Commit 818ff715 authored by dirk.wintergruen's avatar dirk.wintergruen

int exchanged by float

parent 7435dd5f
......@@ -5,6 +5,7 @@ import logging
import os.path
import pandas
class TFIDF(object):
def __init__(self,freq_folder,period_len,start_year=1900,end_year=2015):
......@@ -63,6 +64,22 @@ class TFIDF(object):
df = self.DF(term, period)
return tf * math.log(d / df)
def Jaccard(self,term,period):
df_and = self.DF(term, period)
df_or = 0
for w in term.split(" "):
df_or += self.DF(w,period)
return df_and / df_or
def Odds(self,term,period):
df = self.DF(term,period)
d = self.D(period)
return df / (d - df)
if __name__ == '__main__':
freq_folder ="/tmp/out3_filter/"
......
......@@ -79,11 +79,11 @@ def createFromNgramFolder(folder,out_folder=None,left=False, one_gram = set()):
## check if one part is a one_gram
word,neighbour_words = getWord(ngram,one_gram,left)
neighbours_file[word][neighbour_words] += int(cnt)
neighbours_file[word][neighbour_words] += float(cnt)
#neighbours_file_rel[word].append((neighbour_words, int(cnt) / _sum))
neighbours[word][neighbour_words] += int(cnt)
neighbours_rel[word][neighbour_words] += int(cnt) / _sum
neighbours[word][neighbour_words] += float(cnt)
neighbours_rel[word][neighbour_words] += float(cnt) / _sum
sums_file[ngram] += float(cnt)
_sums_file=sum(sums_file.values())
......@@ -141,7 +141,7 @@ if __name__ == '__main__':
if args.ngrams is not None:
for freq_doc_freq in ["freq","doc_freq, freq_l1"]:
for freq_doc_freq in ["freq", "doc_freq", "freq_l1"]:
one_gram = read1ngramm(args.in_folder, freq_doc_freq)
for left in [True,False]:
for ngram in args.ngrams.split(","):
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment