Commit 06d26f63 authored by dirk.wintergruen's avatar dirk.wintergruen

filter one character terms

parent 6cec575d
......@@ -22,6 +22,15 @@ from multiprocessing import Pool
#FIELD = "abstract"
#
def word_filter(x):
words = x.split(" ")
for w in words:
if len(w)<2:
return False
return True
def parallelCount(url,
start=1900,
end=1905,
......@@ -109,12 +118,12 @@ def count(docs, ngrams=[1,2],include_pos=["NOUN", "ADJ", "ADV"]):
doc_cnt = defaultdict(int)
for x in docs:
counted = set()
ngramms = x.doc._.to_terms_list(ngrams=ngrams, entities=False, as_strings=True,
ngs = x.doc._.to_terms_list(ngrams=ngrams, entities=False, as_strings=True,
normalize="lemma",
#include_pos=["NOUN"])
include_pos=include_pos)
for n in ngramms:
for n in filter(word_filter,ngs):
cnt[n] += 1
cnt_year[x._.meta["year"]][n] +=1
if not n in counted:
......@@ -144,7 +153,7 @@ if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument("--field",default="abstract",help="field to be indexed")
parser.add_argument("--ngrams", default="1", help="Comma separated list which ngramms to count, use ';' to separate ngramm for separeared folders")
parser.add_argument("--ngrams", default="1", help="Comma separated list which ngrams to count, use ';' to separate ngram for separeared folders")
parser.add_argument("--include_pos", default="NOUN,ADJ,ADV", help="which pos-tags to include")
parser.add_argument("--start_year", default=1900, type=int)
parser.add_argument("--end_year", default=2005, type=int)
......
### create right and left neighbours from n-gramm files
### create right and left neighbours from n-gram files
from collections import defaultdict
import os,os.path
import argparse
EXCLUDE_LIST="of,from,by,with,on".split(",")
def createFromNGrammFolder(folder,out_folder=None,left=False):
def createFromNgramFolder(folder,out_folder=None,left=False):
with open(os.path.join(folder,"sum.tsv"),"r",encoding="utf-8") as inf:
sums = defaultdict(float)
##sum
......@@ -12,11 +12,11 @@ def createFromNGrammFolder(folder,out_folder=None,left=False):
l = l.replace("\n", "")
if l == "":
continue
ngramm_cnt = l.split("\t")
ngramm = ngramm_cnt[0]
cnt = ngramm_cnt[1]
ngramm = " ".join(filter(lambda x: not x in EXCLUDE_LIST, ngramm.split(" ")))
sums[ngramm] += float(cnt)
ngram_cnt = l.split("\t")
ngram = ngram_cnt[0]
cnt = ngram_cnt[1]
ngram = " ".join(filter(lambda x: not x in EXCLUDE_LIST, ngram.split(" ")))
sums[ngram] += float(cnt)
_sum = sum(sums.values())
neighbours = defaultdict(lambda : defaultdict(int))
......@@ -34,10 +34,10 @@ def createFromNGrammFolder(folder,out_folder=None,left=False):
l= l.replace("\n", "")
if l == "":
continue
ngramm_cnt = l.split("\t")
ngramm = ngramm_cnt[0]
cnt = ngramm_cnt[1]
splitted = ngramm.split(" ")
ngram_cnt = l.split("\t")
ngram = ngram_cnt[0]
cnt = ngram_cnt[1]
splitted = ngram.split(" ")
if left:
word = splitted[-1]
# neighbour_words = filter(lambda x: not x in EXCLUDE_LIST, splitted)
......@@ -51,7 +51,7 @@ def createFromNGrammFolder(folder,out_folder=None,left=False):
neighbours[word][neighbour_words] += int(cnt)
neighbours_rel[word][neighbour_words] += int(cnt) / _sum
sums_file[ngramm] += float(cnt)
sums_file[ngram] += float(cnt)
_sums_file=sum(sums_file.values())
neighbours_file_rel = defaultdict(lambda: defaultdict(float))
......@@ -85,17 +85,17 @@ if __name__ == '__main__':
parser.add_argument("-o", "--out_folder", required=True)
parser.add_argument("-l", "--left", const=True,default=False,nargs="?",help="left neighbours instead of right ones")
parser.add_argument("--ngramms",default="False",help="Set this if you want to generate all countings if the ngramm are in subfolders.")
parser.add_argument("--ngrams",default="False",help="Set this if you want to generate all countings if the ngram are in subfolders.")
args = parser.parse_args()
if args.ngramms is not None:
if args.ngrams is not None:
for freq_doc_freq in ["freq","doc_freq"]:
for left in [True,False]:
for ngramm in args.ngramms.split(","):
print(f"F:{freq_doc_freq},L:{left} N:{ngramm}")
out_folder = os.path.join(args.out_folder,ngramm)
for ngram in args.ngrams.split(","):
print(f"F:{freq_doc_freq},L:{left} N:{ngram}")
out_folder = os.path.join(args.out_folder,ngram)
if left:
lr = "l"
else:
......@@ -109,8 +109,8 @@ if __name__ == '__main__':
os.makedirs(path_abs, exist_ok=True)
in_folder = os.path.join(args.in_folder,ngramm,freq_doc_freq)
nbs, nbs_rel = createFromNGrammFolder(in_folder, path, left=left)
in_folder = os.path.join(args.in_folder,ngram,freq_doc_freq)
nbs, nbs_rel = createFromNgramFolder(in_folder, path, left=left)
print("save nb")
......@@ -121,7 +121,7 @@ if __name__ == '__main__':
else:
os.makedirs(os.path.join(args.out_folder,"absolute"),exist_ok=True)
os.makedirs(os.path.join(args.out_folder,"relative"), exist_ok=True)
nbs,nbs_rel = createFromNGrammFolder(args.in_folder,args.out_folder,left=args.left)
nbs,nbs_rel = createFromNgramFolder(args.in_folder,args.out_folder,left=args.left)
print("save nb")
saveNeighboursDict(nbs,os.path.join(os.path.join(args.out_folder,"absolute"),"sum.tsv"))
saveNeighboursDict(nbs_rel, os.path.join(os.path.join(args.out_folder, "relative"), "sum.tsv"))
......
......@@ -8,7 +8,7 @@ import textacy,textacy.doc
spacy_lang = textacy.load_spacy_lang("en_core_web_lg")
fileList = list(os.listdir("/media/data/nature_ngramms/3/"))
fileList = list(os.listdir("/media/data/nature_ngrams/3/"))
def get_doc_solr(url, start=1900, end=2005, lang="en_core_web_lg", filter_corpus=None):
nlp = spacy.load(lang)
......@@ -33,22 +33,22 @@ def term_list_func(x, save=None):
try:
with open(os.path.join(save, x._.meta["doi"]), "r", encoding="utf-8") as inf:
ngramms = inf.read().split("\n")
ngrams = inf.read().split("\n")
#print("loaded")
#return ngramms
#return ngrams
except FileNotFoundError:
# have to create it
ngramms = x.doc._.to_terms_list(ngrams=[3], entities=False, as_strings=True,
ngrams = x.doc._.to_terms_list(ngrams=[3], entities=False, as_strings=True,
normalize="lemma",
include_pos=["NOUN", "ADJ", "ADV"])
if save:
with open(os.path.join(save, x._.meta["doi"]), "w", encoding="utf-8") as outf:
outf.write("\n".join(ngramms))
outf.write("\n".join(ngrams))
return (ngramms,
return (ngrams,
int(int(x._.meta["year"]) / 2)) # 5 years groups
......@@ -57,4 +57,4 @@ docs = get_doc_solr("http://dw3.mpiwg-berlin.mpg.de:8983/solr/science_in_context
for doc in docs:
term_list_func(doc,save="/media/data/nature_ngramms/3/")
term_list_func(doc,save="/media/data/nature_ngrams/3/")
......@@ -9,6 +9,7 @@ class Flrc(object):
def __init__(self,noun_candidates,nouns_left,nouns_right):
self.fqs = None
self.fqs_sum = None
for ncs in noun_candidates:
self._getFrequenciesFromFolder(ncs, add=True)
......@@ -27,8 +28,9 @@ class Flrc(object):
#self.sum_fl_r = sum([self._sum(x, self.fls_r) for x in self.fls_r])
def _getFrequenciesFromFolder(self, folder, fqs,fqs_sum = (None,None), add = False):
def _getFrequenciesFromFolder(self, folder, fqs_fqs_sum = (None,None), add = False):
# get all the count out of the folder (normally per year) sum the up and give frequency and total_count back
fqs,gqs_sum = fqs_fqs_sum
if fqs is not None and add:
raise ValueError("if add is true fqs has to be None!")
if add:
......@@ -64,24 +66,25 @@ class Flrc(object):
return fqs,fqs_sum
@staticmethod
def _getFLs(file,fl = None):
def _getFLs(self,file,fl = None):
#get the frequencies of n-grams created by neighboursFromNGrams
if fl is None:
fl = defaultdict(lambda: defaultdict(Fqs))
fl = defaultdict(float)
with open(file,"r",encoding="utf-8") as inf:
for l in inf.readlines():
l = l.replace("\n","")
splitted=l.split("\t")
word = splitted[0]
for i in range(1,len(splitted),2):
fl[word][splitted[i]].value = int(splitted[i+1])
fl[word][splitted[i]].typ = file
fl[word] += float(splitted[i+1])
return fl
def flr(self,cn):
f_cn = self.fqs[cn].value
f_cn = self.fqs_sum[cn]
term2 = 1
for l in cn.split(" "):
......@@ -89,31 +92,33 @@ class Flrc(object):
term2 = term2 ** (1/(2*len(cn.split(" "))))
return f_cn * term2, term2 ,f_cn
return f_cn * term2, term2 ,f_cn, f_cn / term2
if __name__ == '__main__':
noun_candidates = ["/media/data/nature/counts_freq/2/doc_freq", "/media/data/nature/counts_freq/1/doc_freq"]
nouns_left = ["/media/data/nature/neighbours/2l/all.txt", "/media/data/nature/neighbours/3l/all.txt"]
nouns_right = ["/media/data/nature/neighbours/2r/all.txt", "/media/data/nature/neighbours/3r/all.txt"]
outf = "/tmp/out_doc_freq.tsv"
noun_candidates = ["/tmp/out3/2/doc_freq", "/tmp/out3/1/doc_freq"]
#nouns_left = ["/tmp/out3_neighbours/2/l/relative/sum.tsv", "//tmp/out3_neighbours/2/l/relative/sum.tsv"]
#nouns_right = ["/tmp/out3_neighbours/2/r/relative/sum.tsv", "/tmp/out3_neighbours/2/r/relative/sum.tsv"]
nouns_left = ["/tmp/out3_neighbours/2/l/absolute/sum.tsv", "//tmp/out3_neighbours/2/l/absolute/sum.tsv"]
nouns_right = ["/tmp/out3_neighbours/2/r/absolute/sum.tsv", "/tmp/out3_neighbours/2/r/absolute/sum.tsv"]
outfn = "/tmp/out_doc_freq.tsv"
flrcn = Flrc(noun_candidates,nouns_left=nouns_left,nouns_right=nouns_right)
import pandas
print (pandas.__version__)
vals = {}
#for x in flrcn.fqs:
# vals[x] = pandas.Series(flrcn.flr(x),index=["fcrn","fr","term2"])
#sg = pandas.DataFrame(vals)
# sg.to_csv("/tmp/out.csv")
with open(outf,"w",encoding="utf-8") as outf:
outf.write("\t".join(["name","fcrn","term2","fr"]) + "\n")
with open(outfn,"w",encoding="utf-8") as outf:
outf.write("\t".join(["name","fcrn","term2","fr","f_icrn"]) + "\n")
for x in flrcn.fqs:
v = flrcn.flr(x)
outf.write(x + "\t" + "\t".join([str(x) for x in v]) + "\n")
print("saved")
print(f"saved to: {outfn} ")
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment