Commit 6fa119f8 authored by dirk.wintergruen's avatar dirk.wintergruen

parameters for batch creation added

parent d55d472d
"""count n-gramms per year from solr storage and writes the counts into files, it also writes the overall sum of all
in a files sum.tsv.
--- in the folder freq you find the absolute number of occurances
--- in the folder doc_freq you find the number of documents an n-gram occurs.
"""
import spacy
import pysolr
import os,os.path
......@@ -26,47 +33,65 @@ def parallelCount(url,
worker = 10):
batches = [(y,start,end,lang,field,url, ngrams,include_pos,out_folder) for y in range(start,end)]
batches = [(y, lang,field,url, ngrams,include_pos,out_folder) for y in range(start,end+1)]
with Pool(worker) as p:
p.map(do_year,batches)
all_cnts = p.map(do_year,batches)
#bilde nun die Summe für alle Einträge
all_cnt = defaultdict(int)
all_doc_cnt = defaultdict(int)
for (cnt, doc_cnt) in all_cnts:
for x,v in cnt.items():
all_cnt[x] += v
for x, v in doc_cnt.items():
all_doc_cnt[x] += v
out_folder_cnt = os.path.join(out_folder, "doc_freq")
with open(os.path.join(out_folder,"freq","sum.tsv"),"w",encoding="utf-8") as outf:
for x,y in all_cnt.items():
outf.write(f"{x}\t{y}\n")
return
with open(os.path.join(out_folder, "doc_freq", "sum.tsv"), "w", encoding="utf-8") as outf:
for x, y in all_doc_cnt.items():
outf.write(f"{x}\t{y}\n")
return all_cnt,all_doc_cnt
def do_year(args):
y, start, end, lang, field,url, ngrams,include_pos,out_folder = args
y, lang, field,url, ngrams,include_pos,out_folder = args
print(y)
docs = get_doc_solr(url, start=y, end=y+1, lang=lang, field=field)
docs = get_doc_solr(url, start=y, end=y, lang=lang, field=field)
cnt,cnt_year,doc_cnt,doc_cnt_year = count(docs, ngrams=ngrams, include_pos=include_pos)
out_folder_cnt = os.path.join(out_folder, "freq")
os.makedirs(out_folder_cnt, exist_ok=True)
for y, res2 in cnt_year.items():
with open(os.path.join(out_folder_cnt, f"ngrams_{y}.txt"), "w", encoding="utf-8") as outf:
with open(os.path.join(out_folder_cnt, f"ngrams_{y}.tsv"), "w", encoding="utf-8") as outf:
for r, v in res2.items():
outf.write(f"{r}\t{v}\n")
out_folder_cnt = os.path.join(out_folder, "doc_freq")
os.makedirs(out_folder_cnt, exist_ok=True)
for y, res2 in doc_cnt_year.items():
out_folder_cnt = os.path.join(out_folder, "doc_freq")
os.makedirs(out_folder, exist_ok=True)
with open(os.path.join(out_folder_cnt, f"ngrams_{y}.txt"), "w", encoding="utf-8") as outf:
with open(os.path.join(out_folder_cnt, f"ngrams_{y}.tsv"), "w", encoding="utf-8") as outf:
for r, v in res2.items():
outf.write(f"{r}\t{v}\n")
return
return cnt, doc_cnt
def get_doc_solr(url, start=1900, end=1905, lang="en_core_web_lg", field="abstract"):
#nlp = spacy.load(lang)
solr = pysolr.Solr(url)
for y in range(start, end):
for y in range(start, end+1):
#res = solr.search(q="year:%s" % y, rows=10000000, fl="id,pdf_txt_en")
res = solr.search(q="year:%s AND journal:nature" % y, rows=10000000, fl="id," + field)
for r in res:
print(r["id"])
#print(r["id"])
txt = r[field]
if isinstance(txt,list):
txt = " ".join(txt)
......@@ -89,8 +114,6 @@ def count(docs, ngrams=[1,2],include_pos=["NOUN", "ADJ", "ADV"]):
#include_pos=["NOUN"])
include_pos=include_pos)
for n in ngramms:
cnt[n] += 1
cnt_year[x._.meta["year"]][n] +=1
......@@ -108,13 +131,12 @@ def nonParallelCount(url,
lang="en_core_web_lg",
field="abstract",
ngrams=[1, 2],
include_pos=["NOUN", "ADJ", "ADV"],
worker=10):
docs = get_doc_solr("http://localhost:8983/solr/science_in_context", field=args.field, start=args.start_year,
end=args.end_year)
include_pos=["NOUN", "ADJ", "ADV"]):
docs = get_doc_solr("http://localhost:8983/solr/science_in_context", field=field, start=start,
end=end)
ngrams = [int(i) for i in args.ngrams.split(",")]
res, res_year = count(docs, ngrams=ngrams, include_pos=args.include_pos.split(","))
ngrams = [int(i) for i in ngrams.split(",")]
res, res_year = count(docs, ngrams=ngrams, include_pos=include_pos.split(","))
return res,res_year
......@@ -122,7 +144,7 @@ if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument("--field",default="abstract",help="field to be indexed")
parser.add_argument("--ngrams", default="1", help="Comma separated list which ngramms to cound")
parser.add_argument("--ngrams", default="1", help="Comma separated list which ngramms to count, use ';' to separate ngramm for separeared folders")
parser.add_argument("--include_pos", default="NOUN,ADJ,ADV", help="which pos-tags to include")
parser.add_argument("--start_year", default=1900, type=int)
parser.add_argument("--end_year", default=2005, type=int)
......@@ -132,42 +154,56 @@ if __name__ == '__main__':
args = parser.parse_args()
url = "http://localhost:8983/solr/science_in_context"
ngrams = [int(i) for i in args.ngrams.split(",")]
os.makedirs(args.out_folder, exist_ok=True)
if args.worker == 1:
res,res_year = nonParallelCount(url,
start=args.start_year,
end=args.end_year,
lang="en_core_web_lg",
field=args.field,
ngrams=ngrams,
include_pos=args.include_pos.split(","))
if res:
with open(os.path.join(args.out_folder, "ngrams.txt"), "w", encoding="utf-8") as outf:
for r, v in res.items():
outf.write(f"{r}\t{v}\n")
for y, res2 in res_year.items():
with open(os.path.join(args.out_folder, f"ngrams_{y}.txt"), "w", encoding="utf-8") as outf:
for r, v in res2.items():
outf.write(f"{r}\t{v}\n")
else:
res_year = parallelCount(url,
start=args.start_year,
end=args.end_year,
lang="en_core_web_lg",
field=args.field,
ngrams=ngrams,
include_pos=args.include_pos.split(","),
worker=args.worker,
out_folder=args.out_folder
)
res = None
ngram_loop = args.ngrams.split(";")
for ngs in ngram_loop:
ngrams = [int(i) for i in ngs.split(",")]
print (ngs)
if args.worker == 1:
res,res_year = nonParallelCount(url,
start=args.start_year,
end=args.end_year,
lang="en_core_web_lg",
field=args.field,
ngrams=ngrams,
include_pos=args.include_pos.split(","))
if res:
with open(os.path.join(args.out_folder, "ngrams.txt"), "w", encoding="utf-8") as outf:
for r, v in res.items():
outf.write(f"{r}\t{v}\n")
for y, res2 in res_year.items():
with open(os.path.join(args.out_folder, f"ngrams_{y}.txt"), "w", encoding="utf-8") as outf:
for r, v in res2.items():
outf.write(f"{r}\t{v}\n")
else:
res_year = parallelCount(url,
start=args.start_year,
end=args.end_year,
lang="en_core_web_lg",
field=args.field,
ngrams=ngrams,
include_pos=args.include_pos.split(","),
worker=args.worker,
out_folder=args.out_folder
)
res = None
## now I am counting all words and calculate frequencies
......@@ -5,35 +5,66 @@ import argparse
EXCLUDE_LIST="of,from,by,with,on".split(",")
def createFromNGrammFolder(folder,out_folder=None,left=False):
with open(os.path.join(folder,"sum.tsv"),"r",encoding="utf-8") as inf:
sums = defaultdict(float)
##sum
for l in inf.readlines():
l = l.replace("\n", "")
if l == "":
continue
ngramm_cnt = l.split("\t")
ngramm = ngramm_cnt[0]
cnt = ngramm_cnt[1]
ngramm = " ".join(filter(lambda x: not x in EXCLUDE_LIST, ngramm.split(" ")))
sums[ngramm] += float(cnt)
_sum = sum(sums.values())
neighbours = defaultdict(lambda : defaultdict(int))
neighbours_rel = defaultdict(lambda: defaultdict(float))
for fn in os.listdir(folder):
neighbours_file = defaultdict(list)
with open(os.path.join(folder,fn),"r",encoding="utf-8") as inf:
for l in inf.readlines():
l= l.replace("\n", "")
if l == "":
continue
ngramm_cnt = l.split("\t")
ngramm = ngramm_cnt[0]
cnt = ngramm_cnt[1]
splitted = ngramm.split(" ")
if left:
word = splitted[-1]
neighbour_words = filter(lambda x: not x in EXCLUDE_LIST, splitted[0:-1])
else:
word=splitted[0]
neighbour_words = filter(lambda x: not x in EXCLUDE_LIST, splitted[1:])
neighbour_words = " ".join(neighbour_words)
neighbours_file[word].append((neighbour_words,cnt))
neighbours[word][neighbour_words] += int(cnt)
if not fn == "sum.tsv" :
neighbours_file = defaultdict(lambda : defaultdict(int))
sums_file = defaultdict(float)
with open(os.path.join(folder,fn),"r",encoding="utf-8") as inf:
for l in inf.readlines():
l= l.replace("\n", "")
if l == "":
continue
ngramm_cnt = l.split("\t")
ngramm = ngramm_cnt[0]
cnt = ngramm_cnt[1]
splitted = ngramm.split(" ")
if left:
word = splitted[-1]
# neighbour_words = filter(lambda x: not x in EXCLUDE_LIST, splitted)
else:
word=splitted[0]
neighbour_words = filter(lambda x: not x in EXCLUDE_LIST, splitted)
neighbour_words = " ".join(neighbour_words)
neighbours_file[word][neighbour_words] += int(cnt)
#neighbours_file_rel[word].append((neighbour_words, int(cnt) / _sum))
neighbours[word][neighbour_words] += int(cnt)
neighbours_rel[word][neighbour_words] += int(cnt) / _sum
sums_file[ngramm] += float(cnt)
_sums_file=sum(sums_file.values())
neighbours_file_rel = defaultdict(lambda: defaultdict(float))
for w in neighbours_file:
for nw in neighbours_file[w]:
neighbours_file_rel[w][nw] = neighbours_file[word][neighbour_words] / _sums_file
if out_folder:
print(f"save:{fn}")
saveNeighbours(neighbours_file, os.path.join(out_folder,fn))
saveNeighboursDict(neighbours_file, os.path.join(out_folder,"absolute",fn))
saveNeighboursDict(neighbours_file_rel, os.path.join(out_folder, "relative", fn))
return neighbours
return neighbours,neighbours_rel
def saveNeighbours(neighbours,fn):
with open(fn,"w",encoding="utf-8") as outf:
......@@ -54,12 +85,47 @@ if __name__ == '__main__':
parser.add_argument("-o", "--out_folder", required=True)
parser.add_argument("-l", "--left", const=True,default=False,nargs="?",help="left neighbours instead of right ones")
parser.add_argument("--ngramms",default="False",help="Set this if you want to generate all countings if the ngramm are in subfolders.")
args = parser.parse_args()
os.makedirs(args.out_folder,exist_ok=True)
nbs = createFromNGrammFolder(args.in_folder,args.out_folder,left=args.left)
print("save nb")
saveNeighboursDict(nbs,os.path.join(args.out_folder,"all.txt"))
print("finished!")
if args.ngramms is not None:
for freq_doc_freq in ["freq","doc_freq"]:
for left in [True,False]:
for ngramm in args.ngramms.split(","):
print(f"F:{freq_doc_freq},L:{left} N:{ngramm}")
out_folder = os.path.join(args.out_folder,ngramm)
if left:
lr = "l"
else:
lr = "r"
path = os.path.join(out_folder,lr)
path_rel = os.path.join(path, "relative")
os.makedirs(path_rel, exist_ok=True)
path_abs = os.path.join(path, "absolute")
os.makedirs(path_abs, exist_ok=True)
in_folder = os.path.join(args.in_folder,ngramm,freq_doc_freq)
nbs, nbs_rel = createFromNGrammFolder(in_folder, path, left=left)
print("save nb")
saveNeighboursDict(nbs, os.path.join(path_abs, "sum.tsv"))
saveNeighboursDict(nbs_rel, os.path.join(path_rel, "sum.tsv"))
else:
os.makedirs(os.path.join(args.out_folder,"absolute"),exist_ok=True)
os.makedirs(os.path.join(args.out_folder,"relative"), exist_ok=True)
nbs,nbs_rel = createFromNGrammFolder(args.in_folder,args.out_folder,left=args.left)
print("save nb")
saveNeighboursDict(nbs,os.path.join(os.path.join(args.out_folder,"absolute"),"sum.tsv"))
saveNeighboursDict(nbs_rel, os.path.join(os.path.join(args.out_folder, "relative"), "sum.tsv"))
print("finished!")
......@@ -5,11 +5,10 @@ from tqdm import tqdm
from functools import lru_cache
class Flrc(object):
def __init__(self,noun_candidates,nouns_left,nouns_right):
self.fqs = defaultdict(int)
self.fqs = None
for ncs in noun_candidates:
self._getFrequenciesFromFolder(ncs, add=True)
......@@ -24,68 +23,69 @@ class Flrc(object):
#self.fl_l = lambda x: fl(x, fls_l)
#self.fl_r = lambda x: fl(x, fls_r)
self.sum_fl_l = sum([self._fl(x, self.fls_l) for x in self.fls_l])
self.sum_fl_r = sum([self._fl(x, self.fls_r) for x in self.fls_r])
self.sum = self._all_fl(self.fqs)
#self.sum_fl_l = sum([self._sum(x, self.fls_l) for x in self.fls_l])
#self.sum_fl_r = sum([self._sum(x, self.fls_r) for x in self.fls_r])
def _getFrequenciesFromFolder(self, folder, fqs = None, add = False):
def _getFrequenciesFromFolder(self, folder, fqs,fqs_sum = (None,None), add = False):
# get all the count out of the folder (normally per year) sum the up and give frequency and total_count back
if fqs is not None and add:
raise ValueError("if add is true fqs has to be None!")
if add:
fqs = self.fqs
fqs_sum = self.fqs_sum
if not fqs and not add:
if not fqs:
fqs = defaultdict(int)
if add:
fqs = self.fqs
if not fqs_sum:
fqs_sum = defaultdict(int)
for fn in tqdm(os.listdir(folder)):
with open(os.path.join(folder,fn),"r",encoding="utf-8") as inf:
for l in inf.readlines():
l = l.replace("\n","")
term,cnt = l.split("\t")
fqs[term] += int(cnt)
if fn != "sum.txt":
with open(os.path.join(folder,fn),"r",encoding="utf-8") as inf:
for l in inf.readlines():
l = l.replace("\n","")
term,cnt = l.split("\t")
fqs[term] += int(cnt)
else:
with open(os.path.join(folder, fn), "r", encoding="utf-8") as inf:
for l in inf.readlines():
l = l.replace("\n", "")
term, cnt = l.split("\t")
fqs_sum[term] = int(cnt)
self.fqs = fqs
return fqs
_sum = sum(fqs.values())
self.fqs_sum= {k : x/_sum for k,x in fqs.items()}
return fqs,fqs_sum
@staticmethod
def _getFLs(file,fl = None):
if fl is None:
fl = defaultdict(lambda: defaultdict(int))
fl = defaultdict(lambda: defaultdict(Fqs))
with open(file,"r",encoding="utf-8") as inf:
for l in inf.readlines():
l = l.replace("\n","")
splitted=l.split("\t")
word = splitted[0]
for i in range(1,len(splitted),2):
fl[word][splitted[i]] = splitted[i+1]
fl[word][splitted[i]].value = int(splitted[i+1])
fl[word][splitted[i]].typ = file
return fl
@staticmethod
def _all_fl(fls):
return sum(fls.values())
@staticmethod
def _fl(l,fls):
vals = [int(v) for k,v in fls[l].items()]
return sum(vals)
def fl_l(self,cn):
return self._fl(cn, self.fls_l)
def fl_r(self, cn):
return self._fl(cn, self.fls_l)
def flr(self,cn):
f_cn = self.fqs[cn]/self.sum
f_cn = self.fqs[cn].value
term2 = 1
for l in cn.split(" "):
term2 = term2 * (self.fl_l(l)/self.sum_fl_l +1) *(self.fl_r(l)/self.sum_fl_r + 1)
term2 = term2 * (self.fls_l[l] +1) *(self.fls_r[l] + 1)
term2 = term2 ** (1/(2*len(cn.split(" "))))
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment