...
 
import logging
import multiprocessing
import sys
from collections import Counter
from collections import Counter, defaultdict
#from multiprocessing.pool import Pool
from billiard.pool import Pool
import dateparser
......@@ -142,7 +142,7 @@ class Analyser(object):
data = pd.DataFrame(columns=["title", "_date", "_year", "_month","_words"])
ts = time.time()
logger.info("start creating dataset: ")
word_counts = defaultdict(int)
......@@ -198,7 +198,10 @@ class Analyser(object):
logger.debug(f"Document will be ignored {year} > {self.end} ")
continue
md["_words"] = [w.lemma_ for w in textacy.extract.words(doc, filter_stops=True, filter_punct=True)]
md["_words"] = [f"{w.lemma_.lower()}@{w.pos_}" for w in textacy.extract.words(doc, filter_stops=True, filter_punct=True)]
for w,c in Counter(md["_words"]).items():
word_counts[w] += c
#md["_doc"] = doc
......@@ -207,7 +210,9 @@ class Analyser(object):
data.loc[cnt] = series
cnt += 1
logger.info(f"Created datset in {time.time()-ts}")
return data
word_counts = pd.Series(word_counts)
self.wordCounts = word_counts
return data,word_counts
def createAllPresentWordsPerYearMonth(self,data):
......@@ -235,33 +240,6 @@ class Analyser(object):
if self._uniqueWords and self.threshold == threshold:
return self._uniqueWords
t_s = time.time()
if self.wordCounts is None:
if self.in_folder and os.path.exists(os.path.join(self.in_folder, "word_counts.csv")):
logger.info("Read wordlist")
wcs = pd.read_csv(os.path.join(self.in_folder, "word_counts.csv"),squeeze = True, index_col=0,dtype = {"0":"float64"})
logger.debug(wcs)
self.wordCounts = wcs
else:
logger.info("Start creating wordlist")
wcs = pd.Series(dtype="float64")
for corpus in self.corpus.values():
if not isinstance(corpus,Corpus):
corpus = textacy.Corpus.load(self.lang,corpus)
wcs_tmp = pd.Series(corpus.word_counts(as_strings=True))
wcs = wcs.add(wcs_tmp,fill_value=0)
self.wordCounts = wcs
logger.info(f"Creted wordlist: f{time.time()-t_s} seconds.")
if self.save_intermediate:
wcs.to_csv(os.path.join(self.out_folder,"word_counts.csv"))
logger.info(f"Creating filtered list with threshold: {threshold}")
filteredWordList = {}
......@@ -670,7 +648,7 @@ if __name__ == '__main__':
with open(all_r_path,"rb") as inf:
analyser.all_r = pickle.load(inf)
if data is None:
data = analyser.createYearMonthDataset()
data,word_counts = analyser.createYearMonthDataset()
logger.info("saving data")
......