Commit b0537399 authored by dirk.wintergruen's avatar dirk.wintergruen

small bugs in analyser for new pandas

parent 818ff715
include README.md
recursive-include publications/templates *
recursive-include publications/fixtures *
recursive-include publications/static *
recursive-include publications/management *
\ No newline at end of file
......@@ -7,6 +7,7 @@ from multiprocessing.pool import Pool
import dateparser
import matplotlib
import textacy
from textacy import Corpus
from tqdm import tqdm
logger = logging.getLogger(__name__)
......@@ -95,7 +96,10 @@ def do_words(args):
return all_bursts
class Analyser(object):
def __init__(self,corpus,save_intermediate=False,out_folder=None,workers=5,in_folder=None, all_r=None):
def __init__(self,corpus,save_intermediate=False,out_folder=None,workers=5,in_folder=None, all_r=None,
start=None,
end=None,
lang = "en"):
"""
:param corpus: spacy corpus
"""
......@@ -111,6 +115,9 @@ class Analyser(object):
self.workers = workers
self.in_folder = in_folder
self.all_r = all_r
self.start = start
self.end = end
self.lang = lang
def createYearMonthDataset(self):
cnt = 0
......@@ -119,53 +126,60 @@ class Analyser(object):
logger.info("start creating dataset: ")
for k,corpus in self.corpus.items():
logger.info(f"analysing: {k}")
for doc in tqdm(corpus):
md = textacy.spacier.doc_extensions.get_meta(doc)
if "creator" in md:
del md["creator"]
if not "date" in md:
# print("no date")
continue
date = md.get("date",None)
md["_year"] = None
if date:
md["_month"] = dateparser.parse(date).month
for doc in getDocs(self.corpus,self.lang):
md = textacy.spacier.doc_extensions.get_meta(doc)
if "creator" in md:
del md["creator"]
if not "date" in md:
# print("no date")
continue
md["_date"] = date
md["_year"] = dateparser.parse(date).year
del md["date"]
year = md.get("year",None)
if year:
date = md.get("date",None)
md["_year"] = None
if date:
md["_month"] = dateparser.parse(date).month
md["_date"] = date
md["_year"] = dateparser.parse(date).year
del md["date"]
year = md.get("year",None)
if year:
md["_year"] = year
del md["year"]
if not date: # date not set set month = 1
md["_month"] = 1
if not md["_year"]: # kein Jahr weder in Date noch Feld Year, assume korpus name ist year
try:
year = int(y)
md["_year"] = year
del md["year"]
if not date: # date not set set month = 1
if not date: # date not set set month = 1
md["_month"] = 1
except ValueError:
logger.error(f"no date for : {doc._.meta}")
logger.error("Document will be ignored!")
continue
if not md["_year"]: # kein Jahr weder in Date noch Feld Year, assume korpus name ist year
try:
year = int(y)
md["_year"] = year
if not date: # date not set set month = 1
md["_month"] = 1
except ValueError:
logger.error(f"no date for : {doc._.meta}")
logger.error("Document will be ignored!")
continue
if self.start and year < self.start:
logger.debug(f"Document will be ignored {year} < {self.start} ")
continue
if self.end and year > self.end:
logger.debug(f"Document will be ignored {year} > {self.end} ")
continue
md["_words"] = [w.lemma_ for w in textacy.extract.words(doc, filter_stops=True, filter_punct=True)]
#md["_doc"] = doc
md["_words"] = [w.lemma_ for w in textacy.extract.words(doc, filter_stops=True, filter_punct=True)]
#md["_doc"] = doc
series = pd.Series(md)
data.loc[cnt] = series
cnt += 1
logger.info(f"Created datset in {time.time()-ts}")
series = pd.Series(md)
data.loc[cnt] = series
cnt += 1
logger.info(f"Created datset in {time.time()-ts}")
return data
def createAllPresentWordsPerYearMonth(self,data):
......@@ -190,7 +204,7 @@ class Analyser(object):
return self._uniqueWords
t_s = time.time()
if not self.wordCounts:
if self.wordCounts is None:
if self.in_folder and os.path.exists(os.path.join(self.in_folder, "word_counts.csv")):
logger.info("Read wordlist")
......@@ -203,6 +217,9 @@ class Analyser(object):
logger.info("Start creating wordlist")
wcs = pd.Series(dtype="float64")
for corpus in self.corpus.values():
if not isinstance(corpus,Corpus):
corpus = textacy.Corpus.load(self.lang,corpus)
wcs_tmp = pd.Series(corpus.word_counts(as_strings=True))
wcs = wcs.add(wcs_tmp,fill_value=0)
......@@ -463,10 +480,10 @@ class Analyser(object):
batches = []
# find bursts
unique_words_tmp = filter(filter_func, self.uniqueWords)
#unique_words_tmp = filter(filter_func, self.uniqueWords)
d = data.groupby(['_year', '_month'])['_words'].count()
# create a dataframe to hold results
all_bursts = pd.DataFrame(columns=['begin', 'end', 'weight'])
#all_bursts = pd.DataFrame(columns=['begin', 'end', 'weight'])
# unique_words_tmp = filter(lambda x )
......@@ -639,3 +656,18 @@ if __name__ == '__main__':
bursts.to_excel(os.path.join(args.out_folder,"bst.xlsx"))
def getDocs(corpus,lang="en"):
for k,corps in corpus.items():
logging.info(f"Analysing {k}")
if not isinstance(corps,Corpus):
try:
corps = textacy.Corpus.load(lang,corps)
except:
logger.error(f"{corps} unkown error -- ignored!")
continue
for doc in corps:
yield doc
......@@ -40,7 +40,7 @@ def getFrequenciesFromFolder(folder, fqs_fqs_sum = (None ,None),with_all_years=
l = l.replace("\n", "")
term, cnt = l.split("\t")
## in rare occasions 1-gramms can be
fqs[term] = int(cnt)
fqs[term] = float(cnt)
# print(fqs["amino acid"],fqs_sum["amino acid"])
......
import argparse
from collections import defaultdict
import os,os.path
from tqdm import tqdm
......@@ -7,7 +8,7 @@ from functools import lru_cache
from helpers.readWordFiles import getFrequenciesFromFolder
def read1ngramm(folder):
def read1ngram(folder):
""" read alles 1 gramm -- Terms like "North Dakota" are a 1-gram! following Spacy """
#folder = os.path.join(in_folder, "1", freq_doc_freq)
......@@ -32,8 +33,8 @@ class Flrc(object):
def split(self,cns):
""" split but leaf everything which is in onegramm as one term"""
cns2 = cns
for og in self.onegram:
cns2 = cns
if og in cns:
og2 = og.replace(" ", "@@")
cns2 = cns2.replace(og, og2)
......@@ -45,7 +46,7 @@ class Flrc(object):
self.fqs = None
self.fqs_sum = None
print("read 1 gramm")
self.onegram = read1ngramm(onegram_path)
self.onegram = read1ngram(onegram_path)
print("read 1 ncs")
for ncs in noun_candidates:
self._getFrequenciesFromFolder(ncs, add=True)
......@@ -152,22 +153,35 @@ class Flrc(object):
if __name__ == '__main__':
noun_candidates = ["/tmp/out3/2/doc_freq", "/tmp/out3/1/doc_freq"]
#nouns_left = ["/tmp/out3_neighbours/2/l/relative/sum.tsv", "//tmp/out3_neighbours/2/l/relative/sum.tsv"]
#nouns_right = ["/tmp/out3_neighbours/2/r/relative/sum.tsv", "/tmp/out3_neighbours/2/r/relative/sum.tsv"]
parser = argparse.ArgumentParser()
parser.add_argument("-ng", "--base_path_ngrams", help="Path to the folders wit the ngramms", required= True)
parser.add_argument("-nb", "--base_path_neighbours", help="Path to the folders wit the neighbours", required= True)
parser.add_argument("-o", "--out",help="output file", required= True)
nouns_left = ["/tmp/out3_neighbours/2/l/absolute/sum.tsv", "//tmp/out3_neighbours/2/l/absolute/sum.tsv"]
nouns_right = ["/tmp/out3_neighbours/2/r/absolute/sum.tsv", "/tmp/out3_neighbours/2/r/absolute/sum.tsv"]
args = parser.parse_args()
base_path_ngrams = args.base_path_ngrams #"/tmp/out3_filter"
noun_candidates = [base_path_ngrams + "/2/freq_l1", base_path_ngrams + "/1/freq_l1", base_path_ngrams +"/3/freq_l1"]
onegram_path = base_path_ngrams + "/1/freq_l1/sum.tsv"
#nouns_left = ["/tmp/out3_neighbours/2/l/relative/sum.tsv", "//tmp/out3_neighbours/2/l/relative/sum.tsv"]
#nouns_right = ["/tmp/out3_neighbours/2/r/relative/sum.tsv", "/tmp/out3_neighbours/2/r/relative/sum.tsv"]
noun_candidates = ["/var/tmp/media/data/nature/counts_freq/2/doc_freq", "/var/tmp/media/data/nature/counts_freq/1/doc_freq", "/var/tmp//media/data/nature/counts_freq/3/doc_freq"]
nouns_left = ["//var/tmp/media/data/nature/neighbours/2/l/absolute/sum.tsv", "/var/tmp//media/data/nature/neighbours/3/l/absolute/sum.tsv"]
nouns_right = ["/var/tmp//media/data/nature/neighbours/2/r/absolute/sum.tsv", "/var/tmp//media/data/nature/neighbours/3/r/relative/sum.tsv"]
onegram_path = "/var/tmp//media/data/nature/counts_freq/1/doc_freq/sum.tsv"
base_path_neigbours = args.base_path_neighbours #"/tmp/out3_filter_neighbours"
nouns_left = [base_path_neigbours + "/2/doc_freq/l/absolute/sum.tsv",
base_path_neigbours + "/3/doc_freq/l/absolute/sum.tsv"]
nouns_right = [base_path_neigbours + "/2/doc_freq/r/absolute/sum.tsv",
base_path_neigbours + "/3/doc_freq/r/absolute/sum.tsv"]
#noun_candidates = ["/var/tmp/media/data/nature/counts_freq/2/doc_freq", "/var/tmp/media/data/nature/counts_freq/1/doc_freq", "/var/tmp//media/data/nature/counts_freq/3/doc_freq"]
#nouns_left = ["//var/tmp/media/data/nature/neighbours/2/l/absolute/sum.tsv", "/var/tmp//media/data/nature/neighbours/3/l/absolute/sum.tsv"]
#nouns_right = ["/var/tmp//media/data/nature/neighbours/2/r/absolute/sum.tsv", "/var/tmp//media/data/nature/neighbours/3/r/relative/sum.tsv"]
#onegram_path = "/var/tmp//media/data/nature/counts_freq/1/doc_freq/sum.tsv"
outfn = "/tmp/out_doc_freq.tsv"
outfn = args.out #"/tmp/out_doc_freq_l2.tsv"
print("INIT")
flrcn = Flrc(noun_candidates,nouns_left=nouns_left,nouns_right=nouns_right,onegram_path=onegram_path)
......
from setuptools import setup
from setuptools import setup, find_packages
setup(
name='DataMiningAndAnalysis',
version='',
packages=['helpers', 'solrTools'],
#packages=['helpers', 'solrTools'],
url='',
license='',
author='dwinter',
author_email='',
description='Tools for enriching the solr index'
description='Tools for enriching the solr index',
packages=find_packages(),
include_package_data=True,
)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment