Commit deb1135d authored by dirk.wintergruen's avatar dirk.wintergruen

deal with missing date

parent 84bb64ce
......@@ -25,6 +25,21 @@ OUTW = ["-PRON-", "et", "al", "planet", "star", "mass", "model", "use", "system"
def getDocs(corpus,lang="en"):
for k,corps in corpus.items():
logging.info(f"Analysing {k}")
if not isinstance(corps,Corpus):
try:
corps = textacy.Corpus.load(lang,corps)
except:
logger.error(f"{corps} unkown error -- ignored!")
continue
for doc in corps:
yield k,doc
def filter_func(x, outw=OUTW, additional_wordlist=[]):
if x in (outw + additional_wordlist):
return False
......@@ -130,20 +145,28 @@ class Analyser(object):
for k,doc in getDocs(self.corpus,self.lang):
md = textacy.spacier.doc_extensions.get_meta(doc)
year = None
if "creator" in md:
del md["creator"]
if not "date" in md:
# print("no date")
continue
if "year" in md:
year = md["year"]
else:
continue
date = md.get("date",None)
if year:
date = "%s-1-1"%year
else:
date = md.get("date")
del md["date"]
md["_year"] = None
if date:
md["_month"] = dateparser.parse(date).month
md["_date"] = date
md["_year"] = dateparser.parse(date).year
del md["date"]
year = md.get("year",None)
if year:
md["_year"] = year
......@@ -584,6 +607,7 @@ if __name__ == '__main__':
corpus = None
cnt = 0
if args.corpus_file:
if os.path.isdir(args.corpus_file): #wenn directory, dann nimm an for eatch year is a corpus_file:
corpus = {}
......@@ -593,6 +617,8 @@ if __name__ == '__main__':
logger.info(f"Loading: {y}")
try:
corpus[y] = textacy.Corpus.load(args.lang,os.path.join(args.corpus_file,f))
#cnt +=1
#if cnt > 4 : break
except:
logger.error(f"Error: {y}")
continue
......@@ -603,7 +629,7 @@ if __name__ == '__main__':
import pickle
with open(args.pickle_file,"rb") as inf:
corpus = pickle.load(inf)
elif not args.debug and not args.data_file:
else:
print("Neither corpus_file nor pickle_file set!")
sys.exit(0)
#else:
......@@ -657,19 +683,5 @@ if __name__ == '__main__':
bursts.to_excel(os.path.join(args.out_folder,"bst.xlsx"))
def getDocs(corpus,lang="en"):
for k,corps in corpus.items():
logging.info(f"Analysing {k}")
if not isinstance(corps,Corpus):
try:
corps = textacy.Corpus.load(lang,corps)
except:
logger.error(f"{corps} unkown error -- ignored!")
continue
for doc in corps:
yield k,doc
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment