Commit dab2967c authored by dirk.wintergruen's avatar dirk.wintergruen

remove control chars

parent bdaf26ec
......@@ -24,6 +24,16 @@ import os,os.path
OUTW = ["-PRON-", "et", "al", "planet", "star", "mass", "model", "use", "system",
"arXiv"] + ["%s" % x for x in range(1900, 2020)]
import sys, unicodedata, re
# Get all unicode characters
all_chars = (chr(i) for i in range(sys.maxunicode))
# Get all non printable characters
control_chars = '|'.join(c for c in all_chars if unicodedata.category(c) == 'Cc')
# Create regex of above characters
control_char_re = re.compile('[%s]' % re.escape(control_chars))
# Substitute these characters by empty string in the original string.
def remove_control_chars(s):
return control_char_re.sub('', s)
def getDocs(corpus,lang="en"):
......@@ -199,7 +209,7 @@ class Analyser(object):
logger.debug(f"Document will be ignored {year} > {self.end} ")
continue
md["_words"] = [f"{w.lemma_.lower()}@{w.pos_}" for w in textacy.extract.words(doc, filter_stops=True, filter_punct=True)]
md["_words"] = [f"{remove_control_chars(w.lemma_.lower())}@{w.pos_}" for w in textacy.extract.words(doc, filter_stops=True, filter_punct=True)]
for w,c in Counter(md["_words"]).items():
word_counts[w] += c
......@@ -514,19 +524,20 @@ class Analyser(object):
n = len(d) # number of timepoints
cnt = 0
with open("/tmp/uniq_w","w",encoding="utf-8") as outf:
for w in self.uniqueWords:
outf.write(f"{w}\n")
if logger.getEffectiveLevel() == logging.DEBUG:
with open("/tmp/uniq_w","w",encoding="utf-8") as outf:
for w in self.uniqueWords:
outf.write(f"{w}\n")
unique_words_tmp = filter(filter_func, self.uniqueWords)
unique_words_list = list(unique_words_tmp)
size = 300
with open("/tmp/uniq_w_l", "w", encoding="utf-8") as outf:
for w in unique_words_list:
outf.write(f"{w}\n")
if logger.getEffectiveLevel() == logging.DEBUG:
with open("/tmp/uniq_w_l", "w", encoding="utf-8") as outf:
for w in unique_words_list:
outf.write(f"{w}\n")
self.createAllPresentWordsPerYearMonth(data)
......@@ -538,9 +549,10 @@ class Analyser(object):
for i in range(0, len(list(unique_words_list)), size):
batch = list(unique_words_list)[i:i + size]
with open(f"/tmp/uniq_w_l_{i}", "w", encoding="utf-8") as outf:
for w in batch:
outf.write(f"{w}\n")
if logger.getEffectiveLevel() == logging.DEBUG:
with open(f"/tmp/uniq_w_l_{i}", "w", encoding="utf-8") as outf:
for w in batch:
outf.write(f"{w}\n")
batches.append((d, n, s, gam, batch, all_r))
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment