Commit 8600deec authored by Seungbin Yim's avatar Seungbin Yim
Browse files

Add dh_publictiosns scripts

parent 2fef05b5
This diff is collapsed.
import glob2, os, json, jsonlines
import glob2, os, json, jsonlines, sys
def text_to_jsonl(filename):
with open(filename) as f:
......@@ -6,15 +8,13 @@ def text_to_jsonl(filename):
line = line.strip('\n')
print(line)
data.append({"text": line})
corpus_folder_path = sys.argv[1]
corpusfolderpath = 'SENTS'
data = []
for filename in glob2.glob(corpusfolderpath+'/*.txt'):
for filename in glob2.glob(corpus_folder_path+'/*.txt'):
text_to_jsonl(filename)
with jsonlines.open('corpus_full_patterns.jsonl', 'w') as writer:
writer.write_all(data)
with jsonlines.open('dh_corpus_full_patterns.jsonl', 'w') as writer:
writer.write_all(data)
from bs4 import BeautifulSoup
import os
import logging
import sys
import spacy
nlp = spacy.load('en_core_web_sm')
logger = logging.getLogger('preprocess')
logger.setLevel(logging.INFO)
print(len(sys.argv))
for arg in sys.argv:
print(arg)
all_directories = []
all_directories.append("../data/dh_pub/xml/2015")
all_directories.append("../data/dh_pub/xml/2016")
all_directories.append("../data/dh_pub/xml/2017")
all_directories.append("../data/dh_pub/xml/2018")
all_directories.append("../data/dh_pub/xml/2019")
all_directories.append("../data/dh_pub/xml/2020")
output_dir = sys.argv[1]
if len(sys.argv) ==3:
mention_titles = sys.argv[2]
positive_dir = "/pos"
negative_dir = "/neg"
def write_to_file(output_dir, filename, paragraphs):
out_filename = output_dir + filename.name.replace('.xml', '.txt')
if not os.path.exists(os.path.dirname(out_filename)):
try:
os.makedirs(os.path.dirname(out_filename))
except OSError as exc:
if exc.errno != errno.EEXIST:
raise
with open(out_filename, 'w+') as fd:
for p in paragraphs:
doc = nlp(p.getText(strip=True))
sentences = [sent.string.strip() for sent in doc.sents]
for sent in sentences:
fd.write(sent+'\n')
def retrieve_paragraphs(soup):
paragraphs = []
text = soup.findAll('text')
for t in text:
paragraphs = paragraphs + (t.findAll('p'))
return paragraphs
for directory in all_directories:
for filename in os.scandir(directory):
with open(filename.path, 'r') as tei:
soup = BeautifulSoup(tei, 'lxml')
paragraphs = retrieve_paragraphs(soup)
write_to_file(output_dir, filename, paragraphs)
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment