Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
SSHOC
data-ingestion
Commits
8600deec
Commit
8600deec
authored
Nov 23, 2020
by
Seungbin Yim
Browse files
Add dh_publictiosns scripts
parent
2fef05b5
Changes
6
Expand all
Hide whitespace changes
Inline
Side-by-side
repositories/__init__.py
0 → 100644
View file @
8600deec
repositories/extraction/data/articles_tapor.txt
0 → 100644
View file @
8600deec
This diff is collapsed.
Click to expand it.
repositories/extraction/preprocess/__init__.py
0 → 100644
View file @
8600deec
repositories/extraction/corpus_to_jsonl.py
→
repositories/extraction/
preprocess/
corpus_to_jsonl.py
View file @
8600deec
import
glob2
,
os
,
json
,
jsonlines
import
glob2
,
os
,
json
,
jsonlines
,
sys
def
text_to_jsonl
(
filename
):
with
open
(
filename
)
as
f
:
...
...
@@ -6,15 +8,13 @@ def text_to_jsonl(filename):
line
=
line
.
strip
(
'
\n
'
)
print
(
line
)
data
.
append
({
"text"
:
line
})
corpus_folder_path
=
sys
.
argv
[
1
]
corpusfolderpath
=
'SENTS'
data
=
[]
for
filename
in
glob2
.
glob
(
corpusfolderpath
+
'/*.txt'
):
for
filename
in
glob2
.
glob
(
corpus
_
folder
_
path
+
'/*.txt'
):
text_to_jsonl
(
filename
)
with
jsonlines
.
open
(
'corpus_full_patterns.jsonl'
,
'w'
)
as
writer
:
writer
.
write_all
(
data
)
with
jsonlines
.
open
(
'
dh_
corpus_full_patterns.jsonl'
,
'w'
)
as
writer
:
writer
.
write_all
(
data
)
repositories/extraction/preprocess/preprocess.py
0 → 100644
View file @
8600deec
from
bs4
import
BeautifulSoup
import
os
import
logging
import
sys
import
spacy
nlp
=
spacy
.
load
(
'en_core_web_sm'
)
logger
=
logging
.
getLogger
(
'preprocess'
)
logger
.
setLevel
(
logging
.
INFO
)
print
(
len
(
sys
.
argv
))
for
arg
in
sys
.
argv
:
print
(
arg
)
all_directories
=
[]
all_directories
.
append
(
"../data/dh_pub/xml/2015"
)
all_directories
.
append
(
"../data/dh_pub/xml/2016"
)
all_directories
.
append
(
"../data/dh_pub/xml/2017"
)
all_directories
.
append
(
"../data/dh_pub/xml/2018"
)
all_directories
.
append
(
"../data/dh_pub/xml/2019"
)
all_directories
.
append
(
"../data/dh_pub/xml/2020"
)
output_dir
=
sys
.
argv
[
1
]
if
len
(
sys
.
argv
)
==
3
:
mention_titles
=
sys
.
argv
[
2
]
positive_dir
=
"/pos"
negative_dir
=
"/neg"
def
write_to_file
(
output_dir
,
filename
,
paragraphs
):
out_filename
=
output_dir
+
filename
.
name
.
replace
(
'.xml'
,
'.txt'
)
if
not
os
.
path
.
exists
(
os
.
path
.
dirname
(
out_filename
)):
try
:
os
.
makedirs
(
os
.
path
.
dirname
(
out_filename
))
except
OSError
as
exc
:
if
exc
.
errno
!=
errno
.
EEXIST
:
raise
with
open
(
out_filename
,
'w+'
)
as
fd
:
for
p
in
paragraphs
:
doc
=
nlp
(
p
.
getText
(
strip
=
True
))
sentences
=
[
sent
.
string
.
strip
()
for
sent
in
doc
.
sents
]
for
sent
in
sentences
:
fd
.
write
(
sent
+
'
\n
'
)
def
retrieve_paragraphs
(
soup
):
paragraphs
=
[]
text
=
soup
.
findAll
(
'text'
)
for
t
in
text
:
paragraphs
=
paragraphs
+
(
t
.
findAll
(
'p'
))
return
paragraphs
for
directory
in
all_directories
:
for
filename
in
os
.
scandir
(
directory
):
with
open
(
filename
.
path
,
'r'
)
as
tei
:
soup
=
BeautifulSoup
(
tei
,
'lxml'
)
paragraphs
=
retrieve_paragraphs
(
soup
)
write_to_file
(
output_dir
,
filename
,
paragraphs
)
repositories/extraction/text_to_jsonl.py
→
repositories/extraction/
preprocess/
text_to_jsonl.py
View file @
8600deec
File moved
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment