Skip to content
Snippets Groups Projects
Commit 94f30892 authored by Your Name's avatar Your Name
Browse files

Update SpeechTagger

parent f11e5e5f
No related branches found
Tags v1.1
No related merge requests found
......@@ -6,7 +6,7 @@ from ..global_wordlists import Q_MARKS_C, Q_MARKS_O
from ..utils_methods import add_extension
def flair_speech_tagger(doc, sentence_level=True):
def flair_speech_tagger(doc, sentence_level=False):
"""Spacy pipeline component.
Tags tokens and clauses with speech tags.
Wrapper for the "Redewiedergabe" taggers from https://github.com/redewiedergabe/tagger.
......@@ -14,7 +14,7 @@ def flair_speech_tagger(doc, sentence_level=True):
Args:
doc (`Doc`): A spacy document object.
sentence_level (boolean): If True, the taggers take each sentence separately as input;
if False, the taggers take the whole document at once as input.
if False, the taggers take chunks of up to 100 tokens as input.
Returns:
`Doc`: A spacy document object.
......@@ -33,12 +33,24 @@ def flair_speech_tagger(doc, sentence_level=True):
add_speech_tags_to_tokens(sent, text, SPEECH_TAGGERS._, "direct")
add_speech_tags_to_tokens(sent, text, SPEECH_TAGGERS._, "reported")
else:
text = " ".join([token.text for token in doc if not token.is_space])
text = Sentence(text, use_tokenizer=False)
add_speech_tags_to_tokens(doc, text, SPEECH_TAGGERS._, "indirect")
add_speech_tags_to_tokens(doc, text, SPEECH_TAGGERS._, "free_indirect")
add_speech_tags_to_tokens(doc, text, SPEECH_TAGGERS._, "direct")
add_speech_tags_to_tokens(doc, text, SPEECH_TAGGERS._, "reported")
chunks = []
chunk = []
for sent in doc.sents:
tokens = list(sent)
if len(chunk) + len(tokens) <= 100 or len(chunk) == 0:
chunk.extend(tokens)
else:
chunks.append(chunk)
chunk = tokens
if len(chunk) > 0:
chunks.append(chunk)
for chunk in chunks:
text = " ".join([token.text for token in chunk if not token.is_space])
text = Sentence(text, use_tokenizer=False)
add_speech_tags_to_tokens(chunk, text, SPEECH_TAGGERS._, "indirect")
add_speech_tags_to_tokens(chunk, text, SPEECH_TAGGERS._, "free_indirect")
add_speech_tags_to_tokens(chunk, text, SPEECH_TAGGERS._, "direct")
add_speech_tags_to_tokens(chunk, text, SPEECH_TAGGERS._, "reported")
assign_speech_tags_to_clauses(doc)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment