Update SpeechTagger

94f30892 · Your Name · f11e5e5f · 94f30892
Commit 94f30892 authored 2 years ago by Your Name
--- a/pipeline/components/speech_tagger.py
+++ b/pipeline/components/speech_tagger.py
@@ -6,7 +6,7 @@ from ..global_wordlists import Q_MARKS_C, Q_MARKS_O
 from ..utils_methods import add_extension


-def flair_speech_tagger(doc, sentence_level=True):
+def flair_speech_tagger(doc, sentence_level=False):
    """Spacy pipeline component.
        Tags tokens and clauses with speech tags.
        Wrapper for the "Redewiedergabe" taggers from https://github.com/redewiedergabe/tagger.
@@ -14,7 +14,7 @@ def flair_speech_tagger(doc, sentence_level=True):
    Args:
        doc (`Doc`): A spacy document object.
        sentence_level (boolean): If True, the taggers take each sentence separately as input;
-            if False, the taggers take the whole document at once as input.
+            if False, the taggers take chunks of up to 100 tokens as input.
    
    Returns:
        `Doc`: A spacy document object.
@@ -33,12 +33,24 @@ def flair_speech_tagger(doc, sentence_level=True):
            add_speech_tags_to_tokens(sent, text, SPEECH_TAGGERS._, "direct")
            add_speech_tags_to_tokens(sent, text, SPEECH_TAGGERS._, "reported")
    else:
-        text = " ".join([token.text for token in doc if not token.is_space])
-        text = Sentence(text, use_tokenizer=False)
-        add_speech_tags_to_tokens(doc, text, SPEECH_TAGGERS._, "indirect")
-        add_speech_tags_to_tokens(doc, text, SPEECH_TAGGERS._, "free_indirect")
-        add_speech_tags_to_tokens(doc, text, SPEECH_TAGGERS._, "direct")
-        add_speech_tags_to_tokens(doc, text, SPEECH_TAGGERS._, "reported")
+        chunks = []
+        chunk = []
+        for sent in doc.sents:
+            tokens = list(sent)
+            if len(chunk) + len(tokens) <= 100 or len(chunk) == 0:
+                chunk.extend(tokens)
+            else:
+                chunks.append(chunk)
+                chunk = tokens
+        if len(chunk) > 0:
+            chunks.append(chunk)
+        for chunk in chunks:
+            text = " ".join([token.text for token in chunk if not token.is_space])
+            text = Sentence(text, use_tokenizer=False)
+            add_speech_tags_to_tokens(chunk, text, SPEECH_TAGGERS._, "indirect")
+            add_speech_tags_to_tokens(chunk, text, SPEECH_TAGGERS._, "free_indirect")
+            add_speech_tags_to_tokens(chunk, text, SPEECH_TAGGERS._, "direct")
+            add_speech_tags_to_tokens(chunk, text, SPEECH_TAGGERS._, "reported")
    
    assign_speech_tags_to_clauses(doc)