From 857440e1e0f9681f0a9f6ec904ada148865717f1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tillmann=20D=C3=B6nicke?= <doenicke@MacBook-Pro-von-Tillmann.local> Date: Wed, 9 Aug 2023 14:12:42 +0200 Subject: [PATCH] refactor(pipeline): add super classes for all pipeline components --- main/example.py | 2 +- .../annotation_reader/annotation_reader.py | 22 +++++++++ .../catma_annotation_reader.py | 14 +++--- .../attribution_tagger/attribution_tagger.py | 17 +++++++ .../neural_attribution_tagger.py | 13 +++--- src/monapipe/pipeline/clausizer/clausizer.py | 45 +++++++++++++++++++ .../clausizer/dependency_clausizer.py | 25 +++-------- src/monapipe/pipeline/coref/coref.py | 35 +++++++++++++++ src/monapipe/pipeline/coref/rb_coref.py | 29 +++--------- .../pipeline/event_tagger/event_tagger.py | 17 +++++++ .../event_tagger/neural_event_tagger.py | 13 +++--- .../pipeline/formatter/conllu_formatter.py | 25 ++++++----- src/monapipe/pipeline/formatter/formatter.py | 32 +++++++++++++ .../pipeline/gen_tagger/gen_tagger.py | 19 ++++++++ .../pipeline/gen_tagger/neural_gen_tagger.py | 12 ++--- .../normalizer/identity_normalizer.py | 27 +++-------- .../pipeline/normalizer/normalizer.py | 36 +++++++++++++++ .../neural_reflection_tagger.py | 12 ++--- .../reflection_tagger/reflection_tagger.py | 19 ++++++++ .../germanet_semantic_tagger.py | 13 +++--- .../semantic_tagger/semantic_tagger.py | 21 +++++++++ .../pipeline/slicer/from_start_slicer.py | 10 ++--- src/monapipe/pipeline/slicer/slicer.py | 21 +++++++++ .../speaker_extractor/rb_speaker_extractor.py | 17 +++---- .../speaker_extractor/speaker_extractor.py | 21 +++++++++ .../speech_tagger/flair_speech_tagger.py | 19 +++----- .../quotation_marks_speech_tagger.py | 24 +++++----- .../pipeline/speech_tagger/speech_tagger.py | 26 +++++++++++ .../heideltime_temponym_tagger.py | 14 +++--- .../temponym_tagger/temponym_tagger.py | 20 +++++++++ .../verb_analyzer/rb_verb_analyzer.py | 30 +++++-------- .../pipeline/verb_analyzer/verb_analyzer.py | 41 +++++++++++++++++ tests/pipeline/test_conllu_formatter.py | 4 +- 33 files changed, 507 insertions(+), 188 deletions(-) create mode 100644 src/monapipe/pipeline/annotation_reader/annotation_reader.py create mode 100644 src/monapipe/pipeline/attribution_tagger/attribution_tagger.py create mode 100644 src/monapipe/pipeline/clausizer/clausizer.py create mode 100644 src/monapipe/pipeline/coref/coref.py create mode 100644 src/monapipe/pipeline/event_tagger/event_tagger.py create mode 100644 src/monapipe/pipeline/formatter/formatter.py create mode 100644 src/monapipe/pipeline/gen_tagger/gen_tagger.py create mode 100644 src/monapipe/pipeline/normalizer/normalizer.py create mode 100644 src/monapipe/pipeline/reflection_tagger/reflection_tagger.py create mode 100644 src/monapipe/pipeline/semantic_tagger/semantic_tagger.py create mode 100644 src/monapipe/pipeline/slicer/slicer.py create mode 100644 src/monapipe/pipeline/speaker_extractor/speaker_extractor.py create mode 100644 src/monapipe/pipeline/speech_tagger/speech_tagger.py create mode 100644 src/monapipe/pipeline/temponym_tagger/temponym_tagger.py create mode 100644 src/monapipe/pipeline/verb_analyzer/verb_analyzer.py diff --git a/main/example.py b/main/example.py index 25e4299..2f8ff20 100644 --- a/main/example.py +++ b/main/example.py @@ -121,4 +121,4 @@ if __name__ == "__main__": doc_data = pickle.dumps(make_pickleable(doc)) doc = unmake_pickleable(pickle.loads(doc_data)) - print(doc._.conllu_str) + print(doc._.format_str) diff --git a/src/monapipe/pipeline/annotation_reader/annotation_reader.py b/src/monapipe/pipeline/annotation_reader/annotation_reader.py new file mode 100644 index 0000000..dc07325 --- /dev/null +++ b/src/monapipe/pipeline/annotation_reader/annotation_reader.py @@ -0,0 +1,22 @@ +# SPDX-FileCopyrightText: 2022 Georg-August-Universität Göttingen +# +# SPDX-License-Identifier: CC0-1.0 + +from typing import Optional + +from spacy.language import Language +from spacy.tokens import Doc, Token + +from monapipe.pipeline.methods import add_extension + + +class AnnotationReader: + """The super class `AnnotationReader`.""" + + assigns = ["doc._.annotations", "token._.annotations"] + + def __init__(self, nlp: Language, corpus_path: Optional[str]): + self.corpus_path = corpus_path + + add_extension(Token, "annotations", {}) + add_extension(Doc, "annotations", {}) diff --git a/src/monapipe/pipeline/annotation_reader/catma_annotation_reader.py b/src/monapipe/pipeline/annotation_reader/catma_annotation_reader.py index 06e84d1..89a7e62 100644 --- a/src/monapipe/pipeline/annotation_reader/catma_annotation_reader.py +++ b/src/monapipe/pipeline/annotation_reader/catma_annotation_reader.py @@ -9,15 +9,16 @@ import xml.etree.cElementTree as ET from typing import Any, Dict, List, Optional from spacy.language import Language -from spacy.tokens import Doc, Token +from spacy.tokens import Doc from monapipe.annotation import Annotation, AnnotationList -from monapipe.pipeline.methods import add_extension, get_doc_text +from monapipe.pipeline.annotation_reader.annotation_reader import AnnotationReader +from monapipe.pipeline.methods import get_doc_text @Language.factory( "catma_annotation_reader", - assigns=["doc._.annotations", "token._.annotations"], + assigns=AnnotationReader.assigns, default_config={"corpus_path": None}, ) def catma_annotation_reader(nlp: Language, name: str, corpus_path: Optional[str]) -> Any: @@ -51,11 +52,11 @@ def catma_annotation_reader(nlp: Language, name: str, corpus_path: Optional[str] return CatmaAnnotationReader(nlp, corpus_path) -class CatmaAnnotationReader: +class CatmaAnnotationReader(AnnotationReader): """The class `CatmaAnnotationReader`.""" def __init__(self, nlp: Language, corpus_path: Optional[str]): - self.corpus_path = corpus_path + super().__init__(nlp, corpus_path) # name space for CATMA TEI/XML self._namespace = { @@ -63,9 +64,6 @@ class CatmaAnnotationReader: "xml": "http://www.w3.org/XML/1998/namespace", } - add_extension(Token, "annotations", {}) - add_extension(Doc, "annotations", {}) - def __call__(self, doc: Doc) -> Doc: if self.corpus_path is None: return doc diff --git a/src/monapipe/pipeline/attribution_tagger/attribution_tagger.py b/src/monapipe/pipeline/attribution_tagger/attribution_tagger.py new file mode 100644 index 0000000..cae3103 --- /dev/null +++ b/src/monapipe/pipeline/attribution_tagger/attribution_tagger.py @@ -0,0 +1,17 @@ +# SPDX-FileCopyrightText: 2022 Georg-August-Universität Göttingen +# +# SPDX-License-Identifier: CC0-1.0 + +from spacy.language import Language +from spacy.tokens import Span + +from monapipe.pipeline.methods import add_extension + + +class AttributionTagger: + """The super class `AttributionTagger`.""" + + assigns = {"span._.attribution": "clause._.attribution"} + + def __init__(self, nlp: Language): + add_extension(Span, "attribution") diff --git a/src/monapipe/pipeline/attribution_tagger/neural_attribution_tagger.py b/src/monapipe/pipeline/attribution_tagger/neural_attribution_tagger.py index 6748aa4..e0dec41 100644 --- a/src/monapipe/pipeline/attribution_tagger/neural_attribution_tagger.py +++ b/src/monapipe/pipeline/attribution_tagger/neural_attribution_tagger.py @@ -8,17 +8,18 @@ import numpy as np import torch from sklearn.preprocessing import MultiLabelBinarizer from spacy.language import Language -from spacy.tokens import Doc, Span +from spacy.tokens import Doc from transformers import BertModel, BertTokenizer import monapipe.resource_handler as resources from monapipe.config import SETTINGS -from monapipe.pipeline.methods import add_extension, requires +from monapipe.pipeline.attribution_tagger.attribution_tagger import AttributionTagger +from monapipe.pipeline.methods import requires @Language.factory( "neural_attribution_tagger", - assigns={"span._.attribution": "clause._.attribution"}, + assigns=AttributionTagger.assigns, default_config={}, ) def neural_attribution_tagger(nlp: Language, name: str) -> Any: @@ -39,12 +40,14 @@ def neural_attribution_tagger(nlp: Language, name: str) -> Any: return NeuralAttributionTagger(nlp) -class NeuralAttributionTagger: +class NeuralAttributionTagger(AttributionTagger): """The class `NeuralAttributionTagger`.""" def __init__(self, nlp: Language): requires(self, nlp, ["clausizer"]) + super().__init__(nlp) + # BERT tokenizer and model self._tokenizer = BertTokenizer.from_pretrained("dbmdz/bert-base-german-cased") self._model = BertModel.from_pretrained( @@ -56,8 +59,6 @@ class NeuralAttributionTagger: self._label_encoder = MultiLabelBinarizer() self._label_encoder.fit([self._label_names]) - add_extension(Span, "attribution") - def __call__(self, doc: Doc) -> Doc: attribution_model = resources.access("attribution") diff --git a/src/monapipe/pipeline/clausizer/clausizer.py b/src/monapipe/pipeline/clausizer/clausizer.py new file mode 100644 index 0000000..ac3400c --- /dev/null +++ b/src/monapipe/pipeline/clausizer/clausizer.py @@ -0,0 +1,45 @@ +# SPDX-FileCopyrightText: 2022 Georg-August-Universität Göttingen +# +# SPDX-License-Identifier: CC0-1.0 + +from typing import List + +from spacy.language import Language +from spacy.tokens import Doc, Span, Token + +from monapipe.pipeline.methods import add_extension + + +class Clausizer: + """The super class `Clausizer`.""" + + assigns = { + "doc._.clauses": "doc._.clauses", + "span._.clauses": "sent._.clauses", + "span._.prec_punct": "clause._.prec_punct", + "span._.succ_punct": "clause._.succ_punct", + "span._.tokens": "clause._.tokens", + "token._.clause": "token._.clause", + } + + def __init__( + self, + nlp: Language, + dep_labels: List[str], + conj_rule_labels: List[str], + xcomp_rule_labels: List[str], + handle_semi_modals: bool, + include_ws: bool, + ): + self.dep_labels = dep_labels + self.conj_rule_labels = conj_rule_labels + self.xcomp_rule_labels = xcomp_rule_labels + self.handle_semi_modals = handle_semi_modals + self.include_ws = include_ws + + add_extension(Doc, "clauses", []) + add_extension(Span, "clauses") + add_extension(Span, "tokens") + add_extension(Span, "prec_punct") + add_extension(Span, "succ_punct") + add_extension(Token, "clause") diff --git a/src/monapipe/pipeline/clausizer/dependency_clausizer.py b/src/monapipe/pipeline/clausizer/dependency_clausizer.py index d9abb98..eedeab6 100644 --- a/src/monapipe/pipeline/clausizer/dependency_clausizer.py +++ b/src/monapipe/pipeline/clausizer/dependency_clausizer.py @@ -8,19 +8,13 @@ from spacy.language import Language from spacy.tokens import Doc, Span, Token from monapipe.lookups import lookup +from monapipe.pipeline.clausizer.clausizer import Clausizer from monapipe.pipeline.methods import add_extension, requires @Language.factory( "dependency_clausizer", - assigns={ - "doc._.clauses": "doc._.clauses", - "span._.clauses": "sent._.clauses", - "span._.prec_punct": "clause._.prec_punct", - "span._.succ_punct": "clause._.succ_punct", - "span._.tokens": "clause._.tokens", - "token._.clause": "token._.clause", - }, + assigns=Clausizer.assigns, default_config={ "dep_labels": [ "acl", @@ -73,7 +67,7 @@ def dependency_clausizer( ) -class DependencyClausizer: +class DependencyClausizer(Clausizer): """The class `DependencyClausizer`.""" def __init__( @@ -87,20 +81,15 @@ class DependencyClausizer: ): requires(self, nlp, ["tok2vec", "morphologizer", "lemmatizer", "parser"]) + super().__init__( + nlp, dep_labels, conj_rule_labels, xcomp_rule_labels, handle_semi_modals, include_ws + ) + self.dep_labels = set(dep_labels) self.conj_rule_labels = set(conj_rule_labels) self.xcomp_rule_labels = set(xcomp_rule_labels) self.dep_labels.difference_update(self.conj_rule_labels) self.dep_labels.difference_update(self.xcomp_rule_labels) - self.handle_semi_modals = handle_semi_modals - self.include_ws = include_ws - - add_extension(Doc, "clauses", []) - add_extension(Span, "clauses") - add_extension(Span, "tokens") - add_extension(Span, "prec_punct") - add_extension(Span, "succ_punct") - add_extension(Token, "clause") def __call__(self, doc: Doc) -> Doc: semi_modals = lookup(doc.lang_, "semi_modal_verbs") diff --git a/src/monapipe/pipeline/coref/coref.py b/src/monapipe/pipeline/coref/coref.py new file mode 100644 index 0000000..feec291 --- /dev/null +++ b/src/monapipe/pipeline/coref/coref.py @@ -0,0 +1,35 @@ +# SPDX-FileCopyrightText: 2022 Georg-August-Universität Göttingen +# +# SPDX-License-Identifier: CC0-1.0 + +from spacy.language import Language +from spacy.tokens import Doc, Span, Token + +from monapipe.pipeline.methods import add_extension + + +class Coref: + """The super class `Coref`.""" + + assigns = { + "doc._.coref_clusters", + "doc._.coref_resolved", + "doc._.coref_scores", + "doc._.has_coref", + "span._.coref_cluster", + "span._.coref_scores", + "span._.is_coref", + "token._.coref_clusters", + "token._.in_coref", + } + + def __init__(self, nlp: Language): + add_extension(Doc, "coref_clusters", []) + add_extension(Doc, "coref_resolved") + add_extension(Doc, "coref_scores", {}) + add_extension(Doc, "has_coref", False) + add_extension(Span, "coref_cluster") + add_extension(Span, "coref_scores", {}) + add_extension(Span, "is_coref", False) + add_extension(Token, "coref_clusters", []) + add_extension(Token, "in_coref", False) diff --git a/src/monapipe/pipeline/coref/rb_coref.py b/src/monapipe/pipeline/coref/rb_coref.py index 4be3479..3b29192 100644 --- a/src/monapipe/pipeline/coref/rb_coref.py +++ b/src/monapipe/pipeline/coref/rb_coref.py @@ -7,7 +7,7 @@ from typing import Any, Callable, List, Set from nltk.corpus.util import LazyCorpusLoader from spacy.language import Language -from spacy.tokens import Doc, Span, Token +from spacy.tokens import Doc, Span import monapipe.resource_handler as resources from monapipe.linguistics import ( @@ -22,22 +22,13 @@ from monapipe.linguistics import ( stringify, ) from monapipe.neuralcoref import Cluster, get_resolved -from monapipe.pipeline.methods import add_extension, requires +from monapipe.pipeline.coref.coref import Coref +from monapipe.pipeline.methods import requires @Language.factory( "rb_coref", - assigns={ - "doc._.coref_clusters", - "doc._.coref_resolved", - "doc._.coref_scores", - "doc._.has_coref", - "span._.coref_cluster", - "span._.coref_scores", - "span._.is_coref", - "token._.coref_clusters", - "token._.in_coref", - }, + assigns=Coref.assigns, default_config={}, ) def rb_coref(nlp: Language, name: str) -> Any: @@ -58,21 +49,13 @@ def rb_coref(nlp: Language, name: str) -> Any: return RbCoref(nlp) -class RbCoref: +class RbCoref(Coref): """The class `RbCoref`.""" def __init__(self, nlp: Language): requires(self, nlp, ["parser", "morphologizer", "lemmatizer", "speaker_extractor"]) - add_extension(Doc, "coref_clusters", []) - add_extension(Doc, "coref_resolved") - add_extension(Doc, "coref_scores", {}) - add_extension(Doc, "has_coref", False) - add_extension(Span, "coref_cluster") - add_extension(Span, "coref_scores", {}) - add_extension(Span, "is_coref", False) - add_extension(Token, "coref_clusters", []) - add_extension(Token, "in_coref", False) + super().__init__(nlp) def __call__(self, doc: Doc) -> Doc: ents = get_noun_phrases(doc) diff --git a/src/monapipe/pipeline/event_tagger/event_tagger.py b/src/monapipe/pipeline/event_tagger/event_tagger.py new file mode 100644 index 0000000..5e407dd --- /dev/null +++ b/src/monapipe/pipeline/event_tagger/event_tagger.py @@ -0,0 +1,17 @@ +# SPDX-FileCopyrightText: 2022 Georg-August-Universität Göttingen +# +# SPDX-License-Identifier: CC0-1.0 + +from spacy.language import Language +from spacy.tokens import Span + +from monapipe.pipeline.methods import add_extension + + +class EventTagger: + """The super class `EventTagger`.""" + + assigns = {"span._.event": "clause._.event"} + + def __init__(self, nlp: Language): + add_extension(Span, "event") diff --git a/src/monapipe/pipeline/event_tagger/neural_event_tagger.py b/src/monapipe/pipeline/event_tagger/neural_event_tagger.py index 524d09b..cd5fefb 100644 --- a/src/monapipe/pipeline/event_tagger/neural_event_tagger.py +++ b/src/monapipe/pipeline/event_tagger/neural_event_tagger.py @@ -6,17 +6,16 @@ import importlib from typing import Any from spacy.language import Language -from spacy.tokens import Doc, Span +from spacy.tokens import Doc from torch.utils.data import DataLoader import monapipe.resource_handler as resources from monapipe.config import SETTINGS -from monapipe.pipeline.methods import add_extension, requires +from monapipe.pipeline.event_tagger.event_tagger import EventTagger +from monapipe.pipeline.methods import requires -@Language.factory( - "neural_event_tagger", assigns={"span._.event": "clause._.event"}, default_config={} -) +@Language.factory("neural_event_tagger", assigns=EventTagger.assigns, default_config={}) def neural_event_tagger(nlp: Language, name: str) -> Any: """Spacy pipeline component. Integration of event classification from EvENT project. @@ -35,13 +34,13 @@ def neural_event_tagger(nlp: Language, name: str) -> Any: return NeuralEventTagger(nlp) -class NeuralEventTagger: +class NeuralEventTagger(EventTagger): """The class `NeuralEventTagger`.""" def __init__(self, nlp: Language): requires(self, nlp, ["clausizer"]) - add_extension(Span, "event") + super().__init__(nlp) def __call__(self, doc: Doc) -> Doc: model, tokenizer = resources.access("event_classification") diff --git a/src/monapipe/pipeline/formatter/conllu_formatter.py b/src/monapipe/pipeline/formatter/conllu_formatter.py index 0fb1f94..4c195dc 100644 --- a/src/monapipe/pipeline/formatter/conllu_formatter.py +++ b/src/monapipe/pipeline/formatter/conllu_formatter.py @@ -6,14 +6,15 @@ import re from typing import Any, Callable, Dict, List, Union from spacy.language import Language -from spacy.tokens import Doc, Span, Token +from spacy.tokens import Doc, Token -from monapipe.pipeline.methods import add_extension, deserialize_config_param, optional +from monapipe.pipeline.formatter.formatter import Formatter +from monapipe.pipeline.methods import deserialize_config_param, optional @Language.factory( "conllu_formatter", - assigns={"doc._.conllu_str": "doc._.conllu_str", "span._.conllu_str": "sent._.conllu_str"}, + assigns=Formatter.assigns, default_config={ "column_names": [ "ID", @@ -64,7 +65,7 @@ def conllu_formatter( return ConlluFormatter(nlp, column_names, column_names_plus, column_funcs, delimiter) -class ConlluFormatter: +class ConlluFormatter(Formatter): """The class `ConlluFormatter`.""" def __init__( @@ -77,9 +78,12 @@ class ConlluFormatter: ): optional(self, nlp, ["parser"]) - column_funcs = deserialize_config_param(column_funcs) + super().__init__(nlp, column_names, column_names_plus, column_funcs, delimiter) + + self.column_names = self.column_names + self.column_names_plus + + column_funcs = deserialize_config_param(self.column_funcs) - self.column_names = column_names + column_names_plus self.column_funcs = { "ID": lambda token: ( list(token.sent).index(token) if token.doc.is_sentenced else token.i @@ -100,12 +104,9 @@ class ConlluFormatter: ), "DEPREL": lambda token: token.dep_, } + for column_name in column_funcs: self.column_funcs[column_name] = column_funcs[column_name] - self.delimiter = delimiter - - add_extension(Doc, "conllu_str") - add_extension(Span, "conllu_str") def __call__(self, doc: Doc) -> Doc: if doc.is_sentenced: @@ -135,8 +136,8 @@ class ConlluFormatter: sent_rows.append(row) sent_rows.append("") doc_rows.extend(sent_rows) - sent._.conllu_str = self._string_from_rows([first_row] + sent_rows, sent_char_widths) - doc._.conllu_str = self._string_from_rows([first_row] + doc_rows, doc_char_widths) + sent._.format_str = self._string_from_rows([first_row] + sent_rows, sent_char_widths) + doc._.format_str = self._string_from_rows([first_row] + doc_rows, doc_char_widths) return doc def _apply_column_func(self, token: Token, column_name: str) -> Any: diff --git a/src/monapipe/pipeline/formatter/formatter.py b/src/monapipe/pipeline/formatter/formatter.py new file mode 100644 index 0000000..85bd7f9 --- /dev/null +++ b/src/monapipe/pipeline/formatter/formatter.py @@ -0,0 +1,32 @@ +# SPDX-FileCopyrightText: 2022 Georg-August-Universität Göttingen +# +# SPDX-License-Identifier: CC0-1.0 + +from typing import Any, Callable, Dict, List, Union + +from spacy.language import Language +from spacy.tokens import Doc, Span, Token + +from monapipe.pipeline.methods import add_extension + + +class Formatter: + """The super class `Formatter`.""" + + assigns = {"doc._.format_str": "doc._.format_str", "span._.format_str": "sent._.format_str"} + + def __init__( + self, + nlp: Language, + column_names: List[str], + column_names_plus: List[str], + column_funcs: Union[str, Dict[str, Callable[[Token], Any]]], + delimiter: str, + ): + self.column_names = column_names + self.column_names_plus = column_names_plus + self.column_funcs = column_funcs + self.delimiter = delimiter + + add_extension(Doc, "format_str") + add_extension(Span, "format_str") diff --git a/src/monapipe/pipeline/gen_tagger/gen_tagger.py b/src/monapipe/pipeline/gen_tagger/gen_tagger.py new file mode 100644 index 0000000..688f630 --- /dev/null +++ b/src/monapipe/pipeline/gen_tagger/gen_tagger.py @@ -0,0 +1,19 @@ +# SPDX-FileCopyrightText: 2022 Georg-August-Universität Göttingen +# +# SPDX-License-Identifier: CC0-1.0 + +from spacy.language import Language +from spacy.tokens import Span + +from monapipe.pipeline.methods import add_extension + + +class GenTagger: + """The super class `GenTagger`.""" + + assigns = {"doc.spans": "doc.spans['gi']", "span._.gi": "gi_span._.gi"} + + def __init__(self, nlp: Language, label_condition: str): + self.label_condition = label_condition + + add_extension(Span, "gi", {}) diff --git a/src/monapipe/pipeline/gen_tagger/neural_gen_tagger.py b/src/monapipe/pipeline/gen_tagger/neural_gen_tagger.py index 40727b5..e1ff781 100644 --- a/src/monapipe/pipeline/gen_tagger/neural_gen_tagger.py +++ b/src/monapipe/pipeline/gen_tagger/neural_gen_tagger.py @@ -5,16 +5,17 @@ from typing import Any from spacy.language import Language -from spacy.tokens import Doc, Span +from spacy.tokens import Doc import monapipe.resource_handler as resources -from monapipe.pipeline.methods import add_extension, requires +from monapipe.pipeline.gen_tagger.gen_tagger import GenTagger +from monapipe.pipeline.methods import requires from monapipe.pipeline.reflection_tagger.methods import create_passages_from_clause_tags @Language.factory( "neural_gen_tagger", - assigns={"doc.spans": "doc.spans['gi']", "span._.gi": "gi_span._.gi"}, + assigns=GenTagger.assigns, default_config={"label_condition": "multi"}, ) def neural_gen_tagger(nlp: Language, name: str, label_condition: str) -> Any: @@ -35,7 +36,7 @@ def neural_gen_tagger(nlp: Language, name: str, label_condition: str) -> Any: return NeuralGenTagger(nlp, label_condition) -class NeuralGenTagger: +class NeuralGenTagger(GenTagger): """The class `NeuralGenTagger`.""" def __init__(self, nlp: Language, label_condition: str): @@ -43,9 +44,8 @@ class NeuralGenTagger: if label_condition not in ["binary", "multi"]: raise ValueError('Label condition must be "binary" or "multi".') - self.label_condition = label_condition - add_extension(Span, "gi", {}) + super().__init__(nlp, label_condition) def __call__(self, doc: Doc) -> Doc: models = resources.access("generalizing_passages_identification_bert") diff --git a/src/monapipe/pipeline/normalizer/identity_normalizer.py b/src/monapipe/pipeline/normalizer/identity_normalizer.py index 56ea9df..ff55d35 100644 --- a/src/monapipe/pipeline/normalizer/identity_normalizer.py +++ b/src/monapipe/pipeline/normalizer/identity_normalizer.py @@ -5,26 +5,14 @@ from typing import Any from spacy.language import Language -from spacy.tokens import Doc, Token +from spacy.tokens import Doc -from monapipe.pipeline.methods import add_extension +from monapipe.pipeline.normalizer.normalizer import Normalizer @Language.factory( "identity_normalizer", - assigns=[ - "doc.text", - "doc.text_with_ws", - "doc._.text", - "doc._.text_with_ws", - "token.idx", - "token.text", - "token.text_with_ws", - "token._.idx", - "token._.text", - "token._.text_with_ws", - "token._.whitespace_", - ], + assigns=Normalizer.assigns, default_config={"remove_spaces": False}, ) def dependency_clausizer(nlp: Language, name: str, remove_spaces: bool) -> Any: @@ -43,18 +31,13 @@ def dependency_clausizer(nlp: Language, name: str, remove_spaces: bool) -> Any: return IdentityNormalizer(nlp, remove_spaces) -class IdentityNormalizer: +class IdentityNormalizer(Normalizer): """The class `IdentityNormalizer`.""" def __init__(self, nlp: Language, remove_spaces: bool): self.remove_spaces = remove_spaces - add_extension(Doc, "text") - add_extension(Doc, "text_with_ws") - add_extension(Token, "idx") - add_extension(Token, "text") - add_extension(Token, "text_with_ws") - add_extension(Token, "whitespace_") + super().__init__(nlp, remove_spaces) def __call__(self, doc: Doc) -> Doc: # token.text cannot be overwritten, so we create a new document diff --git a/src/monapipe/pipeline/normalizer/normalizer.py b/src/monapipe/pipeline/normalizer/normalizer.py new file mode 100644 index 0000000..710ad4e --- /dev/null +++ b/src/monapipe/pipeline/normalizer/normalizer.py @@ -0,0 +1,36 @@ +# SPDX-FileCopyrightText: 2022 Georg-August-Universität Göttingen +# +# SPDX-License-Identifier: CC0-1.0 + +from spacy.language import Language +from spacy.tokens import Doc, Token + +from monapipe.pipeline.methods import add_extension + + +class Normalizer: + """The super class `Normalizer`.""" + + assigns = [ + "doc.text", + "doc.text_with_ws", + "doc._.text", + "doc._.text_with_ws", + "token.idx", + "token.text", + "token.text_with_ws", + "token._.idx", + "token._.text", + "token._.text_with_ws", + "token._.whitespace_", + ] + + def __init__(self, nlp: Language, remove_spaces: bool): + self.remove_spaces = remove_spaces + + add_extension(Doc, "text") + add_extension(Doc, "text_with_ws") + add_extension(Token, "idx") + add_extension(Token, "text") + add_extension(Token, "text_with_ws") + add_extension(Token, "whitespace_") diff --git a/src/monapipe/pipeline/reflection_tagger/neural_reflection_tagger.py b/src/monapipe/pipeline/reflection_tagger/neural_reflection_tagger.py index 25f12a3..ebbb69f 100644 --- a/src/monapipe/pipeline/reflection_tagger/neural_reflection_tagger.py +++ b/src/monapipe/pipeline/reflection_tagger/neural_reflection_tagger.py @@ -5,16 +5,17 @@ from typing import Any from spacy.language import Language -from spacy.tokens import Doc, Span +from spacy.tokens import Doc import monapipe.resource_handler as resources -from monapipe.pipeline.methods import add_extension, requires, update_token_span_groups +from monapipe.pipeline.methods import requires, update_token_span_groups from monapipe.pipeline.reflection_tagger.methods import create_passages_from_clause_tags +from monapipe.pipeline.reflection_tagger.reflection_tagger import ReflectionTagger @Language.factory( "neural_reflection_tagger", - assigns={"doc.spans": "doc.spans['rp']", "span._.rp": "rp_span._.rp"}, + assigns=ReflectionTagger.assigns, default_config={"label_condition": "multi"}, ) def neural_reflection_tagger(nlp: Language, name: str, label_condition: str) -> Any: @@ -35,7 +36,7 @@ def neural_reflection_tagger(nlp: Language, name: str, label_condition: str) -> return NeuralReflectionTagger(nlp, label_condition) -class NeuralReflectionTagger: +class NeuralReflectionTagger(ReflectionTagger): """The class `NeuralReflectionTagger`.""" def __init__(self, nlp: Language, label_condition: str): @@ -43,9 +44,8 @@ class NeuralReflectionTagger: if label_condition not in ["binary", "multi"]: raise ValueError('Label condition must be "binary" or "multi".') - self.label_condition = label_condition - add_extension(Span, "rp", {}) + super().__init__(nlp, label_condition) def __call__(self, doc: Doc) -> Doc: models = resources.access("reflective_passages_identification_bert") diff --git a/src/monapipe/pipeline/reflection_tagger/reflection_tagger.py b/src/monapipe/pipeline/reflection_tagger/reflection_tagger.py new file mode 100644 index 0000000..c560b48 --- /dev/null +++ b/src/monapipe/pipeline/reflection_tagger/reflection_tagger.py @@ -0,0 +1,19 @@ +# SPDX-FileCopyrightText: 2022 Georg-August-Universität Göttingen +# +# SPDX-License-Identifier: CC0-1.0 + +from spacy.language import Language +from spacy.tokens import Span + +from monapipe.pipeline.methods import add_extension + + +class ReflectionTagger: + """The super class `ReflectionTagger`.""" + + assigns = {"doc.spans": "doc.spans['rp']", "span._.rp": "rp_span._.rp"} + + def __init__(self, nlp: Language, label_condition: str): + self.label_condition = label_condition + + add_extension(Span, "rp", {}) diff --git a/src/monapipe/pipeline/semantic_tagger/germanet_semantic_tagger.py b/src/monapipe/pipeline/semantic_tagger/germanet_semantic_tagger.py index 8e4eaca..f82fd5e 100644 --- a/src/monapipe/pipeline/semantic_tagger/germanet_semantic_tagger.py +++ b/src/monapipe/pipeline/semantic_tagger/germanet_semantic_tagger.py @@ -10,15 +10,13 @@ from spacy.language import Language from spacy.tokens import Doc, Span, Token import monapipe.resource_handler as resources -from monapipe.pipeline.methods import add_extension, requires +from monapipe.pipeline.methods import requires +from monapipe.pipeline.semantic_tagger.semantic_tagger import SemanticTagger @Language.factory( "germanet_semantic_tagger", - assigns={ - "span._.verb_synset_id": "clause._.verb_synset_id", - "token._.synset_id": "token._.synset_id", - }, + assigns=SemanticTagger.assigns, default_config={}, ) def germanet_semantic_tagger(nlp: Language, name: str) -> Any: @@ -36,14 +34,13 @@ def germanet_semantic_tagger(nlp: Language, name: str) -> Any: return GermanetSemanticTagger(nlp) -class GermanetSemanticTagger: +class GermanetSemanticTagger(SemanticTagger): """The class `GermanetSemanticTagger`.""" def __init__(self, nlp: Language): requires(self, nlp, ["lemmatizer", "clausizer"]) - add_extension(Span, "verb_synset_id") - add_extension(Token, "synset_id") + super().__init__(nlp) def __call__(self, doc: Doc) -> Doc: germanet = resources.access("germanet") diff --git a/src/monapipe/pipeline/semantic_tagger/semantic_tagger.py b/src/monapipe/pipeline/semantic_tagger/semantic_tagger.py new file mode 100644 index 0000000..6e01e56 --- /dev/null +++ b/src/monapipe/pipeline/semantic_tagger/semantic_tagger.py @@ -0,0 +1,21 @@ +# SPDX-FileCopyrightText: 2022 Georg-August-Universität Göttingen +# +# SPDX-License-Identifier: CC0-1.0 + +from spacy.language import Language +from spacy.tokens import Span, Token + +from monapipe.pipeline.methods import add_extension + + +class SemanticTagger: + """The super class `SemanticTagger`.""" + + assigns = { + "span._.verb_synset_id": "clause._.verb_synset_id", + "token._.synset_id": "token._.synset_id", + } + + def __init__(self, nlp: Language): + add_extension(Span, "verb_synset_id") + add_extension(Token, "synset_id") diff --git a/src/monapipe/pipeline/slicer/from_start_slicer.py b/src/monapipe/pipeline/slicer/from_start_slicer.py index df555a8..1e09367 100644 --- a/src/monapipe/pipeline/slicer/from_start_slicer.py +++ b/src/monapipe/pipeline/slicer/from_start_slicer.py @@ -10,11 +10,12 @@ from spacy.tokens import Doc from monapipe.pipeline.methods import add_extension from monapipe.pipeline.slicer.methods import span_to_doc +from monapipe.pipeline.slicer.slicer import Slicer @Language.factory( "from_start_slicer", - assigns=["doc.text", "doc._.fulltext"], + assigns=Slicer.assigns, default_config={"max_units": -1, "units": "sents", "complete_sentences": True}, ) def from_start_slicer( @@ -40,17 +41,14 @@ def from_start_slicer( return FromStartSlicer(nlp, max_units, units, complete_sentences) -class FromStartSlicer: +class FromStartSlicer(Slicer): """The class `FromStartSlicer`.""" def __init__(self, nlp: Language, max_units: int, units: str, complete_sentences: bool): - self.max_units = max_units if units not in ["chars", "sents", "tokens"]: raise ValueError('Units must be "chars", "sents" or "tokens".') - self.units = units - self.complete_sentences = complete_sentences - add_extension(Doc, "fulltext") + super().__init__(nlp, max_units, units, complete_sentences) def __call__(self, doc: Doc) -> Doc: if (self.units == "sents" or self.complete_sentences) and not doc.is_sentenced: diff --git a/src/monapipe/pipeline/slicer/slicer.py b/src/monapipe/pipeline/slicer/slicer.py new file mode 100644 index 0000000..9247122 --- /dev/null +++ b/src/monapipe/pipeline/slicer/slicer.py @@ -0,0 +1,21 @@ +# SPDX-FileCopyrightText: 2022 Georg-August-Universität Göttingen +# +# SPDX-License-Identifier: CC0-1.0 + +from spacy.language import Language +from spacy.tokens import Doc + +from monapipe.pipeline.methods import add_extension + + +class Slicer: + """The super class `Slicer`.""" + + assigns = ["doc.text", "doc._.fulltext"] + + def __init__(self, nlp: Language, max_units: int, units: str, complete_sentences: bool): + self.max_units = max_units + self.units = units + self.complete_sentences = complete_sentences + + add_extension(Doc, "fulltext") diff --git a/src/monapipe/pipeline/speaker_extractor/rb_speaker_extractor.py b/src/monapipe/pipeline/speaker_extractor/rb_speaker_extractor.py index e22cd62..fb11b34 100644 --- a/src/monapipe/pipeline/speaker_extractor/rb_speaker_extractor.py +++ b/src/monapipe/pipeline/speaker_extractor/rb_speaker_extractor.py @@ -2,22 +2,20 @@ # # SPDX-License-Identifier: CC0-1.0 -from typing import Any, List, Optional, Tuple +from typing import Any from spacy.language import Language -from spacy.tokens import Doc, Span, Token +from spacy.tokens import Doc from monapipe.linguistics import get_noun_phrases, is_pronoun, is_proper_noun from monapipe.lookups import lookup -from monapipe.pipeline.methods import add_extension, requires +from monapipe.pipeline.methods import requires +from monapipe.pipeline.speaker_extractor.speaker_extractor import SpeakerExtractor @Language.factory( "rb_speaker_extractor", - assigns={ - "span._.addressee": "speech_span._.addressee", - "span._.speaker": "speech_span._.speaker", - }, + assigns=SpeakerExtractor.assigns, default_config={}, ) def rb_speaker_extractor(nlp: Language, name: str) -> Any: @@ -36,14 +34,13 @@ def rb_speaker_extractor(nlp: Language, name: str) -> Any: return RbSpeakerExtractor(nlp) -class RbSpeakerExtractor: +class RbSpeakerExtractor(SpeakerExtractor): """The class `RbSpeakerExtractor`.""" def __init__(self, nlp: Language): requires(self, nlp, ["lemmatizer", "speech_tagger"]) - add_extension(Span, "addressee") - add_extension(Span, "speaker") + super().__init__(nlp) def __call__(self, doc: Doc) -> Doc: speech_verbs = lookup(doc.lang_, "speech_verbs") diff --git a/src/monapipe/pipeline/speaker_extractor/speaker_extractor.py b/src/monapipe/pipeline/speaker_extractor/speaker_extractor.py new file mode 100644 index 0000000..54b1624 --- /dev/null +++ b/src/monapipe/pipeline/speaker_extractor/speaker_extractor.py @@ -0,0 +1,21 @@ +# SPDX-FileCopyrightText: 2022 Georg-August-Universität Göttingen +# +# SPDX-License-Identifier: CC0-1.0 + +from spacy.language import Language +from spacy.tokens import Span + +from monapipe.pipeline.methods import add_extension + + +class SpeakerExtractor: + """The super class `SpeakerExtractor`.""" + + assigns = { + "span._.addressee": "speech_span._.addressee", + "span._.speaker": "speech_span._.speaker", + } + + def __init__(self, nlp: Language): + add_extension(Span, "addressee") + add_extension(Span, "speaker") diff --git a/src/monapipe/pipeline/speech_tagger/flair_speech_tagger.py b/src/monapipe/pipeline/speech_tagger/flair_speech_tagger.py index d52292d..f5339ee 100644 --- a/src/monapipe/pipeline/speech_tagger/flair_speech_tagger.py +++ b/src/monapipe/pipeline/speech_tagger/flair_speech_tagger.py @@ -9,23 +9,20 @@ import torch from flair.data import Sentence from flair.models import SequenceTagger from spacy.language import Language -from spacy.tokens import Doc, Span, Token +from spacy.tokens import Doc, Span import monapipe.resource_handler as resources from monapipe.config import SETTINGS -from monapipe.pipeline.methods import add_extension, requires, update_token_span_groups +from monapipe.pipeline.methods import requires, update_token_span_groups from monapipe.pipeline.speech_tagger.methods import ( create_speech_segments_from_token_tags, ) +from monapipe.pipeline.speech_tagger.speech_tagger import SpeechTagger @Language.factory( "flair_speech_tagger", - assigns={ - "doc.spans": "doc.spans['speech']", - "span._.speech": "speech_span._.speech", - "token._.speech": "token._.speech", - }, + assigns=SpeechTagger.assigns, default_config={"sentence_level": False}, ) def flair_speech_tagger(nlp: Language, name: str, sentence_level: bool) -> Any: @@ -33,7 +30,6 @@ def flair_speech_tagger(nlp: Language, name: str, sentence_level: bool) -> Any: Tags tokens and clauses with speech tags. Wrapper for the "Redewiedergabe" taggers from https://github.com/redewiedergabe/tagger. - Args: nlp: Spacy object. name: Component name. @@ -47,16 +43,13 @@ def flair_speech_tagger(nlp: Language, name: str, sentence_level: bool) -> Any: return FlairSpeechTagger(nlp, sentence_level) -class FlairSpeechTagger: +class FlairSpeechTagger(SpeechTagger): """The class `FlairSpeechTagger`.""" def __init__(self, nlp: Language, sentence_level: bool): requires(self, nlp, ["parser"]) - self.sentence_level = sentence_level - - add_extension(Token, "speech", {}) - add_extension(Span, "speech", {}) + super().__init__(nlp, sentence_level) def __call__(self, doc: Doc) -> Doc: flair.device = torch.device(SETTINGS["torch_device"]) diff --git a/src/monapipe/pipeline/speech_tagger/quotation_marks_speech_tagger.py b/src/monapipe/pipeline/speech_tagger/quotation_marks_speech_tagger.py index 2555d95..563e538 100644 --- a/src/monapipe/pipeline/speech_tagger/quotation_marks_speech_tagger.py +++ b/src/monapipe/pipeline/speech_tagger/quotation_marks_speech_tagger.py @@ -2,7 +2,7 @@ # # SPDX-License-Identifier: CC0-1.0 -from typing import Any +from typing import Any, Optional from spacy.language import Language from spacy.tokens import Doc, Span, Token @@ -12,40 +12,36 @@ from monapipe.pipeline.methods import add_extension, update_token_span_groups from monapipe.pipeline.speech_tagger.methods import ( create_speech_segments_from_token_tags, ) +from monapipe.pipeline.speech_tagger.speech_tagger import SpeechTagger @Language.factory( "quotation_marks_speech_tagger", - assigns={ - "doc.spans": "doc.spans['speech']", - "span._.speech": "speech_span._.speech", - "token._.speech": "token._.speech", - }, - default_config={}, + assigns=SpeechTagger.assigns, + default_config={"sentence_level": None}, ) -def quotation_marks_speech_tagger(nlp: Language, name: str) -> Any: +def quotation_marks_speech_tagger(nlp: Language, name: str, sentence_level: Optional[bool]) -> Any: """Spacy pipeline component. Tags tokens and clauses with speech tags. Detects only direct speech within (German) quotation marks. - Args: nlp: Spacy object. name: Component name. + sentence_level: Ignored. This parameter exists only for compatibility with `SpeechTagger`. Returns: `QuotationMarksSpeechTagger`. """ - return QuotationMarksSpeechTagger(nlp) + return QuotationMarksSpeechTagger(nlp, sentence_level) -class QuotationMarksSpeechTagger: +class QuotationMarksSpeechTagger(SpeechTagger): """The class `QuotationMarksSpeechTagger`.""" - def __init__(self, nlp: Language): - add_extension(Token, "speech", {}) - add_extension(Span, "speech", {}) + def __init__(self, nlp: Language, sentence_level: Optional[bool]): + super().__init__(nlp, sentence_level) def __call__(self, doc: Doc) -> Doc: q_marks = lookup(doc.lang_, "quotation_marks") diff --git a/src/monapipe/pipeline/speech_tagger/speech_tagger.py b/src/monapipe/pipeline/speech_tagger/speech_tagger.py new file mode 100644 index 0000000..eecf63a --- /dev/null +++ b/src/monapipe/pipeline/speech_tagger/speech_tagger.py @@ -0,0 +1,26 @@ +# SPDX-FileCopyrightText: 2022 Georg-August-Universität Göttingen +# +# SPDX-License-Identifier: CC0-1.0 + +from typing import Optional + +from spacy.language import Language +from spacy.tokens import Span, Token + +from monapipe.pipeline.methods import add_extension + + +class SpeechTagger: + """The super class `SpeechTagger`.""" + + assigns = { + "doc.spans": "doc.spans['speech']", + "span._.speech": "speech_span._.speech", + "token._.speech": "token._.speech", + } + + def __init__(self, nlp: Language, sentence_level: Optional[bool]): + self.sentence_level = sentence_level + + add_extension(Token, "speech", {}) + add_extension(Span, "speech", {}) diff --git a/src/monapipe/pipeline/temponym_tagger/heideltime_temponym_tagger.py b/src/monapipe/pipeline/temponym_tagger/heideltime_temponym_tagger.py index e89a066..6cc9ada 100644 --- a/src/monapipe/pipeline/temponym_tagger/heideltime_temponym_tagger.py +++ b/src/monapipe/pipeline/temponym_tagger/heideltime_temponym_tagger.py @@ -6,18 +6,16 @@ import re from typing import Any from spacy.language import Language -from spacy.tokens import Doc, Span, Token +from spacy.tokens import Doc import monapipe.resource_handler as resources -from monapipe.pipeline.methods import add_extension, update_token_span_groups +from monapipe.pipeline.methods import update_token_span_groups +from monapipe.pipeline.temponym_tagger.temponym_tagger import TemponymTagger @Language.factory( "heideltime_temponym_tagger", - assigns={ - "doc.spans": "doc.spans['temponym']", - "span._.temponym_norm": "temponym_span._.temponym_norm", - }, + assigns=TemponymTagger.assigns, default_config={}, ) def heideltime_temponym_tagger(nlp: Language, name: str) -> Any: @@ -37,11 +35,11 @@ def heideltime_temponym_tagger(nlp: Language, name: str) -> Any: return HeideltimeTemponymTagger(nlp) -class HeideltimeTemponymTagger: +class HeideltimeTemponymTagger(TemponymTagger): """The class `HeideltimeTemponymTagger`.""" def __init__(self, nlp: Language): - add_extension(Span, "temponym_norm") + super().__init__(nlp) def __call__(self, doc: Doc) -> Doc: doc.spans["temponym"] = [] diff --git a/src/monapipe/pipeline/temponym_tagger/temponym_tagger.py b/src/monapipe/pipeline/temponym_tagger/temponym_tagger.py new file mode 100644 index 0000000..4aa6597 --- /dev/null +++ b/src/monapipe/pipeline/temponym_tagger/temponym_tagger.py @@ -0,0 +1,20 @@ +# SPDX-FileCopyrightText: 2022 Georg-August-Universität Göttingen +# +# SPDX-License-Identifier: CC0-1.0 + +from spacy.language import Language +from spacy.tokens import Span + +from monapipe.pipeline.methods import add_extension + + +class TemponymTagger: + """The super class `TemponymTagger`.""" + + assigns = { + "doc.spans": "doc.spans['temponym']", + "span._.temponym_norm": "temponym_span._.temponym_norm", + } + + def __init__(self, nlp: Language): + add_extension(Span, "temponym_norm") diff --git a/src/monapipe/pipeline/verb_analyzer/rb_verb_analyzer.py b/src/monapipe/pipeline/verb_analyzer/rb_verb_analyzer.py index 2f2244c..cdef19d 100644 --- a/src/monapipe/pipeline/verb_analyzer/rb_verb_analyzer.py +++ b/src/monapipe/pipeline/verb_analyzer/rb_verb_analyzer.py @@ -12,17 +12,13 @@ from spacy.tokens import Doc, MorphAnalysis, Span, Token from monapipe.linguistics import get_morph_analyses from monapipe.lookups import lookup -from monapipe.pipeline.methods import add_extension, requires +from monapipe.pipeline.methods import requires +from monapipe.pipeline.verb_analyzer.verb_analyzer import VerbAnalyzer @Language.factory( "rb_verb_analyzer", - assigns={ - "span._.form": "clause._.form", - "span._.form_main": "clause._.form_main", - "span._.form_modals": "clause._.form_modals", - "span._.form_verbs": "clause._.form_verbs", - }, + assigns=VerbAnalyzer.assigns, default_config={ "ov": True, "conj_rule_labels": ["conj"], @@ -67,7 +63,7 @@ def rb_verb_analyzer( ) -class RbVerbAnalyzer: +class RbVerbAnalyzer(VerbAnalyzer): """The class `RbVerbAnalyzer`.""" def __init__( @@ -81,16 +77,14 @@ class RbVerbAnalyzer: ): requires(self, nlp, ["morphologizer", "lemmatizer", "clausizer"]) - self.ov = ov - self.conj_rule_labels = conj_rule_labels - self.handle_semi_modals = handle_semi_modals - self.handle_particles = handle_particles - self.handle_local_verb_movement = handle_local_verb_movement - - add_extension(Span, "form", MorphAnalysis(nlp.vocab, {})) - add_extension(Span, "form_main", None) - add_extension(Span, "form_modals", []) - add_extension(Span, "form_verbs", []) + super().__init__( + nlp, + ov, + conj_rule_labels, + handle_semi_modals, + handle_particles, + handle_local_verb_movement, + ) def __call__(self, doc: Doc) -> Doc: # read language-specific inflection table, auxiliary verbs and modal verbs diff --git a/src/monapipe/pipeline/verb_analyzer/verb_analyzer.py b/src/monapipe/pipeline/verb_analyzer/verb_analyzer.py new file mode 100644 index 0000000..8592b62 --- /dev/null +++ b/src/monapipe/pipeline/verb_analyzer/verb_analyzer.py @@ -0,0 +1,41 @@ +# SPDX-FileCopyrightText: 2022 Georg-August-Universität Göttingen +# +# SPDX-License-Identifier: CC0-1.0 + +from typing import List + +from spacy.language import Language +from spacy.tokens import MorphAnalysis, Span + +from monapipe.pipeline.methods import add_extension + + +class VerbAnalyzer: + """The super class `VerbAnalyzer`.""" + + assigns = { + "span._.form": "clause._.form", + "span._.form_main": "clause._.form_main", + "span._.form_modals": "clause._.form_modals", + "span._.form_verbs": "clause._.form_verbs", + } + + def __init__( + self, + nlp: Language, + ov: bool, + conj_rule_labels: List[str], + handle_semi_modals: bool, + handle_particles: bool, + handle_local_verb_movement: bool, + ): + self.ov = ov + self.conj_rule_labels = conj_rule_labels + self.handle_semi_modals = handle_semi_modals + self.handle_particles = handle_particles + self.handle_local_verb_movement = handle_local_verb_movement + + add_extension(Span, "form", MorphAnalysis(nlp.vocab, {})) + add_extension(Span, "form_main", None) + add_extension(Span, "form_modals", []) + add_extension(Span, "form_verbs", []) diff --git a/tests/pipeline/test_conllu_formatter.py b/tests/pipeline/test_conllu_formatter.py index f2bc247..66dbdc9 100644 --- a/tests/pipeline/test_conllu_formatter.py +++ b/tests/pipeline/test_conllu_formatter.py @@ -13,5 +13,5 @@ def test_conllu_formatter(): nlp = monapipe.model.load() nlp.add_pipe("conllu_formatter") doc = nlp(text_goethe_wv) - assert check_data_types([doc], "conllu_str", str) - assert check_data_types(doc.sents, "conllu_str", str) + assert check_data_types([doc], "format_str", str) + assert check_data_types(doc.sents, "format_str", str) -- GitLab