Update convert-anystyle-data experiment

3bf60326 · Christian Boulanger · 8d055390 · 3bf60326
Commit 3bf60326 authored 8 months ago by Christian Boulanger
--- a/convert-anystyle-data/anystyle-to-prodigy.ipynb
+++ b/convert-anystyle-data/anystyle-to-prodigy.ipynb
@@ -100,7 +100,7 @@
   "source": [
    "from pathlib import Path\n",
    "for filename in [file.stem for file in Path('in').glob('*.xml')]:\n",
-    "    xml_to_jsonl(f'in/{filename}.xml', f'out/{filename}-parser.jsonl')"
+    "    xml_to_jsonl(f'in/{filename}.xml', f'out/{filename}-prodigy-parser.jsonl')"
   ],
   "id": "fa9f2c194697a6b7",
   "outputs": [],
@@ -238,7 +238,7 @@
   "source": [
    "from pathlib import Path\n",
    "for filename in [file.stem for file in Path('in').glob('*.xml')]:\n",
-    "    markup_to_jsonl(f'in/{filename}.ttx', f'out/{filename}-finder.jsonl', break_tags_before=['meta'])"
+    "    markup_to_jsonl(f'in/{filename}.ttx', f'out/{filename}-prodigy-finder.jsonl', break_tags_before=['meta'])"
   ],
   "id": "2b0ca72bbf70c829",
   "outputs": [],

 %% Cell type:markdown id:cb631197c03a0768 tags:

 # Convert AnyStyle training files to the Prodigy format

 Converter functions written with the help GPT-4, see https://chat.openai.com/share/3f42ae1d-3066-4563-944d-3139b5e1e867 and https://chat.openai.com/share/b6832876-9424-4ff9-bd83-157f3197f805

 %% Cell type:markdown id:83bfc356ce72fe51 tags:

 ## Parser files

 %% Cell type:markdown id:f46a123db5e7d341 tags:

 For the output format, see https://prodi.gy/docs/api-interfaces#spans_manual

 %% Cell type:code id:initial_id tags:

 ``` python
 import xml.etree.ElementTree as ET
 import spacy
 import json

 # Load the German NLP model
 nlp = spacy.load('de_core_news_sm')

 def xml_to_jsonl(input_xml_path, output_jsonl_path):
    tree = ET.parse(input_xml_path)
    root = tree.getroot()

    with open(output_jsonl_path, 'w', encoding='utf-8') as f:
        for sequence in root.findall('sequence'):
            text = " ".join(element.text.strip() if element.text else '' for element in sequence)
            doc = nlp(text)
            tokens = [{'text': token.text, 'start': token.idx, 'end': token.idx + len(token)} for token in doc]

            # Add each token's ID and whitespace flag
            for idx, token in enumerate(tokens):
                token['id'] = idx
                token['ws'] = (token['end'] < len(text) and text[token['end']] == " ")

            spans = []
            for element in sequence:
                if not element.text:
                    continue
                element_text = element.text.strip()
                start = text.index(element_text)
                end = start + len(element_text)

                # Find the tokens that encompass the start and end of the element text
                token_start = next((i for i, token in enumerate(tokens) if token['start'] <= start < token['end']), None)
                token_end = next((i for i, token in enumerate(tokens) if token['start'] < end <= token['end']), None)

                # If tokens are found, add to spans list
                if token_start is not None and token_end is not None:
                    span = {
                        'start': start,
                        'end': end,
                        'label': element.tag,
                        'token_start': token_start,
                        'token_end': token_end
                    }
                    spans.append(span)
                else:
                    print(f"Error finding tokens for span from text: '{element_text}'")

            # Serialize and write to file
            f.write(json.dumps({'text': text, 'tokens': tokens, 'spans': spans}) + '\n')

 ```

 %% Cell type:code id:fa9f2c194697a6b7 tags:

 ``` python
 from pathlib import Path
 for filename in [file.stem for file in Path('in').glob('*.xml')]:
-    xml_to_jsonl(f'in/{filename}.xml', f'out/{filename}-parser.jsonl')
+    xml_to_jsonl(f'in/{filename}.xml', f'out/{filename}-prodigy-parser.jsonl')
 ```

 %% Cell type:markdown id:2988615a1be96bb tags:

 ## Finder files

 %% Cell type:code id:7764c66e149abd4f tags:

 ``` python
 import json

 def markup_to_jsonl(input_path, output_jsonl_path, break_tags_before=None, break_tags_after=None):
    if break_tags_before is None:
        break_tags_before = []
    if break_tags_after is None:
        break_tags_after = []

    # Helper function to write blocks to JSONL, now takes a file handle
    def write_block(full_text, spans, file_handle):
        if full_text.strip():  # Ensure there's content to write
            result = {
                'text': full_text.rstrip(),  # Remove trailing spaces
                'spans': spans
            }
            file_handle.write(json.dumps(result) + '\n')

    # Open the output file once at the beginning
    with open(output_jsonl_path, 'w', encoding='utf-8') as f:
        with open(input_path, 'r', encoding='utf-8') as file:
            lines = file.readlines()

        full_text = ''
        spans = []
        current_text = ''
        text_start_pos = 0
        current_tag = 'text'  # Default initial tag

        # Iterate through lines
        for line in lines:
            parts = line.strip().split('|', 1)
            if len(parts) < 2:
                continue

            tag = parts[0].strip()
            content = parts[1].strip()

            if tag in break_tags_before:
                # Finish the current text block before starting new due to break tag
                if current_text:
                    full_text += current_text
                    spans.append({
                        'start': text_start_pos,
                        'end': text_start_pos + len(current_text),
                        'label': current_tag
                    })
                    write_block(full_text, spans, f)
                    full_text, spans, current_text = '', [], ''
                current_tag = tag  # Update to the new tag
                text_start_pos = len(full_text)

            # Append content with maintaining line break if the previous content was not empty
            current_text += content + '\n' if current_text else content

            if tag:
                if current_text and current_text != content:  # Ensures not to add just processed content span
                    # Finish the current span
                    full_text += current_text
                    spans.append({
                        'start': text_start_pos,
                        'end': text_start_pos + len(current_text) - 1,  # -1 to not count the last \n for span
                        'label': current_tag
                    })
                current_text = content + '\n'  # Start new text content under the new tag
                current_tag = tag
                text_start_pos = len(full_text)

            if tag in break_tags_after:
                # Finalize current text and write it due to break tag
                full_text += current_text
                spans.append({
                    'start': text_start_pos,
                    'end': text_start_pos + len(current_text) - 1,  # -1 as above
                    'label': current_tag
                })
                current_text = ''
                write_block(full_text, spans, f)
                full_text, spans = '', []

        # Final write for any remaining text
        if current_text:
            full_text += current_text
            spans.append({
                'start': text_start_pos,
                'end': text_start_pos + len(current_text) - 1,  # -1 to adjust for the last newline character
                'label': current_tag
            })
        write_block(full_text, spans, f)

 # Example usage:
 # markup_to_jsonl('input_text.txt', 'output.jsonl', ['header'], ['footer'])
 ```

 %% Cell type:markdown id:8923ed5011c1d65f tags:

 Since the finder files have no information on page breaks, document segmentation is done using the `meta` tag, which is a very simple heuristic and probably fails in many cases.

 %% Cell type:code id:2b0ca72bbf70c829 tags:

 ``` python
 from pathlib import Path
 for filename in [file.stem for file in Path('in').glob('*.xml')]:
-    markup_to_jsonl(f'in/{filename}.ttx', f'out/{filename}-finder.jsonl', break_tags_before=['meta'])
+    markup_to_jsonl(f'in/{filename}.ttx', f'out/{filename}-prodigy-finder.jsonl', break_tags_before=['meta'])
 ```

 %% Cell type:markdown id:72233341123b80c7 tags:

 ## Extract labels

 %% Cell type:code id:46d7e63e53b441e3 tags:

 ``` python
 import json

 def extract_unique_labels(jsonl_file_path):
    unique_labels = set()  # Using a set to store unique labels
    with open(jsonl_file_path, 'r', encoding='utf-8') as file:
        for line in file:
            data = json.loads(line)  # Load JSON from each line
            if 'spans' in data:
                for span in data['spans']:
                    if 'label' in span:
                        unique_labels.add(span['label'])  # Add label to the set
    unique_labels = list(unique_labels)
    unique_labels.sort()
    return unique_labels

 " ".join(extract_unique_labels(f'out/10.1515_zfrs-1980-0103-parser.jsonl'))
 ```

 %% Output

    'author backref citation-number collection-title container-title date editor ignore journal location note pages publisher signal title volume'

 %% Cell type:code id:3efd4cc21605ecbd tags:

 ``` python
 ```