Convert AmyStyle training data to a simple JSONL format

In [30]:
import xml.etree.ElementTree as ET
import json
import regex as re
import string
import glob
import os

def xml_to_jsonl(input_xml_path, output_jsonl_path, tags):
    tree = ET.parse(input_xml_path)
    root = tree.getroot()

    with open(output_jsonl_path, 'w', encoding='utf-8') as f:
        for sequence in root.findall('sequence'):
            output = []
            for element in sequence:
                for tag in tags:
                    if type(tag) is tuple:
                        tag, fn = tag
                    if element.tag == tag:
                        value = fn(element.text) if callable(fn) else element.text
                        if len(output) == 0 or tag in output[-1]:
                            output.append({}) 
                        output[-1][tag] = value
            if len(output) > 0:
                instance = {
                    "in" : " ".join(element.text.strip() if element.text else '' for element in sequence),
                    "out" : output
                }
                f.write(json.dumps(instance) + '\n')

def remove_punctuation(text):
    punctuation = set(string.punctuation)
    start, end = 0, len(text)
    while start < len(text) and text[start] in punctuation:
        start += 1
    while end > start and text[end - 1] in punctuation:
        end -= 1
    return text[start:end].strip()

def clean_editor(text):
    return remove_punctuation(re.sub(r'hrsg\. v\.|hg\. v|hrsg|ed\.|eds\.|in:', '', text, flags=re.IGNORECASE))

def clean_container(text):
    return remove_punctuation(re.sub(r'in:|aus:|from:', '', text, flags=re.IGNORECASE))
    
def extract_year(text): 
    m = re.search( r'[12][0-9]{3}', text)
    return m.group(0) if m else None

for input_file in glob.glob('in/*.xml'):
    base_name = os.path.basename(input_file)
    output_file = f'out/{os.path.splitext(base_name)[0]}-simple.jsonl'
    xml_to_jsonl(input_file, output_file, [
        ("author", remove_punctuation),
        ("editor", clean_editor),
        ("authority", remove_punctuation),
        ("title", remove_punctuation),
        ("container-title", clean_container),
        ("journal", clean_container),
        ("date", extract_year)
    ])

