# Conversion to a simple flat JSON format  

## Convert AmyStyle training data to JSONL

In [1]:
import xml.etree.ElementTree as ET
import json
import regex as re
import glob
import os

def xml_to_jsonl(input_xml_path, output_jsonl_path, tags):
    tree = ET.parse(input_xml_path)
    root = tree.getroot()

    with (open(output_jsonl_path, 'w', encoding='utf-8') as f):
        for sequence in root.findall('sequence'):
            output = []
            for element in sequence:
                for tag in tags:
                    fn = None
                    if type(tag) is tuple:
                        tag, fn = tag
                    if element.tag == tag:
                        value = fn(element.text) if callable(fn) else element.text
                        is_new_ref = (len(output) == 0 #  if no refs yet
                                     or tag == "citation-number" 
                                     or (tag in output[-1]  and tag != "note") # or tag already exists
                                     or (tag in ["author", "editor", "authority", "legal-ref"] and 'date' in output[-1])) # or a creator field follows a date field 
                        if is_new_ref:
                            output.append({})
                            # add citation number from previous citation 
                            if len(output) > 1 and 'citation-number' in tags and tag != "citation-number" and 'citation-number' in output[-2]:
                                output[-1]['citation-number'] = output[-2]['citation-number']
                        # merge tags
                        if tag == "authority":
                            tag = "author"                       
                        output[-1][tag] = value
            if len(output) > 0:
                instance = {
                    "in" : " ".join(element.text.strip() if element.text else '' for element in sequence),
                    "out" : output
                }
                f.write(json.dumps(instance) + '\n')
                
def even_num_brackets(string: str):
    return ((string.endswith(")") and string.count(")") == string.count("(")) 
            or (string.endswith("]") and string.count("]") == string.count("["))) 

def remove_punctuation(text):
    """This removes leading and trailing punctuation using a very naive heuristic"""
    start, end = 0, len(text)
    while start < len(text) and re.match("\p{P}", text[start]):
        start += 1
    while end > start and re.match("\p{P}", text[end - 1]) and not even_num_brackets(text[start:end]) and text[end - 1] not in "?!":
        end -= 1
    return text[start:end].strip()

def clean_editor(text):
    text = re.sub(r'^in(:| )', '', remove_punctuation(text), flags=re.IGNORECASE)
    text = re.sub(r'\(?(hrsg\. v\.|hg\. v|hrsg\.|ed\.|eds\.)\)?', '', text, flags=re.IGNORECASE)
    return text

def clean_container(text):
    return remove_punctuation(re.sub(r'^(in|aus|from)(:| )', '', text.strip(), flags=re.IGNORECASE))
    
def extract_year(text): 
    m = re.search( r'[12][0-9]{3}', text)
    return m.group(0) if m else None

for input_path in glob.glob('anystyle/*.xml'):
    base_name = os.path.basename(input_path)
    schema_file = f'jsonl/{os.path.splitext(base_name)[0]}.jsonl'
    print(f'Processing {input_path}')
    xml_to_jsonl(input_path, schema_file, [
        'citation-number',
        ("author", remove_punctuation),
        ("editor", clean_editor),
        ("authority", remove_punctuation),
        ("title", remove_punctuation),
        ("legal-ref", remove_punctuation),
        ("container-title", clean_container),
        ("journal", clean_container),
        ("date", extract_year),
        ("backref", remove_punctuation)
    ])



Processing anystyle\10.1111_1467-6478.00057.xml
Processing anystyle\10.1111_1467-6478.00080.xml
Processing anystyle\10.1515_zfrs-1980-0103.xml
Processing anystyle\10.1515_zfrs-1980-0104.xml


## Create JSON and MarkML schema from JSON data

In [1]:
# Adapted from https://github.com/linkml/schema-automator/blob/main/tests/test_generalizers/test_json_data_generalizer.py
import glob
import os
import pandas as pd
import json
from schema_automator.generalizers.json_instance_generalizer import JsonDataGeneralizer
from linkml.generators.yamlgen import YAMLGenerator
from schema_automator.utils.schemautils import write_schema
import warnings
warnings.filterwarnings("ignore") # Suppress irrelevant timezone warning

WRITE_EXTENDED_SCHEMA = False

ie = JsonDataGeneralizer()
for input_path in glob.glob('jsonl/*.jsonl'):
    base_name = os.path.basename(input_path)
    base_name_no_ext = os.path.splitext(base_name)[0]
    json_file = f'json/{base_name_no_ext}.json'
    schema_file = f'schema/{base_name_no_ext}-schema.yaml'
    extended_schema_file = f'schema/{base_name_no_ext}-schema2.yaml'
    print(f'Processing {input_path}')
    with open(input_path, 'r', encoding='utf-8') as input_buf:
        df = pd.read_json(input_path, lines=True)
        flat_list = [item for sublist in df['out'].tolist() for item in sublist if pd.notna(item)]
        with open(json_file, 'w') as f:
            json.dump(flat_list, f)
        schema = ie.convert(json_file, format='json', container_class_name="Reference")
        write_schema(schema, schema_file)
        if WRITE_EXTENDED_SCHEMA:
            s = YAMLGenerator(schema_file).serialize()
            with open(extended_schema_file, 'w') as stream:
                stream.write(s)


Processing jsonl\10.1111_1467-6478.00057.jsonl




Processing jsonl\10.1111_1467-6478.00080.jsonl
Processing jsonl\10.1515_zfrs-1980-0103.jsonl
Processing jsonl\10.1515_zfrs-1980-0104.jsonl


# Create canonical schema file from one of the auto-generated ones

In [3]:
import yaml

def merge_slot_data(input_file, slots_data_file, output_file):
    with open(input_file, 'r') as f:
        yaml1 = yaml.safe_load(f)
    with open(slots_data_file, 'r') as f:
        yaml2 = yaml.safe_load(f)
    for key in yaml1['slots'].keys():
        if key in yaml2['slots']:
            yaml1['slots'][key].update(yaml2['slots'][key])
    with open(output_file, 'w') as f:
        yaml.dump(yaml1, f, default_flow_style=False, sort_keys=False)

merge_slot_data('schema/10.1111_1467-6478.00080-schema.yaml', 'schema/slot-data.yaml', 'schema/simple-schema.yaml')
