{ "cells": [ { "metadata": {}, "cell_type": "markdown", "source": "## Convert AmyStyle training data to a simple JSONL format", "id": "ae7e001161d678cc" }, { "metadata": { "ExecuteTime": { "end_time": "2024-07-22T10:53:30.704827Z", "start_time": "2024-07-22T10:53:30.627637Z" } }, "cell_type": "code", "source": [ "import xml.etree.ElementTree as ET\n", "import json\n", "import regex as re\n", "import glob\n", "import os\n", "\n", "def xml_to_jsonl(input_xml_path, output_jsonl_path, tags):\n", " tree = ET.parse(input_xml_path)\n", " root = tree.getroot()\n", "\n", " with (open(output_jsonl_path, 'w', encoding='utf-8') as f):\n", " for sequence in root.findall('sequence'):\n", " output = []\n", " for element in sequence:\n", " for tag in tags:\n", " fn = None\n", " if type(tag) is tuple:\n", " tag, fn = tag\n", " if element.tag == tag:\n", " value = fn(element.text) if callable(fn) else element.text\n", " is_new_ref = (len(output) == 0 # if no refs yet\n", " or tag == \"citation-number\" \n", " or tag in output[-1] # or tag already exists\n", " or (tag in [\"author\", \"editor\", \"authority\", \"legal-ref\"] and 'date' in output[-1])) # or a creator field follows a date field \n", " if is_new_ref:\n", " output.append({})\n", " # add citation number from previous citation \n", " if len(output) > 1 and 'citation-number' in tags and tag != \"citation-number\" and 'citation-number' in output[-2]:\n", " output[-1]['citation-number'] = output[-2]['citation-number']\n", " # merge tags\n", " if tag == \"authority\":\n", " tag = \"author\" \n", " output[-1][tag] = value\n", " if len(output) > 0:\n", " instance = {\n", " \"in\" : \" \".join(element.text.strip() if element.text else '' for element in sequence),\n", " \"out\" : output\n", " }\n", " f.write(json.dumps(instance) + '\\n')\n", "\n", "def remove_punctuation(text):\n", " start, end = 0, len(text)\n", " while start < len(text) and re.match(\"\\p{P}\", text[start]) and text[start] not in \")]\":\n", " start += 1\n", " while end > start and re.match(\"\\p{P}\", text[end - 1]) and text[end - 1] not in \")]\":\n", " end -= 1\n", " return text[start:end].strip()\n", "\n", "def clean_editor(text):\n", " text = re.sub(r'în:? ?', '', text.strip(), flags=re.IGNORECASE)\n", " text = re.sub(r'hrsg\\. v\\.|hg\\. v|hrsg|ed\\.|eds\\.', '', text, flags=re.IGNORECASE)\n", " return remove_punctuation(text)\n", "\n", "def clean_container(text):\n", " return remove_punctuation(re.sub(r'^(in|aus|from)(:| )', '', text.strip(), flags=re.IGNORECASE))\n", " \n", "def extract_year(text): \n", " m = re.search( r'[12][0-9]{3}', text)\n", " return m.group(0) if m else None\n", "\n", "for input_file in glob.glob('in/*.xml'):\n", " base_name = os.path.basename(input_file)\n", " schema_file = f'jsonl/{os.path.splitext(base_name)[0]}.jsonl'\n", " print(f'Processing {input_file}')\n", " xml_to_jsonl(input_file, schema_file, [\n", " 'citation-number',\n", " (\"author\", remove_punctuation),\n", " (\"editor\", clean_editor),\n", " (\"authority\", remove_punctuation),\n", " (\"title\", remove_punctuation),\n", " (\"legal-ref\", remove_punctuation),\n", " (\"container-title\", clean_container),\n", " (\"journal\", clean_container),\n", " (\"date\", extract_year),\n", " (\"backref\", remove_punctuation)\n", " ])\n", "\n" ], "id": "f101a4e2408d6313", "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Processing in\\10.1111_1467-6478.00057.xml\n", "Processing in\\10.1111_1467-6478.00080.xml\n", "Processing in\\10.1515_zfrs-1980-0103.xml\n", "Processing in\\10.1515_zfrs-1980-0104.xml\n" ] } ], "execution_count": 37 }, { "metadata": {}, "cell_type": "markdown", "source": "## Create JSON and MarkML schema from JSON data", "id": "6b14734c9c1f6ea6" }, { "metadata": { "ExecuteTime": { "end_time": "2024-07-22T10:53:36.880258Z", "start_time": "2024-07-22T10:53:36.710296Z" } }, "cell_type": "code", "source": [ "# Adapted from https://github.com/linkml/schema-automator/blob/main/tests/test_generalizers/test_json_data_generalizer.py\n", "import glob\n", "import os\n", "import pandas as pd\n", "import json\n", "from schema_automator.generalizers.json_instance_generalizer import JsonDataGeneralizer\n", "from linkml.generators.yamlgen import YAMLGenerator\n", "from schema_automator.utils.schemautils import write_schema\n", "import warnings\n", "warnings.filterwarnings(\"ignore\") # Suppress irrelevant timezone warning\n", "\n", "WRITE_EXTENDED_SCHEMA = False\n", "\n", "ie = JsonDataGeneralizer()\n", "for input_file in glob.glob('jsonl/*.jsonl'):\n", " base_name = os.path.basename(input_file)\n", " base_name_no_ext = os.path.splitext(base_name)[0]\n", " json_file = f'json/{base_name_no_ext}.json'\n", " schema_file = f'schema/{base_name_no_ext}-schema.yaml'\n", " extended_schema_file = f'schema/{base_name_no_ext}-schema2.yaml'\n", " print(f'Processing {input_file}')\n", " with open(input_file, 'r', encoding='utf-8') as input_buf:\n", " df = pd.read_json(input_file, lines=True)\n", " flat_list = [item for sublist in df['out'].tolist() for item in sublist if pd.notna(item)]\n", " with open(json_file, 'w') as f:\n", " json.dump(flat_list, f)\n", " schema = ie.convert(json_file, format='json')\n", " write_schema(schema, schema_file)\n", " if WRITE_EXTENDED_SCHEMA:\n", " s = YAMLGenerator(schema_file).serialize()\n", " with open(extended_schema_file, 'w') as stream:\n", " stream.write(s)\n" ], "id": "43e2040fed89c0bd", "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Processing jsonl\\10.1111_1467-6478.00057.jsonl\n", "Processing jsonl\\10.1111_1467-6478.00080.jsonl\n", "Processing jsonl\\10.1515_zfrs-1980-0103.jsonl\n", "Processing jsonl\\10.1515_zfrs-1980-0104.jsonl\n" ] } ], "execution_count": 38 }, { "metadata": {}, "cell_type": "code", "outputs": [], "execution_count": null, "source": "", "id": "ca27199d10d9b8bf" } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.6" } }, "nbformat": 4, "nbformat_minor": 5 }