Skip to content
Snippets Groups Projects
anystyle-to-simple-json.ipynb 7.67 KiB
Newer Older
  • Learn to ignore specific revisions
  • {
     "cells": [
      {
       "metadata": {},
       "cell_type": "markdown",
    
       "source": "## Convert AmyStyle training data to a simple JSONL format",
    
       "id": "ae7e001161d678cc"
      },
      {
       "metadata": {
        "ExecuteTime": {
    
         "end_time": "2024-07-22T10:53:30.704827Z",
         "start_time": "2024-07-22T10:53:30.627637Z"
    
        }
       },
       "cell_type": "code",
       "source": [
        "import xml.etree.ElementTree as ET\n",
        "import json\n",
        "import regex as re\n",
        "import glob\n",
        "import os\n",
        "\n",
        "def xml_to_jsonl(input_xml_path, output_jsonl_path, tags):\n",
        "    tree = ET.parse(input_xml_path)\n",
        "    root = tree.getroot()\n",
        "\n",
    
        "    with (open(output_jsonl_path, 'w', encoding='utf-8') as f):\n",
    
        "        for sequence in root.findall('sequence'):\n",
        "            output = []\n",
        "            for element in sequence:\n",
        "                for tag in tags:\n",
    
        "                    if type(tag) is tuple:\n",
        "                        tag, fn = tag\n",
        "                    if element.tag == tag:\n",
        "                        value = fn(element.text) if callable(fn) else element.text\n",
    
        "                        is_new_ref = (len(output) == 0 #  if no refs yet\n",
        "                                     or tag == \"citation-number\" \n",
        "                                     or tag in output[-1] # or tag already exists\n",
        "                                     or (tag in [\"author\", \"editor\", \"authority\", \"legal-ref\"] and 'date' in output[-1])) # or a creator field follows a date field \n",
        "                        if is_new_ref:\n",
        "                            output.append({})\n",
        "                            # add citation number from previous citation \n",
        "                            if len(output) > 1 and 'citation-number' in tags and tag != \"citation-number\" and 'citation-number' in output[-2]:\n",
        "                                output[-1]['citation-number'] = output[-2]['citation-number']\n",
        "                        # merge tags\n",
        "                        if tag == \"authority\":\n",
        "                            tag = \"author\"                       \n",
    
        "                        output[-1][tag] = value\n",
        "            if len(output) > 0:\n",
        "                instance = {\n",
        "                    \"in\" : \" \".join(element.text.strip() if element.text else '' for element in sequence),\n",
        "                    \"out\" : output\n",
        "                }\n",
        "                f.write(json.dumps(instance) + '\\n')\n",
        "\n",
        "def remove_punctuation(text):\n",
        "    start, end = 0, len(text)\n",
    
        "    while start < len(text) and re.match(\"\\p{P}\", text[start]) and text[start] not in \")]\":\n",
    
        "        start += 1\n",
    
        "    while end > start and re.match(\"\\p{P}\", text[end - 1]) and text[end - 1] not in \")]\":\n",
    
        "        end -= 1\n",
        "    return text[start:end].strip()\n",
        "\n",
        "def clean_editor(text):\n",
    
        "    text = re.sub(r'în:? ?', '', text.strip(), flags=re.IGNORECASE)\n",
        "    text = re.sub(r'hrsg\\. v\\.|hg\\. v|hrsg|ed\\.|eds\\.', '', text, flags=re.IGNORECASE)\n",
        "    return remove_punctuation(text)\n",
    
        "\n",
        "def clean_container(text):\n",
    
        "    return remove_punctuation(re.sub(r'^(in|aus|from)(:| )', '', text.strip(), flags=re.IGNORECASE))\n",
    
        "    \n",
        "def extract_year(text): \n",
        "    m = re.search( r'[12][0-9]{3}', text)\n",
        "    return m.group(0) if m else None\n",
        "\n",
        "for input_file in glob.glob('in/*.xml'):\n",
        "    base_name = os.path.basename(input_file)\n",
    
        "    schema_file = f'jsonl/{os.path.splitext(base_name)[0]}.jsonl'\n",
    
        "    print(f'Processing {input_file}')\n",
    
        "    xml_to_jsonl(input_file, schema_file, [\n",
    
        "        'citation-number',\n",
    
        "        (\"author\", remove_punctuation),\n",
        "        (\"editor\", clean_editor),\n",
        "        (\"authority\", remove_punctuation),\n",
        "        (\"title\", remove_punctuation),\n",
    
        "        (\"legal-ref\", remove_punctuation),\n",
    
        "        (\"container-title\", clean_container),\n",
        "        (\"journal\", clean_container),\n",
    
        "        (\"date\", extract_year),\n",
        "        (\"backref\", remove_punctuation)\n",
    
        "    ])\n",
        "\n"
       ],
       "id": "f101a4e2408d6313",
    
       "outputs": [
        {
         "name": "stdout",
         "output_type": "stream",
         "text": [
          "Processing in\\10.1111_1467-6478.00057.xml\n",
          "Processing in\\10.1111_1467-6478.00080.xml\n",
          "Processing in\\10.1515_zfrs-1980-0103.xml\n",
          "Processing in\\10.1515_zfrs-1980-0104.xml\n"
         ]
        }
       ],
    
       "execution_count": 37
      },
      {
       "metadata": {},
       "cell_type": "markdown",
       "source": "## Create JSON and MarkML schema from JSON data",
       "id": "6b14734c9c1f6ea6"
      },
      {
       "metadata": {
        "ExecuteTime": {
         "end_time": "2024-07-22T10:53:36.880258Z",
         "start_time": "2024-07-22T10:53:36.710296Z"
        }
       },
       "cell_type": "code",
       "source": [
        "# Adapted from https://github.com/linkml/schema-automator/blob/main/tests/test_generalizers/test_json_data_generalizer.py\n",
        "import glob\n",
        "import os\n",
        "import pandas as pd\n",
        "import json\n",
        "from schema_automator.generalizers.json_instance_generalizer import JsonDataGeneralizer\n",
        "from linkml.generators.yamlgen import YAMLGenerator\n",
        "from schema_automator.utils.schemautils import write_schema\n",
        "import warnings\n",
        "warnings.filterwarnings(\"ignore\") # Suppress irrelevant timezone warning\n",
        "\n",
        "WRITE_EXTENDED_SCHEMA = False\n",
        "\n",
        "ie = JsonDataGeneralizer()\n",
        "for input_file in glob.glob('jsonl/*.jsonl'):\n",
        "    base_name = os.path.basename(input_file)\n",
        "    base_name_no_ext = os.path.splitext(base_name)[0]\n",
        "    json_file = f'json/{base_name_no_ext}.json'\n",
        "    schema_file = f'schema/{base_name_no_ext}-schema.yaml'\n",
        "    extended_schema_file = f'schema/{base_name_no_ext}-schema2.yaml'\n",
        "    print(f'Processing {input_file}')\n",
        "    with open(input_file, 'r', encoding='utf-8') as input_buf:\n",
        "        df = pd.read_json(input_file, lines=True)\n",
        "        flat_list = [item for sublist in df['out'].tolist() for item in sublist if pd.notna(item)]\n",
        "        with open(json_file, 'w') as f:\n",
        "            json.dump(flat_list, f)\n",
        "        schema = ie.convert(json_file, format='json')\n",
        "        write_schema(schema, schema_file)\n",
        "        if WRITE_EXTENDED_SCHEMA:\n",
        "            s = YAMLGenerator(schema_file).serialize()\n",
        "            with open(extended_schema_file, 'w') as stream:\n",
        "                stream.write(s)\n"
       ],
       "id": "43e2040fed89c0bd",
       "outputs": [
        {
         "name": "stdout",
         "output_type": "stream",
         "text": [
          "Processing jsonl\\10.1111_1467-6478.00057.jsonl\n",
          "Processing jsonl\\10.1111_1467-6478.00080.jsonl\n",
          "Processing jsonl\\10.1515_zfrs-1980-0103.jsonl\n",
          "Processing jsonl\\10.1515_zfrs-1980-0104.jsonl\n"
         ]
        }
       ],
       "execution_count": 38
    
      },
      {
       "metadata": {},
       "cell_type": "code",
       "outputs": [],
       "execution_count": null,
       "source": "",
    
       "id": "ca27199d10d9b8bf"
    
      }
     ],
     "metadata": {
      "kernelspec": {
       "display_name": "Python 3",
       "language": "python",
       "name": "python3"
      },
      "language_info": {
       "codemirror_mode": {
        "name": "ipython",
        "version": 2
       },
       "file_extension": ".py",
       "mimetype": "text/x-python",
       "name": "python",
       "nbconvert_exporter": "python",
       "pygments_lexer": "ipython2",
       "version": "2.7.6"
      }
     },
     "nbformat": 4,
     "nbformat_minor": 5
    }