Newer
Older
{
"cells": [
{
"metadata": {},
"cell_type": "markdown",
"source": "## Convert AmyStyle training data to a simple JSONL format",
"id": "ae7e001161d678cc"
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-07-22T10:53:30.704827Z",
"start_time": "2024-07-22T10:53:30.627637Z"
}
},
"cell_type": "code",
"source": [
"import xml.etree.ElementTree as ET\n",
"import json\n",
"import regex as re\n",
"import glob\n",
"import os\n",
"\n",
"def xml_to_jsonl(input_xml_path, output_jsonl_path, tags):\n",
" tree = ET.parse(input_xml_path)\n",
" root = tree.getroot()\n",
"\n",
" with (open(output_jsonl_path, 'w', encoding='utf-8') as f):\n",
" for sequence in root.findall('sequence'):\n",
" output = []\n",
" for element in sequence:\n",
" for tag in tags:\n",
" if type(tag) is tuple:\n",
" tag, fn = tag\n",
" if element.tag == tag:\n",
" value = fn(element.text) if callable(fn) else element.text\n",
" is_new_ref = (len(output) == 0 # if no refs yet\n",
" or tag == \"citation-number\" \n",
" or tag in output[-1] # or tag already exists\n",
" or (tag in [\"author\", \"editor\", \"authority\", \"legal-ref\"] and 'date' in output[-1])) # or a creator field follows a date field \n",
" if is_new_ref:\n",
" output.append({})\n",
" # add citation number from previous citation \n",
" if len(output) > 1 and 'citation-number' in tags and tag != \"citation-number\" and 'citation-number' in output[-2]:\n",
" output[-1]['citation-number'] = output[-2]['citation-number']\n",
" # merge tags\n",
" if tag == \"authority\":\n",
" tag = \"author\" \n",
" output[-1][tag] = value\n",
" if len(output) > 0:\n",
" instance = {\n",
" \"in\" : \" \".join(element.text.strip() if element.text else '' for element in sequence),\n",
" \"out\" : output\n",
" }\n",
" f.write(json.dumps(instance) + '\\n')\n",
"\n",
"def remove_punctuation(text):\n",
" start, end = 0, len(text)\n",
" while start < len(text) and re.match(\"\\p{P}\", text[start]) and text[start] not in \")]\":\n",
" while end > start and re.match(\"\\p{P}\", text[end - 1]) and text[end - 1] not in \")]\":\n",
" end -= 1\n",
" return text[start:end].strip()\n",
"\n",
"def clean_editor(text):\n",
" text = re.sub(r'în:? ?', '', text.strip(), flags=re.IGNORECASE)\n",
" text = re.sub(r'hrsg\\. v\\.|hg\\. v|hrsg|ed\\.|eds\\.', '', text, flags=re.IGNORECASE)\n",
" return remove_punctuation(text)\n",
"\n",
"def clean_container(text):\n",
" return remove_punctuation(re.sub(r'^(in|aus|from)(:| )', '', text.strip(), flags=re.IGNORECASE))\n",
" \n",
"def extract_year(text): \n",
" m = re.search( r'[12][0-9]{3}', text)\n",
" return m.group(0) if m else None\n",
"\n",
"for input_file in glob.glob('in/*.xml'):\n",
" base_name = os.path.basename(input_file)\n",
" schema_file = f'jsonl/{os.path.splitext(base_name)[0]}.jsonl'\n",
" print(f'Processing {input_file}')\n",
" xml_to_jsonl(input_file, schema_file, [\n",
" (\"author\", remove_punctuation),\n",
" (\"editor\", clean_editor),\n",
" (\"authority\", remove_punctuation),\n",
" (\"title\", remove_punctuation),\n",
" (\"legal-ref\", remove_punctuation),\n",
" (\"container-title\", clean_container),\n",
" (\"journal\", clean_container),\n",
" (\"date\", extract_year),\n",
" (\"backref\", remove_punctuation)\n",
" ])\n",
"\n"
],
"id": "f101a4e2408d6313",
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Processing in\\10.1111_1467-6478.00057.xml\n",
"Processing in\\10.1111_1467-6478.00080.xml\n",
"Processing in\\10.1515_zfrs-1980-0103.xml\n",
"Processing in\\10.1515_zfrs-1980-0104.xml\n"
]
}
],
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
"execution_count": 37
},
{
"metadata": {},
"cell_type": "markdown",
"source": "## Create JSON and MarkML schema from JSON data",
"id": "6b14734c9c1f6ea6"
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-07-22T10:53:36.880258Z",
"start_time": "2024-07-22T10:53:36.710296Z"
}
},
"cell_type": "code",
"source": [
"# Adapted from https://github.com/linkml/schema-automator/blob/main/tests/test_generalizers/test_json_data_generalizer.py\n",
"import glob\n",
"import os\n",
"import pandas as pd\n",
"import json\n",
"from schema_automator.generalizers.json_instance_generalizer import JsonDataGeneralizer\n",
"from linkml.generators.yamlgen import YAMLGenerator\n",
"from schema_automator.utils.schemautils import write_schema\n",
"import warnings\n",
"warnings.filterwarnings(\"ignore\") # Suppress irrelevant timezone warning\n",
"\n",
"WRITE_EXTENDED_SCHEMA = False\n",
"\n",
"ie = JsonDataGeneralizer()\n",
"for input_file in glob.glob('jsonl/*.jsonl'):\n",
" base_name = os.path.basename(input_file)\n",
" base_name_no_ext = os.path.splitext(base_name)[0]\n",
" json_file = f'json/{base_name_no_ext}.json'\n",
" schema_file = f'schema/{base_name_no_ext}-schema.yaml'\n",
" extended_schema_file = f'schema/{base_name_no_ext}-schema2.yaml'\n",
" print(f'Processing {input_file}')\n",
" with open(input_file, 'r', encoding='utf-8') as input_buf:\n",
" df = pd.read_json(input_file, lines=True)\n",
" flat_list = [item for sublist in df['out'].tolist() for item in sublist if pd.notna(item)]\n",
" with open(json_file, 'w') as f:\n",
" json.dump(flat_list, f)\n",
" schema = ie.convert(json_file, format='json')\n",
" write_schema(schema, schema_file)\n",
" if WRITE_EXTENDED_SCHEMA:\n",
" s = YAMLGenerator(schema_file).serialize()\n",
" with open(extended_schema_file, 'w') as stream:\n",
" stream.write(s)\n"
],
"id": "43e2040fed89c0bd",
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Processing jsonl\\10.1111_1467-6478.00057.jsonl\n",
"Processing jsonl\\10.1111_1467-6478.00080.jsonl\n",
"Processing jsonl\\10.1515_zfrs-1980-0103.jsonl\n",
"Processing jsonl\\10.1515_zfrs-1980-0104.jsonl\n"
]
}
],
"execution_count": 38
},
{
"metadata": {},
"cell_type": "code",
"outputs": [],
"execution_count": null,
"source": "",
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 5
}