Newer
Older
{
"cells": [
{
"metadata": {},
"cell_type": "markdown",
"source": "Convert AmyStyle training data to a simple JSONL format",
"id": "ae7e001161d678cc"
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-07-15T09:25:32.594541Z",
"start_time": "2024-07-15T09:25:32.464600Z"
}
},
"cell_type": "code",
"source": [
"import xml.etree.ElementTree as ET\n",
"import json\n",
"import regex as re\n",
"import glob\n",
"import os\n",
"\n",
"def xml_to_jsonl(input_xml_path, output_jsonl_path, tags):\n",
" tree = ET.parse(input_xml_path)\n",
" root = tree.getroot()\n",
"\n",
" with (open(output_jsonl_path, 'w', encoding='utf-8') as f):\n",
" for sequence in root.findall('sequence'):\n",
" output = []\n",
" for element in sequence:\n",
" for tag in tags:\n",
" if type(tag) is tuple:\n",
" tag, fn = tag\n",
" if element.tag == tag:\n",
" value = fn(element.text) if callable(fn) else element.text\n",
" is_new_ref = (len(output) == 0 # if no refs yet\n",
" or tag == \"citation-number\" \n",
" or tag in output[-1] # or tag already exists\n",
" or (tag in [\"author\", \"editor\", \"authority\", \"legal-ref\"] and 'date' in output[-1])) # or a creator field follows a date field \n",
" if is_new_ref:\n",
" output.append({})\n",
" # add citation number from previous citation \n",
" if len(output) > 1 and 'citation-number' in tags and tag != \"citation-number\" and 'citation-number' in output[-2]:\n",
" output[-1]['citation-number'] = output[-2]['citation-number']\n",
" # merge tags\n",
" if tag == \"authority\":\n",
" tag = \"author\" \n",
" output[-1][tag] = value\n",
" if len(output) > 0:\n",
" instance = {\n",
" \"in\" : \" \".join(element.text.strip() if element.text else '' for element in sequence),\n",
" \"out\" : output\n",
" }\n",
" f.write(json.dumps(instance) + '\\n')\n",
"\n",
"def remove_punctuation(text):\n",
" start, end = 0, len(text)\n",
" while start < len(text) and re.match(\"\\p{P}\", text[start]) and text[start] not in \")]\":\n",
" while end > start and re.match(\"\\p{P}\", text[end - 1]) and text[end - 1] not in \")]\":\n",
" end -= 1\n",
" return text[start:end].strip()\n",
"\n",
"def clean_editor(text):\n",
" text = re.sub(r'în:? ?', '', text.strip(), flags=re.IGNORECASE)\n",
" text = re.sub(r'hrsg\\. v\\.|hg\\. v|hrsg|ed\\.|eds\\.', '', text, flags=re.IGNORECASE)\n",
" return remove_punctuation(text)\n",
"\n",
"def clean_container(text):\n",
" return remove_punctuation(re.sub(r'^(in|aus|from)(:| )', '', text.strip(), flags=re.IGNORECASE))\n",
" \n",
"def extract_year(text): \n",
" m = re.search( r'[12][0-9]{3}', text)\n",
" return m.group(0) if m else None\n",
"\n",
"for input_file in glob.glob('in/*.xml'):\n",
" base_name = os.path.basename(input_file)\n",
" output_file = f'out/{os.path.splitext(base_name)[0]}.jsonl'\n",
" print(f'Processing {input_file}')\n",
" xml_to_jsonl(input_file, output_file, [\n",
" (\"author\", remove_punctuation),\n",
" (\"editor\", clean_editor),\n",
" (\"authority\", remove_punctuation),\n",
" (\"title\", remove_punctuation),\n",
" (\"legal-ref\", remove_punctuation),\n",
" (\"container-title\", clean_container),\n",
" (\"journal\", clean_container),\n",
" (\"date\", extract_year),\n",
" (\"backref\", remove_punctuation)\n",
" ])\n",
"\n"
],
"id": "f101a4e2408d6313",
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Processing in\\10.1111_1467-6478.00057.xml\n",
"Processing in\\10.1111_1467-6478.00080.xml\n",
"Processing in\\10.1515_zfrs-1980-0103.xml\n",
"Processing in\\10.1515_zfrs-1980-0104.xml\n"
]
}
],
"execution_count": 53
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
},
{
"metadata": {},
"cell_type": "code",
"outputs": [],
"execution_count": null,
"source": "",
"id": "43e2040fed89c0bd"
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 5
}