diff --git a/convert-anystyle-data/anystyle-to-tei.ipynb b/convert-anystyle-data/anystyle-to-tei.ipynb index 6f041fddc716056fddbd9411b6220bc20fe225d4..528e4e1a0ef4a7b117a4c4333a5f920ba8a4da4b 100644 --- a/convert-anystyle-data/anystyle-to-tei.ipynb +++ b/convert-anystyle-data/anystyle-to-tei.ipynb @@ -16,6 +16,10 @@ "\n", "We use `<bibl>` here instead of `<biblStruct>` because it is more loosely-structured and allows for a more flat datastructure. \n", "\n", + "Todo:\n", + "- BiblStruct mit der übergeordneten <listBibl n=\"fußnote\" src=\"Input\">\n", + "\n", + "\n", "## Collect metadata on TEI `<bibl>` tags" ], "metadata": { @@ -753,13 +757,208 @@ ], "id": "4c19609699dc79c" }, + { + "metadata": {}, + "cell_type": "markdown", + "source": [ + "## Create `biblStruct` that works as a Gold Standard\n", + "\n", + "The issue here is that we need to have a way to parse individual footnotes and have a reliable way of retrieving all contained references. This is not possible with a simple `bibl` to `biblStruct` conversion.\n", + "\n", + "Target TEI schema: \n", + "```xml\n", + "<TEI>\n", + " <teiHeader />\n", + " <standOff>\n", + " <!-- each contained footnote as listBibl -->\n", + " <listBibl n=\"footnote number\" src=\"full footnote as input string\">\n", + " <!-- each contained reference as biblStruct, including empty ones, e.g. when there is simply internal references such as \"Op. cit, p. 23\" or \"see Doe (n.5), p. 2\" -->\n", + " <biblStruct />\n", + " <biblStruct />\n", + " </listBibl>\n", + " <!-- in addition to footnotes containing refs, there might be a full bibliography -->\n", + " <!-- in this case, each reference string is contained in a single <listBibl><biblStruct/></listBibl> -->\n", + " <listBibl src=\"full bibliography entry as input string\">\n", + " <biblStruct />\n", + " </listBibl>\n", + " </standOff>\n", + "</TEI>\n", + "```" + ], + "id": "3b4192e5e772efda" + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2024-09-28T10:43:29.407449Z", + "start_time": "2024-09-28T10:43:29.365660Z" + } + }, + "cell_type": "code", + "source": [ + "from lxml import etree\n", + "import os\n", + "from glob import glob\n", + "\n", + "def remove_encoding_declaration(xml_string):\n", + " return xml_string.replace('<?xml version=\"1.0\" encoding=\"UTF-8\"?>', '')\n", + "\n", + "def create_gold_standard(ref_content, bibl_content, biblstruct_content):\n", + " \"\"\" original code written by GPT-4, adapted by CB\"\"\"\n", + " ref_lines = ref_content.split(\"\\n\")\n", + " bibl_tree = etree.fromstring(remove_encoding_declaration(bibl_content))\n", + " bibl_struct_tree = etree.fromstring(remove_encoding_declaration(biblstruct_content))\n", + "\n", + " tei_namespace = \"http://www.tei-c.org/ns/1.0\"\n", + " ns = {\"tei\": tei_namespace }\n", + " output_tree = etree.Element(\"TEI\", {'xmlns':tei_namespace})\n", + " header = etree.SubElement(output_tree, \"teiHeader\") # take from biblStruct_tree\n", + " standoff = etree.SubElement(output_tree, \"standOff\")\n", + "\n", + " biblStructs = bibl_struct_tree.xpath('//biblStruct')\n", + " biblStructs_idx = 0\n", + " \n", + " # footnotes\n", + " bibl_notes = bibl_tree.xpath('//tei:note', namespaces=ns)\n", + " \n", + " # bibliography entries\n", + " src_list_bibl = bibl_tree.xpath('//tei:listBibl', namespaces=ns)\n", + " src_list_bibl_idx = 0\n", + " \n", + " # for mixed content (having both footnotes and bibliographies, we need to know what we're currently dealing with\n", + " footnote_flag = False\n", + " \n", + " # iterate over input\n", + " for idx, ref_line in enumerate(ref_lines):\n", + " print (f' - Analyzing \"{ref_line[:20]}\"')\n", + " if ref_line == \"\":\n", + " continue\n", + " \n", + " # target listBibl\n", + " tgt_list_bibl = etree.SubElement(standoff, \"listBibl\", {'source': ref_line})\n", + " \n", + " ref_no = ref_line.split()[0]\n", + " if not ref_no.isdigit():\n", + " # assume its part of a bibliography or out-of-band citation not in a footnote\n", + " if footnote_flag and src_list_bibl_idx > 0:\n", + " src_list_bibl_idx += 1\n", + " if len(src_list_bibl) < src_list_bibl_idx + 1 or len(src_list_bibl[src_list_bibl_idx]) == 0:\n", + " raise RuntimeError(\"No corresponding listBibl can be found.\")\n", + " src_bibl_structs = src_list_bibl[src_list_bibl_idx].xpath('./tei:biblStruct', namespaces=ns)\n", + " if len(src_bibl_structs) == 0:\n", + " raise RuntimeError(\"No corresponding listBibl/biblStruct can be found.\") \n", + " \n", + " else:\n", + " # assume that line starting with a number are footnotes \n", + " footnote_flag = True\n", + " note = bibl_notes[idx]\n", + " n = note.attrib[\"n\"]\n", + " print(f' - Found <note n=\"{n}\">') \n", + " # abort if lines don't match since it's an error in the source gold standard\n", + " if ref_no != n: \n", + " raise RuntimeError(f\"Mismatch error at note: {n}\")\n", + " # Create new listBibl with attributes\n", + " list_bibl = etree.SubElement(standoff, \"listBibl\", {\"n\": n, \"source\": ref_lines[int(n)-1].strip()})\n", + " # Iterate over bibl elements to retrieve the corresponding biblStruct elements\n", + " for bibl in note.iterchildren(f\"{{{tei_namespace}}}bibl\"):\n", + " title = bibl.xpath('.//tei:title/text()', namespaces=ns)\n", + " if len(title) > 0:\n", + " print(f' - Reference: {title}')\n", + " else:\n", + " print(f' - Reference with no title element')\n", + " # Matching & appending biblStruct\n", + " while biblStructs_idx < len(biblStructs):\n", + " biblStruct = biblStructs[biblStructs_idx]\n", + " if not title or biblStruct.xpath('.//tei:title/text()', namespaces=ns)[0] == title[0]:\n", + " list_bibl.append(biblStruct)\n", + " biblStructs_idx += 1\n", + " break\n", + " biblStructs_idx += 1\n", + " \n", + " \n", + " \n", + " return etree.tostring(output_tree, pretty_print=True)\n", + "\n", + "\n", + "def create_all_gold_standards(input_dir, bibl_dir, biblstruct_dir, biblstruct_gold_dir):\n", + " \"\"\"written by GPT-4, adapted by CB\"\"\"\n", + " for file_path in glob(f'{input_dir}/*.txt'):\n", + " file_id = os.path.basename(file_path).replace(\".txt\", \"\")\n", + " print(f'Processing {file_id}')\n", + " bibl_path = f'{bibl_dir}/{file_id}.xml'\n", + " biblstruct_path = f'{biblstruct_dir}/{file_id}.biblstruct.xml'\n", + "\n", + " with open(file_path, 'r') as ref_file, open(bibl_path, 'r') as bibl_file, open(biblstruct_path, 'r') as biblStruct_file:\n", + " ref_content = ref_file.read()\n", + " bibl_content = bibl_file.read()\n", + " biblStruct_content = biblStruct_file.read()\n", + "\n", + " output_data = create_gold_standard(ref_content, bibl_content, biblStruct_content)\n", + " with open(f'{biblstruct_gold_dir}/{file_id}.xml', 'w', encoding='utf-8') as output_file:\n", + " output_file.write(output_data.decode())\n", + "\n", + "create_all_gold_standards('refs', 'tei-bibl', 'tei-biblStruct', 'tei-biblStruct-gold')\n", + "\n" + ], + "id": "ec1ac88441d6b9e5", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Processing 10.1515_zfrs-1980-0103\n", + " - Analyzing footnote 1\n", + " - Reference with no title element\n", + " - Analyzing footnote 2\n", + " - Reference with no title element\n", + " - Reference: ['Nichtkriminalisierung als Struktur und Routine']\n", + " - Analyzing footnote 3\n", + " - Reference: ['Bereitschaft zur Anzeigeerstattung']\n", + " - Reference: ['Private Verbrechenskontrolle — eine empirische Untersuchung zur Anzeigeerstattung']\n", + " - Analyzing footnote 4\n", + " - Reference with no title element\n", + " - Analyzing footnote 5\n", + " - Reference with no title element\n", + " - Analyzing footnote 6\n", + " - Reference with no title element\n", + " - Analyzing footnote 7\n", + " - Reference with no title element\n", + " - Analyzing footnote 8\n", + " - Reference with no title element\n", + " - Analyzing footnote 9\n", + " - Reference with no title element\n", + " - Analyzing footnote 10\n", + " - Reference with no title element\n", + " - Analyzing footnote 11\n", + " - Reference with no title element\n", + " - Analyzing footnote 12\n", + " - Reference with no title element\n", + " - Analyzing footnote 13\n" + ] + }, + { + "ename": "RuntimeError", + "evalue": "Mismatch error at note: 13", + "output_type": "error", + "traceback": [ + "\u001B[0;31m---------------------------------------------------------------------------\u001B[0m", + "\u001B[0;31mRuntimeError\u001B[0m Traceback (most recent call last)", + "Cell \u001B[0;32mIn[18], line 74\u001B[0m\n\u001B[1;32m 71\u001B[0m \u001B[38;5;28;01mwith\u001B[39;00m \u001B[38;5;28mopen\u001B[39m(\u001B[38;5;124mf\u001B[39m\u001B[38;5;124m'\u001B[39m\u001B[38;5;132;01m{\u001B[39;00mbiblstruct_gold_dir\u001B[38;5;132;01m}\u001B[39;00m\u001B[38;5;124m/\u001B[39m\u001B[38;5;132;01m{\u001B[39;00mfile_id\u001B[38;5;132;01m}\u001B[39;00m\u001B[38;5;124m.xml\u001B[39m\u001B[38;5;124m'\u001B[39m, \u001B[38;5;124m'\u001B[39m\u001B[38;5;124mw\u001B[39m\u001B[38;5;124m'\u001B[39m, encoding\u001B[38;5;241m=\u001B[39m\u001B[38;5;124m'\u001B[39m\u001B[38;5;124mutf-8\u001B[39m\u001B[38;5;124m'\u001B[39m) \u001B[38;5;28;01mas\u001B[39;00m output_file:\n\u001B[1;32m 72\u001B[0m output_file\u001B[38;5;241m.\u001B[39mwrite(output_data\u001B[38;5;241m.\u001B[39mdecode())\n\u001B[0;32m---> 74\u001B[0m create_all_gold_standards(\u001B[38;5;124m'\u001B[39m\u001B[38;5;124mrefs\u001B[39m\u001B[38;5;124m'\u001B[39m, \u001B[38;5;124m'\u001B[39m\u001B[38;5;124mtei-bibl\u001B[39m\u001B[38;5;124m'\u001B[39m, \u001B[38;5;124m'\u001B[39m\u001B[38;5;124mtei-biblStruct\u001B[39m\u001B[38;5;124m'\u001B[39m, \u001B[38;5;124m'\u001B[39m\u001B[38;5;124mtei-biblStruct-gold\u001B[39m\u001B[38;5;124m'\u001B[39m)\n", + "Cell \u001B[0;32mIn[18], line 70\u001B[0m, in \u001B[0;36mcreate_all_gold_standards\u001B[0;34m(input_dir, bibl_dir, biblstruct_dir, biblstruct_gold_dir)\u001B[0m\n\u001B[1;32m 67\u001B[0m bibl_content \u001B[38;5;241m=\u001B[39m bibl_file\u001B[38;5;241m.\u001B[39mread()\n\u001B[1;32m 68\u001B[0m biblStruct_content \u001B[38;5;241m=\u001B[39m biblStruct_file\u001B[38;5;241m.\u001B[39mread()\n\u001B[0;32m---> 70\u001B[0m output_data \u001B[38;5;241m=\u001B[39m create_gold_standard(ref_content, bibl_content, biblStruct_content)\n\u001B[1;32m 71\u001B[0m \u001B[38;5;28;01mwith\u001B[39;00m \u001B[38;5;28mopen\u001B[39m(\u001B[38;5;124mf\u001B[39m\u001B[38;5;124m'\u001B[39m\u001B[38;5;132;01m{\u001B[39;00mbiblstruct_gold_dir\u001B[38;5;132;01m}\u001B[39;00m\u001B[38;5;124m/\u001B[39m\u001B[38;5;132;01m{\u001B[39;00mfile_id\u001B[38;5;132;01m}\u001B[39;00m\u001B[38;5;124m.xml\u001B[39m\u001B[38;5;124m'\u001B[39m, \u001B[38;5;124m'\u001B[39m\u001B[38;5;124mw\u001B[39m\u001B[38;5;124m'\u001B[39m, encoding\u001B[38;5;241m=\u001B[39m\u001B[38;5;124m'\u001B[39m\u001B[38;5;124mutf-8\u001B[39m\u001B[38;5;124m'\u001B[39m) \u001B[38;5;28;01mas\u001B[39;00m output_file:\n\u001B[1;32m 72\u001B[0m output_file\u001B[38;5;241m.\u001B[39mwrite(output_data\u001B[38;5;241m.\u001B[39mdecode())\n", + "Cell \u001B[0;32mIn[18], line 52\u001B[0m, in \u001B[0;36mcreate_gold_standard\u001B[0;34m(ref_content, bibl_content, biblstruct_content)\u001B[0m\n\u001B[1;32m 50\u001B[0m biblStructs_idx \u001B[38;5;241m+\u001B[39m\u001B[38;5;241m=\u001B[39m \u001B[38;5;241m1\u001B[39m\n\u001B[1;32m 51\u001B[0m \u001B[38;5;28;01melse\u001B[39;00m:\n\u001B[0;32m---> 52\u001B[0m \u001B[38;5;28;01mraise\u001B[39;00m \u001B[38;5;167;01mRuntimeError\u001B[39;00m(\u001B[38;5;124mf\u001B[39m\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mMismatch error at note: \u001B[39m\u001B[38;5;132;01m{\u001B[39;00mn\u001B[38;5;132;01m}\u001B[39;00m\u001B[38;5;124m\"\u001B[39m)\n\u001B[1;32m 54\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m etree\u001B[38;5;241m.\u001B[39mtostring(output_tree, pretty_print\u001B[38;5;241m=\u001B[39m\u001B[38;5;28;01mTrue\u001B[39;00m)\n", + "\u001B[0;31mRuntimeError\u001B[0m: Mismatch error at note: 13" + ] + } + ], + "execution_count": 18 + }, { "metadata": {}, "cell_type": "code", "outputs": [], "execution_count": null, - "source": [], - "id": "1a8a57560f1f4868" + "source": "", + "id": "90477a6402855f3" } ], "metadata": {