{
 "cells": [
  {
   "cell_type": "markdown",
   "source": [
    "# Conversion to TEI (`<bibl>`)\n",
    "\n",
    "References: \n",
    "- https://vault.tei-c.de/P5/3.0.0/doc/tei-p5-doc/en/html/CO.html#COBI (Overview)\n",
    "- https://vault.tei-c.de/P5/3.0.0/doc/tei-p5-doc/en/html/CO.html#COBIOT (Mapping to other bibliographic formats)\n",
    "- https://vault.tei-c.de/P5/3.0.0/doc/tei-p5-doc/en/html/ref-bibl.html (`<bibl>`)\n",
    "- https://vault.tei-c.de/P5/3.0.0/doc/tei-p5-doc/en/html/ref-biblStruct.html (`biblStruct`)\n",
    "- https://epidoc.stoa.org/gl/latest/supp-bibliography.html (Examples)\n",
    "- https://quod.lib.umich.edu/cgi/t/tei/tei-idx?type=HTML&rgn=DIV2&byte=647051\n",
    "- https://grobid.readthedocs.io/en/latest/training/Bibliographical-references/ (Grobid examples using `<bibl>`)\n",
    "\n",
    "We use `<bibl>` here instead of `<biblStruct>` because it is more loosely-structured and allows for a more flat datastructure. \n",
    "\n",
    "## Collect metadata on TEI `<bibl>` tags"
   ],
   "metadata": {
    "collapsed": false
   },
   "id": "4c77ab592c98dfd"
  },
  {
   "cell_type": "code",
   "source": [
    "import xmlschema\n",
    "import os\n",
    "\n",
    "# cache for local use\n",
    "if not os.path.isdir(\"schema/tei\"):\n",
    "    schema = xmlschema.XMLSchema(\"https://www.tei-c.org/release/xml/tei/custom/schema/xsd/tei_all.xsd\")\n",
    "    schema.export(target='schema/tei', save_remote=True)"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2024-07-27T10:10:23.856072Z",
     "start_time": "2024-07-27T10:09:57.959933Z"
    }
   },
   "id": "ff140f40df428a8f",
   "outputs": [],
   "execution_count": 2
  },
  {
   "cell_type": "code",
   "source": [
    "import xml.etree.ElementTree as ET\n",
    "import pandas as pd\n",
    "import requests\n",
    "from bs4 import BeautifulSoup\n",
    "import re\n",
    "from tqdm.notebook import tqdm\n",
    "\n",
    "\n",
    "# written by GPT-4\n",
    "def extract_headings_and_links(tag, doc_heading, doc_base_url):\n",
    "    # Extract heading numbers from the document\n",
    "    heading_numbers = re.findall(r'\\d+(?:\\.\\d+)*', doc_heading)\n",
    "\n",
    "    # Download the HTML page\n",
    "    url = f\"{doc_base_url}/ref-{tag}.html\"\n",
    "    response = requests.get(url)\n",
    "    soup = BeautifulSoup(response.content, 'html.parser')\n",
    "\n",
    "    # Extract the links associated with each heading number\n",
    "    links = {}\n",
    "    for link in soup.find_all('a', class_='link_ptr'):\n",
    "        heading_value = link.find('span', class_='headingNumber').text.strip()\n",
    "        link_url = link.get('href')\n",
    "        links[heading_value] = f\"{doc_base_url}/{link_url}\"\n",
    "\n",
    "    return {heading: link_url for heading, link_url in zip(heading_numbers, links.values()) if\n",
    "            heading in heading_numbers}\n",
    "\n",
    "\n",
    "def generate_tag_docs(xsd_path):\n",
    "    namespaces = {'xs': 'http://www.w3.org/2001/XMLSchema'}\n",
    "    doc_base_url = \"https://vault.tei-c.de/P5/3.0.0/doc/tei-p5-doc/en/html\"\n",
    "\n",
    "    tree = ET.parse('schema/tei/tei_all.xsd')\n",
    "    root = tree.getroot()\n",
    "    schema = xmlschema.XMLSchema(xsd_path)\n",
    "    bibl_schema = schema.find(\"tei:bibl\")\n",
    "    data_list = []\n",
    "    #names = [child_element.local_name for child_element in bibl_schema.iterchildren()]\n",
    "    names = ['author', 'biblScope', 'citedRange', 'date', 'edition', 'editor', 'idno', 'location', 'note', 'orgName', \n",
    "             'publisher', 'pubPlace', 'ptr', 'series', 'span', 'title', 'volume', 'issue']\n",
    "    for name in tqdm(names, desc=\"Processing TEI tags\"):\n",
    "        doc_node = root.find(f\".//xs:element[@name='{name}']/xs:annotation/xs:documentation\", namespaces=namespaces)\n",
    "        if doc_node is not None:\n",
    "            matches = re.search(r'^(.*)\\[(.*)]$', doc_node.text)\n",
    "            if matches is None: continue\n",
    "            description = matches.group(1)\n",
    "            doc_heading = matches.group(2)\n",
    "            doc_urls = extract_headings_and_links(name, doc_heading, doc_base_url)\n",
    "            data_list.append({'name': name, 'description': description, 'documentation': doc_heading, 'urls': doc_urls})\n",
    "\n",
    "    return pd.DataFrame(data_list)\n",
    "\n",
    "\n",
    "cache_file = \"schema/tei/tei-tags-documentation.json\"\n",
    "if not os.path.isfile(cache_file):\n",
    "    df = generate_tag_docs(\"schema/tei/tei_all.xsd\")\n",
    "    df.to_json(cache_file, index=False, orient='records')\n",
    "else:\n",
    "    df = pd.read_json(cache_file)\n",
    "df\n"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2024-07-29T08:03:30.899758Z",
     "start_time": "2024-07-29T08:03:16.672095Z"
    }
   },
   "id": "572f566fc9784238",
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Processing TEI tags:   0%|          | 0/15 [00:00<?, ?it/s]"
      ],
      "application/vnd.jupyter.widget-view+json": {
       "version_major": 2,
       "version_minor": 0,
       "model_id": "2b29eb53bd824a9e8ee692483a17d0da"
      }
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "          name                                        description  \\\n",
       "0       author  (author) in a bibliographic reference, contain...   \n",
       "1    biblScope  (scope of bibliographic reference) defines the...   \n",
       "2   citedRange  (cited range) defines the range of cited conte...   \n",
       "3         date             (date) contains a date in any format.    \n",
       "4      edition  (edition) describes the particularities of one...   \n",
       "5       editor  contains a secondary statement of responsibili...   \n",
       "6     location  (location) defines the location of a place as ...   \n",
       "7         note             (note) contains a note or annotation.    \n",
       "8    publisher  (publisher) provides the name of the organizat...   \n",
       "9     pubPlace  (publication place) contains the name of the p...   \n",
       "10      series  (series information) contains information abou...   \n",
       "11        span  associates an interpretative annotation direct...   \n",
       "12       title    (title) contains a title for any kind of work.    \n",
       "\n",
       "                                        documentation  \\\n",
       "0   3.12.2.2. Titles, Authors, and Editors 2.2.1. ...   \n",
       "1   3.12.2.5. Scopes and Ranges in Bibliographic C...   \n",
       "2   3.12.2.5. Scopes and Ranges in Bibliographic C...   \n",
       "3   3.6.4. Dates and Times 2.2.4. Publication, Dis...   \n",
       "4                        2.2.2. The Edition Statement   \n",
       "5              3.12.2.2. Titles, Authors, and Editors   \n",
       "6                                      14.3.4. Places   \n",
       "7   3.9.1. Notes and Simple Annotation 2.2.6. The ...   \n",
       "8   3.12.2.4. Imprint, Size of a Document, and Rep...   \n",
       "9   3.12.2.4. Imprint, Size of a Document, and Rep...   \n",
       "10  3.12.2.1. Analytic, Monographic, and Series Le...   \n",
       "11                    18.3. Spans and Interpretations   \n",
       "12  3.12.2.2. Titles, Authors, and Editors 2.2.1. ...   \n",
       "\n",
       "                                                 urls  \n",
       "0   {'3.12.2.2': 'https://vault.tei-c.de/P5/3.0.0/...  \n",
       "1   {'3.12.2.5': 'https://vault.tei-c.de/P5/3.0.0/...  \n",
       "2   {'3.12.2.5': 'https://vault.tei-c.de/P5/3.0.0/...  \n",
       "3   {'3.6.4': 'https://vault.tei-c.de/P5/3.0.0/doc...  \n",
       "4   {'2.2.2': 'https://vault.tei-c.de/P5/3.0.0/doc...  \n",
       "5   {'3.12.2.2': 'https://vault.tei-c.de/P5/3.0.0/...  \n",
       "6   {'14.3.4': 'https://vault.tei-c.de/P5/3.0.0/do...  \n",
       "7   {'3.9.1': 'https://vault.tei-c.de/P5/3.0.0/doc...  \n",
       "8   {'3.12.2.4': 'https://vault.tei-c.de/P5/3.0.0/...  \n",
       "9   {'3.12.2.4': 'https://vault.tei-c.de/P5/3.0.0/...  \n",
       "10  {'3.12.2.1': 'https://vault.tei-c.de/P5/3.0.0/...  \n",
       "11  {'18.3': 'https://vault.tei-c.de/P5/3.0.0/doc/...  \n",
       "12  {'3.12.2.2': 'https://vault.tei-c.de/P5/3.0.0/...  "
      ],
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>name</th>\n",
       "      <th>description</th>\n",
       "      <th>documentation</th>\n",
       "      <th>urls</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>author</td>\n",
       "      <td>(author) in a bibliographic reference, contain...</td>\n",
       "      <td>3.12.2.2. Titles, Authors, and Editors 2.2.1. ...</td>\n",
       "      <td>{'3.12.2.2': 'https://vault.tei-c.de/P5/3.0.0/...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>biblScope</td>\n",
       "      <td>(scope of bibliographic reference) defines the...</td>\n",
       "      <td>3.12.2.5. Scopes and Ranges in Bibliographic C...</td>\n",
       "      <td>{'3.12.2.5': 'https://vault.tei-c.de/P5/3.0.0/...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>citedRange</td>\n",
       "      <td>(cited range) defines the range of cited conte...</td>\n",
       "      <td>3.12.2.5. Scopes and Ranges in Bibliographic C...</td>\n",
       "      <td>{'3.12.2.5': 'https://vault.tei-c.de/P5/3.0.0/...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>date</td>\n",
       "      <td>(date) contains a date in any format.</td>\n",
       "      <td>3.6.4. Dates and Times 2.2.4. Publication, Dis...</td>\n",
       "      <td>{'3.6.4': 'https://vault.tei-c.de/P5/3.0.0/doc...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>edition</td>\n",
       "      <td>(edition) describes the particularities of one...</td>\n",
       "      <td>2.2.2. The Edition Statement</td>\n",
       "      <td>{'2.2.2': 'https://vault.tei-c.de/P5/3.0.0/doc...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>editor</td>\n",
       "      <td>contains a secondary statement of responsibili...</td>\n",
       "      <td>3.12.2.2. Titles, Authors, and Editors</td>\n",
       "      <td>{'3.12.2.2': 'https://vault.tei-c.de/P5/3.0.0/...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>location</td>\n",
       "      <td>(location) defines the location of a place as ...</td>\n",
       "      <td>14.3.4. Places</td>\n",
       "      <td>{'14.3.4': 'https://vault.tei-c.de/P5/3.0.0/do...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>note</td>\n",
       "      <td>(note) contains a note or annotation.</td>\n",
       "      <td>3.9.1. Notes and Simple Annotation 2.2.6. The ...</td>\n",
       "      <td>{'3.9.1': 'https://vault.tei-c.de/P5/3.0.0/doc...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>publisher</td>\n",
       "      <td>(publisher) provides the name of the organizat...</td>\n",
       "      <td>3.12.2.4. Imprint, Size of a Document, and Rep...</td>\n",
       "      <td>{'3.12.2.4': 'https://vault.tei-c.de/P5/3.0.0/...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>pubPlace</td>\n",
       "      <td>(publication place) contains the name of the p...</td>\n",
       "      <td>3.12.2.4. Imprint, Size of a Document, and Rep...</td>\n",
       "      <td>{'3.12.2.4': 'https://vault.tei-c.de/P5/3.0.0/...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>series</td>\n",
       "      <td>(series information) contains information abou...</td>\n",
       "      <td>3.12.2.1. Analytic, Monographic, and Series Le...</td>\n",
       "      <td>{'3.12.2.1': 'https://vault.tei-c.de/P5/3.0.0/...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>span</td>\n",
       "      <td>associates an interpretative annotation direct...</td>\n",
       "      <td>18.3. Spans and Interpretations</td>\n",
       "      <td>{'18.3': 'https://vault.tei-c.de/P5/3.0.0/doc/...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>title</td>\n",
       "      <td>(title) contains a title for any kind of work.</td>\n",
       "      <td>3.12.2.2. Titles, Authors, and Editors 2.2.1. ...</td>\n",
       "      <td>{'3.12.2.2': 'https://vault.tei-c.de/P5/3.0.0/...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "execution_count": 13
  },
  {
   "metadata": {},
   "cell_type": "markdown",
   "source": "## Convert Groundd Truth to TEI",
   "id": "aaf43ee43bb6d4d"
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-07-30T15:36:29.945576400Z",
     "start_time": "2024-07-30T15:36:26.673665500Z"
    }
   },
   "cell_type": "code",
   "source": [
    "import xml.etree.ElementTree as ET\n",
    "import regex as re\n",
    "import glob\n",
    "import os\n",
    "import xml.dom.minidom\n",
    "import json\n",
    "import xmlschema\n",
    "\n",
    "\n",
    "def even_num_brackets(string: str):\n",
    "    \"\"\"\n",
    "    Simple heuristic to determine if string contains an even number of round and square brackets,\n",
    "    so that if not, trailing or leading brackets will be removed.\n",
    "    \"\"\"\n",
    "    return ((string.endswith(\")\") and string.count(\")\") == string.count(\"(\"))\n",
    "            or (string.endswith(\"]\") and string.count(\"]\") == string.count(\"[\")))\n",
    "\n",
    "def remove_punctuation(text):\n",
    "    \"\"\"This removes leading and trailing punctuation using very simple rules for German and English\"\"\"\n",
    "    start, end = 0, len(text)\n",
    "    while start < len(text) and re.match(\"\\p{P}\", text[start]) and text[end - 1]:\n",
    "        start += 1\n",
    "    while end > start and re.match(\"\\p{P}\", text[end - 1]) and not even_num_brackets(text[start:end]) and text[end - 1] not in \"?!\":\n",
    "        end -= 1\n",
    "    return text[start:end].strip()\n",
    "\n",
    "def clean_editor(text): \n",
    "    text = re.sub(r'^in(:| )', '', remove_punctuation(text), flags=re.IGNORECASE)\n",
    "    text = re.sub(r'\\(?(hrsg\\. v\\.|hg\\. v|hrsg\\.|ed\\.|eds\\.)\\)?', '', text, flags=re.IGNORECASE)\n",
    "    return text\n",
    "\n",
    "def clean_container(text):\n",
    "    return remove_punctuation(re.sub(r'^(in|aus|from)(:| )', '', text.strip(), flags=re.IGNORECASE))\n",
    "\n",
    "def clean_pages(text):\n",
    "    return remove_punctuation(re.sub(r'^(S\\.|p\\.|pp\\.|ff?\\.||seqq?\\.)', '', text.strip(), flags=re.IGNORECASE))\n",
    "\n",
    "def extract_year(text):\n",
    "    m = re.search( r'[12][0-9]{3}', text)\n",
    "    return m.group(0) if m else None\n",
    "\n",
    "def find_string(string, container):\n",
    "    start = container.find(string)\n",
    "    if start > -1:\n",
    "        end = start + len(string)\n",
    "        return start, end\n",
    "    raise ValueError(f\"Could not find '{string}' in '{container}'\")\n",
    "\n",
    "def add_node(parent, tag, text=\"\", attributes=None, clean_func=None, preserve=False):\n",
    "    \"\"\"\n",
    "    Adds a child node to the parent, optionally adding text and attributes. \n",
    "    If a clean_func is passed, the text is set after applying the function to it.\n",
    "    If the `preserve` flag is True, the removed preceding or trailing text is preserved in the xml,\n",
    "    outside of the node content \n",
    "    \"\"\"\n",
    "    node = ET.SubElement(parent, tag, (attributes or {}))\n",
    "    if clean_func:\n",
    "        cleaned_text = clean_func(text)\n",
    "        if preserve:\n",
    "            start, end = find_string(cleaned_text, text)\n",
    "            prefix, suffix = text[:start], text[end:]\n",
    "            if prefix !=\"\" and len(parent) > 0:\n",
    "                parent[-1].tail = prefix\n",
    "            node.text = cleaned_text\n",
    "            if suffix != \"\":\n",
    "                node.tail = suffix\n",
    "    else:\n",
    "        node.text = text\n",
    "    return node\n",
    "\n",
    "def create_tei_root():\n",
    "    return ET.Element('TEI', {\n",
    "        'xmlns': \"http://www.tei-c.org/ns/1.0\"\n",
    "    })\n",
    "\n",
    "def create_tei_header(tei_root, title):\n",
    "    tei_header = add_node(tei_root, 'teiHeader')\n",
    "    file_desc = add_node(tei_header, 'fileDesc')\n",
    "    title_stmt = add_node(file_desc, 'titleStmt')\n",
    "    add_node(title_stmt, 'title', title)\n",
    "    publication_stmt = add_node(file_desc, 'publicationStmt')\n",
    "    add_node(publication_stmt, 'publisher', 'mpilhlt')\n",
    "    source_desc = add_node(file_desc, 'sourceDesc')\n",
    "    add_node(source_desc, 'p', title)\n",
    "    return tei_header\n",
    "\n",
    "def create_body(text_root):\n",
    "    body = ET.SubElement(text_root, 'body')\n",
    "    add_node(body, 'p', 'The article text is not part of this document')\n",
    "    return body\n",
    "\n",
    "def prettify(xml_string, indentation=\"  \"):\n",
    "    \"\"\"Return a pretty-printed XML string\"\"\"\n",
    "    return xml.dom.minidom.parseString(xml_string).toprettyxml(indent=indentation)\n",
    "\n",
    "def anystyle_to_tei(input_xml_path, id, preserve=False):\n",
    "    anystyle_root = ET.parse(input_xml_path).getroot()\n",
    "    tei_root = create_tei_root()\n",
    "    create_tei_header(tei_root, title=id)\n",
    "    text_root = add_node(tei_root, 'text')\n",
    "    body = create_body(text_root)\n",
    "    # <listBibl> element for <bibl> elements that are not in footnotes, such as a bibliography\n",
    "    listBibl = add_node(body, 'listBibl')\n",
    "    # iterate over all sequences (=footnotes) and translate into TEI equivalents\n",
    "    for sequence in anystyle_root.findall('sequence'):\n",
    "        # if the sequence contains a citation-number, create a new <note> to add <bibl> elements to \n",
    "        if (cn:= sequence.findall('citation-number')):\n",
    "            attributes = {\n",
    "                'n': cn[0].text,\n",
    "                'place': 'bottom'\n",
    "            }\n",
    "            node = add_node(text_root, 'note', attributes=attributes, clean_func=remove_punctuation, preserve=preserve)\n",
    "        else:\n",
    "            # otherwise add to <listBibl> element\n",
    "            node = listBibl\n",
    "        bibl = None\n",
    "        for child in sequence:\n",
    "            tag = child.tag\n",
    "            text = child.text\n",
    "            if tag == \"citation-number\": continue # this has already been taken care of\n",
    "            if (bibl is None # if we do not have a bibl element yet\n",
    "                or (bibl.find(tag) and tag != \"note\") # or tag already exists in the current element\n",
    "                or tag in ['signal', 'legal-ref'] # or tag belongs to a specific groups that signal a separate reference\n",
    "                or (tag in [\"author\", \"editor\", \"authority\"] and bibl.find('date'))): # or specific tags follow a date field \n",
    "                # then create a new bibl element\n",
    "                bibl = ET.SubElement(node, 'bibl')\n",
    "            match tag:\n",
    "                case 'author':\n",
    "                    add_node(bibl, 'author', text, clean_func=remove_punctuation, preserve=preserve)\n",
    "                case 'backref':\n",
    "                    add_node(bibl, 'ref', text, clean_func=remove_punctuation, preserve=preserve)\n",
    "                case 'container-title':\n",
    "                    add_node(bibl, 'title', text, {'level': 'm'}, clean_func= clean_container, preserve=preserve)\n",
    "                case 'collection-title':\n",
    "                    add_node(bibl, 'title', text, {'level': 's'}, clean_func= clean_container, preserve=preserve)\n",
    "                case 'date':\n",
    "                    add_node(bibl, 'date', text, clean_func= extract_year, preserve=preserve)\n",
    "                case 'editor':\n",
    "                    add_node(bibl, 'editor', text, clean_func=clean_editor, preserve=preserve)\n",
    "                case 'note':\n",
    "                    add_node(bibl, 'note', text)                    \n",
    "                case 'journal':\n",
    "                    add_node(bibl, 'title', text, {'level': 'j'}, clean_func= clean_container, preserve=preserve)\n",
    "                case 'legal-ref':\n",
    "                    add_node(bibl, 'ref', text, {'type': 'legal'}, clean_func = remove_punctuation, preserve=preserve)\n",
    "                case 'pages':\n",
    "                    add_node(bibl, 'biblScope', text, {'unit': 'pp'}, clean_func= clean_pages, preserve=preserve)\n",
    "                case 'signal':\n",
    "                    add_node(bibl, 'note', text, {'type': 'signal'}, clean_func=remove_punctuation, preserve=preserve)\n",
    "                case 'title':\n",
    "                    add_node(bibl, 'title', text, {'level': 'a'}, clean_func=remove_punctuation, preserve=preserve)\n",
    "                case 'volume':\n",
    "                    add_node(bibl, 'biblScope', text, {'unit': 'vol'}, clean_func = remove_punctuation, preserve=preserve)\n",
    "            if len(bibl) == 0:\n",
    "                node.remove(bibl)\n",
    "    if len(listBibl) == 0:\n",
    "        body.remove(listBibl)\n",
    "    return ET.tostring(tei_root, 'unicode')\n",
    "\n",
    "def tei_to_json(tei_xml, schema):\n",
    "    dict_obj = xmlschema.to_dict(tei_xml, schema=schema, converter=xmlschema.XMLSchemaConverter)\n",
    "    return json.dumps(dict_obj, default=str)\n",
    "\n",
    "# main\n",
    "tei_xsd_path = \"schema/tei/tei_all.xsd\"\n",
    "if 'schema' not in locals():\n",
    "    print(\"Parsing schema file, please wait...\")\n",
    "    schema = xmlschema.XMLSchema(tei_xsd_path)\n",
    "for input_path in glob.glob('anystyle/*.xml'):\n",
    "    base_name = os.path.basename(input_path)\n",
    "    id = os.path.splitext(base_name)[0]\n",
    "    print(f'Converting {base_name} into TEI-XML and JSON...')\n",
    "    output_xml = anystyle_to_tei(input_path, id, preserve=True)\n",
    "    output_json = tei_to_json(output_xml, schema)\n",
    "    with open(f'tei/{id}.xml', 'w', encoding='utf-8') as f:\n",
    "        f.write(prettify(output_xml))\n",
    "    with open(f'tei/{id}.json', 'w', encoding='utf-8') as f:\n",
    "        f.write(output_json)\n",
    "    \n"
   ],
   "id": "b3ee84984b88f24a",
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Converting 10.1111_1467-6478.00057.xml into TEI-XML and JSON...\n",
      "Converting 10.1111_1467-6478.00080.xml into TEI-XML and JSON...\n",
      "Converting 10.1515_zfrs-1980-0103.xml into TEI-XML and JSON...\n",
      "Converting 10.1515_zfrs-1980-0104.xml into TEI-XML and JSON...\n"
     ]
    }
   ],
   "execution_count": 29
  },
  {
   "metadata": {},
   "cell_type": "markdown",
   "source": "## Create LinkML schema from TEI XSD",
   "id": "b0a231dc7bdd8b01"
  },
  {
   "metadata": {},
   "cell_type": "markdown",
   "source": "",
   "id": "aa86435960e61937"
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}