Add XSLT code

771ba5ae · Christian Boulanger · 1f01804f · 771ba5ae · 771ba5ae
Commit 771ba5ae authored 7 months ago by Christian Boulanger
--- a/convert-anystyle-data/anystyle-to-tei.ipynb
+++ b/convert-anystyle-data/anystyle-to-tei.ipynb
@@ -148,7 +148,9 @@
  {
   "metadata": {},
   "cell_type": "markdown",
-   "source": "## Convert Groundd Truth to TEI",
+   "source": [
+    "## Convert Groundd Truth to TEI"
+   ],
   "id": "aaf43ee43bb6d4d"
  },
  {
@@ -404,19 +406,94 @@
   ],
   "execution_count": 80
  },
+  {
+   "cell_type": "markdown",
+   "source": [],
+   "metadata": {
+    "collapsed": false
+   },
+   "id": "8c8b2d820086d461"
+  },
  {
   "metadata": {},
   "cell_type": "markdown",
   "source": [
-    "## Create LinkML schema from TEI XSD"
+    "## Extract bibliographic data from TEI files "
   ],
   "id": "b0a231dc7bdd8b01"
  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "### Download XSLTs"
+   ],
+   "metadata": {
+    "collapsed": false
+   },
+   "id": "149588c08747c4b3"
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "outputs": [],
+   "source": [
+    "import requests, zipfile, io, os\n",
+    "\n",
+    "if not os.path.isdir('lib/convert'): \n",
+    "    url = 'https://github.com/OpenArabicPE/convert_tei-to-bibliographic-data/archive/refs/heads/master.zip'\n",
+    "    r = requests.get(url)\n",
+    "    assert r.status_code == 200\n",
+    "    z = zipfile.ZipFile(io.BytesIO(r.content))\n",
+    "    z.extractall('lib')\n",
+    "    z.close()\n",
+    "    os.rename('lib/convert_tei-to-bibliographic-data-master', 'lib/convert')\n"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "ExecuteTime": {
+     "end_time": "2024-07-31T19:42:48.723119300Z",
+     "start_time": "2024-07-31T19:42:47.234795500Z"
+    }
+   },
+   "id": "1f15b3af6aab73ed"
+  },
  {
   "metadata": {},
   "cell_type": "markdown",
-   "source": "",
+   "source": [
+    "### Apply XSLT"
+   ],
   "id": "aa86435960e61937"
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "outputs": [],
+   "source": [
+    "from lxml import etree\n",
+    "\n",
+    "def apply_xslt_to_xml(xslt_path, xml_path):\n",
+    "    xslt = etree.parse(xslt_path)\n",
+    "    xml = etree.parse(xml_path)\n",
+    "    transformer = etree.XSLT(xslt)\n",
+    "    new_xml = transformer(xml)\n",
+    "    return str(new_xml)\n",
+    "\n",
+    "new_xml_str = apply_xslt_to_xml('path_to_your_xslt_file', 'path_to_your_xml_file')\n",
+    "print(new_xml_str)\n"
+   ],
+   "metadata": {
+    "collapsed": false
+   },
+   "id": "cb3b4140ab153c08"
+  },
+  {
+   "cell_type": "markdown",
+   "source": [],
+   "metadata": {
+    "collapsed": false
+   },
+   "id": "387b5b9792505b13"
  }
 ],
 "metadata": {

 %% Cell type:markdown id:4c77ab592c98dfd tags:
 # Conversion to TEI (`<bibl>`)
 References:
 - https://www.tei-c.org/release/doc/tei-p5-doc/en/html/CO.html#COBI (Overview)
 - https://www.tei-c.org/release/doc/tei-p5-doc/en/html/CO.html#COBIOT (Mapping to other bibliographic formats)
 - https://www.tei-c.org/release/doc/tei-p5-doc/en/html/ref-bibl.html (`<bibl>`)
 - https://www.tei-c.org/release/doc/tei-p5-doc/en/html/ref-biblStruct.html (`biblStruct`)
 - https://epidoc.stoa.org/gl/latest/supp-bibliography.html (Examples)
 - https://grobid.readthedocs.io/en/latest/training/Bibliographical-references/ (Grobid examples using `<bibl>`)
 - http://www.jsonml.org/ (a JSON schema for lossless conversion from/to xml)
 We use `<bibl>` here instead of `<biblStruct>` because it is more loosely-structured and allows for a more flat datastructure.
 ## Collect metadata on TEI `<bibl>` tags
 %% Cell type:code id:ff140f40df428a8f tags:
 ``` python
 import xmlschema
 import os
 # cache for local use
 if not os.path.isdir("schema/tei"):
    schema = xmlschema.XMLSchema("https://www.tei-c.org/release/xml/tei/custom/schema/xsd/tei_all.xsd")
    schema.export(target='schema/tei', save_remote=True)
 ```
 %% Cell type:code id:572f566fc9784238 tags:
 ``` python
 import xml.etree.ElementTree as ET
 import pandas as pd
 import requests
 from bs4 import BeautifulSoup
 import re
 from tqdm.notebook import tqdm
 # written by GPT-4
 def extract_headings_and_links(tag, doc_heading, doc_base_url):
    # Extract heading numbers from the document
    heading_numbers = re.findall(r'\d+(?:\.\d+)*', doc_heading)
    # Download the HTML page
    url = f"{doc_base_url}/ref-{tag}.html"
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    # Extract the links associated with each heading number
    links = {}
    for link in soup.find_all('a', class_='link_ptr'):
        heading_value = link.find('span', class_='headingNumber').text.strip()
        link_url = link.get('href')
        links[heading_value] = f"{doc_base_url}/{link_url}"
    return {heading: link_url for heading, link_url in zip(heading_numbers, links.values()) if
            heading in heading_numbers}
 def generate_tag_docs(xsd_path):
    namespaces = {'xs': 'http://www.w3.org/2001/XMLSchema'}
    doc_base_url = "https://www.tei-c.org/release/doc/tei-p5-doc/en/html"
    tree = ET.parse('schema/tei/tei_all.xsd')
    root = tree.getroot()
    schema = xmlschema.XMLSchema(xsd_path)
    bibl_schema = schema.find("tei:bibl")
    data_list = []
    #names = [child_element.local_name for child_element in bibl_schema.iterchildren()]
    names = ['author', 'biblScope', 'citedRange', 'date', 'edition', 'editor', 'idno', 'location', 'note', 'orgName',
             'publisher', 'pubPlace', 'ptr', 'series', 'title', 'volume', 'issue']
    for name in tqdm(names, desc="Processing TEI tags"):
        doc_node = root.find(f".//xs:element[@name='{name}']/xs:annotation/xs:documentation", namespaces=namespaces)
        if doc_node is not None:
            matches = re.search(r'^(.*)\[(.*)]$', doc_node.text)
            if matches is None: continue
            description = matches.group(1)
            doc_heading = matches.group(2)
            doc_urls = extract_headings_and_links(name, doc_heading, doc_base_url)
            data_list.append({'name': name, 'description': description, 'documentation': doc_heading, 'urls': doc_urls})
    return pd.DataFrame(data_list)
 cache_file = "schema/tei/tei-tags-documentation.json"
 if not os.path.isfile(cache_file):
    df = generate_tag_docs("schema/tei/tei_all.xsd")
    json_str = df.to_json(index=False, orient='records', indent=4).replace(r"\/", "/")
    with open(cache_file, "w", encoding='utf-8') as f:
        f.write(json_str)
 else:
    df = pd.read_json(cache_file)
 df
 ```
 %% Output
          name                                        description  \
 0       author  (author) in a bibliographic reference, contain...   
 1    biblScope  (scope of bibliographic reference) defines the...   
 2   citedRange  (cited range) defines the range of cited conte...   
 3         date             (date) contains a date in any format.    
 4      edition  (edition) describes the particularities of one...   
 5       editor  contains a secondary statement of responsibili...   
 6         idno  (identifier) supplies any form of identifier u...   
 7     location  (location) defines the location of a place as ...   
 8         note             (note) contains a note or annotation.    
 9      orgName  (organization name) contains an organizational...   
 10   publisher  (publisher) provides the name of the organizat...   
 11    pubPlace  (publication place) contains the name of the p...   
 12         ptr  (pointer) defines a pointer to another location.    
 13      series  (series information) contains information abou...   
 14       title    (title) contains a title for any kind of work.    
                                        documentation  \
 0   3.12.2.2. Titles, Authors, and Editors 2.2.1. ...   
 1   3.12.2.5. Scopes and Ranges in Bibliographic C...   
 2   3.12.2.5. Scopes and Ranges in Bibliographic C...   
 3   3.6.4. Dates and Times 2.2.4. Publication, Dis...   
 4                        2.2.2. The Edition Statement   
 5              3.12.2.2. Titles, Authors, and Editors   
 6   14.3.1. Basic Principles 2.2.4. Publication, D...   
 7                                      14.3.4. Places   
 8   3.9.1. Notes and Simple Annotation 2.2.6. The ...   
 9                        14.2.2. Organizational Names   
 10  3.12.2.4. Imprint, Size of a Document, and Rep...   
 11  3.12.2.4. Imprint, Size of a Document, and Rep...   
 12  3.7. Simple Links and Cross-References 17.1. L...   
 13  3.12.2.1. Analytic, Monographic, and Series Le...   
 14  3.12.2.2. Titles, Authors, and Editors 2.2.1. ...   
                                                 urls  
 0   {'3.12.2.2': 'https://www.tei-c.org/release/do...  
 1   {'3.12.2.5': 'https://www.tei-c.org/release/do...  
 2   {'3.12.2.5': 'https://www.tei-c.org/release/do...  
 3   {'3.6.4': 'https://www.tei-c.org/release/doc/t...  
 4   {'2.2.2': 'https://www.tei-c.org/release/doc/t...  
 5   {'3.12.2.2': 'https://www.tei-c.org/release/do...  
 6   {'14.3.1': 'https://www.tei-c.org/release/doc/...  
 7   {'14.3.4': 'https://www.tei-c.org/release/doc/...  
 8   {'3.9.1': 'https://www.tei-c.org/release/doc/t...  
 9   {'14.2.2': 'https://www.tei-c.org/release/doc/...  
 10  {'3.12.2.4': 'https://www.tei-c.org/release/do...  
 11  {'3.12.2.4': 'https://www.tei-c.org/release/do...  
 12  {'3.7': 'https://www.tei-c.org/release/doc/tei...  
 13  {'3.12.2.1': 'https://www.tei-c.org/release/do...  
 14  {'3.12.2.2': 'https://www.tei-c.org/release/do...  
 %% Cell type:markdown id:aaf43ee43bb6d4d tags:
 ## Convert Groundd Truth to TEI
 %% Cell type:code id:b3ee84984b88f24a tags:
 ``` python
 import xml.etree.ElementTree as ET
 import regex as re
 import glob
 import os
 import xml.dom.minidom
 import json
 import xmlschema
 from nameparser import HumanName
 def even_num_brackets(string: str):
    """
    Simple heuristic to determine if string contains an even number of round and square brackets,
    so that if not, trailing or leading brackets will be removed.
    """
    return ((string.endswith(")") and string.count(")") == string.count("("))
            or (string.endswith("]") and string.count("]") == string.count("[")))
 def remove_punctuation(text, keep_trailing_chars="?!"):
    """This removes leading and trailing punctuation using very simple rules for German and English"""
    start, end = 0, len(text)
    while start < len(text) and re.match("\p{P}", text[start]) and text[end - 1]:
        start += 1
    while end > start and re.match("\p{P}", text[end - 1]) and not even_num_brackets(text[start:end]) and text[end - 1] not in keep_trailing_chars:
        end -= 1
    return text[start:end].strip()
 def remove_punctuation2(text):
    """same as remove_punctuation, but keep trailing periods."""
    return remove_punctuation(text, "?!.")
 def clean_editor(text):
    text = re.sub(r'^in(:| )', '', remove_punctuation(text), flags=re.IGNORECASE)
    text = re.sub(r'\(?(hrsg\. v\.|hg\. v|hrsg\.|ed\.|eds\.)\)?', '', text, flags=re.IGNORECASE)
    return text.strip()
 def clean_container(text):
    return remove_punctuation(re.sub(r'^(in|aus|from)(:| )', '', text.strip(), flags=re.IGNORECASE))
 def clean_pages(text):
    return remove_punctuation(re.sub(r'^(S\.|p\.|pp\.|ff?\.||seqq?\.)', '', text.strip(), flags=re.IGNORECASE))
 def extract_year(text):
    m = re.search( r'[12][0-9]{3}', text)
    return m.group(0) if m else None
 def find_string(string, container):
    start = container.find(string)
    if start > -1:
        end = start + len(string)
        return start, end
    raise ValueError(f"Could not find '{string}' in '{container}'")
 def add_node(parent, tag, text="", attributes=None, clean_func=None, preserve=False):
    """
    Adds a child node to the parent, optionally adding text and attributes.
    If a clean_func is passed, the text is set after applying the function to it.
    If the `preserve` flag is True, the removed preceding or trailing text is preserved in the xml,
    outside of the node content
    """
    node = ET.SubElement(parent, tag, (attributes or {}))
    if clean_func:
        cleaned_text = clean_func(text)
        if preserve:
            start, end = find_string(cleaned_text, text)
            prefix, suffix = text[:start], text[end:]
            if prefix !="" and len(parent) > 1:
                prev_sibling = parent[-2]
                prev_tail = (prev_sibling.tail or '')
                new_prev_tail = f'{prev_tail} {prefix}'.strip()
                prev_sibling.tail = new_prev_tail
            node.text = cleaned_text
            if suffix != "":
                node.tail = suffix
    else:
        node.text = text
    return node
 def create_tei_root():
    return ET.Element('TEI', {
        'xmlns': "http://www.tei-c.org/ns/1.0"
    })
 def create_tei_header(tei_root, title):
    tei_header = add_node(tei_root, 'teiHeader')
    file_desc = add_node(tei_header, 'fileDesc')
    title_stmt = add_node(file_desc, 'titleStmt')
    add_node(title_stmt, 'title', title)
    publication_stmt = add_node(file_desc, 'publicationStmt')
    add_node(publication_stmt, 'publisher', 'mpilhlt')
    source_desc = add_node(file_desc, 'sourceDesc')
    add_node(source_desc, 'p', title)
    return tei_header
 def create_body(text_root):
    body = ET.SubElement(text_root, 'body')
    add_node(body, 'p', 'The article text is not part of this document')
    return body
 def prettify(xml_string, indentation="  "):
    """Return a pretty-printed XML string"""
    return xml.dom.minidom.parseString(xml_string).toprettyxml(indent=indentation)
 def split_creators(text:str, bibl, tag, clean_func, preserve):
    sep_regex = r'[;&/]| and | und '
    creators = re.split(sep_regex, text)
    seperators = re.findall(sep_regex, text)
    for creator in creators:
        # <author>/<editor>
        creator_node = add_node(bibl, tag, creator, clean_func=clean_func, preserve=preserve)
        # <persName>
        name = HumanName(creator_node.text)
        creator_node.text = ''
        pers_name = add_node(creator_node, 'persName')
        inv_map = {v: k for k, v in name.as_dict(False).items()}
        if len(name) == 1:
            add_node(pers_name, 'surname', list(name)[0])
        else:
            for elem in list(name):
                match inv_map[elem]:
                    case 'last':
                        # <surname>
                        add_node(pers_name, 'surname', elem)
                    case 'first' | 'middle':
                        # <forename>
                        add_node(pers_name, 'forename', elem)
            if len(seperators):
                creator_node.tail = seperators.pop(0).strip()
 def anystyle_to_tei(input_xml_path, id, preserve=False):
    anystyle_root = ET.parse(input_xml_path).getroot()
    tei_root = create_tei_root()
    create_tei_header(tei_root, title=id)
    text_root = add_node(tei_root, 'text')
    body = create_body(text_root)
    # <listBibl> element for <bibl> elements that are not in footnotes, such as a bibliography
    listBibl = add_node(body, 'listBibl')
    # iterate over all sequences (=footnotes) and translate into TEI equivalents
    for sequence in anystyle_root.findall('sequence'):
        # if the sequence contains a citation-number, create a new <note> to add <bibl> elements to
        if (cn:= sequence.findall('citation-number')):
            footnote_number = cn[0].text
            attributes = {
                'n': footnote_number,
                'type': 'footnote',
                'place': 'bottom'
            }
            node = add_node(text_root, 'note', attributes=attributes, clean_func=remove_punctuation, preserve=preserve)
        else:
            # otherwise add to <listBibl> element
            node = listBibl
        bibl = None
        for child in sequence:
            tag = child.tag
            text = child.text
            if tag == "citation-number": continue # this has already been taken care of
            if (bibl is None # if we do not have a bibl element yet
                or (bibl.find(tag) and tag != "note") # or tag already exists in the current element
                or tag in ['signal', 'legal-ref'] # or tag belongs to a specific groups that signal a separate reference
                or (tag in ["author", "editor", "authority"] and bibl.find('date'))): # or specific tags follow a date field
                # then create a new bibl element
                bibl = ET.SubElement(node, 'bibl')
            match tag:
                case 'author':
                    split_creators(text, bibl, 'author', clean_func=remove_punctuation, preserve=preserve)
                case 'authority':
                    split_creators(text, bibl, 'publisher', clean_func=remove_punctuation, preserve=preserve)
                case 'backref':
                    add_node(bibl, 'ref', text, clean_func=remove_punctuation2, preserve=preserve)
                case 'container-title':
                    add_node(bibl, 'title', text, {'level': 'm'}, clean_func= clean_container, preserve=preserve)
                case 'collection-title':
                    add_node(bibl, 'title', text, {'level': 's'}, clean_func= clean_container, preserve=preserve)
                case 'date':
                    add_node(bibl, 'date', text, clean_func= extract_year, preserve=preserve)
                case 'edition':
                    add_node(bibl, 'edition', text, clean_func=remove_punctuation2, preserve=preserve)
                case 'editor':
                    split_creators(text, bibl, 'editor', clean_func=clean_editor, preserve=preserve)
                case 'location':
                    add_node(bibl, 'pubPlace', text, clean_func=remove_punctuation, preserve=preserve)
                case 'note':
                    add_node(bibl, 'note', text, clean_func=remove_punctuation, preserve=preserve)
                case 'journal':
                    add_node(bibl, 'title', text, {'level': 'j'}, clean_func= clean_container, preserve=preserve)
                case 'legal-ref':
                    add_node(bibl, 'ref', text, {'type': 'legal'}, clean_func = remove_punctuation, preserve=preserve)
                case 'pages':
                    if bibl[-1].tag == "ref":
                        add_node(bibl, 'citedRange', text, {'unit': 'pp'}, clean_func= clean_pages, preserve=preserve)
                    else:
                        add_node(bibl, 'biblScope', text, {'unit': 'pp'}, clean_func= clean_pages, preserve=preserve)
                case 'signal':
                    add_node(bibl, 'note', text, {'type': 'signal'}, clean_func=remove_punctuation, preserve=preserve)
                case 'title':
                    add_node(bibl, 'title', text, {'level': 'a'}, clean_func=remove_punctuation2, preserve=preserve)
                case 'url':
                    add_node(bibl, 'ptr', text, {'type':'web'}, clean_func=remove_punctuation, preserve=preserve)
                case 'volume':
                    add_node(bibl, 'biblScope', text, {'unit': 'vol'}, clean_func = remove_punctuation, preserve=preserve)
            if len(bibl) == 0:
                node.remove(bibl)
    if len(listBibl) == 0:
        body.remove(listBibl)
    return ET.tostring(tei_root, 'unicode')
 def tei_to_json(tei_xml, schema):
    dict_obj = xmlschema.to_dict(tei_xml, schema=schema, converter=xmlschema.JsonMLConverter)
    return json.dumps(dict_obj, default=str)
 # main
 # XML->JSON-Conversion doesn't provide anything useful
 # tei_xsd_path = "schema/tei/tei_all.xsd"
 # if 'schema' not in locals():
 #     print("Parsing schema file, please wait...")
 #     schema = xmlschema.XMLSchema(tei_xsd_path)
 for input_path in glob.glob('anystyle/*.xml'):
    base_name = os.path.basename(input_path)
    id = os.path.splitext(base_name)[0]
    print(f'Converting {base_name} into TEI-XML ...')
    output_xml = anystyle_to_tei(input_path, id, preserve=True)
    # output_json = tei_to_json(output_xml, schema)
    with open(f'tei/{id}.xml', 'w', encoding='utf-8') as f:
        f.write(prettify(output_xml))
    # with open(f'tei/{id}.json', 'w', encoding='utf-8') as f:
    #     f.write(output_json)
 ```
 %% Output
    Converting 10.1111_1467-6478.00057.xml into TEI-XML ...
    Converting 10.1111_1467-6478.00080.xml into TEI-XML ...
    Converting 10.1515_zfrs-1980-0103.xml into TEI-XML ...
    Converting 10.1515_zfrs-1980-0104.xml into TEI-XML ...
+%% Cell type:markdown id:8c8b2d820086d461 tags:
 %% Cell type:markdown id:b0a231dc7bdd8b01 tags:
-## Create LinkML schema from TEI XSD
+## Extract bibliographic data from TEI files
+%% Cell type:markdown id:149588c08747c4b3 tags:
+### Download XSLTs
+%% Cell type:code id:1f15b3af6aab73ed tags:
+``` python
+import requests, zipfile, io, os
+if not os.path.isdir('lib/convert'):
+    url = 'https://github.com/OpenArabicPE/convert_tei-to-bibliographic-data/archive/refs/heads/master.zip'
+    r = requests.get(url)
+    assert r.status_code == 200
+    z = zipfile.ZipFile(io.BytesIO(r.content))
+    z.extractall('lib')
+    z.close()
+    os.rename('lib/convert_tei-to-bibliographic-data-master', 'lib/convert')
+```
 %% Cell type:markdown id:aa86435960e61937 tags:
+### Apply XSLT
+%% Cell type:code id:cb3b4140ab153c08 tags:
+``` python
+from lxml import etree
+def apply_xslt_to_xml(xslt_path, xml_path):
+    xslt = etree.parse(xslt_path)
+    xml = etree.parse(xml_path)
+    transformer = etree.XSLT(xslt)
+    new_xml = transformer(xml)
+    return str(new_xml)
+new_xml_str = apply_xslt_to_xml('path_to_your_xslt_file', 'path_to_your_xml_file')
+print(new_xml_str)
+```
+%% Cell type:markdown id:387b5b9792505b13 tags:

--- a/convert-anystyle-data/lib/.gitignore
+++ b/convert-anystyle-data/lib/.gitignore
+*
\ No newline at end of file