Skip to content
Snippets Groups Projects
Commit ab263aa0 authored by Christian Boulanger's avatar Christian Boulanger
Browse files

Update documentation

parent 48f8d5d2
No related branches found
No related tags found
No related merge requests found
Pipeline #511090 passed
%% Cell type:markdown id:4c77ab592c98dfd tags:
# Convert AnyStyle GS to TEI (`<bibl>`/`<biblStruct>`) GS
# Convert AnyStyle to TEI-bibl data
References:
- https://www.tei-c.org/release/doc/tei-p5-doc/en/html/CO.html#COBI (Overview)
- https://www.tei-c.org/release/doc/tei-p5-doc/en/html/CO.html#COBIOT (Mapping to other bibliographic formats)
- https://www.tei-c.org/release/doc/tei-p5-doc/en/html/ref-bibl.html (`<bibl>`)
- https://www.tei-c.org/release/doc/tei-p5-doc/en/html/ref-biblStruct.html (`biblStruct`)
- https://epidoc.stoa.org/gl/latest/supp-bibliography.html (Examples)
- https://grobid.readthedocs.io/en/latest/training/Bibliographical-references/ (Grobid examples using `<bibl>`)
We use `<bibl>` here instead of `<biblStruct>` because it is more loosely-structured and allows for a more flat datastructure.
We use `<bibl>` here for marking up the citation data. These annotations can then be further processed:
- [to Gold Standard based on `<biblStruct>`](tei-to-biblstruct-gs.ipynb)
- [to bibliographic data formats](tei-to-bibformats.ipynb)
- [to the prodigy annotation format](tei-to-prodigy.ipynb)
Todo:
- BiblStruct mit der übergeordneten <listBibl n="fußnote" src="Input">
Code was written with assistance by ChatGPT 4.
%% Cell type:markdown id:dd3645db958007fe tags:
## Collect metadata on TEI `<bibl>` tags
%% Cell type:markdown id:c4ebd32b98166eb tags:
Cache XML schema for offline use
%% Cell type:code id:ff140f40df428a8f tags:
``` python
import xmlschema
import os
if not os.path.isdir("schema/tei"):
schema = xmlschema.XMLSchema("https://www.tei-c.org/release/xml/tei/custom/schema/xsd/tei_all.xsd")
schema.export(target='schema/tei', save_remote=True)
```
%% Cell type:markdown id:3019ff70c4b769cd tags:
This generates JSON data with information on the tags used, extracting from the schema and from the documentation pages
%% Cell type:code id:572f566fc9784238 tags:
``` python
import os
import xmlschema
import xml.etree.ElementTree as ET
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
from tqdm.notebook import tqdm
# written by GPT-4
def extract_headings_and_links(tag, doc_heading, doc_base_url):
# Extract heading numbers from the document
heading_numbers = re.findall(r'\d+(?:\.\d+)*', doc_heading)
# Download the HTML page
url = f"{doc_base_url}/ref-{tag}.html"
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
# Extract the links associated with each heading number
links = {}
for link in soup.find_all('a', class_='link_ptr'):
heading_value = link.find('span', class_='headingNumber').text.strip()
link_url = link.get('href')
links[heading_value] = f"{doc_base_url}/{link_url}"
return {heading: link_url for heading, link_url in zip(heading_numbers, links.values()) if
heading in heading_numbers}
def generate_tag_docs(xsd_path):
namespaces = {'xs': 'http://www.w3.org/2001/XMLSchema'}
doc_base_url = "https://www.tei-c.org/release/doc/tei-p5-doc/en/html"
tree = ET.parse('schema/tei/tei_all.xsd')
root = tree.getroot()
schema = xmlschema.XMLSchema(xsd_path)
bibl_schema = schema.find("tei:bibl")
data_list = []
#names = [child_element.local_name for child_element in bibl_schema.iterchildren()]
names = ['author', 'biblScope', 'citedRange', 'date', 'edition', 'editor', 'idno', 'issue', 'location', 'note', 'orgName', 'ptr', 'pubPlace', 'publisher', 'ref', 'seg', 'series', 'title', 'volume', 'xr']
for name in tqdm(names, desc="Processing TEI tags"):
doc_node = root.find(f".//xs:element[@name='{name}']/xs:annotation/xs:documentation", namespaces=namespaces)
if doc_node is not None:
matches = re.search(r'^(.*)\[(.*)]$', doc_node.text)
if matches is None: continue
description = matches.group(1)
doc_heading = matches.group(2)
doc_urls = extract_headings_and_links(name, doc_heading, doc_base_url)
data_list.append({'name': name, 'description': description, 'documentation': doc_heading, 'urls': doc_urls})
return pd.DataFrame(data_list)
cache_file = "schema/tei/tei-tags-documentation.json"
if not os.path.isfile(cache_file):
df = generate_tag_docs("schema/tei/tei_all.xsd")
json_str = df.to_json(index=False, orient='records', indent=4).replace(r"\/", "/")
with open(cache_file, "w", encoding='utf-8') as f:
f.write(json_str)
else:
df = pd.read_json(cache_file)
df
```
%% Output
name description \
0 author (author) in a bibliographic reference, contain...
1 biblScope (scope of bibliographic reference) defines the...
2 citedRange (cited range) defines the range of cited conte...
3 date (date) contains a date in any format.
4 edition (edition) describes the particularities of one...
5 editor contains a secondary statement of responsibili...
6 idno (identifier) supplies any form of identifier u...
7 location (location) defines the location of a place as ...
8 note (note) contains a note or annotation.
9 orgName (organization name) contains an organizational...
10 publisher (publisher) provides the name of the organizat...
11 pubPlace (publication place) contains the name of the p...
12 ptr (pointer) defines a pointer to another location.
13 seg (arbitrary segment) represents any segmentatio...
14 series (series information) contains information abou...
15 title (title) contains a title for any kind of work.
documentation \
0 3.12.2.2. Titles, Authors, and Editors 2.2.1. ...
1 3.12.2.5. Scopes and Ranges in Bibliographic C...
2 3.12.2.5. Scopes and Ranges in Bibliographic C...
3 3.6.4. Dates and Times 2.2.4. Publication, Dis...
4 2.2.2. The Edition Statement
5 3.12.2.2. Titles, Authors, and Editors
6 14.3.1. Basic Principles 2.2.4. Publication, D...
7 14.3.4. Places
8 3.9.1. Notes and Simple Annotation 2.2.6. The ...
9 14.2.2. Organizational Names
10 3.12.2.4. Imprint, Size of a Document, and Rep...
11 3.12.2.4. Imprint, Size of a Document, and Rep...
12 3.7. Simple Links and Cross-References 17.1. L...
13 17.3. Blocks, Segments, and Anchors 6.2. Compo...
14 3.12.2.1. Analytic, Monographic, and Series Le...
15 3.12.2.2. Titles, Authors, and Editors 2.2.1. ...
urls
0 {'3.12.2.2': 'https://www.tei-c.org/release/do...
1 {'3.12.2.5': 'https://www.tei-c.org/release/do...
2 {'3.12.2.5': 'https://www.tei-c.org/release/do...
3 {'3.6.4': 'https://www.tei-c.org/release/doc/t...
4 {'2.2.2': 'https://www.tei-c.org/release/doc/t...
5 {'3.12.2.2': 'https://www.tei-c.org/release/do...
6 {'14.3.1': 'https://www.tei-c.org/release/doc/...
7 {'14.3.4': 'https://www.tei-c.org/release/doc/...
8 {'3.9.1': 'https://www.tei-c.org/release/doc/t...
9 {'14.2.2': 'https://www.tei-c.org/release/doc/...
10 {'3.12.2.4': 'https://www.tei-c.org/release/do...
11 {'3.12.2.4': 'https://www.tei-c.org/release/do...
12 {'3.7': 'https://www.tei-c.org/release/doc/tei...
13 {'17.3': 'https://www.tei-c.org/release/doc/te...
14 {'3.12.2.1': 'https://www.tei-c.org/release/do...
15 {'3.12.2.2': 'https://www.tei-c.org/release/do...
%% Cell type:markdown id:aaf43ee43bb6d4d tags:
## Convert AnyStyle Gold Standard to TEI
This converts the AnyStyle XML data to TEI, translating from the flat schema to the nested TEI `<bibl>` structure.
%% Cell type:code id:b3ee84984b88f24a tags:
``` python
import xml.etree.ElementTree as ET
import regex as re
import glob
import os
import xml.dom.minidom
import json
import xmlschema
from nameparser import HumanName
def even_num_brackets(string: str):
"""
Simple heuristic to determine if string contains an even number of round and square brackets,
so that if not, trailing or leading brackets will be removed.
"""
return ((string.endswith(")") and string.count(")") == string.count("("))
or (string.endswith("]") and string.count("]") == string.count("[")))
def remove_punctuation(text, keep_trailing_chars="?!"):
"""This removes leading and trailing punctuation using very simple rules for German and English"""
start, end = 0, len(text)
while start < len(text) and re.match("\p{P}", text[start]) and text[end - 1]:
start += 1
while end > start and re.match("\p{P}", text[end - 1]) and not even_num_brackets(text[start:end]) and text[end - 1] not in keep_trailing_chars:
end -= 1
return text[start:end].strip()
def remove_punctuation2(text):
"""same as remove_punctuation, but keep trailing periods."""
return remove_punctuation(text, "?!.")
def clean_editor(text):
text = re.sub(r'^in(:| )', '', remove_punctuation(text), flags=re.IGNORECASE)
text = re.sub(r'\(?(hrsg\. v\.|hg\. v|hrsg\.|ed\.|eds\.)\)?', '', text, flags=re.IGNORECASE)
return text.strip()
def clean_container(text):
return remove_punctuation(re.sub(r'^(in|aus|from)(:| )', '', text.strip(), flags=re.IGNORECASE))
def extract_page_range(text):
match = re.match(r'(\p{Alnum}+)(?: *\p{Pd} *(\p{Alnum}+))?', text)
attributes = {"unit": "page"}
if match:
from_page = match.group(1)
to_page = match.group(2)
attributes.update({"from": from_page})
if to_page is not None:
attributes.update({"to": to_page})
return attributes
def process_range(text):
text = re.sub(r'^(S\.|p\.|pp\.)', '', text.strip(), flags=re.IGNORECASE)
text = re.sub(r'(ff?\.|seqq?\.)$', '', text.strip(), flags=re.IGNORECASE)
text = remove_punctuation(text)
attributes = extract_page_range(text)
return (text, attributes)
def handle_pages(text, bibl, tag, preserve):
if text == "": return
# split by comma or semicolon along with any trailing spaces
ranges = re.split(r'([,;] *)', text)
# initialize an empty list to store results
page_locators = []
# loop through indices with a step of 2
for i in range(0, len(ranges) - 1, 2):
# combine current element with the next one (which is a separator), and append to the list
page_locators.append(ranges[i] + ranges[i+1])
# if the input text doesn't end with a separator, add the last element
if text[-1] not in [',', ';']:
page_locators.append(ranges[-1])
for page_locator in page_locators:
add_node(bibl, tag, page_locator, clean_func=process_range, preserve=preserve)
def clean_volume(text):
text = re.sub(r'(vol\.|bd\.)', '', text.strip(), flags=re.IGNORECASE)
text = re.sub(r'(issue\.|heft\.)$', '', text.strip(), flags=re.IGNORECASE)
text = remove_punctuation(text)
return text, {"from": text, "to": text}
def extract_text_in_parentheses(text):
match = re.search(r'(.*?)\s*(\(.*?\))', text)
if match:
return match.group(1), match.group(2)
else:
return text, None
def extract_year(text):
m = re.search( r'[12][0-9]{3}', text)
return m.group(0) if m else None
def find_string(string, container):
start = container.find(string)
if start > -1:
end = start + len(string)
return start, end
raise ValueError(f"Could not find '{string}' in '{container}'")
def add_node(parent, tag, text="", attributes=None, clean_func=None, preserve=False):
"""
Adds a child node to the parent, optionally adding text and attributes.
If a clean_func is passed, the text is set after applying the function to it.
If the `preserve` flag is True, the removed preceding or trailing text is preserved in the xml, outside of the node content
"""
node = ET.SubElement(parent, tag, (attributes or {}))
if clean_func:
cleaned_text = clean_func(text)
if type(cleaned_text) is tuple:
# in a tuple result, the first element is the text and the second node attributes
for key,value in cleaned_text[1].items():
node.set(key, value)
cleaned_text = cleaned_text[0]
if preserve:
start, end = find_string(cleaned_text, text)
prefix, suffix = text[:start], text[end:]
if prefix !="" and len(parent) > 1:
prev_sibling = parent[-2]
prev_tail = (prev_sibling.tail or '')
new_prev_tail = f'{prev_tail} {prefix}'.strip()
prev_sibling.tail = new_prev_tail
node.text = cleaned_text
if suffix != "":
node.tail = suffix
else:
node.text = text
return node
def create_tei_root():
return ET.Element('TEI', {
'xmlns': "http://www.tei-c.org/ns/1.0"
})
def create_tei_header(tei_root, title):
tei_header = add_node(tei_root, 'teiHeader')
file_desc = add_node(tei_header, 'fileDesc')
title_stmt = add_node(file_desc, 'titleStmt')
add_node(title_stmt, 'title', title)
publication_stmt = add_node(file_desc, 'publicationStmt')
add_node(publication_stmt, 'publisher', 'mpilhlt')
source_desc = add_node(file_desc, 'sourceDesc')
add_node(source_desc, 'p', title)
return tei_header
def create_body(text_root):
body = ET.SubElement(text_root, 'body')
add_node(body, 'p', 'The article text is not part of this document')
return body
def prettify(xml_string, indentation=" "):
"""Return a pretty-printed XML string"""
return xml.dom.minidom.parseString(xml_string).toprettyxml(indent=indentation)
def split_creators(text:str, bibl, tag, clean_func, preserve):
sep_regex = r'[;&/]| and | und '
creators = re.split(sep_regex, text)
seperators = re.findall(sep_regex, text)
for creator in creators:
# <author>/<editor>
creator_node = add_node(bibl, tag, creator, clean_func=clean_func, preserve=preserve)
# <persName>
name = HumanName(creator_node.text)
creator_node.text = ''
pers_name = add_node(creator_node, 'persName')
inv_map = {v: k for k, v in name.as_dict(False).items()}
if len(name) == 1:
add_node(pers_name, 'surname', list(name)[0])
else:
for elem in list(name):
match inv_map[elem]:
case 'last':
# <surname>
add_node(pers_name, 'surname', elem)
case 'first' | 'middle':
# <forename>
add_node(pers_name, 'forename', elem)
if len(seperators):
creator_node.tail = seperators.pop(0).strip()
def anystyle_to_tei(input_xml_path, id, preserve=False):
anystyle_root = ET.parse(input_xml_path).getroot()
tei_root = create_tei_root()
create_tei_header(tei_root, title=id)
text_root = add_node(tei_root, 'text')
body = create_body(text_root)
# <listBibl> element for <bibl> elements that are not in footnotes, such as a bibliography
listBibl = add_node(body, 'listBibl')
# iterate over all sequences (=footnotes) and translate into TEI equivalents
for sequence in anystyle_root.findall('sequence'):
# if the sequence contains a citation-number, create a new <note> to add <bibl> elements to
if (cn:= sequence.findall('citation-number')):
footnote_number = cn[0].text
attributes = {
'n': footnote_number,
'type': 'footnote',
'place': 'bottom'
}
node = add_node(body, 'note', attributes=attributes, clean_func=remove_punctuation, preserve=preserve)
else:
# otherwise add to <listBibl> element
node = listBibl
bibl = None
for child in sequence:
tag = child.tag
text = child.text
if tag == "citation-number": continue # this has already been taken care of
if (bibl is None # if we do not have a bibl element yet
or (bibl.find(tag) and tag != "note") # or tag already exists in the current element
or tag in ['signal', 'legal-ref'] # or tag belongs to a specific groups that signal a separate reference
or (tag in ["author", "editor", "authority"] and bibl.find('date'))): # or specific tags follow a date field
# then create a new bibl element
bibl = ET.SubElement(node, 'bibl')
match tag.lower():
case 'author':
split_creators(text, bibl, 'author', clean_func=remove_punctuation, preserve=preserve)
case 'authority':
split_creators(text, bibl, 'publisher', clean_func=remove_punctuation, preserve=preserve)
case 'backref':
add_node(bibl, 'ref', text, clean_func=remove_punctuation2, preserve=preserve)
case 'container-title':
add_node(bibl, 'title', text, {'level': 'm'}, clean_func= clean_container, preserve=preserve)
case 'collection-title':
add_node(bibl, 'title', text, {'level': 's'}, clean_func= clean_container, preserve=preserve)
case 'date':
add_node(bibl, 'date', text, clean_func= extract_year, preserve=preserve)
case 'doi':
add_node(bibl, 'idno', text, {'type':'DOI'})
case 'edition':
add_node(bibl, 'edition', text, clean_func=remove_punctuation2, preserve=preserve)
case 'editor':
split_creators(text, bibl, 'editor', clean_func=clean_editor, preserve=preserve)
case 'location':
add_node(bibl, 'pubPlace', text, clean_func=remove_punctuation, preserve=preserve)
case 'note':
add_node(bibl, 'seg', text, {'type': 'comment'}, clean_func=remove_punctuation, preserve=preserve)
case 'journal':
add_node(bibl, 'title', text, {'level': 'j'}, clean_func= clean_container, preserve=preserve)
case 'legal-ref':
add_node(bibl, 'idno', text, {'type': 'caseNumber'}, clean_func = remove_punctuation, preserve=preserve)
case 'pages':
if bibl[-1].tag == "xr":
handle_pages(text, bibl, 'citedRange', preserve=preserve)
else:
pages, cited_range = extract_text_in_parentheses(text)
handle_pages(pages, bibl, 'biblScope', preserve=preserve)
if cited_range:
handle_pages(cited_range, bibl, 'citedRange', preserve=preserve)
case 'signal':
add_node(bibl, 'seg', text, {'type': 'signal'})
case 'title':
add_node(bibl, 'title', text, {'level': 'a'}, clean_func=remove_punctuation2, preserve=preserve)
case 'url':
add_node(bibl, 'ptr', text, {'type':'web'}, clean_func=remove_punctuation, preserve=preserve)
case 'volume':
volume, issue = extract_text_in_parentheses(text)
add_node(bibl, 'biblScope', volume, {'unit': 'volume'}, clean_func = clean_volume, preserve=preserve)
if issue:
add_node(bibl, 'biblScope', issue, {'unit': 'issue'}, clean_func = clean_volume, preserve=preserve)
if len(bibl) == 0:
node.remove(bibl)
if len(listBibl) == 0:
body.remove(listBibl)
return ET.tostring(tei_root, 'unicode')
def tei_to_json(tei_xml, schema):
dict_obj = xmlschema.to_dict(tei_xml, schema=schema, converter=xmlschema.JsonMLConverter)
return json.dumps(dict_obj, default=str)
# main
print("Converting AnyStyle XML into TEI/bibl elements")
for input_path in glob.glob('anystyle/*.xml'):
base_name = os.path.basename(input_path)
id = os.path.splitext(base_name)[0]
print(f' - {base_name}')
output_xml = anystyle_to_tei(input_path, id, preserve=True)
# output_json = tei_to_json(output_xml, schema)
with open(f'tei-bibl/{id}.xml', 'w', encoding='utf-8') as f:
f.write(prettify(output_xml))
```
%% Output
Converting AnyStyle XML into TEI/bibl elements
- 10.1111_1467-6478.00057.xml
- 10.1111_1467-6478.00080.xml
- 10.1515_zfrs-1980-0103.xml
- 10.1515_zfrs-1980-0104.xml
%% Cell type:markdown id:8c8b2d820086d461 tags:
%% Cell type:markdown id:bb9da323c357ca4c tags:
## Recreate input data from TEI/bibl and compare with AnyStyle input data
To see how much information is lost or which errors are introduced in the translation of Anystyle to TEI, we compare the input data generated from the (lossless) anystyle markup with that "reverse-engineered" from the TEI and save a character-level diff in the `html` directory.
The comparison is done with a copy of the files stored in `./tei-bibl-corrected` so that they are not overwritten when running the previous cell, and so that they can be manually corrected to fit the original data.
For better viewing, the result is published on gitlab pages (see links in the output).
%% Cell type:code id:4c19609699dc79c tags:
``` python
from lxml import etree
import glob
import os
import json
import regex as re
from lib.string import remove_whitespace
from difflib import HtmlDiff
from IPython.display import display, HTML,Markdown
def tei_to_ground_truth_input(tei_xml_doc):
"""
Extract the original footnote strings from the <note> elements in a given TEI document and return a list of strings
"""
root = etree.fromstring(tei_xml_doc)
ground_truth_list = []
ns = {"tei": "http://www.tei-c.org/ns/1.0"}
# iterate over the <note type="footnote"> elements
for note in root.findall('.//tei:note[@type="footnote"]', ns):
footnote_elements = [note.attrib['n']]
# iterate over the <bibl> elements
for bibl in note.findall('tei:bibl', ns):
# extract the text without xml tags, still contains all (collapsed) whitespace
text = etree.tostring(bibl, method="text", encoding='utf-8').decode()
text = remove_whitespace(text)
footnote_elements.append(text)
ground_truth_list.append(" ".join(footnote_elements))
return ground_truth_list
for input_path in glob.glob('tei-bibl-corrected/*.xml'):
base_name = os.path.basename(input_path)
id = os.path.splitext(base_name)[0]
with open(input_path, 'r', encoding='utf-8') as f:
tei_input_data = tei_to_ground_truth_input(f.read())
anystyle_input_path = f'refs/{id}.txt'
with open(anystyle_input_path, 'r', encoding='utf-8') as f:
anystyle_input_data = f.read().splitlines()
# create files showing the diff between the reverse engineering of the input data from the TEI and the original raw strings
html_diff = HtmlDiff().make_file(anystyle_input_data, tei_input_data)
with open(f"../public/convert-anystyle-data/diffs/{id}.diff.html", "w", encoding="utf-8") as f:
f.write(html_diff)
display(Markdown(f'Extracted and compared input data for {id} ([See diff](https://experiments-boulanger-27b5c1c5c975b0350675064f0f85580e618945eef.pages.gwdg.de/convert-anystyle-data/diffs/{id}.diff.html))'))
```
%% Output
Extracted and compared input data for 10.1111_1467-6478.00057 ([See diff](https://experiments-boulanger-27b5c1c5c975b0350675064f0f85580e618945eef.pages.gwdg.de/convert-anystyle-data/diffs/10.1111_1467-6478.00057.diff.html))
Extracted and compared input data for 10.1111_1467-6478.00080 ([See diff](https://experiments-boulanger-27b5c1c5c975b0350675064f0f85580e618945eef.pages.gwdg.de/convert-anystyle-data/diffs/10.1111_1467-6478.00080.diff.html))
Extracted and compared input data for 10.1515_zfrs-1980-0103 ([See diff](https://experiments-boulanger-27b5c1c5c975b0350675064f0f85580e618945eef.pages.gwdg.de/convert-anystyle-data/diffs/10.1515_zfrs-1980-0103.diff.html))
Extracted and compared input data for 10.1515_zfrs-1980-0104 ([See diff](https://experiments-boulanger-27b5c1c5c975b0350675064f0f85580e618945eef.pages.gwdg.de/convert-anystyle-data/diffs/10.1515_zfrs-1980-0104.diff.html))
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment