Skip to content
Snippets Groups Projects
Commit 771ba5ae authored by Christian Boulanger's avatar Christian Boulanger
Browse files

Add XSLT code

parent 1f01804f
No related branches found
No related tags found
No related merge requests found
%% Cell type:markdown id:4c77ab592c98dfd tags: %% Cell type:markdown id:4c77ab592c98dfd tags:
# Conversion to TEI (`<bibl>`) # Conversion to TEI (`<bibl>`)
References: References:
- https://www.tei-c.org/release/doc/tei-p5-doc/en/html/CO.html#COBI (Overview) - https://www.tei-c.org/release/doc/tei-p5-doc/en/html/CO.html#COBI (Overview)
- https://www.tei-c.org/release/doc/tei-p5-doc/en/html/CO.html#COBIOT (Mapping to other bibliographic formats) - https://www.tei-c.org/release/doc/tei-p5-doc/en/html/CO.html#COBIOT (Mapping to other bibliographic formats)
- https://www.tei-c.org/release/doc/tei-p5-doc/en/html/ref-bibl.html (`<bibl>`) - https://www.tei-c.org/release/doc/tei-p5-doc/en/html/ref-bibl.html (`<bibl>`)
- https://www.tei-c.org/release/doc/tei-p5-doc/en/html/ref-biblStruct.html (`biblStruct`) - https://www.tei-c.org/release/doc/tei-p5-doc/en/html/ref-biblStruct.html (`biblStruct`)
- https://epidoc.stoa.org/gl/latest/supp-bibliography.html (Examples) - https://epidoc.stoa.org/gl/latest/supp-bibliography.html (Examples)
- https://grobid.readthedocs.io/en/latest/training/Bibliographical-references/ (Grobid examples using `<bibl>`) - https://grobid.readthedocs.io/en/latest/training/Bibliographical-references/ (Grobid examples using `<bibl>`)
- http://www.jsonml.org/ (a JSON schema for lossless conversion from/to xml) - http://www.jsonml.org/ (a JSON schema for lossless conversion from/to xml)
We use `<bibl>` here instead of `<biblStruct>` because it is more loosely-structured and allows for a more flat datastructure. We use `<bibl>` here instead of `<biblStruct>` because it is more loosely-structured and allows for a more flat datastructure.
## Collect metadata on TEI `<bibl>` tags ## Collect metadata on TEI `<bibl>` tags
%% Cell type:code id:ff140f40df428a8f tags: %% Cell type:code id:ff140f40df428a8f tags:
``` python ``` python
import xmlschema import xmlschema
import os import os
# cache for local use # cache for local use
if not os.path.isdir("schema/tei"): if not os.path.isdir("schema/tei"):
schema = xmlschema.XMLSchema("https://www.tei-c.org/release/xml/tei/custom/schema/xsd/tei_all.xsd") schema = xmlschema.XMLSchema("https://www.tei-c.org/release/xml/tei/custom/schema/xsd/tei_all.xsd")
schema.export(target='schema/tei', save_remote=True) schema.export(target='schema/tei', save_remote=True)
``` ```
%% Cell type:code id:572f566fc9784238 tags: %% Cell type:code id:572f566fc9784238 tags:
``` python ``` python
import xml.etree.ElementTree as ET import xml.etree.ElementTree as ET
import pandas as pd import pandas as pd
import requests import requests
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
import re import re
from tqdm.notebook import tqdm from tqdm.notebook import tqdm
# written by GPT-4 # written by GPT-4
def extract_headings_and_links(tag, doc_heading, doc_base_url): def extract_headings_and_links(tag, doc_heading, doc_base_url):
# Extract heading numbers from the document # Extract heading numbers from the document
heading_numbers = re.findall(r'\d+(?:\.\d+)*', doc_heading) heading_numbers = re.findall(r'\d+(?:\.\d+)*', doc_heading)
# Download the HTML page # Download the HTML page
url = f"{doc_base_url}/ref-{tag}.html" url = f"{doc_base_url}/ref-{tag}.html"
response = requests.get(url) response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser') soup = BeautifulSoup(response.content, 'html.parser')
# Extract the links associated with each heading number # Extract the links associated with each heading number
links = {} links = {}
for link in soup.find_all('a', class_='link_ptr'): for link in soup.find_all('a', class_='link_ptr'):
heading_value = link.find('span', class_='headingNumber').text.strip() heading_value = link.find('span', class_='headingNumber').text.strip()
link_url = link.get('href') link_url = link.get('href')
links[heading_value] = f"{doc_base_url}/{link_url}" links[heading_value] = f"{doc_base_url}/{link_url}"
return {heading: link_url for heading, link_url in zip(heading_numbers, links.values()) if return {heading: link_url for heading, link_url in zip(heading_numbers, links.values()) if
heading in heading_numbers} heading in heading_numbers}
def generate_tag_docs(xsd_path): def generate_tag_docs(xsd_path):
namespaces = {'xs': 'http://www.w3.org/2001/XMLSchema'} namespaces = {'xs': 'http://www.w3.org/2001/XMLSchema'}
doc_base_url = "https://www.tei-c.org/release/doc/tei-p5-doc/en/html" doc_base_url = "https://www.tei-c.org/release/doc/tei-p5-doc/en/html"
tree = ET.parse('schema/tei/tei_all.xsd') tree = ET.parse('schema/tei/tei_all.xsd')
root = tree.getroot() root = tree.getroot()
schema = xmlschema.XMLSchema(xsd_path) schema = xmlschema.XMLSchema(xsd_path)
bibl_schema = schema.find("tei:bibl") bibl_schema = schema.find("tei:bibl")
data_list = [] data_list = []
#names = [child_element.local_name for child_element in bibl_schema.iterchildren()] #names = [child_element.local_name for child_element in bibl_schema.iterchildren()]
names = ['author', 'biblScope', 'citedRange', 'date', 'edition', 'editor', 'idno', 'location', 'note', 'orgName', names = ['author', 'biblScope', 'citedRange', 'date', 'edition', 'editor', 'idno', 'location', 'note', 'orgName',
'publisher', 'pubPlace', 'ptr', 'series', 'title', 'volume', 'issue'] 'publisher', 'pubPlace', 'ptr', 'series', 'title', 'volume', 'issue']
for name in tqdm(names, desc="Processing TEI tags"): for name in tqdm(names, desc="Processing TEI tags"):
doc_node = root.find(f".//xs:element[@name='{name}']/xs:annotation/xs:documentation", namespaces=namespaces) doc_node = root.find(f".//xs:element[@name='{name}']/xs:annotation/xs:documentation", namespaces=namespaces)
if doc_node is not None: if doc_node is not None:
matches = re.search(r'^(.*)\[(.*)]$', doc_node.text) matches = re.search(r'^(.*)\[(.*)]$', doc_node.text)
if matches is None: continue if matches is None: continue
description = matches.group(1) description = matches.group(1)
doc_heading = matches.group(2) doc_heading = matches.group(2)
doc_urls = extract_headings_and_links(name, doc_heading, doc_base_url) doc_urls = extract_headings_and_links(name, doc_heading, doc_base_url)
data_list.append({'name': name, 'description': description, 'documentation': doc_heading, 'urls': doc_urls}) data_list.append({'name': name, 'description': description, 'documentation': doc_heading, 'urls': doc_urls})
return pd.DataFrame(data_list) return pd.DataFrame(data_list)
cache_file = "schema/tei/tei-tags-documentation.json" cache_file = "schema/tei/tei-tags-documentation.json"
if not os.path.isfile(cache_file): if not os.path.isfile(cache_file):
df = generate_tag_docs("schema/tei/tei_all.xsd") df = generate_tag_docs("schema/tei/tei_all.xsd")
json_str = df.to_json(index=False, orient='records', indent=4).replace(r"\/", "/") json_str = df.to_json(index=False, orient='records', indent=4).replace(r"\/", "/")
with open(cache_file, "w", encoding='utf-8') as f: with open(cache_file, "w", encoding='utf-8') as f:
f.write(json_str) f.write(json_str)
else: else:
df = pd.read_json(cache_file) df = pd.read_json(cache_file)
df df
``` ```
%% Output %% Output
name description \ name description \
0 author (author) in a bibliographic reference, contain... 0 author (author) in a bibliographic reference, contain...
1 biblScope (scope of bibliographic reference) defines the... 1 biblScope (scope of bibliographic reference) defines the...
2 citedRange (cited range) defines the range of cited conte... 2 citedRange (cited range) defines the range of cited conte...
3 date (date) contains a date in any format. 3 date (date) contains a date in any format.
4 edition (edition) describes the particularities of one... 4 edition (edition) describes the particularities of one...
5 editor contains a secondary statement of responsibili... 5 editor contains a secondary statement of responsibili...
6 idno (identifier) supplies any form of identifier u... 6 idno (identifier) supplies any form of identifier u...
7 location (location) defines the location of a place as ... 7 location (location) defines the location of a place as ...
8 note (note) contains a note or annotation. 8 note (note) contains a note or annotation.
9 orgName (organization name) contains an organizational... 9 orgName (organization name) contains an organizational...
10 publisher (publisher) provides the name of the organizat... 10 publisher (publisher) provides the name of the organizat...
11 pubPlace (publication place) contains the name of the p... 11 pubPlace (publication place) contains the name of the p...
12 ptr (pointer) defines a pointer to another location. 12 ptr (pointer) defines a pointer to another location.
13 series (series information) contains information abou... 13 series (series information) contains information abou...
14 title (title) contains a title for any kind of work. 14 title (title) contains a title for any kind of work.
documentation \ documentation \
0 3.12.2.2. Titles, Authors, and Editors 2.2.1. ... 0 3.12.2.2. Titles, Authors, and Editors 2.2.1. ...
1 3.12.2.5. Scopes and Ranges in Bibliographic C... 1 3.12.2.5. Scopes and Ranges in Bibliographic C...
2 3.12.2.5. Scopes and Ranges in Bibliographic C... 2 3.12.2.5. Scopes and Ranges in Bibliographic C...
3 3.6.4. Dates and Times 2.2.4. Publication, Dis... 3 3.6.4. Dates and Times 2.2.4. Publication, Dis...
4 2.2.2. The Edition Statement 4 2.2.2. The Edition Statement
5 3.12.2.2. Titles, Authors, and Editors 5 3.12.2.2. Titles, Authors, and Editors
6 14.3.1. Basic Principles 2.2.4. Publication, D... 6 14.3.1. Basic Principles 2.2.4. Publication, D...
7 14.3.4. Places 7 14.3.4. Places
8 3.9.1. Notes and Simple Annotation 2.2.6. The ... 8 3.9.1. Notes and Simple Annotation 2.2.6. The ...
9 14.2.2. Organizational Names 9 14.2.2. Organizational Names
10 3.12.2.4. Imprint, Size of a Document, and Rep... 10 3.12.2.4. Imprint, Size of a Document, and Rep...
11 3.12.2.4. Imprint, Size of a Document, and Rep... 11 3.12.2.4. Imprint, Size of a Document, and Rep...
12 3.7. Simple Links and Cross-References 17.1. L... 12 3.7. Simple Links and Cross-References 17.1. L...
13 3.12.2.1. Analytic, Monographic, and Series Le... 13 3.12.2.1. Analytic, Monographic, and Series Le...
14 3.12.2.2. Titles, Authors, and Editors 2.2.1. ... 14 3.12.2.2. Titles, Authors, and Editors 2.2.1. ...
urls urls
0 {'3.12.2.2': 'https://www.tei-c.org/release/do... 0 {'3.12.2.2': 'https://www.tei-c.org/release/do...
1 {'3.12.2.5': 'https://www.tei-c.org/release/do... 1 {'3.12.2.5': 'https://www.tei-c.org/release/do...
2 {'3.12.2.5': 'https://www.tei-c.org/release/do... 2 {'3.12.2.5': 'https://www.tei-c.org/release/do...
3 {'3.6.4': 'https://www.tei-c.org/release/doc/t... 3 {'3.6.4': 'https://www.tei-c.org/release/doc/t...
4 {'2.2.2': 'https://www.tei-c.org/release/doc/t... 4 {'2.2.2': 'https://www.tei-c.org/release/doc/t...
5 {'3.12.2.2': 'https://www.tei-c.org/release/do... 5 {'3.12.2.2': 'https://www.tei-c.org/release/do...
6 {'14.3.1': 'https://www.tei-c.org/release/doc/... 6 {'14.3.1': 'https://www.tei-c.org/release/doc/...
7 {'14.3.4': 'https://www.tei-c.org/release/doc/... 7 {'14.3.4': 'https://www.tei-c.org/release/doc/...
8 {'3.9.1': 'https://www.tei-c.org/release/doc/t... 8 {'3.9.1': 'https://www.tei-c.org/release/doc/t...
9 {'14.2.2': 'https://www.tei-c.org/release/doc/... 9 {'14.2.2': 'https://www.tei-c.org/release/doc/...
10 {'3.12.2.4': 'https://www.tei-c.org/release/do... 10 {'3.12.2.4': 'https://www.tei-c.org/release/do...
11 {'3.12.2.4': 'https://www.tei-c.org/release/do... 11 {'3.12.2.4': 'https://www.tei-c.org/release/do...
12 {'3.7': 'https://www.tei-c.org/release/doc/tei... 12 {'3.7': 'https://www.tei-c.org/release/doc/tei...
13 {'3.12.2.1': 'https://www.tei-c.org/release/do... 13 {'3.12.2.1': 'https://www.tei-c.org/release/do...
14 {'3.12.2.2': 'https://www.tei-c.org/release/do... 14 {'3.12.2.2': 'https://www.tei-c.org/release/do...
%% Cell type:markdown id:aaf43ee43bb6d4d tags: %% Cell type:markdown id:aaf43ee43bb6d4d tags:
## Convert Groundd Truth to TEI ## Convert Groundd Truth to TEI
%% Cell type:code id:b3ee84984b88f24a tags: %% Cell type:code id:b3ee84984b88f24a tags:
``` python ``` python
import xml.etree.ElementTree as ET import xml.etree.ElementTree as ET
import regex as re import regex as re
import glob import glob
import os import os
import xml.dom.minidom import xml.dom.minidom
import json import json
import xmlschema import xmlschema
from nameparser import HumanName from nameparser import HumanName
def even_num_brackets(string: str): def even_num_brackets(string: str):
""" """
Simple heuristic to determine if string contains an even number of round and square brackets, Simple heuristic to determine if string contains an even number of round and square brackets,
so that if not, trailing or leading brackets will be removed. so that if not, trailing or leading brackets will be removed.
""" """
return ((string.endswith(")") and string.count(")") == string.count("(")) return ((string.endswith(")") and string.count(")") == string.count("("))
or (string.endswith("]") and string.count("]") == string.count("["))) or (string.endswith("]") and string.count("]") == string.count("[")))
def remove_punctuation(text, keep_trailing_chars="?!"): def remove_punctuation(text, keep_trailing_chars="?!"):
"""This removes leading and trailing punctuation using very simple rules for German and English""" """This removes leading and trailing punctuation using very simple rules for German and English"""
start, end = 0, len(text) start, end = 0, len(text)
while start < len(text) and re.match("\p{P}", text[start]) and text[end - 1]: while start < len(text) and re.match("\p{P}", text[start]) and text[end - 1]:
start += 1 start += 1
while end > start and re.match("\p{P}", text[end - 1]) and not even_num_brackets(text[start:end]) and text[end - 1] not in keep_trailing_chars: while end > start and re.match("\p{P}", text[end - 1]) and not even_num_brackets(text[start:end]) and text[end - 1] not in keep_trailing_chars:
end -= 1 end -= 1
return text[start:end].strip() return text[start:end].strip()
def remove_punctuation2(text): def remove_punctuation2(text):
"""same as remove_punctuation, but keep trailing periods.""" """same as remove_punctuation, but keep trailing periods."""
return remove_punctuation(text, "?!.") return remove_punctuation(text, "?!.")
def clean_editor(text): def clean_editor(text):
text = re.sub(r'^in(:| )', '', remove_punctuation(text), flags=re.IGNORECASE) text = re.sub(r'^in(:| )', '', remove_punctuation(text), flags=re.IGNORECASE)
text = re.sub(r'\(?(hrsg\. v\.|hg\. v|hrsg\.|ed\.|eds\.)\)?', '', text, flags=re.IGNORECASE) text = re.sub(r'\(?(hrsg\. v\.|hg\. v|hrsg\.|ed\.|eds\.)\)?', '', text, flags=re.IGNORECASE)
return text.strip() return text.strip()
def clean_container(text): def clean_container(text):
return remove_punctuation(re.sub(r'^(in|aus|from)(:| )', '', text.strip(), flags=re.IGNORECASE)) return remove_punctuation(re.sub(r'^(in|aus|from)(:| )', '', text.strip(), flags=re.IGNORECASE))
def clean_pages(text): def clean_pages(text):
return remove_punctuation(re.sub(r'^(S\.|p\.|pp\.|ff?\.||seqq?\.)', '', text.strip(), flags=re.IGNORECASE)) return remove_punctuation(re.sub(r'^(S\.|p\.|pp\.|ff?\.||seqq?\.)', '', text.strip(), flags=re.IGNORECASE))
def extract_year(text): def extract_year(text):
m = re.search( r'[12][0-9]{3}', text) m = re.search( r'[12][0-9]{3}', text)
return m.group(0) if m else None return m.group(0) if m else None
def find_string(string, container): def find_string(string, container):
start = container.find(string) start = container.find(string)
if start > -1: if start > -1:
end = start + len(string) end = start + len(string)
return start, end return start, end
raise ValueError(f"Could not find '{string}' in '{container}'") raise ValueError(f"Could not find '{string}' in '{container}'")
def add_node(parent, tag, text="", attributes=None, clean_func=None, preserve=False): def add_node(parent, tag, text="", attributes=None, clean_func=None, preserve=False):
""" """
Adds a child node to the parent, optionally adding text and attributes. Adds a child node to the parent, optionally adding text and attributes.
If a clean_func is passed, the text is set after applying the function to it. If a clean_func is passed, the text is set after applying the function to it.
If the `preserve` flag is True, the removed preceding or trailing text is preserved in the xml, If the `preserve` flag is True, the removed preceding or trailing text is preserved in the xml,
outside of the node content outside of the node content
""" """
node = ET.SubElement(parent, tag, (attributes or {})) node = ET.SubElement(parent, tag, (attributes or {}))
if clean_func: if clean_func:
cleaned_text = clean_func(text) cleaned_text = clean_func(text)
if preserve: if preserve:
start, end = find_string(cleaned_text, text) start, end = find_string(cleaned_text, text)
prefix, suffix = text[:start], text[end:] prefix, suffix = text[:start], text[end:]
if prefix !="" and len(parent) > 1: if prefix !="" and len(parent) > 1:
prev_sibling = parent[-2] prev_sibling = parent[-2]
prev_tail = (prev_sibling.tail or '') prev_tail = (prev_sibling.tail or '')
new_prev_tail = f'{prev_tail} {prefix}'.strip() new_prev_tail = f'{prev_tail} {prefix}'.strip()
prev_sibling.tail = new_prev_tail prev_sibling.tail = new_prev_tail
node.text = cleaned_text node.text = cleaned_text
if suffix != "": if suffix != "":
node.tail = suffix node.tail = suffix
else: else:
node.text = text node.text = text
return node return node
def create_tei_root(): def create_tei_root():
return ET.Element('TEI', { return ET.Element('TEI', {
'xmlns': "http://www.tei-c.org/ns/1.0" 'xmlns': "http://www.tei-c.org/ns/1.0"
}) })
def create_tei_header(tei_root, title): def create_tei_header(tei_root, title):
tei_header = add_node(tei_root, 'teiHeader') tei_header = add_node(tei_root, 'teiHeader')
file_desc = add_node(tei_header, 'fileDesc') file_desc = add_node(tei_header, 'fileDesc')
title_stmt = add_node(file_desc, 'titleStmt') title_stmt = add_node(file_desc, 'titleStmt')
add_node(title_stmt, 'title', title) add_node(title_stmt, 'title', title)
publication_stmt = add_node(file_desc, 'publicationStmt') publication_stmt = add_node(file_desc, 'publicationStmt')
add_node(publication_stmt, 'publisher', 'mpilhlt') add_node(publication_stmt, 'publisher', 'mpilhlt')
source_desc = add_node(file_desc, 'sourceDesc') source_desc = add_node(file_desc, 'sourceDesc')
add_node(source_desc, 'p', title) add_node(source_desc, 'p', title)
return tei_header return tei_header
def create_body(text_root): def create_body(text_root):
body = ET.SubElement(text_root, 'body') body = ET.SubElement(text_root, 'body')
add_node(body, 'p', 'The article text is not part of this document') add_node(body, 'p', 'The article text is not part of this document')
return body return body
def prettify(xml_string, indentation=" "): def prettify(xml_string, indentation=" "):
"""Return a pretty-printed XML string""" """Return a pretty-printed XML string"""
return xml.dom.minidom.parseString(xml_string).toprettyxml(indent=indentation) return xml.dom.minidom.parseString(xml_string).toprettyxml(indent=indentation)
def split_creators(text:str, bibl, tag, clean_func, preserve): def split_creators(text:str, bibl, tag, clean_func, preserve):
sep_regex = r'[;&/]| and | und ' sep_regex = r'[;&/]| and | und '
creators = re.split(sep_regex, text) creators = re.split(sep_regex, text)
seperators = re.findall(sep_regex, text) seperators = re.findall(sep_regex, text)
for creator in creators: for creator in creators:
# <author>/<editor> # <author>/<editor>
creator_node = add_node(bibl, tag, creator, clean_func=clean_func, preserve=preserve) creator_node = add_node(bibl, tag, creator, clean_func=clean_func, preserve=preserve)
# <persName> # <persName>
name = HumanName(creator_node.text) name = HumanName(creator_node.text)
creator_node.text = '' creator_node.text = ''
pers_name = add_node(creator_node, 'persName') pers_name = add_node(creator_node, 'persName')
inv_map = {v: k for k, v in name.as_dict(False).items()} inv_map = {v: k for k, v in name.as_dict(False).items()}
if len(name) == 1: if len(name) == 1:
add_node(pers_name, 'surname', list(name)[0]) add_node(pers_name, 'surname', list(name)[0])
else: else:
for elem in list(name): for elem in list(name):
match inv_map[elem]: match inv_map[elem]:
case 'last': case 'last':
# <surname> # <surname>
add_node(pers_name, 'surname', elem) add_node(pers_name, 'surname', elem)
case 'first' | 'middle': case 'first' | 'middle':
# <forename> # <forename>
add_node(pers_name, 'forename', elem) add_node(pers_name, 'forename', elem)
if len(seperators): if len(seperators):
creator_node.tail = seperators.pop(0).strip() creator_node.tail = seperators.pop(0).strip()
def anystyle_to_tei(input_xml_path, id, preserve=False): def anystyle_to_tei(input_xml_path, id, preserve=False):
anystyle_root = ET.parse(input_xml_path).getroot() anystyle_root = ET.parse(input_xml_path).getroot()
tei_root = create_tei_root() tei_root = create_tei_root()
create_tei_header(tei_root, title=id) create_tei_header(tei_root, title=id)
text_root = add_node(tei_root, 'text') text_root = add_node(tei_root, 'text')
body = create_body(text_root) body = create_body(text_root)
# <listBibl> element for <bibl> elements that are not in footnotes, such as a bibliography # <listBibl> element for <bibl> elements that are not in footnotes, such as a bibliography
listBibl = add_node(body, 'listBibl') listBibl = add_node(body, 'listBibl')
# iterate over all sequences (=footnotes) and translate into TEI equivalents # iterate over all sequences (=footnotes) and translate into TEI equivalents
for sequence in anystyle_root.findall('sequence'): for sequence in anystyle_root.findall('sequence'):
# if the sequence contains a citation-number, create a new <note> to add <bibl> elements to # if the sequence contains a citation-number, create a new <note> to add <bibl> elements to
if (cn:= sequence.findall('citation-number')): if (cn:= sequence.findall('citation-number')):
footnote_number = cn[0].text footnote_number = cn[0].text
attributes = { attributes = {
'n': footnote_number, 'n': footnote_number,
'type': 'footnote', 'type': 'footnote',
'place': 'bottom' 'place': 'bottom'
} }
node = add_node(text_root, 'note', attributes=attributes, clean_func=remove_punctuation, preserve=preserve) node = add_node(text_root, 'note', attributes=attributes, clean_func=remove_punctuation, preserve=preserve)
else: else:
# otherwise add to <listBibl> element # otherwise add to <listBibl> element
node = listBibl node = listBibl
bibl = None bibl = None
for child in sequence: for child in sequence:
tag = child.tag tag = child.tag
text = child.text text = child.text
if tag == "citation-number": continue # this has already been taken care of if tag == "citation-number": continue # this has already been taken care of
if (bibl is None # if we do not have a bibl element yet if (bibl is None # if we do not have a bibl element yet
or (bibl.find(tag) and tag != "note") # or tag already exists in the current element or (bibl.find(tag) and tag != "note") # or tag already exists in the current element
or tag in ['signal', 'legal-ref'] # or tag belongs to a specific groups that signal a separate reference or tag in ['signal', 'legal-ref'] # or tag belongs to a specific groups that signal a separate reference
or (tag in ["author", "editor", "authority"] and bibl.find('date'))): # or specific tags follow a date field or (tag in ["author", "editor", "authority"] and bibl.find('date'))): # or specific tags follow a date field
# then create a new bibl element # then create a new bibl element
bibl = ET.SubElement(node, 'bibl') bibl = ET.SubElement(node, 'bibl')
match tag: match tag:
case 'author': case 'author':
split_creators(text, bibl, 'author', clean_func=remove_punctuation, preserve=preserve) split_creators(text, bibl, 'author', clean_func=remove_punctuation, preserve=preserve)
case 'authority': case 'authority':
split_creators(text, bibl, 'publisher', clean_func=remove_punctuation, preserve=preserve) split_creators(text, bibl, 'publisher', clean_func=remove_punctuation, preserve=preserve)
case 'backref': case 'backref':
add_node(bibl, 'ref', text, clean_func=remove_punctuation2, preserve=preserve) add_node(bibl, 'ref', text, clean_func=remove_punctuation2, preserve=preserve)
case 'container-title': case 'container-title':
add_node(bibl, 'title', text, {'level': 'm'}, clean_func= clean_container, preserve=preserve) add_node(bibl, 'title', text, {'level': 'm'}, clean_func= clean_container, preserve=preserve)
case 'collection-title': case 'collection-title':
add_node(bibl, 'title', text, {'level': 's'}, clean_func= clean_container, preserve=preserve) add_node(bibl, 'title', text, {'level': 's'}, clean_func= clean_container, preserve=preserve)
case 'date': case 'date':
add_node(bibl, 'date', text, clean_func= extract_year, preserve=preserve) add_node(bibl, 'date', text, clean_func= extract_year, preserve=preserve)
case 'edition': case 'edition':
add_node(bibl, 'edition', text, clean_func=remove_punctuation2, preserve=preserve) add_node(bibl, 'edition', text, clean_func=remove_punctuation2, preserve=preserve)
case 'editor': case 'editor':
split_creators(text, bibl, 'editor', clean_func=clean_editor, preserve=preserve) split_creators(text, bibl, 'editor', clean_func=clean_editor, preserve=preserve)
case 'location': case 'location':
add_node(bibl, 'pubPlace', text, clean_func=remove_punctuation, preserve=preserve) add_node(bibl, 'pubPlace', text, clean_func=remove_punctuation, preserve=preserve)
case 'note': case 'note':
add_node(bibl, 'note', text, clean_func=remove_punctuation, preserve=preserve) add_node(bibl, 'note', text, clean_func=remove_punctuation, preserve=preserve)
case 'journal': case 'journal':
add_node(bibl, 'title', text, {'level': 'j'}, clean_func= clean_container, preserve=preserve) add_node(bibl, 'title', text, {'level': 'j'}, clean_func= clean_container, preserve=preserve)
case 'legal-ref': case 'legal-ref':
add_node(bibl, 'ref', text, {'type': 'legal'}, clean_func = remove_punctuation, preserve=preserve) add_node(bibl, 'ref', text, {'type': 'legal'}, clean_func = remove_punctuation, preserve=preserve)
case 'pages': case 'pages':
if bibl[-1].tag == "ref": if bibl[-1].tag == "ref":
add_node(bibl, 'citedRange', text, {'unit': 'pp'}, clean_func= clean_pages, preserve=preserve) add_node(bibl, 'citedRange', text, {'unit': 'pp'}, clean_func= clean_pages, preserve=preserve)
else: else:
add_node(bibl, 'biblScope', text, {'unit': 'pp'}, clean_func= clean_pages, preserve=preserve) add_node(bibl, 'biblScope', text, {'unit': 'pp'}, clean_func= clean_pages, preserve=preserve)
case 'signal': case 'signal':
add_node(bibl, 'note', text, {'type': 'signal'}, clean_func=remove_punctuation, preserve=preserve) add_node(bibl, 'note', text, {'type': 'signal'}, clean_func=remove_punctuation, preserve=preserve)
case 'title': case 'title':
add_node(bibl, 'title', text, {'level': 'a'}, clean_func=remove_punctuation2, preserve=preserve) add_node(bibl, 'title', text, {'level': 'a'}, clean_func=remove_punctuation2, preserve=preserve)
case 'url': case 'url':
add_node(bibl, 'ptr', text, {'type':'web'}, clean_func=remove_punctuation, preserve=preserve) add_node(bibl, 'ptr', text, {'type':'web'}, clean_func=remove_punctuation, preserve=preserve)
case 'volume': case 'volume':
add_node(bibl, 'biblScope', text, {'unit': 'vol'}, clean_func = remove_punctuation, preserve=preserve) add_node(bibl, 'biblScope', text, {'unit': 'vol'}, clean_func = remove_punctuation, preserve=preserve)
if len(bibl) == 0: if len(bibl) == 0:
node.remove(bibl) node.remove(bibl)
if len(listBibl) == 0: if len(listBibl) == 0:
body.remove(listBibl) body.remove(listBibl)
return ET.tostring(tei_root, 'unicode') return ET.tostring(tei_root, 'unicode')
def tei_to_json(tei_xml, schema): def tei_to_json(tei_xml, schema):
dict_obj = xmlschema.to_dict(tei_xml, schema=schema, converter=xmlschema.JsonMLConverter) dict_obj = xmlschema.to_dict(tei_xml, schema=schema, converter=xmlschema.JsonMLConverter)
return json.dumps(dict_obj, default=str) return json.dumps(dict_obj, default=str)
# main # main
# XML->JSON-Conversion doesn't provide anything useful # XML->JSON-Conversion doesn't provide anything useful
# tei_xsd_path = "schema/tei/tei_all.xsd" # tei_xsd_path = "schema/tei/tei_all.xsd"
# if 'schema' not in locals(): # if 'schema' not in locals():
# print("Parsing schema file, please wait...") # print("Parsing schema file, please wait...")
# schema = xmlschema.XMLSchema(tei_xsd_path) # schema = xmlschema.XMLSchema(tei_xsd_path)
for input_path in glob.glob('anystyle/*.xml'): for input_path in glob.glob('anystyle/*.xml'):
base_name = os.path.basename(input_path) base_name = os.path.basename(input_path)
id = os.path.splitext(base_name)[0] id = os.path.splitext(base_name)[0]
print(f'Converting {base_name} into TEI-XML ...') print(f'Converting {base_name} into TEI-XML ...')
output_xml = anystyle_to_tei(input_path, id, preserve=True) output_xml = anystyle_to_tei(input_path, id, preserve=True)
# output_json = tei_to_json(output_xml, schema) # output_json = tei_to_json(output_xml, schema)
with open(f'tei/{id}.xml', 'w', encoding='utf-8') as f: with open(f'tei/{id}.xml', 'w', encoding='utf-8') as f:
f.write(prettify(output_xml)) f.write(prettify(output_xml))
# with open(f'tei/{id}.json', 'w', encoding='utf-8') as f: # with open(f'tei/{id}.json', 'w', encoding='utf-8') as f:
# f.write(output_json) # f.write(output_json)
``` ```
%% Output %% Output
Converting 10.1111_1467-6478.00057.xml into TEI-XML ... Converting 10.1111_1467-6478.00057.xml into TEI-XML ...
Converting 10.1111_1467-6478.00080.xml into TEI-XML ... Converting 10.1111_1467-6478.00080.xml into TEI-XML ...
Converting 10.1515_zfrs-1980-0103.xml into TEI-XML ... Converting 10.1515_zfrs-1980-0103.xml into TEI-XML ...
Converting 10.1515_zfrs-1980-0104.xml into TEI-XML ... Converting 10.1515_zfrs-1980-0104.xml into TEI-XML ...
%% Cell type:markdown id:8c8b2d820086d461 tags:
%% Cell type:markdown id:b0a231dc7bdd8b01 tags: %% Cell type:markdown id:b0a231dc7bdd8b01 tags:
## Create LinkML schema from TEI XSD ## Extract bibliographic data from TEI files
%% Cell type:markdown id:149588c08747c4b3 tags:
### Download XSLTs
%% Cell type:code id:1f15b3af6aab73ed tags:
``` python
import requests, zipfile, io, os
if not os.path.isdir('lib/convert'):
url = 'https://github.com/OpenArabicPE/convert_tei-to-bibliographic-data/archive/refs/heads/master.zip'
r = requests.get(url)
assert r.status_code == 200
z = zipfile.ZipFile(io.BytesIO(r.content))
z.extractall('lib')
z.close()
os.rename('lib/convert_tei-to-bibliographic-data-master', 'lib/convert')
```
%% Cell type:markdown id:aa86435960e61937 tags: %% Cell type:markdown id:aa86435960e61937 tags:
### Apply XSLT
%% Cell type:code id:cb3b4140ab153c08 tags:
``` python
from lxml import etree
def apply_xslt_to_xml(xslt_path, xml_path):
xslt = etree.parse(xslt_path)
xml = etree.parse(xml_path)
transformer = etree.XSLT(xslt)
new_xml = transformer(xml)
return str(new_xml)
new_xml_str = apply_xslt_to_xml('path_to_your_xslt_file', 'path_to_your_xml_file')
print(new_xml_str)
```
%% Cell type:markdown id:387b5b9792505b13 tags:
......
*
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment