# Convert the generated TEI to bibliographic formats


## Download required XSLT documents

we use XSLT provided by https://github.com/OpenArabicPE/convert_tei-to-bibliographic-data 

In [30]:
import os
from urllib.parse import urljoin
import requests
from lxml import etree

cache = set()

def download_xslt(url, target_dir = 'lib/xslt'):
 """written by GPT-4"""
 response = requests.get(url)
 response.raise_for_status()
 doc = etree.fromstring(response.content)
 for elem in doc.xpath('//*[local-name() = "import"]'):
 import_url = urljoin(url, elem.get('href'))
 if import_url not in cache:
 cache.add(import_url)
 download_xslt(import_url, target_dir)
 os.makedirs(target_dir, exist_ok=True)
 with open(os.path.join(target_dir, os.path.basename(url)), 'wb') as f:
 f.write(response.content)
 print(f'Downloaded {os.path.basename(url)} to {target_dir}')

# TEI -> BiblStruct/MODS
base_url = 'https://openarabicpe.github.io/convert_tei-to-bibliographic-data/xslt'
xslt_docs = ['convert_tei-to-biblstruct_bibl.xsl', 
 'convert_tei-to-mods_bibl.xsl',
 'convert_tei-to-zotero-rdf_bibl.xsl']
for xslt_doc in xslt_docs:
 download_xslt(f'{base_url}/{xslt_doc}')

# MODS -> BIBO-RDF
base_url = 'https://www.loc.gov/standards/mods/v3'
xslt_docs = ['MODS3-7_Bibframe2-0_XSLT2-0_20230505.xsl', 'conf/languageCrosswalk.xml']
for xslt_doc in xslt_docs:
 download_xslt(f'{base_url}/{xslt_doc}', target_dir=f'lib/xslt/{os.path.dirname(xslt_doc)}')


Downloaded parameters.xsl to lib/xslt
Downloaded functions.xsl to lib/xslt
Downloaded convert_tei-to-biblstruct_functions.xsl to lib/xslt
Downloaded convert_tei-to-biblstruct_bibl.xsl to lib/xslt
Downloaded date-functions.xsl to lib/xslt
Downloaded convert_tei-to-mods_functions.xsl to lib/xslt
Downloaded convert_tei-to-mods_bibl.xsl to lib/xslt
Downloaded convert_tei-to-zotero-rdf_bibl.xsl to lib/xslt
Downloaded MODS3-7_Bibframe2-0_XSLT2-0_20230505.xsl to lib/xslt/
Downloaded languageCrosswalk.xml to lib/xslt/conf


## Download the Saxon jar

As the xslt uses v2.0 features, and there are no native-python xslt-2.0 processors, we need to use the Saxon processor. Possible alternatives (untested):
 - https://pypi.org/project/saxonpy
 - https://github.com/cts2/pyjxslt


In [86]:
import requests
import zipfile
import io
import os
url = "https://github.com/Saxonica/Saxon-HE/releases/download/SaxonHE12-5/SaxonHE12-5J.zip"
target_dir = 'lib/SaxonHE12-5'
response = requests.get(url, stream=True)
file_zip = zipfile.ZipFile(io.BytesIO(response.content))
os.makedirs(target_dir, exist_ok=True)
file_zip.extractall(path=target_dir)

## Run the transformation to biblStruct & MODS

In [53]:
import subprocess
import os
from glob import glob

def transform(xslt_path, input_path='tei', output_path='.', rename_extension:tuple=None):
 input_path = os.path.normpath(input_path)
 xslt_path = os.path.normpath(xslt_path)
 cmd = ['java', '-jar', 'lib/SaxonHE12-5J/saxon-he-12.5.jar', 
 f'-s:{input_path}', f'-xsl:{xslt_path}', f'-o:{output_path}',
 'p_target-language=de', 'p_github-action=true', f'p_output-folder={output_path}']
 process = subprocess.run(cmd, capture_output=True, text=True)
 if rename_extension:
 from_extension = rename_extension[0]
 to_extension = rename_extension[1]
 for filename in glob(f'{output_path}/*.xml'):
 if filename.endswith(from_extension):
 os.replace(filename, filename.replace(from_extension, to_extension))
 if process.returncode != 0:
 raise RuntimeError(process.stderr)
 print(f'Applied {xslt_path} to files in {input_path} and saved result in {output_path}.')
 return process

In [54]:
# TEI -> biblstruct
transform(xslt_path='lib/xslt/convert_tei-to-biblstruct_bibl.xsl', output_path='biblStruct', rename_extension=('-bibl_biblStruct.TEIP5.xml','.biblStruct.xml'))

# TEI -> MODS
transform(xslt_path='lib/xslt/convert_tei-to-mods_bibl.xsl', output_path='mods', rename_extension=('-bibl.MODS.xml','.mods.xml'))

# rename wanted and delete unwanted empty files
for dir_name in ['biblStruct', 'mods']:
 for filename in glob(f'{dir_name}/*'):
 if os.path.basename(filename).startswith(f'{dir_name}'):
 os.replace(filename, f'{os.path.dirname(filename)}/{os.path.basename(filename).removeprefix(dir_name)}')
 else:
 os.remove(filename)


Applied lib\xslt\convert_tei-to-biblstruct_bibl.xsl to files in tei and saved result in biblStruct.
Applied lib\xslt\convert_tei-to-mods_bibl.xsl to files in tei and saved result in mods.


## Convert MODS to RIS tagged file format

This requires the install the [Bibutils suite of executables](https://sourceforge.net/p/bibutils/home/Bibutils) available in most distros.
If you are on Windows, you will need to install it to the standard WSL distro.

In [55]:
import subprocess
import platform

cmd = ['bash', 'lib/run-bibutils.sh', 'xml2ris']
if platform.system() == 'Windows':
 cmd = ['wsl.exe', '-e'] + cmd
output = subprocess.check_output(cmd, stderr=subprocess.STDOUT)
print(output.decode())

Running xml2ris to convert mods/10.1111_1467-6478.00057.mods.xml to ris/10.1111_1467-6478.00057.ris...
Running xml2ris to convert mods/10.1111_1467-6478.00080.mods.xml to ris/10.1111_1467-6478.00080.ris...
Running xml2ris to convert mods/10.1515_zfrs-1980-0103.mods.xml to ris/10.1515_zfrs-1980-0103.ris...
Running xml2ris to convert mods/10.1515_zfrs-1980-0104.mods.xml to ris/10.1515_zfrs-1980-0104.ris...


## Convert MODS -> Bibframe RDF -> JSON-LD

See:
 - https://www.loc.gov/standards/mods/modsrdf/mods3-7-bibframe2-0-mapping.html
 - https://rdflib.readthedocs.io/ 

In [56]:
# MODS -> Bibframe
transform(xslt_path='lib/xslt/MODS3-7_Bibframe2-0_XSLT2-0_20230505.xsl', 
 input_path='mods', output_path='bibframe', 
 rename_extension=('.mods.xml','.bibframe.xml'))

Applied lib\xslt\MODS3-7_Bibframe2-0_XSLT2-0_20230505.xsl to files in mods and saved result in bibframe.


CompletedProcess(args=['java', '-jar', 'lib/SaxonHE12-5J/saxon-he-12.5.jar', '-s:mods', '-xsl:lib\\xslt\\MODS3-7_Bibframe2-0_XSLT2-0_20230505.xsl', '-o:bibframe', 'p_target-language=de', 'p_github-action=true', 'p_output-folder=bibframe'], returncode=0, stdout='', stderr='')

In [57]:
from rdflib import Graph
for in_path in glob(f'bibframe/*'):
 out_file = os.path.basename(in_path).replace('.bibframe.xml','.json')
 g = Graph()
 g.parse(in_path)
 g.serialize(destination=f'json-ld/{out_file}', format='json-ld')


## Convert MODS -> BibTex -> CSL-JSON

This also requires Bibutils and additionally, the [pandoc executable](https://pandoc.org/installing.html).


In [2]:
# MODS -> BibTeX
import subprocess
import platform

cmd = ['bash', 'lib/run-bibutils.sh', 'xml2bib']
if platform.system() == 'Windows':
 cmd = ['wsl.exe', '-e'] + cmd
output = subprocess.check_output(cmd, stderr=subprocess.STDOUT)
print(output.decode())

Running xml2bib to convert mods/10.1111_1467-6478.00057.mods.xml to bib/10.1111_1467-6478.00057.bib...
Running xml2bib to convert mods/10.1111_1467-6478.00080.mods.xml to bib/10.1111_1467-6478.00080.bib...
Running xml2bib to convert mods/10.1515_zfrs-1980-0103.mods.xml to bib/10.1515_zfrs-1980-0103.bib...
Running xml2bib to convert mods/10.1515_zfrs-1980-0104.mods.xml to bib/10.1515_zfrs-1980-0104.bib...


In [4]:
# BibTeX to CSL

cmd = ['bash', 'lib/run-pandoc.sh', 'bibtex', 'csljson']
if platform.system() == 'Windows':
 cmd = ['wsl.exe', '-e'] + cmd
output = subprocess.check_output(cmd, stderr=subprocess.STDOUT)
print(output.decode())

Running citeproc to convert bib/10.1111_1467-6478.00057.bib to csljson/10.1111_1467-6478.00057.csl.json...
Running citeproc to convert bib/10.1111_1467-6478.00080.bib to csljson/10.1111_1467-6478.00080.csl.json...
Running citeproc to convert bib/10.1515_zfrs-1980-0103.bib to csljson/10.1515_zfrs-1980-0103.csl.json...
Running citeproc to convert bib/10.1515_zfrs-1980-0104.bib to csljson/10.1515_zfrs-1980-0104.csl.json...
