Skip to content
Snippets Groups Projects
Commit 2d597648 authored by Christian Boulanger's avatar Christian Boulanger
Browse files

Download Saxon programmatically

parent cf342253
No related branches found
No related tags found
No related merge requests found
%% Cell type:markdown id:2cdf8ba1eefa38e0 tags: %% Cell type:markdown id:2cdf8ba1eefa38e0 tags:
# Convert the generated TEI to bibliographic formats # Convert the generated TEI to bibliographic formats
%% Cell type:markdown id:db65c4065691c578 tags: %% Cell type:markdown id:db65c4065691c578 tags:
## Download required XSLT documents ## Download required XSLT documents
we use XSLT provided by https://github.com/OpenArabicPE/convert_tei-to-bibliographic-data we use XSLT provided by https://github.com/OpenArabicPE/convert_tei-to-bibliographic-data
%% Cell type:code id:1de7cedbb3514188 tags: %% Cell type:code id:1de7cedbb3514188 tags:
``` python ``` python
import os import os
from urllib.parse import urljoin from urllib.parse import urljoin
import requests import requests
from lxml import etree from lxml import etree
def download_xslt(url, target_dir = 'lib/xslt'): def download_xslt(url, target_dir = 'lib/xslt'):
"""written by GPT-4""" """written by GPT-4"""
response = requests.get(url) response = requests.get(url)
response.raise_for_status() response.raise_for_status()
doc = etree.fromstring(response.content) doc = etree.fromstring(response.content)
for elem in doc.xpath('//*[local-name() = "import"]'): for elem in doc.xpath('//*[local-name() = "import"]'):
import_url = urljoin(url, elem.get('href')) # Construct a full URL based on the href attribute relative to the original url import_url = urljoin(url, elem.get('href')) # Construct a full URL based on the href attribute relative to the original url
download_xslt(import_url, target_dir) download_xslt(import_url, target_dir)
os.makedirs(target_dir, exist_ok=True) os.makedirs(target_dir, exist_ok=True)
with open(os.path.join(target_dir, os.path.basename(url)), 'wb') as f: with open(os.path.join(target_dir, os.path.basename(url)), 'wb') as f:
f.write(response.content) f.write(response.content)
print(f'Downloaded {os.path.basename(url)}') print(f'Downloaded {os.path.basename(url)}')
base_url = 'https://openarabicpe.github.io/convert_tei-to-bibliographic-data/xslt' base_url = 'https://openarabicpe.github.io/convert_tei-to-bibliographic-data/xslt'
xslt_docs = ['convert_tei-to-mods_bibl.xsl'] xslt_docs = ['convert_tei-to-mods_bibl.xsl']
for xslt_doc in xslt_docs: for xslt_doc in xslt_docs:
download_xslt(f'{base_url}/{xslt_doc}') download_xslt(f'{base_url}/{xslt_doc}')
``` ```
%% Output %% Output
Downloaded date-functions.xsl Downloaded date-functions.xsl
Downloaded parameters.xsl Downloaded parameters.xsl
Downloaded functions.xsl Downloaded functions.xsl
Downloaded convert_tei-to-biblstruct_functions.xsl Downloaded convert_tei-to-biblstruct_functions.xsl
Downloaded convert_tei-to-mods_functions.xsl Downloaded convert_tei-to-mods_functions.xsl
Downloaded convert_tei-to-mods_bibl.xsl Downloaded convert_tei-to-mods_bibl.xsl
%% Cell type:markdown id:d08d51f8767602c5 tags: %% Cell type:markdown id:781d0e0e7a9dd346 tags:
## Extract bibliographic data from TEI files using XSLT ## Download the Saxon jar
### Using lxml - currently not working As the xslt uses v2.0 features, and there are no native-python xslt-2.0 processors, we need to use the Saxon processor
%% Cell type:code id:af437a5ab3cc41a3 tags: %% Cell type:code id:72b688e9b2e0d1f2 tags:
``` python ``` python
from lxml import etree
import glob
import requests import requests
import zipfile
def apply_xslt(xslt_path, xml_input_path, xml_output_path): import io
try: import os
xslt_doc = etree.parse(xslt_path) url = "https://github.com/Saxonica/Saxon-HE/releases/download/SaxonHE12-5/SaxonHE12-5J.zip"
xml_doc = etree.parse(xml_input_path) target_dir = 'lib/SaxonHE12-5'
transformer = etree.XSLT(xslt_doc) response = requests.get(url, stream=True)
new_xml = transformer(xml_doc) file_zip = zipfile.ZipFile(io.BytesIO(response.content))
with open(xml_output_path, 'w', encoding='utf-8') as f: os.makedirs(target_dir, exist_ok=True)
f.write(new_xml) file_zip.extractall(path=target_dir)
except etree.XSLTParseError as e:
print(f"Error parsing XSLT file at {xslt_path}: {e}")
for input_path in glob.glob('tei/*.xml'):
print(f'Converting {input_path}')
base_name = os.path.basename(input_path)
output_path = f'tmp/{base_name.replace(".xml", "-mods.xml")}'
apply_xslt('lib/xslt/convert_tei-to-mods_bibl.xsl', input_path, output_path)
``` ```
%% Output %% Cell type:markdown id:1bbb36ac0f4fd1b5 tags:
Converting tei\10.1111_1467-6478.00057.xml
Error parsing XSLT file at lib/xslt/convert_tei-to-mods_bibl.xsl: xsltParseStylesheetTop: ignoring misplaced import element
Converting tei\10.1111_1467-6478.00080.xml
Error parsing XSLT file at lib/xslt/convert_tei-to-mods_bibl.xsl: xsltParseStylesheetTop: ignoring misplaced import element
Converting tei\10.1515_zfrs-1980-0103.xml
Error parsing XSLT file at lib/xslt/convert_tei-to-mods_bibl.xsl: xsltParseStylesheetTop: ignoring misplaced import element
Converting tei\10.1515_zfrs-1980-0104.xml
Error parsing XSLT file at lib/xslt/convert_tei-to-mods_bibl.xsl: xsltParseStylesheetTop: ignoring misplaced import element
%% Cell type:markdown id:781d0e0e7a9dd346 tags:
### Using Saxon:
- download ZIP from https://github.com/Saxonica/Saxon-HE/releases/download/SaxonHE12-5/SaxonHE12-5J.zip ## Run the transformation
- unpack in lib/SaxonHE12-5J
%% Cell type:code id:34087ef2f498ffa6 tags: %% Cell type:code id:34087ef2f498ffa6 tags:
``` python ``` python
import subprocess import subprocess
import os import os
def transform_tei(xslt_path, file_path='tei', output_path='.'): def transform_tei(xslt_path, file_path='tei', output_path='.'):
file_path = os.path.normpath(file_path) file_path = os.path.normpath(file_path)
xslt_path = os.path.normpath(xslt_path) xslt_path = os.path.normpath(xslt_path)
cmd = ['java', '-jar', 'lib/SaxonHE12-5J/saxon-he-12.5.jar', cmd = ['java', '-jar', 'lib/SaxonHE12-5J/saxon-he-12.5.jar',
f'-s:{file_path}', f'-s:{file_path}',
f'-xsl:{xslt_path}', f'-xsl:{xslt_path}',
f'-o:{output_path}', f'-o:{output_path}',
'p_target-language=de', 'p_github-action=true'] 'p_target-language=de', 'p_github-action=true']
process = subprocess.run(cmd, capture_output=True, text=True) process = subprocess.run(cmd, capture_output=True, text=True)
if process.returncode != 0: if process.returncode != 0:
raise RuntimeError(process.stderr) raise RuntimeError(process.stderr)
return process return process
transform_tei(xslt_path='lib/xslt/convert_tei-to-biblstruct_bibl.xsl', output_path='biblStruct') transform_tei(xslt_path='lib/xslt/convert_tei-to-biblstruct_bibl.xsl', output_path='biblStruct')
transform_tei(xslt_path='lib/xslt/convert_tei-to-mods_bibl.xsl', output_path='mods') transform_tei(xslt_path='lib/xslt/convert_tei-to-mods_bibl.xsl', output_path='mods')
``` ```
%% Output %% Output
CompletedProcess(args=['java', '-jar', 'lib/SaxonHE12-5J/saxon-he-12.5.jar', '-s:tei', '-xsl:lib\\xslt\\convert_tei-to-mods_bibl.xsl', '-o:mods', 'p_target-language=de', 'p_github-action=true'], returncode=0, stdout='', stderr='') CompletedProcess(args=['java', '-jar', 'lib/SaxonHE12-5J/saxon-he-12.5.jar', '-s:tei', '-xsl:lib\\xslt\\convert_tei-to-mods_bibl.xsl', '-o:mods', 'p_target-language=de', 'p_github-action=true'], returncode=0, stdout='', stderr='')
%% Cell type:markdown id:5e75488ae4379946 tags: %% Cell type:markdown id:5e75488ae4379946 tags:
## Convert MODS to RIS tagged file format ## Convert MODS to RIS tagged file format
This requires the install the bibutils suite of executables https://sourceforge.net/p/bibutils/home/Bibutils/ available in most distros. This requires the install the bibutils suite of executables https://sourceforge.net/p/bibutils/home/Bibutils/ available in most distros.
(in windows, you will need to install it to the standard WSL distro) (in windows, you will need to install it to the standard WSL distro)
%% Cell type:code id:fde37a9e4a182bad tags: %% Cell type:code id:fde37a9e4a182bad tags:
``` python ``` python
import subprocess import subprocess
import platform import platform
cmd = ['bash', 'lib/xml2ris.sh'] cmd = ['bash', 'lib/xml2ris.sh']
if platform.system() == 'Windows': if platform.system() == 'Windows':
cmd = ['wsl.exe', '-e'] + cmd cmd = ['wsl.exe', '-e'] + cmd
output = subprocess.check_output(cmd, stderr=subprocess.STDOUT) output = subprocess.check_output(cmd, stderr=subprocess.STDOUT)
print(output.decode()) print(output.decode())
``` ```
%% Output %% Output
Converted mods/metadata/10.1111_1467-6478.00057-bibl.MODS.xml to ris/10.1111_1467-6478.00057.ris Converted mods/metadata/10.1111_1467-6478.00057-bibl.MODS.xml to ris/10.1111_1467-6478.00057.ris
xml2ris: Processed 68 references. xml2ris: Processed 68 references.
Converted mods/metadata/10.1111_1467-6478.00080-bibl.MODS.xml to ris/10.1111_1467-6478.00080.ris Converted mods/metadata/10.1111_1467-6478.00080-bibl.MODS.xml to ris/10.1111_1467-6478.00080.ris
xml2ris: Processed 40 references. xml2ris: Processed 40 references.
Converted mods/metadata/10.1515_zfrs-1980-0103-bibl.MODS.xml to ris/10.1515_zfrs-1980-0103.ris Converted mods/metadata/10.1515_zfrs-1980-0103-bibl.MODS.xml to ris/10.1515_zfrs-1980-0103.ris
xml2ris: Processed 36 references. xml2ris: Processed 36 references.
Converted mods/metadata/10.1515_zfrs-1980-0104-bibl.MODS.xml to ris/10.1515_zfrs-1980-0104.ris Converted mods/metadata/10.1515_zfrs-1980-0104-bibl.MODS.xml to ris/10.1515_zfrs-1980-0104.ris
xml2ris: Processed 82 references. xml2ris: Processed 82 references.
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment