Skip to content
Snippets Groups Projects
Commit fc6b1dd6 authored by Christian Boulanger's avatar Christian Boulanger
Browse files
parent 9ec1a8dd
No related branches found
No related tags found
No related merge requests found
%% Cell type:markdown id:2cdf8ba1eefa38e0 tags: %% Cell type:markdown id:2cdf8ba1eefa38e0 tags:
# Convert the generated TEI to bibliographic formats # Convert the generated TEI to bibliographic formats
%% Cell type:markdown id:db65c4065691c578 tags: %% Cell type:markdown id:db65c4065691c578 tags:
## Download required XSLT documents ## Download required XSLT documents
we use XSLT provided by https://github.com/OpenArabicPE/convert_tei-to-bibliographic-data we use XSLT provided by https://github.com/OpenArabicPE/convert_tei-to-bibliographic-data
%% Cell type:code id:1de7cedbb3514188 tags: %% Cell type:code id:1de7cedbb3514188 tags:
``` python ``` python
import os import os
from urllib.parse import urljoin from urllib.parse import urljoin
import requests import requests
from lxml import etree from lxml import etree
cache = set() cache = set()
def download_xslt(url, target_dir = 'lib/xslt'): def download_xslt(url, target_dir = 'lib/xslt'):
"""written by GPT-4""" """written by GPT-4"""
response = requests.get(url) response = requests.get(url)
response.raise_for_status() response.raise_for_status()
doc = etree.fromstring(response.content) doc = etree.fromstring(response.content)
for elem in doc.xpath('//*[local-name() = "import"]'): for elem in doc.xpath('//*[local-name() = "import"]'):
import_url = urljoin(url, elem.get('href')) import_url = urljoin(url, elem.get('href'))
if import_url not in cache: if import_url not in cache:
cache.add(import_url) cache.add(import_url)
download_xslt(import_url, target_dir) download_xslt(import_url, target_dir)
os.makedirs(target_dir, exist_ok=True) os.makedirs(target_dir, exist_ok=True)
with open(os.path.join(target_dir, os.path.basename(url)), 'wb') as f: with open(os.path.join(target_dir, os.path.basename(url)), 'wb') as f:
f.write(response.content) f.write(response.content)
print(f'Downloaded {os.path.basename(url)} to {target_dir}') print(f'Downloaded {os.path.basename(url)} to {target_dir}')
# TEI -> BiblStruct/MODS # TEI -> BiblStruct/MODS
base_url = 'https://openarabicpe.github.io/convert_tei-to-bibliographic-data/xslt' base_url = 'https://openarabicpe.github.io/convert_tei-to-bibliographic-data/xslt'
xslt_docs = ['convert_tei-to-biblstruct_bibl.xsl', xslt_docs = ['convert_tei-to-biblstruct_bibl.xsl',
'convert_tei-to-mods_bibl.xsl', 'convert_tei-to-mods_bibl.xsl',
'convert_tei-to-zotero-rdf_bibl.xsl'] 'convert_tei-to-zotero-rdf_bibl.xsl']
for xslt_doc in xslt_docs: for xslt_doc in xslt_docs:
download_xslt(f'{base_url}/{xslt_doc}') download_xslt(f'{base_url}/{xslt_doc}')
# MODS -> BIBO-RDF # MODS -> BIBO-RDF
base_url = 'https://www.loc.gov/standards/mods/v3' base_url = 'https://www.loc.gov/standards/mods/v3'
xslt_docs = ['MODS3-7_Bibframe2-0_XSLT2-0_20230505.xsl', 'conf/languageCrosswalk.xml'] xslt_docs = ['MODS3-7_Bibframe2-0_XSLT2-0_20230505.xsl', 'conf/languageCrosswalk.xml']
for xslt_doc in xslt_docs: for xslt_doc in xslt_docs:
download_xslt(f'{base_url}/{xslt_doc}', target_dir=f'lib/xslt/{os.path.dirname(xslt_doc)}') download_xslt(f'{base_url}/{xslt_doc}', target_dir=f'lib/xslt/{os.path.dirname(xslt_doc)}')
``` ```
%% Output %% Output
Downloaded parameters.xsl to lib/xslt Downloaded parameters.xsl to lib/xslt
Downloaded functions.xsl to lib/xslt Downloaded functions.xsl to lib/xslt
Downloaded convert_tei-to-biblstruct_functions.xsl to lib/xslt Downloaded convert_tei-to-biblstruct_functions.xsl to lib/xslt
Downloaded convert_tei-to-biblstruct_bibl.xsl to lib/xslt Downloaded convert_tei-to-biblstruct_bibl.xsl to lib/xslt
Downloaded date-functions.xsl to lib/xslt Downloaded date-functions.xsl to lib/xslt
Downloaded convert_tei-to-mods_functions.xsl to lib/xslt Downloaded convert_tei-to-mods_functions.xsl to lib/xslt
Downloaded convert_tei-to-mods_bibl.xsl to lib/xslt Downloaded convert_tei-to-mods_bibl.xsl to lib/xslt
Downloaded convert_tei-to-zotero-rdf_bibl.xsl to lib/xslt Downloaded convert_tei-to-zotero-rdf_bibl.xsl to lib/xslt
Downloaded MODS3-7_Bibframe2-0_XSLT2-0_20230505.xsl to lib/xslt/ Downloaded MODS3-7_Bibframe2-0_XSLT2-0_20230505.xsl to lib/xslt/
Downloaded languageCrosswalk.xml to lib/xslt/conf Downloaded languageCrosswalk.xml to lib/xslt/conf
%% Cell type:markdown id:781d0e0e7a9dd346 tags: %% Cell type:markdown id:781d0e0e7a9dd346 tags:
## Download the Saxon jar ## Download the Saxon jar
As the xslt uses v2.0 features, and there are no native-python xslt-2.0 processors, we need to use the Saxon processor (I haven't tried https://pypi.org/project/saxonpy). As the xslt uses v2.0 features, and there are no native-python xslt-2.0 processors, we need to use the Saxon processor. Possible alternatives (untested):
- https://pypi.org/project/saxonpy
- https://github.com/cts2/pyjxslt
%% Cell type:code id:72b688e9b2e0d1f2 tags: %% Cell type:code id:72b688e9b2e0d1f2 tags:
``` python ``` python
import requests import requests
import zipfile import zipfile
import io import io
import os import os
url = "https://github.com/Saxonica/Saxon-HE/releases/download/SaxonHE12-5/SaxonHE12-5J.zip" url = "https://github.com/Saxonica/Saxon-HE/releases/download/SaxonHE12-5/SaxonHE12-5J.zip"
target_dir = 'lib/SaxonHE12-5' target_dir = 'lib/SaxonHE12-5'
response = requests.get(url, stream=True) response = requests.get(url, stream=True)
file_zip = zipfile.ZipFile(io.BytesIO(response.content)) file_zip = zipfile.ZipFile(io.BytesIO(response.content))
os.makedirs(target_dir, exist_ok=True) os.makedirs(target_dir, exist_ok=True)
file_zip.extractall(path=target_dir) file_zip.extractall(path=target_dir)
``` ```
%% Cell type:markdown id:1bbb36ac0f4fd1b5 tags: %% Cell type:markdown id:1bbb36ac0f4fd1b5 tags:
## Run the transformation to biblStruct & MODS ## Run the transformation to biblStruct & MODS
%% Cell type:code id:d4a6c9620d0199eb tags: %% Cell type:code id:d4a6c9620d0199eb tags:
``` python ``` python
import subprocess import subprocess
import os import os
from glob import glob from glob import glob
def transform(xslt_path, input_path='tei', output_path='.', rename_extension:tuple=None): def transform(xslt_path, input_path='tei', output_path='.', rename_extension:tuple=None):
input_path = os.path.normpath(input_path) input_path = os.path.normpath(input_path)
xslt_path = os.path.normpath(xslt_path) xslt_path = os.path.normpath(xslt_path)
cmd = ['java', '-jar', 'lib/SaxonHE12-5J/saxon-he-12.5.jar', cmd = ['java', '-jar', 'lib/SaxonHE12-5J/saxon-he-12.5.jar',
f'-s:{input_path}', f'-xsl:{xslt_path}', f'-o:{output_path}', f'-s:{input_path}', f'-xsl:{xslt_path}', f'-o:{output_path}',
'p_target-language=de', 'p_github-action=true', f'p_output-folder={output_path}'] 'p_target-language=de', 'p_github-action=true', f'p_output-folder={output_path}']
process = subprocess.run(cmd, capture_output=True, text=True) process = subprocess.run(cmd, capture_output=True, text=True)
if rename_extension: if rename_extension:
from_extension = rename_extension[0] from_extension = rename_extension[0]
to_extension = rename_extension[1] to_extension = rename_extension[1]
for filename in glob(f'{output_path}/*.xml'): for filename in glob(f'{output_path}/*.xml'):
if filename.endswith(from_extension): if filename.endswith(from_extension):
os.replace(filename, filename.replace(from_extension, to_extension)) os.replace(filename, filename.replace(from_extension, to_extension))
if process.returncode != 0: if process.returncode != 0:
raise RuntimeError(process.stderr) raise RuntimeError(process.stderr)
print(f'Applied {xslt_path} to files in {input_path} and saved result in {output_path}.') print(f'Applied {xslt_path} to files in {input_path} and saved result in {output_path}.')
return process return process
``` ```
%% Cell type:code id:34087ef2f498ffa6 tags: %% Cell type:code id:34087ef2f498ffa6 tags:
``` python ``` python
# TEI -> biblstruct # TEI -> biblstruct
transform(xslt_path='lib/xslt/convert_tei-to-biblstruct_bibl.xsl', output_path='biblStruct', rename_extension=('-bibl_biblStruct.TEIP5.xml','.biblStruct.xml')) transform(xslt_path='lib/xslt/convert_tei-to-biblstruct_bibl.xsl', output_path='biblStruct', rename_extension=('-bibl_biblStruct.TEIP5.xml','.biblStruct.xml'))
# TEI -> MODS # TEI -> MODS
transform(xslt_path='lib/xslt/convert_tei-to-mods_bibl.xsl', output_path='mods', rename_extension=('-bibl.MODS.xml','.mods.xml')) transform(xslt_path='lib/xslt/convert_tei-to-mods_bibl.xsl', output_path='mods', rename_extension=('-bibl.MODS.xml','.mods.xml'))
# rename wanted and delete unwanted empty files # rename wanted and delete unwanted empty files
for dir_name in ['biblStruct', 'mods']: for dir_name in ['biblStruct', 'mods']:
for filename in glob(f'{dir_name}/*'): for filename in glob(f'{dir_name}/*'):
if os.path.basename(filename).startswith(f'{dir_name}'): if os.path.basename(filename).startswith(f'{dir_name}'):
os.replace(filename, f'{os.path.dirname(filename)}/{os.path.basename(filename).removeprefix(dir_name)}') os.replace(filename, f'{os.path.dirname(filename)}/{os.path.basename(filename).removeprefix(dir_name)}')
else: else:
os.remove(filename) os.remove(filename)
``` ```
%% Output %% Output
Applied lib\xslt\convert_tei-to-biblstruct_bibl.xsl to files in tei and saved result in biblStruct. Applied lib\xslt\convert_tei-to-biblstruct_bibl.xsl to files in tei and saved result in biblStruct.
Applied lib\xslt\convert_tei-to-mods_bibl.xsl to files in tei and saved result in mods. Applied lib\xslt\convert_tei-to-mods_bibl.xsl to files in tei and saved result in mods.
%% Cell type:markdown id:5e75488ae4379946 tags: %% Cell type:markdown id:5e75488ae4379946 tags:
## Convert MODS to RIS tagged file format ## Convert MODS to RIS tagged file format
This requires the install the [Bibutils suite of executables](https://sourceforge.net/p/bibutils/home/Bibutils) available in most distros. This requires the install the [Bibutils suite of executables](https://sourceforge.net/p/bibutils/home/Bibutils) available in most distros.
If you are on Windows, you will need to install it to the standard WSL distro. If you are on Windows, you will need to install it to the standard WSL distro.
%% Cell type:code id:fde37a9e4a182bad tags: %% Cell type:code id:fde37a9e4a182bad tags:
``` python ``` python
import subprocess import subprocess
import platform import platform
cmd = ['bash', 'lib/run-bibutils.sh', 'xml2ris'] cmd = ['bash', 'lib/run-bibutils.sh', 'xml2ris']
if platform.system() == 'Windows': if platform.system() == 'Windows':
cmd = ['wsl.exe', '-e'] + cmd cmd = ['wsl.exe', '-e'] + cmd
output = subprocess.check_output(cmd, stderr=subprocess.STDOUT) output = subprocess.check_output(cmd, stderr=subprocess.STDOUT)
print(output.decode()) print(output.decode())
``` ```
%% Output %% Output
Running xml2ris to convert mods/10.1111_1467-6478.00057.mods.xml to ris/10.1111_1467-6478.00057.ris... Running xml2ris to convert mods/10.1111_1467-6478.00057.mods.xml to ris/10.1111_1467-6478.00057.ris...
Running xml2ris to convert mods/10.1111_1467-6478.00080.mods.xml to ris/10.1111_1467-6478.00080.ris... Running xml2ris to convert mods/10.1111_1467-6478.00080.mods.xml to ris/10.1111_1467-6478.00080.ris...
Running xml2ris to convert mods/10.1515_zfrs-1980-0103.mods.xml to ris/10.1515_zfrs-1980-0103.ris... Running xml2ris to convert mods/10.1515_zfrs-1980-0103.mods.xml to ris/10.1515_zfrs-1980-0103.ris...
Running xml2ris to convert mods/10.1515_zfrs-1980-0104.mods.xml to ris/10.1515_zfrs-1980-0104.ris... Running xml2ris to convert mods/10.1515_zfrs-1980-0104.mods.xml to ris/10.1515_zfrs-1980-0104.ris...
%% Cell type:markdown id:61f6cfe7d3de482a tags: %% Cell type:markdown id:61f6cfe7d3de482a tags:
## Convert MODS -> Bibframe RDF -> JSON-LD ## Convert MODS -> Bibframe RDF -> JSON-LD
See: See:
- https://www.loc.gov/standards/mods/modsrdf/mods3-7-bibframe2-0-mapping.html - https://www.loc.gov/standards/mods/modsrdf/mods3-7-bibframe2-0-mapping.html
- https://rdflib.readthedocs.io/ - https://rdflib.readthedocs.io/
%% Cell type:code id:6ba739963096f858 tags: %% Cell type:code id:6ba739963096f858 tags:
``` python ``` python
# MODS -> Bibframe # MODS -> Bibframe
transform(xslt_path='lib/xslt/MODS3-7_Bibframe2-0_XSLT2-0_20230505.xsl', transform(xslt_path='lib/xslt/MODS3-7_Bibframe2-0_XSLT2-0_20230505.xsl',
input_path='mods', output_path='bibframe', input_path='mods', output_path='bibframe',
rename_extension=('.mods.xml','.bibframe.xml')) rename_extension=('.mods.xml','.bibframe.xml'))
``` ```
%% Output %% Output
Applied lib\xslt\MODS3-7_Bibframe2-0_XSLT2-0_20230505.xsl to files in mods and saved result in bibframe. Applied lib\xslt\MODS3-7_Bibframe2-0_XSLT2-0_20230505.xsl to files in mods and saved result in bibframe.
CompletedProcess(args=['java', '-jar', 'lib/SaxonHE12-5J/saxon-he-12.5.jar', '-s:mods', '-xsl:lib\\xslt\\MODS3-7_Bibframe2-0_XSLT2-0_20230505.xsl', '-o:bibframe', 'p_target-language=de', 'p_github-action=true', 'p_output-folder=bibframe'], returncode=0, stdout='', stderr='') CompletedProcess(args=['java', '-jar', 'lib/SaxonHE12-5J/saxon-he-12.5.jar', '-s:mods', '-xsl:lib\\xslt\\MODS3-7_Bibframe2-0_XSLT2-0_20230505.xsl', '-o:bibframe', 'p_target-language=de', 'p_github-action=true', 'p_output-folder=bibframe'], returncode=0, stdout='', stderr='')
%% Cell type:code id:4cb509fa7f296d1f tags: %% Cell type:code id:4cb509fa7f296d1f tags:
``` python ``` python
from rdflib import Graph from rdflib import Graph
for in_path in glob(f'bibframe/*'): for in_path in glob(f'bibframe/*'):
out_file = os.path.basename(in_path).replace('.bibframe.xml','.json') out_file = os.path.basename(in_path).replace('.bibframe.xml','.json')
g = Graph() g = Graph()
g.parse(in_path) g.parse(in_path)
g.serialize(destination=f'json-ld/{out_file}', format='json-ld') g.serialize(destination=f'json-ld/{out_file}', format='json-ld')
``` ```
%% Cell type:markdown id:8ce07a1a294b5408 tags: %% Cell type:markdown id:8ce07a1a294b5408 tags:
%% Cell type:markdown id:be771aec518bf10a tags: %% Cell type:markdown id:be771aec518bf10a tags:
## Convert MODS -> BibTex -> CSL-JSON ## Convert MODS -> BibTex -> CSL-JSON
This also requires Bibutils and additionally, the [pandoc executable](https://pandoc.org/installing.html). This also requires Bibutils and additionally, the [pandoc executable](https://pandoc.org/installing.html).
%% Cell type:code id:3e95a38e223dae51 tags: %% Cell type:code id:3e95a38e223dae51 tags:
``` python ``` python
# MODS -> BibTeX # MODS -> BibTeX
import subprocess import subprocess
import platform import platform
cmd = ['bash', 'lib/run-bibutils.sh', 'xml2bib'] cmd = ['bash', 'lib/run-bibutils.sh', 'xml2bib']
if platform.system() == 'Windows': if platform.system() == 'Windows':
cmd = ['wsl.exe', '-e'] + cmd cmd = ['wsl.exe', '-e'] + cmd
output = subprocess.check_output(cmd, stderr=subprocess.STDOUT) output = subprocess.check_output(cmd, stderr=subprocess.STDOUT)
print(output.decode()) print(output.decode())
``` ```
%% Output %% Output
Running xml2bib to convert mods/10.1111_1467-6478.00057.mods.xml to bib/10.1111_1467-6478.00057.bib... Running xml2bib to convert mods/10.1111_1467-6478.00057.mods.xml to bib/10.1111_1467-6478.00057.bib...
Running xml2bib to convert mods/10.1111_1467-6478.00080.mods.xml to bib/10.1111_1467-6478.00080.bib... Running xml2bib to convert mods/10.1111_1467-6478.00080.mods.xml to bib/10.1111_1467-6478.00080.bib...
Running xml2bib to convert mods/10.1515_zfrs-1980-0103.mods.xml to bib/10.1515_zfrs-1980-0103.bib... Running xml2bib to convert mods/10.1515_zfrs-1980-0103.mods.xml to bib/10.1515_zfrs-1980-0103.bib...
Running xml2bib to convert mods/10.1515_zfrs-1980-0104.mods.xml to bib/10.1515_zfrs-1980-0104.bib... Running xml2bib to convert mods/10.1515_zfrs-1980-0104.mods.xml to bib/10.1515_zfrs-1980-0104.bib...
%% Cell type:code id:a2cdccaf919c268e tags: %% Cell type:code id:a2cdccaf919c268e tags:
``` python ``` python
# BibTeX to CSL # BibTeX to CSL
cmd = ['bash', 'lib/run-pandoc.sh', 'bibtex', 'csljson'] cmd = ['bash', 'lib/run-pandoc.sh', 'bibtex', 'csljson']
if platform.system() == 'Windows': if platform.system() == 'Windows':
cmd = ['wsl.exe', '-e'] + cmd cmd = ['wsl.exe', '-e'] + cmd
output = subprocess.check_output(cmd, stderr=subprocess.STDOUT) output = subprocess.check_output(cmd, stderr=subprocess.STDOUT)
print(output.decode()) print(output.decode())
``` ```
%% Output %% Output
Running citeproc to convert bib/10.1111_1467-6478.00057.bib to csljson/10.1111_1467-6478.00057.csl.json... Running citeproc to convert bib/10.1111_1467-6478.00057.bib to csljson/10.1111_1467-6478.00057.csl.json...
Running citeproc to convert bib/10.1111_1467-6478.00080.bib to csljson/10.1111_1467-6478.00080.csl.json... Running citeproc to convert bib/10.1111_1467-6478.00080.bib to csljson/10.1111_1467-6478.00080.csl.json...
Running citeproc to convert bib/10.1515_zfrs-1980-0103.bib to csljson/10.1515_zfrs-1980-0103.csl.json... Running citeproc to convert bib/10.1515_zfrs-1980-0103.bib to csljson/10.1515_zfrs-1980-0103.csl.json...
Running citeproc to convert bib/10.1515_zfrs-1980-0104.bib to csljson/10.1515_zfrs-1980-0104.csl.json... Running citeproc to convert bib/10.1515_zfrs-1980-0104.bib to csljson/10.1515_zfrs-1980-0104.csl.json...
%% Cell type:code id:77ff73f83a1db70e tags: %% Cell type:code id:77ff73f83a1db70e tags:
``` python ``` python
``` ```
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment