Skip to content
Snippets Groups Projects
Commit b5d864b3 authored by Christian Boulanger's avatar Christian Boulanger
Browse files

Added transformation MODS -> BibTeX -> CSL-JSON

parent 3bc806fc
No related branches found
No related tags found
No related merge requests found
......@@ -10,5 +10,5 @@ for input_path in $input_dir/*.$input_ext
do
out_file=$(basename ${input_path/\.$input_ext/.$output_ext})
echo "Running citeproc to convert $input_path to $output_dir/$out_file..."
pandoc -f $convert_from -t $convert_to -o "$output_dir/$out_file" $input_path 2>&1
pandoc -f "$convert_from" -t "$convert_to" -o "$output_dir/$out_file" "$input_path" 2>&1
done
\ No newline at end of file
%% Cell type:markdown id:2cdf8ba1eefa38e0 tags:
# Convert the generated TEI to bibliographic formats
%% Cell type:markdown id:db65c4065691c578 tags:
## Download required XSLT documents
we use XSLT provided by https://github.com/OpenArabicPE/convert_tei-to-bibliographic-data
%% Cell type:code id:1de7cedbb3514188 tags:
``` python
import os
from urllib.parse import urljoin
import requests
from lxml import etree
cache = set()
def download_xslt(url, target_dir = 'lib/xslt'):
"""written by GPT-4"""
response = requests.get(url)
response.raise_for_status()
doc = etree.fromstring(response.content)
for elem in doc.xpath('//*[local-name() = "import"]'):
import_url = urljoin(url, elem.get('href'))
if import_url not in cache:
cache.add(import_url)
download_xslt(import_url, target_dir)
os.makedirs(target_dir, exist_ok=True)
with open(os.path.join(target_dir, os.path.basename(url)), 'wb') as f:
f.write(response.content)
print(f'Downloaded {os.path.basename(url)} to {target_dir}')
# TEI -> BiblStruct/MODS
base_url = 'https://openarabicpe.github.io/convert_tei-to-bibliographic-data/xslt'
xslt_docs = ['convert_tei-to-biblstruct_bibl.xsl',
'convert_tei-to-mods_bibl.xsl',
'convert_tei-to-zotero-rdf_bibl.xsl']
for xslt_doc in xslt_docs:
download_xslt(f'{base_url}/{xslt_doc}')
# MODS -> BIBO-RDF
base_url = 'https://www.loc.gov/standards/mods/v3'
xslt_docs = ['MODS3-7_Bibframe2-0_XSLT2-0_20230505.xsl', 'conf/languageCrosswalk.xml']
for xslt_doc in xslt_docs:
download_xslt(f'{base_url}/{xslt_doc}', target_dir=f'lib/xslt/{os.path.dirname(xslt_doc)}')
```
%% Output
Downloaded parameters.xsl to lib/xslt
Downloaded functions.xsl to lib/xslt
Downloaded convert_tei-to-biblstruct_functions.xsl to lib/xslt
Downloaded convert_tei-to-biblstruct_bibl.xsl to lib/xslt
Downloaded date-functions.xsl to lib/xslt
Downloaded convert_tei-to-mods_functions.xsl to lib/xslt
Downloaded convert_tei-to-mods_bibl.xsl to lib/xslt
Downloaded convert_tei-to-zotero-rdf_bibl.xsl to lib/xslt
Downloaded MODS3-7_Bibframe2-0_XSLT2-0_20230505.xsl to lib/xslt/
Downloaded languageCrosswalk.xml to lib/xslt/conf
%% Cell type:markdown id:781d0e0e7a9dd346 tags:
## Download the Saxon jar
As the xslt uses v2.0 features, and there are no native-python xslt-2.0 processors, we need to use the Saxon processor (I haven't tried https://pypi.org/project/saxonpy).
%% Cell type:code id:72b688e9b2e0d1f2 tags:
``` python
import requests
import zipfile
import io
import os
url = "https://github.com/Saxonica/Saxon-HE/releases/download/SaxonHE12-5/SaxonHE12-5J.zip"
target_dir = 'lib/SaxonHE12-5'
response = requests.get(url, stream=True)
file_zip = zipfile.ZipFile(io.BytesIO(response.content))
os.makedirs(target_dir, exist_ok=True)
file_zip.extractall(path=target_dir)
```
%% Cell type:markdown id:1bbb36ac0f4fd1b5 tags:
## Run the transformation to biblStruct & MODS
%% Cell type:code id:d4a6c9620d0199eb tags:
``` python
import subprocess
import os
from glob import glob
def transform(xslt_path, input_path='tei', output_path='.', rename_extension:tuple=None):
input_path = os.path.normpath(input_path)
xslt_path = os.path.normpath(xslt_path)
cmd = ['java', '-jar', 'lib/SaxonHE12-5J/saxon-he-12.5.jar',
f'-s:{input_path}', f'-xsl:{xslt_path}', f'-o:{output_path}',
'p_target-language=de', 'p_github-action=true', f'p_output-folder={output_path}']
process = subprocess.run(cmd, capture_output=True, text=True)
if rename_extension:
from_extension = rename_extension[0]
to_extension = rename_extension[1]
for filename in glob(f'{output_path}/*.xml'):
if filename.endswith(from_extension):
os.replace(filename, filename.replace(from_extension, to_extension))
if process.returncode != 0:
raise RuntimeError(process.stderr)
print(f'Applied {xslt_path} to files in {input_path} and saved result in {output_path}.')
return process
```
%% Cell type:code id:34087ef2f498ffa6 tags:
``` python
# TEI -> biblstruct
transform(xslt_path='lib/xslt/convert_tei-to-biblstruct_bibl.xsl', output_path='biblStruct', rename_extension=('-bibl_biblStruct.TEIP5.xml','.biblStruct.xml'))
# TEI -> MODS
transform(xslt_path='lib/xslt/convert_tei-to-mods_bibl.xsl', output_path='mods', rename_extension=('-bibl.MODS.xml','.mods.xml'))
# rename wanted and delete unwanted empty files
for dir_name in ['biblStruct', 'mods']:
for filename in glob(f'{dir_name}/*'):
if os.path.basename(filename).startswith(f'{dir_name}'):
os.replace(filename, f'{os.path.dirname(filename)}/{os.path.basename(filename).removeprefix(dir_name)}')
else:
os.remove(filename)
```
%% Output
Applied lib\xslt\convert_tei-to-biblstruct_bibl.xsl to files in tei and saved result in biblStruct.
Applied lib\xslt\convert_tei-to-mods_bibl.xsl to files in tei and saved result in mods.
%% Cell type:markdown id:5e75488ae4379946 tags:
## Convert MODS to RIS tagged file format
This requires the install the [Bibutils suite of executables](https://sourceforge.net/p/bibutils/home/Bibutils) available in most distros.
If you are on Windows, you will need to install it to the standard WSL distro.
%% Cell type:code id:fde37a9e4a182bad tags:
``` python
import subprocess
import platform
cmd = ['bash', 'lib/run-bibutils.sh', 'xml2ris']
if platform.system() == 'Windows':
cmd = ['wsl.exe', '-e'] + cmd
output = subprocess.check_output(cmd, stderr=subprocess.STDOUT)
print(output.decode())
```
%% Output
Running xml2ris to convert mods/10.1111_1467-6478.00057.mods.xml to ris/10.1111_1467-6478.00057.ris...
Running xml2ris to convert mods/10.1111_1467-6478.00080.mods.xml to ris/10.1111_1467-6478.00080.ris...
Running xml2ris to convert mods/10.1515_zfrs-1980-0103.mods.xml to ris/10.1515_zfrs-1980-0103.ris...
Running xml2ris to convert mods/10.1515_zfrs-1980-0104.mods.xml to ris/10.1515_zfrs-1980-0104.ris...
%% Cell type:markdown id:61f6cfe7d3de482a tags:
## Convert MODS -> Bibframe RDF -> JSON-LD
See:
- https://www.loc.gov/standards/mods/modsrdf/mods3-7-bibframe2-0-mapping.html
- https://rdflib.readthedocs.io/
%% Cell type:code id:6ba739963096f858 tags:
``` python
# MODS -> Bibframe
transform(xslt_path='lib/xslt/MODS3-7_Bibframe2-0_XSLT2-0_20230505.xsl',
input_path='mods', output_path='bibframe',
rename_extension=('.mods.xml','.bibframe.xml'))
```
%% Output
Applied lib\xslt\MODS3-7_Bibframe2-0_XSLT2-0_20230505.xsl to files in mods and saved result in bibframe.
CompletedProcess(args=['java', '-jar', 'lib/SaxonHE12-5J/saxon-he-12.5.jar', '-s:mods', '-xsl:lib\\xslt\\MODS3-7_Bibframe2-0_XSLT2-0_20230505.xsl', '-o:bibframe', 'p_target-language=de', 'p_github-action=true', 'p_output-folder=bibframe'], returncode=0, stdout='', stderr='')
%% Cell type:code id:4cb509fa7f296d1f tags:
``` python
from rdflib import Graph
for in_path in glob(f'bibframe/*'):
out_file = os.path.basename(in_path).replace('.bibframe.xml','.json')
g = Graph()
g.parse(in_path)
g.serialize(destination=f'json-ld/{out_file}', format='json-ld')
```
%% Cell type:markdown id:8ce07a1a294b5408 tags:
%% Cell type:markdown id:be771aec518bf10a tags:
## Convert MODS -> BibTex -> CSL-JSON
This also requires Bibutils and additionally, the [pandoc executable](https://pandoc.org/installing.html).
%% Cell type:code id:3e95a38e223dae51 tags:
``` python
# MODS -> BibTeX
import subprocess
import platform
cmd = ['bash', 'lib/run-bibutils.sh', 'xml2bib']
if platform.system() == 'Windows':
cmd = ['wsl.exe', '-e'] + cmd
output = subprocess.check_output(cmd, stderr=subprocess.STDOUT)
print(output.decode())
```
%% Output
Running xml2bib to convert mods/10.1111_1467-6478.00057.mods.xml to bib/10.1111_1467-6478.00057.bib...
Running xml2bib to convert mods/10.1111_1467-6478.00080.mods.xml to bib/10.1111_1467-6478.00080.bib...
Running xml2bib to convert mods/10.1515_zfrs-1980-0103.mods.xml to bib/10.1515_zfrs-1980-0103.bib...
Running xml2bib to convert mods/10.1515_zfrs-1980-0104.mods.xml to bib/10.1515_zfrs-1980-0104.bib...
%% Cell type:code id:a2cdccaf919c268e tags:
``` python
# BibTeX to CSL
cmd = ['bash', 'lib/run-bibutils.sh', 'xml2bib']
cmd = ['bash', 'lib/run-pandoc.sh', 'bibtex', 'csljson']
if platform.system() == 'Windows':
cmd = ['wsl.exe', '-e'] + cmd
output = subprocess.check_output(cmd, stderr=subprocess.STDOUT)
print(output.decode())
```
%% Output
Running citeproc to convert bib/10.1111_1467-6478.00057.bib to csljson/10.1111_1467-6478.00057.csl.json...
Running citeproc to convert bib/10.1111_1467-6478.00080.bib to csljson/10.1111_1467-6478.00080.csl.json...
Running citeproc to convert bib/10.1515_zfrs-1980-0103.bib to csljson/10.1515_zfrs-1980-0103.csl.json...
Running citeproc to convert bib/10.1515_zfrs-1980-0104.bib to csljson/10.1515_zfrs-1980-0104.csl.json...
%% Cell type:code id:77ff73f83a1db70e tags:
``` python
```
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment