Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
E
experiments
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Monitor
Service Desk
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Christian Boulanger
experiments
Commits
2d597648
Commit
2d597648
authored
7 months ago
by
Christian Boulanger
Browse files
Options
Downloads
Patches
Plain Diff
Download Saxon programmatically
parent
cf342253
No related branches found
No related tags found
No related merge requests found
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
convert-anystyle-data/tei-to-bibformats.ipynb
+19
-49
19 additions, 49 deletions
convert-anystyle-data/tei-to-bibformats.ipynb
with
19 additions
and
49 deletions
convert-anystyle-data/tei-to-bibformats.ipynb
+
19
−
49
View file @
2d597648
...
@@ -69,71 +69,41 @@
...
@@ -69,71 +69,41 @@
"metadata": {},
"metadata": {},
"cell_type": "markdown",
"cell_type": "markdown",
"source": [
"source": [
"##
Extract bibliographic data from TEI files using XSLT
\n",
"##
Download the Saxon jar
\n",
"\n",
"\n",
"
### Using lxml - currently not working
\n"
"
As the xslt uses v2.0 features, and there are no native-python xslt-2.0 processors, we need to use the Saxon processor
\n"
],
],
"id": "
d08d51f8767602c5
"
"id": "
781d0e0e7a9dd346
"
},
},
{
{
"metadata": {
"metadata": {
"ExecuteTime": {
"ExecuteTime": {
"end_time": "2024-08-2
1T16:24:16.226255
Z",
"end_time": "2024-08-2
2T08:58:42.538326
Z",
"start_time": "2024-08-2
1T16:24:16.196421
Z"
"start_time": "2024-08-2
2T08:58:34.687673
Z"
}
}
},
},
"cell_type": "code",
"cell_type": "code",
"source": [
"source": [
"from lxml import etree\n",
"import glob\n",
"import requests\n",
"import requests\n",
"\n",
"import zipfile\n",
"def apply_xslt(xslt_path, xml_input_path, xml_output_path):\n",
"import io\n",
" try:\n",
"import os\n",
" xslt_doc = etree.parse(xslt_path)\n",
"url = \"https://github.com/Saxonica/Saxon-HE/releases/download/SaxonHE12-5/SaxonHE12-5J.zip\"\n",
" xml_doc = etree.parse(xml_input_path)\n",
"target_dir = 'lib/SaxonHE12-5'\n",
" transformer = etree.XSLT(xslt_doc)\n",
"response = requests.get(url, stream=True)\n",
" new_xml = transformer(xml_doc)\n",
"file_zip = zipfile.ZipFile(io.BytesIO(response.content))\n",
" with open(xml_output_path, 'w', encoding='utf-8') as f:\n",
"os.makedirs(target_dir, exist_ok=True)\n",
" f.write(new_xml)\n",
"file_zip.extractall(path=target_dir)"
" except etree.XSLTParseError as e:\n",
" print(f\"Error parsing XSLT file at {xslt_path}: {e}\")\n",
"\n",
"for input_path in glob.glob('tei/*.xml'):\n",
" print(f'Converting {input_path}')\n",
" base_name = os.path.basename(input_path)\n",
" output_path = f'tmp/{base_name.replace(\".xml\", \"-mods.xml\")}'\n",
" apply_xslt('lib/xslt/convert_tei-to-mods_bibl.xsl', input_path, output_path)\n"
],
"id": "af437a5ab3cc41a3",
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Converting tei\\10.1111_1467-6478.00057.xml\n",
"Error parsing XSLT file at lib/xslt/convert_tei-to-mods_bibl.xsl: xsltParseStylesheetTop: ignoring misplaced import element\n",
"Converting tei\\10.1111_1467-6478.00080.xml\n",
"Error parsing XSLT file at lib/xslt/convert_tei-to-mods_bibl.xsl: xsltParseStylesheetTop: ignoring misplaced import element\n",
"Converting tei\\10.1515_zfrs-1980-0103.xml\n",
"Error parsing XSLT file at lib/xslt/convert_tei-to-mods_bibl.xsl: xsltParseStylesheetTop: ignoring misplaced import element\n",
"Converting tei\\10.1515_zfrs-1980-0104.xml\n",
"Error parsing XSLT file at lib/xslt/convert_tei-to-mods_bibl.xsl: xsltParseStylesheetTop: ignoring misplaced import element\n"
]
}
],
],
"execution_count": 41
"id": "72b688e9b2e0d1f2",
"outputs": [],
"execution_count": 86
},
},
{
{
"metadata": {},
"metadata": {},
"cell_type": "markdown",
"cell_type": "markdown",
"source": [
"source": "## Run the transformation",
"### Using Saxon:\n",
"id": "1bbb36ac0f4fd1b5"
"\n",
"- download ZIP from https://github.com/Saxonica/Saxon-HE/releases/download/SaxonHE12-5/SaxonHE12-5J.zip\n",
"- unpack in lib/SaxonHE12-5J"
],
"id": "781d0e0e7a9dd346"
},
},
{
{
"metadata": {
"metadata": {
...
...
%% Cell type:markdown id:2cdf8ba1eefa38e0 tags:
%% Cell type:markdown id:2cdf8ba1eefa38e0 tags:
# Convert the generated TEI to bibliographic formats
# Convert the generated TEI to bibliographic formats
%% Cell type:markdown id:db65c4065691c578 tags:
%% Cell type:markdown id:db65c4065691c578 tags:
## Download required XSLT documents
## Download required XSLT documents
we use XSLT provided by https://github.com/OpenArabicPE/convert_tei-to-bibliographic-data
we use XSLT provided by https://github.com/OpenArabicPE/convert_tei-to-bibliographic-data
%% Cell type:code id:1de7cedbb3514188 tags:
%% Cell type:code id:1de7cedbb3514188 tags:
```
python
```
python
import
os
import
os
from
urllib.parse
import
urljoin
from
urllib.parse
import
urljoin
import
requests
import
requests
from
lxml
import
etree
from
lxml
import
etree
def
download_xslt
(
url
,
target_dir
=
'
lib/xslt
'
):
def
download_xslt
(
url
,
target_dir
=
'
lib/xslt
'
):
"""
written by GPT-4
"""
"""
written by GPT-4
"""
response
=
requests
.
get
(
url
)
response
=
requests
.
get
(
url
)
response
.
raise_for_status
()
response
.
raise_for_status
()
doc
=
etree
.
fromstring
(
response
.
content
)
doc
=
etree
.
fromstring
(
response
.
content
)
for
elem
in
doc
.
xpath
(
'
//*[local-name() =
"
import
"
]
'
):
for
elem
in
doc
.
xpath
(
'
//*[local-name() =
"
import
"
]
'
):
import_url
=
urljoin
(
url
,
elem
.
get
(
'
href
'
))
# Construct a full URL based on the href attribute relative to the original url
import_url
=
urljoin
(
url
,
elem
.
get
(
'
href
'
))
# Construct a full URL based on the href attribute relative to the original url
download_xslt
(
import_url
,
target_dir
)
download_xslt
(
import_url
,
target_dir
)
os
.
makedirs
(
target_dir
,
exist_ok
=
True
)
os
.
makedirs
(
target_dir
,
exist_ok
=
True
)
with
open
(
os
.
path
.
join
(
target_dir
,
os
.
path
.
basename
(
url
)),
'
wb
'
)
as
f
:
with
open
(
os
.
path
.
join
(
target_dir
,
os
.
path
.
basename
(
url
)),
'
wb
'
)
as
f
:
f
.
write
(
response
.
content
)
f
.
write
(
response
.
content
)
print
(
f
'
Downloaded
{
os
.
path
.
basename
(
url
)
}
'
)
print
(
f
'
Downloaded
{
os
.
path
.
basename
(
url
)
}
'
)
base_url
=
'
https://openarabicpe.github.io/convert_tei-to-bibliographic-data/xslt
'
base_url
=
'
https://openarabicpe.github.io/convert_tei-to-bibliographic-data/xslt
'
xslt_docs
=
[
'
convert_tei-to-mods_bibl.xsl
'
]
xslt_docs
=
[
'
convert_tei-to-mods_bibl.xsl
'
]
for
xslt_doc
in
xslt_docs
:
for
xslt_doc
in
xslt_docs
:
download_xslt
(
f
'
{
base_url
}
/
{
xslt_doc
}
'
)
download_xslt
(
f
'
{
base_url
}
/
{
xslt_doc
}
'
)
```
```
%% Output
%% Output
Downloaded date-functions.xsl
Downloaded date-functions.xsl
Downloaded parameters.xsl
Downloaded parameters.xsl
Downloaded functions.xsl
Downloaded functions.xsl
Downloaded convert_tei-to-biblstruct_functions.xsl
Downloaded convert_tei-to-biblstruct_functions.xsl
Downloaded convert_tei-to-mods_functions.xsl
Downloaded convert_tei-to-mods_functions.xsl
Downloaded convert_tei-to-mods_bibl.xsl
Downloaded convert_tei-to-mods_bibl.xsl
%% Cell type:markdown id:
d08d51f8767602c5
tags:
%% Cell type:markdown id:
781d0e0e7a9dd346
tags:
##
Extract bibliographic data from TEI files using XSLT
##
Download the Saxon jar
### Using lxml - currently not working
As the xslt uses v2.0 features, and there are no native-python xslt-2.0 processors, we need to use the Saxon processor
%% Cell type:code id:
af437a5ab3cc41a3
tags:
%% Cell type:code id:
72b688e9b2e0d1f2
tags:
```
python
```
python
from
lxml
import
etree
import
glob
import
requests
import
requests
import
zipfile
def
apply_xslt
(
xslt_path
,
xml_input_path
,
xml_output_path
):
import
io
try
:
import
os
xslt_doc
=
etree
.
parse
(
xslt_path
)
url
=
"
https://github.com/Saxonica/Saxon-HE/releases/download/SaxonHE12-5/SaxonHE12-5J.zip
"
xml_doc
=
etree
.
parse
(
xml_input_path
)
target_dir
=
'
lib/SaxonHE12-5
'
transformer
=
etree
.
XSLT
(
xslt_doc
)
response
=
requests
.
get
(
url
,
stream
=
True
)
new_xml
=
transformer
(
xml_doc
)
file_zip
=
zipfile
.
ZipFile
(
io
.
BytesIO
(
response
.
content
))
with
open
(
xml_output_path
,
'
w
'
,
encoding
=
'
utf-8
'
)
as
f
:
os
.
makedirs
(
target_dir
,
exist_ok
=
True
)
f
.
write
(
new_xml
)
file_zip
.
extractall
(
path
=
target_dir
)
except
etree
.
XSLTParseError
as
e
:
print
(
f
"
Error parsing XSLT file at
{
xslt_path
}
:
{
e
}
"
)
for
input_path
in
glob
.
glob
(
'
tei/*.xml
'
):
print
(
f
'
Converting
{
input_path
}
'
)
base_name
=
os
.
path
.
basename
(
input_path
)
output_path
=
f
'
tmp/
{
base_name
.
replace
(
"
.xml
"
,
"
-mods.xml
"
)
}
'
apply_xslt
(
'
lib/xslt/convert_tei-to-mods_bibl.xsl
'
,
input_path
,
output_path
)
```
```
%% Output
%% Cell type:markdown id:1bbb36ac0f4fd1b5 tags:
Converting tei\10.1111_1467-6478.00057.xml
Error parsing XSLT file at lib/xslt/convert_tei-to-mods_bibl.xsl: xsltParseStylesheetTop: ignoring misplaced import element
Converting tei\10.1111_1467-6478.00080.xml
Error parsing XSLT file at lib/xslt/convert_tei-to-mods_bibl.xsl: xsltParseStylesheetTop: ignoring misplaced import element
Converting tei\10.1515_zfrs-1980-0103.xml
Error parsing XSLT file at lib/xslt/convert_tei-to-mods_bibl.xsl: xsltParseStylesheetTop: ignoring misplaced import element
Converting tei\10.1515_zfrs-1980-0104.xml
Error parsing XSLT file at lib/xslt/convert_tei-to-mods_bibl.xsl: xsltParseStylesheetTop: ignoring misplaced import element
%% Cell type:markdown id:781d0e0e7a9dd346 tags:
### Using Saxon:
-
download ZIP from https://github.com/Saxonica/Saxon-HE/releases/download/SaxonHE12-5/SaxonHE12-5J.zip
## Run the transformation
-
unpack in lib/SaxonHE12-5J
%% Cell type:code id:34087ef2f498ffa6 tags:
%% Cell type:code id:34087ef2f498ffa6 tags:
```
python
```
python
import
subprocess
import
subprocess
import
os
import
os
def
transform_tei
(
xslt_path
,
file_path
=
'
tei
'
,
output_path
=
'
.
'
):
def
transform_tei
(
xslt_path
,
file_path
=
'
tei
'
,
output_path
=
'
.
'
):
file_path
=
os
.
path
.
normpath
(
file_path
)
file_path
=
os
.
path
.
normpath
(
file_path
)
xslt_path
=
os
.
path
.
normpath
(
xslt_path
)
xslt_path
=
os
.
path
.
normpath
(
xslt_path
)
cmd
=
[
'
java
'
,
'
-jar
'
,
'
lib/SaxonHE12-5J/saxon-he-12.5.jar
'
,
cmd
=
[
'
java
'
,
'
-jar
'
,
'
lib/SaxonHE12-5J/saxon-he-12.5.jar
'
,
f
'
-s:
{
file_path
}
'
,
f
'
-s:
{
file_path
}
'
,
f
'
-xsl:
{
xslt_path
}
'
,
f
'
-xsl:
{
xslt_path
}
'
,
f
'
-o:
{
output_path
}
'
,
f
'
-o:
{
output_path
}
'
,
'
p_target-language=de
'
,
'
p_github-action=true
'
]
'
p_target-language=de
'
,
'
p_github-action=true
'
]
process
=
subprocess
.
run
(
cmd
,
capture_output
=
True
,
text
=
True
)
process
=
subprocess
.
run
(
cmd
,
capture_output
=
True
,
text
=
True
)
if
process
.
returncode
!=
0
:
if
process
.
returncode
!=
0
:
raise
RuntimeError
(
process
.
stderr
)
raise
RuntimeError
(
process
.
stderr
)
return
process
return
process
transform_tei
(
xslt_path
=
'
lib/xslt/convert_tei-to-biblstruct_bibl.xsl
'
,
output_path
=
'
biblStruct
'
)
transform_tei
(
xslt_path
=
'
lib/xslt/convert_tei-to-biblstruct_bibl.xsl
'
,
output_path
=
'
biblStruct
'
)
transform_tei
(
xslt_path
=
'
lib/xslt/convert_tei-to-mods_bibl.xsl
'
,
output_path
=
'
mods
'
)
transform_tei
(
xslt_path
=
'
lib/xslt/convert_tei-to-mods_bibl.xsl
'
,
output_path
=
'
mods
'
)
```
```
%% Output
%% Output
CompletedProcess(args=['java', '-jar', 'lib/SaxonHE12-5J/saxon-he-12.5.jar', '-s:tei', '-xsl:lib\\xslt\\convert_tei-to-mods_bibl.xsl', '-o:mods', 'p_target-language=de', 'p_github-action=true'], returncode=0, stdout='', stderr='')
CompletedProcess(args=['java', '-jar', 'lib/SaxonHE12-5J/saxon-he-12.5.jar', '-s:tei', '-xsl:lib\\xslt\\convert_tei-to-mods_bibl.xsl', '-o:mods', 'p_target-language=de', 'p_github-action=true'], returncode=0, stdout='', stderr='')
%% Cell type:markdown id:5e75488ae4379946 tags:
%% Cell type:markdown id:5e75488ae4379946 tags:
## Convert MODS to RIS tagged file format
## Convert MODS to RIS tagged file format
This requires the install the bibutils suite of executables https://sourceforge.net/p/bibutils/home/Bibutils/ available in most distros.
This requires the install the bibutils suite of executables https://sourceforge.net/p/bibutils/home/Bibutils/ available in most distros.
(in windows, you will need to install it to the standard WSL distro)
(in windows, you will need to install it to the standard WSL distro)
%% Cell type:code id:fde37a9e4a182bad tags:
%% Cell type:code id:fde37a9e4a182bad tags:
```
python
```
python
import
subprocess
import
subprocess
import
platform
import
platform
cmd
=
[
'
bash
'
,
'
lib/xml2ris.sh
'
]
cmd
=
[
'
bash
'
,
'
lib/xml2ris.sh
'
]
if
platform
.
system
()
==
'
Windows
'
:
if
platform
.
system
()
==
'
Windows
'
:
cmd
=
[
'
wsl.exe
'
,
'
-e
'
]
+
cmd
cmd
=
[
'
wsl.exe
'
,
'
-e
'
]
+
cmd
output
=
subprocess
.
check_output
(
cmd
,
stderr
=
subprocess
.
STDOUT
)
output
=
subprocess
.
check_output
(
cmd
,
stderr
=
subprocess
.
STDOUT
)
print
(
output
.
decode
())
print
(
output
.
decode
())
```
```
%% Output
%% Output
Converted mods/metadata/10.1111_1467-6478.00057-bibl.MODS.xml to ris/10.1111_1467-6478.00057.ris
Converted mods/metadata/10.1111_1467-6478.00057-bibl.MODS.xml to ris/10.1111_1467-6478.00057.ris
xml2ris: Processed 68 references.
xml2ris: Processed 68 references.
Converted mods/metadata/10.1111_1467-6478.00080-bibl.MODS.xml to ris/10.1111_1467-6478.00080.ris
Converted mods/metadata/10.1111_1467-6478.00080-bibl.MODS.xml to ris/10.1111_1467-6478.00080.ris
xml2ris: Processed 40 references.
xml2ris: Processed 40 references.
Converted mods/metadata/10.1515_zfrs-1980-0103-bibl.MODS.xml to ris/10.1515_zfrs-1980-0103.ris
Converted mods/metadata/10.1515_zfrs-1980-0103-bibl.MODS.xml to ris/10.1515_zfrs-1980-0103.ris
xml2ris: Processed 36 references.
xml2ris: Processed 36 references.
Converted mods/metadata/10.1515_zfrs-1980-0104-bibl.MODS.xml to ris/10.1515_zfrs-1980-0104.ris
Converted mods/metadata/10.1515_zfrs-1980-0104-bibl.MODS.xml to ris/10.1515_zfrs-1980-0104.ris
xml2ris: Processed 82 references.
xml2ris: Processed 82 references.
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment