Download Saxon programmatically

2d597648 · Christian Boulanger · cf342253 · 2d597648
Commit 2d597648 authored 7 months ago by Christian Boulanger
--- a/convert-anystyle-data/tei-to-bibformats.ipynb
+++ b/convert-anystyle-data/tei-to-bibformats.ipynb
@@ -69,71 +69,41 @@
   "metadata": {},
   "cell_type": "markdown",
   "source": [
-    "## Extract bibliographic data from TEI files using XSLT\n",
+    "## Download the Saxon jar\n",
    "\n",
-    "### Using lxml - currently not working\n"
+    "As the xslt uses v2.0 features, and there are no native-python xslt-2.0 processors, we need to use the Saxon processor\n"
   ],
-   "id": "d08d51f8767602c5"
+   "id": "781d0e0e7a9dd346"
  },
  {
   "metadata": {
    "ExecuteTime": {
-     "end_time": "2024-08-21T16:24:16.226255Z",
+     "end_time": "2024-08-22T08:58:42.538326Z",
-     "start_time": "2024-08-21T16:24:16.196421Z"
+     "start_time": "2024-08-22T08:58:34.687673Z"
    }
   },
   "cell_type": "code",
   "source": [
-    "from lxml import etree\n",
-    "import glob\n",
    "import requests\n",
-    "\n",
+    "import zipfile\n",
-    "def apply_xslt(xslt_path, xml_input_path, xml_output_path):\n",
+    "import io\n",
-    "    try:\n",
+    "import os\n",
-    "        xslt_doc = etree.parse(xslt_path)\n",
+    "url = \"https://github.com/Saxonica/Saxon-HE/releases/download/SaxonHE12-5/SaxonHE12-5J.zip\"\n",
-    "        xml_doc = etree.parse(xml_input_path)\n",
+    "target_dir = 'lib/SaxonHE12-5'\n",
-    "        transformer = etree.XSLT(xslt_doc)\n",
+    "response = requests.get(url, stream=True)\n",
-    "        new_xml = transformer(xml_doc)\n",
+    "file_zip = zipfile.ZipFile(io.BytesIO(response.content))\n",
-    "        with open(xml_output_path, 'w', encoding='utf-8') as f:\n",
+    "os.makedirs(target_dir, exist_ok=True)\n",
-    "            f.write(new_xml)\n",
+    "file_zip.extractall(path=target_dir)"
-    "    except etree.XSLTParseError as e:\n",
-    "        print(f\"Error parsing XSLT file at {xslt_path}: {e}\")\n",
-    "\n",
-    "for input_path in glob.glob('tei/*.xml'):\n",
-    "    print(f'Converting {input_path}')\n",
-    "    base_name = os.path.basename(input_path)\n",
-    "    output_path = f'tmp/{base_name.replace(\".xml\", \"-mods.xml\")}'\n",
-    "    apply_xslt('lib/xslt/convert_tei-to-mods_bibl.xsl', input_path, output_path)\n"
-   ],
-   "id": "af437a5ab3cc41a3",
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Converting tei\\10.1111_1467-6478.00057.xml\n",
-      "Error parsing XSLT file at lib/xslt/convert_tei-to-mods_bibl.xsl: xsltParseStylesheetTop: ignoring misplaced import element\n",
-      "Converting tei\\10.1111_1467-6478.00080.xml\n",
-      "Error parsing XSLT file at lib/xslt/convert_tei-to-mods_bibl.xsl: xsltParseStylesheetTop: ignoring misplaced import element\n",
-      "Converting tei\\10.1515_zfrs-1980-0103.xml\n",
-      "Error parsing XSLT file at lib/xslt/convert_tei-to-mods_bibl.xsl: xsltParseStylesheetTop: ignoring misplaced import element\n",
-      "Converting tei\\10.1515_zfrs-1980-0104.xml\n",
-      "Error parsing XSLT file at lib/xslt/convert_tei-to-mods_bibl.xsl: xsltParseStylesheetTop: ignoring misplaced import element\n"
-     ]
-    }
   ],
-   "execution_count": 41
+   "id": "72b688e9b2e0d1f2",
+   "outputs": [],
+   "execution_count": 86
  },
  {
   "metadata": {},
   "cell_type": "markdown",
-   "source": [
+   "source": "## Run the transformation",
-    "### Using Saxon:\n",
+   "id": "1bbb36ac0f4fd1b5"
-    "\n",
-    "- download ZIP from https://github.com/Saxonica/Saxon-HE/releases/download/SaxonHE12-5/SaxonHE12-5J.zip\n",
-    "- unpack in lib/SaxonHE12-5J"
-   ],
-   "id": "781d0e0e7a9dd346"
  },
  {
   "metadata": {

 %% Cell type:markdown id:2cdf8ba1eefa38e0 tags:
 # Convert the generated TEI to bibliographic formats
 %% Cell type:markdown id:db65c4065691c578 tags:
 ## Download required XSLT documents
 we use XSLT provided by https://github.com/OpenArabicPE/convert_tei-to-bibliographic-data
 %% Cell type:code id:1de7cedbb3514188 tags:
 ``` python
 import os
 from urllib.parse import urljoin
 import requests
 from lxml import etree
 def download_xslt(url, target_dir = 'lib/xslt'):
    """written by GPT-4"""
    response = requests.get(url)
    response.raise_for_status()
    doc = etree.fromstring(response.content)
    for elem in doc.xpath('//*[local-name() = "import"]'):
        import_url = urljoin(url, elem.get('href'))  # Construct a full URL based on the href attribute relative to the original url
        download_xslt(import_url, target_dir)
    os.makedirs(target_dir, exist_ok=True)
    with open(os.path.join(target_dir, os.path.basename(url)), 'wb') as f:
        f.write(response.content)
    print(f'Downloaded {os.path.basename(url)}')
 base_url = 'https://openarabicpe.github.io/convert_tei-to-bibliographic-data/xslt'
 xslt_docs = ['convert_tei-to-mods_bibl.xsl']
 for xslt_doc in xslt_docs:
    download_xslt(f'{base_url}/{xslt_doc}')
 ```
 %% Output
    Downloaded date-functions.xsl
    Downloaded parameters.xsl
    Downloaded functions.xsl
    Downloaded convert_tei-to-biblstruct_functions.xsl
    Downloaded convert_tei-to-mods_functions.xsl
    Downloaded convert_tei-to-mods_bibl.xsl
-%% Cell type:markdown id:d08d51f8767602c5 tags:
+%% Cell type:markdown id:781d0e0e7a9dd346 tags:
-## Extract bibliographic data from TEI files using XSLT
+## Download the Saxon jar
-### Using lxml - currently not working
+As the xslt uses v2.0 features, and there are no native-python xslt-2.0 processors, we need to use the Saxon processor
-%% Cell type:code id:af437a5ab3cc41a3 tags:
+%% Cell type:code id:72b688e9b2e0d1f2 tags:
 ``` python
-from lxml import etree
-import glob
 import requests
+import zipfile
-def apply_xslt(xslt_path, xml_input_path, xml_output_path):
+import io
-    try:
+import os
-        xslt_doc = etree.parse(xslt_path)
+url = "https://github.com/Saxonica/Saxon-HE/releases/download/SaxonHE12-5/SaxonHE12-5J.zip"
-        xml_doc = etree.parse(xml_input_path)
+target_dir = 'lib/SaxonHE12-5'
-        transformer = etree.XSLT(xslt_doc)
+response = requests.get(url, stream=True)
-        new_xml = transformer(xml_doc)
+file_zip = zipfile.ZipFile(io.BytesIO(response.content))
-        with open(xml_output_path, 'w', encoding='utf-8') as f:
+os.makedirs(target_dir, exist_ok=True)
-            f.write(new_xml)
+file_zip.extractall(path=target_dir)
-    except etree.XSLTParseError as e:
-        print(f"Error parsing XSLT file at {xslt_path}: {e}")
-for input_path in glob.glob('tei/*.xml'):
-    print(f'Converting {input_path}')
-    base_name = os.path.basename(input_path)
-    output_path = f'tmp/{base_name.replace(".xml", "-mods.xml")}'
-    apply_xslt('lib/xslt/convert_tei-to-mods_bibl.xsl', input_path, output_path)
 ```
-%% Output
+%% Cell type:markdown id:1bbb36ac0f4fd1b5 tags:
-    Converting tei\10.1111_1467-6478.00057.xml
-    Error parsing XSLT file at lib/xslt/convert_tei-to-mods_bibl.xsl: xsltParseStylesheetTop: ignoring misplaced import element
-    Converting tei\10.1111_1467-6478.00080.xml
-    Error parsing XSLT file at lib/xslt/convert_tei-to-mods_bibl.xsl: xsltParseStylesheetTop: ignoring misplaced import element
-    Converting tei\10.1515_zfrs-1980-0103.xml
-    Error parsing XSLT file at lib/xslt/convert_tei-to-mods_bibl.xsl: xsltParseStylesheetTop: ignoring misplaced import element
-    Converting tei\10.1515_zfrs-1980-0104.xml
-    Error parsing XSLT file at lib/xslt/convert_tei-to-mods_bibl.xsl: xsltParseStylesheetTop: ignoring misplaced import element
-%% Cell type:markdown id:781d0e0e7a9dd346 tags:
-### Using Saxon:
- download ZIP from https://github.com/Saxonica/Saxon-HE/releases/download/SaxonHE12-5/SaxonHE12-5J.zip
+## Run the transformation
- unpack in lib/SaxonHE12-5J
 %% Cell type:code id:34087ef2f498ffa6 tags:
 ``` python
 import subprocess
 import os
 def transform_tei(xslt_path, file_path='tei', output_path='.'):
    file_path = os.path.normpath(file_path)
    xslt_path = os.path.normpath(xslt_path)
    cmd = ['java', '-jar', 'lib/SaxonHE12-5J/saxon-he-12.5.jar',
           f'-s:{file_path}',
           f'-xsl:{xslt_path}',
           f'-o:{output_path}',
           'p_target-language=de', 'p_github-action=true']
    process = subprocess.run(cmd, capture_output=True, text=True)
    if process.returncode != 0:
        raise RuntimeError(process.stderr)
    return process
 transform_tei(xslt_path='lib/xslt/convert_tei-to-biblstruct_bibl.xsl', output_path='biblStruct')
 transform_tei(xslt_path='lib/xslt/convert_tei-to-mods_bibl.xsl', output_path='mods')
 ```
 %% Output
    CompletedProcess(args=['java', '-jar', 'lib/SaxonHE12-5J/saxon-he-12.5.jar', '-s:tei', '-xsl:lib\\xslt\\convert_tei-to-mods_bibl.xsl', '-o:mods', 'p_target-language=de', 'p_github-action=true'], returncode=0, stdout='', stderr='')
 %% Cell type:markdown id:5e75488ae4379946 tags:
 ## Convert MODS to RIS tagged file format
 This requires the install the bibutils suite of executables https://sourceforge.net/p/bibutils/home/Bibutils/ available in most distros.
 (in windows, you will need to install it to the standard WSL distro)
 %% Cell type:code id:fde37a9e4a182bad tags:
 ``` python
 import subprocess
 import platform
 cmd = ['bash', 'lib/xml2ris.sh']
 if platform.system() == 'Windows':
    cmd = ['wsl.exe', '-e'] + cmd
 output = subprocess.check_output(cmd, stderr=subprocess.STDOUT)
 print(output.decode())
 ```
 %% Output
    Converted mods/metadata/10.1111_1467-6478.00057-bibl.MODS.xml to ris/10.1111_1467-6478.00057.ris
    xml2ris: Processed 68 references.
    Converted mods/metadata/10.1111_1467-6478.00080-bibl.MODS.xml to ris/10.1111_1467-6478.00080.ris
    xml2ris: Processed 40 references.
    Converted mods/metadata/10.1515_zfrs-1980-0103-bibl.MODS.xml to ris/10.1515_zfrs-1980-0103.ris
    xml2ris: Processed 36 references.
    Converted mods/metadata/10.1515_zfrs-1980-0104-bibl.MODS.xml to ris/10.1515_zfrs-1980-0104.ris
    xml2ris: Processed 82 references.