From 2d5976487027bdbbaf8eeff6a9de9c007827ecc6 Mon Sep 17 00:00:00 2001
From: Christian Boulanger <info@bibliograph.org>
Date: Thu, 22 Aug 2024 11:02:12 +0200
Subject: [PATCH] Download Saxon programmatically

---
 convert-anystyle-data/tei-to-bibformats.ipynb | 68 ++++++-------------
 1 file changed, 19 insertions(+), 49 deletions(-)

diff --git a/convert-anystyle-data/tei-to-bibformats.ipynb b/convert-anystyle-data/tei-to-bibformats.ipynb
index c448d83..8520080 100644
--- a/convert-anystyle-data/tei-to-bibformats.ipynb
+++ b/convert-anystyle-data/tei-to-bibformats.ipynb
@@ -69,71 +69,41 @@
    "metadata": {},
    "cell_type": "markdown",
    "source": [
-    "## Extract bibliographic data from TEI files using XSLT\n",
+    "## Download the Saxon jar\n",
     "\n",
-    "### Using lxml - currently not working\n"
+    "As the xslt uses v2.0 features, and there are no native-python xslt-2.0 processors, we need to use the Saxon processor\n"
    ],
-   "id": "d08d51f8767602c5"
+   "id": "781d0e0e7a9dd346"
   },
   {
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2024-08-21T16:24:16.226255Z",
-     "start_time": "2024-08-21T16:24:16.196421Z"
+     "end_time": "2024-08-22T08:58:42.538326Z",
+     "start_time": "2024-08-22T08:58:34.687673Z"
     }
    },
    "cell_type": "code",
    "source": [
-    "from lxml import etree\n",
-    "import glob\n",
     "import requests\n",
-    "\n",
-    "def apply_xslt(xslt_path, xml_input_path, xml_output_path):\n",
-    "    try:\n",
-    "        xslt_doc = etree.parse(xslt_path)\n",
-    "        xml_doc = etree.parse(xml_input_path)\n",
-    "        transformer = etree.XSLT(xslt_doc)\n",
-    "        new_xml = transformer(xml_doc)\n",
-    "        with open(xml_output_path, 'w', encoding='utf-8') as f:\n",
-    "            f.write(new_xml)\n",
-    "    except etree.XSLTParseError as e:\n",
-    "        print(f\"Error parsing XSLT file at {xslt_path}: {e}\")\n",
-    "\n",
-    "for input_path in glob.glob('tei/*.xml'):\n",
-    "    print(f'Converting {input_path}')\n",
-    "    base_name = os.path.basename(input_path)\n",
-    "    output_path = f'tmp/{base_name.replace(\".xml\", \"-mods.xml\")}'\n",
-    "    apply_xslt('lib/xslt/convert_tei-to-mods_bibl.xsl', input_path, output_path)\n"
-   ],
-   "id": "af437a5ab3cc41a3",
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Converting tei\\10.1111_1467-6478.00057.xml\n",
-      "Error parsing XSLT file at lib/xslt/convert_tei-to-mods_bibl.xsl: xsltParseStylesheetTop: ignoring misplaced import element\n",
-      "Converting tei\\10.1111_1467-6478.00080.xml\n",
-      "Error parsing XSLT file at lib/xslt/convert_tei-to-mods_bibl.xsl: xsltParseStylesheetTop: ignoring misplaced import element\n",
-      "Converting tei\\10.1515_zfrs-1980-0103.xml\n",
-      "Error parsing XSLT file at lib/xslt/convert_tei-to-mods_bibl.xsl: xsltParseStylesheetTop: ignoring misplaced import element\n",
-      "Converting tei\\10.1515_zfrs-1980-0104.xml\n",
-      "Error parsing XSLT file at lib/xslt/convert_tei-to-mods_bibl.xsl: xsltParseStylesheetTop: ignoring misplaced import element\n"
-     ]
-    }
+    "import zipfile\n",
+    "import io\n",
+    "import os\n",
+    "url = \"https://github.com/Saxonica/Saxon-HE/releases/download/SaxonHE12-5/SaxonHE12-5J.zip\"\n",
+    "target_dir = 'lib/SaxonHE12-5'\n",
+    "response = requests.get(url, stream=True)\n",
+    "file_zip = zipfile.ZipFile(io.BytesIO(response.content))\n",
+    "os.makedirs(target_dir, exist_ok=True)\n",
+    "file_zip.extractall(path=target_dir)"
    ],
-   "execution_count": 41
+   "id": "72b688e9b2e0d1f2",
+   "outputs": [],
+   "execution_count": 86
   },
   {
    "metadata": {},
    "cell_type": "markdown",
-   "source": [
-    "### Using Saxon:\n",
-    "\n",
-    "- download ZIP from https://github.com/Saxonica/Saxon-HE/releases/download/SaxonHE12-5/SaxonHE12-5J.zip\n",
-    "- unpack in lib/SaxonHE12-5J"
-   ],
-   "id": "781d0e0e7a9dd346"
+   "source": "## Run the transformation",
+   "id": "1bbb36ac0f4fd1b5"
   },
   {
    "metadata": {
-- 
GitLab