From 0af92f46af893b0718f9a400ded11d012814bfcc Mon Sep 17 00:00:00 2001
From: cboulanger <info@bibliograph.org>
Date: Sat, 3 Aug 2024 22:36:58 +0200
Subject: [PATCH] Trying to make the XSLTs work, no luck

---
 convert-anystyle-data/anystyle-to-tei.ipynb | 143 ++++++++++++--------
 1 file changed, 86 insertions(+), 57 deletions(-)

diff --git a/convert-anystyle-data/anystyle-to-tei.ipynb b/convert-anystyle-data/anystyle-to-tei.ipynb
index 395c904..b9994e2 100644
--- a/convert-anystyle-data/anystyle-to-tei.ipynb
+++ b/convert-anystyle-data/anystyle-to-tei.ipynb
@@ -418,82 +418,111 @@
    "metadata": {},
    "cell_type": "markdown",
    "source": [
-    "## Extract bibliographic data from TEI files "
+    "## Extract bibliographic data from TEI files using XSLT\n",
+    "\n",
+    "https://github.com/OpenArabicPE/convert_tei-to-bibliographic-data"
    ],
    "id": "b0a231dc7bdd8b01"
   },
-  {
-   "cell_type": "markdown",
-   "source": [
-    "### Download XSLTs"
-   ],
-   "metadata": {
-    "collapsed": false
-   },
-   "id": "149588c08747c4b3"
-  },
   {
    "cell_type": "code",
-   "execution_count": 6,
-   "outputs": [],
    "source": [
-    "import requests, zipfile, io, os\n",
-    "\n",
-    "if not os.path.isdir('lib/convert'): \n",
-    "    url = 'https://github.com/OpenArabicPE/convert_tei-to-bibliographic-data/archive/refs/heads/master.zip'\n",
-    "    r = requests.get(url)\n",
-    "    assert r.status_code == 200\n",
-    "    z = zipfile.ZipFile(io.BytesIO(r.content))\n",
-    "    z.extractall('lib')\n",
-    "    z.close()\n",
-    "    os.rename('lib/convert_tei-to-bibliographic-data-master', 'lib/convert')\n"
+    "from lxml import etree\n",
+    "import glob\n",
+    "from urllib.request import urlopen\n",
+    "import requests\n",
+    "import traceback\n",
+    "\n",
+    "class HttpsResolver(etree.Resolver):\n",
+    "    def resolve(self, url, id, context):     \n",
+    "        r = requests.get(url)\n",
+    "        assert(r.status_code == 200)\n",
+    "        return self.resolve_string(r.content, context, base_url=url)\n",
+    "\n",
+    "def apply_xslt(xslt_path, xml_input_path, xml_output_path):\n",
+    "    try:\n",
+    "        if xslt_path.startswith('http'):\n",
+    "            with urlopen(xslt_path) as f:\n",
+    "                xml_parser = etree.XMLParser(no_network=False)\n",
+    "                xml_parser.resolvers.add(HttpsResolver())\n",
+    "                xslt_doc = etree.parse(f, parser=xml_parser)\n",
+    "        else:\n",
+    "            xslt_doc = etree.parse(xslt_path)\n",
+    "        xml_doc = etree.parse(xml_input_path)\n",
+    "        transformer = etree.XSLT(xslt_doc)\n",
+    "        new_xml = transformer(xml_doc)\n",
+    "        with open(xml_output_path, 'w', encoding='utf-8') as f:\n",
+    "            f.write(new_xml)\n",
+    "    except etree.XSLTParseError as e:\n",
+    "        print(f\"Error parsing XSLT file at {xslt_path}: {e}\")\n",
+    "\n",
+    "xslt_url = 'https://openarabicpe.github.io/convert_tei-to-bibliographic-data/xslt/convert_tei-to-biblstruct_bibl.xsl'\n",
+    "\n",
+    "for input_path in glob.glob('tei/*.xml'):\n",
+    "    print(f'Converting {input_path}')\n",
+    "    base_name = os.path.basename(input_path)\n",
+    "    output_path = f'tei-biblstruct/{base_name}'\n",
+    "    apply_xslt(xslt_url, input_path, output_path )\n"
    ],
    "metadata": {
     "collapsed": false,
     "ExecuteTime": {
-     "end_time": "2024-07-31T19:42:48.723119300Z",
-     "start_time": "2024-07-31T19:42:47.234795500Z"
+     "end_time": "2024-08-03T20:28:20.087934Z",
+     "start_time": "2024-08-03T20:28:18.699591Z"
     }
    },
-   "id": "1f15b3af6aab73ed"
-  },
-  {
-   "metadata": {},
-   "cell_type": "markdown",
-   "source": [
-    "### Apply XSLT"
+   "id": "cb3b4140ab153c08",
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Converting tei/10.1515_zfrs-1980-0103.xml\n",
+      "Error parsing XSLT file at https://openarabicpe.github.io/convert_tei-to-bibliographic-data/xslt/convert_tei-to-biblstruct_bibl.xsl: Failed to compile predicate\n",
+      "Converting tei/10.1515_zfrs-1980-0104.xml\n",
+      "Error parsing XSLT file at https://openarabicpe.github.io/convert_tei-to-bibliographic-data/xslt/convert_tei-to-biblstruct_bibl.xsl: Failed to compile predicate\n",
+      "Converting tei/10.1111_1467-6478.00080.xml\n",
+      "Error parsing XSLT file at https://openarabicpe.github.io/convert_tei-to-bibliographic-data/xslt/convert_tei-to-biblstruct_bibl.xsl: Failed to compile predicate\n",
+      "Converting tei/10.1111_1467-6478.00057.xml\n",
+      "Error parsing XSLT file at https://openarabicpe.github.io/convert_tei-to-bibliographic-data/xslt/convert_tei-to-biblstruct_bibl.xsl: Failed to compile predicate\n"
+     ]
+    }
    ],
-   "id": "aa86435960e61937"
+   "execution_count": 28
   },
   {
-   "cell_type": "code",
-   "execution_count": null,
-   "outputs": [],
-   "source": [
-    "from lxml import etree\n",
-    "\n",
-    "def apply_xslt_to_xml(xslt_path, xml_path):\n",
-    "    xslt = etree.parse(xslt_path)\n",
-    "    xml = etree.parse(xml_path)\n",
-    "    transformer = etree.XSLT(xslt)\n",
-    "    new_xml = transformer(xml)\n",
-    "    return str(new_xml)\n",
-    "\n",
-    "new_xml_str = apply_xslt_to_xml('path_to_your_xslt_file', 'path_to_your_xml_file')\n",
-    "print(new_xml_str)\n"
-   ],
    "metadata": {
-    "collapsed": false
+    "ExecuteTime": {
+     "end_time": "2024-08-03T20:28:45.266893Z",
+     "start_time": "2024-08-03T20:28:42.357601Z"
+    }
    },
-   "id": "cb3b4140ab153c08"
+   "cell_type": "code",
+   "source": "!saxon -s:\"tei/10.1111_1467-6478.00057.xml\" -xsl:\"https://openarabicpe.github.io/convert_tei-to-bibliographic-data/xslt/convert_tei-to-biblstruct_bibl.xsl\"",
+   "id": "2e6d27dc670c0038",
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Error on line 6 column 88 of functions.xsl:\r\n",
+      "  XTSE0165  I/O error reported by XML parser processing\r\n",
+      "  https://openarabicpe.github.io/../xslt-calendar-conversion/functions/date-functions.xsl.\r\n",
+      "  Caused by java.io.IOException: Server returned HTTP response code: 400 for URL:\r\n",
+      "  https://openarabicpe.github.io/../xslt-calendar-conversion/functions/date-functions.xsl\r\n",
+      "I/O error reported by XML parser processing https://openarabicpe.github.io/../xslt-calendar-conversion/functions/date-functions.xsl\r\n"
+     ]
+    }
+   ],
+   "execution_count": 29
   },
   {
-   "cell_type": "markdown",
-   "source": [],
-   "metadata": {
-    "collapsed": false
-   },
-   "id": "387b5b9792505b13"
+   "metadata": {},
+   "cell_type": "code",
+   "outputs": [],
+   "execution_count": null,
+   "source": "",
+   "id": "1a8a57560f1f4868"
   }
  ],
  "metadata": {
-- 
GitLab