From 0af92f46af893b0718f9a400ded11d012814bfcc Mon Sep 17 00:00:00 2001 From: cboulanger <info@bibliograph.org> Date: Sat, 3 Aug 2024 22:36:58 +0200 Subject: [PATCH] Trying to make the XSLTs work, no luck --- convert-anystyle-data/anystyle-to-tei.ipynb | 143 ++++++++++++-------- 1 file changed, 86 insertions(+), 57 deletions(-) diff --git a/convert-anystyle-data/anystyle-to-tei.ipynb b/convert-anystyle-data/anystyle-to-tei.ipynb index 395c904..b9994e2 100644 --- a/convert-anystyle-data/anystyle-to-tei.ipynb +++ b/convert-anystyle-data/anystyle-to-tei.ipynb @@ -418,82 +418,111 @@ "metadata": {}, "cell_type": "markdown", "source": [ - "## Extract bibliographic data from TEI files " + "## Extract bibliographic data from TEI files using XSLT\n", + "\n", + "https://github.com/OpenArabicPE/convert_tei-to-bibliographic-data" ], "id": "b0a231dc7bdd8b01" }, - { - "cell_type": "markdown", - "source": [ - "### Download XSLTs" - ], - "metadata": { - "collapsed": false - }, - "id": "149588c08747c4b3" - }, { "cell_type": "code", - "execution_count": 6, - "outputs": [], "source": [ - "import requests, zipfile, io, os\n", - "\n", - "if not os.path.isdir('lib/convert'): \n", - " url = 'https://github.com/OpenArabicPE/convert_tei-to-bibliographic-data/archive/refs/heads/master.zip'\n", - " r = requests.get(url)\n", - " assert r.status_code == 200\n", - " z = zipfile.ZipFile(io.BytesIO(r.content))\n", - " z.extractall('lib')\n", - " z.close()\n", - " os.rename('lib/convert_tei-to-bibliographic-data-master', 'lib/convert')\n" + "from lxml import etree\n", + "import glob\n", + "from urllib.request import urlopen\n", + "import requests\n", + "import traceback\n", + "\n", + "class HttpsResolver(etree.Resolver):\n", + " def resolve(self, url, id, context): \n", + " r = requests.get(url)\n", + " assert(r.status_code == 200)\n", + " return self.resolve_string(r.content, context, base_url=url)\n", + "\n", + "def apply_xslt(xslt_path, xml_input_path, xml_output_path):\n", + " try:\n", + " if xslt_path.startswith('http'):\n", + " with urlopen(xslt_path) as f:\n", + " xml_parser = etree.XMLParser(no_network=False)\n", + " xml_parser.resolvers.add(HttpsResolver())\n", + " xslt_doc = etree.parse(f, parser=xml_parser)\n", + " else:\n", + " xslt_doc = etree.parse(xslt_path)\n", + " xml_doc = etree.parse(xml_input_path)\n", + " transformer = etree.XSLT(xslt_doc)\n", + " new_xml = transformer(xml_doc)\n", + " with open(xml_output_path, 'w', encoding='utf-8') as f:\n", + " f.write(new_xml)\n", + " except etree.XSLTParseError as e:\n", + " print(f\"Error parsing XSLT file at {xslt_path}: {e}\")\n", + "\n", + "xslt_url = 'https://openarabicpe.github.io/convert_tei-to-bibliographic-data/xslt/convert_tei-to-biblstruct_bibl.xsl'\n", + "\n", + "for input_path in glob.glob('tei/*.xml'):\n", + " print(f'Converting {input_path}')\n", + " base_name = os.path.basename(input_path)\n", + " output_path = f'tei-biblstruct/{base_name}'\n", + " apply_xslt(xslt_url, input_path, output_path )\n" ], "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-07-31T19:42:48.723119300Z", - "start_time": "2024-07-31T19:42:47.234795500Z" + "end_time": "2024-08-03T20:28:20.087934Z", + "start_time": "2024-08-03T20:28:18.699591Z" } }, - "id": "1f15b3af6aab73ed" - }, - { - "metadata": {}, - "cell_type": "markdown", - "source": [ - "### Apply XSLT" + "id": "cb3b4140ab153c08", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Converting tei/10.1515_zfrs-1980-0103.xml\n", + "Error parsing XSLT file at https://openarabicpe.github.io/convert_tei-to-bibliographic-data/xslt/convert_tei-to-biblstruct_bibl.xsl: Failed to compile predicate\n", + "Converting tei/10.1515_zfrs-1980-0104.xml\n", + "Error parsing XSLT file at https://openarabicpe.github.io/convert_tei-to-bibliographic-data/xslt/convert_tei-to-biblstruct_bibl.xsl: Failed to compile predicate\n", + "Converting tei/10.1111_1467-6478.00080.xml\n", + "Error parsing XSLT file at https://openarabicpe.github.io/convert_tei-to-bibliographic-data/xslt/convert_tei-to-biblstruct_bibl.xsl: Failed to compile predicate\n", + "Converting tei/10.1111_1467-6478.00057.xml\n", + "Error parsing XSLT file at https://openarabicpe.github.io/convert_tei-to-bibliographic-data/xslt/convert_tei-to-biblstruct_bibl.xsl: Failed to compile predicate\n" + ] + } ], - "id": "aa86435960e61937" + "execution_count": 28 }, { - "cell_type": "code", - "execution_count": null, - "outputs": [], - "source": [ - "from lxml import etree\n", - "\n", - "def apply_xslt_to_xml(xslt_path, xml_path):\n", - " xslt = etree.parse(xslt_path)\n", - " xml = etree.parse(xml_path)\n", - " transformer = etree.XSLT(xslt)\n", - " new_xml = transformer(xml)\n", - " return str(new_xml)\n", - "\n", - "new_xml_str = apply_xslt_to_xml('path_to_your_xslt_file', 'path_to_your_xml_file')\n", - "print(new_xml_str)\n" - ], "metadata": { - "collapsed": false + "ExecuteTime": { + "end_time": "2024-08-03T20:28:45.266893Z", + "start_time": "2024-08-03T20:28:42.357601Z" + } }, - "id": "cb3b4140ab153c08" + "cell_type": "code", + "source": "!saxon -s:\"tei/10.1111_1467-6478.00057.xml\" -xsl:\"https://openarabicpe.github.io/convert_tei-to-bibliographic-data/xslt/convert_tei-to-biblstruct_bibl.xsl\"", + "id": "2e6d27dc670c0038", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Error on line 6 column 88 of functions.xsl:\r\n", + " XTSE0165 I/O error reported by XML parser processing\r\n", + " https://openarabicpe.github.io/../xslt-calendar-conversion/functions/date-functions.xsl.\r\n", + " Caused by java.io.IOException: Server returned HTTP response code: 400 for URL:\r\n", + " https://openarabicpe.github.io/../xslt-calendar-conversion/functions/date-functions.xsl\r\n", + "I/O error reported by XML parser processing https://openarabicpe.github.io/../xslt-calendar-conversion/functions/date-functions.xsl\r\n" + ] + } + ], + "execution_count": 29 }, { - "cell_type": "markdown", - "source": [], - "metadata": { - "collapsed": false - }, - "id": "387b5b9792505b13" + "metadata": {}, + "cell_type": "code", + "outputs": [], + "execution_count": null, + "source": "", + "id": "1a8a57560f1f4868" } ], "metadata": { -- GitLab