From 2d5976487027bdbbaf8eeff6a9de9c007827ecc6 Mon Sep 17 00:00:00 2001 From: Christian Boulanger <info@bibliograph.org> Date: Thu, 22 Aug 2024 11:02:12 +0200 Subject: [PATCH] Download Saxon programmatically --- convert-anystyle-data/tei-to-bibformats.ipynb | 68 ++++++------------- 1 file changed, 19 insertions(+), 49 deletions(-) diff --git a/convert-anystyle-data/tei-to-bibformats.ipynb b/convert-anystyle-data/tei-to-bibformats.ipynb index c448d83..8520080 100644 --- a/convert-anystyle-data/tei-to-bibformats.ipynb +++ b/convert-anystyle-data/tei-to-bibformats.ipynb @@ -69,71 +69,41 @@ "metadata": {}, "cell_type": "markdown", "source": [ - "## Extract bibliographic data from TEI files using XSLT\n", + "## Download the Saxon jar\n", "\n", - "### Using lxml - currently not working\n" + "As the xslt uses v2.0 features, and there are no native-python xslt-2.0 processors, we need to use the Saxon processor\n" ], - "id": "d08d51f8767602c5" + "id": "781d0e0e7a9dd346" }, { "metadata": { "ExecuteTime": { - "end_time": "2024-08-21T16:24:16.226255Z", - "start_time": "2024-08-21T16:24:16.196421Z" + "end_time": "2024-08-22T08:58:42.538326Z", + "start_time": "2024-08-22T08:58:34.687673Z" } }, "cell_type": "code", "source": [ - "from lxml import etree\n", - "import glob\n", "import requests\n", - "\n", - "def apply_xslt(xslt_path, xml_input_path, xml_output_path):\n", - " try:\n", - " xslt_doc = etree.parse(xslt_path)\n", - " xml_doc = etree.parse(xml_input_path)\n", - " transformer = etree.XSLT(xslt_doc)\n", - " new_xml = transformer(xml_doc)\n", - " with open(xml_output_path, 'w', encoding='utf-8') as f:\n", - " f.write(new_xml)\n", - " except etree.XSLTParseError as e:\n", - " print(f\"Error parsing XSLT file at {xslt_path}: {e}\")\n", - "\n", - "for input_path in glob.glob('tei/*.xml'):\n", - " print(f'Converting {input_path}')\n", - " base_name = os.path.basename(input_path)\n", - " output_path = f'tmp/{base_name.replace(\".xml\", \"-mods.xml\")}'\n", - " apply_xslt('lib/xslt/convert_tei-to-mods_bibl.xsl', input_path, output_path)\n" - ], - "id": "af437a5ab3cc41a3", - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Converting tei\\10.1111_1467-6478.00057.xml\n", - "Error parsing XSLT file at lib/xslt/convert_tei-to-mods_bibl.xsl: xsltParseStylesheetTop: ignoring misplaced import element\n", - "Converting tei\\10.1111_1467-6478.00080.xml\n", - "Error parsing XSLT file at lib/xslt/convert_tei-to-mods_bibl.xsl: xsltParseStylesheetTop: ignoring misplaced import element\n", - "Converting tei\\10.1515_zfrs-1980-0103.xml\n", - "Error parsing XSLT file at lib/xslt/convert_tei-to-mods_bibl.xsl: xsltParseStylesheetTop: ignoring misplaced import element\n", - "Converting tei\\10.1515_zfrs-1980-0104.xml\n", - "Error parsing XSLT file at lib/xslt/convert_tei-to-mods_bibl.xsl: xsltParseStylesheetTop: ignoring misplaced import element\n" - ] - } + "import zipfile\n", + "import io\n", + "import os\n", + "url = \"https://github.com/Saxonica/Saxon-HE/releases/download/SaxonHE12-5/SaxonHE12-5J.zip\"\n", + "target_dir = 'lib/SaxonHE12-5'\n", + "response = requests.get(url, stream=True)\n", + "file_zip = zipfile.ZipFile(io.BytesIO(response.content))\n", + "os.makedirs(target_dir, exist_ok=True)\n", + "file_zip.extractall(path=target_dir)" ], - "execution_count": 41 + "id": "72b688e9b2e0d1f2", + "outputs": [], + "execution_count": 86 }, { "metadata": {}, "cell_type": "markdown", - "source": [ - "### Using Saxon:\n", - "\n", - "- download ZIP from https://github.com/Saxonica/Saxon-HE/releases/download/SaxonHE12-5/SaxonHE12-5J.zip\n", - "- unpack in lib/SaxonHE12-5J" - ], - "id": "781d0e0e7a9dd346" + "source": "## Run the transformation", + "id": "1bbb36ac0f4fd1b5" }, { "metadata": { -- GitLab