From 771ba5ae00d0ed641598ab8cb928187f75aff187 Mon Sep 17 00:00:00 2001 From: Christian Boulanger <boulanger@lhlt.mpg.de> Date: Wed, 31 Jul 2024 21:45:59 +0200 Subject: [PATCH] Add XSLT code --- convert-anystyle-data/anystyle-to-tei.ipynb | 83 ++++++++++++++++++++- convert-anystyle-data/lib/.gitignore | 1 + 2 files changed, 81 insertions(+), 3 deletions(-) create mode 100644 convert-anystyle-data/lib/.gitignore diff --git a/convert-anystyle-data/anystyle-to-tei.ipynb b/convert-anystyle-data/anystyle-to-tei.ipynb index f22848a..395c904 100644 --- a/convert-anystyle-data/anystyle-to-tei.ipynb +++ b/convert-anystyle-data/anystyle-to-tei.ipynb @@ -148,7 +148,9 @@ { "metadata": {}, "cell_type": "markdown", - "source": "## Convert Groundd Truth to TEI", + "source": [ + "## Convert Groundd Truth to TEI" + ], "id": "aaf43ee43bb6d4d" }, { @@ -404,19 +406,94 @@ ], "execution_count": 80 }, + { + "cell_type": "markdown", + "source": [], + "metadata": { + "collapsed": false + }, + "id": "8c8b2d820086d461" + }, { "metadata": {}, "cell_type": "markdown", "source": [ - "## Create LinkML schema from TEI XSD" + "## Extract bibliographic data from TEI files " ], "id": "b0a231dc7bdd8b01" }, + { + "cell_type": "markdown", + "source": [ + "### Download XSLTs" + ], + "metadata": { + "collapsed": false + }, + "id": "149588c08747c4b3" + }, + { + "cell_type": "code", + "execution_count": 6, + "outputs": [], + "source": [ + "import requests, zipfile, io, os\n", + "\n", + "if not os.path.isdir('lib/convert'): \n", + " url = 'https://github.com/OpenArabicPE/convert_tei-to-bibliographic-data/archive/refs/heads/master.zip'\n", + " r = requests.get(url)\n", + " assert r.status_code == 200\n", + " z = zipfile.ZipFile(io.BytesIO(r.content))\n", + " z.extractall('lib')\n", + " z.close()\n", + " os.rename('lib/convert_tei-to-bibliographic-data-master', 'lib/convert')\n" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-07-31T19:42:48.723119300Z", + "start_time": "2024-07-31T19:42:47.234795500Z" + } + }, + "id": "1f15b3af6aab73ed" + }, { "metadata": {}, "cell_type": "markdown", - "source": "", + "source": [ + "### Apply XSLT" + ], "id": "aa86435960e61937" + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "from lxml import etree\n", + "\n", + "def apply_xslt_to_xml(xslt_path, xml_path):\n", + " xslt = etree.parse(xslt_path)\n", + " xml = etree.parse(xml_path)\n", + " transformer = etree.XSLT(xslt)\n", + " new_xml = transformer(xml)\n", + " return str(new_xml)\n", + "\n", + "new_xml_str = apply_xslt_to_xml('path_to_your_xslt_file', 'path_to_your_xml_file')\n", + "print(new_xml_str)\n" + ], + "metadata": { + "collapsed": false + }, + "id": "cb3b4140ab153c08" + }, + { + "cell_type": "markdown", + "source": [], + "metadata": { + "collapsed": false + }, + "id": "387b5b9792505b13" } ], "metadata": { diff --git a/convert-anystyle-data/lib/.gitignore b/convert-anystyle-data/lib/.gitignore new file mode 100644 index 0000000..f59ec20 --- /dev/null +++ b/convert-anystyle-data/lib/.gitignore @@ -0,0 +1 @@ +* \ No newline at end of file -- GitLab