From cf3422532dd6dac06dc3ed00608d594e7ae0acf2 Mon Sep 17 00:00:00 2001 From: Christian Boulanger <info@bibliograph.org> Date: Thu, 22 Aug 2024 10:43:01 +0200 Subject: [PATCH] Added transformations TEI->biblStruct, TEI->MODS and MODS->RIS via bibutils --- convert-anystyle-data/readme.md | 19 +------ convert-anystyle-data/tei-to-bibformats.ipynb | 57 +++++-------------- 2 files changed, 17 insertions(+), 59 deletions(-) diff --git a/convert-anystyle-data/readme.md b/convert-anystyle-data/readme.md index f1b25e0..4f76ac0 100644 --- a/convert-anystyle-data/readme.md +++ b/convert-anystyle-data/readme.md @@ -1,20 +1,5 @@ # Conversion of AnyStyle training data to other formats -This subrepo contains code to convert the existing training data in the AnyStyle formats (XML, TTX) into other formats -that can be used with other tools like prodigy or which are more standardized (such as LinkML) +This subrepo contains code to convert the existing training data in the AnyStyle formats (XML, TTX) into other formats. -Note: The automatic generation of a LinkML schema from the converted JSONL files using the schema-automator tool -introduces a huge dependency tree - use a virtual environment to avoid cluttering your python installation. - -## Content of directories: - -- `in`: AnyStyle Ground Truth for document-level (ttx) and footnote-level (xml) reference information -- `jsonl`: AnyStyle footnote GT converted to a JSONL objects with "in" (Complete footnote as a string) and "out" - (Structured data) fields -- `json`: json files containing a flat list of objects with the structured data of the references in the footnotes -- `schema`: LinkML schema, autogenerated from the json files, not yet annotated. - -## Resources -- https://prodi.gy/docs/api-interfaces#spans_manual -- https://linkml.io/linkml/index.html -- https://linkml.io/schema-automator/ \ No newline at end of file +Note: the requirements introduce a huge dependency tree - use a virtual environment to avoid cluttering your python installation. diff --git a/convert-anystyle-data/tei-to-bibformats.ipynb b/convert-anystyle-data/tei-to-bibformats.ipynb index 8e4cc16..c448d83 100644 --- a/convert-anystyle-data/tei-to-bibformats.ipynb +++ b/convert-anystyle-data/tei-to-bibformats.ipynb @@ -86,7 +86,6 @@ "source": [ "from lxml import etree\n", "import glob\n", - "from urllib.request import urlopen\n", "import requests\n", "\n", "def apply_xslt(xslt_path, xml_input_path, xml_output_path):\n", @@ -185,30 +184,14 @@ "source": [ "## Convert MODS to RIS tagged file format\n", "\n", - "This requires the install the bibutils suite of executables https://sourceforge.net/p/bibutils/home/Bibutils/ \n", - "(in windows, install it to the standard WSL distro)" + "This requires the install the bibutils suite of executables https://sourceforge.net/p/bibutils/home/Bibutils/ available in most distros.\n", + "(in windows, you will need to install it to the standard WSL distro)" ], "id": "5e75488ae4379946" }, { - "metadata": { - "ExecuteTime": { - "end_time": "2024-08-22T08:19:43.322537Z", - "start_time": "2024-08-22T08:19:43.087262Z" - } - }, + "metadata": {}, "cell_type": "code", - "source": [ - "import subprocess\n", - "import platform\n", - "\n", - "cmd = ['bash', 'lib/xml2ris.sh']\n", - "if platform.system() == 'Windows':\n", - " cmd = ['wsl.exe', '-e'] + cmd\n", - "output = subprocess.check_output(cmd, stderr=subprocess.STDOUT)\n", - "print(output.decode())" - ], - "id": "fde37a9e4a182bad", "outputs": [ { "name": "stdout", @@ -226,28 +209,18 @@ ] } ], - "execution_count": 83 - }, - { - "metadata": { - "ExecuteTime": { - "end_time": "2024-08-22T08:18:13.159229Z", - "start_time": "2024-08-22T08:18:13.145443Z" - } - }, - "cell_type": "code", - "source": "", - "id": "8a013a47766a81cc", - "outputs": [], - "execution_count": 80 - }, - { - "metadata": {}, - "cell_type": "code", - "outputs": [], - "execution_count": null, - "source": "", - "id": "bf5722a2500cf1a" + "execution_count": 83, + "source": [ + "import subprocess\n", + "import platform\n", + "\n", + "cmd = ['bash', 'lib/xml2ris.sh']\n", + "if platform.system() == 'Windows':\n", + " cmd = ['wsl.exe', '-e'] + cmd\n", + "output = subprocess.check_output(cmd, stderr=subprocess.STDOUT)\n", + "print(output.decode())" + ], + "id": "fde37a9e4a182bad" } ], "metadata": { -- GitLab