From cf3422532dd6dac06dc3ed00608d594e7ae0acf2 Mon Sep 17 00:00:00 2001
From: Christian Boulanger <info@bibliograph.org>
Date: Thu, 22 Aug 2024 10:43:01 +0200
Subject: [PATCH] Added transformations TEI->biblStruct, TEI->MODS and
 MODS->RIS via bibutils

---
 convert-anystyle-data/readme.md               | 19 +------
 convert-anystyle-data/tei-to-bibformats.ipynb | 57 +++++--------------
 2 files changed, 17 insertions(+), 59 deletions(-)

diff --git a/convert-anystyle-data/readme.md b/convert-anystyle-data/readme.md
index f1b25e0..4f76ac0 100644
--- a/convert-anystyle-data/readme.md
+++ b/convert-anystyle-data/readme.md
@@ -1,20 +1,5 @@
 # Conversion of AnyStyle training data to other formats
 
-This subrepo contains code to convert the existing training data in the AnyStyle formats (XML, TTX) into other formats
-that can be used with other tools like prodigy or which are more standardized (such as LinkML)
+This subrepo contains code to convert the existing training data in the AnyStyle formats (XML, TTX) into other formats.
 
-Note: The automatic generation of a LinkML schema from the converted JSONL files using the schema-automator tool 
-introduces a huge dependency tree - use a virtual environment to avoid cluttering your python installation.
-
-## Content of directories:
-
-- `in`: AnyStyle Ground Truth for document-level (ttx) and footnote-level (xml) reference information
-- `jsonl`: AnyStyle footnote GT converted to a JSONL objects with "in" (Complete footnote as a string) and "out"
-  (Structured data) fields
-- `json`: json files containing a flat list of objects with the structured data of the references in the footnotes
-- `schema`: LinkML schema, autogenerated from the json files, not yet annotated.
-
-## Resources
-- https://prodi.gy/docs/api-interfaces#spans_manual
-- https://linkml.io/linkml/index.html
-- https://linkml.io/schema-automator/
\ No newline at end of file
+Note: the requirements introduce a huge dependency tree - use a virtual environment to avoid cluttering your python installation.
diff --git a/convert-anystyle-data/tei-to-bibformats.ipynb b/convert-anystyle-data/tei-to-bibformats.ipynb
index 8e4cc16..c448d83 100644
--- a/convert-anystyle-data/tei-to-bibformats.ipynb
+++ b/convert-anystyle-data/tei-to-bibformats.ipynb
@@ -86,7 +86,6 @@
    "source": [
     "from lxml import etree\n",
     "import glob\n",
-    "from urllib.request import urlopen\n",
     "import requests\n",
     "\n",
     "def apply_xslt(xslt_path, xml_input_path, xml_output_path):\n",
@@ -185,30 +184,14 @@
    "source": [
     "## Convert MODS to RIS tagged file format\n",
     "\n",
-    "This requires the install the bibutils suite of executables https://sourceforge.net/p/bibutils/home/Bibutils/ \n",
-    "(in windows, install it to the standard WSL distro)"
+    "This requires the install the bibutils suite of executables https://sourceforge.net/p/bibutils/home/Bibutils/ available in most distros.\n",
+    "(in windows, you will need to install it to the standard WSL distro)"
    ],
    "id": "5e75488ae4379946"
   },
   {
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2024-08-22T08:19:43.322537Z",
-     "start_time": "2024-08-22T08:19:43.087262Z"
-    }
-   },
+   "metadata": {},
    "cell_type": "code",
-   "source": [
-    "import subprocess\n",
-    "import platform\n",
-    "\n",
-    "cmd = ['bash', 'lib/xml2ris.sh']\n",
-    "if platform.system() == 'Windows':\n",
-    "    cmd = ['wsl.exe', '-e'] + cmd\n",
-    "output = subprocess.check_output(cmd, stderr=subprocess.STDOUT)\n",
-    "print(output.decode())"
-   ],
-   "id": "fde37a9e4a182bad",
    "outputs": [
     {
      "name": "stdout",
@@ -226,28 +209,18 @@
      ]
     }
    ],
-   "execution_count": 83
-  },
-  {
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2024-08-22T08:18:13.159229Z",
-     "start_time": "2024-08-22T08:18:13.145443Z"
-    }
-   },
-   "cell_type": "code",
-   "source": "",
-   "id": "8a013a47766a81cc",
-   "outputs": [],
-   "execution_count": 80
-  },
-  {
-   "metadata": {},
-   "cell_type": "code",
-   "outputs": [],
-   "execution_count": null,
-   "source": "",
-   "id": "bf5722a2500cf1a"
+   "execution_count": 83,
+   "source": [
+    "import subprocess\n",
+    "import platform\n",
+    "\n",
+    "cmd = ['bash', 'lib/xml2ris.sh']\n",
+    "if platform.system() == 'Windows':\n",
+    "    cmd = ['wsl.exe', '-e'] + cmd\n",
+    "output = subprocess.check_output(cmd, stderr=subprocess.STDOUT)\n",
+    "print(output.decode())"
+   ],
+   "id": "fde37a9e4a182bad"
   }
  ],
  "metadata": {
-- 
GitLab