{ "cells": [ { "metadata": {}, "cell_type": "markdown", "source": [ "# Convert the generated TEI to bibliographic formats\n" ], "id": "2cdf8ba1eefa38e0" }, { "metadata": {}, "cell_type": "markdown", "source": [ "## Download required XSLT documents\n", "\n", "we use XSLT provided by https://github.com/OpenArabicPE/convert_tei-to-bibliographic-data " ], "id": "db65c4065691c578" }, { "metadata": { "ExecuteTime": { "end_time": "2024-08-27T20:23:35.718829Z", "start_time": "2024-08-27T20:23:30.937360Z" } }, "cell_type": "code", "source": [ "import os\n", "from urllib.parse import urljoin\n", "import requests\n", "from lxml import etree\n", "\n", "cache = set()\n", "\n", "def download_xslt(url, target_dir = 'lib/xslt'):\n", " \"\"\"written by GPT-4\"\"\"\n", " response = requests.get(url)\n", " response.raise_for_status()\n", " doc = etree.fromstring(response.content)\n", " for elem in doc.xpath('//*[local-name() = \"import\"]'):\n", " import_url = urljoin(url, elem.get('href'))\n", " if import_url not in cache:\n", " cache.add(import_url)\n", " download_xslt(import_url, target_dir)\n", " os.makedirs(target_dir, exist_ok=True)\n", " with open(os.path.join(target_dir, os.path.basename(url)), 'wb') as f:\n", " f.write(response.content)\n", " print(f'Downloaded {os.path.basename(url)} to {target_dir}')\n", "\n", "# TEI -> BiblStruct/MODS\n", "base_url = 'https://openarabicpe.github.io/convert_tei-to-bibliographic-data/xslt'\n", "xslt_docs = ['convert_tei-to-biblstruct_bibl.xsl', \n", " 'convert_tei-to-mods_bibl.xsl',\n", " 'convert_tei-to-zotero-rdf_bibl.xsl']\n", "for xslt_doc in xslt_docs:\n", " download_xslt(f'{base_url}/{xslt_doc}')\n", "\n", "# MODS -> BIBO-RDF\n", "base_url = 'https://www.loc.gov/standards/mods/v3'\n", "xslt_docs = ['MODS3-7_Bibframe2-0_XSLT2-0_20230505.xsl', 'conf/languageCrosswalk.xml']\n", "for xslt_doc in xslt_docs:\n", " download_xslt(f'{base_url}/{xslt_doc}', target_dir=f'lib/xslt/{os.path.dirname(xslt_doc)}')\n" ], "id": "1de7cedbb3514188", "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Downloaded parameters.xsl to lib/xslt\n", "Downloaded functions.xsl to lib/xslt\n", "Downloaded convert_tei-to-biblstruct_functions.xsl to lib/xslt\n", "Downloaded convert_tei-to-biblstruct_bibl.xsl to lib/xslt\n", "Downloaded date-functions.xsl to lib/xslt\n", "Downloaded convert_tei-to-mods_functions.xsl to lib/xslt\n", "Downloaded convert_tei-to-mods_bibl.xsl to lib/xslt\n", "Downloaded convert_tei-to-zotero-rdf_bibl.xsl to lib/xslt\n", "Downloaded MODS3-7_Bibframe2-0_XSLT2-0_20230505.xsl to lib/xslt/\n", "Downloaded languageCrosswalk.xml to lib/xslt/conf\n" ] } ], "execution_count": 30 }, { "metadata": {}, "cell_type": "markdown", "source": [ "## Download the Saxon jar\n", "\n", "As the xslt uses v2.0 features, and there are no native-python xslt-2.0 processors, we need to use the Saxon processor. Possible alternatives (untested):\n", " - https://pypi.org/project/saxonpy\n", " - https://github.com/cts2/pyjxslt\n" ], "id": "781d0e0e7a9dd346" }, { "metadata": { "ExecuteTime": { "end_time": "2024-08-22T08:58:42.538326Z", "start_time": "2024-08-22T08:58:34.687673Z" } }, "cell_type": "code", "source": [ "import requests\n", "import zipfile\n", "import io\n", "import os\n", "url = \"https://github.com/Saxonica/Saxon-HE/releases/download/SaxonHE12-5/SaxonHE12-5J.zip\"\n", "target_dir = 'lib/SaxonHE12-5'\n", "response = requests.get(url, stream=True)\n", "file_zip = zipfile.ZipFile(io.BytesIO(response.content))\n", "os.makedirs(target_dir, exist_ok=True)\n", "file_zip.extractall(path=target_dir)" ], "id": "72b688e9b2e0d1f2", "outputs": [], "execution_count": 86 }, { "metadata": {}, "cell_type": "markdown", "source": [ "## Run the transformation to biblStruct & MODS" ], "id": "1bbb36ac0f4fd1b5" }, { "metadata": { "ExecuteTime": { "end_time": "2024-08-31T14:07:42.390943Z", "start_time": "2024-08-31T14:07:42.360658Z" } }, "cell_type": "code", "source": [ "import subprocess\n", "import os\n", "from glob import glob\n", "\n", "def transform(xslt_path, input_path='tei', output_path='.', rename_extension:tuple=None):\n", " input_path = os.path.normpath(input_path)\n", " xslt_path = os.path.normpath(xslt_path)\n", " cmd = ['java', '-jar', 'lib/SaxonHE12-5J/saxon-he-12.5.jar', \n", " f'-s:{input_path}', f'-xsl:{xslt_path}', f'-o:{output_path}',\n", " 'p_target-language=de', 'p_github-action=true', f'p_output-folder={output_path}']\n", " process = subprocess.run(cmd, capture_output=True, text=True)\n", " if rename_extension:\n", " from_extension = rename_extension[0]\n", " to_extension = rename_extension[1]\n", " for filename in glob(f'{output_path}/*.xml'):\n", " if filename.endswith(from_extension):\n", " os.replace(filename, filename.replace(from_extension, to_extension))\n", " if process.returncode != 0:\n", " raise RuntimeError(process.stderr)\n", " print(f'Applied {xslt_path} to files in {input_path} and saved result in {output_path}.')\n", " return process" ], "id": "d4a6c9620d0199eb", "outputs": [], "execution_count": 53 }, { "metadata": { "ExecuteTime": { "end_time": "2024-08-31T14:07:52.212627Z", "start_time": "2024-08-31T14:07:45.196851Z" } }, "cell_type": "code", "source": [ "# TEI -> biblstruct\n", "transform(xslt_path='lib/xslt/convert_tei-to-biblstruct_bibl.xsl', output_path='biblStruct', rename_extension=('-bibl_biblStruct.TEIP5.xml','.biblStruct.xml'))\n", "\n", "# TEI -> MODS\n", "transform(xslt_path='lib/xslt/convert_tei-to-mods_bibl.xsl', output_path='mods', rename_extension=('-bibl.MODS.xml','.mods.xml'))\n", "\n", "# rename wanted and delete unwanted empty files\n", "for dir_name in ['biblStruct', 'mods']:\n", " for filename in glob(f'{dir_name}/*'):\n", " if os.path.basename(filename).startswith(f'{dir_name}'):\n", " os.replace(filename, f'{os.path.dirname(filename)}/{os.path.basename(filename).removeprefix(dir_name)}')\n", " else:\n", " os.remove(filename)\n" ], "id": "34087ef2f498ffa6", "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Applied lib\\xslt\\convert_tei-to-biblstruct_bibl.xsl to files in tei and saved result in biblStruct.\n", "Applied lib\\xslt\\convert_tei-to-mods_bibl.xsl to files in tei and saved result in mods.\n" ] } ], "execution_count": 54 }, { "metadata": {}, "cell_type": "markdown", "source": [ "## Convert MODS to RIS tagged file format\n", "\n", "This requires the install the [Bibutils suite of executables](https://sourceforge.net/p/bibutils/home/Bibutils) available in most distros.\n", "If you are on Windows, you will need to install it to the standard WSL distro." ], "id": "5e75488ae4379946" }, { "metadata": { "ExecuteTime": { "end_time": "2024-08-31T14:08:03.605705Z", "start_time": "2024-08-31T14:08:03.315458Z" } }, "cell_type": "code", "source": [ "import subprocess\n", "import platform\n", "\n", "cmd = ['bash', 'lib/run-bibutils.sh', 'xml2ris']\n", "if platform.system() == 'Windows':\n", " cmd = ['wsl.exe', '-e'] + cmd\n", "output = subprocess.check_output(cmd, stderr=subprocess.STDOUT)\n", "print(output.decode())" ], "id": "fde37a9e4a182bad", "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Running xml2ris to convert mods/10.1111_1467-6478.00057.mods.xml to ris/10.1111_1467-6478.00057.ris...\n", "Running xml2ris to convert mods/10.1111_1467-6478.00080.mods.xml to ris/10.1111_1467-6478.00080.ris...\n", "Running xml2ris to convert mods/10.1515_zfrs-1980-0103.mods.xml to ris/10.1515_zfrs-1980-0103.ris...\n", "Running xml2ris to convert mods/10.1515_zfrs-1980-0104.mods.xml to ris/10.1515_zfrs-1980-0104.ris...\n" ] } ], "execution_count": 55 }, { "metadata": {}, "cell_type": "markdown", "source": [ "## Convert MODS -> Bibframe RDF -> JSON-LD\n", "\n", "See:\n", " - https://www.loc.gov/standards/mods/modsrdf/mods3-7-bibframe2-0-mapping.html\n", " - https://rdflib.readthedocs.io/ " ], "id": "61f6cfe7d3de482a" }, { "metadata": { "ExecuteTime": { "end_time": "2024-08-31T14:08:19.285923Z", "start_time": "2024-08-31T14:08:11.824941Z" } }, "cell_type": "code", "source": [ "# MODS -> Bibframe\n", "transform(xslt_path='lib/xslt/MODS3-7_Bibframe2-0_XSLT2-0_20230505.xsl', \n", " input_path='mods', output_path='bibframe', \n", " rename_extension=('.mods.xml','.bibframe.xml'))" ], "id": "6ba739963096f858", "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Applied lib\\xslt\\MODS3-7_Bibframe2-0_XSLT2-0_20230505.xsl to files in mods and saved result in bibframe.\n" ] }, { "data": { "text/plain": [ "CompletedProcess(args=['java', '-jar', 'lib/SaxonHE12-5J/saxon-he-12.5.jar', '-s:mods', '-xsl:lib\\\\xslt\\\\MODS3-7_Bibframe2-0_XSLT2-0_20230505.xsl', '-o:bibframe', 'p_target-language=de', 'p_github-action=true', 'p_output-folder=bibframe'], returncode=0, stdout='', stderr='')" ] }, "execution_count": 56, "metadata": {}, "output_type": "execute_result" } ], "execution_count": 56 }, { "metadata": { "ExecuteTime": { "end_time": "2024-08-31T14:08:24.493962Z", "start_time": "2024-08-31T14:08:22.772571Z" } }, "cell_type": "code", "source": [ "from rdflib import Graph\n", "for in_path in glob(f'bibframe/*'):\n", " out_file = os.path.basename(in_path).replace('.bibframe.xml','.json')\n", " g = Graph()\n", " g.parse(in_path)\n", " g.serialize(destination=f'json-ld/{out_file}', format='json-ld')\n" ], "id": "4cb509fa7f296d1f", "outputs": [], "execution_count": 57 }, { "metadata": {}, "cell_type": "markdown", "source": [], "id": "8ce07a1a294b5408" }, { "metadata": {}, "cell_type": "markdown", "source": [ "## Convert MODS -> BibTex -> CSL-JSON\n", "\n", "This also requires Bibutils and additionally, the [pandoc executable](https://pandoc.org/installing.html).\n" ], "id": "be771aec518bf10a" }, { "metadata": { "ExecuteTime": { "end_time": "2024-08-31T20:39:19.077022Z", "start_time": "2024-08-31T20:39:18.691518Z" } }, "cell_type": "code", "source": [ "# MODS -> BibTeX\n", "import subprocess\n", "import platform\n", "\n", "cmd = ['bash', 'lib/run-bibutils.sh', 'xml2bib']\n", "if platform.system() == 'Windows':\n", " cmd = ['wsl.exe', '-e'] + cmd\n", "output = subprocess.check_output(cmd, stderr=subprocess.STDOUT)\n", "print(output.decode())" ], "id": "3e95a38e223dae51", "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Running xml2bib to convert mods/10.1111_1467-6478.00057.mods.xml to bib/10.1111_1467-6478.00057.bib...\n", "Running xml2bib to convert mods/10.1111_1467-6478.00080.mods.xml to bib/10.1111_1467-6478.00080.bib...\n", "Running xml2bib to convert mods/10.1515_zfrs-1980-0103.mods.xml to bib/10.1515_zfrs-1980-0103.bib...\n", "Running xml2bib to convert mods/10.1515_zfrs-1980-0104.mods.xml to bib/10.1515_zfrs-1980-0104.bib...\n" ] } ], "execution_count": 2 }, { "metadata": { "ExecuteTime": { "end_time": "2024-08-31T20:40:43.883111Z", "start_time": "2024-08-31T20:40:37.701850Z" } }, "cell_type": "code", "source": [ "# BibTeX to CSL\n", "\n", "cmd = ['bash', 'lib/run-pandoc.sh', 'bibtex', 'csljson']\n", "if platform.system() == 'Windows':\n", " cmd = ['wsl.exe', '-e'] + cmd\n", "output = subprocess.check_output(cmd, stderr=subprocess.STDOUT)\n", "print(output.decode())" ], "id": "a2cdccaf919c268e", "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Running citeproc to convert bib/10.1111_1467-6478.00057.bib to csljson/10.1111_1467-6478.00057.csl.json...\n", "Running citeproc to convert bib/10.1111_1467-6478.00080.bib to csljson/10.1111_1467-6478.00080.csl.json...\n", "Running citeproc to convert bib/10.1515_zfrs-1980-0103.bib to csljson/10.1515_zfrs-1980-0103.csl.json...\n", "Running citeproc to convert bib/10.1515_zfrs-1980-0104.bib to csljson/10.1515_zfrs-1980-0104.csl.json...\n" ] } ], "execution_count": 4 }, { "metadata": {}, "cell_type": "code", "outputs": [], "execution_count": null, "source": [], "id": "77ff73f83a1db70e" } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.6" } }, "nbformat": 4, "nbformat_minor": 5 }