{
 "cells": [
  {
   "metadata": {},
   "cell_type": "markdown",
   "source": [
    "# Convert the generated TEI to bibliographic formats\n"
   ],
   "id": "2cdf8ba1eefa38e0"
  },
  {
   "metadata": {},
   "cell_type": "markdown",
   "source": [
    "## Download required XSLT documents\n",
    "\n",
    "we use XSLT provided by https://github.com/OpenArabicPE/convert_tei-to-bibliographic-data "
   ],
   "id": "db65c4065691c578"
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-08-27T20:23:35.718829Z",
     "start_time": "2024-08-27T20:23:30.937360Z"
    }
   },
   "cell_type": "code",
   "source": [
    "import os\n",
    "from urllib.parse import urljoin\n",
    "import requests\n",
    "from lxml import etree\n",
    "\n",
    "cache = set()\n",
    "\n",
    "def download_xslt(url, target_dir = 'lib/xslt'):\n",
    "    \"\"\"written by GPT-4\"\"\"\n",
    "    response = requests.get(url)\n",
    "    response.raise_for_status()\n",
    "    doc = etree.fromstring(response.content)\n",
    "    for elem in doc.xpath('//*[local-name() = \"import\"]'):\n",
    "        import_url = urljoin(url, elem.get('href'))\n",
    "        if import_url not in cache:\n",
    "            cache.add(import_url)\n",
    "            download_xslt(import_url, target_dir)\n",
    "    os.makedirs(target_dir, exist_ok=True)\n",
    "    with open(os.path.join(target_dir, os.path.basename(url)), 'wb') as f:\n",
    "        f.write(response.content)\n",
    "    print(f'Downloaded {os.path.basename(url)} to {target_dir}')\n",
    "\n",
    "# TEI -> BiblStruct/MODS\n",
    "base_url = 'https://openarabicpe.github.io/convert_tei-to-bibliographic-data/xslt'\n",
    "xslt_docs = ['convert_tei-to-biblstruct_bibl.xsl', \n",
    "             'convert_tei-to-mods_bibl.xsl',\n",
    "             'convert_tei-to-zotero-rdf_bibl.xsl']\n",
    "for xslt_doc in xslt_docs:\n",
    "    download_xslt(f'{base_url}/{xslt_doc}')\n",
    "\n",
    "# MODS -> BIBO-RDF\n",
    "base_url = 'https://www.loc.gov/standards/mods/v3'\n",
    "xslt_docs = ['MODS3-7_Bibframe2-0_XSLT2-0_20230505.xsl', 'conf/languageCrosswalk.xml']\n",
    "for xslt_doc in xslt_docs:\n",
    "    download_xslt(f'{base_url}/{xslt_doc}', target_dir=f'lib/xslt/{os.path.dirname(xslt_doc)}')\n"
   ],
   "id": "1de7cedbb3514188",
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Downloaded parameters.xsl to lib/xslt\n",
      "Downloaded functions.xsl to lib/xslt\n",
      "Downloaded convert_tei-to-biblstruct_functions.xsl to lib/xslt\n",
      "Downloaded convert_tei-to-biblstruct_bibl.xsl to lib/xslt\n",
      "Downloaded date-functions.xsl to lib/xslt\n",
      "Downloaded convert_tei-to-mods_functions.xsl to lib/xslt\n",
      "Downloaded convert_tei-to-mods_bibl.xsl to lib/xslt\n",
      "Downloaded convert_tei-to-zotero-rdf_bibl.xsl to lib/xslt\n",
      "Downloaded MODS3-7_Bibframe2-0_XSLT2-0_20230505.xsl to lib/xslt/\n",
      "Downloaded languageCrosswalk.xml to lib/xslt/conf\n"
     ]
    }
   ],
   "execution_count": 30
  },
  {
   "metadata": {},
   "cell_type": "markdown",
   "source": [
    "## Download the Saxon jar\n",
    "\n",
    "As the xslt uses v2.0 features, and there are no native-python xslt-2.0 processors, we need to use the Saxon processor. Possible alternatives (untested):\n",
    "  - https://pypi.org/project/saxonpy\n",
    "  - https://github.com/cts2/pyjxslt\n"
   ],
   "id": "781d0e0e7a9dd346"
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-08-22T08:58:42.538326Z",
     "start_time": "2024-08-22T08:58:34.687673Z"
    }
   },
   "cell_type": "code",
   "source": [
    "import requests\n",
    "import zipfile\n",
    "import io\n",
    "import os\n",
    "url = \"https://github.com/Saxonica/Saxon-HE/releases/download/SaxonHE12-5/SaxonHE12-5J.zip\"\n",
    "target_dir = 'lib/SaxonHE12-5'\n",
    "response = requests.get(url, stream=True)\n",
    "file_zip = zipfile.ZipFile(io.BytesIO(response.content))\n",
    "os.makedirs(target_dir, exist_ok=True)\n",
    "file_zip.extractall(path=target_dir)"
   ],
   "id": "72b688e9b2e0d1f2",
   "outputs": [],
   "execution_count": 86
  },
  {
   "metadata": {},
   "cell_type": "markdown",
   "source": [
    "## Run the transformation to biblStruct & MODS"
   ],
   "id": "1bbb36ac0f4fd1b5"
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-08-31T14:07:42.390943Z",
     "start_time": "2024-08-31T14:07:42.360658Z"
    }
   },
   "cell_type": "code",
   "source": [
    "import subprocess\n",
    "import os\n",
    "from glob import glob\n",
    "\n",
    "def transform(xslt_path, input_path='tei', output_path='.', rename_extension:tuple=None):\n",
    "    input_path = os.path.normpath(input_path)\n",
    "    xslt_path = os.path.normpath(xslt_path)\n",
    "    cmd = ['java', '-jar', 'lib/SaxonHE12-5J/saxon-he-12.5.jar', \n",
    "           f'-s:{input_path}', f'-xsl:{xslt_path}', f'-o:{output_path}',\n",
    "           'p_target-language=de', 'p_github-action=true', f'p_output-folder={output_path}']\n",
    "    process = subprocess.run(cmd, capture_output=True, text=True)\n",
    "    if rename_extension:\n",
    "        from_extension = rename_extension[0]\n",
    "        to_extension = rename_extension[1]\n",
    "        for filename in glob(f'{output_path}/*.xml'):\n",
    "            if filename.endswith(from_extension):\n",
    "                os.replace(filename, filename.replace(from_extension, to_extension))\n",
    "    if process.returncode != 0:\n",
    "        raise RuntimeError(process.stderr)\n",
    "    print(f'Applied {xslt_path} to files in {input_path} and saved result in {output_path}.')\n",
    "    return process"
   ],
   "id": "d4a6c9620d0199eb",
   "outputs": [],
   "execution_count": 53
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-08-31T14:07:52.212627Z",
     "start_time": "2024-08-31T14:07:45.196851Z"
    }
   },
   "cell_type": "code",
   "source": [
    "# TEI -> biblstruct\n",
    "transform(xslt_path='lib/xslt/convert_tei-to-biblstruct_bibl.xsl', output_path='biblStruct', rename_extension=('-bibl_biblStruct.TEIP5.xml','.biblStruct.xml'))\n",
    "\n",
    "# TEI -> MODS\n",
    "transform(xslt_path='lib/xslt/convert_tei-to-mods_bibl.xsl', output_path='mods', rename_extension=('-bibl.MODS.xml','.mods.xml'))\n",
    "\n",
    "# rename wanted and delete unwanted empty files\n",
    "for dir_name in ['biblStruct', 'mods']:\n",
    "    for filename in glob(f'{dir_name}/*'):\n",
    "        if os.path.basename(filename).startswith(f'{dir_name}'):\n",
    "            os.replace(filename, f'{os.path.dirname(filename)}/{os.path.basename(filename).removeprefix(dir_name)}')\n",
    "        else:\n",
    "            os.remove(filename)\n"
   ],
   "id": "34087ef2f498ffa6",
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Applied lib\\xslt\\convert_tei-to-biblstruct_bibl.xsl to files in tei and saved result in biblStruct.\n",
      "Applied lib\\xslt\\convert_tei-to-mods_bibl.xsl to files in tei and saved result in mods.\n"
     ]
    }
   ],
   "execution_count": 54
  },
  {
   "metadata": {},
   "cell_type": "markdown",
   "source": [
    "## Convert MODS to RIS tagged file format\n",
    "\n",
    "This requires the install the [Bibutils suite of executables](https://sourceforge.net/p/bibutils/home/Bibutils) available in most distros.\n",
    "If you are on Windows, you will need to install it to the standard WSL distro."
   ],
   "id": "5e75488ae4379946"
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-08-31T14:08:03.605705Z",
     "start_time": "2024-08-31T14:08:03.315458Z"
    }
   },
   "cell_type": "code",
   "source": [
    "import subprocess\n",
    "import platform\n",
    "\n",
    "cmd = ['bash', 'lib/run-bibutils.sh', 'xml2ris']\n",
    "if platform.system() == 'Windows':\n",
    "    cmd = ['wsl.exe', '-e'] + cmd\n",
    "output = subprocess.check_output(cmd, stderr=subprocess.STDOUT)\n",
    "print(output.decode())"
   ],
   "id": "fde37a9e4a182bad",
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Running xml2ris to convert mods/10.1111_1467-6478.00057.mods.xml to ris/10.1111_1467-6478.00057.ris...\n",
      "Running xml2ris to convert mods/10.1111_1467-6478.00080.mods.xml to ris/10.1111_1467-6478.00080.ris...\n",
      "Running xml2ris to convert mods/10.1515_zfrs-1980-0103.mods.xml to ris/10.1515_zfrs-1980-0103.ris...\n",
      "Running xml2ris to convert mods/10.1515_zfrs-1980-0104.mods.xml to ris/10.1515_zfrs-1980-0104.ris...\n"
     ]
    }
   ],
   "execution_count": 55
  },
  {
   "metadata": {},
   "cell_type": "markdown",
   "source": [
    "## Convert MODS -> Bibframe RDF -> JSON-LD\n",
    "\n",
    "See:\n",
    " - https://www.loc.gov/standards/mods/modsrdf/mods3-7-bibframe2-0-mapping.html\n",
    " - https://rdflib.readthedocs.io/   "
   ],
   "id": "61f6cfe7d3de482a"
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-08-31T14:08:19.285923Z",
     "start_time": "2024-08-31T14:08:11.824941Z"
    }
   },
   "cell_type": "code",
   "source": [
    "# MODS -> Bibframe\n",
    "transform(xslt_path='lib/xslt/MODS3-7_Bibframe2-0_XSLT2-0_20230505.xsl', \n",
    "          input_path='mods', output_path='bibframe', \n",
    "          rename_extension=('.mods.xml','.bibframe.xml'))"
   ],
   "id": "6ba739963096f858",
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Applied lib\\xslt\\MODS3-7_Bibframe2-0_XSLT2-0_20230505.xsl to files in mods and saved result in bibframe.\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "CompletedProcess(args=['java', '-jar', 'lib/SaxonHE12-5J/saxon-he-12.5.jar', '-s:mods', '-xsl:lib\\\\xslt\\\\MODS3-7_Bibframe2-0_XSLT2-0_20230505.xsl', '-o:bibframe', 'p_target-language=de', 'p_github-action=true', 'p_output-folder=bibframe'], returncode=0, stdout='', stderr='')"
      ]
     },
     "execution_count": 56,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "execution_count": 56
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-08-31T14:08:24.493962Z",
     "start_time": "2024-08-31T14:08:22.772571Z"
    }
   },
   "cell_type": "code",
   "source": [
    "from rdflib import Graph\n",
    "for in_path in glob(f'bibframe/*'):\n",
    "    out_file = os.path.basename(in_path).replace('.bibframe.xml','.json')\n",
    "    g = Graph()\n",
    "    g.parse(in_path)\n",
    "    g.serialize(destination=f'json-ld/{out_file}', format='json-ld')\n"
   ],
   "id": "4cb509fa7f296d1f",
   "outputs": [],
   "execution_count": 57
  },
  {
   "metadata": {},
   "cell_type": "markdown",
   "source": [],
   "id": "8ce07a1a294b5408"
  },
  {
   "metadata": {},
   "cell_type": "markdown",
   "source": [
    "## Convert MODS -> BibTex -> CSL-JSON\n",
    "\n",
    "This also requires Bibutils and additionally, the [pandoc executable](https://pandoc.org/installing.html).\n"
   ],
   "id": "be771aec518bf10a"
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-08-31T20:39:19.077022Z",
     "start_time": "2024-08-31T20:39:18.691518Z"
    }
   },
   "cell_type": "code",
   "source": [
    "# MODS -> BibTeX\n",
    "import subprocess\n",
    "import platform\n",
    "\n",
    "cmd = ['bash', 'lib/run-bibutils.sh', 'xml2bib']\n",
    "if platform.system() == 'Windows':\n",
    "    cmd = ['wsl.exe', '-e'] + cmd\n",
    "output = subprocess.check_output(cmd, stderr=subprocess.STDOUT)\n",
    "print(output.decode())"
   ],
   "id": "3e95a38e223dae51",
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Running xml2bib to convert mods/10.1111_1467-6478.00057.mods.xml to bib/10.1111_1467-6478.00057.bib...\n",
      "Running xml2bib to convert mods/10.1111_1467-6478.00080.mods.xml to bib/10.1111_1467-6478.00080.bib...\n",
      "Running xml2bib to convert mods/10.1515_zfrs-1980-0103.mods.xml to bib/10.1515_zfrs-1980-0103.bib...\n",
      "Running xml2bib to convert mods/10.1515_zfrs-1980-0104.mods.xml to bib/10.1515_zfrs-1980-0104.bib...\n"
     ]
    }
   ],
   "execution_count": 2
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-08-31T20:40:43.883111Z",
     "start_time": "2024-08-31T20:40:37.701850Z"
    }
   },
   "cell_type": "code",
   "source": [
    "# BibTeX to CSL\n",
    "\n",
    "cmd = ['bash', 'lib/run-pandoc.sh', 'bibtex', 'csljson']\n",
    "if platform.system() == 'Windows':\n",
    "    cmd = ['wsl.exe', '-e'] + cmd\n",
    "output = subprocess.check_output(cmd, stderr=subprocess.STDOUT)\n",
    "print(output.decode())"
   ],
   "id": "a2cdccaf919c268e",
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Running citeproc to convert bib/10.1111_1467-6478.00057.bib to csljson/10.1111_1467-6478.00057.csl.json...\n",
      "Running citeproc to convert bib/10.1111_1467-6478.00080.bib to csljson/10.1111_1467-6478.00080.csl.json...\n",
      "Running citeproc to convert bib/10.1515_zfrs-1980-0103.bib to csljson/10.1515_zfrs-1980-0103.csl.json...\n",
      "Running citeproc to convert bib/10.1515_zfrs-1980-0104.bib to csljson/10.1515_zfrs-1980-0104.csl.json...\n"
     ]
    }
   ],
   "execution_count": 4
  },
  {
   "metadata": {},
   "cell_type": "code",
   "outputs": [],
   "execution_count": null,
   "source": [],
   "id": "77ff73f83a1db70e"
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}