diff --git a/convert-anystyle-data/anystyle-to-tei.ipynb b/convert-anystyle-data/anystyle-to-tei.ipynb index 0cf07229b1e7c61bad533a7feaab388b19e1f62c..40464bd53e45b721e847e29d858343e496ae6824 100644 --- a/convert-anystyle-data/anystyle-to-tei.ipynb +++ b/convert-anystyle-data/anystyle-to-tei.ipynb @@ -23,13 +23,21 @@ }, "id": "4c77ab592c98dfd" }, + { + "cell_type": "markdown", + "source": [ + "Cache XML schema for offline use" + ], + "metadata": { + "collapsed": false + }, + "id": "c4ebd32b98166eb" + }, { "cell_type": "code", "source": [ "import xmlschema\n", "import os\n", - "\n", - "# cache for local use\n", "if not os.path.isdir(\"schema/tei\"):\n", " schema = xmlschema.XMLSchema(\"https://www.tei-c.org/release/xml/tei/custom/schema/xsd/tei_all.xsd\")\n", " schema.export(target='schema/tei', save_remote=True)" @@ -45,6 +53,16 @@ "outputs": [], "execution_count": 2 }, + { + "cell_type": "markdown", + "source": [ + "This generates JSON data with information on the tags used, extracting from the schema and from the documentation pages" + ], + "metadata": { + "collapsed": false + }, + "id": "3019ff70c4b769cd" + }, { "cell_type": "code", "source": [ @@ -149,7 +167,9 @@ "metadata": {}, "cell_type": "markdown", "source": [ - "## Convert Groundd Truth to TEI" + "## Convert Ground Truth to TEI\n", + "\n", + "This converts the AnyStyle XML data to TEI, translating from the flat schema to the nested TEI `<bibl>` structure.\n" ], "id": "aaf43ee43bb6d4d" }, @@ -582,7 +602,9 @@ } }, "cell_type": "code", - "source": "!saxon -s:\"tei/10.1111_1467-6478.00057.xml\" -xsl:\"https://openarabicpe.github.io/convert_tei-to-bibliographic-data/xslt/convert_tei-to-biblstruct_bibl.xsl\"", + "source": [ + "!saxon -s:\"tei/10.1111_1467-6478.00057.xml\" -xsl:\"https://openarabicpe.github.io/convert_tei-to-bibliographic-data/xslt/convert_tei-to-biblstruct_bibl.xsl\"" + ], "id": "2e6d27dc670c0038", "outputs": [ { @@ -605,7 +627,7 @@ "cell_type": "code", "outputs": [], "execution_count": null, - "source": "", + "source": [], "id": "1a8a57560f1f4868" } ],