diff --git a/convert-anystyle-data/anystyle-to-tei.ipynb b/convert-anystyle-data/anystyle-to-tei.ipynb index c5a4e44cb6b764a3550bad4079c415299c1d66a7..02bf772abc70216a383f36aa196e1b833f1d3773 100644 --- a/convert-anystyle-data/anystyle-to-tei.ipynb +++ b/convert-anystyle-data/anystyle-to-tei.ipynb @@ -28,9 +28,7 @@ "# cache for local use\n", "if not os.path.isdir(\"schema/tei\"):\n", " schema = xmlschema.XMLSchema(\"https://www.tei-c.org/release/xml/tei/custom/schema/xsd/tei_all.xsd\")\n", - " schema.export(target='schema/tei', save_remote=True)\n", - "\n", - "schema = xmlschema.XMLSchema(\"schema/tei/tei_all.xsd\")" + " schema.export(target='schema/tei', save_remote=True)" ], "metadata": { "collapsed": false, @@ -48,34 +46,89 @@ "source": [ "import xml.etree.ElementTree as ET\n", "import pandas as pd\n", + "import requests\n", + "from bs4 import BeautifulSoup\n", "import re\n", - "tree = ET.parse('schema/tei/tei_all.xsd')\n", - "root = tree.getroot()\n", + "from tqdm.notebook import tqdm\n", + "\n", + "\n", + "# written by GPT-4\n", + "def extract_headings_and_links(tag, doc_heading, doc_base_url):\n", + " # Extract heading numbers from the document\n", + " heading_numbers = re.findall(r'\\d+(?:\\.\\d+)*', doc_heading)\n", + "\n", + " # Download the HTML page\n", + " url = f\"{doc_base_url}/ref-{tag}.html\"\n", + " response = requests.get(url)\n", + " soup = BeautifulSoup(response.content, 'html.parser')\n", + "\n", + " # Extract the links associated with each heading number\n", + " links = {}\n", + " for link in soup.find_all('a', class_='link_ptr'):\n", + " heading_value = link.find('span', class_='headingNumber').text.strip()\n", + " link_url = link.get('href')\n", + " links[heading_value] = f\"{doc_base_url}/{link_url}\"\n", + "\n", + " return {heading: link_url for heading, link_url in zip(heading_numbers, links.values()) if\n", + " heading in heading_numbers}\n", + "\n", "\n", - "namespaces = {'xs':'http://www.w3.org/2001/XMLSchema'}\n", - "bibl_schema = schema.find(\"tei:bibl\")\n", - "data_list = []\n", - "#names = [child_element.local_name for child_element in bibl_schema.iterchildren()]\n", - "names = ['author', 'citedRange', 'date', 'edition', 'editor', 'location', 'note', 'publisher', 'pubPlace', 'series', 'span', 'title', 'volume', 'issue'] \n", - "for name in names:\n", - " doc_node = root.find(f\".//xs:element[@name='{name}']/xs:annotation/xs:documentation\",namespaces=namespaces)\n", - " if doc_node is not None:\n", - " matches = re.search(r'^(.*)\\[(.*)]$', doc_node.text)\n", - " if matches is not None:\n", - " data_list.append({'name': name, 'description': matches.group(1), 'documentation': matches.group(2) })\n", + "def generate_tag_docs(xsd_path):\n", + " namespaces = {'xs': 'http://www.w3.org/2001/XMLSchema'}\n", + " doc_base_url = \"https://vault.tei-c.de/P5/3.0.0/doc/tei-p5-doc/en/html\"\n", "\n", - "df = pd.DataFrame(data_list)\n", + " tree = ET.parse('schema/tei/tei_all.xsd')\n", + " root = tree.getroot()\n", + " schema = xmlschema.XMLSchema(xsd_path)\n", + " bibl_schema = schema.find(\"tei:bibl\")\n", + " data_list = []\n", + " #names = [child_element.local_name for child_element in bibl_schema.iterchildren()]\n", + " names = ['author', 'citedRange', 'date', 'edition', 'editor', 'location', 'note', 'publisher', 'pubPlace', 'series',\n", + " 'span', 'title', 'volume', 'issue']\n", + " for name in tqdm(names, desc=\"Processing TEI tags\"):\n", + " doc_node = root.find(f\".//xs:element[@name='{name}']/xs:annotation/xs:documentation\", namespaces=namespaces)\n", + " if doc_node is not None:\n", + " matches = re.search(r'^(.*)\\[(.*)]$', doc_node.text)\n", + " if matches is None: continue\n", + " description = matches.group(1)\n", + " doc_heading = matches.group(2)\n", + " doc_urls = extract_headings_and_links(name, doc_heading, doc_base_url)\n", + " data_list.append({'name': name, 'description': description, 'documentation': doc_heading, 'urls': doc_urls})\n", + "\n", + " return pd.DataFrame(data_list)\n", + "\n", + "\n", + "cache_file = \"schema/tei-tags-documentation.json\"\n", + "if not os.path.isfile(cache_file):\n", + " df = generate_tag_docs(\"schema/tei/tei_all.xsd\")\n", + " df.to_json(cache_file, index=False, orient='records')\n", + "else:\n", + " df = pd.read_json(cache_file)\n", "df\n" ], "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-07-27T10:34:38.980418Z", - "start_time": "2024-07-27T10:34:38.895061Z" + "end_time": "2024-07-29T07:56:06.664531Z", + "start_time": "2024-07-29T07:55:52.208162Z" } }, "id": "572f566fc9784238", "outputs": [ + { + "data": { + "text/plain": [ + "Processing TEI tags: 0%| | 0/14 [00:00<?, ?it/s]" + ], + "application/vnd.jupyter.widget-view+json": { + "version_major": 2, + "version_minor": 0, + "model_id": "517f81d06e204232823c4b049be3de46" + } + }, + "metadata": {}, + "output_type": "display_data" + }, { "data": { "text/plain": [ @@ -93,19 +146,33 @@ "10 span associates an interpretative annotation direct... \n", "11 title (title) contains a title for any kind of work. \n", "\n", - " documentation \n", - "0 3.12.2.2. Titles, Authors, and Editors 2.2.1. ... \n", - "1 3.12.2.5. Scopes and Ranges in Bibliographic C... \n", - "2 3.6.4. Dates and Times 2.2.4. Publication, Dis... \n", - "3 2.2.2. The Edition Statement \n", - "4 3.12.2.2. Titles, Authors, and Editors \n", - "5 14.3.4. Places \n", - "6 3.9.1. Notes and Simple Annotation 2.2.6. The ... \n", - "7 3.12.2.4. Imprint, Size of a Document, and Rep... \n", - "8 3.12.2.4. Imprint, Size of a Document, and Rep... \n", - "9 3.12.2.1. Analytic, Monographic, and Series Le... \n", - "10 18.3. Spans and Interpretations \n", - "11 3.12.2.2. Titles, Authors, and Editors 2.2.1. ... " + " documentation \\\n", + "0 3.12.2.2. Titles, Authors, and Editors 2.2.1. ... \n", + "1 3.12.2.5. Scopes and Ranges in Bibliographic C... \n", + "2 3.6.4. Dates and Times 2.2.4. Publication, Dis... \n", + "3 2.2.2. The Edition Statement \n", + "4 3.12.2.2. Titles, Authors, and Editors \n", + "5 14.3.4. Places \n", + "6 3.9.1. Notes and Simple Annotation 2.2.6. The ... \n", + "7 3.12.2.4. Imprint, Size of a Document, and Rep... \n", + "8 3.12.2.4. Imprint, Size of a Document, and Rep... \n", + "9 3.12.2.1. Analytic, Monographic, and Series Le... \n", + "10 18.3. Spans and Interpretations \n", + "11 3.12.2.2. Titles, Authors, and Editors 2.2.1. ... \n", + "\n", + " urls \n", + "0 {'3.12.2.2': 'https://vault.tei-c.de/P5/3.0.0/... \n", + "1 {'3.12.2.5': 'https://vault.tei-c.de/P5/3.0.0/... \n", + "2 {'3.6.4': 'https://vault.tei-c.de/P5/3.0.0/doc... \n", + "3 {'2.2.2': 'https://vault.tei-c.de/P5/3.0.0/doc... \n", + "4 {'3.12.2.2': 'https://vault.tei-c.de/P5/3.0.0/... \n", + "5 {'14.3.4': 'https://vault.tei-c.de/P5/3.0.0/do... \n", + "6 {'3.9.1': 'https://vault.tei-c.de/P5/3.0.0/doc... \n", + "7 {'3.12.2.4': 'https://vault.tei-c.de/P5/3.0.0/... \n", + "8 {'3.12.2.4': 'https://vault.tei-c.de/P5/3.0.0/... \n", + "9 {'3.12.2.1': 'https://vault.tei-c.de/P5/3.0.0/... \n", + "10 {'18.3': 'https://vault.tei-c.de/P5/3.0.0/doc/... \n", + "11 {'3.12.2.2': 'https://vault.tei-c.de/P5/3.0.0/... " ], "text/html": [ "<div>\n", @@ -129,6 +196,7 @@ " <th>name</th>\n", " <th>description</th>\n", " <th>documentation</th>\n", + " <th>urls</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", @@ -137,84 +205,109 @@ " <td>author</td>\n", " <td>(author) in a bibliographic reference, contain...</td>\n", " <td>3.12.2.2. Titles, Authors, and Editors 2.2.1. ...</td>\n", + " <td>{'3.12.2.2': 'https://vault.tei-c.de/P5/3.0.0/...</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>citedRange</td>\n", " <td>(cited range) defines the range of cited conte...</td>\n", " <td>3.12.2.5. Scopes and Ranges in Bibliographic C...</td>\n", + " <td>{'3.12.2.5': 'https://vault.tei-c.de/P5/3.0.0/...</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>date</td>\n", " <td>(date) contains a date in any format.</td>\n", " <td>3.6.4. Dates and Times 2.2.4. Publication, Dis...</td>\n", + " <td>{'3.6.4': 'https://vault.tei-c.de/P5/3.0.0/doc...</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>edition</td>\n", " <td>(edition) describes the particularities of one...</td>\n", " <td>2.2.2. The Edition Statement</td>\n", + " <td>{'2.2.2': 'https://vault.tei-c.de/P5/3.0.0/doc...</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>editor</td>\n", " <td>contains a secondary statement of responsibili...</td>\n", " <td>3.12.2.2. Titles, Authors, and Editors</td>\n", + " <td>{'3.12.2.2': 'https://vault.tei-c.de/P5/3.0.0/...</td>\n", " </tr>\n", " <tr>\n", " <th>5</th>\n", " <td>location</td>\n", " <td>(location) defines the location of a place as ...</td>\n", " <td>14.3.4. Places</td>\n", + " <td>{'14.3.4': 'https://vault.tei-c.de/P5/3.0.0/do...</td>\n", " </tr>\n", " <tr>\n", " <th>6</th>\n", " <td>note</td>\n", " <td>(note) contains a note or annotation.</td>\n", " <td>3.9.1. Notes and Simple Annotation 2.2.6. The ...</td>\n", + " <td>{'3.9.1': 'https://vault.tei-c.de/P5/3.0.0/doc...</td>\n", " </tr>\n", " <tr>\n", " <th>7</th>\n", " <td>publisher</td>\n", " <td>(publisher) provides the name of the organizat...</td>\n", " <td>3.12.2.4. Imprint, Size of a Document, and Rep...</td>\n", + " <td>{'3.12.2.4': 'https://vault.tei-c.de/P5/3.0.0/...</td>\n", " </tr>\n", " <tr>\n", " <th>8</th>\n", " <td>pubPlace</td>\n", " <td>(publication place) contains the name of the p...</td>\n", " <td>3.12.2.4. Imprint, Size of a Document, and Rep...</td>\n", + " <td>{'3.12.2.4': 'https://vault.tei-c.de/P5/3.0.0/...</td>\n", " </tr>\n", " <tr>\n", " <th>9</th>\n", " <td>series</td>\n", " <td>(series information) contains information abou...</td>\n", " <td>3.12.2.1. Analytic, Monographic, and Series Le...</td>\n", + " <td>{'3.12.2.1': 'https://vault.tei-c.de/P5/3.0.0/...</td>\n", " </tr>\n", " <tr>\n", " <th>10</th>\n", " <td>span</td>\n", " <td>associates an interpretative annotation direct...</td>\n", " <td>18.3. Spans and Interpretations</td>\n", + " <td>{'18.3': 'https://vault.tei-c.de/P5/3.0.0/doc/...</td>\n", " </tr>\n", " <tr>\n", " <th>11</th>\n", " <td>title</td>\n", " <td>(title) contains a title for any kind of work.</td>\n", " <td>3.12.2.2. Titles, Authors, and Editors 2.2.1. ...</td>\n", + " <td>{'3.12.2.2': 'https://vault.tei-c.de/P5/3.0.0/...</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ] }, - "execution_count": 5, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], - "execution_count": 5 + "execution_count": 11 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2024-07-29T07:23:41.966749Z", + "start_time": "2024-07-29T07:23:41.958890Z" + } + }, + "cell_type": "code", + "source": "", + "id": "b3ee84984b88f24a", + "outputs": [], + "execution_count": 9 }, { "metadata": {}, @@ -222,7 +315,7 @@ "outputs": [], "execution_count": null, "source": "", - "id": "b3ee84984b88f24a" + "id": "c24a62e5a69f7dad" } ], "metadata": { diff --git a/convert-anystyle-data/schema/.gitignore b/convert-anystyle-data/schema/.gitignore index 9832e29ebd08343618c18fd3ea59b91a13e02d9e..e1229fc4fff8e6ab7bd1911f7a0173559463fadc 100644 --- a/convert-anystyle-data/schema/.gitignore +++ b/convert-anystyle-data/schema/.gitignore @@ -1 +1 @@ -tei/ \ No newline at end of file +tei/* \ No newline at end of file diff --git a/convert-anystyle-data/schema/tei-tags-documentation.json b/convert-anystyle-data/schema/tei-tags-documentation.json new file mode 100644 index 0000000000000000000000000000000000000000..a5e7a9b8c5ed2643eb1c1f641c3740b665496d94 --- /dev/null +++ b/convert-anystyle-data/schema/tei-tags-documentation.json @@ -0,0 +1,110 @@ +[ + { + "name": "author", + "description": "(author) in a bibliographic reference, contains the name(s) of an author, personal or corporate, of a work; for example in the same form as that provided by a recognized bibliographic name authority. ", + "documentation": "3.12.2.2. Titles, Authors, and Editors 2.2.1. The Title Statement", + "urls": { + "3.12.2.2": "https:\/\/vault.tei-c.de\/P5\/3.0.0\/doc\/tei-p5-doc\/en\/html\/CO.html#COBICOR", + "2.2.1": "https:\/\/vault.tei-c.de\/P5\/3.0.0\/doc\/tei-p5-doc\/en\/html\/HD.html#HD21" + } + }, + { + "name": "citedRange", + "description": "(cited range) defines the range of cited content, often represented by pages or other units ", + "documentation": "3.12.2.5. Scopes and Ranges in Bibliographic Citations", + "urls": { + "3.12.2.5": "https:\/\/vault.tei-c.de\/P5\/3.0.0\/doc\/tei-p5-doc\/en\/html\/CO.html#COBICOB" + } + }, + { + "name": "date", + "description": "(date) contains a date in any format. ", + "documentation": "3.6.4. Dates and Times 2.2.4. Publication, Distribution, Licensing, etc. 2.6. The Revision Description 3.12.2.4. Imprint, Size of a Document, and Reprint Information 16.2.3. The Setting Description 14.4. Dates", + "urls": { + "3.6.4": "https:\/\/vault.tei-c.de\/P5\/3.0.0\/doc\/tei-p5-doc\/en\/html\/CO.html#CONADA", + "2.2.4": "https:\/\/vault.tei-c.de\/P5\/3.0.0\/doc\/tei-p5-doc\/en\/html\/HD.html#HD24", + "2.6": "https:\/\/vault.tei-c.de\/P5\/3.0.0\/doc\/tei-p5-doc\/en\/html\/HD.html#HD6", + "3.12.2.4": "https:\/\/vault.tei-c.de\/P5\/3.0.0\/doc\/tei-p5-doc\/en\/html\/CO.html#COBICOI", + "16.2.3": "https:\/\/vault.tei-c.de\/P5\/3.0.0\/doc\/tei-p5-doc\/en\/html\/CC.html#CCAHSE", + "14.4": "https:\/\/vault.tei-c.de\/P5\/3.0.0\/doc\/tei-p5-doc\/en\/html\/ND.html#NDDATE" + } + }, + { + "name": "edition", + "description": "(edition) describes the particularities of one edition of a text. ", + "documentation": "2.2.2. The Edition Statement", + "urls": { + "2.2.2": "https:\/\/vault.tei-c.de\/P5\/3.0.0\/doc\/tei-p5-doc\/en\/html\/HD.html#HD22" + } + }, + { + "name": "editor", + "description": "contains a secondary statement of responsibility for a bibliographic item, for example the name of an individual, institution or organization, (or of several such) acting as editor, compiler, translator, etc. ", + "documentation": "3.12.2.2. Titles, Authors, and Editors", + "urls": { + "3.12.2.2": "https:\/\/vault.tei-c.de\/P5\/3.0.0\/doc\/tei-p5-doc\/en\/html\/CO.html#COBICOR" + } + }, + { + "name": "location", + "description": "(location) defines the location of a place as a set of geographical coordinates, in terms of other named geo-political entities, or as an address. ", + "documentation": "14.3.4. Places", + "urls": { + "14.3.4": "https:\/\/vault.tei-c.de\/P5\/3.0.0\/doc\/tei-p5-doc\/en\/html\/ND.html#NDGEOG" + } + }, + { + "name": "note", + "description": "(note) contains a note or annotation. ", + "documentation": "3.9.1. Notes and Simple Annotation 2.2.6. The Notes Statement 3.12.2.8. Notes and Statement of Language 10.3.5.4. Notes within Entries", + "urls": { + "3.9.1": "https:\/\/vault.tei-c.de\/P5\/3.0.0\/doc\/tei-p5-doc\/en\/html\/CO.html#CONONO", + "2.2.6": "https:\/\/vault.tei-c.de\/P5\/3.0.0\/doc\/tei-p5-doc\/en\/html\/HD.html#HD27", + "3.12.2.8": "https:\/\/vault.tei-c.de\/P5\/3.0.0\/doc\/tei-p5-doc\/en\/html\/CO.html#COBICON", + "10.3.5.4": "https:\/\/vault.tei-c.de\/P5\/3.0.0\/doc\/tei-p5-doc\/en\/html\/DI.html#DITPNO" + } + }, + { + "name": "publisher", + "description": "(publisher) provides the name of the organization responsible for the publication or distribution of a bibliographic item. ", + "documentation": "3.12.2.4. Imprint, Size of a Document, and Reprint Information 2.2.4. Publication, Distribution, Licensing, etc.", + "urls": { + "3.12.2.4": "https:\/\/vault.tei-c.de\/P5\/3.0.0\/doc\/tei-p5-doc\/en\/html\/CO.html#COBICOI", + "2.2.4": "https:\/\/vault.tei-c.de\/P5\/3.0.0\/doc\/tei-p5-doc\/en\/html\/HD.html#HD24" + } + }, + { + "name": "pubPlace", + "description": "(publication place) contains the name of the place where a bibliographic item was published. ", + "documentation": "3.12.2.4. Imprint, Size of a Document, and Reprint Information", + "urls": { + "3.12.2.4": "https:\/\/vault.tei-c.de\/P5\/3.0.0\/doc\/tei-p5-doc\/en\/html\/CO.html#COBICOI" + } + }, + { + "name": "series", + "description": "(series information) contains information about the series in which a book or other bibliographic item has appeared. ", + "documentation": "3.12.2.1. Analytic, Monographic, and Series Levels", + "urls": { + "3.12.2.1": "https:\/\/vault.tei-c.de\/P5\/3.0.0\/doc\/tei-p5-doc\/en\/html\/CO.html#COBICOL" + } + }, + { + "name": "span", + "description": "associates an interpretative annotation directly with a span of text. ", + "documentation": "18.3. Spans and Interpretations", + "urls": { + "18.3": "https:\/\/vault.tei-c.de\/P5\/3.0.0\/doc\/tei-p5-doc\/en\/html\/AI.html#AISP" + } + }, + { + "name": "title", + "description": "(title) contains a title for any kind of work. ", + "documentation": "3.12.2.2. Titles, Authors, and Editors 2.2.1. The Title Statement 2.2.5. The Series Statement", + "urls": { + "3.12.2.2": "https:\/\/vault.tei-c.de\/P5\/3.0.0\/doc\/tei-p5-doc\/en\/html\/CO.html#COBICOR", + "2.2.1": "https:\/\/vault.tei-c.de\/P5\/3.0.0\/doc\/tei-p5-doc\/en\/html\/HD.html#HD21", + "2.2.5": "https:\/\/vault.tei-c.de\/P5\/3.0.0\/doc\/tei-p5-doc\/en\/html\/HD.html#HD26" + } + } +] \ No newline at end of file