{ "cells": [ { "metadata": {}, "cell_type": "markdown", "source": [ "# Convert AnyStyle training files to the Prodigy format\n", "\n", "Converter functions written with the help GPT-4, see https://chat.openai.com/share/3f42ae1d-3066-4563-944d-3139b5e1e867 and https://chat.openai.com/share/b6832876-9424-4ff9-bd83-157f3197f805" ], "id": "cb631197c03a0768" }, { "metadata": {}, "cell_type": "markdown", "source": [ "## Parser files" ], "id": "83bfc356ce72fe51" }, { "metadata": {}, "cell_type": "markdown", "source": "For the output format, see https://prodi.gy/docs/api-interfaces#spans_manual", "id": "f46a123db5e7d341" }, { "cell_type": "code", "id": "initial_id", "metadata": { "collapsed": true, "ExecuteTime": { "end_time": "2024-07-29T17:07:02.554572Z", "start_time": "2024-07-29T17:06:39.684515Z" } }, "source": [ "import xml.etree.ElementTree as ET\n", "import spacy\n", "import json\n", "\n", "# Load the German NLP model\n", "nlp = spacy.load('de_core_news_sm')\n", "\n", "def xml_to_jsonl(input_xml_path, output_jsonl_path):\n", " tree = ET.parse(input_xml_path)\n", " root = tree.getroot()\n", "\n", " with open(output_jsonl_path, 'w', encoding='utf-8') as f:\n", " for sequence in root.findall('sequence'):\n", " text = \" \".join(element.text.strip() if element.text else '' for element in sequence)\n", " doc = nlp(text)\n", " tokens = [{'text': token.text, 'start': token.idx, 'end': token.idx + len(token)} for token in doc]\n", " \n", " # Add each token's ID and whitespace flag\n", " for idx, token in enumerate(tokens):\n", " token['id'] = idx\n", " token['ws'] = (token['end'] < len(text) and text[token['end']] == \" \")\n", "\n", " spans = []\n", " for element in sequence:\n", " if not element.text:\n", " continue\n", " element_text = element.text.strip()\n", " start = text.index(element_text)\n", " end = start + len(element_text)\n", "\n", " # Find the tokens that encompass the start and end of the element text\n", " token_start = next((i for i, token in enumerate(tokens) if token['start'] <= start < token['end']), None)\n", " token_end = next((i for i, token in enumerate(tokens) if token['start'] < end <= token['end']), None)\n", "\n", " # If tokens are found, add to spans list\n", " if token_start is not None and token_end is not None:\n", " span = {\n", " 'start': start,\n", " 'end': end,\n", " 'label': element.tag,\n", " 'token_start': token_start,\n", " 'token_end': token_end\n", " }\n", " spans.append(span)\n", " else:\n", " print(f\"Error finding tokens for span from text: '{element_text}'\")\n", "\n", " # Serialize and write to file\n", " f.write(json.dumps({'text': text, 'tokens': tokens, 'spans': spans}) + '\\n')\n", "\n" ], "outputs": [], "execution_count": 1 }, { "metadata": { "ExecuteTime": { "end_time": "2024-07-29T17:08:04.715576Z", "start_time": "2024-07-29T17:08:02.065755Z" } }, "cell_type": "code", "source": [ "from pathlib import Path\n", "for filename in [file.stem for file in Path('anystyle').glob('*.xml')]:\n", " xml_to_jsonl(f'anystyle/{filename}.xml', f'prodigy/{filename}-parser.jsonl')" ], "id": "fa9f2c194697a6b7", "outputs": [], "execution_count": 4 }, { "metadata": {}, "cell_type": "markdown", "source": [ "## Finder files" ], "id": "2988615a1be96bb" }, { "metadata": { "ExecuteTime": { "end_time": "2024-04-12T06:54:04.924138Z", "start_time": "2024-04-12T06:54:04.913099Z" } }, "cell_type": "code", "source": [ "import json\n", "\n", "def markup_to_jsonl(input_path, output_jsonl_path, break_tags_before=None, break_tags_after=None):\n", " if break_tags_before is None:\n", " break_tags_before = []\n", " if break_tags_after is None:\n", " break_tags_after = []\n", "\n", " # Helper function to write blocks to JSONL, now takes a file handle\n", " def write_block(full_text, spans, file_handle):\n", " if full_text.strip(): # Ensure there's content to write\n", " result = {\n", " 'text': full_text.rstrip(), # Remove trailing spaces\n", " 'spans': spans\n", " }\n", " file_handle.write(json.dumps(result) + '\\n')\n", "\n", " # Open the output file once at the beginning\n", " with open(output_jsonl_path, 'w', encoding='utf-8') as f:\n", " with open(input_path, 'r', encoding='utf-8') as file:\n", " lines = file.readlines()\n", "\n", " full_text = ''\n", " spans = []\n", " current_text = ''\n", " text_start_pos = 0\n", " current_tag = 'text' # Default initial tag\n", "\n", " # Iterate through lines\n", " for line in lines:\n", " parts = line.strip().split('|', 1)\n", " if len(parts) < 2:\n", " continue\n", "\n", " tag = parts[0].strip()\n", " content = parts[1].strip()\n", "\n", " if tag in break_tags_before:\n", " # Finish the current text block before starting new due to break tag\n", " if current_text:\n", " full_text += current_text\n", " spans.append({\n", " 'start': text_start_pos,\n", " 'end': text_start_pos + len(current_text),\n", " 'label': current_tag\n", " })\n", " write_block(full_text, spans, f)\n", " full_text, spans, current_text = '', [], ''\n", " current_tag = tag # Update to the new tag\n", " text_start_pos = len(full_text)\n", "\n", " # Append content with maintaining line break if the previous content was not empty\n", " current_text += content + '\\n' if current_text else content\n", "\n", " if tag:\n", " if current_text and current_text != content: # Ensures not to add just processed content span\n", " # Finish the current span\n", " full_text += current_text\n", " spans.append({\n", " 'start': text_start_pos,\n", " 'end': text_start_pos + len(current_text) - 1, # -1 to not count the last \\n for span\n", " 'label': current_tag\n", " })\n", " current_text = content + '\\n' # Start new text content under the new tag\n", " current_tag = tag\n", " text_start_pos = len(full_text)\n", "\n", " if tag in break_tags_after:\n", " # Finalize current text and write it due to break tag\n", " full_text += current_text\n", " spans.append({\n", " 'start': text_start_pos,\n", " 'end': text_start_pos + len(current_text) - 1, # -1 as above\n", " 'label': current_tag\n", " })\n", " current_text = ''\n", " write_block(full_text, spans, f)\n", " full_text, spans = '', []\n", "\n", " # Final write for any remaining text\n", " if current_text:\n", " full_text += current_text\n", " spans.append({\n", " 'start': text_start_pos,\n", " 'end': text_start_pos + len(current_text) - 1, # -1 to adjust for the last newline character\n", " 'label': current_tag\n", " })\n", " write_block(full_text, spans, f)\n", "\n", "# Example usage:\n", "# markup_to_jsonl('input_text.txt', 'output.jsonl', ['header'], ['footer'])\n" ], "id": "7764c66e149abd4f", "outputs": [], "execution_count": 26 }, { "metadata": {}, "cell_type": "markdown", "source": [ "Since the finder files have no information on page breaks, document segmentation is done using the `meta` tag, which is a very simple heuristic and probably fails in many cases. " ], "id": "8923ed5011c1d65f" }, { "metadata": { "ExecuteTime": { "end_time": "2024-04-12T06:54:05.879064Z", "start_time": "2024-04-12T06:54:05.870910Z" } }, "cell_type": "code", "source": [ "from pathlib import Path\n", "for filename in [file.stem for file in Path('in').glob('*.xml')]:\n", " markup_to_jsonl(f'anystyle/{filename}.ttx', f'prodigy/{filename}-finder.jsonl', break_tags_before=['meta'])" ], "id": "2b0ca72bbf70c829", "outputs": [], "execution_count": 27 }, { "cell_type": "markdown", "source": [ "## Extract labels" ], "metadata": { "collapsed": false }, "id": "72233341123b80c7" }, { "metadata": { "ExecuteTime": { "end_time": "2024-05-08T17:00:47.429395Z", "start_time": "2024-05-08T17:00:47.418132Z" } }, "cell_type": "code", "source": [ "import json\n", "\n", "def extract_unique_labels(jsonl_file_path):\n", " unique_labels = set() # Using a set to store unique labels\n", " with open(jsonl_file_path, 'r', encoding='utf-8') as file:\n", " for line in file:\n", " data = json.loads(line) # Load JSON from each line\n", " if 'spans' in data:\n", " for span in data['spans']:\n", " if 'label' in span:\n", " unique_labels.add(span['label']) # Add label to the set\n", " unique_labels = list(unique_labels)\n", " unique_labels.sort()\n", " return unique_labels\n", "\n", "\" \".join(extract_unique_labels(f'prodigy/10.1515_zfrs-1980-0103-parser.jsonl'))\n" ], "id": "46d7e63e53b441e3", "outputs": [ { "data": { "text/plain": [ "'author backref citation-number collection-title container-title date editor ignore journal location note pages publisher signal title volume'" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "execution_count": 2 }, { "cell_type": "code", "execution_count": null, "outputs": [], "source": [], "metadata": { "collapsed": false }, "id": "3efd4cc21605ecbd" } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.6" } }, "nbformat": 4, "nbformat_minor": 5 }