{
 "cells": [
  {
   "metadata": {},
   "cell_type": "markdown",
   "source": [
    "# Convert AnyStyle training files to the Prodigy format\n",
    "\n",
    "Converter functions written with the help GPT-4, see https://chat.openai.com/share/3f42ae1d-3066-4563-944d-3139b5e1e867 and https://chat.openai.com/share/b6832876-9424-4ff9-bd83-157f3197f805"
   ],
   "id": "cb631197c03a0768"
  },
  {
   "metadata": {},
   "cell_type": "markdown",
   "source": [
    "## Parser files"
   ],
   "id": "83bfc356ce72fe51"
  },
  {
   "metadata": {},
   "cell_type": "markdown",
   "source": "For the output format, see https://prodi.gy/docs/api-interfaces#spans_manual",
   "id": "f46a123db5e7d341"
  },
  {
   "cell_type": "code",
   "id": "initial_id",
   "metadata": {
    "collapsed": true,
    "ExecuteTime": {
     "end_time": "2024-07-29T17:07:02.554572Z",
     "start_time": "2024-07-29T17:06:39.684515Z"
    }
   },
   "source": [
    "import xml.etree.ElementTree as ET\n",
    "import spacy\n",
    "import json\n",
    "\n",
    "# Load the German NLP model\n",
    "nlp = spacy.load('de_core_news_sm')\n",
    "\n",
    "def xml_to_jsonl(input_xml_path, output_jsonl_path):\n",
    "    tree = ET.parse(input_xml_path)\n",
    "    root = tree.getroot()\n",
    "\n",
    "    with open(output_jsonl_path, 'w', encoding='utf-8') as f:\n",
    "        for sequence in root.findall('sequence'):\n",
    "            text = \" \".join(element.text.strip() if element.text else '' for element in sequence)\n",
    "            doc = nlp(text)\n",
    "            tokens = [{'text': token.text, 'start': token.idx, 'end': token.idx + len(token)} for token in doc]\n",
    "            \n",
    "            # Add each token's ID and whitespace flag\n",
    "            for idx, token in enumerate(tokens):\n",
    "                token['id'] = idx\n",
    "                token['ws'] = (token['end'] < len(text) and text[token['end']] == \" \")\n",
    "\n",
    "            spans = []\n",
    "            for element in sequence:\n",
    "                if not element.text:\n",
    "                    continue\n",
    "                element_text = element.text.strip()\n",
    "                start = text.index(element_text)\n",
    "                end = start + len(element_text)\n",
    "\n",
    "                # Find the tokens that encompass the start and end of the element text\n",
    "                token_start = next((i for i, token in enumerate(tokens) if token['start'] <= start < token['end']), None)\n",
    "                token_end = next((i for i, token in enumerate(tokens) if token['start'] < end <= token['end']), None)\n",
    "\n",
    "                # If tokens are found, add to spans list\n",
    "                if token_start is not None and token_end is not None:\n",
    "                    span = {\n",
    "                        'start': start,\n",
    "                        'end': end,\n",
    "                        'label': element.tag,\n",
    "                        'token_start': token_start,\n",
    "                        'token_end': token_end\n",
    "                    }\n",
    "                    spans.append(span)\n",
    "                else:\n",
    "                    print(f\"Error finding tokens for span from text: '{element_text}'\")\n",
    "\n",
    "            # Serialize and write to file\n",
    "            f.write(json.dumps({'text': text, 'tokens': tokens, 'spans': spans}) + '\\n')\n",
    "\n"
   ],
   "outputs": [],
   "execution_count": 1
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-07-29T17:08:04.715576Z",
     "start_time": "2024-07-29T17:08:02.065755Z"
    }
   },
   "cell_type": "code",
   "source": [
    "from pathlib import Path\n",
    "for filename in [file.stem for file in Path('anystyle').glob('*.xml')]:\n",
    "    xml_to_jsonl(f'anystyle/{filename}.xml', f'prodigy/{filename}-parser.jsonl')"
   ],
   "id": "fa9f2c194697a6b7",
   "outputs": [],
   "execution_count": 4
  },
  {
   "metadata": {},
   "cell_type": "markdown",
   "source": [
    "## Finder files"
   ],
   "id": "2988615a1be96bb"
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-04-12T06:54:04.924138Z",
     "start_time": "2024-04-12T06:54:04.913099Z"
    }
   },
   "cell_type": "code",
   "source": [
    "import json\n",
    "\n",
    "def markup_to_jsonl(input_path, output_jsonl_path, break_tags_before=None, break_tags_after=None):\n",
    "    if break_tags_before is None:\n",
    "        break_tags_before = []\n",
    "    if break_tags_after is None:\n",
    "        break_tags_after = []\n",
    "\n",
    "    # Helper function to write blocks to JSONL, now takes a file handle\n",
    "    def write_block(full_text, spans, file_handle):\n",
    "        if full_text.strip():  # Ensure there's content to write\n",
    "            result = {\n",
    "                'text': full_text.rstrip(),  # Remove trailing spaces\n",
    "                'spans': spans\n",
    "            }\n",
    "            file_handle.write(json.dumps(result) + '\\n')\n",
    "\n",
    "    # Open the output file once at the beginning\n",
    "    with open(output_jsonl_path, 'w', encoding='utf-8') as f:\n",
    "        with open(input_path, 'r', encoding='utf-8') as file:\n",
    "            lines = file.readlines()\n",
    "\n",
    "        full_text = ''\n",
    "        spans = []\n",
    "        current_text = ''\n",
    "        text_start_pos = 0\n",
    "        current_tag = 'text'  # Default initial tag\n",
    "\n",
    "        # Iterate through lines\n",
    "        for line in lines:\n",
    "            parts = line.strip().split('|', 1)\n",
    "            if len(parts) < 2:\n",
    "                continue\n",
    "\n",
    "            tag = parts[0].strip()\n",
    "            content = parts[1].strip()\n",
    "\n",
    "            if tag in break_tags_before:\n",
    "                # Finish the current text block before starting new due to break tag\n",
    "                if current_text:\n",
    "                    full_text += current_text\n",
    "                    spans.append({\n",
    "                        'start': text_start_pos,\n",
    "                        'end': text_start_pos + len(current_text),\n",
    "                        'label': current_tag\n",
    "                    })\n",
    "                    write_block(full_text, spans, f)\n",
    "                    full_text, spans, current_text = '', [], ''\n",
    "                current_tag = tag  # Update to the new tag\n",
    "                text_start_pos = len(full_text)\n",
    "\n",
    "            # Append content with maintaining line break if the previous content was not empty\n",
    "            current_text += content + '\\n' if current_text else content\n",
    "\n",
    "            if tag:\n",
    "                if current_text and current_text != content:  # Ensures not to add just processed content span\n",
    "                    # Finish the current span\n",
    "                    full_text += current_text\n",
    "                    spans.append({\n",
    "                        'start': text_start_pos,\n",
    "                        'end': text_start_pos + len(current_text) - 1,  # -1 to not count the last \\n for span\n",
    "                        'label': current_tag\n",
    "                    })\n",
    "                current_text = content + '\\n'  # Start new text content under the new tag\n",
    "                current_tag = tag\n",
    "                text_start_pos = len(full_text)\n",
    "\n",
    "            if tag in break_tags_after:\n",
    "                # Finalize current text and write it due to break tag\n",
    "                full_text += current_text\n",
    "                spans.append({\n",
    "                    'start': text_start_pos,\n",
    "                    'end': text_start_pos + len(current_text) - 1,  # -1 as above\n",
    "                    'label': current_tag\n",
    "                })\n",
    "                current_text = ''\n",
    "                write_block(full_text, spans, f)\n",
    "                full_text, spans = '', []\n",
    "\n",
    "        # Final write for any remaining text\n",
    "        if current_text:\n",
    "            full_text += current_text\n",
    "            spans.append({\n",
    "                'start': text_start_pos,\n",
    "                'end': text_start_pos + len(current_text) - 1,  # -1 to adjust for the last newline character\n",
    "                'label': current_tag\n",
    "            })\n",
    "        write_block(full_text, spans, f)\n",
    "\n",
    "# Example usage:\n",
    "# markup_to_jsonl('input_text.txt', 'output.jsonl', ['header'], ['footer'])\n"
   ],
   "id": "7764c66e149abd4f",
   "outputs": [],
   "execution_count": 26
  },
  {
   "metadata": {},
   "cell_type": "markdown",
   "source": [
    "Since the finder files have no information on page breaks, document segmentation is done using the `meta` tag, which is a very simple heuristic and probably fails in many cases. "
   ],
   "id": "8923ed5011c1d65f"
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-04-12T06:54:05.879064Z",
     "start_time": "2024-04-12T06:54:05.870910Z"
    }
   },
   "cell_type": "code",
   "source": [
    "from pathlib import Path\n",
    "for filename in [file.stem for file in Path('in').glob('*.xml')]:\n",
    "    markup_to_jsonl(f'anystyle/{filename}.ttx', f'prodigy/{filename}-finder.jsonl', break_tags_before=['meta'])"
   ],
   "id": "2b0ca72bbf70c829",
   "outputs": [],
   "execution_count": 27
  },
  {
   "cell_type": "markdown",
   "source": [
    "## Extract labels"
   ],
   "metadata": {
    "collapsed": false
   },
   "id": "72233341123b80c7"
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-05-08T17:00:47.429395Z",
     "start_time": "2024-05-08T17:00:47.418132Z"
    }
   },
   "cell_type": "code",
   "source": [
    "import json\n",
    "\n",
    "def extract_unique_labels(jsonl_file_path):\n",
    "    unique_labels = set()  # Using a set to store unique labels\n",
    "    with open(jsonl_file_path, 'r', encoding='utf-8') as file:\n",
    "        for line in file:\n",
    "            data = json.loads(line)  # Load JSON from each line\n",
    "            if 'spans' in data:\n",
    "                for span in data['spans']:\n",
    "                    if 'label' in span:\n",
    "                        unique_labels.add(span['label'])  # Add label to the set\n",
    "    unique_labels = list(unique_labels)\n",
    "    unique_labels.sort()\n",
    "    return unique_labels\n",
    "\n",
    "\" \".join(extract_unique_labels(f'prodigy/10.1515_zfrs-1980-0103-parser.jsonl'))\n"
   ],
   "id": "46d7e63e53b441e3",
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'author backref citation-number collection-title container-title date editor ignore journal location note pages publisher signal title volume'"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "execution_count": 2
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "outputs": [],
   "source": [],
   "metadata": {
    "collapsed": false
   },
   "id": "3efd4cc21605ecbd"
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}