Skip to content
Snippets Groups Projects
Commit 3bf60326 authored by Christian Boulanger's avatar Christian Boulanger
Browse files

Update convert-anystyle-data experiment

parent 8d055390
No related branches found
No related tags found
No related merge requests found
%% Cell type:markdown id:cb631197c03a0768 tags:
# Convert AnyStyle training files to the Prodigy format
Converter functions written with the help GPT-4, see https://chat.openai.com/share/3f42ae1d-3066-4563-944d-3139b5e1e867 and https://chat.openai.com/share/b6832876-9424-4ff9-bd83-157f3197f805
%% Cell type:markdown id:83bfc356ce72fe51 tags:
## Parser files
%% Cell type:markdown id:f46a123db5e7d341 tags:
For the output format, see https://prodi.gy/docs/api-interfaces#spans_manual
%% Cell type:code id:initial_id tags:
``` python
import xml.etree.ElementTree as ET
import spacy
import json
# Load the German NLP model
nlp = spacy.load('de_core_news_sm')
def xml_to_jsonl(input_xml_path, output_jsonl_path):
tree = ET.parse(input_xml_path)
root = tree.getroot()
with open(output_jsonl_path, 'w', encoding='utf-8') as f:
for sequence in root.findall('sequence'):
text = " ".join(element.text.strip() if element.text else '' for element in sequence)
doc = nlp(text)
tokens = [{'text': token.text, 'start': token.idx, 'end': token.idx + len(token)} for token in doc]
# Add each token's ID and whitespace flag
for idx, token in enumerate(tokens):
token['id'] = idx
token['ws'] = (token['end'] < len(text) and text[token['end']] == " ")
spans = []
for element in sequence:
if not element.text:
continue
element_text = element.text.strip()
start = text.index(element_text)
end = start + len(element_text)
# Find the tokens that encompass the start and end of the element text
token_start = next((i for i, token in enumerate(tokens) if token['start'] <= start < token['end']), None)
token_end = next((i for i, token in enumerate(tokens) if token['start'] < end <= token['end']), None)
# If tokens are found, add to spans list
if token_start is not None and token_end is not None:
span = {
'start': start,
'end': end,
'label': element.tag,
'token_start': token_start,
'token_end': token_end
}
spans.append(span)
else:
print(f"Error finding tokens for span from text: '{element_text}'")
# Serialize and write to file
f.write(json.dumps({'text': text, 'tokens': tokens, 'spans': spans}) + '\n')
```
%% Cell type:code id:fa9f2c194697a6b7 tags:
``` python
from pathlib import Path
for filename in [file.stem for file in Path('in').glob('*.xml')]:
xml_to_jsonl(f'in/{filename}.xml', f'out/{filename}-parser.jsonl')
xml_to_jsonl(f'in/{filename}.xml', f'out/{filename}-prodigy-parser.jsonl')
```
%% Cell type:markdown id:2988615a1be96bb tags:
## Finder files
%% Cell type:code id:7764c66e149abd4f tags:
``` python
import json
def markup_to_jsonl(input_path, output_jsonl_path, break_tags_before=None, break_tags_after=None):
if break_tags_before is None:
break_tags_before = []
if break_tags_after is None:
break_tags_after = []
# Helper function to write blocks to JSONL, now takes a file handle
def write_block(full_text, spans, file_handle):
if full_text.strip(): # Ensure there's content to write
result = {
'text': full_text.rstrip(), # Remove trailing spaces
'spans': spans
}
file_handle.write(json.dumps(result) + '\n')
# Open the output file once at the beginning
with open(output_jsonl_path, 'w', encoding='utf-8') as f:
with open(input_path, 'r', encoding='utf-8') as file:
lines = file.readlines()
full_text = ''
spans = []
current_text = ''
text_start_pos = 0
current_tag = 'text' # Default initial tag
# Iterate through lines
for line in lines:
parts = line.strip().split('|', 1)
if len(parts) < 2:
continue
tag = parts[0].strip()
content = parts[1].strip()
if tag in break_tags_before:
# Finish the current text block before starting new due to break tag
if current_text:
full_text += current_text
spans.append({
'start': text_start_pos,
'end': text_start_pos + len(current_text),
'label': current_tag
})
write_block(full_text, spans, f)
full_text, spans, current_text = '', [], ''
current_tag = tag # Update to the new tag
text_start_pos = len(full_text)
# Append content with maintaining line break if the previous content was not empty
current_text += content + '\n' if current_text else content
if tag:
if current_text and current_text != content: # Ensures not to add just processed content span
# Finish the current span
full_text += current_text
spans.append({
'start': text_start_pos,
'end': text_start_pos + len(current_text) - 1, # -1 to not count the last \n for span
'label': current_tag
})
current_text = content + '\n' # Start new text content under the new tag
current_tag = tag
text_start_pos = len(full_text)
if tag in break_tags_after:
# Finalize current text and write it due to break tag
full_text += current_text
spans.append({
'start': text_start_pos,
'end': text_start_pos + len(current_text) - 1, # -1 as above
'label': current_tag
})
current_text = ''
write_block(full_text, spans, f)
full_text, spans = '', []
# Final write for any remaining text
if current_text:
full_text += current_text
spans.append({
'start': text_start_pos,
'end': text_start_pos + len(current_text) - 1, # -1 to adjust for the last newline character
'label': current_tag
})
write_block(full_text, spans, f)
# Example usage:
# markup_to_jsonl('input_text.txt', 'output.jsonl', ['header'], ['footer'])
```
%% Cell type:markdown id:8923ed5011c1d65f tags:
Since the finder files have no information on page breaks, document segmentation is done using the `meta` tag, which is a very simple heuristic and probably fails in many cases.
%% Cell type:code id:2b0ca72bbf70c829 tags:
``` python
from pathlib import Path
for filename in [file.stem for file in Path('in').glob('*.xml')]:
markup_to_jsonl(f'in/{filename}.ttx', f'out/{filename}-finder.jsonl', break_tags_before=['meta'])
markup_to_jsonl(f'in/{filename}.ttx', f'out/{filename}-prodigy-finder.jsonl', break_tags_before=['meta'])
```
%% Cell type:markdown id:72233341123b80c7 tags:
## Extract labels
%% Cell type:code id:46d7e63e53b441e3 tags:
``` python
import json
def extract_unique_labels(jsonl_file_path):
unique_labels = set() # Using a set to store unique labels
with open(jsonl_file_path, 'r', encoding='utf-8') as file:
for line in file:
data = json.loads(line) # Load JSON from each line
if 'spans' in data:
for span in data['spans']:
if 'label' in span:
unique_labels.add(span['label']) # Add label to the set
unique_labels = list(unique_labels)
unique_labels.sort()
return unique_labels
" ".join(extract_unique_labels(f'out/10.1515_zfrs-1980-0103-parser.jsonl'))
```
%% Output
'author backref citation-number collection-title container-title date editor ignore journal location note pages publisher signal title volume'
%% Cell type:code id:3efd4cc21605ecbd tags:
``` python
```
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment