Skip to content
Snippets Groups Projects
Commit 48cb0887 authored by cboulanger's avatar cboulanger
Browse files

Adding finder file conversion

parent 10ee6ba0
No related branches found
No related tags found
No related merge requests found
%% Cell type:markdown id:cb631197c03a0768 tags:
# Convert AnyStyle training files to the Prodigy format
Converter functions written by GPT-4, see https://chat.openai.com/share/3f42ae1d-3066-4563-944d-3139b5e1e867
%% Cell type:markdown id:83bfc356ce72fe51 tags:
## Parser files
%% Cell type:code id:initial_id tags: %% Cell type:code id:initial_id tags:
``` python ``` python
import xml.etree.ElementTree as ET import xml.etree.ElementTree as ET
import json import json
def xml_to_jsonl(input_xml_path, output_jsonl_path): def xml_to_jsonl(input_xml_path, output_jsonl_path):
tree = ET.parse(input_xml_path) tree = ET.parse(input_xml_path)
root = tree.getroot() root = tree.getroot()
with open(output_jsonl_path, 'w', encoding='utf-8') as f: with open(output_jsonl_path, 'w', encoding='utf-8') as f:
for sequence in root.findall('sequence'): for sequence in root.findall('sequence'):
text = '' text = ''
spans = [] spans = []
current_pos = 0 current_pos = 0
for element in sequence: for element in sequence:
content = (element.text or '').strip() + ' ' # Append a space for separation content = (element.text or '').strip() + ' ' # Append a space for separation
if content: if content:
start = current_pos start = current_pos
end = start + len(content) - 1 # Subtract 1 to account for the space added end = start + len(content) - 1 # Subtract 1 to account for the space added
spans.append({'start': start, 'end': end, 'label': element.tag}) spans.append({'start': start, 'end': end, 'label': element.tag})
text += content text += content
current_pos = end + 1 # Move current position to right after the last character current_pos = end + 1 # Move current position to right after the last character
# Store the reconstructed sequence and its spans in a dictionary # Store the reconstructed sequence and its spans in a dictionary
entry = {'text': text.strip(), 'spans': spans} entry = {'text': text.strip(), 'spans': spans}
# Write the JSON formatted string to the file, ending with a newline # Write the JSON formatted string to the file, ending with a newline
f.write(json.dumps(entry) + '\n') f.write(json.dumps(entry) + '\n')
``` ```
%% Cell type:code id:fa9f2c194697a6b7 tags: %% Cell type:code id:fa9f2c194697a6b7 tags:
``` python ``` python
from pathlib import Path from pathlib import Path
for filename in [file.stem for file in Path('in').glob('*.xml')]: for filename in [file.stem for file in Path('in').glob('*.xml')]:
xml_to_jsonl(f'in/{filename}.xml', f'out/{filename}.jsonl') xml_to_jsonl(f'in/{filename}.xml', f'out/{filename}-parser.jsonl')
``` ```
%% Cell type:markdown id:2988615a1be96bb tags:
## Finder files
%% Cell type:code id:7764c66e149abd4f tags: %% Cell type:code id:7764c66e149abd4f tags:
``` python ``` python
import json
def markup_to_jsonl(input_path, output_jsonl_path, break_tags_before=None, break_tags_after=None):
if break_tags_before is None:
break_tags_before = []
if break_tags_after is None:
break_tags_after = []
# Helper function to write blocks to JSONL, now takes a file handle
def write_block(full_text, spans, file_handle):
if full_text.strip(): # Ensure there's content to write
result = {
'text': full_text.rstrip(), # Remove trailing spaces
'spans': spans
}
file_handle.write(json.dumps(result) + '\n')
# Open the output file once at the beginning
with open(output_jsonl_path, 'w', encoding='utf-8') as f:
with open(input_path, 'r', encoding='utf-8') as file:
lines = file.readlines()
full_text = ''
spans = []
current_text = ''
text_start_pos = 0
current_tag = 'text' # Default initial tag
# Iterate through lines
for line in lines:
parts = line.strip().split('|', 1)
if len(parts) < 2:
continue
tag = parts[0].strip()
content = parts[1].strip()
if tag in break_tags_before:
# Finish the current text block before starting new due to break tag
if current_text:
full_text += current_text
spans.append({
'start': text_start_pos,
'end': text_start_pos + len(current_text),
'label': current_tag
})
write_block(full_text, spans, f)
full_text, spans, current_text = '', [], ''
current_tag = tag # Update to the new tag
text_start_pos = len(full_text)
# Append content with maintaining line break if the previous content was not empty
current_text += content + '\n' if current_text else content
if tag:
if current_text and current_text != content: # Ensures not to add just processed content span
# Finish the current span
full_text += current_text
spans.append({
'start': text_start_pos,
'end': text_start_pos + len(current_text) - 1, # -1 to not count the last \n for span
'label': current_tag
})
current_text = content + '\n' # Start new text content under the new tag
current_tag = tag
text_start_pos = len(full_text)
if tag in break_tags_after:
# Finalize current text and write it due to break tag
full_text += current_text
spans.append({
'start': text_start_pos,
'end': text_start_pos + len(current_text) - 1, # -1 as above
'label': current_tag
})
current_text = ''
write_block(full_text, spans, f)
full_text, spans = '', []
# Final write for any remaining text
if current_text:
full_text += current_text
spans.append({
'start': text_start_pos,
'end': text_start_pos + len(current_text) - 1, # -1 to adjust for the last newline character
'label': current_tag
})
write_block(full_text, spans, f)
# Example usage:
# markup_to_jsonl('input_text.txt', 'output.jsonl', ['header'], ['footer'])
```
%% Cell type:markdown id:8923ed5011c1d65f tags:
Since the finder files have no information on page breaks, document segmentation is done using the `meta` tag, which is a very simple heuristic and probably fails in many cases.
%% Cell type:code id:2b0ca72bbf70c829 tags:
``` python
from pathlib import Path
for filename in [file.stem for file in Path('in').glob('*.xml')]:
markup_to_jsonl(f'in/{filename}.ttx', f'out/{filename}-finder.jsonl', break_tags_before=['meta'])
```
%% Cell type:code id:46d7e63e53b441e3 tags:
``` python
``` ```
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment