Skip to content
Snippets Groups Projects
Commit ab7f07ea authored by Christian Boulanger's avatar Christian Boulanger
Browse files

change LinkML container name to "Reference"

parent 8221ff9f
Branches
No related tags found
No related merge requests found
%% Cell type:markdown id:ae7e001161d678cc tags:
## Convert AmyStyle training data to a simple JSONL format
%% Cell type:code id:f101a4e2408d6313 tags:
``` python
import xml.etree.ElementTree as ET
import json
import regex as re
import glob
import os
def xml_to_jsonl(input_xml_path, output_jsonl_path, tags):
tree = ET.parse(input_xml_path)
root = tree.getroot()
with (open(output_jsonl_path, 'w', encoding='utf-8') as f):
for sequence in root.findall('sequence'):
output = []
for element in sequence:
for tag in tags:
fn = None
if type(tag) is tuple:
tag, fn = tag
if element.tag == tag:
value = fn(element.text) if callable(fn) else element.text
is_new_ref = (len(output) == 0 # if no refs yet
or tag == "citation-number"
or tag in output[-1] # or tag already exists
or (tag in ["author", "editor", "authority", "legal-ref"] and 'date' in output[-1])) # or a creator field follows a date field
if is_new_ref:
output.append({})
# add citation number from previous citation
if len(output) > 1 and 'citation-number' in tags and tag != "citation-number" and 'citation-number' in output[-2]:
output[-1]['citation-number'] = output[-2]['citation-number']
# merge tags
if tag == "authority":
tag = "author"
output[-1][tag] = value
if len(output) > 0:
instance = {
"in" : " ".join(element.text.strip() if element.text else '' for element in sequence),
"out" : output
}
f.write(json.dumps(instance) + '\n')
def even_num_brackets(string: str):
return ((string.endswith(")") and string.count(")") == string.count("("))
or (string.endswith("]") and string.count("]") == string.count("[")))
def remove_punctuation(text):
start, end = 0, len(text)
while start < len(text) and re.match("\p{P}", text[start]):
start += 1
while end > start and re.match("\p{P}", text[end - 1]) and not even_num_brackets(text[start:end]):
end -= 1
return text[start:end].strip()
def clean_editor(text):
text = re.sub(r'in:? ?', '', text.strip(), flags=re.IGNORECASE)
text = re.sub(r'\(?(hrsg\. v\.|hg\. v|hrsg|ed\.|eds\.)\)?', '', text, flags=re.IGNORECASE)
return remove_punctuation(text)
def clean_container(text):
return remove_punctuation(re.sub(r'^(in|aus|from)(:| )', '', text.strip(), flags=re.IGNORECASE))
def extract_year(text):
m = re.search( r'[12][0-9]{3}', text)
return m.group(0) if m else None
for input_file in glob.glob('in/*.xml'):
base_name = os.path.basename(input_file)
schema_file = f'jsonl/{os.path.splitext(base_name)[0]}.jsonl'
print(f'Processing {input_file}')
xml_to_jsonl(input_file, schema_file, [
'citation-number',
("author", remove_punctuation),
("editor", clean_editor),
("authority", remove_punctuation),
("title", remove_punctuation),
("legal-ref", remove_punctuation),
("container-title", clean_container),
("journal", clean_container),
("date", extract_year),
("backref", remove_punctuation)
])
```
%% Output
Processing in\10.1111_1467-6478.00057.xml
Processing in\10.1111_1467-6478.00080.xml
Processing in\10.1515_zfrs-1980-0103.xml
Processing in\10.1515_zfrs-1980-0104.xml
%% Cell type:markdown id:6b14734c9c1f6ea6 tags:
## Create JSON and MarkML schema from JSON data
%% Cell type:code id:43e2040fed89c0bd tags:
``` python
# Adapted from https://github.com/linkml/schema-automator/blob/main/tests/test_generalizers/test_json_data_generalizer.py
import glob
import os
import pandas as pd
import json
from schema_automator.generalizers.json_instance_generalizer import JsonDataGeneralizer
from linkml.generators.yamlgen import YAMLGenerator
from schema_automator.utils.schemautils import write_schema
import warnings
warnings.filterwarnings("ignore") # Suppress irrelevant timezone warning
WRITE_EXTENDED_SCHEMA = False
ie = JsonDataGeneralizer()
for input_file in glob.glob('jsonl/*.jsonl'):
base_name = os.path.basename(input_file)
base_name_no_ext = os.path.splitext(base_name)[0]
json_file = f'json/{base_name_no_ext}.json'
schema_file = f'schema/{base_name_no_ext}-schema.yaml'
extended_schema_file = f'schema/{base_name_no_ext}-schema2.yaml'
print(f'Processing {input_file}')
with open(input_file, 'r', encoding='utf-8') as input_buf:
df = pd.read_json(input_file, lines=True)
flat_list = [item for sublist in df['out'].tolist() for item in sublist if pd.notna(item)]
with open(json_file, 'w') as f:
json.dump(flat_list, f)
schema = ie.convert(json_file, format='json')
schema = ie.convert(json_file, format='json', container_class_name="Reference")
write_schema(schema, schema_file)
if WRITE_EXTENDED_SCHEMA:
s = YAMLGenerator(schema_file).serialize()
with open(extended_schema_file, 'w') as stream:
stream.write(s)
```
%% Output
Processing jsonl\10.1111_1467-6478.00057.jsonl
WARNING:quantulum3.classifier:The classifier was built using a different scikit-learn version (=1.5.0, !=1.5.1). The disambiguation tool could behave unexpectedly. Consider running classifier.train_classfier()
Processing jsonl\10.1111_1467-6478.00080.jsonl
Processing jsonl\10.1515_zfrs-1980-0103.jsonl
Processing jsonl\10.1515_zfrs-1980-0104.jsonl
%% Cell type:code id:ca27199d10d9b8bf tags:
%% Cell type:markdown id:685c3cbbeaa1b733 tags:
``` python
# Create canonical schema file from one of the auto-generated ones
```
%% Cell type:markdown id:be7693b86de42036 tags:
%% Cell type:code id:2b48a09bda0a4387 tags:
``` python
import yaml
def merge_slot_data(input_file, slots_data_file, output_file):
with open(input_file, 'r') as f:
yaml1 = yaml.safe_load(f)
with open(slots_data_file, 'r') as f:
yaml2 = yaml.safe_load(f)
for key in yaml1['slots'].keys():
if key in yaml2['slots']:
yaml1['slots'][key].update(yaml2['slots'][key])
with open(output_file, 'w') as f:
yaml.dump(yaml1, f, default_flow_style=False, sort_keys=False)
merge_slot_data('schema/10.1111_1467-6478.00080-schema.yaml', 'schema/slot-data.yaml', 'schema/simple-schema.yaml')
```
%% Cell type:code id:1690bb4c648c5538 tags:
``` python
```
......
[
{
"citation-number": "1",
"author": "Geiger",
"date": "1964"
},
{
"citation-number": "2",
"author": "Feest/Blankenburg",
"date": "1972"
},
{
"citation-number": "2",
"author": "ich",
"title": "Nichtkriminalisierung als Struktur und Routine",
"date": "1976"
},
{
"citation-number": "3",
"author": "Peter MacNaughton-Smith und Richard Rosellen",
"title": "Bereitschaft zur Anzeigeerstattung",
"date": "1978"
},
{
"citation-number": "3",
"author": "Richard Rosellen",
"title": "Private Verbrechenskontrolle \u2014 eine empirische Untersuchung zur Anzeigeerstattung",
"date": "1980"
},
{
"citation-number": "4",
"author": "Blankenburg/Sessar/Steffen",
"date": "1978"
},
{
"citation-number": "5",
"author": "Black",
"date": "1973"
},
{
"citation-number": "6",
"author": "Gessner",
"date": "1976"
},
{
"citation-number": "7",
"author": "Luhmann",
"date": "1980"
},
{
"citation-number": "8",
"author": "Gessner",
"date": "1976"
},
{
"citation-number": "9",
"author": "Sch\u00f6nholz",
"date": "1980"
},
{
"citation-number": "10",
"author": "Blankenburg/Sch\u00f6nholz; Rogowski",
"date": "1979"
},
{
"citation-number": "11",
"author": "Hilden",
"date": "1976"
},
{
"citation-number": "12",
"author": "Koch",
"date": "1975"
},
{
"citation-number": "13",
"author": "Statistisches Bundesamt Wiesbaden",
"title": "Fachserie 10 (Rechtspflege) Reihe 2.1, Tabelle 10",
"date": "1978"
},
{
"citation-number": "14",
"author": "Blankenburg/Sch\u00f6nholz; Rogowski",
"date": "1979"
},
{
"citation-number": "15"
},
{
"citation-number": "16",
"author": "Johnson",
"date": "1979"
},
{
"citation-number": "17",
"author": "Steinbach",
"date": "1979"
},
{
"citation-number": "17",
"author": "Blankenburg/Blankenburg/Morasch",
"date": "1972"
},
{
"citation-number": "18",
"title": "Projektbericht ,Rechtshilfebed\u00fcrfnisse sozial Schwacher",
"author": "Blankenburg/Gorges/Reifner; Ticmann).",
"date": "1980"
},
{
"citation-number": "19",
"author": "Baumg\u00e4rtei",
"date": "1976"
},
{
"citation-number": "20"
},
{
"citation-number": "21",
"title": "Projektbericht Rechtsschutzversicherung",
"author": "Blankenburg; Fiedler",
"date": "1980"
},
{
"citation-number": "22",
"author": "Reifner/Gorges",
"date": "1980"
},
{
"citation-number": "23",
"author": "Reifner",
"date": "1979"
},
{
"citation-number": "24",
"author": "Carlin/Howard/Messinger",
"date": "1967"
},
{
"citation-number": "25",
"author": "Galanter",
"date": "1974"
},
{
"citation-number": "25",
"author": "Sarat",
"date": "1976"
},
{
"citation-number": "26",
"author": "Bender/Schumacher",
"date": "1980"
},
{
"citation-number": "27",
"author": "Steinbach",
"date": "1979"
},
{
"citation-number": "27",
"author": "Blankenburg/Blankenburg/Morasch",
"date": "1972"
},
{
"citation-number": "28",
"author": "Reifner",
"date": "1978"
},
{
"citation-number": "29",
"author": "Blankenburg/Sch\u00f6nholz; Rogowski",
"date": "1979"
},
{
"citation-number": "30",
"title": "Recht als gradualisiertes Konzept",
"date": "1980"
},
{
"citation-number": "31",
"author": "Steinbach",
"date": "1979"
},
{
"citation-number": "31",
"author": "Bender/Schumacher",
"date": "1980"
},
{
"citation-number": "32",
"author": "Wanner",
"date": "1975"
},
{
"citation-number": "32",
"author": "Bender/Schumacher",
"date": "1980"
},
{
"citation-number": "32",
"author": "Galanter",
"date": "1974"
},
{
"citation-number": "32",
"author": "Sarat",
"date": "1976"
},
{
"citation-number": "33",
"author": "Blankenburg/Sch\u00f6nholz; Rogowski",
"date": "1979"
},
{
"citation-number": "34",
"backref": "Ebenda"
},
{
"citation-number": "35",
"author": "Luhmann",
"date": "1969"
},
{
"citation-number": "36",
"author": "Felstiner und Danzig/Lowy",
"journal": "Law and Society Review",
"date": "1974"
},
{
"author": "Baumg\u00e4rtei, Gottfried",
"date": "1976",
"title": "Gleicher Zugang zum Recht f\u00fcr alle"
},
{
"author": "Bender, Rolf und Rolf Schumacher",
"date": "1980",
"title": "Erfolgsbarrieren vor Gericht."
},
{
"author": "Black, Donald",
"date": "1973",
"title": "The Mobilization of Law",
"journal": "Journal of Legal Studies"
},
{
"author": "Blankenburg, Erhard/Blankenburg, Viola/Morasch, Helmut",
"date": "1972",
"title": "Der lange Weg in die Berufung"
},
{
"editor": "Bender, Rolf",
"container-title": "Tatsachenforschung in der Justiz"
},
{
"author": "Blankenburg, Erhard",
"date": "1976",
"title": "Nichtkriminalisierung als Struktur und Routine"
},
{
"editor": "G\u00f6ppger",
"title": "Hans und G\u00fcnter Kaiser: Kriminologie und Strafverfahren"
},
{
"author": "Blankenburg, Erhard/Sessar, Klaus/Steffen, Wiebke",
"date": "1978",
"title": "Die Staatsanwaltschaft im Proze\u00df strafrechtlicher Sozialkontrolle"
},
{
"author": "Blankenburg, Erhard; Sch\u00f6nholz, Siegfried, unter Mitarbeit von Ralf Rogowski",
"date": "1979",
"title": "Zur Soziologie des Arbeitsgerichtsverfahrens"
},
{
"author": "Blankenburg, Erhard",
"date": "1980",
"title": "Recht als gradualisiertes Konzept"
},
{
"editor": "Blankenburg, Erhard; Klausa, Ekkehard und Hubert Rottleuthner",
"container-title": "Alternative Rechtsformen und Alternativen zum Recht"
},
{
"author": "Carlin, Jerome-, Jan Howard und Sheldon Messinger",
"date": "1967",
"title": "Civil Justice and the Poor"
},
{
"author": "Richard /Lowy, Michael",
"date": "1974",
"title": "Everday Disputes and Mediation in the United States: A Reply to Professor Felstiner",
"journal": "Law and Society Review"
},
{
"author": "Feest, Johannes/Blankenburg, Erhard",
"date": "1972",
"title": "Die Definitionsmacht der Polizei"
},
{
"author": "Felstiner, William L. F",
"date": "1974",
"title": "Influences of Social Organization on Dispute processing",
"journal": "Law and Society Review"
},
{
"author": "Felstiner, William L. F",
"date": "1974",
"title": "Avoidance as Dispute Processing: An Elaboration",
"journal": "Law and Society Review"
},
{
"author": "Galanter, Marc",
"date": "1974",
"title": "Why the ,Haves* Come out Ahead: Speculations on the Limits of Legal Change",
"journal": "Law and Society Review"
},
{
"author": "Geiger, Theodor",
"date": "1964",
"title": "Vorstudien zu einer Soziologie des Rechts"
},
{
"author": "Neuwied. Gessner, Volkmar",
"date": "1976",
"title": "Recht und Konflikt"
},
{
"author": "Hilden, Hartmut",
"date": "1976",
"title": "Rechtstatsachen im R\u00e4umungsstreit. Frankfurt/Main"
},
{
"author": "Johnson, Earl",
"date": "1979",
"title": "Thinking about Access: A Preliminary Typology of Possible Strategies"
},
{
"editor": "Cappelletti, Mauro und Bryant Garth",
"container-title": "Access to Justice"
},
{
"editor": "Milan und Alphen/ Rijn"
},
{
"author": "Koch, Hartmut",
"date": "1975",
"title": "Das Gerichtsverfahren als Konfliktl\u00f6sungsproze\u00df \u2014 Einstellung von Kl\u00e4gern und Beklagten zu Mietprozessen"
},
{
"author": "Luhmann, Niklas",
"date": "1969",
"title": "Legitimation durch Verfahren"
},
{
"author": "Luhmann, Niklas",
"date": "1980",
"title": "Kommunikation \u00fcber Recht in Interaktionssystemen"
},
{
"editor": "Blankenburg, Erhard; Klausa, Ekkehard und Hubert Rottleuthner",
"container-title": "Alternative Rechtsformen und Alternativen zum Recht"
},
{
"author": "Reifner, Udo",
"date": "1978",
"title": "Rechtshilfebed\u00fcrfnis und Verrechtlichung am Beispiel einer Berliner Mieterinitiative"
},
{
"author": "Reifner, Udo",
"date": "1979",
"title": "Gewerkschaftlicher Rechtsschutz \u2014 Geschichte des freigewerkschaftlichen Rechtsschutzes und der Rechtsberatung der Deutschen Arbeitsfront von 1894\u20141945"
},
{
"author": "Reifner, Udo und Irmela Gorges",
"date": "1980",
"title": "Alternativen der Rechtsberatung: Dienstleistung, F\u00fcrsorge und kollektive Selbsthilfe"
},
{
"editor": "Blankenburg, Erhard; Klausa, Ekkehard und Hubert Rottleuthner",
"container-title": "Alternative Rechtsformen und Alternativen zum Recht"
},
{
"author": "Sarat",
"date": "1976",
"title": "Alternatives in Dispute Processing in a Small Claim Court",
"journal": "Law and Society Review"
},
{
"author": "Sch\u00f6nholz, Siegfried",
"date": "1980",
"title": "Arbeitsplatzsicherung oder Kompensation - Rechtliche Formen des Bestandsschutzes im Vergleich"
},
{
"editor": "Vereigung f\u00fcr Rechtssoziologie",
"container-title": "Arbeitslosigkeit und Recht"
},
{
"author": "Steinbach, E",
"date": "1979",
"title": "GMD-Bericht, Manuskript. Gesellschaft f\u00fcr Mathematik und Datenverarbeitung"
},
{
"author": "Wanner, Craig",
"date": "1975",
"title": "The Public Ordering of Private Cases; Winning Civil Court Cases",
"journal": "Law and Society Review"
}
]
\ No newline at end of file
[{"citation-number": "1", "author": "Geiger", "date": "1964"}, {"citation-number": "2", "author": "Feest/Blankenburg", "date": "1972"}, {"citation-number": "2", "author": "ich", "title": "Nichtkriminalisierung als Struktur und Routine", "date": "1976"}, {"citation-number": "3", "author": "Peter MacNaughton-Smith und Richard Rosellen", "title": "Bereitschaft zur Anzeigeerstattung", "date": "1978"}, {"citation-number": "3", "author": "Richard Rosellen", "title": "Private Verbrechenskontrolle \u2014 eine empirische Untersuchung zur Anzeigeerstattung", "date": "1980"}, {"citation-number": "4", "author": "Blankenburg/Sessar/Steffen", "date": "1978"}, {"citation-number": "5", "author": "Black", "date": "1973"}, {"citation-number": "6", "author": "Gessner", "date": "1976"}, {"citation-number": "7", "author": "Luhmann", "date": "1980"}, {"citation-number": "8", "author": "Gessner", "date": "1976"}, {"citation-number": "9", "author": "Sch\u00f6nholz", "date": "1980"}, {"citation-number": "10", "author": "Blankenburg/Sch\u00f6nholz; Rogowski", "date": "1979"}, {"citation-number": "11", "author": "Hilden", "date": "1976"}, {"citation-number": "12", "author": "Koch", "date": "1975"}, {"citation-number": "13", "author": "Statistisches Bundesamt Wiesbaden", "title": "Fachserie 10 (Rechtspflege) Reihe 2.1, Tabelle 10", "date": "1978"}, {"citation-number": "14", "author": "Blankenburg/Sch\u00f6nholz; Rogowski", "date": "1979"}, {"citation-number": "15"}, {"citation-number": "16", "author": "Johnson", "date": "1979"}, {"citation-number": "17", "author": "Steinbach", "date": "1979"}, {"citation-number": "17", "author": "Blankenburg/Blankenburg/Morasch", "date": "1972"}, {"citation-number": "18", "title": "Projektbericht ,Rechtshilfebed\u00fcrfnisse sozial Schwacher", "author": "Blankenburg/Gorges/Reifner; Ticmann).", "date": "1980"}, {"citation-number": "19", "author": "Baumg\u00e4rtei", "date": "1976"}, {"citation-number": "20"}, {"citation-number": "21", "title": "Projektbericht Rechtsschutzversicherung", "author": "Blankenburg; Fiedler", "date": "1980"}, {"citation-number": "22", "author": "Reifner/Gorges", "date": "1980"}, {"citation-number": "23", "author": "Reifner", "date": "1979"}, {"citation-number": "24", "author": "Carlin/Howard/Messinger", "date": "1967"}, {"citation-number": "25", "author": "Galanter", "date": "1974"}, {"citation-number": "25", "author": "Sarat", "date": "1976"}, {"citation-number": "26", "author": "Bender/Schumacher", "date": "1980"}, {"citation-number": "27", "author": "Steinbach", "date": "1979"}, {"citation-number": "27", "author": "Blankenburg/Blankenburg/Morasch", "date": "1972"}, {"citation-number": "28", "author": "Reifner", "date": "1978"}, {"citation-number": "29", "author": "Blankenburg/Sch\u00f6nholz; Rogowski", "date": "1979"}, {"citation-number": "30", "title": "Recht als gradualisiertes Konzept", "date": "1980"}, {"citation-number": "31", "author": "Steinbach", "date": "1979"}, {"citation-number": "31", "author": "Bender/Schumacher", "date": "1980"}, {"citation-number": "32", "author": "Wanner", "date": "1975"}, {"citation-number": "32", "author": "Bender/Schumacher", "date": "1980"}, {"citation-number": "32", "author": "Galanter", "date": "1974"}, {"citation-number": "32", "author": "Sarat", "date": "1976"}, {"citation-number": "33", "author": "Blankenburg/Sch\u00f6nholz; Rogowski", "date": "1979"}, {"citation-number": "34", "backref": "Ebenda"}, {"citation-number": "35", "author": "Luhmann", "date": "1969"}, {"citation-number": "36", "author": "Felstiner und Danzig/Lowy", "journal": "Law and Society Review", "date": "1974"}, {"author": "Baumg\u00e4rtei, Gottfried", "date": "1976", "title": "Gleicher Zugang zum Recht f\u00fcr alle"}, {"author": "Bender, Rolf und Rolf Schumacher", "date": "1980", "title": "Erfolgsbarrieren vor Gericht."}, {"author": "Black, Donald", "date": "1973", "title": "The Mobilization of Law", "journal": "Journal of Legal Studies"}, {"author": "Blankenburg, Erhard/Blankenburg, Viola/Morasch, Helmut", "date": "1972", "title": "Der lange Weg in die Berufung"}, {"editor": "Bender, Rolf", "container-title": "Tatsachenforschung in der Justiz"}, {"author": "Blankenburg, Erhard", "date": "1976", "title": "Nichtkriminalisierung als Struktur und Routine"}, {"editor": "G\u00f6ppger", "title": "Hans und G\u00fcnter Kaiser: Kriminologie und Strafverfahren"}, {"author": "Blankenburg, Erhard/Sessar, Klaus/Steffen, Wiebke", "date": "1978", "title": "Die Staatsanwaltschaft im Proze\u00df strafrechtlicher Sozialkontrolle"}, {"author": "Blankenburg, Erhard; Sch\u00f6nholz, Siegfried, unter Mitarbeit von Ralf Rogowski", "date": "1979", "title": "Zur Soziologie des Arbeitsgerichtsverfahrens"}, {"author": "Blankenburg, Erhard", "date": "1980", "title": "Recht als gradualisiertes Konzept"}, {"editor": "Blankenburg, Erhard; Klausa, Ekkehard und Hubert Rottleuthner", "container-title": "Alternative Rechtsformen und Alternativen zum Recht"}, {"author": "Carlin, Jerome-, Jan Howard und Sheldon Messinger", "date": "1967", "title": "Civil Justice and the Poor"}, {"author": "Richard /Lowy, Michael", "date": "1974", "title": "Everday Disputes and Mediation in the United States: A Reply to Professor Felstiner", "journal": "Law and Society Review"}, {"author": "Feest, Johannes/Blankenburg, Erhard", "date": "1972", "title": "Die Definitionsmacht der Polizei"}, {"author": "Felstiner, William L. F", "date": "1974", "title": "Influences of Social Organization on Dispute processing", "journal": "Law and Society Review"}, {"author": "Felstiner, William L. F", "date": "1974", "title": "Avoidance as Dispute Processing: An Elaboration", "journal": "Law and Society Review"}, {"author": "Galanter, Marc", "date": "1974", "title": "Why the ,Haves* Come out Ahead: Speculations on the Limits of Legal Change", "journal": "Law and Society Review"}, {"author": "Geiger, Theodor", "date": "1964", "title": "Vorstudien zu einer Soziologie des Rechts"}, {"author": "Neuwied. Gessner, Volkmar", "date": "1976", "title": "Recht und Konflikt"}, {"author": "Hilden, Hartmut", "date": "1976", "title": "Rechtstatsachen im R\u00e4umungsstreit. Frankfurt/Main"}, {"author": "Johnson, Earl", "date": "1979", "title": "Thinking about Access: A Preliminary Typology of Possible Strategies"}, {"editor": "Cappelletti, Mauro und Bryant Garth", "container-title": "Access to Justice"}, {"editor": "Milan und Alphen/ Rijn"}, {"author": "Koch, Hartmut", "date": "1975", "title": "Das Gerichtsverfahren als Konfliktl\u00f6sungsproze\u00df \u2014 Einstellung von Kl\u00e4gern und Beklagten zu Mietprozessen"}, {"author": "Luhmann, Niklas", "date": "1969", "title": "Legitimation durch Verfahren"}, {"author": "Luhmann, Niklas", "date": "1980", "title": "Kommunikation \u00fcber Recht in Interaktionssystemen"}, {"editor": "Blankenburg, Erhard; Klausa, Ekkehard und Hubert Rottleuthner", "container-title": "Alternative Rechtsformen und Alternativen zum Recht"}, {"author": "Reifner, Udo", "date": "1978", "title": "Rechtshilfebed\u00fcrfnis und Verrechtlichung am Beispiel einer Berliner Mieterinitiative"}, {"author": "Reifner, Udo", "date": "1979", "title": "Gewerkschaftlicher Rechtsschutz \u2014 Geschichte des freigewerkschaftlichen Rechtsschutzes und der Rechtsberatung der Deutschen Arbeitsfront von 1894\u20141945"}, {"author": "Reifner, Udo und Irmela Gorges", "date": "1980", "title": "Alternativen der Rechtsberatung: Dienstleistung, F\u00fcrsorge und kollektive Selbsthilfe"}, {"editor": "Blankenburg, Erhard; Klausa, Ekkehard und Hubert Rottleuthner", "container-title": "Alternative Rechtsformen und Alternativen zum Recht"}, {"author": "Sarat", "date": "1976", "title": "Alternatives in Dispute Processing in a Small Claim Court", "journal": "Law and Society Review"}, {"author": "Sch\u00f6nholz, Siegfried", "date": "1980", "title": "Arbeitsplatzsicherung oder Kompensation - Rechtliche Formen des Bestandsschutzes im Vergleich"}, {"editor": "Vereigung f\u00fcr Rechtssoziologie", "container-title": "Arbeitslosigkeit und Recht"}, {"author": "Steinbach, E", "date": "1979", "title": "GMD-Bericht, Manuskript. Gesellschaft f\u00fcr Mathematik und Datenverarbeitung"}, {"author": "Wanner, Craig", "date": "1975", "title": "The Public Ordering of Private Cases; Winning Civil Court Cases", "journal": "Law and Society Review"}]
\ No newline at end of file
This diff is collapsed.
name: Container
description: Container
id: https://w3id.org/Container
name: Reference
description: Reference
id: https://w3id.org/Reference
imports:
- linkml:types
prefixes:
linkml: https://w3id.org/linkml/
Container: https://w3id.org/Container
default_prefix: Container
Reference: https://w3id.org/Reference
default_prefix: Reference
slots:
citation-number:
examples:
......@@ -45,7 +45,7 @@ slots:
- value: Dean v. District of Columbia 653 U.S. App. D.C
range: string
classes:
Container:
Reference:
slots:
- citation-number
- author
......
name: Container
description: Container
id: https://w3id.org/Container
name: Reference
description: Reference
id: https://w3id.org/Reference
imports:
- linkml:types
prefixes:
linkml: https://w3id.org/linkml/
Container: https://w3id.org/Container
default_prefix: Container
Reference: https://w3id.org/Reference
default_prefix: Reference
slots:
citation-number:
examples:
......@@ -45,7 +45,7 @@ slots:
- value: Cmnd. 2154
range: string
classes:
Container:
Reference:
slots:
- citation-number
- author
......
name: Container
description: Container
id: https://w3id.org/Container
name: Reference
description: Reference
id: https://w3id.org/Reference
imports:
- linkml:types
prefixes:
linkml: https://w3id.org/linkml/
Container: https://w3id.org/Container
default_prefix: Container
Reference: https://w3id.org/Reference
default_prefix: Reference
slots:
citation-number:
examples:
......@@ -41,7 +41,7 @@ slots:
- value: Arbeitslosigkeit und Recht
range: string
classes:
Container:
Reference:
slots:
- citation-number
- author
......
name: Container
description: Container
id: https://w3id.org/Container
name: Reference
description: Reference
id: https://w3id.org/Reference
imports:
- linkml:types
prefixes:
linkml: https://w3id.org/linkml/
Container: https://w3id.org/Container
default_prefix: Container
Reference: https://w3id.org/Reference
default_prefix: Reference
slots:
editor:
examples:
......@@ -41,7 +41,7 @@ slots:
- value: oben N. 52
range: string
classes:
Container:
Reference:
slots:
- editor
- title
......
name: Container
description: Container
id: https://w3id.org/Container
name: Reference
description: Reference
id: https://w3id.org/Reference
imports:
- linkml:types
prefixes:
linkml: https://w3id.org/linkml/
Container: https://w3id.org/Container
default_prefix: Container
Reference: https://w3id.org/Reference
default_prefix: Reference
slots:
citation-number:
examples:
......@@ -58,7 +58,7 @@ slots:
description: A legal reference, such as the name or identifier of a court decision
or law, sometimes including the source where this reference can be found
classes:
Container:
Reference:
slots:
- citation-number
- author
......
%% Cell type:markdown id:233c437cd1f9a650 tags:
# Prompt design
Template has been adapted from https://sadlynothavocdinosaur.com/posts/diagram-sentence/
%% Cell type:code id:bc99427770b546bc tags:
``` python
prompt_template = """
Below I will provide you with German language references that have been extracted from the footnotes of academic texts. Your job is to segment the references contained in the lines into their constituent parts, and to produce an XML representation of the structure of the references, in such a way that when the text content of the XML nodes are joined by whitespace, the original text can be reconstructed (additional inserted whitespace does not matter).
1. XML Structure:
The XML consists of a <dataset> root and <sequence> nodes, one for each reference in the list with subnodes having the following tags:
- author: the author or authors of the references work. create one author node for each author
- backref: back-reference such as "footnote 5, above", "ibid.", "ebd.", etc., if the reference string is from a footnote
- citation-number: the number of the footnote, if applicable
- collection-title: the title of the series that contains a work, if applicable
- container-title: the title of the work containing the referenced work, if applicable
- date: the date of the work. in most cases, the year of publication
- editor: if the work has editors, provide one editor node per person.
- ignore: this node contains all text that does not belong to any other node. It is needed in the segmentation of footnotes, which contain additional commentary unrelated to the bibliographic information.
- journal: the name of the journal. In many cases, the journal name is abbreviated
- location: the location of the publisher, if given
- note: this nodes is for information that is not bibliographic but is relevant for the citation. see examples below
- pages: page numbers
- publisher: the name of the publisher, if given
- signal: the introductory signal phrase indicating the character of, and motivation for, the citation, such as "see also", "cf.", "so auch", "anderer Meinung", "vgl.". etc. In many cases, the signal phrase contains additional commentary.
- title: the title of the referenced work
- volume: volume and issue of the work, if applicable
Note:
2. Example input and output:
INPUT:
1 Geiger 1964, insbesondere S. 65—83.
2 Vgl. Feest/Blankenburg, 1972. Die Konsequenz einer größeren Dunkelziffer bei den von der Polizei selbst entdeckten Straftaten entwickle ich ausführlicher in meinem Beitrag über ,Nichtkriminalisierung als Struktur und Routine', 1976.
3 Angaben aus einer Befragung von Peter MacNaughton-Smith und Richard Rosellen zur 'Bereitschaft zur Anzeigeerstattung' Manuskript Max-Planck-Institut für Strafrecht, Freiburg 1978.
Der ausführliche Forschungsbericht von Richard Rosellen erscheint in Kürze unter dem Titel 'Private Verbrechenskontrolle — eine empirische Untersuchung zur Anzeigeerstattung', Berlin, voraussichtlich 1980.
4 Vgl. Blankenburg/ Sessar/ Steffen, 1978, S. 66-85.
27 Siehe etwa die Erklärung, warum das „Access to justice movement" in der Bundesrepublik vergleichsweise wenig Widerhall gefunden hat von Blankenburg, Patterns of Legal Culture as a Variable for the Chances of Legal Innovation, in: Blankenburg (Hrsg.), innovations in the Legal Services (Cambridge, Mass.; Meisenheim 1980).
2 Rabel, Aufgabe und Notwendigkeit der Rechtsvergleichung (1925), abgedruckt in: Rabel, Gesammelte Aufsätze III (Hrsg. Leser, 1967) 1 (3).
18 Zu entsprechenden Versuchen etwa Merryman, Comparative Law and Scientific Explanation, in: Law in the United States of America in Social and Technological Revolution (Brüssel 1974) 81 (89 ff.).
OUTPUT:
<dataset>
<sequence>
<citation-number>1</citation-number>
<author>Geiger</author>
<date>1964,</date>
<pages>insbesondere S. 65—83.</pages>
</sequence>
<sequence>
<citation-number>2</citation-number>
<signal>Vgl.</signal>
<author>Feest/Blankenburg,</author>
<date>1972.</date>
</sequence>
<sequence>
<signal>Die Konsequenz einer größeren Dunkelziffer bei den von der Polizei selbst entdeckten Straftaten entwickle ich ausführlicher in meinem Beitrag über </signal>
<title>,Nichtkriminalisierung als Struktur und Routine',</title>
<date>1976.</date>
</sequence>
<sequence>
<citation-number>3</citation-number>
<note>Angaben aus einer Befragung von</note>
<author>Peter MacNaughton-Smith</author>
<ignore>und</ignore>
<author>Richard Rosellen</author>
<ignore>zur</ignore>
<title>'Bereitschaft zur Anzeigeerstattung' </title>
<note>Manuskript</note>
<publisher> Max-Planck-Institut für Strafrecht,</publisher>
<location>Freiburg</location>
<date>1978.</date>
</sequence>
<sequence>
<signal>Der ausführliche Forschungsbericht von </signal>
<author>Richard Rosellen</author>
<note>erscheint in Kürze unter dem Titel</note>
<title>'Private Verbrechenskontrolle — eine empirische Untersuchung zur Anzeigeerstattung',</title>
<location>Berlin,</location>
<date>voraussichtlich 1980.</date>
</sequence>
<sequence>
<citation-number>4</citation-number>
<signal>Vgl.</signal>
<author>Blankenburg/</author>
<author>Sessar/</author>
<author>Steffen,</author>
<date>1978,</date>
<pages>S. 66-85.</pages>
</sequence>
<sequence>
<citation-number>27</citation-number>
<signal>Siehe etwa die Erklärung, warum das „Access to justice movement&quot; in der Bundesrepublik vergleichsweise wenig Widerhall gefunden hat von</signal>
<author>Blankenburg,</author>
<title>Patterns of Legal Culture as a Variable for the Chances of Legal Innovation,</title>
<editor>in: Blankenburg (Hrsg.),</editor>
<container-title>innovations in the Legal Services</container-title>
<location>(Cambridge, Mass.; Meisenheim</location>
<date>1980).</date>
</sequence>
<sequence>
<citation-number>2</citation-number>
<author>Rabel,</author>
<title>Aufgabe und Notwendigkeit der Rechtsvergleichung (1925),</title>
<note>abgedruckt in:</note>
<author>Rabel,</author>
<title>Gesammelte Aufsätze</title>
<volume>III</volume>
<editor>(Hrsg. Leser,</editor>
<date>1967)</date>
<pages>1 (3).</pages>
</sequence>
<sequence>
<citation-number>18</citation-number>
<signal>Zu entsprechenden Versuchen etwa</signal>
<author>Merryman,</author>
<title>Comparative Law and Scientific Explanation,</title>
<container-title>in: Law in the United States of America in Social and Technological Revolution</container-title>
<location>(Brüssel</location>
<date>1974)</date>
<pages>81 (89 ff.).</pages>
</sequence>
</dataset>
3. Conclusion:
Now that I've given you these specifications, your job is to make such an object for the following list of references:
{reference}
DO NOT provide any additional commentary, simply return the XML data as per the specifications:
"""
```
%% Cell type:code id:7268ce5243d868ff tags:
``` python
```
%% Cell type:markdown id:850189270f4120f tags:
%% Cell type:code id:50175a95d764932e tags:
``` python
import json
import regex as re
input_path = "data/10.1515_zfrs-1980-0103-finder.jsonl"
with open(input_path, 'r', encoding='utf-8') as file:
lines = file.readlines()
input = ""
for line in lines[:3]:
item = json.loads(line)
text, spans = item['text'], item['spans']
for span in spans:
span_text = re.sub(r'^[\p{P} ]+|[\p{P} ]+$','',text[span['start']:span['end']])
output.append(f"Text:{span_text}\nLabel:{span['label']}")
output = "\n".join(output)
system_prompt += f'\n>>EXAMPLE INPUT{input}\n\n>>EXAMPLE OUTPUT:\n{output}\n'
print(system_prompt)
```
%% Cell type:code id:initial_id tags:
``` python
import os
from openai import OpenAI
from dotenv import load_dotenv
load_dotenv()
# API configuration
api_key = os.environ.get("GWDG_TI_API_KEY")
base_url = "https://chat-ai.academiccloud.de/v1"
models = "intel-neural-chat-7b mixtral-8x7b-instruct qwen1.5-72b-chat meta-llama-3-70b-instruct".split()
# Start OpenAI client
client = OpenAI(
api_key = api_key,
base_url = base_url
)
model = models[0]
# Get response
response = client.chat.completions.create(
messages=[
{"role":"system","content":"You will be provided with a t"},
{"role":"user","content":"Definiere Rechtsdogmatik"}],
model= model,
)
# Print full response as JSON
content = response.choices[0].message.content
print(content)
```
%% Cell type:code id:a43a6014692373a9 tags:
``` python
!python -m prodigy spans.manual footnotes blank:de ./data/10.1515_zfrs-1980-0103-parser.jsonl --label "author backref citation-number collection-title container-title date editor ignore journal location note pages publisher signal title volume"
```
%% Output
^C
%% Cell type:code id:78ae68df9f88f7a1 tags:
``` python
```
%% Cell type:code id:53fc1768ed2d7ab0 tags:
``` python
!python -m prodigy stats
```
%% Output

============================= [*] Prodigy Stats =============================
Version 1.15.3
License Type Prodigy Personal
Location C:\Users\Boulanger\anaconda3\envs\experiments\Lib\site-packages\prodigy
Prodigy Home C:\Users\Boulanger\.prodigy
Platform Windows-10-10.0.19044-SP0
Python Version 3.11.7
spaCy Version 3.7.4
Database Name SQLite
Database Id sqlite
Total Datasets 0
Total Sessions 0
%% Cell type:code id:6187045fd6b21a31 tags:
``` python
```
%% Cell type:markdown id:7c3613c385449e4 tags:
adapted from https://kili-technology.com/data-labeling/machine-learning/using-chatgpt-to-pre-annotate-named-entities-recognition-labeling-tasks
%% Cell type:code id:12d506f869193ef4 tags:
``` python
json_response_array = []
for datapoint, sentence_annotations in zip(dataset, openai_answers):
full_sentence = datapoint["sentence"]
annotations = [] # list of annotations for the sentence
for category, _ in ENTITY_TYPES:
sentence_annotations_cat = sentence_annotations[category]
for content in sentence_annotations_cat:
begin_offset = full_sentence.find(content)
assert (
begin_offset != -1
), f"Cannot find offset of '{content}' in sentence '{full_sentence}'"
annotation = {
"categories": [{"name": category}],
"beginOffset": begin_offset,
"content": content,
}
annotations.append(annotation)
json_resp = {"NAMED_ENTITIES_RECOGNITION_JOB": {"annotations": annotations}}
json_response_array.append(json_resp)
```
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment