From 2ef626ae86c23b9cd6aef8418fd04d78a7f8b864 Mon Sep 17 00:00:00 2001 From: Christian Boulanger <boulanger@lhlt.mpg.de> Date: Tue, 19 Mar 2024 18:29:59 +0100 Subject: [PATCH] Add GPT-4 optimized prompt, refactoring --- wikidata/.gitignore | 6 +- wikidata/data/Thilo Ramm.csv | 12 -- wikidata/{ => data}/scholars-qid.csv | 0 wikidata/{ => data}/scholars.csv | 0 ...s.ipynb => full-extraction-workflow.ipynb} | 52 +++--- wikidata/lib/langchain.py | 7 +- wikidata/output/.gitkeep | 1 + .../{data => output}/Erhard Blankenburg.csv | 0 wikidata/output/Thilo Ramm-gp4-prompt.csv | 7 + .../output/Thilo Ramm-handmade-prompt.csv | 14 ++ .../prompts/gp4-optimized-prompt-template.txt | 48 +++++ .../prompts/gpt4-optimized-test-prompt.txt | 58 ++++++ .../handmade-prompt-template.txt} | 3 +- wikidata/prompts/handmade-test-prompt.txt | 40 ++++ wikidata/update-wikidata.ipynb | 176 ------------------ 15 files changed, 208 insertions(+), 216 deletions(-) delete mode 100644 wikidata/data/Thilo Ramm.csv rename wikidata/{ => data}/scholars-qid.csv (100%) rename wikidata/{ => data}/scholars.csv (100%) rename wikidata/{download-wikipedia-pages.ipynb => full-extraction-workflow.ipynb} (70%) create mode 100644 wikidata/output/.gitkeep rename wikidata/{data => output}/Erhard Blankenburg.csv (100%) create mode 100644 wikidata/output/Thilo Ramm-gp4-prompt.csv create mode 100644 wikidata/output/Thilo Ramm-handmade-prompt.csv create mode 100644 wikidata/prompts/gp4-optimized-prompt-template.txt create mode 100644 wikidata/prompts/gpt4-optimized-test-prompt.txt rename wikidata/{extraction-prompt.txt => prompts/handmade-prompt-template.txt} (85%) create mode 100644 wikidata/prompts/handmade-test-prompt.txt delete mode 100644 wikidata/update-wikidata.ipynb diff --git a/wikidata/.gitignore b/wikidata/.gitignore index afab930..6f3b439 100644 --- a/wikidata/.gitignore +++ b/wikidata/.gitignore @@ -1,7 +1,7 @@ -timeline_data.xlsx +data/timeline_data.xlsx user-config.py .env apicache /throttle.ctrl -data/*-chatgpt.csv -data/*-wikipedia.txt \ No newline at end of file +input/*-chatgpt.csv +input/*-wikipedia.txt \ No newline at end of file diff --git a/wikidata/data/Thilo Ramm.csv b/wikidata/data/Thilo Ramm.csv deleted file mode 100644 index a33217a..0000000 --- a/wikidata/data/Thilo Ramm.csv +++ /dev/null @@ -1,12 +0,0 @@ -subject-label,subject-qid,predicate,pid,object,object-qid,start_time,end_time,reference_url -Thilo Ramm,Q59533838,educated at,P69,University of Marburg,,,1949,de.wikipedia.org/wiki/Thilo_Ramm?oldid=237476391 -Thilo Ramm,Q59533838,student of,P1066,Fritz von Hippel,,1949,1949,de.wikipedia.org/wiki/Thilo_Ramm?oldid=237476391 -Thilo Ramm,Q59533838,educated at,P69,University of Freiburg,,1953,1953,de.wikipedia.org/wiki/Thilo_Ramm?oldid=237476391 -Thilo Ramm,Q59533838,academic appointment,P8413,University of Freiburg,,,1961,de.wikipedia.org/wiki/Thilo_Ramm?oldid=237476391 -University of Giessen,,founded by,P112,Thilo Ramm,Q59533838,1962,1962,de.wikipedia.org/wiki/Thilo_Ramm?oldid=237476391 -Thilo Ramm,Q59533838,academic appointment,P8413,University of Giessen,,1962,,de.wikipedia.org/wiki/Thilo_Ramm?oldid=237476391 -Thilo Ramm,Q59533838,academic appointment,P8413,Fernuniversität in Hagen,,1977,,de.wikipedia.org/wiki/Thilo_Ramm?oldid=237476391 -Thilo Ramm,Q59533838,field of work,P101,Civil law,,1962,,de.wikipedia.org/wiki/Thilo_Ramm?oldid=237476391 -Thilo Ramm,Q59533838,field of work,P101,Labor law,,1962,,de.wikipedia.org/wiki/Thilo_Ramm?oldid=237476391 -Thilo Ramm,Q59533838,field of work,P101,Social law,,1962,,de.wikipedia.org/wiki/Thilo_Ramm?oldid=237476391 -Thilo Ramm,Q59533838,field of work,P101,Social philosophy,,1962,,de.wikipedia.org/wiki/Thilo_Ramm?oldid=237476391 diff --git a/wikidata/scholars-qid.csv b/wikidata/data/scholars-qid.csv similarity index 100% rename from wikidata/scholars-qid.csv rename to wikidata/data/scholars-qid.csv diff --git a/wikidata/scholars.csv b/wikidata/data/scholars.csv similarity index 100% rename from wikidata/scholars.csv rename to wikidata/data/scholars.csv diff --git a/wikidata/download-wikipedia-pages.ipynb b/wikidata/full-extraction-workflow.ipynb similarity index 70% rename from wikidata/download-wikipedia-pages.ipynb rename to wikidata/full-extraction-workflow.ipynb index 038989a..7b71619 100644 --- a/wikidata/download-wikipedia-pages.ipynb +++ b/wikidata/full-extraction-workflow.ipynb @@ -24,13 +24,13 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 1, "id": "initial_id", "metadata": { "collapsed": true, "ExecuteTime": { - "end_time": "2024-03-18T15:26:43.976993400Z", - "start_time": "2024-03-18T15:26:43.935308500Z" + "end_time": "2024-03-19T16:26:12.465730900Z", + "start_time": "2024-03-19T16:26:07.245012500Z" } }, "outputs": [ @@ -63,7 +63,7 @@ " print(f'No Wikipedia page exists for {fullName}.')\n", " continue\n", "\n", - " wikipedia_content_cache_path = f'data/{fullName}-wikipedia.txt'\n", + " wikipedia_content_cache_path = f'input/{fullName}-wikipedia.txt'\n", " if not os.path.isfile(wikipedia_content_cache_path):\n", " page_data = get_wikipedia_page_data(pagetTitle, language_code)\n", " if page_data and page_data['page'].exists: \n", @@ -106,16 +106,14 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 7, "outputs": [ { - "name": "stderr", + "name": "stdout", "output_type": "stream", "text": [ - "C:\\Users\\boulanger\\AppData\\Local\\miniconda3\\Lib\\site-packages\\langchain_openai\\chat_models\\base.py:454: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.5/migration/\n", - " response = response.dict()\n", - "C:\\Users\\boulanger\\AppData\\Local\\miniconda3\\Lib\\site-packages\\pydantic\\main.py:979: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.5/migration/\n", - " warnings.warn('The `dict` method is deprecated; use `model_dump` instead.', DeprecationWarning)\n" + "Prompting with handmade-prompt-template.txt took 0 minutes and 28 seconds and extracted 13 triples.\n", + "Prompting with gp4-optimized-prompt-template.txt took 0 minutes and 19 seconds and extracted 6 triples.\n" ] } ], @@ -123,19 +121,30 @@ "from lib.langchain import extract_to_csv\n", "from langchain_openai import ChatOpenAI\n", "from pathlib import Path\n", + "import time\n", + "\n", "fullName = \"Thilo Ramm\"\n", "qid=\"Q59533838\"\n", "model = ChatOpenAI(model_name=\"gpt-4\")\n", - "template = Path('extraction-prompt.txt').read_text()\n", - "website_text = Path(f'data/{fullName}-wikipedia.txt').read_text()\n", - "csv_path = f'data/{fullName}.csv'\n", - "df = extract_to_csv(model, template, csv_path, fullName=fullName, qid=qid, website_text=website_text)" + "website_text = Path(f'input/{fullName}-wikipedia.txt').read_text()\n", + "\n", + "for template_file in ['handmade-prompt-template.txt', 'gp4-optimized-prompt-template.txt']:\n", + " start_time = time.time()\n", + " template = Path(f'prompts/{template_file}').read_text() \n", + " df = extract_to_csv(model, template, debug=False, fullName=fullName, qid=qid, website_text=website_text)\n", + " end_time = time.time()\n", + " execution_time = end_time - start_time # In seconds\n", + " minutes, seconds = divmod(execution_time, 60)\n", + " print(f\"Prompting with {template_file} took {int(minutes)} minutes and {int(seconds)} seconds and extracted {len(df)} triples.\")\n", + " csv_path = f'output/{fullName}-{template_file.split(\"-\")[0]}-prompt.csv'\n", + " df.to_csv(csv_path, index=False)\n", + " \n" ], "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-03-18T17:03:40.614971500Z", - "start_time": "2024-03-18T17:03:11.400373800Z" + "end_time": "2024-03-19T17:19:30.764372400Z", + "start_time": "2024-03-19T17:18:42.398193300Z" } }, "id": "d904e502f8eff15d" @@ -144,13 +153,14 @@ "cell_type": "code", "execution_count": null, "outputs": [], - "source": [ - "from lib.wikidata import update_wikidata_from_csv" - ], + "source": [], "metadata": { - "collapsed": false + "collapsed": false, + "ExecuteTime": { + "start_time": "2024-03-19T16:26:16.823261600Z" + } }, - "id": "7e04f078c326387a" + "id": "2c11affcd6ffbff4" } ], "metadata": { diff --git a/wikidata/lib/langchain.py b/wikidata/lib/langchain.py index 2a8f3f1..6703184 100644 --- a/wikidata/lib/langchain.py +++ b/wikidata/lib/langchain.py @@ -6,11 +6,12 @@ from dotenv import load_dotenv load_dotenv() -def extract_to_csv(model, template, csv_path, **params): +def extract_to_csv(model, template, debug=False, **params): prompt = ChatPromptTemplate.from_template(template) parser = StrOutputParser() chain = ( prompt | model | parser ) response = chain.invoke(params) + if debug: + print(response) data = io.StringIO(response) - df = pd.read_csv(data, dtype={'start_time': str, 'end_time': str}) - df.to_csv(csv_path, index=False) \ No newline at end of file + return pd.read_csv(data, dtype={'start_time': str, 'end_time': str}) diff --git a/wikidata/output/.gitkeep b/wikidata/output/.gitkeep new file mode 100644 index 0000000..f59ec20 --- /dev/null +++ b/wikidata/output/.gitkeep @@ -0,0 +1 @@ +* \ No newline at end of file diff --git a/wikidata/data/Erhard Blankenburg.csv b/wikidata/output/Erhard Blankenburg.csv similarity index 100% rename from wikidata/data/Erhard Blankenburg.csv rename to wikidata/output/Erhard Blankenburg.csv diff --git a/wikidata/output/Thilo Ramm-gp4-prompt.csv b/wikidata/output/Thilo Ramm-gp4-prompt.csv new file mode 100644 index 0000000..91b89e7 --- /dev/null +++ b/wikidata/output/Thilo Ramm-gp4-prompt.csv @@ -0,0 +1,7 @@ +Thilo Ramm,Q59533838,educated at,P69,University of Marburg,Unnamed: 5,Unnamed: 6,1949,https://www.fernuni-hagen.de/universitaet/aktuelles/2018/07/am16-ramm_thilo_nachruf.shtml +Thilo Ramm,Q59533838,student of,P1066,Fritz von Hippel,,1949.0,1949.0,https://www.fernuni-hagen.de/universitaet/aktuelles/2018/07/am16-ramm_thilo_nachruf.shtml +Thilo Ramm,Q59533838,student of,P1066,Fritz von Hippel,,1951.0,1953.0,https://www.fernuni-hagen.de/universitaet/aktuelles/2018/07/am16-ramm_thilo_nachruf.shtml +Thilo Ramm,Q59533838,academic appointment,P8413,University of Freiburg,,1953.0,1961.0,https://www.fernuni-hagen.de/universitaet/aktuelles/2018/07/am16-ramm_thilo_nachruf.shtml +Thilo Ramm,Q59533838,academic appointment,P8413,University of Giessen,,1962.0,1977.0,https://www.fernuni-hagen.de/universitaet/aktuelles/2018/07/am16-ramm_thilo_nachruf.shtml +Thilo Ramm,Q59533838,founded by,P112,University of Giessen,,,1962.0,https://www.fernuni-hagen.de/universitaet/aktuelles/2018/07/am16-ramm_thilo_nachruf.shtml +Thilo Ramm,Q59533838,employer,P108,Fernuniversität Hagen,,1977.0,,https://www.fernuni-hagen.de/universitaet/aktuelles/2018/07/am16-ramm_thilo_nachruf.shtml diff --git a/wikidata/output/Thilo Ramm-handmade-prompt.csv b/wikidata/output/Thilo Ramm-handmade-prompt.csv new file mode 100644 index 0000000..ae4263c --- /dev/null +++ b/wikidata/output/Thilo Ramm-handmade-prompt.csv @@ -0,0 +1,14 @@ +subject-label,subject-qid,predicate,pid,object,object-qid,start_time,end_time,reference_url +Thilo Ramm,Q59533838,educated at,P69,University of Marburg,,1949,1949,de.wikipedia.org/wiki/Thilo_Ramm?oldid=237476391 +Thilo Ramm,Q59533838,student of,P1066,Fritz von Hippel,,1949,1949,de.wikipedia.org/wiki/Thilo_Ramm?oldid=237476391 +Thilo Ramm,Q59533838,academic degree,P512,doctorate,,1949,1949,de.wikipedia.org/wiki/Thilo_Ramm?oldid=237476391 +doctorate,,conferred by,P1027,University of Marburg,,1949,1949,de.wikipedia.org/wiki/Thilo_Ramm?oldid=237476391 +Thilo Ramm,Q59533838,academic appointment,P8413,University of Freiburg,,1953,1961,de.wikipedia.org/wiki/Thilo_Ramm?oldid=237476391 +Thilo Ramm,Q59533838,field of work,P101,Socialism,,1953,1953,de.wikipedia.org/wiki/Thilo_Ramm?oldid=237476391 +Thilo Ramm,Q59533838,academic appointment,P8413,University of Giessen,,1962,1977,de.wikipedia.org/wiki/Thilo_Ramm?oldid=237476391 +Thilo Ramm,Q59533838,field of work,P101,Civil law,,1962,1962,de.wikipedia.org/wiki/Thilo_Ramm?oldid=237476391 +Thilo Ramm,Q59533838,field of work,P101,Labour law,,1962,1962,de.wikipedia.org/wiki/Thilo_Ramm?oldid=237476391 +Thilo Ramm,Q59533838,field of work,P101,Social law,,1962,1962,de.wikipedia.org/wiki/Thilo_Ramm?oldid=237476391 +Thilo Ramm,Q59533838,field of work,P101,Social philosophy,,1962,1962,de.wikipedia.org/wiki/Thilo_Ramm?oldid=237476391 +University of Giessen,,founded by,P112,Thilo Ramm,Q59533838,1962,1962,de.wikipedia.org/wiki/Thilo_Ramm?oldid=237476391 +Thilo Ramm,Q59533838,academic appointment,P8413,FernUniversity in Hagen,,1977,1982,de.wikipedia.org/wiki/Thilo_Ramm?oldid=237476391 diff --git a/wikidata/prompts/gp4-optimized-prompt-template.txt b/wikidata/prompts/gp4-optimized-prompt-template.txt new file mode 100644 index 0000000..dde184a --- /dev/null +++ b/wikidata/prompts/gp4-optimized-prompt-template.txt @@ -0,0 +1,48 @@ +**Objective:** Extract structured data from text for Wikidata entry. + +**Text Source:** The text is about "{fullName}" with the QID {qid}. It contains sections separated by "-----", each starting with a URL followed by content excerpts. + +**Output Format:** Arrange data in a CSV table with columns: +- `subject-label` +- `subject-qid` +- `predicate` +- `pid` +- `object` +- `object-qid` +- `start_time` +- `end_time` +- `reference_url` + +**Rules:** +- **subject-label/subject-qid:** Generally, "{fullName}" and {qid}. For specific entities or reverse relationships (e.g., P112), adjust accordingly. +- **predicate/pid:** Use specific predicates like P69, P1066, etc., as guided. +- **object-label/object-qid:** English labels and, if available, QIDs for related institutions/persons. +- **start_time/end_time:** Dates for the duration of the statement's validity. If not specified, leave blank. +- **reference_url:** Source URL of the extracted information. + +**Data Extraction Guidance:** +- **Educated at (P69):** Institutions where the person studied. +- **Student of (P1066):** Supervisors for doctoral theses/habilitations. +- **Employer (P108):** Organization paying the salary. +- **Academic appointment (P8413):** Department of a university, or use P108 if unknown. +- **Student (P802):** Persons educated by the subject. +- **Member of (P463):** Organizations/associations the person belongs to. +- **Affiliation (P1416):** Organization affiliated with (not P463/P108). +- **Academic degree (P512):** Academic degrees obtained, refine with "conferred by" (P1027) and "point in time" (P585). +- **Field of work (P101):** Main topics/themes of work. +- **Editor (P98):** Editorial board memberships. +- **Founded by (P112):** Journals/organizations formally founded by the subject (reverse subject and object). +- **Significant person (P3342)/Object has role (P3831):** Reverse subject and object for P3342. Use object as subject and summarize the subject's role for P3831. + +**Format:** +- CSV with quoted values. Escape quotes within values properly. + +**Example Output:** +``` +"subject-label","subject-qid","predicate","pid","object","object-qid","start_time","end_time","reference_url" +"{fullName}","{qid}","educated at","P69","<some institution>","<qid if known>","","","<some url>" +``` + +**Note:** Extract data strictly as presented. Do not infer or add information not explicitly mentioned in the source material. +----- +{website_text} \ No newline at end of file diff --git a/wikidata/prompts/gpt4-optimized-test-prompt.txt b/wikidata/prompts/gpt4-optimized-test-prompt.txt new file mode 100644 index 0000000..7cb5c95 --- /dev/null +++ b/wikidata/prompts/gpt4-optimized-test-prompt.txt @@ -0,0 +1,58 @@ +**Objective:** Extract structured data from text for Wikidata entry. + +**Text Source:** The text is about "Erhard Blankenburg" with the QID Q51595283. It contains sections separated by "-----", each starting with a URL followed by content excerpts. + +**Output Format:** Arrange data in a CSV table with columns: +- `subject-label` +- `subject-qid` +- `predicate` +- `pid` +- `object` +- `object-qid` +- `start_time` +- `end_time` +- `reference_url` + +**Rules:** +- **subject-label/subject-qid:** Generally, "{fullName}" and {qid}. For specific entities or reverse relationships (e.g., P112), adjust accordingly. +- **predicate/pid:** Use specific predicates like P69, P1066, etc., as guided. +- **object-label/object-qid:** English labels and, if available, QIDs for related institutions/persons. +- **start_time/end_time:** Dates for the duration of the statement's validity. If not specified, leave blank. +- **reference_url:** Source URL of the extracted information. + +**Data Extraction Guidance:** +- **Educated at (P69):** Institutions where the person studied. +- **Student of (P1066):** Supervisors for doctoral theses/habilitations. +- **Employer (P108):** Organization paying the salary. +- **Academic appointment (P8413):** Department of a university, or use P108 if unknown. +- **Student (P802):** Persons educated by the subject. +- **Member of (P463):** Organizations/associations the person belongs to. +- **Affiliation (P1416):** Organization affiliated with (not P463/P108). +- **Academic degree (P512):** Academic degrees obtained, refine with "conferred by" (P1027) and "point in time" (P585). +- **Field of work (P101):** Main topics/themes of work. +- **Editor (P98):** Editorial board memberships. +- **Founded by (P112):** Journals/organizations formally founded by the subject (reverse subject and object). +- **Significant person (P3342)/Object has role (P3831):** Reverse subject and object for P3342. Use object as subject and summarize the subject's role for P3831. + +**Format:** +- CSV with quoted values. Escape quotes within values properly. + +**Example Output:** +``` +"subject-label","subject-qid","predicate","pid","object","object-qid","start_time","end_time","reference_url" +"Erhard Blankenburg","Q51595283","educated at","P69","Freie Universität Berlin","Q123456","","","de.wikipedia.org/wiki/Erhard_Blankenburg?oldid=228627122" +``` + +**Note:** Extract data strictly as presented. Do not infer or add information not explicitly mentioned in the source material. + +----- +de.wikipedia.org/wiki/Erhard_Blankenburg?oldid=228627122 + +== Werdegang == +Blankenburg belegte ein Studium der Philosophie, Soziologie und Germanistik an der Universität Freiburg und FU Berlin. Es folgten Graduate Studies und eine Tätigkeit als Forschungsassistent am Department of Sociology der University of Oregon. Ein Studium der Soziologie und Wirtschaftswissenschaft an der Universität Basel beendete er mit dem Abschluss Master of Arts 1965. + +Seine [[Promotion (Doktor)|Promotion]] zum Dr. phil. erfolgte an der Universität Basel 1966. +Als Assistent am Institut für Soziologie der Universität Freiburg arbeitete er von 1966 bis 1968. +Von 1969 bis 1971 war er Organisationsberater beim Quickborner Team, Hamburg. Danach arbeitete Blankenburg in Basel als Senior Projektleiter bei der [[Prognos]] in Basel. 1973/1974 war er [[wissenschaftlicher Mitarbeiter]] am [[Max-Planck-Institut für ausländisches und internationales Strafrecht]] in Freiburg. Die [[Habilitation]] für das Fach Soziologie erwarb er 1974 an der Universität Freiburg. Blankenburg war von 1975 bis 1980 Mitglied des [[Wissenschaftszentrum Berlin für Sozialforschung|Wissenschaftszentrums Berlin]], Internationales Institut für Management und Verwaltung. + +1980 bekam er einen Ruf auf den Lehrstuhl für Rechtssoziologie der [[Vrije Universiteit Amsterdam]]. Gemeinsam mit [[Wolfgang Kaupen]] spielte er eine wichtige Rolle bei der Neubegründung der Deutschen Rechtssoziologie in den 70er-Jahren (Raiser 1998), ebenso, mit [[Volkmar Gessner]], bei der Gründung des [[International Institute for the Sociology of Law]]. Er gehörte auch zu den Initiatoren und zu den Gründungsherausgebern der [[Zeitschrift für Rechtssoziologie]]. Gemeinsam mit [[Bill Felstiner]] organisierte er 1991 in Amsterdam das erste gemeinsame Treffen der beiden bedeutenden Vereinigungen der Rechtssoziologie (LSA und RCSL). Seine Beschäftigung mit rechtssoziologischen Themen war ungewöhnlich breit, reichte von der Soziologie der Kriminalität über die des Staatsapparates bis zu der des Zivilrechts. Blankenburg war primär Empiriker und Methodiker (vgl. seine Empirische Rechtssoziologie). Seine wichtigsten Beiträge zur rechtssoziologischen Theorie betreffen die Begriffe der "Mobilisierung des Rechts" und der "[[Rechtskultur]](en)". Vor allem aber wirkte er als Koordinator, Organisator und als Vermittler zwischen Wissenschaft und Praxis: "Er bemühte sich nicht, eine 'Schule' zu gründen, ihm fiel es leicht, in stets wechselnden Teams mit wechselnden Wissenschaftlern zusammenzuarbeiten. Wie kein anderer Rechtssoziologe vermochte er, erfolgreich Tagungen zu organisieren, kompetente Referenten zu gewinnen und die Veranstaltungen mit Autorität und zugleich locker zu leiten" ([[Theo Rasehorn]] 1998, 23). \ No newline at end of file diff --git a/wikidata/extraction-prompt.txt b/wikidata/prompts/handmade-prompt-template.txt similarity index 85% rename from wikidata/extraction-prompt.txt rename to wikidata/prompts/handmade-prompt-template.txt index 732babd..b7466a7 100644 --- a/wikidata/extraction-prompt.txt +++ b/wikidata/prompts/handmade-prompt-template.txt @@ -17,7 +17,8 @@ Insert data into the columns as per the following rules: - academic degree (P512): some instance of academic degree (Q189533). After making this claim, add further triples to refine the P512 statement with triples on "conferred by" (P1027) and on "point in time" (P585). - field of work (P101): extract the main topics and themes the subject has worked and published on - editor (P98): add information on memberships in editorial boards of academic journals - - founded by (P112): add information on journals, associations or other organizations that the subject helped to establish. When adding this claim, YOU MUST switch subject and object to express the reverse relationship + - founded by (P112): add information on journals, associations or other organizations of which the subject officially was a (co-) founder. When adding this claim, YOU MUST switch subject and object to express the reverse relationship. + - "significant person" (P3342) and "object has role" (P3831): If the subject was an important factor in the establishment of some object without being an official founder. As with P112, reverse subject and object for P3342. For P3831, use the object as subject of the claim and summarize the subject's role as the value. - object-label/object-qid: here the English labels and, if known, the QIDs for the institutions and persons who are the objects of the triple. If you are not absolutely sure, leave blank - start_time: the date/year from which the triple statement is true. Leave blank if the date is not specified or cannot be inferred, or the triple involves P585 - end_time: the date/year up to which the triple statement is true. If it is an event, identical to start_time diff --git a/wikidata/prompts/handmade-test-prompt.txt b/wikidata/prompts/handmade-test-prompt.txt new file mode 100644 index 0000000..baeeb48 --- /dev/null +++ b/wikidata/prompts/handmade-test-prompt.txt @@ -0,0 +1,40 @@ +Your task is to extract data from the text and to output it in a format that is suitable as a data source for adding triples to Wikidata. + +The text is about "Erhard Blankenburg" with the QID Q51595283. It consists of one or more sections separated by "-----". The sections begin with a standalone URL followed by an excerpt of the content that can be found at this URL. + +Arrange the extracted information into a table with the following columns: subject-label, subject-qid, predicate, pid, object, object-qid, start_time, end_time, reference_url. + +Insert data into the columns as per the following rules: +- subject-label/subject-qid: In general, the subject is "{fullName}" with the QID {qid}. However, refining/qualifying statements can also be made about other entities, as with the academic degree (P512) item below. Also, in the case of P112, subject and object must be reversed +- predicate/pid: + - educated at (P69): Institutions at which the person studied + - student of (P1066): If supervisors of doctoral theses and habilitations are specified + - employer (P108): is the organization that pays the salary of a person (this can be a company, and institution or the university) + - academic appointment (P8413): usually the department of a university, if this or its QID are not known, like P108 + - student (P802): persons contained in WikiData who were educated by the subject + - member of (P463): Organizations and associations to which the person belongs (excluding P108) + - affiliation (P1416): Organization that the subject is affiliated with (not member of or employed by) + - academic degree (P512): some instance of academic degree (Q189533). After making this claim, add further triples to refine the P512 statement with triples on "conferred by" (P1027) and on "point in time" (P585). + - field of work (P101): extract the main topics and themes the subject has worked and published on + - editor (P98): add information on memberships in editorial boards of academic journals + - founded by (P112): add information on journals, associations or other organizations of which the subject officially was a (co-) founder. When adding this claim, YOU MUST switch subject and object to express the reverse relationship. + - "significant person" (P3342) and "object has role" (P3831): If the subject was an important factor in the establishment of some object without being an official founder. As with P112, reverse subject and object for P3342. For P3831, use the object as subject of the claim and summarize the subject's role as the value. +- object-label/object-qid: here the English labels and, if known, the QIDs for the institutions and persons who are the objects of the triple. If you are not absolutely sure, leave blank +- start_time: the date/year from which the triple statement is true. Leave blank if the date is not specified or cannot be inferred, or the triple involves P585 +- end_time: the date/year up to which the triple statement is true. If it is an event, identical to start_time +- reference_url: this is the source URL of the text from which the information was extracted. + +Return information as a comma-separated values (CSV). Include the column headers. Surround the values with quotes. If values contain quotes, properly escape them. + +DO NOT, UNDER ANY CIRCUMSTANCES, provide any commentary or explanations, just return the raw data. Do not make anything up that is not in the source material. +----- +de.wikipedia.org/wiki/Erhard_Blankenburg?oldid=228627122 + +== Werdegang == +Blankenburg belegte ein Studium der Philosophie, Soziologie und Germanistik an der Universität Freiburg und FU Berlin. Es folgten Graduate Studies und eine Tätigkeit als Forschungsassistent am Department of Sociology der University of Oregon. Ein Studium der Soziologie und Wirtschaftswissenschaft an der Universität Basel beendete er mit dem Abschluss Master of Arts 1965. + +Seine [[Promotion (Doktor)|Promotion]] zum Dr. phil. erfolgte an der Universität Basel 1966. +Als Assistent am Institut für Soziologie der Universität Freiburg arbeitete er von 1966 bis 1968. +Von 1969 bis 1971 war er Organisationsberater beim Quickborner Team, Hamburg. Danach arbeitete Blankenburg in Basel als Senior Projektleiter bei der [[Prognos]] in Basel. 1973/1974 war er [[wissenschaftlicher Mitarbeiter]] am [[Max-Planck-Institut für ausländisches und internationales Strafrecht]] in Freiburg. Die [[Habilitation]] für das Fach Soziologie erwarb er 1974 an der Universität Freiburg. Blankenburg war von 1975 bis 1980 Mitglied des [[Wissenschaftszentrum Berlin für Sozialforschung|Wissenschaftszentrums Berlin]], Internationales Institut für Management und Verwaltung. + +1980 bekam er einen Ruf auf den Lehrstuhl für Rechtssoziologie der [[Vrije Universiteit Amsterdam]]. Gemeinsam mit [[Wolfgang Kaupen]] spielte er eine wichtige Rolle bei der Neubegründung der Deutschen Rechtssoziologie in den 70er-Jahren (Raiser 1998), ebenso, mit [[Volkmar Gessner]], bei der Gründung des [[International Institute for the Sociology of Law]]. Er gehörte auch zu den Initiatoren und zu den Gründungsherausgebern der [[Zeitschrift für Rechtssoziologie]]. Gemeinsam mit [[Bill Felstiner]] organisierte er 1991 in Amsterdam das erste gemeinsame Treffen der beiden bedeutenden Vereinigungen der Rechtssoziologie (LSA und RCSL). Seine Beschäftigung mit rechtssoziologischen Themen war ungewöhnlich breit, reichte von der Soziologie der Kriminalität über die des Staatsapparates bis zu der des Zivilrechts. Blankenburg war primär Empiriker und Methodiker (vgl. seine Empirische Rechtssoziologie). Seine wichtigsten Beiträge zur rechtssoziologischen Theorie betreffen die Begriffe der "Mobilisierung des Rechts" und der "[[Rechtskultur]](en)". Vor allem aber wirkte er als Koordinator, Organisator und als Vermittler zwischen Wissenschaft und Praxis: "Er bemühte sich nicht, eine 'Schule' zu gründen, ihm fiel es leicht, in stets wechselnden Teams mit wechselnden Wissenschaftlern zusammenzuarbeiten. Wie kein anderer Rechtssoziologe vermochte er, erfolgreich Tagungen zu organisieren, kompetente Referenten zu gewinnen und die Veranstaltungen mit Autorität und zugleich locker zu leiten" ([[Theo Rasehorn]] 1998, 23). \ No newline at end of file diff --git a/wikidata/update-wikidata.ipynb b/wikidata/update-wikidata.ipynb deleted file mode 100644 index 5b738b1..0000000 --- a/wikidata/update-wikidata.ipynb +++ /dev/null @@ -1,176 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "source": [ - "# Update WikiData\n" - ], - "metadata": { - "collapsed": false - }, - "id": "8ab9671b6edfa9f" - }, - { - "cell_type": "code", - "execution_count": 5, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Retrieved wikidata item.\n", - "Claim P108 P580 {\n", - " \"after\": 0,\n", - " \"before\": 0,\n", - " \"calendarmodel\": \"http://www.wikidata.org/entity/Q1985727\",\n", - " \"precision\": 9,\n", - " \"time\": \"+00000001980-01-01T00:00:00Z\",\n", - " \"timezone\": 0\n", - "} already exists.\n", - "Claim P108 P582 {\n", - " \"after\": 0,\n", - " \"before\": 0,\n", - " \"calendarmodel\": \"http://www.wikidata.org/entity/Q1985727\",\n", - " \"precision\": 9,\n", - " \"time\": \"+00000002003-01-01T00:00:00Z\",\n", - " \"timezone\": 0\n", - "} already exists.\n", - "Reference P4656 https://de.wikipedia.org/wiki/Erhard_Blankenburg already exists for P108.\n", - "Reference P854 https://www.linkedin.com/in/erhard-blankenburg-63938058/ already exists for P108.\n", - "Modifications applied in an idempotent manner.\n" - ] - } - ], - "source": [ - "import pywikibot\n", - "from pywikibot import Claim, WbTime\n", - "from datetime import datetime\n", - "\n", - "site = pywikibot.Site(\"wikidata\", \"wikidata\")\n", - "repo = site.data_repository()\n", - "\n", - "item = pywikibot.ItemPage(repo, 'Q51595283')\n", - "item.get()\n", - "\n", - "print(\"Retrieved wikidata item.\")\n", - "\n", - "# Function to check if a qualifier exists\n", - "def qualifier_exists(claim, qualifier_property, target_value):\n", - " for qualifier in claim.qualifiers.get(qualifier_property, []):\n", - " if qualifier.getTarget() == target_value:\n", - " print(f'Claim {claim.getID()} {qualifier_property} {target_value} already exists.')\n", - " return True\n", - " return False\n", - "\n", - "# Function to check if a reference exists\n", - "def reference_exists(claim, source_property, target_url):\n", - " for source in claim.sources:\n", - " for prop_id, values in source.items():\n", - " if prop_id == source_property:\n", - " for value in values:\n", - " if value.getTarget() == target_url:\n", - " print(f'Reference {source_property} {target_url} already exists for {claim.getID()}.')\n", - " return True\n", - " return False\n", - "\n", - "# Ensure employment claim is not duplicated\n", - "employment_claim_exists = False\n", - "for claim in item.claims.get('P108', []): # P108 is 'employer'\n", - " if claim.getTarget().getID() == 'Q1065414': # University of Amsterdam\n", - " employment_claim_exists = True\n", - " break\n", - "\n", - "if not employment_claim_exists:\n", - " claim = Claim(repo, 'P108')\n", - " target = pywikibot.ItemPage(repo, 'Q1065414')\n", - " claim.setTarget(target)\n", - " item.addClaim(claim)\n", - " print(f'Created new claim {claim}...')\n", - "\n", - "# Add start and end time qualifiers if they don't already exist\n", - "start_time = WbTime(year=1980)\n", - "if not qualifier_exists(claim, 'P580', start_time):\n", - " start_qualifier = Claim(repo, 'P580')\n", - " start_qualifier.setTarget(start_time)\n", - " claim.addQualifier(start_qualifier)\n", - " print(f'Added new qualifier {start_qualifier}...')\n", - "\n", - "end_time = WbTime(year=2003)\n", - "if not qualifier_exists(claim, 'P582', end_time):\n", - " end_qualifier = Claim(repo, 'P582')\n", - " end_qualifier.setTarget(end_time)\n", - " claim.addQualifier(end_qualifier)\n", - " print(f'Added new qualifier {end_qualifier}...')\n", - "\n", - "# Add references with 'retrieved at' qualifier\n", - "current_datetime = datetime.utcnow()\n", - "retrieved_at_datetime = WbTime(year=current_datetime.year, month=current_datetime.month, day=current_datetime.day)\n", - "\n", - "wikipedia_url = 'https://de.wikipedia.org/wiki/Erhard_Blankenburg'\n", - "linkedin_url = 'https://www.linkedin.com/in/erhard-blankenburg-63938058/'\n", - "\n", - "if not reference_exists(claim, 'P4656', wikipedia_url):\n", - " # Add Wikipedia reference\n", - " wikipedia_reference = Claim(repo, 'P4656')\n", - " wikipedia_reference.setTarget(wikipedia_url)\n", - " retrieved_at_claim_wiki = Claim(repo, 'P813')\n", - " retrieved_at_claim_wiki.setTarget(retrieved_at_datetime)\n", - " wikipedia_reference.addQualifier(retrieved_at_claim_wiki)\n", - " claim.addSources([wikipedia_reference])\n", - " print(f'Added new source {wikipedia_reference}...')\n", - "\n", - "if not reference_exists(claim, 'P854', linkedin_url):\n", - " # Add LinkedIn reference\n", - " linkedin_reference = Claim(repo, 'P854')\n", - " linkedin_reference.setTarget(linkedin_url)\n", - " retrieved_at_claim_linkedin = Claim(repo, 'P813')\n", - " retrieved_at_claim_linkedin.setTarget(retrieved_at_datetime)\n", - " linkedin_reference.addQualifier(retrieved_at_claim_linkedin)\n", - " claim.addSources([linkedin_reference])\n", - " print(f'Added new source {linkedin_reference}...')\n", - "\n", - "print('Modifications applied in an idempotent manner.')\n", - "\n" - ], - "metadata": { - "collapsed": false, - "ExecuteTime": { - "end_time": "2024-03-15T08:44:19.767213300Z", - "start_time": "2024-03-15T08:44:19.382460Z" - } - }, - "id": "59d15dc93174e6ad" - }, - { - "cell_type": "code", - "execution_count": null, - "outputs": [], - "source": [], - "metadata": { - "collapsed": false - }, - "id": "d702eb98f46957ca" - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 2 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython2", - "version": "2.7.6" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} -- GitLab