Skip to content
Snippets Groups Projects
Commit e3313c0b authored by Christian Boulanger's avatar Christian Boulanger
Browse files

extraction with GPT-4 and upload of the manually corrected data works

parent 40938663
No related branches found
No related tags found
No related merge requests found
OPENAI_API_KEY=''
HUGGINGFACEHUB_API_TOKEN=''
\ No newline at end of file
timeline_data.xlsx
\ No newline at end of file
timeline_data.xlsx
user-config.py
.env
apicache
/throttle.ctrl
data/*-chatgpt.csv
\ No newline at end of file
This diff is collapsed.
subject-label,subject-qid,predicate,pid,object,object-qid,start_time,end_time,reference_url
Erhard Blankenburg,Q51595283,educated at,P69,University of Freiburg,Q153987,,,https://de.wikipedia.org/wiki/Erhard_Blankenburg
Erhard Blankenburg,Q51595283,educated at,P69,Free University Berlin,Q153006,,,https://de.wikipedia.org/wiki/Erhard_Blankenburg
Erhard Blankenburg,Q51595283,educated at,P69,University of Oregon,Q766145,,,https://de.wikipedia.org/wiki/Erhard_Blankenburg
Erhard Blankenburg,Q51595283,educated at,P69,University of Basel,Q372608,,1965,https://de.wikipedia.org/wiki/Erhard_Blankenburg
Erhard Blankenburg,Q51595283,academic degree,P512,Master of Arts,Q2091008,1965,1965,https://de.wikipedia.org/wiki/Erhard_Blankenburg
Erhard Blankenburg,Q51595283,academic degree,P512,Doctor of Philosophy,Q752297,1966,1966,https://de.wikipedia.org/wiki/Erhard_Blankenburg
Erhard Blankenburg,Q51595283,employer,P108,University of Freiburg,Q153987,1966,1968,https://de.wikipedia.org/wiki/Erhard_Blankenburg
Erhard Blankenburg,Q51595283,employer,P108,Quickborner Team,Q124866772,1969,1971,https://de.wikipedia.org/wiki/Erhard_Blankenburg
Erhard Blankenburg,Q51595283,employer,P108,Prognos AG,Q2112115,,1973,https://de.wikipedia.org/wiki/Erhard_Blankenburg
Erhard Blankenburg,Q51595283,employer,P108,Max Planck Institute for Foreign and International Criminal Law,Q832780,1973,1974,https://de.wikipedia.org/wiki/Erhard_Blankenburg
Erhard Blankenburg,Q51595283,academic degree,P512,habilitation,Q308678,1974,1974,https://de.wikipedia.org/wiki/Erhard_Blankenburg
Erhard Blankenburg,Q51595283,employed by,P463,WZB Berlin Social Science Center,Q475602,1975,1980,https://de.wikipedia.org/wiki/Erhard_Blankenburg
Erhard Blankenburg,Q51595283,academic appointment,P8413,Free University of Amsterdam,Q1065414,1980,,https://de.wikipedia.org/wiki/Erhard_Blankenburg
Erhard Blankenburg,Q51595283,affiliated with,P1416,International Institute for the Sociology of Law,Q1459361,,,https://de.wikipedia.org/wiki/Erhard_Blankenburg
Erhard Blankenburg,Q51595283,editor,P98,Zeitschrift fur Rechtssoziologie,Q96335163,,,https://de.wikipedia.org/wiki/Erhard_Blankenburg
Zeitschrift fur Rechtssoziologie,Q65972149,founded by,P112,Erhard Blankenburg,Q51595283,,,https://de.wikipedia.org/wiki/Erhard_Blankenburg
Erhard Blankenburg,Q51595283,academic appointment,P8413,Free University of Amsterdam,Q1065414,,2003,https://www.linkedin.com/in/erhard-blankenburg-63938058/
# WikiData data retrieval
`pip install openpyxl plotly lxml tabulate`
\ No newline at end of file
## Dependencies
For data upload to WikiData
`pip install pywikibot requests-oauthlib`
For timeline plots
`pip install openpyxl plotly lxml tabulate`
For data extraction using OpenAi models
`pip install langchain_core langchain_openai`
\ No newline at end of file
%% Cell type:markdown id:8ab9671b6edfa9f tags:
# Update WikiData
%% Cell type:code id:59d15dc93174e6ad tags:
``` python
import pywikibot
from pywikibot import Claim, WbTime
from datetime import datetime
site = pywikibot.Site("wikidata", "wikidata")
repo = site.data_repository()
item = pywikibot.ItemPage(repo, 'Q51595283')
item.get()
print("Retrieved wikidata item.")
# Function to check if a qualifier exists
def qualifier_exists(claim, qualifier_property, target_value):
for qualifier in claim.qualifiers.get(qualifier_property, []):
if qualifier.getTarget() == target_value:
print(f'Claim {claim.getID()} {qualifier_property} {target_value} already exists.')
return True
return False
# Function to check if a reference exists
def reference_exists(claim, source_property, target_url):
for source in claim.sources:
for prop_id, values in source.items():
if prop_id == source_property:
for value in values:
if value.getTarget() == target_url:
print(f'Reference {source_property} {target_url} already exists for {claim.getID()}.')
return True
return False
# Ensure employment claim is not duplicated
employment_claim_exists = False
for claim in item.claims.get('P108', []): # P108 is 'employer'
if claim.getTarget().getID() == 'Q1065414': # University of Amsterdam
employment_claim_exists = True
break
if not employment_claim_exists:
claim = Claim(repo, 'P108')
target = pywikibot.ItemPage(repo, 'Q1065414')
claim.setTarget(target)
item.addClaim(claim)
print(f'Created new claim {claim}...')
# Add start and end time qualifiers if they don't already exist
start_time = WbTime(year=1980)
if not qualifier_exists(claim, 'P580', start_time):
start_qualifier = Claim(repo, 'P580')
start_qualifier.setTarget(start_time)
claim.addQualifier(start_qualifier)
print(f'Added new qualifier {start_qualifier}...')
end_time = WbTime(year=2003)
if not qualifier_exists(claim, 'P582', end_time):
end_qualifier = Claim(repo, 'P582')
end_qualifier.setTarget(end_time)
claim.addQualifier(end_qualifier)
print(f'Added new qualifier {end_qualifier}...')
# Add references with 'retrieved at' qualifier
current_datetime = datetime.utcnow()
retrieved_at_datetime = WbTime(year=current_datetime.year, month=current_datetime.month, day=current_datetime.day)
wikipedia_url = 'https://de.wikipedia.org/wiki/Erhard_Blankenburg'
linkedin_url = 'https://www.linkedin.com/in/erhard-blankenburg-63938058/'
if not reference_exists(claim, 'P4656', wikipedia_url):
# Add Wikipedia reference
wikipedia_reference = Claim(repo, 'P4656')
wikipedia_reference.setTarget(wikipedia_url)
retrieved_at_claim_wiki = Claim(repo, 'P813')
retrieved_at_claim_wiki.setTarget(retrieved_at_datetime)
wikipedia_reference.addQualifier(retrieved_at_claim_wiki)
claim.addSources([wikipedia_reference])
print(f'Added new source {wikipedia_reference}...')
if not reference_exists(claim, 'P854', linkedin_url):
# Add LinkedIn reference
linkedin_reference = Claim(repo, 'P854')
linkedin_reference.setTarget(linkedin_url)
retrieved_at_claim_linkedin = Claim(repo, 'P813')
retrieved_at_claim_linkedin.setTarget(retrieved_at_datetime)
linkedin_reference.addQualifier(retrieved_at_claim_linkedin)
claim.addSources([linkedin_reference])
print(f'Added new source {linkedin_reference}...')
print('Modifications applied in an idempotent manner.')
```
%% Output
Retrieved wikidata item.
Claim P108 P580 {
"after": 0,
"before": 0,
"calendarmodel": "http://www.wikidata.org/entity/Q1985727",
"precision": 9,
"time": "+00000001980-01-01T00:00:00Z",
"timezone": 0
} already exists.
Claim P108 P582 {
"after": 0,
"before": 0,
"calendarmodel": "http://www.wikidata.org/entity/Q1985727",
"precision": 9,
"time": "+00000002003-01-01T00:00:00Z",
"timezone": 0
} already exists.
Reference P4656 https://de.wikipedia.org/wiki/Erhard_Blankenburg already exists for P108.
Reference P854 https://www.linkedin.com/in/erhard-blankenburg-63938058/ already exists for P108.
Modifications applied in an idempotent manner.
%% Cell type:code id:d702eb98f46957ca tags:
``` python
```
mylang = 'wikidata'
family = 'wikidata'
usernames['wikidata']['wikidata'] = '<name of bot>'
authenticate['*.wikidata.org'] = ('Consumer token', 'Consumer secret',
'Access token', 'Access secret')
console_encoding = 'utf-8'
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment