Skip to content
Snippets Groups Projects
Commit db5a5bf2 authored by cboulanger's avatar cboulanger
Browse files

Fix wikipedia query

parent c9c63843
No related branches found
No related tags found
No related merge requests found
%% Cell type:code id:initial_id tags:
``` python
import os.path
import textwrap
import requests
def generate_sparql_query(fullName, property_labels_to_ids, language='en'):
"""
Query WikiData for the properties of the given person listed in the given property map.
All properties that are simple values without a label must have an "_id" suffix, all date
properties must begin with "date"
:param fullName:
:param property_labels_to_ids:
:param language:
:return:
"""
propSelection = ""
for label, pid in property_labels_to_ids.items():
if label.endswith("_id") or label.startswith("image"):
# literal values, including URIs - this needs to be solved in a more generic way
propSelection += f"""
OPTIONAL {{ ?item wdt:{pid} ?{label}. }}"""
elif label.startswith("date"):
# Dates, fetched directly but need special handling for formatting if desired
propSelection += f"""
OPTIONAL {{ ?item wdt:{pid} ?{label}. }}"""
else:
propSelection += f"""
OPTIONAL {{ ?item wdt:{pid} ?{label}Id .
?{label}Id rdfs:label ?{label} FILTER(LANG(?{label}) = "{language}") .
SERVICE wikibase:label {{ bd:serviceParam wikibase:language "{language}". }} }}"""
query = textwrap.dedent(f"""
SELECT DISTINCT ?item ?itemLabel {"".join([f"(SAMPLE(?{label}) AS ?{label})" for label in property_labels_to_ids])}
WHERE {{
?item wdt:P31 wd:Q5; rdfs:label "{fullName}"@{language}.
{textwrap.dedent(propSelection)}
}}
GROUP BY ?item ?itemLabel
""")
return query
def construct_image_url(filename):
return f"https://commons.wikimedia.org/wiki/Special:FilePath/{requests.utils.quote(filename)}"
def get_wikipedia_links(qid, languages):
"""
Fetch Wikipedia links for a given Wikidata QID and a list of languages.
Parameters:
- qid (str): The QID of the Wikidata item.
- languages (list): A list of language codes (e.g., ['en', 'de']).
Returns:
- dict: A dictionary with languages as keys and Wikipedia URLs as values.
"""
url = "https://www.wikidata.org/w/api.php"
params = {
"action": "wbgetentities",
"ids": qid,
"props": "sitelinks",
"format": "json"
}
response = requests.get(url, params=params)
data = response.json()
links = {}
if "entities" in data and qid in data["entities"]:
sitelinks = data["entities"][qid].get("sitelinks", {})
for lang in languages:
sitekey = f"{lang}wiki"
if sitekey in sitelinks:
links[lang] = sitelinks[sitekey]["url"]
else:
links[lang] = None # Or use '' to represent absence of link
return links
def query_wikidata(fullName, property_map, language='en'):
SPARQL_ENDPOINT = "https://query.wikidata.org/sparql"
query = generate_sparql_query(fullName, property_map, language)
headers = {'User-Agent': 'Mozilla/5.0', 'Accept': 'application/json'}
response = requests.get(SPARQL_ENDPOINT, headers=headers, params={'query': query, 'format': 'json'})
if response.status_code != 200:
response.raise_for_status()
results = response.json()['results']['bindings']
if not results:
return None
# Initialize with fullName to ensure it appears first
data = {
'fullName': fullName
}
# use first result
result = results[0]
# iterate over fields
for label in property_map:
if label in result:
value = result[label]['value']
data[label] = value
else:
data[label] = None
# add item URI
data['item'] = os.path.basename(result['item']['value'])
return data
def get_person_info_from_wikidata(names, property_map, language='en'):
all_data = []
for fullName in names:
data = query_wikidata(fullName, property_map, language)
if data:
all_data.append(data)
if all_data:
# Ensure fullName appears first by reordering columns based on property_labels_to_ids keys
columns_order = ['fullName', 'item'] + list(property_map.keys())
df = pd.DataFrame(all_data, columns=columns_order)
else:
df = pd.DataFrame(columns=['fullName'] + list(property_map.keys()))
return df
```
%% Cell type:code id:19ddabbda261cc90 tags:
``` python
# Now calling the updated function with the 'language' parameter
property_labels_to_ids = {
'sexOrGender': 'P21',
'image': 'P18',
'countryOfCitizenship': 'P27',
'givenName': 'P735',
'familyName': 'P734',
'dateOfBirth': 'P569',
'dateOfDeath': 'P570',
'occupation': 'P106',
'fieldOfWork': 'P101',
'employer': 'P108',
'viaf_id': 'P214',
'isni_id': 'P213',
'gnd_id': 'P227'
}
scholars = [
"Hans Kelsen",
"Hugo Sinzheimer",
"Karl Renner",
"Ernst Fraenkel",
"Franz Leopold Neumann",
"Otto Kahn-Freund",
"Otto Kirchheimer",
"Herrmann Kantorowicz",
"Ludwig Bendix",
"Arthur Nussbaum",
"Theodor Geiger",
"Erhard Blankenburg",
"Wolfgang Kaupen",
"Rüdiger Lautmann",
"Thilo Ramm",
"Rudolf Wiethölter",
"Niklas Luhmann",
"Gunther Teubner",
"Volkmar Gessner"
]
df = get_person_info_from_wikidata(scholars, property_labels_to_ids)
df
```
%% Output
fullName item sexOrGender \
0 Hans Kelsen Q84165 male
1 Hugo Sinzheimer Q86043 male
2 Karl Renner Q11726 male
3 Ernst Fraenkel Q86812 male
4 Franz Leopold Neumann Q112562068 male
5 Otto Kahn-Freund Q121832 male
6 Otto Kirchheimer Q214397 male
7 Ludwig Bendix Q28053205 male
8 Arthur Nussbaum Q103088 male
9 Theodor Geiger Q96410 male
10 Erhard Blankenburg Q51595283 male
11 Wolfgang Kaupen Q93221485 male
12 Rüdiger Lautmann Q91074 male
13 Thilo Ramm Q59533838 male
14 Rudolf Wiethölter Q1512482 male
15 Niklas Luhmann Q85691627 None
16 Gunther Teubner Q98304 male
17 Volkmar Gessner Q15435946 male
image \
0 http://commons.wikimedia.org/wiki/Special:File...
1 http://commons.wikimedia.org/wiki/Special:File...
2 http://commons.wikimedia.org/wiki/Special:File...
3 None
4 None
5 http://commons.wikimedia.org/wiki/Special:File...
6 None
7 None
8 http://commons.wikimedia.org/wiki/Special:File...
9 None
10 http://commons.wikimedia.org/wiki/Special:File...
11 None
12 http://commons.wikimedia.org/wiki/Special:File...
13 None
14 None
15 None
16 http://commons.wikimedia.org/wiki/Special:File...
17 http://commons.wikimedia.org/wiki/Special:File...
countryOfCitizenship givenName familyName dateOfBirth \
0 Cisleithania Hans Kelsen 1881-10-11T00:00:00Z
1 Germany Hugo Sinzheimer 1875-04-12T00:00:00Z
2 Cisleithania Karl Renner 1870-12-14T00:00:00Z
3 Germany Ernst Fraenkel 1898-12-26T00:00:00Z
4 None Leopold Neumann None
5 Germany Otto None 1900-11-17T00:00:00Z
6 Germany Otto Kirchheimer 1905-11-11T00:00:00Z
7 None Ludwig Bendix 1857-10-28T00:00:00Z
8 United States of America Arthur Nussbaum 1877-01-31T00:00:00Z
9 Germany Theodor Geiger 1891-11-09T00:00:00Z
10 Germany Erhard Blankenburg 1938-10-30T00:00:00Z
11 None Wolfgang None 1936-01-01T00:00:00Z
12 Germany Rüdiger None 1935-12-22T00:00:00Z
13 Germany Thilo Ramm 1925-04-04T00:00:00Z
14 Germany Rudolf None 1929-07-17T00:00:00Z
15 None None None None
16 Germany Gunther Teubner 1944-04-30T00:00:00Z
17 Germany Volkmar Gessner 1937-10-09T00:00:00Z
dateOfDeath occupation fieldOfWork \
0 1973-04-19T00:00:00Z judge international law
1 1945-09-16T00:00:00Z lawyer None
2 1950-12-31T00:00:00Z lawyer politics
3 1975-03-28T00:00:00Z lawyer None
4 None printer publishing
5 1979-08-16T00:00:00Z judge None
6 1965-11-22T00:00:00Z jurist None
7 1923-09-28T00:00:00Z university teacher None
8 1964-11-22T00:00:00Z lawyer law
9 1952-06-16T00:00:00Z university teacher None
10 2018-03-28T00:00:00Z sociology of law sociology of law
11 1981-01-01T00:00:00Z sociologist sociology of law
12 None author sociology of law
13 2018-06-17T00:00:00Z writer None
14 None jurist None
15 None researcher None
16 None jurist None
17 2014-11-08T00:00:00Z judge comparative law
employer viaf_id \
0 Charles University 31998356
1 Goethe University Frankfurt 27864307
2 Austrian Federal Government 61669459
3 Free University Berlin 27108403
4 None 637163874508945722514
5 University of Oxford 76317591
6 Office of Strategic Services 32042801
7 None 88720482
8 Columbia University 5180962
9 Technical University of Braunschweig 56667946
10 Free University of Amsterdam 64109592
11 None 32919813
12 University of Bremen 24732961
13 FernUniversität in Hagen 9924244
14 Goethe University Frankfurt 106974404
15 None None
16 Goethe University Frankfurt 108364502
17 University of Bremen 69100039
isni_id gnd_id
0 0000000121266076 118561219
1 0000000109619641 118614711
2 0000000121358165 118599739
3 0000000110230959 118534602
4 None None
5 0000000109168959 118559362
6 0000000081110244 118562371
7 0000000061811334 1023309920
8 0000000120988288 117071676
9 0000000109038951 118538187
10 0000000110676109 115459235
11 0000000035495614 124045405
12 000000011469331X 120502208
13 0000000108689541 116327391
14 0000000116961365 1034437860
15 None None
16 0000000109312017 119443562
17 0000000109127065 170469328
%% Cell type:code id:c6c0cc347c8788d0 tags:
``` python
df.to_csv("scholars.csv", index=False)
```
%% Cell type:code id:b8058de7fa9212b tags:
``` python
import pandas as pd
from datetime import datetime
# Assuming df is your existing DataFrame
# Convert dateOfBirth and dateOfDeath to just the year, handle NaT/NaN appropriately
df['Year'] = pd.to_datetime(df['dateOfBirth'], errors='coerce').dt.year.astype('Int64')
df['End Year'] = pd.to_datetime(df['dateOfDeath'], errors='coerce').dt.year.astype('Int64')
# Create 'Display Date' as "dateOfBirth - dateOfDeath"
df['Display Date'] = df['Year'].astype(str).replace('<NA>','') + ' - ' + df['End Year'].astype(str).replace('<NA>','')
# Create 'Headline' as "fullName (dateOfBirth - dateOfDeath)"
df['Headline'] = df['fullName'] + ' (' + df['Display Date'] + ')'
# Create 'Text' column by combining occupation, fieldOfWork, employer
df['Text'] = df[['occupation', 'fieldOfWork']].apply(lambda x: '<br>'.join(x.dropna()), axis=1)
# Use the image directly; assuming the URLs are already correctly formed in the 'image' column
df['Media'] = df['image']
# Add a "Group" column with the value "actors" for all rows
df['Group'] = 'actors'
# fix date columns
df['Display Date'] = df['Display Date'].fillna('') # Ensure no NaNs in Display Date
df['Headline'] = df['Headline'].fillna('') # Ensure no NaNs in Headline
df['Text'] = df['Text'].fillna('') # Ensure no NaNs in Text
df['Media'] = df['Media'].fillna('') # Ensure no NaNs in Media
# Now select and order the DataFrame according to the TimelineJS template requirements
columns = "Year Month Day Time End Year End Month End Day End Time Display Date Headline Text Media Media Credit Media Caption Media Thumbnail Type Group Background Link".split("\t")
for col in columns:
if col not in df:
df[col] = ''
timeline_df = df[columns]
timeline_df.to_excel("timeline_data.xlsx", index=False)
```
%% Cell type:code id:f4b14ea7d4941e57 tags:
``` python
```
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment