Skip to content
Snippets Groups Projects
Commit db5a5bf2 authored by cboulanger's avatar cboulanger
Browse files

Fix wikipedia query

parent c9c63843
No related branches found
No related tags found
No related merge requests found
%% Cell type:code id:initial_id tags: %% Cell type:code id:initial_id tags:
``` python ``` python
import os.path import os.path
import textwrap import textwrap
import requests import requests
def generate_sparql_query(fullName, property_labels_to_ids, language='en'): def generate_sparql_query(fullName, property_labels_to_ids, language='en'):
""" """
Query WikiData for the properties of the given person listed in the given property map. Query WikiData for the properties of the given person listed in the given property map.
All properties that are simple values without a label must have an "_id" suffix, all date All properties that are simple values without a label must have an "_id" suffix, all date
properties must begin with "date" properties must begin with "date"
:param fullName: :param fullName:
:param property_labels_to_ids: :param property_labels_to_ids:
:param language: :param language:
:return: :return:
""" """
propSelection = "" propSelection = ""
for label, pid in property_labels_to_ids.items(): for label, pid in property_labels_to_ids.items():
if label.endswith("_id") or label.startswith("image"): if label.endswith("_id") or label.startswith("image"):
# literal values, including URIs - this needs to be solved in a more generic way
propSelection += f""" propSelection += f"""
OPTIONAL {{ ?item wdt:{pid} ?{label}. }}""" OPTIONAL {{ ?item wdt:{pid} ?{label}. }}"""
elif label.startswith("date"): elif label.startswith("date"):
# Dates, fetched directly but need special handling for formatting if desired # Dates, fetched directly but need special handling for formatting if desired
propSelection += f""" propSelection += f"""
OPTIONAL {{ ?item wdt:{pid} ?{label}. }}""" OPTIONAL {{ ?item wdt:{pid} ?{label}. }}"""
else: else:
propSelection += f""" propSelection += f"""
OPTIONAL {{ ?item wdt:{pid} ?{label}Id . OPTIONAL {{ ?item wdt:{pid} ?{label}Id .
?{label}Id rdfs:label ?{label} FILTER(LANG(?{label}) = "{language}") . ?{label}Id rdfs:label ?{label} FILTER(LANG(?{label}) = "{language}") .
SERVICE wikibase:label {{ bd:serviceParam wikibase:language "{language}". }} }}""" SERVICE wikibase:label {{ bd:serviceParam wikibase:language "{language}". }} }}"""
query = textwrap.dedent(f""" query = textwrap.dedent(f"""
SELECT DISTINCT ?item ?itemLabel {"".join([f"(SAMPLE(?{label}) AS ?{label})" for label in property_labels_to_ids])} SELECT DISTINCT ?item ?itemLabel {"".join([f"(SAMPLE(?{label}) AS ?{label})" for label in property_labels_to_ids])}
WHERE {{ WHERE {{
?item wdt:P31 wd:Q5; rdfs:label "{fullName}"@{language}. ?item wdt:P31 wd:Q5; rdfs:label "{fullName}"@{language}.
{textwrap.dedent(propSelection)} {textwrap.dedent(propSelection)}
}} }}
GROUP BY ?item ?itemLabel GROUP BY ?item ?itemLabel
""") """)
return query return query
def construct_image_url(filename): def construct_image_url(filename):
return f"https://commons.wikimedia.org/wiki/Special:FilePath/{requests.utils.quote(filename)}" return f"https://commons.wikimedia.org/wiki/Special:FilePath/{requests.utils.quote(filename)}"
def get_wikipedia_links(qid, languages): def get_wikipedia_links(qid, languages):
""" """
Fetch Wikipedia links for a given Wikidata QID and a list of languages. Fetch Wikipedia links for a given Wikidata QID and a list of languages.
Parameters: Parameters:
- qid (str): The QID of the Wikidata item. - qid (str): The QID of the Wikidata item.
- languages (list): A list of language codes (e.g., ['en', 'de']). - languages (list): A list of language codes (e.g., ['en', 'de']).
Returns: Returns:
- dict: A dictionary with languages as keys and Wikipedia URLs as values. - dict: A dictionary with languages as keys and Wikipedia URLs as values.
""" """
url = "https://www.wikidata.org/w/api.php" url = "https://www.wikidata.org/w/api.php"
params = { params = {
"action": "wbgetentities", "action": "wbgetentities",
"ids": qid, "ids": qid,
"props": "sitelinks", "props": "sitelinks",
"format": "json" "format": "json"
} }
response = requests.get(url, params=params) response = requests.get(url, params=params)
data = response.json() data = response.json()
links = {} links = {}
if "entities" in data and qid in data["entities"]: if "entities" in data and qid in data["entities"]:
sitelinks = data["entities"][qid].get("sitelinks", {}) sitelinks = data["entities"][qid].get("sitelinks", {})
for lang in languages: for lang in languages:
sitekey = f"{lang}wiki" sitekey = f"{lang}wiki"
if sitekey in sitelinks: if sitekey in sitelinks:
links[lang] = sitelinks[sitekey]["url"] links[lang] = sitelinks[sitekey]["url"]
else: else:
links[lang] = None # Or use '' to represent absence of link links[lang] = None # Or use '' to represent absence of link
return links return links
def query_wikidata(fullName, property_map, language='en'): def query_wikidata(fullName, property_map, language='en'):
SPARQL_ENDPOINT = "https://query.wikidata.org/sparql" SPARQL_ENDPOINT = "https://query.wikidata.org/sparql"
query = generate_sparql_query(fullName, property_map, language) query = generate_sparql_query(fullName, property_map, language)
headers = {'User-Agent': 'Mozilla/5.0', 'Accept': 'application/json'} headers = {'User-Agent': 'Mozilla/5.0', 'Accept': 'application/json'}
response = requests.get(SPARQL_ENDPOINT, headers=headers, params={'query': query, 'format': 'json'}) response = requests.get(SPARQL_ENDPOINT, headers=headers, params={'query': query, 'format': 'json'})
if response.status_code != 200: if response.status_code != 200:
response.raise_for_status() response.raise_for_status()
results = response.json()['results']['bindings'] results = response.json()['results']['bindings']
if not results: if not results:
return None return None
# Initialize with fullName to ensure it appears first # Initialize with fullName to ensure it appears first
data = { data = {
'fullName': fullName 'fullName': fullName
} }
# use first result # use first result
result = results[0] result = results[0]
# iterate over fields # iterate over fields
for label in property_map: for label in property_map:
if label in result: if label in result:
value = result[label]['value'] value = result[label]['value']
data[label] = value data[label] = value
else: else:
data[label] = None data[label] = None
# add item URI # add item URI
data['item'] = os.path.basename(result['item']['value']) data['item'] = os.path.basename(result['item']['value'])
return data return data
def get_person_info_from_wikidata(names, property_map, language='en'): def get_person_info_from_wikidata(names, property_map, language='en'):
all_data = [] all_data = []
for fullName in names: for fullName in names:
data = query_wikidata(fullName, property_map, language) data = query_wikidata(fullName, property_map, language)
if data: if data:
all_data.append(data) all_data.append(data)
if all_data: if all_data:
# Ensure fullName appears first by reordering columns based on property_labels_to_ids keys # Ensure fullName appears first by reordering columns based on property_labels_to_ids keys
columns_order = ['fullName', 'item'] + list(property_map.keys()) columns_order = ['fullName', 'item'] + list(property_map.keys())
df = pd.DataFrame(all_data, columns=columns_order) df = pd.DataFrame(all_data, columns=columns_order)
else: else:
df = pd.DataFrame(columns=['fullName'] + list(property_map.keys())) df = pd.DataFrame(columns=['fullName'] + list(property_map.keys()))
return df return df
``` ```
%% Cell type:code id:19ddabbda261cc90 tags: %% Cell type:code id:19ddabbda261cc90 tags:
``` python ``` python
# Now calling the updated function with the 'language' parameter # Now calling the updated function with the 'language' parameter
property_labels_to_ids = { property_labels_to_ids = {
'sexOrGender': 'P21', 'sexOrGender': 'P21',
'image': 'P18', 'image': 'P18',
'countryOfCitizenship': 'P27', 'countryOfCitizenship': 'P27',
'givenName': 'P735', 'givenName': 'P735',
'familyName': 'P734', 'familyName': 'P734',
'dateOfBirth': 'P569', 'dateOfBirth': 'P569',
'dateOfDeath': 'P570', 'dateOfDeath': 'P570',
'occupation': 'P106', 'occupation': 'P106',
'fieldOfWork': 'P101', 'fieldOfWork': 'P101',
'employer': 'P108', 'employer': 'P108',
'viaf_id': 'P214', 'viaf_id': 'P214',
'isni_id': 'P213', 'isni_id': 'P213',
'gnd_id': 'P227' 'gnd_id': 'P227'
} }
scholars = [ scholars = [
"Hans Kelsen", "Hans Kelsen",
"Hugo Sinzheimer", "Hugo Sinzheimer",
"Karl Renner", "Karl Renner",
"Ernst Fraenkel", "Ernst Fraenkel",
"Franz Leopold Neumann", "Franz Leopold Neumann",
"Otto Kahn-Freund", "Otto Kahn-Freund",
"Otto Kirchheimer", "Otto Kirchheimer",
"Herrmann Kantorowicz", "Herrmann Kantorowicz",
"Ludwig Bendix", "Ludwig Bendix",
"Arthur Nussbaum", "Arthur Nussbaum",
"Theodor Geiger", "Theodor Geiger",
"Erhard Blankenburg", "Erhard Blankenburg",
"Wolfgang Kaupen", "Wolfgang Kaupen",
"Rüdiger Lautmann", "Rüdiger Lautmann",
"Thilo Ramm", "Thilo Ramm",
"Rudolf Wiethölter", "Rudolf Wiethölter",
"Niklas Luhmann", "Niklas Luhmann",
"Gunther Teubner", "Gunther Teubner",
"Volkmar Gessner" "Volkmar Gessner"
] ]
df = get_person_info_from_wikidata(scholars, property_labels_to_ids) df = get_person_info_from_wikidata(scholars, property_labels_to_ids)
df df
``` ```
%% Output %% Output
fullName item sexOrGender \ fullName item sexOrGender \
0 Hans Kelsen Q84165 male 0 Hans Kelsen Q84165 male
1 Hugo Sinzheimer Q86043 male 1 Hugo Sinzheimer Q86043 male
2 Karl Renner Q11726 male 2 Karl Renner Q11726 male
3 Ernst Fraenkel Q86812 male 3 Ernst Fraenkel Q86812 male
4 Franz Leopold Neumann Q112562068 male 4 Franz Leopold Neumann Q112562068 male
5 Otto Kahn-Freund Q121832 male 5 Otto Kahn-Freund Q121832 male
6 Otto Kirchheimer Q214397 male 6 Otto Kirchheimer Q214397 male
7 Ludwig Bendix Q28053205 male 7 Ludwig Bendix Q28053205 male
8 Arthur Nussbaum Q103088 male 8 Arthur Nussbaum Q103088 male
9 Theodor Geiger Q96410 male 9 Theodor Geiger Q96410 male
10 Erhard Blankenburg Q51595283 male 10 Erhard Blankenburg Q51595283 male
11 Wolfgang Kaupen Q93221485 male 11 Wolfgang Kaupen Q93221485 male
12 Rüdiger Lautmann Q91074 male 12 Rüdiger Lautmann Q91074 male
13 Thilo Ramm Q59533838 male 13 Thilo Ramm Q59533838 male
14 Rudolf Wiethölter Q1512482 male 14 Rudolf Wiethölter Q1512482 male
15 Niklas Luhmann Q85691627 None 15 Niklas Luhmann Q85691627 None
16 Gunther Teubner Q98304 male 16 Gunther Teubner Q98304 male
17 Volkmar Gessner Q15435946 male 17 Volkmar Gessner Q15435946 male
image \ image \
0 http://commons.wikimedia.org/wiki/Special:File... 0 http://commons.wikimedia.org/wiki/Special:File...
1 http://commons.wikimedia.org/wiki/Special:File... 1 http://commons.wikimedia.org/wiki/Special:File...
2 http://commons.wikimedia.org/wiki/Special:File... 2 http://commons.wikimedia.org/wiki/Special:File...
3 None 3 None
4 None 4 None
5 http://commons.wikimedia.org/wiki/Special:File... 5 http://commons.wikimedia.org/wiki/Special:File...
6 None 6 None
7 None 7 None
8 http://commons.wikimedia.org/wiki/Special:File... 8 http://commons.wikimedia.org/wiki/Special:File...
9 None 9 None
10 http://commons.wikimedia.org/wiki/Special:File... 10 http://commons.wikimedia.org/wiki/Special:File...
11 None 11 None
12 http://commons.wikimedia.org/wiki/Special:File... 12 http://commons.wikimedia.org/wiki/Special:File...
13 None 13 None
14 None 14 None
15 None 15 None
16 http://commons.wikimedia.org/wiki/Special:File... 16 http://commons.wikimedia.org/wiki/Special:File...
17 http://commons.wikimedia.org/wiki/Special:File... 17 http://commons.wikimedia.org/wiki/Special:File...
countryOfCitizenship givenName familyName dateOfBirth \ countryOfCitizenship givenName familyName dateOfBirth \
0 Cisleithania Hans Kelsen 1881-10-11T00:00:00Z 0 Cisleithania Hans Kelsen 1881-10-11T00:00:00Z
1 Germany Hugo Sinzheimer 1875-04-12T00:00:00Z 1 Germany Hugo Sinzheimer 1875-04-12T00:00:00Z
2 Cisleithania Karl Renner 1870-12-14T00:00:00Z 2 Cisleithania Karl Renner 1870-12-14T00:00:00Z
3 Germany Ernst Fraenkel 1898-12-26T00:00:00Z 3 Germany Ernst Fraenkel 1898-12-26T00:00:00Z
4 None Leopold Neumann None 4 None Leopold Neumann None
5 Germany Otto None 1900-11-17T00:00:00Z 5 Germany Otto None 1900-11-17T00:00:00Z
6 Germany Otto Kirchheimer 1905-11-11T00:00:00Z 6 Germany Otto Kirchheimer 1905-11-11T00:00:00Z
7 None Ludwig Bendix 1857-10-28T00:00:00Z 7 None Ludwig Bendix 1857-10-28T00:00:00Z
8 United States of America Arthur Nussbaum 1877-01-31T00:00:00Z 8 United States of America Arthur Nussbaum 1877-01-31T00:00:00Z
9 Germany Theodor Geiger 1891-11-09T00:00:00Z 9 Germany Theodor Geiger 1891-11-09T00:00:00Z
10 Germany Erhard Blankenburg 1938-10-30T00:00:00Z 10 Germany Erhard Blankenburg 1938-10-30T00:00:00Z
11 None Wolfgang None 1936-01-01T00:00:00Z 11 None Wolfgang None 1936-01-01T00:00:00Z
12 Germany Rüdiger None 1935-12-22T00:00:00Z 12 Germany Rüdiger None 1935-12-22T00:00:00Z
13 Germany Thilo Ramm 1925-04-04T00:00:00Z 13 Germany Thilo Ramm 1925-04-04T00:00:00Z
14 Germany Rudolf None 1929-07-17T00:00:00Z 14 Germany Rudolf None 1929-07-17T00:00:00Z
15 None None None None 15 None None None None
16 Germany Gunther Teubner 1944-04-30T00:00:00Z 16 Germany Gunther Teubner 1944-04-30T00:00:00Z
17 Germany Volkmar Gessner 1937-10-09T00:00:00Z 17 Germany Volkmar Gessner 1937-10-09T00:00:00Z
dateOfDeath occupation fieldOfWork \ dateOfDeath occupation fieldOfWork \
0 1973-04-19T00:00:00Z judge international law 0 1973-04-19T00:00:00Z judge international law
1 1945-09-16T00:00:00Z lawyer None 1 1945-09-16T00:00:00Z lawyer None
2 1950-12-31T00:00:00Z lawyer politics 2 1950-12-31T00:00:00Z lawyer politics
3 1975-03-28T00:00:00Z lawyer None 3 1975-03-28T00:00:00Z lawyer None
4 None printer publishing 4 None printer publishing
5 1979-08-16T00:00:00Z judge None 5 1979-08-16T00:00:00Z judge None
6 1965-11-22T00:00:00Z jurist None 6 1965-11-22T00:00:00Z jurist None
7 1923-09-28T00:00:00Z university teacher None 7 1923-09-28T00:00:00Z university teacher None
8 1964-11-22T00:00:00Z lawyer law 8 1964-11-22T00:00:00Z lawyer law
9 1952-06-16T00:00:00Z university teacher None 9 1952-06-16T00:00:00Z university teacher None
10 2018-03-28T00:00:00Z sociology of law sociology of law 10 2018-03-28T00:00:00Z sociology of law sociology of law
11 1981-01-01T00:00:00Z sociologist sociology of law 11 1981-01-01T00:00:00Z sociologist sociology of law
12 None author sociology of law 12 None author sociology of law
13 2018-06-17T00:00:00Z writer None 13 2018-06-17T00:00:00Z writer None
14 None jurist None 14 None jurist None
15 None researcher None 15 None researcher None
16 None jurist None 16 None jurist None
17 2014-11-08T00:00:00Z judge comparative law 17 2014-11-08T00:00:00Z judge comparative law
employer viaf_id \ employer viaf_id \
0 Charles University 31998356 0 Charles University 31998356
1 Goethe University Frankfurt 27864307 1 Goethe University Frankfurt 27864307
2 Austrian Federal Government 61669459 2 Austrian Federal Government 61669459
3 Free University Berlin 27108403 3 Free University Berlin 27108403
4 None 637163874508945722514 4 None 637163874508945722514
5 University of Oxford 76317591 5 University of Oxford 76317591
6 Office of Strategic Services 32042801 6 Office of Strategic Services 32042801
7 None 88720482 7 None 88720482
8 Columbia University 5180962 8 Columbia University 5180962
9 Technical University of Braunschweig 56667946 9 Technical University of Braunschweig 56667946
10 Free University of Amsterdam 64109592 10 Free University of Amsterdam 64109592
11 None 32919813 11 None 32919813
12 University of Bremen 24732961 12 University of Bremen 24732961
13 FernUniversität in Hagen 9924244 13 FernUniversität in Hagen 9924244
14 Goethe University Frankfurt 106974404 14 Goethe University Frankfurt 106974404
15 None None 15 None None
16 Goethe University Frankfurt 108364502 16 Goethe University Frankfurt 108364502
17 University of Bremen 69100039 17 University of Bremen 69100039
isni_id gnd_id isni_id gnd_id
0 0000000121266076 118561219 0 0000000121266076 118561219
1 0000000109619641 118614711 1 0000000109619641 118614711
2 0000000121358165 118599739 2 0000000121358165 118599739
3 0000000110230959 118534602 3 0000000110230959 118534602
4 None None 4 None None
5 0000000109168959 118559362 5 0000000109168959 118559362
6 0000000081110244 118562371 6 0000000081110244 118562371
7 0000000061811334 1023309920 7 0000000061811334 1023309920
8 0000000120988288 117071676 8 0000000120988288 117071676
9 0000000109038951 118538187 9 0000000109038951 118538187
10 0000000110676109 115459235 10 0000000110676109 115459235
11 0000000035495614 124045405 11 0000000035495614 124045405
12 000000011469331X 120502208 12 000000011469331X 120502208
13 0000000108689541 116327391 13 0000000108689541 116327391
14 0000000116961365 1034437860 14 0000000116961365 1034437860
15 None None 15 None None
16 0000000109312017 119443562 16 0000000109312017 119443562
17 0000000109127065 170469328 17 0000000109127065 170469328
%% Cell type:code id:c6c0cc347c8788d0 tags: %% Cell type:code id:c6c0cc347c8788d0 tags:
``` python ``` python
df.to_csv("scholars.csv", index=False) df.to_csv("scholars.csv", index=False)
``` ```
%% Cell type:code id:b8058de7fa9212b tags: %% Cell type:code id:b8058de7fa9212b tags:
``` python ``` python
import pandas as pd import pandas as pd
from datetime import datetime from datetime import datetime
# Assuming df is your existing DataFrame # Assuming df is your existing DataFrame
# Convert dateOfBirth and dateOfDeath to just the year, handle NaT/NaN appropriately # Convert dateOfBirth and dateOfDeath to just the year, handle NaT/NaN appropriately
df['Year'] = pd.to_datetime(df['dateOfBirth'], errors='coerce').dt.year.astype('Int64') df['Year'] = pd.to_datetime(df['dateOfBirth'], errors='coerce').dt.year.astype('Int64')
df['End Year'] = pd.to_datetime(df['dateOfDeath'], errors='coerce').dt.year.astype('Int64') df['End Year'] = pd.to_datetime(df['dateOfDeath'], errors='coerce').dt.year.astype('Int64')
# Create 'Display Date' as "dateOfBirth - dateOfDeath" # Create 'Display Date' as "dateOfBirth - dateOfDeath"
df['Display Date'] = df['Year'].astype(str).replace('<NA>','') + ' - ' + df['End Year'].astype(str).replace('<NA>','') df['Display Date'] = df['Year'].astype(str).replace('<NA>','') + ' - ' + df['End Year'].astype(str).replace('<NA>','')
# Create 'Headline' as "fullName (dateOfBirth - dateOfDeath)" # Create 'Headline' as "fullName (dateOfBirth - dateOfDeath)"
df['Headline'] = df['fullName'] + ' (' + df['Display Date'] + ')' df['Headline'] = df['fullName'] + ' (' + df['Display Date'] + ')'
# Create 'Text' column by combining occupation, fieldOfWork, employer # Create 'Text' column by combining occupation, fieldOfWork, employer
df['Text'] = df[['occupation', 'fieldOfWork']].apply(lambda x: '<br>'.join(x.dropna()), axis=1) df['Text'] = df[['occupation', 'fieldOfWork']].apply(lambda x: '<br>'.join(x.dropna()), axis=1)
# Use the image directly; assuming the URLs are already correctly formed in the 'image' column # Use the image directly; assuming the URLs are already correctly formed in the 'image' column
df['Media'] = df['image'] df['Media'] = df['image']
# Add a "Group" column with the value "actors" for all rows # Add a "Group" column with the value "actors" for all rows
df['Group'] = 'actors' df['Group'] = 'actors'
# fix date columns # fix date columns
df['Display Date'] = df['Display Date'].fillna('') # Ensure no NaNs in Display Date df['Display Date'] = df['Display Date'].fillna('') # Ensure no NaNs in Display Date
df['Headline'] = df['Headline'].fillna('') # Ensure no NaNs in Headline df['Headline'] = df['Headline'].fillna('') # Ensure no NaNs in Headline
df['Text'] = df['Text'].fillna('') # Ensure no NaNs in Text df['Text'] = df['Text'].fillna('') # Ensure no NaNs in Text
df['Media'] = df['Media'].fillna('') # Ensure no NaNs in Media df['Media'] = df['Media'].fillna('') # Ensure no NaNs in Media
# Now select and order the DataFrame according to the TimelineJS template requirements # Now select and order the DataFrame according to the TimelineJS template requirements
columns = "Year Month Day Time End Year End Month End Day End Time Display Date Headline Text Media Media Credit Media Caption Media Thumbnail Type Group Background Link".split("\t") columns = "Year Month Day Time End Year End Month End Day End Time Display Date Headline Text Media Media Credit Media Caption Media Thumbnail Type Group Background Link".split("\t")
for col in columns: for col in columns:
if col not in df: if col not in df:
df[col] = '' df[col] = ''
timeline_df = df[columns] timeline_df = df[columns]
timeline_df.to_excel("timeline_data.xlsx", index=False) timeline_df.to_excel("timeline_data.xlsx", index=False)
``` ```
%% Cell type:code id:f4b14ea7d4941e57 tags: %% Cell type:code id:f4b14ea7d4941e57 tags:
``` python ``` python
``` ```
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment