Fix wikipedia query

db5a5bf2 · cboulanger · c9c63843 · db5a5bf2
Commit db5a5bf2 authored 1 year ago by cboulanger
--- a/wikidata/query-wikidata.ipynb
+++ b/wikidata/query-wikidata.ipynb
@@ -30,6 +30,7 @@
    "    propSelection = \"\"\n",
    "    for label, pid in property_labels_to_ids.items():\n",
    "        if label.endswith(\"_id\") or label.startswith(\"image\"):\n",
+    "            # literal values, including URIs - this needs to be solved in a more generic way\n",
    "            propSelection += f\"\"\"\n",
    "                OPTIONAL {{ ?item wdt:{pid} ?{label}. }}\"\"\"\n",
    "        elif label.startswith(\"date\"):  \n",

 %% Cell type:code id:initial_id tags:

 ``` python
 import os.path
 import textwrap
 import requests

 def generate_sparql_query(fullName, property_labels_to_ids, language='en'):
    """
    Query WikiData for the properties of the given person listed in the given property map.
    All properties that are simple values without a label must have an "_id" suffix, all date
    properties must begin with "date"
    :param fullName:
    :param property_labels_to_ids:
    :param language:
    :return:
    """
    propSelection = ""
    for label, pid in property_labels_to_ids.items():
        if label.endswith("_id") or label.startswith("image"):
+            # literal values, including URIs - this needs to be solved in a more generic way
            propSelection += f"""
                OPTIONAL {{ ?item wdt:{pid} ?{label}. }}"""
        elif label.startswith("date"):
            # Dates, fetched directly but need special handling for formatting if desired
            propSelection += f"""
                OPTIONAL {{ ?item wdt:{pid} ?{label}. }}"""
        else:
            propSelection += f"""
                OPTIONAL {{ ?item wdt:{pid} ?{label}Id .
                   ?{label}Id rdfs:label ?{label} FILTER(LANG(?{label}) = "{language}") .
                   SERVICE wikibase:label {{ bd:serviceParam wikibase:language "{language}". }} }}"""

    query = textwrap.dedent(f"""
    SELECT DISTINCT ?item ?itemLabel {"".join([f"(SAMPLE(?{label}) AS ?{label})" for label in property_labels_to_ids])}
    WHERE {{
          ?item wdt:P31 wd:Q5; rdfs:label "{fullName}"@{language}.
          {textwrap.dedent(propSelection)}
    }}
    GROUP BY ?item ?itemLabel
    """)
    return query

 def construct_image_url(filename):
    return f"https://commons.wikimedia.org/wiki/Special:FilePath/{requests.utils.quote(filename)}"


 def get_wikipedia_links(qid, languages):
    """
    Fetch Wikipedia links for a given Wikidata QID and a list of languages.

    Parameters:
    - qid (str): The QID of the Wikidata item.
    - languages (list): A list of language codes (e.g., ['en', 'de']).

    Returns:
    - dict: A dictionary with languages as keys and Wikipedia URLs as values.
    """
    url = "https://www.wikidata.org/w/api.php"
    params = {
        "action": "wbgetentities",
        "ids": qid,
        "props": "sitelinks",
        "format": "json"
    }

    response = requests.get(url, params=params)
    data = response.json()

    links = {}
    if "entities" in data and qid in data["entities"]:
        sitelinks = data["entities"][qid].get("sitelinks", {})
        for lang in languages:
            sitekey = f"{lang}wiki"
            if sitekey in sitelinks:
                links[lang] = sitelinks[sitekey]["url"]
            else:
                links[lang] = None  # Or use '' to represent absence of link

    return links


 def query_wikidata(fullName, property_map, language='en'):
    SPARQL_ENDPOINT = "https://query.wikidata.org/sparql"
    query = generate_sparql_query(fullName, property_map, language)
    headers = {'User-Agent': 'Mozilla/5.0', 'Accept': 'application/json'}
    response = requests.get(SPARQL_ENDPOINT, headers=headers, params={'query': query, 'format': 'json'})

    if response.status_code != 200:
        response.raise_for_status()

    results = response.json()['results']['bindings']

    if not results:
        return None

    # Initialize with fullName to ensure it appears first
    data = {
        'fullName': fullName
    }

    # use first result
    result = results[0]

    # iterate over fields
    for label in property_map:
        if label in result:
            value = result[label]['value']
            data[label] = value
        else:
            data[label] = None

    # add item URI
    data['item'] = os.path.basename(result['item']['value'])

    return data


 def get_person_info_from_wikidata(names, property_map, language='en'):
    all_data = []
    for fullName in names:
        data = query_wikidata(fullName, property_map, language)
        if data:
            all_data.append(data)
    if all_data:
        # Ensure fullName appears first by reordering columns based on property_labels_to_ids keys
        columns_order = ['fullName', 'item'] + list(property_map.keys())
        df = pd.DataFrame(all_data, columns=columns_order)
    else:
        df = pd.DataFrame(columns=['fullName'] + list(property_map.keys()))
    return df
 ```

 %% Cell type:code id:19ddabbda261cc90 tags:

 ``` python
 # Now calling the updated function with the 'language' parameter
 property_labels_to_ids = {
    'sexOrGender': 'P21',
    'image': 'P18',
    'countryOfCitizenship': 'P27',
    'givenName': 'P735',
    'familyName': 'P734',
    'dateOfBirth': 'P569',
    'dateOfDeath': 'P570',
    'occupation': 'P106',
    'fieldOfWork': 'P101',
    'employer': 'P108',
    'viaf_id': 'P214',
    'isni_id': 'P213',
    'gnd_id': 'P227'
 }

 scholars = [
    "Hans Kelsen",
    "Hugo Sinzheimer",
    "Karl Renner",
    "Ernst Fraenkel",
    "Franz Leopold Neumann",
    "Otto Kahn-Freund",
    "Otto Kirchheimer",
    "Herrmann Kantorowicz",
    "Ludwig Bendix",
    "Arthur Nussbaum",
    "Theodor Geiger",
    "Erhard Blankenburg",
    "Wolfgang Kaupen",
    "Rüdiger Lautmann",
    "Thilo Ramm",
    "Rudolf Wiethölter",
    "Niklas Luhmann",
    "Gunther Teubner",
    "Volkmar Gessner"
 ]
 df = get_person_info_from_wikidata(scholars, property_labels_to_ids)
 df
 ```

 %% Output

                 fullName        item sexOrGender  \
 0             Hans Kelsen      Q84165        male   
 1         Hugo Sinzheimer      Q86043        male   
 2             Karl Renner      Q11726        male   
 3          Ernst Fraenkel      Q86812        male   
 4   Franz Leopold Neumann  Q112562068        male   
 5        Otto Kahn-Freund     Q121832        male   
 6        Otto Kirchheimer     Q214397        male   
 7           Ludwig Bendix   Q28053205        male   
 8         Arthur Nussbaum     Q103088        male   
 9          Theodor Geiger      Q96410        male   
 10     Erhard Blankenburg   Q51595283        male   
 11        Wolfgang Kaupen   Q93221485        male   
 12       Rüdiger Lautmann      Q91074        male   
 13             Thilo Ramm   Q59533838        male   
 14      Rudolf Wiethölter    Q1512482        male   
 15         Niklas Luhmann   Q85691627        None   
 16        Gunther Teubner      Q98304        male   
 17        Volkmar Gessner   Q15435946        male   

                                                image  \
 0   http://commons.wikimedia.org/wiki/Special:File...   
 1   http://commons.wikimedia.org/wiki/Special:File...   
 2   http://commons.wikimedia.org/wiki/Special:File...   
 3                                                None   
 4                                                None   
 5   http://commons.wikimedia.org/wiki/Special:File...   
 6                                                None   
 7                                                None   
 8   http://commons.wikimedia.org/wiki/Special:File...   
 9                                                None   
 10  http://commons.wikimedia.org/wiki/Special:File...   
 11                                               None   
 12  http://commons.wikimedia.org/wiki/Special:File...   
 13                                               None   
 14                                               None   
 15                                               None   
 16  http://commons.wikimedia.org/wiki/Special:File...   
 17  http://commons.wikimedia.org/wiki/Special:File...   

        countryOfCitizenship givenName   familyName           dateOfBirth  \
 0               Cisleithania      Hans       Kelsen  1881-10-11T00:00:00Z   
 1                    Germany      Hugo   Sinzheimer  1875-04-12T00:00:00Z   
 2               Cisleithania      Karl       Renner  1870-12-14T00:00:00Z   
 3                    Germany     Ernst     Fraenkel  1898-12-26T00:00:00Z   
 4                       None   Leopold      Neumann                  None   
 5                    Germany      Otto         None  1900-11-17T00:00:00Z   
 6                    Germany      Otto  Kirchheimer  1905-11-11T00:00:00Z   
 7                       None    Ludwig       Bendix  1857-10-28T00:00:00Z   
 8   United States of America    Arthur     Nussbaum  1877-01-31T00:00:00Z   
 9                    Germany   Theodor       Geiger  1891-11-09T00:00:00Z   
 10                   Germany    Erhard  Blankenburg  1938-10-30T00:00:00Z   
 11                      None  Wolfgang         None  1936-01-01T00:00:00Z   
 12                   Germany   Rüdiger         None  1935-12-22T00:00:00Z   
 13                   Germany     Thilo         Ramm  1925-04-04T00:00:00Z   
 14                   Germany    Rudolf         None  1929-07-17T00:00:00Z   
 15                      None      None         None                  None   
 16                   Germany   Gunther      Teubner  1944-04-30T00:00:00Z   
 17                   Germany   Volkmar      Gessner  1937-10-09T00:00:00Z   

             dateOfDeath          occupation        fieldOfWork  \
 0   1973-04-19T00:00:00Z               judge  international law   
 1   1945-09-16T00:00:00Z              lawyer               None   
 2   1950-12-31T00:00:00Z              lawyer           politics   
 3   1975-03-28T00:00:00Z              lawyer               None   
 4                   None             printer         publishing   
 5   1979-08-16T00:00:00Z               judge               None   
 6   1965-11-22T00:00:00Z              jurist               None   
 7   1923-09-28T00:00:00Z  university teacher               None   
 8   1964-11-22T00:00:00Z              lawyer                law   
 9   1952-06-16T00:00:00Z  university teacher               None   
 10  2018-03-28T00:00:00Z    sociology of law   sociology of law   
 11  1981-01-01T00:00:00Z         sociologist   sociology of law   
 12                  None              author   sociology of law   
 13  2018-06-17T00:00:00Z              writer               None   
 14                  None              jurist               None   
 15                  None          researcher               None   
 16                  None              jurist               None   
 17  2014-11-08T00:00:00Z               judge    comparative law   

                                employer                viaf_id  \
 0                     Charles University               31998356   
 1            Goethe University Frankfurt               27864307   
 2            Austrian Federal Government               61669459   
 3                 Free University Berlin               27108403   
 4                                   None  637163874508945722514   
 5                   University of Oxford               76317591   
 6           Office of Strategic Services               32042801   
 7                                   None               88720482   
 8                    Columbia University                5180962   
 9   Technical University of Braunschweig               56667946   
 10          Free University of Amsterdam               64109592   
 11                                  None               32919813   
 12                  University of Bremen               24732961   
 13              FernUniversität in Hagen                9924244   
 14           Goethe University Frankfurt              106974404   
 15                                  None                   None   
 16           Goethe University Frankfurt              108364502   
 17                  University of Bremen               69100039   

             isni_id      gnd_id  
 0   0000000121266076   118561219  
 1   0000000109619641   118614711  
 2   0000000121358165   118599739  
 3   0000000110230959   118534602  
 4               None        None  
 5   0000000109168959   118559362  
 6   0000000081110244   118562371  
 7   0000000061811334  1023309920  
 8   0000000120988288   117071676  
 9   0000000109038951   118538187  
 10  0000000110676109   115459235  
 11  0000000035495614   124045405  
 12  000000011469331X   120502208  
 13  0000000108689541   116327391  
 14  0000000116961365  1034437860  
 15              None        None  
 16  0000000109312017   119443562  
 17  0000000109127065   170469328  

 %% Cell type:code id:c6c0cc347c8788d0 tags:

 ``` python
 df.to_csv("scholars.csv", index=False)
 ```

 %% Cell type:code id:b8058de7fa9212b tags:

 ``` python
 import pandas as pd
 from datetime import datetime

 # Assuming df is your existing DataFrame

 # Convert dateOfBirth and dateOfDeath to just the year, handle NaT/NaN appropriately
 df['Year'] = pd.to_datetime(df['dateOfBirth'], errors='coerce').dt.year.astype('Int64')
 df['End Year'] = pd.to_datetime(df['dateOfDeath'], errors='coerce').dt.year.astype('Int64')

 # Create 'Display Date' as "dateOfBirth - dateOfDeath"
 df['Display Date'] = df['Year'].astype(str).replace('<NA>','')  + ' - ' + df['End Year'].astype(str).replace('<NA>','')

 # Create 'Headline' as "fullName (dateOfBirth - dateOfDeath)"
 df['Headline'] = df['fullName'] + ' (' + df['Display Date'] + ')'

 # Create 'Text' column by combining occupation, fieldOfWork, employer
 df['Text'] = df[['occupation', 'fieldOfWork']].apply(lambda x: '<br>'.join(x.dropna()), axis=1)

 # Use the image directly; assuming the URLs are already correctly formed in the 'image' column
 df['Media'] = df['image']

 # Add a "Group" column with the value "actors" for all rows
 df['Group'] = 'actors'

 # fix date columns
 df['Display Date'] = df['Display Date'].fillna('')  # Ensure no NaNs in Display Date
 df['Headline'] = df['Headline'].fillna('')  # Ensure no NaNs in Headline
 df['Text'] = df['Text'].fillna('')  # Ensure no NaNs in Text
 df['Media'] = df['Media'].fillna('')  # Ensure no NaNs in Media

 # Now select and order the DataFrame according to the TimelineJS template requirements
 columns = "Year	Month	Day	Time	End Year	End Month	End Day	End Time	Display Date	Headline	Text	Media	Media Credit	Media Caption	Media Thumbnail	Type	Group	Background	Link".split("\t")
 for col in columns:
    if col not in df:
        df[col] = ''
 timeline_df = df[columns]

 timeline_df.to_excel("timeline_data.xlsx", index=False)
 ```

 %% Cell type:code id:f4b14ea7d4941e57 tags:

 ``` python
 ```