Updates

53a4915d · Christian Boulanger · 617409b7 · 53a4915d · 53a4915d · 53a4915d
Commit 53a4915d authored 1 year ago by Christian Boulanger
--- a/wikidata/.env.dist
+++ b/wikidata/.env.dist
 OPENAI_API_KEY=''
-HUGGINGFACEHUB_API_TOKEN=''
\ No newline at end of file
+HUGGINGFACEHUB_API_TOKEN=''
+USER_AGENT=''
\ No newline at end of file
--- a/wikidata/.gitignore
+++ b/wikidata/.gitignore
@@ -3,4 +3,5 @@ user-config.py
 .env
 apicache
 /throttle.ctrl
-data/*-chatgpt.csv
\ No newline at end of file
+data/*-chatgpt.csv
+data/*-wikipedia.txt
\ No newline at end of file
--- a/wikidata/data/Thilo Ramm.csv
+++ b/wikidata/data/Thilo Ramm.csv
+subject-label,subject-qid,predicate,pid,object,object-qid,start_time,end_time,reference_url
+Thilo Ramm,Q59533838,educated at,P69,University of Marburg,,,1949,de.wikipedia.org/wiki/Thilo_Ramm?oldid=237476391
+Thilo Ramm,Q59533838,student of,P1066,Fritz von Hippel,,1949,1949,de.wikipedia.org/wiki/Thilo_Ramm?oldid=237476391
+Thilo Ramm,Q59533838,educated at,P69,University of Freiburg,,1953,1953,de.wikipedia.org/wiki/Thilo_Ramm?oldid=237476391
+Thilo Ramm,Q59533838,academic appointment,P8413,University of Freiburg,,,1961,de.wikipedia.org/wiki/Thilo_Ramm?oldid=237476391
+University of Giessen,,founded by,P112,Thilo Ramm,Q59533838,1962,1962,de.wikipedia.org/wiki/Thilo_Ramm?oldid=237476391
+Thilo Ramm,Q59533838,academic appointment,P8413,University of Giessen,,1962,,de.wikipedia.org/wiki/Thilo_Ramm?oldid=237476391
+Thilo Ramm,Q59533838,academic appointment,P8413,Fernuniversität in Hagen,,1977,,de.wikipedia.org/wiki/Thilo_Ramm?oldid=237476391
+Thilo Ramm,Q59533838,field of work,P101,Civil law,,1962,,de.wikipedia.org/wiki/Thilo_Ramm?oldid=237476391
+Thilo Ramm,Q59533838,field of work,P101,Labor law,,1962,,de.wikipedia.org/wiki/Thilo_Ramm?oldid=237476391
+Thilo Ramm,Q59533838,field of work,P101,Social law,,1962,,de.wikipedia.org/wiki/Thilo_Ramm?oldid=237476391
+Thilo Ramm,Q59533838,field of work,P101,Social philosophy,,1962,,de.wikipedia.org/wiki/Thilo_Ramm?oldid=237476391
--- a/wikidata/download-wikipedia-pages.ipynb
+++ b/wikidata/download-wikipedia-pages.ipynb
@@ -3,45 +3,75 @@
  {
   "cell_type": "markdown",
   "source": [
-    "# Download wikipedia pages as source of triple extraction"
+    "# Download wikipedia pages as source of triple extraction\n",
+    "\n",
+    "This improves on [data-extraction notebook](./data-extraction.ipynb) by downloading the wikipedia article from which information is to be extracted "
   ],
   "metadata": {
    "collapsed": false
   },
   "id": "9d6a10996bfdd3cf"
  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "## 1. Download raw Wikipedia page content for the list of scholars and save it"
+   ],
+   "metadata": {
+    "collapsed": false
+   },
+   "id": "2ad235b62e2efc09"
+  },
  {
   "cell_type": "code",
-   "execution_count": 26,
+   "execution_count": 7,
   "id": "initial_id",
   "metadata": {
    "collapsed": true,
    "ExecuteTime": {
-     "end_time": "2024-03-17T19:36:34.696997900Z",
-     "start_time": "2024-03-17T19:36:33.700459Z"
+     "end_time": "2024-03-18T15:26:43.976993400Z",
+     "start_time": "2024-03-18T15:26:43.935308500Z"
    }
   },
   "outputs": [
    {
-     "data": {
-      "text/plain": "'de.wikipedia.org/wiki/Erhard_Blankenburg?oldid=228627122'"
-     },
-     "execution_count": 26,
-     "metadata": {},
-     "output_type": "execute_result"
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "No Wikipedia page exists for Wolfgang Kaupen.\n"
+     ]
    }
   ],
   "source": [
-    "from lib.wikidata import get_wikipedia_page_content\n",
-    "import mwclient\n",
-    "#get_wikipedia_page_content(\"Erhard Blankenburg\")\n",
-    "user_agent = 'github.com/cboulanger/experiments (info@bibliograph.org)'\n",
-    "pageTitle = 'Erhard Blankenburg'\n",
-    "site = mwclient.Site(f'de.wikipedia.org', clients_useragent=user_agent)\n",
-    "page = site.pages[pageTitle]\n",
-    "rev = page.revision\n",
-    "url = f'{site.host}/wiki/{pageTitle.replace(\" \",\"_\")}?oldid={rev}' \n",
-    "url\n"
+    "import os.path\n",
+    "\n",
+    "from lib.wikidata import get_wikipedia_page_data\n",
+    "from urllib.parse import unquote\n",
+    "import pandas as pd\n",
+    "\n",
+    "df = pd.read_csv('scholars.csv')\n",
+    "for index, row in df.iterrows():\n",
+    "    fullName = row['fullName']\n",
+    "    language_code = None\n",
+    "    if pd.notna(row['wikipedia_de']):\n",
+    "        pagetTitle = unquote(os.path.basename(row['wikipedia_de']))\n",
+    "        language_code = 'de'\n",
+    "    elif pd.notna(row['wikipedia_en']):\n",
+    "        pagetTitle = unquote(os.path.basename(row['wikipedia_en']))\n",
+    "        language_code = 'en'\n",
+    "    else:\n",
+    "        print(f'No Wikipedia page exists for {fullName}.')\n",
+    "        continue\n",
+    "\n",
+    "    wikipedia_content_cache_path = f'data/{fullName}-wikipedia.txt'\n",
+    "    if not os.path.isfile(wikipedia_content_cache_path):\n",
+    "        page_data = get_wikipedia_page_data(pagetTitle, language_code)\n",
+    "        if page_data and page_data['page'].exists: \n",
+    "            file_content = f\"{page_data['url']}\\n\\n{page_data['content']}\"\n",
+    "            with open(wikipedia_content_cache_path, 'w', encoding='utf-8') as file:\n",
+    "                file.write(file_content)\n",
+    "        else:\n",
+    "            print(f'No page content could be retrieved for \"{fullName}\"')"
   ]
  },
  {
@@ -52,11 +82,71 @@
   },
   "id": "e83e59e1974a6506"
  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "## 2. Reduce text size\n",
+    "\n",
+    "In order to remove unnecessary information and reduce the token count, edit the downloaded files to contain only the biographical parts from which to extract the information"
+   ],
+   "metadata": {
+    "collapsed": false
+   },
+   "id": "997c82c5d3d72b7"
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "## 3. Extract the information "
+   ],
+   "metadata": {
+    "collapsed": false
+   },
+   "id": "303ddc348c4a2887"
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "C:\\Users\\boulanger\\AppData\\Local\\miniconda3\\Lib\\site-packages\\langchain_openai\\chat_models\\base.py:454: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.5/migration/\n",
+      "  response = response.dict()\n",
+      "C:\\Users\\boulanger\\AppData\\Local\\miniconda3\\Lib\\site-packages\\pydantic\\main.py:979: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.5/migration/\n",
+      "  warnings.warn('The `dict` method is deprecated; use `model_dump` instead.', DeprecationWarning)\n"
+     ]
+    }
+   ],
+   "source": [
+    "from lib.langchain import extract_to_csv\n",
+    "from langchain_openai import ChatOpenAI\n",
+    "from pathlib import Path\n",
+    "fullName = \"Thilo Ramm\"\n",
+    "qid=\"Q59533838\"\n",
+    "model = ChatOpenAI(model_name=\"gpt-4\")\n",
+    "template = Path('extraction-prompt.txt').read_text()\n",
+    "website_text = Path(f'data/{fullName}-wikipedia.txt').read_text()\n",
+    "csv_path = f'data/{fullName}.csv'\n",
+    "df = extract_to_csv(model, template, csv_path, fullName=fullName, qid=qid, website_text=website_text)"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "ExecuteTime": {
+     "end_time": "2024-03-18T17:03:40.614971500Z",
+     "start_time": "2024-03-18T17:03:11.400373800Z"
+    }
+   },
+   "id": "d904e502f8eff15d"
+  },
  {
   "cell_type": "code",
   "execution_count": null,
   "outputs": [],
-   "source": [],
+   "source": [
+    "from lib.wikidata import update_wikidata_from_csv"
+   ],
   "metadata": {
    "collapsed": false
   },

 %% Cell type:markdown id:9d6a10996bfdd3cf tags:

 # Download wikipedia pages as source of triple extraction

+This improves on [data-extraction notebook](./data-extraction.ipynb) by downloading the wikipedia article from which information is to be extracted
+
+%% Cell type:markdown id:2ad235b62e2efc09 tags:
+
+## 1. Download raw Wikipedia page content for the list of scholars and save it
+
 %% Cell type:code id:initial_id tags:

 ``` python
-from lib.wikidata import get_wikipedia_page_content
-import mwclient
-#get_wikipedia_page_content("Erhard Blankenburg")
-user_agent = 'github.com/cboulanger/experiments (info@bibliograph.org)'
-pageTitle = 'Erhard Blankenburg'
-site = mwclient.Site(f'de.wikipedia.org', clients_useragent=user_agent)
-page = site.pages[pageTitle]
-rev = page.revision
-url = f'{site.host}/wiki/{pageTitle.replace(" ","_")}?oldid={rev}'
-url
+import os.path
+
+from lib.wikidata import get_wikipedia_page_data
+from urllib.parse import unquote
+import pandas as pd
+
+df = pd.read_csv('scholars.csv')
+for index, row in df.iterrows():
+    fullName = row['fullName']
+    language_code = None
+    if pd.notna(row['wikipedia_de']):
+        pagetTitle = unquote(os.path.basename(row['wikipedia_de']))
+        language_code = 'de'
+    elif pd.notna(row['wikipedia_en']):
+        pagetTitle = unquote(os.path.basename(row['wikipedia_en']))
+        language_code = 'en'
+    else:
+        print(f'No Wikipedia page exists for {fullName}.')
+        continue
+
+    wikipedia_content_cache_path = f'data/{fullName}-wikipedia.txt'
+    if not os.path.isfile(wikipedia_content_cache_path):
+        page_data = get_wikipedia_page_data(pagetTitle, language_code)
+        if page_data and page_data['page'].exists:
+            file_content = f"{page_data['url']}\n\n{page_data['content']}"
+            with open(wikipedia_content_cache_path, 'w', encoding='utf-8') as file:
+                file.write(file_content)
+        else:
+            print(f'No page content could be retrieved for "{fullName}"')
 ```

 %% Output

-'de.wikipedia.org/wiki/Erhard_Blankenburg?oldid=228627122'
+    No Wikipedia page exists for Wolfgang Kaupen.

 %% Cell type:markdown id:e83e59e1974a6506 tags:


+%% Cell type:markdown id:997c82c5d3d72b7 tags:
+
+## 2. Reduce text size
+
+In order to remove unnecessary information and reduce the token count, edit the downloaded files to contain only the biographical parts from which to extract the information
+
+%% Cell type:markdown id:303ddc348c4a2887 tags:
+
+## 3. Extract the information
+
+%% Cell type:code id:d904e502f8eff15d tags:
+
+``` python
+from lib.langchain import extract_to_csv
+from langchain_openai import ChatOpenAI
+from pathlib import Path
+fullName = "Thilo Ramm"
+qid="Q59533838"
+model = ChatOpenAI(model_name="gpt-4")
+template = Path('extraction-prompt.txt').read_text()
+website_text = Path(f'data/{fullName}-wikipedia.txt').read_text()
+csv_path = f'data/{fullName}.csv'
+df = extract_to_csv(model, template, csv_path, fullName=fullName, qid=qid, website_text=website_text)
+```
+
+%% Output
+
+    C:\Users\boulanger\AppData\Local\miniconda3\Lib\site-packages\langchain_openai\chat_models\base.py:454: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.5/migration/
+      response = response.dict()
+    C:\Users\boulanger\AppData\Local\miniconda3\Lib\site-packages\pydantic\main.py:979: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.5/migration/
+      warnings.warn('The `dict` method is deprecated; use `model_dump` instead.', DeprecationWarning)
+
 %% Cell type:code id:7e04f078c326387a tags:

 ``` python
+from lib.wikidata import update_wikidata_from_csv
 ```

--- a/wikidata/extraction-prompt.txt
+++ b/wikidata/extraction-prompt.txt
+Your task is to extract data from the text and to output it in a format that is suitable as a data source for adding triples to Wikidata.
+
+The text is about "{fullName}" with the QID {qid}. It consists of one or more sections separated by "-----". The sections begin with a standalone URL followed by an excerpt of the content that can be found at this URL.
+
+Arrange the extracted information into a table with the following columns: subject-label, subject-qid, predicate, pid, object, object-qid, start_time, end_time, reference_url.
+
+Insert data into the columns as per the following rules:
+- subject-label/subject-qid: In general, the subject is "{fullName}" with the QID {qid}. However, refining/qualifying statements can also be made about other entities, as with the academic degree (P512) item below. Also, in the case of P112, subject and object must be reversed
+- predicate/pid:
+    - educated at (P69): Institutions at which the person studied
+    - student of (P1066): If supervisors of doctoral theses and habilitations are specified
+    - employer (P108): is the organization that pays the salary of a person (this can be a company, and institution or the university)
+    - academic appointment (P8413): usually the department of a university, if this or its QID are not known, like P108
+    - student (P802): persons contained in WikiData who were educated by the subject
+    - member of (P463): Organizations and associations to which the person belongs (excluding P108)
+    - affiliation (P1416): Organization that the subject is affiliated with (not member of or employed by)
+    - academic degree (P512): some instance of academic degree (Q189533). After making this claim, add further triples to refine the P512 statement with triples on "conferred by" (P1027) and on "point in time" (P585).
+    - field of work (P101): extract the main topics and themes the subject has worked and published on
+    - editor (P98): add information on memberships in editorial boards of academic journals
+    - founded by (P112): add information on journals, associations or other organizations that the subject helped to establish. When adding this claim, YOU MUST switch subject and object to express the reverse relationship
+- object-label/object-qid: here the English labels and, if known, the QIDs for the institutions and persons who are the objects of the triple. If you are not absolutely sure, leave blank
+- start_time: the date/year from which the triple statement is true. Leave blank if the date is not specified or cannot be inferred, or the triple involves P585
+- end_time: the date/year up to which the triple statement is true. If it is an event, identical to start_time
+- reference_url: this is the source URL of the text from which the information was extracted.
+
+Return information as a comma-separated values (CSV). Include the column headers. Surround the values with quotes. If values contain quotes, properly escape them.
+
+DO NOT, UNDER ANY CIRCUMSTANCES, provide any commentary or explanations, just return the raw data. Do not make anything up that is not in the source material.
+-----
+{website_text}
\ No newline at end of file
--- a/wikidata/lib/langchain.py
+++ b/wikidata/lib/langchain.py
+import io
+from langchain_core.prompts import ChatPromptTemplate
+from langchain_core.output_parsers import StrOutputParser
+import pandas as pd
+from dotenv import load_dotenv
+
+load_dotenv()
+
+def extract_to_csv(model, template, csv_path, **params):
+    prompt = ChatPromptTemplate.from_template(template)
+    parser = StrOutputParser()
+    chain = ( prompt | model | parser )
+    response = chain.invoke(params)
+    data = io.StringIO(response)
+    df = pd.read_csv(data, dtype={'start_time': str, 'end_time': str})
+    df.to_csv(csv_path, index=False)
\ No newline at end of file
--- a/wikidata/lib/wikidata.py
+++ b/wikidata/lib/wikidata.py
 # based on code written by GPT-4
 import csv
+import os
+
 from pywikibot import Claim, WbTime, ItemPage, PropertyPage, Site
 from datetime import datetime
-
 import mwclient
-from bs4 import BeautifulSoup
+from dotenv import load_dotenv
+
+load_dotenv()


 def claim_to_string(claim):
@@ -172,10 +175,16 @@ def update_wikidata_from_csv(file_path):
            previous_claim = claim


-def get_wikipedia_page_content(pageTitle:str, language="en"):
-    user_agent = 'github.com/cboulanger/experiments (info@bibliograph.org)'
+def get_wikipedia_page_data(pageTitle: str, language="en"):
+    user_agent = os.getenv('USER_AGENT')
    site = mwclient.Site(f'{language}.wikipedia.org', clients_useragent=user_agent)
-    api = site.api("parse", page=pageTitle.replace(" ", "_"))
-    html = api["parse"]["text"]["*"]
-    soup = BeautifulSoup(html, "html.parser")
-    return soup.get_text()
+    page = site.pages[pageTitle]
+    if not page.exists:
+        return None
+
+    return {
+        'page': page,
+        'revision': page.revision,
+        'url': f'{site.host}/wiki/{pageTitle.replace(" ", "_")}?oldid={page.revision}',
+        'content': page.text()
+    }
\ No newline at end of file
--- a/wikidata/scholars-qid.csv
+++ b/wikidata/scholars-qid.csv
@@ -16,7 +16,6 @@ Thilo Ramm,Q59533838
 Rudolf Wiethölter,Q1512482
 Niklas Luhmann,Q57238
 Hubert Rottleuthner,Q55622018
-Ralf Rogowski,Q112499743
 Ralf Rogowski,Q20128038
 Gunther Teubner,Q98304
 Volkmar Gessner,Q15435946

--- a/wikidata/scholars.csv
+++ b/wikidata/scholars.csv
@@ -16,7 +16,6 @@ Thilo Ramm,Q59533838,male,Ramm,Thilo,1925-04-04 00:00:00+00:00,2018-06-17 00:00:
 Rudolf Wiethölter,Q1512482,male,,Rudolf,1929-07-17 00:00:00+00:00,,1034437860,http://www.wikidata.org/entity/Q1512482,,https://de.wikipedia.org/wiki/Rudolf_Wieth%C3%B6lter
 Niklas Luhmann,Q57238,male,Luhmann,Niklas,1927-12-08 00:00:00+00:00,1998-11-06 00:00:00+00:00,118575147,http://www.wikidata.org/entity/Q57238,https://en.wikipedia.org/wiki/Niklas_Luhmann,https://de.wikipedia.org/wiki/Niklas_Luhmann
 Hubert Rottleuthner,Q55622018,male,,Hubert,1944-01-01 00:00:00+00:00,,135622751,http://www.wikidata.org/entity/Q55622018,,https://de.wikipedia.org/wiki/Hubert_Rottleuthner
-Ralf Rogowski,Q112499743,male,Rogowski,Ralf,1953-01-01 00:00:00+00:00,,17150982X,http://www.wikidata.org/entity/Q112499743,,
 Ralf Rogowski,Q20128038,male,Rogowski,Ralf,,,,http://www.wikidata.org/entity/Q20128038,https://en.wikipedia.org/wiki/Ralf_Rogowski,
 Gunther Teubner,Q98304,male,Teubner,Gunther,1944-04-30 00:00:00+00:00,,119443562,http://www.wikidata.org/entity/Q98304,https://en.wikipedia.org/wiki/Gunther_Teubner,https://de.wikipedia.org/wiki/Gunther_Teubner
 Volkmar Gessner,Q15435946,male,Gessner,Volkmar,1937-10-09 00:00:00+00:00,2014-11-08 00:00:00+00:00,170469328,http://www.wikidata.org/entity/Q15435946,https://en.wikipedia.org/wiki/Volkmar_Gessner,https://de.wikipedia.org/wiki/Volkmar_Gessner