From 53a4915dac9fd6171112998e3112b072498f109b Mon Sep 17 00:00:00 2001
From: Christian Boulanger <boulanger@lhlt.mpg.de>
Date: Mon, 18 Mar 2024 18:11:47 +0100
Subject: [PATCH] Updates

---
 wikidata/.env.dist                      |   3 +-
 wikidata/.gitignore                     |   3 +-
 wikidata/data/Thilo Ramm.csv            |  12 +++
 wikidata/download-wikipedia-pages.ipynb | 132 ++++++++++++++++++++----
 wikidata/extraction-prompt.txt          |  30 ++++++
 wikidata/lib/langchain.py               |  16 +++
 wikidata/lib/wikidata.py                |  25 +++--
 wikidata/scholars-qid.csv               |   1 -
 wikidata/scholars.csv                   |   1 -
 9 files changed, 190 insertions(+), 33 deletions(-)
 create mode 100644 wikidata/data/Thilo Ramm.csv
 create mode 100644 wikidata/extraction-prompt.txt
 create mode 100644 wikidata/lib/langchain.py

diff --git a/wikidata/.env.dist b/wikidata/.env.dist
index f841092..d26cd52 100644
--- a/wikidata/.env.dist
+++ b/wikidata/.env.dist
@@ -1,2 +1,3 @@
 OPENAI_API_KEY=''
-HUGGINGFACEHUB_API_TOKEN=''
\ No newline at end of file
+HUGGINGFACEHUB_API_TOKEN=''
+USER_AGENT=''
\ No newline at end of file
diff --git a/wikidata/.gitignore b/wikidata/.gitignore
index 757a946..afab930 100644
--- a/wikidata/.gitignore
+++ b/wikidata/.gitignore
@@ -3,4 +3,5 @@ user-config.py
 .env
 apicache
 /throttle.ctrl
-data/*-chatgpt.csv
\ No newline at end of file
+data/*-chatgpt.csv
+data/*-wikipedia.txt
\ No newline at end of file
diff --git a/wikidata/data/Thilo Ramm.csv b/wikidata/data/Thilo Ramm.csv
new file mode 100644
index 0000000..a33217a
--- /dev/null
+++ b/wikidata/data/Thilo Ramm.csv	
@@ -0,0 +1,12 @@
+subject-label,subject-qid,predicate,pid,object,object-qid,start_time,end_time,reference_url
+Thilo Ramm,Q59533838,educated at,P69,University of Marburg,,,1949,de.wikipedia.org/wiki/Thilo_Ramm?oldid=237476391
+Thilo Ramm,Q59533838,student of,P1066,Fritz von Hippel,,1949,1949,de.wikipedia.org/wiki/Thilo_Ramm?oldid=237476391
+Thilo Ramm,Q59533838,educated at,P69,University of Freiburg,,1953,1953,de.wikipedia.org/wiki/Thilo_Ramm?oldid=237476391
+Thilo Ramm,Q59533838,academic appointment,P8413,University of Freiburg,,,1961,de.wikipedia.org/wiki/Thilo_Ramm?oldid=237476391
+University of Giessen,,founded by,P112,Thilo Ramm,Q59533838,1962,1962,de.wikipedia.org/wiki/Thilo_Ramm?oldid=237476391
+Thilo Ramm,Q59533838,academic appointment,P8413,University of Giessen,,1962,,de.wikipedia.org/wiki/Thilo_Ramm?oldid=237476391
+Thilo Ramm,Q59533838,academic appointment,P8413,FernuniversitÃ¤t in Hagen,,1977,,de.wikipedia.org/wiki/Thilo_Ramm?oldid=237476391
+Thilo Ramm,Q59533838,field of work,P101,Civil law,,1962,,de.wikipedia.org/wiki/Thilo_Ramm?oldid=237476391
+Thilo Ramm,Q59533838,field of work,P101,Labor law,,1962,,de.wikipedia.org/wiki/Thilo_Ramm?oldid=237476391
+Thilo Ramm,Q59533838,field of work,P101,Social law,,1962,,de.wikipedia.org/wiki/Thilo_Ramm?oldid=237476391
+Thilo Ramm,Q59533838,field of work,P101,Social philosophy,,1962,,de.wikipedia.org/wiki/Thilo_Ramm?oldid=237476391
diff --git a/wikidata/download-wikipedia-pages.ipynb b/wikidata/download-wikipedia-pages.ipynb
index 4a48c15..038989a 100644
--- a/wikidata/download-wikipedia-pages.ipynb
+++ b/wikidata/download-wikipedia-pages.ipynb
@@ -3,45 +3,75 @@
   {
    "cell_type": "markdown",
    "source": [
-    "# Download wikipedia pages as source of triple extraction"
+    "# Download wikipedia pages as source of triple extraction\n",
+    "\n",
+    "This improves on [data-extraction notebook](./data-extraction.ipynb) by downloading the wikipedia article from which information is to be extracted "
    ],
    "metadata": {
     "collapsed": false
    },
    "id": "9d6a10996bfdd3cf"
   },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "## 1. Download raw Wikipedia page content for the list of scholars and save it"
+   ],
+   "metadata": {
+    "collapsed": false
+   },
+   "id": "2ad235b62e2efc09"
+  },
   {
    "cell_type": "code",
-   "execution_count": 26,
+   "execution_count": 7,
    "id": "initial_id",
    "metadata": {
     "collapsed": true,
     "ExecuteTime": {
-     "end_time": "2024-03-17T19:36:34.696997900Z",
-     "start_time": "2024-03-17T19:36:33.700459Z"
+     "end_time": "2024-03-18T15:26:43.976993400Z",
+     "start_time": "2024-03-18T15:26:43.935308500Z"
     }
    },
    "outputs": [
     {
-     "data": {
-      "text/plain": "'de.wikipedia.org/wiki/Erhard_Blankenburg?oldid=228627122'"
-     },
-     "execution_count": 26,
-     "metadata": {},
-     "output_type": "execute_result"
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "No Wikipedia page exists for Wolfgang Kaupen.\n"
+     ]
     }
    ],
    "source": [
-    "from lib.wikidata import get_wikipedia_page_content\n",
-    "import mwclient\n",
-    "#get_wikipedia_page_content(\"Erhard Blankenburg\")\n",
-    "user_agent = 'github.com/cboulanger/experiments (info@bibliograph.org)'\n",
-    "pageTitle = 'Erhard Blankenburg'\n",
-    "site = mwclient.Site(f'de.wikipedia.org', clients_useragent=user_agent)\n",
-    "page = site.pages[pageTitle]\n",
-    "rev = page.revision\n",
-    "url = f'{site.host}/wiki/{pageTitle.replace(\" \",\"_\")}?oldid={rev}' \n",
-    "url\n"
+    "import os.path\n",
+    "\n",
+    "from lib.wikidata import get_wikipedia_page_data\n",
+    "from urllib.parse import unquote\n",
+    "import pandas as pd\n",
+    "\n",
+    "df = pd.read_csv('scholars.csv')\n",
+    "for index, row in df.iterrows():\n",
+    "    fullName = row['fullName']\n",
+    "    language_code = None\n",
+    "    if pd.notna(row['wikipedia_de']):\n",
+    "        pagetTitle = unquote(os.path.basename(row['wikipedia_de']))\n",
+    "        language_code = 'de'\n",
+    "    elif pd.notna(row['wikipedia_en']):\n",
+    "        pagetTitle = unquote(os.path.basename(row['wikipedia_en']))\n",
+    "        language_code = 'en'\n",
+    "    else:\n",
+    "        print(f'No Wikipedia page exists for {fullName}.')\n",
+    "        continue\n",
+    "\n",
+    "    wikipedia_content_cache_path = f'data/{fullName}-wikipedia.txt'\n",
+    "    if not os.path.isfile(wikipedia_content_cache_path):\n",
+    "        page_data = get_wikipedia_page_data(pagetTitle, language_code)\n",
+    "        if page_data and page_data['page'].exists: \n",
+    "            file_content = f\"{page_data['url']}\\n\\n{page_data['content']}\"\n",
+    "            with open(wikipedia_content_cache_path, 'w', encoding='utf-8') as file:\n",
+    "                file.write(file_content)\n",
+    "        else:\n",
+    "            print(f'No page content could be retrieved for \"{fullName}\"')"
    ]
   },
   {
@@ -52,11 +82,71 @@
    },
    "id": "e83e59e1974a6506"
   },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "## 2. Reduce text size\n",
+    "\n",
+    "In order to remove unnecessary information and reduce the token count, edit the downloaded files to contain only the biographical parts from which to extract the information"
+   ],
+   "metadata": {
+    "collapsed": false
+   },
+   "id": "997c82c5d3d72b7"
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "## 3. Extract the information "
+   ],
+   "metadata": {
+    "collapsed": false
+   },
+   "id": "303ddc348c4a2887"
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "C:\\Users\\boulanger\\AppData\\Local\\miniconda3\\Lib\\site-packages\\langchain_openai\\chat_models\\base.py:454: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.5/migration/\n",
+      "  response = response.dict()\n",
+      "C:\\Users\\boulanger\\AppData\\Local\\miniconda3\\Lib\\site-packages\\pydantic\\main.py:979: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.5/migration/\n",
+      "  warnings.warn('The `dict` method is deprecated; use `model_dump` instead.', DeprecationWarning)\n"
+     ]
+    }
+   ],
+   "source": [
+    "from lib.langchain import extract_to_csv\n",
+    "from langchain_openai import ChatOpenAI\n",
+    "from pathlib import Path\n",
+    "fullName = \"Thilo Ramm\"\n",
+    "qid=\"Q59533838\"\n",
+    "model = ChatOpenAI(model_name=\"gpt-4\")\n",
+    "template = Path('extraction-prompt.txt').read_text()\n",
+    "website_text = Path(f'data/{fullName}-wikipedia.txt').read_text()\n",
+    "csv_path = f'data/{fullName}.csv'\n",
+    "df = extract_to_csv(model, template, csv_path, fullName=fullName, qid=qid, website_text=website_text)"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "ExecuteTime": {
+     "end_time": "2024-03-18T17:03:40.614971500Z",
+     "start_time": "2024-03-18T17:03:11.400373800Z"
+    }
+   },
+   "id": "d904e502f8eff15d"
+  },
   {
    "cell_type": "code",
    "execution_count": null,
    "outputs": [],
-   "source": [],
+   "source": [
+    "from lib.wikidata import update_wikidata_from_csv"
+   ],
    "metadata": {
     "collapsed": false
    },
diff --git a/wikidata/extraction-prompt.txt b/wikidata/extraction-prompt.txt
new file mode 100644
index 0000000..732babd
--- /dev/null
+++ b/wikidata/extraction-prompt.txt
@@ -0,0 +1,30 @@
+Your task is to extract data from the text and to output it in a format that is suitable as a data source for adding triples to Wikidata.
+
+The text is about "{fullName}" with the QID {qid}. It consists of one or more sections separated by "-----". The sections begin with a standalone URL followed by an excerpt of the content that can be found at this URL.
+
+Arrange the extracted information into a table with the following columns: subject-label, subject-qid, predicate, pid, object, object-qid, start_time, end_time, reference_url.
+
+Insert data into the columns as per the following rules:
+- subject-label/subject-qid: In general, the subject is "{fullName}" with the QID {qid}. However, refining/qualifying statements can also be made about other entities, as with the academic degree (P512) item below. Also, in the case of P112, subject and object must be reversed
+- predicate/pid:
+    - educated at (P69): Institutions at which the person studied
+    - student of (P1066): If supervisors of doctoral theses and habilitations are specified
+    - employer (P108): is the organization that pays the salary of a person (this can be a company, and institution or the university)
+    - academic appointment (P8413): usually the department of a university, if this or its QID are not known, like P108
+    - student (P802): persons contained in WikiData who were educated by the subject
+    - member of (P463): Organizations and associations to which the person belongs (excluding P108)
+    - affiliation (P1416): Organization that the subject is affiliated with (not member of or employed by)
+    - academic degree (P512): some instance of academic degree (Q189533). After making this claim, add further triples to refine the P512 statement with triples on "conferred by" (P1027) and on "point in time" (P585).
+    - field of work (P101): extract the main topics and themes the subject has worked and published on
+    - editor (P98): add information on memberships in editorial boards of academic journals
+    - founded by (P112): add information on journals, associations or other organizations that the subject helped to establish. When adding this claim, YOU MUST switch subject and object to express the reverse relationship
+- object-label/object-qid: here the English labels and, if known, the QIDs for the institutions and persons who are the objects of the triple. If you are not absolutely sure, leave blank
+- start_time: the date/year from which the triple statement is true. Leave blank if the date is not specified or cannot be inferred, or the triple involves P585
+- end_time: the date/year up to which the triple statement is true. If it is an event, identical to start_time
+- reference_url: this is the source URL of the text from which the information was extracted.
+
+Return information as a comma-separated values (CSV). Include the column headers. Surround the values with quotes. If values contain quotes, properly escape them.
+
+DO NOT, UNDER ANY CIRCUMSTANCES, provide any commentary or explanations, just return the raw data. Do not make anything up that is not in the source material.
+-----
+{website_text}
\ No newline at end of file
diff --git a/wikidata/lib/langchain.py b/wikidata/lib/langchain.py
new file mode 100644
index 0000000..2a8f3f1
--- /dev/null
+++ b/wikidata/lib/langchain.py
@@ -0,0 +1,16 @@
+import io
+from langchain_core.prompts import ChatPromptTemplate
+from langchain_core.output_parsers import StrOutputParser
+import pandas as pd
+from dotenv import load_dotenv
+
+load_dotenv()
+
+def extract_to_csv(model, template, csv_path, **params):
+    prompt = ChatPromptTemplate.from_template(template)
+    parser = StrOutputParser()
+    chain = ( prompt | model | parser )
+    response = chain.invoke(params)
+    data = io.StringIO(response)
+    df = pd.read_csv(data, dtype={'start_time': str, 'end_time': str})
+    df.to_csv(csv_path, index=False)
\ No newline at end of file
diff --git a/wikidata/lib/wikidata.py b/wikidata/lib/wikidata.py
index 8f5f266..5953954 100644
--- a/wikidata/lib/wikidata.py
+++ b/wikidata/lib/wikidata.py
@@ -1,10 +1,13 @@
 # based on code written by GPT-4
 import csv
+import os
+
 from pywikibot import Claim, WbTime, ItemPage, PropertyPage, Site
 from datetime import datetime
-
 import mwclient
-from bs4 import BeautifulSoup
+from dotenv import load_dotenv
+
+load_dotenv()
 
 
 def claim_to_string(claim):
@@ -172,10 +175,16 @@ def update_wikidata_from_csv(file_path):
             previous_claim = claim
 
 
-def get_wikipedia_page_content(pageTitle:str, language="en"):
-    user_agent = 'github.com/cboulanger/experiments (info@bibliograph.org)'
+def get_wikipedia_page_data(pageTitle: str, language="en"):
+    user_agent = os.getenv('USER_AGENT')
     site = mwclient.Site(f'{language}.wikipedia.org', clients_useragent=user_agent)
-    api = site.api("parse", page=pageTitle.replace(" ", "_"))
-    html = api["parse"]["text"]["*"]
-    soup = BeautifulSoup(html, "html.parser")
-    return soup.get_text()
+    page = site.pages[pageTitle]
+    if not page.exists:
+        return None
+
+    return {
+        'page': page,
+        'revision': page.revision,
+        'url': f'{site.host}/wiki/{pageTitle.replace(" ", "_")}?oldid={page.revision}',
+        'content': page.text()
+    }
\ No newline at end of file
diff --git a/wikidata/scholars-qid.csv b/wikidata/scholars-qid.csv
index 52fed1c..1de6ff8 100644
--- a/wikidata/scholars-qid.csv
+++ b/wikidata/scholars-qid.csv
@@ -16,7 +16,6 @@ Thilo Ramm,Q59533838
 Rudolf WiethÃ¶lter,Q1512482
 Niklas Luhmann,Q57238
 Hubert Rottleuthner,Q55622018
-Ralf Rogowski,Q112499743
 Ralf Rogowski,Q20128038
 Gunther Teubner,Q98304
 Volkmar Gessner,Q15435946
diff --git a/wikidata/scholars.csv b/wikidata/scholars.csv
index 70aea7e..cd9bb94 100644
--- a/wikidata/scholars.csv
+++ b/wikidata/scholars.csv
@@ -16,7 +16,6 @@ Thilo Ramm,Q59533838,male,Ramm,Thilo,1925-04-04 00:00:00+00:00,2018-06-17 00:00:
 Rudolf WiethÃ¶lter,Q1512482,male,,Rudolf,1929-07-17 00:00:00+00:00,,1034437860,http://www.wikidata.org/entity/Q1512482,,https://de.wikipedia.org/wiki/Rudolf_Wieth%C3%B6lter
 Niklas Luhmann,Q57238,male,Luhmann,Niklas,1927-12-08 00:00:00+00:00,1998-11-06 00:00:00+00:00,118575147,http://www.wikidata.org/entity/Q57238,https://en.wikipedia.org/wiki/Niklas_Luhmann,https://de.wikipedia.org/wiki/Niklas_Luhmann
 Hubert Rottleuthner,Q55622018,male,,Hubert,1944-01-01 00:00:00+00:00,,135622751,http://www.wikidata.org/entity/Q55622018,,https://de.wikipedia.org/wiki/Hubert_Rottleuthner
-Ralf Rogowski,Q112499743,male,Rogowski,Ralf,1953-01-01 00:00:00+00:00,,17150982X,http://www.wikidata.org/entity/Q112499743,,
 Ralf Rogowski,Q20128038,male,Rogowski,Ralf,,,,http://www.wikidata.org/entity/Q20128038,https://en.wikipedia.org/wiki/Ralf_Rogowski,
 Gunther Teubner,Q98304,male,Teubner,Gunther,1944-04-30 00:00:00+00:00,,119443562,http://www.wikidata.org/entity/Q98304,https://en.wikipedia.org/wiki/Gunther_Teubner,https://de.wikipedia.org/wiki/Gunther_Teubner
 Volkmar Gessner,Q15435946,male,Gessner,Volkmar,1937-10-09 00:00:00+00:00,2014-11-08 00:00:00+00:00,170469328,http://www.wikidata.org/entity/Q15435946,https://en.wikipedia.org/wiki/Volkmar_Gessner,https://de.wikipedia.org/wiki/Volkmar_Gessner
-- 
GitLab