From 53a4915dac9fd6171112998e3112b072498f109b Mon Sep 17 00:00:00 2001 From: Christian Boulanger <boulanger@lhlt.mpg.de> Date: Mon, 18 Mar 2024 18:11:47 +0100 Subject: [PATCH] Updates --- wikidata/.env.dist | 3 +- wikidata/.gitignore | 3 +- wikidata/data/Thilo Ramm.csv | 12 +++ wikidata/download-wikipedia-pages.ipynb | 132 ++++++++++++++++++++---- wikidata/extraction-prompt.txt | 30 ++++++ wikidata/lib/langchain.py | 16 +++ wikidata/lib/wikidata.py | 25 +++-- wikidata/scholars-qid.csv | 1 - wikidata/scholars.csv | 1 - 9 files changed, 190 insertions(+), 33 deletions(-) create mode 100644 wikidata/data/Thilo Ramm.csv create mode 100644 wikidata/extraction-prompt.txt create mode 100644 wikidata/lib/langchain.py diff --git a/wikidata/.env.dist b/wikidata/.env.dist index f841092..d26cd52 100644 --- a/wikidata/.env.dist +++ b/wikidata/.env.dist @@ -1,2 +1,3 @@ OPENAI_API_KEY='' -HUGGINGFACEHUB_API_TOKEN='' \ No newline at end of file +HUGGINGFACEHUB_API_TOKEN='' +USER_AGENT='' \ No newline at end of file diff --git a/wikidata/.gitignore b/wikidata/.gitignore index 757a946..afab930 100644 --- a/wikidata/.gitignore +++ b/wikidata/.gitignore @@ -3,4 +3,5 @@ user-config.py .env apicache /throttle.ctrl -data/*-chatgpt.csv \ No newline at end of file +data/*-chatgpt.csv +data/*-wikipedia.txt \ No newline at end of file diff --git a/wikidata/data/Thilo Ramm.csv b/wikidata/data/Thilo Ramm.csv new file mode 100644 index 0000000..a33217a --- /dev/null +++ b/wikidata/data/Thilo Ramm.csv @@ -0,0 +1,12 @@ +subject-label,subject-qid,predicate,pid,object,object-qid,start_time,end_time,reference_url +Thilo Ramm,Q59533838,educated at,P69,University of Marburg,,,1949,de.wikipedia.org/wiki/Thilo_Ramm?oldid=237476391 +Thilo Ramm,Q59533838,student of,P1066,Fritz von Hippel,,1949,1949,de.wikipedia.org/wiki/Thilo_Ramm?oldid=237476391 +Thilo Ramm,Q59533838,educated at,P69,University of Freiburg,,1953,1953,de.wikipedia.org/wiki/Thilo_Ramm?oldid=237476391 +Thilo Ramm,Q59533838,academic appointment,P8413,University of Freiburg,,,1961,de.wikipedia.org/wiki/Thilo_Ramm?oldid=237476391 +University of Giessen,,founded by,P112,Thilo Ramm,Q59533838,1962,1962,de.wikipedia.org/wiki/Thilo_Ramm?oldid=237476391 +Thilo Ramm,Q59533838,academic appointment,P8413,University of Giessen,,1962,,de.wikipedia.org/wiki/Thilo_Ramm?oldid=237476391 +Thilo Ramm,Q59533838,academic appointment,P8413,Fernuniversität in Hagen,,1977,,de.wikipedia.org/wiki/Thilo_Ramm?oldid=237476391 +Thilo Ramm,Q59533838,field of work,P101,Civil law,,1962,,de.wikipedia.org/wiki/Thilo_Ramm?oldid=237476391 +Thilo Ramm,Q59533838,field of work,P101,Labor law,,1962,,de.wikipedia.org/wiki/Thilo_Ramm?oldid=237476391 +Thilo Ramm,Q59533838,field of work,P101,Social law,,1962,,de.wikipedia.org/wiki/Thilo_Ramm?oldid=237476391 +Thilo Ramm,Q59533838,field of work,P101,Social philosophy,,1962,,de.wikipedia.org/wiki/Thilo_Ramm?oldid=237476391 diff --git a/wikidata/download-wikipedia-pages.ipynb b/wikidata/download-wikipedia-pages.ipynb index 4a48c15..038989a 100644 --- a/wikidata/download-wikipedia-pages.ipynb +++ b/wikidata/download-wikipedia-pages.ipynb @@ -3,45 +3,75 @@ { "cell_type": "markdown", "source": [ - "# Download wikipedia pages as source of triple extraction" + "# Download wikipedia pages as source of triple extraction\n", + "\n", + "This improves on [data-extraction notebook](./data-extraction.ipynb) by downloading the wikipedia article from which information is to be extracted " ], "metadata": { "collapsed": false }, "id": "9d6a10996bfdd3cf" }, + { + "cell_type": "markdown", + "source": [ + "## 1. Download raw Wikipedia page content for the list of scholars and save it" + ], + "metadata": { + "collapsed": false + }, + "id": "2ad235b62e2efc09" + }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 7, "id": "initial_id", "metadata": { "collapsed": true, "ExecuteTime": { - "end_time": "2024-03-17T19:36:34.696997900Z", - "start_time": "2024-03-17T19:36:33.700459Z" + "end_time": "2024-03-18T15:26:43.976993400Z", + "start_time": "2024-03-18T15:26:43.935308500Z" } }, "outputs": [ { - "data": { - "text/plain": "'de.wikipedia.org/wiki/Erhard_Blankenburg?oldid=228627122'" - }, - "execution_count": 26, - "metadata": {}, - "output_type": "execute_result" + "name": "stdout", + "output_type": "stream", + "text": [ + "No Wikipedia page exists for Wolfgang Kaupen.\n" + ] } ], "source": [ - "from lib.wikidata import get_wikipedia_page_content\n", - "import mwclient\n", - "#get_wikipedia_page_content(\"Erhard Blankenburg\")\n", - "user_agent = 'github.com/cboulanger/experiments (info@bibliograph.org)'\n", - "pageTitle = 'Erhard Blankenburg'\n", - "site = mwclient.Site(f'de.wikipedia.org', clients_useragent=user_agent)\n", - "page = site.pages[pageTitle]\n", - "rev = page.revision\n", - "url = f'{site.host}/wiki/{pageTitle.replace(\" \",\"_\")}?oldid={rev}' \n", - "url\n" + "import os.path\n", + "\n", + "from lib.wikidata import get_wikipedia_page_data\n", + "from urllib.parse import unquote\n", + "import pandas as pd\n", + "\n", + "df = pd.read_csv('scholars.csv')\n", + "for index, row in df.iterrows():\n", + " fullName = row['fullName']\n", + " language_code = None\n", + " if pd.notna(row['wikipedia_de']):\n", + " pagetTitle = unquote(os.path.basename(row['wikipedia_de']))\n", + " language_code = 'de'\n", + " elif pd.notna(row['wikipedia_en']):\n", + " pagetTitle = unquote(os.path.basename(row['wikipedia_en']))\n", + " language_code = 'en'\n", + " else:\n", + " print(f'No Wikipedia page exists for {fullName}.')\n", + " continue\n", + "\n", + " wikipedia_content_cache_path = f'data/{fullName}-wikipedia.txt'\n", + " if not os.path.isfile(wikipedia_content_cache_path):\n", + " page_data = get_wikipedia_page_data(pagetTitle, language_code)\n", + " if page_data and page_data['page'].exists: \n", + " file_content = f\"{page_data['url']}\\n\\n{page_data['content']}\"\n", + " with open(wikipedia_content_cache_path, 'w', encoding='utf-8') as file:\n", + " file.write(file_content)\n", + " else:\n", + " print(f'No page content could be retrieved for \"{fullName}\"')" ] }, { @@ -52,11 +82,71 @@ }, "id": "e83e59e1974a6506" }, + { + "cell_type": "markdown", + "source": [ + "## 2. Reduce text size\n", + "\n", + "In order to remove unnecessary information and reduce the token count, edit the downloaded files to contain only the biographical parts from which to extract the information" + ], + "metadata": { + "collapsed": false + }, + "id": "997c82c5d3d72b7" + }, + { + "cell_type": "markdown", + "source": [ + "## 3. Extract the information " + ], + "metadata": { + "collapsed": false + }, + "id": "303ddc348c4a2887" + }, + { + "cell_type": "code", + "execution_count": 8, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "C:\\Users\\boulanger\\AppData\\Local\\miniconda3\\Lib\\site-packages\\langchain_openai\\chat_models\\base.py:454: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.5/migration/\n", + " response = response.dict()\n", + "C:\\Users\\boulanger\\AppData\\Local\\miniconda3\\Lib\\site-packages\\pydantic\\main.py:979: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.5/migration/\n", + " warnings.warn('The `dict` method is deprecated; use `model_dump` instead.', DeprecationWarning)\n" + ] + } + ], + "source": [ + "from lib.langchain import extract_to_csv\n", + "from langchain_openai import ChatOpenAI\n", + "from pathlib import Path\n", + "fullName = \"Thilo Ramm\"\n", + "qid=\"Q59533838\"\n", + "model = ChatOpenAI(model_name=\"gpt-4\")\n", + "template = Path('extraction-prompt.txt').read_text()\n", + "website_text = Path(f'data/{fullName}-wikipedia.txt').read_text()\n", + "csv_path = f'data/{fullName}.csv'\n", + "df = extract_to_csv(model, template, csv_path, fullName=fullName, qid=qid, website_text=website_text)" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-03-18T17:03:40.614971500Z", + "start_time": "2024-03-18T17:03:11.400373800Z" + } + }, + "id": "d904e502f8eff15d" + }, { "cell_type": "code", "execution_count": null, "outputs": [], - "source": [], + "source": [ + "from lib.wikidata import update_wikidata_from_csv" + ], "metadata": { "collapsed": false }, diff --git a/wikidata/extraction-prompt.txt b/wikidata/extraction-prompt.txt new file mode 100644 index 0000000..732babd --- /dev/null +++ b/wikidata/extraction-prompt.txt @@ -0,0 +1,30 @@ +Your task is to extract data from the text and to output it in a format that is suitable as a data source for adding triples to Wikidata. + +The text is about "{fullName}" with the QID {qid}. It consists of one or more sections separated by "-----". The sections begin with a standalone URL followed by an excerpt of the content that can be found at this URL. + +Arrange the extracted information into a table with the following columns: subject-label, subject-qid, predicate, pid, object, object-qid, start_time, end_time, reference_url. + +Insert data into the columns as per the following rules: +- subject-label/subject-qid: In general, the subject is "{fullName}" with the QID {qid}. However, refining/qualifying statements can also be made about other entities, as with the academic degree (P512) item below. Also, in the case of P112, subject and object must be reversed +- predicate/pid: + - educated at (P69): Institutions at which the person studied + - student of (P1066): If supervisors of doctoral theses and habilitations are specified + - employer (P108): is the organization that pays the salary of a person (this can be a company, and institution or the university) + - academic appointment (P8413): usually the department of a university, if this or its QID are not known, like P108 + - student (P802): persons contained in WikiData who were educated by the subject + - member of (P463): Organizations and associations to which the person belongs (excluding P108) + - affiliation (P1416): Organization that the subject is affiliated with (not member of or employed by) + - academic degree (P512): some instance of academic degree (Q189533). After making this claim, add further triples to refine the P512 statement with triples on "conferred by" (P1027) and on "point in time" (P585). + - field of work (P101): extract the main topics and themes the subject has worked and published on + - editor (P98): add information on memberships in editorial boards of academic journals + - founded by (P112): add information on journals, associations or other organizations that the subject helped to establish. When adding this claim, YOU MUST switch subject and object to express the reverse relationship +- object-label/object-qid: here the English labels and, if known, the QIDs for the institutions and persons who are the objects of the triple. If you are not absolutely sure, leave blank +- start_time: the date/year from which the triple statement is true. Leave blank if the date is not specified or cannot be inferred, or the triple involves P585 +- end_time: the date/year up to which the triple statement is true. If it is an event, identical to start_time +- reference_url: this is the source URL of the text from which the information was extracted. + +Return information as a comma-separated values (CSV). Include the column headers. Surround the values with quotes. If values contain quotes, properly escape them. + +DO NOT, UNDER ANY CIRCUMSTANCES, provide any commentary or explanations, just return the raw data. Do not make anything up that is not in the source material. +----- +{website_text} \ No newline at end of file diff --git a/wikidata/lib/langchain.py b/wikidata/lib/langchain.py new file mode 100644 index 0000000..2a8f3f1 --- /dev/null +++ b/wikidata/lib/langchain.py @@ -0,0 +1,16 @@ +import io +from langchain_core.prompts import ChatPromptTemplate +from langchain_core.output_parsers import StrOutputParser +import pandas as pd +from dotenv import load_dotenv + +load_dotenv() + +def extract_to_csv(model, template, csv_path, **params): + prompt = ChatPromptTemplate.from_template(template) + parser = StrOutputParser() + chain = ( prompt | model | parser ) + response = chain.invoke(params) + data = io.StringIO(response) + df = pd.read_csv(data, dtype={'start_time': str, 'end_time': str}) + df.to_csv(csv_path, index=False) \ No newline at end of file diff --git a/wikidata/lib/wikidata.py b/wikidata/lib/wikidata.py index 8f5f266..5953954 100644 --- a/wikidata/lib/wikidata.py +++ b/wikidata/lib/wikidata.py @@ -1,10 +1,13 @@ # based on code written by GPT-4 import csv +import os + from pywikibot import Claim, WbTime, ItemPage, PropertyPage, Site from datetime import datetime - import mwclient -from bs4 import BeautifulSoup +from dotenv import load_dotenv + +load_dotenv() def claim_to_string(claim): @@ -172,10 +175,16 @@ def update_wikidata_from_csv(file_path): previous_claim = claim -def get_wikipedia_page_content(pageTitle:str, language="en"): - user_agent = 'github.com/cboulanger/experiments (info@bibliograph.org)' +def get_wikipedia_page_data(pageTitle: str, language="en"): + user_agent = os.getenv('USER_AGENT') site = mwclient.Site(f'{language}.wikipedia.org', clients_useragent=user_agent) - api = site.api("parse", page=pageTitle.replace(" ", "_")) - html = api["parse"]["text"]["*"] - soup = BeautifulSoup(html, "html.parser") - return soup.get_text() + page = site.pages[pageTitle] + if not page.exists: + return None + + return { + 'page': page, + 'revision': page.revision, + 'url': f'{site.host}/wiki/{pageTitle.replace(" ", "_")}?oldid={page.revision}', + 'content': page.text() + } \ No newline at end of file diff --git a/wikidata/scholars-qid.csv b/wikidata/scholars-qid.csv index 52fed1c..1de6ff8 100644 --- a/wikidata/scholars-qid.csv +++ b/wikidata/scholars-qid.csv @@ -16,7 +16,6 @@ Thilo Ramm,Q59533838 Rudolf Wiethölter,Q1512482 Niklas Luhmann,Q57238 Hubert Rottleuthner,Q55622018 -Ralf Rogowski,Q112499743 Ralf Rogowski,Q20128038 Gunther Teubner,Q98304 Volkmar Gessner,Q15435946 diff --git a/wikidata/scholars.csv b/wikidata/scholars.csv index 70aea7e..cd9bb94 100644 --- a/wikidata/scholars.csv +++ b/wikidata/scholars.csv @@ -16,7 +16,6 @@ Thilo Ramm,Q59533838,male,Ramm,Thilo,1925-04-04 00:00:00+00:00,2018-06-17 00:00: Rudolf Wiethölter,Q1512482,male,,Rudolf,1929-07-17 00:00:00+00:00,,1034437860,http://www.wikidata.org/entity/Q1512482,,https://de.wikipedia.org/wiki/Rudolf_Wieth%C3%B6lter Niklas Luhmann,Q57238,male,Luhmann,Niklas,1927-12-08 00:00:00+00:00,1998-11-06 00:00:00+00:00,118575147,http://www.wikidata.org/entity/Q57238,https://en.wikipedia.org/wiki/Niklas_Luhmann,https://de.wikipedia.org/wiki/Niklas_Luhmann Hubert Rottleuthner,Q55622018,male,,Hubert,1944-01-01 00:00:00+00:00,,135622751,http://www.wikidata.org/entity/Q55622018,,https://de.wikipedia.org/wiki/Hubert_Rottleuthner -Ralf Rogowski,Q112499743,male,Rogowski,Ralf,1953-01-01 00:00:00+00:00,,17150982X,http://www.wikidata.org/entity/Q112499743,, Ralf Rogowski,Q20128038,male,Rogowski,Ralf,,,,http://www.wikidata.org/entity/Q20128038,https://en.wikipedia.org/wiki/Ralf_Rogowski, Gunther Teubner,Q98304,male,Teubner,Gunther,1944-04-30 00:00:00+00:00,,119443562,http://www.wikidata.org/entity/Q98304,https://en.wikipedia.org/wiki/Gunther_Teubner,https://de.wikipedia.org/wiki/Gunther_Teubner Volkmar Gessner,Q15435946,male,Gessner,Volkmar,1937-10-09 00:00:00+00:00,2014-11-08 00:00:00+00:00,170469328,http://www.wikidata.org/entity/Q15435946,https://en.wikipedia.org/wiki/Volkmar_Gessner,https://de.wikipedia.org/wiki/Volkmar_Gessner -- GitLab