# Download wikipedia pages as source of triple extraction

This improves on [data-extraction notebook](./data-extraction.ipynb) by downloading the wikipedia article from which information is to be extracted 

## 1. Download raw Wikipedia page content for the list of scholars and save it

In [7]:
import os.path

from lib.wikidata import get_wikipedia_page_data
from urllib.parse import unquote
import pandas as pd

df = pd.read_csv('scholars.csv')
for index, row in df.iterrows():
    fullName = row['fullName']
    language_code = None
    if pd.notna(row['wikipedia_de']):
        pagetTitle = unquote(os.path.basename(row['wikipedia_de']))
        language_code = 'de'
    elif pd.notna(row['wikipedia_en']):
        pagetTitle = unquote(os.path.basename(row['wikipedia_en']))
        language_code = 'en'
    else:
        print(f'No Wikipedia page exists for {fullName}.')
        continue

    wikipedia_content_cache_path = f'data/{fullName}-wikipedia.txt'
    if not os.path.isfile(wikipedia_content_cache_path):
        page_data = get_wikipedia_page_data(pagetTitle, language_code)
        if page_data and page_data['page'].exists: 
            file_content = f"{page_data['url']}\n\n{page_data['content']}"
            with open(wikipedia_content_cache_path, 'w', encoding='utf-8') as file:
                file.write(file_content)
        else:
            print(f'No page content could be retrieved for "{fullName}"')

No Wikipedia page exists for Wolfgang Kaupen.


## 2. Reduce text size

In order to remove unnecessary information and reduce the token count, edit the downloaded files to contain only the biographical parts from which to extract the information

## 3. Extract the information 

In [8]:
from lib.langchain import extract_to_csv
from langchain_openai import ChatOpenAI
from pathlib import Path
fullName = "Thilo Ramm"
qid="Q59533838"
model = ChatOpenAI(model_name="gpt-4")
template = Path('extraction-prompt.txt').read_text()
website_text = Path(f'data/{fullName}-wikipedia.txt').read_text()
csv_path = f'data/{fullName}.csv'
df = extract_to_csv(model, template, csv_path, fullName=fullName, qid=qid, website_text=website_text)

C:\Users\boulanger\AppData\Local\miniconda3\Lib\site-packages\langchain_openai\chat_models\base.py:454: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.5/migration/
  response = response.dict()
C:\Users\boulanger\AppData\Local\miniconda3\Lib\site-packages\pydantic\main.py:979: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.5/migration/


In [None]:
from lib.wikidata import update_wikidata_from_csv