# Download wikipedia pages as source of triple extraction

This improves on [data-extraction notebook](./data-extraction.ipynb) by downloading the wikipedia article from which information is to be extracted 

## 1. Download raw Wikipedia page content for the list of scholars and save it

In [1]:
import os.path

from lib.wikidata import get_wikipedia_page_data
from urllib.parse import unquote
import pandas as pd

df = pd.read_csv('scholars.csv')
for index, row in df.iterrows():
    fullName = row['fullName']
    language_code = None
    if pd.notna(row['wikipedia_de']):
        pagetTitle = unquote(os.path.basename(row['wikipedia_de']))
        language_code = 'de'
    elif pd.notna(row['wikipedia_en']):
        pagetTitle = unquote(os.path.basename(row['wikipedia_en']))
        language_code = 'en'
    else:
        print(f'No Wikipedia page exists for {fullName}.')
        continue

    wikipedia_content_cache_path = f'input/{fullName}-wikipedia.txt'
    if not os.path.isfile(wikipedia_content_cache_path):
        page_data = get_wikipedia_page_data(pagetTitle, language_code)
        if page_data and page_data['page'].exists: 
            file_content = f"{page_data['url']}\n\n{page_data['content']}"
            with open(wikipedia_content_cache_path, 'w', encoding='utf-8') as file:
                file.write(file_content)
        else:
            print(f'No page content could be retrieved for "{fullName}"')

No Wikipedia page exists for Wolfgang Kaupen.


## 2. Reduce text size

In order to remove unnecessary information and reduce the token count, edit the downloaded files to contain only the biographical parts from which to extract the information

## 3. Extract the information 

In [7]:
from lib.langchain import extract_to_csv
from langchain_openai import ChatOpenAI
from pathlib import Path
import time

fullName = "Thilo Ramm"
qid="Q59533838"
model = ChatOpenAI(model_name="gpt-4")
website_text = Path(f'input/{fullName}-wikipedia.txt').read_text()

for template_file in ['handmade-prompt-template.txt', 'gp4-optimized-prompt-template.txt']:
    start_time = time.time()
    template = Path(f'prompts/{template_file}').read_text()   
    df = extract_to_csv(model, template, debug=False, fullName=fullName, qid=qid, website_text=website_text)
    end_time = time.time()
    execution_time = end_time - start_time  # In seconds
    minutes, seconds = divmod(execution_time, 60)
    print(f"Prompting with {template_file} took {int(minutes)} minutes and {int(seconds)} seconds and extracted {len(df)} triples.")
    csv_path = f'output/{fullName}-{template_file.split("-")[0]}-prompt.csv'
    df.to_csv(csv_path, index=False)
    


Prompting with handmade-prompt-template.txt took 0 minutes and 28 seconds and extracted 13 triples.
Prompting with gp4-optimized-prompt-template.txt took 0 minutes and 19 seconds and extracted 6 triples.
