diff --git a/corpus-creation/.env.dist b/corpus-creation/.env.dist new file mode 100644 index 0000000000000000000000000000000000000000..f4b045dae6259fed99fc361c95d495cf95e88bda --- /dev/null +++ b/corpus-creation/.env.dist @@ -0,0 +1,2 @@ +EZPROXY_USER= +EZPROXY_PASS= \ No newline at end of file diff --git a/corpus-creation/.gitignore b/corpus-creation/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..2eea525d885d5148108f6f3a9a8613863f783d36 --- /dev/null +++ b/corpus-creation/.gitignore @@ -0,0 +1 @@ +.env \ No newline at end of file diff --git a/corpus-creation/download-journal-corpus.ipynb b/corpus-creation/download-journal-corpus.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..2a59e70a273242b8814c76ac310655e30775a0cb --- /dev/null +++ b/corpus-creation/download-journal-corpus.ipynb @@ -0,0 +1,136 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 4, + "id": "initial_id", + "metadata": { + "collapsed": true, + "ExecuteTime": { + "end_time": "2024-03-22T21:42:32.983419Z", + "start_time": "2024-03-22T21:42:31.435315Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "https://www-jstor-org.ezproxy.lhlt.mpg.de/stable/pdf/20805575.pdf\n", + "{'Server': 'Varnish', 'Retry-After': '0', 'Content-Type': '', 'Date': 'Fri, 22 Mar 2024 21:42:32 GMT', 'Via': '1.1 varnish', 'X-Served-By': 'cache-fra-eddf8230065-FRA', 'X-Cache': 'MISS', 'X-Cache-Hits': '0', 'Accept-Ranges': 'none', 'Connection': 'close'}\n" + ] + }, + { + "ename": "Exception", + "evalue": "Unexpected response of type ", + "output_type": "error", + "traceback": [ + "\u001B[0;31m---------------------------------------------------------------------------\u001B[0m", + "\u001B[0;31mException\u001B[0m Traceback (most recent call last)", + "Cell \u001B[0;32mIn[4], line 65\u001B[0m\n\u001B[1;32m 61\u001B[0m \u001B[38;5;28;01mraise\u001B[39;00m \u001B[38;5;167;01mException\u001B[39;00m(\u001B[38;5;124mf\u001B[39m\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mUnexpected response of type \u001B[39m\u001B[38;5;132;01m{\u001B[39;00mcontent_type\u001B[38;5;132;01m}\u001B[39;00m\u001B[38;5;124m\"\u001B[39m)\n\u001B[1;32m 63\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m size\n\u001B[0;32m---> 65\u001B[0m download_via_ezproxy(\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mhttps://www.jstor.org/stable/pdf/20805575.pdf\u001B[39m\u001B[38;5;124m\"\u001B[39m, \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mout/20805575.pdf\u001B[39m\u001B[38;5;124m\"\u001B[39m)\n", + "Cell \u001B[0;32mIn[4], line 61\u001B[0m, in \u001B[0;36mdownload_via_ezproxy\u001B[0;34m(url, file_path)\u001B[0m\n\u001B[1;32m 59\u001B[0m f\u001B[38;5;241m.\u001B[39mwrite(chunk)\n\u001B[1;32m 60\u001B[0m \u001B[38;5;28;01melse\u001B[39;00m:\n\u001B[0;32m---> 61\u001B[0m \u001B[38;5;28;01mraise\u001B[39;00m \u001B[38;5;167;01mException\u001B[39;00m(\u001B[38;5;124mf\u001B[39m\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mUnexpected response of type \u001B[39m\u001B[38;5;132;01m{\u001B[39;00mcontent_type\u001B[38;5;132;01m}\u001B[39;00m\u001B[38;5;124m\"\u001B[39m)\n\u001B[1;32m 63\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m size\n", + "\u001B[0;31mException\u001B[0m: Unexpected response of type " + ] + } + ], + "source": [ + "import os\n", + "import re\n", + "import requests\n", + "from urllib.parse import urlencode#\n", + "from dotenv import load_dotenv\n", + "\n", + "load_dotenv()\n", + "\n", + "# Configure session to handle cookies\n", + "session = requests.Session()\n", + "\n", + "# Environment variables for credentials (assumed to be set in your environment)\n", + "EZPROXY_USER = os.getenv(\"EZPROXY_USER\")\n", + "EZPROXY_PASS = os.getenv(\"EZPROXY_PASS\")\n", + "\n", + "# URL and parameters setup\n", + "ezproxy_url_prefix = \"https://login.ezproxy.lhlt.mpg.de/login?qurl=\"\n", + "params = {\n", + " \"user\": EZPROXY_USER,\n", + " \"pass\": EZPROXY_PASS,\n", + " \"login\": \"Login\"\n", + "}\n", + "\n", + "params_encoded = urlencode(params)\n", + "\n", + "def download_via_ezproxy(url, file_path):\n", + " # Login with credentials and fetch content\n", + " res = session.post(ezproxy_url_prefix + url, data=params_encoded)\n", + " res = session.get(res.url) # Follow redirect\n", + " content_type = res.headers[\"Content-Type\"].split(\";\")[0]\n", + " size = 0\n", + " \n", + " if content_type == \"text/html\":\n", + " # Check for access restrictions or find the real document URL\n", + " html = res.text\n", + " \n", + " if \"You currently have no access\" in html:\n", + " raise Exception(\"No access\")\n", + "\n", + " match = re.search(r'click \\<a href=\"([^\"]+)\"', html)\n", + " if match:\n", + " url = match.group(1)\n", + " else:\n", + " with open(\"out/invalid-response.html\", \"w\", encoding=\"utf-8\") as f:\n", + " f.write(html)\n", + " raise Exception(\"Invalid html response\")\n", + "\n", + " # Refetch from the new URL\n", + " print(url)\n", + " res = session.get(url)\n", + " print(res.headers)\n", + " content_type = res.headers[\"Content-Type\"].split(\";\")[0]\n", + "\n", + " if content_type == \"application/pdf\":\n", + " # Download PDF document\n", + " with open(file_path, \"wb\") as f:\n", + " for chunk in res.iter_content(chunk_size=8192):\n", + " size += len(chunk)\n", + " f.write(chunk)\n", + " else:\n", + " raise Exception(f\"Unexpected response of type {content_type}\")\n", + "\n", + " return size\n", + "\n", + "download_via_ezproxy(\"https://www.jstor.org/stable/pdf/20805575.pdf\", \"out/20805575.pdf\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [], + "metadata": { + "collapsed": false + }, + "id": "9bc974e45a72e7a" + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/wikidata/scholars-de.ipynb b/wikidata/scholars-de.ipynb index deb1ec946adab91b5a8da3553fc333fcdb5c7791..da5bc724de0cba3cca17f50a68ec8c9d34638bff 100644 --- a/wikidata/scholars-de.ipynb +++ b/wikidata/scholars-de.ipynb @@ -105,6 +105,7 @@ "Ralf Poscher (Q2129347)\n", "Susanne Baer (Q101872)\n", "Gralf-Peter Calliess (Q1542033)\n", + "Rolf Bender (Q59533437) \n", "\"\"\".split(\"\\n\")\n", "\n", "from lib.wikidata import get_person_info_from_wikidata\n",