{ "cells": [ { "cell_type": "code", "execution_count": 4, "id": "initial_id", "metadata": { "collapsed": true, "ExecuteTime": { "end_time": "2024-03-22T21:42:32.983419Z", "start_time": "2024-03-22T21:42:31.435315Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "https://www-jstor-org.ezproxy.lhlt.mpg.de/stable/pdf/20805575.pdf\n", "{'Server': 'Varnish', 'Retry-After': '0', 'Content-Type': '', 'Date': 'Fri, 22 Mar 2024 21:42:32 GMT', 'Via': '1.1 varnish', 'X-Served-By': 'cache-fra-eddf8230065-FRA', 'X-Cache': 'MISS', 'X-Cache-Hits': '0', 'Accept-Ranges': 'none', 'Connection': 'close'}\n" ] }, { "ename": "Exception", "evalue": "Unexpected response of type ", "output_type": "error", "traceback": [ "\u001B[0;31m---------------------------------------------------------------------------\u001B[0m", "\u001B[0;31mException\u001B[0m Traceback (most recent call last)", "Cell \u001B[0;32mIn[4], line 65\u001B[0m\n\u001B[1;32m 61\u001B[0m \u001B[38;5;28;01mraise\u001B[39;00m \u001B[38;5;167;01mException\u001B[39;00m(\u001B[38;5;124mf\u001B[39m\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mUnexpected response of type \u001B[39m\u001B[38;5;132;01m{\u001B[39;00mcontent_type\u001B[38;5;132;01m}\u001B[39;00m\u001B[38;5;124m\"\u001B[39m)\n\u001B[1;32m 63\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m size\n\u001B[0;32m---> 65\u001B[0m download_via_ezproxy(\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mhttps://www.jstor.org/stable/pdf/20805575.pdf\u001B[39m\u001B[38;5;124m\"\u001B[39m, \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mout/20805575.pdf\u001B[39m\u001B[38;5;124m\"\u001B[39m)\n", "Cell \u001B[0;32mIn[4], line 61\u001B[0m, in \u001B[0;36mdownload_via_ezproxy\u001B[0;34m(url, file_path)\u001B[0m\n\u001B[1;32m 59\u001B[0m f\u001B[38;5;241m.\u001B[39mwrite(chunk)\n\u001B[1;32m 60\u001B[0m \u001B[38;5;28;01melse\u001B[39;00m:\n\u001B[0;32m---> 61\u001B[0m \u001B[38;5;28;01mraise\u001B[39;00m \u001B[38;5;167;01mException\u001B[39;00m(\u001B[38;5;124mf\u001B[39m\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mUnexpected response of type \u001B[39m\u001B[38;5;132;01m{\u001B[39;00mcontent_type\u001B[38;5;132;01m}\u001B[39;00m\u001B[38;5;124m\"\u001B[39m)\n\u001B[1;32m 63\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m size\n", "\u001B[0;31mException\u001B[0m: Unexpected response of type " ] } ], "source": [ "import os\n", "import re\n", "import requests\n", "from urllib.parse import urlencode#\n", "from dotenv import load_dotenv\n", "\n", "load_dotenv()\n", "\n", "# Configure session to handle cookies\n", "session = requests.Session()\n", "\n", "# Environment variables for credentials (assumed to be set in your environment)\n", "EZPROXY_USER = os.getenv(\"EZPROXY_USER\")\n", "EZPROXY_PASS = os.getenv(\"EZPROXY_PASS\")\n", "\n", "# URL and parameters setup\n", "ezproxy_url_prefix = \"https://login.ezproxy.lhlt.mpg.de/login?qurl=\"\n", "params = {\n", " \"user\": EZPROXY_USER,\n", " \"pass\": EZPROXY_PASS,\n", " \"login\": \"Login\"\n", "}\n", "\n", "params_encoded = urlencode(params)\n", "\n", "def download_via_ezproxy(url, file_path):\n", " # Login with credentials and fetch content\n", " res = session.post(ezproxy_url_prefix + url, data=params_encoded)\n", " res = session.get(res.url) # Follow redirect\n", " content_type = res.headers[\"Content-Type\"].split(\";\")[0]\n", " size = 0\n", " \n", " if content_type == \"text/html\":\n", " # Check for access restrictions or find the real document URL\n", " html = res.text\n", " \n", " if \"You currently have no access\" in html:\n", " raise Exception(\"No access\")\n", "\n", " match = re.search(r'click \\<a href=\"([^\"]+)\"', html)\n", " if match:\n", " url = match.group(1)\n", " else:\n", " with open(\"out/invalid-response.html\", \"w\", encoding=\"utf-8\") as f:\n", " f.write(html)\n", " raise Exception(\"Invalid html response\")\n", "\n", " # Refetch from the new URL\n", " print(url)\n", " res = session.get(url)\n", " print(res.headers)\n", " content_type = res.headers[\"Content-Type\"].split(\";\")[0]\n", "\n", " if content_type == \"application/pdf\":\n", " # Download PDF document\n", " with open(file_path, \"wb\") as f:\n", " for chunk in res.iter_content(chunk_size=8192):\n", " size += len(chunk)\n", " f.write(chunk)\n", " else:\n", " raise Exception(f\"Unexpected response of type {content_type}\")\n", "\n", " return size\n", "\n", "download_via_ezproxy(\"https://www.jstor.org/stable/pdf/20805575.pdf\", \"out/20805575.pdf\")\n" ] }, { "cell_type": "code", "execution_count": null, "outputs": [], "source": [], "metadata": { "collapsed": false }, "id": "9bc974e45a72e7a" } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.6" } }, "nbformat": 4, "nbformat_minor": 5 }