{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "initial_id",
   "metadata": {
    "collapsed": true,
    "ExecuteTime": {
     "end_time": "2024-03-22T21:42:32.983419Z",
     "start_time": "2024-03-22T21:42:31.435315Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "https://www-jstor-org.ezproxy.lhlt.mpg.de/stable/pdf/20805575.pdf\n",
      "{'Server': 'Varnish', 'Retry-After': '0', 'Content-Type': '', 'Date': 'Fri, 22 Mar 2024 21:42:32 GMT', 'Via': '1.1 varnish', 'X-Served-By': 'cache-fra-eddf8230065-FRA', 'X-Cache': 'MISS', 'X-Cache-Hits': '0', 'Accept-Ranges': 'none', 'Connection': 'close'}\n"
     ]
    },
    {
     "ename": "Exception",
     "evalue": "Unexpected response of type ",
     "output_type": "error",
     "traceback": [
      "\u001B[0;31m---------------------------------------------------------------------------\u001B[0m",
      "\u001B[0;31mException\u001B[0m                                 Traceback (most recent call last)",
      "Cell \u001B[0;32mIn[4], line 65\u001B[0m\n\u001B[1;32m     61\u001B[0m         \u001B[38;5;28;01mraise\u001B[39;00m \u001B[38;5;167;01mException\u001B[39;00m(\u001B[38;5;124mf\u001B[39m\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mUnexpected response of type \u001B[39m\u001B[38;5;132;01m{\u001B[39;00mcontent_type\u001B[38;5;132;01m}\u001B[39;00m\u001B[38;5;124m\"\u001B[39m)\n\u001B[1;32m     63\u001B[0m     \u001B[38;5;28;01mreturn\u001B[39;00m size\n\u001B[0;32m---> 65\u001B[0m download_via_ezproxy(\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mhttps://www.jstor.org/stable/pdf/20805575.pdf\u001B[39m\u001B[38;5;124m\"\u001B[39m, \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mout/20805575.pdf\u001B[39m\u001B[38;5;124m\"\u001B[39m)\n",
      "Cell \u001B[0;32mIn[4], line 61\u001B[0m, in \u001B[0;36mdownload_via_ezproxy\u001B[0;34m(url, file_path)\u001B[0m\n\u001B[1;32m     59\u001B[0m             f\u001B[38;5;241m.\u001B[39mwrite(chunk)\n\u001B[1;32m     60\u001B[0m \u001B[38;5;28;01melse\u001B[39;00m:\n\u001B[0;32m---> 61\u001B[0m     \u001B[38;5;28;01mraise\u001B[39;00m \u001B[38;5;167;01mException\u001B[39;00m(\u001B[38;5;124mf\u001B[39m\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mUnexpected response of type \u001B[39m\u001B[38;5;132;01m{\u001B[39;00mcontent_type\u001B[38;5;132;01m}\u001B[39;00m\u001B[38;5;124m\"\u001B[39m)\n\u001B[1;32m     63\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m size\n",
      "\u001B[0;31mException\u001B[0m: Unexpected response of type "
     ]
    }
   ],
   "source": [
    "import os\n",
    "import re\n",
    "import requests\n",
    "from urllib.parse import urlencode#\n",
    "from dotenv import load_dotenv\n",
    "\n",
    "load_dotenv()\n",
    "\n",
    "# Configure session to handle cookies\n",
    "session = requests.Session()\n",
    "\n",
    "# Environment variables for credentials (assumed to be set in your environment)\n",
    "EZPROXY_USER = os.getenv(\"EZPROXY_USER\")\n",
    "EZPROXY_PASS = os.getenv(\"EZPROXY_PASS\")\n",
    "\n",
    "# URL and parameters setup\n",
    "ezproxy_url_prefix = \"https://login.ezproxy.lhlt.mpg.de/login?qurl=\"\n",
    "params = {\n",
    "    \"user\": EZPROXY_USER,\n",
    "    \"pass\": EZPROXY_PASS,\n",
    "    \"login\": \"Login\"\n",
    "}\n",
    "\n",
    "params_encoded = urlencode(params)\n",
    "\n",
    "def download_via_ezproxy(url, file_path):\n",
    "    # Login with credentials and fetch content\n",
    "    res = session.post(ezproxy_url_prefix + url, data=params_encoded)\n",
    "    res = session.get(res.url)  # Follow redirect\n",
    "    content_type = res.headers[\"Content-Type\"].split(\";\")[0]\n",
    "    size = 0\n",
    "    \n",
    "    if content_type == \"text/html\":\n",
    "        # Check for access restrictions or find the real document URL\n",
    "        html = res.text\n",
    "        \n",
    "        if \"You currently have no access\" in html:\n",
    "            raise Exception(\"No access\")\n",
    "\n",
    "        match = re.search(r'click \\<a href=\"([^\"]+)\"', html)\n",
    "        if match:\n",
    "            url = match.group(1)\n",
    "        else:\n",
    "            with open(\"out/invalid-response.html\", \"w\", encoding=\"utf-8\") as f:\n",
    "                f.write(html)\n",
    "            raise Exception(\"Invalid html response\")\n",
    "\n",
    "        # Refetch from the new URL\n",
    "        print(url)\n",
    "        res = session.get(url)\n",
    "        print(res.headers)\n",
    "        content_type = res.headers[\"Content-Type\"].split(\";\")[0]\n",
    "\n",
    "    if content_type == \"application/pdf\":\n",
    "        # Download PDF document\n",
    "        with open(file_path, \"wb\") as f:\n",
    "            for chunk in res.iter_content(chunk_size=8192):\n",
    "                size += len(chunk)\n",
    "                f.write(chunk)\n",
    "    else:\n",
    "        raise Exception(f\"Unexpected response of type {content_type}\")\n",
    "\n",
    "    return size\n",
    "\n",
    "download_via_ezproxy(\"https://www.jstor.org/stable/pdf/20805575.pdf\", \"out/20805575.pdf\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "outputs": [],
   "source": [],
   "metadata": {
    "collapsed": false
   },
   "id": "9bc974e45a72e7a"
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}