Skip to content
Snippets Groups Projects
download-journal-corpus.ipynb 5.67 KiB
Newer Older
  • Learn to ignore specific revisions
  • cboulanger's avatar
    cboulanger committed
    {
     "cells": [
      {
       "cell_type": "code",
       "execution_count": 4,
       "id": "initial_id",
       "metadata": {
        "collapsed": true,
        "ExecuteTime": {
         "end_time": "2024-03-22T21:42:32.983419Z",
         "start_time": "2024-03-22T21:42:31.435315Z"
        }
       },
       "outputs": [
        {
         "name": "stdout",
         "output_type": "stream",
         "text": [
          "https://www-jstor-org.ezproxy.lhlt.mpg.de/stable/pdf/20805575.pdf\n",
          "{'Server': 'Varnish', 'Retry-After': '0', 'Content-Type': '', 'Date': 'Fri, 22 Mar 2024 21:42:32 GMT', 'Via': '1.1 varnish', 'X-Served-By': 'cache-fra-eddf8230065-FRA', 'X-Cache': 'MISS', 'X-Cache-Hits': '0', 'Accept-Ranges': 'none', 'Connection': 'close'}\n"
         ]
        },
        {
         "ename": "Exception",
         "evalue": "Unexpected response of type ",
         "output_type": "error",
         "traceback": [
          "\u001B[0;31m---------------------------------------------------------------------------\u001B[0m",
          "\u001B[0;31mException\u001B[0m                                 Traceback (most recent call last)",
          "Cell \u001B[0;32mIn[4], line 65\u001B[0m\n\u001B[1;32m     61\u001B[0m         \u001B[38;5;28;01mraise\u001B[39;00m \u001B[38;5;167;01mException\u001B[39;00m(\u001B[38;5;124mf\u001B[39m\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mUnexpected response of type \u001B[39m\u001B[38;5;132;01m{\u001B[39;00mcontent_type\u001B[38;5;132;01m}\u001B[39;00m\u001B[38;5;124m\"\u001B[39m)\n\u001B[1;32m     63\u001B[0m     \u001B[38;5;28;01mreturn\u001B[39;00m size\n\u001B[0;32m---> 65\u001B[0m download_via_ezproxy(\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mhttps://www.jstor.org/stable/pdf/20805575.pdf\u001B[39m\u001B[38;5;124m\"\u001B[39m, \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mout/20805575.pdf\u001B[39m\u001B[38;5;124m\"\u001B[39m)\n",
          "Cell \u001B[0;32mIn[4], line 61\u001B[0m, in \u001B[0;36mdownload_via_ezproxy\u001B[0;34m(url, file_path)\u001B[0m\n\u001B[1;32m     59\u001B[0m             f\u001B[38;5;241m.\u001B[39mwrite(chunk)\n\u001B[1;32m     60\u001B[0m \u001B[38;5;28;01melse\u001B[39;00m:\n\u001B[0;32m---> 61\u001B[0m     \u001B[38;5;28;01mraise\u001B[39;00m \u001B[38;5;167;01mException\u001B[39;00m(\u001B[38;5;124mf\u001B[39m\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mUnexpected response of type \u001B[39m\u001B[38;5;132;01m{\u001B[39;00mcontent_type\u001B[38;5;132;01m}\u001B[39;00m\u001B[38;5;124m\"\u001B[39m)\n\u001B[1;32m     63\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m size\n",
          "\u001B[0;31mException\u001B[0m: Unexpected response of type "
         ]
        }
       ],
       "source": [
        "import os\n",
        "import re\n",
        "import requests\n",
        "from urllib.parse import urlencode#\n",
        "from dotenv import load_dotenv\n",
        "\n",
        "load_dotenv()\n",
        "\n",
        "# Configure session to handle cookies\n",
        "session = requests.Session()\n",
        "\n",
        "# Environment variables for credentials (assumed to be set in your environment)\n",
        "EZPROXY_USER = os.getenv(\"EZPROXY_USER\")\n",
        "EZPROXY_PASS = os.getenv(\"EZPROXY_PASS\")\n",
        "\n",
        "# URL and parameters setup\n",
        "ezproxy_url_prefix = \"https://login.ezproxy.lhlt.mpg.de/login?qurl=\"\n",
        "params = {\n",
        "    \"user\": EZPROXY_USER,\n",
        "    \"pass\": EZPROXY_PASS,\n",
        "    \"login\": \"Login\"\n",
        "}\n",
        "\n",
        "params_encoded = urlencode(params)\n",
        "\n",
        "def download_via_ezproxy(url, file_path):\n",
        "    # Login with credentials and fetch content\n",
        "    res = session.post(ezproxy_url_prefix + url, data=params_encoded)\n",
        "    res = session.get(res.url)  # Follow redirect\n",
        "    content_type = res.headers[\"Content-Type\"].split(\";\")[0]\n",
        "    size = 0\n",
        "    \n",
        "    if content_type == \"text/html\":\n",
        "        # Check for access restrictions or find the real document URL\n",
        "        html = res.text\n",
        "        \n",
        "        if \"You currently have no access\" in html:\n",
        "            raise Exception(\"No access\")\n",
        "\n",
        "        match = re.search(r'click \\<a href=\"([^\"]+)\"', html)\n",
        "        if match:\n",
        "            url = match.group(1)\n",
        "        else:\n",
        "            with open(\"out/invalid-response.html\", \"w\", encoding=\"utf-8\") as f:\n",
        "                f.write(html)\n",
        "            raise Exception(\"Invalid html response\")\n",
        "\n",
        "        # Refetch from the new URL\n",
        "        print(url)\n",
        "        res = session.get(url)\n",
        "        print(res.headers)\n",
        "        content_type = res.headers[\"Content-Type\"].split(\";\")[0]\n",
        "\n",
        "    if content_type == \"application/pdf\":\n",
        "        # Download PDF document\n",
        "        with open(file_path, \"wb\") as f:\n",
        "            for chunk in res.iter_content(chunk_size=8192):\n",
        "                size += len(chunk)\n",
        "                f.write(chunk)\n",
        "    else:\n",
        "        raise Exception(f\"Unexpected response of type {content_type}\")\n",
        "\n",
        "    return size\n",
        "\n",
        "download_via_ezproxy(\"https://www.jstor.org/stable/pdf/20805575.pdf\", \"out/20805575.pdf\")\n"
       ]
      },
      {
       "cell_type": "code",
       "execution_count": null,
       "outputs": [],
       "source": [],
       "metadata": {
        "collapsed": false
       },
       "id": "9bc974e45a72e7a"
      }
     ],
     "metadata": {
      "kernelspec": {
       "display_name": "Python 3",
       "language": "python",
       "name": "python3"
      },
      "language_info": {
       "codemirror_mode": {
        "name": "ipython",
        "version": 2
       },
       "file_extension": ".py",
       "mimetype": "text/x-python",
       "name": "python",
       "nbconvert_exporter": "python",
       "pygments_lexer": "ipython2",
       "version": "2.7.6"
      }
     },
     "nbformat": 4,
     "nbformat_minor": 5
    }