Newer
Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
{
"cells": [
{
"cell_type": "code",
"execution_count": 4,
"id": "initial_id",
"metadata": {
"collapsed": true,
"ExecuteTime": {
"end_time": "2024-03-22T21:42:32.983419Z",
"start_time": "2024-03-22T21:42:31.435315Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"https://www-jstor-org.ezproxy.lhlt.mpg.de/stable/pdf/20805575.pdf\n",
"{'Server': 'Varnish', 'Retry-After': '0', 'Content-Type': '', 'Date': 'Fri, 22 Mar 2024 21:42:32 GMT', 'Via': '1.1 varnish', 'X-Served-By': 'cache-fra-eddf8230065-FRA', 'X-Cache': 'MISS', 'X-Cache-Hits': '0', 'Accept-Ranges': 'none', 'Connection': 'close'}\n"
]
},
{
"ename": "Exception",
"evalue": "Unexpected response of type ",
"output_type": "error",
"traceback": [
"\u001B[0;31m---------------------------------------------------------------------------\u001B[0m",
"\u001B[0;31mException\u001B[0m Traceback (most recent call last)",
"Cell \u001B[0;32mIn[4], line 65\u001B[0m\n\u001B[1;32m 61\u001B[0m \u001B[38;5;28;01mraise\u001B[39;00m \u001B[38;5;167;01mException\u001B[39;00m(\u001B[38;5;124mf\u001B[39m\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mUnexpected response of type \u001B[39m\u001B[38;5;132;01m{\u001B[39;00mcontent_type\u001B[38;5;132;01m}\u001B[39;00m\u001B[38;5;124m\"\u001B[39m)\n\u001B[1;32m 63\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m size\n\u001B[0;32m---> 65\u001B[0m download_via_ezproxy(\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mhttps://www.jstor.org/stable/pdf/20805575.pdf\u001B[39m\u001B[38;5;124m\"\u001B[39m, \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mout/20805575.pdf\u001B[39m\u001B[38;5;124m\"\u001B[39m)\n",
"Cell \u001B[0;32mIn[4], line 61\u001B[0m, in \u001B[0;36mdownload_via_ezproxy\u001B[0;34m(url, file_path)\u001B[0m\n\u001B[1;32m 59\u001B[0m f\u001B[38;5;241m.\u001B[39mwrite(chunk)\n\u001B[1;32m 60\u001B[0m \u001B[38;5;28;01melse\u001B[39;00m:\n\u001B[0;32m---> 61\u001B[0m \u001B[38;5;28;01mraise\u001B[39;00m \u001B[38;5;167;01mException\u001B[39;00m(\u001B[38;5;124mf\u001B[39m\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mUnexpected response of type \u001B[39m\u001B[38;5;132;01m{\u001B[39;00mcontent_type\u001B[38;5;132;01m}\u001B[39;00m\u001B[38;5;124m\"\u001B[39m)\n\u001B[1;32m 63\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m size\n",
"\u001B[0;31mException\u001B[0m: Unexpected response of type "
]
}
],
"source": [
"import os\n",
"import re\n",
"import requests\n",
"from urllib.parse import urlencode#\n",
"from dotenv import load_dotenv\n",
"\n",
"load_dotenv()\n",
"\n",
"# Configure session to handle cookies\n",
"session = requests.Session()\n",
"\n",
"# Environment variables for credentials (assumed to be set in your environment)\n",
"EZPROXY_USER = os.getenv(\"EZPROXY_USER\")\n",
"EZPROXY_PASS = os.getenv(\"EZPROXY_PASS\")\n",
"\n",
"# URL and parameters setup\n",
"ezproxy_url_prefix = \"https://login.ezproxy.lhlt.mpg.de/login?qurl=\"\n",
"params = {\n",
" \"user\": EZPROXY_USER,\n",
" \"pass\": EZPROXY_PASS,\n",
" \"login\": \"Login\"\n",
"}\n",
"\n",
"params_encoded = urlencode(params)\n",
"\n",
"def download_via_ezproxy(url, file_path):\n",
" # Login with credentials and fetch content\n",
" res = session.post(ezproxy_url_prefix + url, data=params_encoded)\n",
" res = session.get(res.url) # Follow redirect\n",
" content_type = res.headers[\"Content-Type\"].split(\";\")[0]\n",
" size = 0\n",
" \n",
" if content_type == \"text/html\":\n",
" # Check for access restrictions or find the real document URL\n",
" html = res.text\n",
" \n",
" if \"You currently have no access\" in html:\n",
" raise Exception(\"No access\")\n",
"\n",
" match = re.search(r'click \\<a href=\"([^\"]+)\"', html)\n",
" if match:\n",
" url = match.group(1)\n",
" else:\n",
" with open(\"out/invalid-response.html\", \"w\", encoding=\"utf-8\") as f:\n",
" f.write(html)\n",
" raise Exception(\"Invalid html response\")\n",
"\n",
" # Refetch from the new URL\n",
" print(url)\n",
" res = session.get(url)\n",
" print(res.headers)\n",
" content_type = res.headers[\"Content-Type\"].split(\";\")[0]\n",
"\n",
" if content_type == \"application/pdf\":\n",
" # Download PDF document\n",
" with open(file_path, \"wb\") as f:\n",
" for chunk in res.iter_content(chunk_size=8192):\n",
" size += len(chunk)\n",
" f.write(chunk)\n",
" else:\n",
" raise Exception(f\"Unexpected response of type {content_type}\")\n",
"\n",
" return size\n",
"\n",
"download_via_ezproxy(\"https://www.jstor.org/stable/pdf/20805575.pdf\", \"out/20805575.pdf\")\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [],
"metadata": {
"collapsed": false
},
"id": "9bc974e45a72e7a"
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 5
}