Skip to content
Snippets Groups Projects
Commit 33f2b7df authored by Christian Boulanger's avatar Christian Boulanger
Browse files

Add Hedgedoc API experiment

parent c0bb65d0
Branches
No related tags found
No related merge requests found
.idea
.venv*
*.pyc
\ No newline at end of file
*.pyc
__*
\ No newline at end of file
import os
import re
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import argparse
def download_resource(session, url, base_dir, level=0):
try:
response = session.get(url, stream=True)
response.raise_for_status()
# Check the content type to ensure it's not an HTML error page
content_type = response.headers.get('Content-Type', '')
content_is_code = content_type.split("; ")[0] in ['application/javascript', 'text/javascript', 'text/css']
if 'text/html' in content_type and not url.endswith(".html"):
print(f"Skipped downloading {url}: MIME type is text/html (likely a 404 page)")
return
parsed_url = urlparse(url)
resource_path = os.path.join(base_dir, parsed_url.path.lstrip('/'))
resource_dir = os.path.dirname(resource_path)
os.makedirs(resource_dir, exist_ok=True)
print(f'{" "*level}Saving {parsed_url.path} to {resource_path}')
with open(resource_path, 'wb') as f:
content = b""
for chunk in response.iter_content(chunk_size=8192):
# Monkey-patch the javascript
if resource_path.endswith('slide-pack.9fe42901cee029fba75d.js'):
chunk_str = chunk.decode('utf-8')
chunk_str = chunk_str.replace('src:serverurl+"/build/', 'src:"build/')
chunk = chunk_str.encode('utf-8')
content += chunk
f.write(chunk)
# If the downloaded file is JavaScript or CSS, check for more URLs
if content_is_code:
content_str = content.decode('utf-8')
matches = re.findall(r'https://pad\.gwdg\.de/[^\'"\s\)\]]+', content_str)
matches += ['https://pad.gwdg.de/' + p for p in re.findall(r'/(build|css|js)/[^\'"\s\)\]]+', content_str)]
for match in matches:
if not match.endswith("/"):
download_resource(session, match, base_dir, level+1)
except Exception as e:
print(f"{" "*level}Failed to download {url}: {e}")
def remove_csp(soup):
for meta in soup.find_all("meta"):
if 'http-equiv' in meta.attrs and meta.attrs['http-equiv'] == 'Content-Security-Policy':
meta.decompose()
def replace_and_download_resources(session, soup, base_url, base_dir):
for tag in soup.find_all(['img', 'link', 'script']):
if tag.name == 'img' and tag.get('src'):
resource_url = urljoin(base_url, tag['src'])
download_resource(session, resource_url, base_dir)
elif tag.name == 'link' and tag.get('href'):
resource_url = urljoin(base_url, tag['href'])
download_resource(session, resource_url, base_dir)
elif tag.name == 'script' and tag.get('src'):
resource_url = urljoin(base_url, tag['src'])
download_resource(session, resource_url, base_dir)
# Handle dynamically generated URLs in script content
for script in soup.find_all('script'):
if script.string:
updated_script = script.string
matches = re.findall(r'https://pad\.gwdg\.de/([^\'"\s]+)', updated_script)
for match in matches:
resource_url = urljoin(base_url, match)
download_resource(session, resource_url, base_dir)
updated_script = updated_script.replace(resource_url, match)
script.string.replace_with(updated_script)
def download_uploads_resources(session, html_str, base_dir):
matches = re.findall(r'https://pad\.gwdg\.de/uploads/[^\s\'\"\)\]]+', html_str)
for match in matches:
resource_url = match
download_resource(session, resource_url, base_dir)
def download_additional_resources(session, base_url, base_dir, additional_paths):
for path in additional_paths:
resource_url = urljoin(base_url, path)
download_resource(session, resource_url, base_dir)
def download_html_and_resources(slide_id, output_html, base_dir, additional_paths):
base_url = "https://pad.gwdg.de"
page_url = f"{base_url}/p/{slide_id}"
session = requests.Session()
response = session.get(page_url)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
remove_csp(soup) # Remove CSP settings
# Replace occurrences of the base URL with the local paths and download resources
replace_and_download_resources(session, soup, base_url, base_dir)
# Convert the soup object to string to find and download uploads resources
html_str = str(soup)
download_uploads_resources(session, html_str, base_dir)
# Download additional specified resources
download_additional_resources(session, base_url, base_dir, additional_paths)
# Convert all occurrences of the base URL to local paths
html_str = html_str.replace(base_url + '/', './')
# Ensure the base directory exists
os.makedirs(base_dir, exist_ok=True)
# Save the modified HTML
with open(os.path.join(base_dir, output_html), 'w', encoding='utf-8') as f:
f.write(html_str)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Download and save HTML and resources from pad.gwdg.de")
parser.add_argument('-i', '--id', required=True, help="The slide ID from pad.gwdg.de")
parser.add_argument('-d', '--dir', required=True, help="The directory where the resources will be saved")
args = parser.parse_args()
slide_id = args.id
base_dir = args.dir
output_html = "index.html"
# this is a manual collection of resources that were not discovered by the script, probably incomplete
additional_paths = [
"build/85934a8a31bd9b8b75e68eeb57b6859810055d48742953766c4a5c2b5a0d5266.woff",
"build/8810ba3440bf482ced33d2f74b7803bba711f689d8e4caa7da5c6ae6844a1b49.woff2",
"build/006708d6691753cfc46eec2dae88fbdafa22823a89194149d9f223050dc78998.woff",
"build/4f319287827e35f841069eb471c092eccf97d2f7830aa4d8bd7301ded418bf49.ttf",
"build/MathJax/jax/input/TeX/config.js?V=2.7.9",
"build/MathJax/jax/input/MathML/config.js?V=2.7.9",
"build/MathJax/jax/output/HTML-CSS/config.js?V=2.7.9",
"build/MathJax/jax/output/NativeMML/config.js?V=2.7.9",
"build/MathJax/jax/output/PreviewHTML/config.js?V=2.7.9",
"build/MathJax/extensions/tex2jax.js?V=2.7.9",
"build/MathJax/extensions/mml2jax.js?V=2.7.9",
"build/MathJax/extensions/MathEvents.js?V=2.7.9",
"build/MathJax/extensions/MathZoom.js?V=2.7.9",
"build/MathJax/extensions/MathMenu.js?V=2.7.9",
"build/MathJax/extensions/toMathML.js?V=2.7.9",
"build/MathJax/extensions/TeX/noErrors.js?V=2.7.9",
"build/MathJax/extensions/TeX/noUndefined.js?V=2.7.9",
"build/MathJax/extensions/TeX/AMSmath.js?V=2.7.9",
"build/MathJax/extensions/TeX/AMSsymbols.js?V=2.7.9",
"build/MathJax/extensions/fast-preview.js?V=2.7.9",
"build/MathJax/extensions/AssistiveMML.js?V=2.7.9",
"build/MathJax/extensions/a11y/accessibility-menu.js?V=2.7.9",
"build/MathJax/extensions/Safe.js?V=2.7.9",
"build/29.5f5bdb9120d6b9c39930.js",
"build/27.fbb6b5bbda6765f0a1f1.js",
"build/reveal.js/plugin/notes/notes.js",
"build/reveal.js/css/theme/white.css",
"build/reveal.js/lib/font/source-sans-pro/source-sans-pro.css",
"build/reveal.js/css/print/paper.css",
"build/reveal.js/lib/font/source-sans-pro/source-sans-pro-regular.ttf",
"build/reveal.js/lib/font/source-sans-pro/source-sans-pro-regular.woff"
]
download_html_and_resources(slide_id, output_html, base_dir, additional_paths)
%% Cell type:code id: tags:
``` python
%load_ext autoreload
%autoreload 2
```
%% Output
The autoreload extension is already loaded. To reload it, use:
%reload_ext autoreload
%% Cell type:code id: tags:
``` python
from hedgedoc_api import HedgedocClient
client = HedgedocClient('https://pad.gwdg.de')
markdown = client.get_note_content('wOsgnXCdR9qxkNn1-RyV7g')
print(markdown)
```
%% Output
# Test
dies ist ein Test
![](https://pad.gwdg.de/uploads/b3dce952-95c4-4143-a09e-a32cddb5168e.png)
%% Cell type:code id: tags:
``` python
client2 = HedgedocClient('http://localhost:3000')
client2.create_note(markdown)
```
%% Output
'http://localhost:3000/WRWm37-BSwSlbsvZv-44DA'
%% Cell type:code id: tags:
``` python
from hedgedoc_api import HedgedocClient, find_image_urls
client = HedgedocClient('http://localhost:3000')
markdown = client.get_note_content('Wgn_PaviRj-g6CRnZChkSg')
print(find_image_urls(markdown))
```
%% Output
['http://localhost:3000/uploads/4d46ab65-797f-4fde-9329-3859a752d30c.png']
%% Cell type:code id: tags:
``` python
from hedgedoc_api import HedgedocClient, find_image_urls
client1 = HedgedocClient('https://pad.gwdg.de')
client2 = HedgedocClient('http://localhost:3000')
# get content and image urls
markdown = client1.get_note_content('wOsgnXCdR9qxkNn1-RyV7g')
image_urls = find_image_urls(markdown)
# copy images
if len(image_urls):
print("Uploading images:")
for image_url in image_urls:
print(f" - {image_url}")
new_image_url = client2.upload_image_from_url(image_url)
print(f" --> {new_image_url}")
markdown = markdown.replace(image_url, new_image_url)
# uplaod note with new image urls
client2.create_note(markdown)
```
%% Output
Uploading images:
- https://pad.gwdg.de/uploads/b3dce952-95c4-4143-a09e-a32cddb5168e.png
--> http://localhost:3000/uploads/2b6965db-ffb9-4e9f-b893-3c76a6b522ab.png
'http://localhost:3000/IWxjqEXJT5SyeQAgX-fEEw'
import requests
import json
import regex as re
def find_image_urls(text):
"""Extracts all image URLs from a given text"""
url_pattern = re.compile(r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+\.(?:jpg|gif|png|jpeg)")
urls = re.findall(url_pattern, text)
return urls
class HedgedocClient:
BASE_URL = None
def __init__(self, base_url) -> None:
self.BASE_URL = base_url
def upload_image(self, image_path):
"""
Function to upload an image.
The image_path parameter should be a string representing the file path to the image.
Returns the URL of the uploaded image.
"""
url = f'{self.BASE_URL}/uploadimage'
headers = {
'Content-Type': 'multipart/form-data',
}
with open(image_path, 'rb') as img_file:
files = {
'image': (image_path, img_file, 'image/png'),
}
upload_response = requests.post(url, headers=headers, files=files)
if upload_response.status_code != 200:
raise RuntimeError("Upload failed")
return upload_response.json()['link']
def upload_image_from_url(self, image_url):
"""
Function to upload an image from a URL.
The image_url parameter should be a string representing the URL of the image.
Returns the URL of the uploaded image.
"""
upload_url = f'{self.BASE_URL}/uploadimage'
response = requests.get(image_url, stream=True)
response.raise_for_status()
filename = image_url.split("/")[-1]
# We make use of the context manager to avoid storing the whole file in memory.
with response:
files = {'image': (filename, response.raw, response.headers.get('Content-type'))}
upload_response = requests.post(upload_url, files=files)
if upload_response.status_code != 200:
raise RuntimeError("Upload failed")
return upload_response.json()['link']
def create_note(self, data):
"""
Function to import markdown data into a new note.
The note content will be the body of the received HTTP-request.
"""
headers = {'Content-Type': 'text/markdown'}
response = requests.post(f'{self.BASE_URL}/new', data=data, headers=headers)
response.raise_for_status()
if response.status_code != 200:
raise RuntimeError("Request failed")
return response.url
def get_note_content(self, note_id):
"""
Function to return the raw markdown content of a note.
"""
response = requests.get(f'{self.BASE_URL}/{note_id}/download')
response.raise_for_status()
if response.status_code != 200:
raise RuntimeError("Download failed")
return response.text
def get_note_metadata(self, note_id):
"""
Function to return metadata about a note.
This includes the title and description of the note as well as the creation date and viewcount.
"""
response = requests.get(f'{self.BASE_URL}/{note_id}/info')
response.raise_for_status()
if response.status_code != 200:
raise RuntimeError("Request failed")
return response.json()
def import_note(self, source_client, note_id):
markdown = source_client.get_note_content(note_id)
image_urls = find_image_urls(markdown)
# copy images
if len(image_urls):
for image_url in image_urls:
new_image_url = self.upload_image_from_url(image_url)
markdown = markdown.replace(image_url, new_image_url)
# uplaod note with new image urls and return the url of the new note
return self.create_note(markdown)
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment