Add Hedgedoc API experiment

33f2b7df · Christian Boulanger · c0bb65d0 · 33f2b7df · 33f2b7df · 33f2b7df
Commit 33f2b7df authored 6 months ago by Christian Boulanger
--- a/.gitignore
+++ b/.gitignore
 .idea
 .venv*
-*.pyc
\ No newline at end of file
+*.pyc
+__*
\ No newline at end of file
--- a/hedgedoc/download-presentation.py
+++ b/hedgedoc/download-presentation.py
+import os
+import re
+import requests
+from bs4 import BeautifulSoup
+from urllib.parse import urljoin, urlparse
+import argparse
+
+def download_resource(session, url, base_dir, level=0):
+    try:
+        response = session.get(url, stream=True)
+        response.raise_for_status()
+
+        # Check the content type to ensure it's not an HTML error page
+        content_type = response.headers.get('Content-Type', '')
+        content_is_code = content_type.split("; ")[0] in ['application/javascript', 'text/javascript', 'text/css']
+        
+        if 'text/html' in content_type and not url.endswith(".html"):
+            print(f"Skipped downloading {url}: MIME type is text/html (likely a 404 page)")
+            return
+
+        parsed_url = urlparse(url)
+        resource_path = os.path.join(base_dir, parsed_url.path.lstrip('/'))
+        resource_dir = os.path.dirname(resource_path)
+        
+        os.makedirs(resource_dir, exist_ok=True)
+        
+        print(f'{"    "*level}Saving {parsed_url.path} to {resource_path}')
+        
+        with open(resource_path, 'wb') as f:
+            content = b""
+            for chunk in response.iter_content(chunk_size=8192):
+                
+                # Monkey-patch the javascript
+                if resource_path.endswith('slide-pack.9fe42901cee029fba75d.js'):
+                    chunk_str = chunk.decode('utf-8')
+                    chunk_str = chunk_str.replace('src:serverurl+"/build/', 'src:"build/')
+                    chunk = chunk_str.encode('utf-8')
+                    
+                content += chunk
+                f.write(chunk)
+
+        # If the downloaded file is JavaScript or CSS, check for more URLs
+        if content_is_code:
+            content_str = content.decode('utf-8')
+            matches = re.findall(r'https://pad\.gwdg\.de/[^\'"\s\)\]]+', content_str)
+            matches += ['https://pad.gwdg.de/' + p for p in re.findall(r'/(build|css|js)/[^\'"\s\)\]]+', content_str)]
+            for match in matches:
+                if not match.endswith("/"):
+                    download_resource(session, match, base_dir, level+1)
+                
+    except Exception as e:
+        print(f"{"    "*level}Failed to download {url}: {e}")
+
+def remove_csp(soup):
+    for meta in soup.find_all("meta"):
+        if 'http-equiv' in meta.attrs and meta.attrs['http-equiv'] == 'Content-Security-Policy':
+            meta.decompose()
+
+def replace_and_download_resources(session, soup, base_url, base_dir):
+    for tag in soup.find_all(['img', 'link', 'script']):
+        if tag.name == 'img' and tag.get('src'):
+            resource_url = urljoin(base_url, tag['src'])
+            download_resource(session, resource_url, base_dir)
+        elif tag.name == 'link' and tag.get('href'):
+            resource_url = urljoin(base_url, tag['href'])
+            download_resource(session, resource_url, base_dir)
+        elif tag.name == 'script' and tag.get('src'):
+            resource_url = urljoin(base_url, tag['src'])
+            download_resource(session, resource_url, base_dir)
+
+    # Handle dynamically generated URLs in script content
+    for script in soup.find_all('script'):
+        if script.string:
+            updated_script = script.string
+            matches = re.findall(r'https://pad\.gwdg\.de/([^\'"\s]+)', updated_script)
+            for match in matches:
+                resource_url = urljoin(base_url, match)
+                download_resource(session, resource_url, base_dir)
+                updated_script = updated_script.replace(resource_url, match)
+            script.string.replace_with(updated_script)
+
+def download_uploads_resources(session, html_str, base_dir):
+    matches = re.findall(r'https://pad\.gwdg\.de/uploads/[^\s\'\"\)\]]+', html_str)
+    for match in matches:
+        resource_url = match
+        download_resource(session, resource_url, base_dir)
+
+def download_additional_resources(session, base_url, base_dir, additional_paths):
+    for path in additional_paths:
+        resource_url = urljoin(base_url, path)
+        download_resource(session, resource_url, base_dir)
+
+def download_html_and_resources(slide_id, output_html, base_dir, additional_paths):
+    base_url = "https://pad.gwdg.de"
+    page_url = f"{base_url}/p/{slide_id}"
+    
+    session = requests.Session()
+    response = session.get(page_url)
+    response.raise_for_status()
+
+    soup = BeautifulSoup(response.text, 'html.parser')
+    remove_csp(soup)  # Remove CSP settings
+
+    # Replace occurrences of the base URL with the local paths and download resources
+    replace_and_download_resources(session, soup, base_url, base_dir)
+    
+    # Convert the soup object to string to find and download uploads resources
+    html_str = str(soup)
+    download_uploads_resources(session, html_str, base_dir)
+    
+    # Download additional specified resources
+    download_additional_resources(session, base_url, base_dir, additional_paths)
+    
+    # Convert all occurrences of the base URL to local paths
+    html_str = html_str.replace(base_url + '/', './')
+    
+    # Ensure the base directory exists
+    os.makedirs(base_dir, exist_ok=True)
+
+    # Save the modified HTML
+    with open(os.path.join(base_dir, output_html), 'w', encoding='utf-8') as f:
+        f.write(html_str)
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Download and save HTML and resources from pad.gwdg.de")
+    parser.add_argument('-i', '--id', required=True, help="The slide ID from pad.gwdg.de")
+    parser.add_argument('-d', '--dir', required=True, help="The directory where the resources will be saved")
+
+    args = parser.parse_args()
+
+    slide_id = args.id
+    base_dir = args.dir
+    output_html = "index.html"
+    # this is a manual collection of resources that were not discovered by the script, probably incomplete
+    additional_paths = [
+        "build/85934a8a31bd9b8b75e68eeb57b6859810055d48742953766c4a5c2b5a0d5266.woff",
+        "build/8810ba3440bf482ced33d2f74b7803bba711f689d8e4caa7da5c6ae6844a1b49.woff2",
+        "build/006708d6691753cfc46eec2dae88fbdafa22823a89194149d9f223050dc78998.woff",
+        "build/4f319287827e35f841069eb471c092eccf97d2f7830aa4d8bd7301ded418bf49.ttf",
+        "build/MathJax/jax/input/TeX/config.js?V=2.7.9",
+        "build/MathJax/jax/input/MathML/config.js?V=2.7.9",
+        "build/MathJax/jax/output/HTML-CSS/config.js?V=2.7.9",
+        "build/MathJax/jax/output/NativeMML/config.js?V=2.7.9",
+        "build/MathJax/jax/output/PreviewHTML/config.js?V=2.7.9", 
+        "build/MathJax/extensions/tex2jax.js?V=2.7.9",
+        "build/MathJax/extensions/mml2jax.js?V=2.7.9",
+        "build/MathJax/extensions/MathEvents.js?V=2.7.9",
+        "build/MathJax/extensions/MathZoom.js?V=2.7.9",
+        "build/MathJax/extensions/MathMenu.js?V=2.7.9",
+        "build/MathJax/extensions/toMathML.js?V=2.7.9",
+        "build/MathJax/extensions/TeX/noErrors.js?V=2.7.9",
+        "build/MathJax/extensions/TeX/noUndefined.js?V=2.7.9",
+        "build/MathJax/extensions/TeX/AMSmath.js?V=2.7.9",
+        "build/MathJax/extensions/TeX/AMSsymbols.js?V=2.7.9",
+        "build/MathJax/extensions/fast-preview.js?V=2.7.9",
+        "build/MathJax/extensions/AssistiveMML.js?V=2.7.9",
+        "build/MathJax/extensions/a11y/accessibility-menu.js?V=2.7.9",
+        "build/MathJax/extensions/Safe.js?V=2.7.9",
+        "build/29.5f5bdb9120d6b9c39930.js",
+        "build/27.fbb6b5bbda6765f0a1f1.js",
+        "build/reveal.js/plugin/notes/notes.js",
+        "build/reveal.js/css/theme/white.css",
+        "build/reveal.js/lib/font/source-sans-pro/source-sans-pro.css",
+        "build/reveal.js/css/print/paper.css",
+        "build/reveal.js/lib/font/source-sans-pro/source-sans-pro-regular.ttf",
+        "build/reveal.js/lib/font/source-sans-pro/source-sans-pro-regular.woff"
+    ]
+    download_html_and_resources(slide_id, output_html, base_dir, additional_paths)
--- a/hedgedoc/hedgedoc-api.ipynb
+++ b/hedgedoc/hedgedoc-api.ipynb
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "The autoreload extension is already loaded. To reload it, use:\n",
+      "  %reload_ext autoreload\n"
+     ]
+    }
+   ],
+   "source": [
+    "%load_ext autoreload\n",
+    "%autoreload 2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "# Test\n",
+      "\n",
+      "dies ist ein Test\n",
+      "\n",
+      "![](https://pad.gwdg.de/uploads/b3dce952-95c4-4143-a09e-a32cddb5168e.png)\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "from hedgedoc_api import HedgedocClient\n",
+    "\n",
+    "client = HedgedocClient('https://pad.gwdg.de')\n",
+    "markdown = client.get_note_content('wOsgnXCdR9qxkNn1-RyV7g')\n",
+    "print(markdown)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'http://localhost:3000/WRWm37-BSwSlbsvZv-44DA'"
+      ]
+     },
+     "execution_count": 16,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "client2 = HedgedocClient('http://localhost:3000')\n",
+    "client2.create_note(markdown)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "['http://localhost:3000/uploads/4d46ab65-797f-4fde-9329-3859a752d30c.png']\n"
+     ]
+    }
+   ],
+   "source": [
+    "from hedgedoc_api import HedgedocClient, find_image_urls\n",
+    "\n",
+    "client = HedgedocClient('http://localhost:3000')\n",
+    "markdown = client.get_note_content('Wgn_PaviRj-g6CRnZChkSg')\n",
+    "print(find_image_urls(markdown))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Uploading images:\n",
+      " - https://pad.gwdg.de/uploads/b3dce952-95c4-4143-a09e-a32cddb5168e.png\n",
+      "   --> http://localhost:3000/uploads/2b6965db-ffb9-4e9f-b893-3c76a6b522ab.png\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "'http://localhost:3000/IWxjqEXJT5SyeQAgX-fEEw'"
+      ]
+     },
+     "execution_count": 17,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from hedgedoc_api import HedgedocClient, find_image_urls\n",
+    "\n",
+    "client1 = HedgedocClient('https://pad.gwdg.de')\n",
+    "client2 = HedgedocClient('http://localhost:3000')\n",
+    "\n",
+    "# get content and image urls\n",
+    "markdown = client1.get_note_content('wOsgnXCdR9qxkNn1-RyV7g')\n",
+    "image_urls = find_image_urls(markdown)\n",
+    "\n",
+    "# copy images \n",
+    "if len(image_urls):\n",
+    "    print(\"Uploading images:\")\n",
+    "    for image_url in image_urls:\n",
+    "        print(f\" - {image_url}\")\n",
+    "        new_image_url = client2.upload_image_from_url(image_url)\n",
+    "        print(f\"   --> {new_image_url}\")\n",
+    "        markdown = markdown.replace(image_url, new_image_url)\n",
+    "\n",
+    "# uplaod note with new image urls\n",
+    "client2.create_note(markdown)    "
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "experiments",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.7"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
+%% Cell type:code id: tags:
+
+``` python
+%load_ext autoreload
+%autoreload 2
+```
+
+%% Output
+
+    The autoreload extension is already loaded. To reload it, use:
+      %reload_ext autoreload
+
+%% Cell type:code id: tags:
+
+``` python
+from hedgedoc_api import HedgedocClient
+
+client = HedgedocClient('https://pad.gwdg.de')
+markdown = client.get_note_content('wOsgnXCdR9qxkNn1-RyV7g')
+print(markdown)
+```
+
+%% Output
+
+    # Test
+    
+    dies ist ein Test
+    
+    ![](https://pad.gwdg.de/uploads/b3dce952-95c4-4143-a09e-a32cddb5168e.png)
+    
+
+%% Cell type:code id: tags:
+
+``` python
+client2 = HedgedocClient('http://localhost:3000')
+client2.create_note(markdown)
+```
+
+%% Output
+
+    'http://localhost:3000/WRWm37-BSwSlbsvZv-44DA'
+
+%% Cell type:code id: tags:
+
+``` python
+from hedgedoc_api import HedgedocClient, find_image_urls
+
+client = HedgedocClient('http://localhost:3000')
+markdown = client.get_note_content('Wgn_PaviRj-g6CRnZChkSg')
+print(find_image_urls(markdown))
+```
+
+%% Output
+
+    ['http://localhost:3000/uploads/4d46ab65-797f-4fde-9329-3859a752d30c.png']
+
+%% Cell type:code id: tags:
+
+``` python
+from hedgedoc_api import HedgedocClient, find_image_urls
+
+client1 = HedgedocClient('https://pad.gwdg.de')
+client2 = HedgedocClient('http://localhost:3000')
+
+# get content and image urls
+markdown = client1.get_note_content('wOsgnXCdR9qxkNn1-RyV7g')
+image_urls = find_image_urls(markdown)
+
+# copy images
+if len(image_urls):
+    print("Uploading images:")
+    for image_url in image_urls:
+        print(f" - {image_url}")
+        new_image_url = client2.upload_image_from_url(image_url)
+        print(f"   --> {new_image_url}")
+        markdown = markdown.replace(image_url, new_image_url)
+
+# uplaod note with new image urls
+client2.create_note(markdown)
+```
+
+%% Output
+
+    Uploading images:
+     - https://pad.gwdg.de/uploads/b3dce952-95c4-4143-a09e-a32cddb5168e.png
+       --> http://localhost:3000/uploads/2b6965db-ffb9-4e9f-b893-3c76a6b522ab.png
+
+    'http://localhost:3000/IWxjqEXJT5SyeQAgX-fEEw'
--- a/hedgedoc/hedgedoc_api.py
+++ b/hedgedoc/hedgedoc_api.py
+import requests
+import json
+import regex as re
+
+def find_image_urls(text):
+    """Extracts all image URLs from a given text"""
+    url_pattern = re.compile(r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+\.(?:jpg|gif|png|jpeg)")
+    urls = re.findall(url_pattern, text)
+    return urls
+
+class HedgedocClient:
+
+    BASE_URL = None
+    
+    def __init__(self, base_url) -> None:
+        self.BASE_URL = base_url
+    
+    def upload_image(self, image_path):
+        """
+        Function to upload an image.
+        The image_path parameter should be a string representing the file path to the image.
+        Returns the URL of the uploaded image.
+        """
+        url = f'{self.BASE_URL}/uploadimage'
+        headers = {
+            'Content-Type': 'multipart/form-data',
+        }
+        with open(image_path, 'rb') as img_file:
+            files = {
+                'image': (image_path, img_file, 'image/png'),
+            }
+            
+        upload_response = requests.post(url, headers=headers, files=files)
+        
+        if upload_response.status_code != 200:
+            raise RuntimeError("Upload failed")
+        
+        return upload_response.json()['link']
+        
+    def upload_image_from_url(self, image_url):
+        """
+        Function to upload an image from a URL.
+        The image_url parameter should be a string representing the URL of the image.
+        Returns the URL of the uploaded image.
+        """
+        upload_url = f'{self.BASE_URL}/uploadimage'
+        response = requests.get(image_url, stream=True)
+        response.raise_for_status()
+        filename = image_url.split("/")[-1]
+        
+        # We make use of the context manager to avoid storing the whole file in memory.
+        with response:
+            files = {'image': (filename, response.raw, response.headers.get('Content-type'))}
+            upload_response = requests.post(upload_url, files=files)
+
+        if upload_response.status_code != 200:
+            raise RuntimeError("Upload failed")
+        
+        return upload_response.json()['link']
+
+
+    def create_note(self, data):
+        """
+        Function to import markdown data into a new note.
+        The note content will be the body of the received HTTP-request.
+        """
+        headers = {'Content-Type': 'text/markdown'}
+        response = requests.post(f'{self.BASE_URL}/new', data=data, headers=headers)
+        response.raise_for_status()
+        
+        if response.status_code != 200:
+            raise RuntimeError("Request failed")
+        
+        return response.url
+
+    def get_note_content(self, note_id):
+        """
+        Function to return the raw markdown content of a note.
+        """
+        response = requests.get(f'{self.BASE_URL}/{note_id}/download')
+        response.raise_for_status()
+        
+        if response.status_code != 200:
+            raise RuntimeError("Download failed")
+        
+        return response.text
+
+    def get_note_metadata(self, note_id):
+        """
+        Function to return metadata about a note.
+        This includes the title and description of the note as well as the creation date and viewcount.
+        """
+        response = requests.get(f'{self.BASE_URL}/{note_id}/info')
+        response.raise_for_status()
+        
+        if response.status_code != 200:
+            raise RuntimeError("Request failed")
+        
+        return response.json()
+    
+    def import_note(self, source_client, note_id):
+        markdown = source_client.get_note_content(note_id)
+        image_urls = find_image_urls(markdown)
+
+        # copy images 
+        if len(image_urls):
+            for image_url in image_urls:
+                new_image_url = self.upload_image_from_url(image_url)
+                markdown = markdown.replace(image_url, new_image_url)
+
+        # uplaod note with new image urls and return the url of the new note
+        return self.create_note(markdown)    
+        
\ No newline at end of file