second commit

c5e90b30 · srebers · fc4bbf92 · c5e90b30 · c5e90b30 · c5e90b30
Commit c5e90b30 authored 1 year ago by srebers
--- a/.gitignore
+++ b/.gitignore
+venv
--- a/crawler.py
+++ b/crawler.py
+from queue import Queue
+from urllib.parse import urljoin
 import requests
+from bs4 import BeautifulSoup
+# r = requests.get('https://vm009.rz.uos.de/crawl/index.html')
+queue = Queue()
+visitedLinks = []
+queue.put('https://vm009.rz.uos.de/crawl/index.html')
+visitedLinks.append('https://vm009.rz.uos.de/crawl/index.html')
+while not queue.empty():
+    link = queue.get()
+    r = requests.get(link)
+    soup = BeautifulSoup(r.content, 'html.parser')
+    # print(r.content)
+    for l in soup.find_all("a"):
+        url = urljoin('https://vm009.rz.uos.de/crawl/', l['href'])
+        if url not in visitedLinks and 'https://vm009.rz.uos.de/crawl' in url:
+            print(url)
+            print(l.text)
+            queue.put(url)
+            visitedLinks.append(url)
+print(visitedLinks)
-r = requests.get('https://vm009.rz.uos.de/crawl/index.html')
-print(r.content)
\ No newline at end of file
--- a/indexing.py
+++ b/indexing.py
+from whoosh.index import create_in
+from whoosh.fields import *
\ No newline at end of file