crawler and index working, not clean & correctly commented

13590527 · srebers · c5e90b30 · 13590527 · 13590527 · 13590527
Commit 13590527 authored 1 year ago by srebers
--- a/crawler.py
+++ b/crawler.py
+import os
 from queue import Queue
-from urllib.parse import urljoin
+from urllib.parse import urlparse, urljoin
 import requests
 from bs4 import BeautifulSoup
-# r = requests.get('https://vm009.rz.uos.de/crawl/index.html')
+import indexing
+#Index:
+ix = indexing.get_index()
+ALLOWED_DOMAINS = ('https://vm009.rz.uos.de/crawl/')
 queue = Queue()
-visitedLinks = []
+queue.put('https://vm009.rz.uos.de/crawl/index.html')   #initialize queue with start URL
-queue.put('https://vm009.rz.uos.de/crawl/index.html')
-visitedLinks.append('https://vm009.rz.uos.de/crawl/index.html')
+visited_list = set()   #list with visited websites
 while not queue.empty():
-    link = queue.get()
+    current_url = queue.get()   #take the first element of the queue
-    r = requests.get(link)
-    soup = BeautifulSoup(r.content, 'html.parser')
+    if True: #TODO: Implement when index is added
-    # print(r.content)
-    for l in soup.find_all("a"):
+        r = requests.get(current_url, timeout=3)   # get the current url
-        url = urljoin('https://vm009.rz.uos.de/crawl/', l['href'])
+        print(r.url)
-        if url not in visitedLinks and 'https://vm009.rz.uos.de/crawl' in url:
+        soup = BeautifulSoup(r.content, 'html.parser')
-            print(url)
+        urltitle = str(soup.title.string)
-            print(l.text)
+        writer = ix.writer()
-            queue.put(url)
+        writer.update_document(title=urltitle, url=r.url, content=str(soup.text)) #add website information to crawler
-            visitedLinks.append(url)
+        writer.commit() #put crawler content into the index
+        new_links = soup.find_all('a', href=True)   # find all links on this website
-print(visitedLinks)
+        for link in new_links:   # for each of the links on the current website
+            href = urljoin(current_url, link['href'])   # join the domain with its relative links
+            url = urlparse(href)
+            if os.path.splitext(url.path)[1] in ('.html', '', '.htm') and url.hostname in ALLOWED_DOMAINS and href not in visited_list:
+            # if the url has the right format, is inside the allowed domains and has not been visited recently
+                queue.put(href)
+                visited_list.add(href)
+        #TODO: Analyze it and update the index
+print(visited_list)
--- a/crawler_alt.py
+++ b/crawler_alt.py
+import os.path
+from queue import Queue
+from urllib.parse import urljoin, urlparse
+import requests
+from bs4 import BeautifulSoup
+# r = requests.get('https://vm009.rz.uos.de/crawl/index.html')
+queue = Queue()
+visited_links = []
+start_site = 'https://vm009.rz.uos.de/crawl/index.html'
+base_site = 'https://vm009.rz.uos.de/crawl/'
+queue.put(start_site)
+visited_links.append(start_site)
+while not queue.empty():
+    link = queue.get()
+    r = requests.get(link)
+    soup = BeautifulSoup(r.content, 'html.parser')
+    # print(r.content)
+    for l in soup.find_all("a"):
+        url = urlparse(urljoin(base_site, l['href']))
+        print(os.path.split(url))
+        if url not in visited_links and base_site in url :
+            print(url)
+            print(l.text)
+            queue.put(url)
+            visited_links.append(url)
+print(visited_links)
--- a/indexing.py
+++ b/indexing.py
+import os
+from whoosh.fields import *
 from whoosh.index import create_in
-from whoosh.fields import *
+from whoosh import index
\ No newline at end of file
+# Here, the structure of index entries is defined. You can add more fields with metadata, computed values etc.,
+# and use them for searching and ranking.
+# We only use a title and a text.
+#
+# The "stored" attribute is used for all parts that we want to be able to fully retrieve from the index
+ix_schema = Schema(title=TEXT(stored=True), url=ID(stored=True, unique=True), content=TEXT)
+# Create an index if not created or open an existing, then return it
+def get_index():
+    if not os.path.exists("indexdir"):
+        os.makedirs("indexdir")
+        return create_in("indexdir", schema=ix_schema)
+    return index.open_dir("indexdir")
+# # now let's add some texts (=documents)
+# writer.add_document(title=u"First document", content=u"This is the first document we've added!")
+# writer.add_document(title=u"Second document", content=u"The second one is even more interesting!")
+# writer.add_document(title=u"Songtext", content=u"Music was my first love and it will be the last")
+#
+# # write the index to the disk
+# writer.commit()
+# # Retrieving data
+# from whoosh.qparser import QueryParser
+#
+# with ix.searcher() as searcher:
+#     # find entries with the words 'first' AND 'last'
+#     query = QueryParser("content", ix.schema).parse("first last")
+#     results = searcher.search(query)
+#
+#     # print all results
+#     for r in results:
+#         print(r)
\ No newline at end of file
--- a/myapp.py
+++ b/myapp.py
+from whoosh.fields import *
+import indexing
+# Retrieving data
+from whoosh.qparser import QueryParser
+ix = indexing.get_index()
+with ix.searcher() as searcher:
+    # find entries with the words 'first' AND 'last'
+    query = QueryParser("content", ix.schema).parse(input("search for something:\n"))
+    results = searcher.search(query)
+    # print all results
+    for r in results:
+        print(r)
\ No newline at end of file