final version

a9ab0c0c · srebers · c26ce85d · a9ab0c0c · a9ab0c0c · a9ab0c0c
Commit a9ab0c0c authored 1 year ago by srebers
--- a/crawler.py
+++ b/crawler.py
@@ -8,6 +8,7 @@ from bs4 import BeautifulSoup
 import indexing
 from myapp import search_index
+# returns true only if the time since crawling the url is bigger than the cooldown period which we set further down
 def time_passed(link):
    reader = ix.reader()
    try:
@@ -17,47 +18,48 @@ def time_passed(link):
    except:
        return True
-#Index:
 ix = indexing.get_index()
 ALLOWED_DOMAINS = ('en.wikipedia.org',)
 wikipedia = True
-crawl_cooldown = timedelta(hours=1)
+crawl_cooldown = timedelta(minutes=1440)   # setting the crawl cooldown time
-depth_limit = 2
+depth_limit = 2   # making sure to limit the amount of websites we crawl
 queue = Queue()
-queue.put(('https://en.wikipedia.org/wiki/Artificial_neural_network', 0))   #initialize queue with start URL
+queue.put(('https://en.wikipedia.org/wiki/Artificial_neural_network', 0))   # initialize queue with start URL
-visited_list = set()   #list with visited websites
+visited_list = set()   # list with visited websites
 while not queue.empty():
-    current_url, depth = queue.get()  #take the first element of the queue
+    current_url, depth = queue.get()  # take the first url out of the queue
-    visited_list.add(current_url) #needed?
+    visited_list.add(current_url)  # adding url to the crawl history
-    r = requests.get(current_url, timeout=3)   # get the current url
+    # extract website content
+    r = requests.get(current_url, timeout=3)
    soup = BeautifulSoup(r.content, 'html.parser')
    urltitle = str(soup.title.string)
    request_time = datetime.now()
+    # putting the crawler content into the index
    writer = ix.writer()
-    writer.update_document(title=urltitle, url=r.url, content=str(soup.text), date=request_time) #add website information to crawler
+    writer.update_document(title=urltitle, url=r.url, content=str(soup.text), date=request_time) #adding the website information to the crawler
-    writer.commit()# put crawler content into the index
+    writer.commit()
    print(F"Queue length: {queue.qsize()}, searching on depth: {depth}, site: {r.url}")
    if depth < depth_limit:
-        if wikipedia: #only works on wikipedia
+        if wikipedia:  # only works on wikipedia
-            new_links = soup.find(id="bodyContent").find_all('a', href=True, limit=20)   # find all links on this website
+            new_links = soup.find(id="bodyContent").find_all('a', href=True, limit=20)   # finding first 20 links of the content body on this website
        else:
            new_links = soup.find_all('a', href=True, limit=20)
        for link in new_links:   # for each of the links on the current website
-            href = urljoin(current_url, link['href'])   # join the domain with its relative links
+            href = urljoin(current_url, link['href'])   # joining current url with its sublinks
            url = urlparse(href)
-            if os.path.splitext(url.path)[1] in ('.html', '', '.htm') and url.hostname in ALLOWED_DOMAINS and href not in visited_list and time_passed(url) and "#" not in href:
+            # putting  the url in the queue if it has the right format, is inside the allowed domains, has not been crawled recently and is no fragment url
-            # if the url has the right format, is inside the allowed domains and has not been visited recently
+            if os.path.splitext(url.path)[1] in ('.html', '', '.htm') and url.hostname in ALLOWED_DOMAINS and href not in visited_list and time_passed(href) and "#" not in href:
                queue.put((href, depth + 1))
 print(visited_list)

--- a/indexing.py
+++ b/indexing.py
@@ -4,15 +4,10 @@ from whoosh.fields import *
 from whoosh.index import create_in
 from whoosh import index
-# Here, the structure of index entries is defined. You can add more fields with metadata, computed values etc.,
+# creating index schema
-# and use them for searching and ranking.
-# We only use a title and a text.
-#
-# The "stored" attribute is used for all parts that we want to be able to fully retrieve from the index
 ix_schema = Schema(title=TEXT(stored=True), url=ID(stored=True, unique=True), content=TEXT(stored=True), date=DATETIME(stored=True))
-# Create an index if not created or open an existing, then return it
+# Create an index if not created or open an existing one, then return it
 def get_index():
    if not os.path.exists("indexdir"):
        os.makedirs("indexdir")

--- a/myapp.py
+++ b/myapp.py
-from whoosh.fields import *
 import indexing
-# Retrieving data
 from whoosh.qparser import QueryParser
 import traceback
 from flask import Flask, request, render_template
-#TODO:
+# getting the index and searching it for results, then appending them to the "results" list
-# fix empty results
-# cut out url after #
-# check if timedelta works
-# clean up code
-# annotate
-# features: language, result pages
 def search_index(keyword: str):
    ix = indexing.get_index()
    with ix.searcher() as searcher:
@@ -23,24 +12,23 @@ def search_index(keyword: str):
        results = searcher.search(query, limit=100)
        results.fragmenter.surround = 50
        for r in results:
-            result_list.append((r.fields()["url"], r.highlights("content"), r.fields()["title"]))
+            if r.highlights("content"):  # filtering results that have no contents and are just embedded on the site
+                result_list.append((r.fields()["url"], r.highlights("content"), r.fields()["title"]))
        return result_list, len(results)
 app = Flask(__name__)
+# including an errorhandler
 @app.errorhandler(500)
 def internal_error(exception):
   return "<pre>"+traceback.format_exc()+"</pre>"
+# rendering the template and giving it the right attributes
 @app.route("/")
 def start():
    search_word = request.args.get('search_word')
    if search_word is None:
        return render_template("start.html", title="start", result="start")
    else:
-        return render_template("start.html", title=search_word, search=search_word, result=search_index(search_word))#return render_template("result.html", title="search for: " + rev, result=search_index(rev))
+        return render_template("start.html", title=search_word, search=search_word, result=search_index(search_word))
\ No newline at end of file
--- a/static/img/sad_platypus.png
+++ b/static/img/sad_platypus.png
--- a/static/img/search_platypus.png
+++ b/static/img/search_platypus.png
--- a/static/img/secret_perry.png
+++ b/static/img/secret_perry.png
--- a/static/styles.css
+++ b/static/styles.css
@@ -34,6 +34,14 @@ ul {
 }
+.search-platypus {
+    display: block;
+    margin-left: auto;
+    margin-top: 9px;
+    max-width: 100px;
+    max-height: auto;
+}
 .logo {
    font-family: "Courier New", monospace;
    font-size: 50px;
@@ -83,6 +91,28 @@ ul {
    background-color: white;
 }
+.top-section{
+    display: grid;
+    padding-top: 10px;
+    height: 45px;
+    grid-template-columns: 1fr 3.5fr;
+}
+.secret-perry{
+    margin-top: -20px;
+    width: 50%;
+    height: 70px;
+    width: auto;
+    opacity: 0.2;
+    filter: grayscale(100%);
+    transition: opacity 0.5s;
+}
+.secret-perry:hover{
+    opacity: 1;
+    filter: grayscale(0%);
+}
 .results{
    margin-left: 15px;
    font-family: "Courier New", monospace;
@@ -92,10 +122,24 @@ ul {
 }
 .no-results{
-    margin-left: 15px;
    font-family: "Courier New", monospace;
    font-size: 25px;
    font-weight: bold;
+    display: block;
+    margin-left: auto;
+    margin-right: auto;
+    width: 50%;
+    text-align: center;
+}
+.sad-platypus{
+    display: block;
+    margin-top: 120px;
+    margin-left: auto;
+    margin-right: auto;
+    width: 50%;
+    max-width: 200px;
+    max-height: auto;
 }
 .results-word{

--- a/templates/start.html
+++ b/templates/start.html
@@ -16,6 +16,9 @@
                </form>
            </div>
            <div class = "right-section">
+                <div class="crop-container">
+                    <img src="{{ url_for('static', filename='img/search_platypus.png') }}" alt="Search platypus" class="search-platypus"/>
+                </div>
            </div>
        </div>
@@ -23,9 +26,16 @@
        {% if result == "start" %}
        {% elif result[0][0] is defined %}
-            <a class="results">Results for: </a>
+            <div class="top-section">
-            <a class="results-word">{{ search }}</a>
+               <div class="result-area">
-            <p class="result-length"> {{ result[1] }} Results</p>
+                <a class="results">Results for: </a>
+                <a class="results-word">{{ search }}</a>
+                <p class="result-length"> {{ result[1] }} Results</p>
+                </div>
+                    <img src="{{ url_for('static', filename='img/secret_perry.png') }}" class="secret-perry">
+            </div>
            <ul>
                {% for item in result[0] %}
@@ -38,6 +48,7 @@
                {% endfor %}
            </ul>
        {% else %}
+            <img src="{{ url_for('static', filename='img/sad_platypus.png') }}" alt="Sad platypus" class="sad-platypus">
            <a class="no-results">No Results</a>
        {% endif %}