Search engine working, still some bugs

5917c73c · srebers · ba1b4e00 · 5917c73c · 5917c73c · 5917c73c
Commit 5917c73c authored 1 year ago by srebers
--- a/crawler.py
+++ b/crawler.py
 import os
+from datetime import datetime, timedelta
 from queue import Queue
 from urllib.parse import urlparse, urljoin

 import requests
 from bs4 import BeautifulSoup
-
 import indexing
+from myapp import search_index
+
+def time_passed(link):
+    reader = ix.reader()
+    try:
+        link_time = reader.stored_fields(reader.first_id("url", link))["date"]
+        time_difference = datetime.now() - link_time
+        return time_difference > crawl_cooldown
+    except:
+        return True

 #Index:

 ix = indexing.get_index()

-ALLOWED_DOMAINS = ('https://vm009.rz.uos.de/crawl/')
+ALLOWED_DOMAINS = ('minecraft.fandom.com',)
+crawl_cooldown = timedelta(hours=1)
+depth_limit = 2

 queue = Queue()
-queue.put('https://vm009.rz.uos.de/crawl/index.html')   #initialize queue with start URL
+queue.put(('https://minecraft.fandom.com/wiki/Crafting_Table', 0))   #initialize queue with start URL

 visited_list = set()   #list with visited websites

 while not queue.empty():
-    current_url = queue.get()   #take the first element of the queue
-
-    if True: #TODO: Implement when index is added
-
-        r = requests.get(current_url, timeout=3)   # get the current url
-        print(r.url)
-        soup = BeautifulSoup(r.content, 'html.parser')
-        urltitle = str(soup.title.string)
-        writer = ix.writer()
-        writer.update_document(title=urltitle, url=r.url, content=str(soup.text)) #add website information to crawler
+    current_url, depth = queue.get()   #take the first element of the queue
+    visited_list.add(current_url) #working?
+    r = requests.get(current_url, timeout=3)   # get the current url
+    soup = BeautifulSoup(r.content, 'html.parser')
+    urltitle = str(soup.title.string)
+    request_time = datetime.now()
+    writer = ix.writer()
+    writer.update_document(title=urltitle, url=r.url, content=str(soup.text), date=request_time) #add website information to crawler
+    writer.commit()# put crawler content into the index

-        writer.commit() #put crawler content into the index
+    print(F"Queue length: {queue.qsize()}, searching on depth: {depth}, site: {r.url}")

-        new_links = soup.find_all('a', href=True)   # find all links on this website
+    if depth <= depth_limit:
+        new_links = soup.find_all('a', href=True, limit=20)   # find all links on this website

        for link in new_links:   # for each of the links on the current website
            href = urljoin(current_url, link['href'])   # join the domain with its relative links
            url = urlparse(href)

-
-            if os.path.splitext(url.path)[1] in ('.html', '', '.htm') and url.hostname in ALLOWED_DOMAINS and href not in visited_list:
+            if os.path.splitext(url.path)[1] in ('.html', '', '.htm') and url.hostname in ALLOWED_DOMAINS and href not in visited_list and time_passed(url):
            # if the url has the right format, is inside the allowed domains and has not been visited recently
-                queue.put(href)
-                visited_list.add(href)
+                queue.put((href, depth + 1))

 print(visited_list)
+
+
--- a/indexing.py
+++ b/indexing.py
@@ -10,33 +10,11 @@ from whoosh import index
 #
 # The "stored" attribute is used for all parts that we want to be able to fully retrieve from the index

-ix_schema = Schema(title=TEXT(stored=True), url=ID(stored=True, unique=True), content=TEXT)
+ix_schema = Schema(title=TEXT(stored=True), url=ID(stored=True, unique=True), content=TEXT(stored=True), date=DATETIME(stored=True))

 # Create an index if not created or open an existing, then return it
 def get_index():
    if not os.path.exists("indexdir"):
        os.makedirs("indexdir")
        return create_in("indexdir", schema=ix_schema)
-    return index.open_dir("indexdir")
-
-
-
-# # now let's add some texts (=documents)
-# writer.add_document(title=u"First document", content=u"This is the first document we've added!")
-# writer.add_document(title=u"Second document", content=u"The second one is even more interesting!")
-# writer.add_document(title=u"Songtext", content=u"Music was my first love and it will be the last")
-#
-# # write the index to the disk
-# writer.commit()
-
-# # Retrieving data
-# from whoosh.qparser import QueryParser
-#
-# with ix.searcher() as searcher:
-#     # find entries with the words 'first' AND 'last'
-#     query = QueryParser("content", ix.schema).parse("first last")
-#     results = searcher.search(query)
-#
-#     # print all results
-#     for r in results:
-#         print(r)
\ No newline at end of file
+    return index.open_dir("indexdir")
\ No newline at end of file
--- a/myapp.py
+++ b/myapp.py
@@ -9,25 +9,23 @@ from flask import Flask, request, render_template
 def search_index(keyword: str):
    ix = indexing.get_index()
    with ix.searcher() as searcher:
-        # find entries with the words 'first' AND 'last'
+        result_list = []
        query = QueryParser("content", ix.schema).parse(keyword)
        results = searcher.search(query)
-
-        # print all results
-        result_links = []
+        results.fragmenter.surround = 300
        for r in results:
-            result_links.append(r.fields()["url"])
-        return result_links
-
-search_index("platypus")
+            result_list.append((r.fields()["url"], r.highlights("content"), r.fields()["title"]))
+        #TODO: ??? searcher.close()
+        return result_list

 app = Flask(__name__)

 @app.route("/")
 def start():
-    return render_template("start.html", title="start")
+    search_word = request.args.get('search_word')
+    if search_word is None:
+        return render_template("start.html", title="start", result="start")
+    else:
+        return render_template("start.html", title=search_word, search=search_word, result=search_index(search_word))#return render_template("result.html", title="search for: " + rev, result=search_index(rev))
+

-@app.route("/result")
-def reverse():
-    rev = request.args.get('rev')
-    return render_template("result.html", title="search for: "+rev, result=search_index(rev))
\ No newline at end of file
--- a/static/styles.css
+++ b/static/styles.css
+body {
+    padding-top: 80px;
+    background-color: rgb(230,224,258);
+}
+
+ul {
+    list-style-type: none;
+    padding: 0;
+}
+
+.header {
+    height: 80px;
+
+    position: fixed;
+    top: 0;
+    left: 0;
+    right: 0;
+
+    display: grid;
+    grid-template-columns: 1fr 1fr 1fr;
+    background-color: rgb(160,154,188);
+    z-index: 100;
+}
+
+.left-section {
+
+}
+
+.middle-section {
+    text-align: center;
+}
+
+.right-section {
+
+}
+
+.logo {
+    font-family: "Courier New", monospace;
+    font-size: 50px;
+    margin-top: 10px;
+    margin-left: 20px;
+}
+
+.search-box{
+
+    padding: 15px 16px;
+    font-size: 18px;
+    font-weight: bold;
+    font-family: "Courier New", monospace;
+    color: black;
+    background-color: rgb(230,224,258);
+    border-radius: 15px;
+    border: none;
+    margin-top: 13px;
+    margin-right: 5px;
+
+
+}
+
+.submit-box{
+    cursor: pointer;
+    border-radius: 15px;
+    padding: 15px 16px;
+    font-size: 18px;
+    font-weight: bold;
+    font-family: "Courier New", monospace;
+    color: black;
+    background-color: rgb(230,224,258);
+    border-style: solid;
+    border-size: 2px;
+    border-color: black;
+    margin-top: 13px;
+    transition: background-color 0.3s;
+}
+
+.search-box:focus{
+    background-color: white;
+    outline: none;
+}
+
+.submit-box:hover{
+    background-color: white;
+}
+
+.results{
+    margin-left: 15px;
+    font-family: "Courier New", monospace;
+    font-size: 25px;
+    font-weight: bold;
+}
+
+.no-results{
+    margin-left: 15px;
+    font-family: "Courier New", monospace;
+    font-size: 25px;
+    font-weight: bold;
+}
+
+.results-word{
+    font-family: "Courier New", monospace;
+    font-size: 25px;
+    font-weight: bold;
+    font-style: italic;
+}
+
+.result-box{
+    margin-left: 15px;
+    margin-right: 15px;
+    padding: 10px;
+    margin-bottom: 15px;
+    background-color: white;
+
+    border-radius: 10px;
+    box-shadow: 5px 5px 5px 5px grey;
+    blur-radius: 10px;
+}
\ No newline at end of file
--- a/templates/start.html
+++ b/templates/start.html
@@ -2,11 +2,42 @@
 <html>
    <head>
        <title>{{ title }}</title>
+        <link rel="stylesheet" href="{{ url_for('static', filename='styles.css') }}"/>
    </head>
    <body>
-        <h1>Hello World</h1>
-        <form action="result" method="GET">
-            <input type="text" name='rev'>
-        </form>
+        <div class = "header">
+            <div class = "left-section">
+                <p class = "logo">Search Engine</p>
+            </div>
+            <div class = "middle-section">
+                <form action="" method="GET">
+                    <input class = "search-box" type="text" name='search_word' placeholder="search...">
+                    <input class = "submit-box" type="submit" value="Go">
+                </form>
+            </div>
+            <div class = "right-section">
+            </div>
+
+        </div>
+
+        {% if result == "start" %}
+
+        {% elif result[0] is defined %}
+            <a class="results">Results for: </a>
+            <a class="results-word">{{ search }}</a>
+            <ul>
+                {% for item in result %}
+                    <li>
+                        <div class="result-box">
+                            <a href= {{ item[0] }}> {{ item[2] }} </a>
+                            <p> {{ item[1]|safe }} </p>
+                        </div>
+                    </li>
+                {% endfor %}
+            </ul>
+        {% else %}
+            <a class="no-results">No Results</a>
+        {% endif %}
+
    </body>
 </html>
\ No newline at end of file