some updates

30655817 · srebers · 5917c73c · 30655817 · 5917c73c · 5917c73c
Commit 30655817 authored 1 year ago by srebers
--- a/crawler.py
+++ b/crawler.py
@@ -21,18 +21,21 @@ def time_passed(link):

 ix = indexing.get_index()

-ALLOWED_DOMAINS = ('minecraft.fandom.com',)
+ALLOWED_DOMAINS = ('en.wikipedia.org',)
+wikipedia = True
 crawl_cooldown = timedelta(hours=1)
 depth_limit = 2

 queue = Queue()
-queue.put(('https://minecraft.fandom.com/wiki/Crafting_Table', 0))   #initialize queue with start URL
+queue.put(('https://en.wikipedia.org/wiki/Artificial_neural_network', 0))   #initialize queue with start URL

 visited_list = set()   #list with visited websites

 while not queue.empty():
-    current_url, depth = queue.get()   #take the first element of the queue
-    visited_list.add(current_url) #working?
+    current_url, depth = queue.get()  #take the first element of the queue
+
+    visited_list.add(current_url) #needed?
+
    r = requests.get(current_url, timeout=3)   # get the current url
    soup = BeautifulSoup(r.content, 'html.parser')
    urltitle = str(soup.title.string)
@@ -42,15 +45,18 @@ while not queue.empty():
    writer.commit()# put crawler content into the index

    print(F"Queue length: {queue.qsize()}, searching on depth: {depth}, site: {r.url}")
+    if depth < depth_limit:
+        if wikipedia: #only works on wikipedia
+            new_links = soup.find(id="bodyContent").find_all('a', href=True, limit=20)   # find all links on this website
+        else:
+            new_links = soup.find_all('a', href=True, limit=20)

-    if depth <= depth_limit:
-        new_links = soup.find_all('a', href=True, limit=20)   # find all links on this website

        for link in new_links:   # for each of the links on the current website
            href = urljoin(current_url, link['href'])   # join the domain with its relative links
            url = urlparse(href)

-            if os.path.splitext(url.path)[1] in ('.html', '', '.htm') and url.hostname in ALLOWED_DOMAINS and href not in visited_list and time_passed(url):
+            if os.path.splitext(url.path)[1] in ('.html', '', '.htm') and url.hostname in ALLOWED_DOMAINS and href not in visited_list and time_passed(url) and "#" not in href:
            # if the url has the right format, is inside the allowed domains and has not been visited recently
                queue.put((href, depth + 1))


--- a/crawler_alt.py
+++ b/crawler_alt.py
-import os.path
-from queue import Queue
-from urllib.parse import urljoin, urlparse
-
-import requests
-from bs4 import BeautifulSoup
-
-# r = requests.get('https://vm009.rz.uos.de/crawl/index.html')
-
-
-queue = Queue()
-visited_links = []
-start_site = 'https://vm009.rz.uos.de/crawl/index.html'
-base_site = 'https://vm009.rz.uos.de/crawl/'
-queue.put(start_site)
-visited_links.append(start_site)
-
-while not queue.empty():
-    link = queue.get()
-    r = requests.get(link)
-    soup = BeautifulSoup(r.content, 'html.parser')
-    # print(r.content)
-    for l in soup.find_all("a"):
-        url = urlparse(urljoin(base_site, l['href']))
-        print(os.path.split(url))
-        if url not in visited_links and base_site in url :
-            print(url)
-            print(l.text)
-            queue.put(url)
-            visited_links.append(url)
-
-print(visited_links)
-
-
-
-
--- a/main.py
+++ b/main.py
-
-
--- a/myapp.py
+++ b/myapp.py
@@ -3,23 +3,38 @@ import indexing

 # Retrieving data
 from whoosh.qparser import QueryParser
-
+import traceback
 from flask import Flask, request, render_template

+#TODO:
+# fix empty results
+# cut out url after #
+# check if timedelta works
+# clean up code
+# annotate
+# features: language, result pages
+
+
 def search_index(keyword: str):
    ix = indexing.get_index()
    with ix.searcher() as searcher:
        result_list = []
        query = QueryParser("content", ix.schema).parse(keyword)
-        results = searcher.search(query)
-        results.fragmenter.surround = 300
+        results = searcher.search(query, limit=100)
+        results.fragmenter.surround = 50
        for r in results:
            result_list.append((r.fields()["url"], r.highlights("content"), r.fields()["title"]))
-        #TODO: ??? searcher.close()
-        return result_list
+        return result_list, len(results)
+

 app = Flask(__name__)

+
+@app.errorhandler(500)
+def internal_error(exception):
+   return "<pre>"+traceback.format_exc()+"</pre>"
+
+
 @app.route("/")
 def start():
    search_word = request.args.get('search_word')

--- a/myapp.wsgi
+++ b/myapp.wsgi
+from myapp import app
+application = app
\ No newline at end of file
--- a/static/styles.css
+++ b/static/styles.css
@@ -53,6 +53,7 @@ ul {
    border: none;
    margin-top: 13px;
    margin-right: 5px;
+    transition: background-color 0.5s;


 }
@@ -87,6 +88,7 @@ ul {
    font-family: "Courier New", monospace;
    font-size: 25px;
    font-weight: bold;
+
 }

 .no-results{
@@ -103,6 +105,13 @@ ul {
    font-style: italic;
 }

+.result-length{
+    margin-left: 15px;
+    margin-top: 0px;
+    font-family: "Courier New", monospace;
+    font-size: 15px;
+}
+
 .result-box{
    margin-left: 15px;
    margin-right: 15px;

--- a/templates/result.html
+++ b/templates/result.html
-<!DOCTYPE html>
-<html>
-    <head>
-        <title>{{ title }}</title>
-    </head>
-    <body>
-        <h1>Results:</h1>
-        <ul>
-            {%  for item in result %}
-                <li>
-                    <a href= {{ item }}> {{ item }} </a>
-                </li>
-            {% endfor %}
-        </ul>
-    </body>
-</html>
\ No newline at end of file
--- a/templates/start.html
+++ b/templates/start.html
@@ -22,11 +22,13 @@

        {% if result == "start" %}

-        {% elif result[0] is defined %}
+        {% elif result[0][0] is defined %}
            <a class="results">Results for: </a>
            <a class="results-word">{{ search }}</a>
+            <p class="result-length"> {{ result[1] }} Results</p>
+
            <ul>
-                {% for item in result %}
+                {% for item in result[0] %}
                    <li>
                        <div class="result-box">
                            <a href= {{ item[0] }}> {{ item[2] }} </a>