Skip to content
Snippets Groups Projects
Commit a9ab0c0c authored by srebers's avatar srebers
Browse files

final version

parent c26ce85d
No related branches found
No related tags found
No related merge requests found
...@@ -8,6 +8,7 @@ from bs4 import BeautifulSoup ...@@ -8,6 +8,7 @@ from bs4 import BeautifulSoup
import indexing import indexing
from myapp import search_index from myapp import search_index
# returns true only if the time since crawling the url is bigger than the cooldown period which we set further down
def time_passed(link): def time_passed(link):
reader = ix.reader() reader = ix.reader()
try: try:
...@@ -17,47 +18,48 @@ def time_passed(link): ...@@ -17,47 +18,48 @@ def time_passed(link):
except: except:
return True return True
#Index:
ix = indexing.get_index() ix = indexing.get_index()
ALLOWED_DOMAINS = ('en.wikipedia.org',) ALLOWED_DOMAINS = ('en.wikipedia.org',)
wikipedia = True wikipedia = True
crawl_cooldown = timedelta(hours=1) crawl_cooldown = timedelta(minutes=1440) # setting the crawl cooldown time
depth_limit = 2 depth_limit = 2 # making sure to limit the amount of websites we crawl
queue = Queue() queue = Queue()
queue.put(('https://en.wikipedia.org/wiki/Artificial_neural_network', 0)) #initialize queue with start URL queue.put(('https://en.wikipedia.org/wiki/Artificial_neural_network', 0)) # initialize queue with start URL
visited_list = set() #list with visited websites visited_list = set() # list with visited websites
while not queue.empty(): while not queue.empty():
current_url, depth = queue.get() #take the first element of the queue current_url, depth = queue.get() # take the first url out of the queue
visited_list.add(current_url) #needed? visited_list.add(current_url) # adding url to the crawl history
r = requests.get(current_url, timeout=3) # get the current url # extract website content
r = requests.get(current_url, timeout=3)
soup = BeautifulSoup(r.content, 'html.parser') soup = BeautifulSoup(r.content, 'html.parser')
urltitle = str(soup.title.string) urltitle = str(soup.title.string)
request_time = datetime.now() request_time = datetime.now()
# putting the crawler content into the index
writer = ix.writer() writer = ix.writer()
writer.update_document(title=urltitle, url=r.url, content=str(soup.text), date=request_time) #add website information to crawler writer.update_document(title=urltitle, url=r.url, content=str(soup.text), date=request_time) #adding the website information to the crawler
writer.commit()# put crawler content into the index writer.commit()
print(F"Queue length: {queue.qsize()}, searching on depth: {depth}, site: {r.url}") print(F"Queue length: {queue.qsize()}, searching on depth: {depth}, site: {r.url}")
if depth < depth_limit: if depth < depth_limit:
if wikipedia: #only works on wikipedia if wikipedia: # only works on wikipedia
new_links = soup.find(id="bodyContent").find_all('a', href=True, limit=20) # find all links on this website new_links = soup.find(id="bodyContent").find_all('a', href=True, limit=20) # finding first 20 links of the content body on this website
else: else:
new_links = soup.find_all('a', href=True, limit=20) new_links = soup.find_all('a', href=True, limit=20)
for link in new_links: # for each of the links on the current website for link in new_links: # for each of the links on the current website
href = urljoin(current_url, link['href']) # join the domain with its relative links href = urljoin(current_url, link['href']) # joining current url with its sublinks
url = urlparse(href) url = urlparse(href)
if os.path.splitext(url.path)[1] in ('.html', '', '.htm') and url.hostname in ALLOWED_DOMAINS and href not in visited_list and time_passed(url) and "#" not in href: # putting the url in the queue if it has the right format, is inside the allowed domains, has not been crawled recently and is no fragment url
# if the url has the right format, is inside the allowed domains and has not been visited recently if os.path.splitext(url.path)[1] in ('.html', '', '.htm') and url.hostname in ALLOWED_DOMAINS and href not in visited_list and time_passed(href) and "#" not in href:
queue.put((href, depth + 1)) queue.put((href, depth + 1))
print(visited_list) print(visited_list)
......
...@@ -4,15 +4,10 @@ from whoosh.fields import * ...@@ -4,15 +4,10 @@ from whoosh.fields import *
from whoosh.index import create_in from whoosh.index import create_in
from whoosh import index from whoosh import index
# Here, the structure of index entries is defined. You can add more fields with metadata, computed values etc., # creating index schema
# and use them for searching and ranking.
# We only use a title and a text.
#
# The "stored" attribute is used for all parts that we want to be able to fully retrieve from the index
ix_schema = Schema(title=TEXT(stored=True), url=ID(stored=True, unique=True), content=TEXT(stored=True), date=DATETIME(stored=True)) ix_schema = Schema(title=TEXT(stored=True), url=ID(stored=True, unique=True), content=TEXT(stored=True), date=DATETIME(stored=True))
# Create an index if not created or open an existing, then return it # Create an index if not created or open an existing one, then return it
def get_index(): def get_index():
if not os.path.exists("indexdir"): if not os.path.exists("indexdir"):
os.makedirs("indexdir") os.makedirs("indexdir")
......
from whoosh.fields import *
import indexing import indexing
# Retrieving data
from whoosh.qparser import QueryParser from whoosh.qparser import QueryParser
import traceback import traceback
from flask import Flask, request, render_template from flask import Flask, request, render_template
#TODO: # getting the index and searching it for results, then appending them to the "results" list
# fix empty results
# cut out url after #
# check if timedelta works
# clean up code
# annotate
# features: language, result pages
def search_index(keyword: str): def search_index(keyword: str):
ix = indexing.get_index() ix = indexing.get_index()
with ix.searcher() as searcher: with ix.searcher() as searcher:
...@@ -23,24 +12,23 @@ def search_index(keyword: str): ...@@ -23,24 +12,23 @@ def search_index(keyword: str):
results = searcher.search(query, limit=100) results = searcher.search(query, limit=100)
results.fragmenter.surround = 50 results.fragmenter.surround = 50
for r in results: for r in results:
result_list.append((r.fields()["url"], r.highlights("content"), r.fields()["title"])) if r.highlights("content"): # filtering results that have no contents and are just embedded on the site
result_list.append((r.fields()["url"], r.highlights("content"), r.fields()["title"]))
return result_list, len(results) return result_list, len(results)
app = Flask(__name__) app = Flask(__name__)
# including an errorhandler
@app.errorhandler(500) @app.errorhandler(500)
def internal_error(exception): def internal_error(exception):
return "<pre>"+traceback.format_exc()+"</pre>" return "<pre>"+traceback.format_exc()+"</pre>"
# rendering the template and giving it the right attributes
@app.route("/") @app.route("/")
def start(): def start():
search_word = request.args.get('search_word') search_word = request.args.get('search_word')
if search_word is None: if search_word is None:
return render_template("start.html", title="start", result="start") return render_template("start.html", title="start", result="start")
else: else:
return render_template("start.html", title=search_word, search=search_word, result=search_index(search_word))#return render_template("result.html", title="search for: " + rev, result=search_index(rev)) return render_template("start.html", title=search_word, search=search_word, result=search_index(search_word))
\ No newline at end of file
static/img/sad_platypus.png

77.4 KiB

static/img/search_platypus.png

188 KiB

static/img/secret_perry.png

350 KiB

...@@ -34,6 +34,14 @@ ul { ...@@ -34,6 +34,14 @@ ul {
} }
.search-platypus {
display: block;
margin-left: auto;
margin-top: 9px;
max-width: 100px;
max-height: auto;
}
.logo { .logo {
font-family: "Courier New", monospace; font-family: "Courier New", monospace;
font-size: 50px; font-size: 50px;
...@@ -83,6 +91,28 @@ ul { ...@@ -83,6 +91,28 @@ ul {
background-color: white; background-color: white;
} }
.top-section{
display: grid;
padding-top: 10px;
height: 45px;
grid-template-columns: 1fr 3.5fr;
}
.secret-perry{
margin-top: -20px;
width: 50%;
height: 70px;
width: auto;
opacity: 0.2;
filter: grayscale(100%);
transition: opacity 0.5s;
}
.secret-perry:hover{
opacity: 1;
filter: grayscale(0%);
}
.results{ .results{
margin-left: 15px; margin-left: 15px;
font-family: "Courier New", monospace; font-family: "Courier New", monospace;
...@@ -92,10 +122,24 @@ ul { ...@@ -92,10 +122,24 @@ ul {
} }
.no-results{ .no-results{
margin-left: 15px;
font-family: "Courier New", monospace; font-family: "Courier New", monospace;
font-size: 25px; font-size: 25px;
font-weight: bold; font-weight: bold;
display: block;
margin-left: auto;
margin-right: auto;
width: 50%;
text-align: center;
}
.sad-platypus{
display: block;
margin-top: 120px;
margin-left: auto;
margin-right: auto;
width: 50%;
max-width: 200px;
max-height: auto;
} }
.results-word{ .results-word{
......
...@@ -16,6 +16,9 @@ ...@@ -16,6 +16,9 @@
</form> </form>
</div> </div>
<div class = "right-section"> <div class = "right-section">
<div class="crop-container">
<img src="{{ url_for('static', filename='img/search_platypus.png') }}" alt="Search platypus" class="search-platypus"/>
</div>
</div> </div>
</div> </div>
...@@ -23,9 +26,16 @@ ...@@ -23,9 +26,16 @@
{% if result == "start" %} {% if result == "start" %}
{% elif result[0][0] is defined %} {% elif result[0][0] is defined %}
<a class="results">Results for: </a> <div class="top-section">
<a class="results-word">{{ search }}</a> <div class="result-area">
<p class="result-length"> {{ result[1] }} Results</p> <a class="results">Results for: </a>
<a class="results-word">{{ search }}</a>
<p class="result-length"> {{ result[1] }} Results</p>
</div>
<img src="{{ url_for('static', filename='img/secret_perry.png') }}" class="secret-perry">
</div>
<ul> <ul>
{% for item in result[0] %} {% for item in result[0] %}
...@@ -38,6 +48,7 @@ ...@@ -38,6 +48,7 @@
{% endfor %} {% endfor %}
</ul> </ul>
{% else %} {% else %}
<img src="{{ url_for('static', filename='img/sad_platypus.png') }}" alt="Sad platypus" class="sad-platypus">
<a class="no-results">No Results</a> <a class="no-results">No Results</a>
{% endif %} {% endif %}
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment