Skip to content
Snippets Groups Projects
Commit 5917c73c authored by srebers's avatar srebers
Browse files

Search engine working, still some bugs

parent ba1b4e00
No related branches found
No related tags found
No related merge requests found
import os
from datetime import datetime, timedelta
from queue import Queue
from urllib.parse import urlparse, urljoin
import requests
from bs4 import BeautifulSoup
import indexing
from myapp import search_index
def time_passed(link):
reader = ix.reader()
try:
link_time = reader.stored_fields(reader.first_id("url", link))["date"]
time_difference = datetime.now() - link_time
return time_difference > crawl_cooldown
except:
return True
#Index:
ix = indexing.get_index()
ALLOWED_DOMAINS = ('https://vm009.rz.uos.de/crawl/')
ALLOWED_DOMAINS = ('minecraft.fandom.com',)
crawl_cooldown = timedelta(hours=1)
depth_limit = 2
queue = Queue()
queue.put('https://vm009.rz.uos.de/crawl/index.html') #initialize queue with start URL
queue.put(('https://minecraft.fandom.com/wiki/Crafting_Table', 0)) #initialize queue with start URL
visited_list = set() #list with visited websites
while not queue.empty():
current_url = queue.get() #take the first element of the queue
if True: #TODO: Implement when index is added
r = requests.get(current_url, timeout=3) # get the current url
print(r.url)
soup = BeautifulSoup(r.content, 'html.parser')
urltitle = str(soup.title.string)
writer = ix.writer()
writer.update_document(title=urltitle, url=r.url, content=str(soup.text)) #add website information to crawler
current_url, depth = queue.get() #take the first element of the queue
visited_list.add(current_url) #working?
r = requests.get(current_url, timeout=3) # get the current url
soup = BeautifulSoup(r.content, 'html.parser')
urltitle = str(soup.title.string)
request_time = datetime.now()
writer = ix.writer()
writer.update_document(title=urltitle, url=r.url, content=str(soup.text), date=request_time) #add website information to crawler
writer.commit()# put crawler content into the index
writer.commit() #put crawler content into the index
print(F"Queue length: {queue.qsize()}, searching on depth: {depth}, site: {r.url}")
new_links = soup.find_all('a', href=True) # find all links on this website
if depth <= depth_limit:
new_links = soup.find_all('a', href=True, limit=20) # find all links on this website
for link in new_links: # for each of the links on the current website
href = urljoin(current_url, link['href']) # join the domain with its relative links
url = urlparse(href)
if os.path.splitext(url.path)[1] in ('.html', '', '.htm') and url.hostname in ALLOWED_DOMAINS and href not in visited_list:
if os.path.splitext(url.path)[1] in ('.html', '', '.htm') and url.hostname in ALLOWED_DOMAINS and href not in visited_list and time_passed(url):
# if the url has the right format, is inside the allowed domains and has not been visited recently
queue.put(href)
visited_list.add(href)
queue.put((href, depth + 1))
print(visited_list)
......@@ -10,33 +10,11 @@ from whoosh import index
#
# The "stored" attribute is used for all parts that we want to be able to fully retrieve from the index
ix_schema = Schema(title=TEXT(stored=True), url=ID(stored=True, unique=True), content=TEXT)
ix_schema = Schema(title=TEXT(stored=True), url=ID(stored=True, unique=True), content=TEXT(stored=True), date=DATETIME(stored=True))
# Create an index if not created or open an existing, then return it
def get_index():
if not os.path.exists("indexdir"):
os.makedirs("indexdir")
return create_in("indexdir", schema=ix_schema)
return index.open_dir("indexdir")
# # now let's add some texts (=documents)
# writer.add_document(title=u"First document", content=u"This is the first document we've added!")
# writer.add_document(title=u"Second document", content=u"The second one is even more interesting!")
# writer.add_document(title=u"Songtext", content=u"Music was my first love and it will be the last")
#
# # write the index to the disk
# writer.commit()
# # Retrieving data
# from whoosh.qparser import QueryParser
#
# with ix.searcher() as searcher:
# # find entries with the words 'first' AND 'last'
# query = QueryParser("content", ix.schema).parse("first last")
# results = searcher.search(query)
#
# # print all results
# for r in results:
# print(r)
\ No newline at end of file
return index.open_dir("indexdir")
\ No newline at end of file
......@@ -9,25 +9,23 @@ from flask import Flask, request, render_template
def search_index(keyword: str):
ix = indexing.get_index()
with ix.searcher() as searcher:
# find entries with the words 'first' AND 'last'
result_list = []
query = QueryParser("content", ix.schema).parse(keyword)
results = searcher.search(query)
# print all results
result_links = []
results.fragmenter.surround = 300
for r in results:
result_links.append(r.fields()["url"])
return result_links
search_index("platypus")
result_list.append((r.fields()["url"], r.highlights("content"), r.fields()["title"]))
#TODO: ??? searcher.close()
return result_list
app = Flask(__name__)
@app.route("/")
def start():
return render_template("start.html", title="start")
search_word = request.args.get('search_word')
if search_word is None:
return render_template("start.html", title="start", result="start")
else:
return render_template("start.html", title=search_word, search=search_word, result=search_index(search_word))#return render_template("result.html", title="search for: " + rev, result=search_index(rev))
@app.route("/result")
def reverse():
rev = request.args.get('rev')
return render_template("result.html", title="search for: "+rev, result=search_index(rev))
\ No newline at end of file
body {
padding-top: 80px;
background-color: rgb(230,224,258);
}
ul {
list-style-type: none;
padding: 0;
}
.header {
height: 80px;
position: fixed;
top: 0;
left: 0;
right: 0;
display: grid;
grid-template-columns: 1fr 1fr 1fr;
background-color: rgb(160,154,188);
z-index: 100;
}
.left-section {
}
.middle-section {
text-align: center;
}
.right-section {
}
.logo {
font-family: "Courier New", monospace;
font-size: 50px;
margin-top: 10px;
margin-left: 20px;
}
.search-box{
padding: 15px 16px;
font-size: 18px;
font-weight: bold;
font-family: "Courier New", monospace;
color: black;
background-color: rgb(230,224,258);
border-radius: 15px;
border: none;
margin-top: 13px;
margin-right: 5px;
}
.submit-box{
cursor: pointer;
border-radius: 15px;
padding: 15px 16px;
font-size: 18px;
font-weight: bold;
font-family: "Courier New", monospace;
color: black;
background-color: rgb(230,224,258);
border-style: solid;
border-size: 2px;
border-color: black;
margin-top: 13px;
transition: background-color 0.3s;
}
.search-box:focus{
background-color: white;
outline: none;
}
.submit-box:hover{
background-color: white;
}
.results{
margin-left: 15px;
font-family: "Courier New", monospace;
font-size: 25px;
font-weight: bold;
}
.no-results{
margin-left: 15px;
font-family: "Courier New", monospace;
font-size: 25px;
font-weight: bold;
}
.results-word{
font-family: "Courier New", monospace;
font-size: 25px;
font-weight: bold;
font-style: italic;
}
.result-box{
margin-left: 15px;
margin-right: 15px;
padding: 10px;
margin-bottom: 15px;
background-color: white;
border-radius: 10px;
box-shadow: 5px 5px 5px 5px grey;
blur-radius: 10px;
}
\ No newline at end of file
......@@ -2,11 +2,42 @@
<html>
<head>
<title>{{ title }}</title>
<link rel="stylesheet" href="{{ url_for('static', filename='styles.css') }}"/>
</head>
<body>
<h1>Hello World</h1>
<form action="result" method="GET">
<input type="text" name='rev'>
</form>
<div class = "header">
<div class = "left-section">
<p class = "logo">Search Engine</p>
</div>
<div class = "middle-section">
<form action="" method="GET">
<input class = "search-box" type="text" name='search_word' placeholder="search...">
<input class = "submit-box" type="submit" value="Go">
</form>
</div>
<div class = "right-section">
</div>
</div>
{% if result == "start" %}
{% elif result[0] is defined %}
<a class="results">Results for: </a>
<a class="results-word">{{ search }}</a>
<ul>
{% for item in result %}
<li>
<div class="result-box">
<a href= {{ item[0] }}> {{ item[2] }} </a>
<p> {{ item[1]|safe }} </p>
</div>
</li>
{% endfor %}
</ul>
{% else %}
<a class="no-results">No Results</a>
{% endif %}
</body>
</html>
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment