Skip to content
Snippets Groups Projects
Commit 30655817 authored by srebers's avatar srebers
Browse files

some updates

parent 5917c73c
Branches
No related tags found
No related merge requests found
......@@ -21,18 +21,21 @@ def time_passed(link):
ix = indexing.get_index()
ALLOWED_DOMAINS = ('minecraft.fandom.com',)
ALLOWED_DOMAINS = ('en.wikipedia.org',)
wikipedia = True
crawl_cooldown = timedelta(hours=1)
depth_limit = 2
queue = Queue()
queue.put(('https://minecraft.fandom.com/wiki/Crafting_Table', 0)) #initialize queue with start URL
queue.put(('https://en.wikipedia.org/wiki/Artificial_neural_network', 0)) #initialize queue with start URL
visited_list = set() #list with visited websites
while not queue.empty():
current_url, depth = queue.get() #take the first element of the queue
visited_list.add(current_url) #working?
current_url, depth = queue.get() #take the first element of the queue
visited_list.add(current_url) #needed?
r = requests.get(current_url, timeout=3) # get the current url
soup = BeautifulSoup(r.content, 'html.parser')
urltitle = str(soup.title.string)
......@@ -42,15 +45,18 @@ while not queue.empty():
writer.commit()# put crawler content into the index
print(F"Queue length: {queue.qsize()}, searching on depth: {depth}, site: {r.url}")
if depth < depth_limit:
if wikipedia: #only works on wikipedia
new_links = soup.find(id="bodyContent").find_all('a', href=True, limit=20) # find all links on this website
else:
new_links = soup.find_all('a', href=True, limit=20)
if depth <= depth_limit:
new_links = soup.find_all('a', href=True, limit=20) # find all links on this website
for link in new_links: # for each of the links on the current website
href = urljoin(current_url, link['href']) # join the domain with its relative links
url = urlparse(href)
if os.path.splitext(url.path)[1] in ('.html', '', '.htm') and url.hostname in ALLOWED_DOMAINS and href not in visited_list and time_passed(url):
if os.path.splitext(url.path)[1] in ('.html', '', '.htm') and url.hostname in ALLOWED_DOMAINS and href not in visited_list and time_passed(url) and "#" not in href:
# if the url has the right format, is inside the allowed domains and has not been visited recently
queue.put((href, depth + 1))
......
import os.path
from queue import Queue
from urllib.parse import urljoin, urlparse
import requests
from bs4 import BeautifulSoup
# r = requests.get('https://vm009.rz.uos.de/crawl/index.html')
queue = Queue()
visited_links = []
start_site = 'https://vm009.rz.uos.de/crawl/index.html'
base_site = 'https://vm009.rz.uos.de/crawl/'
queue.put(start_site)
visited_links.append(start_site)
while not queue.empty():
link = queue.get()
r = requests.get(link)
soup = BeautifulSoup(r.content, 'html.parser')
# print(r.content)
for l in soup.find_all("a"):
url = urlparse(urljoin(base_site, l['href']))
print(os.path.split(url))
if url not in visited_links and base_site in url :
print(url)
print(l.text)
queue.put(url)
visited_links.append(url)
print(visited_links)
......@@ -3,23 +3,38 @@ import indexing
# Retrieving data
from whoosh.qparser import QueryParser
import traceback
from flask import Flask, request, render_template
#TODO:
# fix empty results
# cut out url after #
# check if timedelta works
# clean up code
# annotate
# features: language, result pages
def search_index(keyword: str):
ix = indexing.get_index()
with ix.searcher() as searcher:
result_list = []
query = QueryParser("content", ix.schema).parse(keyword)
results = searcher.search(query)
results.fragmenter.surround = 300
results = searcher.search(query, limit=100)
results.fragmenter.surround = 50
for r in results:
result_list.append((r.fields()["url"], r.highlights("content"), r.fields()["title"]))
#TODO: ??? searcher.close()
return result_list
return result_list, len(results)
app = Flask(__name__)
@app.errorhandler(500)
def internal_error(exception):
return "<pre>"+traceback.format_exc()+"</pre>"
@app.route("/")
def start():
search_word = request.args.get('search_word')
......
from myapp import app
application = app
\ No newline at end of file
......@@ -53,6 +53,7 @@ ul {
border: none;
margin-top: 13px;
margin-right: 5px;
transition: background-color 0.5s;
}
......@@ -87,6 +88,7 @@ ul {
font-family: "Courier New", monospace;
font-size: 25px;
font-weight: bold;
}
.no-results{
......@@ -103,6 +105,13 @@ ul {
font-style: italic;
}
.result-length{
margin-left: 15px;
margin-top: 0px;
font-family: "Courier New", monospace;
font-size: 15px;
}
.result-box{
margin-left: 15px;
margin-right: 15px;
......
<!DOCTYPE html>
<html>
<head>
<title>{{ title }}</title>
</head>
<body>
<h1>Results:</h1>
<ul>
{% for item in result %}
<li>
<a href= {{ item }}> {{ item }} </a>
</li>
{% endfor %}
</ul>
</body>
</html>
\ No newline at end of file
......@@ -22,11 +22,13 @@
{% if result == "start" %}
{% elif result[0] is defined %}
{% elif result[0][0] is defined %}
<a class="results">Results for: </a>
<a class="results-word">{{ search }}</a>
<p class="result-length"> {{ result[1] }} Results</p>
<ul>
{% for item in result %}
{% for item in result[0] %}
<li>
<div class="result-box">
<a href= {{ item[0] }}> {{ item[2] }} </a>
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment