Skip to content
Snippets Groups Projects
Commit 13590527 authored by srebers's avatar srebers
Browse files

crawler and index working, not clean & correctly commented

parent c5e90b30
No related branches found
No related tags found
No related merge requests found
import os
from queue import Queue from queue import Queue
from urllib.parse import urljoin from urllib.parse import urlparse, urljoin
import requests import requests
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
# r = requests.get('https://vm009.rz.uos.de/crawl/index.html') import indexing
#Index:
ix = indexing.get_index()
ALLOWED_DOMAINS = ('https://vm009.rz.uos.de/crawl/')
queue = Queue() queue = Queue()
visitedLinks = [] queue.put('https://vm009.rz.uos.de/crawl/index.html') #initialize queue with start URL
queue.put('https://vm009.rz.uos.de/crawl/index.html')
visitedLinks.append('https://vm009.rz.uos.de/crawl/index.html') visited_list = set() #list with visited websites
while not queue.empty(): while not queue.empty():
link = queue.get() current_url = queue.get() #take the first element of the queue
r = requests.get(link)
soup = BeautifulSoup(r.content, 'html.parser') if True: #TODO: Implement when index is added
# print(r.content)
for l in soup.find_all("a"): r = requests.get(current_url, timeout=3) # get the current url
url = urljoin('https://vm009.rz.uos.de/crawl/', l['href']) print(r.url)
if url not in visitedLinks and 'https://vm009.rz.uos.de/crawl' in url: soup = BeautifulSoup(r.content, 'html.parser')
print(url) urltitle = str(soup.title.string)
print(l.text) writer = ix.writer()
queue.put(url) writer.update_document(title=urltitle, url=r.url, content=str(soup.text)) #add website information to crawler
visitedLinks.append(url)
writer.commit() #put crawler content into the index
new_links = soup.find_all('a', href=True) # find all links on this website
print(visitedLinks) for link in new_links: # for each of the links on the current website
href = urljoin(current_url, link['href']) # join the domain with its relative links
url = urlparse(href)
if os.path.splitext(url.path)[1] in ('.html', '', '.htm') and url.hostname in ALLOWED_DOMAINS and href not in visited_list:
# if the url has the right format, is inside the allowed domains and has not been visited recently
queue.put(href)
visited_list.add(href)
#TODO: Analyze it and update the index
print(visited_list)
import os.path
from queue import Queue
from urllib.parse import urljoin, urlparse
import requests
from bs4 import BeautifulSoup
# r = requests.get('https://vm009.rz.uos.de/crawl/index.html')
queue = Queue()
visited_links = []
start_site = 'https://vm009.rz.uos.de/crawl/index.html'
base_site = 'https://vm009.rz.uos.de/crawl/'
queue.put(start_site)
visited_links.append(start_site)
while not queue.empty():
link = queue.get()
r = requests.get(link)
soup = BeautifulSoup(r.content, 'html.parser')
# print(r.content)
for l in soup.find_all("a"):
url = urlparse(urljoin(base_site, l['href']))
print(os.path.split(url))
if url not in visited_links and base_site in url :
print(url)
print(l.text)
queue.put(url)
visited_links.append(url)
print(visited_links)
import os
from whoosh.fields import *
from whoosh.index import create_in from whoosh.index import create_in
from whoosh.fields import * from whoosh import index
\ No newline at end of file
# Here, the structure of index entries is defined. You can add more fields with metadata, computed values etc.,
# and use them for searching and ranking.
# We only use a title and a text.
#
# The "stored" attribute is used for all parts that we want to be able to fully retrieve from the index
ix_schema = Schema(title=TEXT(stored=True), url=ID(stored=True, unique=True), content=TEXT)
# Create an index if not created or open an existing, then return it
def get_index():
if not os.path.exists("indexdir"):
os.makedirs("indexdir")
return create_in("indexdir", schema=ix_schema)
return index.open_dir("indexdir")
# # now let's add some texts (=documents)
# writer.add_document(title=u"First document", content=u"This is the first document we've added!")
# writer.add_document(title=u"Second document", content=u"The second one is even more interesting!")
# writer.add_document(title=u"Songtext", content=u"Music was my first love and it will be the last")
#
# # write the index to the disk
# writer.commit()
# # Retrieving data
# from whoosh.qparser import QueryParser
#
# with ix.searcher() as searcher:
# # find entries with the words 'first' AND 'last'
# query = QueryParser("content", ix.schema).parse("first last")
# results = searcher.search(query)
#
# # print all results
# for r in results:
# print(r)
\ No newline at end of file
myapp.py 0 → 100644
from whoosh.fields import *
import indexing
# Retrieving data
from whoosh.qparser import QueryParser
ix = indexing.get_index()
with ix.searcher() as searcher:
# find entries with the words 'first' AND 'last'
query = QueryParser("content", ix.schema).parse(input("search for something:\n"))
results = searcher.search(query)
# print all results
for r in results:
print(r)
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment