Newer
Older
from urllib.parse import urlparse, urljoin
#Index:
ix = indexing.get_index()
ALLOWED_DOMAINS = ('https://vm009.rz.uos.de/crawl/')
queue.put('https://vm009.rz.uos.de/crawl/index.html') #initialize queue with start URL
visited_list = set() #list with visited websites
current_url = queue.get() #take the first element of the queue
if True: #TODO: Implement when index is added
r = requests.get(current_url, timeout=3) # get the current url
print(r.url)
soup = BeautifulSoup(r.content, 'html.parser')
urltitle = str(soup.title.string)
writer = ix.writer()
writer.update_document(title=urltitle, url=r.url, content=str(soup.text)) #add website information to crawler
writer.commit() #put crawler content into the index
new_links = soup.find_all('a', href=True) # find all links on this website
for link in new_links: # for each of the links on the current website
href = urljoin(current_url, link['href']) # join the domain with its relative links
url = urlparse(href)
if os.path.splitext(url.path)[1] in ('.html', '', '.htm') and url.hostname in ALLOWED_DOMAINS and href not in visited_list:
# if the url has the right format, is inside the allowed domains and has not been visited recently
queue.put(href)
visited_list.add(href)
#TODO: Analyze it and update the index