import os from queue import Queue from urllib.parse import urlparse, urljoin import requests from bs4 import BeautifulSoup import indexing #Index: ix = indexing.get_index() ALLOWED_DOMAINS = ('https://vm009.rz.uos.de/crawl/') queue = Queue() queue.put('https://vm009.rz.uos.de/crawl/index.html') #initialize queue with start URL visited_list = set() #list with visited websites while not queue.empty(): current_url = queue.get() #take the first element of the queue if True: #TODO: Implement when index is added r = requests.get(current_url, timeout=3) # get the current url print(r.url) soup = BeautifulSoup(r.content, 'html.parser') urltitle = str(soup.title.string) writer = ix.writer() writer.update_document(title=urltitle, url=r.url, content=str(soup.text)) #add website information to crawler writer.commit() #put crawler content into the index new_links = soup.find_all('a', href=True) # find all links on this website for link in new_links: # for each of the links on the current website href = urljoin(current_url, link['href']) # join the domain with its relative links url = urlparse(href) if os.path.splitext(url.path)[1] in ('.html', '', '.htm') and url.hostname in ALLOWED_DOMAINS and href not in visited_list: # if the url has the right format, is inside the allowed domains and has not been visited recently queue.put(href) visited_list.add(href) #TODO: Analyze it and update the index print(visited_list)