import os from datetime import datetime, timedelta from queue import Queue from urllib.parse import urlparse, urljoin import requests from bs4 import BeautifulSoup import indexing from myapp import search_index # returns true only if the time since crawling the url is bigger than the cooldown period which we set further down def time_passed(link): reader = ix.reader() try: link_time = reader.stored_fields(reader.first_id("url", link))["date"] time_difference = datetime.now() - link_time return time_difference > crawl_cooldown except: return True ix = indexing.get_index() ALLOWED_DOMAINS = ('en.wikipedia.org',) wikipedia = True crawl_cooldown = timedelta(days=1) # setting the crawl cooldown time depth_limit = 2 # making sure to limit the amount of websites we crawl queue = Queue() queue.put(('https://en.wikipedia.org/wiki/Artificial_neural_network', 0)) # initialize queue with start URL visited_list = set() # list with visited websites while not queue.empty(): current_url, depth = queue.get() # take the first url out of the queue visited_list.add(current_url) # adding url to the crawl history # extract website content r = requests.get(current_url, timeout=3) soup = BeautifulSoup(r.content, 'html.parser') urltitle = str(soup.title.string) request_time = datetime.now() # putting the crawler content into the index writer = ix.writer() writer.update_document(title=urltitle, url=r.url, content=str(soup.text), date=request_time) #adding the website information to the crawler writer.commit() print(F"Queue length: {queue.qsize()}, searching on depth: {depth}, site: {r.url}") if depth < depth_limit: if wikipedia: # only works on wikipedia new_links = soup.find(id="bodyContent").find_all('a', href=True, limit=20) # finding first 20 links of the content body on this website else: new_links = soup.find_all('a', href=True, limit=20) for link in new_links: # for each of the links on the current website href = urljoin(current_url, link['href']) # joining current url with its sublinks url = urlparse(href) # putting the url in the queue if it has the right format, is inside the allowed domains, has not been crawled recently and is no fragment url if os.path.splitext(url.path)[1] in ('.html', '', '.htm') and url.hostname in ALLOWED_DOMAINS and href not in visited_list and time_passed(href) and "#" not in href: queue.put((href, depth + 1)) print(visited_list)