from urllib.parse import urlparse, urljoin
from myapp import search_index
def time_passed(link):
reader = ix.reader()
link_time = reader.stored_fields(reader.first_id("url", link))["date"]
time_difference = - link_time
return time_difference > crawl_cooldown
return True
ix = indexing.get_index()
crawl_cooldown = timedelta(hours=1)
depth_limit = 2
queue.put(('', 0)) #initialize queue with start URL
visited_list = set() #list with visited websites
current_url, depth = queue.get() #take the first element of the queue
visited_list.add(current_url) #needed?
r = requests.get(current_url, timeout=3) # get the current url
soup = BeautifulSoup(r.content, 'html.parser')
urltitle = str(soup.title.string)
request_time =
writer = ix.writer()
writer.update_document(title=urltitle, url=r.url, content=str(soup.text), date=request_time) #add website information to crawler
writer.commit()# put crawler content into the index
print(F"Queue length: {queue.qsize()}, searching on depth: {depth}, site: {r.url}")
if depth < depth_limit:
if wikipedia: #only works on wikipedia
new_links = soup.find(id="bodyContent").find_all('a', href=True, limit=20) # find all links on this website
new_links = soup.find_all('a', href=True, limit=20)
for link in new_links: # for each of the links on the current website
href = urljoin(current_url, link['href']) # join the domain with its relative links
url = urlparse(href)
if os.path.splitext(url.path)[1] in ('.html', '', '.htm') and url.hostname in ALLOWED_DOMAINS and href not in visited_list and time_passed(url) and "#" not in href:
# if the url has the right format, is inside the allowed domains and has not been visited recently