crawler.py

import os
from datetime import datetime, timedelta
from queue import Queue
from urllib.parse import urlparse, urljoin

import requests
from bs4 import BeautifulSoup
import indexing
from myapp import search_index

def time_passed(link):
    reader = ix.reader()
    try:
        link_time = reader.stored_fields(reader.first_id("url", link))["date"]
        time_difference = datetime.now() - link_time
        return time_difference > crawl_cooldown
    except:
        return True

#Index:

ix = indexing.get_index()

ALLOWED_DOMAINS = ('en.wikipedia.org',)
wikipedia = True
crawl_cooldown = timedelta(hours=1)
depth_limit = 2

queue = Queue()
queue.put(('https://en.wikipedia.org/wiki/Artificial_neural_network', 0))   #initialize queue with start URL

visited_list = set()   #list with visited websites

while not queue.empty():
    current_url, depth = queue.get()  #take the first element of the queue

    visited_list.add(current_url) #needed?

    r = requests.get(current_url, timeout=3)   # get the current url
    soup = BeautifulSoup(r.content, 'html.parser')
    urltitle = str(soup.title.string)
    request_time = datetime.now()
    writer = ix.writer()
    writer.update_document(title=urltitle, url=r.url, content=str(soup.text), date=request_time) #add website information to crawler
    writer.commit()# put crawler content into the index

    print(F"Queue length: {queue.qsize()}, searching on depth: {depth}, site: {r.url}")
    if depth < depth_limit:
        if wikipedia: #only works on wikipedia
            new_links = soup.find(id="bodyContent").find_all('a', href=True, limit=20)   # find all links on this website
        else:
            new_links = soup.find_all('a', href=True, limit=20)


        for link in new_links:   # for each of the links on the current website
            href = urljoin(current_url, link['href'])   # join the domain with its relative links
            url = urlparse(href)

            if os.path.splitext(url.path)[1] in ('.html', '', '.htm') and url.hostname in ALLOWED_DOMAINS and href not in visited_list and time_passed(url) and "#" not in href:
            # if the url has the right format, is inside the allowed domains and has not been visited recently
                queue.put((href, depth + 1))

print(visited_list)