Skip to content
Snippets Groups Projects
crawler.py 2.56 KiB
Newer Older
  • Learn to ignore specific revisions
  • from datetime import datetime, timedelta
    
    srebers's avatar
    srebers committed
    from queue import Queue
    
    from urllib.parse import urlparse, urljoin
    
    srebers's avatar
    srebers committed
    
    
    srebers's avatar
    srebers committed
    import requests
    
    srebers's avatar
    srebers committed
    from bs4 import BeautifulSoup
    
    from myapp import search_index
    
    
    srebers's avatar
    srebers committed
    # returns true only if the time since crawling the url is bigger than the cooldown period which we set further down
    
    def time_passed(link):
        reader = ix.reader()
        try:
            link_time = reader.stored_fields(reader.first_id("url", link))["date"]
            time_difference = datetime.now() - link_time
            return time_difference > crawl_cooldown
        except:
            return True
    
    srebers's avatar
    srebers committed
    
    
    srebers's avatar
    srebers committed
    ALLOWED_DOMAINS = ('en.wikipedia.org',)
    wikipedia = True
    
    Simon Rebers's avatar
    Simon Rebers committed
    crawl_cooldown = timedelta(days=1)   # setting the crawl cooldown time
    
    srebers's avatar
    srebers committed
    depth_limit = 2   # making sure to limit the amount of websites we crawl
    
    srebers's avatar
    srebers committed
    
    queue = Queue()
    
    srebers's avatar
    srebers committed
    queue.put(('https://en.wikipedia.org/wiki/Artificial_neural_network', 0))   # initialize queue with start URL
    
    srebers's avatar
    srebers committed
    visited_list = set()   # list with visited websites
    
    srebers's avatar
    srebers committed
    
    while not queue.empty():
    
    srebers's avatar
    srebers committed
        current_url, depth = queue.get()  # take the first url out of the queue
    
    srebers's avatar
    srebers committed
    
    
    srebers's avatar
    srebers committed
        visited_list.add(current_url)  # adding url to the crawl history
    
    srebers's avatar
    srebers committed
    
    
    srebers's avatar
    srebers committed
        # extract website content
        r = requests.get(current_url, timeout=3)
    
        soup = BeautifulSoup(r.content, 'html.parser')
        urltitle = str(soup.title.string)
        request_time = datetime.now()
    
    srebers's avatar
    srebers committed
    
        # putting the crawler content into the index
    
        writer = ix.writer()
    
    srebers's avatar
    srebers committed
        writer.update_document(title=urltitle, url=r.url, content=str(soup.text), date=request_time) #adding the website information to the crawler
        writer.commit()
    
        print(F"Queue length: {queue.qsize()}, searching on depth: {depth}, site: {r.url}")
    
    srebers's avatar
    srebers committed
        if depth < depth_limit:
    
    srebers's avatar
    srebers committed
            if wikipedia:  # only works on wikipedia
                new_links = soup.find(id="bodyContent").find_all('a', href=True, limit=20)   # finding first 20 links of the content body on this website
    
    srebers's avatar
    srebers committed
            else:
                new_links = soup.find_all('a', href=True, limit=20)
    
    
            for link in new_links:   # for each of the links on the current website
    
    srebers's avatar
    srebers committed
                href = urljoin(current_url, link['href'])   # joining current url with its sublinks
    
    srebers's avatar
    srebers committed
    
    
    srebers's avatar
    srebers committed
                # putting  the url in the queue if it has the right format, is inside the allowed domains, has not been crawled recently and is no fragment url
                if os.path.splitext(url.path)[1] in ('.html', '', '.htm') and url.hostname in ALLOWED_DOMAINS and href not in visited_list and time_passed(href) and "#" not in href:
    
                    queue.put((href, depth + 1))
    
    srebers's avatar
    srebers committed