Skip to content
Snippets Groups Projects
crawler.py 2.24 KiB
Newer Older
  • Learn to ignore specific revisions
  • from datetime import datetime, timedelta
    
    srebers's avatar
    srebers committed
    from queue import Queue
    
    from urllib.parse import urlparse, urljoin
    
    srebers's avatar
    srebers committed
    
    
    srebers's avatar
    srebers committed
    import requests
    
    srebers's avatar
    srebers committed
    from bs4 import BeautifulSoup
    
    from myapp import search_index
    
    def time_passed(link):
        reader = ix.reader()
        try:
            link_time = reader.stored_fields(reader.first_id("url", link))["date"]
            time_difference = datetime.now() - link_time
            return time_difference > crawl_cooldown
        except:
            return True
    
    srebers's avatar
    srebers committed
    
    
    srebers's avatar
    srebers committed
    ALLOWED_DOMAINS = ('en.wikipedia.org',)
    wikipedia = True
    
    crawl_cooldown = timedelta(hours=1)
    depth_limit = 2
    
    srebers's avatar
    srebers committed
    
    queue = Queue()
    
    srebers's avatar
    srebers committed
    queue.put(('https://en.wikipedia.org/wiki/Artificial_neural_network', 0))   #initialize queue with start URL
    
    
    visited_list = set()   #list with visited websites
    
    srebers's avatar
    srebers committed
    
    while not queue.empty():
    
    srebers's avatar
    srebers committed
        current_url, depth = queue.get()  #take the first element of the queue
    
        visited_list.add(current_url) #needed?
    
    
        r = requests.get(current_url, timeout=3)   # get the current url
        soup = BeautifulSoup(r.content, 'html.parser')
        urltitle = str(soup.title.string)
        request_time = datetime.now()
        writer = ix.writer()
        writer.update_document(title=urltitle, url=r.url, content=str(soup.text), date=request_time) #add website information to crawler
        writer.commit()# put crawler content into the index
    
        print(F"Queue length: {queue.qsize()}, searching on depth: {depth}, site: {r.url}")
    
    srebers's avatar
    srebers committed
        if depth < depth_limit:
            if wikipedia: #only works on wikipedia
                new_links = soup.find(id="bodyContent").find_all('a', href=True, limit=20)   # find all links on this website
            else:
                new_links = soup.find_all('a', href=True, limit=20)
    
    srebers's avatar
    srebers committed
    
    
            for link in new_links:   # for each of the links on the current website
                href = urljoin(current_url, link['href'])   # join the domain with its relative links
                url = urlparse(href)
    
    srebers's avatar
    srebers committed
    
    
    srebers's avatar
    srebers committed
                if os.path.splitext(url.path)[1] in ('.html', '', '.htm') and url.hostname in ALLOWED_DOMAINS and href not in visited_list and time_passed(url) and "#" not in href:
    
                # if the url has the right format, is inside the allowed domains and has not been visited recently
    
                    queue.put((href, depth + 1))
    
    srebers's avatar
    srebers committed