Skip to content
Snippets Groups Projects
crawler.py 2.05 KiB
Newer Older
  • Learn to ignore specific revisions
  • from datetime import datetime, timedelta
    
    srebers's avatar
    srebers committed
    from queue import Queue
    
    from urllib.parse import urlparse, urljoin
    
    srebers's avatar
    srebers committed
    
    
    srebers's avatar
    srebers committed
    import requests
    
    srebers's avatar
    srebers committed
    from bs4 import BeautifulSoup
    
    from myapp import search_index
    
    def time_passed(link):
        reader = ix.reader()
        try:
            link_time = reader.stored_fields(reader.first_id("url", link))["date"]
            time_difference = datetime.now() - link_time
            return time_difference > crawl_cooldown
        except:
            return True
    
    srebers's avatar
    srebers committed
    
    
    ALLOWED_DOMAINS = ('minecraft.fandom.com',)
    crawl_cooldown = timedelta(hours=1)
    depth_limit = 2
    
    srebers's avatar
    srebers committed
    
    queue = Queue()
    
    queue.put(('https://minecraft.fandom.com/wiki/Crafting_Table', 0))   #initialize queue with start URL
    
    
    visited_list = set()   #list with visited websites
    
    srebers's avatar
    srebers committed
    
    while not queue.empty():
    
        current_url, depth = queue.get()   #take the first element of the queue
        visited_list.add(current_url) #working?
        r = requests.get(current_url, timeout=3)   # get the current url
        soup = BeautifulSoup(r.content, 'html.parser')
        urltitle = str(soup.title.string)
        request_time = datetime.now()
        writer = ix.writer()
        writer.update_document(title=urltitle, url=r.url, content=str(soup.text), date=request_time) #add website information to crawler
        writer.commit()# put crawler content into the index
    
        print(F"Queue length: {queue.qsize()}, searching on depth: {depth}, site: {r.url}")
    
        if depth <= depth_limit:
            new_links = soup.find_all('a', href=True, limit=20)   # find all links on this website
    
    srebers's avatar
    srebers committed
    
    
            for link in new_links:   # for each of the links on the current website
                href = urljoin(current_url, link['href'])   # join the domain with its relative links
                url = urlparse(href)
    
    srebers's avatar
    srebers committed
    
    
                if os.path.splitext(url.path)[1] in ('.html', '', '.htm') and url.hostname in ALLOWED_DOMAINS and href not in visited_list and time_passed(url):
    
                # if the url has the right format, is inside the allowed domains and has not been visited recently
    
                    queue.put((href, depth + 1))
    
    srebers's avatar
    srebers committed