Skip to content
Snippets Groups Projects
crawler.py 1.58 KiB
Newer Older
  • Learn to ignore specific revisions
  • srebers's avatar
    srebers committed
    from queue import Queue
    
    from urllib.parse import urlparse, urljoin
    
    srebers's avatar
    srebers committed
    
    
    srebers's avatar
    srebers committed
    import requests
    
    srebers's avatar
    srebers committed
    from bs4 import BeautifulSoup
    
    
    srebers's avatar
    srebers committed
    
    
    #Index:
    
    ix = indexing.get_index()
    
    ALLOWED_DOMAINS = ('https://vm009.rz.uos.de/crawl/')
    
    srebers's avatar
    srebers committed
    
    queue = Queue()
    
    queue.put('https://vm009.rz.uos.de/crawl/index.html')   #initialize queue with start URL
    
    visited_list = set()   #list with visited websites
    
    srebers's avatar
    srebers committed
    
    while not queue.empty():
    
        current_url = queue.get()   #take the first element of the queue
    
        if True: #TODO: Implement when index is added
    
            r = requests.get(current_url, timeout=3)   # get the current url
            print(r.url)
            soup = BeautifulSoup(r.content, 'html.parser')
            urltitle = str(soup.title.string)
            writer = ix.writer()
            writer.update_document(title=urltitle, url=r.url, content=str(soup.text)) #add website information to crawler
    
            writer.commit() #put crawler content into the index
    
            new_links = soup.find_all('a', href=True)   # find all links on this website
    
    srebers's avatar
    srebers committed
    
    
            for link in new_links:   # for each of the links on the current website
                href = urljoin(current_url, link['href'])   # join the domain with its relative links
                url = urlparse(href)
    
    srebers's avatar
    srebers committed
    
    
    
                if os.path.splitext(url.path)[1] in ('.html', '', '.htm') and url.hostname in ALLOWED_DOMAINS and href not in visited_list:
                # if the url has the right format, is inside the allowed domains and has not been visited recently
                    queue.put(href)
                    visited_list.add(href)
    
    srebers's avatar
    srebers committed
    
    
            #TODO: Analyze it and update the index
    
    srebers's avatar
    srebers committed