crawler.py

import os
from queue import Queue
from urllib.parse import urlparse, urljoin

import requests
from bs4 import BeautifulSoup

import indexing

#Index:

ix = indexing.get_index()

ALLOWED_DOMAINS = ('https://vm009.rz.uos.de/crawl/')

queue = Queue()
queue.put('https://vm009.rz.uos.de/crawl/index.html')   #initialize queue with start URL

visited_list = set()   #list with visited websites

while not queue.empty():
    current_url = queue.get()   #take the first element of the queue

    if True: #TODO: Implement when index is added

        r = requests.get(current_url, timeout=3)   # get the current url
        print(r.url)
        soup = BeautifulSoup(r.content, 'html.parser')
        urltitle = str(soup.title.string)
        writer = ix.writer()
        writer.update_document(title=urltitle, url=r.url, content=str(soup.text)) #add website information to crawler

        writer.commit() #put crawler content into the index

        new_links = soup.find_all('a', href=True)   # find all links on this website

        for link in new_links:   # for each of the links on the current website
            href = urljoin(current_url, link['href'])   # join the domain with its relative links
            url = urlparse(href)


            if os.path.splitext(url.path)[1] in ('.html', '', '.htm') and url.hostname in ALLOWED_DOMAINS and href not in visited_list:
            # if the url has the right format, is inside the allowed domains and has not been visited recently
                queue.put(href)
                visited_list.add(href)

        #TODO: Analyze it and update the index

print(visited_list)