Skip to content
Snippets Groups Projects
crawler.py 789 B
Newer Older
  • Learn to ignore specific revisions
  • srebers's avatar
    srebers committed
    from queue import Queue
    from urllib.parse import urljoin
    
    
    srebers's avatar
    srebers committed
    import requests
    
    srebers's avatar
    srebers committed
    from bs4 import BeautifulSoup
    
    # r = requests.get('https://vm009.rz.uos.de/crawl/index.html')
    
    
    queue = Queue()
    visitedLinks = []
    queue.put('https://vm009.rz.uos.de/crawl/index.html')
    visitedLinks.append('https://vm009.rz.uos.de/crawl/index.html')
    
    while not queue.empty():
        link = queue.get()
        r = requests.get(link)
        soup = BeautifulSoup(r.content, 'html.parser')
        # print(r.content)
        for l in soup.find_all("a"):
            url = urljoin('https://vm009.rz.uos.de/crawl/', l['href'])
            if url not in visitedLinks and 'https://vm009.rz.uos.de/crawl' in url:
                print(url)
                print(l.text)
                queue.put(url)
                visitedLinks.append(url)
    
    print(visitedLinks)
    
    
    
    
    srebers's avatar
    srebers committed