Newer
Older
from bs4 import BeautifulSoup
# r = requests.get('https://vm009.rz.uos.de/crawl/index.html')
queue = Queue()
visitedLinks = []
queue.put('https://vm009.rz.uos.de/crawl/index.html')
visitedLinks.append('https://vm009.rz.uos.de/crawl/index.html')
while not queue.empty():
link = queue.get()
r = requests.get(link)
soup = BeautifulSoup(r.content, 'html.parser')
# print(r.content)
for l in soup.find_all("a"):
url = urljoin('https://vm009.rz.uos.de/crawl/', l['href'])
if url not in visitedLinks and 'https://vm009.rz.uos.de/crawl' in url:
print(url)
print(l.text)
queue.put(url)
visitedLinks.append(url)
print(visitedLinks)