diff --git a/crawler_alt.py b/crawler_alt.py deleted file mode 100644 index 420487d576d247f4ea807eee47bc8313b2c5fcbd..0000000000000000000000000000000000000000 --- a/crawler_alt.py +++ /dev/null @@ -1,36 +0,0 @@ -import os.path -from queue import Queue -from urllib.parse import urljoin, urlparse - -import requests -from bs4 import BeautifulSoup - -# r = requests.get('https://vm009.rz.uos.de/crawl/index.html') - - -queue = Queue() -visited_links = [] -start_site = 'https://vm009.rz.uos.de/crawl/index.html' -base_site = 'https://vm009.rz.uos.de/crawl/' -queue.put(start_site) -visited_links.append(start_site) - -while not queue.empty(): - link = queue.get() - r = requests.get(link) - soup = BeautifulSoup(r.content, 'html.parser') - # print(r.content) - for l in soup.find_all("a"): - url = urlparse(urljoin(base_site, l['href'])) - print(os.path.split(url)) - if url not in visited_links and base_site in url : - print(url) - print(l.text) - queue.put(url) - visited_links.append(url) - -print(visited_links) - - - -