Skip to content
Snippets Groups Projects
Commit c5e90b30 authored by srebers's avatar srebers
Browse files

second commit

parent fc4bbf92
No related branches found
No related tags found
No related merge requests found
venv
from queue import Queue
from urllib.parse import urljoin
import requests
from bs4 import BeautifulSoup
# r = requests.get('https://vm009.rz.uos.de/crawl/index.html')
queue = Queue()
visitedLinks = []
queue.put('https://vm009.rz.uos.de/crawl/index.html')
visitedLinks.append('https://vm009.rz.uos.de/crawl/index.html')
while not queue.empty():
link = queue.get()
r = requests.get(link)
soup = BeautifulSoup(r.content, 'html.parser')
# print(r.content)
for l in soup.find_all("a"):
url = urljoin('https://vm009.rz.uos.de/crawl/', l['href'])
if url not in visitedLinks and 'https://vm009.rz.uos.de/crawl' in url:
print(url)
print(l.text)
queue.put(url)
visitedLinks.append(url)
print(visitedLinks)
r = requests.get('https://vm009.rz.uos.de/crawl/index.html')
print(r.content)
\ No newline at end of file
from whoosh.index import create_in
from whoosh.fields import *
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment