Skip to content
Snippets Groups Projects
Commit c5e90b30 authored by srebers's avatar srebers
Browse files

second commit

parent fc4bbf92
Branches
Tags 6.0.0
No related merge requests found
venv
from queue import Queue
from urllib.parse import urljoin
import requests import requests
from bs4 import BeautifulSoup
# r = requests.get('https://vm009.rz.uos.de/crawl/index.html')
queue = Queue()
visitedLinks = []
queue.put('https://vm009.rz.uos.de/crawl/index.html')
visitedLinks.append('https://vm009.rz.uos.de/crawl/index.html')
while not queue.empty():
link = queue.get()
r = requests.get(link)
soup = BeautifulSoup(r.content, 'html.parser')
# print(r.content)
for l in soup.find_all("a"):
url = urljoin('https://vm009.rz.uos.de/crawl/', l['href'])
if url not in visitedLinks and 'https://vm009.rz.uos.de/crawl' in url:
print(url)
print(l.text)
queue.put(url)
visitedLinks.append(url)
print(visitedLinks)
r = requests.get('https://vm009.rz.uos.de/crawl/index.html')
print(r.content)
\ No newline at end of file
from whoosh.index import create_in
from whoosh.fields import *
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment