Below that, we declare the myssl context, which we'll pass in as the context for our HTTPS requests. And finally, we instantiate a new set called errorLinks, which will be populated with any links that throw less than favorable 200 status codes.
from urllib.request import Request, urlopen, urljoin, URLError
from urllib.parse import urlparse
import ssl
from bs4 import BeautifulSoup
class Crawler:
base_url = ''
myssl = ssl.create_default_context()
myssl.check_hostname=False
myssl.verify_mode=ssl.CERT_NONE
errorLinks = set()
We then go on to declare our Crawler's constructor function which will set base_url.
After this, we declare crawl static method and enqueueLinks static method. This takes in a list of links and linksToCrawl, a queue object. It iterates through them and if the link has not already been crawled, and if it is not already enqueued in linksToCrawl, then we add it to the queue object.
def __init__(self, base_url):
Crawler.base_url = base_url
@staticmethod
def crawl(thread_name, url, linksToCrawl):
try:
link = urljoin(Crawler.base_url, url)
if (urlparse(link).netloc == 'tutorialedge.net') and (link not in Crawler.crawledLinks):
request = Request(link, headers={'User-Agent': 'Mozilla/5.0'})
response = urlopen(request, context=Crawler.myssl)
Crawler.crawledLinks.add(link)
print("Url {} Crawled with Status: {} : {} Crawled In Total".format(response.geturl(), response.getcode(), len(Crawler.crawledLinks)))
soup = BeautifulSoup(response.read(), "html.parser")
Crawler.enqueueLinks(soup.find_all('a'), linksToCrawl)
except URLError as e:
print("URL {} threw this error when trying to parse: {}".format(link, e.reason))
Crawler.errorLinks.add(link)
@staticmethod
def enqueueLinks(links, linksToCrawl):
for link in links:
if (urljoin(Crawler.base_url, link.get('href')) not in Crawler.crawledLinks):
if (urljoin(Crawler.base_url, link.get('href')) not in linksToCrawl):
linksToCrawl.put(link.get('href'))