Okay, so the first thing we are going to do is refactor the way we implement multithreading in our application. We want to move away from us having to manage the startup and shutdown of all our threads, and, instead, leave that in the capable hands of ThreadPoolExecutor.
So, if we look back at our crawler code, in order to start up numerous threads, we would have to do something like this:
import threading
import queue
from crawler import *
from CheckableQueue import *
THREAD_COUNT = 20
linksToCrawl = CheckableQueue()
def createCrawlers():
for i in range(THREAD_COUNT):
t = threading.Thread(target=run)
t.daemon = True
t.start()
def run():
while True:
url = linksToCrawl.get()
try:
if url is None:
break
Crawler.crawl(threading.current_thread(), url, linksToCrawl)
except:
print("Exception thrown with link: {}".format(url))
linksToCrawl.task_done()
def main():
url = input("Website > ")
Crawler(url)
linksToCrawl.put(url)
createCrawlers()
linksToCrawl.join()
print("Total Links Crawled: {}".format(len(Crawler.crawledLinks)))
print("Total Errors: {}".format(len(Crawler.errorLinks)))
if __name__ == '__main__':
main()
However, using ThreadPoolExecutor, we can condense this down to just a couple of lines now using our with command. Our code has become more succinct and easier to follow as a result:
import threading
import queue
from concurrent.futures import ThreadPoolExecutor
from crawler import *
from CheckableQueue import *
THREAD_COUNT = 20
linksToCrawl = CheckableQueue()
def run(url):
try:
Crawler.crawl(threading.current_thread(), url, linksToCrawl)
except:
print("Exception thrown with link: {}".format(url))
linksToCrawl.task_done()
def main():
url = input("Website > ")
Crawler(url)
linksToCrawl.put(url)
while not linksToCrawl.empty():
with ThreadPoolExecutor(max_workers=THREAD_COUNT) as executor:
url = linksToCrawl.get()
if url is not None:
future = executor.submit(run, url)
print("Total Links Crawled: {}".format(len(Crawler.crawledLinks)))
print("Total Errors: {}".format(len(Crawler.errorLinks)))
if __name__ == '__main__':
main()