多线程函数,实现以不阻塞多线程的方式从一个指定网页抓取网页链接
import threading, queue, time, urllib from urllib import requestBASE_URL = '' URL_QUEUE = queue.Queue() for item in range(0, 10): url = 'http://www.baidu.com' URL_QUEUE.put(url)
def fetch_url(url_queue, thid): while not url_queue.empty(): try: url = url_queue.get_nowait()
except Exception as e: print('error--->' + e) break print(f'current thread id:{threading.Thread.__name__}-{thid},url:{url}\') try: response = urllib.request.urlopen(url) response_code = response.getcode() except Exception as e: print(f'get{url},error--->' + e) continue
if response_code == 200: html = response.read() # 获取到页面的源代码 print(html.decode('utf-8')) # 转化为 utf-8 编码 time.sleep(1)if name == 'main': print('-' * 4 + 'ALL START' + '-' * 4) stat_time = time.time() threads = [] thread_num = 3 for x in range(0, thread_num): thread = threading.Thread(target=fetch_url, args=(URL_QUEUE, x)) threads.append(thread) thread.start() for t in threads: t.join() print('-' * 4 + 'ALL DONE' + '-' * 4)

评论