Python多线程抓取网页HTML代码示例

多线程函数，实现以不阻塞多线程的方式从一个指定网页抓取网页链接

import threading, queue, time, urllib
from urllib import request
BASE_URL = ''
URL_QUEUE = queue.Queue()
for item in range(0, 10):
url = 'http://www.baidu.com&#39;
URL_QUEUE.put(url)
def fetch_url(url_queue, thid):
while not url_queue.empty():
try:
url = url_queue.get_nowait()
    except Exception as e:
        print(&#39;error---&gt;&#39; + e)
        break

    print(f&#39;current thread id:{threading.Thread.__name__}-{thid},url:{url}\

')
try:
response = urllib.request.urlopen(url)
response_code = response.getcode()
except Exception as e:
print(f'get{url},error--->' + e)
continue
    if response_code == 200:
        html = response.read()  # 获取到页面的源代码
        print(html.decode(&#39;utf-8&#39;))  # 转化为 utf-8 编码
        time.sleep(1)
        

if name == 'main':
print('-' * 4 + 'ALL START' + '-' * 4)
stat_time = time.time()
threads = []
thread_num = 3
for x in range(0, thread_num):
thread = threading.Thread(target=fetch_url, args=(URL_QUEUE, x))
threads.append(thread)
thread.start()
for t in threads:
t.join()
print('-' * 4 + 'ALL DONE' + '-' * 4)

Python 简单获取网页Html

相关文章

linux卸载python

Python GUI tkinter 计算器

Python 打开摄像头

评论