Python urllib.request 模块详解：urlopen参数、GET/POST请求及高级用法

urllib.request.urlopen(url, data=None, [timeout, ]*, cafile=None, capath=None, cadefault=False, context=None)

url:地址

data：bytes 类型的内容，可通过 bytes()函数转为化字节流。它也是可选参数。使用 data 参数，请求方式变成以 POST 方式提交表单。使用标准格式是application/x-www-form-urlencoded

timeout ：设置请求超时时间。单位是s。

cafile、capath： CA 证书和 CA 证书的路径。如果使用HTTPS则需要用到。

contex：t参数必须是ssl.SSLContext类型，用来指定SSL设置

import urllib.request
url = "http://www.baidu.com"
response = urllib.request.urlopen(url,timeout=3)
html = response.read()
print(html.decode('utf-8'))

带data

import urllib.parse
import urllib.request
url = "http://www.baidu.com/s?"
params = {
  'wd':'python'
}
post_data = bytes(urllib.parse.urlencode(params), encoding='utf8')
response = urllib.request.urlopen(url, data=post_data)
print(response.read().decode('utf-8'))

其它示例如下：

from urllib import request, parse
def get_request(url):
    with request.urlopen(url) as f:
        data = f.read()
        print(f'Status-->{f.status}:{f.reason}')
        for k, v in f.getheaders():
            print(f'{k}->{v}')
        print(data.decode('utf-8'))
def post_request(url, post_data):
    post_data = parse.urlencode(post_data)
    req = request.Request(url, method='POST')
    req.add_header('User-Agent', 'Mozilla/4.0 (compatible; MSIE 5.5;Windows NT)')
    with request.urlopen(req, data=post_data.encode('utf-8')) as f:
        print(f'Status-->{f.status}:{f.reason}')
        data = f.read()
        for k, v in f.getheaders():
            print(f'{k}->{v}')
        print(data.decode('utf-8'))
if name == 'main':
    # get_request('https://www.baidu.com/s&#39;)
    data = ([
        ('wd', '123'),
        ('r', '123')
    ])
    post_request('https://www.baidu.com/s&#39;,&nbsp;data)

通用的方法

import urllib.request
class reqhelper:
    def init(self):
        # 生成useragent地址：https://gongjux.com/userAgentGenerator/
        self.useragent = (
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.93 Safari/537.36',
            'Mozilla/5.0 (Windows NT 10; Win64; x64; rv:83.0) Gecko/20100101 Firefox/83.0',
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.93 '
            'Safari/537.36',
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:83.0) Gecko/20100101 Firefox/83.0',
            'Mozilla/5.0 (Linux; Android 10; ELS-AN00) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.92 Mobile '
            'Safari/537.36 '
        )
        self.headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
                                 'Chrome/96.0.4664.93 Safari/537.36', }
    def get_content(self,url):
        print(''  3, '直接使用', ''  3)
        # 直接使用
        res = urllib.request.urlopen(url)
        content = res.read().decode('utf-8')
        return content
    def get_content_with_useragent(self,url):
        # 使用User-Agent
        req = urllib.request.Request(url=url, headers=self.headers)
        content = urllib.request.urlopen(req).read().decode('utf-8')
        return content
    def get_content_with_cookie(self,url):
        print(''  3, '使用cookie', ''  3)
        # 使用cookie
        from http import cookiejar
        cookie_support = urllib.request.HTTPCookieProcessor(cookiejar.CookieJar())
        opener = urllib.request.build_opener(cookie_support, urllib.request.HTTPHandler)
        req = urllib.request.Request(url, headers=self.headers)
        content = opener.open(req).read().decode("utf-8")
        return content
    def get_content_with_proxy(self,url):
        print(''  3, '使用代理服务器', ''  3)
        # 使用代理服务器
        import random, urllib
        proxy_list = [
            {'http': '127.0.0.1:8000'},
            {'http': '127.0.0.1:8001'},
            {'http': '127.0.0.1:8002'}
        ]
        url = "https://www.baidu.com/"
        proxy_support = urllib.request.ProxyHandler(random.choice(proxy_list))
        opener = urllib.request.build_opener(proxy_support, urllib.request.HTTPHandler)
        urllib.request.install_opener(opener)
        content = urllib.request.urlopen(url).read().decode('utf-8')
        print(content)
        # 建议使用以下的方法
        req = urllib.request.Request(url)
        content = opener.open(req).read().decode("utf-8")
        return content
    def get_content_with_proxy_up(self,url):
        print(''  3, '使用带账号密码的代码', ''  3)
        # 带账号密码的代码［转］
        import urllib.request
        # 用户名密码
        user = "user"
        passwd = "passwd"
        proxyserver = "x.x.x.x:xx"
        # 建一个密码管理对象，保存用户名和密码
        pwdmgr = urllib.request.HTTPPasswordMgrWithDefaultRealm()
        #  添加账户信息，第一个None，
        pwdmgr.add_password(None, proxyserver, user, passwd)
        # 构建一个代理基础用户名/密码验证的 ProxyBasicAuthHandler 处理器对象，参数是创建的密码管理对象
        # 注意，这里不再使用普通 ProxyHandler 类了
        proxyauth_handler = urllib.request.ProxyBasicAuthHandler(pwdmgr)
        # 通过 build_opener()方法使用这些代理 Handler 对象，创建自定义 opener 对象，参数包括构建的 proxy_handler 和 proxyauth_handler
        opener = urllib.request.build_opener(proxyauth_handler)
        # 构造Request 请求
        req = urllib.request.Request(url)
        # 使用自定义opener发送请求
        response = opener.open(req)
        # 打印响应内容
        content = response.read().decode("utf-8")
        return content
if name == 'main':
    r = reqhelper()
    url = "https://www.baidu.com/"
    print(r.get_content(url))
    print(r.get_content_with_cookie(url))
    print(r.get_content_with_useragent(url))
    print(r.get_content_with_proxy(url))
    print(r.get_content_with_proxy_up(url))

Python urllib.request

相关文章

Docker部署Code-server的两种方案

Ubuntu Server 原生部署 KasmVNC 中文桌面指南

三条测试流式curl命令（OpenAI / Anthropic，分别对应 /chat、/responses、/message 流式接口）

评论