Python 爬虫代理示例

[!NOTE]代码示例说明

  1. 代码样例不能直接运行,请替换成您自己的代理信息。
  2. 在不同编程语言的代码示例中,需注意其环境版本。
  3. 示例代码使用遇到问题请联系售后客服,我们会为您提供技术支持。

requests

=== "随机IP访问"

    #! -*- encoding:utf-8 -*-

    import requests
    import random

    # 要访问的目标页面
    targetUrl = "http://httpbin.org/ip"

    # 要访问的目标HTTPS页面
    # targetUrl = "https://httpbin.org/ip"

    # 代理服务器(产品官网 www.16yun.cn)
    proxyHost = "t.16yun.cn"
    proxyPort = "31111"

    # 代理验证信息
    proxyUser = "username"
    proxyPass = "password"

    proxyMeta = "http://%(user)s:%(pass)s@%(host)s:%(port)s" % {
        "host" : proxyHost,
        "port" : proxyPort,
        "user" : proxyUser,
        "pass" : proxyPass,
    }

    # 设置 http和https访问都是用HTTP代理
    proxies = {
        "http"  : proxyMeta,
        "https" : proxyMeta,
    }


    #  设置IP切换头
    tunnel = random.randint(1,10000)
    headers = {"Proxy-Tunnel": str(tunnel)}



    resp = requests.get(targetUrl, proxies=proxies, headers=headers)

    print resp.status_code
    print resp.text

=== "Proxy-Tunnel保持IP不变"

    #! -*- encoding:utf-8 -*-
    import requests
    import random
    import requests.adapters

    # 要访问的目标页面
    targetUrlList = [
        "https://httpbin.org/ip",
        "https://httpbin.org/headers",
        "https://httpbin.org/user-agent",
    ]

    # 代理服务器(产品官网 www.16yun.cn)
    proxyHost = "t.16yun.cn"
    proxyPort = "31111"

    # 代理验证信息
    proxyUser = "username"
    proxyPass = "password"

    proxyMeta = "http://%(user)s:%(pass)s@%(host)s:%(port)s" % {
        "host": proxyHost,
        "port": proxyPort,
        "user": proxyUser,
        "pass": proxyPass,
    }

    # 设置 http和https访问都是用HTTP代理
    proxies = {
        "http": proxyMeta,
        "https": proxyMeta,
    }

    #  设置IP切换头
    tunnel = random.randint(1, 10000)
    headers = {"Proxy-Tunnel": str(tunnel)}


    class HTTPAdapter(requests.adapters.HTTPAdapter):
        def proxy_headers(self, proxy):
            headers = super(HTTPAdapter, self).proxy_headers(proxy)
            if hasattr(self, 'tunnel'):
                headers['Proxy-Tunnel'] = self.tunnel
            return headers


    # 访问三次网站,使用相同的tunnel标志,均能够保持相同的外网IP
    for i in range(3):
        s = requests.session()

        a = HTTPAdapter()

        #  设置IP切换头
        a.tunnel = tunnel
        s.mount('https://', a)

        for url in targetUrlList:
            r = s.get(url, proxies=proxies)
            print r.text

=== "Keep-Alive保持IP不变"

    #! -*- encoding:utf-8 -*-
    import requests
    import random
    import requests.adapters

    # 要访问的目标页面
    targetUrlList = [
        "https://httpbin.org/ip",
        "https://httpbin.org/headers",
        "https://httpbin.org/user-agent",
    ]

    # 代理服务器(产品官网 www.16yun.cn)
    proxyHost = "t.16yun.cn"  
    proxyPort = "31111"  

    # 代理验证信息
    proxyUser = "username"  
    proxyPass = "password" 

    proxyMeta = "http://%(user)s:%(pass)s@%(host)s:%(port)s" % {
        "host": proxyHost,
        "port": proxyPort,
        "user": proxyUser,
        "pass": proxyPass,
    }

    # 设置 http和https访问都是用HTTP代理
    proxies = {
        "http": proxyMeta,
        "https": proxyMeta,
    }

    # 访问三次网站,使用相同的Session(keep-alive),均能够保持相同的外网IP
    s = requests.session()

    # 设置cookie
    # cookie_dict = {"JSESSION":"123456789"}
    # cookies = requests.utils.cookiejar_from_dict(cookie_dict, cookiejar=None, overwrite=True)
    # s.cookies = cookies

    for i in range(3):
        for url in targetUrlList:
            r = s.get(url, proxies=proxies)
            print r.text

urllib2

=== "随机IP访问"


    #! -*- encoding:utf-8 -*-

    from urllib import request

    # 要访问的目标页面
    targetUrl = "http://httpbin.org/ip"

    # 代理服务器(产品官网 www.16yun.cn)
    proxyHost = "t.16yun.cn"
    proxyPort = "31111"

    # 代理验证信息
    proxyUser = "username"
    proxyPass = "password"


    proxyMeta = "http://%(user)s:%(pass)s@%(host)s:%(port)s" % {
        "host" : proxyHost,
        "port" : proxyPort,
        "user" : proxyUser,
        "pass" : proxyPass,
    }

    proxy_handler = request.ProxyHandler({
        "http"  : proxyMeta,
        "https" : proxyMeta,
    })        

    opener = request.build_opener(proxy_handler)

    request.install_opener(opener)
    resp = request.urlopen(targetUrl).read()

    print (resp)              

=== "Proxy-Tunnel保持IP不变"

    #! -*- encoding:utf-8 -*-
    import urllib2
    import random
    import httplib


    class HTTPSConnection(httplib.HTTPSConnection):

        def set_tunnel(self, host, port=None, headers=None):
            httplib.HTTPSConnection.set_tunnel(self, host, port, headers)
            if hasattr(self, 'proxy_tunnel'):
                self._tunnel_headers['Proxy-Tunnel'] = self.proxy_tunnel


    class HTTPSHandler(urllib2.HTTPSHandler):
        def https_open(self, req):
            return urllib2.HTTPSHandler.do_open(self, HTTPSConnection, req, context=self._context)


    # 要访问的目标页面
    targetUrlList = [
        "https://httpbin.org/ip",
        "https://httpbin.org/headers",
        "https://httpbin.org/user-agent",
    ]

    # 代理服务器(产品官网 www.16yun.cn)
    proxyHost = "t.16yun.cn"
    proxyPort = "31111"

    # 代理验证信息
    proxyUser = "username"
    proxyPass = "password"

    proxyMeta = "http://%(user)s:%(pass)s@%(host)s:%(port)s" % {
        "host": proxyHost,
        "port": proxyPort,
        "user": proxyUser,
        "pass": proxyPass,
    }

    # 设置 http和https访问都是用HTTP代理
    proxies = {
        "http": proxyMeta,
        "https": proxyMeta,
    }

    #  设置IP切换头
    tunnel = random.randint(1, 10000)
    headers = {"Proxy-Tunnel": str(tunnel)}
    HTTPSConnection.proxy_tunnel = tunnel


    proxy = urllib2.ProxyHandler(proxies)
    opener = urllib2.build_opener(proxy, HTTPSHandler)
    urllib2.install_opener(opener)

    # 访问三次网站,使用相同的tunnel标志,均能够保持相同的外网IP
    for i in range(3):
        for url in targetUrlList:
            r = urllib2.Request(url)
            print(urllib2.urlopen(r).read())

[!WARNING] urllib2无法使用Keep-alive

urllib2对于HTTP/1.1默认会关闭连接 请通过设置相同Proxy-Tunnel来保持相同的外网IP.

scrapy

=== "scrapy中间件"

在项目中新建middlewares.py文件(./项目名/middlewares.py)

        #! -*- encoding:utf-8 -*-
        import base64            
        import sys
        import random
        
        PY3 = sys.version_info[0] >= 3
        
        def base64ify(bytes_or_str):
            if PY3 and isinstance(bytes_or_str, str):
                input_bytes = bytes_or_str.encode('utf8')
            else:
                input_bytes = bytes_or_str
        
            output_bytes = base64.urlsafe_b64encode(input_bytes)
            if PY3:
                return output_bytes.decode('ascii')
            else:
                return output_bytes
                        
        class ProxyMiddleware(object):                
            def process_request(self, request, spider):
                # 代理服务器(产品官网 www.16yun.cn)
                proxyHost = "t.16yun.cn"
                proxyPort = "31111"
    
                # 代理验证信息
                proxyUser = "username"
                proxyPass = "password"
                
                # [版本>=2.6.2](https://docs.scrapy.org/en/latest/news.html?highlight=2.6.2#scrapy-2-6-2-2022-07-25)无需添加验证头,会自动在请求头中设置Proxy-Authorization     
                request.meta['proxy'] = "http://{0}:{1}@{2}:{3}".format(proxyUser,proxyPass,proxyHost,proxyPort)
    
                # 版本<2.6.2 需要手动添加代理验证头
                # request.meta['proxy'] = "http://{0}:{1}".format(proxyHost,proxyPort)
                # request.headers['Proxy-Authorization'] = 'Basic ' +  base64ify(proxyUser + ":" + proxyPass)                    

                # 设置IP切换头(根据需求)
                # tunnel = random.randint(1,10000)
                # request.headers['Proxy-Tunnel'] = str(tunnel)

                # 每次访问后关闭TCP链接,强制每次访问切换IP
                request.headers['Connection'] = "Close"

修改项目配置文件 (./项目名/settings.py)

    DOWNLOADER_MIDDLEWARES = {
        '项目名.middlewares.ProxyMiddleware': 100,
    }

=== "scrapy-splash中间件"

在start_requests给splash调用添加代理信息


    def start_requests(self):
        script = '''
                function main(splash)
                    local url = splash.args.url
                    assert(splash:go(url))
                    assert(splash:wait(0.5))
                    local entries = splash:history()
                    local last_response = entries[#entries].response
                    return {
                    url = splash:url(),
                    http_status = last_response.status,
                    cookies = splash:get_cookies(),
                    html = splash:html(),
                    headers = last_response.headers,
                    }
                end
            '''
        # 代理服务器(产品官网 www.16yun.cn)
        proxyHost = "t.16yun.cn"
        proxyPort = "31111"

        # 代理验证信息
        proxyUser = "16111YVL"
        proxyPass = "11111"

        proxy = "http://{}:{}@{}:{}".format(proxyUser,proxyPass, proxyHost, proxyPort)
        try:        
            for url in self.start_urls:
                yield SplashRequest(
                    url,
                    self.parse,
                    endpoint="execute",
                    args={
                        "lua_source": script,
                        "wait": 5,
                        "timeout": 600,
                        "target_count": self.target_count,
                        'proxy': proxy
                    },
                )
        except:
            raise CloseSpider("Could not load Lua script.")

在项目中新建middlewares.py文件(./项目名/middlewares.py)

        #! -*- encoding:utf-8 -*-
        import base64            
        import sys
        import random
        
        PY3 = sys.version_info[0] >= 3
        
        def base64ify(bytes_or_str):
            if PY3 and isinstance(bytes_or_str, str):
                input_bytes = bytes_or_str.encode('utf8')
            else:
                input_bytes = bytes_or_str
        
            output_bytes = base64.urlsafe_b64encode(input_bytes)
            if PY3:
                return output_bytes.decode('ascii')
            else:
                return output_bytes
                        
        class ProxyMiddleware(object):                
            def process_request(self, request, spider):
                # 代理服务器(产品官网 www.16yun.cn)
                proxyHost = "t.16yun.cn"
                proxyPort = "31111"
    
                # 代理验证信息
                proxyUser = "username"
                proxyPass = "password"

                request.meta['proxy'] = "http://{0}:{1}".format(proxyHost,proxyPort)

                # 添加验证头
                encoded_user_pass = base64ify(proxyUser + ":" + proxyPass)
                request.headers['Proxy-Authorization'] = 'Basic ' + encoded_user_pass                    

                # 设置IP切换头(根据需求)
                tunnel = random.randint(1,10000)
                request.headers['Proxy-Tunnel'] = str(tunnel)    

给splash设置代理修改项目配置文件 (./项目名/settings.py)

    DOWNLOADER_MIDDLEWARES = {
        '项目名.middlewares.ProxyMiddleware': 100,
    }

=== "scrapy-pyppeteer中间件"

在项目中新建middlewares.py文件(./项目名/middlewares.py)

        #! -*- encoding:utf-8 -*-    
                
        import websockets
        from scrapy.http import HtmlResponse
        from logging import getLogger
        import asyncio
        import pyppeteer
        import logging
        from concurrent.futures._base import TimeoutError
        import base64
        import sys
        import random
        
        pyppeteer_level = logging.WARNING
        logging.getLogger('websockets.protocol').setLevel(pyppeteer_level)
        logging.getLogger('pyppeteer').setLevel(pyppeteer_level)
        
        PY3 = sys.version_info[0] >= 3
                        
        def base64ify(bytes_or_str):
            if PY3 and isinstance(bytes_or_str, str):
                input_bytes = bytes_or_str.encode('utf8')
            else:
                input_bytes = bytes_or_str
        
            output_bytes = base64.urlsafe_b64encode(input_bytes)
            if PY3:
                return output_bytes.decode('ascii')
            else:
                return output_bytes
        
        class ProxyMiddleware(object):
            USER_AGENT = open('useragents.txt').readlines()
        
            def process_request(self, request, spider):
                # 代理服务器
                proxyHost = "t.16yun.cn"
                proxyPort = "31111"
                
                # 代理验证信息
                proxyUser = "username"
                proxyPass = "password"
        
                request.meta['proxy'] = "http://{0}:{1}".format(proxyHost, proxyPort)
        
                # 添加验证头
                encoded_user_pass = base64ify(proxyUser + ":" + proxyPass)
                request.headers['Proxy-Authorization'] = 'Basic ' + encoded_user_pass
        
                # 设置IP切换头(根据需求)
                tunnel = random.randint(1, 10000)
                request.headers['Proxy-Tunnel'] = str(tunnel)
                request.headers['User-Agent'] = random.choice(self.USER_AGENT)
                    
        class PyppeteerMiddleware(object):
            def __init__(self, **args):
                """
                init logger, loop, browser
                :param args:
                """
                self.logger = getLogger(__name__)
                self.loop = asyncio.get_event_loop()
                self.browser = self.loop.run_until_complete(
                    pyppeteer.launch(headless=True))
                self.args = args
        
            def __del__(self):
                """
                close loop
                :return:
                """
                self.loop.close()
        
            def render(self, url, retries=1, script=None, wait=0.3, scrolldown=False, sleep=0,
                        timeout=8.0, keep_page=False):
                """
                render page with pyppeteer
                :param url: page url
                :param retries: max retry times
                :param script: js script to evaluate
                :param wait: number of seconds to wait before loading the page, preventing timeouts
                :param scrolldown: how many times to page down
                :param sleep: how many long to sleep after initial render
                :param timeout: the longest wait time, otherwise raise timeout error
                :param keep_page: keep page not to be closed, browser object needed
                :param browser: pyppetter browser object
                :param with_result: return with js evaluation result
                :return: content, [result]
                """
        
                # define async render
                async def async_render(url, script, scrolldown, sleep, wait, timeout, keep_page):
                    try:
                        # basic render
                        page = await self.browser.newPage()
                        await asyncio.sleep(wait)
                        response = await page.goto(url, options={'timeout': int(timeout * 1000)})
                        if response.status != 200:
                            return None, None, response.status
                        result = None
                        # evaluate with script
                        if script:
                            result = await page.evaluate(script)
        
                        # scroll down for {scrolldown} times
                        if scrolldown:
                            for _ in range(scrolldown):
                                await page._keyboard.down('PageDown')
                                await asyncio.sleep(sleep)
                        else:
                            await asyncio.sleep(sleep)
                        if scrolldown:
                            await page._keyboard.up('PageDown')
        
                        # get html of page
                        content = await page.content()
        
                        return content, result, response.status
                    except TimeoutError:
                        return None, None, 500
                    finally:
                        # if keep page, do not close it
                        if not keep_page:
                            await page.close()
        
                content, result, status = [None] * 3
        
                # retry for {retries} times
                for i in range(retries):
                    if not content:
                        content, result, status = self.loop.run_until_complete(
                            async_render(url=url, script=script, sleep=sleep, wait=wait,
                                            scrolldown=scrolldown, timeout=timeout, keep_page=keep_page))
                    else:
                        break
        
                # if need to return js evaluation result
                return content, result, status
        
            def process_request(self, request, spider):
                """
                :param request: request object
                :param spider: spider object
                :return: HtmlResponse
                """
                if request.meta.get('render'):
                    try:
                        self.logger.debug('rendering %s', request.url)
                        html, result, status = self.render(request.url)
                        return HtmlResponse(url=request.url, body=html, request=request, encoding='utf-8',
                                            status=status)
                    except websockets.exceptions.ConnectionClosed:
                        pass
        
            @classmethod
            def from_crawler(cls, crawler):
                return cls(**crawler.settings.get('PYPPETEER_ARGS', {}))

给splash设置代理修改项目配置文件 (./项目名/settings.py)

    DOWNLOADER_MIDDLEWARES = {
        '项目名.middlewares.PyppeteerMiddleware': 543,
        '项目名.middlewares.ProxyMiddleware': 100,    
    }

=== "scrapy环境变量"

通过设置环境变量,来使用爬虫代理

Windows


    C:\>set http_proxy=http://username:password@ip:port
    

Linux/MacOS


    csh% 
    setenv http_proxy http://username:password@ip:port
    
    sh$ 
    export http_proxy=http://username:password@ip:port

=== "scrapy HTTPS会话指定IP"

新建文件 (./项目名/tunnel_fix.py) 在项目文件最开始位置from . import tunnel_fix


    from scrapy.utils.python import to_bytes, to_unicode
    import scrapy.core.downloader.handlers.http11
    from random import randint
    
    def tunnel_request_data(host, port, proxy_auth_header=None):
        host_value = to_bytes(host, encoding='ascii') + b':' + to_bytes(str(port))
        tunnel_req = b'CONNECT ' + host_value + b' HTTP/1.1\r\n'
        tunnel_req += b'Host: ' + host_value + b'\r\n'
        if proxy_auth_header:
            tunnel_req += b'Proxy-Authorization: ' + proxy_auth_header + b'\r\n'
            # 指定Proxy-Tunnel
            proxy_tunnel = '{}'.format(randint(1,9999))
            tunnel_req += b'Proxy-Tunnel: '+ to_bytes(proxy_tunnel)  + b'\r\n'
        tunnel_req += b'\r\n'
        return tunnel_req
    
    scrapy.core.downloader.handlers.http11.tunnel_request_data = tunnel_request_data

aiohttp

=== "aiohttp 使用Proxy-Tunnel切换IP"

    import aiohttp, asyncio
    import random
    def main():

        targetUrl = "https://httpbin.org/headers"
    
        # 代理服务器(产品官网 www.16yun.cn)
        proxyHost = "t.16yun.cn"
        proxyPort = "31111"
    
        # 代理验证信息
        proxyUser = "username"
        proxyPass = "password"
    
        proxyServer = "http://%(user)s:%(pass)s@%(host)s:%(port)s" % {
            "host" : proxyHost,
            "port" : proxyPort,
            "user" : proxyUser,
            "pass" : proxyPass,
        }
    
        userAgent = "Chrome/83.0.4103.61"
        # 所有请求使用同一个Proxy-Tunnel, 使用固定IP                            
        # proxy_tunnel = "{}".format(random.randint(1,10000))

        async def entry():            
            async with aiohttp.ClientSession(headers={"User-Agent": userAgent}) as session:
                while True:
                    # 随机设Proxy-Tunnel,使用随机IP                          
                    proxy_tunnel = "{}".format(random.randint(1,10000))
                    async with session.get(targetUrl, proxy=proxyServer, proxy_headers={"Proxy-Tunnel":proxy_tunnel}) as resp:
                        body = await resp.read()            
                        print(resp.status)
                        print(body)
    
        loop = asyncio.get_event_loop()
        loop.run_until_complete(entry())
        loop.run_forever()           
    
    if __name__ == '__main__':
        main()

=== "aiohttp 使用TCP方式切换IP"

    #! -*- encoding:utf-8 -*-

    import aiohttp, asyncio

    
    targetUrl = "http://httpbin.org/ip"

    # 代理服务器(产品官网 www.16yun.cn)
    proxyHost = "t.16yun.cn"
    proxyPort = "31111"

    # 代理验证信息
    proxyUser = "username"
    proxyPass = "password"

    proxyServer = "http://%(user)s:%(pass)s@%(host)s:%(port)s" % {
        "host" : proxyHost,
        "port" : proxyPort,
        "user" : proxyUser,
        "pass" : proxyPass,
    }

    userAgent = "Chrome/83.0.4103.61"

    async def entry():
        while True: 
            conn = aiohttp.TCPConnector(verify_ssl=False)                       
            async with aiohttp.ClientSession(headers={"User-Agent": userAgent}, connector=conn) as session:
                async with session.get(targetUrl, proxy=proxyServer) as resp:
                    body = await resp.read()
    
                    print(resp.status)
                    print(body)

    loop = asyncio.get_event_loop()
    loop.run_until_complete(entry())
    loop.run_forever()

[!NOTE] 注意

  • aiohttp库实现了TCP链接池功能,如果没设置随机Proxy-Tunnel或断开TCP链接,会导致多个请求始终没有切换IP。
  • 如需切换IP,需每个请求新建一个session;同时设置connector_owner参数让session关闭后链接也关闭。
  • 如需切换IP,也可以设置Proxy-Tunnel为随机数,使用随机IP。

httpx

=== "随机IP访问"

    
    import asyncio
    import httpx
    # 代理服务器(产品官网 www.16yun.cn)
    proxyHost = "t.16yun.cn"
    proxyPort = "31111"

    # 代理验证信息
    proxyUser = "username"
    proxyPass = "password"

    proxyMeta = "http://%(user)s:%(pass)s@%(host)s:%(port)s" % {
        "host": proxyHost,
        "port": proxyPort,
        "user": proxyUser,
        "pass": proxyPass,
    }

    # 设置 http和https访问都是用HTTP代理
    proxies = {
        "http://": proxyMeta,
        "https://": proxyMeta,
    }

    client = httpx.AsyncClient(proxies=proxies)
    
    # 开启http2.0支持,请使用 pip install httpx[http2]
    # client = httpx.AsyncClient(http2=True,proxies=proxies)

    async def test():
        resp = await client.get("https://httpbin.org/ip")
        print(resp.http_version)
        print(resp.text)
    
    asyncio.run(test())

=== "Proxy-Tunnel保持IP不变"

    import httpx
    import random
    
    # 代理服务器(产品官网 www.16yun.cn)
    proxy_host = "t.16yun.cn"
    proxy_port = "31111"
    
    # 代理验证信息
    proxy_user = "username"
    proxy_pwd = "password"
    
    proxy_url = f"http://{proxy_user}:{proxy_pwd}@{proxy_host}:{proxy_port}"
    
    proxy = httpx.Proxy(
        url=proxy_url,
        #  设置IP切换头,数不变,保持IP不变
        headers={"Proxy-Tunnel": f"{random.randint(1, 10000)}"}
    )
    print(proxy_url)
    proxies = {
        "http://": proxy,
        "https://": proxy,
    }
    target_url = "https://httpbin.org/ip"           
    
    async def test_async():
        # 三次请求保持在同一个IP上
        # 开启http2.0支持,请使用 pip install httpx[http2]
        # client = httpx.AsyncClient(http2=True,proxies=proxies)
        for _ in range(3):
            async with httpx.AsyncClient(
                    proxies=proxies,
            ) as client:
                response = await client.get(target_url)
                print("test_async:", response.text)
    
    def test():
        # 三次请求保持在同一个IP上
        for _ in range(3):
            # 开启http2.0支持,请使用 pip install httpx[http2]
            # client = httpx.Client(http2=True,proxies=proxies)
            client = httpx.Client(
                proxies=proxies,
            )
    
            response = client.get(target_url)
            print("test:", response.text)
    
            client.close()
                
    if __name__ == '__main__':                
        import asyncio
    
        test()
        asyncio.run(test_async())