Python 爬虫代理示例
代码示例说明
- 代码样例不能直接运行,请替换成您自己的代理信息。
- 在不同编程语言的代码示例中,需注意其环境版本。
- 示例代码使用遇到问题请联系,我们会为您提供技术支持。
requests
1#! -*- encoding:utf-8 -*-23import requests4import random56# 要访问的目标页面7targetUrl = "http://httpbin.org/ip"89# 要访问的目标HTTPS页面10# targetUrl = "https://httpbin.org/ip"1112# 代理服务器(产品官网 www.16yun.cn)13proxyHost = "t.16yun.cn"14proxyPort = "31111"1516# 代理验证信息17proxyUser = "username"18proxyPass = "password"1920proxyMeta = "http://%(user)s:%(pass)s@%(host)s:%(port)s" % {21"host" : proxyHost,22"port" : proxyPort,23"user" : proxyUser,24"pass" : proxyPass,25}2627# 设置 http和https访问都是用HTTP代理28proxies = {29"http" : proxyMeta,30"https" : proxyMeta,31}323334# 设置IP切换头35tunnel = random.randint(1,10000)36headers = {"Proxy-Tunnel": str(tunnel)}37383940resp = requests.get(targetUrl, proxies=proxies, headers=headers)4142print resp.status_code43print resp.text
urllib2
12#! -*- encoding:utf-8 -*-34from urllib import request56# 要访问的目标页面7targetUrl = "http://httpbin.org/ip"89# 代理服务器(产品官网 www.16yun.cn)10proxyHost = "t.16yun.cn"11proxyPort = "31111"1213# 代理验证信息14proxyUser = "username"15proxyPass = "password"161718proxyMeta = "http://%(user)s:%(pass)s@%(host)s:%(port)s" % {19"host" : proxyHost,20"port" : proxyPort,21"user" : proxyUser,22"pass" : proxyPass,23}2425proxy_handler = request.ProxyHandler({26"http" : proxyMeta,27"https" : proxyMeta,28})2930opener = request.build_opener(proxy_handler)3132request.install_opener(opener)33resp = request.urlopen(targetUrl).read()3435print (resp)
urllib2无法使用Keep-alive
urllib2对于HTTP/1.1默认会关闭连接
请通过设置相同Proxy-Tunnel来保持相同的外网IP.
scrapy
在项目中新建middlewares.py文件(./项目名/middlewares.py)
1#! -*- encoding:utf-8 -*-2import base643import sys4import random56PY3 = sys.version_info[0] >= 378def base64ify(bytes_or_str):9if PY3 and isinstance(bytes_or_str, str):10input_bytes = bytes_or_str.encode('utf8')11else:12input_bytes = bytes_or_str1314output_bytes = base64.urlsafe_b64encode(input_bytes)15if PY3:16return output_bytes.decode('ascii')17else:18return output_bytes1920class ProxyMiddleware(object):21def process_request(self, request, spider):22# 代理服务器(产品官网 www.16yun.cn)23proxyHost = "t.16yun.cn"24proxyPort = "31111"2526# 代理验证信息27proxyUser = "username"28proxyPass = "password"2930# [版本>=2.6.2](https://docs.scrapy.org/en/latest/news.html?highlight=2.6.2#scrapy-2-6-2-2022-07-25)无需添加验证头,会自动在请求头中设置Proxy-Authorization31request.meta['proxy'] = "http://{0}:{1}@{2}:{3}".format(proxyUser,proxyPass,proxyHost,proxyPort)3233# 版本<2.6.2 需要手动添加代理验证头34# request.meta['proxy'] = "http://{0}:{1}".format(proxyHost,proxyPort)35# request.headers['Proxy-Authorization'] = 'Basic ' + base64ify(proxyUser + ":" + proxyPass)3637# 设置IP切换头(根据需求)38# tunnel = random.randint(1,10000)39# request.headers['Proxy-Tunnel'] = str(tunnel)4041# 每次访问后关闭TCP链接,强制每次访问切换IP42request.headers['Connection'] = "Close"43
修改项目配置文件 (./项目名/settings.py)
1DOWNLOADER_MIDDLEWARES = {2'项目名.middlewares.ProxyMiddleware': 100,3}
在start_requests给splash调用添加代理信息
12def start_requests(self):3script = '''4function main(splash)5local url = splash.args.url6assert(splash:go(url))7assert(splash:wait(0.5))8local entries = splash:history()9local last_response = entries[#entries].response10return {11url = splash:url(),12http_status = last_response.status,13cookies = splash:get_cookies(),14html = splash:html(),15headers = last_response.headers,16}17end18'''19# 代理服务器(产品官网 www.16yun.cn)20proxyHost = "t.16yun.cn"21proxyPort = "31111"2223# 代理验证信息24proxyUser = "16111YVL"25proxyPass = "11111"2627proxy = "http://{}:{}@{}:{}".format(proxyUser,proxyPass, proxyHost, proxyPort)28try:29for url in self.start_urls:30yield SplashRequest(31url,32self.parse,33endpoint="execute",34args={35"lua_source": script,36"wait": 5,37"timeout": 600,38"target_count": self.target_count,39'proxy': proxy40},41)42except:43raise CloseSpider("Could not load Lua script.")
在项目中新建middlewares.py文件(./项目名/middlewares.py)
1#! -*- encoding:utf-8 -*-2import base643import sys4import random56PY3 = sys.version_info[0] >= 378def base64ify(bytes_or_str):9if PY3 and isinstance(bytes_or_str, str):10input_bytes = bytes_or_str.encode('utf8')11else:12input_bytes = bytes_or_str1314output_bytes = base64.urlsafe_b64encode(input_bytes)15if PY3:16return output_bytes.decode('ascii')17else:18return output_bytes1920class ProxyMiddleware(object):21def process_request(self, request, spider):22# 代理服务器(产品官网 www.16yun.cn)23proxyHost = "t.16yun.cn"24proxyPort = "31111"2526# 代理验证信息27proxyUser = "username"28proxyPass = "password"2930request.meta['proxy'] = "http://{0}:{1}".format(proxyHost,proxyPort)3132# 添加验证头33encoded_user_pass = base64ify(proxyUser + ":" + proxyPass)34request.headers['Proxy-Authorization'] = 'Basic ' + encoded_user_pass3536# 设置IP切换头(根据需求)37tunnel = random.randint(1,10000)38request.headers['Proxy-Tunnel'] = str(tunnel)39
给splash设置代理修改项目配置文件 (./项目名/settings.py)
1DOWNLOADER_MIDDLEWARES = {2'项目名.middlewares.ProxyMiddleware': 100,3}
在项目中新建middlewares.py文件(./项目名/middlewares.py)
1#! -*- encoding:utf-8 -*-23import websockets4from scrapy.http import HtmlResponse5from logging import getLogger6import asyncio7import pyppeteer8import logging9from concurrent.futures._base import TimeoutError10import base6411import sys12import random1314pyppeteer_level = logging.WARNING15logging.getLogger('websockets.protocol').setLevel(pyppeteer_level)16logging.getLogger('pyppeteer').setLevel(pyppeteer_level)1718PY3 = sys.version_info[0] >= 31920def base64ify(bytes_or_str):21if PY3 and isinstance(bytes_or_str, str):22input_bytes = bytes_or_str.encode('utf8')23else:24input_bytes = bytes_or_str2526output_bytes = base64.urlsafe_b64encode(input_bytes)27if PY3:28return output_bytes.decode('ascii')29else:30return output_bytes3132class ProxyMiddleware(object):33USER_AGENT = open('useragents.txt').readlines()3435def process_request(self, request, spider):36# 代理服务器37proxyHost = "t.16yun.cn"38proxyPort = "31111"3940# 代理验证信息41proxyUser = "username"42proxyPass = "password"4344request.meta['proxy'] = "http://{0}:{1}".format(proxyHost, proxyPort)4546# 添加验证头47encoded_user_pass = base64ify(proxyUser + ":" + proxyPass)48request.headers['Proxy-Authorization'] = 'Basic ' + encoded_user_pass4950# 设置IP切换头(根据需求)51tunnel = random.randint(1, 10000)52request.headers['Proxy-Tunnel'] = str(tunnel)53request.headers['User-Agent'] = random.choice(self.USER_AGENT)5455class PyppeteerMiddleware(object):56def __init__(self, **args):57"""58init logger, loop, browser59:param args:60"""61self.logger = getLogger(__name__)62self.loop = asyncio.get_event_loop()63self.browser = self.loop.run_until_complete(64pyppeteer.launch(headless=True))65self.args = args6667def __del__(self):68"""69close loop70:return:71"""72self.loop.close()7374def render(self, url, retries=1, script=None, wait=0.3, scrolldown=False, sleep=0,75timeout=8.0, keep_page=False):76"""77render page with pyppeteer78:param url: page url79:param retries: max retry times80:param script: js script to evaluate81:param wait: number of seconds to wait before loading the page, preventing timeouts82:param scrolldown: how many times to page down83:param sleep: how many long to sleep after initial render84:param timeout: the longest wait time, otherwise raise timeout error85:param keep_page: keep page not to be closed, browser object needed86:param browser: pyppetter browser object87:param with_result: return with js evaluation result88:return: content, [result]89"""9091# define async render92async def async_render(url, script, scrolldown, sleep, wait, timeout, keep_page):93try:94# basic render95page = await self.browser.newPage()96await asyncio.sleep(wait)97response = await page.goto(url, options={'timeout': int(timeout * 1000)})98if response.status != 200:99return None, None, response.status100result = None101# evaluate with script102if script:103result = await page.evaluate(script)104105# scroll down for {scrolldown} times106if scrolldown:107for _ in range(scrolldown):108await page._keyboard.down('PageDown')109await asyncio.sleep(sleep)110else:111await asyncio.sleep(sleep)112if scrolldown:113await page._keyboard.up('PageDown')114115# get html of page116content = await page.content()117118return content, result, response.status119except TimeoutError:120return None, None, 500121finally:122# if keep page, do not close it123if not keep_page:124await page.close()125126content, result, status = [None] * 3127128# retry for {retries} times129for i in range(retries):130if not content:131content, result, status = self.loop.run_until_complete(132async_render(url=url, script=script, sleep=sleep, wait=wait,133scrolldown=scrolldown, timeout=timeout, keep_page=keep_page))134else:135break136137# if need to return js evaluation result138return content, result, status139140def process_request(self, request, spider):141"""142:param request: request object143:param spider: spider object144:return: HtmlResponse145"""146if request.meta.get('render'):147try:148self.logger.debug('rendering %s', request.url)149html, result, status = self.render(request.url)150return HtmlResponse(url=request.url, body=html, request=request, encoding='utf-8',151status=status)152except websockets.exceptions.ConnectionClosed:153pass154155@classmethod156def from_crawler(cls, crawler):157return cls(**crawler.settings.get('PYPPETEER_ARGS', {}))158
给splash设置代理修改项目配置文件 (./项目名/settings.py)
1DOWNLOADER_MIDDLEWARES = {2'项目名.middlewares.PyppeteerMiddleware': 543,3'项目名.middlewares.ProxyMiddleware': 100,4}
通过设置环境变量,来使用爬虫代理
Windows
12C:\>set http_proxy=http://username:password@ip:port3
Linux/MacOS
12csh%3setenv http_proxy http://username:password@ip:port45sh$6export http_proxy=http://username:password@ip:port
新建文件 (./项目名/tunnel_fix.py)
在项目文件最开始位置from . import tunnel_fix
12from scrapy.utils.python import to_bytes, to_unicode3import scrapy.core.downloader.handlers.http114from random import randint56def tunnel_request_data(host, port, proxy_auth_header=None):7host_value = to_bytes(host, encoding='ascii') + b':' + to_bytes(str(port))8tunnel_req = b'CONNECT ' + host_value + b' HTTP/1.1\r\n'9tunnel_req += b'Host: ' + host_value + b'\r\n'10if proxy_auth_header:11tunnel_req += b'Proxy-Authorization: ' + proxy_auth_header + b'\r\n'12# 指定Proxy-Tunnel13proxy_tunnel = '{}'.format(randint(1,9999))14tunnel_req += b'Proxy-Tunnel: '+ to_bytes(proxy_tunnel) + b'\r\n'15tunnel_req += b'\r\n'16return tunnel_req1718scrapy.core.downloader.handlers.http11.tunnel_request_data = tunnel_request_data19
aiohttp
1import aiohttp, asyncio2import random3def main():45targetUrl = "https://httpbin.org/headers"67# 代理服务器(产品官网 www.16yun.cn)8proxyHost = "t.16yun.cn"9proxyPort = "31111"1011# 代理验证信息12proxyUser = "username"13proxyPass = "password"1415proxyServer = "http://%(user)s:%(pass)s@%(host)s:%(port)s" % {16"host" : proxyHost,17"port" : proxyPort,18"user" : proxyUser,19"pass" : proxyPass,20}2122userAgent = "Chrome/83.0.4103.61"23# 所有请求使用同一个Proxy-Tunnel, 使用固定IP24# proxy_tunnel = "{}".format(random.randint(1,10000))2526async def entry():27async with aiohttp.ClientSession(headers={"User-Agent": userAgent}) as session:28while True:29# 随机设Proxy-Tunnel,使用随机IP30proxy_tunnel = "{}".format(random.randint(1,10000))31async with session.get(targetUrl, proxy=proxyServer, proxy_headers={"Proxy-Tunnel":proxy_tunnel}) as resp:32body = await resp.read()33print(resp.status)34print(body)3536loop = asyncio.get_event_loop()37loop.run_until_complete(entry())38loop.run_forever()3940if __name__ == '__main__':41main()
注意
- aiohttp库实现了TCP链接池功能,如果没设置随机Proxy-Tunnel或断开TCP链接,会导致多个请求始终没有切换IP。
- 如需切换IP,需每个请求新建一个session;同时设置connector_owner参数让session关闭后链接也关闭。
- 如需切换IP,也可以设置Proxy-Tunnel为随机数,使用随机IP。
httpx
12import asyncio3import httpx4# 代理服务器(产品官网 www.16yun.cn)5proxyHost = "t.16yun.cn"6proxyPort = "31111"78# 代理验证信息9proxyUser = "username"10proxyPass = "password"1112proxyMeta = "http://%(user)s:%(pass)s@%(host)s:%(port)s" % {13"host": proxyHost,14"port": proxyPort,15"user": proxyUser,16"pass": proxyPass,17}1819# 设置 http和https访问都是用HTTP代理20proxies = {21"http://": proxyMeta,22"https://": proxyMeta,23}2425client = httpx.AsyncClient(proxies=proxies)2627# 开启http2.0支持,请使用 pip install httpx[http2]28# client = httpx.AsyncClient(http2=True,proxies=proxies)2930async def test():31resp = await client.get("https://httpbin.org/ip")32print(resp.http_version)33print(resp.text)3435asyncio.run(test())