Python 爬虫代理示例

阅读模式

代码示例说明

  1. 代码样例不能直接运行,请替换成您自己的代理信息。
  2. 在不同编程语言的代码示例中,需注意其环境版本。
  3. 示例代码使用遇到问题请联系,我们会为您提供技术支持。

requests

1
#! -*- encoding:utf-8 -*-
2
3
import requests
4
import random
5
6
# 要访问的目标页面
7
targetUrl = "http://httpbin.org/ip"
8
9
# 要访问的目标HTTPS页面
10
# targetUrl = "https://httpbin.org/ip"
11
12
# 代理服务器(产品官网 www.16yun.cn)
13
proxyHost = "t.16yun.cn"
14
proxyPort = "31111"
15
16
# 代理验证信息
17
proxyUser = "username"
18
proxyPass = "password"
19
20
proxyMeta = "http://%(user)s:%(pass)s@%(host)s:%(port)s" % {
21
"host" : proxyHost,
22
"port" : proxyPort,
23
"user" : proxyUser,
24
"pass" : proxyPass,
25
}
26
27
# 设置 http和https访问都是用HTTP代理
28
proxies = {
29
"http" : proxyMeta,
30
"https" : proxyMeta,
31
}
32
33
34
# 设置IP切换头
35
tunnel = random.randint(1,10000)
36
headers = {"Proxy-Tunnel": str(tunnel)}
37
38
39
40
resp = requests.get(targetUrl, proxies=proxies, headers=headers)
41
42
print resp.status_code
43
print resp.text

urllib2

1
2
#! -*- encoding:utf-8 -*-
3
4
from urllib import request
5
6
# 要访问的目标页面
7
targetUrl = "http://httpbin.org/ip"
8
9
# 代理服务器(产品官网 www.16yun.cn)
10
proxyHost = "t.16yun.cn"
11
proxyPort = "31111"
12
13
# 代理验证信息
14
proxyUser = "username"
15
proxyPass = "password"
16
17
18
proxyMeta = "http://%(user)s:%(pass)s@%(host)s:%(port)s" % {
19
"host" : proxyHost,
20
"port" : proxyPort,
21
"user" : proxyUser,
22
"pass" : proxyPass,
23
}
24
25
proxy_handler = request.ProxyHandler({
26
"http" : proxyMeta,
27
"https" : proxyMeta,
28
})
29
30
opener = request.build_opener(proxy_handler)
31
32
request.install_opener(opener)
33
resp = request.urlopen(targetUrl).read()
34
35
print (resp)

urllib2无法使用Keep-alive

urllib2对于HTTP/1.1默认会关闭连接 请通过设置相同Proxy-Tunnel来保持相同的外网IP.

scrapy

在项目中新建middlewares.py文件(./项目名/middlewares.py)

1
#! -*- encoding:utf-8 -*-
2
import base64
3
import sys
4
import random
5
6
PY3 = sys.version_info[0] >= 3
7
8
def base64ify(bytes_or_str):
9
if PY3 and isinstance(bytes_or_str, str):
10
input_bytes = bytes_or_str.encode('utf8')
11
else:
12
input_bytes = bytes_or_str
13
14
output_bytes = base64.urlsafe_b64encode(input_bytes)
15
if PY3:
16
return output_bytes.decode('ascii')
17
else:
18
return output_bytes
19
20
class ProxyMiddleware(object):
21
def process_request(self, request, spider):
22
# 代理服务器(产品官网 www.16yun.cn)
23
proxyHost = "t.16yun.cn"
24
proxyPort = "31111"
25
26
# 代理验证信息
27
proxyUser = "username"
28
proxyPass = "password"
29
30
# [版本>=2.6.2](https://docs.scrapy.org/en/latest/news.html?highlight=2.6.2#scrapy-2-6-2-2022-07-25)无需添加验证头,会自动在请求头中设置Proxy-Authorization
31
request.meta['proxy'] = "http://{0}:{1}@{2}:{3}".format(proxyUser,proxyPass,proxyHost,proxyPort)
32
33
# 版本<2.6.2 需要手动添加代理验证头
34
# request.meta['proxy'] = "http://{0}:{1}".format(proxyHost,proxyPort)
35
# request.headers['Proxy-Authorization'] = 'Basic ' + base64ify(proxyUser + ":" + proxyPass)
36
37
# 设置IP切换头(根据需求)
38
# tunnel = random.randint(1,10000)
39
# request.headers['Proxy-Tunnel'] = str(tunnel)
40
41
# 每次访问后关闭TCP链接,强制每次访问切换IP
42
request.headers['Connection'] = "Close"
43

修改项目配置文件 (./项目名/settings.py)

1
DOWNLOADER_MIDDLEWARES = {
2
'项目名.middlewares.ProxyMiddleware': 100,
3
}

在start_requests给splash调用添加代理信息

1
2
def start_requests(self):
3
script = '''
4
function main(splash)
5
local url = splash.args.url
6
assert(splash:go(url))
7
assert(splash:wait(0.5))
8
local entries = splash:history()
9
local last_response = entries[#entries].response
10
return {
11
url = splash:url(),
12
http_status = last_response.status,
13
cookies = splash:get_cookies(),
14
html = splash:html(),
15
headers = last_response.headers,
16
}
17
end
18
'''
19
# 代理服务器(产品官网 www.16yun.cn)
20
proxyHost = "t.16yun.cn"
21
proxyPort = "31111"
22
23
# 代理验证信息
24
proxyUser = "16111YVL"
25
proxyPass = "11111"
26
27
proxy = "http://{}:{}@{}:{}".format(proxyUser,proxyPass, proxyHost, proxyPort)
28
try:
29
for url in self.start_urls:
30
yield SplashRequest(
31
url,
32
self.parse,
33
endpoint="execute",
34
args={
35
"lua_source": script,
36
"wait": 5,
37
"timeout": 600,
38
"target_count": self.target_count,
39
'proxy': proxy
40
},
41
)
42
except:
43
raise CloseSpider("Could not load Lua script.")

在项目中新建middlewares.py文件(./项目名/middlewares.py)

1
#! -*- encoding:utf-8 -*-
2
import base64
3
import sys
4
import random
5
6
PY3 = sys.version_info[0] >= 3
7
8
def base64ify(bytes_or_str):
9
if PY3 and isinstance(bytes_or_str, str):
10
input_bytes = bytes_or_str.encode('utf8')
11
else:
12
input_bytes = bytes_or_str
13
14
output_bytes = base64.urlsafe_b64encode(input_bytes)
15
if PY3:
16
return output_bytes.decode('ascii')
17
else:
18
return output_bytes
19
20
class ProxyMiddleware(object):
21
def process_request(self, request, spider):
22
# 代理服务器(产品官网 www.16yun.cn)
23
proxyHost = "t.16yun.cn"
24
proxyPort = "31111"
25
26
# 代理验证信息
27
proxyUser = "username"
28
proxyPass = "password"
29
30
request.meta['proxy'] = "http://{0}:{1}".format(proxyHost,proxyPort)
31
32
# 添加验证头
33
encoded_user_pass = base64ify(proxyUser + ":" + proxyPass)
34
request.headers['Proxy-Authorization'] = 'Basic ' + encoded_user_pass
35
36
# 设置IP切换头(根据需求)
37
tunnel = random.randint(1,10000)
38
request.headers['Proxy-Tunnel'] = str(tunnel)
39

给splash设置代理修改项目配置文件 (./项目名/settings.py)

1
DOWNLOADER_MIDDLEWARES = {
2
'项目名.middlewares.ProxyMiddleware': 100,
3
}

在项目中新建middlewares.py文件(./项目名/middlewares.py)

1
#! -*- encoding:utf-8 -*-
2
3
import websockets
4
from scrapy.http import HtmlResponse
5
from logging import getLogger
6
import asyncio
7
import pyppeteer
8
import logging
9
from concurrent.futures._base import TimeoutError
10
import base64
11
import sys
12
import random
13
14
pyppeteer_level = logging.WARNING
15
logging.getLogger('websockets.protocol').setLevel(pyppeteer_level)
16
logging.getLogger('pyppeteer').setLevel(pyppeteer_level)
17
18
PY3 = sys.version_info[0] >= 3
19
20
def base64ify(bytes_or_str):
21
if PY3 and isinstance(bytes_or_str, str):
22
input_bytes = bytes_or_str.encode('utf8')
23
else:
24
input_bytes = bytes_or_str
25
26
output_bytes = base64.urlsafe_b64encode(input_bytes)
27
if PY3:
28
return output_bytes.decode('ascii')
29
else:
30
return output_bytes
31
32
class ProxyMiddleware(object):
33
USER_AGENT = open('useragents.txt').readlines()
34
35
def process_request(self, request, spider):
36
# 代理服务器
37
proxyHost = "t.16yun.cn"
38
proxyPort = "31111"
39
40
# 代理验证信息
41
proxyUser = "username"
42
proxyPass = "password"
43
44
request.meta['proxy'] = "http://{0}:{1}".format(proxyHost, proxyPort)
45
46
# 添加验证头
47
encoded_user_pass = base64ify(proxyUser + ":" + proxyPass)
48
request.headers['Proxy-Authorization'] = 'Basic ' + encoded_user_pass
49
50
# 设置IP切换头(根据需求)
51
tunnel = random.randint(1, 10000)
52
request.headers['Proxy-Tunnel'] = str(tunnel)
53
request.headers['User-Agent'] = random.choice(self.USER_AGENT)
54
55
class PyppeteerMiddleware(object):
56
def __init__(self, **args):
57
"""
58
init logger, loop, browser
59
:param args:
60
"""
61
self.logger = getLogger(__name__)
62
self.loop = asyncio.get_event_loop()
63
self.browser = self.loop.run_until_complete(
64
pyppeteer.launch(headless=True))
65
self.args = args
66
67
def __del__(self):
68
"""
69
close loop
70
:return:
71
"""
72
self.loop.close()
73
74
def render(self, url, retries=1, script=None, wait=0.3, scrolldown=False, sleep=0,
75
timeout=8.0, keep_page=False):
76
"""
77
render page with pyppeteer
78
:param url: page url
79
:param retries: max retry times
80
:param script: js script to evaluate
81
:param wait: number of seconds to wait before loading the page, preventing timeouts
82
:param scrolldown: how many times to page down
83
:param sleep: how many long to sleep after initial render
84
:param timeout: the longest wait time, otherwise raise timeout error
85
:param keep_page: keep page not to be closed, browser object needed
86
:param browser: pyppetter browser object
87
:param with_result: return with js evaluation result
88
:return: content, [result]
89
"""
90
91
# define async render
92
async def async_render(url, script, scrolldown, sleep, wait, timeout, keep_page):
93
try:
94
# basic render
95
page = await self.browser.newPage()
96
await asyncio.sleep(wait)
97
response = await page.goto(url, options={'timeout': int(timeout * 1000)})
98
if response.status != 200:
99
return None, None, response.status
100
result = None
101
# evaluate with script
102
if script:
103
result = await page.evaluate(script)
104
105
# scroll down for {scrolldown} times
106
if scrolldown:
107
for _ in range(scrolldown):
108
await page._keyboard.down('PageDown')
109
await asyncio.sleep(sleep)
110
else:
111
await asyncio.sleep(sleep)
112
if scrolldown:
113
await page._keyboard.up('PageDown')
114
115
# get html of page
116
content = await page.content()
117
118
return content, result, response.status
119
except TimeoutError:
120
return None, None, 500
121
finally:
122
# if keep page, do not close it
123
if not keep_page:
124
await page.close()
125
126
content, result, status = [None] * 3
127
128
# retry for {retries} times
129
for i in range(retries):
130
if not content:
131
content, result, status = self.loop.run_until_complete(
132
async_render(url=url, script=script, sleep=sleep, wait=wait,
133
scrolldown=scrolldown, timeout=timeout, keep_page=keep_page))
134
else:
135
break
136
137
# if need to return js evaluation result
138
return content, result, status
139
140
def process_request(self, request, spider):
141
"""
142
:param request: request object
143
:param spider: spider object
144
:return: HtmlResponse
145
"""
146
if request.meta.get('render'):
147
try:
148
self.logger.debug('rendering %s', request.url)
149
html, result, status = self.render(request.url)
150
return HtmlResponse(url=request.url, body=html, request=request, encoding='utf-8',
151
status=status)
152
except websockets.exceptions.ConnectionClosed:
153
pass
154
155
@classmethod
156
def from_crawler(cls, crawler):
157
return cls(**crawler.settings.get('PYPPETEER_ARGS', {}))
158

给splash设置代理修改项目配置文件 (./项目名/settings.py)

1
DOWNLOADER_MIDDLEWARES = {
2
'项目名.middlewares.PyppeteerMiddleware': 543,
3
'项目名.middlewares.ProxyMiddleware': 100,
4
}

通过设置环境变量,来使用爬虫代理

Windows

1
2
C:\>set http_proxy=http://username:password@ip:port
3

Linux/MacOS

1
2
csh%
3
setenv http_proxy http://username:password@ip:port
4
5
sh$
6
export http_proxy=http://username:password@ip:port

新建文件 (./项目名/tunnel_fix.py) 在项目文件最开始位置from . import tunnel_fix

1
2
from scrapy.utils.python import to_bytes, to_unicode
3
import scrapy.core.downloader.handlers.http11
4
from random import randint
5
6
def tunnel_request_data(host, port, proxy_auth_header=None):
7
host_value = to_bytes(host, encoding='ascii') + b':' + to_bytes(str(port))
8
tunnel_req = b'CONNECT ' + host_value + b' HTTP/1.1\r\n'
9
tunnel_req += b'Host: ' + host_value + b'\r\n'
10
if proxy_auth_header:
11
tunnel_req += b'Proxy-Authorization: ' + proxy_auth_header + b'\r\n'
12
# 指定Proxy-Tunnel
13
proxy_tunnel = '{}'.format(randint(1,9999))
14
tunnel_req += b'Proxy-Tunnel: '+ to_bytes(proxy_tunnel) + b'\r\n'
15
tunnel_req += b'\r\n'
16
return tunnel_req
17
18
scrapy.core.downloader.handlers.http11.tunnel_request_data = tunnel_request_data
19

aiohttp

1
import aiohttp, asyncio
2
import random
3
def main():
4
5
targetUrl = "https://httpbin.org/headers"
6
7
# 代理服务器(产品官网 www.16yun.cn)
8
proxyHost = "t.16yun.cn"
9
proxyPort = "31111"
10
11
# 代理验证信息
12
proxyUser = "username"
13
proxyPass = "password"
14
15
proxyServer = "http://%(user)s:%(pass)s@%(host)s:%(port)s" % {
16
"host" : proxyHost,
17
"port" : proxyPort,
18
"user" : proxyUser,
19
"pass" : proxyPass,
20
}
21
22
userAgent = "Chrome/83.0.4103.61"
23
# 所有请求使用同一个Proxy-Tunnel, 使用固定IP
24
# proxy_tunnel = "{}".format(random.randint(1,10000))
25
26
async def entry():
27
async with aiohttp.ClientSession(headers={"User-Agent": userAgent}) as session:
28
while True:
29
# 随机设Proxy-Tunnel,使用随机IP
30
proxy_tunnel = "{}".format(random.randint(1,10000))
31
async with session.get(targetUrl, proxy=proxyServer, proxy_headers={"Proxy-Tunnel":proxy_tunnel}) as resp:
32
body = await resp.read()
33
print(resp.status)
34
print(body)
35
36
loop = asyncio.get_event_loop()
37
loop.run_until_complete(entry())
38
loop.run_forever()
39
40
if __name__ == '__main__':
41
main()

注意

  • aiohttp库实现了TCP链接池功能,如果没设置随机Proxy-Tunnel或断开TCP链接,会导致多个请求始终没有切换IP。
  • 如需切换IP,需每个请求新建一个session;同时设置connector_owner参数让session关闭后链接也关闭。
  • 如需切换IP,也可以设置Proxy-Tunnel为随机数,使用随机IP。

httpx

1
2
import asyncio
3
import httpx
4
# 代理服务器(产品官网 www.16yun.cn)
5
proxyHost = "t.16yun.cn"
6
proxyPort = "31111"
7
8
# 代理验证信息
9
proxyUser = "username"
10
proxyPass = "password"
11
12
proxyMeta = "http://%(user)s:%(pass)s@%(host)s:%(port)s" % {
13
"host": proxyHost,
14
"port": proxyPort,
15
"user": proxyUser,
16
"pass": proxyPass,
17
}
18
19
# 设置 http和https访问都是用HTTP代理
20
proxies = {
21
"http://": proxyMeta,
22
"https://": proxyMeta,
23
}
24
25
client = httpx.AsyncClient(proxies=proxies)
26
27
# 开启http2.0支持,请使用 pip install httpx[http2]
28
# client = httpx.AsyncClient(http2=True,proxies=proxies)
29
30
async def test():
31
resp = await client.get("https://httpbin.org/ip")
32
print(resp.http_version)
33
print(resp.text)
34
35
asyncio.run(test())