JavaScript/Node.js 爬虫代理示例

阅读模式

代码示例说明

  1. 代码样例不能直接运行,请替换成您自己的代理信息。
  2. 在不同编程语言的代码示例中,需注意其环境版本。
  3. 示例代码使用遇到问题请,我们会为您提供技术支持。

Node.js

1
2
const http = require("http");
3
const url = require("url");
4
5
// 要访问的目标页面
6
const targetUrl = "http://httpbin.org/ip";
7
8
9
const urlParsed = url.parse(targetUrl);
10
11
// 代理服务器(产品官网 www.16yun.cn)
12
const proxyHost = "t.16yun.cn";
13
const proxyPort = "36600";
14
15
// 生成一个随机 proxy tunnel
16
var seed = 1;
17
function random() {
18
var x = Math.sin(seed++) * 10000;
19
return x - Math.floor(x);
20
}
21
const tunnel = random()*100;
22
23
// 代理验证信息
24
const proxyUser = "username";
25
const proxyPass = "password";
26
27
const base64 = new Buffer.from(proxyUser + ":" + proxyPass).toString("base64");
28
29
const options = {
30
host: proxyHost,
31
port: proxyPort,
32
path: targetUrl,
33
method: "GET",
34
headers: {
35
"Host": urlParsed.hostname,
36
"Proxy-Tunnel": tunnel,
37
"Proxy-Authorization" : "Basic " + base64
38
}
39
};
40
41
http.request(options, function (res) {
42
console.log("got response: " + res.statusCode);
43
res.pipe(process.stdout);
44
}).on("error", function (err) {
45
console.log(err);
46
}).end();

request

1
2
const request = require("request");
3
4
// 要访问的目标页面
5
const targetUrl = "http://httpbin.org/ip";
6
7
// 代理服务器(产品官网 www.16yun.cn)
8
const proxyHost = "t.16yun.cn";
9
const proxyPort = "31111";
10
11
12
// 代理验证信息
13
const proxyUser = "username";
14
const proxyPass = "password";
15
16
const proxyUrl = "http://" + proxyUser + ":" + proxyPass + "@" + proxyHost + ":" + proxyPort;
17
18
const proxiedRequest = request.defaults({'proxy': proxyUrl});
19
20
const options = {
21
url : targetUrl,
22
headers : {
23
}
24
};
25
26
proxiedRequest
27
.get(options, function (err, res, body) {
28
console.log("got response: " + res.statusCode);
29
})
30
.on("error", function (err) {
31
console.log(err);
32
})
33
;

superagent

1
const request = require("superagent");
2
3
require("superagent-proxy")(request);
4
5
// 要访问的目标页面
6
const targetUrl = "http://httpbin.org/ip";
7
8
// 代理服务器(产品官网 www.16yun.cn)
9
const proxyHost = "t.16yun.cn";
10
const proxyPort = 31111;
11
12
// 代理验证信息
13
const proxyUser = "username";
14
const proxyPass = "password";
15
16
const proxyUrl = "http://" + proxyUser + ":" + proxyPass + "@" + proxyHost + ":" + proxyPort;
17
18
request
19
.get(targetUrl)
20
.proxy(proxyUrl)
21
.end(function onResponse(err, res) {
22
if (err) {
23
return console.log(err);
24
}
25
26
console.log(res.status, res.headers);
27
console.log(res.text);
28
})
29
;

axios

1
const axios = require('axios');
2
3
// 要访问的目标页面
4
const targetUrl = "http://httpbin.org/ip";
5
const targetHttpsUrl = "https://httpbin.org/ip";
6
7
// 代理服务器(产品官网 www.16yun.cn)
8
const proxyHost = "t.16yun.cn";
9
const proxyPort = 31111;
10
11
// 代理验证信息
12
const proxyUser = "username";
13
const proxyPass = "password";
14
15
var proxy = {
16
host: proxyHost,
17
port: proxyPort,
18
auth: {
19
username: proxyUser,
20
password: proxyPass
21
}
22
};
23
24
axios.get(targetUrl,{proxy:proxy})
25
.then(function (response) {
26
// handle success
27
console.log(response.data);
28
})
29
.catch(function (error) {
30
// handle error
31
console.log(error);
32
})
33
.finally(function () {
34
// always executed
35
});
36
37
// 目标为https网站 axios库支持有bug,不推荐使用
38
// 具体参看 https://github.com/axios/axios/issues/4531

浏览器与自动化(JS)

PhantomJS

以参数方式传递代理信息,示例如下:

1
phantomjs --proxy-auth=USERNAME:PASSWORD --proxy=http://t.16yun.cn:31111 --ignore-ssl-errors=true http-demo.js

http-demo.js 内容如下:

1
2
var page = require('webpage').create();
3
page.settings.userAgent = 'Mozilla/5.0 UCBrowser/9.4.1.362 U3/0.8.0 Mobile Safari/533.1';
4
5
console.log('The user agent is ' + page.settings.userAgent);
6
7
// 生成一个随机 proxy tunnel
8
var seed = 1;
9
function random() {
10
var x = Math.sin(seed++) * 10000;
11
return x - Math.floor(x);
12
}
13
const tunnel = random()*100;
14
15
//page.customHeaders = {
16
// "proxy-tunnel": tunnel,
17
//};
18
19
page.onResourceReceived = function(j) {
20
for (var i = 0; i < j.headers.length; ++i) {
21
console.log(j.headers[i].name + ': ' + j.headers[i].value);
22
}
23
};
24
25
page.open("http://httpbin.org/ip", {}, function(status) {
26
console.log('status> ' + status);
27
console.log(page.content);
28
setTimeout(function() {
29
phantom.exit();
30
}, 3000);
31
});

CasperJS

以参数方式传递代理信息,示例如下:

1
casperjs --proxy-auth=USERNAME:PASSWORD --proxy=http://t.16yun.cn:31111 --ignore-ssl-errors=true --ssl-protocol=any http-demo.js

http-demo.js 内容如下:

1
2
var casper = require('casper').create();
3
4
// 生成一个随机 proxy tunnel
5
var seed = 1;
6
function random() {
7
var x = Math.sin(seed++) * 10000;
8
return x - Math.floor(x);
9
}
10
const tunnel = random()*1000;
11
12
casper.on('started', function () {
13
this.page.customHeaders = {
14
"User-Agent" : "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:26.0) Gecko/20100101 Firefox/26.0",
15
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
16
"Accept-Language": "en-US,en;q=0.5",
17
"Accept-Encoding": "gzip, deflate",
18
"Connection" : "keep-alive",
19
"Proxy-Tunnel": tunnel
20
}
21
});
22
casper.start("http://httpbin.org/headers");
23
24
25
casper.then(function() {
26
console.log('First Page: ' + this.page.content);
27
});
28
casper.run();
29

playwright

1
const { chromium, webkit, firefox } = require('playwright');
2
3
(async () => {
4
const browser = await chromium.launch({
5
proxy: {
6
server: 'http://t.16yun.cn:31111',
7
username: 'username',
8
password: 'password'
9
}
10
});
11
const page = await browser.newPage();
12
13
// Subscribe to 'request' and 'response' events.
14
page.on('request', request =>
15
console.log('>>', request.method(), request.url()));
16
page.on('response', response =>
17
console.log('<<', response.status(), response.url()));
18
await page.goto('https://httpbin.org/ip');
19
20
await browser.close();
21
})();

Puppeteer

1
const puppeteer = require('puppeteer');
2
// 代理服务器(产品官网 www.16yun.cn)
3
const proxyServer = 'http://t.16yun.cn:31111';
4
5
const username = 'username';
6
const password = 'password';
7
8
(async() => {
9
const browser = await puppeteer.launch({
10
args: [ '--proxy-server='+proxyServer+'','--no-sandbox', '--disable-setuid-sandbox' ]});
11
const page = await browser.newPage();
12
await page.authenticate({ username, password });
13
await page.goto('https://www.baidu.com');
14
const cookies = await page.cookies();
15
await console.log(cookies);
16
await page.setViewport({width: 320, height: 480});
17
await page.screenshot({path: '/screenshots/full.png', fullPage: true});
18
await browser.close();
19
})();