Java 爬虫代理示例

阅读模式

代码示例说明

  1. 代码样例不能直接运行,请替换成您自己的代理信息。
  2. 在不同编程语言的代码示例中,需注意其环境版本。
  3. 示例代码使用遇到问题请,我们会为您提供技术支持。

通过代理访问HTTP2网站

需要保证JDK的版本支持HTTP2网站的访问,java9已经以上才能完整支持

407错误

// Change in Java 8 Update 111 以上版本需要下面代码 System.setProperty("jdk.http.auth.tunneling.disabledSchemes", "false"); System.setProperty("jdk.http.auth.proxying.disabledSchemes", "false");

HttpClient

1
import org.apache.commons.httpclient.Credentials;
2
import org.apache.commons.httpclient.HostConfiguration;
3
import org.apache.commons.httpclient.HttpClient;
4
import org.apache.commons.httpclient.HttpMethod;
5
import org.apache.commons.httpclient.HttpStatus;
6
import org.apache.commons.httpclient.UsernamePasswordCredentials;
7
import org.apache.commons.httpclient.auth.AuthScope;
8
import org.apache.commons.httpclient.methods.GetMethod;
9
10
import java.io.IOException;
11
12
public class Main {
13
# 代理服务器(产品官网 www.16yun.cn)
14
private static final String PROXY_HOST = "t.16yun.cn";
15
private static final int PROXY_PORT = 31111;
16
17
public static void main(String[] args) {
18
HttpClient client = new HttpClient();
19
HttpMethod method = new GetMethod("https://httpbin.org/ip");
20
21
HostConfiguration config = client.getHostConfiguration();
22
config.setProxy(PROXY_HOST, PROXY_PORT);
23
24
client.getParams().setAuthenticationPreemptive(true);
25
26
String username = "16ABCCKJ";
27
String password = "712323";
28
Credentials credentials = new UsernamePasswordCredentials(username, password);
29
AuthScope authScope = new AuthScope(PROXY_HOST, PROXY_PORT);
30
31
client.getState().setProxyCredentials(authScope, credentials);
32
33
try {
34
client.executeMethod(method);
35
36
if (method.getStatusCode() == HttpStatus.SC_OK) {
37
String response = method.getResponseBodyAsString();
38
System.out.println("Response = " + response);
39
}
40
} catch (IOException e) {
41
e.printStackTrace();
42
} finally {
43
method.releaseConnection();
44
}
45
}
46
}

JSoup

1
import java.io.IOException;
2
import java.net.Authenticator;
3
import java.net.InetSocketAddress;
4
import java.net.PasswordAuthentication;
5
import java.net.Proxy;
6
7
import org.jsoup.Jsoup;
8
import org.jsoup.nodes.Document;
9
10
11
public class Demo
12
{
13
// 代理验证信息
14
final static String ProxyUser = "username";
15
final static String ProxyPass = "password";
16
17
// 代理服务器(产品官网 www.16yun.cn)
18
final static String ProxyHost = "t.16yun.cn";
19
final static Integer ProxyPort = 31111;
20
21
// 设置IP切换头
22
final static String ProxyHeadKey = "Proxy-Tunnel";
23
24
25
public static String getUrlProxyContent(String url)
26
{
27
Authenticator.setDefault(new Authenticator() {
28
public PasswordAuthentication getPasswordAuthentication()
29
{
30
return new PasswordAuthentication(ProxyUser, ProxyPass.toCharArray());
31
}
32
});
33
// 设置Proxy-Tunnel
34
Random random = new Random();
35
int tunnel = random.nextInt(10000);
36
String ProxyHeadVal = String.valueOf(tunnel);
37
38
Proxy proxy = new Proxy(Proxy.Type.HTTP, new InetSocketAddress(ProxyHost, ProxyPort));
39
40
try
41
{
42
// 处理异常、其他参数
43
Document doc = Jsoup.connect(url).timeout(3000).header(ProxyHeadKey, ProxyHeadVal).proxy(proxy).get();
44
45
if(doc != null) {
46
System.out.println(doc.body().html());
47
}
48
}
49
catch (IOException e)
50
{
51
e.printStackTrace();
52
}
53
54
return null;
55
}
56
57
public static void main(String[] args) throws Exception
58
{
59
// 要访问的目标页面
60
String targetUrl = "http://httpbin.org/ip";
61
62
63
getUrlProxyContent(targetUrl);
64
}
65
}

JSoup无法使用Keep-alive

  • JSoup默认会关闭连接.
  • 访问HTTP网站请通过设置相同Proxy-Tunnel来保持相同的外网IP.
  • 访问HTTPS网站请使用其他库,保持相同的外网IP.
1
import java.io.IOException;
2
import org.jsoup.Jsoup;
3
import org.jsoup.nodes.Document;
4
5
public class Demo {
6
7
public static void main(String[] args) {
8
9
try{
10
11
// 代理服务器(产品官网 www.16yun.cn)
12
final static String ProxyHost = "t.16yun.cn";
13
final static String ProxyPort = "31111";
14
15
System.setProperty("http.proxyHost", ProxyHost);
16
System.setProperty("https.proxyHost", ProxyHost);
17
18
System.setProperty("http.proxyPort", ProxyPort);
19
System.setProperty("https.proxyPort", ProxyPort);
20
21
// 代理验证信息
22
final static String ProxyUser = "username";
23
final static String ProxyPass = "password";
24
25
System.setProperty("http.proxyUser", ProxyUser);
26
System.setProperty("http.proxyPassword", ProxyPass);
27
28
System.setProperty("https.proxyUser", ProxyUser);
29
System.setProperty("https.proxyPassword", ProxyPass);
30
31
// 设置IP切换头
32
final static String ProxyHeadKey = "Proxy-Tunnel";
33
34
// 设置Proxy-Tunnel
35
Random random = new Random();
36
int tunnel = random.nextInt(10000);
37
String ProxyHeadVal = String.valueOf(tunnel);
38
39
// 处理异常、其他参数
40
Document doc = Jsoup.connect(url).timeout(3000).header(ProxyHeadKey, ProxyHeadVal).get();
41
42
if(doc != null) {
43
System.out.println(doc.body().html());
44
}
45
46
}catch (IOException e)
47
{
48
e.printStackTrace();
49
}
50
51
}
52
}

JSoup无法使用Keep-alive

  • JSoup默认会关闭连接.
  • 访问HTTP网站请通过设置相同Proxy-Tunnel来保持相同的外网IP.
  • 访问HTTPS网站请使用其他库,保持相同的外网IP.

HttpURLConnection Connection

1
import java.io.ByteArrayOutputStream;
2
import java.io.InputStream;
3
import java.net.Authenticator;
4
import java.net.HttpURLConnection;
5
import java.net.InetSocketAddress;
6
import java.net.PasswordAuthentication;
7
import java.net.Proxy;
8
import java.net.URL;
9
import java.util.Random;
10
11
class ProxyAuthenticator extends Authenticator {
12
private String user, password;
13
14
public ProxyAuthenticator(String user, String password) {
15
this.user = user;
16
this.password = password;
17
}
18
19
protected PasswordAuthentication getPasswordAuthentication() {
20
return new PasswordAuthentication(user, password.toCharArray());
21
}
22
}
23
24
/**
25
* 注意:下面代码仅仅实现HTTP请求链接,每一次请求都是无状态保留的,仅仅是这次请求是更换IP的,如果下次请求的IP地址会改变
26
* 如果是多线程访问的话,只要将下面的代码嵌入到你自己的业务逻辑里面,那么每次都会用新的IP进行访问,如果担心IP有重复,
27
* 自己可以维护IP的使用情况,并做校验。
28
*/
29
public class Demo {
30
public static void main(String args[]) throws Exception {
31
// Change in Java 8 Update 111 以上版本需要下面代码
32
// System.setProperty("jdk.http.auth.tunneling.disabledSchemes", "false");
33
// System.setProperty("jdk.http.auth.proxying.disabledSchemes", "false");
34
35
// 要访问的目标页面
36
String targetUrl = "http://httpbin.org/ip";
37
38
39
// 代理服务器(产品官网 www.16yun.cn)
40
String proxyServer = "t.16yun.cn";
41
int proxyPort = 31111;
42
43
// 代理验证信息
44
String proxyUser = "username";
45
String proxyPass = "password";
46
47
try {
48
URL url = new URL(targetUrl);
49
50
Authenticator.setDefault(new ProxyAuthenticator(proxyUser, proxyPass));
51
52
// 创建代理服务器地址对象
53
InetSocketAddress addr = new InetSocketAddress(proxyServer, proxyPort);
54
// 创建HTTP类型代理对象
55
Proxy proxy = new Proxy(Proxy.Type.HTTP, addr);
56
57
// 设置通过代理访问目标页面
58
HttpURLConnection connection = (HttpURLConnection) url.openConnection(proxy);
59
60
// 设置KeepAlive
61
// connection.setRequestProperty("Connection", "keep-alive");
62
// connection.setRequestProperty("Keep-Alive", "timeout=5, max=100");
63
64
// 设置Proxy-Tunnel
65
// Random random = new Random();
66
// int tunnel = random.nextInt(10000);
67
// connection.setRequestProperty("Proxy-Tunnel",String.valueOf(tunnel));
68
69
// 解析返回数据
70
byte[] response = readStream(connection.getInputStream());
71
72
System.out.println(new String(response));
73
} catch (Exception e) {
74
System.out.println(e.getLocalizedMessage());
75
}
76
}
77
78
/**
79
* 将输入流转换成字符串
80
*
81
* @param inStream
82
* @return
83
* @throws Exception
84
*/
85
public static byte[] readStream(InputStream inStream) throws Exception {
86
ByteArrayOutputStream outSteam = new ByteArrayOutputStream();
87
byte[] buffer = new byte[1024];
88
int len = -1;
89
90
while ((len = inStream.read(buffer)) != -1) {
91
outSteam.write(buffer, 0, len);
92
}
93
outSteam.close();
94
inStream.close();
95
96
return outSteam.toByteArray();
97
}
98
}

Htmlunit

1
package htmlunit;
2
3
import org.apache.http.auth.AuthScope;
4
import org.apache.http.auth.UsernamePasswordCredentials;
5
import org.apache.http.client.CredentialsProvider;
6
import org.apache.http.impl.client.BasicCredentialsProvider;
7
8
import com.gargoylesoftware.htmlunit.BrowserVersion;
9
import com.gargoylesoftware.htmlunit.NicelyResynchronizingAjaxController;
10
import com.gargoylesoftware.htmlunit.WebClient;
11
import com.gargoylesoftware.htmlunit.html.HtmlPage;
12
13
public class HtmlunitDemo {
14
// 代理服务器(产品官网 www.16yun.cn)
15
final static String proxyHost = "t.16yun.cn";
16
final static Integer proxyPort = 31111;
17
18
// 代理验证信息
19
final static String proxyUser = "USERNAME";
20
final static String proxyPass = "PASSWORD";
21
22
public static void main(String[] args) {
23
24
CredentialsProvider credsProvider = new BasicCredentialsProvider();
25
credsProvider.setCredentials(
26
27
new AuthScope(proxyHost, proxyPort),
28
new UsernamePasswordCredentials(proxyUser, proxyPass));
29
30
WebClient webClient = new WebClient(BrowserVersion.CHROME,proxyHost, proxyPort);
31
32
webClient.setCredentialsProvider(credsProvider);
33
34
webClient.setAjaxController(new NicelyResynchronizingAjaxController());
35
webClient.getOptions().setJavaScriptEnabled(true);
36
webClient.getOptions().setThrowExceptionOnScriptError(false);
37
webClient.getOptions().setThrowExceptionOnFailingStatusCode(false);
38
webClient.getOptions().setActiveXNative(false);
39
webClient.getOptions().setCssEnabled(false);
40
41
HtmlPage page = null;
42
43
try {
44
page = webClient.getPage("http://httpbin.org/ip");
45
} catch (Exception e) {
46
e.printStackTrace();
47
} finally {
48
webClient.close();
49
}
50
51
webClient.waitForBackgroundJavaScript(30000);
52
53
54
String pageXml = page.asXml();
55
56
System.out.println(pageXml);
57
}
58
}

Okhttp

1
2
import okhttp3.*;
3
4
import java.io.IOException;
5
import java.net.InetSocketAddress;
6
import java.net.Proxy;
7
import java.util.concurrent.TimeUnit;
8
9
public class OkHttp {
10
11
// 代理服务器(产品官网 www.16yun.cn)
12
final static String proxyHost = "t.16yun.cn";
13
final static Integer proxyPort = 31111;
14
15
// 代理验证信息
16
final static String proxyUser = "USERNAME";
17
final static String proxyPass = "PASSWORD";
18
19
static OkHttpClient client = null;
20
21
static {
22
Proxy proxy = new Proxy(Proxy.Type.HTTP, new InetSocketAddress(proxyHost, proxyPort));
23
24
Authenticator proxyAuthenticator = new Authenticator() {
25
public Request authenticate(Route route, Response response) {
26
String credential = Credentials.basic(proxyUser, proxyPass);
27
return response.request().newBuilder()
28
.header("Proxy-Authorization", credential)
29
.build();
30
}
31
};
32
33
client = new OkHttpClient().newBuilder()
34
.connectTimeout(5, TimeUnit.SECONDS)
35
.readTimeout(5, TimeUnit.SECONDS)
36
.proxy(proxy)
37
.proxyAuthenticator(proxyAuthenticator)
38
.connectionPool(new ConnectionPool(5, 1, TimeUnit.SECONDS))
39
.build();
40
}
41
42
public static Response doGet() throws IOException {
43
// 要访问的目标页面
44
String targetUrl = "http://httpbin.org/ip";
45
46
Request request = new Request.Builder()
47
.url(targetUrl)
48
.build();
49
Response response = client.newCall(request).execute();
50
return response;
51
}
52
53
public static void main(String[] args) throws IOException {
54
Response response1 = doGet();
55
System.out.println("GET请求返回结果:");
56
System.out.println(response1.body().string());
57
}
58
59
}

Selenium(Java 示例)

HtmlUnitDriver(Java)

1
import org.json.JSONException;
2
import org.json.JSONObject;
3
import org.openqa.selenium.Platform;
4
import org.openqa.selenium.Proxy;
5
import org.openqa.selenium.htmlunit.HtmlUnitDriver;
6
import org.openqa.selenium.remote.CapabilityType;
7
import org.openqa.selenium.remote.DesiredCapabilities;
8
9
import com.gargoylesoftware.htmlunit.DefaultCredentialsProvider;
10
import com.gargoylesoftware.htmlunit.WebClient;
11
12
public class HtmlUnitDriverProxyDemo
13
{
14
// 代理验证信息
15
final static String proxyUser = "username";
16
final static String proxyPass = "password";
17
18
// 代理服务器
19
final static String proxyServer = "t.16yun.cn:31111";
20
21
public static void main(String[] args) throws JSONException
22
{
23
HtmlUnitDriver driver = getHtmlUnitDriver();
24
25
driver.get("https://httpbin.org/ip");
26
27
String title = driver.getTitle();
28
System.out.println(title);
29
}
30
31
public static HtmlUnitDriver getHtmlUnitDriver()
32
{
33
HtmlUnitDriver driver = null;
34
35
Proxy proxy = new Proxy();
36
37
proxy.setHttpProxy(proxyServer);
38
39
DesiredCapabilities capabilities = DesiredCapabilities.htmlUnit();
40
capabilities.setCapability(CapabilityType.PROXY, proxy);
41
capabilities.setJavascriptEnabled(true);
42
capabilities.setPlatform(Platform.WIN8_1);
43
44
driver = new HtmlUnitDriver(capabilities) {
45
@Override
46
protected WebClient modifyWebClient(WebClient client) {
47
DefaultCredentialsProvider creds = new DefaultCredentialsProvider();
48
creds.addCredentials(proxyUser, proxyPass);
49
client.setCredentialsProvider(creds);
50
return client;
51
}
52
};
53
54
driver.setJavascriptEnabled(true);
55
56
return driver;
57
}
58
}

Firefox(Java)

1
import org.json.JSONException;
2
import org.json.JSONObject;
3
import org.openqa.selenium.Platform;
4
import org.openqa.selenium.Proxy;
5
import org.openqa.selenium.firefox.FirefoxDriver;
6
import org.openqa.selenium.firefox.FirefoxProfile;
7
import org.openqa.selenium.htmlunit.HtmlUnitDriver;
8
import org.openqa.selenium.remote.CapabilityType;
9
import org.openqa.selenium.remote.DesiredCapabilities;
10
11
import com.gargoylesoftware.htmlunit.DefaultCredentialsProvider;
12
import com.gargoylesoftware.htmlunit.WebClient;
13
14
public class FirefoxDriverProxyDemo
15
{
16
// 代理隧道验证信息
17
final static String proxyUser = "username";
18
final static String proxyPass = "password";
19
20
// 代理服务器
21
final static String proxyHost = "t.16yun.cn";
22
final static int proxyPort = 31111;
23
24
final static String firefoxBin = "C:/Program Files/Mozilla Firefox/firefox.exe";
25
26
public static void main(String[] args) throws JSONException
27
{
28
System.setProperty("webdriver.firefox.bin", firefoxBin);
29
30
FirefoxProfile profile = new FirefoxProfile();
31
32
profile.setPreference("network.proxy.type", 1);
33
34
35
profile.setPreference("network.proxy.http", proxyHost);
36
profile.setPreference("network.proxy.http_port", proxyPort);
37
38
profile.setPreference("network.proxy.ssl", proxyHost);
39
profile.setPreference("network.proxy.ssl_port", proxyPort);
40
41
profile.setPreference("username", proxyUser);
42
profile.setPreference("password", proxyPass);
43
44
45
profile.setPreference("network.proxy.share_proxy_settings", true);
46
47
48
profile.setPreference("network.proxy.no_proxies_on", "localhost");
49
50
51
FirefoxDriver driver = new FirefoxDriver(profile);
52
}
53
}