Java 爬虫代理示例
代码示例说明
- 代码样例不能直接运行,请替换成您自己的代理信息。
- 在不同编程语言的代码示例中,需注意其环境版本。
- 示例代码使用遇到问题请,我们会为您提供技术支持。
通过代理访问HTTP2网站
需要保证JDK的版本支持HTTP2网站的访问,java9已经以上才能完整支持
407错误
// Change in Java 8 Update 111 以上版本需要下面代码 System.setProperty("jdk.http.auth.tunneling.disabledSchemes", "false"); System.setProperty("jdk.http.auth.proxying.disabledSchemes", "false");
HttpClient
1import org.apache.commons.httpclient.Credentials;2import org.apache.commons.httpclient.HostConfiguration;3import org.apache.commons.httpclient.HttpClient;4import org.apache.commons.httpclient.HttpMethod;5import org.apache.commons.httpclient.HttpStatus;6import org.apache.commons.httpclient.UsernamePasswordCredentials;7import org.apache.commons.httpclient.auth.AuthScope;8import org.apache.commons.httpclient.methods.GetMethod;910import java.io.IOException;1112public class Main {13# 代理服务器(产品官网 www.16yun.cn)14private static final String PROXY_HOST = "t.16yun.cn";15private static final int PROXY_PORT = 31111;1617public static void main(String[] args) {18HttpClient client = new HttpClient();19HttpMethod method = new GetMethod("https://httpbin.org/ip");2021HostConfiguration config = client.getHostConfiguration();22config.setProxy(PROXY_HOST, PROXY_PORT);2324client.getParams().setAuthenticationPreemptive(true);2526String username = "16ABCCKJ";27String password = "712323";28Credentials credentials = new UsernamePasswordCredentials(username, password);29AuthScope authScope = new AuthScope(PROXY_HOST, PROXY_PORT);3031client.getState().setProxyCredentials(authScope, credentials);3233try {34client.executeMethod(method);3536if (method.getStatusCode() == HttpStatus.SC_OK) {37String response = method.getResponseBodyAsString();38System.out.println("Response = " + response);39}40} catch (IOException e) {41e.printStackTrace();42} finally {43method.releaseConnection();44}45}46}
JSoup
1import java.io.IOException;2import java.net.Authenticator;3import java.net.InetSocketAddress;4import java.net.PasswordAuthentication;5import java.net.Proxy;67import org.jsoup.Jsoup;8import org.jsoup.nodes.Document;91011public class Demo12{13// 代理验证信息14final static String ProxyUser = "username";15final static String ProxyPass = "password";1617// 代理服务器(产品官网 www.16yun.cn)18final static String ProxyHost = "t.16yun.cn";19final static Integer ProxyPort = 31111;2021// 设置IP切换头22final static String ProxyHeadKey = "Proxy-Tunnel";232425public static String getUrlProxyContent(String url)26{27Authenticator.setDefault(new Authenticator() {28public PasswordAuthentication getPasswordAuthentication()29{30return new PasswordAuthentication(ProxyUser, ProxyPass.toCharArray());31}32});33// 设置Proxy-Tunnel34Random random = new Random();35int tunnel = random.nextInt(10000);36String ProxyHeadVal = String.valueOf(tunnel);3738Proxy proxy = new Proxy(Proxy.Type.HTTP, new InetSocketAddress(ProxyHost, ProxyPort));3940try41{42// 处理异常、其他参数43Document doc = Jsoup.connect(url).timeout(3000).header(ProxyHeadKey, ProxyHeadVal).proxy(proxy).get();4445if(doc != null) {46System.out.println(doc.body().html());47}48}49catch (IOException e)50{51e.printStackTrace();52}5354return null;55}5657public static void main(String[] args) throws Exception58{59// 要访问的目标页面60String targetUrl = "http://httpbin.org/ip";616263getUrlProxyContent(targetUrl);64}65}
JSoup无法使用Keep-alive
- JSoup默认会关闭连接.
- 访问HTTP网站请通过设置相同
Proxy-Tunnel来保持相同的外网IP. - 访问HTTPS网站请使用其他库,保持相同的外网IP.
1import java.io.IOException;2import org.jsoup.Jsoup;3import org.jsoup.nodes.Document;45public class Demo {67public static void main(String[] args) {89try{1011// 代理服务器(产品官网 www.16yun.cn)12final static String ProxyHost = "t.16yun.cn";13final static String ProxyPort = "31111";1415System.setProperty("http.proxyHost", ProxyHost);16System.setProperty("https.proxyHost", ProxyHost);1718System.setProperty("http.proxyPort", ProxyPort);19System.setProperty("https.proxyPort", ProxyPort);2021// 代理验证信息22final static String ProxyUser = "username";23final static String ProxyPass = "password";2425System.setProperty("http.proxyUser", ProxyUser);26System.setProperty("http.proxyPassword", ProxyPass);2728System.setProperty("https.proxyUser", ProxyUser);29System.setProperty("https.proxyPassword", ProxyPass);3031// 设置IP切换头32final static String ProxyHeadKey = "Proxy-Tunnel";3334// 设置Proxy-Tunnel35Random random = new Random();36int tunnel = random.nextInt(10000);37String ProxyHeadVal = String.valueOf(tunnel);3839// 处理异常、其他参数40Document doc = Jsoup.connect(url).timeout(3000).header(ProxyHeadKey, ProxyHeadVal).get();4142if(doc != null) {43System.out.println(doc.body().html());44}4546}catch (IOException e)47{48e.printStackTrace();49}5051}52}
JSoup无法使用Keep-alive
- JSoup默认会关闭连接.
- 访问HTTP网站请通过设置相同
Proxy-Tunnel来保持相同的外网IP. - 访问HTTPS网站请使用其他库,保持相同的外网IP.
HttpURLConnection Connection
1import java.io.ByteArrayOutputStream;2import java.io.InputStream;3import java.net.Authenticator;4import java.net.HttpURLConnection;5import java.net.InetSocketAddress;6import java.net.PasswordAuthentication;7import java.net.Proxy;8import java.net.URL;9import java.util.Random;1011class ProxyAuthenticator extends Authenticator {12private String user, password;1314public ProxyAuthenticator(String user, String password) {15this.user = user;16this.password = password;17}1819protected PasswordAuthentication getPasswordAuthentication() {20return new PasswordAuthentication(user, password.toCharArray());21}22}2324/**25* 注意:下面代码仅仅实现HTTP请求链接,每一次请求都是无状态保留的,仅仅是这次请求是更换IP的,如果下次请求的IP地址会改变26* 如果是多线程访问的话,只要将下面的代码嵌入到你自己的业务逻辑里面,那么每次都会用新的IP进行访问,如果担心IP有重复,27* 自己可以维护IP的使用情况,并做校验。28*/29public class Demo {30public static void main(String args[]) throws Exception {31// Change in Java 8 Update 111 以上版本需要下面代码32// System.setProperty("jdk.http.auth.tunneling.disabledSchemes", "false");33// System.setProperty("jdk.http.auth.proxying.disabledSchemes", "false");3435// 要访问的目标页面36String targetUrl = "http://httpbin.org/ip";373839// 代理服务器(产品官网 www.16yun.cn)40String proxyServer = "t.16yun.cn";41int proxyPort = 31111;4243// 代理验证信息44String proxyUser = "username";45String proxyPass = "password";4647try {48URL url = new URL(targetUrl);4950Authenticator.setDefault(new ProxyAuthenticator(proxyUser, proxyPass));5152// 创建代理服务器地址对象53InetSocketAddress addr = new InetSocketAddress(proxyServer, proxyPort);54// 创建HTTP类型代理对象55Proxy proxy = new Proxy(Proxy.Type.HTTP, addr);5657// 设置通过代理访问目标页面58HttpURLConnection connection = (HttpURLConnection) url.openConnection(proxy);5960// 设置KeepAlive61// connection.setRequestProperty("Connection", "keep-alive");62// connection.setRequestProperty("Keep-Alive", "timeout=5, max=100");6364// 设置Proxy-Tunnel65// Random random = new Random();66// int tunnel = random.nextInt(10000);67// connection.setRequestProperty("Proxy-Tunnel",String.valueOf(tunnel));6869// 解析返回数据70byte[] response = readStream(connection.getInputStream());7172System.out.println(new String(response));73} catch (Exception e) {74System.out.println(e.getLocalizedMessage());75}76}7778/**79* 将输入流转换成字符串80*81* @param inStream82* @return83* @throws Exception84*/85public static byte[] readStream(InputStream inStream) throws Exception {86ByteArrayOutputStream outSteam = new ByteArrayOutputStream();87byte[] buffer = new byte[1024];88int len = -1;8990while ((len = inStream.read(buffer)) != -1) {91outSteam.write(buffer, 0, len);92}93outSteam.close();94inStream.close();9596return outSteam.toByteArray();97}98}
Htmlunit
1package htmlunit;23import org.apache.http.auth.AuthScope;4import org.apache.http.auth.UsernamePasswordCredentials;5import org.apache.http.client.CredentialsProvider;6import org.apache.http.impl.client.BasicCredentialsProvider;78import com.gargoylesoftware.htmlunit.BrowserVersion;9import com.gargoylesoftware.htmlunit.NicelyResynchronizingAjaxController;10import com.gargoylesoftware.htmlunit.WebClient;11import com.gargoylesoftware.htmlunit.html.HtmlPage;1213public class HtmlunitDemo {14// 代理服务器(产品官网 www.16yun.cn)15final static String proxyHost = "t.16yun.cn";16final static Integer proxyPort = 31111;1718// 代理验证信息19final static String proxyUser = "USERNAME";20final static String proxyPass = "PASSWORD";2122public static void main(String[] args) {2324CredentialsProvider credsProvider = new BasicCredentialsProvider();25credsProvider.setCredentials(2627new AuthScope(proxyHost, proxyPort),28new UsernamePasswordCredentials(proxyUser, proxyPass));2930WebClient webClient = new WebClient(BrowserVersion.CHROME,proxyHost, proxyPort);3132webClient.setCredentialsProvider(credsProvider);3334webClient.setAjaxController(new NicelyResynchronizingAjaxController());35webClient.getOptions().setJavaScriptEnabled(true);36webClient.getOptions().setThrowExceptionOnScriptError(false);37webClient.getOptions().setThrowExceptionOnFailingStatusCode(false);38webClient.getOptions().setActiveXNative(false);39webClient.getOptions().setCssEnabled(false);4041HtmlPage page = null;4243try {44page = webClient.getPage("http://httpbin.org/ip");45} catch (Exception e) {46e.printStackTrace();47} finally {48webClient.close();49}5051webClient.waitForBackgroundJavaScript(30000);525354String pageXml = page.asXml();5556System.out.println(pageXml);57}58}
Okhttp
12import okhttp3.*;34import java.io.IOException;5import java.net.InetSocketAddress;6import java.net.Proxy;7import java.util.concurrent.TimeUnit;89public class OkHttp {1011// 代理服务器(产品官网 www.16yun.cn)12final static String proxyHost = "t.16yun.cn";13final static Integer proxyPort = 31111;1415// 代理验证信息16final static String proxyUser = "USERNAME";17final static String proxyPass = "PASSWORD";1819static OkHttpClient client = null;2021static {22Proxy proxy = new Proxy(Proxy.Type.HTTP, new InetSocketAddress(proxyHost, proxyPort));2324Authenticator proxyAuthenticator = new Authenticator() {25public Request authenticate(Route route, Response response) {26String credential = Credentials.basic(proxyUser, proxyPass);27return response.request().newBuilder()28.header("Proxy-Authorization", credential)29.build();30}31};3233client = new OkHttpClient().newBuilder()34.connectTimeout(5, TimeUnit.SECONDS)35.readTimeout(5, TimeUnit.SECONDS)36.proxy(proxy)37.proxyAuthenticator(proxyAuthenticator)38.connectionPool(new ConnectionPool(5, 1, TimeUnit.SECONDS))39.build();40}4142public static Response doGet() throws IOException {43// 要访问的目标页面44String targetUrl = "http://httpbin.org/ip";4546Request request = new Request.Builder()47.url(targetUrl)48.build();49Response response = client.newCall(request).execute();50return response;51}5253public static void main(String[] args) throws IOException {54Response response1 = doGet();55System.out.println("GET请求返回结果:");56System.out.println(response1.body().string());57}5859}
Selenium(Java 示例)
HtmlUnitDriver(Java)
1import org.json.JSONException;2import org.json.JSONObject;3import org.openqa.selenium.Platform;4import org.openqa.selenium.Proxy;5import org.openqa.selenium.htmlunit.HtmlUnitDriver;6import org.openqa.selenium.remote.CapabilityType;7import org.openqa.selenium.remote.DesiredCapabilities;89import com.gargoylesoftware.htmlunit.DefaultCredentialsProvider;10import com.gargoylesoftware.htmlunit.WebClient;1112public class HtmlUnitDriverProxyDemo13{14// 代理验证信息15final static String proxyUser = "username";16final static String proxyPass = "password";1718// 代理服务器19final static String proxyServer = "t.16yun.cn:31111";2021public static void main(String[] args) throws JSONException22{23HtmlUnitDriver driver = getHtmlUnitDriver();2425driver.get("https://httpbin.org/ip");2627String title = driver.getTitle();28System.out.println(title);29}3031public static HtmlUnitDriver getHtmlUnitDriver()32{33HtmlUnitDriver driver = null;3435Proxy proxy = new Proxy();3637proxy.setHttpProxy(proxyServer);3839DesiredCapabilities capabilities = DesiredCapabilities.htmlUnit();40capabilities.setCapability(CapabilityType.PROXY, proxy);41capabilities.setJavascriptEnabled(true);42capabilities.setPlatform(Platform.WIN8_1);4344driver = new HtmlUnitDriver(capabilities) {45@Override46protected WebClient modifyWebClient(WebClient client) {47DefaultCredentialsProvider creds = new DefaultCredentialsProvider();48creds.addCredentials(proxyUser, proxyPass);49client.setCredentialsProvider(creds);50return client;51}52};5354driver.setJavascriptEnabled(true);5556return driver;57}58}
Firefox(Java)
1import org.json.JSONException;2import org.json.JSONObject;3import org.openqa.selenium.Platform;4import org.openqa.selenium.Proxy;5import org.openqa.selenium.firefox.FirefoxDriver;6import org.openqa.selenium.firefox.FirefoxProfile;7import org.openqa.selenium.htmlunit.HtmlUnitDriver;8import org.openqa.selenium.remote.CapabilityType;9import org.openqa.selenium.remote.DesiredCapabilities;1011import com.gargoylesoftware.htmlunit.DefaultCredentialsProvider;12import com.gargoylesoftware.htmlunit.WebClient;1314public class FirefoxDriverProxyDemo15{16// 代理隧道验证信息17final static String proxyUser = "username";18final static String proxyPass = "password";1920// 代理服务器21final static String proxyHost = "t.16yun.cn";22final static int proxyPort = 31111;2324final static String firefoxBin = "C:/Program Files/Mozilla Firefox/firefox.exe";2526public static void main(String[] args) throws JSONException27{28System.setProperty("webdriver.firefox.bin", firefoxBin);2930FirefoxProfile profile = new FirefoxProfile();3132profile.setPreference("network.proxy.type", 1);333435profile.setPreference("network.proxy.http", proxyHost);36profile.setPreference("network.proxy.http_port", proxyPort);3738profile.setPreference("network.proxy.ssl", proxyHost);39profile.setPreference("network.proxy.ssl_port", proxyPort);4041profile.setPreference("username", proxyUser);42profile.setPreference("password", proxyPass);434445profile.setPreference("network.proxy.share_proxy_settings", true);464748profile.setPreference("network.proxy.no_proxies_on", "localhost");495051FirefoxDriver driver = new FirefoxDriver(profile);52}53}