Java 爬虫代理示例

[!NOTE]代码示例说明

  1. 代码样例不能直接运行,请替换成您自己的代理信息。
  2. 在不同编程语言的代码示例中,需注意其环境版本。
  3. 示例代码使用遇到问题请联系售后客服,我们会为您提供技术支持。

[!WARNING] 通过代理访问HTTP2网站

需要保证JDK的版本支持HTTP2网站的访问,java9已经以上才能完整支持

[!WARNING] 407错误

// Change in Java 8 Update 111 以上版本需要下面代码 System.setProperty("jdk.http.auth.tunneling.disabledSchemes", "false"); System.setProperty("jdk.http.auth.proxying.disabledSchemes", "false");

HttpClient

=== "HttpClient3.1"

import org.apache.commons.httpclient.Credentials;
import org.apache.commons.httpclient.HostConfiguration;
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.HttpMethod;
import org.apache.commons.httpclient.HttpStatus;
import org.apache.commons.httpclient.UsernamePasswordCredentials;
import org.apache.commons.httpclient.auth.AuthScope;
import org.apache.commons.httpclient.methods.GetMethod;

import java.io.IOException;

public class Main {
    # 代理服务器(产品官网 www.16yun.cn)
    private static final String PROXY_HOST = "t.16yun.cn";
    private static final int PROXY_PORT = 31111;

    public static void main(String[] args) {
        HttpClient client = new HttpClient();
        HttpMethod method = new GetMethod("https://httpbin.org/ip");

        HostConfiguration config = client.getHostConfiguration();
        config.setProxy(PROXY_HOST, PROXY_PORT);

        client.getParams().setAuthenticationPreemptive(true);

        String username = "16ABCCKJ";
        String password = "712323";
        Credentials credentials = new UsernamePasswordCredentials(username, password);
        AuthScope authScope = new AuthScope(PROXY_HOST, PROXY_PORT);

        client.getState().setProxyCredentials(authScope, credentials);

        try {
            client.executeMethod(method);

            if (method.getStatusCode() == HttpStatus.SC_OK) {
                String response = method.getResponseBodyAsString();
                System.out.println("Response = " + response);
            }
        } catch (IOException e) {
            e.printStackTrace();
        } finally {
            method.releaseConnection();
        }
    }
}

=== "HttpClient4.x"


//*感谢 “情歌”提供的代码

import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.io.IOException;
import java.net.URI;
import java.util.Arrays;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;

import org.apache.http.Header;
import org.apache.http.HeaderElement;
import org.apache.http.HttpHost;
import org.apache.http.auth.AuthScope;
import org.apache.http.auth.UsernamePasswordCredentials;
import org.apache.http.client.AuthCache;
import org.apache.http.client.CredentialsProvider;
import org.apache.http.client.HttpRequestRetryHandler;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.config.AuthSchemes;
import org.apache.http.client.entity.GzipDecompressingEntity;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.client.methods.HttpRequestBase;
import org.apache.http.client.protocol.HttpClientContext;
import org.apache.http.config.Registry;
import org.apache.http.config.RegistryBuilder;
import org.apache.http.conn.socket.ConnectionSocketFactory;
import org.apache.http.conn.socket.LayeredConnectionSocketFactory;
import org.apache.http.conn.socket.PlainConnectionSocketFactory;
import org.apache.http.conn.ssl.SSLConnectionSocketFactory;
import org.apache.http.impl.auth.BasicScheme;
import org.apache.http.impl.client.BasicAuthCache;
import org.apache.http.impl.client.BasicCredentialsProvider;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.client.ProxyAuthenticationStrategy;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.apache.http.message.BasicHeader;
import org.apache.http.message.BasicNameValuePair;
import org.apache.http.NameValuePair;
import org.apache.http.util.EntityUtils;

public class Demo
{
    // 代理服务器(产品官网 www.16yun.cn)
    final static String proxyHost = "t.16yun.cn";
    final static Integer proxyPort = 31000;

    // 代理验证信息
    final static String proxyUser = "username";
    final static String proxyPass = "password";




    private static PoolingHttpClientConnectionManager cm = null;
    private static HttpRequestRetryHandler httpRequestRetryHandler = null;
    private static HttpHost proxy = null;

    private static CredentialsProvider credsProvider = null;
    private static RequestConfig reqConfig = null;

    static {
        ConnectionSocketFactory plainsf = PlainConnectionSocketFactory.getSocketFactory();
        LayeredConnectionSocketFactory sslsf = SSLConnectionSocketFactory.getSocketFactory();

        Registry registry = RegistryBuilder.create()
            .register("http", plainsf)
            .register("https", sslsf)
            .build();

        cm = new PoolingHttpClientConnectionManager(registry);
        cm.setMaxTotal(20);
        cm.setDefaultMaxPerRoute(5);

        proxy = new HttpHost(proxyHost, proxyPort, "http");

        credsProvider = new BasicCredentialsProvider();
        credsProvider.setCredentials(AuthScope.ANY, new UsernamePasswordCredentials(proxyUser, proxyPass));

        reqConfig = RequestConfig.custom()
            .setConnectionRequestTimeout(5000)
            .setConnectTimeout(5000)
            .setSocketTimeout(5000)
            .setExpectContinueEnabled(false)
            .setProxy(new HttpHost(proxyHost, proxyPort))
            .build();
    }

    public static void doRequest(HttpRequestBase httpReq) {
        CloseableHttpResponse httpResp = null;

        try {
            setHeaders(httpReq);

            httpReq.setConfig(reqConfig);

            CloseableHttpClient httpClient = HttpClients.custom()
                .setConnectionManager(cm)
                .setDefaultCredentialsProvider(credsProvider)
                .build();

            //设置TCP keep alive,访问https网站时保持IP不切换
            // SocketConfig socketConfig = SocketConfig.custom().setSoKeepAlive(true).setSoTimeout(3600000).build(); 
            // CloseableHttpClient httpClient =  HttpClients.custom()
            //    .setConnectionManager(cm)
            //    .setDefaultCredentialsProvider(credsProvider)
            //    .setDefaultSocketConfig(socketConfig)
            //    .build();
                                                       

            AuthCache authCache = new BasicAuthCache();
            authCache.put(proxy, new BasicScheme());
            // 如果遇到407,可以设置代理认证 Proxy-Authenticate
            // authCache.put(proxy, new BasicScheme(ChallengeState.PROXY));

            HttpClientContext localContext = HttpClientContext.create();
            localContext.setAuthCache(authCache);

            httpResp = httpClient.execute(httpReq, localContext);

            int statusCode = httpResp.getStatusLine().getStatusCode();

            System.out.println(statusCode);

            BufferedReader rd = new BufferedReader(new InputStreamReader(httpResp.getEntity().getContent()));

            String line = "";
            while((line = rd.readLine()) != null) {
                System.out.println(line);
            }
        } catch (Exception e) {
            e.printStackTrace();
        } finally {
            try {
                if (httpResp != null) {
                    httpResp.close();
                }
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
    }

    /**
     * 设置请求头
     *
     * @param httpReq
     */
    private static void setHeaders(HttpRequestBase httpReq) {

        // 设置Proxy-Tunnel
        // Random random = new Random();
        // int tunnel = random.nextInt(10000);
        // httpReq.setHeader("Proxy-Tunnel", String.valueOf(tunnel));

        httpReq.setHeader("Accept-Encoding", null);

    }


    public static void doGetRequest() {
        // 要访问的目标页面
        String targetUrl = "https://httpbin.org/ip";


        try {
            HttpGet httpGet = new HttpGet(targetUrl);

            doRequest(httpGet);
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    public static void main(String[] args) {
        doGetRequest();


    }
}

JSoup

=== "JSoup"

    import java.io.IOException;
    import java.net.Authenticator;
    import java.net.InetSocketAddress;
    import java.net.PasswordAuthentication;
    import java.net.Proxy;

    import org.jsoup.Jsoup;
    import org.jsoup.nodes.Document;


    public class Demo
    {
        // 代理验证信息
        final static String ProxyUser = "username";
        final static String ProxyPass = "password";

        // 代理服务器(产品官网 www.16yun.cn)
        final static String ProxyHost = "t.16yun.cn";
        final static Integer ProxyPort = 31111;

        // 设置IP切换头
        final static String ProxyHeadKey = "Proxy-Tunnel";


        public static String getUrlProxyContent(String url)
        {
            Authenticator.setDefault(new Authenticator() {
                public PasswordAuthentication getPasswordAuthentication()
                {
                    return new PasswordAuthentication(ProxyUser, ProxyPass.toCharArray());
                }
            });
            // 设置Proxy-Tunnel
            Random random = new Random();
            int tunnel = random.nextInt(10000);
            String ProxyHeadVal = String.valueOf(tunnel);

            Proxy proxy = new Proxy(Proxy.Type.HTTP, new InetSocketAddress(ProxyHost, ProxyPort));

            try
            {
                // 处理异常、其他参数
                Document doc = Jsoup.connect(url).timeout(3000).header(ProxyHeadKey, ProxyHeadVal).proxy(proxy).get();

                if(doc != null) {
                    System.out.println(doc.body().html());
                }
            }
            catch (IOException e)
            {
                e.printStackTrace();
            }

            return null;
        }

        public static void main(String[] args) throws Exception
        {
            // 要访问的目标页面
            String targetUrl = "http://httpbin.org/ip";


            getUrlProxyContent(targetUrl);
        }
    }

[!WARNING] JSoup无法使用Keep-alive

  • JSoup默认会关闭连接.
  • 访问HTTP网站请通过设置相同Proxy-Tunnel来保持相同的外网IP.
  • 访问HTTPS网站请使用其他库,保持相同的外网IP.

=== "JSoup 全局代理"

    import java.io.IOException;
    import org.jsoup.Jsoup;
    import org.jsoup.nodes.Document;

    public class Demo {

        public static void main(String[] args) {

            try{

                // 代理服务器(产品官网 www.16yun.cn)
                final static String ProxyHost = "t.16yun.cn";
                final static String ProxyPort = "31111";

                System.setProperty("http.proxyHost", ProxyHost);
                System.setProperty("https.proxyHost", ProxyHost);

                System.setProperty("http.proxyPort", ProxyPort);
                System.setProperty("https.proxyPort", ProxyPort);

                // 代理验证信息
                final static String ProxyUser = "username";
                final static String ProxyPass = "password";

                System.setProperty("http.proxyUser", ProxyUser);
                System.setProperty("http.proxyPassword", ProxyPass);

                System.setProperty("https.proxyUser", ProxyUser);
                System.setProperty("https.proxyPassword", ProxyPass);

                // 设置IP切换头
                final static String ProxyHeadKey = "Proxy-Tunnel";

                // 设置Proxy-Tunnel
                Random random = new Random();
                int tunnel = random.nextInt(10000);
                String ProxyHeadVal = String.valueOf(tunnel);

                // 处理异常、其他参数
                Document doc = Jsoup.connect(url).timeout(3000).header(ProxyHeadKey, ProxyHeadVal).get();

                if(doc != null) {
                    System.out.println(doc.body().html());
                }

            }catch (IOException e)
            {
                e.printStackTrace();
            }

        }
    }

[!WARNING] JSoup无法使用Keep-alive

  • JSoup默认会关闭连接.
  • 访问HTTP网站请通过设置相同Proxy-Tunnel来保持相同的外网IP.
  • 访问HTTPS网站请使用其他库,保持相同的外网IP.

HttpURLConnection Connection

import java.io.ByteArrayOutputStream;
import java.io.InputStream;
import java.net.Authenticator;
import java.net.HttpURLConnection;
import java.net.InetSocketAddress;
import java.net.PasswordAuthentication;
import java.net.Proxy;
import java.net.URL;
import java.util.Random;

class ProxyAuthenticator extends Authenticator {
    private String user, password;

    public ProxyAuthenticator(String user, String password) {
        this.user     = user;
        this.password = password;
    }

    protected PasswordAuthentication getPasswordAuthentication() {
        return new PasswordAuthentication(user, password.toCharArray());
    }
}

/**
 * 注意:下面代码仅仅实现HTTP请求链接,每一次请求都是无状态保留的,仅仅是这次请求是更换IP的,如果下次请求的IP地址会改变
 * 如果是多线程访问的话,只要将下面的代码嵌入到你自己的业务逻辑里面,那么每次都会用新的IP进行访问,如果担心IP有重复,
 * 自己可以维护IP的使用情况,并做校验。
 */
public class Demo {
    public static void main(String args[]) throws Exception {
        // Change in Java 8 Update 111 以上版本需要下面代码
        // System.setProperty("jdk.http.auth.tunneling.disabledSchemes", "false");
        // System.setProperty("jdk.http.auth.proxying.disabledSchemes", "false");
        
        // 要访问的目标页面
        String targetUrl = "http://httpbin.org/ip";


        // 代理服务器(产品官网 www.16yun.cn)
        String proxyServer = "t.16yun.cn";
        int proxyPort      = 31111;

        // 代理验证信息
        String proxyUser  = "username";
        String proxyPass  = "password";

        try {
            URL url = new URL(targetUrl);

            Authenticator.setDefault(new ProxyAuthenticator(proxyUser, proxyPass));

            // 创建代理服务器地址对象
            InetSocketAddress addr = new InetSocketAddress(proxyServer, proxyPort);
            // 创建HTTP类型代理对象
            Proxy proxy = new Proxy(Proxy.Type.HTTP, addr);

            // 设置通过代理访问目标页面
            HttpURLConnection connection = (HttpURLConnection) url.openConnection(proxy);
            
            // 设置KeepAlive
            // connection.setRequestProperty("Connection", "keep-alive");
            // connection.setRequestProperty("Keep-Alive", "timeout=5, max=100");                

            // 设置Proxy-Tunnel
            // Random random = new Random();
            // int tunnel = random.nextInt(10000);
            // connection.setRequestProperty("Proxy-Tunnel",String.valueOf(tunnel));

            // 解析返回数据
            byte[] response = readStream(connection.getInputStream());

            System.out.println(new String(response));
        } catch (Exception e) {
            System.out.println(e.getLocalizedMessage());
        }
    }

    /**
     * 将输入流转换成字符串
     *
     * @param inStream
     * @return
     * @throws Exception
     */
    public static byte[] readStream(InputStream inStream) throws Exception {
        ByteArrayOutputStream outSteam = new ByteArrayOutputStream();
        byte[] buffer = new byte[1024];
        int len = -1;

        while ((len = inStream.read(buffer)) != -1) {
            outSteam.write(buffer, 0, len);
        }
        outSteam.close();
        inStream.close();

        return outSteam.toByteArray();
    }
}

Htmlunit

package htmlunit;

import org.apache.http.auth.AuthScope;
import org.apache.http.auth.UsernamePasswordCredentials;
import org.apache.http.client.CredentialsProvider;
import org.apache.http.impl.client.BasicCredentialsProvider;

import com.gargoylesoftware.htmlunit.BrowserVersion;
import com.gargoylesoftware.htmlunit.NicelyResynchronizingAjaxController;
import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.html.HtmlPage;

public class HtmlunitDemo {
    // 代理服务器(产品官网 www.16yun.cn)
    final static String proxyHost = "t.16yun.cn";
    final static Integer proxyPort = 31111;

    // 代理验证信息
    final static String proxyUser = "USERNAME";
    final static String proxyPass = "PASSWORD";

    public static void main(String[] args) {
        
        CredentialsProvider credsProvider = new BasicCredentialsProvider();
        credsProvider.setCredentials(
        
        new AuthScope(proxyHost, proxyPort),
        new UsernamePasswordCredentials(proxyUser, proxyPass));
  
        WebClient webClient = new WebClient(BrowserVersion.CHROME,proxyHost, proxyPort);
  
        webClient.setCredentialsProvider(credsProvider);
     
        webClient.setAjaxController(new NicelyResynchronizingAjaxController());
        webClient.getOptions().setJavaScriptEnabled(true);
        webClient.getOptions().setThrowExceptionOnScriptError(false);
        webClient.getOptions().setThrowExceptionOnFailingStatusCode(false);
        webClient.getOptions().setActiveXNative(false);
        webClient.getOptions().setCssEnabled(false);

        HtmlPage page = null;

        try {
            page = webClient.getPage("http://httpbin.org/ip");
        } catch (Exception e) {
            e.printStackTrace();
        } finally {
            webClient.close();
        }

        webClient.waitForBackgroundJavaScript(30000);

        
        String pageXml = page.asXml();

        System.out.println(pageXml);
    }
}

Okhttp


    import okhttp3.*;

    import java.io.IOException;
    import java.net.InetSocketAddress;
    import java.net.Proxy;
    import java.util.concurrent.TimeUnit;

    public class OkHttp {

        // 代理服务器(产品官网 www.16yun.cn)
        final static String proxyHost = "t.16yun.cn";
        final static Integer proxyPort = 31111;

        // 代理验证信息
        final static String proxyUser = "USERNAME";
        final static String proxyPass = "PASSWORD";

        static OkHttpClient client = null;

        static {
            Proxy proxy = new Proxy(Proxy.Type.HTTP, new InetSocketAddress(proxyHost, proxyPort));

            Authenticator proxyAuthenticator = new Authenticator() {
                public Request authenticate(Route route, Response response) {
                    String credential = Credentials.basic(proxyUser, proxyPass);
                    return response.request().newBuilder()
                            .header("Proxy-Authorization", credential)
                            .build();
                }
            };

            client = new OkHttpClient().newBuilder()
                    .connectTimeout(5, TimeUnit.SECONDS)
                    .readTimeout(5, TimeUnit.SECONDS)
                    .proxy(proxy)
                    .proxyAuthenticator(proxyAuthenticator)
                    .connectionPool(new ConnectionPool(5, 1, TimeUnit.SECONDS))
                    .build();
        }

        public static Response doGet() throws IOException {
            // 要访问的目标页面
            String targetUrl = "http://httpbin.org/ip";

            Request request = new Request.Builder()
                    .url(targetUrl)
                    .build();
            Response response = client.newCall(request).execute();
            return response;
        }

        public static void main(String[] args) throws IOException {
            Response response1 = doGet();
            System.out.println("GET请求返回结果:");
            System.out.println(response1.body().string());
        }

    }

Selenium(Java 示例)

HtmlUnitDriver(Java)

import org.json.JSONException;
import org.json.JSONObject;
import org.openqa.selenium.Platform;
import org.openqa.selenium.Proxy;
import org.openqa.selenium.htmlunit.HtmlUnitDriver;
import org.openqa.selenium.remote.CapabilityType;
import org.openqa.selenium.remote.DesiredCapabilities;

import com.gargoylesoftware.htmlunit.DefaultCredentialsProvider;
import com.gargoylesoftware.htmlunit.WebClient;

public class HtmlUnitDriverProxyDemo
{
    // 代理验证信息
    final static String proxyUser = "username";
    final static String proxyPass = "password";

    // 代理服务器
    final static String proxyServer = "t.16yun.cn:31111";

    public static void main(String[] args) throws JSONException
    {
        HtmlUnitDriver driver = getHtmlUnitDriver();

        driver.get("https://httpbin.org/ip");

        String title = driver.getTitle();
        System.out.println(title);
    }

    public static HtmlUnitDriver getHtmlUnitDriver()
    {
        HtmlUnitDriver driver = null;

        Proxy proxy = new Proxy();
        
        proxy.setHttpProxy(proxyServer);

        DesiredCapabilities capabilities = DesiredCapabilities.htmlUnit();
        capabilities.setCapability(CapabilityType.PROXY, proxy);
        capabilities.setJavascriptEnabled(true);
        capabilities.setPlatform(Platform.WIN8_1);

        driver = new HtmlUnitDriver(capabilities) {
            @Override
            protected WebClient modifyWebClient(WebClient client) {
                DefaultCredentialsProvider creds = new DefaultCredentialsProvider();
                creds.addCredentials(proxyUser, proxyPass);
                client.setCredentialsProvider(creds);
                return client;
            }
        };

        driver.setJavascriptEnabled(true);

        return driver;
    }
}              

Firefox(Java)

import org.json.JSONException;
import org.json.JSONObject;
import org.openqa.selenium.Platform;
import org.openqa.selenium.Proxy;
import org.openqa.selenium.firefox.FirefoxDriver;
import org.openqa.selenium.firefox.FirefoxProfile;
import org.openqa.selenium.htmlunit.HtmlUnitDriver;
import org.openqa.selenium.remote.CapabilityType;
import org.openqa.selenium.remote.DesiredCapabilities;

import com.gargoylesoftware.htmlunit.DefaultCredentialsProvider;
import com.gargoylesoftware.htmlunit.WebClient;

public class FirefoxDriverProxyDemo
{
    // 代理隧道验证信息
    final static String proxyUser = "username";
    final static String proxyPass = "password";

    // 代理服务器
    final static String proxyHost = "t.16yun.cn";
    final static int proxyPort = 31111;

    final static String firefoxBin = "C:/Program Files/Mozilla Firefox/firefox.exe";

    public static void main(String[] args) throws JSONException
    {
        System.setProperty("webdriver.firefox.bin", firefoxBin);

        FirefoxProfile profile = new FirefoxProfile();
        
        profile.setPreference("network.proxy.type", 1);
        
        
        profile.setPreference("network.proxy.http", proxyHost);
        profile.setPreference("network.proxy.http_port", proxyPort);

        profile.setPreference("network.proxy.ssl", proxyHost);
        profile.setPreference("network.proxy.ssl_port", proxyPort);

        profile.setPreference("username", proxyUser);
        profile.setPreference("password", proxyPass);

        
        profile.setPreference("network.proxy.share_proxy_settings", true);

        
        profile.setPreference("network.proxy.no_proxies_on", "localhost");

        
        FirefoxDriver driver = new FirefoxDriver(profile);
    }
}