構(gòu)造httpclient的時候可以setRetryHandler(HttpRequestRetryHandler) ** HttpRequestRetryHandler是Http請求出錯后的重試的處理接口類,對于了某些要求比較嚴格的業(yè)務(wù)情況下這個參數(shù)還是比較重要的。
HttpRequestRetryHandler** 的已知實現(xiàn)類有 DefaultHttpRequestRetryHandler和繼承了DefaultHttpRequestRetryHandler的StandardHttpRequestRetryHandler
DefaultHttpRequestRetryHandler
/*
* ====================================================================
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*
*/
package org.apache.http.impl.client;
import java.io.IOException;
import java.io.InterruptedIOException;
import java.net.ConnectException;
import java.net.UnknownHostException;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashSet;
import java.util.Set;
import javax.net.ssl.SSLException;
import org.apache.http.HttpEntityEnclosingRequest;
import org.apache.http.HttpRequest;
import org.apache.http.annotation.Immutable;
import org.apache.http.client.HttpRequestRetryHandler;
import org.apache.http.client.methods.HttpUriRequest;
import org.apache.http.client.protocol.HttpClientContext;
import org.apache.http.protocol.HttpContext;
import org.apache.http.util.Args;
/**
* The default {@link HttpRequestRetryHandler} used by request executors.
*
* @since 4.0
*/
@Immutable
public class DefaultHttpRequestRetryHandler implements HttpRequestRetryHandler {
public static final DefaultHttpRequestRetryHandler INSTANCE = new DefaultHttpRequestRetryHandler();
/** the number of times a method will be retried */
private final int retryCount;
/** Whether or not methods that have successfully sent their request will be retried */
private final boolean requestSentRetryEnabled;
private final Set<Class<? extends IOException>> nonRetriableClasses;
/**
* Create the request retry handler using the specified IOException classes
*
* @param retryCount how many times to retry; 0 means no retries
* @param requestSentRetryEnabled true if it's OK to retry requests that have been sent
* @param clazzes the IOException types that should not be retried
* @since 4.3
*/
protected DefaultHttpRequestRetryHandler(
final int retryCount,
final boolean requestSentRetryEnabled,
final Collection<Class<? extends IOException>> clazzes) {
super();
this.retryCount = retryCount;
this.requestSentRetryEnabled = requestSentRetryEnabled;
this.nonRetriableClasses = new HashSet<Class<? extends IOException>>();
for (final Class<? extends IOException> clazz: clazzes) {
this.nonRetriableClasses.add(clazz);
}
}
/**
* Create the request retry handler using the following list of
* non-retriable IOException classes: <br>
* <ul>
* <li>InterruptedIOException</li>
* <li>UnknownHostException</li>
* <li>ConnectException</li>
* <li>SSLException</li>
* </ul>
* @param retryCount how many times to retry; 0 means no retries
* @param requestSentRetryEnabled true if it's OK to retry non-idempotent requests that have been sent
*/
@SuppressWarnings("unchecked")
public DefaultHttpRequestRetryHandler(final int retryCount, final boolean requestSentRetryEnabled) {
this(retryCount, requestSentRetryEnabled, Arrays.asList(
InterruptedIOException.class,
UnknownHostException.class,
ConnectException.class,
SSLException.class));
}
/**
* Create the request retry handler with a retry count of 3, requestSentRetryEnabled false
* and using the following list of non-retriable IOException classes: <br>
* <ul>
* <li>InterruptedIOException</li>
* <li>UnknownHostException</li>
* <li>ConnectException</li>
* <li>SSLException</li>
* </ul>
*/
public DefaultHttpRequestRetryHandler() {
this(3, false);
}
/**
* Used {@code retryCount} and {@code requestSentRetryEnabled} to determine
* if the given method should be retried.
*/
@Override
public boolean retryRequest(
final IOException exception,
final int executionCount,
final HttpContext context) {
Args.notNull(exception, "Exception parameter");
Args.notNull(context, "HTTP context");
if (executionCount > this.retryCount) {
// Do not retry if over max retry count
return false;
}
if (this.nonRetriableClasses.contains(exception.getClass())) {
return false;
} else {
for (final Class<? extends IOException> rejectException : this.nonRetriableClasses) {
if (rejectException.isInstance(exception)) {
return false;
}
}
}
final HttpClientContext clientContext = HttpClientContext.adapt(context);
final HttpRequest request = clientContext.getRequest();
if(requestIsAborted(request)){
return false;
}
if (handleAsIdempotent(request)) {
// Retry if the request is considered idempotent
return true;
}
if (!clientContext.isRequestSent() || this.requestSentRetryEnabled) {
// Retry if the request has not been sent fully or
// if it's OK to retry methods that have been sent
return true;
}
// otherwise do not retry
return false;
}
/**
* @return {@code true} if this handler will retry methods that have
* successfully sent their request, {@code false} otherwise
*/
public boolean isRequestSentRetryEnabled() {
return requestSentRetryEnabled;
}
/**
* @return the maximum number of times a method will be retried
*/
public int getRetryCount() {
return retryCount;
}
/**
* @since 4.2
*/
protected boolean handleAsIdempotent(final HttpRequest request) {
return !(request instanceof HttpEntityEnclosingRequest);
}
/**
* @since 4.2
*
* @deprecated (4.3)
*/
@Deprecated
protected boolean requestIsAborted(final HttpRequest request) {
HttpRequest req = request;
if (request instanceof RequestWrapper) { // does not forward request to original
req = ((RequestWrapper) request).getOriginal();
}
return (req instanceof HttpUriRequest && ((HttpUriRequest)req).isAborted());
}
}
默認構(gòu)造函數(shù)是
public DefaultHttpRequestRetryHandler() {
this(3, false);
}
參數(shù)requestSentRetryEnabled是請求是否發(fā)送成功都重試 這里設(shè)置了false,一般情況下都不要為true我覺得。
主要實現(xiàn)的方法是
boolean retryRequest(IOException exception, int executionCount, HttpContext context);
類StandardHttpRequestRetryHandler并沒有重寫該方法
@Immutable
public class StandardHttpRequestRetryHandler extends DefaultHttpRequestRetryHandler {
private final Map<String, Boolean> idempotentMethods;
public StandardHttpRequestRetryHandler(final int retryCount, final boolean requestSentRetryEnabled) {
super(retryCount, requestSentRetryEnabled);
this.idempotentMethods = new ConcurrentHashMap<String, Boolean>();
this.idempotentMethods.put("GET", Boolean.TRUE);
this.idempotentMethods.put("HEAD", Boolean.TRUE);
this.idempotentMethods.put("PUT", Boolean.TRUE);
this.idempotentMethods.put("DELETE", Boolean.TRUE);
this.idempotentMethods.put("OPTIONS", Boolean.TRUE);
this.idempotentMethods.put("TRACE", Boolean.TRUE);
}
public StandardHttpRequestRetryHandler() {
this(3, false);
}
@Override
protected boolean handleAsIdempotent(final HttpRequest request) {
final String method = request.getRequestLine().getMethod().toUpperCase(Locale.ROOT);
final Boolean b = this.idempotentMethods.get(method);
return b != null && b.booleanValue();
}
}
只是重寫了
protected boolean handleAsIdempotent(final HttpRequest request)
我們參考后完全可以實現(xiàn)自己的HttpRequestRetryHandler
初始化httpClient
在httpClient4.5中,初始化的方式已經(jīng)和以前版有差異
static CloseableHttpClient client = HttpClients.createDefault();
和
static CloseableHttpClient httpClient=HttpClients.custom().build();
在該方式中可以添加一些網(wǎng)絡(luò)請求的設(shè)置
可以直接使用匿名類
HttpRequestRetryHandler handler = new HttpRequestRetryHandler() {
@Override
public boolean retryRequest(IOException arg0, int retryTimes, HttpContext arg2) {
if (retryTimes > 5) {
return false;
}
if (arg0 instanceof UnknownHostException || arg0 instanceof ConnectTimeoutException
|| !(arg0 instanceof SSLException) || arg0 instanceof NoHttpResponseException) {
return true;
}
HttpClientContext clientContext = HttpClientContext.adapt(arg2);
HttpRequest request = clientContext.getRequest();
boolean idempotent = !(request instanceof HttpEntityEnclosingRequest);
if (idempotent) {
// 如果請求被認為是冪等的,那么就重試。即重復(fù)執(zhí)行不影響程序其他效果的
return true;
}
return false;
}
};
還可以設(shè)置路由策略 即設(shè)置代理方式訪問
HttpHost proxy = new HttpHost("127.0.0.1", 80);// 設(shè)置代理ip
DefaultProxyRoutePlanner routePlanner = new DefaultProxyRoutePlanner(proxy);
CloseableHttpClient httpClient = HttpClients.custom().setRoutePlanner(routePlanner).setRetryHandler(handler)
.setConnectionTimeToLive(1, TimeUnit.DAYS).setDefaultCookieStore(cookieStore).build();
附錄:
Httpclient4.5后對于get請求方式的改變
static RequestConfig config = RequestConfig.custom().setConnectTimeout(6000).setSocketTimeout(6000)
.setCookieSpec(CookieSpecs.STANDARD).build(); // 設(shè)置超時及cookie策略
public static String getDemo(String url) {
HttpGet get = new HttpGet(url);
get.setConfig(config);
HttpResponse response = null;
String html = null;
try {
response = client.execute(get);
int statusCode = response.getStatusLine().getStatusCode();// 連接代碼
Header[] headers = response.getAllHeaders();
// 用于得到返回的文件頭
for (Header header : headers) {
System.out.println(header);
}
html = new String(EntityUtils.toString(response.getEntity()).getBytes("gb2312"), "utf8");
// 在后面參數(shù)輸入網(wǎng)站的編碼,一般為utf-8
// 返回的html代碼,避免發(fā)生編碼錯誤
System.out.println(html);
} catch (IOException e) {
e.printStackTrace();
}
return html;
}
大致流程:新建httpget對象->用httpClient執(zhí)行->解析返回的response得到自己需要的內(nèi)容
cookieSpec:即cookie策略。參數(shù)為cookiespecs的一些字段。作用:1、如果網(wǎng)站header中有set-cookie字段時,采用默認方式可能會被cookie reject,無法寫入cookie。將此屬性設(shè)置成CookieSpecs.STANDARD_STRICT可避免此情況。2、如果要想忽略cookie訪問,則將此屬性設(shè)置成CookieSpecs.IGNORE_COOKIES。
tips:注意網(wǎng)站編碼,否則容易出現(xiàn)亂碼
執(zhí)行post請求:
public static void postDemo(String url) {
HttpPost post = new HttpPost(url);
post.setConfig(config);
post.setHeader("User-Agent",
"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.93 Safari/537.36");
post.setHeader("Connection", "keep-alive");
List<NameValuePair> list = new ArrayList<NameValuePair>();
list.add(new BasicNameValuePair("key", "value"));
list.add(new BasicNameValuePair("key", "value"));
list.add(new BasicNameValuePair("key", "value"));
list.add(new BasicNameValuePair("key", "value"));
list.add(new BasicNameValuePair("key", "value"));
try {
HttpEntity entity = new UrlEncodedFormEntity(list, "utf-8");
post.setEntity(entity);
HttpResponse response = client.execute(post);
String responseHtml = EntityUtils.toString(response.getEntity());
System.out.println(responseHtml);
} catch (IOException e) {
e.printStackTrace();
}
}
大致流程:新建post對象->新建需要的表單頁->將表單內(nèi)容設(shè)置入請求中->執(zhí)行并獲得response
解析response
//得到返回內(nèi)容
String responseHtml = EntityUtils.toString(response.getEntity());
int statusCode = response.getStatusLine().getStatusCode();// 連接代碼
//得到response header
response.getFirstHeader("key");// 得到第一個名字為key的header
response.getHeaders("key");// 得到名字為key的所有header,返回一個數(shù)組
response.getLastHeader("key");
//得到inputstream:(下載網(wǎng)絡(luò)部分資源的時候有可能會對cookie有要求,此時需要用到httpClient來下載。)例如驗證碼等等。
InputStream inputStream = response.getEntity().getContent();
管理cookie
CookieStore cookieStore = new BasicCookieStore();
CloseableHttpClient httpClient= HttpClients.custom()
.setDefaultCookieStore(cookieStore).build();
httpClient里默認自動管理cookie,如果想要提取cookie或者發(fā)送自定義的cookie,則需要在httpClient對象初始化時設(shè)置一個默認的cookiestore來保存。(方法見初始化httpClient對象里的setDefaultCookieStore)。
得到當(dāng)前所有cookie:
List<Cookie> list = cookieStore.getCookies();// get all cookies
System.out.println("cookie is:");
System.out.println("-----------------------");
for (Cookie cookie : list) {
System.out.println(cookie);
}
System.out.println("-----------------------");
清除所有cookie:
cookieStore.clear();
發(fā)送自定義cookie:(new了一個對象之后可以設(shè)置多種屬性。)
BasicClientCookie cookie = new BasicClientCookie("name", "value");
// new a cookie
cookie.setDomain("domain");
cookie.setExpiryDate(new Date());
// set the properties of the cookie
cookieStore.addCookie(cookie);
管理header:
在平常抓取過程中,經(jīng)常需要在請求中加入許多header偽裝成一個正常的瀏覽器。以免被服務(wù)器認出是爬蟲而被封。
設(shè)置一些常見header:
post.setHeader("User-Agent",
"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.93 Safari/537.36");
post.setHeader("Connection", "keep-alive");
注意:下載某些網(wǎng)站的資源時,服務(wù)器會獲取你的來源站,并發(fā)出對應(yīng)的相應(yīng)。如果來源站不對,可能會被服務(wù)器拒絕。此時只需要在請求中加個header就行。
get1.setHeader("Referer", "http://www.a.com");
ps:
1、爬蟲也要遵守基本法,在多次請求的之中為了不給對方服務(wù)器造成負擔(dān)(避免被封),盡量在請求間sleep一個隨機數(shù)值。
2、爬取非英文網(wǎng)站時注意編碼格式,國內(nèi)一般為utf-8,也有一些是gb2312.獲取時注意轉(zhuǎn)碼。
3、多獲得一些可靠IP(備胎),一旦自身ip被封,趕快去找備胎。附帶一個簡單的判斷網(wǎng)站是否需要代理方法:
// 判斷訪問目標網(wǎng)站是否需要代理
private boolean isNeedProxy() {
boolean result = true;
URL url;
try {
url = new URL("http://apkpure.com/");
HttpURLConnection connection = (HttpURLConnection) url.openConnection();
connection.setConnectTimeout(6000);
// int i = connection.getResponseCode();
int i = connection.getContentLength();
if (i > 0) {
result = false;
}
} catch (IOException e) {
e.printStackTrace();
}
return result;
}