elements = document.getElementsByAttributeValue("class", "phdnews_txt fr").first()
62 | .getElementsByAttributeValue("class", "phdnews_hdline");
63 | elements.forEach(element -> {
64 | for (Element e : element.getElementsByTag("a")) {
65 | System.out.println(e.attr("href"));
66 | System.out.println(e.text());
67 | }
68 | });
69 | }
70 | }
71 |
--------------------------------------------------------------------------------
/src/test/java/xuyihao/util/HttpUtilsTest.java:
--------------------------------------------------------------------------------
1 | package xuyihao.util;
2 |
3 | import org.jsoup.nodes.Document;
4 | import org.junit.Test;
5 |
6 | import java.util.HashMap;
7 |
8 | public class HttpUtilsTest {
9 | private static final String TEST_URL = "http://www.google.com/";
10 |
11 | @Test
12 | public void testExecuteGetAsDocument() {
13 | HttpUtils httpUtils = HttpUtils.getInstance();
14 | httpUtils.setCharset("UTF-8");
15 | httpUtils.setTimeout(30000);
16 | try {
17 | Document document = httpUtils.executeGetAsDocument(TEST_URL);
18 | //TODO
19 | System.out.println(document);
20 | } catch (Exception e) {
21 | e.printStackTrace();
22 | }
23 | }
24 |
25 | @Test
26 | public void testExecuteGet() {
27 | HttpUtils httpUtils = HttpUtils.getInstance();
28 | httpUtils.setCharset("UTF-8");
29 | httpUtils.setTimeout(30000);
30 | try {
31 | String responseStr = httpUtils.executeGet(TEST_URL);
32 | //TODO
33 | System.out.println(responseStr);
34 | } catch (Exception e) {
35 | e.printStackTrace();
36 | }
37 | }
38 |
39 | @Test
40 | public void testExecuteGetWithSSLAsDocument() {
41 | HttpUtils httpUtils = HttpUtils.getInstance();
42 | httpUtils.setCharset("UTF-8");
43 | httpUtils.setTimeout(30000);
44 | try {
45 | Document document = httpUtils.executeGetWithSSLAsDocument(TEST_URL);
46 | //TODO
47 | System.out.println(document);
48 | } catch (Exception e) {
49 | e.printStackTrace();
50 | }
51 | }
52 |
53 | @Test
54 | public void testExecuteGetWithSSL() {
55 | HttpUtils httpUtils = HttpUtils.getInstance();
56 | httpUtils.setCharset("UTF-8");
57 | httpUtils.setTimeout(30000);
58 | try {
59 | String responseStr = httpUtils.executeGetWithSSL(TEST_URL);
60 | //TODO
61 | System.out.println(responseStr);
62 | } catch (Exception e) {
63 | e.printStackTrace();
64 | }
65 | }
66 |
67 | @Test
68 | public void testExecutePostAsDocument() {
69 | HttpUtils httpUtils = HttpUtils.getInstance();
70 | httpUtils.setCharset("UTF-8");
71 | httpUtils.setTimeout(30000);
72 | try {
73 | Document document = httpUtils.executePostAsDocument(TEST_URL, new HashMap<>());
74 | //TODO
75 | System.out.println(document);
76 | } catch (Exception e) {
77 | e.printStackTrace();
78 | }
79 | }
80 |
81 | @Test
82 | public void testExecutePost() {
83 | HttpUtils httpUtils = HttpUtils.getInstance();
84 | httpUtils.setCharset("UTF-8");
85 | httpUtils.setTimeout(30000);
86 | try {
87 | String responseStr = httpUtils.executePost(TEST_URL, new HashMap<>());
88 | //TODO
89 | System.out.println(responseStr);
90 | } catch (Exception e) {
91 | e.printStackTrace();
92 | }
93 | }
94 |
95 | @Test
96 | public void testExecutePostWithSSLAsDocument() {
97 | HttpUtils httpUtils = HttpUtils.getInstance();
98 | httpUtils.setCharset("UTF-8");
99 | httpUtils.setTimeout(30000);
100 | try {
101 | Document document = httpUtils.executePostWithSSLAsDocument(TEST_URL, new HashMap<>());
102 | //TODO
103 | System.out.println(document);
104 | } catch (Exception e) {
105 | e.printStackTrace();
106 | }
107 | }
108 |
109 | @Test
110 | public void testExecutePostWithSSL() {
111 | HttpUtils httpUtils = HttpUtils.getInstance();
112 | httpUtils.setCharset("UTF-8");
113 | httpUtils.setTimeout(30000);
114 | try {
115 | String responseStr = httpUtils.executePostWithSSL(TEST_URL, new HashMap<>());
116 | //TODO
117 | System.out.println(responseStr);
118 | } catch (Exception e) {
119 | e.printStackTrace();
120 | }
121 | }
122 |
123 | @Test
124 | public void testExecutePostWithJson() {
125 | HttpUtils httpUtils = HttpUtils.getInstance();
126 | httpUtils.setCharset("UTF-8");
127 | httpUtils.setTimeout(30000);
128 | try {
129 | String responseStr = httpUtils.executePostWithJson(TEST_URL, "{\"name\":\"testName001\"}");
130 | //TODO
131 | System.out.println(responseStr);
132 | } catch (Exception e) {
133 | e.printStackTrace();
134 | }
135 | }
136 |
137 | @Test
138 | public void testExecutePostWithJsonAndSSL() {
139 | HttpUtils httpUtils = HttpUtils.getInstance();
140 | httpUtils.setCharset("UTF-8");
141 | httpUtils.setTimeout(30000);
142 | try {
143 | String responseStr = httpUtils.executePostWithJsonAndSSL(TEST_URL, "{\"name\":\"testName001\"}");
144 | //TODO
145 | System.out.println(responseStr);
146 | } catch (Exception e) {
147 | e.printStackTrace();
148 | }
149 | }
150 |
151 | @Test
152 | public void testGetHtmlPageResponse() {
153 | HttpUtils httpUtils = HttpUtils.getInstance();
154 | httpUtils.setCharset("UTF-8");
155 | httpUtils.setTimeout(30000);
156 | httpUtils.setWaitForBackgroundJavaScript(30000);
157 | try {
158 | String htmlPageStr = httpUtils.getHtmlPageResponse(TEST_URL);
159 | //TODO
160 | System.out.println(htmlPageStr);
161 | } catch (Exception e) {
162 | e.printStackTrace();
163 | }
164 | }
165 |
166 | @Test
167 | public void testGetHtmlPageResponseAsDocument() {
168 | HttpUtils httpUtils = HttpUtils.getInstance();
169 | httpUtils.setCharset("UTF-8");
170 | httpUtils.setTimeout(30000);
171 | httpUtils.setWaitForBackgroundJavaScript(30000);
172 | try {
173 | Document document = httpUtils.getHtmlPageResponseAsDocument(TEST_URL);
174 | //TODO
175 | System.out.println(document);
176 | } catch (Exception e) {
177 | e.printStackTrace();
178 | }
179 | }
180 |
181 | }
182 |
--------------------------------------------------------------------------------
/src/main/java/xuyihao/util/HttpUtils.java:
--------------------------------------------------------------------------------
1 | package xuyihao.util;
2 |
3 | import org.apache.http.HttpEntity;
4 | import org.apache.http.NameValuePair;
5 | import org.apache.http.client.CookieStore;
6 | import org.apache.http.client.config.RequestConfig;
7 | import org.apache.http.client.entity.UrlEncodedFormEntity;
8 | import org.apache.http.client.methods.CloseableHttpResponse;
9 | import org.apache.http.client.methods.HttpGet;
10 | import org.apache.http.client.methods.HttpPost;
11 | import org.apache.http.client.protocol.HttpClientContext;
12 | import org.apache.http.conn.ssl.SSLConnectionSocketFactory;
13 | import org.apache.http.cookie.Cookie;
14 | import org.apache.http.entity.ContentType;
15 | import org.apache.http.entity.StringEntity;
16 | import org.apache.http.impl.client.CloseableHttpClient;
17 | import org.apache.http.impl.client.HttpClientBuilder;
18 | import org.apache.http.impl.client.HttpClients;
19 | import org.apache.http.message.BasicNameValuePair;
20 | import org.apache.http.ssl.SSLContextBuilder;
21 | import org.apache.http.util.EntityUtils;
22 | import org.jsoup.Jsoup;
23 | import org.jsoup.nodes.Document;
24 |
25 | import com.gargoylesoftware.htmlunit.BrowserVersion;
26 | import com.gargoylesoftware.htmlunit.NicelyResynchronizingAjaxController;
27 | import com.gargoylesoftware.htmlunit.WebClient;
28 | import com.gargoylesoftware.htmlunit.html.HtmlPage;
29 |
30 | import javax.net.ssl.*;
31 | import java.io.IOException;
32 | import java.security.GeneralSecurityException;
33 | import java.util.ArrayList;
34 | import java.util.HashMap;
35 | import java.util.List;
36 | import java.util.Map;
37 |
38 | /**
39 | *
40 | * Http工具,包含:
41 | * 1.普通http请求工具(使用httpClient进行http,https请求的发送)
42 | * 2.高级http工具(使用net.sourceforge.htmlunit获取完整的html页面,即完成后台js代码的运行)
43 | *
44 | * Created by xuyh at 2017/7/17 19:08.
45 | */
46 | public class HttpUtils {
47 | /**
48 | * 请求超时时间,默认20000ms
49 | */
50 | private int timeout = 20000;
51 | /**
52 | * 等待异步JS执行时间,默认20000ms
53 | */
54 | private int waitForBackgroundJavaScript = 20000;
55 | /**
56 | * cookie表
57 | */
58 | private Map cookieMap = new HashMap<>();
59 |
60 | /**
61 | * 请求编码(处理返回结果),默认UTF-8
62 | */
63 | private String charset = "UTF-8";
64 |
65 | private static HttpUtils httpUtils;
66 |
67 | private HttpUtils() {
68 | }
69 |
70 | /**
71 | * 获取实例
72 | *
73 | * @return
74 | */
75 | public static HttpUtils getInstance() {
76 | if (httpUtils == null)
77 | httpUtils = new HttpUtils();
78 | return httpUtils;
79 | }
80 |
81 | /**
82 | * 清空cookieMap
83 | */
84 | public void invalidCookieMap() {
85 | cookieMap.clear();
86 | }
87 |
88 | public int getTimeout() {
89 | return timeout;
90 | }
91 |
92 | /**
93 | * 设置请求超时时间
94 | *
95 | * @param timeout
96 | */
97 | public void setTimeout(int timeout) {
98 | this.timeout = timeout;
99 | }
100 |
101 | public String getCharset() {
102 | return charset;
103 | }
104 |
105 | /**
106 | * 设置请求字符编码集
107 | *
108 | * @param charset
109 | */
110 | public void setCharset(String charset) {
111 | this.charset = charset;
112 | }
113 |
114 | public int getWaitForBackgroundJavaScript() {
115 | return waitForBackgroundJavaScript;
116 | }
117 |
118 | /**
119 | * 设置获取完整HTML页面时等待异步JS执行的时间
120 | *
121 | * @param waitForBackgroundJavaScript
122 | */
123 | public void setWaitForBackgroundJavaScript(int waitForBackgroundJavaScript) {
124 | this.waitForBackgroundJavaScript = waitForBackgroundJavaScript;
125 | }
126 |
127 | /**
128 | * 将网页返回为解析后的文档格式
129 | *
130 | * @param html
131 | * @return
132 | * @throws Exception
133 | */
134 | public static Document parseHtmlToDoc(String html) throws Exception {
135 | return removeHtmlSpace(html);
136 | }
137 |
138 | private static Document removeHtmlSpace(String str) {
139 | Document doc = Jsoup.parse(str);
140 | String result = doc.html().replace(" ", "");
141 | return Jsoup.parse(result);
142 | }
143 |
144 | /**
145 | * 执行get请求,返回doc
146 | *
147 | * @param url
148 | * @return
149 | * @throws Exception
150 | */
151 | public Document executeGetAsDocument(String url) throws Exception {
152 | return parseHtmlToDoc(executeGet(url));
153 | }
154 |
155 | /**
156 | * 执行get请求
157 | *
158 | * @param url
159 | * @return
160 | * @throws Exception
161 | */
162 | public String executeGet(String url) throws Exception {
163 | HttpGet httpGet = new HttpGet(url);
164 | httpGet.setHeader("Cookie", convertCookieMapToString(cookieMap));
165 | httpGet.setConfig(RequestConfig.custom().setSocketTimeout(timeout).setConnectTimeout(timeout).build());
166 | CloseableHttpClient httpClient = null;
167 | String str = "";
168 | try {
169 | httpClient = HttpClientBuilder.create().build();
170 | HttpClientContext context = HttpClientContext.create();
171 | CloseableHttpResponse response = httpClient.execute(httpGet, context);
172 | getCookiesFromCookieStore(context.getCookieStore(), cookieMap);
173 | int state = response.getStatusLine().getStatusCode();
174 | if (state == 404) {
175 | str = "";
176 | }
177 | try {
178 | HttpEntity entity = response.getEntity();
179 | if (entity != null) {
180 | str = EntityUtils.toString(entity, charset);
181 | }
182 | } finally {
183 | response.close();
184 | }
185 | } catch (IOException e) {
186 | throw e;
187 | } finally {
188 | try {
189 | if (httpClient != null)
190 | httpClient.close();
191 | } catch (IOException e) {
192 | throw e;
193 | }
194 | }
195 | return str;
196 | }
197 |
198 | /**
199 | * 用https执行get请求,返回doc
200 | *
201 | * @param url
202 | * @return
203 | * @throws Exception
204 | */
205 | public Document executeGetWithSSLAsDocument(String url) throws Exception {
206 | return parseHtmlToDoc(executeGetWithSSL(url));
207 | }
208 |
209 | /**
210 | * 用https执行get请求
211 | *
212 | * @param url
213 | * @return
214 | * @throws Exception
215 | */
216 | public String executeGetWithSSL(String url) throws Exception {
217 | HttpGet httpGet = new HttpGet(url);
218 | httpGet.setHeader("Cookie", convertCookieMapToString(cookieMap));
219 | httpGet.setConfig(RequestConfig.custom().setSocketTimeout(timeout).setConnectTimeout(timeout).build());
220 | CloseableHttpClient httpClient = null;
221 | String str = "";
222 | try {
223 | httpClient = createSSLInsecureClient();
224 | HttpClientContext context = HttpClientContext.create();
225 | CloseableHttpResponse response = httpClient.execute(httpGet, context);
226 | getCookiesFromCookieStore(context.getCookieStore(), cookieMap);
227 | int state = response.getStatusLine().getStatusCode();
228 | if (state == 404) {
229 | str = "";
230 | }
231 | try {
232 | HttpEntity entity = response.getEntity();
233 | if (entity != null) {
234 | str = EntityUtils.toString(entity, charset);
235 | }
236 | } finally {
237 | response.close();
238 | }
239 | } catch (IOException e) {
240 | throw e;
241 | } catch (GeneralSecurityException ex) {
242 | throw ex;
243 | } finally {
244 | try {
245 | if (httpClient != null)
246 | httpClient.close();
247 | } catch (IOException e) {
248 | throw e;
249 | }
250 | }
251 | return str;
252 | }
253 |
254 | /**
255 | * 执行post请求,返回doc
256 | *
257 | * @param url
258 | * @param params
259 | * @return
260 | * @throws Exception
261 | */
262 | public Document executePostAsDocument(String url, Map params) throws Exception {
263 | return parseHtmlToDoc(executePost(url, params));
264 | }
265 |
266 | /**
267 | * 执行post请求
268 | *
269 | * @param url
270 | * @param params
271 | * @return
272 | * @throws Exception
273 | */
274 | public String executePost(String url, Map params) throws Exception {
275 | String reStr = "";
276 | HttpPost httpPost = new HttpPost(url);
277 | httpPost.setConfig(RequestConfig.custom().setSocketTimeout(timeout).setConnectTimeout(timeout).build());
278 | httpPost.setHeader("Cookie", convertCookieMapToString(cookieMap));
279 | List paramsRe = new ArrayList<>();
280 | for (Map.Entry entry : params.entrySet()) {
281 | paramsRe.add(new BasicNameValuePair(entry.getKey(), entry.getValue()));
282 | }
283 | CloseableHttpClient httpclient = HttpClientBuilder.create().build();
284 | CloseableHttpResponse response;
285 | try {
286 | httpPost.setEntity(new UrlEncodedFormEntity(paramsRe));
287 | HttpClientContext context = HttpClientContext.create();
288 | response = httpclient.execute(httpPost, context);
289 | getCookiesFromCookieStore(context.getCookieStore(), cookieMap);
290 | HttpEntity entity = response.getEntity();
291 | reStr = EntityUtils.toString(entity, charset);
292 | } catch (IOException e) {
293 | throw e;
294 | } finally {
295 | httpPost.releaseConnection();
296 | }
297 | return reStr;
298 | }
299 |
300 | /**
301 | * 用https执行post请求,返回doc
302 | *
303 | * @param url
304 | * @param params
305 | * @return
306 | * @throws Exception
307 | */
308 | public Document executePostWithSSLAsDocument(String url, Map params) throws Exception {
309 | return parseHtmlToDoc(executePostWithSSL(url, params));
310 | }
311 |
312 | /**
313 | * 用https执行post请求
314 | *
315 | * @param url
316 | * @param params
317 | * @return
318 | * @throws Exception
319 | */
320 | public String executePostWithSSL(String url, Map params) throws Exception {
321 | String re = "";
322 | HttpPost post = new HttpPost(url);
323 | List paramsRe = new ArrayList<>();
324 | for (Map.Entry entry : params.entrySet()) {
325 | paramsRe.add(new BasicNameValuePair(entry.getKey(), entry.getValue()));
326 | }
327 | post.setHeader("Cookie", convertCookieMapToString(cookieMap));
328 | post.setConfig(RequestConfig.custom().setSocketTimeout(timeout).setConnectTimeout(timeout).build());
329 | CloseableHttpResponse response;
330 | try {
331 | CloseableHttpClient httpClientRe = createSSLInsecureClient();
332 | HttpClientContext contextRe = HttpClientContext.create();
333 | post.setEntity(new UrlEncodedFormEntity(paramsRe));
334 | response = httpClientRe.execute(post, contextRe);
335 | HttpEntity entity = response.getEntity();
336 | if (entity != null) {
337 | re = EntityUtils.toString(entity, charset);
338 | }
339 | getCookiesFromCookieStore(contextRe.getCookieStore(), cookieMap);
340 | } catch (Exception e) {
341 | throw e;
342 | }
343 | return re;
344 | }
345 |
346 | /**
347 | * 发送JSON格式body的POST请求
348 | *
349 | * @param url 地址
350 | * @param jsonBody json body
351 | * @return
352 | * @throws Exception
353 | */
354 | public String executePostWithJson(String url, String jsonBody) throws Exception {
355 | String reStr = "";
356 | HttpPost httpPost = new HttpPost(url);
357 | httpPost.setConfig(RequestConfig.custom().setSocketTimeout(timeout).setConnectTimeout(timeout).build());
358 | httpPost.setHeader("Cookie", convertCookieMapToString(cookieMap));
359 | CloseableHttpClient httpclient = HttpClientBuilder.create().build();
360 | CloseableHttpResponse response;
361 | try {
362 | httpPost.setEntity(new StringEntity(jsonBody, ContentType.APPLICATION_JSON));
363 | HttpClientContext context = HttpClientContext.create();
364 | response = httpclient.execute(httpPost, context);
365 | getCookiesFromCookieStore(context.getCookieStore(), cookieMap);
366 | HttpEntity entity = response.getEntity();
367 | reStr = EntityUtils.toString(entity, charset);
368 | } catch (IOException e) {
369 | throw e;
370 | } finally {
371 | httpPost.releaseConnection();
372 | }
373 | return reStr;
374 | }
375 |
376 | /**
377 | * 发送JSON格式body的SSL POST请求
378 | *
379 | * @param url 地址
380 | * @param jsonBody json body
381 | * @return
382 | * @throws Exception
383 | */
384 | public String executePostWithJsonAndSSL(String url, String jsonBody) throws Exception {
385 | String re = "";
386 | HttpPost post = new HttpPost(url);
387 | post.setHeader("Cookie", convertCookieMapToString(cookieMap));
388 | post.setConfig(RequestConfig.custom().setSocketTimeout(timeout).setConnectTimeout(timeout).build());
389 | CloseableHttpResponse response;
390 | try {
391 | CloseableHttpClient httpClientRe = createSSLInsecureClient();
392 | HttpClientContext contextRe = HttpClientContext.create();
393 | post.setEntity(new StringEntity(jsonBody, ContentType.APPLICATION_JSON));
394 | response = httpClientRe.execute(post, contextRe);
395 | HttpEntity entity = response.getEntity();
396 | if (entity != null) {
397 | re = EntityUtils.toString(entity, charset);
398 | }
399 | getCookiesFromCookieStore(contextRe.getCookieStore(), cookieMap);
400 | } catch (Exception e) {
401 | throw e;
402 | }
403 | return re;
404 | }
405 |
406 | /**
407 | * 获取页面文档字串(等待异步JS执行)
408 | *
409 | * @param url 页面URL
410 | * @return
411 | * @throws Exception
412 | */
413 | public String getHtmlPageResponse(String url) throws Exception {
414 | String result = "";
415 |
416 | final WebClient webClient = new WebClient(BrowserVersion.CHROME);
417 |
418 | webClient.getOptions().setThrowExceptionOnScriptError(false);//当JS执行出错的时候是否抛出异常
419 | webClient.getOptions().setThrowExceptionOnFailingStatusCode(false);//当HTTP的状态非200时是否抛出异常
420 | webClient.getOptions().setActiveXNative(false);
421 | webClient.getOptions().setCssEnabled(false);//是否启用CSS
422 | webClient.getOptions().setJavaScriptEnabled(true); //很重要,启用JS
423 | webClient.setAjaxController(new NicelyResynchronizingAjaxController());//很重要,设置支持AJAX
424 |
425 | webClient.getOptions().setTimeout(timeout);//设置“浏览器”的请求超时时间
426 | webClient.setJavaScriptTimeout(timeout);//设置JS执行的超时时间
427 |
428 | HtmlPage page;
429 | try {
430 | page = webClient.getPage(url);
431 | } catch (Exception e) {
432 | webClient.close();
433 | throw e;
434 | }
435 | webClient.waitForBackgroundJavaScript(waitForBackgroundJavaScript);//该方法阻塞线程
436 |
437 | result = page.asXml();
438 | webClient.close();
439 |
440 | return result;
441 | }
442 |
443 | /**
444 | * 获取页面文档Document对象(等待异步JS执行)
445 | *
446 | * @param url 页面URL
447 | * @return
448 | * @throws Exception
449 | */
450 | public Document getHtmlPageResponseAsDocument(String url) throws Exception {
451 | return parseHtmlToDoc(getHtmlPageResponse(url));
452 | }
453 |
454 | private void getCookiesFromCookieStore(CookieStore cookieStore, Map cookieMap) {
455 | List cookies = cookieStore.getCookies();
456 | for (Cookie cookie : cookies) {
457 | cookieMap.put(cookie.getName(), cookie.getValue());
458 | }
459 | }
460 |
461 | private String convertCookieMapToString(Map map) {
462 | String cookie = "";
463 | for (Map.Entry entry : map.entrySet()) {
464 | cookie += (entry.getKey() + "=" + entry.getValue() + "; ");
465 | }
466 | if (map.size() > 0) {
467 | cookie = cookie.substring(0, cookie.length() - 2);
468 | }
469 | return cookie;
470 | }
471 |
472 | /**
473 | * 创建 SSL连接
474 | *
475 | * @return
476 | * @throws GeneralSecurityException
477 | */
478 | private static CloseableHttpClient createSSLInsecureClient() throws GeneralSecurityException {
479 | try {
480 | SSLContext sslContext = new SSLContextBuilder().loadTrustMaterial(null, (chain, authType) -> true).build();
481 | SSLConnectionSocketFactory sslConnectionSocketFactory = new SSLConnectionSocketFactory(sslContext,
482 | (s, sslContextL) -> true);
483 | return HttpClients.custom().setSSLSocketFactory(sslConnectionSocketFactory).build();
484 | } catch (GeneralSecurityException e) {
485 | throw e;
486 | }
487 | }
488 | }
489 |
--------------------------------------------------------------------------------