java爬蟲之Xsoup(xpath) 解析

Git地址:https://github.com/code4craft/xsoup

引入包

        <dependency>
            <groupId>us.codecraft</groupId>
            <artifactId>webmagic-core</artifactId>
            <version>0.5.3</version>
        </dependency>

案例:

package com.example.power_spider.test;

import org.apache.http.HttpEntity;
import org.apache.http.HttpStatus;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.utils.HttpClientUtils;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import us.codecraft.xsoup.Xsoup;
import java.io.IOException;



public class HttpClientTest {

    public static String getRosponse() throws IOException{
        //1.生成httpclient,相當于該打開一個瀏覽器
        CloseableHttpClient httpClient = HttpClients.createDefault();
        String html = null;
        try {
            //2.創(chuàng)建get請求,相當于在瀏覽器地址欄輸入 網址
            HttpGet httpget = new HttpGet("https://auto.gasgoo.com/a/70147658.html");

            httpget.setHeader("Accept", "text/html, */*; q=0.01");
            httpget.setHeader("Accept-Encoding", "gzip, deflate,sdch");
            httpget.setHeader("Accept-Language", "zh-CN,zh;q=0.8");
            httpget.setHeader("Connection", "keep-alive");
            httpget.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.124 Safari/537.36)");
            //3.執(zhí)行get請求,相當于在輸入地址欄后敲回車鍵
            CloseableHttpResponse response = httpClient.execute(httpget);

            //4.判斷響應狀態(tài)為200,進行處理
            if(response.getStatusLine().getStatusCode() == HttpStatus.SC_OK) {
                //5.獲取響應內容
                HttpEntity httpEntity = response.getEntity();
                html = EntityUtils.toString(httpEntity, "utf-8");

            } else {
                //如果返回狀態(tài)不是200,比如404(頁面不存在)等,根據情況做處理,這里略
                System.out.println("返回狀態(tài)不是200");
                System.out.println(EntityUtils.toString(response.getEntity(), "utf-8"));
            }
        } catch (ClientProtocolException e) {
            e.printStackTrace();
        }
        finally {
            //6.關閉
            HttpClientUtils.closeQuietly(httpClient);
        }
        return html;

    }

    public static void main(String[] args) throws IOException {
        
        String html = getRosponse();
        Document document = Jsoup.parse(html);
        String title = Xsoup.compile("http://title").evaluate(document).get();
        String result = Xsoup.compile("http://div[@class='scrap minwidth']").evaluate(document).getElements().text();
        String data = Xsoup.select(html, "http://title").getElements().text();
        System.out.println("===>>>"+title);
        System.out.println("===>>>"+result);
        System.out.println(data);
    }

}

?著作權歸作者所有,轉載或內容合作請聯系作者
【社區(qū)內容提示】社區(qū)部分內容疑似由AI輔助生成,瀏覽時請結合常識與多方信息審慎甄別。
平臺聲明:文章內容(如有圖片或視頻亦包括在內)由作者上傳并發(fā)布,文章內容僅代表作者本人觀點,簡書系信息發(fā)布平臺,僅提供信息存儲服務。

相關閱讀更多精彩內容

友情鏈接更多精彩內容