爬蟲學(xué)習(xí)(二)數(shù)據(jù)解析

1.xpath語法

xpath語法

2.1用lxml庫解析html字符串和文件

from lxml import etree
#解析HTML字符串
html = etree.HTML(text)
print(etree.tostring(html,encoding="utf-8").decode("utf-8"))
#解析HTML文件
html = etree.HTML("lagou.html")
print(etree.tostring(html,encoding="utf-8").decode("utf-8"))
#解析HTML文件錯誤時(默認(rèn)為xml解析器)需創(chuàng)建指定的解析器
parser = etree.HTMLParser(encoding = 'utf-8')
html = etree.HTML("lagou.html",parser=parser)
print(etree.tostring(html,encoding="utf-8").decode("utf-8"))

2.2xpath和lxml庫配合使用

from lxml import etree
parser = etree.HTMLParser(encoding = "utf-8")
html = etree.parse('tencet.html',parser = parser)
#1.獲取所有tr標(biāo)簽
trs = html.xpath("http://tr") #xpath返回的是個列表
for tr in trs:
    print(etree.tostring(tr,encoding='utf-8').decode("utf-8"))
#2.獲取第二個tr標(biāo)簽
tr = html.xpath("http://tr[2]").[0]
print(etree.tostring(tr,encoding='utf-8').decode("utf-8"))
#3.獲取所有class = even得tr標(biāo)簽
tr = html.xpath("http://tr[@clss = 'even']")
#4.獲取所有a標(biāo)簽的href屬性
alist = html.xpath("http://a/@herf")
#5.獲取某個標(biāo)簽下的文本文檔
title = tr.xpath(".//td[1]//text()")

示例 電影天堂爬取

import requests
from lxml import etree
Base_DOMAIN = 'https://www.dytt8.net/'
# url = 'https://www.dytt8.net/html/gndy/dyzz/list_23_1.html'
HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36",
    "Referer": "https://www.dytt8.net/html/gndy/dyzz/list_23_2.html"}


def get_url(url):
    resp = requests.get(url, headers=HEADERS)
    # text = resp.content.decode('gbk', errors='ignore')
    text = resp.text
    html = etree.HTML(text)
    links = html.xpath("http://table[@class='tbspan']//a/@href")
    urls = map(lambda url: Base_DOMAIN + url, links)
    return urls


def jx(url):
    movies = {}
    resp = requests.get(url, headers=HEADERS)
    text = resp.content.decode('gbk', errors='ignore')
    html = etree.HTML(text)
    movie_title = html.xpath(
        "http://div[@class='title_all']//font[@color='#07519a']/text()")[0]
    movies["title"] = movie_title
    Zoom = html.xpath("http://div[@id='Zoom']")[0]
    photos = Zoom.xpath(".//img/@src")
    haibao = photos[0]
    movies["haibao"] = haibao
    infos = Zoom.xpath(".//text()")
    for index, info in enumerate(
            infos):  # enumerate(infos)返回兩個值,下標(biāo)和內(nèi)容,獲取演員列表需要下標(biāo)
        if info.startswith(
                "◎年  代"):  # startswith("text")查找以text為開頭的部分,并返回text整體
            # info = info.replace("◎年  代","").strip()
            # #replace()將text整體中text部分替換為空,即去除text部分,strip()去掉內(nèi)容前后空格
            year = info_1(info, "◎年  代")
            movies["years"] = year
        elif info.startswith("◎豆瓣評分"):
            # info = info.replace("◎豆瓣評分","").strip()
            scores = info_1(info, "◎豆瓣評分")
            movies["scores"] = scores
        elif info.startswith("◎主  演"):
            info = info_1(info, "◎主  演")
            actor = [info]
            for x in range(index + 1, len(infos)):
                actors = infos[x].strip()
                if actors.startswith("◎"):
                    break
                actor.append(actors)
            movies["actors"] = actor
        elif info.startswith("◎簡  介"):
            info = info_1(info, "◎簡  介")
            for x in range(index + 1, len(infos)):
                profile = infos[x].strip()
                if profile.startswith("◎"):
                    break
                movies["profile"] = profile
    download_url = html.xpath("http://td[@bgcolor='#fdfddf']/a/@href")
    movies["download_url"] = download_url
    return movies


def info_1(info, rule):
    return info.replace(rule, "").strip()


def spider():
    base_url = 'https://www.dytt8.net/html/gndy/dyzz/list_23_{}.html'  # 預(yù)留頁碼位置
    film = []
    for x in range(1, 2):
        url = base_url.format(x)  # 填入頁碼位置獲得完整鏈接
        films_details = get_url(url)
        for page_url in films_details:
            movie = jx(page_url)
            film.append(movie)
            print(film)
    # with open("E:/桌面/電影.txt","w")as f:
    #     for x in film:
    #         f.write("\n"+str(x))



if __name__ == '__main__':
    spider()

3.BeautifulSoup4庫

BeautifulSoup也是HTML/XML的解析器,主要用于解析和提取HTML/XML。它作用于HTML DOM,會載入整個文檔,而xpath只是局部遍歷,因此BeautifulSoup性能上低于xpath,但它解析HTML比xpath簡單
安裝方法:pip安裝

pip install bs4

基本使用

from bs4 import BeautifulSoup
bs = BeautifulSoup(html,'lxml')#將HTML導(dǎo)入用lxml解析器進(jìn)行解析
#1.獲取所有tr標(biāo)簽
trs = soup.find_all('tr')
#2.獲取第二個tr標(biāo)簽
tr_2 = soup.find_all('tr',limit=2)[1] #limit限制獲取幾個數(shù)據(jù),find_all返回列表
#3.獲取所有class=even的標(biāo)簽
tr_even = soup.find_all('tr',class_='even')
tr_even = soup.find_all('tr',attrs={'class':'even'})#atrrs可指定獲取tr的某些屬性
#4.獲取id=test,class=test的a標(biāo)簽
alist = soup.find_all('a',id='test',class_='test')
alist = soup.find_all('a',attrs={'id':'test','class':'test'})
#5.獲取a標(biāo)簽下的href屬性
alist = soup.find_all('a')
for a in alist:
    #方法1
    href = a['href']
    #方法2
    href = a.attrs('href')
#6.獲取所有文本
tr_3 = soup.find_all('tr')[1:]#過濾第一個
for tr in tr_3:
    #infos = tr.strings #用strings會包括“\n”等字符,string會返回字符串,get_text()返回的不是列表
    infos = tr.stripped_strings#獲取非空字符
    infos =list(infos) #轉(zhuǎn)換為列表可提取其中元素

CSS選擇器 select

#1.通過標(biāo)簽名查找
print(soup.select('a'))
#2.通過類名查找,如查找class=sy
print(soup.select('.sy'))
#3.通過id查找
print(soup.select('#sy'))
#4.組合查找 標(biāo)簽+id/class等
print(soup.select('p #sy'))
#5.通過屬性查找
print(soup.select("a[href='http://......']"))

實(shí)例

#BeautifulSoup實(shí)例及數(shù)據(jù)可視化簡單應(yīng)用
import requests
from bs4 import BeautifulSoup
from pyecharts.charts import Bar #數(shù)據(jù)可視化庫,版本1.7.1,新版本改動
from pyecharts import options as opts

weather = []
def page_parse(url):
    headers = {
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36'
    }
    resp = requests.get(url,headers=headers).content.decode('utf-8')
    soup = BeautifulSoup(resp,'html5lib') #港澳臺地區(qū)需要用html5lib進(jìn)行解析
    conmidtabs = soup.find('div',class_='conMidtab')
    tables = conmidtabs.find_all('table')
    for table in tables:
        trs = table.find_all('tr')[2:]
        for index,tr in enumerate(trs):
            tds = tr.find_all('td')
            city_td = tds[0]
            if index == 0:
                city_td = tds[1]
            city = list(city_td.stripped_strings)[0]
            temp_td = tds[-2]
            temp = list(temp_td.stripped_strings)[0]
            # print({'city':city,'min-temp':temp})
            weather.append({'city':city,'min_temp':int(temp)})
    # with open('E:/桌面/weather.txt','w')as fp:
    #     for x in weather:
    #         fp.write('\n'+str(x))


def main():
    urls = [
        'http://www.weather.com.cn/textFC/hb.shtml',
        'http://www.weather.com.cn/textFC/db.shtml',
        'http://www.weather.com.cn/textFC/hd.shtml',
        'http://www.weather.com.cn/textFC/hz.shtml',
        'http://www.weather.com.cn/textFC/hn.shtml',
        'http://www.weather.com.cn/textFC/xb.shtml',
        'http://www.weather.com.cn/textFC/xn.shtml',
        'http://www.weather.com.cn/textFC/gat.shtml'
    ]
    for url in urls:
        page_parse(url)
    weather.sort(key=lambda weather:weather['min_temp'])
    data = weather[0:10]
    # print(data)
    cities = list(map(lambda x:x['city'],data))
    min_temps = list(map(lambda x:x['min_temp'],data))
    chart = Bar() #創(chuàng)建一個直方圖
    chart.set_global_opts(title_opts=opts.TitleOpts(title="天氣預(yù)報")) #創(chuàng)建直方圖主標(biāo)題
    chart.add_xaxis(cities)
    chart.add_yaxis('',min_temps)
    chart.set_global_opts(xaxis_opts=opts.AxisOpts(name='城市')) #建立x軸圖標(biāo)
    chart.set_global_opts(yaxis_opts=opts.AxisOpts(name='溫度'))
    chart.render('E:/桌面/天氣.html')



if __name__ == '__main__':
    main()

4.正則表達(dá)式

基本知識


匹配單個字符.jpg

匹配多個字符.jpg

正則表達(dá)式常用小案例

import re
#1.匹配電話號碼
text = '13691612426'
ret = re.match('1[34578]\d{9}',text)
print(ret.group())
#2.匹配郵箱
text = '1871759153@qq.com'
ret = re.match('\w+@[a-z0-9]+\.[a-z]+',text)
print(ret.group())
#3.匹配url
text = 'https://www.runoob.com/python3/python3-tutorial.html'
ret = re.match('(http|https|ftp)://[^\s]+',text)
print(ret.group())
#4.驗(yàn)證身份證
text = '32042519121281241x'
ret = re.match('\d{17}[\dxX]',text)
print(ret.group())
#5.匹配100內(nèi)的數(shù)字
text = '98'
ret = re.match('[1-9]\d?$|100$',text)
print(ret.group())

group()分組

import re
#group分組
text = 'apple prince is $5,iphone price is $300'
ret = re.match('.*(\$\d+).*(\$\d+)',text)
print(ret.group(1))
print(ret.group(2))

re模塊常用函數(shù)

import re
# re常用函數(shù)
# 1.findall() 找出所有滿足條件的,返回的是一個列表
text = 'apple prince is $5,iphone price is $300'
ret = re.findall('\d+',text)
print(ret)
# 2.sub() 找出所有滿足條件的并將其替換
text = 'apple prince is $5,iphone price is $300'
ret = re.sub('\d+','0',text)
print(ret)
# 3.split()函數(shù),返回一個列表
text = 'hello world ni hao'
ret = re.split(' ',text)
print(ret)
# 4.compile() 對經(jīng)常要用的正則表達(dá)式進(jìn)行編譯能提高效率
text = 'the number is 20.50'
r = re.compile(r"""
                \d+   #小數(shù)點(diǎn)前面的
                \.?   #小數(shù)點(diǎn)
                \d+   #小數(shù)點(diǎn)后面的
                """,re.VERBOSE)
ret = re.search(r,text)
print(ret.group())

實(shí)例分析

#古詩詞網(wǎng)爬?。ㄕ齽t表達(dá)式的應(yīng)用)
import requests
import re

poems = []
def page_parse(url):
    headers = {
        "user-agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36"
    }
    resp = requests.get(url,headers=headers)
    text = resp.text
    titles = re.findall(r'<div class="cont">.*?<b>(.*?)</b>',text,re.DOTALL) #加入re.DOTALL使.*可以識別\n
    dynasties = re.findall(r'<p class="source">.*?<a.*?>(.*?)</a>',text,re.DOTALL)
    authors = re.findall(r'<p class="source">.*?<a.*?>.*?<a.*?>(.*?)</a>',text,re.DOTALL)
    contents_tags = re.findall(r'<div class="contson".*?>(.*?)</div>',text,re.DOTALL)
    contents = []
    for content in contents_tags:
        content = re.sub('<.*?>','',content)
        contents.append(content.strip())
    #zip()函數(shù)簡介
    #zip函數(shù)將兩個或多個序列作為參數(shù),返回一個組成元素為元組的列表,元組由各序列構(gòu)成
    # x= [1,2,3]
    # y= [4,5,6]
    # xy = zip(x,y)
    # print(xy)
    #得到[(1,4),(2,5),(3,6)]
    # poems = []
    for x in zip(titles,dynasties,authors,contents):
        title,dynasty,author,content = x
        poem = {
            "title":title,
            "dynasty":dynasty,
            "author":author,
            "content":content
        }
        poems.append(poem)
    # for poem in poems:
    #     print(poem)



def main1():
   base_url = 'https://www.gushiwen.cn/default_{}.aspx'
   for x in range(1,4):
       url = base_url.format(x) #確定前幾頁url
       page_parse(url)

if __name__ == '__main__':
    main1()
    with open('E:/桌面/poems.txt','w')as fp:
        for poem in poems:
            fp.write("\n"+str(poem))
最后編輯于
?著作權(quán)歸作者所有,轉(zhuǎn)載或內(nèi)容合作請聯(lián)系作者
【社區(qū)內(nèi)容提示】社區(qū)部分內(nèi)容疑似由AI輔助生成,瀏覽時請結(jié)合常識與多方信息審慎甄別。
平臺聲明:文章內(nèi)容(如有圖片或視頻亦包括在內(nèi))由作者上傳并發(fā)布,文章內(nèi)容僅代表作者本人觀點(diǎn),簡書系信息發(fā)布平臺,僅提供信息存儲服務(wù)。

友情鏈接更多精彩內(nèi)容