python_day8爬蟲之BeautifulSoup&MongnDB

今日作業(yè)

今日作業(yè)
程序代碼
'''
url:
    https://www.wandoujia.com/category/6001


https://www.wandoujia.com/wdjweb/api/category/more?catId=6001&subCatId=0&page=1&ctoken=2sFZJXOEckN_7qBULrSyfvj9

https://www.wandoujia.com/wdjweb/api/category/more?catId=6001&subCatId=0&page=2&ctoken=2sFZJXOEckN_7qBULrSyfvj9

32個(gè)頁(yè)面
'''

import requests
from bs4 import BeautifulSoup as bs
import lxml
import re
from pymongo import MongoClient

# 發(fā)送get請(qǐng)求
def get_page(url):
    response = requests.get(url)
    return response

# 解析詳情頁(yè)
def parse_detail(data):
    soup = bs(data, 'lxml')

    # 獲取APP名稱
    name = soup.find(name='span', attrs={'class', 'title'}).text
    # print(name)

    # 好評(píng)率
    love = soup.find(name='span', attrs={'class', 'love'}).text
    # print(love)

    # 評(píng)論人數(shù)
    commit_num = soup.find(name='a', attrs={'class', 'comment-open'}).text
    # print(commit_num)

    # 小編點(diǎn)評(píng)
    commit = soup.find(name='div', attrs={'class', 'con'}).text
    # print(commit)

    # 下載鏈接
    download = soup.find(name='a', attrs={'class', 'normal-dl-btn '}).attrs['href']
    # print(download)

    # 簡(jiǎn)介
    intro = soup.find(name='div', attrs={'class', 'desc-info'}).text

    # 網(wǎng)友評(píng)論(星星、名字、評(píng)論、時(shí)間)
    try:
        star = soup.find(name='i', attrs={'class', 'score-current'}).attrs['style']
        star_dict = {'width: 20%': '1顆星', 'width: 40%': '2顆星', 'width: 60%': '3顆星', 'width: 80%': '4顆星', 'width: 100%': '5顆星'}
        user_name = soup.find(name='span', attrs={'class': 'name'}).text
        time = soup.find(name='span', attrs={'class': 'time'}).text
        user_commit = soup.find(name='p', attrs={'class': 'cmt-content'}).text
    except:
        star = None
        star_dict = None
        user_name = None
        time = None
        user_commit = None

    # 1——5張截圖鏈接地址
    link = []
    for i in range(0, 4):
        link.append(soup.find(name='img', attrs={'data-index': '{}'.format(i)}).attrs['src'])

    # print(
    #     '''
    #     APP名稱:{}
    #     好評(píng)率:{}
    #     評(píng)論人數(shù):{}
    #     小編點(diǎn)評(píng):{}
    #     下載鏈接:{}
    #     簡(jiǎn)介:{}
    #     姓名:{}
    #     時(shí)間:{}
    #     star:{}
    #     評(píng)論:{}
    #     截圖鏈接:{}
    #     '''
    #     .format(name, love, commit_num, commit, download, intro, user_name, time, star_dict[star], user_commit, link)
    # )
    client['wandoujia']['detail'].insert({'app_name': name})
    client['wandoujia']['detail'].insert({'love': love})
    client['wandoujia']['detail'].insert({'commit_num': commit_num})
    client['wandoujia']['detail'].insert({'commit': commit})
    client['wandoujia']['detail'].insert({'download_link': download})
    client['wandoujia']['detail'].insert({'intro': intro})
    client['wandoujia']['detail'].insert({'user_name': user_name})
    client['wandoujia']['detail'].insert({'time': time})
    client['wandoujia']['detail'].insert({'star': star_dict[star]})
    client['wandoujia']['detail'].insert({'user_commit': user_commit})
    client['wandoujia']['detail'].insert({'pic_link': link})



# 解析主頁(yè)
def parse_text(data):
    soup = bs(data, 'lxml')

    # 獲取所有app的li標(biāo)簽
    li_data = soup.find_all(name='li', attrs={'class': 'card'})

    for li in li_data:
        # 圖標(biāo)地址
        img = li.find(name='img').attrs['data-original']
        # print('圖標(biāo)地址:'+img)
        client['wandoujia']['index'].insert({'icon_addr': img})

        # 下載人數(shù)
        download_count = li.find(name='span', attrs={'class': 'install-count'}).text
        # print('下載人數(shù):'+count)
        client['wandoujia']['index'].insert({'download_count': download_count})

        # 大小
        size = li.find(name='span', text=re.compile('\w+B')).text
        # print('大小:'+size)
        client['wandoujia']['index'].insert({'size': size})

        # 詳情頁(yè)鏈接
        detail_url = li.find(name='a').attrs['href']
        # print('詳情頁(yè)鏈接:'+detail_url)
        client['wandoujia']['index'].insert({'detail_url': detail_url})

        # 訪問(wèn)詳情頁(yè)
        detail_response = get_page(detail_url)
        # print(detail_response.text)

        # 解析詳情頁(yè)
        parse_detail(detail_response.text)


def main():
    for i in range(1, 33):
        url = 'https://www.wandoujia.com/wdjweb/api/category/more?catId=6001&subCatId=0&page={}&ctoken=2sFZJXOEckN_7qBULrSyfvj9'.format(i)

        response = get_page(url)
        # print(response)
        # 反序列化為字典
        data = response.json()

        app_li = data['data']['content']

        # 解析主頁(yè)面
        parse_text(app_li)

        # 關(guān)閉mongo客戶端
        client.close()

if __name__ == '__main__':
    client = MongoClient('localhost', 27017)
    print(client)
    client['wandoujia']['index']
    client['wandoujia']['detail']
    main()

1.基本使用

'''
BeautifulSoup4
1、什么是bs4
    是一個(gè)基于re開(kāi)發(fā)的解析庫(kù)

'''

'''
1.基本使用
'''
from bs4 import BeautifulSoup as bs

import lxml

html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="sister"><b>$37</b></p>

<p class="story" id="p">Once upon a time there were three little sisters; and their names were
<a  class="sister" >Elsie</a>,
<a  class="sister" id="link2">Lacie</a> and
<a  class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
"""

# 用BeautifulSoup實(shí)例化得到一個(gè)soup對(duì)象
# 參數(shù)一:解析文本,參數(shù)二:解析器
soup = bs(html_doc, 'lxml')

print(soup)

# 格式美化
html = soup.prettify()
print(html)

2.遍歷文檔樹(shù)

from bs4 import BeautifulSoup as bs

import lxml


html_doc = """<html><head><title>The Dormouse's story</title></head><body><p class="sister"><b>$37</b></p><p class="story" id="p">Once upon a time there were three little sisters; and their names were<b>hy</b><a  class="sister" >Elsie</a>,<a  class="sister" id="link2">Lacie</a> and<a  class="sister" id="link3">Tillie</a>;and they lived at the bottom of a well.</p><p class="story">...</p>"""

soup = bs(html_doc, 'lxml')

'''
    1、直接使用
    2、獲取標(biāo)簽的名稱
    3、獲取標(biāo)簽的屬性
    4、獲取標(biāo)簽的內(nèi)容
    5、嵌套選擇
    6、子節(jié)點(diǎn)、子孫節(jié)點(diǎn)
    7、父節(jié)點(diǎn)、祖先節(jié)點(diǎn)
    8、兄弟節(jié)點(diǎn)
'''

'''
1、直接使用
'''
# 查找第一個(gè)p標(biāo)簽
print(soup.p)
print(soup.a)

'''
2、獲取標(biāo)簽的名稱
'''
print(soup.head.name)

'''
3、獲取標(biāo)簽的屬性
'''
# 獲取a標(biāo)簽中的所有屬性
print(soup.a.attrs)

# 獲取a標(biāo)簽中的href屬性
print(soup.a.attrs['href'])

'''
4、獲取標(biāo)簽的內(nèi)容
'''
print(soup.p.text)

'''
5、嵌套選擇
'''
print(soup.html.head)

'''
6、子節(jié)點(diǎn)、子孫節(jié)點(diǎn)
'''
# body所有子節(jié)點(diǎn),返回迭代器對(duì)象
print(soup.body.children)
# 強(qiáng)制轉(zhuǎn)換為列表
print(list(soup.body.children))

# body的子孫節(jié)點(diǎn),返回生成器對(duì)象
print(soup.body.descendants)
# 強(qiáng)制轉(zhuǎn)換為列表
print(list(soup.body.descendants))

'''
7、父節(jié)點(diǎn)、祖先節(jié)點(diǎn)
'''
# 獲取p標(biāo)簽的父親節(jié)點(diǎn)
print(soup.p.parent)
# 獲取p標(biāo)簽的祖先節(jié)點(diǎn),返回生成器對(duì)象
print(soup.p.parents)
# 強(qiáng)制轉(zhuǎn)換為列表
print(list(soup.p.parents))

'''
8、兄弟節(jié)點(diǎn)
'''
# 找p標(biāo)簽下一個(gè)兄弟
print(soup.p.next_sibling)
# 找p下面所有兄弟
print(soup.p.next_siblings)
print(list(soup.p.next_siblings))

# 找a標(biāo)簽的上一個(gè)兄弟
print(soup.a.previous_sibling)
# 找a標(biāo)簽的上面所有兄弟
print(soup.a.previous_siblings)
print(list(soup.a.previous_siblings))

3.搜索文檔樹(shù)

'''
搜索文檔樹(shù):
    find()     找一個(gè)
    find()     找所有

標(biāo)簽查找與屬性查找:

    標(biāo)簽:
        name 屬性匹配
        attrs 屬性查找匹配
        text 文本匹配

        - 字符串過(guò)濾器
            字符串全局匹配

        - 正則過(guò)濾器
            re模塊匹配

        - 列表過(guò)濾器
            列表內(nèi)的數(shù)據(jù)匹配

        - bool過(guò)濾器
            True匹配

        - 方法過(guò)濾器
            用于一些要的屬性以及不需要的屬性查找。

    屬性:
        - class_
        - id
'''

from bs4 import BeautifulSoup as bs
import lxml

html_doc = """<html><head><title>The Dormouse's story</title></head><body><p class="sister"><b>$37</b></p><p class="story" id="p">Once upon a time there were three little sisters; and their names were<b>hy</b><a  class="sister" >Elsie</a>,<a  class="sister" id="link2">Lacie</a> and<a  class="sister" id="link3">Tillie</a>;and they lived at the bottom of a well.</p><p class="story">...</p>"""

soup = bs(html_doc, 'lxml')


'''
字符串過(guò)濾器
'''
# name
# 根據(jù)標(biāo)簽名查找
p_tag = soup.find(name='p')
print(p_tag)

# 找到所有p標(biāo)簽
p_all_tag = soup.find_all(name='p')
print(p_all_tag)

# attrs
# 查找第一個(gè)class為sister的節(jié)點(diǎn)
p_class = soup.find(attrs={'class': 'sister'})
print(p_class)

# 查找所有class為sister的節(jié)點(diǎn)
p_all_class = soup.find_all(attrs={'class': 'sister'})
print(p_all_class)

# text
# 查找文本
p_text = soup.find(text='$37')
print(p_text)

# 配合使用
# 找到一個(gè)id為link2,文本為L(zhǎng)acie的a標(biāo)簽
p_all = soup.find(name='a', attrs={'id': 'link2'},text='Lacie')
print(p_all)


'''
正則過(guò)濾器
'''
import re
# name
# 根據(jù)標(biāo)簽名查找
p_tag = soup.find(name=re.compile('p'))
print(p_tag)


'''
列表過(guò)濾器
'''
import re
# name
# 根據(jù)標(biāo)簽名查找
p_tags = soup.find_all(name=['p', 'a', re.compile('html')])
print(p_tags)


'''
bool過(guò)濾器
'''
# 找到有id的p標(biāo)簽
p_tag = soup.find(name='p', attrs={'id': True})
print(p_tag)


'''
方法過(guò)濾器
'''
# 匹配標(biāo)簽名為a,屬性有id,沒(méi)有class的標(biāo)簽
def have_id_not_class(tag):
    if tag.name == 'a' and tag.has_attr('id') and tag.has_attr('class'):
        return tag

tag = soup.find(have_id_not_class)
print(tag)

4.爬取豌豆莢app數(shù)據(jù)

'''
url:
    https://www.wandoujia.com/category/6001


https://www.wandoujia.com/wdjweb/api/category/more?catId=6001&subCatId=0&page=1&ctoken=2sFZJXOEckN_7qBULrSyfvj9

https://www.wandoujia.com/wdjweb/api/category/more?catId=6001&subCatId=0&page=2&ctoken=2sFZJXOEckN_7qBULrSyfvj9

32個(gè)
'''

import requests
from bs4 import BeautifulSoup as bs
import lxml
import re

# 發(fā)送get請(qǐng)求
def get_page(url):
    response = requests.get(url)
    return response

# 解析詳情頁(yè)
def parse_detail(data):
    soup = bs(data, 'lxml')

    # 獲取APP名稱
    name = soup.find(name='span', attrs={'class', 'title'}).text
    # print(name)

    # 好評(píng)率
    love = soup.find(name='span', attrs={'class', 'love'}).text
    # print(love)

    # 評(píng)論人數(shù)
    commit_num = soup.find(name='a', attrs={'class', 'comment-open'}).text
    # print(commit_num)

    # 小編點(diǎn)評(píng)
    commit = soup.find(name='div', attrs={'class', 'con'}).text
    # print(commit)

    # 下載鏈接
    download = soup.find(name='a', attrs={'class', 'normal-dl-btn '}).attrs['href']
    # print(download)

    print(
        '''
        APP名稱:{}
        好評(píng)率:{}
        評(píng)論人數(shù):{}
        小編點(diǎn)評(píng):{}
        下載鏈接:{}
        '''
        .format(name, love, commit_num, commit, download)
    )

# 解析主頁(yè)
def parse_text(data):
    soup = bs(data, 'lxml')

    '''
    <li data-pn="com.tuyoo.fish.uc" class="card" data-suffix=""><div class="icon-wrap"><a    >  <img src="http://img.ucdl.pp.uc.cn/upload_files/wdj_web/public/img/grey-128x128.png" data-original="https://android-artworks.25pp.com/fs08/2019/05/30/9/109_c9c161e9f3eca16f072b27cbfe759bab_con_130x130.png" alt="捕魚大作戰(zhàn)" class="icon lazy" width="68" height="68">  </a></div><div class="app-desc"><h2 class="app-title-h2"><a    title="捕魚大作戰(zhàn)" class="name">捕魚大作戰(zhàn)</a></h2><div class="meta">    <span class="install-count">13.9萬(wàn)人安裝</span>  <span class="dot">?</span> <span title="33.67MB">33.67MB</span></div><div class="comment">  捕魚大作戰(zhàn),經(jīng)典街機(jī)新體驗(yàn)  </div></div>   <a class="tag-link" >休閑益智</a>     <a data-app-id="7471166" data-app-vid="700485088" data-app-name="捕魚大作戰(zhàn)" data-app-pname="com.tuyoo.fish.uc" data-app-vcode="41000" data-app-vname="4.1" data-app-categoryid="6001" data-app-subcategoryid="" data-app-icon="https://android-artworks.25pp.com/fs08/2019/05/30/9/109_c9c161e9f3eca16f072b27cbfe759bab_con_130x130.png" data-app-rtype="1"  class="detail-check-btn" >查看 </a>   </li>
    '''
    # 獲取所有app的li標(biāo)簽
    li_data = soup.find_all(name='li', attrs={'class': 'card'})

    for li in li_data:
        # 圖標(biāo)地址
        img = li.find(name='img').attrs['data-original']
        print('圖標(biāo)地址:'+img)

        # 下載人數(shù)
        count = li.find(name='span', attrs={'class': 'install-count'}).text
        print('下載人數(shù):'+count)

        # 大小
        size = li.find(name='span', text=re.compile('\d+MB')).text
        print('大?。?+size)

        # 詳情頁(yè)鏈接
        detail_url = li.find(name='a').attrs['href']
        print('詳情頁(yè)鏈接:'+detail_url)

        # 訪問(wèn)詳情頁(yè)
        detail_response = get_page(detail_url)
        # print(detail_response.text)

        # 解析詳情頁(yè)
        parse_detail(detail_response.text)

def main():
    for i in range(1, 33):
        url = 'https://www.wandoujia.com/wdjweb/api/category/more?catId=6001&subCatId=0&page={}&ctoken=2sFZJXOEckN_7qBULrSyfvj9'.format(i)

        response = get_page(url)
        # print(response)
        # 反序列化為字典
        data = response.json()

        app_li = data['data']['content']

        # 解析主頁(yè)面
        parse_text(app_li)

if __name__ == '__main__':
    main()

5.MongoDB

MongoDB是一款強(qiáng)大、靈活、且易于擴(kuò)展的通用型非關(guān)系型數(shù)據(jù)庫(kù)。


關(guān)系型數(shù)據(jù)庫(kù)與非關(guān)系型數(shù)據(jù)庫(kù)

MongoDB與SQL對(duì)比
'''
一、安裝運(yùn)行
1.下載安裝
    -https://www.mongodb.com/download-center#community

2.安裝路徑為D:\MongoDB,將D:\MongoDB\bin目錄加入環(huán)境變量

3.新建目錄與文件
    -D:\MongoDB\data\db
    -D:\MongoDB\log\mongod.log

4.在C盤建立文件夾C:/data/db
    -數(shù)據(jù)存放路徑

5.輸入mongod啟動(dòng)服務(wù)
    進(jìn)入終端(以管理員身份),輸入mongod啟動(dòng)MongoDB服務(wù)

6.輸入mongo進(jìn)入MongoDB客戶端(不要關(guān)閉服務(wù)端)
    打開(kāi)一個(gè)新的cmd,輸入mongo進(jìn)入客戶端

二、數(shù)據(jù)庫(kù)操作
1.切換庫(kù)
SQL:
    use admin;  有則切換,無(wú)則報(bào)錯(cuò)

MongoDB:
    use tank;   有則切換,無(wú)則創(chuàng)建并切換
    
2.查數(shù)據(jù)庫(kù)
SQL:
    show database;

MongoDB:
    show dbs;   僅顯示有數(shù)據(jù)的庫(kù)

3.刪除庫(kù)
SQL:
    drop database;
    
MongoDB:
    db.dropDatabase();
    
三、集合操作  mysql中叫做表
1.創(chuàng)建集合
SQL:
    creat table f1,f2...
    
MongoDB:
    # 在當(dāng)前庫(kù)中通過(guò).創(chuàng)建集合
    db.student

2.插入數(shù)據(jù)
    # 插入一條數(shù)據(jù)
    db.student.insert({"name":"hy"})
    
    # 插入多條數(shù)據(jù)
    db.student.insert({"name1":"hy1"},{"name2":"hy2"})

3.查找數(shù)據(jù)
    # 查找student集合中所有數(shù)據(jù)
    db.student.find({})
    
    # 查找一條數(shù)據(jù),一條name為hy的數(shù)據(jù)
    db.student.find({"name":"hy"})
'''

6.pymongo使用

from pymongo import MongoClient

# 1.鏈接MongoDB客戶端
# 參數(shù)一:mongoDB的IP地址
# 參數(shù)二:mongoDB端口號(hào),默認(rèn)27017
client = MongoClient('localhost', 27017)
print(client)


# 2.進(jìn)入hy_db庫(kù),沒(méi)有則創(chuàng)建
print(client['hy_db'])

# 3.創(chuàng)建集合
print(client['hy_db']['prople'])

# 4.向hy_db庫(kù)插入一條數(shù)據(jù)
data1 = {'name': 'hy', 'age': '23', 'sex': 'male'}
client['hy_db']['people'].insert(data1)

# 5.插入多條數(shù)據(jù)
data1 = {'name': 'hy1', 'age': '23', 'sex': 'male'}
data2 = {'name': 'hy2', 'age': '22', 'sex': 'male'}
data3 = {'name': 'hy3', 'age': '21', 'sex': 'male'}
client['hy_db']['people'].insert([data1, data2, data3])

# 官方推薦使用
# 插入一條
client['hy_db']['people'].insert_one()
# 插入多條
client['hy_db']['people'].insert_many()


# 6.查數(shù)據(jù)
# 查看所有數(shù)據(jù)
data_s = client['hy_db']['people'].find()
# 循環(huán)打印所有數(shù)據(jù)
for data in data_s:
    print(data)

# 查看一條數(shù)據(jù)
data = client['hy_db']['people'].find_one()
print(data)

7.MongoDB可視化工具

https://robomongo.org/

最后編輯于
?著作權(quán)歸作者所有,轉(zhuǎn)載或內(nèi)容合作請(qǐng)聯(lián)系作者
【社區(qū)內(nèi)容提示】社區(qū)部分內(nèi)容疑似由AI輔助生成,瀏覽時(shí)請(qǐng)結(jié)合常識(shí)與多方信息審慎甄別。
平臺(tái)聲明:文章內(nèi)容(如有圖片或視頻亦包括在內(nèi))由作者上傳并發(fā)布,文章內(nèi)容僅代表作者本人觀點(diǎn),簡(jiǎn)書系信息發(fā)布平臺(tái),僅提供信息存儲(chǔ)服務(wù)。

相關(guān)閱讀更多精彩內(nèi)容

  • 一、MongoDB簡(jiǎn)介 1.概述 ? MongoDB是一個(gè)基于分布式文件存儲(chǔ)的數(shù)據(jù)庫(kù),由C++語(yǔ)言編寫。旨在為WE...
    鄭元吉閱讀 1,118評(píng)論 0 2
  • MongoDB 是一個(gè)基于分布式文件存儲(chǔ)的數(shù)據(jù)庫(kù)。由 C++ 語(yǔ)言編寫,旨在為 WEB 應(yīng)用提供可擴(kuò)展的高性能數(shù)據(jù)...
    LittlePy閱讀 1,681評(píng)論 0 4
  • 在Python 應(yīng)用中使用 MongoDB 1、SQL vs NoSQL 如果你不是很熟悉NoSQL這個(gè)概念,Mo...
    _士心_閱讀 663評(píng)論 0 5
  • 為什么要使用 MongoDB 以及 Pymongo 在程序開(kāi)發(fā)實(shí)踐中,除了學(xué)習(xí)代碼、算法之外,其他開(kāi)發(fā)有用的程序一...
    speculatecat閱讀 3,668評(píng)論 2 13
  • 月黑雁飛高,咪嚕夜秉燭。欲把書盡歡,魚片滿手沾。 麻麻做主的家庭,通常做一件事主次不分。 明明是要看書的,卻不想白...
    MiluJoy閱讀 273評(píng)論 0 2

友情鏈接更多精彩內(nèi)容