今日作業(yè)

今日作業(yè)
程序代碼
'''
url:
https://www.wandoujia.com/category/6001
https://www.wandoujia.com/wdjweb/api/category/more?catId=6001&subCatId=0&page=1&ctoken=2sFZJXOEckN_7qBULrSyfvj9
https://www.wandoujia.com/wdjweb/api/category/more?catId=6001&subCatId=0&page=2&ctoken=2sFZJXOEckN_7qBULrSyfvj9
32個(gè)頁(yè)面
'''
import requests
from bs4 import BeautifulSoup as bs
import lxml
import re
from pymongo import MongoClient
# 發(fā)送get請(qǐng)求
def get_page(url):
response = requests.get(url)
return response
# 解析詳情頁(yè)
def parse_detail(data):
soup = bs(data, 'lxml')
# 獲取APP名稱
name = soup.find(name='span', attrs={'class', 'title'}).text
# print(name)
# 好評(píng)率
love = soup.find(name='span', attrs={'class', 'love'}).text
# print(love)
# 評(píng)論人數(shù)
commit_num = soup.find(name='a', attrs={'class', 'comment-open'}).text
# print(commit_num)
# 小編點(diǎn)評(píng)
commit = soup.find(name='div', attrs={'class', 'con'}).text
# print(commit)
# 下載鏈接
download = soup.find(name='a', attrs={'class', 'normal-dl-btn '}).attrs['href']
# print(download)
# 簡(jiǎn)介
intro = soup.find(name='div', attrs={'class', 'desc-info'}).text
# 網(wǎng)友評(píng)論(星星、名字、評(píng)論、時(shí)間)
try:
star = soup.find(name='i', attrs={'class', 'score-current'}).attrs['style']
star_dict = {'width: 20%': '1顆星', 'width: 40%': '2顆星', 'width: 60%': '3顆星', 'width: 80%': '4顆星', 'width: 100%': '5顆星'}
user_name = soup.find(name='span', attrs={'class': 'name'}).text
time = soup.find(name='span', attrs={'class': 'time'}).text
user_commit = soup.find(name='p', attrs={'class': 'cmt-content'}).text
except:
star = None
star_dict = None
user_name = None
time = None
user_commit = None
# 1——5張截圖鏈接地址
link = []
for i in range(0, 4):
link.append(soup.find(name='img', attrs={'data-index': '{}'.format(i)}).attrs['src'])
# print(
# '''
# APP名稱:{}
# 好評(píng)率:{}
# 評(píng)論人數(shù):{}
# 小編點(diǎn)評(píng):{}
# 下載鏈接:{}
# 簡(jiǎn)介:{}
# 姓名:{}
# 時(shí)間:{}
# star:{}
# 評(píng)論:{}
# 截圖鏈接:{}
# '''
# .format(name, love, commit_num, commit, download, intro, user_name, time, star_dict[star], user_commit, link)
# )
client['wandoujia']['detail'].insert({'app_name': name})
client['wandoujia']['detail'].insert({'love': love})
client['wandoujia']['detail'].insert({'commit_num': commit_num})
client['wandoujia']['detail'].insert({'commit': commit})
client['wandoujia']['detail'].insert({'download_link': download})
client['wandoujia']['detail'].insert({'intro': intro})
client['wandoujia']['detail'].insert({'user_name': user_name})
client['wandoujia']['detail'].insert({'time': time})
client['wandoujia']['detail'].insert({'star': star_dict[star]})
client['wandoujia']['detail'].insert({'user_commit': user_commit})
client['wandoujia']['detail'].insert({'pic_link': link})
# 解析主頁(yè)
def parse_text(data):
soup = bs(data, 'lxml')
# 獲取所有app的li標(biāo)簽
li_data = soup.find_all(name='li', attrs={'class': 'card'})
for li in li_data:
# 圖標(biāo)地址
img = li.find(name='img').attrs['data-original']
# print('圖標(biāo)地址:'+img)
client['wandoujia']['index'].insert({'icon_addr': img})
# 下載人數(shù)
download_count = li.find(name='span', attrs={'class': 'install-count'}).text
# print('下載人數(shù):'+count)
client['wandoujia']['index'].insert({'download_count': download_count})
# 大小
size = li.find(name='span', text=re.compile('\w+B')).text
# print('大小:'+size)
client['wandoujia']['index'].insert({'size': size})
# 詳情頁(yè)鏈接
detail_url = li.find(name='a').attrs['href']
# print('詳情頁(yè)鏈接:'+detail_url)
client['wandoujia']['index'].insert({'detail_url': detail_url})
# 訪問(wèn)詳情頁(yè)
detail_response = get_page(detail_url)
# print(detail_response.text)
# 解析詳情頁(yè)
parse_detail(detail_response.text)
def main():
for i in range(1, 33):
url = 'https://www.wandoujia.com/wdjweb/api/category/more?catId=6001&subCatId=0&page={}&ctoken=2sFZJXOEckN_7qBULrSyfvj9'.format(i)
response = get_page(url)
# print(response)
# 反序列化為字典
data = response.json()
app_li = data['data']['content']
# 解析主頁(yè)面
parse_text(app_li)
# 關(guān)閉mongo客戶端
client.close()
if __name__ == '__main__':
client = MongoClient('localhost', 27017)
print(client)
client['wandoujia']['index']
client['wandoujia']['detail']
main()
1.基本使用
'''
BeautifulSoup4
1、什么是bs4
是一個(gè)基于re開(kāi)發(fā)的解析庫(kù)
'''
'''
1.基本使用
'''
from bs4 import BeautifulSoup as bs
import lxml
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="sister"><b>$37</b></p>
<p class="story" id="p">Once upon a time there were three little sisters; and their names were
<a class="sister" >Elsie</a>,
<a class="sister" id="link2">Lacie</a> and
<a class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
# 用BeautifulSoup實(shí)例化得到一個(gè)soup對(duì)象
# 參數(shù)一:解析文本,參數(shù)二:解析器
soup = bs(html_doc, 'lxml')
print(soup)
# 格式美化
html = soup.prettify()
print(html)
2.遍歷文檔樹(shù)
from bs4 import BeautifulSoup as bs
import lxml
html_doc = """<html><head><title>The Dormouse's story</title></head><body><p class="sister"><b>$37</b></p><p class="story" id="p">Once upon a time there were three little sisters; and their names were<b>hy</b><a class="sister" >Elsie</a>,<a class="sister" id="link2">Lacie</a> and<a class="sister" id="link3">Tillie</a>;and they lived at the bottom of a well.</p><p class="story">...</p>"""
soup = bs(html_doc, 'lxml')
'''
1、直接使用
2、獲取標(biāo)簽的名稱
3、獲取標(biāo)簽的屬性
4、獲取標(biāo)簽的內(nèi)容
5、嵌套選擇
6、子節(jié)點(diǎn)、子孫節(jié)點(diǎn)
7、父節(jié)點(diǎn)、祖先節(jié)點(diǎn)
8、兄弟節(jié)點(diǎn)
'''
'''
1、直接使用
'''
# 查找第一個(gè)p標(biāo)簽
print(soup.p)
print(soup.a)
'''
2、獲取標(biāo)簽的名稱
'''
print(soup.head.name)
'''
3、獲取標(biāo)簽的屬性
'''
# 獲取a標(biāo)簽中的所有屬性
print(soup.a.attrs)
# 獲取a標(biāo)簽中的href屬性
print(soup.a.attrs['href'])
'''
4、獲取標(biāo)簽的內(nèi)容
'''
print(soup.p.text)
'''
5、嵌套選擇
'''
print(soup.html.head)
'''
6、子節(jié)點(diǎn)、子孫節(jié)點(diǎn)
'''
# body所有子節(jié)點(diǎn),返回迭代器對(duì)象
print(soup.body.children)
# 強(qiáng)制轉(zhuǎn)換為列表
print(list(soup.body.children))
# body的子孫節(jié)點(diǎn),返回生成器對(duì)象
print(soup.body.descendants)
# 強(qiáng)制轉(zhuǎn)換為列表
print(list(soup.body.descendants))
'''
7、父節(jié)點(diǎn)、祖先節(jié)點(diǎn)
'''
# 獲取p標(biāo)簽的父親節(jié)點(diǎn)
print(soup.p.parent)
# 獲取p標(biāo)簽的祖先節(jié)點(diǎn),返回生成器對(duì)象
print(soup.p.parents)
# 強(qiáng)制轉(zhuǎn)換為列表
print(list(soup.p.parents))
'''
8、兄弟節(jié)點(diǎn)
'''
# 找p標(biāo)簽下一個(gè)兄弟
print(soup.p.next_sibling)
# 找p下面所有兄弟
print(soup.p.next_siblings)
print(list(soup.p.next_siblings))
# 找a標(biāo)簽的上一個(gè)兄弟
print(soup.a.previous_sibling)
# 找a標(biāo)簽的上面所有兄弟
print(soup.a.previous_siblings)
print(list(soup.a.previous_siblings))
3.搜索文檔樹(shù)
'''
搜索文檔樹(shù):
find() 找一個(gè)
find() 找所有
標(biāo)簽查找與屬性查找:
標(biāo)簽:
name 屬性匹配
attrs 屬性查找匹配
text 文本匹配
- 字符串過(guò)濾器
字符串全局匹配
- 正則過(guò)濾器
re模塊匹配
- 列表過(guò)濾器
列表內(nèi)的數(shù)據(jù)匹配
- bool過(guò)濾器
True匹配
- 方法過(guò)濾器
用于一些要的屬性以及不需要的屬性查找。
屬性:
- class_
- id
'''
from bs4 import BeautifulSoup as bs
import lxml
html_doc = """<html><head><title>The Dormouse's story</title></head><body><p class="sister"><b>$37</b></p><p class="story" id="p">Once upon a time there were three little sisters; and their names were<b>hy</b><a class="sister" >Elsie</a>,<a class="sister" id="link2">Lacie</a> and<a class="sister" id="link3">Tillie</a>;and they lived at the bottom of a well.</p><p class="story">...</p>"""
soup = bs(html_doc, 'lxml')
'''
字符串過(guò)濾器
'''
# name
# 根據(jù)標(biāo)簽名查找
p_tag = soup.find(name='p')
print(p_tag)
# 找到所有p標(biāo)簽
p_all_tag = soup.find_all(name='p')
print(p_all_tag)
# attrs
# 查找第一個(gè)class為sister的節(jié)點(diǎn)
p_class = soup.find(attrs={'class': 'sister'})
print(p_class)
# 查找所有class為sister的節(jié)點(diǎn)
p_all_class = soup.find_all(attrs={'class': 'sister'})
print(p_all_class)
# text
# 查找文本
p_text = soup.find(text='$37')
print(p_text)
# 配合使用
# 找到一個(gè)id為link2,文本為L(zhǎng)acie的a標(biāo)簽
p_all = soup.find(name='a', attrs={'id': 'link2'},text='Lacie')
print(p_all)
'''
正則過(guò)濾器
'''
import re
# name
# 根據(jù)標(biāo)簽名查找
p_tag = soup.find(name=re.compile('p'))
print(p_tag)
'''
列表過(guò)濾器
'''
import re
# name
# 根據(jù)標(biāo)簽名查找
p_tags = soup.find_all(name=['p', 'a', re.compile('html')])
print(p_tags)
'''
bool過(guò)濾器
'''
# 找到有id的p標(biāo)簽
p_tag = soup.find(name='p', attrs={'id': True})
print(p_tag)
'''
方法過(guò)濾器
'''
# 匹配標(biāo)簽名為a,屬性有id,沒(méi)有class的標(biāo)簽
def have_id_not_class(tag):
if tag.name == 'a' and tag.has_attr('id') and tag.has_attr('class'):
return tag
tag = soup.find(have_id_not_class)
print(tag)
4.爬取豌豆莢app數(shù)據(jù)
'''
url:
https://www.wandoujia.com/category/6001
https://www.wandoujia.com/wdjweb/api/category/more?catId=6001&subCatId=0&page=1&ctoken=2sFZJXOEckN_7qBULrSyfvj9
https://www.wandoujia.com/wdjweb/api/category/more?catId=6001&subCatId=0&page=2&ctoken=2sFZJXOEckN_7qBULrSyfvj9
32個(gè)
'''
import requests
from bs4 import BeautifulSoup as bs
import lxml
import re
# 發(fā)送get請(qǐng)求
def get_page(url):
response = requests.get(url)
return response
# 解析詳情頁(yè)
def parse_detail(data):
soup = bs(data, 'lxml')
# 獲取APP名稱
name = soup.find(name='span', attrs={'class', 'title'}).text
# print(name)
# 好評(píng)率
love = soup.find(name='span', attrs={'class', 'love'}).text
# print(love)
# 評(píng)論人數(shù)
commit_num = soup.find(name='a', attrs={'class', 'comment-open'}).text
# print(commit_num)
# 小編點(diǎn)評(píng)
commit = soup.find(name='div', attrs={'class', 'con'}).text
# print(commit)
# 下載鏈接
download = soup.find(name='a', attrs={'class', 'normal-dl-btn '}).attrs['href']
# print(download)
print(
'''
APP名稱:{}
好評(píng)率:{}
評(píng)論人數(shù):{}
小編點(diǎn)評(píng):{}
下載鏈接:{}
'''
.format(name, love, commit_num, commit, download)
)
# 解析主頁(yè)
def parse_text(data):
soup = bs(data, 'lxml')
'''
<li data-pn="com.tuyoo.fish.uc" class="card" data-suffix=""><div class="icon-wrap"><a > <img src="http://img.ucdl.pp.uc.cn/upload_files/wdj_web/public/img/grey-128x128.png" data-original="https://android-artworks.25pp.com/fs08/2019/05/30/9/109_c9c161e9f3eca16f072b27cbfe759bab_con_130x130.png" alt="捕魚大作戰(zhàn)" class="icon lazy" width="68" height="68"> </a></div><div class="app-desc"><h2 class="app-title-h2"><a title="捕魚大作戰(zhàn)" class="name">捕魚大作戰(zhàn)</a></h2><div class="meta"> <span class="install-count">13.9萬(wàn)人安裝</span> <span class="dot">?</span> <span title="33.67MB">33.67MB</span></div><div class="comment"> 捕魚大作戰(zhàn),經(jīng)典街機(jī)新體驗(yàn) </div></div> <a class="tag-link" >休閑益智</a> <a data-app-id="7471166" data-app-vid="700485088" data-app-name="捕魚大作戰(zhàn)" data-app-pname="com.tuyoo.fish.uc" data-app-vcode="41000" data-app-vname="4.1" data-app-categoryid="6001" data-app-subcategoryid="" data-app-icon="https://android-artworks.25pp.com/fs08/2019/05/30/9/109_c9c161e9f3eca16f072b27cbfe759bab_con_130x130.png" data-app-rtype="1" class="detail-check-btn" >查看 </a> </li>
'''
# 獲取所有app的li標(biāo)簽
li_data = soup.find_all(name='li', attrs={'class': 'card'})
for li in li_data:
# 圖標(biāo)地址
img = li.find(name='img').attrs['data-original']
print('圖標(biāo)地址:'+img)
# 下載人數(shù)
count = li.find(name='span', attrs={'class': 'install-count'}).text
print('下載人數(shù):'+count)
# 大小
size = li.find(name='span', text=re.compile('\d+MB')).text
print('大?。?+size)
# 詳情頁(yè)鏈接
detail_url = li.find(name='a').attrs['href']
print('詳情頁(yè)鏈接:'+detail_url)
# 訪問(wèn)詳情頁(yè)
detail_response = get_page(detail_url)
# print(detail_response.text)
# 解析詳情頁(yè)
parse_detail(detail_response.text)
def main():
for i in range(1, 33):
url = 'https://www.wandoujia.com/wdjweb/api/category/more?catId=6001&subCatId=0&page={}&ctoken=2sFZJXOEckN_7qBULrSyfvj9'.format(i)
response = get_page(url)
# print(response)
# 反序列化為字典
data = response.json()
app_li = data['data']['content']
# 解析主頁(yè)面
parse_text(app_li)
if __name__ == '__main__':
main()
5.MongoDB
MongoDB是一款強(qiáng)大、靈活、且易于擴(kuò)展的通用型非關(guān)系型數(shù)據(jù)庫(kù)。

關(guān)系型數(shù)據(jù)庫(kù)與非關(guān)系型數(shù)據(jù)庫(kù)

MongoDB與SQL對(duì)比
'''
一、安裝運(yùn)行
1.下載安裝
-https://www.mongodb.com/download-center#community
2.安裝路徑為D:\MongoDB,將D:\MongoDB\bin目錄加入環(huán)境變量
3.新建目錄與文件
-D:\MongoDB\data\db
-D:\MongoDB\log\mongod.log
4.在C盤建立文件夾C:/data/db
-數(shù)據(jù)存放路徑
5.輸入mongod啟動(dòng)服務(wù)
進(jìn)入終端(以管理員身份),輸入mongod啟動(dòng)MongoDB服務(wù)
6.輸入mongo進(jìn)入MongoDB客戶端(不要關(guān)閉服務(wù)端)
打開(kāi)一個(gè)新的cmd,輸入mongo進(jìn)入客戶端
二、數(shù)據(jù)庫(kù)操作
1.切換庫(kù)
SQL:
use admin; 有則切換,無(wú)則報(bào)錯(cuò)
MongoDB:
use tank; 有則切換,無(wú)則創(chuàng)建并切換
2.查數(shù)據(jù)庫(kù)
SQL:
show database;
MongoDB:
show dbs; 僅顯示有數(shù)據(jù)的庫(kù)
3.刪除庫(kù)
SQL:
drop database;
MongoDB:
db.dropDatabase();
三、集合操作 mysql中叫做表
1.創(chuàng)建集合
SQL:
creat table f1,f2...
MongoDB:
# 在當(dāng)前庫(kù)中通過(guò).創(chuàng)建集合
db.student
2.插入數(shù)據(jù)
# 插入一條數(shù)據(jù)
db.student.insert({"name":"hy"})
# 插入多條數(shù)據(jù)
db.student.insert({"name1":"hy1"},{"name2":"hy2"})
3.查找數(shù)據(jù)
# 查找student集合中所有數(shù)據(jù)
db.student.find({})
# 查找一條數(shù)據(jù),一條name為hy的數(shù)據(jù)
db.student.find({"name":"hy"})
'''
6.pymongo使用
from pymongo import MongoClient
# 1.鏈接MongoDB客戶端
# 參數(shù)一:mongoDB的IP地址
# 參數(shù)二:mongoDB端口號(hào),默認(rèn)27017
client = MongoClient('localhost', 27017)
print(client)
# 2.進(jìn)入hy_db庫(kù),沒(méi)有則創(chuàng)建
print(client['hy_db'])
# 3.創(chuàng)建集合
print(client['hy_db']['prople'])
# 4.向hy_db庫(kù)插入一條數(shù)據(jù)
data1 = {'name': 'hy', 'age': '23', 'sex': 'male'}
client['hy_db']['people'].insert(data1)
# 5.插入多條數(shù)據(jù)
data1 = {'name': 'hy1', 'age': '23', 'sex': 'male'}
data2 = {'name': 'hy2', 'age': '22', 'sex': 'male'}
data3 = {'name': 'hy3', 'age': '21', 'sex': 'male'}
client['hy_db']['people'].insert([data1, data2, data3])
# 官方推薦使用
# 插入一條
client['hy_db']['people'].insert_one()
# 插入多條
client['hy_db']['people'].insert_many()
# 6.查數(shù)據(jù)
# 查看所有數(shù)據(jù)
data_s = client['hy_db']['people'].find()
# 循環(huán)打印所有數(shù)據(jù)
for data in data_s:
print(data)
# 查看一條數(shù)據(jù)
data = client['hy_db']['people'].find_one()
print(data)