from bs4 import BeautifulSoup
import re
html_file = '/Users/XXX/muggle/Plan-for-combating/week1/1_2/1_2answer_of_homework/index.html'
# 使用with open語法打開文件
# 第一個參數(shù)是文件地址;第二個參數(shù)是文件處理方式:r表示讀取文件;w表示寫文件
# 添加encoding指定字符集,避免亂碼問題
with open(html_file,'r',encoding='utf-8') as web_data:
content = web_data.read()
soup = BeautifulSoup(content, 'lxml')
titles = soup.select("body > div > div > div.col-md-9 > div > div > div > div.caption > h4 > a")
images = soup.select("body > div > div > div.col-md-9 > div > div > div > img")
reviews = soup.select("body > div > div > div.col-md-9 > div > div > div > div.ratings > p.pull-right")
prices = soup.select("body > div > div > div.col-md-9 > div > div > div > div.caption > h4.pull-right")
stars = soup.select('body > div > div > div.col-md-9 > div > div > div > div.ratings > p:nth-of-type(2)')
for title, image, price, star, review in zip(titles, images, prices, stars, reviews):
data = {
'title': title.get_text(),
'image': image.get('src'),
'price': price.get_text(),
# 通過len函數(shù)獲取列表長度
'star' : len(star.find_all('span', class_="glyphicon glyphicon-star")),
# 通知正則表達(dá)式獲取數(shù)字
'review': int(re.search(r'\d*', review.get_text()).group())
}
print(data)