爬取思路
1.分析頁面,定義爬取字段
2.觀察網(wǎng)頁,分析接口url,通過xpath和json解析爬取內(nèi)容字段
3.在pipelines.py寫入存儲方式
4.開始爬取
5.GitHub地址:https://github.com/HAOyanWEI24/Crawler/tree/master/jingdongspider 歡迎fork和star
1.分析網(wǎng)頁,定義字段
通過觀察頁面,我將字段分為了兩塊:一塊為商品詳情,包括價格名稱,評論數(shù)量等等內(nèi)容,另一塊主要從商品得到評論,會員的相關信息,定義如下:
1.商品詳情:
- link = scrapy.Field()
商品鏈接- project_id = scrapy.Field()
商品ID- name = scrapy.Field()
商品名字- comment_num = scrapy.Field()
評論人數(shù)- shop_name = scrapy.Field()
店家名字- price = scrapy.Field()
價錢- GoodCountStr = scrapy.Field()
好評- AfterCount = scrapy.Field()
中評- PoorCount = scrapy.Field()
差評
2.評論詳情:
- user_name = scrapy.Field()
評論用戶的名字- user_id = scrapy.Field()
評論用戶的ID- userProvince = scrapy.Field()
評論用戶來自的地區(qū)- content = scrapy.Field()
評論內(nèi)容- good_id = scrapy.Field()
評論的商品ID- good_name = scrapy.Field()
評論的商品名字- date = scrapy.Field()
評論時間- replyCount = scrapy.Field()
回復數(shù)- score = scrapy.Field()
評分- status = scrapy.Field()
狀態(tài)- userLevelId = scrapy.Field()
用戶等級- productColor = scrapy.Field()
商品顏色- productSize = scrapy.Field()
商品大小- userLevelName = scrapy.Field()
銀牌會員,鉆石會員等- userClientShow = scrapy.Field()
來自什么 比如來自京東客戶端- isMobile = scrapy.Field()
是否來自手機- days = scrapy.Field()
天數(shù)
接口思路解析:
京東網(wǎng)頁中的很多數(shù)據(jù)是寫在js中的,需要在network中查找接口路由,從而獲得其真正所在的url地址,通過不同的id與接口組合得到不同的解析內(nèi)容,分析如下:
分析接口:

商品價格接口分析.png
京東價格js接口url: https://p.3.cn/prices/mgets?callback=jQuery8876824&skuIds=J_4471753

Image 5.png
京東評論數(shù)量js接口url: https://club.jd.com/comment/productCommentSummaries.action?referenceIds=4471753

Image 6.png
京東評論js接口url: https://sclub.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98vv2394&productId=6023682&score=0&sortType=5&page=2&pageSize=10&isShadowSku=0&fold=1
思路闡述,由商品list頁面轉(zhuǎn)入商品詳情頁面,解析商品的詳情屬性,邏輯很簡單,話不多說,直接上代碼
"""京東商品詳情頁代碼"""
# -*- coding: utf-8 -*-
import requests
from jingdongspider.items import JingdongspiderItem
import scrapy
import re
import json
from scrapy import Request
class JingdongSpider(scrapy.Spider):
name = 'jingdong'
allowed_domains = ['jd.com']
start_urls = ['https://www.jd.com']
def parse(self, response):
"""京東"""
url = "https://list.jd.com/list.html?cat=670,671,672&page=1&sort=sort_totalsales15_desc&trans=1&JL=6_0_0#J_main"
yield Request(url, callback=self.parseMainPage)
def parseMainPage(self, response):
urls = response.xpath('//li[@class="gl-item"]/div/div[@class="p-img"]/a')
for url in urls:
item = JingdongspiderItem()
url = url.xpath('@href').extract()
all_url = response.urljoin(url[0])
item['link'] = all_url # 商品鏈接
for link in url:
url = response.urljoin(link)
yield Request(url, meta={'meta': item}, callback=self.parseDetails)
"""
通過遞歸原理解析下一頁
下一頁網(wǎng)頁xpath解析地址
"""
next_page = response.xpath('//a[@class="pn-next"]')
for page in next_page:
pages = page.xpath('@href').extract()[0]
page = response.urljoin(pages)
print(">>>>>>>>>>>>>", page)
yield Request(page, callback=self.parseMainPage, dont_filter=True)
def parseDetails(self, response):
item = response.meta['meta']
id= response.xpath('//a[@class="compare J-compare J_contrast"]/@data-sku').extract()[0] # 商品id
item['project_id'] = id
shop_name = response.xpath('//div[@class="name"]/a/text()').extract()[0] # 商店名稱
print(">>>>>>",shop_name)
item['shop_name'] = shop_name
item['name'] = response.xpath('//div[@class="sku-name"]/text()').extract()[0].strip() # 名稱
"""
獲取京東商品價格的url
"""
price_url = "https://p.3.cn/prices/mgets?callback=jQuery8876824&skuIds=" + str(id)
price = requests.get(price_url).text
money = re.findall(r'\"p\"\:\"(.*?)\"}]\)', price)
item['price'] = money[0]
"""
獲取京東商品評論數(shù)量
"""
comment_num = "https://club.jd.com/comment/productCommentSummaries.action?referenceIds=" + str(id)
yield scrapy.Request(comment_num, meta={'item': item}, callback=self.parse_getCommentnum)
"""
通過正則表達式解析評論人數(shù)
"""
# comment_nums = requests.get(comment_num).text
# nums = re.findall(r'\"ShowCountStr\"\:\"(.*?)\"', comment_nums)
# print(">>>>>>>", nums)
# page = urllib.urlopen(comment_num)
# data = page.read()
# print(data)
def parse_getCommentnum(self, response):
item = response.meta['item']
# response.text是一個json格式的
date = json.loads(response.text)
# print(date)
item['comment_num']= date['CommentsCount'][0]['CommentCountStr'] # 評論數(shù)量
item['AfterCount'] = date['CommentsCount'][0]['AfterCount'] # 好評
item['GoodCountStr']= date['CommentsCount'][0]['GoodCountStr'] # 中評
item['PoorCount']= date['CommentsCount'][0]['PoorCount'] # 差評
# for field in item.fields:
# try:
# item[field] = eval(field)
# except:
# print('Field is not defined', field)
yield item
"""京東評論詳情頁代碼"""
# -*- coding: utf-8 -*-
import requests
from jingdongspider.items import commentItem
import json
import xlrd
import scrapy
from scrapy import Request
class JingdongCommentSpider(scrapy.Spider):
name = 'comment'
allowed_domains = ['jd.com']
start_urls = ['https://www.jd.com']
def parse(self, response):
"""京東"""
url = "https://list.jd.com/list.html?cat=670,671,672&page=1&sort=sort_totalsales15_desc&trans=1&JL=6_0_0#J_main"
yield Request(url, callback=self.parseMainPage)
def parseMainPage(self, response):
urls = response.xpath('//li[@class="gl-item"]/div/div[@class="p-img"]/a')
for url in urls:
url = url.xpath('@href').extract()
for link in url:
url = response.urljoin(link)
yield Request(url, callback=self.parseDetails)
def parseDetails(self, response):
id= response.xpath('//a[@class="compare J-compare J_contrast"]/@data-sku').extract()[0] # 商品id
"""
解析京東商品評論的url
"""
# url = 'https://sclub.jd.com/comment/productPageComments.action?productId=' + str(id) +'&score=0&sortType=5&page=0&pageSize=10'
# yield scrapy.Request(url, callback=self.parse_getCommentnum)
comment_num = "https://club.jd.com/comment/productCommentSummaries.action?referenceIds=" + str(id)
com = requests.get(comment_num).text
date = json.loads(com)
comment_nums = date['CommentsCount'][0]['ShowCount']
print(comment_nums)
comment_total = int(comment_nums)
if comment_total % 10 == 0: # 算出評論的頁數(shù),一頁10條評論
page = comment_total//10
else:
page = comment_total//10 + 1
for k in range(page):
'''
京東下一頁評論接口
'''
com_url = 'https://sclub.jd.com/comment/productPageComments.action?productId=' + str(id) +'&score=0&sortType=5&page='+str(k)+'&pageSize=10'
# print(">>>>>>>>>>", com_url)
yield scrapy.Request(com_url, callback=self.parse_getCommentnum)
# yield scrapy.Request(com_url, callback=self.parseDetails)
def parse_getCommentnum(self, response):
js = json.loads(response.text)
# print(js)
comments = js['comments'] # 該頁所有評論
items = []
for comment in comments:
item1 = commentItem()
item1['user_name'] = comment['nickname'] # 用戶名
item1['user_id'] = comment['id'] # 用戶id
item1['userProvince'] = comment['userProvince'] # 用戶評論用戶來自的地區(qū)
item1['content'] = comment['content'] # 評論
item1['good_id'] = comment['referenceId'] # 評論的商品ID
item1['good_name'] = comment['referenceName'] # 評論的商品名字
item1['date'] = comment['referenceTime'] # 評論時間
item1['replyCount'] = comment['replyCount'] # 回復數(shù)
item1['score'] = comment['score'] # 評分
item1['status'] = comment['status'] # 狀態(tài)
item1['userLevelId'] = comment['userLevelId'] # 用戶等級
item1['productColor'] = comment['productColor'] # 商品顏色
item1['productSize'] = comment['productSize'] # 商品大小
item1['userLevelName'] = comment['userLevelName'] # 銀牌會員,鉆石會員等
item1['isMobile'] = comment['isMobile'] # 是否來自手機
item1['userClientShow'] = comment['userClientShow'] # 是否來自手機
item1['days'] = comment['days'] # 天數(shù)
items.append(item1)
return items
存入數(shù)據(jù)庫
"""
pipelines.pyc存儲方法
"""
import MySQLdb.cursors
from twisted.enterprise import adbapi
from scrapy.utils.project import get_project_settings
SETTINGS = get_project_settings()
class MySQLPipeline(object):
@classmethod
def from_settings(cls, settings):
'''1、@classmethod聲明一個類方法,而對于平常我們見到的則叫做實例方法。
2、類方法的第一個參數(shù)cls(class的縮寫,指這個類本身),而實例方法的第一個參數(shù)是self,表示該類的一個實例
3、可以通過類來調(diào)用,就像C.f(),相當于java中的靜態(tài)方法'''
dbparams = dict(
host=settings['MYSQL_HOST'], # 讀取settings中的配置
db=settings['MYSQL_DBNAME'],
user=settings['MYSQL_USER'],
passwd=settings['MYSQL_PASSWD'],
charset='utf8', # 編碼要加上,否則可能出現(xiàn)中文亂碼問題
cursorclass=MySQLdb.cursors.DictCursor,
use_unicode=False,
)
dbpool = adbapi.ConnectionPool('MySQLdb', **dbparams) # **表示將字典擴展為關鍵字參數(shù),相當于host=xxx,db=yyy....
return cls(dbpool) # 相當于dbpool付給了這個類,self中可以得
def __init__(self, dbpool):
self.dbpool = dbpool
# pipeline默認調(diào)用
def process_item(self, item, spider):
query = self.dbpool.runInteraction(self._conditional_insert, item) # 調(diào)用插入的方法
query.addErrback(self._handle_error, item, spider) # 調(diào)用異常處理方法
return item
# 寫入數(shù)據(jù)庫中
def _conditional_insert(self, tx, item):
sql = "insert into jingdong(project_id,name,comment_num,shop_name,link,GoodCountStr,AfterCount,PoorCount,price) values(%s,%s,%s,%s,%s,%s,%s,%s,%s)"
params = (
item["project_id"], item["name"], item["comment_num"], item["shop_name"], item["link"], item["GoodCountStr"],
item["AfterCount"], item["PoorCount"], item["price"])
tx.execute(sql, params)
# 錯誤處理方法
def _handle_error(self, failue, item, spider):
print('--------------database operation exception!!-----------------')
print(failue)
settings.py數(shù)據(jù)庫配置
#Mysql數(shù)據(jù)庫的配置信息
MYSQL_HOST = '127.0.0.1'
MYSQL_DBNAME = 'jingdong' #數(shù)據(jù)庫名字,請修改
MYSQL_USER = 'user' #數(shù)據(jù)庫賬號,請修改
MYSQL_PASSWD = 'pwd' #數(shù)據(jù)庫密碼,請修改
MYSQL_PORT = 3306 #數(shù)據(jù)庫端口,在dbhelper中使用

商品詳情.png

評論詳情 2.png