抓包之分類數(shù)據(jù)抓取

https://www.hardtofind.com.au/

Paste_Image.png

1.獲取分類的鏈接,寫成數(shù)組的形式,保存在一個(gè)文件中
2.獲取每個(gè)分類的網(wǎng)頁數(shù)據(jù),保存在csv文件。
3.讀取每個(gè)分類的商品鏈接,并獲取該鏈接下的商品。

引入插件 。

from selenium import webdriver
# get encode uncode by base64
import base64

# get xpath by BeautifulSoup  == make base sdk
from bs4 import BeautifulSoup

import time
# get get time by sleep  eg:time.sleep(0.1)
from time import sleep

import random
# get re way   eg:match = re.search(r'[\w.-]+@[\w.-]+', Fi_Email)
import re
import csv
import unicodecsv
#import unicodecsv               ========make
from cStringIO import StringIO
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities

import json

import urllib2

from lxml import etree

圖片地址分析:
大圖:https://res.cloudinary.com/hardtofind/image/upload/c_pad,h_580,w_580/b_rgb:ffffff,h_580,w_580/cs_srgb,f_auto,fl_lossy/v1/product_image/santa_friends
小圖:
https://res.cloudinary.com/hardtofind/image/upload/c_pad,h_235,w_235/b_rgb:ffffff,h_235,w_235/cs_srgb,f_auto,fl_lossy/v1/product_image/santa_friends

實(shí)現(xiàn)

#_*_coding:utf8_*_
#!/usr/local/bin/python    MAC
# Filename: hard.py
#intend: image,title,price,class


from selenium import webdriver
from bs4 import BeautifulSoup
import urllib2
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities

import csv
import unicodecsv
import codecs
dcap = dict(DesiredCapabilities.PHANTOMJS)
dcap["phantomjs.page.settings.userAgent"] = (
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/53 "
    "(KHTML, like Gecko) Chrome/15.0.87"
)
driver = webdriver.PhantomJS(desired_capabilities=dcap)

# step 1 get all class
hrefarray = ['gifts','home-garden','prints-art','fashion','jewellery','men','kids','weddings']
for href in hrefarray:
    page = -1;
    while (page < 200):
        page = page + 1
        strPageIndex = str(page)
# step 2 get all href
        href_a = "https://www.hardtofind.com.au/categories/"+href+"?page="+strPageIndex
        print "======"+strPageIndex+"==="+href
        csvPageindexfile = file('hrefhardtofind.csv', 'a+')
        writer = unicodecsv.writer(csvPageindexfile,encoding='utf-8')
        #writer.writerow(['name', 'class', 'type','Address','Po_Address','Phone','Fax','Ma_Office','email'])
        hrefdb = (strPageIndex,href_a)
        hrefdata = [
         hrefdb
           ]
        writer.writerows(hrefdata)
        csvPageindexfile.close()
        content = urllib2.urlopen(href_a).read()
        s = BeautifulSoup(content,"html5lib")
        nav_Data = s.find("div", {"id": "products"})
        roothref_data = nav_Data.findAll("div", {"class": "sale-item-text-wrap"})
        index = 0
        maxindex = len(roothref_data) - 1

        for webhref in roothref_data:
            if (index >= maxindex):
                break
            index = index + 1
            lablehref = webhref.find("a",href=True)
            page_href =  lablehref["href"]
            threecontent = urllib2.urlopen(page_href).read()
            s = BeautifulSoup(threecontent,"html5lib")
            page_form = s.find("form",{"id":"form_add_to_cart"})
            page_title = page_form.find("div",{"class":"title"})
            title = page_title.h1.string
            print title
            price = page_title.find("span",{"itemprop":"price"}).string
            print price
            page_image =s.find("div",{"class":"galleryContainer"})
            page_image_a = page_image.findAll("ul",{"id":"imageGallery"})
            # image = page_image.findAll("a")
            print page_image_a[0].findAll("a")
            # print image
            page_class = s.find("div",{"class":"pagination"})
            classname = page_class.get_text().strip().encode('utf-8')
            classname = classname.replace(' ', '').replace('\n','>')
            print classname
# step 3 get all content
            csvfile = file('hardtofind_com_au.csv', 'a+')
            writer = unicodecsv.writer(csvfile,encoding='utf-8')
            csvfile.write(codecs.BOM_UTF8)
            writer = csv.writer(csvfile)
# step 4  seve to csv
            db = (title,price,classname, page_image_a)
            data = [
            db
            ]
            writer.writerows(data)
            csvfile.close()
            # break



    page += 1
    time.sleep(3)



最后編輯于
?著作權(quán)歸作者所有,轉(zhuǎn)載或內(nèi)容合作請聯(lián)系作者
【社區(qū)內(nèi)容提示】社區(qū)部分內(nèi)容疑似由AI輔助生成,瀏覽時(shí)請結(jié)合常識(shí)與多方信息審慎甄別。
平臺(tái)聲明:文章內(nèi)容(如有圖片或視頻亦包括在內(nèi))由作者上傳并發(fā)布,文章內(nèi)容僅代表作者本人觀點(diǎn),簡書系信息發(fā)布平臺(tái),僅提供信息存儲(chǔ)服務(wù)。

相關(guān)閱讀更多精彩內(nèi)容

友情鏈接更多精彩內(nèi)容