久久激情视频,调教网站一区

https://www.hardtofind.com.au/

Paste_Image.png

1.獲取分類的鏈接，寫成數(shù)組的形式，保存在一個(gè)文件中
2.獲取每個(gè)分類的網(wǎng)頁數(shù)據(jù)，保存在csv文件。
3.讀取每個(gè)分類的商品鏈接，并獲取該鏈接下的商品。

引入插件。

from selenium import webdriver
# get encode uncode by base64
import base64

# get xpath by BeautifulSoup  == make base sdk
from bs4 import BeautifulSoup

import time
# get get time by sleep  eg:time.sleep(0.1)
from time import sleep

import random
# get re way   eg:match = re.search(r'[\w.-]+@[\w.-]+', Fi_Email)
import re
import csv
import unicodecsv
#import unicodecsv               ========make
from cStringIO import StringIO
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities

import json

import urllib2

from lxml import etree

圖片地址分析：
大圖：https://res.cloudinary.com/hardtofind/image/upload/c_pad,h_580,w_580/b_rgb:ffffff,h_580,w_580/cs_srgb,f_auto,fl_lossy/v1/product_image/santa_friends
小圖：
https://res.cloudinary.com/hardtofind/image/upload/c_pad,h_235,w_235/b_rgb:ffffff,h_235,w_235/cs_srgb,f_auto,fl_lossy/v1/product_image/santa_friends

實(shí)現(xiàn)

#_*_coding:utf8_*_
#！/usr/local/bin/python    MAC
# Filename: hard.py
#intend: image,title,price,class


from selenium import webdriver
from bs4 import BeautifulSoup
import urllib2
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities

import csv
import unicodecsv
import codecs
dcap = dict(DesiredCapabilities.PHANTOMJS)
dcap["phantomjs.page.settings.userAgent"] = (
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/53 "
    "(KHTML, like Gecko) Chrome/15.0.87"
)
driver = webdriver.PhantomJS(desired_capabilities=dcap)

# step 1 get all class
hrefarray = ['gifts','home-garden','prints-art','fashion','jewellery','men','kids','weddings']
for href in hrefarray:
    page = -1;
    while (page < 200):
        page = page + 1
        strPageIndex = str(page)
# step 2 get all href
        href_a = "https://www.hardtofind.com.au/categories/"+href+"?page="+strPageIndex
        print "======"+strPageIndex+"==="+href
        csvPageindexfile = file('hrefhardtofind.csv', 'a+')
        writer = unicodecsv.writer(csvPageindexfile,encoding='utf-8')
        #writer.writerow(['name', 'class', 'type','Address','Po_Address','Phone','Fax','Ma_Office','email'])
        hrefdb = (strPageIndex,href_a)
        hrefdata = [
         hrefdb
           ]
        writer.writerows(hrefdata)
        csvPageindexfile.close()
        content = urllib2.urlopen(href_a).read()
        s = BeautifulSoup(content,"html5lib")
        nav_Data = s.find("div", {"id": "products"})
        roothref_data = nav_Data.findAll("div", {"class": "sale-item-text-wrap"})
        index = 0
        maxindex = len(roothref_data) - 1

        for webhref in roothref_data:
            if (index >= maxindex):
                break
            index = index + 1
            lablehref = webhref.find("a",href=True)
            page_href =  lablehref["href"]
            threecontent = urllib2.urlopen(page_href).read()
            s = BeautifulSoup(threecontent,"html5lib")
            page_form = s.find("form",{"id":"form_add_to_cart"})
            page_title = page_form.find("div",{"class":"title"})
            title = page_title.h1.string
            print title
            price = page_title.find("span",{"itemprop":"price"}).string
            print price
            page_image =s.find("div",{"class":"galleryContainer"})
            page_image_a = page_image.findAll("ul",{"id":"imageGallery"})
            # image = page_image.findAll("a")
            print page_image_a[0].findAll("a")
            # print image
            page_class = s.find("div",{"class":"pagination"})
            classname = page_class.get_text().strip().encode('utf-8')
            classname = classname.replace(' ', '').replace('\n','>')
            print classname
# step 3 get all content
            csvfile = file('hardtofind_com_au.csv', 'a+')
            writer = unicodecsv.writer(csvfile,encoding='utf-8')
            csvfile.write(codecs.BOM_UTF8)
            writer = csv.writer(csvfile)
# step 4  seve to csv
            db = (title,price,classname, page_image_a)
            data = [
            db
            ]
            writer.writerows(data)
            csvfile.close()
            # break



    page += 1
    time.sleep(3)

色偷偷精品伊人,欧洲久久精品,欧美综合婷婷骚逼,国产AV主播,国产最新探花在线,九色在线视频一区,伊人大交九欧美,1769亚洲,黄色成人av

抓包之分類數(shù)據(jù)抓取

抓包之分類數(shù)據(jù)抓取

相關(guān)閱讀更多精彩內(nèi)容

友情鏈接更多精彩內(nèi)容

色偷偷精品伊人,欧洲久久精品,欧美综合婷婷骚逼,国产AV主播,国产最新探花在线,九色在线视频一区,伊人大交九 欧美,1769亚洲,黄色成人av

抓包之分類數(shù)據(jù)抓取

相關(guān)閱讀更多精彩內(nèi)容

友情鏈接更多精彩內(nèi)容

色偷偷精品伊人,欧洲久久精品,欧美综合婷婷骚逼,国产AV主播,国产最新探花在线,九色在线视频一区,伊人大交九欧美,1769亚洲,黄色成人av