chezhiwang_spider

#! /usr/bin/python3
# -*- coding: utf-8 -*-
# @Time : 2020/7/5 15:10
# @File : chezhiwangspider
# @Software: PyCharm


#http://www.12365auto.com/zlts/0-0-0-0-0-0_0-0-0-0-0-0-0-7.shtml
#10510



from fake_useragent import UserAgent
ua = UserAgent()

import pymysql
import random
import time

data = {
    'host':'127.0.0.1',
    'port':3306,
    'user':'root',
    'password':'*******',
    'charset':'utf8',
    'db':'chezhiwang'
}

conn = pymysql.connect(**data)
cur = conn.cursor()
sql = "insert into chezhiwang.complaint(complaint_id, car_brand, car_series, car_model, description, topical_prob, cp_tm, cp_status, crawler_tm) " \
      "values (%s,%s,%s,%s,%s,%s,%s,%s,%s)"

import requests
from bs4 import BeautifulSoup as bs
import re
# from w3lib import *
from datetime import datetime

url = 'http://www.12365auto.com/zlts/0-0-0-0-0-0_0-0-0-0-0-0-0-7.shtml'

headers = {
    'Host': 'www.12365auto.com',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
    'Accept-Encoding': 'gzip, deflate',
    'Accept-Language': 'zh-CN,zh;q=0.9',
    'Connection': 'keep-alive',
    'User-Agent': ua.random,
}

def get_html(url):
    web_data = requests.get(url=url,headers = headers)
    soup = bs(web_data.text,'lxml')
    return soup




if __name__ == '__main__':

    flag = 0

    for i in range(1, 10510):
        url = "http://www.12365auto.com/zlts/0-0-0-0-0-0_0-0-0-0-0-0-0-{}.shtml".format(str(i))
        flag += 1

        data = get_html(url)
        nodes = data.select('div.tslb_b table  tr')[1:]

        alist = []

        for node in nodes:
            #投訴編號
            id = node.select('td')[0].text
            #投訴品牌
            car_brand = node.select('td')[1].text
            #投訴車系
            car_series = node.select('td')[2].text
            #投訴車型
            car_model  = node.select('td')[3].text
            #問題簡述
            description =  node.select('td')[4].text
            #典型問題
            topical_problem = node.select('td')[5].text
            #投訴時間
            complain_tm = node.select('td')[6].text
            #投訴狀態(tài)
            complain_status = node.select('td')[7].text

            crawler_tm = str(datetime.now())

            alist.append([id,car_brand,car_series,car_model,description,topical_problem,complain_tm,complain_status,crawler_tm])

        #沒十頁寫入一次數據庫
        if flag % 10 == 0:
            # print(alist)
            for i in alist:
                try:
                    cur.execute(sql,((i[0],i[1],i[2],i[3],i[4],i[5],i[6],i[7],i[8])))
                    conn.commit()
                except Exception as e:
                    print(e)

            alist = []
            time.sleep(random.randint(1,3))
        print(flag,datetime.now(),url)

    conn.close()
最后編輯于
?著作權歸作者所有,轉載或內容合作請聯系作者
【社區(qū)內容提示】社區(qū)部分內容疑似由AI輔助生成,瀏覽時請結合常識與多方信息審慎甄別。
平臺聲明:文章內容(如有圖片或視頻亦包括在內)由作者上傳并發(fā)布,文章內容僅代表作者本人觀點,簡書系信息發(fā)布平臺,僅提供信息存儲服務。

友情鏈接更多精彩內容