使用python的selenium庫還有多線程抓取CET4成績

沒有requests快,但好寫

# -*- coding: utf-8 -*-
#使用selenium的webdriver的方法
import csv
import os
import time
import re
import threading
from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
##進(jìn)程鎖
lock=threading.Lock()
############獲取csv文件中的行數(shù)(暫時不用管它....因為沒上)############
def raw_num(filepath):
    count = 0 
    cs=open("2017.csv","r",errors="ignore")
    reader=csv.reader(cs)
    for index,line in enumerate(reader): 
        count += 1
    cs.close()
    return count
############模擬登陸############
def post_get(html,driver,name,password):
    global writer
    grade=""
    driver.get(html)
    driver.find_element_by_id("id").clear()
    driver.find_element_by_id("id").send_keys(password)
    driver.find_element_by_id("name").clear()
    driver.find_element_by_id("name").send_keys(name)
    driver.find_element_by_id("btn").click()
    ip=driver.find_elements_by_class_name("liright")
    #要使用elements才可以遞歸,其實用xpath更好.......
    for num,i in enumerate(ip):
        if(num==1):
            grade=i.text      
    return grade
####################從csv文件讀數(shù)據(jù)############################
def read_out():
    global name 
    global i
    global password
    cs=open("2017.csv","r",errors="ignore")
    reader=csv.reader(cs)
    for num,line in enumerate(reader):
        name.append(line[6])
        password.append(line[5])
#print("準(zhǔn)考證號: "+password[len(password)-1]+"  姓名: "+name[len(name)-1])
    cs.close()
########往csv文件寫數(shù)據(jù)(內(nèi)嵌爬取網(wǎng)頁操作)#################
def write_in(driver,i):
    global lock
    global csv_file
    global writer
    global error
    grade=0
   #stop 用于計數(shù),如果有哪個倒霉的數(shù)據(jù)超過多次提交都失敗就會寫入error.csv
    stop=1
    html="http://cet.99sushe.com"
    while(len(password)!=0):
        #print("第%d個線程"%i)
        try:
            lock.acquire()
            n=name.pop()
            p=password.pop()
            lock.release()
            grade=post_get(html,driver,n,p)
            print("第%d個線程:"%i)
            print("準(zhǔn)考證號: "+p+"  姓名: "+n)
            print(grade)
        except Exception as e:
            print(e)
            #如果出線程爬取7-1次內(nèi)出了bug則返回棧
            if(stop%7!=0):
                stop=stop+1
                lock.acquire()
                password.append(p)
                name.append(n)
                lock.release()
            if(stop%7==0):
                stop=1
                print("\n/******write in error.csv")
                print("第%d個線程:"%i)
                print("準(zhǔn)考證號: "+p+"  姓名: "+n+"******/\n")
                lock.acquire()
                error.writerow([n,p])
                lock.release()
                continue
        try:
            lock.acquire()
            writer.writerow([n,p,grade])
            lock.release()
        except:
            print(">>>>>>>error")
            #print("/*****error in writting.")
            #print("第%d個線程:"%i)
            #print("準(zhǔn)考證號: "+p+"  姓名: "+n+"****/")
            lock.acquire()
            password.append(p)
            name.append(n)
            lock.release()
    driver.quit()
###########主函數(shù)####################
##存儲堆棧
threads=[]
password=[]
name=[]
##讀取數(shù)據(jù)到堆棧
read_out()
##############
grade=0
###線程個數(shù)
thread_num=3
###csv文件指針等.....####
html="http://cet.99sushe.com"
#不加newline="",則會多空一行...
error_file=open("error.csv","w",newline="",errors="ignore")
csv_file=open("result.csv","w",newline="",errors="ignore")
writer=csv.writer(csv_file)
error=csv.writer(error_file)
writer.writerow("讓我們先空一行!")
error.writerow("讓我們先空一行!")
###########多進(jìn)程部分###############
for i in range(thread_num):
    print("第%d個線程入棧:"%i)
    driver=webdriver.Firefox()
    driver.implicitly_wait(10)
    t=threading.Thread(target=write_in,args=(driver,i))
    #因為是args=(A,B)所以之前寫出arge=(driver)結(jié)果炸了
    threads.append(t)
#開啟多線程
for i in range(thread_num):
    print("start第%d個線程"%i)
    threads[i].start()
for i in range(thread_num):
    threads[i].join()
#####################################
print("一共花了:")
print(time.clock())
csv_file.close()          
error_file.close()          
           
最后編輯于
?著作權(quán)歸作者所有,轉(zhuǎn)載或內(nèi)容合作請聯(lián)系作者
【社區(qū)內(nèi)容提示】社區(qū)部分內(nèi)容疑似由AI輔助生成,瀏覽時請結(jié)合常識與多方信息審慎甄別。
平臺聲明:文章內(nèi)容(如有圖片或視頻亦包括在內(nèi))由作者上傳并發(fā)布,文章內(nèi)容僅代表作者本人觀點,簡書系信息發(fā)布平臺,僅提供信息存儲服務(wù)。

相關(guān)閱讀更多精彩內(nèi)容

友情鏈接更多精彩內(nèi)容