代碼如下:
在windows下測試ok,但會存在卡死的問題,應該是windows遺留的問題
from threading import Thread
from multiprocessing import Pool
from bs4 import BeautifulSoup
import re
import time
import requests
import pandas as pd
import json
from selenium import webdriver
from urllib.parse import urljoin, quote
PAGES = 34
KEYWORD = 'yourkeyword'
BASEURL = 'https://search.jd.com/Search?keyword={}&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&bs=1&suggest=2.def.0.V19--12s0&ev=exbrand_%E5%A5%94%E5%AF%8C%EF%BC%88Penfolds%EF%BC%89%5E&stock=1&page={}&s=918&click=0'
def parse(source):
soup = BeautifulSoup(source, 'lxml')
lis = soup.find_all('li', 'gl-item')
print('總共{}個數(shù)據(jù)'.format(len(lis)))
for li in lis:
price = li.find('div', 'p-price').get_text()
title = li.div.find('div', 'p-name').a.get('title')
href = 'http:' + li.div.find('div', 'p-name').a.get('href')
sale = li.div.find('div', 'p-commit').get_text()
shopname = li.div.find('div', 'p-shop').span.a.get('title')
print(price, title, href, sale, shopname)
df = pd.DataFrame(data = {
'價格': price,
'標題': title,
'鏈接': href,
'銷量': sale,
'店鋪名': shopname
}, index = ['0'])
df.to_csv('filename.csv', mode = 'a', index=False, header = False, encoding='utf_8_sig')
def run(url):
driver = webdriver.Chrome()
try:
driver.get(url)
# 滾動下拉菜單
for j in range(10):
driver.execute_script('window.scrollBy(0,1500)')
time.sleep(1)
except Exception as e:
print("err: ", e)
else:
parse(driver.page_source)
finally:
driver.close()
def main():
p = Pool(2)
urls = []
for page in range(21, PAGES, 2):
url = BASEURL.format(quote(KEYWORD), page)
print('url: ', url)
urls.append(url)
for url in urls:
p.apply_async(run, args = (url, ))
p.close()
p.join()
print('task over')
if __name__ == '__main__':
main()