初學(xué)爬蟲入坑爬煎蛋的教程,無奈煎蛋有反爬蟲機(jī)制獲取不到真實(shí)圖片地址,研究了兩天,自己寫了一個(gè),代碼很簡單,便于理解。
import time,requests
from selenium import webdriver
from bs4 import BeautifulSoup
star = time.time()
browser = webdriver.Chrome()
n = 1
total = 0
for num in range(48,0,-1):
browser.get('http://jandan.net/ooxx/page-'+str(num)+'#comments')
data = browser.page_source
soup = BeautifulSoup(data,'lxml')
download_links = []
folder_path = 'C:\\Users\\Administrator\\Desktop\\JD\\'
# print('===========第' + str(num) + '頁===============')
for pic_tag in soup.find_all('img'):
pic_link = pic_tag.get('src')
download_links.append(pic_link)
for item in download_links:
try:
urllib.request.urlretrieve(item,folder_path + item[-10:])
with open(folder_path + item[-10:],'wb',) as f:
f.write(requests.get(item).content)
print('正在下載第{}圖片'.format(n))
total += 1
except:
print('第{}張圖片下載出錯(cuò),已跳過'.format(n))
n += 1
browser.close()
end = time.time()
print('總共用時(shí){}分'.format((end-star)/60))
print('成功下載{}張圖片,失敗{}張圖片'.format(total,n-total))