寫在前面
感謝@Cstances學長的幫助。
貼代碼:
import re
import os
import requests
from bs4 import BeautifulSoup
'''
第一步:獲取單頁圖片的鏈接
第二步:獲取頁數(shù)的數(shù)字
第三步:獲取所有圖片的鏈接
第四步:保存圖片
'''
def get_images(url):
"""獲取單頁圖片鏈接"""
headers = {'User-Agent':'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Mobile Safari/537.36'}
html = requests.get(url, headers=headers)
html.encoding = 'utf-8'
soup = BeautifulSoup(html.text, 'lxml')
single_page_imgurls = [] # 用于保存當前頁的圖片鏈接
div_a = soup.find('ol' ,{'class':'commentlist'}).find_all('a', href=re.compile(r'//(.*?\.jpg)')) #獲取a標簽
for url in div_a:
single_page_imgurls.append('http:' + url['href'])
return single_page_imgurls
def get_pages(url):
"""獲取首頁 page number"""
headers = {'User-Agent':'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Mobile Safari/537.36'}
html = requests.get(url, headers=headers)
html.encoding = 'utf-8'
soup = BeautifulSoup(html.text, 'lxml')
pattern = re.compile(r'<span class="current-comment-page">\[(.*)\]</span>')
pagesNow = pattern.search(str(soup)).groups()[0]
return pagesNow
def get_all_images(max_pages):
"""獲取所有的圖片鏈接"""
all_images_url = [] #保存所有圖片的鏈接
page_num = int(get_pages('http://jandan.net/ooxx'))
for page in range(page_num, page_num-max_pages, -1): #圖片是倒著來取的
url = 'http://jandan.net/ooxx/page-' + str(page) + '#comments'
all_images_url.extend(get_images(url))#把單頁的圖片鏈接加到all_images_url里
return all_images_url
def save_images(url, dir_name='ooxx'):
if not os.path.exists(dir_name):
os.mkdir(dir_name)
with open(dir_name+os.sep+url.split('/')[-1], 'wb') as fp:
fp.write(requests.get(url).content)
def main():
endpage = int(input('請輸入要下載的頁數(shù):'))
all_images_url = get_all_images(endpage)
for img_url in all_images_url:
save_images(img_url)
if __name__ == '__main__':
main()

效果圖