思路分析
首先利用Python的Requests庫去獲取參數(shù)中頁面的內(nèi)容 , 然后進行使用BeautifulSoup庫進行解析 , 解析到圖片以后開啟多線程進行下載保存
截圖展示 :

Paste_Image.png

Paste_Image.png
代碼實現(xiàn) :
#!/usr/bin/env python
#encoding:utf8
import requests
import threading
from bs4 import BeautifulSoup
import sys
import os
# config-start
url = sys.argv[1]
threadNumber = 20 # 設(shè)置線程數(shù)
# config-end
def getContent(url):
response = requests.get(url)
response.encoding = 'utf-8' # 設(shè)置相應(yīng)體的字符集
return response.text
def getTitle(soup):
return soup.title.string.encode("GBK")
def getImageLinks(soup):
imgs = soup.findAll("img")
result = []
for img in imgs:
result.append(img)
return result
def makeDirectory(dicName):
if not os.path.exists(dicName):
os.mkdir(dicName)
def downloadImage(imgUrl,savePath):
local_filename = imgUrl.split('/')[-1]
local_filename = formatFileName(local_filename)
r = requests.get(imgUrl, stream=True)
counter = 0
if not savePath.endswith("\\"):
savePath += "\\"
f = open(savePath + local_filename, 'wb')
for chunk in r.iter_content(chunk_size=1024):
if chunk:
f.write(chunk)
f.flush()
counter += 1
f.close()
def formatFileName(fileName):
fileName = fileName.replace("/","_")
fileName = fileName.replace("\\","_")
fileName = fileName.replace(":","_")
fileName = fileName.replace("*","_")
fileName = fileName.replace("?","_")
fileName = fileName.replace("\"","_")
fileName = fileName.replace(">","_")
fileName = fileName.replace("<","_")
fileName = fileName.replace("|","_")
fileName = fileName.replace(" ","_")
return fileName
def threadFunction(imgSrc,directoryName):
downloadImage(imgSrc,directoryName)
class myThread (threading.Thread):
def __init__(self, imgSrc, directoryName):
threading.Thread.__init__(self)
self.imgSrc = imgSrc
self.directoryName = directoryName
def run(self):
threadFunction(self.imgSrc, self.directoryName)
content = getContent(url)
soup = BeautifulSoup(content, "html.parser")
images = getImageLinks(soup)
title = getTitle(soup)
title = formatFileName(title)
print u"頁面標(biāo)題 : " , title
print u"本頁圖片數(shù)量 :",len(images)
print u"正在創(chuàng)建文件夾以用來保存所有圖片"
makeDirectory(title)
threads = []
for image in images:
src = image['src']
print u"圖片地址 : " + src
threads.append(myThread(str(src), str(title)))
for t in threads:
t.start()
while True:
if(len(threading.enumerate()) < threadNumber):
break
print u"所有圖片下載完成 ! "
后記 :
之前的腳本在處理某些網(wǎng)站的時候容錯性還不是很強 , 這里將腳本簡單修改了一下
新版本代碼如下
#!/usr/bin/env python
#encoding:utf8
import requests
import threading
from bs4 import BeautifulSoup
import sys
import os
if len(sys.argv) != 2:
print("Usage : ")
print(" python main.py [URL]")
exit(1)
# config-start
url = sys.argv[1]
threadNumber = 20 # 設(shè)置線程數(shù)
# config-end
def getContent(url):
try:
response = requests.get(url)
response.raise_for_status()
response.encoding = response.apparent_encoding
return response.text
except Exception as e:
print(e)
return str(e)
def getTitle(soup):
try:
return soup.title.string
except:
return "UnTitled"
def getImageLinks(soup):
imgs = soup.findAll("img")
result = []
for img in imgs:
try:
src = img['src']
if src.startswith("http"):
result.append(img['src'])
else:
result.append(domain + img['src'])
except:
continue
return result
def makeDirectory(dicName):
if not os.path.exists(dicName):
os.mkdir(dicName)
def downloadImage(imgUrl,savePath):
local_filename = imgUrl.split('/')[-1]
local_filename = formatFileName(local_filename)
r = requests.get(imgUrl, stream=True)
counter = 0
if not savePath.endswith("/"):
savePath += "/"
f = open(savePath + local_filename + ".png", 'wb')
for chunk in r.iter_content(chunk_size=1024):
if chunk:
f.write(chunk)
f.flush()
counter += 1
f.close()
def formatFileName(fileName):
fileName = fileName.replace("/","_")
fileName = fileName.replace("\\","_")
fileName = fileName.replace(":","_")
fileName = fileName.replace("*","_")
fileName = fileName.replace("?","_")
fileName = fileName.replace("\"","_")
fileName = fileName.replace(">","_")
fileName = fileName.replace("<","_")
fileName = fileName.replace("|","_")
fileName = fileName.replace(" ","_")
return fileName
def threadFunction(imgSrc,directoryName):
downloadImage(imgSrc,directoryName)
class myThread (threading.Thread):
def __init__(self, imgSrc, directoryName):
threading.Thread.__init__(self)
self.imgSrc = imgSrc
self.directoryName = directoryName
def run(self):
threadFunction(self.imgSrc, self.directoryName)
def getPrefix(url):
# http://doamin/xxx.jpg
return ''.join(i+"/" for i in url.split("/")[0:4])
def getDomain(url):
return ''.join(i+"/" for i in url.split("/")[0:3])
content = getContent(url)
prefix = getPrefix(url)
domain = getDomain(url)
soup = BeautifulSoup(content, "html.parser")
images = getImageLinks(soup)
title = getTitle(soup)
title = formatFileName(title)
print(u"頁面標(biāo)題 : " , title)
print(u"本頁圖片數(shù)量 :",len(images))
print(u"正在創(chuàng)建文件夾以用來保存所有圖片")
makeDirectory(title)
threads = []
for image in images:
print(u"圖片地址 : " + image)
threads.append(myThread(image, title))
for t in threads:
t.start()
while True:
if(len(threading.enumerate()) < threadNumber):
break
print(u"所有圖片已加入下載隊列 ! 正在下載...")
測試結(jié)果 :

image.png

image.png