使用工具
爬蟲:
?Python 3.6
?urllib
?requests
爬取目標
音悅臺任意藝人的mv(最清晰)
步驟


點擊第一個視頻,F12

在瀏覽器中輸入這個鏈接看一下:

這就是mv真實地址了,但是...



我不知道怎么搞這個啊!如果有大佬會的,請一定告訴我,謝謝!
慫了,面向搜索引擎一下,結果找到

轉自:https://www.toubiec.cn/81.html
好吧,這就是我需要的了,將videoId換成我們需要的Id看看

到此,已經拿到了真實視頻地址,再看看mv列表頁的總頁數,id,mv標題

周董的mv有1000+,但是只有42頁,大膽預測一下,42頁應該是封頂了,再看看楊宗緯的

尋找一下怎么獲取總頁數

可以看到pageCount就是總頁數了

刪去不必要的參數,訪問看一下(懶得裝jsonview,看著有點累):

這里包含了總頁數,id,mv標題
mv真實地址和需要的參數都到手了,整一下代碼
1.獲取mv總頁數:
def get_page_count(self):
# 獲取mv總頁數
get_page_url = 'http://soapi.yinyuetai.com/search/video-search?keyword={0}&pageIndex=1&pageSize=24'\
.format(keyword) # keyword是藝人名
try:
response = requests.get(get_page_url)
if response.status_code == 200:
json = response.json()
page_count = json.get('pageInfo')['pageCount'] # 獲取總頁數
return page_count
return None
except ConnectionError:
return None
2.訪問包含mv列表頁信息的api接口(返回json)
def get_index(self, url):
try:
response = requests.get(url)
if response.status_code == 200:
json = response.json()
return json
return None
except ConnectionError:
return None
3.獲取mv的id、title
def get_mv_info(self, json):
# 獲取mv的id、標題
if json.get('videos'):
items = json.get('videos')['data']
for item in items:
video_id = item.get('id')
title = item.get('title')
yield {
'video_id': video_id,
'title': title,
}
4.通過上文面向搜索引擎找到的包含mv真實地址的api接口,獲取mv真實地址
def get_mv_source_url(self, video_id):
# 構造mv真實地址
mv_source_url = 'http://www.yinyuetai.com/api/info/get-video-urls?flex=true&videoId={}'.format(video_id)
json_dict = requests.get(mv_source_url).json()
mv_source_dict = {
'SD_MV': json_dict["hdVideoUrl"] if "hdVideoUrl" in json_dict else None,
'HD_MV': json_dict["hcVideoUrl"] if "hcVideoUrl" in json_dict else None,
'FHD_MV': json_dict["heVideoUrl"] if "heVideoUrl" in json_dict else None,
}
mv_source_list = []
for key, value in mv_source_dict.items():
if value is not None:
mv_source_list.append(value)
return mv_source_list
因為很多mv不會有各種畫質,所以需要進行一下判斷,這里可以保證mv_source_list里的最后一個元素是mv最高畫質的真實地址
5.下載最高畫質的mv,并根據keyword創(chuàng)建文件夾,下載后的mv文件以mv的title命名
def download_mv(self, mv_source_list, title, video_id): # 下載最高品質的視頻
# 創(chuàng)建存放視頻的文件夾
file = 'C:/Users/Administrator/Desktop/{}/'.format(keyword)
if not os.path.exists(file):
os.mkdir(file)
print('創(chuàng)建文件夾:', file)
# 處理下載過程中的異常
try:
# 判斷視頻文件是否存在,并且給視頻文件名做處理,將不合法的字符用'-'替代
if not os.path.exists(file + clean_title(title) + '-' + str(video_id) + '.mp4'):
print('Start Download MV:' + title + '...:', mv_source_list[-1])
urllib.request.urlretrieve(url=mv_source_list[-1], filename=file + clean_title(title) + '-' + str(video_id) + '.mp4')
print('MV Download Success:', title)
else:
print('MV:{}-已存在'.format(clean_title(title)))
except socket.timeout:
# 解決下載時間過長甚至出現死循環(huán)的情況
count = 1
while count <= 5:
try:
urllib.request.urlretrieve(url=mv_source_list[-1], filename=file + clean_title(title) + '-' + str(video_id) + '.mp4')
print('MV Download Success:', title)
break
except socket.timeout:
err_info = 'Reloading for %d time' % count if count == 1 else 'Reloading for %d times' % count
print(err_info)
count += 1
if count > 5:
print("Downloading MV Failed!")
需要解釋一下的是,文件命名是有規(guī)范的:

如果mv的標題包含這些字符,那就會保存失敗,因此我寫了一個函數來替換這些非法字符:
def clean_title(filename):
# 將非法字符替換成'-'
title = re.sub('[\/:*?"<>|]', '-', filename)
return title
還有一點是,在我實際運行程序進行下載的時候,會有個別時候,某個mv的下載速度出奇的慢,甚至有僵住的可能,但是如果暫停重新下載,下載速度又很大概率恢復正常,具體原因我不清楚,但是針對這個現象,我設置了一個sockettimeout,表示若下載阻塞超過3秒,就算超時
timeout = 3.0
socket.setdefaulttimeout(timeout)
except socket.timeout:
# 解決下載時間過長甚至出現死循環(huán)的情況
count = 1
while count <= 5:
try:
urllib.request.urlretrieve(url=mv_source_list[-1], filename=file + clean_title(title) + '-' + str(video_id) + '.mp4')
print('MV Download Success:', title)
break
except socket.timeout:
err_info = 'Reloading for %d time' % count if count == 1 else 'Reloading for %d times' % count
print(err_info)
count += 1
if count > 5:
print("Downloading MV Failed!")
超時后重新進行下載,我給了它5次機會,如果實在太倔強,只能放棄
6.main函數
def main(self):
page_count = self.get_page_count()
mv_count = 0
for page in range(1, page_count):
print('Crawl Page:', page)
url = 'http://soapi.yinyuetai.com/search/video-search?keyword={0}&pageIndex={1}&pageSize=24'.format(keyword,
page)
json = self.get_index(url)
for item in self.get_mv_info(json):
mv_count += 1
video_id = item['video_id']
title = item['title']
mv_source_list = self.get_mv_source_url(video_id)
self.download_mv(mv_source_list, title, video_id)
print('已下載MV數量:', mv_count)
程序完整代碼:
spider.py
import requests
import urllib.request
import os
import urllib.error
import socket
from common import clean_title
keyword = '周杰倫'
timeout = 3.0
socket.setdefaulttimeout(timeout)
class YinYueTaiSpider(object):
def get_index(self, url):
try:
response = requests.get(url)
if response.status_code == 200:
json = response.json()
return json
return None
except ConnectionError:
return None
def get_page_count(self):
# 獲取mv總頁數
get_page_url = 'http://soapi.yinyuetai.com/search/video-search?keyword={0}&pageIndex=1&pageSize=24'\
.format(keyword) # keyword是藝人名
try:
response = requests.get(get_page_url)
if response.status_code == 200:
json = response.json()
page_count = json.get('pageInfo')['pageCount'] # 獲取總頁數
return page_count
return None
except ConnectionError:
return None
def get_mv_info(self, json):
# 獲取mv的id、標題
if json.get('videos'):
items = json.get('videos')['data']
for item in items:
video_id = item.get('id')
title = item.get('title')
yield {
'video_id': video_id,
'title': title,
}
def get_mv_source_url(self, video_id):
# 構造mv真實地址
mv_source_url = 'http://www.yinyuetai.com/api/info/get-video-urls?flex=true&videoId={}'.format(video_id)
json_dict = requests.get(mv_source_url).json()
mv_source_dict = {
'SD_MV': json_dict["hdVideoUrl"] if "hdVideoUrl" in json_dict else None,
'HD_MV': json_dict["hcVideoUrl"] if "hcVideoUrl" in json_dict else None,
'FHD_MV': json_dict["heVideoUrl"] if "heVideoUrl" in json_dict else None,
}
mv_source_list = []
for key, value in mv_source_dict.items():
if value is not None:
mv_source_list.append(value)
return mv_source_list
def download_mv(self, mv_source_list, title, video_id): # 下載最高品質的視頻
# 創(chuàng)建存放視頻的文件夾
file = 'C:/Users/Administrator/Desktop/{}/'.format(keyword)
if not os.path.exists(file):
os.mkdir(file)
print('創(chuàng)建文件夾:', file)
# 處理下載過程中的異常
try:
# 判斷視頻文件是否存在,并且給視頻文件名做處理,將不合法的字符用'-'替代
if not os.path.exists(file + clean_title(title) + '-' + str(video_id) + '.mp4'):
print('Start Download MV:' + title + '...:', mv_source_list[-1])
urllib.request.urlretrieve(url=mv_source_list[-1], filename=file + clean_title(title) + '-' + str(video_id) + '.mp4')
print('MV Download Success:', title)
else:
print('MV:{}-已存在'.format(clean_title(title)))
except socket.timeout:
# 解決下載時間過長甚至出現死循環(huán)的情況
count = 1
while count <= 5:
try:
urllib.request.urlretrieve(url=mv_source_list[-1], filename=file + clean_title(title) + '-' + str(video_id) + '.mp4')
print('MV Download Success:', title)
break
except socket.timeout:
err_info = 'Reloading for %d time' % count if count == 1 else 'Reloading for %d times' % count
print(err_info)
count += 1
if count > 5:
print("Downloading MV Failed!")
def main(self):
page_count = self.get_page_count()
mv_count = 0
for page in range(1, page_count):
print('Crawl Page:', page)
url = 'http://soapi.yinyuetai.com/search/video-search?keyword={0}&pageIndex={1}&pageSize=24'.format(keyword,
page)
json = self.get_index(url)
for item in self.get_mv_info(json):
mv_count += 1
video_id = item['video_id']
title = item['title']
mv_source_list = self.get_mv_source_url(video_id)
self.download_mv(mv_source_list, title, video_id)
print('已下載MV數量:', mv_count)
if __name__ == '__main__':
mv = YinYueTaiSpider()
mv.main()
common.py(寫過濾非法字符的函數)
import re
def clean_title(filename):
# 將非法字符替換成'-'
title = re.sub('[\/:*?"<>|]', '-', filename)
return title
運行程序

可以看到桌面上已經有了相應文件夾


下載速度還不錯
暫停,重新運行看看:


由于硬盤空間不夠,就不繼續(xù)運行了
使用
只需修改spider.py里的keyword參數,換成你需要的藝人名即可
keyword = '周杰倫'
最后發(fā)現
后來寫完了程序...發(fā)現了一個音悅臺mv1080p的接口,我是懶得重新整了
詳情請見:https://www.lylares.com/yinyuetai-videourl-online-analysis-api.html