多線程爬取

import time

import requests
from lxml import etree
from selenium import webdriver
from kaisha import str2url
from threading import Thread

browser = webdriver.Chrome(executable_path="/Users/apple/Desktop/tool/chromedriver")

def get_page():
url = 'https://www.xiami.com/chart'
browser.get(url)
time.sleep(1)
return browser.page_source

def get_mp3(url):
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:2.0b13pre) Gecko/20110307 Firefox/4.0b13pre"
}
response = requests.get(url, headers=headers)
if response.status_code == 200:
return response.content
return None

def process_mp3(mp3_url, mp3_title):
mp3_url = str2url(mp3_url)
print(mp3_url, mp3_title)
mp3_content = get_mp3(mp3_url)
save_mp3(mp3_content, mp3_title)

def save_mp3(mp3_content, mp3_title):
with open('./mp3/%s.mp3' % mp3_title, 'wb') as f:
f.write(mp3_content)

def parse_page(html):
etree_html = etree.HTML(html)
items = etree_html.xpath('//tr[@class="songwrapper"]')
threads = []
for item in items:
mp3_url = item.xpath('./@data-mp3')[0]
mp3_title = item.xpath('./@data-title')[0]

    thread = Thread(target=process_mp3, args=(mp3_url, mp3_title))
    threads.append(thread)
for thread in threads:
    thread.start()

def main():
html = get_page()
parse_page(html)

if name == 'main':
main()

?著作權(quán)歸作者所有,轉(zhuǎn)載或內(nèi)容合作請(qǐng)聯(lián)系作者
【社區(qū)內(nèi)容提示】社區(qū)部分內(nèi)容疑似由AI輔助生成,瀏覽時(shí)請(qǐng)結(jié)合常識(shí)與多方信息審慎甄別。
平臺(tái)聲明:文章內(nèi)容(如有圖片或視頻亦包括在內(nèi))由作者上傳并發(fā)布,文章內(nèi)容僅代表作者本人觀點(diǎn),簡(jiǎn)書(shū)系信息發(fā)布平臺(tái),僅提供信息存儲(chǔ)服務(wù)。

相關(guān)閱讀更多精彩內(nèi)容

友情鏈接更多精彩內(nèi)容