9songSpider

# coding:utf-8

# 網(wǎng)站主頁地址:https://9song.me/cy/%e5%ae%b6%e5%ba%ad%e4%ba%82%e5%80%ab/
# 每頁的地址: page/1/
# 每篇文章的地址 //h2//@href
# 文章的標題: //h1
# 文章的內(nèi)容:////div[@class="entry-inner"]

import urllib
import urllib2
import Queue
import threading
import requests
from lxml import etree


class Spider(object):

    def __init__(self, url, ag_header):
        self.url = url
        self.head = ag_header

    def load_page(self):
        print("[*]獲取頁面中每個版塊的url中...")
        request = urllib2.Request(self.url, headers=self.head)
        html = urllib2.urlopen(request).read().decode("utf-8")
        path_list = etree.HTML(html).xpath('//h2//a/@href')
        for section_url in path_list:
            # self.load_image(section_url)
            print(section_url)

    def load_image(self, section_url):
        print("[*]正在獲取文章數(shù)據(jù)中...")
        request = urllib2.Request(section_url, headers=self.head)
        html = urllib2.urlopen(request).read().decode("utf-8")
        file_name = etree.HTML(html).xpath('//h1/text()')
        content_txt = etree.HTML(html).xpath('//div[@class="entry-inner"]//p/text()')
        for title in file_name:
            print("[*]正在保存文章<%s>..." % title.encode("utf-8"))
            for each_txt in content_txt:
                self.save_text(title, each_txt.encode("utf-8"))

    def save_text(self, title, each_txt):
        # print("[*]正在保存文章...")
        file_name = title + ".txt"
        with open(file_name, "ab") as f:
            f.write(each_txt + "\r\n")
        # print("[*]保存文章%s完成!?。? % file_name.encode("utf-8"))


class CrawThread(threading.Thread):
    def __init__(self, name, page_queue, each_page_queue, header):
        super(CrawThread, self).__init__(name=name)
        self.page_queue = page_queue
        self.headers = header
        self.each_page_queue = each_page_queue

    def run(self):
        print("啟動%s" % self.name)
        try:
            page = self.page_queue.get(False)
            full_url = "https://9song.me/cy/%e6%a0%a1%e5%9c%92%e5%ad%b8%e7%94%9f/page/" + str(page)
            html = requests.get(full_url, self.headers).text
            self.each_page_queue.put(html)
        except:
            pass
        print("結(jié)束%s" % self.name)


class ParseThread(threading.Thread):
    def __init__(self, thread_name, data_queue, mylock):
        super(ParseThread, self).__init__(name=thread_name)
        self.lock = mylock
        self.data_queue = data_queue

    def run(self):
        print("啟動%s" % self.name)
        """這里的html是每一篇文章的頁面的html源碼"""
        html = self.data_queue.get()
        self.handle(html)

    def handle(self, html):
        file_name = etree.HTML(html).xpath('//h1/text()')
        content_txt = etree.HTML(html).xpath('//div[@class="entry-inner"]//p/text()')
        for title in file_name:
            print("[*]正在保存文章<%s>..." % title.encode("utf-8"))
            for each_txt in content_txt:
                self.save_text(title, each_txt.encode("utf-8"))

    def save_text(self, title, each_txt):
        file_name = title + ".txt"
        with open(file_name, "ab") as f:
            f.write(each_txt + "\r\n")
        print("[*]保存文章%s完成?。?!" % file_name.encode("utf-8"))


class CrawEachThread(threading.Thread):
    def __init__(self, thread_name, each_page_queue, data_queue, header):
        super(CrawEachThread, self).__init__(thread_name)
        self.each_page_queue = each_page_queue
        self.data_queue = data_queue
        self.header = header

    def run(self):
        print("啟動%s" % self.name)
        try:
            html = self.each_page_queue.get(False)
            self.handle(html)
        except:
            pass
        print("結(jié)束%s" % self.name)

    def handle(self, html):
        path_list = etree.HTML(html).xpath('//h2//a/@href')
        for section_url in path_list:
            each_html = requests.get(section_url)
            self.data_queue.put(each_html)


if __name__ == '__main__':

    """基本信息"""
    url = "https://9song.me/cy/%e6%a0%a1%e5%9c%92%e5%ad%b8%e7%94%9f/page/"
    ag_header = {
        "Host": " 9song.me",
        "Connection": " keep-alive",
        # "Cache-Control" : " max-age=0",
        "Upgrade-Insecure-Requests": " 1",
        # "User-Agent" : " Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36",
        # "User-Agent" : "Mozilla/5.0(compatible;MSIE 9.0; Windows NT6.1;Trident/5.0)",
        "Accept": " text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
        "Accept-Language": " zh-CN,zh;q=0.9"
    }

    mylock = threading.Lock()

    """所需隊列"""
    page_queue = Queue.Queue()
    data_queue = Queue.Queue()
    each_page_queue = Queue.Queue()

    """用戶輸入請求的指定頁面"""
    start_page = int(raw_input("起始頁:"))
    end_page = int(raw_input("結(jié)束頁:"))

    """先put需要爬取的頁數(shù)"""
    for page in range(start_page, end_page + 1):
        page_queue.put(page)

    """啟動采集線程..."""
    craw_list = ["craw-1", "craw-2", "craw-3"]
    craw_thread = []

    for thread_name in craw_list:
        thread = CrawThread(thread_name, page_queue, each_page_queue, ag_header)
        thread.start()
        craw_thread.append(thread)

    """獲取每一頁的url"""
    craw_each_page = ["each-1", "each-2", "each-3"]
    craw_each_thread = []

    for thread_name in craw_each_page:
        thread = CrawEachThread(thread_name, each_page_queue, data_queue, ag_header)
        thread.start()
        craw_each_thread.append(thread)

    """啟動解析線程..."""
    parse_list = ["parse-1", "parse-2", "parse-3"]
    parse_thread = []

    """需要的初始化參數(shù)不同"""
    for parse_name in parse_list:
        thread = ParseThread(parse_name, data_queue, mylock)
        thread.start()
        parse_thread.append(thread)

        # print("[*]正在爬取此分類下第%d頁..." % page)
        # sexy = Spider(full_url, ag_header)
        # sexy.load_page()
?著作權(quán)歸作者所有,轉(zhuǎn)載或內(nèi)容合作請聯(lián)系作者
【社區(qū)內(nèi)容提示】社區(qū)部分內(nèi)容疑似由AI輔助生成,瀏覽時請結(jié)合常識與多方信息審慎甄別。
平臺聲明:文章內(nèi)容(如有圖片或視頻亦包括在內(nèi))由作者上傳并發(fā)布,文章內(nèi)容僅代表作者本人觀點,簡書系信息發(fā)布平臺,僅提供信息存儲服務(wù)。

相關(guān)閱讀更多精彩內(nèi)容

  • 據(jù)美媒報道,作為活塞主教練的范甘迪似乎正尋求與騎士之間有關(guān)歐文的交易。 雖說歐文的價值看似已經(jīng)是逐漸下滑,但仍會有...
    zoneball閱讀 252評論 0 0
  • 昨天老婆順利住進婦產(chǎn)科病房,在大醫(yī)院一床難求的時代可算是幸運。經(jīng)過各類檢查,醫(yī)院建議觀察后在決定生產(chǎn)方案。到了午飯...
    紀光星閱讀 136評論 0 0
  • 或許多年以后 再回想與你走過的路 吃過飯的餐館 看過書的涼亭 聊的天翻地覆的話題 看過每一場都是回憶滿滿的電影 還...
    蘭亭沒有序閱讀 260評論 6 5
  • 人生就如同一條波浪線,起起伏伏,有波谷,也有波峰。這條波浪線也有兩種形態(tài),一種是向上的波浪線,另一種是向下的波浪...
    風起云勇閱讀 1,427評論 0 2

友情鏈接更多精彩內(nèi)容