欧美久久一区二区三区,日韩大香蕉视频

核心解析

# 解析url，獲得標(biāo)題與內(nèi)容
def AnalysisUrl(url):
    try:
        request = urllib2.Request(url, headers=headers)
        response = urllib2.urlopen(request)
        resHtml = response.read()
        # 獲取整個(gè)頁(yè)面
        html = BeautifulSoup(resHtml)
        # 獲取文章內(nèi)容
        a = html.select('div[class="show-content-free"]')
        # 獲取全部圖片
        img = html.select('div[class="image-package"]')
        # 獲取圖片地址
        image = html.select('div[class="show-content-free"] img')
        # 第一段
        content = a[0].__str__()
        # 圖片附加內(nèi)容
        imgpatt = '?imageMogr2/auto-orient/strip%7CimageView2/2/w/'
        # 正則匹配出圖片位置
        patternImgSize = re.compile(r'<div class="image-container" '
                                    r'style="max-width: (.*?)px;', re.S)
        flat = 0
        for i in img:
            # 圖片格式拼接
            d = image[flat].attrs['data-original-src'] + imgpatt \
                + patternImgSize.findall(i.__str__())[0]
            # 圖片替換
            content = content.replace(i.__str__(), "<img src=\"" + d + "\">")
            flat = flat + 1
        # 自定義格式需要
        content = content.replace("<b>", "<strong>")
        content = content.replace("</b>", "</strong>")
        content = content.replace('<div class="show-content-free">', "")
        content = content.replace("</div>", "")

        # 獲得標(biāo)題
        title = html.select('meta[property="og:title"]')[0].attrs['content']

        #解析出內(nèi)容與標(biāo)題
        writeArticle(content, title)
    except:
        print "該文章解析失敗 url:" + url

解析過(guò)濾簡(jiǎn)書自定義標(biāo)簽，文章再用其他富文本逆向解析即可

用vue quill editor解析

vue quill editor解析顯示.PNG

完整代碼

# -*- coding:utf-8 -*-
from bs4 import BeautifulSoup
import urllib2
import re
import time

headers = {
    "Accept": "application/json, text/javascript, */*; q=0.01",
    "X-Requested-With": "XMLHttpRequest",
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; …) Gecko/20100101 Firefox/56.0",
    "Content-Type": "application/json;charset=UTF-8",
    "Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3",
    "Cookie": "JSESSIONID=2D1E55287F8B056E83FD29B114FBA389"
}

# 解析url，獲得標(biāo)題與內(nèi)容
def AnalysisUrl(url):
    try:
        request = urllib2.Request(url, headers=headers)
        response = urllib2.urlopen(request)
        resHtml = response.read()
        # 獲取整個(gè)頁(yè)面
        html = BeautifulSoup(resHtml)
        # 獲取文章內(nèi)容
        a = html.select('div[class="show-content-free"]')
        # 獲取全部圖片
        img = html.select('div[class="image-package"]')
        # 獲取圖片地址
        image = html.select('div[class="show-content-free"] img')
        # 第一段
        content = a[0].__str__()
        # 圖片附加內(nèi)容
        imgpatt = '?imageMogr2/auto-orient/strip%7CimageView2/2/w/'
        # 正則匹配出圖片位置
        patternImgSize = re.compile(r'<div class="image-container" '
                                    r'style="max-width: (.*?)px;', re.S)
        flat = 0
        for i in img:
            # 圖片格式拼接
            d = image[flat].attrs['data-original-src'] + imgpatt \
                + patternImgSize.findall(i.__str__())[0]
            # 圖片替換
            content = content.replace(i.__str__(), "<img src=\"" + d + "\">")
            flat = flat + 1
        # 自定義格式需要
        content = content.replace("<b>", "<strong>")
        content = content.replace("</b>", "</strong>")
        content = content.replace('<div class="show-content-free">', "")
        content = content.replace("</div>", "")

        # 獲得標(biāo)題
        title = html.select('meta[property="og:title"]')[0].attrs['content']

        #解析出內(nèi)容與標(biāo)題
        writeArticle(content, title)
    except:
        print "該文章解析失敗 url:" + url


# 文章寫入，用其他富文本編輯器解析
def writeArticle(content, title):
    with open(title + ".txt", "w") as f:
        f.write(content)

if __name__ == "__main__":
    # auto.py解析出來(lái)的url，進(jìn)行文章解析
    file = open("articleUrl.txt")
    myTime = 0
    while 1:
        line = file.readline()
        # 把這個(gè)字符串頭和尾的空格，以及位于頭尾的\n \t之類給刪掉
        url = line.strip('\n')
        myTime = myTime + 1
        AnalysisUrl(url)
        if myTime > 10:
            # 休眠策略,10篇文章休眠3秒
            time.sleep(3)
            myTime = 0
        if not line:
            break

色偷偷精品伊人,欧洲久久精品,欧美综合婷婷骚逼,国产AV主播,国产最新探花在线,九色在线视频一区,伊人大交九欧美,1769亚洲,黄色成人av

簡(jiǎn)書文章爬蟲解析

簡(jiǎn)書文章爬蟲解析

核心解析

用vue quill editor解析

完整代碼

相關(guān)閱讀更多精彩內(nèi)容

友情鏈接更多精彩內(nèi)容

色偷偷精品伊人,欧洲久久精品,欧美综合婷婷骚逼,国产AV主播,国产最新探花在线,九色在线视频一区,伊人大交九 欧美,1769亚洲,黄色成人av

簡(jiǎn)書文章爬蟲解析

核心解析

用vue quill editor解析

完整代碼

相關(guān)閱讀更多精彩內(nèi)容

友情鏈接更多精彩內(nèi)容

色偷偷精品伊人,欧洲久久精品,欧美综合婷婷骚逼,国产AV主播,国产最新探花在线,九色在线视频一区,伊人大交九欧美,1769亚洲,黄色成人av