核心解析
# 解析url,獲得標(biāo)題與內(nèi)容
def AnalysisUrl(url):
try:
request = urllib2.Request(url, headers=headers)
response = urllib2.urlopen(request)
resHtml = response.read()
# 獲取整個(gè)頁(yè)面
html = BeautifulSoup(resHtml)
# 獲取文章內(nèi)容
a = html.select('div[class="show-content-free"]')
# 獲取全部圖片
img = html.select('div[class="image-package"]')
# 獲取圖片地址
image = html.select('div[class="show-content-free"] img')
# 第一段
content = a[0].__str__()
# 圖片附加內(nèi)容
imgpatt = '?imageMogr2/auto-orient/strip%7CimageView2/2/w/'
# 正則匹配出圖片位置
patternImgSize = re.compile(r'<div class="image-container" '
r'style="max-width: (.*?)px;', re.S)
flat = 0
for i in img:
# 圖片格式拼接
d = image[flat].attrs['data-original-src'] + imgpatt \
+ patternImgSize.findall(i.__str__())[0]
# 圖片替換
content = content.replace(i.__str__(), "<img src=\"" + d + "\">")
flat = flat + 1
# 自定義格式需要
content = content.replace("<b>", "<strong>")
content = content.replace("</b>", "</strong>")
content = content.replace('<div class="show-content-free">', "")
content = content.replace("</div>", "")
# 獲得標(biāo)題
title = html.select('meta[property="og:title"]')[0].attrs['content']
#解析出內(nèi)容與標(biāo)題
writeArticle(content, title)
except:
print "該文章解析失敗 url:" + url
- 解析過(guò)濾簡(jiǎn)書自定義標(biāo)簽,文章再用其他富文本逆向解析即可
用vue quill editor解析
vue quill editor解析顯示.PNG
完整代碼
# -*- coding:utf-8 -*-
from bs4 import BeautifulSoup
import urllib2
import re
import time
headers = {
"Accept": "application/json, text/javascript, */*; q=0.01",
"X-Requested-With": "XMLHttpRequest",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; …) Gecko/20100101 Firefox/56.0",
"Content-Type": "application/json;charset=UTF-8",
"Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3",
"Cookie": "JSESSIONID=2D1E55287F8B056E83FD29B114FBA389"
}
# 解析url,獲得標(biāo)題與內(nèi)容
def AnalysisUrl(url):
try:
request = urllib2.Request(url, headers=headers)
response = urllib2.urlopen(request)
resHtml = response.read()
# 獲取整個(gè)頁(yè)面
html = BeautifulSoup(resHtml)
# 獲取文章內(nèi)容
a = html.select('div[class="show-content-free"]')
# 獲取全部圖片
img = html.select('div[class="image-package"]')
# 獲取圖片地址
image = html.select('div[class="show-content-free"] img')
# 第一段
content = a[0].__str__()
# 圖片附加內(nèi)容
imgpatt = '?imageMogr2/auto-orient/strip%7CimageView2/2/w/'
# 正則匹配出圖片位置
patternImgSize = re.compile(r'<div class="image-container" '
r'style="max-width: (.*?)px;', re.S)
flat = 0
for i in img:
# 圖片格式拼接
d = image[flat].attrs['data-original-src'] + imgpatt \
+ patternImgSize.findall(i.__str__())[0]
# 圖片替換
content = content.replace(i.__str__(), "<img src=\"" + d + "\">")
flat = flat + 1
# 自定義格式需要
content = content.replace("<b>", "<strong>")
content = content.replace("</b>", "</strong>")
content = content.replace('<div class="show-content-free">', "")
content = content.replace("</div>", "")
# 獲得標(biāo)題
title = html.select('meta[property="og:title"]')[0].attrs['content']
#解析出內(nèi)容與標(biāo)題
writeArticle(content, title)
except:
print "該文章解析失敗 url:" + url
# 文章寫入,用其他富文本編輯器解析
def writeArticle(content, title):
with open(title + ".txt", "w") as f:
f.write(content)
if __name__ == "__main__":
# auto.py解析出來(lái)的url,進(jìn)行文章解析
file = open("articleUrl.txt")
myTime = 0
while 1:
line = file.readline()
# 把這個(gè)字符串頭和尾的空格,以及位于頭尾的\n \t之類給刪掉
url = line.strip('\n')
myTime = myTime + 1
AnalysisUrl(url)
if myTime > 10:
# 休眠策略,10篇文章休眠3秒
time.sleep(3)
myTime = 0
if not line:
break