從HTML文件獲取純文本
通過(guò)BeautifulSoup獲取純文本
之前是通過(guò)BeautifulSoup (bs4)獲取純文本的,簡(jiǎn)單演示如下:
from bs4 import BeautifulSoup
htmfile = 'myweb.htm'
html = open(htmfile, 'r', encoding='utf-8')
htmlpage = html.read()
soup = BeautifulSoup(htmlpage.strip(), 'html.parser')
print(soup.text)
但是這樣做的問(wèn)題在于,其控制文本的格式或許與瀏覽器端顯示的不一樣
瀏覽器端顯示的格式:

bs4抓取的文本格式如下:

為什么會(huì)這樣呢?
因?yàn)樵趆tml源碼中,存在這樣的換行:

可以理解為bs4僅僅將html中的tag什么的去除了,但是并沒有考慮格式與瀏覽器顯示一致。
經(jīng)過(guò)一番搜索,終于找到相應(yīng)的方案
通過(guò)HTMLParser獲取純文本
HTMLParser是可以無(wú)視tag中的換行符的,如同瀏覽器一樣,只要在tag中的文本,如<p>等,無(wú)論是否換行,在瀏覽器都是顯示為一行。
安裝:
pip install HTMLParser
但是安裝之后運(yùn)行,或許會(huì)提示找不到markupbase module的錯(cuò)誤。
可以去如下地址下載:markupbase
然后將_markupbase.py更名為markupbase.py,并拷貝到python安裝路徑的:Lib\site-packages目錄下,如: D:\python36\Lib\site-packages
具體的代碼如下,我是將其單獨(dú)寫在一個(gè)python代碼文件中,方便重用:
from re import sub
from sys import stderr
#need download https://pypi.org/project/micropython-_markupbase/3.3.3-1/#files,
# and copy _markupbase.py to \Lib\site-packages, then rename it to markupbase.py
# this library make html content to be right format
# 不能直接使用HTMLParser,而是用html.parser,
# 否則如果環(huán)境是python3.x,會(huì)遇到兼容問(wèn)題
# from HTMLParser import HTMLParser
from html.parser import HTMLParser
from traceback import print_exc
from bs4 import BeautifulSoup
class _DeHTMLParser(HTMLParser):
def __init__(self):
HTMLParser.__init__(self)
self.__text = []
def handle_data(self, data):
text = data.strip()
if len(text) > 0:
text = sub('[ \t\r\n]+', ' ', text)
self.__text.append(text + ' ')
def handle_starttag(self, tag, attrs):
if tag == 'p':
self.__text.append('\n\n')
elif tag == 'br':
self.__text.append('\n')
elif tag == 'div':
self.__text.append('\n')
def handle_startendtag(self, tag, attrs):
if tag == 'br':
self.__text.append('\n\n')
def text(self):
return ''.join(self.__text).strip()
def dehtml(text):
try:
parser = _DeHTMLParser()
parser.feed(text)
parser.close()
return parser.text()
except:
print_exc(file=stderr)
#If invoke exception, then return text by beautifulsoup
soup = BeautifulSoup(text, 'html.parser')
return soup.text
之后提取的文本就是符合我們格式要求的:

主程序代碼
包括:解壓htm壓縮包,提取純文本,拆分文本文件
拆分文本文件,我是每500行就拆為一個(gè)小的文本文件
但是,考慮到避免將一個(gè)完整的段落文字拆到幾個(gè)文本文件中,加入了最后一行必須以句號(hào):.為結(jié)尾(相對(duì)英文文件文本)如果是漢語(yǔ),可以加入結(jié)尾是否是漢字句號(hào):。的判斷。
壓縮文件相對(duì)于程序代碼路徑:./htmzip
解壓的htm文件相對(duì)于程序代碼路徑:./htm
txt文件相對(duì)于程序代碼路徑:./txt
完整主程序代碼:
import os
import parsehtmltext
import zipfile
def startjob():
zipfilelist = getfilelist('./htmzip', '.zip')
count = 1
for filepath in zipfilelist:
(filefolder, zipfilename) = os.path.split(filepath)
print('handle the %d file: %s start' % (count, zipfilename))
unzipfolder = './htm/%s' % (zipfilename)
unzip(filepath, unzipfolder)
htmlfilelist = getfilelist(unzipfolder, '.htm')
if len(htmlfilelist) > 0:
txtfilelist = splithtmltotxt(htmlfilelist[0],
'./txt/%s' % (zipfilename))
print(txtfilelist)
count += 1
def getfilelist(folderpath, extension):
filelist = []
for (root, dirs, files) in os.walk(folderpath):
for filename in files:
if filename.lower().endswith(extension.lower()):
filepath = os.path.join(root, filename).replace('\\','/')
filelist.append(filepath)
return filelist
def splithtmltotxt(htmfile, txtfolder):
html = open(htmfile, 'r', encoding='utf-8')
htmlpage = html.read()
wholetextlines = parsehtmltext.dehtml(htmlpage).split('\n')
if len(wholetextlines) == 1 and len(wholetextlines[0].strip()) > 100:
wholetextlines = wholetextlines[0].split('.')
testblock = []
if os.path.isdir(txtfolder):
pass
else:
os.mkdir(txtfolder)
txtfilelist = []
#整體計(jì)數(shù)器
count = 1
#用于單文本文件行數(shù)的計(jì)數(shù)器
eachcount = 0
#文本文件數(shù)量的計(jì)數(shù)器
txtfilecount = 1
totalcount = len(wholetextlines)
# print(wholetextlines)
# print(totalcount)
for line in wholetextlines:
if line.split():
testblock.append(line.strip() + '\n')
eachcount += 1
# print('each count is %d, count is %d, total count is %d' % (eachcount, count, totalcount))
if (eachcount >= 500
and (line.endswith('\n') or line.strip().endswith('.')))\
or count == totalcount:
txtfilename = '%s/%d.txt' % (txtfolder, txtfilecount)
with open(txtfilename, 'w', encoding='utf-8') as f:
f.writelines(testblock)
# print('save txt file: %s'% txtfilename)
txtfilelist.append(txtfilename)
#reset single counter
testblock = []
eachcount = 0
txtfilecount += 1
count += 1
return txtfilelist
def unzip(sourcefile, unzipfolder):
with zipfile.ZipFile(sourcefile) as zip_file:
if os.path.isdir(unzipfolder):
pass
else:
os.mkdir(unzipfolder)
for names in zip_file.namelist():
zip_file.extract(names, unzipfolder)
if __name__ == '__main__':
startjob()