簡化不必要的細(xì)節(jié),現(xiàn)實版的實踐?自覺的優(yōu)化代碼o(▽)o
# -*- coding: utf-8 -*-
#導(dǎo)入re模塊
import re
import urllib.request
from bs4 import BeautifulSoup
#部首列表正則表達(dá)式
bspattern = re.compile(r"(?:%[^%']{2}){3}")
cipattern = re.compile(r"/c/[^']*?htm")
#1獲取部首列表頁面
req1 = urllib.request.Request('http://www.zdic.net/c/cibs/')
response1 = urllib.request.urlopen(req1)
index_page1 = response1.read()
#分析得到部首列表
#先試試直接正則表達(dá)式提取部首列表
index_page1 = index_page1.decode('utf8')
bslist = re.findall(bspattern, index_page1)
#1獲取詞列表 部首列表頁面
for bu in bslist:
print(bu)
bu = "http://www.zdic.net/c/cibs/bs/?bs=" + b
reqb = urllib.request.Request(bu)
reqb.add_header('Referer', 'http://www.zdic.net/c/cibs/')
responseb = urllib.request.urlopen(reqb)
index_z = responseb.read()
#分析得到字列表
index_z = index_z.decode('utf8')
zlist = re.findall(bspattern, index_z)
#部首列表地址
for z in zlist:
if len(z) != 0:
z = "http://www.zdic.net/c/cibs/ci/?z=" + z
print(z)
reqz = urllib.request.Request(z)
reqz.add_header('Referer', 'http://www.zdic.net/c/cibs/')
responseb = urllib.request.urlopen(reqz)
index_c = responseb.read()
#分析得到字列表
index_c = index_c.decode('utf8')
clist = re.findall(r"/z/[^']*?\.htm", index_c)
#轉(zhuǎn)化為字地址列表
for uc in clist:
line = "http://www.zdic.net/" + uc
outfile.write(line+'\n')#參數(shù)不能為疊加器
outfile.close()