# 正則表達(dá)式, 按照一定規(guī)則提取字符串中的符合條件的內(nèi)容
str1='ghmjfdhngbrghmgjhngbfrthnhfgbfv'
list1=re.findall('gj(.*?)fr',str1) #返回值是列表
print(list1)
str2 = '''hellobtevrqtyjumuynthbg
mybrtymuntbrgev
juntybrwtvewbynutybworld'''
list1 = re.findall('hello(.*?)world', str2, re.S) # re.S允許跨行匹配
print(list1)
爬取全書網(wǎng)小說
import os
import re
import requests
url = 'http://www.quannovel.com/read/620/' # 需要進(jìn)行爬蟲的網(wǎng)址
req = requests.post(url) # 訪問網(wǎng)頁,獲取網(wǎng)頁內(nèi)容
book_name = re.findall('(.*?)(.*?)', req.text) # 獲取章節(jié)名
url_list = re.findall('<a href="(.*?).html', req.text) # 獲取正文網(wǎng)址
dict1 = {}
for i in range(len(title_list)):
dict1[title_list[i]] = f'{url}{url_list[i]}.html' # 將目錄和網(wǎng)址放到字典里
if not os.path.exists(f'D:/{book_name}'): # 如果沒有以書名命名的目錄,新建目錄
os.mkdir(f'D:/{book_name}')
count = 1
for k, v in dict1.items():
if count > 5:
break
else:
req = requests.get(v) # 訪問正文網(wǎng)頁
text = re.findall('class="page-content ">(.*?)<div class', req.text, re.S)[0] # 獲取文章內(nèi)容
text = text.replace('', '').replace('', '')
with open(f'd:/{book_name}/{k}.txt','w+') as file1:
file1.write(text)
print(f'第{count}章爬取完畢')
count += 1