1.異常處理
import urllib.request
import urllib.error
url = "http://www.mobiletrain.org/afd?pinzhuanbdtg=biaoti"
req = urllib.request.Request(url=url)
# res = urllib.request.urlopen(req)
try:
res = urllib.request.urlopen(req)
print("0")
except urllib.error.HTTPError as e:
# 如果是http請(qǐng)求的時(shí)候或者響應(yīng)的時(shí)候發(fā)生錯(cuò)誤,就會(huì)檢測(cè)到HTTPError這個(gè)異常
print("1")
print(e)
except urllib.error.URLError as e:
# 網(wǎng)址本身存在錯(cuò)誤,這個(gè)域名不存在
print("2")
print(e)
except Exception as e:
print(e)
2.微博登陸帶cookie,寫入html
import urllib.request
url = "https://weibo.cn/6370062528/info"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36",
"Cookie": "SCF=Ahz9Jk7TyV7zzLvwoCxFjRRTbASUHA9Jp8RcRyRaht68K11D0lYQBg5j9No1B157Zgv7Lx5COUC7DNdjo8APyKc.; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9W59rVpGeZ7yg7I7HR0hyYPg5JpX5KzhUgL.Foq0S057Sozfeon2dJLoI05LxKML1heLB-BLxKqL1heL1h-LxKML1-2L1hBLxKqLBoeLBKzLxKqLBoeLBKz41K.t; SUB=_2A253_53KDeRhGeBN7FIR9izJyTSIHXVVAyOCrDV6PUJbkdANLW75kW1NRFYvPyNwa0XZ3DB_CJthDJ6-896SR-uQ; SUHB=0FQbkbkZxmWpX1; _T_WM=ae4d1af9302c562727db8db7ad8ef936"
}
# http協(xié)議是一個(gè)無狀態(tài)的協(xié)議,不記錄當(dāng)前的會(huì)話的狀態(tài),在某些情況下,我們需要記錄狀態(tài)(比如在訪問某些用戶登錄以后的頁面中),這個(gè)時(shí)候就要借助于cookie或者Session來保存用戶的信息
#創(chuàng)建請(qǐng)求對(duì)象
req = urllib.request.Request(url=url,headers=headers)
#解析網(wǎng)址
res = urllib.request.urlopen(req)
#寫入weibo.html文件
with open("weibo.html","wb") as fp:
fp.write(res.read())
3.handler保存會(huì)話信息的時(shí)候,采用的opener+handler的請(qǐng)求機(jī)制
import urllib.request
# Request對(duì)象無法保存會(huì)話信息,我們需要有另外的一個(gè)對(duì)象handler,用來保存
# 在handler保存會(huì)話信息的時(shí)候,采用的opener+handler的請(qǐng)求機(jī)制
# 1、創(chuàng)建一個(gè)handler對(duì)象
handler = urllib.request.HTTPHandler() # 用于保存會(huì)話信息
# 2、創(chuàng)建一個(gè)opener對(duì)象
opener = urllib.request.build_opener(handler) # 用于請(qǐng)求網(wǎng)絡(luò)
# opener攜帶了handler,在用其發(fā)起http請(qǐng)求的時(shí)候就會(huì)攜帶存儲(chǔ)在handler中的會(huì)話信息
# 構(gòu)建請(qǐng)求
url = "http://www.baidu.com/"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36"
}
req = urllib.request.Request(headers=headers,url=url)
# urllib.request.urlopen(req) # 這個(gè)請(qǐng)求無法攜帶會(huì)話信息
# 用opener發(fā)出請(qǐng)求,攜帶會(huì)話信息
res = opener.open(req) # 此時(shí)在發(fā)起請(qǐng)求的時(shí)候,就會(huì)攜帶上"http://www.baidu.com"這個(gè)網(wǎng)址的cookie
print(res)
4.完整微博登陸
import urllib.request
import urllib.parse
# 處理cookie我們還需要導(dǎo)入cookiejar庫(kù)
import http.cookiejar
# 登錄過程
# 1、創(chuàng)建一個(gè)cookiejar對(duì)象,用于保存cookie
cookie = http.cookiejar.CookieJar()
# 2、創(chuàng)建一個(gè)handker對(duì)象,并且讓這個(gè)handler對(duì)象攜帶上cookie
handler = urllib.request.HTTPCookieProcessor(cookie) # 這個(gè)對(duì)象是HTTPHandler的子對(duì)象
# 3、創(chuàng)建一個(gè)opener對(duì)象,并且讓他攜帶上handler
opener = urllib.request.build_opener(handler)
# 此時(shí)opener所攜帶的handler對(duì)象就有cookie對(duì)象的存儲(chǔ),用它發(fā)請(qǐng)求就可以存儲(chǔ)cookie
# 登錄的url
url = "https://passport.weibo.cn/sso/login"
# 請(qǐng)求頭
headers_login = {
'Accept': '*/*',
# Accept-Encoding: gzip, deflate, br
'Accept-Language': 'zh-CN,zh;q=0.9',
'Connection': 'keep-alive',
# Content-Length: 162
'Content-Type': 'application/x-www-form-urlencoded',
'Host': 'passport.weibo.cn',
'Origin': 'https://passport.weibo.cn',
# 'Referer': 'https://passport.weibo.cn/signin/login?entry=mweibo&r=https%3A%2F%2Fweibo.cn%2F&backTitle=%CE%A2%B2%A9&vt=',
'Referer': 'https://passport.weibo.cn/signin/login?entry=mweibo&r=https%3A%2F%2Fweibo.cn%2F%3Fluicode%3D20000174&backTitle=%CE%A2%B2%A9&vt=',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
}
# 請(qǐng)求體
form_data = {
'username': '18610593606',
'password': 'f12345678',
'savestate': '1',
'r': 'https://weibo.cn/',
'ec': '0',
'pagerefer':'',
'entry': 'mweibo',
'wentry':'',
'loginfrom':'',
'client_id': '',
'code':'',
'qq':'',
'mainpageflag': '1',
'hff':'',
'hfp':''
}
# 將請(qǐng)求體處理成url參數(shù)的形式
data = urllib.parse.urlencode(form_data).encode("utf-8")
# print(data)
# 發(fā)起一個(gè)post請(qǐng)求
req = urllib.request.Request(url=url,headers=headers_login,data=data)
# 登錄
# urllib.request.urlopen() # 這個(gè)方無法攜帶cookie
res = opener.open(req)
print(res.read().decode("utf-8"))
# 獲取個(gè)人資料
info_url = "https://weibo.cn/6370062528/info"
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
}
req = urllib.request.Request(url=info_url,headers=headers)
res = opener.open(req) # 此時(shí)如果登錄成功,opener就攜帶當(dāng)前用戶對(duì)應(yīng)的cookie信息,就可以獲取當(dāng)前用戶的主要
print(res.read().decode("gb2312"))
5.使用代理服務(wù)器代替我們當(dāng)前客戶端去發(fā)起請(qǐng)求
import urllib.request
url = "https://www.baidu.com/s?wd=ip"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36"
}
# 由于某些網(wǎng)站的反爬機(jī)制會(huì)通過檢測(cè)ip地址的訪問頻率,進(jìn)而決定是否是爬蟲,然后進(jìn)行封鎖
# 通過設(shè)置代理服務(wù)器,然代理服務(wù)器代替我們當(dāng)前客戶端去發(fā)起請(qǐng)求,并且將請(qǐng)求結(jié)果反饋回來
# 創(chuàng)建一個(gè)請(qǐng)求對(duì)象
req = urllib.request.Request(url=url,headers=headers)
# 配置代理
handler = urllib.request.ProxyHandler({"http":"114.113.126.86:80"})
# 通過一個(gè)handler對(duì)象,攜帶代理服務(wù)器的ip地址和端口號(hào)
# 創(chuàng)建一個(gè)opener攜帶handler
opener = urllib.request.build_opener(handler)
# 發(fā)出請(qǐng)求
res = opener.open(req)
with open("ip.html","wb") as fp:
fp.write(res.read())
# 在實(shí)際的項(xiàng)目中代理服務(wù)器需要購(gòu)買
6.正則小使用
import re
# 導(dǎo)入正則支持模塊
string = "<div><title>按時(shí)發(fā)大水</title><div>asdfdsafsadasfds</div></div>"
pat = r'<div>.*'
s = re.findall(pat,string)
print(s)
7.糗事百科爬取圖片到本地
import urllib.request
import re
# 定義一個(gè)函數(shù),用于處理url
def handler_url(url,page):
# 把url和page拼接起來
page_url = url + str(page)
# 請(qǐng)求頭
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
}
# 創(chuàng)建請(qǐng)求對(duì)象
req = urllib.request.Request(url=page_url,headers=headers)
return req
# 定義一個(gè)函數(shù),處理每一個(gè)請(qǐng)求對(duì)象,發(fā)起請(qǐng)求
def request_qiubai(req):
res = urllib.request.urlopen(req)
html = res.read().decode("utf-8")
# print(html)
return html
# 數(shù)據(jù)解析,對(duì)網(wǎng)頁數(shù)據(jù)進(jìn)行解析
def anlysis_qiubai(html):
# 用正則表達(dá)式來匹配圖片的url
pat = re.compile(r'<div class="thumb">.*?<img src="(.*?)" alt=.*?>.*?</div>',re.S)
# 正則匹配時(shí),需要加上單行的模式修正
res = pat.findall(html)
# res中存儲(chǔ)的是圖片的url,對(duì)url進(jìn)行處理
imgs = []
for url in res:
img_url = "http:" + url
imgs.append(img_url)
print(imgs)
return imgs
# 下載圖片
def download_imgs(img_url,name):
urllib.request.urlretrieve(img_url,"./images/"+name + '.jpg')
# 業(yè)務(wù)邏輯
def main():
url = "https://www.qiushibaike.com/pic/page/"
# 輸入要獲取的頁碼
start_page = input("請(qǐng)輸入起始頁:")
end_page = input("請(qǐng)輸入終止頁:")
print("開始下載...")
# 從起始頁到終止頁循環(huán)遍歷
img_name = 0
for i in range(int(start_page),int(end_page)+1):
# 創(chuàng)建每一個(gè)頁面的請(qǐng)求對(duì)象
page_req = handler_url(url,i)
# 發(fā)起請(qǐng)求
qiu_html = request_qiubai(page_req)
# 解析網(wǎng)頁
res = anlysis_qiubai(qiu_html)
# 下載圖片
for img in res:
print("當(dāng)前正在下載:"+img)
download_imgs(img,str(img_name))
img_name += 1
print("下載完畢!")
if __name__ == '__main__':
main()
8.xpath語法使用
# xpath是一種在xml文件中根據(jù)其文檔結(jié)構(gòu)來提取目標(biāo)元素的語法
# html就是一種特殊的xml文件
from lxml import etree
# lxml這個(gè)庫(kù)用于解析和提取xml文件中的內(nèi)容,etree就是把xml(html)里面內(nèi)容用一個(gè)樹形結(jié)構(gòu)來表達(dá)
# 如何用xpath語法尋找html里面的內(nèi)容
# 1、用etree把整個(gè)html文檔讀取出來,并且創(chuàng)建成一棵樹
html_tree = etree.parse("./test.html")
print(html_tree) #<lxml.etree._ElementTree object at 0x000001FE48F7AB08>
# 2、根據(jù)樹形結(jié)構(gòu)獲取目標(biāo)節(jié)點(diǎn)
ret = html_tree.xpath("/html/body/ol/li")
ret = html_tree.xpath("/html/body/div/div/a")
# 3、找節(jié)點(diǎn)中的內(nèi)容與屬性
ret = html_tree.xpath("/html/body/ol/li/text()")
ret = html_tree.xpath("/html/body/div/div/a/@href")
# 4、定位
# 1)層級(jí)定位
# 找所有l(wèi)i
ret = html_tree.xpath("http://body/div//li/text()")
# 在xpath語法中"http://"代表前面有若干層,"/"代表前面一層不隔
# 2)屬性定位
# 找所有帶id屬性的li
ret = html_tree.xpath("http://li[@id]/text()")
# 找所有class值為dudu的li
ret = html_tree.xpath("http://li[@class='dudu']/text()")
# 模糊匹配
# 找左右class值以h開頭的li
ret = html_tree.xpath("http://li[starts-with(@class,'h')]/text()")
ret = html_tree.xpath("http://li[contains(@class,'a')]/text()")
# 邏輯運(yùn)算
# 找class值為hehe并且id值為tata
ret = html_tree.xpath("http://li[@class='hehe' and @id='tata']/text()")
# 找class值為hehe或者class值為haha
ret = html_tree.xpath("http://li[@class='hehe' or @class='haha']")
ret = html_tree.xpath("http://div[@class='hh kk']/text()")
# 【注意】通過xpath獲取到的子節(jié)點(diǎn)也可作為當(dāng)前字節(jié)的樹根取構(gòu)造另外一棵節(jié)點(diǎn)樹
obj = html_tree.xpath("http://div[@id='pp']")[0]
ret = obj.xpath("http://div")
print(ret)