Urllib庫(kù)是Python用于操作Url的標(biāo)準(zhǔn)模塊,Python2.x時(shí)分為Urllib和Urllib2,Python3.x時(shí)合并到Urllib里面。這里把常見(jiàn)的變化列舉一下,便于查找修改。
官方文檔:https://docs.python.org/3.6/library/urllib.html
| Python2.x | Python3.x |
|---|---|
| import urllib2 | import urllib.request,urllib.error |
| import urllib | import urllib.request,urllib.error,urllib.parse |
| import urlparse | import urllib.parse |
| urllib2.urlopen | urllib.request.urlopen |
| urllib2.request | urllib.request.Request |
| urllib.quote | urllib.request.quote |
| urllib.urlencode | urllib.parse.urlencode |
| cookielib.CookieJar | http.CookieJar |
- 簡(jiǎn)單讀取網(wǎng)頁(yè)信息:urllib需制定內(nèi)容的解碼方式,requests可自動(dòng)解碼。
import urllib.request
f = urllib.request.urlopen('http://python.org/')
html1 = f.read() #urlopen返回的是bytes對(duì)象,此時(shí)調(diào)用read()方法得到的也是bytes對(duì)象。
html2 = f.read().decode('utf-8') #要獲取字符串內(nèi)容,需要指定解碼方式。因此,更常用html2的方式。
#還可以寫(xiě)成以下方式:
import urllib.request
with urllib.request.urlopen('http://python.org') as f:
html = f.read().decode('utf-8')
print(f.status)
print(html)
#html等價(jià)于requests庫(kù)的r.text:
import requests
r = requests.get('http://python.org')
print(r.status_code)
print(r.text) #調(diào)用r.text時(shí),Requests對(duì)象會(huì)使用其推測(cè)的文本編碼自動(dòng)解碼。
print(r.encoding) #查詢Requests對(duì)象使用的編碼方式。
r.encoding = 'utf-8' #可直接通過(guò)賦值語(yǔ)句來(lái)改變Requests對(duì)象使用的編碼方式。
2.urllib對(duì)含中文的URL進(jìn)行手動(dòng)編碼
import urllib.request
a = urllib.request.quote("中文")
b = urllib.request.unquote(a)
print(a,b)
結(jié)果為:%E4%B8%AD%E6%96%87 中文
3.使用Request對(duì)象添加headers進(jìn)行請(qǐng)求
import urllib.request
hds = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36'}
req = urllib.request.Request('http://python.org')
req.add_header('User-Agent','Mozilla/5.0') ##注意參數(shù)是用“,”進(jìn)行分隔。
#req.add_header('User-Agent',hds['User-Agent']) #另一種寫(xiě)法
with urllib.request.urlopen(req) as f: ##urlopen可放入url或Request對(duì)象
html = f.read().decode('utf-8')
#requests方法
import requests
hds = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36'}
r = requests.get('http://python.org',headers=hds)
4.超時(shí)設(shè)置
import urllib.request
#加上timeout參數(shù)即可
f = urllib.request.urlopen(req,timeout=1)
f = urllib.request.urlopen('http://python.org',timeout=1)
#完整用法(正常響應(yīng)1秒,若網(wǎng)站服務(wù)器性能不好時(shí)可適當(dāng)調(diào)高timeout值)
import urllib.request
for i in range(10): #若超時(shí),重復(fù)請(qǐng)求10次
try:
f = urllib.request.urlopen('http://python.org',timeout=1)
print(f.read().decode('utf-8')[:100])
break
except Exception as e:
print("出現(xiàn)異常: "+str(e))
# print(type(e))
#requests庫(kù)類似
for i in range(10): #若超時(shí),重復(fù)請(qǐng)求10次
try:
r = requests.get('http://python.org',timeout=0.25) #響應(yīng)比urllib.request快
print(r.text[:100])
break
except Exception as e:
print("第{}次請(qǐng)求出現(xiàn)異常:".format(str(i+1))+str(e))
print(type(e))
5.下載HTML文件到本地
同理:圖片、MP3、視頻等文件格式也是用‘wb’形式下載。
#方法一:
import urllib.request
html = urllib.request.urlopen("http://www.baidu.com").read()
with open("1.html","wb") as f: #使用b模式寫(xiě)入,此時(shí)傳入的html不需解碼
f.write(html)
#方法二:最方便
#urlretrieve(url, filename=None, reporthook=None, data=None)
#reporthook(可選)是回調(diào)函數(shù),可以顯示下載進(jìn)度。
#data(可選)指post到服務(wù)器的數(shù)據(jù)。
import urllib.request
urllib.request.urlretrieve("http://www.baidu.com",filename="1.html")
#urllib.request.urlretrieve("http://www.baidu.com","1.html")
#方法三:
import requests
r = requests.get("http://www.baidu.com")
with open("1.html",'wb') as f:
f.write(r.content)
# 其他格式:
urllib.request.urlretrieve("XXX.jpg",filename="1.jpg") #XXX表示服務(wù)器地址
urllib.request.urlretrieve("XXX.mp3",filename="1.mp3")
urllib.request.urlretrieve("XXX.rmvb",filename="1.rmvb")
6.get請(qǐng)求實(shí)例
get請(qǐng)求的url地址格式:http://網(wǎng)址?字段名1=內(nèi)容1&字段名2=內(nèi)容2
http://www.baidu.com/s?wd="python"&rqlang=cn # wd代表關(guān)鍵字, rqlang代表區(qū)域
import urllib.request
base_url = "http://www.baidu.com/s?wd="
keyword = "Python爬蟲(chóng)"
url = base_url + urllib.request.quote(keyword)
html = urllib.request.urlopen(url).read()
with open("1.html","wb") as f:
f.write(html)
#requests庫(kù)
import requests
base_url = "http://www.baidu.com/s?wd="
keyword = "Python爬蟲(chóng)"
url = base_url + keyword #requests模塊自動(dòng)解析含中文的url
r = requests.get(url)
#print(r.url) #可查看解析后的url
with open("2.html","wb") as f:
f.write(r.content)
7.使用代理:urllib.request.ProxyHandler
import urllib.request
# 創(chuàng)建代理字典
proxy1={'sock5': 'localhost:1080'}
proxy2={'http': '183.51.191.203:9797'}
# 使用ProxyHandler方法生成處理器對(duì)象
proxy_handler = urllib.request.ProxyHandler(proxy1)
# 創(chuàng)建代理IP的opener實(shí)例
opener = urllib.request.build_opener(proxy_handler)
# 創(chuàng)建全局默認(rèn)的open對(duì)象,使用urlopen()時(shí)會(huì)自動(dòng)使用已經(jīng)安裝的opener對(duì)象
urllib.request.install_opener(opener)
a = urllib.request.urlopen("http://www.baidu.com").read().decode("utf8")
print(len(a))
8.開(kāi)啟Debuglog:urllib.request.HTTPHandler,urllib.request.HTTPSHandler
import urllib.request
http_handler = urllib.request.HTTPHandler(debuglevel=1)
https_handler = urllib.request.HTTPSHandler(debuglevel=1)
opener = urllib.request.build_opener(http_handler,https_handler)
urllib.request.install_opener(opener)
urllib.request.urlopen("https://www.baidu.com")
9.異常處理:URLError,子類HTTPError
- 觸發(fā)URLError的原因有以下四種可能:
①連接不上服務(wù)器
②遠(yuǎn)程URL不存在
③無(wú)網(wǎng)絡(luò)
④觸發(fā)HTTPError
#寫(xiě)法一:
import urllib.request
import urllib.error
try:
# urllib.request.urlopen("http://www.google.com") #對(duì)應(yīng)URLError
urllib.request.urlopen("https://login.taobao.com/member") #對(duì)應(yīng)HTTPError
except urllib.error.HTTPError as e:
print(e.code,e.reason)
except urllib.error.URLError as e:
print(e.reason)
#寫(xiě)法二:
import urllib.request
import urllib.error
try:
#urllib.request.urlopen("http://www.google.com")
urllib.request.urlopen("https://login.taobao.com/member")
except urllib.error.URLError as e:
if hasattr(e,"code"): #hasattr是自帶函數(shù),詳見(jiàn)下方。
print(e.code)
if hasattr(e,"reason"):
print(e.reason)
'''
hasattr(obj, name, /)
Return whether the object has an attribute with the given name.
This is done by calling getattr(obj, name) and catching AttributeError.
'''
- HTTP狀態(tài)碼以及含義
| 狀態(tài)碼 (e.code) | 英文(e.reason) | 含義 |
|---|---|---|
| 200 | OK | 一切正常 |
| 301 | Moved Permanently | 重定向到新的URL,永久性 |
| 302 | Found | 重定向到新的URL,非永久性 |
| 304 | Not Modified | 請(qǐng)求的資源未更新 |
| 400 | Bad Request | 非法請(qǐng)求 |
| 401 | Unauthorized | 請(qǐng)求未經(jīng)授權(quán) |
| 403 | Forbidden | 禁止訪問(wèn) |
| 404 | Not Found | 沒(méi)有找到對(duì)應(yīng)頁(yè)面 |
| 500 | Internal Server Error | 服務(wù)器內(nèi)部錯(cuò)誤 |
| 501 | Not Implemented | 服務(wù)器不支持實(shí)現(xiàn)請(qǐng)求所需要的功能 |
10.post請(qǐng)求
import urllib.request
import urllib.parse
url = "https://www.douban.com/accounts/login"
params = {'source':'index_nav',
'form_email':'XXXX', #賬號(hào)
'form_password':'XXXX' #密碼
}
postdata = urllib.parse.urlencode(params).encode('utf-8') #對(duì)數(shù)據(jù)進(jìn)行編碼
req = urllib.request.Request(url,postdata)
html = urllib.request.urlopen(req).read()
with open('1.html','wb') as f:
f.write(html)
#requests庫(kù)
import requests
url = "https://www.douban.com/accounts/login"
params = {'source':'index_nav',
'form_email':'XXXX', #賬號(hào)
'form_password':'XXXX' #密碼
}
r = requests.post(url,params)
with open('1.html','wb') as f:
f.write(r.content)
#注:
urlencode:對(duì)key-value的字典數(shù)據(jù)進(jìn)行編碼轉(zhuǎn)換,返回類似“a=XXX&b=XXX”的結(jié)果。
quote:對(duì)單個(gè)字符串進(jìn)行編碼轉(zhuǎn)換,返回編碼后的一串字符,多用于中文字符的編碼。
11.使用cookies
import urllib.request
import urllib.parse
import http.cookiejar
url = "https://www.douban.com/accounts/login"
params = {'source':'index_nav',
'form_email':'XXXX', #賬號(hào)
'form_password':'XXXX' #密碼
}
postdata = urllib.parse.urlencode(params).encode('utf-8') #對(duì)數(shù)據(jù)進(jìn)行編碼
req = request.Request(url, postdata, method="POST") # 構(gòu)建Request對(duì)象
#創(chuàng)建CookieJar對(duì)象
cj = http.cookiejar.CookieJar()
pro = urllib.request.HTTPCookieProcessor(cj)
opener = urllib.request.build_opener(pro)
# 創(chuàng)建全局默認(rèn)的open對(duì)象,使用urlopen()時(shí)會(huì)自動(dòng)使用已經(jīng)安裝的opener對(duì)象
urllib.request.install_opener(opener)
html1 = urllib.request.urlopen(req).read()
with open('1.html', 'wb') as f:
f.write(html1)
#requests庫(kù)
import requests
url = "https://www.douban.com/accounts/login"
headers = {
'Cookie':'xxxxxxx'
}
r = requests.get(url,headers=headers)
print(r.text)