Python學(xué)習(xí)筆記之 爬蟲
urllib 模塊提供了一系列用于操作URL的功能
urllib 爬取網(wǎng)頁
import urllib.request
# 向指定的url地址發(fā)起請(qǐng)求,并返回服務(wù)器響應(yīng)的數(shù)據(jù)(文件的對(duì)象)
response = urllib.request.urlopen("http://www.baicu.com")
# 讀取問文件的全部內(nèi)容,會(huì)把讀取到的數(shù)據(jù)賦值給一個(gè)字符串變量
data = response.read()
print(data)
print(type(data))
# 讀取一行
#data = response.readline()
#讀取文件的全部內(nèi)容,會(huì)把讀取到的數(shù)據(jù)賦值給一個(gè)列表變量
#data = response.readlines()
'''
print(data)
print(type(data))
print(len(data))
print(type(data[100].decode("utf-8")))
'''
#將爬取到的網(wǎng)頁寫入文件
# with open(r"C:\Users\xlg\Desktop\Python-1704\day18\file\file1.html", "wb") as f:
# f.write(data)
#response 屬性
#返回當(dāng)前環(huán)境的有關(guān)信息
print(response.info())
#返回狀態(tài)碼
print(response.getcode())
#if response.getcode() == 200 or response.getcode() == 304:
#處理網(wǎng)頁信息
# pass
#返回當(dāng)前正在爬取的URL地址
print(response.geturl())
將爬取的網(wǎng)頁直接寫入文件
import urllib.request
urllib.request.urlretrieve("http://www.baidu.com", filename=r"C:\Users\xlg\Desktop\Python-1704\day18\file\file2.html")
#urlretrieve在執(zhí)行的過程當(dāng)中,會(huì)保留一些緩存
#清除緩存
urllib.request.urlcleanup()
模擬瀏覽器
import urllib.request
import random
url = "http://www.baidu.com"
'''
#模擬請(qǐng)求頭
headers = {
"Accept" : "application/json, text/javascript, */*; q=0.01",
"X-Requested-With" : "XMLHttpRequest",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36",
"Content-Type" : "application/x-www-form-urlencoded; charset=UTF-8"
}
#設(shè)置一個(gè)請(qǐng)求體
req = urllib.request.Request(url,headers=headers)
#發(fā)起請(qǐng)求
response = urllib.request.urlopen(req)
data = response.read().decode("utf-8")
print(data)
'''
agentsList = [
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.3.4000 Chrome/30.0.1599.101 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0"
]
agentStr = random.choice(agentsList)
req = urllib.request.Request(url)
#向請(qǐng)求體里添加了User-Agent
req.add_header("User-Agent", agentStr)
response = urllib.request.urlopen(req)
print(response.read().decode("utf-8"))
設(shè)置超時(shí)
import urllib.request
#如果網(wǎng)頁長時(shí)間未響應(yīng),系統(tǒng)判斷超時(shí),無法爬取
for i in range(1, 100):
try:
response = urllib.request.urlopen("http://www.baidu.com", timeout=0.5)
print(len(response.read().decode("utf-8")))
except:
print("請(qǐng)求超時(shí),繼續(xù)下一個(gè)爬取")
HTTP請(qǐng)求
- 使用場景:進(jìn)行客戶端與服務(wù)端之間的消息傳遞時(shí)使用
GET: 通過URL網(wǎng)址傳遞信息,可以直接在URL網(wǎng)址上添加要傳遞的信息
POST: 可以向服務(wù)器提交數(shù)據(jù),是一種比較流行的比較安全的數(shù)據(jù)傳遞方式
PUT: 請(qǐng)求服務(wù)器存儲(chǔ)一個(gè)資源,通常要指定存儲(chǔ)的位置
DELETE: 請(qǐng)求服務(wù)器刪除一個(gè)資源
HEAD: 請(qǐng)求獲取對(duì)應(yīng)的HTTP報(bào)頭信息
OPTIONS:可以獲取當(dāng)前UTL所支持的請(qǐng)求類型
GET請(qǐng)求
'''
特點(diǎn):把數(shù)據(jù)拼接到請(qǐng)求路徑的后面?zhèn)鬟f給服務(wù)器
有點(diǎn):速度快
缺點(diǎn):承載的數(shù)據(jù)量小,不安全
'''
import urllib.request
url = "http://www.sunck.wang:8085/sunck"
response = urllib.request.urlopen(url)
data = response.read().decode("utf-8")
print(data)
print(type(data))
json數(shù)據(jù)解析
'''
概念:一種保存數(shù)據(jù)的格式
作用:可以保存本地的json文件,頁可以將json串進(jìn)行傳輸,通常將json稱為輕量級(jí)的傳輸方式
json文件組成
{} 代表對(duì)象(字典)
[] 代表列表
: 代表鍵值對(duì)
, 分隔兩個(gè)部分
'''
import json
jsonStr = '{"name":"sunck凱", "age":18, "hobby":["money","power","english"], "parames":{"a":1,"b":2}}'
#將json格式的字符串轉(zhuǎn)為python數(shù)據(jù)類型的對(duì)象
jsonData = json.loads(jsonStr)
print(jsonData)
print(type(jsonData))
print(jsonData["hobby"])
#將python數(shù)據(jù)類型的對(duì)象轉(zhuǎn)為json格式的字符串
jsonData2 = {"name":"sunck凱", "age":18, "hobby":["money","power","english"], "parames":{"a":1,"b":2}}
jsonStr2 = json.dumps(jsonData2)
print(jsonStr2)
print(type(jsonStr2))
#讀取本地的json文件
path1 = r"C:\Users\xlg\Desktop\Python-1704\day18\Json\caidanJson.json"
with open(path1, "rb") as f:
data = json.load(f)
print(data)
#字典類型
print(type(data))
#寫本地json
path2 = r"C:\Users\xlg\Desktop\Python-1704\day18\Json\test.json"
jsonData3 = {"name":"sunck凱", "age":18, "hobby":["money","power","english"], "parames":{"a":1,"b":2}}
with open(path2, "w") as f:
json.dump(jsonData3, f)
post請(qǐng)求
'''
特點(diǎn):把參數(shù)進(jìn)行打包,單獨(dú)傳輸
優(yōu)點(diǎn):數(shù)量大,安全(當(dāng)對(duì)服務(wù)器數(shù)據(jù)進(jìn)行修改時(shí)建議使用post)
缺點(diǎn):速度慢
'''
import urllib.request
import urllib.parse
url = "http://www.sunck.wang:8085/form"
#將要發(fā)送的數(shù)據(jù)合成一個(gè)字典
#字典的鍵取網(wǎng)址里找,一般為input標(biāo)簽的name屬性的值
data = {
"username":"sunck",
"passwd":"666"
}
#對(duì)要發(fā)送的數(shù)據(jù)進(jìn)行打包,記住編碼
postData = urllib.parse.urlencode(data).encode("utf-8")
#請(qǐng)求體
req = urllib.request.Request(url, postData)
#請(qǐng)求
req.add_header("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.3.4000 Chrome/30.0.1599.101 Safari/537.36")
response = urllib.request.urlopen(req)
print(response.read().decode("utf-8"))
抓取網(wǎng)頁動(dòng)態(tài)Ajax請(qǐng)求的數(shù)據(jù)
import urllib.request
import ssl
import json
def ajaxCrawler(url):
headers = {
"User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.3.4000 Chrome/30.0.1599.101 Safari/537.36"
}
req = urllib.request.Request(url, headers=headers)
#使用ssl創(chuàng)建未驗(yàn)證的上下文
context = ssl._create_unverified_context()
response = urllib.request.urlopen(req,context=context)
jsonStr = response.read().decode("utf-8")
jsonData = json.loads(jsonStr)
return jsonData
'''
url = "https://movie.douban.com/j/chart/top_list?type=11&interval_id=100%3A90&action=&start=20&limit=20"
info = ajaxCrawler(url)
print(info)
'''
for i in (1, 11):
url = "https://movie.douban.com/j/chart/top_list?type=11&interval_id=100%3A90&action=&start="+ str(i * 20)+"&limit=20"
info = ajaxCrawler(url)
print(len(info))
嗅事百科爬蟲練習(xí)
import urllib.request
import re
def jokeCrawler(url):
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.3.4000 Chrome/30.0.1599.101 Safari/537.36"
}
req = urllib.request.Request(url, headers=headers)
response = urllib.request.urlopen(req)
HTML = response.read().decode("utf-8")
pat = r'<div class="author clearfix">(.*?)<span class="stats-vote"><i class="number">'
re_joke = re.compile(pat, re.S)
divsList = re_joke.findall(HTML)
#print(divsList)
#print(len(divsList))
dic = {}
for div in divsList:
#用戶名
re_u = re.compile(r"<h2>(.*?)</h2>", re.S)
username = re_u.findall(div)
username = username[0]
#段子
re_d = re.compile(r'<div class="content">\n<span>(.*?)</span>', re.S)
duanzi = re_d.findall(div)
duanzi = duanzi[0]
dic[username] = duanzi
return dic
#with open(r"C:\Users\xlg\Desktop\Python-1704\day18\file\file3.html", "w") as f:
# f.write(HTML)
url = "https://www.qiushibaike.com/text/page/1/"
info = jokeCrawler(url)
for k, v in info.items():
print(k + "說\n" + v)
#https://www.douban.com/group/topic/41562980/?start=0