該案例調(diào)用搜狗微信公眾號(hào)搜索的接口,實(shí)現(xiàn)了輸入關(guān)鍵字搜索然后返回對(duì)應(yīng)的公眾號(hào)名稱,公眾號(hào),以及公眾號(hào)描述的功能
# coding:utf-8
import requests
import urllib
from bs4 import BeautifulSoup
# 爬取的網(wǎng)址url:http://weixin.sogou.com/weixin?type=1&s_from=input&query=%E4%BA%A7%E5%93%81&ie=utf8&_sug_=n&_sug_type_=
# 定義獲取url管理器的方法
def get_url(keyword, page):
urlList = []
for page in range(page):
page = page + 1
firstUrl = 'http://weixin.sogou.com/weixin?type=1&s_from=input&ie=utf8&_sug_=n&_sug_type_=&query='
lastUrl = '&page='
kw = urllib.quote(keyword)
url = firstUrl + kw + lastUrl + str(page)
urlList.append(url)
return urlList
# 定義獲取搜索結(jié)果的方法
def get_info(keyword, page):
urlList = get_url(keyword, page)
resList = []
for url in urlList:
response = requests.get(url)
res = response.content
soup = BeautifulSoup(res, 'html.parser')
nameList = soup.findAll('p', attrs={'class': 'tit'})
enameList = soup.findAll('label', attrs={'name': 'em_weixinhao'})
summaryList = soup.select('.gzh-box2 + dl > dd')
# 分頁(yè)搜索結(jié)果返回的條數(shù)不一致(搜狗的反爬機(jī)制),所以這邊選擇了7作為臨界值,某一個(gè)url返回的大于等于7,則爬取第2頁(yè),否則不進(jìn)行第2頁(yè)爬取
# 這里應(yīng)該還有更好的方案,大家可以想一想
if len(nameList) >= 7:
for v in range(len(nameList)):
resDict = {}
resDict = {
'name': nameList[v].text.strip('\n'),
'ename': enameList[v].text,
'summary': summaryList[v].text
}
resList.append(resDict)
else:
for v in range(len(nameList)):
resDict = {}
resDict = {
'name': nameList[v].text.strip('\n'),
'ename': enameList[v].text,
'summary': summaryList[v].text
}
resList.append(resDict)
break
for weixin in resList:
print '名字:%s' % (weixin['name'].encode('utf-8'))
print '公眾號(hào):%s' % (weixin['ename'].encode('utf-8'))
print '描述:%s' % (weixin['summary'].encode('utf-8'))
print '\n'
if __name__ == '__main__':
keyword = raw_input('請(qǐng)輸入關(guān)鍵字:')
page = input('請(qǐng)輸入搜索結(jié)果的頁(yè)數(shù):')
get_info(keyword, page)