python爬蟲爬取網(wǎng)站郵箱電話等
-
目標(biāo):
1. 通過爬取網(wǎng)頁,分析郵箱,電話,等,進(jìn)行記錄到數(shù)據(jù),自動(dòng)爬門戶網(wǎng)站外鏈,并訪問對(duì)方網(wǎng)站,獲取網(wǎng)站上的相關(guān)信息,并記錄到數(shù)據(jù)庫 2. 能夠自動(dòng)發(fā)送郵件,或短信到收集的數(shù)據(jù)郵箱里
第一步:實(shí)現(xiàn)采集單網(wǎng)頁的數(shù)據(jù)
分采集頁面和數(shù)據(jù)庫連接頁面
采集頁面代碼(用的是python2.7):
# encoding: utf-8
import urllib2
import re
import chardet
import sys
import chain_mysql
# 爬取的目標(biāo)網(wǎng)頁上的郵箱地址
# 數(shù)據(jù)寫入文件
def writeData(listData):
chain_mysql.dbInsert(listData)
def outputemail(url):
listReturn = {}
# 使用urllib打開該地址
responseobject = urllib2.urlopen(url)
# 驗(yàn)證輸出
#print(requestobject)
# 獲取解析的數(shù)據(jù)
html = responseobject.read()
typeEncode = sys.getfilesystemencoding() ##系統(tǒng)默認(rèn)編碼
infoencode = chardet.detect(html).get('encoding', 'utf-8') ##通過第3方模塊來自動(dòng)提取網(wǎng)頁的編碼
if infoencode is None:
infoencode = "gb2312"
html = html.decode(infoencode, 'ignore').encode('utf-8') ##先轉(zhuǎn)換成unicode編碼,然后轉(zhuǎn)換系統(tǒng)編碼輸出
# 輸入驗(yàn)證獲取的內(nèi)容
#return html
# 使用正則解析文檔中存在的email地址
phone=str(re.findall(r"1\d{10}\D",html))
phone =re.findall(r"1\d{10}",phone)
phone=list(set(phone))
title=re.search('<title.*?>.*?</title>', html).group()
pattern = re.compile(r'\s|\n|<title>|</title>', re.S)
title = str(title)
title = pattern.sub('', title)
#nametitle = re.findall('>.*?<',title)
#title=pre.findall(title)
contacturl = re.findall('<a.*? .*?>.*?聯(lián)系我們.*?</a>', html)
emaillist = re.findall("\w+@\w+\.\w+", html)
emaillist = list(set(emaillist))
listReturn['title'] = title
listReturn['url']=url
listReturn['value']=contacturl
listReturn['email']=emaillist
listReturn['phone']=phone
# 處理超鏈接中的郵箱地址
# 循環(huán)輸入email的結(jié)果
#for email in emaillist:
# print(email)
return listReturn
listData=[]
# 構(gòu)造url地址
url = "http://www.klineng.com"
# 把這個(gè)功能封裝成一個(gè)函數(shù)
showdata=outputemail(url)
writeData(showdata)
for data in showdata['value']:
cturl = re.search('/.*?\"', data)
if cturl is None:
cturl=""
else:
cturl=cturl.group()
pattern = re.compile(r'\"', re.S)
cturl = pattern.sub('', cturl)
cturl = url+ cturl
writeData(outputemail(cturl))
第二步:創(chuàng)建數(shù)據(jù)庫及數(shù)據(jù)表,建立連接文件
#!/usr/bin/python
# -*- coding: UTF-8 -*-
import MySQLdb
def dbInsert(listData):
# 打開數(shù)據(jù)庫連接
db = MySQLdb.connect("localhost", "root", "1234", "testmail", charset='utf8')
# 使用cursor()方法獲取操作游標(biāo)
cursor = db.cursor()
#for data in listData:
strdata=""
emaildata=""
calldata=""
for data in listData['value']:
strdata = strdata+data+'|'
for maildata in listData['email']:
emaildata = emaildata+maildata+'|'
for phonedata in listData['phone']:
calldata = calldata+phonedata+'|'
# SQL 插入語句
sql = """INSERT INTO t_data (title,url,value,email,phone) VALUES ('""" + listData['title'] + """','""" + str(listData['url'])+ """','""" + strdata + """','""" + emaildata + """','""" + calldata + """')"""
try:
cursor.execute(sql)
# 提交到數(shù)據(jù)庫執(zhí)行
db.commit()
except:
# Rollback in case there is any error
db.rollback()
# 關(guān)閉數(shù)據(jù)庫連接
db.close()
實(shí)現(xiàn)單網(wǎng)頁采集數(shù)據(jù)功能,并做數(shù)據(jù)去重,由于頁面html代碼不一致,對(duì)采集數(shù)據(jù)方面還要進(jìn)一步完善