沒啥技巧,就兩個(gè)文件配置的爬蟲,目的是將企查查網(wǎng)站上一些公司信息抓取下來。
所有源碼
配置文件:config.py
MONGO_URL='localhost' #MongoDB本地連接
MONGO_DB='qichacha' #數(shù)據(jù)庫名
MONGO_TABLE='qichacha' #表名
KEYWORD = '廣州合道' #搜索關(guān)鍵詞
爬取代碼:spider.py
# -*- coding: utf-8 -*-
# @Time : 2018/10/26 21:16
# @Author : Xin
# @File : spider_nologin.py
# @Software: PyCharm
import requests
from requests.exceptions import RequestException
from urllib.parse import urlencode
import json
import re
from pyquery import PyQuery as pq
from multiprocessing import Pool
from config import *
import pymongo
client = pymongo.MongoClient(MONGO_URL)#連接MongoDB
db = client[MONGO_DB]#創(chuàng)建數(shù)據(jù)庫
#報(bào)文頭
headers = {
"cookie":"QCCSESSID=ejvdgnsi2rddlb9pbaue9ooch4; UM_distinctid=166b0853ff3287-096d0c0c314aee-3c604504-1fa400-166b0853ff5131; zg_did=%7B%22did%22%3A%20%22166b08540b44ac-08922195bb52cf-3c604504-1fa400-166b08540b54c4%22%7D; _uab_collina=154055981732518461862276; acw_tc=0ed717a715405598380581012e9866f92d13a6a352f024efbe09a35d3d; CNZZDATA1254842228=639133721-1540559402-null%7C1540899954; hasShow=1; Hm_lvt_3456bee468c83cc63fb5147f119f1075=1540559815,1540734830,1540818836,1540902534; Hm_lpvt_3456bee468c83cc63fb5147f119f1075=1540904548; zg_de1d1a35bfa24ce29bbf2c7eb17e6c4f=%7B%22sid%22%3A%201540902533286%2C%22updated%22%3A%201540904549411%2C%22info%22%3A%201540559814844%2C%22superProperty%22%3A%20%22%7B%7D%22%2C%22platform%22%3A%20%22%7B%7D%22%2C%22utm%22%3A%20%22%7B%7D%22%2C%22referrerDomain%22%3A%20%22%22%2C%22cuid%22%3A%20%22f6d5e6bd81b4649daa269182ad60cf95%22%7D",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36"}
#獲取索引頁
def get_page_index(page,keyword):
data={
'key' : keyword,
'ajaxflag' : 1,
'p' : page,
}
url='https://www.qichacha.com/search_index?'+urlencode(data)
try:
response = requests.get(url,headers=headers)
if response.status_code==200:
return response.text
return None
except RequestException:
print("請(qǐng)求索引頁url:{0}出錯(cuò)".format(url))
return None
#解析索引頁
def parse_page_index(html):
#print(html)
# pattern = re.compile(
# '<tr>.*?href="(.*?)".*?>(.*?)</a>.*?href.*?>(.*?)</a>.*?>(.*?)</span>.*?>(.*?)</span>.*?<p.*?>(.*?)<span.*?>(.*?)</span>.*?<p.*?<em>(.*?)</em>(.*?)</p>.*?<span.*?>(.*?)</span.*?</tr>',
# re.S)#取索引頁所有數(shù)據(jù)
pattern = re.compile('<tr>.*?href="(.*?)".*?>(.*?)</a>.*?</tr>',re.S)#只取詳情頁地址和公司名稱
result = re.findall(pattern,html)
#print(result)
for item in result:
yield {
'detail_url':item[0],
'company':re.sub(r'<em>|</em>','',item[1]),#將獲得的此格式內(nèi)容再替換為純公司名"<em>廣州</em>僑<em>合</em>建設(shè)有限公司"
# 'LegalRepresentative':item[2],
# 'RegisteredCapital':item[3].strip()[5:],
# 'CreatedTime':item[4].strip()[5:],
# 'Email':item[6],
# 'Phone':item[6],
# 'Address':item[7]+item[8],
# 'State':item[9]
}
#獲取詳情頁
def get_page_detail(company,detailurl):
url = 'https://www.qichacha.com'+detailurl
#print(url)
print('開始爬取:{0},網(wǎng)址:{1}'.format(company,url))
try:
response = requests.get(url,headers=headers)
if response.status_code==200:
#print(200)
return response.text
return None
except RequestException:
print("請(qǐng)求詳情頁公司名{0},url:{1}出錯(cuò)!".format(company,url))
#解析詳情頁
def parse_page_detail(html,detailurl):
url = 'https://www.qichacha.com' + detailurl
doc = pq(html)
company = doc('.container.p-t > #company-top > div.row > div.content > div.row.title > h1').text() # 公司名稱
state = doc('.container.p-t >#company-top > div.row > div.content > div.row.title > span').text() # 經(jīng)營狀態(tài)
phone = doc('.container.p-t >#company-top > div.row > div.content > div:nth-child(2) > span.fc > span.cvlu > span').text() # 聯(lián)系電話
official_website = doc('.container.p-t >#company-top > div.row > div.content > div:nth-child(2) > span.cvlu > a:nth-child(1)').text() # 官網(wǎng)
email = doc('.container.p-t >#company-top > div.row > div.content > div:nth-child(3) > span.fc > span.cvlu > a').text() # 郵箱
address = doc('.container.p-t >#company-top > div.row > div.content > div:nth-child(3) > span.cvlu > a:nth-child(1)').text() # 公司地址
# introduction = doc('#textShowMore').text()#簡介
boss = doc('#Cominfo > table:nth-child(3) > tr:nth-child(2) > td.ma_left > div > div.clearfix > div:nth-child(2) > a.bname > h2').text() # 法人代表
business_relations = doc('#Cominfo > table:nth-child(3) > tr:nth-child(2) > td:nth-child(2) > div.ba-table-base > a').attr('href') # 企業(yè)關(guān)聯(lián)圖譜鏈接
registered_capital = doc('#Cominfo > table:nth-child(4) > tr:nth-child(1) > td:nth-child(2)').text() # 注冊(cè)資本
paid_capital = doc('#Cominfo > table:nth-child(4) > tr:nth-child(1) > td:nth-child(4)').text() # 實(shí)繳資本
create_date = doc('#Cominfo > table:nth-child(4) > tr:nth-child(2) > td:nth-child(4)').text() # 成立日期
credit_code = doc('#Cominfo > table:nth-child(4) > tr:nth-child(3) > td:nth-child(2)').text() # 統(tǒng)一社會(huì)信用代碼
registration_number = doc('#Cominfo > table:nth-child(4) > tr:nth-child(4) > td:nth-child(2)').text() # 注冊(cè)號(hào)
organization_code = doc('#Cominfo > table:nth-child(4) > tr:nth-child(4) > td:nth-child(4)').text() # 組織機(jī)構(gòu)代碼
company_type = doc('#Cominfo > table:nth-child(4) > tr:nth-child(5) > td:nth-child(2)').text() # 公司類型
industry_involved = doc('#Cominfo > table:nth-child(4) > tr:nth-child(5) > td:nth-child(4)').text() # 所屬行業(yè)
approval_date = doc('#Cominfo > table:nth-child(4) > tr:nth-child(6) > td:nth-child(2)').text() # 核準(zhǔn)日期
registration_authority = doc('#Cominfo > table:nth-child(4) > tr:nth-child(6) > td:nth-child(4)').text() # 登記機(jī)關(guān)
area = doc('#Cominfo > table:nth-child(4) > tr:nth-child(7) > td:nth-child(2)').text() # 所屬地區(qū)
english_name = doc('#Cominfo > table:nth-child(4) > tr:nth-child(7) > td:nth-child(4)').text() # 英文名
former_name = doc('#Cominfo > table:nth-child(4) > tr:nth-child(8) > td:nth-child(2)').text() # 曾用名
insured_number = doc('#Cominfo > table:nth-child(4) > tr:nth-child(8) > td:nth-child(4)').text() # 參保人數(shù)
staff_size = doc('#Cominfo > table:nth-child(4) > tr:nth-child(9) > td:nth-child(2)').text() # 人員規(guī)模
business_term = doc('#Cominfo > table:nth-child(4) > tr:nth-child(9) > td:nth-child(4)').text() # 營業(yè)期限
business_scope = doc('#Cominfo > table:nth-child(4) > tr:nth-child(11) > td:nth-child(2)').text() # 經(jīng)營范圍
equity_through = doc('#guquanIframeTool > a:nth-child(1)').attr('href') # 股權(quán)穿透圖鏈接
result = {
'url':url,
'company': company, 'state': state,
'phone': phone, 'official_website': official_website,
'email': email, 'address': address,
'boss': boss, 'business_relations': business_relations,
'registered_capital':registered_capital,'paid_capital':paid_capital,
'create_date': create_date,'credit_code':credit_code,
'registration_number': registration_number,'organization_code':organization_code,
'company_type': company_type,'industry_involved':industry_involved,
'approval_date': approval_date,'registration_authority':registration_authority,
'area': area,'english_name':english_name,
'former_name': former_name,'insured_number':insured_number,
'staff_size': staff_size,'business_term':business_term,
'business_scope': business_scope,'equity_through':equity_through,
}
return result
def write_to_file(company,result):
# with open('{0}.txt'.format(company),'a',encoding='utf-8')as f:
# f.write(json.dumps(result,ensure_ascii=False))
with open('result.txt','a',encoding='utf8')as f:
print('{0},保存成功'.format(company))
f.write(json.dumps(result,ensure_ascii=False)+'\n')
def save_to_mongo(result):
if db[MONGO_TABLE].insert(result): # 將結(jié)果插入到MongoDB
print('存儲(chǔ)到MongoDB成功', result['company'])
return True
return False
def main(i):
html = get_page_index(i,KEYWORD)
#print(html)
for item in parse_page_index(html):
#print(item['Company'],item['Detail_url'])
text = get_page_detail(item['company'],item['detail_url'])#獲取詳情頁內(nèi)容
if text:
result=parse_page_detail(text,item['detail_url'])
write_to_file(item['company'],result)
save_to_mongo(result)
if __name__=="__main__":
#main(1)
pool = Pool()
pool.map(main,[i for i in range(1,11)])
print('爬取結(jié)束!')