爬取企查查

沒啥技巧,就兩個(gè)文件配置的爬蟲,目的是將企查查網(wǎng)站上一些公司信息抓取下來。

所有源碼

配置文件:config.py

MONGO_URL='localhost' #MongoDB本地連接
MONGO_DB='qichacha' #數(shù)據(jù)庫名
MONGO_TABLE='qichacha' #表名

KEYWORD = '廣州合道' #搜索關(guān)鍵詞

爬取代碼:spider.py

# -*- coding: utf-8 -*-
# @Time    : 2018/10/26 21:16
# @Author  : Xin
# @File    : spider_nologin.py
# @Software: PyCharm

import requests
from requests.exceptions import RequestException
from urllib.parse import urlencode
import json
import re
from pyquery import PyQuery as pq
from multiprocessing import Pool
from config import *
import pymongo

client = pymongo.MongoClient(MONGO_URL)#連接MongoDB
db = client[MONGO_DB]#創(chuàng)建數(shù)據(jù)庫

#報(bào)文頭
headers = {
        "cookie":"QCCSESSID=ejvdgnsi2rddlb9pbaue9ooch4; UM_distinctid=166b0853ff3287-096d0c0c314aee-3c604504-1fa400-166b0853ff5131; zg_did=%7B%22did%22%3A%20%22166b08540b44ac-08922195bb52cf-3c604504-1fa400-166b08540b54c4%22%7D; _uab_collina=154055981732518461862276; acw_tc=0ed717a715405598380581012e9866f92d13a6a352f024efbe09a35d3d; CNZZDATA1254842228=639133721-1540559402-null%7C1540899954; hasShow=1; Hm_lvt_3456bee468c83cc63fb5147f119f1075=1540559815,1540734830,1540818836,1540902534; Hm_lpvt_3456bee468c83cc63fb5147f119f1075=1540904548; zg_de1d1a35bfa24ce29bbf2c7eb17e6c4f=%7B%22sid%22%3A%201540902533286%2C%22updated%22%3A%201540904549411%2C%22info%22%3A%201540559814844%2C%22superProperty%22%3A%20%22%7B%7D%22%2C%22platform%22%3A%20%22%7B%7D%22%2C%22utm%22%3A%20%22%7B%7D%22%2C%22referrerDomain%22%3A%20%22%22%2C%22cuid%22%3A%20%22f6d5e6bd81b4649daa269182ad60cf95%22%7D",
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36"}

#獲取索引頁
def get_page_index(page,keyword):
    data={
         'key' : keyword,
        'ajaxflag' : 1,
        'p' : page,
    }

    url='https://www.qichacha.com/search_index?'+urlencode(data)
    try:
        response = requests.get(url,headers=headers)
        if response.status_code==200:
            return response.text
        return None
    except RequestException:
        print("請(qǐng)求索引頁url:{0}出錯(cuò)".format(url))
        return None

#解析索引頁
def parse_page_index(html):
    #print(html)
    # pattern = re.compile(
    #     '<tr>.*?href="(.*?)".*?>(.*?)</a>.*?href.*?>(.*?)</a>.*?>(.*?)</span>.*?>(.*?)</span>.*?<p.*?>(.*?)<span.*?>(.*?)</span>.*?<p.*?<em>(.*?)</em>(.*?)</p>.*?<span.*?>(.*?)</span.*?</tr>',
    #     re.S)#取索引頁所有數(shù)據(jù)
    pattern = re.compile('<tr>.*?href="(.*?)".*?>(.*?)</a>.*?</tr>',re.S)#只取詳情頁地址和公司名稱
    result = re.findall(pattern,html)
    #print(result)
    for item in result:
        yield {
            'detail_url':item[0],
            'company':re.sub(r'<em>|</em>','',item[1]),#將獲得的此格式內(nèi)容再替換為純公司名"<em>廣州</em>僑<em>合</em>建設(shè)有限公司"
            # 'LegalRepresentative':item[2],
            # 'RegisteredCapital':item[3].strip()[5:],
            # 'CreatedTime':item[4].strip()[5:],
            # 'Email':item[6],
            # 'Phone':item[6],
            # 'Address':item[7]+item[8],
            # 'State':item[9]
        }

#獲取詳情頁
def get_page_detail(company,detailurl):
    url = 'https://www.qichacha.com'+detailurl
    #print(url)
    print('開始爬取:{0},網(wǎng)址:{1}'.format(company,url))
    try:
        response = requests.get(url,headers=headers)
        if response.status_code==200:
            #print(200)
            return response.text
        return None
    except RequestException:
        print("請(qǐng)求詳情頁公司名{0},url:{1}出錯(cuò)!".format(company,url))

#解析詳情頁
def parse_page_detail(html,detailurl):
    url = 'https://www.qichacha.com' + detailurl
    doc = pq(html)
    company = doc('.container.p-t > #company-top > div.row > div.content > div.row.title > h1').text()  # 公司名稱
    state = doc('.container.p-t >#company-top > div.row > div.content > div.row.title > span').text()  # 經(jīng)營狀態(tài)
    phone = doc('.container.p-t >#company-top > div.row > div.content > div:nth-child(2) > span.fc > span.cvlu > span').text()  # 聯(lián)系電話
    official_website = doc('.container.p-t >#company-top > div.row > div.content > div:nth-child(2) > span.cvlu > a:nth-child(1)').text()  # 官網(wǎng)
    email = doc('.container.p-t >#company-top > div.row > div.content > div:nth-child(3) > span.fc > span.cvlu > a').text()  # 郵箱
    address = doc('.container.p-t >#company-top > div.row > div.content > div:nth-child(3) > span.cvlu > a:nth-child(1)').text()  # 公司地址
    # introduction = doc('#textShowMore').text()#簡介
    boss = doc('#Cominfo > table:nth-child(3) > tr:nth-child(2) > td.ma_left > div > div.clearfix > div:nth-child(2) > a.bname > h2').text()  # 法人代表
    business_relations = doc('#Cominfo > table:nth-child(3) > tr:nth-child(2) > td:nth-child(2) > div.ba-table-base > a').attr('href')  # 企業(yè)關(guān)聯(lián)圖譜鏈接
    registered_capital = doc('#Cominfo > table:nth-child(4) > tr:nth-child(1) > td:nth-child(2)').text()  # 注冊(cè)資本
    paid_capital = doc('#Cominfo > table:nth-child(4) > tr:nth-child(1) > td:nth-child(4)').text()  # 實(shí)繳資本
    create_date = doc('#Cominfo > table:nth-child(4) > tr:nth-child(2) > td:nth-child(4)').text()  # 成立日期
    credit_code = doc('#Cominfo > table:nth-child(4) > tr:nth-child(3) > td:nth-child(2)').text()  # 統(tǒng)一社會(huì)信用代碼
    registration_number = doc('#Cominfo > table:nth-child(4) > tr:nth-child(4) > td:nth-child(2)').text()  # 注冊(cè)號(hào)
    organization_code = doc('#Cominfo > table:nth-child(4) > tr:nth-child(4) > td:nth-child(4)').text()  # 組織機(jī)構(gòu)代碼
    company_type = doc('#Cominfo > table:nth-child(4) > tr:nth-child(5) > td:nth-child(2)').text()  # 公司類型
    industry_involved = doc('#Cominfo > table:nth-child(4) > tr:nth-child(5) > td:nth-child(4)').text()  # 所屬行業(yè)
    approval_date = doc('#Cominfo > table:nth-child(4) > tr:nth-child(6) > td:nth-child(2)').text()  # 核準(zhǔn)日期
    registration_authority = doc('#Cominfo > table:nth-child(4) > tr:nth-child(6) > td:nth-child(4)').text()  # 登記機(jī)關(guān)
    area = doc('#Cominfo > table:nth-child(4) > tr:nth-child(7) > td:nth-child(2)').text()  # 所屬地區(qū)
    english_name = doc('#Cominfo > table:nth-child(4) > tr:nth-child(7) > td:nth-child(4)').text()  # 英文名
    former_name = doc('#Cominfo > table:nth-child(4) > tr:nth-child(8) > td:nth-child(2)').text()  # 曾用名
    insured_number = doc('#Cominfo > table:nth-child(4) > tr:nth-child(8) > td:nth-child(4)').text()  # 參保人數(shù)
    staff_size = doc('#Cominfo > table:nth-child(4) > tr:nth-child(9) > td:nth-child(2)').text()  # 人員規(guī)模
    business_term = doc('#Cominfo > table:nth-child(4) > tr:nth-child(9) > td:nth-child(4)').text()  # 營業(yè)期限
    business_scope = doc('#Cominfo > table:nth-child(4) > tr:nth-child(11) > td:nth-child(2)').text()  # 經(jīng)營范圍
    equity_through = doc('#guquanIframeTool > a:nth-child(1)').attr('href')  # 股權(quán)穿透圖鏈接
    result = {
        'url':url,
        'company': company, 'state': state,
        'phone': phone, 'official_website': official_website,
        'email': email, 'address': address,
        'boss': boss, 'business_relations': business_relations,
        'registered_capital':registered_capital,'paid_capital':paid_capital,
        'create_date': create_date,'credit_code':credit_code,
        'registration_number': registration_number,'organization_code':organization_code,
        'company_type': company_type,'industry_involved':industry_involved,
        'approval_date': approval_date,'registration_authority':registration_authority,
        'area': area,'english_name':english_name,
        'former_name': former_name,'insured_number':insured_number,
        'staff_size': staff_size,'business_term':business_term,
        'business_scope': business_scope,'equity_through':equity_through,
    }
    return result

def write_to_file(company,result):
    # with open('{0}.txt'.format(company),'a',encoding='utf-8')as f:
    #     f.write(json.dumps(result,ensure_ascii=False))
    with open('result.txt','a',encoding='utf8')as f:
        print('{0},保存成功'.format(company))
        f.write(json.dumps(result,ensure_ascii=False)+'\n')

def save_to_mongo(result):
    if db[MONGO_TABLE].insert(result):  # 將結(jié)果插入到MongoDB
        print('存儲(chǔ)到MongoDB成功', result['company'])
        return True
    return False

def main(i):
    html = get_page_index(i,KEYWORD)
    #print(html)
    for item in parse_page_index(html):
        #print(item['Company'],item['Detail_url'])
        text = get_page_detail(item['company'],item['detail_url'])#獲取詳情頁內(nèi)容
        if text:
            result=parse_page_detail(text,item['detail_url'])
            write_to_file(item['company'],result)
            save_to_mongo(result)

if __name__=="__main__":
    #main(1)
    pool = Pool()
    pool.map(main,[i for i in range(1,11)])
    print('爬取結(jié)束!')

?著作權(quán)歸作者所有,轉(zhuǎn)載或內(nèi)容合作請(qǐng)聯(lián)系作者
【社區(qū)內(nèi)容提示】社區(qū)部分內(nèi)容疑似由AI輔助生成,瀏覽時(shí)請(qǐng)結(jié)合常識(shí)與多方信息審慎甄別。
平臺(tái)聲明:文章內(nèi)容(如有圖片或視頻亦包括在內(nèi))由作者上傳并發(fā)布,文章內(nèi)容僅代表作者本人觀點(diǎn),簡書系信息發(fā)布平臺(tái),僅提供信息存儲(chǔ)服務(wù)。

相關(guān)閱讀更多精彩內(nèi)容

  • 1.前期準(zhǔn)備 具體請(qǐng)查看上一篇 2.準(zhǔn)備庫requests,BeautifulSoup,xlwt,lxml 3.具...
    XuJiaxin_閱讀 7,162評(píng)論 0 3
  • 由于是第一次寫作可能代碼風(fēng)格比較丑而且語言表達(dá)不好,各位看官請(qǐng)見諒. 下面進(jìn)入正題臨時(shí)接到一個(gè)任務(wù)爬取企查查的網(wǎng)絡(luò)...
    阿包_26f6閱讀 961評(píng)論 0 0
  • 還等什么,Scrapy啟動(dòng),爬取開始! 目標(biāo)網(wǎng)站 這里我們選取的目標(biāo)網(wǎng)站是我常逛的干貨集中營,而要爬取的就是她: ...
    1s的消失閱讀 867評(píng)論 0 1
  • 王維的詩句“大漠孤煙直,長河落日?qǐng)A” 讓我對(duì)沙漠一直有著美的暇想,而金庸小說里也有諸多大漠里故事,更讓沙漠在我心...
    小漓閱讀 460評(píng)論 0 0
  • 今天我高高興興地買來了顏料、畫紙和畫筆。 今天是第一次畫,藍(lán)色和黑色之間的過渡一點(diǎn)也不自然,總感覺哪里怪怪的。 ...
    花兮蔻閱讀 194評(píng)論 0 0

友情鏈接更多精彩內(nèi)容