豆瓣top250電影爬蟲(chóng)

import requests
import pandas as pd
from lxml import html
movie_list = []

def spider_douban(page):
    # 獲取目標(biāo)站點(diǎn)的源代碼
    url ='https://movie.douban.com/top250?start={}&filter='.format(page)
    # 偽裝成瀏覽器
    headers = {
            'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36'
        }
   
    response = requests.get(url, headers =headers)
    print(response.status_code)
    html_data = response.text
    # print(html_data)

    # xpath提取
    selector = html.fromstring(html_data)
    # 獲取所有電影
    # ul_list = selector.xpath('//div[@class="article"]/ol/li')
    ul_list = selector.xpath('//ol[@class="grid_view"]/li')
    print(len(ul_list))

    # 遍歷
    for li in ul_list:
        # 電影序號(hào)
        number = li.xpath('.//div[1]/div[1]/em/text()')
        print(number)

        # 電影圖片
        picture = li.xpath('.//div[1]/div[1]/a/img/@src')[0]
        print(picture)

        # 電影名
        name = li.xpath('.//div[1]/div[2]/div[1]/a/span[1]/text()')[0]
        print(name)

        #新建img文件夾，存放電影圖片，以電影名命名
        image = requests.get(picture)
        with open('./img/'+name+'.png','wb') as f:
            f.write(image.content)

        # 電影信息
        information = li.xpath('.//div[1]/div[2]/div[2]/p/text()')
        # print(type(information))
        # information = information.strip()
        print(information)

        # 評(píng)價(jià)人數(shù)
        people = li.xpath('.//div[1]/div[2]/div[2]/div[1]/span[4]/text()')[0]
        people = people.replace('人評(píng)價(jià)', ' ')
        people = int(people)
        print(people)

        # 排序
        movie_list.append({
            'name': name,
            'number': number,
            'picture': picture,
            'information': information,
            'people':people
        })
    movie_list.sort(key=lambda x: x['people'], reverse=True)
    for movie in movie_list:
        print(movie)
    # 存儲(chǔ) csv
    df = pd.DataFrame(movie_list)
    df.to_csv('douban250.csv')
#翻頁(yè)
for page in range(0,250,25):
    spider_douban(page)
# spider_douban(0)

色偷偷精品伊人,欧洲久久精品,欧美综合婷婷骚逼,国产AV主播,国产最新探花在线,九色在线视频一区,伊人大交九欧美,1769亚洲,黄色成人av

Python第三天（spider_豆瓣）

Python第三天（spider_豆瓣）

豆瓣top250電影爬蟲(chóng)

未完待續(xù)/...

友情鏈接更多精彩內(nèi)容

色偷偷精品伊人,欧洲久久精品,欧美综合婷婷骚逼,国产AV主播,国产最新探花在线,九色在线视频一区,伊人大交九 欧美,1769亚洲,黄色成人av

Python第三天（spider_豆瓣）

豆瓣top250電影爬蟲(chóng)

未完待續(xù)/...

友情鏈接更多精彩內(nèi)容

色偷偷精品伊人,欧洲久久精品,欧美综合婷婷骚逼,国产AV主播,国产最新探花在线,九色在线视频一区,伊人大交九欧美,1769亚洲,黄色成人av