豆瓣電影 top250 爬蟲

發(fā)現(xiàn)沒什么好說的,主要是這個 pyquery 庫比較好用,能實現(xiàn)像 操縱DOM 一樣解析網(wǎng)頁。
主要功能:

  1. 將爬取的網(wǎng)頁先保存到本地,然后解析,避免重復(fù)請求。
  2. 將解析的結(jié)果保存到 MongoDB。
import requests
import pymongo
from pyquery import PyQuery as pq


class Model(object):
    """
    基類, 用來顯示類的信息
    """

    def __repr__(self):
        name = self.__class__.__name__
        properties = ('{}=({})'.format(k, v) for k, v in self.__dict__.items())
        s = '\n<{} \n  {}>'.format(name, '\n  '.join(properties))
        return s


class Movie(Model):
    """
    存儲電影信息
    """

    def __init__(self):
        self.name = ''
        self.score = 0
        self.quote = ''
        self.cover_url = ''
        self.ranking = 0


def cached_url(url):
    """
    緩存, 避免重復(fù)下載網(wǎng)頁浪費時間
    """
    folder = 'cached'
    filename = url.split('=', 1)[-1] + '.html'
    path = os.path.join(folder, filename)
    if os.path.exists(path):
        with open(path, 'rb') as f:
            s = f.read()
            return s
    else:
        # 建立 cached 文件夾
        if not os.path.exists(folder):
            os.makedirs(folder)

        headers = {
            'user-agent': '''Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36
Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8''',
        }
        # 發(fā)送網(wǎng)絡(luò)請求, 把結(jié)果寫入到文件夾中
        r = requests.get(url, headers)
        with open(path, 'wb') as f:
            f.write(r.content)
        return r.content


def movie_from_div(div):
    """
    從一個 div 里面獲取到一個電影信息
    """
    e = pq(div)

    # 小作用域變量用單字符
    m = Movie()
    m.name = e('.title').text()
    m.score = e('.rating_num').text()
    m.quote = e('.inq').text()
    m.cover_url = e('img').attr('src')
    m.ranking = e('.pic').find('em').text()

    return m


def movies_from_url(url):
    """
    從 url 中下載網(wǎng)頁并解析出頁面內(nèi)所有的電影
    """
    page = cached_url(url)
    e = pq(page)
    # 2.父節(jié)點
    items = e('.item')
    # 調(diào)用 movie_from_div
    # list comprehension
    movies = [movie_from_div(i) for i in items]
    return movies


def download_image(url, file):
    folder = "img"
    name = file.split("/")[0] + '.jpg'
    path = os.path.join(folder, name)

    if not os.path.exists(folder):
        os.makedirs(folder)

    if os.path.exists(path):
        return

    headers = {
        'user-agent': '''Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36
    Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8''',
    }
    # 發(fā)送網(wǎng)絡(luò)請求, 把結(jié)果寫入到文件夾中
    r = requests.get(url, headers)
    with open(path, 'wb') as f:
        f.write(r.content)


def savemovies(movies):
    '''
    保存到 MongoDB
    '''
    connection = pymongo.MongoClient()
    DoubanMovies_db = connection.DoubanMovies_db
    Movietable = DoubanMovies_db.movies

    for m in movies:
        movie = {}
        movie['name'] = m.name
        movie['score'] = m.score
        movie['quote'] = m.quote
        movie['ranking'] = m.ranking
        movie['cover_url'] = m.cover_url
        Movietable.insert_one(movie)


def main():
    for i in range(0, 250, 25):
        url = 'https://movie.douban.com/top250?start={}'.format(i)
        movies = movies_from_url(url)
        savemovies(movies)
        print('top250 movies', movies)
        [download_image(m.cover_url, str(m.name)) for m in movies]


if __name__ == '__main__':
    main()
最后編輯于
?著作權(quán)歸作者所有,轉(zhuǎn)載或內(nèi)容合作請聯(lián)系作者
【社區(qū)內(nèi)容提示】社區(qū)部分內(nèi)容疑似由AI輔助生成,瀏覽時請結(jié)合常識與多方信息審慎甄別。
平臺聲明:文章內(nèi)容(如有圖片或視頻亦包括在內(nèi))由作者上傳并發(fā)布,文章內(nèi)容僅代表作者本人觀點,簡書系信息發(fā)布平臺,僅提供信息存儲服務(wù)。

相關(guān)閱讀更多精彩內(nèi)容

友情鏈接更多精彩內(nèi)容