webdriver

import requests
import os
import pymysql
import uuid
import re
from selenium import webdriver
from bs4 import BeautifulSoup


def getHeaders():
    headers = {
        'User-Agent': 'Mozilla/4.0 (compatible; MSIE 9.0; Windows NT 6.1; 125LA; .NET CLR 2.0.50727; .NET CLR 3.0.04506.648; .NET CLR 3.5.21022)',
    }
    return headers

def fillList(infos,i):
    try:
        infos['title']=i.find_element_by_xpath('./div[1]/div/div/a').text
        infos['url']=i.find_element_by_xpath('./div[1]/div/div/a').get_attribute("href")
        print('----',infos['url'])
        #保存二級頁面
        resp_detail = requests.get(url=infos['url'],headers=getHeaders())
        content_detail = resp_detail.content.decode('utf-8')
        content=re.findall(r'<p>.*</p>',content_detail)
        ''.join(content)
        content = re.sub(r'<img.*?>', '', content[0])
        infos['html'] = content
        #保存圖片
        img_url=i.find_element_by_xpath('./div[2]/a/img').get_attribute("src")
        resp_img = requests.get(url=img_url)
        content_img = resp_img.content
        currentPathName = os.getcwd()
        parentPathName = os.path.abspath(os.path.join(currentPathName, os.pardir))
        folder_path = parentPathName + "/image" + "/"
        if not os.path.exists(folder_path):
            os.makedirs(folder_path)
        img_name = str(uuid.uuid1()) + '.jpg'
        filename = '%s%s' % (folder_path, img_name)
        with open(filename, 'wb') as f:
            f.write(content_img)
        infos['img'] = img_name

        print('提取信息成功')
    except Exception as e:
        print('提取信息失敗')
    return infos


def printInfo(infos, inf):
    conn = pymysql.connect(host='localhost', port=3306, user='root',
                           passwd='root', db='jh_project01', charset='utf8')

    cur = conn.cursor()
    sqlc = '''
                    create table news(
                    id int primary key auto_increment,
                    title varchar(60),
                    img varchar(60),
                    url varchar(100),
                    html longtext)DEFAULT CHARSET=utf8;
                    '''
    try:
        cur.execute(sqlc)
        conn.commit()
        print("成功")
    except:
        print("錯誤")

    for item, i in enumerate(inf):
        # print(item,i.text)
        if item == 7:
            break
        infos=fillList(infos, i)
        sqla = '''
                insert into news(title,img,url,html)
                values(%s,%s,%s,%s);
               '''
        try:
            cur.execute(sqla, (infos['title'], infos['img'], infos['url'], infos['html']))
            conn.commit()
            print("成功")
        except:
            print("失敗")

    conn.commit()
    cur.close()
    conn.close()

def main():
    infos = {}
    driver = webdriver.Chrome()
    driver.get('https://www.toutiao.com/ch/news_tech/')
    js = "var q=document.documentElement.scrollTop=500"
    driver.execute_script(js)
    inf = driver.find_elements_by_xpath('//div[@class="wcommonFeed"]/ul/li[@class="item    "]/div[@class="item-inner y-box"]')
    del inf[0]
    print(inf)
    # print(len(inf))
    printInfo(infos,inf)

    driver.close()

main()
?著作權(quán)歸作者所有,轉(zhuǎn)載或內(nèi)容合作請聯(lián)系作者
【社區(qū)內(nèi)容提示】社區(qū)部分內(nèi)容疑似由AI輔助生成,瀏覽時請結(jié)合常識與多方信息審慎甄別。
平臺聲明:文章內(nèi)容(如有圖片或視頻亦包括在內(nèi))由作者上傳并發(fā)布,文章內(nèi)容僅代表作者本人觀點,簡書系信息發(fā)布平臺,僅提供信息存儲服務(wù)。

相關(guān)閱讀更多精彩內(nèi)容

友情鏈接更多精彩內(nèi)容