#author: Jingke
from bs4 import BeautifulSoup
import ssl
from urllib.request import Request, urlopen
import urllib.request
class Scrape_news():
@classmethod
def url_link(cls, url, *args, **kwargs):
ssl._create_default_https_context = ssl._create_unverified_context
opener = urllib.request.build_opener()
opener.addheaders = [('User-Agent',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1941.0 Safari/537.36')]
urllib.request.install_opener(opener)
req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
web = urlopen(req)
bsObj = BeautifulSoup(web)
news = bsObj.find_all(*args, **kwargs)
list = []
for new in news:
list.append(new.get('href'))
l=[]
for i in list:
if "articles" in i:
i = "http://www.qdaily.com/" + i
l.append(i)
print(l)
return l
if __name__ == '__main__':
# Scrape_news.url_link('http://www.qdaily.com', "h3", {"class": "smart-dotdotdot"})
Scrape_news.url_link('http://www.qdaily.com', "a")
result:
['http://www.qdaily.com//articles/64790.html', 'http://www.qdaily.com//articles/64771.html', 'http://www.qdaily.com//articles/64794.html', 'http://www.qdaily.com//articles/64764.html', 'http://www.qdaily.com//articles/64696.html', 'http://www.qdaily.com//articles/64790.html', 'http://www.qdaily.com//articles/64771.html', 'http://www.qdaily.com//articles/64794.html', 'http://www.qdaily.com//articles/64764.html', 'http://www.qdaily.com//articles/64696.html', 'http://www.qdaily.com//articles/64935.html', 'http://www.qdaily.com//articles/64924.html', 'http://www.qdaily.com//articles/64933.html', 'http://www.qdaily.com//articles/64934.html', 'http://www.qdaily.com//articles/64923.html', 'http://www.qdaily.com//articles/64921.html', 'http://www.qdaily.com//articles/64930.html', 'http://www.qdaily.com//articles/64931.html', 'http://www.qdaily.com//articles/64927.html', 'http://www.qdaily.com//articles/64922.html', 'http://www.qdaily.com//articles/64929.html', 'http://www.qdaily.com//articles/64928.html', 'http://www.qdaily.com//articles/64925.html', 'http://www.qdaily.com//articles/64926.html', 'http://www.qdaily.com//articles/64919.html', 'http://www.qdaily.com//articles/64920.html', 'http://www.qdaily.com//articles/64904.html']
------------------------------------------------------------------------------------------------------------------#
#author: Jingke
class Scrape_news():
@classmethod
def url_link(cls, url, *args, **kwargs):
ssl._create_default_https_context = ssl._create_unverified_context
opener = urllib.request.build_opener()
opener.addheaders = [('User-Agent',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1941.0 Safari/537.36')]
urllib.request.install_opener(opener)
req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
web = urlopen(req)
bsObj = BeautifulSoup(web)
news = bsObj.find_all(*args, **kwargs)
list=[]
for new in news:
list.append(new.get_text())
print(list)
return list
if __name__ == '__main__':
Scrape_news.url_link('http://www.qdaily.com', "h3", {"class": "smart-dotdotdot"})
result:
['重新認(rèn)識人性的可能,如何看待 18 世紀(jì)英國平民文化?',
'兩次世界大戰(zhàn)之間的日本陸軍,他們?nèi)绾巫呦驊?zhàn)爭?',
'艾滋病如何在美國被發(fā)現(xiàn),又怎樣展現(xiàn)人性的復(fù)雜?',
'盧梭研究經(jīng)典,我們該如何理解盧梭的孤獨(dú)?',
'社交媒體和數(shù)字技術(shù)的發(fā)展,如何改變傳統(tǒng)人際關(guān)系?',
'如果愛情讓人自身和自身保持同一,那它可能是什么?',
'130 幅城市復(fù)原圖,如何重現(xiàn)古地中海文明?',
'從 1931 到 1945 年,日本人的思想發(fā)生了什么轉(zhuǎn)變?',
'百年以來,什么是中國文人論政的報(bào)國情懷?']