該PDF下載器支持從本地txt文件中讀取和數(shù)據(jù)庫讀取兩種方式
數(shù)據(jù)庫ip我亂寫的,這里集成了三種情況的數(shù)據(jù)庫(本地數(shù)據(jù)庫,需要驗證的數(shù)據(jù)庫,不需要驗證的數(shù)據(jù)庫)
# -*- coding: utf-8 -*-
"""
此程序用來多線程下載PDF,提供本地txt文件讀取下載和數(shù)據(jù)庫讀取下載兩種方式
"""
import hashlib
import os
import sys
from multiprocessing import Queue
from threading import Thread, Lock
import pymongo
import requests
g_num = 0 # 創(chuàng)建全局變量
lock = Lock() # 創(chuàng)建全局互斥鎖
class DownloadPDF(object):
def __init__(self):
# 創(chuàng)建消息隊列
self.q = Queue()
def run(self):
self.mkdir()
self.get_pdf_msg()
pdf_size = self.q.qsize() # 總PDF數(shù)
# 開啟多線程
all_th = []
for i in range(10):
th = Thread(target=self.download, args=(pdf_size,))
th.start()
all_th.append(th)
for th in all_th:
th.join()
print('\n' + '下載完成!')
def download(self, pdf_size):
while True:
try:
# 獲取PDF信息
each_pdf = self.q.get(timeout=2) # 設(shè)置超時時間2s。這里如果不設(shè)置超時,當隊列為空時,q.get()會進入阻塞狀態(tài)
except:
# 這里隊列已經(jīng)為空,取不到數(shù)據(jù)就跳出循環(huán)
break
# 計數(shù)
global g_num
lock.acquire() # 上鎖
g_num += 1
# 打印下載進度
sys.stdout.write('\r' + '正在下載:%s / %s' % (g_num, pdf_size))
path = ''
pdf_url = ''
md5_later = ''
pdf_name = '' # 如果沒有PDF名,置為空
if is_txt:
# 選擇了txt文件方式,只有一個鏈接
pdf_url = each_pdf.strip()
href_name = pdf_url.split('/')[-1] # 取鏈接 / 后的值作為PDF名稱
if '.pdf' in href_name:
href_name = href_name.split('.')[0]
md5_later = self.pdf_name_md5(href_name) # pdf轉(zhuǎn)碼之后的名稱
path = 'PDF/' + md5_later + '.pdf'
elif not is_txt:
# 從數(shù)據(jù)庫中讀取PDF信息,數(shù)據(jù)庫中包含兩個字段(pdf_name, pdf_url)
pdf_name = each_pdf['pdf_name']
pdf_url = each_pdf['pdf_url']
md5_later = self.pdf_name_md5(pdf_name)
path = 'PDF/' + md5_later + '.pdf'
# 下載之前先去重
try:
# 嘗試獲取該路徑下pdf的size,如果還未下載則置為0
size = os.path.getsize(path)
except:
size = 0
# 判斷已經(jīng)下載并且size > 0則為成功下載
if os.path.exists(path) and size != 0:
# 路徑存在說明下載過
lock.release() # 釋放鎖
continue
# 下載PDF
self.download_pdf(pdf_name, each_pdf, path, pdf_url, md5_later)
lock.release() # 釋放鎖
@staticmethod
def download_pdf(pdf_name, each_pdf, path, pdf_url, md5_later):
"""
下載PDF的主要實現(xiàn)程序
:param pdf_name:
:param each_pdf:
:param path:
:param pdf_url:
:param md5_later:
:return:
"""
try:
# 開始下載
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)'
' Chrome/76.0.3809.100 Safari/537.36'}
r = requests.get(url=pdf_url, headers=headers, timeout=(120, 600)) # 設(shè)置connect超時2分鐘,read超時10分鐘
f = open(path, 'wb')
f.write(r.content)
f.close()
r.close()
s = requests.session()
s.keep_alive = False
if is_insert == 'y':
# 下載成功,插入數(shù)據(jù)庫
if not pdf_name:
msg = {'url': pdf_url, 'md5_name': md5_later, 'relative_path': path}
else:
msg = {'url': pdf_url, 'origin_name': pdf_name, 'md5_name': md5_later, 'relative_path': path}
new_collection.insert(msg)
except:
# 下載失敗
print('\r' + '下載失?。?s' % str(each_pdf))
with open('error.txt', 'r') as fr:
content = fr.readlines()
if str(each_pdf) not in content:
with open('error.txt', 'a') as fe:
fe.writelines(str(each_pdf) + '\n')
@staticmethod
def mkdir():
"""
創(chuàng)建下載失敗文件、去重文件、PDF文件夾
:return:
"""
if not os.path.exists('error.txt'):
with open('error.txt', 'w') as fr:
fr.write('')
if not os.path.exists('PDF'):
os.mkdir('PDF')
def client_dbs(self):
"""
連接數(shù)據(jù)庫,獲取所有pdf信息
:return:
"""
ip = self.choose_ip()
client = pymongo.MongoClient('mongodb://%s:27017' % ip)
# 這里因為該數(shù)據(jù)庫開了驗證,所以這里進行判斷驗證(ip我亂寫的)
if ip == '192.168.0.123':
client['admin'].authenticate('用戶名', '密碼') # 數(shù)據(jù)庫驗證(用戶名密碼根據(jù)自己數(shù)據(jù)庫修改)
all_db_names = client.list_database_names()
# 規(guī)范打印數(shù)據(jù)庫名
self.pr_datas(all_db_names)
db_name_num = input('\r' + '請選擇數(shù)據(jù)庫對應(yīng)序號:')
db_name = self.choose_db_or_col(all_db_names, db_name_num, '數(shù)據(jù)庫')
while True:
db = client[db_name]
all_col = db.collection_names()
if not all_col:
db_name = input('沒有這個數(shù)據(jù)庫,請重新輸入:')
else:
break
# 規(guī)范打印集合名
self.pr_datas(all_col)
col_num = input('\r' + '請選擇集合對應(yīng)的序號:')
col_name = self.choose_db_or_col(all_col, col_num, '集合')
collection = db[col_name]
return collection, db
@staticmethod
def choose_ip():
"""
選擇ip
:return:
"""
ip = input('請選擇數(shù)據(jù)庫:' + '\n' + '1. 本地數(shù)據(jù)庫 2. 123數(shù)據(jù)庫 3. 200數(shù)據(jù)庫' + '\n')
while True:
if ip == '1':
ip = '127.0.0.1'
break
elif ip == '2':
ip = '192.168.0.123'
break
elif ip == '3':
ip = '192.168.0.200'
break
else:
ip = input('請正確輸入1 或者 2 或者 3:')
return ip
@staticmethod
def choose_db_or_col(name, name_num, d_or_c):
"""
選擇數(shù)據(jù)庫或者集合
:return:
"""
while True:
try:
int(name_num)
except ValueError:
name_num = input('請輸入正確的%s序號:' % d_or_c)
if 0 <= int(name_num) <= len(name):
break
else:
name_num = input('請輸入正確的%s序號:' % d_or_c)
for j, e_name in enumerate(name, 1):
if name_num == str(j):
choose_name = e_name
return choose_name
@staticmethod
def pr_datas(names):
"""
規(guī)范打印數(shù)據(jù)庫/集合名稱
:param names:
:return:
"""
for i, each_name in enumerate(names, 1):
print(str(i) + '. ' + each_name + ' ' * (40 - len(each_name) - len(str(i))), end='')
if i % 3 == 0:
print('\n')
if len(names) < 3:
print('\n')
def get_pdf_msg(self):
"""
將pdf信息存入消息隊列
:return:
"""
all_urls = self.choose_way()
for i in all_urls:
self.q.put(i)
def choose_way(self):
"""
選擇PDF的url來源,是在txt文件中還是從數(shù)據(jù)庫中讀取
:return:
"""
print('*' * 100)
print(
'注意事項:' + '\n' + '1. 如果選擇了txt文件方式下載,txt內(nèi)容必須為一行一個url格式' + '\n' +
'2. 如果選擇了從數(shù)據(jù)庫中讀取PDF信息下載,則數(shù)據(jù)庫中包含兩個字段(a. 存放url的字段,b. 該PDF的名字字段(可以沒有)),其中存放url的字段不能嵌套,只能有一個url' + '\n' +
'3. txt文件只有鏈接,默認取鏈接最后 "/" 后的值作為PDF名' + '\n' +
'4. 數(shù)據(jù)庫方式如果沒有指定PDF名字,則默認以鏈接最后一個 / 后的內(nèi)容作為PDF名')
print('*' * 100)
global is_txt, is_insert, new_collection
the_way = input('請選擇PDF的來源(輸入1 / 2):' + '\n' + '1. txt文件 2. 從數(shù)據(jù)庫中讀取' + '\n')
while True:
if the_way == '1' or the_way == '2':
break
else:
the_way = input('請正確輸入1 或者 2:')
urls = []
if the_way == '1':
# txt文件
is_txt = True
print('說明:txt文件中必須為一行一個url!')
txt_name = input('請輸入與該程序同級目錄下的txt文件名稱(如:123.txt):')
while True:
try:
with open(txt_name, 'r') as f:
urls = f.readlines()
break
except FileNotFoundError:
txt_name = input('沒有這個txt文件,請重新輸入:')
elif the_way == '2':
# 數(shù)據(jù)庫
collection, db = self.client_dbs() # 連接數(shù)據(jù)庫
url_field = input('請輸入PDF的url字段名:')
name_field = input('請輸入PDF的name字段名(如果沒有則不輸入):')
if not name_field:
is_txt = True
for d in collection.find():
if d[url_field]:
urls.append(d[url_field])
else:
is_txt = False
for d in collection.find():
if d[url_field]:
msg = {'pdf_url': d[url_field], 'pdf_name': d[name_field]}
urls.append(msg)
# 選擇是否要插入數(shù)據(jù)的數(shù)據(jù)庫
is_insert = input('是否要將下載的PDF信息寫入到新的數(shù)據(jù)庫(輸入y / n,不輸入或輸入其他默認不寫入):')
if is_insert == 'y':
print('選擇插入數(shù)據(jù)庫,字段包含(pdf_url, md5_name, relative_path)')
ip = self.choose_ip()
client = pymongo.MongoClient('mongodb://%s:27017' % ip)
db_name = input('請輸入要存入的數(shù)據(jù)庫名(可新建):')
db = client[db_name]
if ip == '192.168.0.123':
db.authenticate(name='用戶名', password='密碼', source='admin')
new_col = input('請輸入要插入的集合名稱:')
new_collection = db[new_col]
return urls
@staticmethod
def pdf_name_md5(pdf):
"""
將pdf名字轉(zhuǎn)為md5
:param pdf:
:return:
"""
md = hashlib.md5()
md.update(pdf.encode('utf-8'))
pdf_name = md.hexdigest()
return pdf_name
if __name__ == '__main__':
download = DownloadPDF()
download.run()