一句話讓豆.包幫我將json文件提取數(shù)據(jù)到excel
創(chuàng)建一個excel表格為baby.xlsx,標題頭含有"id", "picture", "video", "create_at"標題頭。
打開附件中每一個json文件,全文搜索"list"列表下,將對應(yīng)的"id", "picture", "video", "create_at"數(shù)據(jù)填到對應(yīng)的表格,以"id"為主鍵,遇到空數(shù)據(jù)填"None",保存文件并下載
直接生成
import os
import json
import pandas as pd
from typing import List, Dict
def read_json_files(folder_path: str) -> List[Dict]:
"""
讀取指定文件夾下所有JSON文件,提取list列表中的目標字段
"""
all_data = []
# 遍歷文件夾下所有文件
for filename in os.listdir(folder_path):
if filename.endswith(".json"):
file_path = os.path.join(folder_path, filename)
try:
with open(file_path, 'r', encoding='utf-8') as f:
json_data = json.load(f)
# 遞歸查找所有"list"鍵對應(yīng)的列表(兼容不同層級的list)
def find_list(obj):
if isinstance(obj, dict):
for k, v in obj.items():
if k == "list" and isinstance(v, list):
return v
else:
res = find_list(v)
if res:
return res
elif isinstance(obj, list):
for item in obj:
res = find_list(item)
if res:
return res
return None
list_data = find_list(json_data)
if list_data:
# 提取目標字段,空值填充None
for item in list_data:
row = {
"id": item.get("id", "None"),
"picture": item.get("picture", "None"),
"video": item.get("video", "None"),
"create_at": item.get("create_at", "None")
}
all_data.append(row)
except Exception as e:
print(f"處理文件 {filename} 出錯: {str(e)}")
return all_data
def create_excel(data: List[Dict], output_file: str = "baby.xlsx"):
"""
將數(shù)據(jù)寫入Excel,以id為主鍵去重
"""
# 轉(zhuǎn)換為DataFrame
df = pd.DataFrame(data)
# 以id為主鍵去重(保留第一個出現(xiàn)的id)
df = df.drop_duplicates(subset=["id"], keep="first")
# 確保列順序正確
df = df[["id", "picture", "video", "create_at"]]
# 保存為Excel文件
df.to_excel(output_file, index=False)
print(f"Excel文件已保存至: {os.path.abspath(output_file)}")
if __name__ == "__main__":
# 請?zhí)鎿Q為你的JSON文件所在文件夾路徑
json_folder_path = "/Users/leo/Documents/baby_young/album" # 示例路徑,需自行修改
# 讀取所有JSON文件的目標數(shù)據(jù)
data_list = read_json_files(json_folder_path)
# 創(chuàng)建并保存Excel文件
create_excel(data_list, "baby.xlsx")
然后將數(shù)據(jù)用requests下載,告訴豆.包
寫一個python3腳本:提取excel中列名"create_at",將":"改為"-"作為文件名,循環(huán)遍歷每一行,如果"video"字段不為空,則下載視頻保存,否則下載"picture"保存。每一行下載時間隔3秒。
import pandas as pd
import requests
import os
import time
from urllib.parse import urlparse
def download_media_from_excel(excel_path, save_dir="downloaded_media"):
"""
從Excel文件下載視頻或圖片
Args:
excel_path (str): Excel文件路徑
save_dir (str): 媒體文件保存目錄,默認創(chuàng)建"downloaded_media"文件夾
"""
# 1. 創(chuàng)建保存目錄(不存在則自動創(chuàng)建)
if not os.path.exists(save_dir):
os.makedirs(save_dir)
print(f"已創(chuàng)建保存目錄:{save_dir}")
# 2. 讀取Excel文件
try:
df = pd.read_excel(excel_path)
print(f"成功讀取Excel文件,共{len(df)}行數(shù)據(jù)")
except Exception as e:
print(f"讀取Excel失?。簕str(e)}")
return
# 3. 檢查必要列是否存在
required_cols = ["create_at", "video", "picture"]
missing_cols = [col for col in required_cols if col not in df.columns]
if missing_cols:
print(f"Excel文件缺少必要列:{', '.join(missing_cols)}")
return
# 4. 循環(huán)遍歷每一行數(shù)據(jù)
for index, row in df.iterrows():
try:
# 4.1 處理文件名(替換":"為"-")
create_at = str(row["create_at"]).strip()
file_name = create_at.replace(":", "-")
if not file_name:
file_name = f"unknown_{index}" # 處理空值情況
print(f"\n正在處理第{index+1}行,文件名為:{file_name}")
# 4.2 優(yōu)先處理視頻下載
video_url = str(row["video"]).strip()
if video_url and video_url.lower() != "nan":
# 獲取視頻文件后綴(默認mp4)
parsed_url = urlparse(video_url)
file_ext = os.path.splitext(parsed_url.path)[1] or ".mp4"
full_path = os.path.join(save_dir, f"{file_name}{file_ext}")
# 下載視頻
download_file(video_url, full_path)
continue # 視頻下載成功,跳過圖片處理
# 4.3 處理圖片下載(視頻為空時)
picture_url = str(row["picture"]).strip()
if picture_url and picture_url.lower() != "nan":
# 獲取圖片文件后綴(默認jpg)
parsed_url = urlparse(picture_url)
file_ext = os.path.splitext(parsed_url.path)[1] or ".jpg"
full_path = os.path.join(save_dir, f"{file_name}{file_ext}")
# 下載圖片
download_file(picture_url, full_path)
else:
print("該行video和picture字段均為空,跳過下載")
# 4.4 每行下載后間隔3秒
print("等待3秒后處理下一行...")
time.sleep(3)
except Exception as e:
print(f"處理第{index+1}行時出錯:{str(e)}")
time.sleep(3) # 出錯后仍保持間隔,避免影響后續(xù)行
def download_file(url, save_path):
"""
通用文件下載函數(shù)
Args:
url (str): 待下載文件的URL
save_path (str): 保存路徑(含文件名)
"""
try:
# 發(fā)送請求(設(shè)置超時10秒,添加用戶代理模擬瀏覽器)
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36"
}
response = requests.get(url, headers=headers, timeout=10, stream=True)
response.raise_for_status() # 若狀態(tài)碼非200,拋出異常
# 寫入文件(分塊下載,支持大文件)
with open(save_path, "wb") as f:
for chunk in response.iter_content(chunk_size=1024*1024): # 1MB分塊
if chunk:
f.write(chunk)
print(f"成功保存文件:{os.path.basename(save_path)}")
except Exception as e:
# 若下載失敗,刪除空文件
if os.path.exists(save_path):
os.remove(save_path)
raise Exception(f"下載失?。簕str(e)}")
# ------------------- 腳本執(zhí)行入口 -------------------
if __name__ == "__main__":
# 請根據(jù)實際情況修改Excel文件路徑
EXCEL_FILE_PATH = "/Users/leo/Documents/baby_young/album/baby.xlsx" # Excel文件路徑
SAVE_DIRECTORY = "/Users/leo/Documents/baby_young/album/baby_media" # 自定義保存目錄
# 啟動下載
print("開始執(zhí)行媒體文件下載任務(wù)...")
download_media_from_excel(EXCEL_FILE_PATH, SAVE_DIRECTORY)
print("\n所有行處理完畢!")