2.1 基礎Python與pandas
讀寫csv文件
基礎Python,不使用csv
import sys
input_file = sys.argv[1]
output_file = sys.argv[2]
with open(input_file, 'r', newline='') as filereader:
with open(output_file, 'w', newline='') as filewriter:
header = filereader.readline()
header = header.strip()
header_list = header.split(',')
print(header_list)
filewriter.write(','.join(map(str, header_list))+'\n')
for row in filereader:
row = row.strip()
row_list = row.split(',')
print(row_list)
filewriter.write(','.join(map(str, row_list))+'\n')
-
join()方法將列表元素以指定字符串相連。
seq = ['a', 'b', 'c', 'd']
return(','.join(seq))
>>>a,b,c,d
-
map()函數根據提供函數對指定序列做映射。
def square(x):
return x ** 2
return(map(square, [1, 2, 3, 4, 5]))
>>>[1, 4, 9, 16, 25]
-
lambda匿名函數。
f = lambda a,b,c: a+b+c
return(f(1, 2, 3))
>>>6
使用pandas
import sys
import pandas as pd
input_file = sys.argv[1]
output_file = sys.argv[2]
data_frame = pd.read_csv(input_file)
print(data_frame)
data_frame.to_csv(output_file, index=False)
- 在
pandas中,使用數據框保留了“表格”的形式。
基礎Python,使用csv模塊
import sys
import csv
input_file = sys.argv[1]
output_file = sys.argv[2]
with open(input_file, 'r', newline='') as input_csv:
with open(output_file, 'w', newline='') as output_csv:
filereader = csv.reader(input_csv, delimiter=',')
filewriter = csv.writer(output_csv, delimiter=',')
for row_list in filereader:
print(row_list)
filewriter.writerow(row_list)
- 使用csv模塊時候,盡管在csv的數字內增加了 ‘ , ’ 以表示數字位數,但csv會自動使用 “ ” 將內容括起。
2.2 篩選特定的行
行中的值滿足某個條件
基礎Python
import sys
import csv
input_file = sys.argv[1]
output_file = sys.argv[2]
with open(input_file, 'r', newline='') as input_csv:
with open(output_file, 'w', newline='') as output_csv:
filereader = csv.reader(input_csv, delimiter=',')
filewriter = csv.writer(output_csv, delimiter=',')
header = next(filereader)
filewriter.writerow(header)
for row in filereader:
supplier = str(row[0]).strip()
cost = str(row[3]).strip('$').replace(',', '')
if supplier.lower() == 'supplier Z'.lower() or float(cost) > 600.0:
filewriter.writerow(row)
Pandas
import sys
import pandas as pd
input_file = sys.argv[1]
output_file = sys.argv[2]
data_frame = pd.read_csv(input_file)
data_frame['Cost'] = data_frame['Cost'].str.strip('$').astype(float)
data_frame_value_meets_condition = data_frame.loc[(data_frame['Supplier Name']
.str.contains('Z')) | (data_frame['Cost'] > 600.0), :]
data_frame_value_meets_condition.to_csv(output_file, index=False)
-
contains()函數,搜索包含參數的字符串。 -
loc()函數接受兩個參數,第一個為行第二個為列。 - DataFrame中,如果需要設定多個條件,將需要的邏輯順序用"$"和"|"連接,分別為兩個都必須為真和一個為真就可以。
行中的值屬于某個集合
基礎Python
import sys
import csv
input_file = sys.argv[1]
output_file = sys.argv[2]
important_dates = ['1/20/14', '1/30/14']
with open(input_file, 'r', newline='') as input_csv:
with open(output_file, 'w', newline='') as output_csv:
filereader = csv.reader(input_csv)
filewriter = csv.writer(output_csv)
header = next(filereader)
filewriter.writerow(header)
for row in filereader:
a_date = row[4]
if a_date in important_dates:
filewriter.writerow(row)
Pandas
import sys
import pandas as pd
input_file = sys.argv[1]
output_file = sys.argv[2]
data_frame = pd.read_csv(input_file)
important_dates = ['1/20/14', '1/30/14']
data_frame_value_in_set = data_frame.loc[(data_frame['Purchase Date'].isin(important_dates)), :]
data_frame_value_in_set.to_csv(output_file, index=False)
-
isin()函數,搜索在集合內的參數。
行中的值匹配于某個模式/正則表達式
基礎Python
import sys
import csv
import re
input_file = sys.argv[1]
output_file = sys.argv[2]
my_pattern = re.compile(r'(?P<pattern_group>^001-.*)', re.I)
with open(input_file, 'r', newline='') as input_csv:
with open(output_file, 'w', newline='') as output_csv:
filereader = csv.reader(input_csv, delimiter=',')
filewriter = csv.writer(output_csv, delimiter=',')
header = next(filereader)
filewriter.writerow(header)
for row in filereader:
invoice_number = row[1]
if my_pattern.search(invoice_number):
filewriter.writerow(row)
- 正則表達式
(r'(?P<pattern_group>^001-.*)', re.I)內,^為插入符號,代表只在開頭搜索;.句點表示可匹配除換行符外的任何字符;*表示重復前面字符0次或更多次,.*組合在一起表示除換行符外的任意字符可以在“001-”后面出現任意次。
Pandas
import sys
import re
import pandas as pd
input_file = sys.argv[1]
output_file = sys.argv[2]
data_frame = pd.read_csv(input_file)
data_frame_value_matches_pattern = data_frame.loc[data_frame['Invoice Number'].
str.startswith('001-'), :]
data_frame_value_matches_pattern.to_csv(output_file, index=False)
-
startswith()函數,搜索以參數為開頭的內容。
2.3 選取特定的列
列索引值
基礎Python
import sys
import csv
input_file = sys.argv[1]
output_file = sys.argv[2]
my_column = [0, 3]
with open(input_file, 'r', newline='') as input_csv:
with open(output_file, 'w', newline='') as output_csv:
filereader = csv.reader(input_csv)
filewriter = csv.writer(output_csv)
for row in filereader:
output_row = []
for index_value in my_column:
output_row.append(row[index_value])
filewriter.writerow(output_row)
Pandas
import sys
import pandas as pd
input_file = sys.argv[1]
output_file = sys.argv[2]
data_frame = pd.read_csv(input_file)
data_frame_column_by_index = data_frame.iloc[:, [0,3]]
data_frame_column_by_index.to_csv(output_file, index=False)
-
iloc()函數只能根據行號和列號進行索引(0、1、-1),loc()函數可以根據切片、名稱等,但不能使用不存在的索引如-1。
列標題
基礎Python
import sys
import csv
input_file = sys.argv[1]
output_file = sys.argv[2]
my_columns = ['Invoice Number', 'Purchase Date']
my_columns_index = []
with open(input_file, 'r', newline='') as input_csv:
with open(output_file, 'w', newline='') as output_csv:
filereader = csv.reader(input_csv)
filewriter = csv.writer(output_csv)
header = next(filereader, None)
for index_value in range(len(header)):
if header[index_value] in my_columns:
my_columns_index.append(index_value)
filewriter.writerow(my_columns)
for row in filereader:
row_list_output = []
for index_value in my_columns_index:
row_list_output.append(row[index_value])
filewriter.writerow(row_list_output)
- 使用
next()讀取文件第一行,獲取對應名稱的索引值。
Pandas
import sys
import pandas as pd
input_file = sys.argv[1]
output_file = sys.argv[2]
data_frame = pd.read_csv(input_file)
data_frame_column_by_name = data_frame.loc[:, ['Invoice Number', 'Purchase Date']]
data_frame_column_by_name.to_csv(output_file, index=False)
2.4 選取連續(xù)的行
基礎Python
import sys
import csv
input_file = sys.argv[1]
output_file = sys.argv[2]
row_count = 0
with open(input_file, 'r', newline='') as input_csv:
with open(output_file, 'w', newline='') as output_csv:
filereader = csv.reader(input_csv)
filewriter = csv.writer(output_csv)
for row in filereader:
if row_count >= 3 and row_count <= 15:
filewriter.writerow(row)
row_count += 1
- 使用
row_count變量進行計數,以選取索引值為3-15(表格中4-16)行的內容。
Pandas
import sys
import pandas as pd
input_file = sys.argv[1]
output_file = sys.argv[2]
data_frame = pd.read_csv(input_file, header=None)
data_frame = data_frame.drop([0, 1, 2, 16, 17, 18])
data_frame.columns = data_frame.iloc[0]
data_frame = data_frame.reindex(data_frame.index.drop(3))
data_frame.to_csv(output_file, index=False)
難點在于頭行被注釋占據,因此直接使用
iloc()和loc()函數選取4-16行的內容會出現IndexError: positional indexers are out-of-bounds的錯誤內容。導入csv時給予參數
header=None,使用drop()函數丟棄前三行和后三行。data_frame.columns = data_frame.iloc[0]根據此時索引值為0的行作為列索引,最后使用data_frame = data_frame.reindex(data_frame.index.drop(3))將標簽為3的行丟棄并重新生成索引。
2.5 添加標題行
基礎Python
import sys
import csv
input_file = sys.argv[1]
output_file = sys.argv[2]
with open(input_file, 'r', newline='') as input_csv:
with open(output_file, 'w', newline='') as output_csv:
filereader = csv.reader(input_csv)
filewriter = csv.writer(output_csv)
headers = ['Supplier Name', 'Invoice Number', 'Part Number', 'Cost', 'Purchase Date']
filewriter.writerow(headers)
for row in filereader:
filewriter.writerow(row)
Pandas
import sys
import pandas as pd
input_file = sys.argv[1]
output_file = sys.argv[2]
headers = ['Supplier Name', 'Invoice Number', 'Part Number', 'Cost', 'Purchase Date']
data_frame = pd.read_csv(input_file, header=None, names=headers)
data_frame.to_csv(output_file, index=False)
-
data_frame = pd.read_csv(input_file, header=None, names=headers)直接將現有列表設置為列標題行。
2.6 讀取多個csv文件
文件計數與文件中的行列計算
import sys
import glob
import csv
import os
input_path = sys.argv[1]
file_counter = 0
for input_file in glob.glob(os.path.join(input_path, 'sales_*')):
row_counter = 1
with open(input_file, 'r', newline='') as csv_file:
filereader = csv.reader(csv_file)
header = next(filereader, None)
for row in filereader:
row_counter += 1
print(f"{os.path.basename(input_file)}:\t{row_counter} rows \t{len(header)} columns")
file_counter += 1
print(f"Number of files: {file_counter}.")
- 使用
file_counter變量計算文件數,使用row_counter計算文件內行數。 -
os模塊包含一系列路徑名函數,os.path.join函數將這個文件夾路徑和所有符合特定模式的文件名連接起來。代碼中是以“sales_*"為開頭的文件,實際情況中要獲取路徑內所有csv文件則為“*.csv"。打印(os.path.join(input_path, 'sales_*')的內容,結果為字符串:>>>input_files\sales_*。 -
glob模塊可以找出與特定模式匹配的所有路徑名,glob.glob()函數擴展os.path.join函數獲取的內容,生成一個包含每個文件的列表。 -
os.path.basename()函數從文件的完整路徑名中抽出基本文件名。
2.7 從多個文件中連接數據
基礎Python
import sys
import glob
import csv
import os
input_path = sys.argv[1]
output_file = sys.argv[2]
first_file = True
for input_file in glob.glob(os.path.join(input_path, 'sales_*')):
print(os.path.basename(input_file))
with open(input_file, 'r', newline='') as input_csv:
with open(output_file, 'a', newline='') as output_csv:
filereader = csv.reader(input_csv)
filewriter = csv.writer(output_csv)
if first_file:
for row in filereader:
filewriter.writerow(row)
first_file = False
else:
header = next(filereader, None)
for row in filereader:
filewriter.writerow(row)
- 使用
first_file布爾值變量,僅僅獲取第一個文件的第一行,后續(xù)文件使用next()函數處理掉文件首行并追加進文件。
Pandas
import sys
import glob
import os
import pandas as pd
input_path = sys.argv[1]
output_file = sys.argv[2]
all_files = glob.glob(os.path.join(input_path, 'sales_*'))
all_data_frame = []
for file in all_files:
data_frame = pd.read_csv(file, index_col=None)
all_data_frame.append(data_frame)
data_frame_concat = pd.concat(all_data_frame, axis=0, ignore_index=True)
data_frame_concat.to_csv(output_file, index=False)
concat()方法用于連接兩個或多個數組。該方法不會改變現有的數組,而僅僅會返回被連接數組的一個副本。代碼中將所有數據框連接成為一個數據框,axis參數用于設置方法,0表示垂直堆疊,1表示并排平行堆疊NumPy模塊也提供了一些函數用于連接數據,通常導入為np。垂直連接使用np.concatenate([array1, array2], axis=0),np.vstack((array1, array2))或np.r_[array1, array2]實現;平行連接使用np.concatenate([array1, array2], axis=1),np.hstack((array1, array2))或np.c_[array1, array2]。==Pandas提供了類似SQL join操作的
merge()函數:pd.merge(DataFrame1, DataFrame2, on='key', how='inner')。==
2.8 計算每個文件中值的總和與均值
基礎Python
import sys
import glob
import csv
import os
input_path = sys.argv[1]
output_file = sys.argv[2]
output_headers = ['file_name', 'total_sales', 'average_sales']
with open(output_file, 'a', newline='') as output_csv:
filewriter = csv.writer(output_csv)
filewriter.writerow(output_headers)
for input_file in glob.glob(os.path.join(input_path, 'sales_*')):
with open(input_file, 'r', newline='') as input_csv:
filereader = csv.reader(input_csv)
header = next(filereader, None)
amounts = []
number_of_sales = 0.0
for row in filereader:
amount = float(str(row[3]).strip('$').replace(',', ''))
amounts.append(amount)
number_of_sales += 1.0
total_amount = sum(amounts)
print(type(total_amount))
print(type(number_of_sales))
average_amount = total_amount / number_of_sales
filewriter.writerow([os.path.basename(input_file), total_amount, average_amount])
- 主要是將
').replace(',', ''))``轉化為浮點數進行運算。
Pandas
import sys
import glob
import os
import pandas as pd
input_path = sys.argv[1]
output_file = sys.argv[2]
all_files = glob.glob(os.path.join(input_path, 'sales_*'))
all_data_frames = []
for input_file in all_files:
data_frame = pd.read_csv(input_file, index_col=None)
total_sales = pd.DataFrame([float(str(value).strip('$').replace(',', ''))\
for value in data_frame.loc[:, 'Sale Amount']]).sum()
average_sales = pd.DataFrame([float(str(value).strip('$').replace(',', ''\
)) for value in data_frame.loc[:, 'Sale Amount']]).mean()
data = {'file_name': os.path.basename(input_file),
'total_sales': total_sales,
'average_sales': average_sales}
all_data_frames.append(pd.DataFrame(data,\
columns=['file_name', 'total_sales', 'average_sales']))
data_frames_concat = pd.concat(all_data_frames, axis=0, ignore_index=True)
data_frames_concat.to_csv(output_file, index=False)
- 注意17-21行的建立DataFrame的代碼,通過字典的形式。