
數(shù)據(jù)處理入門
1 簡介
C語言直接編寫,在tensorflow等應(yīng)用中更方便、快速。pandas是numpy的升級(jí)版
2 安裝
3 基本屬性
import numpy as np
array=np.array([[1,2,3]
[2,5,8]],dtype=int)
print(array.ndim)
print(array.shape)
print(array.size)
4 創(chuàng)建數(shù)組array
import numpy as np
zero_array = np.zeros((3,4))#3行4列的零矩陣
one_array = np.ones((2,4),dtype=int)#每個(gè)元素都為1
np.empty((3,2))#每個(gè)元素幾乎為0
range_array = np.arange(12).reshape((2,6))
#按順序取0-12,并reshape為2行6列的矩陣
line_arr = np.linspace(0,10,5)#在指定的間隔內(nèi)返回均勻間隔的數(shù)字,即返回0-10中的5個(gè)等間隔數(shù)字
5 基礎(chǔ)運(yùn)算
import numpy as np
a = np.array([[10,20],
[1,0,]])
b = np.arange(4).reshape(2,2)
c = a*b#逐個(gè)相乘
c_dot = np.dot(a,b)#矩陣乘法
print(c)
print(c_dot)
a = np.random.random((2,4))
print(a)
print(np.sum(a))
print(np.max(a,axis=1))#按行求最大值
print(np.min(a,axis=0))#按列求最大值
6 基礎(chǔ)運(yùn)算2
a = np.arange(14,2,-1).reshape((3,4))
print(a)
print(np.clip(a,5,9))
print(np.mean(a,axis=0))#對(duì)于列進(jìn)行計(jì)算
print(np.mean(a,axis=1))#對(duì)于行進(jìn)行計(jì)算
7 numpy索引
a = np.arange(3,15)
print('a',a)
print(a[3])
b = a.reshape((3,4))
print(b)
print(b[2][1])#第2行,第1列(索引從0開始)
print(b[:,2])#第2列,所有元素
print(b[0,:])#第0行,所有元素
print(b[0:2,0])#第0列,第0-2行的元素
#for循環(huán)
a = np.arange(3,15).reshape((3,4))
print(a)
print('row:')
for row in a:
print(row)
print('column:')
for column in a.T:#轉(zhuǎn)置矩陣
print(column)
print('flat:')
print(a.flatten())
for item in a.flat:
print(item)
8 numpay array合并
a = np.array([1,1,1])
b = np.array([2,2,2])
c = np.vstack((a,b))#vertical stack,上下合并
d = np.hstack((a,b))#horizontal stack,左右合并
print(c.shape,d.shape)
print(c)
print(d)
print(a[np.newaxis,:])#新增行維
print(a[:,np.newaxis])#新增列維
d = np.concatenate((a,b,b,a),axis=0)#多個(gè)array的橫向合并
print(d)
9 array分割
a = np.arange(12).reshape((3,4))
print(a)
print(np.split(a,2,axis=1))#縱向平均分割為2個(gè)array
print(np.split(a,3,axis=0))#橫向平均分割為3個(gè)array
print(np.array_split(a,3,axis=1))#縱向不等分割為3個(gè)array
10 numpy copy & deep copy
a = np.arange(4)
print(a)
b = a
c = a
d = a
a[0] = 11
print(a)
# a,b,c,d相互關(guān)聯(lián)
print(b is a)
d[1:3] = [22,33]
print(c is a)
#不相互關(guān)聯(lián)
b = a.copy()#deep copy
a[3] = 15
print(a is b)
11 pandas基本介紹
import pandas as pd
import numpy as np
s = pd.Series([1,3,6,np.nan,44,1])#可以顯示索引的列表
print(s)
dates = pd.date_range('20180304',periods=6)#從20180304開始的6天時(shí)間數(shù)據(jù)
print(dates)
df = pd.DataFrame(np.random.randn(6,4),index=dates,columns=['a','b','c','d'])#dataframe相當(dāng)于一個(gè)matrix矩陣,也就是numpy里面的二維矩陣。行索引為dates,列索引為中括號(hào)內(nèi)所述內(nèi)容
print(df)
df1 = pd.DataFrame(np.arange(12).reshape((3,4)))
print(df1)
df2 = pd.DataFrame({'A':1,
'B':pd.Series(1,index=list(range(4)),dtype='float32'),
'C':pd.Timestamp('20180102'),
'D':np.array([3]*4,dtype='int32'),
'E':pd.Categorical(['test','train','test','train']),
'F':'foo'})
print(df2)
print(df2.dtypes)#輸出類型
print(df2.index)#輸出索引
print(df2.columns)#輸出列名
print(df2.values)
print(df2.describe())#輸出描述
print(df2.T)#矩陣轉(zhuǎn)置
print(df2.sort_index(axis=1,ascending=False))#按行排序,降序
print(df2.sort_values(by='E'))#按E列中的值排序
12 pandas選擇數(shù)據(jù)
dates = pd.date_range('20180308',periods=6)
df = pd.DataFrame(np.arange(24).reshape((6,4)),index=dates,columns=['a','b','c','d'])
print(df[0:3],df['20180309':'20180311'])#選擇列
print(df['a'],df.a)#選擇行
print(df.loc['20180309'])#按橫向標(biāo)簽選擇
print(df.loc[:,['b','c']])#按列標(biāo)簽選擇
print(df.iloc[3:5,1:3])#按位置選擇
# print(df.ix[:3,['a','d']])#標(biāo)簽、位置混合選擇,已棄用
print(df[df.a>8])
13 pandas設(shè)置值
df.iloc[2,2]=1111
df.loc['20180309','a'] = 2222
df.b[df.a>8] = 0
14 如何處理丟失數(shù)據(jù)
dates = pd.date_range('20180308',periods=6)
df = pd.DataFrame(np.arange(24).reshape((6,4)),index=dates,columns=['a','b','c','d'])
df.iloc[0,1] = np.nan
df.iloc[1,2] = np.nan
print(df.dropna(axis=0,how='any'))#丟掉nan數(shù)據(jù),how=['any','all'],當(dāng)how=all時(shí),該行全部為nan時(shí)才啟用
print(df.fillna(value=0))#將nan填為0
print(df.isnull)#是否缺失數(shù)據(jù)
print(np.any(df.isnull()==True))#至少有一個(gè)為nan
15 導(dǎo)入導(dǎo)出數(shù)據(jù)
data = pd.read_csv('filepath')#導(dǎo)入
data.to_pickle('filepath')#導(dǎo)出
16 合并concatenating
df1 = pd.DataFrame(np.ones((3,4))*0,columns=['a','b','c','d'])
df2 = pd.DataFrame(np.ones((3,4))*1,columns=['a','b','c','d'])
df3 = pd.DataFrame(np.ones((3,4))*2,columns=['a','b','c','d'])
# print(df1)
# print(df2)
# print(df3)
# result = pd.concat([df1,df2,df3],axis=0,ignore_index=True)#axis=0為縱向合并
# print(result)
#join,['inner','outer']
df4 = pd.DataFrame(np.ones((3,4))*1,columns=['c','d','e','f'])
df5 = pd.DataFrame(np.ones((3,4))*0,columns=['a','b','c','d'])
res = pd.concat([df4,df5],axis=0,join='inner',ignore_index=True)#默認(rèn)為outer join
print(res)
#join axes
res2 = pd.concat([df4,df5],axis=1,join_axes=[df4.index])#橫向合并,以df4的索引為準(zhǔn)
print(res2)
#append
res3 = df1.append(df2,ignore_index=True)
print(res3)
s1= pd.Series([1,2,3,4],index=['a','b','c','d'])
res4 = df1.append(s1,ignore_index=True)
print(res4)
17 合并merge
#兩組dataframe依據(jù)key合并
left = pd.DataFrame({'key':['k0','k1','k2','k3'],
'A':['b0','b1','b2','b3'],
'B':['a0','a1','a2','a3']})
right = pd.DataFrame({'key':['k0','k1','k2','k3'],
'C':['c0','c1','c2','c3'],
'D':['d0','d1','d2','d3']})
res = pd.merge(left,right,on='key')
# print(res)
#依據(jù)兩組keys合并
left2 = pd.DataFrame({'key1':['k0','k0','k1','k2'],
'key2':['k0','k1','k0','k1'],
'A':['b0','b1','b2','b3'],
'B':['a0','a1','a2','a3']})
right2 = pd.DataFrame({'key1':['k0','k0','k1','k2'],
'key2':['k0','k0','k0','k0'],
'C':['c0','c1','c2','c3'],
'D':['d0','d1','d2','d3']})
res2 = pd.merge(left2,right2,on=['key1','key2'],how='inner')
# print(res2)
#根據(jù)index橫向合并
res3 = pd.merge(left2,right2,left_index=True,right_index=True,how='outer')
print(res3)
#當(dāng)兩個(gè)表中的某一列重名,但是值不同,合并的時(shí)候要保留,則使用suffixes參數(shù)
boys = pd.DataFrame({'k':['k0','k1','k2'],'age':[1,2,3]})
girls = pd.DataFrame({'k':['k0','k0','k3'],'age':[4,5,6]})
res4 = pd.merge(boys,girls,on='k',suffixes=['_boy','_girl'],how='inner')
print(res4)
18 plot圖表
import matplotlib.pyplot as plt
#線性數(shù)據(jù)Series
data = pd.Series(np.random.randn(1000),index=np.arange(1000))
data = data.cumsum()#累加
# data.plot()
# plt.show()#顯示
#矩陣數(shù)據(jù)DataFrame
data2 = pd.DataFrame(np.random.randn(1000,4),
index=np.arange(1000),
columns=list('ABCD'))
data2 = data2.cumsum()
print(data2.head())
# data2.plot()
ax = data2.plot.scatter(x='A',y='B',color='DarkBlue',label='Class1')
data2.plot.scatter(x='A',y='C',color='DarkGreen',label='Class2',ax=ax)
plt.show()#顯示
#plot method:'bar','hist','box','kde','area','scatter','hexbin','pie'
感謝: