import pandasas pd
import matplotlib.pyplotas plt
import numpyas np
data= pd.read_csv("creditcard.csv")
a=pd.value_counts(data["Class"])
count_classes= pd.value_counts(data['Class'], sort = True).sort_index()
from sklearn.preprocessingimport StandardScaler
# 1、StandardScaler就是z-score方法
# 將原始數(shù)據(jù)歸一化為均值為0,方差為1的數(shù)據(jù)集 并將之存儲(chǔ)到Amount列
data['normAmount'] = StandardScaler().fit_transform(data['Amount'].values.reshape(-1, 1))
#? 刪除數(shù)據(jù)中Time? Amount 列
# 刪除沒(méi)用的兩列數(shù)據(jù),得到一個(gè)新的數(shù)據(jù)集
data= data.drop(['Time','Amount'],axis=1)
# 先對(duì)數(shù)據(jù)進(jìn)行切分
X= data.ix[:, data.columns!= 'Class']
y= data.ix[:, data.columns== 'Class']
# 隨機(jī)下采樣
# 篩選出class為1的數(shù)據(jù)總數(shù),并取得其索引值
# Number of data points in the minority class
# 統(tǒng)計(jì)異常值得個(gè)數(shù)
number_records_fraud= len(data[data.Class== 1])
# 統(tǒng)計(jì)欺詐樣本的下標(biāo),并變成矩陣的格式:
fraud_indices= np.array(data[data.Class== 1].index)
# Picking the indices of the normal classes
# 記錄正常值的下標(biāo):
# 把class為0的數(shù)據(jù)索引拿到手
normal_indices= data[data.Class== 0].index
# Out of the indices we picked, randomly select "x" number (number_records_fraud)
# 從normal_indices中抽取number_records_fraud
# 從正常值的索引中,選擇和異常值相等個(gè)數(shù)的樣本,保證樣本的均衡:
# np.random.choice(a,size, replace, p):在a中以概率p隨機(jī)選擇size個(gè)數(shù)據(jù),replace是指是否有放回;
random_normal_indices= np.random.choice(normal_indices, number_records_fraud, replace = False)
# 將數(shù)據(jù)轉(zhuǎn)換成數(shù)組:
# 轉(zhuǎn)換成numpy的array格式
random_normal_indices= np.array(random_normal_indices)
# Appending the 2 indices
# fraud_indices:欺詐樣本的下標(biāo);random_normal_indices:正常值數(shù)組;
# concatenate:數(shù)據(jù)庫(kù)的拼接;axis=1:按照對(duì)應(yīng)行的數(shù)據(jù)進(jìn)行拼接;
# 將兩組索引數(shù)據(jù)連接成性的數(shù)據(jù)索引
under_sample_indices= np.concatenate([fraud_indices,random_normal_indices])
# Under sample dataset
# loc["a","b"]:表示第a行,第b列;
# iloc[1,1]:按照行列來(lái)索引,左式為第二行第二列;
# 獲取下標(biāo)所在行的所有列,即得到訓(xùn)練所需要的數(shù)據(jù)集:
# 下采樣數(shù)據(jù)集
# 定位到真正的數(shù)據(jù)
under_sample_data= data.iloc[under_sample_indices,:]
# 將數(shù)據(jù)集按照class列進(jìn)行分類
# 切分出下采樣數(shù)據(jù)的特征和標(biāo)簽
X_undersample= under_sample_data.ix[:, under_sample_data.columns!= 'Class']
y_undersample= under_sample_data.ix[:, under_sample_data.columns== 'Class']
# Showing ratio
# 展示下比例
# 計(jì)算正負(fù)比例為0.5
print("Percentage of normal transactions: ", len(under_sample_data[under_sample_data.Class== 0])/len(under_sample_data))
print("Percentage of fraud transactions: ", len(under_sample_data[under_sample_data.Class== 1])/len(under_sample_data))
print("Total number of transactions in resampled data: ", len(under_sample_data))
# 導(dǎo)入交叉驗(yàn)證模塊的數(shù)據(jù)切分
from sklearn.model_selectionimport train_test_split
# Whole dataset
# 交叉驗(yàn)證
# 隨機(jī)劃分訓(xùn)練集和測(cè)試集:x為除了class之外的其他的值,y為最終的結(jié)果列;
# test_size:樣本占比;
# 從原始集中獲取到訓(xùn)練集與測(cè)試集:
# train_test_split:x,y按照test_size的尺寸隨機(jī)提取數(shù)據(jù),然后劃分到四個(gè)數(shù)據(jù)集中
# 對(duì)全部數(shù)據(jù)集進(jìn)行切分,注意使用相同的隨機(jī)策略
X_train, X_test, y_train, y_test= train_test_split(X,y,test_size = 0.3, random_state = 0)
print("Number transactions train dataset: ", len(X_train))
print("Number transactions test dataset: ", len(X_test))
print("Total number of transactions: ", len(X_train)+len(X_test))
# Undersampled dataset
# 數(shù)據(jù)平衡之后的數(shù)據(jù)中獲取到訓(xùn)練集與測(cè)試集:
# 對(duì)下采樣數(shù)據(jù)集進(jìn)行切分
X_train_undersample, X_test_undersample, y_train_undersample, y_test_undersample= train_test_split(X_undersample
,y_undersample
,test_size = 0.3
? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ,random_state = 0)
print("")
print("Number transactions train dataset: ", len(X_train_undersample))
print("Number transactions test dataset: ", len(X_test_undersample))
print("Total number of transactions: ", len(X_train_undersample)+len(X_test_undersample))