#安裝各種數(shù)據(jù)分析庫(kù)(本人使用mac系統(tǒng),2.7版本python)
#在terminal中使用pip install 完成各種包的安裝
sudo pip install numpy
sudo pip install scipy
sudo pip install matplotlib
sudo pip install scikit-learn
引入包文件
import math
import pandas as pd
import numpy as np
import scipy
import matplotlib
import sklearn
#讀取csv文件
data=pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data',names=column_names)
#將?替換成標(biāo)準(zhǔn)缺失值表示
data=data.replace(to_replace='?',value=np.nan)
#丟棄帶有缺失值的數(shù)據(jù)(只要有一個(gè)維度缺失)
data=data.dropna(how='any')
#將原始數(shù)據(jù)分割為25%測(cè)試集,75%訓(xùn)練集
from ?sklearn.cross_validation import train_test_split
X_train,X_test,y_train,y_test=train_test_split(data[column_names[1:10]],data[column_names[10]],test_size = 0.25,random_state=33)
#查驗(yàn)訓(xùn)練樣本的數(shù)量和類(lèi)別分布
print y_train.value_counts()
print y_test.value_counts()