#導(dǎo)入pandas工具包,并且更名為pd
import pandas as pd
#調(diào)用pandas工具包的read_csv函數(shù)/模塊,傳入訓(xùn)練文件的地址參數(shù),獲得返回的數(shù)據(jù)且存至變量df_train.
df_train=pd.read_csv('../Datasets/Breast-Cancer/breast-cancer-train.csv')
#調(diào)用pandas工具包的read_csv函數(shù)、模塊,傳入測(cè)試的文件地址參數(shù),獲得返回的數(shù)據(jù)存至變量df_test
df_test=pd.read_csv('../Datasets/Breast-Cancer/breast-cancer-test.csv')
#選取‘Clump Thickness' 與'Cell Size' 作為特征,構(gòu)建測(cè)試集中的正負(fù)分類樣本。
df_test_negative = df_test.loc[df_test['Type'] == 0][['Clump Thickness', 'Cell Size']]
df_test_positive = df_test.loc[df_test['Type'] == 1][['Clump Thickness', 'Cell Size']]
#導(dǎo)入matplotlib工具包中的pyplot并簡(jiǎn)化為plt
import matplotlib.pyplot as plt
#繪制圖1-2中的良心腫瘤樣本點(diǎn),標(biāo)記為紅色
plt.scatter(df_test_negative['Clump Thickness'], df_test_negative['Cell Size'], marker='o', s=200, c='red')
#繪制圖1-2中的惡性腫瘤樣本點(diǎn),標(biāo)記為黑色的x.
plt.scatter(df_test_positive['Clump Thickness'], df_test_positive['Cell Size'], marker='x', s=150, c='black')
#繪制x,y軸說(shuō)明
plt.xlabel('Clump Thickness')
plt.ylabel('Cell Size')
#顯示圖1-2
plt.show()
#導(dǎo)入numpy工具包,并且重命名為np
import numpy as np
#利用numpy中的random函數(shù)隨機(jī)采用直線的截距和系數(shù)
intercept=np.random.random([1])
coef=np.random.random([2])
lx=np.arange(0, 12)
ly=(-intercept-lx*coef[0]/coef[1])
#繪制一條隨機(jī)直線
plt.plot(lx, ly, c='yellow')
#繪制圖1-3
plt.scatter(df_test_negative['Clump Thickness'], df_test_negative['Cell Size'], marker='o', s=200, c='red')
plt.scatter(df_test_positive['Clump Thickness'], df_test_positive['Cell Size'], marker='x', s=150, c='black')
plt.xlabel('Clump Thickness')
plt.ylabel('Cell Size')
plt.show()
#導(dǎo)入sklearn 中的邏輯斯蒂回歸分類器
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
#使用前10條訓(xùn)練樣本學(xué)習(xí)直線的系數(shù)和截距。
lr.fit(df_train[['Clump Thickness', 'Cell Size']][:10], df_train['Type'][:10])
print('Testing accuracy(10 training samples):', lr.score(df_test[['Clump Thickness', 'Cell Size']], df_test['Type']))
intercept=lr.intercept_
coef=lr.coef_[0,:]
#原本這個(gè)分類面應(yīng)該是lx*coef[0]+ly*coef[1]+intercept=0,映射到2維平面上之后,應(yīng)該是
ly=(-intercept-lx*coef[0])/coef[1]
# 繪制圖1-4
plt.plot(lx,ly,c='green')
plt.scatter(df_test_negative['Clump Thickness'], df_test_negative['Cell Size'], marker='o', s=200, c='red')
plt.scatter(df_test_positive['Clump Thickness'], df_test_positive['Cell Size'], marker='x', s=200, c='black')
plt.xlabel('Clump Thickness')
plt.ylabel('Cell Size')
plt.show()
lr=LogisticRegression()
#使用所有訓(xùn)練樣本學(xué)習(xí)直線的系數(shù)和截距。
lr.fit(df_train[['Clump Thickness', 'Cell Size']], df_train['Type'])
print('Testing accuracy(all training samples:', lr.score(df_test[['Clump Thickness', 'Cell Size']], df_test['Type']))
intercept=lr.intercept_
coef=lr.coef_[0, :]
ly=(-intercept -lx*coef[0])/coef[1]
#繪制圖1-5
plt.plot(lx,ly,c='blue')
plt.scatter(df_test_negative['Clump Thickness'], df_test_negative['Cell Size'], marker='o', s=200, c='red')
plt.scatter(df_test_positive['Clump Thickness'], df_test_positive['Cell Size'], marker='x', s=150, c='black')
plt.plot(lx, ly, c='blue')
plt.xlabel('Clump Thickness')
plt.ylabel('Cell Size')
plt.show()

圖1-5