一、泰坦尼克號數(shù)據(jù)準(zhǔn)備
import pandas as pd
import numpy as np
#特征最影響結(jié)果的K個特征
from sklearn.feature_selection import SelectKBest
#卡方校驗,作為SelectKBest的參數(shù)
from sklearn.feature_selection import chi2
df = pd.read_csv(r"D:\node\nd\Pandas_study\pandas_test\titanic_train.csv")
df = df[["PassengerId", "Survived", "Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"]].copy()
print(df.head())

image.png
二、數(shù)據(jù)清理和轉(zhuǎn)換
1、查看是否有空值的列
print(df.info())

image.png
2、給Age列填充平均值
#年齡字段,填充年齡的平均值
df["Age"] = df["Age"].fillna(df["Age"].median())
3、將性別列變成數(shù)字
df.loc[df["Sex"] == "male","Sex"] = 0
df.loc[df["Sex"] == "female","Sex"] = 1
print(df.head())

image.png
4、給Embarked列填充空值,字符串轉(zhuǎn)換成數(shù)字
a = df["Embarked"].unique()
print(a)
#填充空值
df["Embarked"] = df["Embarked"].fillna(0)
#字符串變成數(shù)字
df.loc[df["Embarked"] == "S","Embarked"] = 1
df.loc[df["Embarked"] == "C","Embarked"] = 2
df.loc[df["Embarked"] == "Q","Embarked"] = 3
print(df.head())

image.png
四、將特征列和結(jié)果列拆分開
#結(jié)果列
y = df.pop("Survived")
#特征列
X = df
print(y.head())
print(X.head())

image.png
五、使用卡放檢驗選擇topK的特征
#選擇所有的特征,目的是看到特征重要性排序,score_func參數(shù)代表選擇的方式,是卡方校驗,
#k代表校驗的列,這里是校驗所有的列
bestfeatures = SelectKBest(score_func=chi2,k = len(X.columns))
fit = bestfeatures.fit(X,y)
print(fit)

image.png
六、按照重要性順序打印特征列表
1、返回每一列分?jǐn)?shù)
#fit中有一個scores_返回了每一列的分?jǐn)?shù)
df_scores = pd.DataFrame(fit.scores_)
print(df_scores)

image.png
2、把每一列編程新的df
df_columns = pd.DataFrame(X.columns)
print("X這個df的列是:",X.columns)
print(df_columns)

image.png
3、合并兩個df
#合并兩個df
df_feature_scores = pd.concat([df_scores,df_columns],axis = 1)
print(df_feature_scores)

image.png
4、設(shè)置合并后的df列名,查看最終結(jié)果
#設(shè)置合并后df的列名
df_feature_scores.columns = ["Score","feature_name"]
print(df_feature_scores)

image.png
5、對數(shù)據(jù)進行降序排列,可以得知哪些因素對結(jié)果有影響
df_sort =df_feature_scores.sort_values(by = "Score",ascending= False)
print(df_sort)

image.png