實現(xiàn)功能:
Python數(shù)據(jù)分析實戰(zhàn)-數(shù)值型特征和類別型特征歸一化編碼操作
實現(xiàn)代碼:
import pandasas pd
import warnings
warnings.filterwarnings("ignore")
df = pd.read_csv("E:\數(shù)據(jù)雜壇\datasets\kidney_disease.csv")
df=pd.DataFrame(df)
pd.set_option('display.max_rows', None)
pd.set_option('display.width', None)
df.drop("id",axis=1,inplace=True)
print(df.head())
df["classification"] = df["classification"].apply(lambda x: xif x =="notckd" else "ckd")
# 數(shù)值型變量名
num_cols = [colfor colin df.columnsif df[col].dtype !="object"]
# 分類型變量名
cat_cols = [colfor colin df.columnsif df[col].dtype =="object"]
print(df.isnull().sum().sort_values(ascending =False))
# ======================缺失值處理============================
def random_value_imputate(col):
"""
? ? 函數(shù):隨機填充方法(缺失值較多的字段)"""
? ? # 1、確定填充的數(shù)量;在取出缺失值隨機選擇缺失值數(shù)量的樣本
? ? random_sample = df[col].dropna().sample(df[col].isna().sum())
# 2、索引號就是原缺失值記錄的索引號
? ? random_sample.index = df[df[col].isnull()].index
# 3、通過loc函數(shù)定位填充
? ? df.loc[df[col].isnull(), col] = random_sample
def mode_impute(col):
"""
? ? 函數(shù):眾數(shù)填充缺失值"""
? ? # 1、確定眾數(shù)
? ? mode = df[col].mode()[0]
# 2、fillna函數(shù)填充眾數(shù)
? ? df[col] = df[col].fillna(mode)
for colin num_cols:
random_value_imputate(col)
for colin cat_cols:
if colin ['rbc','pc']:
# 隨機填充
? ? ? ? random_value_imputate('rbc')
random_value_imputate('pc')
else:
mode_impute(col)
print(df.isnull().sum().sort_values(ascending =False))
print(df.head())
# ======================特征編碼============================
from sklearn.preprocessingimport MinMaxScaler
mms = MinMaxScaler()
df[num_cols] = mms.fit_transform(df[num_cols])
from sklearn.preprocessingimport LabelEncoder
led = LabelEncoder()
for colin cat_cols:
df[col] = led.fit_transform(df[col])
print(df.head())
實現(xiàn)效果: