日本久久黄色,99Re热这里在,韩国在线一二三区

%matplotlib inline
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.gridspec as gridspec
pd.set_option("display.max_columns",101)
pd.set_option('display.float_format', lambda x: '%.5f' % x) #為了直觀的顯示數(shù)字，不采用科學(xué)計數(shù)法
pd.options.display.max_rows = 15 #最多顯示15行
import warnings
warnings.filterwarnings('ignore') #為了整潔，去除彈出的warnings

import pandas as pd

df=pd.read_csv( 'cs-training.csv')
df = df.drop(df.columns[0],axis=1)

df=df[df.age>=18]

在債務(wù)違約預(yù)測之一：數(shù)據(jù)探索中，按各個屬性對借貸者分組，再分析其違約率?，F(xiàn)在換一個角度，分為違約者和未違約兩類，再查看兩組人群中各個屬性的分布。

features=df.columns[1:]

features

Index(['RevolvingUtilizationOfUnsecuredLines', 'age',
       'NumberOfTime30-59DaysPastDueNotWorse', 'DebtRatio', 'MonthlyIncome',
       'NumberOfOpenCreditLinesAndLoans', 'NumberOfTimes90DaysLate',
       'NumberRealEstateLoansOrLines', 'NumberOfTime60-89DaysPastDueNotWorse',
       'NumberOfDependents'],
      dtype='object')

plt.figure(figsize=(12,28*4))
gs = gridspec.GridSpec(28, 1)
#針對違約者和未違約者的每個屬性，繪制直方圖
for i, cn in enumerate(features):
    ax = plt.subplot(gs[i])
    sns.distplot(df[cn][df.SeriousDlqin2yrs == 1], bins=50,color='red')
    sns.distplot(df[cn][df.SeriousDlqin2yrs == 0], bins=50,color='blue')
    ax.set_xlabel('')
    ax.set_title('histogram of feature: ' + str(cn))
plt.show()

出現(xiàn) 'max must be larger than min in range parameter.'是因為有的列存在空值。

df.isnull().sum()

MonthlyIncome為空的記錄較多，為了保持?jǐn)?shù)據(jù)的完整，沒有刪掉，用平均值填充

df['MonthlyIncome'].fillna(df['MonthlyIncome'].mean(), inplace=True)

df['NumberOfDependents'].fillna(df['NumberOfDependents'].mode(), inplace=True)
#NumberOfDependents字段，用眾數(shù)df['NumberOfDependents'].mode()來填充

df.isnull().sum() #空值還是存在，為什么呢

SeriousDlqin2yrs                           0
RevolvingUtilizationOfUnsecuredLines       0
age                                        0
NumberOfTime30-59DaysPastDueNotWorse       0
DebtRatio                                  0
MonthlyIncome                              0
NumberOfOpenCreditLinesAndLoans            0
NumberOfTimes90DaysLate                    0
NumberRealEstateLoansOrLines               0
NumberOfTime60-89DaysPastDueNotWorse       0
NumberOfDependents                      3924
dtype: int64

type(df['NumberOfDependents'].mode()) 
    pandas.core.series.Series
 #mode()返回的是一個Series，而不是單一的值,要取其中的元素來填充

df['NumberOfDependents'].fillna(df['NumberOfDependents'].mode()[0], inplace=True)#填補(bǔ)成功

sns.distplot(df['RevolvingUtilizationOfUnsecuredLines'][(df.SeriousDlqin2yrs == 1) & (df.RevolvingUtilizationOfUnsecuredLines)], bins=20,color='red')
sns.distplot(df['RevolvingUtilizationOfUnsecuredLines'][(df.SeriousDlqin2yrs == 0) & (df.RevolvingUtilizationOfUnsecuredLines)], bins=20,color='blue')

<matplotlib.axes._subplots.AxesSubplot at 0x10229a58>

output_15_1.png

圖形縮成小小的一條，因為取值范圍是0到50000多，x軸的范圍太大了，而大部分值都在0附近，所以無法清晰顯示。

df['RevolvingUtilizationOfUnsecuredLines'].describe() #看該屬性的數(shù)值分布

count   149999.00000
mean         6.04847
std        249.75620
min          0.00000
25%          0.02987
50%          0.15418
75%          0.55904
max      50708.00000
Name: RevolvingUtilizationOfUnsecuredLines, dtype: float64

df[['RevolvingUtilizationOfUnsecuredLines']].boxplot(sym='r*') #用箱型圖查看異常值

<matplotlib.axes._subplots.AxesSubplot at 0x100f1828>

output_18_1.png

p=df[['RevolvingUtilizationOfUnsecuredLines']].boxplot(return_type='dict')
#return_type='dict'時，會返回數(shù)據(jù)集的異常值
outliers=p['fliers'][0].get_xydata()#get_xydata()把異常值返回到一個二維數(shù)組中
outliers.shape

(763, 2)

outliers[:,1:].min() #看看最小的異常值是多少

1.3534146969999998

sns.distplot(df['RevolvingUtilizationOfUnsecuredLines'][(df.SeriousDlqin2yrs == 1) & (df.RevolvingUtilizationOfUnsecuredLines<1.4)], bins=20,color='red')
sns.distplot(df['RevolvingUtilizationOfUnsecuredLines'][(df.SeriousDlqin2yrs == 0) & (df.RevolvingUtilizationOfUnsecuredLines<1.4)], bins=20,color='blue')

<matplotlib.axes._subplots.AxesSubplot at 0x1027f5f8>

output_21_1.png

未違約人群，RevolvingUtilizationOfUnsecuredLines屬性的最高頻率在0附近；違約人群，RevolvingUtilizationOfUnsecuredLines的最高頻率在1附近。

#計算每個屬性的異常值數(shù)量和最小的異常值
col_min={}
for  feature in features:
    p=df[[feature]].boxplot(return_type='dict')
    outliers=p['fliers'][0].get_xydata()
    pmin=outliers[:,1:].min()
    col_min[feature]=[outliers.shape[0],pmin]

output_23_0.png

col_min

{'DebtRatio': [31311, 1.9080459769999998],
 'MonthlyIncome': [9149, 12646.0],
 'NumberOfDependents': [13336, 3.0],
 'NumberOfOpenCreditLinesAndLoans': [3980, 21.0],
 'NumberOfTime30-59DaysPastDueNotWorse': [23981, 1.0],
 'NumberOfTime60-89DaysPastDueNotWorse': [7604, 1.0],
 'NumberOfTimes90DaysLate': [8338, 1.0],
 'NumberRealEstateLoansOrLines': [793, 6.0],
 'RevolvingUtilizationOfUnsecuredLines': [763, 1.3534146969999998],
 'age': [45, 97.0]}

#結(jié)合異常值和該屬性上的數(shù)值分布，選定取值范圍作圖。因為每個屬性的選取范圍和bins不同，所以不進(jìn)行統(tǒng)一繪圖，
而是一個一個繪制。
sns.distplot(df['DebtRatio'][(df.SeriousDlqin2yrs == 1) & (df.DebtRatio<5)], bins=20,color='red')
sns.distplot(df['DebtRatio'][(df.SeriousDlqin2yrs == 0) & (df.DebtRatio<5)], bins=20,color='blue')

<matplotlib.axes._subplots.AxesSubplot at 0xa7cfef0>

output_25_1.png

兩組人群在DebtRatio屬性上的分布相似，最高頻率在0附近，后逐漸降低

sns.distplot(df['NumberOfOpenCreditLinesAndLoans'][(df.SeriousDlqin2yrs == 1) & (df.NumberOfOpenCreditLinesAndLoans<30)], bins=30,color='red')
sns.distplot(df['NumberOfOpenCreditLinesAndLoans'][(df.SeriousDlqin2yrs == 0) & (df.NumberOfOpenCreditLinesAndLoans<30)], bins=30,color='blue')

<matplotlib.axes._subplots.AxesSubplot at 0x11eca3c8>

output_27_1.png

在NumberOfOpenCreditLinesAndLoans屬性上，兩組人群分布相似，最高頻率都是5-8之間

sns.distplot(df['age'][df.SeriousDlqin2yrs == 1] ,bins=50,color='red')
sns.distplot(df['age'][df.SeriousDlqin2yrs == 0], bins=50,color='blue')

<matplotlib.axes._subplots.AxesSubplot at 0x137bbfd0>

output_29_1.png

sns.distplot(df['NumberOfTime30-59DaysPastDueNotWorse'][(df.SeriousDlqin2yrs == 1) & (df['NumberOfTime30-59DaysPastDueNotWorse']<10)], bins=10,color='red')
sns.distplot(df['NumberOfTime30-59DaysPastDueNotWorse'][(df.SeriousDlqin2yrs == 0) & (df['NumberOfTime30-59DaysPastDueNotWorse']<10)], bins=10,color='blue')

<matplotlib.axes._subplots.AxesSubplot at 0xbb45908>

output_30_1.png

sns.distplot(df['NumberOfTime60-89DaysPastDueNotWorse'][(df.SeriousDlqin2yrs == 1) & (df['NumberOfTime60-89DaysPastDueNotWorse']<10)], bins=10,color='red')
sns.distplot(df['NumberOfTime60-89DaysPastDueNotWorse'][(df.SeriousDlqin2yrs == 0) & (df['NumberOfTime60-89DaysPastDueNotWorse']<10)], bins=10,color='blue')

<matplotlib.axes._subplots.AxesSubplot at 0xbf33940>

output_31_1.png

sns.distplot(df['NumberOfTimes90DaysLate'][(df.SeriousDlqin2yrs == 1) & (df.NumberOfTimes90DaysLate<10)], bins=10,color='red')
sns.distplot(df['NumberOfTimes90DaysLate'][(df.SeriousDlqin2yrs == 0) & (df.NumberOfTimes90DaysLate<10)], bins=10,color='blue')

<matplotlib.axes._subplots.AxesSubplot at 0xa5992b0>

output_32_1.png

sns.distplot(df['NumberRealEstateLoansOrLines'][(df.SeriousDlqin2yrs == 1) & (df.NumberRealEstateLoansOrLines<10)], bins=10,color='red')
sns.distplot(df['NumberRealEstateLoansOrLines'][(df.SeriousDlqin2yrs == 0) & (df.NumberRealEstateLoansOrLines<10)], bins=10,color='blue')

<matplotlib.axes._subplots.AxesSubplot at 0xa7aa2e8>

output_33_1.png

其余幾個屬性上，兩類人群的分布都是相近的，不再贅述。和本文采用的是不同分析方法，
前者按各個屬性對借貸者分組，查看不同類別在每一組的分布。本文是先進(jìn)行分類，再查看兩個類別中各個屬性的分布。
。第一種方法使用數(shù)字，能看出更多信息。

參考
python用箱型圖進(jìn)行異常值檢測

色偷偷精品伊人,欧洲久久精品,欧美综合婷婷骚逼,国产AV主播,国产最新探花在线,九色在线视频一区,伊人大交九欧美,1769亚洲,黄色成人av

債務(wù)違約預(yù)測之二：圖形探索

債務(wù)違約預(yù)測之二：圖形探索

相關(guān)閱讀更多精彩內(nèi)容

友情鏈接更多精彩內(nèi)容

色偷偷精品伊人,欧洲久久精品,欧美综合婷婷骚逼,国产AV主播,国产最新探花在线,九色在线视频一区,伊人大交九 欧美,1769亚洲,黄色成人av

債務(wù)違約預(yù)測之二：圖形探索

相關(guān)閱讀更多精彩內(nèi)容

友情鏈接更多精彩內(nèi)容

色偷偷精品伊人,欧洲久久精品,欧美综合婷婷骚逼,国产AV主播,国产最新探花在线,九色在线视频一区,伊人大交九欧美,1769亚洲,黄色成人av