導(dǎo)入庫
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.patches as mpatches
import matplotlib
plt.rcParams["font.sans-serif"]=["SimHei"] #設(shè)置子體
plt.rcParams["axes.unicode_minus"] = False #正常顯示負(fù)號
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
from plotly.offline import init_notebook_mode, iplot
plt.style.use("ggplot")
sns.set(context="notebook",
style="darkgrid",
palette="colorblind",
font="sans-serif",
font_scale=1,
rc=None)
matplotlib.rcParams["figure.figsize"] = [8, 8]
matplotlib.rcParams.update({"font.size":15})
matplotlib.rcParams["font.family"] = "sans-serif"
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split
# 忽略notebook中的警告
import warnings
warnings.filterwarnings("ignore")
數(shù)據(jù)基本信息
df1 = pd.read_csv("train.csv")
df1.head()

image.png
df1.shape

image.png
columns = df1.columns
columns

image.png
df1.dtypes

image.png
# 不同字段類型的占比
df1.dtypes.value_counts().plot.pie(explode=[0.1, 0.1, 0.1], autopct="%1.2f%%", shadow=True)
plt.title("type of our data")
plt.show()

image.png
# 字段基本信息
df1.info()

image.png
# 缺失值情況
df1.isnull().sum()

image.png
統(tǒng)計(jì)與可視化分析
# 性別分析
df2 = df1["Gender"].value_counts().reset_index()
df2

image.png
# 不同性別下的數(shù)量分布統(tǒng)計(jì)
colors = ["red", "blue"]
sns.countplot("Gender", data=df1, palette=colors)
plt.title("Gender Count")
plt.show()

image.png
# 不同性別下的數(shù)量占比統(tǒng)計(jì)
size = df1["Gender"].value_counts()
labels = ["Male", "Female"]
colors = ["#C4061D", "green"]
explode = [0, 0.1]
plt.rcParams["figure.figsize"] = (10, 10)
plt.pie(size,
colors=colors,
labels=labels,
shadow=True,
explode=explode,
autopct="%.2f%%")
plt.title("Gender Percent", fontsize=20)
plt.axis("off")
plt.legend()
plt.show()

image.png
來源:尤而小屋