數(shù)據(jù)清洗
將csv,excel等行式數(shù)據(jù)轉(zhuǎn)換成二維excel數(shù)據(jù)
import pandas as pd
data = pd.read_csv("lipstick.csv",header=0,names=["A","B","C","D","E","F"])
print(data["F"])
result = open('lipstick1.txt', 'w', encoding='utf-8')
for i in data["F"]:
result.write(str(i).replace('\n', '"'))
result.write('\n')
數(shù)據(jù)處理
import pandas as pd
df=pd.read_excel("lip.xlsx")
import matplotlib.pyplot as plt
import seaborn as sns
from pyecharts.charts import Bar, Pie,Map
from pyecharts import options as opts
import numpy as np
plt.rcParams['font.sans-serif']=['SimHei']
plt.rcParams['axes.unicode_minus'] = False
s=df.groupby('shop').number_pay.sum().sort_values(ascending=False)[:30]\
.plot(kind='bar',color=['r','g','b','g','b','tan','c'])
plt.xlabel('店鋪')
plt.ylabel('購(gòu)買人數(shù)')
plt.savefig('購(gòu)買人數(shù)最多的前30店鋪.png',bbox_inches='tight')
plt.show()
y1=df.groupby('shop').amount.sum().sort_values(ascending=False)[:20]
y_amount=pd.DataFrame(y1)
print(y1.index,y1.values)
color_series = ['#FAE927','#E9E416','#C9DA36','#9ECB3C','#6DBC49',
'#37B44E','#3DBA78','#14ADCF','#209AC9','#1E91CA',
'#2C6BA0','#2B55A1','#2D3D8E','#44388E','#6A368B',
'#7D3990','#A63F98','#C31C88','#D52178','#D5225B']
pie1 = Pie(init_opts=opts.InitOpts(width='1350px', height='750px'))
pie1.set_colors(color_series)
pie1.add("", [list(z) for z in zip(y1.index, np.round(np.sqrt(y1.values),0))],
radius=["20%", "100%"],
center=["30%", "65%"],
rosetype="area"
)
# 設(shè)置全局配置項(xiàng)
pie1.set_global_opts(title_opts=opts.TitleOpts(title='玫瑰圖示例'),
legend_opts=opts.LegendOpts(is_show=False),
toolbox_opts=opts.ToolboxOpts())
# 設(shè)置系列配置項(xiàng)
pie1.set_series_opts(label_opts=opts.LabelOpts(is_show=True,
position="inside",
font_size=12,
formatter=":{c}",
font_style="italic",
font_weight="bold",
font_family="Microsoft YaHei"),)
pie1.render('銷售額前20店鋪.html')

image.png
################################店鋪所在地區(qū)分布
y2=df.groupby('location1').shop.count().sort_values(ascending=False)
y_amount=pd.DataFrame(y2)
data=[
('廣東',991),
('上海',925),
('浙江',633),
('北京',362),
('江蘇',353),
('山東',217),
('遼寧',103),
('香港',103),
('四川',95),
('福建',85),
('安徽',65),
('湖北',63),
('湖南',59),
('河北',57),
('黑龍江',54),
('天津',45),
('河南',36),
('江西',19),
('重慶',16),
('吉林',15),
('陜西',14),
('山西',12),
('廣西',4),
('海南',4),
('云南',3),
('貴州',1),
('臺(tái)灣',1),
('內(nèi)蒙古',1),
('甘肅',1),
]
china_map = (
Map(init_opts=opts.InitOpts(theme='dark'))
.add("",data,'china',is_map_symbol_show=False, is_roam=False)
.set_series_opts(label_opts=opts.LabelOpts(is_show=True, color='#ffffff'))
.set_global_opts(
title_opts=opts.TitleOpts(title="店鋪所在地區(qū)分布地圖"),
legend_opts=opts.LegendOpts(is_show=False),
visualmap_opts=opts.VisualMapOpts(max_=2000,
is_piecewise=True,
pieces=[
{"max": 999, "min": 500, "label": "500-999", "color": "#B40404"},
{"max": 499, "min": 100, "label": "100-499", "color": "#DF0101"},
{"max": 99, "min": 60, "label": "60-99", "color": "#F78181"},
{"max": 59, "min": 10, "label": "10-59", "color": "#F5A9A9"},
{"max": 9, "min": 0, "label": "1-9", "color": "#FFFFCC"},
])
)
)
china_map.render("店鋪所在地區(qū)分布.html")

image.png
##############每個(gè)店鋪的口紅單價(jià)情況
y4=df.groupby('shop').price.mean().sort_values(ascending=False)
print(y4)
sns.distplot(df.groupby('shop').price.mean(),color="g",
norm_hist = True, hist_kws = {'color':'g'},
kde_kws=({'linestyle':'--', 'color':'red'}))
plt.xlim(0,1000)
plt.title('口紅單價(jià)')
plt.show()
import jieba
from wordcloud import WordCloud
from PIL import Image
filename = "lip_word.txt"
with open(filename,encoding='UTF-8') as f:
mytext = f.read()
mytext = " ".join(jieba.cut(mytext))
mytext.replace("/", '')
mytext.replace("|", '')
alice_mask = np.array(Image.open("lips.png"))
wordcloud = WordCloud(font_path="simsun.ttf",background_color="white",
collocations=False,#非重復(fù)計(jì)數(shù)
width=800,
height=600,
mask=alice_mask,max_words=500).generate(mytext)
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.figure()
plt.imshow(alice_mask, cmap=plt.cm.gray, interpolation='bilinear')
plt.axis("off")
plt.show()

image.png
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans
df=pd.read_excel("lip_sum_dataming.xlsx",index_col = 'index')
df1=df[['price','number_pay','adress','if_company','if_off_shop']]#選中數(shù)據(jù)框某幾列 df1nona=df1[df1['number_pay'].notna()]#原始數(shù)據(jù)#空值刪除整行
print(np.isnan(df1nona).any())#檢查是否含有空行
scale=MinMaxScaler().fit(df)#數(shù)據(jù)標(biāo)準(zhǔn)化
df_scale=scale.transform(df)#標(biāo)準(zhǔn)化數(shù)據(jù) #df1_scale=df_scale.tolist() 數(shù)組添加索引
data_zs = 1.0*(df - df.mean())/df.std()#print(type(data_zs),data_zs.index)
kmeans=KMeans(n_clusters=2).fit(data_zs)#構(gòu)造聚類器,estimator初始化Kmeans聚類;estimator.fit聚類內(nèi)容擬合;
inertia = kmeans.inertia_ # 獲取聚類準(zhǔn)則的總和
ssa=kmeans.inertia_#組內(nèi)平方和
y_kmeans2=kmeans.predict(data_zs)
from sklearn.manifold import TSNE
tsne = TSNE()
tsne.fit_transform(data_zs) #進(jìn)行數(shù)據(jù)降維,并返回結(jié)果
tsne = pd.DataFrame(tsne.embedding_, index = data_zs.index) #轉(zhuǎn)換數(shù)據(jù)格式
r1 = pd.Series(kmeans.labels_).value_counts() #統(tǒng)計(jì)各個(gè)類別的數(shù)目
r2 = pd.DataFrame(kmeans.cluster_centers_) #找出聚類中心
r = pd.concat([r2, r1], axis = 1) #橫向連接(0是縱向),得到聚類中心對(duì)應(yīng)的類別下的數(shù)目
r = pd.concat([data_zs, pd.Series(kmeans.labels_, index = data_zs.index)], axis = 1) #詳細(xì)輸出每個(gè)樣本對(duì)應(yīng)的類別
r.columns = list(df.columns) + [u'聚類類別'] #重命名表頭
r.to_excel('data_type_2.xls') #保存結(jié)果
d = tsne[r[u'聚類類別'] == 0] #找出聚類類別為0的數(shù)據(jù)對(duì)應(yīng)的降維結(jié)果
plt.plot(d[0], d[1], 'r.')
d = tsne[r[u'聚類類別'] == 1]
plt.plot(d[0], d[1], 'go')
plt.show()

image.png