Customers Profile

Background

  • The company is a domestic e-commerce of daily necessities, electronic products, fast food.
  • Through user consumption records in the past 30 days, analyze user consumption behavior, establish RFM model for user stratification, discover high-value users to achieve delicacy management.

Mind Map

User profile-web-1642391282193.jpeg

1.Data Overview

1.1 Import modules and data
#Import modules
%matplotlib inline
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import matplotlib
import seaborn as sns
# The specified font
matplotlib.rcParams['font.sans-serif'] = ['SimHei']  
matplotlib.rcParams['font.family']='sans-serif'  
#Fixed the issue where the minus sign '-' appears as a square  
matplotlib.rcParams['axes.unicode_minus'] = False

import pymysql
pymysql.install_as_MySQLdb()
from sqlalchemy import create_engine

import gc
import warnings
warnings.filterwarnings('ignore')
from datetime import datetime

# Import data set
df = pd.read_excel("order_data.xlsx")
# Import the user details data set
df_user = pd.read_excel("user_data.xlsx")
1.2 Orders data
image.png

The number of unique values for an order features

df_count=pd.DataFrame()

for i in ['item_id','behavior_type','item_category','time']:
    df_count.loc[i,0]=df[i].nunique()
from matplotlib.font_manager import FontProperties
myfont=FontProperties(fname=r'C:\\Windows\\Fonts\\simhei.ttf',size=8)
sns.set(font=myfont.get_name())

plt.figure(figsize=(8,6),dpi=100)
g=sns.barplot(data=df_count,x=df_count.index,y=0)
plt.title('Orders Data',fontsize=15)
x=df_count.index
y=df_count[0]
plt.yticks(np.arange(0,100001,20000),fontsize=10)
plt.ylabel('How many Unique Value',fontsize=13)
for i,row in enumerate(df_count[0]):
    g.text(i,row+3000,str(row)[:-2],ha='center',va='center',fontsize=10)
image.png
1.3 User data
image.png

The number of unique values for user featuures

df_user_count=pd.DataFrame()

for i in df_user.drop('user_id',axis=1):
    df_user_count.loc[i,0]=df_user[i].nunique()


plt.figure(figsize=(8,6),dpi=100)
g=sns.barplot(data=df_user_count,x=df_user_count.index,y=0)
plt.title('User Data',fontsize=15)
x=df_user_count.index
y=df_user_count[0]
plt.yticks(np.arange(0,41,5),fontsize=10)
plt.ylabel('How many Unique Value',fontsize=13)
for i,row in enumerate(df_user_count[0]):
    g.text(i,row+1,str(row)[:-2],ha='center',va='center',fontsize=10)
image.png
df=labels
user_sex_df = df.groupby("gender")["user_id"].count()
## Pd. cut is used for data discretization and segmentation. Note that the group labels and the number of groups must be consistent
bins = [0,20,25,30,35,40,45,50,55,1000]
labels = ["(0-20]","(20-25]","(25-30]","(30-35]","(35-40]","(40-45]","(45-50]","(50-55]","55歲以上"]
df["age_group"] = pd.cut(df["age"],bins=bins,labels=labels)
# Draw a bar chart,rot=0 indicates rotation Angle, 0-360
age_group_df = df.groupby("age_group").user_id.count().reset_index()
# Group by province
pro_user_df = df.groupby("province").user_id.count().reset_index().sort_values(by="user_id",ascending=True)
pro_user_df=pro_user_df.sort_values('user_id',ascending=False)
# Group by city
city_user_df = df.groupby("city").user_id.count().reset_index().sort_values(by="user_id",ascending=True)
# Group by married
marry_df = df.groupby("marriage").user_id.count()
# Group by education
eduction_df = df.groupby("eduction").user_id.count().reset_index()
eduction_df=eduction_df.sort_values('user_id',ascending=False)
# Group by job
job_df = df.groupby("job").user_id.count().reset_index()
job_df=job_df.sort_values('user_id',ascending=False)

myfont=FontProperties(fname=r'C:\\Windows\\Fonts\\simhei.ttf',size=8)
sns.set(style='darkgrid',font=myfont.get_name())

plt.figure(figsize=(50,70),dpi=100)

plt.subplot(711)
sns.barplot(data=job_df,x='job',y='user_id',palette='Blues_r')
plt.xticks(fontsize=50)
plt.yticks(range(0,81,20),fontsize=50)
plt.ylabel('Number of Customers',fontsize=55)
plt.title('Occupation Distribution',fontsize=60)
for x,y in enumerate(job_df['user_id']):
    plt.text(x,y+5,y,fontsize=50,ha='center')

plt.subplot(712)
sns.barplot(data=eduction_df,x='eduction',y='user_id')
plt.xticks(fontsize=50)
plt.yticks(range(0,201,50),fontsize=50)
plt.ylabel('Number of Customers',fontsize=55)
plt.title('Education Distribution',fontsize=60)
for x,y in enumerate(eduction_df['user_id']):
    plt.text(x,y+5,y,fontsize=50,ha='center')
    
plt.subplot(713)
sns.barplot(data=marry_df,x='marriage',y='user_id')
plt.xticks(fontsize=50)
plt.yticks(range(0,181,90),fontsize=50)
plt.ylabel('Number of Customers',fontsize=55)
plt.title('Marital Distribution',fontsize=60)
for x,y in enumerate(marry_df['user_id']):
    plt.text(x,y+5,y,fontsize=50,ha='center')
    
plt.subplot(714)
sns.barplot(data=city_user_df,x='city',y='user_id',palette='Greens_r')
plt.xticks(fontsize=50)
plt.yticks(range(0,61,20),fontsize=50)
plt.ylabel('Number of Customers',fontsize=55)
plt.title('Distribution of Customers in cities',fontsize=60)
for x,y in enumerate(city_user_df['user_id']):
    plt.text(x,y+2,y,fontsize=50,ha='center')
    
plt.subplot(715)
sns.barplot(data=pro_user_df,x='province',y='user_id',palette='Oranges_r')
plt.xticks(fontsize=50)
plt.yticks(range(0,61,20),fontsize=50)
plt.ylabel('Number of Customers',fontsize=55)
plt.title('Distribution of Customers in Provinces',fontsize=60)
for x,y in enumerate(pro_user_df['user_id']):
    plt.text(x,y+2,y,fontsize=50,ha='center')
    
plt.subplot(716)
sns.barplot(data=user_sex_df1,x=user_sex_df1.gender,y=user_sex_df1.user_id)
plt.xticks(fontsize=50)
plt.yticks(range(0,121,60),fontsize=50)
plt.ylabel('Number of Customers',fontsize=55)
plt.title('Customers Gender Distribution',fontsize=60)
for x,y in enumerate(user_sex_df1.user_id):
    plt.text(x,y+2,y,fontsize=50,ha='center')
    
plt.subplot(717)
sns.barplot(data=age_group_df,x='age_group',y='user_id',palette='Reds_r')
plt.xticks(fontsize=50)
plt.yticks(range(0,71,20),fontsize=50)
plt.xlabel('Age',fontsize=55)
plt.ylabel('Number of Customers',fontsize=55)
plt.title('Customers Age Distribution',fontsize=60)
for x,y in enumerate(age_group_df['user_id']):
    plt.text(x,y+2,y,fontsize=50,ha='center')
    
plt.tight_layout()
plt.subplots_adjust(wspace=0.5,hspace=0.5)
plt.savefig('711.png')
image.png

2.Date and time processing

#Split the time feature into date and period
df['date'] = df['time'].str[0:10]
df['date'] = pd.to_datetime(df['date'],format='%Y-%m-%d')
df['time'] = df['time'].str[11:]
df['time'] = df['time'].astype(int)
# Divide the time period into 'early morning', 'morning', 'noon', 'afternoon' and 'evening'
df['hour'] = pd.cut(df['time'],bins=[-1,5,10,13,18,24],labels=['凌晨','上午','中午','下午','晚上'])

myfont=FontProperties(fname=r'C:\Windows\Fonts\simhei.ttf',size=8)
sns.set(font=myfont.get_name())

hour_order=['Morning','Noon','Afternoon','Evening','Early morning']
ax1=plt.figure(figsize=(8,6),dpi=100)
plt.title('Active Periods',fontsize=15)
sns.countplot(data=df,x='hour',order=hour_order,hue='behavior_type')
plt.legend(['Browse','Collect','Add to shopping cart','Purchase'],fontsize=10)
plt.xticks(fontsize=10)
plt.yticks(fontsize=10)
plt.xlabel('Periods',fontsize=13)
plt.ylabel('Number of People',fontsize=13)

ax2=plt.figure(figsize=(8,6),dpi=100)
ax2=sns.countplot(df['date'],order=sorted(df['date'].unique()))
plt.xticks(rotation=90,fontsize=15)
ax2.set_title('Active Date',fontsize=15,ha='center')
plt.xticks(fontsize=10)
plt.yticks(fontsize=10)
plt.xlabel('Year-Month-Day',fontsize=13)
plt.ylabel('Number of People',fontsize=13)
  • In a day, users are most active in the evening, followed by afternoon, and less frequent in the early morning, which belongs to a reasonable category.

  • December 12 was the most active date in the past 30 days, mainly due to the Double 12 activity.


    image.png
image.png
action_hour=pd.DataFrame(df['time'].value_counts()).sort_index(ascending=True)
plt.figure(figsize=(10,3),dpi=100)
plt.title('Active Time',fontsize=15)
plt.plot(action_hour.index,action_hour['time'],'-o')
plt.xticks(range(0,24,1),fontsize=10)
plt.yticks(fontsize=10)
plt.xlabel('(Time)',fontsize=13)
plt.ylabel('Number of People',fontsize=13)

Throughout the day, users are active from 18:00 to 22:00 in the evening.


image.png

3.Create a user profile

3.1 User active time
3.1.1 Active browsing period

processing:

  • a.Extract the browsing data of the user behavior_type=1.
  • b.Then group count according to user ID + time period, and calculate the maximum value.
  • c.Get the user ID + the most active time period. If there are multiple most active time ranges, splice with commas.

The type of the behavior_type

  • Browse: Behavior_type =1
  • Collection: Behavior_type =2
  • Add purchase: Behavior_type =3
  • Purchase: Behavior_type =4
#Group users and time periods and count the number of browse
time_browse = df[df['behavior_type']==1].groupby(['user_id','hour']).item_id.count().reset_index()
time_browse.rename(columns={'item_id':'hour_counts'},inplace=True)
#which periods dose users usually browse
time_browse_max = time_browse.groupby('user_id').hour_counts.max().reset_index()
time_browse_max.rename(columns={'hour_counts':'read_counts_max'},inplace=True)
time_browse = pd.merge(time_browse,time_browse_max,how='left',on='user_id')
#Select the time period that each user browses the most. If there are the same time periods, connect them with commas
time_browse_hour = time_browse.loc[time_browse['hour_counts']==time_browse['read_counts_max'],'hour'].groupby(time_browse['user_id']).aggregate(lambda x:','.join(x)).reset_index()
count_browse_hour=time_browse_hour.groupby('hour').count()
count_browse_hour2=count_browse_hour.reset_index()
count_browse_hour2['hour']=count_browse_hour2['hour'].astype('category')
count_browse_hour2['hour'].cat.reorder_categories(['Morning','Noon','Afternoon','Evening','Early morning','Afternoon,Evening'], inplace=True)
count_browse_hour2.sort_values('hour',inplace=True)
#draw a picture
hour_order=['Morning','Noon','Afternoon','Evening','Early morning','Afternoon,Evening']
ax1=plt.figure(figsize=(10,6),dpi=100)
plt.title('Browse Time Distribution',fontsize=15)
g=sns.barplot(data=count_browse_hour2,x='hour',y='user_id',order=hour_order)
plt.xticks(fontsize=10)
plt.yticks(range(0,200,50),fontsize=10)
plt.xlabel('Periods',fontsize=13)
plt.ylabel('Number of People',fontsize=13)
for x,y in enumerate(count_browse_hour2['user_id']):
    g.text(x,y+5,str(y),ha='center',fontsize=10)
  • A total of 121 people like to browse and buy goods in the evening, while only a few people like to browse e-commerce platforms in the early morning and noon.


    image.png
3.1.2 Active purchasing period

processing:

  • a. Extract the purchase data of the user behavior_type=4
  • b. Groups by user ID + time and obtain the maximum value in the group
  • c. Obtain the user ID + the most active period. If there are more than 2 the most active period, combine the user ID with commas.

time_buy = df[df['behavior_type']==4].groupby(['user_id','hour']).item_id.count().reset_index()
time_buy.rename(columns={'item_id':'hour_counts'},inplace=True)
time_buy_max = time_buy.groupby('user_id').hour_counts.max().reset_index()
time_buy_max.rename(columns={'hour_counts':'buy_counts_max'},inplace=True)
time_buy = pd.merge(time_buy,time_buy_max,how='left',on='user_id')
time_buy_hour = time_buy.loc[time_buy['hour_counts']==time_buy['buy_counts_max'],'hour'].groupby(time_buy['user_id']).aggregate(lambda x:','.join(x)).reset_index()

#Adding user usually purchase periods to the user tag table
labels = pd.merge(labels,time_buy_hour,how='left',on='user_id')
labels.rename(columns={'hour':'time_buy'},inplace=True)
count_time_buy=labels.groupby('time_buy').agg({'time_buy':'count'})

#draw
ax1=plt.figure(figsize=(10,3),dpi=100)
plt.title('Purchase Periods Distribution',fontsize=15)
g=sns.barplot(data=count_time_buy,x=count_time_buy.index,y='time_buy')
plt.xticks(fontsize=10,rotation=90)
plt.yticks(range(0,81,20),fontsize=10)
plt.xlabel('Periods',fontsize=13)
plt.ylabel('Number of People',fontsize=13)
for x,y in enumerate(count_time_buy['time_buy']):
    g.text(x,y+5,str(y),ha='center',fontsize=10)
  • People prefer to shop in the afternoon or evening, 113 people in total, accounting for 50.82%.


    image.png
3.2 User behavior about products
3.2.1 Most viewed categories

processing:

  • a. Extract the purchase data of the user behavior_type=1
  • b. Groups by user ID + products' categories and obtain the maximum value in the group
  • c. Obtain the user ID + the most viewed products' categories. If there are more than 2 most viewed products' categories, combine the user ID with commas.
df_cate_most_browse = df_browse.groupby(['user_id','item_category']).item_id.count().reset_index()
df_cate_most_browse.rename(columns={'item_id':'item_category_counts'},inplace=True)
#Collects statistics on the categories most viewed by each user
df_cate_most_browse_max = df_cate_most_browse.groupby('user_id').item_category_counts.max().reset_index()
df_cate_most_browse_max.rename(columns={'item_category_counts':'item_category_counts_max'},inplace=True)
df_cate_most_browse = pd.merge(df_cate_most_browse,df_cate_most_browse_max,how='left',on='user_id')
df_cate_most_browse['item_category'] = df_cate_most_browse['item_category'].astype(str)
#Select the category that each user browses the most. If there is a category that is tied the most, connect it with a comma
df_cate_browse = df_cate_most_browse.loc[df_cate_most_browse['item_category_counts']==df_cate_most_browse['item_category_counts_max'],'item_category'].groupby(df_cate_most_browse['user_id']).aggregate(lambda x:','.join(x)).reset_index()
#Add the most viewed categories to the user tag table
labels = pd.merge(labels,df_cate_browse,how='left',on='user_id')
labels.rename(columns={'item_category':'cate_most_browse'},inplace=True)
3.2.2 Most collected categories
  • a. Extract the purchase data of the user behavior_type=2
  • b. Groups by user ID + products' categories and obtain the maximum value in the group
  • c. Obtain the user ID + the Most collected categories. If there are more than 2 most collected categories, combine the user ID with commas.
df_cate_most_collect = df_collect.groupby(['user_id','item_category']).item_id.count().reset_index()
df_cate_most_collect.rename(columns={'item_id':'item_category_counts'},inplace=True)
df_cate_most_collect_max = df_cate_most_collect.groupby('user_id').item_category_counts.max().reset_index()
df_cate_most_collect_max.rename(columns={'item_category_counts':'item_category_counts_max'},inplace=True)
df_cate_most_collect = pd.merge(df_cate_most_collect,df_cate_most_collect_max,how='left',on='user_id')
df_cate_most_collect['item_category'] = df_cate_most_collect['item_category'].astype(str)
df_cate_collect = df_cate_most_collect.loc[df_cate_most_collect['item_category_counts']==df_cate_most_collect['item_category_counts_max'],'item_category'].groupby(df_cate_most_collect['user_id']).aggregate(lambda x:','.join(x)).reset_index()

labels = pd.merge(labels,df_cate_collect,how='left',on='user_id')
labels.rename(columns={'item_category':'cate_most_collect'},inplace=True)
3.2.3 Most added to shopping cart categories
  • a. Extract the purchase data of the user behavior_type=3
  • b. Groups by user ID + products' categories and obtain the maximum value in the group
  • c. Obtain the user ID + the most added to shopping cart categories. If there are more than 2 most added to shopping cart categories, combine the user ID with commas.
df_cate_most_cart = df_cart.groupby(['user_id','item_category']).item_id.count().reset_index()
df_cate_most_cart = df_cart.groupby(['user_id','item_category']).item_id.count().reset_index()
df_cate_most_cart.rename(columns={'item_id':'item_category_counts'},inplace=True)
df_cate_most_cart_max = df_cate_most_cart.groupby('user_id').item_category_counts.max().reset_index()
df_cate_most_cart_max.rename(columns={'item_category_counts':'item_category_counts_max'},inplace=True)
df_cate_most_cart = pd.merge(df_cate_most_cart,df_cate_most_cart_max,how='left',on='user_id')
df_cate_most_cart['item_category'] = df_cate_most_cart['item_category'].astype(str)
df_cate_cart = df_cate_most_cart.loc[df_cate_most_cart['item_category_counts']==df_cate_most_cart['item_category_counts_max'],'item_category'].groupby(df_cate_most_cart['user_id']).aggregate(lambda x:','.join(x)).reset_index()

labels = pd.merge(labels,df_cate_cart,how='left',on='user_id')
labels.rename(columns={'item_category':'cate_most_cart'},inplace=True)
3.2.4 Most purchased categories
  • a. Extract the purchase data of the user behavior_type=4
  • b. Groups by user ID + products' categories and obtain the maximum value in the group
  • c. Obtain the user ID + the most purchased categories. If there are more than 2 most purchased categories, combine the user ID with commas.
df_cate_most_buy = df_buy.groupby(['user_id','item_category']).item_id.count().reset_index()
df_cate_most_buy = df_buy.groupby(['user_id','item_category']).item_id.count().reset_index()
df_cate_most_buy.rename(columns={'item_id':'item_category_counts'},inplace=True)
df_cate_most_buy_max = df_cate_most_buy.groupby('user_id').item_category_counts.max().reset_index()
df_cate_most_buy_max.rename(columns={'item_category_counts':'item_category_counts_max'},inplace=True)
df_cate_most_buy = pd.merge(df_cate_most_buy,df_cate_most_buy_max,how='left',on='user_id')
df_cate_most_buy['item_category'] = df_cate_most_buy['item_category'].astype(str)
df_cate_buy = df_cate_most_buy.loc[df_cate_most_buy['item_category_counts']==df_cate_most_buy['item_category_counts_max'],'item_category'].groupby(df_cate_most_buy['user_id']).aggregate(lambda x:','.join(x)).reset_index()

labels = pd.merge(labels,df_cate_buy,how='left',on='user_id')
labels.rename(columns={'item_category':'cate_most_buy'},inplace=True)
  • The top 10 categories of products Browsed, collected, added to shopping cart and purchased
# First, obtain the required data
df_browse = df.loc[df['behavior_type']==1,['user_id','item_id','item_category']]
df_collect = df.loc[df['behavior_type']==2,['user_id','item_id','item_category']]
df_cart = df.loc[df['behavior_type']==3,['user_id','item_id','item_category']]
df_buy = df.loc[df['behavior_type']==4,['user_id','item_id','item_category']]

def top10_purchase(df,action):
    df_top10=df.groupby('item_id').count().sort_values('user_id',ascending=False)
    df_top10['item_category']=round(df_top10['user_id']/df_top10['user_id'].sum()*100,3)
    df_top10['item_category']=df_top10['item_category'].apply(lambda x:str(x)+'%')
    df_top10=df_top10.head(10)


    ax=plt.figure(figsize=(10,3),dpi=100)
    plt.title('TOP10 Product '+'User '+action+'(Proportion)',fontsize=15)
    g=sns.barplot(data=df_top10,x=df_top10.index,y='user_id',order=df_top10.index)
    plt.xticks(fontsize=10,rotation=45)
    plt.yticks(range(0,max(df_top10['user_id'].max()//10*13+1,df_top10['user_id'].max()+2),max(df_top10['user_id'].max()//10*10//5,1)),fontsize=10)
    plt.xlabel('Item ID',fontsize=13)
    plt.ylabel(action+' times',fontsize=13)
    for x,y in enumerate(df_top10['user_id']):
        g.text(x,y+y*0.05,str(y),ha='center',fontsize=10)
    for x,y in enumerate(df_top10['item_category']):
        g.text(x,df_top10['user_id'].min()/2,y,ha='center',fontsize=10)

top10_purchase(df_browse,'Browse')
top10_purchase(df_collect,'Collect')
top10_purchase(df_buy,'Purchase')
top10_purchase(df_cart,'Add to shopping cart')
image.png
image.png
image.png
image.png
3.3 User behavior in 30 days
3.3.1 Times of purchase in recent 30 days
  • a. Extract the purchase data of the user behavior_type=4
  • b. Count user id in groups
#group by purchase and count it 
df_counts_30_buy = df[df['behavior_type']==4].groupby('user_id').item_id.count().reset_index()

labels = pd.merge(labels,df_counts_30_buy,how='left',on='user_id')
labels.rename(columns={'item_id':'counts_30_buy'},inplace=True)
3.3.2 Times of add to shopping cart in recent 30 days
  • a. Extract the purchase data of the user behavior_type=3
  • b. Count user id in groups
#group by add to shopping cart and count it 
df_counts_30_cart = df[df['behavior_type']==3].groupby('user_id').item_id.count().reset_index()

labels = pd.merge(labels,df_counts_30_cart,how='left',on='user_id')
labels.rename(columns={'item_id':'counts_30_cart'},inplace=True)

labels=labels.iloc[:,~labels.columns.duplicated()]
3.3.3 Active days in recent 30 days
  • a. Extract the purchase data of all users
  • b. Count the number of days on different dates based on user ID groups
#Group users and count active days, including browsing, favorites, add and purchase
counts_30_active = df.groupby('user_id')['date'].nunique()

labels = pd.merge(labels,counts_30_active,how='left',on='user_id')
labels.rename(columns={'date':'counts_30_active'},inplace=True)
3.4 User behavior in 7 days
#The last date in the dataset is December 18, and user behavior after December 11 is counted
df_near_7 = df[df['date']>'2020-12-11']
3.4.1 Times of purchase in recent 7 days
  • a. Extract the purchase data of the user behavior_type=4
  • b. Count user id in groups
df_counts_7_buy = df_near_7[df_near_7['behavior_type']==4].groupby('user_id').item_id.count().reset_index()

labels = pd.merge(labels,df_counts_7_buy,how='left',on='user_id')
labels.rename(columns={'item_id':'counts_7_buy'},inplace=True)
3.4.2 Times of add to shopping cart in recent 7 days
  • a. Extract the purchase data of the user behavior_type=3
  • b. Count user id in groups
df_counts_7_cart = df_near_7[df_near_7['behavior_type']==3].groupby('user_id').item_id.count().reset_index()

labels = pd.merge(labels,df_counts_7_cart,how='left',on='user_id')
labels.rename(columns={'item_id':'counts_7_cart'},inplace=True)
3.4.3 Active days in recent 7 days
  • a. Extract the purchase data of all users
  • b. Count the number of days on different dates based on user ID groups
counts_7_active = df_near_7.groupby('user_id')['date'].nunique()
labels = pd.merge(labels,counts_7_active,how='left',on='user_id')
labels.rename(columns={'date':'counts_7_active'},inplace=True)
3.5 Days since last activity
  • 5.1 Days since last browsing
  • 5.2 Days since last add to shopping cart
  • 5.3 Days since last purchase
3.5.1 Days since last browsing
  • a.Extract the browsing data of the user behavior_type=1.
  • b.obtain the days since last browsing
  • c. Add it to the label
days_browse = df[df['behavior_type']==1].groupby('user_id')['date'].max().apply(lambda x:(datetime.strptime('2020-12-19','%Y-%m-%d')-datetime.strptime(x,'%Y-%m-%d')).days)
labels = pd.merge(labels,days_browse,how='left',on='user_id')
labels.rename(columns={'date':'days_browse'},inplace=True)
3.5.2 Days since last add to shopping cart
  • a.Extract the adding to shopping cart data of the user behavior_type=3.
  • b.obtain the days since last adding to shopping cart
  • c. Add it to the label
days_cart = df[df['behavior_type']==3].groupby('user_id')['date'].max().apply(lambda x:(datetime.strptime('2020-12-19','%Y-%m-%d')-datetime.strptime(x,'%Y-%m-%d')).days)

labels = pd.merge(labels,days_cart,how='left',on='user_id')
labels.rename(columns={'date':'days_cart'},inplace=True)
3.5.3 Days since last purchase
  • a.Extract the purchase data of the user behavior_type=4.
  • b.obtain the days since last purchase
  • c. Add it to the label
days_buy = df[df['behavior_type']==4].groupby('user_id')['date'].max().apply(lambda x:(datetime.strptime('2020-12-19','%Y-%m-%d')-datetime.strptime(x,'%Y-%m-%d')).days)

labels = pd.merge(labels,days_buy,how='left',on='user_id')
labels.rename(columns={'date':'days_buy'},inplace=True)
3.6 Days between the last two purchases
  • a.Extract the purchase data of the user behavior_type=4
  • b. Count user id+date in groups
  • c.Use diff function to obtain the days between the latest two purchases
  • d.Add it to the label
df_interval_buy = df[df['behavior_type']==4].groupby(['user_id','date']).item_id.count().reset_index()
df_interval_buy['date']=df_interval_buy['date'].apply(lambda x:datetime.strptime(x,'%Y-%m-%d'))
interval_buy = df_interval_buy.groupby('user_id')['date'].apply(lambda x:x.sort_values().diff(1).dropna().head(1)).reset_index()
interval_buy['date'] = interval_buy['date'].apply(lambda x : x.days)
interval_buy.drop('level_1',axis=1,inplace=True)
interval_buy.rename(columns={'date':'interval_buy'},inplace=True)
labels = pd.merge(labels,interval_buy,how='left',on='user_id')
3.7 Browse but not buy
  • a.Extract browsing data and purchase data.
  • b.Groups by user ID + product id + users' activity and,obtain statistics of each user's browsing and purchase of each product.
  • c.Determine whether each user browsing but not purchasing behavior for each item. If there is, the value of a new field is marked as 1; if there is no, it is marked as 0
  • d.Then group users by user ID and count the number of times that each user browses but does not purchase. If the number is greater than 0, the value is yes; otherwise, the value is no
df_browse_buy = df.loc[(df['behavior_type']==1) | (df['behavior_type']==4),['user_id','item_id','behavior_type','time']]\
browse_not_buy.fillna(0,inplace=True)
browse_not_buy['browse_not_buy'] = 0
browse_not_buy.loc[(browse_not_buy['browse']>0) & (browse_not_buy['buy']==0),'browse_not_buy'] = 1
browse_not_buy = browse_not_buy.groupby('user_id')['browse_not_buy'].sum().reset_index()
labels = pd.merge(labels,browse_not_buy,how='left',on='user_id')
labels['browse_not_buy'] = labels['browse_not_buy'].apply(lambda x: '是' if x>0 else '否')
3.8 Add to shopping cart but not buy
  • a.Extract add to shopping cart data and purchase data.
  • b.Groups by user ID + product id + users' activity and,obtain statistics of each user's adding to shopping cart and purchase of each product.
  • c.Determine whether each user adding to shopping cart but not.purchasing behavior for each item. If there is, the value of a new field is marked as 1; if there is no, it is marked as 0.
  • d、Then group users by user ID and count the number of times that each user's adding to shopping cart but does not purchase. If the number is greater than 0, the value is yes; otherwise, the value is no.
df_cart_buy = df.loc[(df['behavior_type']==3) | (df['behavior_type']==4),['user_id','item_id','behavior_type','time']]
cart_not_buy = pd.pivot_table(df_cart_buy,index=['user_id','item_id'],columns=['behavior_type'],values=['time'],aggfunc=['count'])
cart_not_buy.columns = ['cart','buy']
cart_not_buy.fillna(0,inplace=True)
cart_not_buy['cart_not_buy'] = 0
cart_not_buy.loc[(cart_not_buy['cart']>0) & (cart_not_buy['buy']==0),'cart_not_buy'] = 1
cart_not_buy = cart_not_buy.groupby('user_id')['cart_not_buy'].sum().reset_index()
labels = pd.merge(labels,cart_not_buy,how='left',on='user_id')
3.9 User tag
3.9.1 Repurchaser or not
  • a、Extract the purchase data of the user behavior_type=4
  • b、If the purchase is greater than 1, there is a repurchase, otherwise there is no repurchase
buy_again = df[df['behavior_type']==4].groupby('user_id')['item_id'].count().reset_index()
buy_again.rename(columns={'item_id':'buy_again'},inplace=True)
labels = pd.merge(labels,buy_again,how='left',on='user_id')
labels['buy_again'].fillna(-1,inplace=True)
#未購買的用戶標(biāo)記為‘未購買’,有購買未復(fù)購的用戶標(biāo)記為‘否’,有復(fù)購的用戶標(biāo)記為‘是’
labels['buy_again'] = labels['buy_again'].apply(lambda x: '是' if x>1 else  '否' if x==1 else '未購買')
3.9.2 Page view
  • a.Number of active users in the last 30 days
  • b.Access times and access number
  • c.Plot to get a trend chart
  • d.Defined as "high" if access activity exceeds 20, "low" otherwise
user_active_level = labels['counts_30_active'].value_counts().sort_index(ascending=False)

plt.figure(figsize=(10,3),dpi=100)
plt.plot(user_active_level.index,user_active_level.values,'--o')
plt.title('Visiting Days in recent 30 Days and Number of Visitors',fontsize=15)
plt.xticks(range(0,32,5),fontsize=10)
plt.yticks(range(0,21,5),fontsize=10)
plt.xlabel('Visiting Days in recent 30 Days',fontsize=13)
plt.ylabel('Number of Visitors',fontsize=13)
for x,y in enumerate(list(reversed(user_active_level.values))):
    plt.text(x+1,y+1,y,ha='center',fontsize=10)
image.png
  • In general, the number of visitors with more visits is more than that with fewer visits, and the inflection point is about 20 visits. Therefore, the number of visits less than 20 times is defined as low activity, and the number of visits more than or equal to 20 times is defined as high activity. This definition is only from the perspective of the distribution of users, and the definition is active from the perspective of the business at work.
labels['buy_active_level'] = '高'
labels.loc[labels['counts_30_buy']<=19,'buy_active_level'] = '低'
3.9.3 Purchased only one kind of product or not
  • a. Extract the purchase data of the user behavior_type=4
  • b. Group users by USER ID and count the number of categories
  • c. Check whether the count is greater than 1
buy_single = df[df['behavior_type']==4].groupby('user_id').item_category.nunique().reset_index()
buy_single.rename(columns={'item_category':'buy_single'},inplace=True)
labels = pd.merge(labels,buy_single,how='left',on='user_id')
labels['buy_single'].fillna(-1,inplace=True)
labels['buy_single'] = labels['buy_single'].apply(lambda x: '否' if x>1 else  '是' if x==1 else '未購買' )

ax1=plt.figure(figsize=(8,6),dpi=100)
plt.title('Purchase only one time?',fontsize=15)
g=sns.countplot(data=labels,x='buy_single')
plt.xticks(fontsize=13)
plt.yticks(range(0,201,20),fontsize=10)
plt.xlabel(' ')
plt.ylabel('Number of People',fontsize=13,rotation=90)
for x,y in enumerate([180,29,13]):
    g.text(x,y+5,str(y)+'人',ha='center',fontsize=10)
image.png
3.9.4 User label(RFM)
  • a.Obtain the days since the last purchase as label 'days_buy'
  • b.Count the purchase days as label 'buy_days_level'
  • c.Draw the graph trend of buy_DAYs_level and the number of buyers
  • d.Defined as "high" if days_buy exceeds 8, "low" otherwise
    RFM:
    Consumption in recency
    Consumption frequency
    Consumption monetary
last_buy_days = labels['days_buy'].value_counts().sort_index()

from matplotlib.font_manager import FontProperties
myfont=FontProperties(fname=r'C:\\Windows\\Fonts\\simhei.ttf',size=8)
sns.set(font=myfont.get_name())

plt.figure(figsize=(10,3),dpi=100)
plt.plot(last_buy_days.index,last_buy_days.values,'--o')
plt.title('Days since Last Purchase and Number of Purchasers',fontsize=15)
plt.xticks(range(0,32,5),fontsize=10)
plt.yticks(range(0,41,5),fontsize=10)
plt.xlabel('Days since Last Purchase',fontsize=13)
plt.ylabel('Number of Purchasers',fontsize=13)
for x,y in enumerate(last_buy_days.values):
    plt.text(x+1,y+1,y,ha='center',fontsize=10)
image.png
labels['buy_days_level'] = '高'
labels.loc[labels['days_buy']>8,'buy_days_level'] = '低'

labels['rfm_value'] = labels['buy_active_level'].str.cat(labels['buy_days_level'])

def trans_value(x):
    if x == 'HighHigh':
        return 'Import Valuable Customres'
    elif x == 'LowHigh':
        return 'Import Developable Customres'
    elif x == 'HighLow':
        return 'Import Recall Customres'
    else: 
        return 'About to Lose'
labels['rfm'] = labels['rfm_value'].apply(trans_value)
labels.drop(['buy_days_level','rfm_value'],axis=1,inplace=True)
RFM=labels['rfm'].value_counts()

ax1=plt.figure(figsize=(10,3),dpi=100)
plt.title('RFM Model',fontsize=15)
g=sns.barplot(y=labels['rfm'].value_counts().index,x=labels['rfm'].value_counts().values)
plt.xticks(range(0,141,20),fontsize=10)
plt.yticks(fontsize=10)
plt.xlabel('Number of Customers',fontsize=13)
plt.ylabel('Customers Label',fontsize=13,rotation=90)
for x,y in enumerate(labels['rfm'].value_counts().values):
    g.text(y+5,x,str(y),ha='center',fontsize=10)
image.png
# import order data
order_df = pd.read_excel("order_data.xlsx")
order_df['time']=order_df['time'].apply(lambda x:'2020'+x[4:])
# Convert order date to time
order_df["time"]=pd.to_datetime(order_df["time"])
# Extraction time
order_df["time_hour"]=order_df["time"].dt.hour
# In order to count by week, you need to convert the time to the day of the week. By default, Monday counts from 0
order_df["time_week"]=order_df["time"].dt.dayofweek+1
#purchase in the weeks
week_order_df = order_df.groupby("time_week").user_id.count().reset_index()

myfont=FontProperties(fname=r'C:\\Windows\\Fonts\\simhei.ttf',size=8)
sns.set(font=myfont.get_name())

plt.figure(figsize=(10,3),dpi=100)
plt.plot(week_order_df['time_week'],week_order_df['user_id'],'--o')
plt.title('Purchase weeks',fontsize=15)
plt.xticks(range(1,8,1),fontsize=10)
plt.yticks(range(30000,42001,2000),fontsize=10)
plt.xlabel('Weeks',fontsize=13)
plt.ylabel('How many Purchases in a week',fontsize=13)
for x,y in enumerate(week_order_df['user_id']):
    plt.text(x+1,y+500,y,ha='center',fontsize=10)
image.png
# Weekly shopping
hour_order_df = order_df.groupby("time_hour").user_id.count().reset_index()

myfont=FontProperties(fname=r'C:\\Windows\\Fonts\\simhei.ttf',size=8)
sns.set(font=myfont.get_name())

plt.figure(figsize=(10,3),dpi=100)
plt.plot(hour_order_df['time_hour'],hour_order_df['user_id'],'--o')
plt.title('Purchase Time Distribuution',fontsize=15)
plt.xticks(range(0,24,1),fontsize=10)
plt.yticks(range(0,25001,5000),fontsize=10)
plt.xlabel('24 hours',fontsize=13)
plt.ylabel('how many purchase',fontsize=13)
for x,y in enumerate(hour_order_df['user_id']):
    plt.text(x,y+1000,y,ha='center',fontsize=10)
image.png
最后編輯于
?著作權(quán)歸作者所有,轉(zhuǎn)載或內(nèi)容合作請聯(lián)系作者
【社區(qū)內(nèi)容提示】社區(qū)部分內(nèi)容疑似由AI輔助生成,瀏覽時(shí)請結(jié)合常識與多方信息審慎甄別。
平臺(tái)聲明:文章內(nèi)容(如有圖片或視頻亦包括在內(nèi))由作者上傳并發(fā)布,文章內(nèi)容僅代表作者本人觀點(diǎn),簡書系信息發(fā)布平臺(tái),僅提供信息存儲(chǔ)服務(wù)。

相關(guān)閱讀更多精彩內(nèi)容

  • rljs by sennchi Timeline of History Part One The Cognitiv...
    sennchi閱讀 7,872評論 0 10
  • **2014真題Directions:Read the following text. Choose the be...
    又是夜半驚坐起閱讀 11,220評論 0 23
  • ![Flask](data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAW...
    極客學(xué)院Wiki閱讀 7,850評論 0 3
  • 不知不覺易趣客已經(jīng)在路上走了快一年了,感覺也該讓更多朋友認(rèn)識知道易趣客,所以就謝了這篇簡介,已做創(chuàng)業(yè)記事。 易趣客...
    Physher閱讀 3,838評論 1 2
  • 雙胎妊娠有家族遺傳傾向,隨母系遺傳。有研究表明,如果孕婦本人是雙胎之一,她生雙胎的機(jī)率為1/58;若孕婦的父親或母...
    鄴水芙蓉hibiscus閱讀 3,934評論 0 2

友情鏈接更多精彩內(nèi)容