Background
- The company is a domestic e-commerce of daily necessities, electronic products, fast food.
- Through user consumption records in the past 30 days, analyze user consumption behavior, establish RFM model for user stratification, discover high-value users to achieve delicacy management.
Mind Map

User profile-web-1642391282193.jpeg
1.Data Overview
1.1 Import modules and data
#Import modules
%matplotlib inline
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import matplotlib
import seaborn as sns
# The specified font
matplotlib.rcParams['font.sans-serif'] = ['SimHei']
matplotlib.rcParams['font.family']='sans-serif'
#Fixed the issue where the minus sign '-' appears as a square
matplotlib.rcParams['axes.unicode_minus'] = False
import pymysql
pymysql.install_as_MySQLdb()
from sqlalchemy import create_engine
import gc
import warnings
warnings.filterwarnings('ignore')
from datetime import datetime
# Import data set
df = pd.read_excel("order_data.xlsx")
# Import the user details data set
df_user = pd.read_excel("user_data.xlsx")
1.2 Orders data

image.png
The number of unique values for an order features
df_count=pd.DataFrame()
for i in ['item_id','behavior_type','item_category','time']:
df_count.loc[i,0]=df[i].nunique()
from matplotlib.font_manager import FontProperties
myfont=FontProperties(fname=r'C:\\Windows\\Fonts\\simhei.ttf',size=8)
sns.set(font=myfont.get_name())
plt.figure(figsize=(8,6),dpi=100)
g=sns.barplot(data=df_count,x=df_count.index,y=0)
plt.title('Orders Data',fontsize=15)
x=df_count.index
y=df_count[0]
plt.yticks(np.arange(0,100001,20000),fontsize=10)
plt.ylabel('How many Unique Value',fontsize=13)
for i,row in enumerate(df_count[0]):
g.text(i,row+3000,str(row)[:-2],ha='center',va='center',fontsize=10)

image.png
1.3 User data

image.png
The number of unique values for user featuures
df_user_count=pd.DataFrame()
for i in df_user.drop('user_id',axis=1):
df_user_count.loc[i,0]=df_user[i].nunique()
plt.figure(figsize=(8,6),dpi=100)
g=sns.barplot(data=df_user_count,x=df_user_count.index,y=0)
plt.title('User Data',fontsize=15)
x=df_user_count.index
y=df_user_count[0]
plt.yticks(np.arange(0,41,5),fontsize=10)
plt.ylabel('How many Unique Value',fontsize=13)
for i,row in enumerate(df_user_count[0]):
g.text(i,row+1,str(row)[:-2],ha='center',va='center',fontsize=10)

image.png
df=labels
user_sex_df = df.groupby("gender")["user_id"].count()
## Pd. cut is used for data discretization and segmentation. Note that the group labels and the number of groups must be consistent
bins = [0,20,25,30,35,40,45,50,55,1000]
labels = ["(0-20]","(20-25]","(25-30]","(30-35]","(35-40]","(40-45]","(45-50]","(50-55]","55歲以上"]
df["age_group"] = pd.cut(df["age"],bins=bins,labels=labels)
# Draw a bar chart,rot=0 indicates rotation Angle, 0-360
age_group_df = df.groupby("age_group").user_id.count().reset_index()
# Group by province
pro_user_df = df.groupby("province").user_id.count().reset_index().sort_values(by="user_id",ascending=True)
pro_user_df=pro_user_df.sort_values('user_id',ascending=False)
# Group by city
city_user_df = df.groupby("city").user_id.count().reset_index().sort_values(by="user_id",ascending=True)
# Group by married
marry_df = df.groupby("marriage").user_id.count()
# Group by education
eduction_df = df.groupby("eduction").user_id.count().reset_index()
eduction_df=eduction_df.sort_values('user_id',ascending=False)
# Group by job
job_df = df.groupby("job").user_id.count().reset_index()
job_df=job_df.sort_values('user_id',ascending=False)
myfont=FontProperties(fname=r'C:\\Windows\\Fonts\\simhei.ttf',size=8)
sns.set(style='darkgrid',font=myfont.get_name())
plt.figure(figsize=(50,70),dpi=100)
plt.subplot(711)
sns.barplot(data=job_df,x='job',y='user_id',palette='Blues_r')
plt.xticks(fontsize=50)
plt.yticks(range(0,81,20),fontsize=50)
plt.ylabel('Number of Customers',fontsize=55)
plt.title('Occupation Distribution',fontsize=60)
for x,y in enumerate(job_df['user_id']):
plt.text(x,y+5,y,fontsize=50,ha='center')
plt.subplot(712)
sns.barplot(data=eduction_df,x='eduction',y='user_id')
plt.xticks(fontsize=50)
plt.yticks(range(0,201,50),fontsize=50)
plt.ylabel('Number of Customers',fontsize=55)
plt.title('Education Distribution',fontsize=60)
for x,y in enumerate(eduction_df['user_id']):
plt.text(x,y+5,y,fontsize=50,ha='center')
plt.subplot(713)
sns.barplot(data=marry_df,x='marriage',y='user_id')
plt.xticks(fontsize=50)
plt.yticks(range(0,181,90),fontsize=50)
plt.ylabel('Number of Customers',fontsize=55)
plt.title('Marital Distribution',fontsize=60)
for x,y in enumerate(marry_df['user_id']):
plt.text(x,y+5,y,fontsize=50,ha='center')
plt.subplot(714)
sns.barplot(data=city_user_df,x='city',y='user_id',palette='Greens_r')
plt.xticks(fontsize=50)
plt.yticks(range(0,61,20),fontsize=50)
plt.ylabel('Number of Customers',fontsize=55)
plt.title('Distribution of Customers in cities',fontsize=60)
for x,y in enumerate(city_user_df['user_id']):
plt.text(x,y+2,y,fontsize=50,ha='center')
plt.subplot(715)
sns.barplot(data=pro_user_df,x='province',y='user_id',palette='Oranges_r')
plt.xticks(fontsize=50)
plt.yticks(range(0,61,20),fontsize=50)
plt.ylabel('Number of Customers',fontsize=55)
plt.title('Distribution of Customers in Provinces',fontsize=60)
for x,y in enumerate(pro_user_df['user_id']):
plt.text(x,y+2,y,fontsize=50,ha='center')
plt.subplot(716)
sns.barplot(data=user_sex_df1,x=user_sex_df1.gender,y=user_sex_df1.user_id)
plt.xticks(fontsize=50)
plt.yticks(range(0,121,60),fontsize=50)
plt.ylabel('Number of Customers',fontsize=55)
plt.title('Customers Gender Distribution',fontsize=60)
for x,y in enumerate(user_sex_df1.user_id):
plt.text(x,y+2,y,fontsize=50,ha='center')
plt.subplot(717)
sns.barplot(data=age_group_df,x='age_group',y='user_id',palette='Reds_r')
plt.xticks(fontsize=50)
plt.yticks(range(0,71,20),fontsize=50)
plt.xlabel('Age',fontsize=55)
plt.ylabel('Number of Customers',fontsize=55)
plt.title('Customers Age Distribution',fontsize=60)
for x,y in enumerate(age_group_df['user_id']):
plt.text(x,y+2,y,fontsize=50,ha='center')
plt.tight_layout()
plt.subplots_adjust(wspace=0.5,hspace=0.5)
plt.savefig('711.png')

image.png
2.Date and time processing
#Split the time feature into date and period
df['date'] = df['time'].str[0:10]
df['date'] = pd.to_datetime(df['date'],format='%Y-%m-%d')
df['time'] = df['time'].str[11:]
df['time'] = df['time'].astype(int)
# Divide the time period into 'early morning', 'morning', 'noon', 'afternoon' and 'evening'
df['hour'] = pd.cut(df['time'],bins=[-1,5,10,13,18,24],labels=['凌晨','上午','中午','下午','晚上'])
myfont=FontProperties(fname=r'C:\Windows\Fonts\simhei.ttf',size=8)
sns.set(font=myfont.get_name())
hour_order=['Morning','Noon','Afternoon','Evening','Early morning']
ax1=plt.figure(figsize=(8,6),dpi=100)
plt.title('Active Periods',fontsize=15)
sns.countplot(data=df,x='hour',order=hour_order,hue='behavior_type')
plt.legend(['Browse','Collect','Add to shopping cart','Purchase'],fontsize=10)
plt.xticks(fontsize=10)
plt.yticks(fontsize=10)
plt.xlabel('Periods',fontsize=13)
plt.ylabel('Number of People',fontsize=13)
ax2=plt.figure(figsize=(8,6),dpi=100)
ax2=sns.countplot(df['date'],order=sorted(df['date'].unique()))
plt.xticks(rotation=90,fontsize=15)
ax2.set_title('Active Date',fontsize=15,ha='center')
plt.xticks(fontsize=10)
plt.yticks(fontsize=10)
plt.xlabel('Year-Month-Day',fontsize=13)
plt.ylabel('Number of People',fontsize=13)
In a day, users are most active in the evening, followed by afternoon, and less frequent in the early morning, which belongs to a reasonable category.
-
December 12 was the most active date in the past 30 days, mainly due to the Double 12 activity.
image.png

image.png
action_hour=pd.DataFrame(df['time'].value_counts()).sort_index(ascending=True)
plt.figure(figsize=(10,3),dpi=100)
plt.title('Active Time',fontsize=15)
plt.plot(action_hour.index,action_hour['time'],'-o')
plt.xticks(range(0,24,1),fontsize=10)
plt.yticks(fontsize=10)
plt.xlabel('(Time)',fontsize=13)
plt.ylabel('Number of People',fontsize=13)
Throughout the day, users are active from 18:00 to 22:00 in the evening.

image.png
3.Create a user profile
3.1 User active time
3.1.1 Active browsing period
processing:
- a.Extract the browsing data of the user behavior_type=1.
- b.Then group count according to user ID + time period, and calculate the maximum value.
- c.Get the user ID + the most active time period. If there are multiple most active time ranges, splice with commas.
The type of the behavior_type
- Browse: Behavior_type =1
- Collection: Behavior_type =2
- Add purchase: Behavior_type =3
- Purchase: Behavior_type =4
#Group users and time periods and count the number of browse
time_browse = df[df['behavior_type']==1].groupby(['user_id','hour']).item_id.count().reset_index()
time_browse.rename(columns={'item_id':'hour_counts'},inplace=True)
#which periods dose users usually browse
time_browse_max = time_browse.groupby('user_id').hour_counts.max().reset_index()
time_browse_max.rename(columns={'hour_counts':'read_counts_max'},inplace=True)
time_browse = pd.merge(time_browse,time_browse_max,how='left',on='user_id')
#Select the time period that each user browses the most. If there are the same time periods, connect them with commas
time_browse_hour = time_browse.loc[time_browse['hour_counts']==time_browse['read_counts_max'],'hour'].groupby(time_browse['user_id']).aggregate(lambda x:','.join(x)).reset_index()
count_browse_hour=time_browse_hour.groupby('hour').count()
count_browse_hour2=count_browse_hour.reset_index()
count_browse_hour2['hour']=count_browse_hour2['hour'].astype('category')
count_browse_hour2['hour'].cat.reorder_categories(['Morning','Noon','Afternoon','Evening','Early morning','Afternoon,Evening'], inplace=True)
count_browse_hour2.sort_values('hour',inplace=True)
#draw a picture
hour_order=['Morning','Noon','Afternoon','Evening','Early morning','Afternoon,Evening']
ax1=plt.figure(figsize=(10,6),dpi=100)
plt.title('Browse Time Distribution',fontsize=15)
g=sns.barplot(data=count_browse_hour2,x='hour',y='user_id',order=hour_order)
plt.xticks(fontsize=10)
plt.yticks(range(0,200,50),fontsize=10)
plt.xlabel('Periods',fontsize=13)
plt.ylabel('Number of People',fontsize=13)
for x,y in enumerate(count_browse_hour2['user_id']):
g.text(x,y+5,str(y),ha='center',fontsize=10)
-
A total of 121 people like to browse and buy goods in the evening, while only a few people like to browse e-commerce platforms in the early morning and noon.
image.png
3.1.2 Active purchasing period
processing:
- a. Extract the purchase data of the user behavior_type=4
- b. Groups by user ID + time and obtain the maximum value in the group
- c. Obtain the user ID + the most active period. If there are more than 2 the most active period, combine the user ID with commas.
time_buy = df[df['behavior_type']==4].groupby(['user_id','hour']).item_id.count().reset_index()
time_buy.rename(columns={'item_id':'hour_counts'},inplace=True)
time_buy_max = time_buy.groupby('user_id').hour_counts.max().reset_index()
time_buy_max.rename(columns={'hour_counts':'buy_counts_max'},inplace=True)
time_buy = pd.merge(time_buy,time_buy_max,how='left',on='user_id')
time_buy_hour = time_buy.loc[time_buy['hour_counts']==time_buy['buy_counts_max'],'hour'].groupby(time_buy['user_id']).aggregate(lambda x:','.join(x)).reset_index()
#Adding user usually purchase periods to the user tag table
labels = pd.merge(labels,time_buy_hour,how='left',on='user_id')
labels.rename(columns={'hour':'time_buy'},inplace=True)
count_time_buy=labels.groupby('time_buy').agg({'time_buy':'count'})
#draw
ax1=plt.figure(figsize=(10,3),dpi=100)
plt.title('Purchase Periods Distribution',fontsize=15)
g=sns.barplot(data=count_time_buy,x=count_time_buy.index,y='time_buy')
plt.xticks(fontsize=10,rotation=90)
plt.yticks(range(0,81,20),fontsize=10)
plt.xlabel('Periods',fontsize=13)
plt.ylabel('Number of People',fontsize=13)
for x,y in enumerate(count_time_buy['time_buy']):
g.text(x,y+5,str(y),ha='center',fontsize=10)
-
People prefer to shop in the afternoon or evening, 113 people in total, accounting for 50.82%.
image.png
3.2 User behavior about products
3.2.1 Most viewed categories
processing:
- a. Extract the purchase data of the user behavior_type=1
- b. Groups by user ID + products' categories and obtain the maximum value in the group
- c. Obtain the user ID + the most viewed products' categories. If there are more than 2 most viewed products' categories, combine the user ID with commas.
df_cate_most_browse = df_browse.groupby(['user_id','item_category']).item_id.count().reset_index()
df_cate_most_browse.rename(columns={'item_id':'item_category_counts'},inplace=True)
#Collects statistics on the categories most viewed by each user
df_cate_most_browse_max = df_cate_most_browse.groupby('user_id').item_category_counts.max().reset_index()
df_cate_most_browse_max.rename(columns={'item_category_counts':'item_category_counts_max'},inplace=True)
df_cate_most_browse = pd.merge(df_cate_most_browse,df_cate_most_browse_max,how='left',on='user_id')
df_cate_most_browse['item_category'] = df_cate_most_browse['item_category'].astype(str)
#Select the category that each user browses the most. If there is a category that is tied the most, connect it with a comma
df_cate_browse = df_cate_most_browse.loc[df_cate_most_browse['item_category_counts']==df_cate_most_browse['item_category_counts_max'],'item_category'].groupby(df_cate_most_browse['user_id']).aggregate(lambda x:','.join(x)).reset_index()
#Add the most viewed categories to the user tag table
labels = pd.merge(labels,df_cate_browse,how='left',on='user_id')
labels.rename(columns={'item_category':'cate_most_browse'},inplace=True)
3.2.2 Most collected categories
- a. Extract the purchase data of the user behavior_type=2
- b. Groups by user ID + products' categories and obtain the maximum value in the group
- c. Obtain the user ID + the Most collected categories. If there are more than 2 most collected categories, combine the user ID with commas.
df_cate_most_collect = df_collect.groupby(['user_id','item_category']).item_id.count().reset_index()
df_cate_most_collect.rename(columns={'item_id':'item_category_counts'},inplace=True)
df_cate_most_collect_max = df_cate_most_collect.groupby('user_id').item_category_counts.max().reset_index()
df_cate_most_collect_max.rename(columns={'item_category_counts':'item_category_counts_max'},inplace=True)
df_cate_most_collect = pd.merge(df_cate_most_collect,df_cate_most_collect_max,how='left',on='user_id')
df_cate_most_collect['item_category'] = df_cate_most_collect['item_category'].astype(str)
df_cate_collect = df_cate_most_collect.loc[df_cate_most_collect['item_category_counts']==df_cate_most_collect['item_category_counts_max'],'item_category'].groupby(df_cate_most_collect['user_id']).aggregate(lambda x:','.join(x)).reset_index()
labels = pd.merge(labels,df_cate_collect,how='left',on='user_id')
labels.rename(columns={'item_category':'cate_most_collect'},inplace=True)
3.2.3 Most added to shopping cart categories
- a. Extract the purchase data of the user behavior_type=3
- b. Groups by user ID + products' categories and obtain the maximum value in the group
- c. Obtain the user ID + the most added to shopping cart categories. If there are more than 2 most added to shopping cart categories, combine the user ID with commas.
df_cate_most_cart = df_cart.groupby(['user_id','item_category']).item_id.count().reset_index()
df_cate_most_cart = df_cart.groupby(['user_id','item_category']).item_id.count().reset_index()
df_cate_most_cart.rename(columns={'item_id':'item_category_counts'},inplace=True)
df_cate_most_cart_max = df_cate_most_cart.groupby('user_id').item_category_counts.max().reset_index()
df_cate_most_cart_max.rename(columns={'item_category_counts':'item_category_counts_max'},inplace=True)
df_cate_most_cart = pd.merge(df_cate_most_cart,df_cate_most_cart_max,how='left',on='user_id')
df_cate_most_cart['item_category'] = df_cate_most_cart['item_category'].astype(str)
df_cate_cart = df_cate_most_cart.loc[df_cate_most_cart['item_category_counts']==df_cate_most_cart['item_category_counts_max'],'item_category'].groupby(df_cate_most_cart['user_id']).aggregate(lambda x:','.join(x)).reset_index()
labels = pd.merge(labels,df_cate_cart,how='left',on='user_id')
labels.rename(columns={'item_category':'cate_most_cart'},inplace=True)
3.2.4 Most purchased categories
- a. Extract the purchase data of the user behavior_type=4
- b. Groups by user ID + products' categories and obtain the maximum value in the group
- c. Obtain the user ID + the most purchased categories. If there are more than 2 most purchased categories, combine the user ID with commas.
df_cate_most_buy = df_buy.groupby(['user_id','item_category']).item_id.count().reset_index()
df_cate_most_buy = df_buy.groupby(['user_id','item_category']).item_id.count().reset_index()
df_cate_most_buy.rename(columns={'item_id':'item_category_counts'},inplace=True)
df_cate_most_buy_max = df_cate_most_buy.groupby('user_id').item_category_counts.max().reset_index()
df_cate_most_buy_max.rename(columns={'item_category_counts':'item_category_counts_max'},inplace=True)
df_cate_most_buy = pd.merge(df_cate_most_buy,df_cate_most_buy_max,how='left',on='user_id')
df_cate_most_buy['item_category'] = df_cate_most_buy['item_category'].astype(str)
df_cate_buy = df_cate_most_buy.loc[df_cate_most_buy['item_category_counts']==df_cate_most_buy['item_category_counts_max'],'item_category'].groupby(df_cate_most_buy['user_id']).aggregate(lambda x:','.join(x)).reset_index()
labels = pd.merge(labels,df_cate_buy,how='left',on='user_id')
labels.rename(columns={'item_category':'cate_most_buy'},inplace=True)
- The top 10 categories of products Browsed, collected, added to shopping cart and purchased
# First, obtain the required data
df_browse = df.loc[df['behavior_type']==1,['user_id','item_id','item_category']]
df_collect = df.loc[df['behavior_type']==2,['user_id','item_id','item_category']]
df_cart = df.loc[df['behavior_type']==3,['user_id','item_id','item_category']]
df_buy = df.loc[df['behavior_type']==4,['user_id','item_id','item_category']]
def top10_purchase(df,action):
df_top10=df.groupby('item_id').count().sort_values('user_id',ascending=False)
df_top10['item_category']=round(df_top10['user_id']/df_top10['user_id'].sum()*100,3)
df_top10['item_category']=df_top10['item_category'].apply(lambda x:str(x)+'%')
df_top10=df_top10.head(10)
ax=plt.figure(figsize=(10,3),dpi=100)
plt.title('TOP10 Product '+'User '+action+'(Proportion)',fontsize=15)
g=sns.barplot(data=df_top10,x=df_top10.index,y='user_id',order=df_top10.index)
plt.xticks(fontsize=10,rotation=45)
plt.yticks(range(0,max(df_top10['user_id'].max()//10*13+1,df_top10['user_id'].max()+2),max(df_top10['user_id'].max()//10*10//5,1)),fontsize=10)
plt.xlabel('Item ID',fontsize=13)
plt.ylabel(action+' times',fontsize=13)
for x,y in enumerate(df_top10['user_id']):
g.text(x,y+y*0.05,str(y),ha='center',fontsize=10)
for x,y in enumerate(df_top10['item_category']):
g.text(x,df_top10['user_id'].min()/2,y,ha='center',fontsize=10)
top10_purchase(df_browse,'Browse')
top10_purchase(df_collect,'Collect')
top10_purchase(df_buy,'Purchase')
top10_purchase(df_cart,'Add to shopping cart')

image.png

image.png

image.png

image.png
3.3 User behavior in 30 days
3.3.1 Times of purchase in recent 30 days
- a. Extract the purchase data of the user behavior_type=4
- b. Count user id in groups
#group by purchase and count it
df_counts_30_buy = df[df['behavior_type']==4].groupby('user_id').item_id.count().reset_index()
labels = pd.merge(labels,df_counts_30_buy,how='left',on='user_id')
labels.rename(columns={'item_id':'counts_30_buy'},inplace=True)
3.3.2 Times of add to shopping cart in recent 30 days
- a. Extract the purchase data of the user behavior_type=3
- b. Count user id in groups
#group by add to shopping cart and count it
df_counts_30_cart = df[df['behavior_type']==3].groupby('user_id').item_id.count().reset_index()
labels = pd.merge(labels,df_counts_30_cart,how='left',on='user_id')
labels.rename(columns={'item_id':'counts_30_cart'},inplace=True)
labels=labels.iloc[:,~labels.columns.duplicated()]
3.3.3 Active days in recent 30 days
- a. Extract the purchase data of all users
- b. Count the number of days on different dates based on user ID groups
#Group users and count active days, including browsing, favorites, add and purchase
counts_30_active = df.groupby('user_id')['date'].nunique()
labels = pd.merge(labels,counts_30_active,how='left',on='user_id')
labels.rename(columns={'date':'counts_30_active'},inplace=True)
3.4 User behavior in 7 days
#The last date in the dataset is December 18, and user behavior after December 11 is counted
df_near_7 = df[df['date']>'2020-12-11']
3.4.1 Times of purchase in recent 7 days
- a. Extract the purchase data of the user behavior_type=4
- b. Count user id in groups
df_counts_7_buy = df_near_7[df_near_7['behavior_type']==4].groupby('user_id').item_id.count().reset_index()
labels = pd.merge(labels,df_counts_7_buy,how='left',on='user_id')
labels.rename(columns={'item_id':'counts_7_buy'},inplace=True)
3.4.2 Times of add to shopping cart in recent 7 days
- a. Extract the purchase data of the user behavior_type=3
- b. Count user id in groups
df_counts_7_cart = df_near_7[df_near_7['behavior_type']==3].groupby('user_id').item_id.count().reset_index()
labels = pd.merge(labels,df_counts_7_cart,how='left',on='user_id')
labels.rename(columns={'item_id':'counts_7_cart'},inplace=True)
3.4.3 Active days in recent 7 days
- a. Extract the purchase data of all users
- b. Count the number of days on different dates based on user ID groups
counts_7_active = df_near_7.groupby('user_id')['date'].nunique()
labels = pd.merge(labels,counts_7_active,how='left',on='user_id')
labels.rename(columns={'date':'counts_7_active'},inplace=True)
3.5 Days since last activity
- 5.1 Days since last browsing
- 5.2 Days since last add to shopping cart
- 5.3 Days since last purchase
3.5.1 Days since last browsing
- a.Extract the browsing data of the user behavior_type=1.
- b.obtain the days since last browsing
- c. Add it to the label
days_browse = df[df['behavior_type']==1].groupby('user_id')['date'].max().apply(lambda x:(datetime.strptime('2020-12-19','%Y-%m-%d')-datetime.strptime(x,'%Y-%m-%d')).days)
labels = pd.merge(labels,days_browse,how='left',on='user_id')
labels.rename(columns={'date':'days_browse'},inplace=True)
3.5.2 Days since last add to shopping cart
- a.Extract the adding to shopping cart data of the user behavior_type=3.
- b.obtain the days since last adding to shopping cart
- c. Add it to the label
days_cart = df[df['behavior_type']==3].groupby('user_id')['date'].max().apply(lambda x:(datetime.strptime('2020-12-19','%Y-%m-%d')-datetime.strptime(x,'%Y-%m-%d')).days)
labels = pd.merge(labels,days_cart,how='left',on='user_id')
labels.rename(columns={'date':'days_cart'},inplace=True)
3.5.3 Days since last purchase
- a.Extract the purchase data of the user behavior_type=4.
- b.obtain the days since last purchase
- c. Add it to the label
days_buy = df[df['behavior_type']==4].groupby('user_id')['date'].max().apply(lambda x:(datetime.strptime('2020-12-19','%Y-%m-%d')-datetime.strptime(x,'%Y-%m-%d')).days)
labels = pd.merge(labels,days_buy,how='left',on='user_id')
labels.rename(columns={'date':'days_buy'},inplace=True)
3.6 Days between the last two purchases
- a.Extract the purchase data of the user behavior_type=4
- b. Count user id+date in groups
- c.Use diff function to obtain the days between the latest two purchases
- d.Add it to the label
df_interval_buy = df[df['behavior_type']==4].groupby(['user_id','date']).item_id.count().reset_index()
df_interval_buy['date']=df_interval_buy['date'].apply(lambda x:datetime.strptime(x,'%Y-%m-%d'))
interval_buy = df_interval_buy.groupby('user_id')['date'].apply(lambda x:x.sort_values().diff(1).dropna().head(1)).reset_index()
interval_buy['date'] = interval_buy['date'].apply(lambda x : x.days)
interval_buy.drop('level_1',axis=1,inplace=True)
interval_buy.rename(columns={'date':'interval_buy'},inplace=True)
labels = pd.merge(labels,interval_buy,how='left',on='user_id')
3.7 Browse but not buy
- a.Extract browsing data and purchase data.
- b.Groups by user ID + product id + users' activity and,obtain statistics of each user's browsing and purchase of each product.
- c.Determine whether each user browsing but not purchasing behavior for each item. If there is, the value of a new field is marked as 1; if there is no, it is marked as 0
- d.Then group users by user ID and count the number of times that each user browses but does not purchase. If the number is greater than 0, the value is yes; otherwise, the value is no
df_browse_buy = df.loc[(df['behavior_type']==1) | (df['behavior_type']==4),['user_id','item_id','behavior_type','time']]\
browse_not_buy.fillna(0,inplace=True)
browse_not_buy['browse_not_buy'] = 0
browse_not_buy.loc[(browse_not_buy['browse']>0) & (browse_not_buy['buy']==0),'browse_not_buy'] = 1
browse_not_buy = browse_not_buy.groupby('user_id')['browse_not_buy'].sum().reset_index()
labels = pd.merge(labels,browse_not_buy,how='left',on='user_id')
labels['browse_not_buy'] = labels['browse_not_buy'].apply(lambda x: '是' if x>0 else '否')
3.8 Add to shopping cart but not buy
- a.Extract add to shopping cart data and purchase data.
- b.Groups by user ID + product id + users' activity and,obtain statistics of each user's adding to shopping cart and purchase of each product.
- c.Determine whether each user adding to shopping cart but not.purchasing behavior for each item. If there is, the value of a new field is marked as 1; if there is no, it is marked as 0.
- d、Then group users by user ID and count the number of times that each user's adding to shopping cart but does not purchase. If the number is greater than 0, the value is yes; otherwise, the value is no.
df_cart_buy = df.loc[(df['behavior_type']==3) | (df['behavior_type']==4),['user_id','item_id','behavior_type','time']]
cart_not_buy = pd.pivot_table(df_cart_buy,index=['user_id','item_id'],columns=['behavior_type'],values=['time'],aggfunc=['count'])
cart_not_buy.columns = ['cart','buy']
cart_not_buy.fillna(0,inplace=True)
cart_not_buy['cart_not_buy'] = 0
cart_not_buy.loc[(cart_not_buy['cart']>0) & (cart_not_buy['buy']==0),'cart_not_buy'] = 1
cart_not_buy = cart_not_buy.groupby('user_id')['cart_not_buy'].sum().reset_index()
labels = pd.merge(labels,cart_not_buy,how='left',on='user_id')
3.9 User tag
3.9.1 Repurchaser or not
- a、Extract the purchase data of the user behavior_type=4
- b、If the purchase is greater than 1, there is a repurchase, otherwise there is no repurchase
buy_again = df[df['behavior_type']==4].groupby('user_id')['item_id'].count().reset_index()
buy_again.rename(columns={'item_id':'buy_again'},inplace=True)
labels = pd.merge(labels,buy_again,how='left',on='user_id')
labels['buy_again'].fillna(-1,inplace=True)
#未購買的用戶標(biāo)記為‘未購買’,有購買未復(fù)購的用戶標(biāo)記為‘否’,有復(fù)購的用戶標(biāo)記為‘是’
labels['buy_again'] = labels['buy_again'].apply(lambda x: '是' if x>1 else '否' if x==1 else '未購買')
3.9.2 Page view
- a.Number of active users in the last 30 days
- b.Access times and access number
- c.Plot to get a trend chart
- d.Defined as "high" if access activity exceeds 20, "low" otherwise
user_active_level = labels['counts_30_active'].value_counts().sort_index(ascending=False)
plt.figure(figsize=(10,3),dpi=100)
plt.plot(user_active_level.index,user_active_level.values,'--o')
plt.title('Visiting Days in recent 30 Days and Number of Visitors',fontsize=15)
plt.xticks(range(0,32,5),fontsize=10)
plt.yticks(range(0,21,5),fontsize=10)
plt.xlabel('Visiting Days in recent 30 Days',fontsize=13)
plt.ylabel('Number of Visitors',fontsize=13)
for x,y in enumerate(list(reversed(user_active_level.values))):
plt.text(x+1,y+1,y,ha='center',fontsize=10)

image.png
- In general, the number of visitors with more visits is more than that with fewer visits, and the inflection point is about 20 visits. Therefore, the number of visits less than 20 times is defined as low activity, and the number of visits more than or equal to 20 times is defined as high activity. This definition is only from the perspective of the distribution of users, and the definition is active from the perspective of the business at work.
labels['buy_active_level'] = '高'
labels.loc[labels['counts_30_buy']<=19,'buy_active_level'] = '低'
3.9.3 Purchased only one kind of product or not
- a. Extract the purchase data of the user behavior_type=4
- b. Group users by USER ID and count the number of categories
- c. Check whether the count is greater than 1
buy_single = df[df['behavior_type']==4].groupby('user_id').item_category.nunique().reset_index()
buy_single.rename(columns={'item_category':'buy_single'},inplace=True)
labels = pd.merge(labels,buy_single,how='left',on='user_id')
labels['buy_single'].fillna(-1,inplace=True)
labels['buy_single'] = labels['buy_single'].apply(lambda x: '否' if x>1 else '是' if x==1 else '未購買' )
ax1=plt.figure(figsize=(8,6),dpi=100)
plt.title('Purchase only one time?',fontsize=15)
g=sns.countplot(data=labels,x='buy_single')
plt.xticks(fontsize=13)
plt.yticks(range(0,201,20),fontsize=10)
plt.xlabel(' ')
plt.ylabel('Number of People',fontsize=13,rotation=90)
for x,y in enumerate([180,29,13]):
g.text(x,y+5,str(y)+'人',ha='center',fontsize=10)

image.png
3.9.4 User label(RFM)
- a.Obtain the days since the last purchase as label 'days_buy'
- b.Count the purchase days as label 'buy_days_level'
- c.Draw the graph trend of buy_DAYs_level and the number of buyers
- d.Defined as "high" if days_buy exceeds 8, "low" otherwise
RFM:
Consumption in recency
Consumption frequency
Consumption monetary
last_buy_days = labels['days_buy'].value_counts().sort_index()
from matplotlib.font_manager import FontProperties
myfont=FontProperties(fname=r'C:\\Windows\\Fonts\\simhei.ttf',size=8)
sns.set(font=myfont.get_name())
plt.figure(figsize=(10,3),dpi=100)
plt.plot(last_buy_days.index,last_buy_days.values,'--o')
plt.title('Days since Last Purchase and Number of Purchasers',fontsize=15)
plt.xticks(range(0,32,5),fontsize=10)
plt.yticks(range(0,41,5),fontsize=10)
plt.xlabel('Days since Last Purchase',fontsize=13)
plt.ylabel('Number of Purchasers',fontsize=13)
for x,y in enumerate(last_buy_days.values):
plt.text(x+1,y+1,y,ha='center',fontsize=10)

image.png
labels['buy_days_level'] = '高'
labels.loc[labels['days_buy']>8,'buy_days_level'] = '低'
labels['rfm_value'] = labels['buy_active_level'].str.cat(labels['buy_days_level'])
def trans_value(x):
if x == 'HighHigh':
return 'Import Valuable Customres'
elif x == 'LowHigh':
return 'Import Developable Customres'
elif x == 'HighLow':
return 'Import Recall Customres'
else:
return 'About to Lose'
labels['rfm'] = labels['rfm_value'].apply(trans_value)
labels.drop(['buy_days_level','rfm_value'],axis=1,inplace=True)
RFM=labels['rfm'].value_counts()
ax1=plt.figure(figsize=(10,3),dpi=100)
plt.title('RFM Model',fontsize=15)
g=sns.barplot(y=labels['rfm'].value_counts().index,x=labels['rfm'].value_counts().values)
plt.xticks(range(0,141,20),fontsize=10)
plt.yticks(fontsize=10)
plt.xlabel('Number of Customers',fontsize=13)
plt.ylabel('Customers Label',fontsize=13,rotation=90)
for x,y in enumerate(labels['rfm'].value_counts().values):
g.text(y+5,x,str(y),ha='center',fontsize=10)

image.png
# import order data
order_df = pd.read_excel("order_data.xlsx")
order_df['time']=order_df['time'].apply(lambda x:'2020'+x[4:])
# Convert order date to time
order_df["time"]=pd.to_datetime(order_df["time"])
# Extraction time
order_df["time_hour"]=order_df["time"].dt.hour
# In order to count by week, you need to convert the time to the day of the week. By default, Monday counts from 0
order_df["time_week"]=order_df["time"].dt.dayofweek+1
#purchase in the weeks
week_order_df = order_df.groupby("time_week").user_id.count().reset_index()
myfont=FontProperties(fname=r'C:\\Windows\\Fonts\\simhei.ttf',size=8)
sns.set(font=myfont.get_name())
plt.figure(figsize=(10,3),dpi=100)
plt.plot(week_order_df['time_week'],week_order_df['user_id'],'--o')
plt.title('Purchase weeks',fontsize=15)
plt.xticks(range(1,8,1),fontsize=10)
plt.yticks(range(30000,42001,2000),fontsize=10)
plt.xlabel('Weeks',fontsize=13)
plt.ylabel('How many Purchases in a week',fontsize=13)
for x,y in enumerate(week_order_df['user_id']):
plt.text(x+1,y+500,y,ha='center',fontsize=10)

image.png
# Weekly shopping
hour_order_df = order_df.groupby("time_hour").user_id.count().reset_index()
myfont=FontProperties(fname=r'C:\\Windows\\Fonts\\simhei.ttf',size=8)
sns.set(font=myfont.get_name())
plt.figure(figsize=(10,3),dpi=100)
plt.plot(hour_order_df['time_hour'],hour_order_df['user_id'],'--o')
plt.title('Purchase Time Distribuution',fontsize=15)
plt.xticks(range(0,24,1),fontsize=10)
plt.yticks(range(0,25001,5000),fontsize=10)
plt.xlabel('24 hours',fontsize=13)
plt.ylabel('how many purchase',fontsize=13)
for x,y in enumerate(hour_order_df['user_id']):
plt.text(x,y+1000,y,ha='center',fontsize=10)

image.png


