筆者從事大數(shù)據(jù)行業(yè)快三年時間,在用戶行為分析方向也有所沉淀。現(xiàn)在整理一些常用的 SQL ,希望本文對用戶行為分析的初學者的 SQL 學習有所幫助。
行業(yè)內(nèi)用的數(shù)據(jù)系統(tǒng)各式各樣,在 《華為數(shù)據(jù)之道》一書中描述的 “華為” 的數(shù)據(jù)中臺的五種數(shù)據(jù)中臺聯(lián)接方式:以業(yè)務流(事件)為中心聯(lián)接、以對象(主體)為中心聯(lián)接、智能標簽、報告數(shù)據(jù)、算法模型。大部分企業(yè)的最基礎的數(shù)據(jù)模型就是 “用戶行為” 和 "用戶主體"。
現(xiàn)在假設大家的數(shù)據(jù)系統(tǒng)里都有兩張大寬表,一張記錄 “用戶行為”,我們稱 events 表,一張記錄 “用戶主體”,我們稱 users 表。表的結構大致是:
Events 表
| user_id | event | date | time |
|---|---|---|---|
| 1 | visit | 2021-01-01 | 2021-01-01 01:31:07.474 |
| 2 | visit | 2021-01-01 | 2021-01-01 01:32:03.674 |
| 3 | payorder | 2021-01-01 | 2021-01-01 01:33:12.444 |
Users 表
| id | first_id | second_id | age | sex | city |
|---|---|---|---|---|---|
| 1 | android_1 | red | 13 | 男 | 合肥,淮北 |
| 2 | android_1 | green | 14 | 男 | 北京,上海 |
| 3 | ios_1 | blue | 22 | 女 | 深圳 |
日訪問量( PV ):
SELECT
COUNT(*) AS "今日總訪問次數(shù)"
FROM
EVENTS
WHERE
date=CURRENT_DATE() AND event='visit'
日活躍用戶數(shù)( UV ):
SELECT
COUNT(DISTINCT user_id) AS "今日獨立用戶數(shù)"
FROM
EVENTS
WHERE
date=CURRENT_DATE() AND event='visit'
最近七天日活:
SELECT
date,COUNT(DISTINCT user_id) AS "今日獨立用戶數(shù)"
FROM
EVENTS
WHERE
event='visit' AND date BETWEEN CURRENT_DATE() - INTERVAL '7' DAY AND CURRENT_DATE()
GROUP BY date
分時活躍:
SELECT
HOUR(time) AS "小時數(shù)",
COUNT(DISTINCT user_id) AS "獨立用戶數(shù)"
FROM
EVENTS
WHERE
event='visit' AND date=CURRENT_DATE()
GROUP BY HOUR(time)
查詢每天上午 10 點至 11 點的下單用戶數(shù)
SELECT
COUNT(*) AS "獨立用戶數(shù)"
FROM
EVENTS
WHERE
EXTRACT(HOUR FROM time) IN (10, 11) AND event = 'payorder'
GROUP BY 1
List 類型的查詢,包含 xx 的 List 有哪些:
SELECT
city AS "城市"
FROM
users
WHERE
CONTAINS('合肥', city);
List 里的元素個數(shù):
SELECT
city,
length(city)-length(replace(city, '\n',''))+1 AS "元素個數(shù)"
FROM
users
WHERE
city IS NOT NULL
根據(jù)生日得到年齡
SELECT
id,
YEAR(NOW())-YEAR(TO_TIMESTAMP(birthday)) AS "年齡"
FROM
users
漏斗用戶:
visit(訪問)—addtocart(加購)—payorder(支付)(窗口期 48 小時且嚴格滿足事件先后順序)
SELECT
COUNT(DISTINCT s3.user_id) AS "漏斗獨立用戶數(shù)"
FROM
events s3
INNER JOIN (
SELECT
s2.user_id as user_id,
s2.time as time,
s1.endtime
FROM
EVENTS s2//s2 得到第二步用戶、加購時間、窗口期時間
INNER JOIN (
SELECT user_id,time,HOURS_ADD(time, 48) as endtime //窗口期 2 天(+ 48 小時) FROM EVENTS
WHERE event = 'visit' AND date between '2021-01-01' and '2021-01-02' //時間范圍是 1 號到 3 號
) s1 //s1 得到用戶、訪問時間、訪問時間+48 小時
ON s2.user_id = s1.user_id AND s2.time > s1.time AND s2.time <= endtime
WHERE
s2.event = 'addtocart'
) a
ON
s3.user_id = a.user_id AND s3.time > a.time AND s3.time <= a.endtime
WHERE
s3.event = 'payorder'
連續(xù) n 天訪問的用戶(n=1,2,3,4,...):
SELECT
COUNT(DISTINCT user_id) AS "連續(xù) n 天訪問的用戶"
FROM
(SELECT
user_id,
date,
TO_DATE(DAY_TO_DATE(LEAD(date,n-1,null)OVER(PARTITION BY user_id ORDER BY date ASC))) AS newdate
FROM
(SELECT user_id, date FROM EVENTS WHERE event='visit' GROUP BY user_id,date) a
) b
WHERE DATEDIFF(newdate,date)=n-1
PS:注意把 SQL 里的 n 替換為具體的天數(shù)
用戶連續(xù) n 天做某事(主要不同用戶得 n)--感謝同事提供:
SELECT
user_id,
continuous_days AS "連續(xù)訪問天數(shù)"
FROM
(-- 原理:得到用戶 ymd1-ROW_NUMBER1=ymd2-ROW_NUMBER2 的數(shù)量,則為連續(xù)天數(shù)
SELECT
user_id,
continuous_group,
COUNT(1) AS continuous_days
FROM
(-- 原理:ROW_NUMBER 和日期天一樣是步長為 1 的數(shù)列
-- 如果用戶是連續(xù)日期做這件事,日期減去ROW_NUMBER一定是相等的,以此構建分組
SELECT
ymd - ROW_NUMBER() OVER (PARTITION BY user_id ORDER BY ymd ASC) AS continuous_group,
user_id,
event
FROM
(-- 把用戶做這個事件的日期取出來
SELECT
DISTINCT
-- 由于數(shù)字格式在跨月時步長不相等,如 20191101-20191031>1,需要把日期換算成連續(xù)天數(shù)列
-- FROM_UNIXTIME(UNIX_TIMESTAMP(date),'yyyyMMdd') AS ymd,
DATEDIFF(date,'1970-01-01') AS ymd,
user_id,
event
FROM
events
WHERE
event='visit'
) event_log
) event_group
GROUP BY
user_id,
continuous_group
) continuous_groups
WHERE
continuous_days>1
查詢一段時間內(nèi)的用戶下單次數(shù)分布情況
SELECT
CASE
WHEN c < 10 THEN '<10'
WHEN c < 20 THEN '<20'
WHEN c < 100 THEN '<100'
ELSE '>100'
END,
COUNT(*)
FROM (
SELECT user_id, COUNT(*) AS c FROM events
WHERE date BETWEEN '2015-09-01' AND '2015-09-20' AND event = 'payorderr'
GROUP BY 1
)a
GROUP BY 1
做過 addtocart (加購) 且沒有 payorder (支付)的用戶:
SELECT
COUNT(DISTINCT a.user_id) AS "做過 addtocart (加購) 且沒有 payorder (支付)的用戶"
FROM
(SELECT user_id FROM EVENTS WHERE event='addtocart') a
LEFT JOIN
(SELECT user_id FROM EVENTS WHERE event='payorder') b
ON
a.user_id=b.user_id
WHERE
b.user_id IS NULL
做過 addtocart (加購) 且沒有 payorder (支付)的用戶和商品:
SELECT
COUNT(DISTINCT addcart.mapid) AS "做過 addtocart (加購) 且沒有 payorder (支付)的用戶和商品"
FROM
(SELECT user_id, CONCAT(CAST(user_id AS STRING), commodityID) AS mapid FROM EVENTS WHERE event = 'addtocart') addcart
LEFT JOIN
(SELECT user_id, CONCAT(CAST(user_id AS STRING), commodityID) AS mapid FROM EVENTS WHERE event = 'payorder') pay
ON
addcart.mapid = pay.mapid
WHERE
pay.user_id is null
不同支付方式的支付金額總和(微信支付+支付寶支付):
SELECT
user_id,
SUM(value) AS "支付金額總和"
FROM
(
-- 支付寶支付金額
SELECT
user_id,
SUM(events.alipay_amount) AS value
FROM
events
WHERE
date BETWEEN '[baseTime]'-INTERVAL 30 DAY AND '[baseTime]'
AND events.event='alipay_detail'
GROUP BY
user_id
UNION ALL
-- 微信支付金額
SELECT
user_id,
SUM(events.wechat_amount) AS value
FROM
events
WHERE
date BETWEEN '[baseTime]'-INTERVAL 30 DAY AND '[baseTime]'
AND events.event='wechat_pay_detail'
GROUP BY
user_id
) t
GROUP BY
user_id
正則表達式之使用 QQ 郵箱為郵件的用戶數(shù)
SELECT
COUNT(*) AS "獨立用戶數(shù)"
FROM
users
WHERE
regexp_like(email, '@qq.com$')
正則表達式之匹配 ID 規(guī)則
-- users 表的 first_id 記錄用戶的設備 id,second_id 來記錄用戶的登錄 id
-- 匹配 first_id 為 Android_id,Android_id 一般是 16 位字母和數(shù)字的組合
SELECT
id,
first_id AS "安卓設備 ID"
FROM
users
WHERE
REGEXP_LIKE(first_id, '^([0-9a-z]{1,16})$')
-- 匹配 first_id 為 IDFA/IDFV,一般是 32 位字母和數(shù)字的組合
SELECT
id,
first_id AS "?? 設備 ID"
FROM
users
WHERE
REGEXP_LIKE(first_id, '^([0-9A-Z]{8})(([/\s-][0-9A-Z]{4}){3})([/\s-][0-9A-Z]{12})$')
-- 匹配 first_id 為小程序的 open_id
SELECT
id,
first_id AS "小程序設備 ID"
FROM
users
WHERE
REGEXP_LIKE(first_id, '^o[0-9a-zA-Z_-]{27}$')
--這里不再一一列舉,只列舉幾個常用的
^\d{n}$ (驗證 n 位數(shù)字,n 輸入具體的值)
^[0-9a-zA-Z]{n,m}$ (n ~m 個數(shù)字、字母組成的字符串)
^[a-zA-Z]{n,m}$(n ~m 個字母組成的字符串)
^([A-Za-z0-9_\-\.])+\@[a-zA-Z0-9_\-]+([a-zA-Z0-9_\-\.])+$(驗證是否是郵箱)
間隔計算,計算兩個事件的間隔時間(超過 10 分鐘則不計算)
SELECT
user_id,
SUM(
CASE WHEN
end_time - begin_time < 600
THEN
end_time - begin_time
ELSE
0
END
) AS "間隔時長(秒)"
FROM (
SELECT
user_id,
EXTRACT(EPOCH FROM time) AS end_time,
LAG(EXTRACT(EPOCH FROM time), 1, NULL) OVER (PARTITION BY user_id ORDER BY time ASC) AS begin_time
FROM events ) a
GROUP BY 1
計算支付行為間隔天數(shù)
SELECT
user_id,
datex,
DATEDIFF(datex, LAG(datex,1,NULL) OVER(PARTITION BY user_id ORDER BY datex ASC)) AS diff
FROM
(SELECT user_id,trunc(time, 'DD') AS datex FROM events WHERE event = 'payorder' GROUP BY user_id,datex) a
超級微笑曲線
SELECT
visit_days AS "訪問天數(shù)",
COUNT(user_id) AS "獨立用戶數(shù)"
FROM
(SELECT user_id, COUNT(DISTINCT date) AS visit_days FROM events WHERE date BETWEEN CURRENT_DATE() - INTERVAL '30' DAY AND CURRENT_DATE() - INTERVAL '1' DAY GROUP BY 1) a
GROUP BY 1
用戶首個購買和第二次購買的日期間隔
SELECT
user_id,
DATEDIFF(first_time_value(time,next_time),MIN(time)) AS "首次購買和第二次購買的時間差"
FROM
(SELECT user_id,time,LEAD(time, 1, NULL) OVER (PARTITION BY user_id ORDER BY time asc) AS next_time FROM events WHERE event='payorder' ) a
GROUP BY
user_id
PS:first_time_value 是自定義的函數(shù),使用 first_time_value(time, 其他屬性) 聚合函數(shù)來獲取第一次發(fā)生某行為時的相關屬性,建議把這個函數(shù)內(nèi)置。如果沒有的話,需要參考:
SELECT
user_id,
time,
next_time,
DATEDIFF(next_time,time)
FROM
(SELECT user_id,time,next_time,ROW_NUMBER() OVER (PARTITION BY user_id ORDER BY time ASC) AS rank FROM (SELECT
user_id,time,distinct_id,
LEAD(time, 1, NULL) OVER (PARTITION BY user_id ORDER BY time asc) AS next_time
FROM events WHERE event='payOrder') a
WHERE next_time IS NOT NULL)b
WHERE rank=1
尋找流失用戶:最近兩次訪問時間間隔在 30 天以上
SELECT
COUNT(user_id) AS "獨立用戶數(shù)"
FROM (
SELECT user_id,(UNIX_TIMESTAMP(MAX(t2)) - UNIX_TIMESTAMP(MAX(t1))) / 86400 AS d1
FROM (
SELECT user_id,LAG(time,1) OVER(PARTITION BY user_id ORDER BY time ASC) AS t1,time AS t2 FROM events
WHERE event='visit' ) t
GROUP BY user_id ) r
過去 7 天瀏覽偏好的商品類型(前 3)
/* 假設當前基準時間為 2019-06-19 */
/* 集合類型標簽*/
SELECT id, MAX(distinct_id) AS distinct_id,
GROUP_CONCAT(product_type, '\n') AS value
FROM (
SELECT id, distinct_id, product_type,
RANK() OVER (PARTITION BY id ORDER BY cnt DESC) AS rank_num
FROM (
SELECT user_id AS id, product_type,
MAX(distinct_id) AS distinct_id, COUNT(*) AS cnt
FROM events
WHERE date BETWEEN '[baseTime]' - INTERVAL '7' DAY AND '[baseTime]' - INTERVAL '1'DAY
AND event = 'payorder'
GROUP BY 1, 2
) a
) b
WHERE rank_num <= 3
GROUP BY 1
/* 其中 group_concat(product_type, '\n') 表示用戶前三的商品類型。 */
/* 返回值是 list 類型,需要創(chuàng)建為集合類型的標簽 */
過去 7 天中用戶最近一次訪問距今時間(天)
/* 假設當前基準時間為 2019-06-19 */
/* 數(shù)值類型標簽 */
SELECT id, distinct_id, DATEDIFF(now(), time) AS value
FROM (
SELECT user_id AS id, MAX(distinct_id) AS distinct_id, MAX(time) AS time
FROM events
WHERE date BETWEEN '[baseTime]' - INTERVAL '7' DAY AND '[baseTime]' - INTERVAL '1' DAY
AND event = 'View'
GROUP BY 1
) a
/* 其中 View 為用戶訪事件,datediff(now(), time) as value 表示事件發(fā)生的距今天數(shù) */
過去 7 天瀏覽最多的商品類型
/* 字符串類型標簽 */
SELECT id, distinct_id, product_type AS value
FROM (
SELECT id, distinct_id, product_type,
ROW_NUMBER() OVER (PARTITION BY id ORDER BY cnt DESC) AS row_num
FROM (
SELECT user_id AS id, product_type, MAX(distinct_id) AS distinct_id, COUNT(*) AS cnt
FROM events
WHERE date BETWEEN '[baseTime]' - INTERVAL '7' DAY AND '[baseTime]' - INTERVAL '1' DAY
AND event = 'ProductDetails'
GROUP BY 1, 2
) a
) b
WHERE row_num <= 1
暫時就這么多,我會不定期補充的~