用戶行為分析 SQL 整理

筆者從事大數(shù)據(jù)行業(yè)快三年時間,在用戶行為分析方向也有所沉淀。現(xiàn)在整理一些常用的 SQL ,希望本文對用戶行為分析的初學者的 SQL 學習有所幫助。

行業(yè)內(nèi)用的數(shù)據(jù)系統(tǒng)各式各樣,在 《華為數(shù)據(jù)之道》一書中描述的 “華為” 的數(shù)據(jù)中臺的五種數(shù)據(jù)中臺聯(lián)接方式:以業(yè)務流(事件)為中心聯(lián)接、以對象(主體)為中心聯(lián)接、智能標簽、報告數(shù)據(jù)、算法模型。大部分企業(yè)的最基礎的數(shù)據(jù)模型就是 “用戶行為” 和 "用戶主體"。

現(xiàn)在假設大家的數(shù)據(jù)系統(tǒng)里都有兩張大寬表,一張記錄 “用戶行為”,我們稱 events 表,一張記錄 “用戶主體”,我們稱 users 表。表的結構大致是:

Events 表

user_id event date time
1 visit 2021-01-01 2021-01-01 01:31:07.474
2 visit 2021-01-01 2021-01-01 01:32:03.674
3 payorder 2021-01-01 2021-01-01 01:33:12.444

Users 表

id first_id second_id age sex city
1 android_1 red 13 合肥,淮北
2 android_1 green 14 北京,上海
3 ios_1 blue 22 深圳

日訪問量( PV ):

SELECT  
    COUNT(*) AS "今日總訪問次數(shù)"
FROM  
    EVENTS 
WHERE 
    date=CURRENT_DATE() AND event='visit'

日活躍用戶數(shù)( UV ):

SELECT
    COUNT(DISTINCT user_id) AS "今日獨立用戶數(shù)"
FROM  
    EVENTS 
WHERE 
    date=CURRENT_DATE() AND event='visit'

最近七天日活:

SELECT  
     date,COUNT(DISTINCT user_id) AS "今日獨立用戶數(shù)"
FROM  
    EVENTS 
WHERE 
    event='visit'  AND date  BETWEEN CURRENT_DATE() - INTERVAL '7' DAY AND CURRENT_DATE() 
GROUP BY date

分時活躍:

SELECT
    HOUR(time) AS "小時數(shù)",
    COUNT(DISTINCT user_id) AS "獨立用戶數(shù)"
FROM  
    EVENTS 
WHERE 
    event='visit' AND date=CURRENT_DATE()
GROUP BY HOUR(time)

查詢每天上午 10 點至 11 點的下單用戶數(shù)

SELECT 
    COUNT(*) AS "獨立用戶數(shù)"
FROM 
    EVENTS
WHERE 
    EXTRACT(HOUR FROM time) IN (10, 11) AND event = 'payorder'
GROUP BY 1

List 類型的查詢,包含 xx 的 List 有哪些:

SELECT 
    city  AS "城市"
FROM
    users 
WHERE
    CONTAINS('合肥', city);

List 里的元素個數(shù):

SELECT 
    city,
    length(city)-length(replace(city, '\n',''))+1 AS "元素個數(shù)"
FROM
    users  
WHERE
    city IS NOT NULL

根據(jù)生日得到年齡

SELECT 
    id, 
    YEAR(NOW())-YEAR(TO_TIMESTAMP(birthday)) AS "年齡"
FROM 
    users

漏斗用戶:

visit(訪問)—addtocart(加購)—payorder(支付)(窗口期 48 小時且嚴格滿足事件先后順序)

SELECT
  COUNT(DISTINCT s3.user_id) AS "漏斗獨立用戶數(shù)"
FROM
  events s3
INNER JOIN (
    SELECT
      s2.user_id as user_id,
      s2.time as time,
      s1.endtime
    FROM
      EVENTS s2//s2 得到第二步用戶、加購時間、窗口期時間
    INNER JOIN (
        SELECT user_id,time,HOURS_ADD(time, 48) as endtime //窗口期 2 天(+ 48 小時) FROM EVENTS
        WHERE event = 'visit' AND date between '2021-01-01' and '2021-01-02' //時間范圍是 1 號到 3 號
        ) s1 //s1 得到用戶、訪問時間、訪問時間+48 小時
    ON s2.user_id = s1.user_id AND s2.time > s1.time AND s2.time <= endtime
    WHERE
      s2.event = 'addtocart'
  ) a 
ON
  s3.user_id = a.user_id AND s3.time > a.time AND s3.time <= a.endtime
WHERE
  s3.event = 'payorder' 

連續(xù) n 天訪問的用戶(n=1,2,3,4,...):


SELECT 
    COUNT(DISTINCT user_id) AS "連續(xù) n 天訪問的用戶"
FROM 
    (SELECT 
         user_id,
         date,
         TO_DATE(DAY_TO_DATE(LEAD(date,n-1,null)OVER(PARTITION BY user_id ORDER BY date ASC))) AS newdate
     FROM 
        (SELECT  user_id, date FROM  EVENTS  WHERE event='visit' GROUP BY user_id,date) a
     ) b
WHERE DATEDIFF(newdate,date)=n-1

PS:注意把 SQL 里的 n 替換為具體的天數(shù)

用戶連續(xù) n 天做某事(主要不同用戶得 n)--感謝同事提供:

SELECT 
  user_id,
  continuous_days  AS "連續(xù)訪問天數(shù)"
FROM
  (-- 原理:得到用戶 ymd1-ROW_NUMBER1=ymd2-ROW_NUMBER2 的數(shù)量,則為連續(xù)天數(shù)
   SELECT
     user_id,
     continuous_group,
     COUNT(1) AS continuous_days
   FROM
    (-- 原理:ROW_NUMBER 和日期天一樣是步長為 1 的數(shù)列
     -- 如果用戶是連續(xù)日期做這件事,日期減去ROW_NUMBER一定是相等的,以此構建分組
     SELECT
        ymd - ROW_NUMBER() OVER (PARTITION BY user_id ORDER BY ymd ASC) AS continuous_group,
        user_id,
        event
     FROM
        (-- 把用戶做這個事件的日期取出來
         SELECT
           DISTINCT
           -- 由于數(shù)字格式在跨月時步長不相等,如 20191101-20191031>1,需要把日期換算成連續(xù)天數(shù)列
           -- FROM_UNIXTIME(UNIX_TIMESTAMP(date),'yyyyMMdd') AS ymd,
           DATEDIFF(date,'1970-01-01') AS ymd,
           user_id,
           event
         FROM
           events
         WHERE
           event='visit'
        ) event_log
   ) event_group
   GROUP BY
     user_id,
     continuous_group
) continuous_groups
WHERE  
  continuous_days>1

查詢一段時間內(nèi)的用戶下單次數(shù)分布情況

SELECT 
    CASE
        WHEN c < 10 THEN '<10'
        WHEN c < 20 THEN '<20'
        WHEN c < 100 THEN '<100'
        ELSE '>100'
    END,
    COUNT(*)
FROM (
    SELECT user_id, COUNT(*) AS c FROM events
    WHERE date BETWEEN '2015-09-01' AND '2015-09-20' AND event = 'payorderr'
    GROUP BY 1
)a 
GROUP BY 1

做過 addtocart (加購) 且沒有 payorder (支付)的用戶:

SELECT 
    COUNT(DISTINCT a.user_id) AS "做過 addtocart (加購) 且沒有 payorder (支付)的用戶"
FROM 
    (SELECT user_id FROM EVENTS WHERE event='addtocart') a
LEFT JOIN 
    (SELECT user_id FROM EVENTS WHERE event='payorder') b
ON 
    a.user_id=b.user_id 
WHERE 
    b.user_id IS NULL

做過 addtocart (加購) 且沒有 payorder (支付)的用戶和商品:

SELECT  
    COUNT(DISTINCT addcart.mapid)  AS "做過 addtocart (加購) 且沒有 payorder (支付)的用戶和商品"
FROM 
    (SELECT user_id, CONCAT(CAST(user_id AS STRING), commodityID) AS mapid FROM EVENTS WHERE event = 'addtocart')  addcart
LEFT JOIN
    (SELECT user_id, CONCAT(CAST(user_id AS STRING), commodityID) AS mapid FROM  EVENTS WHERE  event = 'payorder')  pay 
ON 
    addcart.mapid = pay.mapid
WHERE  
    pay.user_id is null

不同支付方式的支付金額總和(微信支付+支付寶支付):

SELECT
    user_id,
    SUM(value) AS "支付金額總和"
FROM
    (
    -- 支付寶支付金額
    SELECT
        user_id,
        SUM(events.alipay_amount) AS value
    FROM
        events 
    WHERE
        date BETWEEN '[baseTime]'-INTERVAL 30 DAY AND '[baseTime]'
        AND events.event='alipay_detail'
    GROUP BY
        user_id

    UNION ALL
   -- 微信支付金額
    SELECT
        user_id,
        SUM(events.wechat_amount) AS value
    FROM
        events 
    WHERE
        date BETWEEN '[baseTime]'-INTERVAL 30 DAY AND '[baseTime]'
        AND events.event='wechat_pay_detail'
    GROUP BY
        user_id
    ) t
GROUP BY
    user_id

正則表達式之使用 QQ 郵箱為郵件的用戶數(shù)

SELECT
    COUNT(*)  AS "獨立用戶數(shù)"
FROM 
    users
WHERE 
    regexp_like(email, '@qq.com$')

正則表達式之匹配 ID 規(guī)則

-- users 表的 first_id 記錄用戶的設備 id,second_id 來記錄用戶的登錄 id
-- 匹配 first_id 為 Android_id,Android_id 一般是 16 位字母和數(shù)字的組合
SELECT
    id,
    first_id AS "安卓設備 ID"
FROM 
    users
WHERE 
    REGEXP_LIKE(first_id, '^([0-9a-z]{1,16})$')

-- 匹配 first_id 為 IDFA/IDFV,一般是 32 位字母和數(shù)字的組合
SELECT
    id,
    first_id AS "?? 設備 ID"
FROM 
    users
WHERE 
    REGEXP_LIKE(first_id, '^([0-9A-Z]{8})(([/\s-][0-9A-Z]{4}){3})([/\s-][0-9A-Z]{12})$')

-- 匹配 first_id 為小程序的 open_id
SELECT
    id,
    first_id AS "小程序設備 ID"
FROM 
    users
WHERE 
    REGEXP_LIKE(first_id, '^o[0-9a-zA-Z_-]{27}$')

--這里不再一一列舉,只列舉幾個常用的
^\d{n}$   (驗證 n 位數(shù)字,n 輸入具體的值)
^[0-9a-zA-Z]{n,m}$ (n ~m 個數(shù)字、字母組成的字符串)
^[a-zA-Z]{n,m}$(n ~m 個字母組成的字符串)
^([A-Za-z0-9_\-\.])+\@[a-zA-Z0-9_\-]+([a-zA-Z0-9_\-\.])+$(驗證是否是郵箱)

間隔計算,計算兩個事件的間隔時間(超過 10 分鐘則不計算)

SELECT
    user_id,
    SUM(
      CASE WHEN
       end_time - begin_time < 600
      THEN
       end_time - begin_time
      ELSE
       0
      END
    ) AS "間隔時長(秒)"
FROM (
    SELECT
        user_id,
        EXTRACT(EPOCH FROM time) AS end_time,
        LAG(EXTRACT(EPOCH FROM time), 1, NULL) OVER (PARTITION BY user_id ORDER BY time ASC) AS begin_time 
    FROM events ) a
GROUP BY 1

計算支付行為間隔天數(shù)

SELECT 
    user_id,
    datex,
    DATEDIFF(datex, LAG(datex,1,NULL) OVER(PARTITION BY user_id ORDER BY datex ASC)) AS diff
FROM 
    (SELECT user_id,trunc(time, 'DD') AS datex FROM events WHERE event = 'payorder' GROUP BY user_id,datex) a

超級微笑曲線

SELECT 
    visit_days AS "訪問天數(shù)", 
    COUNT(user_id) AS "獨立用戶數(shù)"
FROM
   (SELECT user_id, COUNT(DISTINCT date) AS visit_days FROM events WHERE date BETWEEN CURRENT_DATE() - INTERVAL '30' DAY AND CURRENT_DATE() - INTERVAL '1' DAY GROUP BY 1) a
GROUP BY 1

用戶首個購買和第二次購買的日期間隔

SELECT 
    user_id,
    DATEDIFF(first_time_value(time,next_time),MIN(time)) AS "首次購買和第二次購買的時間差"
FROM  
    (SELECT user_id,time,LEAD(time, 1, NULL) OVER (PARTITION BY user_id ORDER BY time asc) AS next_time FROM events WHERE  event='payorder' ) a
GROUP BY 
    user_id

PS:first_time_value 是自定義的函數(shù),使用 first_time_value(time, 其他屬性) 聚合函數(shù)來獲取第一次發(fā)生某行為時的相關屬性,建議把這個函數(shù)內(nèi)置。如果沒有的話,需要參考:

SELECT 
    user_id,
    time,
    next_time,
    DATEDIFF(next_time,time) 
FROM
    (SELECT user_id,time,next_time,ROW_NUMBER() OVER (PARTITION BY user_id ORDER BY time ASC)     AS rank FROM  (SELECT
    user_id,time,distinct_id,
    LEAD(time, 1, NULL) OVER (PARTITION BY user_id ORDER BY time asc) AS next_time
    FROM events WHERE  event='payOrder') a
    WHERE  next_time IS NOT NULL)b
WHERE rank=1

尋找流失用戶:最近兩次訪問時間間隔在 30 天以上

SELECT
    COUNT(user_id) AS "獨立用戶數(shù)"
FROM (
SELECT user_id,(UNIX_TIMESTAMP(MAX(t2)) - UNIX_TIMESTAMP(MAX(t1))) / 86400 AS d1
FROM (
SELECT user_id,LAG(time,1) OVER(PARTITION BY user_id ORDER BY time ASC) AS t1,time AS t2 FROM events
WHERE  event='visit' ) t
GROUP  BY user_id ) r

過去 7 天瀏覽偏好的商品類型(前 3)

/* 假設當前基準時間為 2019-06-19 */
/* 集合類型標簽*/ 

SELECT id, MAX(distinct_id) AS distinct_id,
    GROUP_CONCAT(product_type, '\n') AS value
FROM (
    SELECT id, distinct_id, product_type, 
        RANK() OVER (PARTITION BY id ORDER BY cnt DESC) AS rank_num
    FROM (
        SELECT user_id AS id, product_type,
            MAX(distinct_id) AS distinct_id, COUNT(*) AS cnt
        FROM events
        WHERE date BETWEEN '[baseTime]' - INTERVAL '7' DAY AND '[baseTime]' - INTERVAL '1'DAY
            AND event = 'payorder'
        GROUP BY 1, 2
    ) a
) b
WHERE rank_num <= 3
GROUP BY 1

/* 其中 group_concat(product_type, '\n') 表示用戶前三的商品類型。 */
/* 返回值是 list 類型,需要創(chuàng)建為集合類型的標簽 */

過去 7 天中用戶最近一次訪問距今時間(天)

/* 假設當前基準時間為 2019-06-19 */
/* 數(shù)值類型標簽 */

SELECT id, distinct_id, DATEDIFF(now(), time) AS value
FROM (
    SELECT user_id AS id, MAX(distinct_id) AS distinct_id, MAX(time) AS time
    FROM events
    WHERE date BETWEEN '[baseTime]' - INTERVAL '7' DAY AND '[baseTime]' - INTERVAL '1' DAY
        AND event = 'View'
    GROUP BY 1
) a

/* 其中 View 為用戶訪事件,datediff(now(), time) as value 表示事件發(fā)生的距今天數(shù) */

過去 7 天瀏覽最多的商品類型

/* 字符串類型標簽 */

SELECT id, distinct_id, product_type AS value
FROM (
    SELECT id, distinct_id, product_type, 
        ROW_NUMBER() OVER (PARTITION BY id ORDER BY cnt DESC) AS row_num
    FROM (
        SELECT user_id AS id, product_type, MAX(distinct_id) AS distinct_id, COUNT(*) AS cnt
        FROM events
        WHERE date BETWEEN '[baseTime]' - INTERVAL '7' DAY AND '[baseTime]' - INTERVAL '1' DAY
            AND event = 'ProductDetails'
        GROUP BY 1, 2
    ) a
) b
WHERE row_num <= 1

暫時就這么多,我會不定期補充的~

?著作權歸作者所有,轉載或內(nèi)容合作請聯(lián)系作者
【社區(qū)內(nèi)容提示】社區(qū)部分內(nèi)容疑似由AI輔助生成,瀏覽時請結合常識與多方信息審慎甄別。
平臺聲明:文章內(nèi)容(如有圖片或視頻亦包括在內(nèi))由作者上傳并發(fā)布,文章內(nèi)容僅代表作者本人觀點,簡書系信息發(fā)布平臺,僅提供信息存儲服務。

相關閱讀更多精彩內(nèi)容

友情鏈接更多精彩內(nèi)容