dpvs學(xué)習(xí)筆記: 12 TOA 實(shí)現(xiàn)原理

在 full-nat two-arm 模式下,后端 real server 獲取到請求的來源都是 dpvs local ip, 如何獲取真實(shí)的 client ip 呢?這就需要 toa 模塊,原理都說是修改了 rs 機(jī)器獲取 ip 的函數(shù),具體如何初現(xiàn)呢?

tcp option 字段

關(guān)于 tcp header 可以參考 wiki, 我把截圖貼上來

tcp header

我們知道 ip header 里 src address 肯定是 dpvs local ip, 否則數(shù)據(jù)包無法發(fā)送。那么 client ip 放哪里呢?就是在 tcp header 的 option 字段中。

option 字段最長 40 bytes. 每填充一個選項(xiàng)由三部分構(gòu)成:op-kind, op-length, op-data. 最常用的 mss 字段就是放在 option 里。只要構(gòu)建一個不沖突的 op-kind 就可以把 client ip 填充進(jìn)去。ipv4 的度度是 4 bytes, ipv6 是 16 bytes. 看來整個 option 字段在不久就會不夠用。

dpvs 寫 tcp option address

DPVS fullnat 在調(diào)用 tcp_fnat_in_handler 時會調(diào)用 tcp_in_add_toa 寫到 mbuf.

static inline int tcp_in_add_toa(struct dp_vs_conn *conn, struct rte_mbuf *mbuf,
                          struct tcphdr *tcph)
{
    uint32_t mtu;
    struct tcpopt_addr *toa;
    uint32_t tcp_opt_len;

    uint8_t *p, *q, *tail;
    struct route_entry *rt;

    if (unlikely(conn->af != AF_INET && conn->af != AF_INET6))
        return EDPVS_NOTSUPP;

    tcp_opt_len = conn->af == AF_INET ? TCP_OLEN_IP4_ADDR : TCP_OLEN_IP6_ADDR;
    /*
     * check if we can add the new option
     */
    /* skb length and tcp option length checking */
    if ((rt = mbuf->userdata) != NULL) {
        mtu = rt->mtu;
    } else if (conn->in_dev) { /* no route for fast-xmit */
        mtu = conn->in_dev->mtu;
    } else {
        RTE_LOG(DEBUG, IPVS, "add toa: MTU unknown.\n");
        return EDPVS_NOROUTE;
    }

    if (unlikely(mbuf->pkt_len > (mtu - tcp_opt_len))) {
        RTE_LOG(DEBUG, IPVS, "add toa: need fragment, tcp opt len : %u.\n",
                tcp_opt_len);
        return EDPVS_FRAG;
    }

    /* maximum TCP header is 60, and 40 for options */
    if (unlikely((60 - (tcph->doff << 2)) < tcp_opt_len)) {
        RTE_LOG(DEBUG, IPVS, "add toa: no TCP header room, tcp opt len : %u.\n",
                tcp_opt_len);
        return EDPVS_NOROOM;
    }

    /* check tail room and expand mbuf.
     * have to pull all bits in segments for later operation. */
    if (unlikely(mbuf_may_pull(mbuf, mbuf->pkt_len) != 0))
        return EDPVS_INVPKT;
    tail = (uint8_t *)rte_pktmbuf_append(mbuf, tcp_opt_len);
    if (unlikely(!tail)) {
        RTE_LOG(DEBUG, IPVS, "add toa: no mbuf tail room, tcp opt len : %u.\n",
                tcp_opt_len);
        return EDPVS_NOROOM;
    }

    /*
     * now add address option
     */

    /* move data down, including existing tcp options
     * @p is last data byte,
     * @q is new position of last data byte */
    p = tail - 1;
    q = p + tcp_opt_len;
    while (p >= ((uint8_t *)tcph + sizeof(struct tcphdr))) {
        *q = *p;
        p--, q--;
    }

    /* insert toa right after TCP basic header */
    toa = (struct tcpopt_addr *)(tcph + 1);
    toa->opcode = TCP_OPT_ADDR;
    toa->opsize = tcp_opt_len;
    toa->port = conn->cport;

    if (conn->af == AF_INET) {
        struct tcpopt_ip4_addr *toa_ip4 = (struct tcpopt_ip4_addr *)(tcph + 1);
        toa_ip4->addr = conn->caddr.in;
    }
    else {
        struct tcpopt_ip6_addr *toa_ip6 = (struct tcpopt_ip6_addr *)(tcph + 1);
        toa_ip6->addr = conn->caddr.in6;
    }


    /* reset tcp header length */
    tcph->doff += tcp_opt_len >> 2;

    /* reset ip header total length */
    if (conn->af == AF_INET)
        ip4_hdr(mbuf)->total_length =
            htons(ntohs(ip4_hdr(mbuf)->total_length) + tcp_opt_len);
    else
        ip6_hdr(mbuf)->ip6_plen =
            htons(ntohs(ip6_hdr(mbuf)->ip6_plen) + tcp_opt_len);

    /* tcp csum will be recalc later, 
     * so as IP hdr csum since iph.tot_len has been chagned. */
    return EDPVS_OK;
}
  1. 根據(jù) ipv4 ipv6 來確定 toa 需要的長度,2 bytes op-kind, 2 bytes op-length 再加上地址長度。所以 ipv4 共需 8 bytes, ipv6 共需 20 bytes
  2. TCP header 最大長度 60,option 最大長度 40,確何不會超過
  3. rte_pktmbuf_append 將 mbuf 擴(kuò)展空間,能容納 toa
  4. 填充 tcpopt_addr 結(jié)構(gòu)體,op-kind TCP_OPT_ADDR 是 254,非官方 tcp/ip 認(rèn)可的值。端口值是 conn->cport, 最后填充 conn->caddr.in 或 conn->caddr.in6 地址。

real server 安裝 toa

很簡單,make 編繹后生成 toa.ko 驅(qū)動,然后 insmod toa.ko 即可。所有 real server 都需要安裝。先看下 module_init 函數(shù) toa_init

static int __init
toa_init(void)
{

    TOA_INFO("TOA " TOA_VERSION " by pukong.wjm\n");

    /* alloc statistics array for toa */
    ext_stats = alloc_percpu(struct toa_stat_mib);
    if (NULL == ext_stats)
        return 1;
    proc_net_fops_create(&init_net, "toa_stats", 0, &toa_stats_fops);

    /* get the address of function sock_def_readable
     * so later we can know whether the sock is for rpc, tux or others
     */
    sk_data_ready_addr = kallsyms_lookup_name("sock_def_readable");
    TOA_INFO("CPU [%u] sk_data_ready_addr = "
        "kallsyms_lookup_name(sock_def_readable) = %lu\n",
         smp_processor_id(), sk_data_ready_addr);
    if (0 == sk_data_ready_addr) {
        TOA_INFO("cannot find sock_def_readable.\n");
        goto err;
    }

#ifdef TOA_IPV6_ENABLE
    if (0 != get_kernel_ipv6_symbol()) {
        TOA_INFO("get ipv6 struct from kernel fail.\n");
        goto err;
    }
#endif
    
    /* hook funcs for parse and get toa */
    hook_toa_functions();

    TOA_INFO("toa loaded\n");
    return 0;

err:
    proc_net_remove(&init_net, "toa_stats");
    if (NULL != ext_stats) {
        free_percpu(ext_stats);
        ext_stats = NULL;
    }

    return 1;
}
  1. proc_net_fops_create 在 /proc 文件系統(tǒng)下注冊 /proc/net/toa_stats 用于查看統(tǒng)計(jì)使用
  2. kallsyms_lookup_name 根據(jù)名稱來獲取 sock_def_readable 地址
  3. get_kernel_ipv6_symbol 如果支持 ipv6, 獲取相應(yīng)的回調(diào)函數(shù)地址
  4. hook_toa_functions 將 toa 功能 hook 進(jìn)內(nèi)核
    proc_net_fops_create
/* replace the functions with our functions */
static inline int
hook_toa_functions(void)
{
    /* hook inet_getname for ipv4 */
    struct proto_ops *inet_stream_ops_p =
            (struct proto_ops *)&inet_stream_ops;
    /* hook tcp_v4_syn_recv_sock for ipv4 */
    struct inet_connection_sock_af_ops *ipv4_specific_p =
            (struct inet_connection_sock_af_ops *)&ipv4_specific;

    inet_stream_ops_p->getname = inet_getname_toa;
    TOA_INFO("CPU [%u] hooked inet_getname <%p> --> <%p>\n",
        smp_processor_id(), inet_getname, inet_stream_ops_p->getname);

    ipv4_specific_p->syn_recv_sock = tcp_v4_syn_recv_sock_toa;
    TOA_INFO("CPU [%u] hooked tcp_v4_syn_recv_sock <%p> --> <%p>\n",
        smp_processor_id(), tcp_v4_syn_recv_sock,
        ipv4_specific_p->syn_recv_sock);

#ifdef TOA_IPV6_ENABLE
    inet6_stream_ops_p->getname = inet6_getname_toa;
    TOA_INFO("CPU [%u] hooked inet6_getname <%p> --> <%p>\n",
        smp_processor_id(), inet6_getname, inet6_stream_ops_p->getname);

    ipv6_specific_p->syn_recv_sock = tcp_v6_syn_recv_sock_toa;
    TOA_INFO("CPU [%u] hooked tcp_v6_syn_recv_sock <%p> --> <%p>\n",
        smp_processor_id(), tcp_v6_syn_recv_sock_org_pt,
        ipv6_specific_p->syn_recv_sock);
#endif

    return 0;
}

仔細(xì)看看也不難,就是將 inet ops 回調(diào)函數(shù) getname 替換為 toa 的。但是我有問題,如果請求不來自 dpvs,普通的請求會不會也受影響?

可以看到 hook 了兩個函數(shù) tcp_v4_syn_recv_sock_toa 和 inet_getname_toa

real server 獲取 client ip

當(dāng)完成三次握手時調(diào)用 tcp_v4_syn_recv_sock_toa

static struct sock *
tcp_v4_syn_recv_sock_toa(struct sock *sk, struct sk_buff *skb,
            struct request_sock *req, struct dst_entry *dst)
{
    struct sock *newsock = NULL;

    TOA_DBG("tcp_v4_syn_recv_sock_toa called\n");

    /* call orginal one */
    newsock = tcp_v4_syn_recv_sock(sk, skb, req, dst);

    /* set our value if need */
    if (NULL != newsock && NULL == newsock->sk_user_data) {
        newsock->sk_user_data = get_toa_data(AF_INET, skb);
        if (NULL != newsock->sk_user_data)
            TOA_INC_STATS(ext_stats, SYN_RECV_SOCK_TOA_CNT);
        else
            TOA_INC_STATS(ext_stats, SYN_RECV_SOCK_NO_TOA_CNT);

        TOA_DBG("tcp_v4_syn_recv_sock_toa: set "
            "sk->sk_user_data to %p\n",
            newsock->sk_user_data);
    }
    return newsock;
}
  1. 調(diào)用原有函數(shù) tcp_v4_syn_recv_sock 處理,也就是就里兼容了原有邏輯,普通非 toa 請求也會正常獲取到 ip
  2. 額外調(diào)用 get_toa_data 生成地址,可以看到地址放到了 sk->sk_user_data 字段。
static void *get_toa_data(int af, struct sk_buff *skb)
{
    struct tcphdr *th;
    int length;
    unsigned char *ptr;

    TOA_DBG("get_toa_data called\n");

    if (NULL != skb) {
        th = tcp_hdr(skb);
        length = (th->doff * 4) - sizeof(struct tcphdr);
        ptr = (unsigned char *) (th + 1);

        while (length > 0) {
            int opcode = *ptr++;
            int opsize;
            switch (opcode) {
            case TCPOPT_EOL:
                return NULL;
            case TCPOPT_NOP:    /* Ref: RFC 793 section 3.1 */
                length--;
                continue;
            default:
                opsize = *ptr++;
                if (opsize < 2) /* "silly options" */
                    return NULL;
                if (opsize > length)
                    /* don't parse partial options */
                    return NULL;
                if (TCPOPT_TOA == opcode &&
                    TCPOLEN_IP4_TOA == opsize) {

                    struct toa_ip4_data tdata;
                    void *ret_ptr = NULL;

                    memcpy(&tdata, ptr - 2, sizeof(tdata));
                    TOA_DBG("af = %d, find toa data: ip = "
                        TOA_NIPQUAD_FMT", port = %u\n",
                        af,
                        TOA_NIPQUAD(tdata.ip),
                        ntohs(tdata.port));
                    if (af == AF_INET) {
                        memcpy(&ret_ptr, &tdata,
                            sizeof(ret_ptr));
                        TOA_DBG("coded ip4 toa data: %p\n",
                            ret_ptr);
                        return ret_ptr;
                    }
#ifdef TOA_IPV6_ENABLE
                    else if (af == AF_INET6) {
                        struct toa_ip6_data *ptr_toa_ip6 =
                            kmalloc(sizeof(struct toa_ip6_data), GFP_ATOMIC);
                        if (!ptr_toa_ip6) {
                            return NULL;
                        }
                        ptr_toa_ip6->opcode = opcode;
                        ptr_toa_ip6->opsize = TCPOLEN_IP6_TOA;
                        ipv6_addr_set(&ptr_toa_ip6->in6_addr, 0, 0,
                            htonl(0x0000FFFF), tdata.ip);
                        TOA_DBG("coded ip6 toa data: %p\n",
                            ptr_toa_ip6);
                        TOA_INC_STATS(ext_stats, IP6_ADDR_ALLOC_CNT);
                        return ptr_toa_ip6;
                    }
#endif
                }

#ifdef TOA_IPV6_ENABLE
                if (TCPOPT_TOA == opcode &&
                    TCPOLEN_IP6_TOA == opsize &&
                    af == AF_INET6) {
                    struct toa_ip6_data *ptr_toa_ip6 =
                        kmalloc(sizeof(struct toa_ip6_data), GFP_ATOMIC);
                    if (!ptr_toa_ip6) {
                            return NULL;
                    }
                    memcpy(ptr_toa_ip6, ptr - 2, sizeof(struct toa_ip6_data));

                    TOA_DBG("find toa_v6 data : ip = "
                        TOA_NIP6_FMT", port = %u,"
                        " coded ip6 toa data: %p\n",
                        TOA_NIP6(ptr_toa_ip6->in6_addr),
                        ptr_toa_ip6->port,
                        ptr_toa_ip6);
                    TOA_INC_STATS(ext_stats, IP6_ADDR_ALLOC_CNT);
                    return ptr_toa_ip6;
                }
#endif
                ptr += opsize - 2;
                length -= opsize;
            }
        }
    }
    return NULL;
}
  1. 遍歷所有 option, 根據(jù) opcode 來處理 ipv4 或是 ipv6
  2. 將 toa struct 復(fù)制一份,然后返回

然后當(dāng) real server 調(diào)用 getpeername 或是 getsocketname 時調(diào)用 inet_getname_toa 來獲取 ip,如果是 ipv6 則調(diào)用 inet6_getname_toa

inet_getname_toa(struct socket *sock, struct sockaddr *uaddr,
        int *uaddr_len, int peer)
{
    int retval = 0;
    struct sock *sk = sock->sk;
    struct sockaddr_in *sin = (struct sockaddr_in *) uaddr;
    struct toa_ip4_data tdata;

    TOA_DBG("inet_getname_toa called, sk->sk_user_data is %p\n",
        sk->sk_user_data);

    /* call orginal one */
    retval = inet_getname(sock, uaddr, uaddr_len, peer);

    /* set our value if need */
    if (retval == 0 && NULL != sk->sk_user_data && peer) {
        if (sk_data_ready_addr == (unsigned long) sk->sk_data_ready) {
            memcpy(&tdata, &sk->sk_user_data, sizeof(tdata));
            if (TCPOPT_TOA == tdata.opcode &&
                TCPOLEN_IP4_TOA == tdata.opsize) {
                TOA_INC_STATS(ext_stats, GETNAME_TOA_OK_CNT);
                TOA_DBG("inet_getname_toa: set new sockaddr, ip "
                    TOA_NIPQUAD_FMT" -> "TOA_NIPQUAD_FMT
                    ", port %u -> %u\n",
                    TOA_NIPQUAD(sin->sin_addr.s_addr),
                    TOA_NIPQUAD(tdata.ip), ntohs(sin->sin_port),
                    ntohs(tdata.port));
                sin->sin_port = tdata.port;
                sin->sin_addr.s_addr = tdata.ip;
            } else { /* sk_user_data doesn't belong to us */
                TOA_INC_STATS(ext_stats,
                        GETNAME_TOA_MISMATCH_CNT);
                TOA_DBG("inet_getname_toa: invalid toa data, "
                    "ip "TOA_NIPQUAD_FMT" port %u opcode %u "
                    "opsize %u\n",
                    TOA_NIPQUAD(tdata.ip), ntohs(tdata.port),
                    tdata.opcode, tdata.opsize);
            }
        } else {
            TOA_INC_STATS(ext_stats, GETNAME_TOA_BYPASS_CNT);
        }
    } else { /* no need to get client ip */
        TOA_INC_STATS(ext_stats, GETNAME_TOA_EMPTY_CNT);
    }

    return retval;
}
  1. 調(diào)用原有 inet_getname 函數(shù),獲取 ip,兼容原有內(nèi)核邏輯
  2. 判斷 sk_user_data 不為空,并且結(jié)構(gòu)體 op-kind op-length 與 ipv4 toa 的相等,獲取 ip port ,并填充 sin

小結(jié)

實(shí)現(xiàn)原理還真簡單,只不過有兩個隱患。

  1. 如果 option 以后擴(kuò)充其它內(nèi)容,長度不夠咋辦?資源本身就不多
  2. op-kind 254 現(xiàn)在不被 tcp/ip 官方認(rèn)可,以后會不會被占用?
?著作權(quán)歸作者所有,轉(zhuǎn)載或內(nèi)容合作請聯(lián)系作者
【社區(qū)內(nèi)容提示】社區(qū)部分內(nèi)容疑似由AI輔助生成,瀏覽時請結(jié)合常識與多方信息審慎甄別。
平臺聲明:文章內(nèi)容(如有圖片或視頻亦包括在內(nèi))由作者上傳并發(fā)布,文章內(nèi)容僅代表作者本人觀點(diǎn),簡書系信息發(fā)布平臺,僅提供信息存儲服務(wù)。

相關(guān)閱讀更多精彩內(nèi)容

友情鏈接更多精彩內(nèi)容