From `sk_buff` to conntrack and NAT

On 2018/10/02 at 23:30

The entry point for assigning skb->_nfct is at nf_conntrack_in during NF_INET_PRE_ROUTING in NF_HOOK. After doing some checkings with layer 3 and layer 4 protocol (such as tcp_error), the resolve_normal_ct will search the associate conntrack or build a new one and assign it to the skb->_nfct.

In resolve_normal_ct, it will build the nf_conntrack_tuple from skb with its l3proto and l4proto respectively.

memset(tuple, 0, sizeof(*tuple));

tuple->src.l3num = l3num;
if (l3proto->pkt_to_tuple(skb, nhoff, tuple) == 0)
    return false;

tuple->dst.protonum = protonum;
tuple->dst.dir = IP_CT_DIR_ORIGINAL;

return l4proto->pkt_to_tuple(skb, dataoff, net, tuple);

Next, netfilter will search the tuple hash to get the conntrack if it existed. If the conntrack is not found, netfilter will build a new one in init_conntrack.

At init_conntrack, netfilter will initialize the reply tuple by reverting the original tuple using nf_ct_invert_tuple.

// inverse is the reply tuple and orig is the original tuple

memset(inverse, 0, sizeof(*inverse));

inverse->src.l3num = orig->src.l3num;
if (l3proto->invert_tuple(inverse, orig) == 0)
    return false;

inverse->dst.dir = !orig->dst.dir;

inverse->dst.protonum = orig->dst.protonum;
return l4proto->invert_tuple(inverse, orig);

Finally, the newly initialize conntrack will be assign with the original and reply tuple. The upcoming skb can build the tuple based on skb's 5 tuple to find the conntrack in the tuple's hash table.

When NAT comes in

When doing NAT, the reply tuple will be modified based on the altered IP/port.

Before NAT

orig tuple: src=192.168.1.14 dst=13.94.40.40 sport=37611 dport=443
repl tuple: src=13.94.40.40 dst=192.168.1.14 sport=443 dport=37611

After NAT

orig tuple: src=192.168.1.14 dst=13.94.40.40 sport=37611 dport=443
repl tuple: src=13.94.40.40 dst=192.168.1.1 sport=443 dport=37611

The reply tuple's destination will be changed to the router's ip address during nf_nat_setup_info.

nf_ct_invert_tuplepr(&curr_tuple,
             &ct->tuplehash[IP_CT_DIR_REPLY].tuple);

get_unique_tuple(&new_tuple, &curr_tuple, range, ct, maniptype);

if (!nf_ct_tuple_equal(&new_tuple, &curr_tuple)) {
    struct nf_conntrack_tuple reply;

    /* Alter conntrack table so will recognize replies. */
    nf_ct_invert_tuplepr(&reply, &new_tuple);
    nf_conntrack_alter_reply(ct, &reply);

    /* Non-atomic: we own this at the moment. */
    if (maniptype == NF_NAT_MANIP_SRC)
        ct->status |= IPS_SRC_NAT;
    else
        ct->status |= IPS_DST_NAT;

    if (nfct_help(ct) && !nfct_seqadj(ct))
        if (!nfct_seqadj_ext_add(ct))
            return NF_DROP;
}

Finally, we can doing the skb NAT by using the conntrack tuple directly at nf_nat_manip_pkt.

static unsigned int nf_nat_manip_pkt(struct sk_buff *skb, struct nf_conn *ct,
                     enum nf_nat_manip_type mtype,
                     enum ip_conntrack_dir dir)
{
    const struct nf_nat_l3proto *l3proto;
    const struct nf_nat_l4proto *l4proto;
    struct nf_conntrack_tuple target;

    /* We are aiming to look like inverse of other direction. */
    nf_ct_invert_tuplepr(&target, &ct->tuplehash[!dir].tuple);

    l3proto = __nf_nat_l3proto_find(target.src.l3num);
    l4proto = __nf_nat_l4proto_find(target.src.l3num,
                    target.dst.protonum);
    if (!l3proto->manip_pkt(skb, 0, l4proto, &target, mtype))
        return NF_DROP;

    return NF_ACCEPT;
}

At nf_nat_ipv4_manip_pkt and tcp_manip_pkt. The skb's IP and port will be altered based on the target tuple information.

if (maniptype == NF_NAT_MANIP_SRC) {
    iph->saddr = target->src.u3.ip;
    csum_replace4(&iph->check, iph->saddr, target->src.u3.ip);
} else {
    csum_replace4(&iph->check, iph->daddr, target->dst.u3.ip);
    iph->daddr = target->dst.u3.ip;
}
hdr = (struct tcphdr *)(skb->data + hdroff);

if (maniptype == NF_NAT_MANIP_SRC) {
    /* Get rid of src port */
    newport = tuple->src.u.tcp.port;
    portptr = &hdr->source;
} else {
    /* Get rid of dst port */
    newport = tuple->dst.u.tcp.port;
    portptr = &hdr->dest;
}

oldport = *portptr;
*portptr = newport;

Comments