强网杯2024 ez_vm 手撕VM + DFA Attack Whitebox AES
2024-12-27 09:59:0 Author: mp.weixin.qq.com(查看原文) 阅读量:0 收藏


题目思路

首先这道题是一个栈虚拟机+JIT,在栈虚拟机中完成白盒AES加密过程。所以做出这道题需要:

  1. 逆向虚拟机的handler

    1. 写出对应的parser,将handler转换成等价的x64汇编指令

    2. 将x64汇编指令汇编交给ida分析(二次逆向)

    3. 为了更好分析,为b.中的分析文件添加VM的结构体信息

    4. 其中有一个极为特殊的handler,为JIT的实现。

  2. 对白盒AES进行DFA攻击

    1. 确定最后一次列混淆的时机,多patch几次,恢复last round key

    2. 根据last round key恢复初始密钥

  3. aes解密

总之,要工程量有工程量,要难度有工程量。


题目背景

chal.exe 3766323862633565396633663134393532356365646630626636363036636630
:)

一个exe程序 接收一个参数(flag)

flag正确输出:)

flag错误输出:(


剖析bytecode格式

首先在sub_14009A760中,发现了一个巨大的swicth case,在sub_14009A760调用的前有寄存器保存,后有寄存器恢复,显然sub_14009A760就是vm_dispatcher,在switch中的case就是不同的handler了。

bytecode就相当于VM中的二进制代码一样,而handler就是bytecode的解释器,可以理解为vcpu,

下面拿case 0来剖析bytecode的格式。

case 0 - push imm8/16/32/64

switch ( (char)_RAX )
{
case 0:
__asm { tzcnt eax, ebx; jumptable 000000014009A7F8 case 0 }
v86 = _RAX;
switch ( v86 )
{
case 0LL:
v87 = *(unsigned __int8 *)v10;
v88 = a1;
goto LABEL_260;
case 1LL:
v87 = *(_WORD *)v10;
v88 = a1;
LABEL_260:
push_16(v88, v87);
break;
case 2LL:
push_32(a1, *v10);
break;
case 3LL:
push_64(a1, *(_QWORD *)v10);
break;
case 4LL:
JUMPOUT(0x14009A190LL);
}

可以看到大switch中还有个小switch,细看小switch中每个case的逻辑都相同只不过操作数的大小不同,其中push_16的代码为:

__int64 __fastcall sub_14009A6B0(__int64 a1, __int16 a2)
{
__int64 result; // rax

result = *(_QWORD *)(a1 + 8); // vrsp
if ( result == *(_QWORD *)(a1 + 536) )
BUG();
*(_QWORD *)(a1 + 8) = result - 2;
*(_WORD *)(result - 2) = a2;
return result;
}

可以看出来是内存模拟push的操作,最后写入内存的是2字节的数据,所以是push_16其他以此类推。

你可能疑问,为什么没有push8,这里我的理解为:本来x84_64架构中,只有16位/32位/64位,即使在16位下,push 1,实际上也是压入2个字节,即压栈时会根据CPU模式扩展到对应位数

调试查看case 0 + case 3 bytecode的内容为:

00 08 20 00 00 00 00 00 00 00

op opsize [operand]

寄存器指向bytecode格式内存大小注释
rdi→op1 byte大swicth

opsize1 byte小switch
[rsi]→operand取决于op和opsizeif exist

operand2operand决定if exist

在这个例子中 就存在operand 为 20 00 00 00 00 00 00 00,长度为8 byte,与opsize=08对应。

存在operand2的例子会在后面讲到,这里先埋个坑。


handler逆向

case 0

根据前面的分析。总之,这就是一个push imm的指令,注意这是在栈虚拟机的实现,在虚拟机中有大量的push,pop,这些数push进去后面又pop出来拿来用,翻译x64等效指令就是:

mov reg, imm

为什么等效?mov reg, imm 其实就是占用一个reg存放imm(reg_index+1),对应于栈虚拟机的栈空间。相应的pop时候,释放栈空间,翻译时reg_index需要减1。(reg_index的动态加减其实就像模拟了rsp的移动)

case 1 : load

case 1:
v89 = (__int16 *)pop_64(a1);
__asm { tzcnt ecx, ebx }
v91 = _RCX;
switch ( v91 )
{
case 0LL:
v13 = *(unsigned __int8 *)v89;
goto LABEL_5;
case 1LL:
v13 = *v89;
v8 = a1;
goto LABEL_6;
case 2LL:
v58 = *(_DWORD *)v89;
goto LABEL_289;
case 3LL:
v138 = *(_QWORD *)v89;
goto LABEL_317;
}

可以看出时load8/16/32/64的操作,对应的操作可以简化为push(*pop()),所以对应的x64指令为:

mov reg, [reg]

case 3,5,6 :store

case 5:
_RAX = (_QWORD *)pop_64(a1);
v75 = _RAX;
if ( (_DWORD)_RBX == 4 )
{
*_RAX = (unsigned int)pop_32(a1);
}
else
{
__asm { tzcnt eax, ebx }
v130 = _RAX;
switch ( (unsigned __int64)v130 )
{
case 0uLL:
LABEL_155:
*v75 = pop_16(a1);
break;
case 1uLL:
LABEL_158:
*(_WORD *)v75 = pop_16(a1);
break;
case 2uLL:
LABEL_156:
*(_DWORD *)v75 = pop_32(a1);
break;
case 3uLL:
LABEL_157:
*(_QWORD *)v75 = pop_64(a1);
break;
}
}

case 3,5,6都为store操作,只有细微的差别,这里拿case 5来分析。

操作可以简化为*push() = pop(),翻译为x64指令:

mov [reg],reg

case 7,8,0xb,0xd,0xe,0xf

case 7:
__asm { tzcnt eax, ebx; jumptable 000000014009A7F8 case 7 }
v77 = _RAX;
switch ( v77 )
{
case 0LL:
v78 = pop_16(a1);
v79 = pop_16(a1);
v80 = v79 + v78;
v81 = *(_QWORD *)(a1 + 0x210) & 0x3F77D7LL;
if ( v79 < 0 )
{
if ( v80 < 0 || v78 >= 0 )
goto LABEL_103;
}
else if ( v80 >= 0 || v78 < 0 )
{
LABEL_103:
v82 = v81 & 0x3F7F17;
v83 = v81 | 0x80; //SF
if ( v80 >= 0 )
v83 = v82;
v84 = v83 & 0x3F7F92;
v85 = v83 | 0x40;// ZF
if ( v80 )
v85 = v84;
*(_QWORD *)(a1 + 0x210) = v85 & 0x3F7FD2 | (unsigned __int64)(unsigned __int8)((4 * !__SETP__(v80, 0)) | ((unsigned __int8)v80 < (unsigned __int8)v79));//PF
v13 = (unsigned __int8)v80;
goto LABEL_5;
}
LODWORD(v81) = v81 | 0x800; //OF
goto LABEL_103;
case 1LL:
//.....

case 7,8,0xb,0xd,0xe,0xf 分别对应add,sub,div,mul,and,or,逻辑类似

这里拿case 7分析,可以看到除了push(pop() + pop())的操作外,还有一系列其他的操作,其实这里是在设置eflags, *(_QWORD *)(a1 + 0x210)存储了虚拟机的eflags。翻译为:

add reg, reg

sub,div,mul,and,or 以此类推。

case 0x12: cmp

case 18:
__asm { tzcnt eax, ebx; jumptable 000000014009A7F8 case 18 }
v102 = _RAX;
switch ( v102 )
{
case 0LL:
v103 = pop_16(a1);
v104 = pop_16(a1);
v105 = v104 - v103;
v106 = *(_QWORD *)(a1 + 528) & 0x3F77D7LL;
if ( v104 >= 0 )
{
if ( v105 >= 0 || v103 < 0 )
goto LABEL_134;
LABEL_133:
LODWORD(v106) = v106 | 0x800;
goto LABEL_134;
}
if ( v105 >= 0 && v103 < 0 )
goto LABEL_133;
LABEL_134:
v107 = v106 & 0x3F7F17 | v105 & 0x80;
v108 = v107 + 64;
if ( v104 != v103 )
v108 = v107;
*(_QWORD *)(a1 + 528) = v108 & 0x3F7FD2 | (unsigned __int64)(unsigned __int8)(((unsigned __int8)v104 < (unsigned __int8)v103) | (4 * !__SETP__(v104, v103)));
goto LABEL_7;
case 1LL:
//.....

乍一看和sub没什么区别,其实不如在case0x12中,v105 = v104 - v103后,并没有push(v105),但是保存了改变的标志位,符合cmp的特征,翻译为:

cmp reg,reg

case 0x15: jmp

case 21:
v224 = *(_QWORD *)(a1 + 0x210);
switch ( *(_BYTE *)v11 )
{
case 0:
goto ture_jmp; // jmp
case 1:
goto LABEL_370;
case 2:
goto LABEL_363;
case 3:
if ( (v224 & 0x40) == 0 && (v224 & 1) == 0 )// ja
goto false_jmp; // jbe
goto ture_jmp;
case 4:
if ( (v224 & 0x40) != 0 || (((unsigned __int8)v224 ^ 1) & 1) == 0 )// jbe
goto false_jmp; // ja
goto ture_jmp;
case 5:
if ( (((unsigned __int8)v224 ^ 1) & 1) == 0 )// jb
goto false_jmp; // jae
goto ture_jmp;
case 6:
if ( ((v224 & 0x80u) != 0LL) != ((v224 & 0x800) != 0) )// jl
goto ture_jmp; // jge
goto LABEL_370;
case 7:
if ( ((v224 & 0x80u) != 0LL) != ((v224 & 0x800) == 0) )// jg
goto LABEL_363; // jle
goto false_jmp;
case 8:
if ( ((v224 & 0x80u) != 0LL) != ((v224 & 0x800) == 0) )// jg
goto ture_jmp;
LABEL_370:
if ( (v224 & 0x40) != 0 ) // jz
goto ture_jmp; // jnz
goto false_jmp;
case 9:
if ( ((v224 & 0x80u) != 0LL) == ((v224 & 0x800) != 0) )
goto false_jmp;
LABEL_363:
if ( (v224 & 0x40) != 0 ) // jz
false_jmp:
*(_QWORD *)a1 = v4 + 11;
else
ture_jmp:
*(_QWORD *)a1 = &v4[-*(_QWORD *)(v4 + 3)];
break;
default:
goto LABEL_418;
}

可以看到这里会判断不同标志位,进行不同的跳转,这里分析下jmp 的bytecode格式:

bytecode formatvaluelength
op0x151 byte
opsize81 byte
jmp_condition(operand)0-91 byte
offset(operand2)unknow8 byte

所以在ture_jmp:中,就是在计算当前的bytecode 的pc与offset的运算,即newpc = pc-offset

在false_jmp:中,直接就是更新pc到下一个bytecode的位置,相当于fall through。

ture_jmp:
*(_QWORD *)a1 = &v4[-*(_QWORD *)(v4 + 3)];
false_jmp:
*(_QWORD *)a1 = v4 + 11;(next bytecode)

case 0x1b: JIT

case 27:
v15 = (unsigned __int8)v4[2];
v16 = v4 + 3;
*(_QWORD *)a1 = v16;
v2 = 0LL;
memset(v443, 0, sizeof(v443));
v444 = 0LL;
v17 = 0LL;
sub_7FF68C2DCEF0(v7, byte_7FF68C2DA000, 0x9000000101uLL, 0);
LOBYTE(v18) = 1;
sub_7FF68C2DCEF0(v7, byte_7FF68C2DA000, 0xA000000101uLL, v18);
LOBYTE(v19) = 2;
sub_7FF68C2DCEF0(v7, byte_7FF68C2DA000, 0xB000000101uLL, v19);
LOBYTE(v20) = 3;
sub_7FF68C2DCEF0(v7, byte_7FF68C2DA000, 0xC000000101uLL, v20);
LOBYTE(v21) = 4;
sub_7FF68C2DCEF0(v7, byte_7FF68C2DA000, 0xD000000101uLL, v21);
LOBYTE(v22) = 5;
sub_7FF68C2DCEF0(v7, byte_7FF68C2DA000, 0xE000000101uLL, v22);
LOBYTE(v23) = 6;
// ....

到这里会发现代码特别长,有很多陌生函数call,点进去还有一堆call,人工分析难度太大了。这里我是猜出来的,首先看下case 0x1b的bytecode:

1B 08 03 C1 E1 02

然后观察前两行:

case 27:
v15 = (unsigned __int8)v4[2];// v15 = 3
v16 = v4 + 3; // v16 -〉c1 e1 02

结合bytecode的内容,v15 = 3,v16指向了C1所在的地址,然后往后看这两个变量的引用:

memcpy_1(v87 + *(_QWORD *)(a1 + 0x230), v16, v15);

现在memcpy中被使用,所以可以断定v15指定了v16的长度,然后被复制到了内存的某个位置,这里我尝试使用capstone反汇编了一下,发现可以反汇编出来,所以就猜定是JIT了。

case 0x1c: ret

case 28:
return a1;

ret一目了然。


虚拟机parser编写

到这里所有handler都知道如何翻译成x64汇编指令,所以可以着手编写parser了,下面是我的parser实现:

from capstone import *

md = Cs(CS_ARCH_X86, CS_MODE_64)

with open('chal.exe','rb') as f:
vm_opcode = f.read()[0x97200:0x97200+0x15b8c]

print(vm_opcode[:16])

pc_max = len(vm_opcode)
pc = 0
reg_index = -1

def get_reg():
reg_name = [
# 'rax', used
# 'rbx',
# 'rcx',
# 'rdx',
'rdi',
'rsi',
# 'rsp',
'rbp',
'r8',
'r9',
'r10',
'r11',
'r12',
'r13',
'r14',
'r15',
]
assert reg_index >= 0 , "reg_index_error"
assert reg_index < len(reg_name) , "reg_index_error"
return reg_name[reg_index]

def get_reg_size():
reg_name_size = [
# ['al','ax','eax','rax'], used
# ['bl','bx','ebx','rbx'],
# ['cl','cx','ecx','rcx'],
# ['dl','dx','edx','rdx'],
['dil','di','edi','rdi'],
['sil','si','esi','rsi'],
# ['spl','sp','esp','rsp'],
['bpl','bp','ebp','rbp'],
['r8b','r8w','r8d','r8'],
['r9b','r9w','r9d','r9'],
['r10b','r10w','r10d','r10'],
['r11b','r11w','r11d','r11'],
['r12b','r12w','r12d','r12'],
['r13b','r13w','r13d','r13'],
['r14b','r14w','r14d','r14'],
['r15b','r15w','r15d','r15'],
]
assert reg_index >= 0 , "reg_index_error"
assert reg_index < len(reg_name_size) , "reg_index_error"
return reg_name_size[reg_index][opsize.bit_length()-1]

opsize_arr = [1,2,4,8]
x64_asm = []
need_label = set()
pc_infor = []

while pc < pc_max:
opcode = vm_opcode[pc]
opsize = vm_opcode[pc+1]
pc_infor.append(pc)
x64_asm.append(f'lable_{hex(pc)}:')
assert opsize in opsize_arr, "opsize error"
if opcode == 0:
imm = int.from_bytes(vm_opcode[pc+2:pc+2+opsize],'little')
if opsize == 1:
print(f"push16 {imm}")
if opsize == 2:
print(f"push16 {imm}")
if opsize == 4:
print(f"push32 {imm}")
if opsize == 8:
print(f"push64 {imm}")
pc += 2+opsize
reg_index += 1
dst_reg = get_reg()
asm = f'mov %s, {imm}' % (dst_reg)
x64_asm.append(asm)

elif opcode == 1:
if opsize == 1:
print(f"load16")
if opsize == 2:
print(f"load16")
if opsize == 4:
print(f"load32")
if opsize == 8:
print(f"load64")
pc += 2
src_reg = get_reg()
dst_reg = get_reg_size()
asm = 'mov %s, [%s]' % (dst_reg, src_reg)
if opsize < 4: #not support 32->64
asm += '\nmovzx %s, %s' % (src_reg, dst_reg)
x64_asm.append(asm)

elif opcode == 2:
print(f"{opcode} not impl")
break

elif opcode == 3:
if opsize == 1:
print(f"store16")
if opsize == 2:
print(f"store16")
if opsize == 4:
print(f"store32")
if opsize == 8:
print(f"store64")
pc += 2
dst_reg = get_reg()
reg_index -= 1
src_reg = get_reg_size()
reg_index -= 1
asm = "mov [%s], %s" % (dst_reg, src_reg)
x64_asm.append(asm)

elif opcode == 4:
print(f"{opcode} not impl")
break

elif opcode == 5:
if opsize == 1:
print(f"store8")
if opsize == 2:
print(f"store16")
if opsize == 4:
print(f"store32u")
if opsize == 8:
print(f"store64")
pc += 2
dst_reg = get_reg()
reg_index -= 1
src_reg = get_reg_size()
reg_index -= 1
asm = "mov [%s], %s" % (dst_reg, src_reg)
x64_asm.append(asm)

elif opcode == 6:
if opsize == 1:
print(f"store8")
if opsize == 2:
print(f"store16")
pc += 2
dst_reg = get_reg()
reg_index -= 1
src_reg = get_reg_size()
reg_index -= 1
asm = "mov [%s], %s" % (dst_reg, src_reg)
x64_asm.append(asm)

elif opcode == 7:
if opsize == 1:
print(f"add16")
if opsize == 2:
print(f"add16")
if opsize == 4:
print(f"add32")
if opsize == 8:
print(f"add64")
pc += 2
src_reg = get_reg_size()
reg_index -= 1
dst_reg = get_reg_size()
asm = 'add %s, %s' % (dst_reg, src_reg)
x64_asm.append(asm)

elif opcode == 8:
if opsize == 1:
print(f"sub16")
if opsize == 2:
print(f"sub16")
if opsize == 4:
print(f"sub32")
if opsize == 8:
print(f"sub64")
pc += 2
src_reg = get_reg_size()
reg_index -= 1
dst_reg = get_reg_size()
asm = 'sub %s, %s' % (dst_reg, src_reg)
x64_asm.append(asm)

elif opcode == 9 or opcode == 0xa:
print(f"{opcode} not impl")
break

elif opcode == 0x0b:
if opsize == 1:
print(f"div16")
if opsize == 2:
print(f"div16")
if opsize == 4:
print(f"div32")
if opsize == 8:
print(f"div64")
pc += 2
src_reg = get_reg()
reg_index -= 1
dst_reg = get_reg()
asm = 'xor rdx, rdx'
asm += '\nmov rax, %s' % dst_reg
asm += '\nmov rcx, %s' % src_reg
asm += '\ndiv rcx'
asm += '\nmov %s, rax'% dst_reg
x64_asm.append(asm)

elif opcode == 0x0c:
print(f"{opcode} not impl")
break

elif opcode == 0x0d:
if opsize == 1:
print(f"imul16")
if opsize == 2:
print(f"imul16")
if opsize == 4:
print(f"imul32")
if opsize == 8:
print(f"imul64")
pc += 2
src_reg = get_reg()
reg_index -= 1
dst_reg = get_reg()
asm = 'imul %s, %s' % (dst_reg, src_reg)
x64_asm.append(asm)

elif opcode == 0x0e:
if opsize == 1:
print(f"and16")
if opsize == 2:
print(f"and16")
if opsize == 4:
print(f"and32")
if opsize == 8:
print(f"and64")
pc += 2
src_reg = get_reg()
reg_index -= 1
dst_reg = get_reg()
asm = 'and %s, %s' % (dst_reg, src_reg)
x64_asm.append(asm)

elif opcode == 0x0f:
if opsize == 1:
print(f"or16")
if opsize == 2:
print(f"or16")
if opsize == 4:
print(f"or32")
if opsize == 8:
print(f"or64")
pc += 2
src_reg = get_reg()
reg_index -= 1
dst_reg = get_reg()
asm = 'or %s, %s' % (dst_reg, src_reg)
x64_asm.append(asm)

elif opcode == 0x10:
if opsize == 1:
print(f"xor16")
if opsize == 2:
print(f"xor16")
if opsize == 4:
print(f"xor32")
if opsize == 8:
print(f"xor64")
pc += 2
src_reg = get_reg()
reg_index -= 1
dst_reg = get_reg()
asm = 'xor %s, %s' % (dst_reg, src_reg)
x64_asm.append(asm)

elif opcode == 0x11:
if opsize == 1:
print(f"~16")
if opsize == 2:
print(f"~16")
if opsize == 4:
print(f"~32")
if opsize == 8:
print(f"~64")
pc += 2
src_reg = get_reg()
asm = 'not %s' % (src_reg)
x64_asm.append(asm)

elif opcode == 0x12:
if opsize == 1:
print(f"CMP16")
if opsize == 2:
print(f"CMP16")
if opsize == 4:
print(f"CMP32")
if opsize == 8:
print(f"CMP64")
pc += 2
src_reg = get_reg()
reg_index -= 1
dst_reg = get_reg()
reg_index -= 1
asm = 'cmp %s, %s' % (dst_reg, src_reg)
x64_asm.append(asm)

elif opcode == 0x13 | opcode == 0x14:
print(f"{opcode} not impl")
break

elif opcode == 0x15:
jmp_condition = vm_opcode[pc+2]
offset = int.from_bytes(vm_opcode[pc+3:pc+3+8],'little')
jmp_pc = pc - offset & 2**64 - 1
target = hex(jmp_pc)
lable = "lable_%s" % target
if jmp_condition == 0:
print("jmp")
asm = f'jmp {lable}'
need_label.add(jmp_pc)
if jmp_condition == 1:
print("jz")
asm = f'jz {lable}'
need_label.add(jmp_pc)
if jmp_condition == 2:
print("jnz")
asm = f'jnz {lable}'
need_label.add(jmp_pc)
if jmp_condition == 3:
print("jbe")
asm = f'jbe {lable}'
need_label.add(jmp_pc)
if jmp_condition == 4:
print("ja")
asm = f'ja {lable}'
need_label.add(jmp_pc)
if jmp_condition == 5:
print("jae")
asm = f'jae {lable}'
need_label.add(jmp_pc)
if jmp_condition == 6:
print("jle")
asm = f'jle {lable}'
need_label.add(jmp_pc)
if jmp_condition == 7:
print("jg")
asm = f'jg {lable}'
need_label.add(jmp_pc)
if jmp_condition == 8:
print("jg")
asm = f'jg {lable}'
need_label.add(jmp_pc)
pc += 11
x64_asm.append(asm)

elif opcode == 0x16:
print("pushVM")
reg_index += 1
dst_reg = get_reg()
asm = "mov %s ,rbx" % dst_reg
pc += 2
x64_asm.append(asm)

elif opcode == 0x17:
print("add64")
pc += 2
src_reg = get_reg()
reg_index -= 1
dst_reg = get_reg()
asm = 'add %s, %s' % (dst_reg, src_reg)
x64_asm.append(asm)

elif opcode == 0x18:
print("imul64")
pc += 2
src_reg = get_reg()
reg_index -= 1
dst_reg = get_reg()
asm = 'imul %s, %s' % (dst_reg, src_reg)
x64_asm.append(asm)

elif opcode == 0x19:
print("sub64")
pc += 2
src_reg = get_reg()
reg_index -= 1
dst_reg = get_reg()
asm = 'sub %s, %s' % (dst_reg, src_reg)
x64_asm.append(asm)

elif opcode == 0x1a:
base = hex(int.from_bytes(vm_opcode[pc+2:pc+2+opsize],'little'))
print(f"rebase {base}")
pc += 2 + opsize
x64_asm.append('nop')

elif opcode == 0x1b:
shellcode_len = vm_opcode[pc+2]
print("JIT")
print(f"shellcode {shellcode_len}")
shellcode_byte = vm_opcode[pc+3:pc+3+shellcode_len]
asm = f'JIT_{pc}:\n'
for i in md.disasm(shellcode_byte,0):
asm += f"{i.mnemonic} {i.op_str}\n"
print(asm)
pc += 3 + shellcode_len
x64_asm.append(asm)

elif opcode == 0x1c:
print('return')
asm = 'mov rax, rbx\nret'
x64_asm.append(asm)
break

with open("parse.s",'w') as f:
f.write('''
.intel_syntax noprefix
.code64
.section .text
.global _start
_start:
'''
)
f.write('mov rbx, rcx\n')
# for index,asm in enumerate(x64_asm):
# # if pc_infor[index] in need_label:
# lable = 'lable_' + hex(pc_infor[index])+":"
# f.write(lable)
f.write('\n'.join(x64_asm))

有一些要点需要注意:

1.注意reg_index加减要和handler中的push,pop对应;

2.这算个小trick,每个翻译指令前面都有个代表当前bytecode的pc的lable,这主要是为了jmp/jne等跳转指令服务;

3.在JIT翻译中,直接使用capstone翻译生成X64汇编。


初探witheBox 逆向

将产生的parse.s汇编产生的目标文件放入ida中分析:

VmContext *__fastcall start(VmContext *a1)
{
*((_QWORD *)a1->rsp + 2) = a1->rdx;
*((_QWORD *)a1->rsp + 1) = a1->rcx;
a1->rsp = (char *)a1->rsp - 8;
*(_QWORD *)a1->rsp = a1->rbp;
a1->rsp = (char *)a1->rsp - 192;
a1->rbp = (char *)a1->rsp + 32;
a1->rcx = (void *)0x140097017LL;
((void (*)(void))((char *)NtCurrentPeb()->ImageBaseAddress + 12048))();
//shiftRow tables
*((_BYTE *)a1->rbp + 16) = 0;
*((_BYTE *)a1->rbp + 17) = 5;
*((_BYTE *)a1->rbp + 18) = 10;
*((_BYTE *)a1->rbp + 19) = 15;
*((_BYTE *)a1->rbp + 20) = 4;
*((_BYTE *)a1->rbp + 21) = 9;
*((_BYTE *)a1->rbp + 22) = 14;
*((_BYTE *)a1->rbp + 23) = 3;
*((_BYTE *)a1->rbp + 24) = 8;
*((_BYTE *)a1->rbp + 25) = 13;
*((_BYTE *)a1->rbp + 26) = 2;
*((_BYTE *)a1->rbp + 27) = 7;
*((_BYTE *)a1->rbp + 28) = 12;
*((_BYTE *)a1->rbp + 29) = 1;
*((_BYTE *)a1->rbp + 30) = 6;
*((_BYTE *)a1->rbp + 31) = 11;
for ( *((_DWORD *)a1->rbp + 16) = 0; *((unsigned int *)a1->rbp + 16) < 9uLL; *((_DWORD *)a1->rbp + 16) = a1->rax )
{
//shift_rows
for ( *((_DWORD *)a1->rbp + 17) = 0; *((unsigned int *)a1->rbp + 17) < 0x10uLL; *((_DWORD *)a1->rbp + 17) = a1->rax )
{
LODWORD(a1->rax) = *((_DWORD *)a1->rbp + 17);
LOBYTE(a1->rax) = *((_BYTE *)a1->rax + (unsigned __int64)a1->rbp + 16);
LODWORD(a1->rcx) = *((_DWORD *)a1->rbp + 17);
a1->rdx = (void *)*((_QWORD *)a1->rbp + 22);
LOBYTE(a1->rax) = *((_BYTE *)a1->rax + (unsigned __int64)a1->rdx);
*((_BYTE *)a1->rcx + (unsigned __int64)a1->rbp) = a1->rax;
LODWORD(a1->rax) = *((_DWORD *)a1->rbp + 17);
++LODWORD(a1->rax);
// tmp[i] = state[order[i]]
}
for ( *((_DWORD *)a1->rbp + 18) = 0; *((unsigned int *)a1->rbp + 18) < 0x10uLL; *((_DWORD *)a1->rbp + 18) = a1->rax )
{
LODWORD(a1->rax) = *((_DWORD *)a1->rbp + 18);
LODWORD(a1->rcx) = *((_DWORD *)a1->rbp + 18);
a1->rdx = (void *)*((_QWORD *)a1->rbp + 22);
LOBYTE(a1->rax) = *((_BYTE *)a1->rax + (unsigned __int64)a1->rbp);
*((_BYTE *)a1->rcx + (unsigned __int64)a1->rdx) = a1->rax;
LODWORD(a1->rax) = *((_DWORD *)a1->rbp + 18);
++LODWORD(a1->rax);
//state[i] = tmp[i]
}
for ( *((_DWORD *)a1->rbp + 19) = 0; *((unsigned int *)a1->rbp + 19) < 4uLL; *((_DWORD *)a1->rbp + 19) = a1->rax )
{
//too long to show
}
}
//memcpy
for ( *((_DWORD *)a1->rbp + 23) = 0; *((unsigned int *)a1->rbp + 23) < 0x10uLL; *((_DWORD *)a1->rbp + 23) = a1->rax )
{
LODWORD(a1->rax) = *((_DWORD *)a1->rbp + 23);
LODWORD(a1->rcx) = *((_DWORD *)a1->rbp + 23);
a1->rdx = (void *)*((_QWORD *)a1->rbp + 23);
a1->r8 = (void *)*((_QWORD *)a1->rbp + 22);
LOBYTE(a1->rax) = *((_BYTE *)a1->rax + (unsigned __int64)a1->r8);
*((_BYTE *)a1->rcx + (unsigned __int64)a1->rdx) = a1->rax;
LODWORD(a1->rax) = *((_DWORD *)a1->rbp + 23);
++LODWORD(a1->rax);
}
a1->rsp = (char *)a1->rbp + 160;
a1->rbp = *(void **)a1->rsp;
a1->rsp = (char *)a1->rsp + 8;
return a1;
}

这里可以看到经典的AES shfit_rows tales(查表法),并且还有9轮大运算,最后是一个循环将密文拷贝到buf2上。


DFA攻击

该部分需要你会AES白盒攻击的基础知识

where to patch

我们需要找到在VM中state[]所在的内存地址,这里shiftRows我们已经逆出来了,定位哪个是shiftRows其实不难,可以发现这个load操作:

a1->rdx = (void *)*((_QWORD *)a1->rbp + 22); // state[] = a1->rbp + 22

但是知道了光知道state[] = a1->rbp + 22没用,因为我们最终还是要在虚拟机中修改,我们需要知道a1->rbp + 22在虚拟机中的地址。

这里我采取的找到这一行load的汇编:

lable_0x7ed:
mov rdi, [rdi]

所以只需要在load处下断点,但是并不是所有load都需要断下,所以在ida中加入条件断点,就在bytecode存放地址的0x7ed处,经过多次调试的经验,发现bytecode的地址是固定,所以索性将condition写死:

rdi == 0x00007FF68C2DE7ed

最后state的地址就是rdi中的值。

patch时机选取

最佳的patch时机应该在第九次列混淆之前,所以我们需要找到这个时机,完成patch。

同样的思路,我们注意到了九次循环,如果我们能断在每个循环上,那么我就可以在循环中寻找patch时机,所以找到这个for循环的汇编的cmp指令,作为每次for循环的检查点:

lable_0x427:
cmp rdi, rsi

写入ida 断点的condition:

rdi == 0x00007FF68C2DE427

这样知道patch地址和patch时机就可以开始patch了,如果patch了之后发现改变的字节为1个,那就是晚了一轮,如果改变的字节早了一轮那就是早了,总之在倒数第几轮里确定时机,我没记错的话实在地八次到达断点时,patch就是在第九次列混淆之前,patch之后的结果简直完美,只要两组4种group就可以会恢复出last round key。

212717A58241E17212C9926E0D67F45C
232717A58241E1A312C9956E0DFBF45C // 1 8 11 14
E92717A58241E18C12C9F76E0D69F45C
212717EE82412C7212FA926E9C67F45C // 4 7 10 13
212717898241DC721220926E1D67F45C
212721A582CEE1722FC9926E0D67F422 // 3 6 9 16
212772A5823CE1727FC9926E0D67F484
21BA17A57241E17212C992350D67A05C // 2 5 12 15
21FB17A53A41E17212C992A00D67C65C

DFA attack

接下来,写脚本来进行DFA 攻击:

import phoenixAES

data = """212717A58241E17212C9926E0D67F45C
232717A58241E1A312C9956E0DFBF45C
E92717A58241E18C12C9F76E0D69F45C
212717EE82412C7212FA926E9C67F45C
212717898241DC721220926E1D67F45C
212721A582CEE1722FC9926E0D67F422
212772A5823CE1727FC9926E0D67F484
21BA17A57241E17212C992350D67A05C
21FB17A53A41E17212C992A00D67C65C
"""

with open('crackfile','wb') as fp:
fp.write(data.encode('utf-8'))

phoenixAES.crack_file('crackfile',[],True,False,verbose=3)
#Last round key #N found:
#BF2256727EF09577C7F720C7D84D697A

recover key

根据上一步恢复出来的last round key,恢复初始密钥:

from aeskeyschedule import *
base_key = reverse_key_schedule(bytes.fromhex('BF2256727EF09577C7F720C7D84D697A'),10)
print(base_key)
# b'welcometoqwb2024'


AES 解密

最后提取密文解密:

from Crypto.Cipher import AES

enc = bytes.fromhex('C40CC020FC48F6D26CD2FC2B5CA72E6541FE0E64056ED59CCC411D10BEA0F509')

key = b'welcometoqwb2024'

aes = AES.new(key=key,mode=AES.MODE_ECB)

flag = int.from_bytes(aes.decrypt(enc),'big')

print(hex(flag)[2:])
# print(aes.decrypt(enc))
#3766323862633565396633663134393532356365646630626636363036636630

此题得解。

写在后面

用了整整一个星期,终于把这道题硬生生啃下来了,其中遇到了很多的问题,如理解reg_index和stack的关系,实现过程中cmp的reg_index少减了一个1导致汇编出来的放入ida中的逻辑对不上,reg中一开始也使用rsp加入parse中但是到ida分析中导致一些信息丢失,因为这个问题出现时cmp的错误还没修复,所以我不是很确定是rsp的问题还是我cmp的问题。汇编出来放入ida反编译提示too big function,一度心灰意冷,搜索解决思路还好容易解决,不然卡在最后一步太难受了。还有就是DFA那里,还算顺利,总之就是耐心,细心。

看雪ID:SleepAlone

https://bbs.kanxue.com/user-home-950548.htm

*本文为看雪论坛优秀文章,由 SleepAlone 原创,转载请注明来自看雪社区

# 往期推荐

1、Frida 逆向一个 APP

2、强网杯S8 Rust Pwn chat-with-me出题思路分享

3、浅析libc2.38版本及以前tcache安全机制演进过程与绕过手法

4、购物APP设备风控SDK-mtop简单分析

5、PWN入门:偷吃特权-SetUID

球分享

球点赞

球在看

点击阅读原文查看更多


文章来源: https://mp.weixin.qq.com/s?__biz=MjM5NTc2MDYxMw==&mid=2458587796&idx=1&sn=5de696fc1e5824f4f7ab11824bc841b9&chksm=b18c221e86fbab080ca78c02ab1916310c9ec38c8e6955f6eadb9d2d3f83739db17910f49f90&scene=58&subscene=0#rd
如有侵权请联系:admin#unsafe.sh