Released in 2011.
Released in 2014/2015.
bic r1, sp, #0xff orr r1, r1, #0xf00 add r1, r1, #0x28 ldr r2, [r1] stmdb r2!, {sp, lr} mov sp, r2 blx r0 pop {r0, r1} mov sp, r0 bx r1
|
|
|---|
.word ROP_POP_R0PC ; pop {r0, pc} .word dst ; r0 .word ROP_POP_R1PC ; pop {r1, pc} .word src ; r1 .word ROP_POP_R2R3R4R5R6PC ; pop {r2, r3, r4, r5, r6, pc} .word size ; r2 .word 0xDEADBABE ; r3 (garbage) .word 0xDEADBABE ; r4 (garbage) .word 0xDEADBABE ; r5 (garbage) .word 0xDEADBABE ; r6 (garbage) .word MEMCPY
object: .word OBJECT_LOC + vtable - object ; pointer to manufactured vtable, and new sp .word ROP_POP_PC ; pc (pop {pc} to jump to ROP) vtable: ; also initial ROP .word ROP_POP_R4R5PC ; pop {r4, r5, pc} : skip pivot .word OBJECT_LOC + ropload_stackpivot - object + 0x1c ; r4 .word ROP_STACK_PIVOT ; stack pivot ; also r5 (garbage) rop: .word ROP_STACK_PIVOT ; ldmdavc r4, {r4, r5, r8, sl, fp, ip, sp, pc} ropload_stackpivot: .word 0, 0, 0, 0, 0, 0 .word LOADEDROP_BUFADR ; sp .word ROP_POP_PC ; pc


SVC 0x15: Result CreateSemaphore(Handle* semaphore, s32 initialCount, s32 maxCount)
SVC 0x21: Result CreateAddressArbiter(Handle* arbiter)
SVC 0x22: Result ArbitrateAddress(Handle arbiter, ...)
VA d8000000..d8600000 -> PA 18000000..18600000 [ XN ] [ Priv: RW, User: -- ]
VA dff00000..e0000000 -> PA 1ff00000..20000000 [ XN ] [ Priv: RW, User: -- ]
VA e0000000..e8000000 -> PA 20000000..28000000 [ XN ] [ Priv: RW, User: -- ]
...
VA fff00000..fff20000 -> PA 1ff80000..1ffa0000 [ X ] [ Priv: R-, User: -- ]
VA fff20000..fff2c000 -> PA 1ffde000..1ffea000 [ X ] [ Priv: R-, User: -- ]
...
VA ffff0000..ffff1000 -> PA 1fff4000..1fff5000 [ X ] [ Priv: R-, User: -- ]
VA d8000000..d8600000 -> PA 18000000..18600000 [ XN ] [ Priv: RW, User: -- ]
VA dff00000..e0000000 -> PA 1ff00000..20000000 [ XN ] [ Priv: RW, User: -- ]
VA e0000000..e8000000 -> PA 20000000..28000000 [ XN ] [ Priv: RW, User: -- ]
...
VA fff00000..fff20000 -> PA 1ff80000..1ffa0000 [ X ] [ Priv: R-, User: -- ]
VA fff20000..fff2c000 -> PA 1ffde000..1ffea000 [ X ] [ Priv: R-, User: -- ]
...
VA ffff0000..ffff1000 -> PA 1fff4000..1fff5000 [ X ] [ Priv: R-, User: -- ]
Much to mess up.
We have unchecked DMA access!
struct MemchunkHdr
{
u32 size; // in pages
MemchunkHdr *next;
MemchunkHdr *prev;
}
In Theory everything has been fixed, invalid ptrs ➜ kernel-panic
In Theory everything has been fixed, invalid ptrs ➜ kernel-panic
1. Calls the memory allocator function ➜ MemchunkHdr ptr.
2. Goes through the allocated Memory Chunks, maps them to userspace. Without any check.
3. Adds Memory Block Information to KProcess.
// ...
MemchunkHdr *memchunk;
// Allocate memory
memchunk = heap_alloc_regular(region_descriptor, usr_size);
if(!memchunk) {
// out of memory ...
}
/* Map and clear memory */
do {
u32 pages = memchunk->size;
u32 paddr = memchunk + 0x40000000; // convert vaddr -> paddr
// map memchunk into userland
if(mem_map(process, usr_vaddr, pages, paddr, access_rights)>>31) {
// error ...
}
MemchunkHdr *current = memchunk;
memchunk = memchunk->next;
usr_vaddr += pages << 12;
memclear(current, pages << 12); // clear mapped pages
}while(memchunk);
// ...
What's wrong?
/* Map and clear memory */
do {
u32 pages = memchunk->size;
u32 paddr = memchunk + 0x40000000; // convert vaddr -> paddr
// map memchunk into userland
if(mem_map(process, usr_vaddr, pages, paddr, access_rights)>>31) {
// error ...
}
MemchunkHdr *current = memchunk;
memchunk = memchunk->next;
usr_vaddr += pages << 12;
memclear(current, pages << 12); // clear mapped pages
} while(memchunk);
// ...
What's wrong?
They're reading from the memchunk after it has already been mapped in userspace!
if(mem_map(process, usr_vaddr, pages, paddr, access_rights)>>31) {
// error ...
}
MemchunkHdr *current = memchunk;
memchunk = memchunk->next;
SVC 0x22: Result ArbitrateAddress(Handle arbiter, u32 addr, ...)
Let's use the SlabHeap. Because...
...we can overwrite vtable pointers.
LDR R0, =0x10164000
...
LDR R2, =0x883F1FFF ; *NTRCARD_ROMCTRL = \
LSLS R1, R6, #0x1D ; 0x883F1FFF | ((R6&7)<<24);
LSRS R1, R1, #5
ADDS R1, R1, R2
STR R1, [R0,#4] ; This initiates a 0x200-byte transfer
... and then a read loop:
...
loop: LDR R3, [R0,#4] ; do {
LSLS R3, R3, #8
BPL loop ; while(*NTRCARD_CTRL & ~READY);
LDR R0, [R0,#0x1C]
LSLS R3, R1, #2
ADDS R1, R1, #1
STR R0, [R2,R3] ; buf[i++] = *NTRCARD_IN;
LDR R0, =0x10164000 ; ^ No range check
LDR R3, [R0,#4]
CMP R3, #0
BLT loop ; } while(*NTRCARD_CTRL & BUSY);
...
So to summarize:
We have a nice buffer overrun.
Can we control the data?
The data comes from the DS cartridge slot.
We need to make our own DS cartridge.
This gives us ARM9 code execution, but...
We want something better.
Once we have ARM9 code-execution, we can just regenerate all keys by using keyslot 0x11.
EPIC FAIL
We can change the key #2 in NAND.
arm9loader will decrypt the ARM9 binary to garbage..
.. and jump to it.
If we try a lot of key #2's...
.. eventually we'll find some garbage that decodes to a branch-instruction.
We install our key #2 onto the NAND key sector.
We install the largest firm binary we have on firm0.
We put our payload on top of firm0.
ARM9 bootrom is executed.
Bootrom loads firm0 into arm9mem.
Bootrom decrypts it.
Bootrom hash-check fails.
Bootrom loads (smaller) firm1 on top.
Bootrom decrypts it.
Since firm1 is valid, bootrom will jump to it.
arm9loader will decrypt using our supplied key.
arm9loader jumps to garbage.
garbage jumps to our code.
ARM9 code-execution.. ✓
.. from cold boot ✓
.. early. ✓
Gives us 6.x save-key and 7.x NCCH-key.✓
Write-only keys: A key written to a keyslot cannot be read back
Keyscrambler: The actual key used is calculated in hardware and never exposed to the CPU
normal_key = F(keyX, keyY)
F is an unknown function implemented in hardware.
encrypt(zeroes, keyX=1 << n, keyY=0)
==
encrypt(zeroes, keyX=0, keyY=1 << (n+2))
F(x, y) = G( (x <<< 2) ^ y )
where G(t) is an unknown function implemented in hardware.
| Name | Keyslot | KeyX set by | KeyY set by |
| UDS | 0x2D | Bootrom | Bootrom |
| Mii QR-codes | 0x31 | Bootrom | Firmware |
| DLP | 0x39 | Bootrom | Firmware |
G(t) = (t + C) <<< 87, then:
F(x, y) = (((x <<< 2) ^ y) + C) <<< 87
C because it's set in silicon.
normal_key = (((x <<< 2) ^ y) + C) <<< 87
(normal_key0 >>> 87) < (normal_key1 >>> 87)
((x <<< 2) ^ y0) + C < ((x <<< 2) ^ y1) + C
(x <<< 2) ^ y0 < (x <<< 2) ^ y1
C.