36.2. C busy loop
The hard part is how to prevent the compiler from optimizing it away: https://stackoverflow.com/questions/7083482/how-to-prevent-gcc-from-optimizing-out-a-busy-wait-loop/58758133#58758133
Disassembly analysis:
./disas --arch aarch64 --userland userland/gcc/busy_loop.out busy_loop
which contains at LKMC eb22fd3b6e7fff7e9ef946a88b208debf5b419d5:
10 ) { 0x0000000000400700 <+0>: ff 83 00 d1 sub sp, sp, #0x20 0x0000000000400704 <+4>: e0 07 00 f9 str x0, [sp, #8] 0x0000000000400708 <+8>: e1 03 00 f9 str x1, [sp] 11 for (unsigned long long i = 0; i < max2; i++) { 0x000000000040070c <+12>: ff 0f 00 f9 str xzr, [sp, #24] 0x0000000000400710 <+16>: 11 00 00 14 b 0x400754 <busy_loop+84> 12 for (unsigned long long j = 0; j < max; j++) { 0x0000000000400714 <+20>: ff 0b 00 f9 str xzr, [sp, #16] 0x0000000000400718 <+24>: 08 00 00 14 b 0x400738 <busy_loop+56> 13 __asm__ __volatile__ ("" : "+g" (i), "+g" (j) : :); 0x000000000040071c <+28>: e1 0f 40 f9 ldr x1, [sp, #24] 0x0000000000400720 <+32>: e0 0b 40 f9 ldr x0, [sp, #16] 0x0000000000400724 <+36>: e1 0f 00 f9 str x1, [sp, #24] 0x0000000000400728 <+40>: e0 0b 00 f9 str x0, [sp, #16] 12 for (unsigned long long j = 0; j < max; j++) { 0x000000000040072c <+44>: e0 0b 40 f9 ldr x0, [sp, #16] 0x0000000000400730 <+48>: 00 04 00 91 add x0, x0, #0x1 0x0000000000400734 <+52>: e0 0b 00 f9 str x0, [sp, #16] 0x0000000000400738 <+56>: e1 0b 40 f9 ldr x1, [sp, #16] 0x000000000040073c <+60>: e0 07 40 f9 ldr x0, [sp, #8] 0x0000000000400740 <+64>: 3f 00 00 eb cmp x1, x0 0x0000000000400744 <+68>: c3 fe ff 54 b.cc 0x40071c <busy_loop+28> // b.lo, b.ul, b.last 11 for (unsigned long long i = 0; i < max2; i++) { 0x0000000000400748 <+72>: e0 0f 40 f9 ldr x0, [sp, #24] 0x000000000040074c <+76>: 00 04 00 91 add x0, x0, #0x1 0x0000000000400750 <+80>: e0 0f 00 f9 str x0, [sp, #24] 0x0000000000400754 <+84>: e1 0f 40 f9 ldr x1, [sp, #24] 0x0000000000400758 <+88>: e0 03 40 f9 ldr x0, [sp] 0x000000000040075c <+92>: 3f 00 00 eb cmp x1, x0 0x0000000000400760 <+96>: a3 fd ff 54 b.cc 0x400714 <busy_loop+20> // b.lo, b.ul, b.last 14 } 15 } 16 } 0x0000000000400764 <+100>: 1f 20 03 d5 nop 0x0000000000400768 <+104>: ff 83 00 91 add sp, sp, #0x20 0x000000000040076c <+108>: c0 03 5f d6 ret
We look for the internal backwards jumps, and we find two:
0x00000000004006dc <+68>: c8 fe ff 54 b.hi 0x4006b4 <busy_loop+28> // b.pmore 0x00000000004006f8 <+96>: a8 fd ff 54 b.hi 0x4006ac <busy_loop+20> // b.pmore
and so clearly the one at 0x4006dc happens first and jumps to a larger address than the other one, so the internal loop must be between 4006dc and 4006b4, which contains exactly 11 instructions.
Oh my God, unoptimized code is so horrendously inefficient, even I can’t stand all those useless loads and stores to memory variables!!!