36.2. C busy loop

Disassembly analysis:

./disas --arch aarch64 --userland userland/gcc/busy_loop.out busy_loop

which contains at LKMC eb22fd3b6e7fff7e9ef946a88b208debf5b419d5:

10      ) {
   0x0000000000400700 <+0>:     ff 83 00 d1     sub     sp, sp, #0x20
   0x0000000000400704 <+4>:     e0 07 00 f9     str     x0, [sp, #8]
   0x0000000000400708 <+8>:     e1 03 00 f9     str     x1, [sp]

11          for (unsigned long long i = 0; i < max2; i++) {
   0x000000000040070c <+12>:    ff 0f 00 f9     str     xzr, [sp, #24]
   0x0000000000400710 <+16>:    11 00 00 14     b       0x400754 <busy_loop+84>

12              for (unsigned long long j = 0; j < max; j++) {
   0x0000000000400714 <+20>:    ff 0b 00 f9     str     xzr, [sp, #16]
   0x0000000000400718 <+24>:    08 00 00 14     b       0x400738 <busy_loop+56>

13                  __asm__ __volatile__ ("" : "+g" (i), "+g" (j) : :);
   0x000000000040071c <+28>:    e1 0f 40 f9     ldr     x1, [sp, #24]
   0x0000000000400720 <+32>:    e0 0b 40 f9     ldr     x0, [sp, #16]
   0x0000000000400724 <+36>:    e1 0f 00 f9     str     x1, [sp, #24]
   0x0000000000400728 <+40>:    e0 0b 00 f9     str     x0, [sp, #16]

12              for (unsigned long long j = 0; j < max; j++) {
   0x000000000040072c <+44>:    e0 0b 40 f9     ldr     x0, [sp, #16]
   0x0000000000400730 <+48>:    00 04 00 91     add     x0, x0, #0x1
   0x0000000000400734 <+52>:    e0 0b 00 f9     str     x0, [sp, #16]
   0x0000000000400738 <+56>:    e1 0b 40 f9     ldr     x1, [sp, #16]
   0x000000000040073c <+60>:    e0 07 40 f9     ldr     x0, [sp, #8]
   0x0000000000400740 <+64>:    3f 00 00 eb     cmp     x1, x0
   0x0000000000400744 <+68>:    c3 fe ff 54     b.cc    0x40071c <busy_loop+28>  // b.lo, b.ul, b.last

11          for (unsigned long long i = 0; i < max2; i++) {
   0x0000000000400748 <+72>:    e0 0f 40 f9     ldr     x0, [sp, #24]
   0x000000000040074c <+76>:    00 04 00 91     add     x0, x0, #0x1
   0x0000000000400750 <+80>:    e0 0f 00 f9     str     x0, [sp, #24]
   0x0000000000400754 <+84>:    e1 0f 40 f9     ldr     x1, [sp, #24]
   0x0000000000400758 <+88>:    e0 03 40 f9     ldr     x0, [sp]
   0x000000000040075c <+92>:    3f 00 00 eb     cmp     x1, x0
   0x0000000000400760 <+96>:    a3 fd ff 54     b.cc    0x400714 <busy_loop+20>  // b.lo, b.ul, b.last

14              }
15          }
16      }
   0x0000000000400764 <+100>:   1f 20 03 d5     nop
   0x0000000000400768 <+104>:   ff 83 00 91     add     sp, sp, #0x20
   0x000000000040076c <+108>:   c0 03 5f d6     ret

We look for the internal backwards jumps, and we find two:

   0x00000000004006dc <+68>:    c8 fe ff 54     b.hi    0x4006b4 <busy_loop+28>  // b.pmore
   0x00000000004006f8 <+96>:    a8 fd ff 54     b.hi    0x4006ac <busy_loop+20>  // b.pmore

and so clearly the one at 0x4006dc happens first and jumps to a larger address than the other one, so the internal loop must be between 4006dc and 4006b4, which contains exactly 11 instructions.

Oh my God, unoptimized code is so horrendously inefficient, even I can’t stand all those useless loads and stores to memory variables!!!