24.22.4.8.7. gem5 event queue DerivO3CPU syscall emulation freestanding example analysis: speculative
Now let’s try to see some Speculative execution in action with userland/arch/aarch64/freestanding/linux/speculative.S.
That program is setup such that the branch is not taken if an extra CLI argument is passed with --cli-args
.
We purposefully set things up so that speculation will be running from the icache so we can see what is going on more clearly without ifetch stalls.
Without an extra CLI argument (the branch is taken):
// f = fetch, d = decode, n = rename, p = dispatch, i = issue, c = complete, r = retire timeline tick pc.upc disasm seq_num [.............................................................................fdn]-( 40000) 0x00400078.0 ldr x0, [sp] [ 1] [.ic.............................................................................]-( 80000) ... [................................r...............................................]-( 120000) ... [.............................................................................fdn]-( 40000) 0x0040007c.0 movz x1, #1, #0 [ 2] [.ic.............................................................................]-( 80000) ... [................................r...............................................]-( 120000) ... [....................fdn.ic......r...............................................]-( 120000) 0x00400080.0 movz x2, #2, #0 [ 3] [....................fdn.ic......r...............................................]-( 120000) 0x00400084.0 movz x3, #3, #0 [ 4] [....................fdn.ic......r...............................................]-( 120000) 0x00400088.0 movz x4, #4, #0 [ 5] [....................fdn.ic......r...............................................]-( 120000) 0x0040008c.0 movz x5, #5, #0 [ 6] [....................fdn.ic......r...............................................]-( 120000) 0x00400090.0 movz x6, #6, #0 [ 7] [....................fdn.p.....ic..r.............................................]-( 120000) 0x00400094.0 subs x0, #2 [ 8] [....................fdn.ic........r.............................................]-( 120000) 0x00400098.0 movz x0, #3, #0 [ 9] [....................fdn.p......ic.r.............................................]-( 120000) 0x0040009c.0 b.lt 0x400080 [ 10] [=====================fdn=ic=====================================================]-( 120000) 0x004000a0.0 -----movz x10, #10, #0 [ 11] [=====================fdn=ic=====================================================]-( 120000) 0x004000a4.0 -----movz x11, #11, #0 [ 12] [=====================fdn=ic=====================================================]-( 120000) 0x004000a8.0 -----movz x12, #12, #0 [ 13] [=====================fdn=ic=====================================================]-( 120000) 0x004000ac.0 -----movz x13, #13, #0 [ 14] [=====================fdn=ic=====================================================]-( 120000) 0x004000b0.0 -----movz x14, #14, #0 [ 15] [=====================fdn=ic=====================================================]-( 120000) 0x004000b4.0 -----movz x15, #15, #0 [ 16] [=====================fdn=pic====================================================]-( 120000) 0x004000b8.0 -----movz x16, #16, #0 [ 17] [=====================fdn=pic====================================================]-( 120000) 0x004000bc.0 -----movz x17, #17, #0 [ 18] [.....................................fdn.ic.r...................................]-( 120000) 0x00400080.0 movz x2, #2, #0 [ 19] [.....................................fdn.ic.r...................................]-( 120000) 0x00400084.0 movz x3, #3, #0 [ 20] [.....................................fdn.ic.r...................................]-( 120000) 0x00400088.0 movz x4, #4, #0 [ 21] [.....................................fdn.ic.r...................................]-( 120000) 0x0040008c.0 movz x5, #5, #0 [ 22] [.....................................fdn.ic.r...................................]-( 120000) 0x00400090.0 movz x6, #6, #0 [ 23] [.....................................fdn.pic.r..................................]-( 120000) 0x00400098.0 movz x0, #3, #0 [ 25] [.....................................fdn.pic.r..................................]-( 120000) 0x0040009c.0 b.lt 0x400080 [ 26] [......................................fdn.ic.r..................................]-( 120000) 0x004000a0.0 movz x10, #10, #0 [ 27] [......................................fdn.ic.r..................................]-( 120000) 0x004000a4.0 movz x11, #11, #0 [ 28] [......................................fdn.ic.r..................................]-( 120000) 0x004000a8.0 movz x12, #12, #0 [ 29] [......................................fdn.ic.r..................................]-( 120000) 0x004000ac.0 movz x13, #13, #0 [ 30] [......................................fdn.pic.r.................................]-( 120000) 0x004000b0.0 movz x14, #14, #0 [ 31] [......................................fdn.pic.r.................................]-( 120000) 0x004000b4.0 movz x15, #15, #0 [ 32] [......................................fdn.pic.r.................................]-( 120000) 0x004000b8.0 movz x16, #16, #0 [ 33] [......................................fdn.pic.r.................................]-( 120000) 0x004000bc.0 movz x17, #17, #0 [ 34] [.............................................fdn.ic.r...........................]-( 160000) 0x004000c0.0 movz x0, #0, #0 [ 35] [.............................................fdn.ic.r...........................]-( 160000) 0x004000c4.0 movz x8, #93, #0 [ 36]
So here we see that the CPU mispredicted! After the BLT instruction, the CPU continued to run movz x10
, assuming that the branch would not be taken.
Then, at time 120000, the LDR data came back, after the wrong prediction had already been fully executed.
The CPU then noticed that it mispredicted, and so it started again from the correct branch target movz x2
, and the instructions that were thrown away are marked as =====
in the timeline.
We can also see some Branch predictor log lines in the O3CPUAll
log:
130000: Fetch: system.cpu.fetch: [tid:0] [sn:10] Branch at PC 0x40009c predicted to be not taken 130000: Fetch: system.cpu.fetch: [tid:0] [sn:10] Branch at PC 0x40009c predicted to go to (0x4000a0=>0x4000a4).(0=>1) 131500: Commit: system.cpu.commit: [tid:10] [sn:0] Inserting PC (0x40009c=>0x4000a0).(0=>1) into ROB. 131500: ROB: system.cpu.rob: Adding inst PC (0x40009c=>0x4000a0).(0=>1) to the ROB. 131500: ROB: system.cpu.rob: [tid:0] Now has 10 instructions. 132000: IEW: system.cpu.iew: [tid:0] Issue: Adding PC (0x40009c=>0x4000a0).(0=>1) [sn:10] [tid:0] to IQ. 132000: IQ: system.cpu.iq: Adding instruction [sn:10] PC (0x40009c=>0x4000a0).(0=>1) to the IQ. 132000: IQ: system.cpu.iq: Instruction PC (0x40009c=>0x4000a0).(0=>1) has src reg 6 (CCRegClass) that is being added to the dependency chain. 132000: IQ: system.cpu.iq: Instruction PC (0x40009c=>0x4000a0).(0=>1) has src reg 8 (CCRegClass) that is being added to the dependency chain. 132000: IQ: system.cpu.iq: Instruction PC (0x40009c=>0x4000a0).(0=>1) has src reg 7 (CCRegClass) that is being added to the dependency chain. 135500: IQ: system.cpu.iq: Waking up a dependent instruction, [sn:10] PC (0x40009c=>0x4000a0).(0=>1). 135500: IQ: global: [sn:10] has 1 ready out of 3 sources. RTI 0) 135500: IQ: system.cpu.iq: Waking any dependents on register 7 (CCRegClass). 135500: IQ: system.cpu.iq: Waking up a dependent instruction, [sn:10] PC (0x40009c=>0x4000a0).(0=>1). 135500: IQ: global: [sn:10] has 2 ready out of 3 sources. RTI 0) 135500: IQ: system.cpu.iq: Waking any dependents on register 8 (CCRegClass). 135500: IQ: system.cpu.iq: Waking up a dependent instruction, [sn:10] PC (0x40009c=>0x4000a0).(0=>1). 135500: IQ: global: [sn:10] has 3 ready out of 3 sources. RTI 0) 135500: IQ: system.cpu.iq: Instruction is ready to issue, putting it onto the ready list, PC (0x40009c=>0x4000a0).(0=>1) opclass:1 [sn:10]. 135500: IEW: system.cpu.iew: Setting Destination Register 6 (CCRegClass) 135500: Scoreboard: system.cpu.scoreboard: Setting reg 6 (CCRegClass) as ready 135500: IEW: system.cpu.iew: Setting Destination Register 7 (CCRegClass) 135500: Scoreboard: system.cpu.scoreboard: Setting reg 7 (CCRegClass) as ready 135500: IEW: system.cpu.iew: Setting Destination Register 8 (CCRegClass) 135500: Scoreboard: system.cpu.scoreboard: Setting reg 8 (CCRegClass) as ready 135500: IQ: system.cpu.iq: Attempting to schedule ready instructions from the IQ. 135500: IQ: system.cpu.iq: Thread 0: Issuing instruction PC (0x40009c=>0x4000a0).(0=>1) [sn:10] 136000: IEW: system.cpu.iew: Execute: Processing PC (0x40009c=>0x4000a0).(0=>1), [tid:0] [sn:10]. 136000: IEW: global: RegFile: Access to cc register 6, has data 0x2 136000: IEW: global: RegFile: Access to cc register 8, has data 0 136000: IEW: global: RegFile: Access to cc register 7, has data 0 136000: IEW: system.cpu.iew: Current wb cycle: 0, width: 8, numInst: 0 wbActual:0 136000: IEW: system.cpu.iew: [tid:0] [sn:10] Execute: Branch mispredict detected. 136000: IEW: system.cpu.iew: [tid:0] [sn:10] Predicted target was PC: (0x4000a0=>0x4000a4).(0=>1) 136000: IEW: system.cpu.iew: [tid:0] [sn:10] Execute: Redirecting fetch to PC: (0x40009c=>0x400080).(0=>1) 136000: IEW: system.cpu.iew: [tid:0] [sn:10] Squashing from a specific instruction, PC: (0x40009c=>0x400080).(0=>1) 136500: Commit: system.cpu.commit: [tid:0] Squashing due to branch mispred PC:0x40009c [sn:10] 136500: Commit: system.cpu.commit: [tid:0] Redirecting to PC 0x400084 136500: ROB: system.cpu.rob: Starting to squash within the ROB. 136500: ROB: system.cpu.rob: [tid:0] Squashing instructions until [sn:10]. 136500: ROB: system.cpu.rob: [tid:0] Squashing instruction PC (0x4000bc=>0x4000c0).(0=>1), seq num 18. 136500: ROB: system.cpu.rob: [tid:0] Squashing instruction PC (0x4000b8=>0x4000bc).(0=>1), seq num 17. 136500: ROB: system.cpu.rob: [tid:0] Squashing instruction PC (0x4000b4=>0x4000b8).(0=>1), seq num 16. 136500: ROB: system.cpu.rob: [tid:0] Squashing instruction PC (0x4000b0=>0x4000b4).(0=>1), seq num 15. 136500: ROB: system.cpu.rob: [tid:0] Squashing instruction PC (0x4000ac=>0x4000b0).(0=>1), seq num 14. 136500: ROB: system.cpu.rob: [tid:0] Squashing instruction PC (0x4000a8=>0x4000ac).(0=>1), seq num 13. 136500: ROB: system.cpu.rob: [tid:0] Squashing instruction PC (0x4000a4=>0x4000a8).(0=>1), seq num 12. 136500: ROB: system.cpu.rob: [tid:0] Squashing instruction PC (0x4000a0=>0x4000a4).(0=>1), seq num 11. 136500: ROB: system.cpu.rob: [tid:0] Done squashing instructions. 136500: Commit: system.cpu.commit: [tid:0] Marking PC (0x40009c=>0x400080).(0=>1), [sn:10] ready within ROB. 137000: Commit: system.cpu.commit: [tid:0] [sn:10] Committing instruction with PC (0x40009c=>0x400080).(0=>1) 130000: ExecEnable: system.cpu: A0 T0 : @asm_main_after_prologue+36 : b.lt 0x400080 : IntAlu : FetchSeq=10 CPSeq=10 flags=(IsControl|IsDirectControl|IsCondControl) 137000: ROB: system.cpu.rob: [tid:0] Retiring head instruction, instruction PC (0x40009c=>0x400080).(0=>1), [sn:10] 137000: O3CPU: system.cpu: Removing committed instruction [tid:0] PC (0x40009c=>0x400080).(0=>1) [sn:10] 137000: Commit: system.cpu.commit: Trying to commit head instruction, [tid:0] [sn:11] 137000: Commit: system.cpu.commit: Retiring squashed instruction from ROB. 137000: Commit: system.cpu.commit: Trying to commit head instruction, [tid:0] [sn:10] 137000: Commit: system.cpu.commit: [tid:0] [sn:10] Committing instruction with PC (0x40009c=>0x400080).(0=>1) 130000: ExecEnable: system.cpu: A0 T0 : @asm_main_after_prologue+36 : b.lt 0x400080 : IntAlu : FetchSeq=10 CPSeq=10 flags=(IsControl|IsDirectControl|IsCondControl) 138500: Fetch: system.cpu.fetch: [tid:0] [sn:26] Branch at PC 0x40009c predicted to be not taken 138500: Fetch: system.cpu.fetch: [tid:0] [sn:26] Branch at PC 0x40009c predicted to go to (0x4000a0=>0x4000a4).(0=>1) 142500: Commit: system.cpu.commit: [tid:0] [sn:26] Committing instruction with PC (0x40009c=>0x4000a0).(0=>1) 138500: ExecEnable: system.cpu: A0 T0 : @asm_main_after_prologue+36 : b.lt 0x400080 : IntAlu : FetchSeq=26 CPSeq=18 flags=(IsControl|IsDirectControl|IsCondControl) 142500: ROB: system.cpu.rob: [tid:0] Retiring head instruction, instruction PC (0x40009c=>0x4000a0).(0=>1), [sn:26]
With an extra CLI (the branch is not taken):
// f = fetch, d = decode, n = rename, p = dispatch, i = issue, c = complete, r = retire timeline tick pc.upc disasm seq_num [.............................................................................fdn]-( 40000) 0x00400078.0 ldr x0, [sp] [ 1] [.ic.............................................................................]-( 80000) ... [................................r...............................................]-( 120000) ... [.............................................................................fdn]-( 40000) 0x0040007c.0 movz x1, #1, #0 [ 2] [.ic.............................................................................]-( 80000) ... [................................r...............................................]-( 120000) ... [....................fdn.ic......r...............................................]-( 120000) 0x00400080.0 movz x2, #2, #0 [ 3] [....................fdn.ic......r...............................................]-( 120000) 0x00400084.0 movz x3, #3, #0 [ 4] [....................fdn.ic......r...............................................]-( 120000) 0x00400088.0 movz x4, #4, #0 [ 5] [....................fdn.ic......r...............................................]-( 120000) 0x0040008c.0 movz x5, #5, #0 [ 6] [....................fdn.ic......r...............................................]-( 120000) 0x00400090.0 movz x6, #6, #0 [ 7] [....................fdn.ic.......r..............................................]-( 120000) 0x00400098.0 movz x0, #3, #0 [ 9] [....................fdn.p......ic.r.............................................]-( 120000) 0x0040009c.0 b.lt 0x400080 [ 10] [.....................fdn.ic.......r.............................................]-( 120000) 0x004000a0.0 movz x10, #10, #0 [ 11] [.....................fdn.ic.......r.............................................]-( 120000) 0x004000a4.0 movz x11, #11, #0 [ 12] [.....................fdn.ic.......r.............................................]-( 120000) 0x004000a8.0 movz x12, #12, #0 [ 13] [.....................fdn.ic.......r.............................................]-( 120000) 0x004000ac.0 movz x13, #13, #0 [ 14] [.....................fdn.ic.......r.............................................]-( 120000) 0x004000b0.0 movz x14, #14, #0 [ 15] [.....................fdn.ic.......r.............................................]-( 120000) 0x004000b4.0 movz x15, #15, #0 [ 16] [.....................fdn.pic......r.............................................]-( 120000) 0x004000b8.0 movz x16, #16, #0 [ 17] [.....................fdn.pic.......r............................................]-( 120000) 0x004000bc.0 movz x17, #17, #0 [ 18] [............................................fdn.ic.r............................]-( 160000) 0x004000c0.0 movz x0, #0, #0 [ 19] [............................................fdn.ic.r............................]-( 160000) 0x004000c4.0 movz x8, #93, #0 [ 20]
So this time the prediction was correct. Retire is delayed until the memory comes back, but we otherwise just kept running forward until hitting the next ifetch cache line.