Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

LDR (register, sxtw, 64-bit)

Test 1: uops

Code:

  ldr x0, [x6, w7, sxtw]
  mov x7, #4
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 0.000

Load/store unit issues: 1.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)030e0f1e1f22233a3f4346494f51schedule uop (52)schedule ldst uop (55)dispatch ldst uop (58)simd uops in schedulers (5a)606d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map ldst uop (7d)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst int load (95)inst ldst (9b)l1d tlb access (a0)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)a9acafb5b6bbl1d cache miss ld nonspec (bf)cfd5map dispatch bubble (d6)ddfetch restart (de)e0eaebec? ldst retires (ed)f5f6f7f8fd
1005389311548810137421218162510001000100014774038938921232491000100020003897111100110001000100039103500391039613543731161139110641000394397396390395
100439420045010237921218162510001000100014838139439421732521000100020003947111100110001000100039103900391039613543731161139110641000395395395395395
1004394300450002379212121625100011301000149890394394216324710001000200039471111001100010001000391039003510396135437311611391101021000395390395395395
1004394300450102379212181625100010001000149890394394217325210001000200038971111001100010001000391039003910396135437311611391101041000395390395395396
1004394300410101379212181625100010001000148380394394217325210001000200039471111001100010001000391039003910396135437311611391101041000395395395395395
1004389210450002379212181625100010001000149890394394217324710001000200039471111001100010001000431039004010396139437311611391101041000395395395395395
100439430045000237921218222510001000100014989039438921232471000100020003947111100110001000100039103900391039613543731161139210641000395390395395395
100439430045000237921218162510001000100014989139439421732471000100020003947111100110001000100039103900351039613543731161139110641000395395395395395
1004394300450002379212181625100010001000149890394395217324710001000200039472111001100010001000391039003910396136437311611386101041000395395395395395
1004394200450102379212181125100010001000149890394394224325210001000200039471111001100010001000391033003910396139437311611386101041000395390395395392

Test 2: Latency 1->2 (with chain penalty)

Chain cycles: 3

Code:

  ldr x0, [x6, w7, sxtw]
  eor x8, x8, x0
  eor x8, x8, x0
  add x6, x6, x8
  mov x7, #4
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0050

retire uop (01)cycle (02)03mmu table walk data (08)l2 tlb miss data (0b)0e0f18191e1f22233f4d4f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)a9acafb5bbl1d cache miss ld nonspec (bf)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
40205700535250000001010700206973559695254010430103100013010010000616175334220604966967700507005064631364953401003020010000602002000070050351140201100991001000030100100000100100000110000000100001010261027111698133000306610000301007003670051700367005170048
40204700505250000001000700326976459706424010430103100013010010000616132334206204966970700507005064646364938401003020010000602002000070035351140201100991001000030100100000100100000010000000100001000261017111698523000300910000301007003670051700517003670051
40204700355240000001000700326976459709254010430103100013010010000616005334220614966967700507004764646364953401003020010000602002000070035351140201100991001000030100100000100100000110000000100001010261017111698113000096610000301007004870051700487003670051
40204700355250000001010700356978159706254010430103100013010010000616005334220614966955700357005064646364953401003020010000602002000070050351140201100991001000030100100000100100000110000000100000000261017111698163000099910000301007004870048700367004870051
40205701035240000000000700326976459695254010030100100013010010000616175334220614966967700357005064646364953401003020010000602002000070050351140201100991001000030100100000100100000110000000100001010261017111701013000366010000301007004870051700517005170048
40204700355240000001000700206978459695254010430100100013010010000616005334220614966970700507005064631364953401003020010000602002000070050351140201100991001000030100100000100100000110000000100001010261017111698173000366010000301007003670051700517004870083
40204700355250000000001700356978159709254010430100100013010010000616175334206204966970700507005064646364953401003020010000602002000070050351140201100991001000030100100000100100000110000100100001010261017111698113000090910000301007003670036700517005270036
40204700505240000001000700356976459709254011230103100013010010000616015334473414966955700477003564631364938401003020010000602002000070047351140201100991001000030100100000100100000110000200100001010261017111698113000099010000301007003670036700367005170036
40204700475250000001000700326976459695254010430103100013010010000616175334220604966970700507004764646364956401003020010000602002000070050351140201100991001000030100100000100100000110000000100001000261017111698453000009910000301007005170036700517005170036
402047005052400000010017002069781596952540104301031000130100100006160053342206149669897006170047646463649504010030200100006020020000700473511402011009910010000301001000001001000001100000001000010002610171116983630000100910000301007003670036700517005170051

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0053

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk instruction (07)mmu table walk data (08)09l2 tlb miss data (0b)0e0f181e1f223a3f4d4f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)5f60696a6d6emap stall dispatch (70)int prf full (71)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafb5bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2branch mispred nonspec (cb)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
40025700535251010111020117003869702597122540018300161000230010100006170093342350014966973700417004164671036497840010300201000060020200007005335114002110910100003001010000010100012110001000110000111100252037155698163000606610000300107005470054700547004270054
40024700535251011011070117003869777597122540018300131000230010100006170093342350004966973700537004164671036497840010300201000060020200007005335114002110910100003001010000010100022010003002110000111100252057145698163000300010000300107005470054700547005470054
40024700535251011111020117003869777597122540018300161000230010100006170093342350004966973700537005364671036496640010300201000060020200007004135114002110910100003001010000010100012110002000110000111110252037134698163000666010000300107005470054700547005470054
40024700535241001111020017002669777597122540018300161000130010100006170093342350004966973700537005364671036497840010300201000060020200007005335114002110910100003001010000010100011110001000110000111110252047145698163000666610000300107005470042700547005470054
40024700535251000110020117003869777597122540018300161000230010100006170093342350004966973700557004164671036497840010300201000060020200007005335114002110910100003001010000010100011110003002110000111120252047153698163000666610000300107005470054700547005470054
40024700535241010110020017003869777597122540018300161000230010100006169953342350004967028700537005764671036497840010300201000060020200007005335114002110910100003001010000010100011110003000110000111110252047154698163000666010000300107004270054700547005470054
40024700535251010010010117003869777597122540018300161000130010100006169953342350004966973700537005364671036497840010300201000060020200007005335114002110910100003001010000010100023110001000110000111120252047153698163000366610000300107005470054700427005470054
40024700535251011011010117003869702597122540031300161000230010100006170093341769004966973700537005364671036497840010300201000060020201347005335114002110910100003001010000010100022110001002410000111110252047144698163000606610000300107005470054700547005470054
400247005352510011100201070038697775971225400183001610002300101000061700933423500049669737005370053646710364978400103002010000600202000070041351140021109101000030010100000101000321100010021100001111102520471446981630006623610000300107005670056700477005570054
40024700415251001110020117003869777597122540014300161000230010100006170093341769004966975700537005364659036497840010300201000060020200007005335114002110910100003001010000110100023110003000110000111100252047145698163000666610000300107005470054700547005470129

Test 3: Latency 1->3 (with chain penalty)

Chain cycles: 3

Code:

  ldr x0, [x6, w7, sxtw]
  eor x8, x8, x0
  eor x8, x8, x0
  add x7, x7, x8
  mov x7, #4
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0054

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)l2 tlb miss data (0b)0e0f1e22233f4d4f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)acafb5l1d cache miss ld nonspec (bf)branch cond mispred nonspec (c5)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
40205700575251110039700700366978559695254010430103100013010010000616175334239814966955700547003564748364957401003020010000602002000070054351140201100991001000030100100000100100001100001541000011126112712369814300031010010000301007003670055700367003670052
4020470054525000103400700396978559695254010430103100013010010000616041334147014966971700547005164647364938401003020010000602002000070051351140201100991001000030100100000100100001100000010000100261137123698173000310131310000301007005270055700557003670055
4020470054524000014200700206978559710254010430103100013010010000616041334239814966971700547005464647364938401003020010000602002000070035351140201100991001000030100100000100100000100000010000110261137132698173000313101310000301007005270055700557005270052
402047003552500000580070039697855969525401003010310001301001000061617533414701496695570054700546464736495440100302001000060200200007005435114020110099100100003010010000010010000010000001000011026113712369817300000131310000301007005570055700557005570055
4020470035524000004010700396978559713254010030103100003010010000616175334239814966955700547005464647364954401003020010000602002000070035351140201100991001000030100100000100100001100001010000110261127123697983000310131310000301007005570036700567003670055
40204700355250000037007003669764597132540104301001000130100100006160413342254149669557005470035646473649574010030200100006020020000700543511402011009910010000301001000001001000001000000100001102611371336981730003001010000301007005270052700557003670055
402047009752500000401070020697855971025401043010310001301001000061604133422541496697470054700546476136495740100302001000060200200007005435114020110099100100003010010000010010000010000001000011026112713369798300001001310000301007005570036700557003670055
4020470054524000002471070039697855971325401003010310000301001000061604133414701496697170054700546464736495740100302001000060200200007005435114020110099100100003010010000010010000010000101000010026113712369814300030101010000301007005270036700367005770052
4020470035524000005810700966978559695254010430100100003010010000616041334239814966973700357005464765364938401003020010000602002000070054351140201100991001000030100100001100100000100000010000100261137132697983000013131310000301007005570036700367005570055
402047005452400011521070036697855971325401043010310001301001000061604133423981496695570054700356464736495440100302001000060200200007005135114020110099100100003010010000110010000110000001000011026112713369798300031301310000301007005270055700557005570055

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0050

retire uop (01)cycle (02)03l1d tlb fill (05)09l2 tlb miss instruction (0a)l2 tlb miss data (0b)0e0f1e223f4d4f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)5f60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9fl1d tlb access (a0)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)acafb5l1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2cfl1i cache miss demand (d3)d5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
400257004752501000000700206976059709254001430010100023001010000616982334147051496697070050700506466836496040010300201000060020200007005035114002110910100003001010000101000011000010100001000252001571156698103000396910000300107005170051700517009370048
4002470050525000010017003569760597092540014300131000130010100006169823342206404966970700477016164668364975400103002010000600202000070096351140021109101000030010100001010000110000001000001002520013711315698103000390910000300107005170036700517003670051
4002470050524000010007003569743597092540010300131000130010100006169823342062304966955700357004764668364973400103002010000600202000070035351140021109101000030010100001010000110000001000011002520014711513697983000069610000300107005170051700367004870036
4002470050524000011017003569728596952540014300131000030010100006169523342062204966971700507003564668364975400103002010000600202000070050351140021109101000030010100001010000110000131000011002520016711716698103000360910000300107010570053700557005170051
4002470050525000000107004069743597062540014300131000130010100006169823342062304966955700507005064668364960400103002010000600202000070050351140021109101000030010100001010000110000001000001002520016711515698103000306010000300107005170048700517003670051
4002470065525000011107003569728597092540014300131000030010100006169823342206404966970700507005064668364975400103002010000600202000070050351140021109101000030010100001010000010000031000000002520016711316698133000309010000300107003670051700487004870036
40024700475240000001170020697605970925400143001010001300101000061706833422063049669707005070035646683649604001030020100006002020000700473511400211091010000300101000010100000100000010000000025202771167697983000096910000300107005170036700367003670051
4002470050525000000007003269743596952540010300131000130010100006169823341470304966955700477005064668364972400103002010000600202000070035351140021109101000030010100001010000110000001000011002520216711315697983000060910000300107003670051700517008870051
400247003552500000000700356974359695254001430010100013001010000617068334206230496695570050700356466836497540010300201000060020200007003535114002110910100003001010000101000011000000100000100252001671168698133000390010000300107003670051700517005170051
4002470055525000010117003569760597142540014300131000030010100006169823341470314966970700507005064653364975400103002010000600202000070047351140021109101000030010100001010000010000001000011002520015711617698133000066010000300107003670051700517005170036

Test 4: throughput

Count: 8

Code:

  ldr x0, [x6, w7, sxtw]
  ldr x0, [x6, w7, sxtw]
  ldr x0, [x6, w7, sxtw]
  ldr x0, [x6, w7, sxtw]
  ldr x0, [x6, w7, sxtw]
  ldr x0, [x6, w7, sxtw]
  ldr x0, [x6, w7, sxtw]
  ldr x0, [x6, w7, sxtw]
  mov x7, 8

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.3342

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk instruction (07)mmu table walk data (08)09l2 tlb miss instruction (0a)l2 tlb miss data (0b)0e0f1e22233a3f4346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)92inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)a5ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafb5b6bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2branch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
802052673620012012101670052672237720258010010080000103800165001168603149236562673726736166426166668011520080024200160048267146411802011009901008000010080000110080021194308005810121800396059431921115122111610102673313135800001002673726715267372673726737
802042673620012012100211032672137720258010010080000100800165001167300149236342673626737166646166928011520080024200160048267368511802011009901008000010080000010080019194308005900021800400059431901115122101610102671113135800001002673826715267372673726737
8020426736200120121006600426721377202580100100800001008001550011673001492365626737267361664261666680113200800242001600482671464118020110099010080000100800000100800191943080058000618004060194319011151221016105267331300800001002673726715267372671526737
802042671420012012011660052672237720258010010080000100800155001167377149236562673726736166646166888011520080024200160048267148511802011009901008000010080000010080020200080059000618003960584319211151221016109267330130800001002671526737267372673726737
8020426714200120020002100226721377202580100100800001008001650011673001492365626736267361666361668880115200800242001600482673686118020110099010080000100800000100800191900800191016080040605901921115122416410267111305800001002673826738267382671626715
802042673620012012101670052669927722580100100800001008001550011665901492366826736267141666461668880115200800242001600482671485118020110099010080000100800000100800191900800600006380040011943190111512291610102671113130800001002673726737267372673726715
8020426736200120121006700326721377125801001008000010080015500116762814923656267142673616663616688801152008002420016004826714851180201100990100800001008000001008001919430800591006180000601943191111512251610102673313135800001002673726716267372673826737
802042671420112002111660042672137020258010010080000100800165001167377149236562673626736166636166878011520080024200160048267368511802011009901008000010080000010080019214308005900060800406159431921115122416109267330135800001002673726738267382673826715
80204267372001201211066104266990071825801001008000010080014546117620014923656267362673716666616687801152008002420016004826744631180201100990100800001008000001008001920008005801121800000158431921115122416109267331305800001002671526737267372673726715
8020426737200120120002100426721377192580100100800001008019850011672151492363426736267361664261668880115200800242001600482673664118020110099010080000100800000100800202043080059000618000060580191111512291610102673313130800001002671526737267152671526738

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.3341

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)l2 tlb miss data (0b)0e0f1e1f223a3f4346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)5f60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)a5ld unit uop (a6)l1d cache writeback (a8)acafb5b6bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2cfl1i cache miss demand (d3)d5map dispatch bubble (d6)ddfetch restart (de)e0eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
800252673720100000450132670020719258001010800001080000501173975314923656267362673716681316717800102080000201600002673685118002110910800001080000010800000430800000398000060580191502005164426734130580000102673726737267372673726716
80024267372001010021000267130121216258001010800001080000501166886614923656267212670816672316707800102080000201600002672777118002110910800001080000010800000430800390398003901390005020041643267241010480000102670926709267282672826731
8002426727200000104501126712212002580010108000010800005011688437149236512670826727166723167118001020800002016000026728561180021109108000010800000108000004308000003980000613943005020031634267051414480000102672826728267282672926729
800242672819900000001126693012116258001010800001080000501167501614923647267282670816676316708800102080000201600002672877118002110910800001080000010800000008000003980039613943005020041644267251010480000102672826728267282672826729
800242672820000000450112671621212162580010108000010800005011688436149236282672726727166723166888001020800002016000026708771180021109108000010800000108000000080039038800396104400502004164326724100480000102670926709267282673226709
8002426708200000004401126712012121625800101080000108000050116884361492365126728267281665231670780010208000020160000267287711800211091080000108000001080000000800391398003901394300502004164326705100080000102672926709267322672926728
8002426727199000114500126712201216258001010800001080000501168843614923647267282673116672316688800102080000201600002672877118002110910800001080000010800000430800390080039000000502003163426724010480000102672926729267292672926728
8002426708200000004501126712212121625800101080000108000050116884371492362826727267271667231670780010208000020160000267287711800211091080000108000001080000043080000042800000104300502004164326727010480000102672826729267312672926729
80024267272000000045001267132121216258001010800001080000501166750814923648267272670816672316708800102080000201600002672777118002110910800001080000010800000008003903980039013943005020041643267241010080000102672926729267292670926728
800242672820000000450112671221212025800101080000108000050116884361492362826728267281667231670780010208000020160000267317711800211091080000108000001080000000800391428003960394300502004164426725100780000102670926729267292670926728