Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

LDRSW (register)

Test 1: uops

Code:

  ldrsw x0, [x6, x7]
  mov x7, #4
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 0.000

Load/store unit issues: 1.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)09l2 tlb miss data (0b)0e0f1e22233a3f4346494f51schedule uop (52)schedule ldst uop (55)dispatch ldst uop (58)simd uops in schedulers (5a)606d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map ldst uop (7d)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)92inst int load (95)inst ldst (9b)9dl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafb5b6bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2cfd5map dispatch bubble (d6)ddfetch restart (de)e0eaebec? ldst retires (ed)f5f6f7f8fd
1005402311111121001366277192510001000100015577040340322532611000100020003828511100101000100001019204310591012110396158431927321611379131351000404383404404403
100440331111002100338837719251000100010001445614024032253261100010002000403851110010100010000102019431019101611000605801917311611400131351000405404404404404
100440231011006610338837720251000100010001552614024032043261100010002000402851110010100010000101920431062102611040615943190731161140001301000404404404403404
100440331111002110338827718251000100010001554614034032253261100010002000403851110010100010000101919431060102611039615743192731161140013051000404404383382404
100440331111006700238807720251000100010001550114034032253261100010002000403851110010100010000102120431019202611039015901907311611416131351000404404404405404
100438131110002110138837702510001000100015480138140320432611000100020004048511100101000100001021214310590002110400019431917311611378131351000406404383404383
1004403311110067102366077222510001000100015480140340322532611000100020004038511100101000100001020204310591026410400019431917311611400131351000404404403383404
10044033111000671033883771925100010001000155060403403225324010001000200040385111001010001000010202043105910161104101594319073116114000051000382404404404382
10044033111100671003880771925100010001000155240402403225326110001000200040364111001010001000010192001060101601040015843191731161139901351000404382403382404
100440331010002110338830719251000100010001552414034022253239100010002000382861110010100010001101921010191006110400158451907311611400131351000383404404404404

Test 2: Latency 1->2 (with chain penalty)

Chain cycles: 3

Code:

  ldrsw x0, [x6, x7]
  eor x8, x8, x0
  eor x8, x8, x0
  add x6, x6, x8
  mov x7, #4
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0050

retire uop (01)cycle (02)03090e0f1e22243f4d4f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)696a6b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)acafb5l1d cache miss ld nonspec (bf)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
4020570047525110110700356978159784254010430103100013010010000616005334220649670220700507004764646364953401003020010000602002000070035351140201100991001000030100100001001000001100000010000110261017111698133000390610000301007003670036700517003670051
4020470050524000110700356978159709254010430103100003010010000616015334206249669670700507005064646364953401003020010000602002000070050351140201100991001000030100100001001000001100050010000110261017111698003000090910000301007005170051700517005170051
4020470050524000100700356978159706254010430100100013010010000616015334147049669700700357003564646364953401003020010000602002000070047351140201100991001000030100100001001000001100002010000100261017211697983000006910000301007005170152700527013170224
40204703225250001810700326978159695254010430103100013010010000616005334147049669550700507005064645364953401003020010000602002000070050351140201100991001000030100100001001000001100001242610000110261017111698143000360910000301007005170051700547005170048
4020470051524000600700336978559711254010030103100013010010000616005334147049669700700357003564646364953401003020010000602002000070050351140201100991001000030100100001001000001100000010000100261017111698103000300610000301007004870051700517003670036
40204700355250111007003569781597092540104301031000130100100006161753342206496695507005070050646313649534010030200100006020020000700353511402011009910010000301001000010010000001000001510000110261017111698133000096010000301007003670051700367005170051
4020470047524000100700206976459709254010030103100003010010000616015334220649669700700507004764646364938401003020010000602002000070050351140201100991001000030100100001001000001100001310000010261017111697983000309010000301007005170051700367005170036
4020470050525000010700356976459709254010430103100013010010000616015334206249669550700507005064643364950401003020010000602002000070050351140201100991001000030100100001001000000100000010000010261017111698133000306910000301007003670036700517003670036
4020470050525000100700206978159709254010430103100013010010000616005334206249669670700507003564631364953401003020010000602002000070035351140201100991001000030100100001001000000100000010000100261017111697983000360610000301007003670048700517005170051
4020470035524000110700326976459695254010030103100013010010000616175334147049669700700507004764715364949401003020010000602002000070050351140201100991001000030100100001001000001100000010000100261017111698103000366910000301007005170048700517003670036

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0050

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)09l2 tlb miss data (0b)0e0f18191e1f22233a3f4d4f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafb5bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
4002570047525000011001010070020697605970925400143001310002300101000061698233420620496695507005070035646683649754001030020100006002020000700473511400211091010000300101000001010000001000000001000010100252017112697983000306010000300107005170051700517004870051
4002470050525000000001010070035697605970925400143001010001300101000061698233420620496697007005070050646593649754001030020100006002020000700353511400211091010000300101000001010000011000000001000000100252027111698133000396910000300107004870051700517005170051
4002470035525000000000010070035697605969525400183001310001300101000061706833414700496697007005070035646683649754001030020100006002020000700503511400211091010000300101000001010000001000000001000010100252017111697983000396910000300107003670051700517005170051
4002470035524000000001000070020697435970625400143001310000300101000061698233414700496697007005070050646683649604001030020100006002020000700353511400211091010000300101000001010000001000000001000010000252017111698133000396010000300107003670051700487003670051
4002470091525000000000000070035697435970625400103001310000300101000061698233422061496697007005070050646683649754001030020100006002020000700503511400211091010000300101000011010000011000000001000000100252017111697983000066010000300107005170048700517005170051
4002470050525000000001000070035697605970625400143001310001300101000061698233420620496697007003570035646533649754001030020100006002020000700883511400211091010000300101000011010000001000000001000010100252017111698133000000910000300107004870048700367004870036
4002470050525000000006000070032697605970925400103001310001300101000061698233422060496697007005070047646683649754001030020100006002020000700503511400211091010000300101000001010000011000001001000110100252017111698103000300010000300107003670048700377003670051
4002470035525000000001010070032697605970925400143001010000300101000061706833420620496695507003570035646653649604001030020100006002020000700353511400211091010000300101000001010000011000000001000000100252027111698103000006910000300107003670051700517005170051
4002470035525000000006000070035697435970925400103001310000300101000061695233420620496697007005570050646533649754001030020100006002020000700473511400211091010000300101000001010000011000000001000010000252017111698133000396910000300107005170051700517005170051
4002470050525000000000000070020697285970925400143001010000300101000061698233422060496697007003570047646653649754001030020100006002020000700353511400211091010000300101000011010000011000000001000010000252017111698103000366910000300107005170048700517005170036

Test 3: Latency 1->3 (with chain penalty)

Chain cycles: 3

Code:

  ldrsw x0, [x6, x7]
  eor x8, x8, x0
  eor x8, x8, x0
  add x7, x7, x8
  mov x7, #4
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0056

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)l2 tlb miss data (0b)0e0f191e22233a3f4d4f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)a9acafb5bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
402057005352511100011017003869870597122540108301061000230100100006160783342494049669617007370052646373649584010030200100006020020000700413511402021009910010000301001000011001000221100030011000011011261017111698043000696610000301007004270057700577012870057
4020470056524111000201007004169702597012540108301061000230100100006160593341769049669617008370057646493649444010030200100006020020000700413511402011009910010000301001000001001000121100010111000011010261017111698193000606610000301007005770042700577005770042
402047004152511100020007004169784597157440104301061000230250100006161053342350049669777006470053646953649564010030200100006020020000700573511402011009910010000301001000001001000411100021211000011111261017111698163000360610000301007005770057700547005470042
402047005652410100020007004169787597012540108301031000130100100006160593342494049669737009070053646493649564010030200100006020020000700533511402011009910010000301001000001001000731100030211000011111261017111698193000696610000301007005770054700577004270062
402047004152411000020007004169787597152540108301091000230100100006160593341769049669737007670064646373649594010030200100006020020000700563511402011009910010000301001000001001000121100020011000011111261017111698193000669910000301007004270058700427005770045
4020470056525110000380007004169702597152540104301091000230100100006160323341769049669767006170057646493649594010030200100006020020000700563511402011009910010000301001000001001000221100020111000011111261017111698043000366610000301007004270057700427005470057
402047004152511100011017002669702597012540104301031000230100100006160323342494049669617005370054646493649444010030200100006020020000700563521402011009910010000301001000001001000141100020111000011111261017111698163000606010000301007004270057700577005470054
4020570056525101000110070038697845970125401083010610002301001000061607833424940496697370062700646464936495940100302001000060200200007005635114020110099100100003010010000010010002111000211191000011111261017111698193000600910000301007004270042700427005770057
402047005652411100020007004169787597012540108301061000230100100006160593342494049669617005770056646373649594010030200100006020020000700563511402011009910010000301001000001001000210100010171000001112261017111698043000696010000301007004270054700547005770054
402047005352410000020007004169784597152540108301031000230100100006160783342494049669767007470060646523649594010030200100006020020000700413511402011009910010000301001000001001000110100011211000011011261017111698043000690910000301007004270042700577005770057

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0051

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)09l2 tlb miss data (0b)0e0f191e22243f4d4f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)acafb5bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)branch cond mispred nonspec (c5)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
4002570057525110111011070026697845971925400183001610005300101000061699133422544966974700357005464672364960400103002010000600202000070054801140021109101000030010100001101000000100000010000100000252017112698173000313101010000300107005270052700557005570052
400247005152500000000007002069743597132540010300101000130010100006170683342254496697470059700546467236497940010300201000060020200007003535114002110910100003001010000010100000110000001000000100025201711269814300031001010000300107003670052700557005570052
40024700515240000000110700396977859710254001430013100013001010000616991334225449669747005470051646693649764001030020100006002020000700513511400211091010000300101000001010000011000000100001010002520171116981430003010010000300107005570036700557005270055
4002470035525000000000070040697435969525400103001310000300101000061706833414704966974700357005464672364979400103002010000600202000070054352140021109101000030010100000101000000100000010000101000252017121698173000313101310000300107005270055700557005570052
4002470051524000000070070020697435971025400143001310001300101000061706833422544966971700357005464653364976400103002010000600202000070035351140021109101000030010100000101000001100010010000001000252017112698173000313101310000300107003670036700557009670052
400247005452500000001007003969743596952540010300131000130010100006170183342398496697470054700516466936497640010300201000060020200007005435114002110910100003001010000010100000110000291000011010025201711169804300031001310000300107005570052700367003670052
400247003552400000001007003669743597132540010300131000130010100006170183342398496695570035700516467236496040010300201000060020200007005435114002110910100003001010000010100000110000001000010100025202712169798300001013010000300107005570052700557005270036
4002470054525000000010070036697785971325400143001310001300101000061701833414704966974700547005164672364976400103002010000600202000070051351140021109101000030010100000101000001100000010000101000252017111698223000313131310000300107003670055700557005570036
400247003552500000000007003969743597132540014300131000130010100006170183342254496695570051700356465336497940010300201000060020200007003535114002110910100003001010000010100000110000001000010100025201712169814300031310010000300107005570055700557003670055
40024700545240000000000700206974359710254001430010100003001010000616991334147049669747005470054646723649794001030020100656002020000700543511400211091010000300101000011010000011000000100001010002520171216981730000001310000300107005570055700557005570055

Test 4: throughput

Count: 8

Code:

  ldrsw x0, [x6, x7]
  ldrsw x0, [x6, x7]
  ldrsw x0, [x6, x7]
  ldrsw x0, [x6, x7]
  ldrsw x0, [x6, x7]
  ldrsw x0, [x6, x7]
  ldrsw x0, [x6, x7]
  ldrsw x0, [x6, x7]
  mov x7, 8

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.3341

retire uop (01)cycle (02)03mmu table walk data (08)09l2 tlb miss data (0b)0e0f18191e1f22243a3f4346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)a5ld unit uop (a6)l1d cache writeback (a8)a9acafb5b6bbl1d cache miss ld nonspec (bf)c2branch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
802052672320001000004101022678421212162580100100800001008001550011658560492364226727267271665561665980116200800242001600482672272118020110099100800001008000001008000003908003900398000060000111511811600267041662800001002672826708267282672326728
8020426722200000000045000226722212120258010010080000100800155001166596049236472670726707166356166798011420080024200160048267277111802011009910080000100800000100800000390800000051800006135430111511801600267240604800001002672826728267082672826708
8020426727200000000045000226717001202580100100800001008001550011665960492364726727267271665561667980115200800242001600482672771118020110099100800001008000001008000003908003900080039613543011151180160026719010104800001002670826708267232670826708
802042672720000000000000226704012121625801001008000010080016500116585604923647267272670716655616674801152008002420016004826727561180201100991008000010080000010080000039080000003880039603543011151180160026724010100800001002670826708267232672326728
8020426722200000000045000226698012016258010010080000100800155001177116149236472670726727166356166598011420080024200160048267277111802011009910080000100800000100800000390800390008003561350011151180160026704010100800001002672826708267282670826728
802042672720000000004501012674830120258010010080000100800145001167875049236422670726707166556166598011520080024200160048267277111802011009910080000100800001100800000390800350035800000104301115118016002671201064800001002672826789267082672526708
8020426707200000000045000126715212120258010010080000100800155001167303049236272672726727166356166798011620080024200160048267277211802011009910080000100800000100800000390800390039800396136390111511801600267190662800001002672826728267082670826728
8020426707200000000045000126721218016258010010080000100800155001167875149236422672726727166356166598011520080024200160048267075611802011009910080000100800000100800000008004000398000061354301115118016002672401000800001002670826708267282672826723
80204267272000000000410102267242181216258010010080000100800155001167875049236472672726722166356166598011420080024200160048267225611802011009910080000100800000100800000390800000008003960354301115118016002672401060800001002670826708267082672826728
802042670720000001004500022670120181225801001008000010080014500117711614923642267272672716655616674801152008002420016004826707711180201100991008000010080000010080000039080035003980039613500111511801600267240664800001002672826728267282670826728

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.3341

retire uop (01)cycle (02)03090e0f1e22233a3f4346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)5f60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafb5b6bbl1d cache miss ld nonspec (bf)branch mispred nonspec (cb)cfd0l1i cache miss demand (d3)d5map dispatch bubble (d6)dbddfetch restart (de)e0eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
80025267272001006010126712201216258001010800001080000501166896004923647267272670816676316708800102080000201600002673277118002110910800001080000010800004380000000398003961404305020150916088267051010080000102673226729267322672926731
80024267312000004500026713212002580010108000010800005011688430049236472673126728166723167078001020800002016000026815771180021109108000010800000108000008000000039800386139440502012091608626710100780000102673226728267092672826709
800242670820100001002671221212192580010108000010800005011688430049236512672726728166723167118001020800002016000026727771180021109108000010800000108000043800390000800396139430502012091606726724100480000102672926729267292672926732
800242673120000044001267160012192580010108000010800005011668960049236482672726708166723166888001020800002016000026728771180021109108000010800000108000043800390000800390139430502012091609926705100480000102672826728267292672926709
8002426708200000440012671620121625800101080000108000050116688600492364826708267271667231670780010208000020160000268357711800211091080000108000001080000438003900008003960000502012071607726724250480000102689426729267112672826728
80024267272000014510126713212120258001010800001080000501168843004923647267282670816672316707800102080000201600002672977118002110910800001080000010800004380038000398003961043050201207160782672800780000102673226728267282672826729
8002426728201000451002669320102580010108000010800005011688430049236512673126731166723167118001020800002016000026732771180021109108000010800000108000008003900038800006104405020120916099267251014480000102673226732267092672826732
800242673120000045101267132121219258001010800001080000501166886114923647267312670816672316708800102080000201600002672777118002110910800001080000010800004380039000398003961394405020120101608826724140480000102672926729267282672826732
80024267312001004500026716211216258001010800001080000501166750004923647267282673116672316708800102080000201600002673277118002110910800001080000010800004380039000388003801394305020120816089267241014480000102672926732267322673226728
800242673120001044001266932001625800101080000108000050116720110492364726728267081665231670780010208000020160000267357711800211091080000108000001080000438003900039800390104405020120916086267051410080000102672826728267282670926732