Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

LDRSB (64-bit)

Test 1: uops

Code:

  ldrsb x0, [x6]
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 0.000

Load/store unit issues: 1.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)030e0f1e22243a3f4346494f51schedule uop (52)schedule ldst uop (55)dispatch ldst uop (58)simd uops in schedulers (5a)6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map ldst uop (7d)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst int load (95)inst ldst (9b)9dl1d tlb access (a0)l1d cache miss ld (a3)a5ld unit uop (a6)acafb5b6bbl1d cache miss ld nonspec (bf)cfd0d5map dispatch bubble (d6)ddfetch restart (de)e0eaebec? ldst retires (ed)f5f6f7f8fd
1005399311410013742181812251000100010001477438938921232471000100010003897111100110001000010003901035351035613539730116113866621000390390390391390
100438930041001377218181125100010001000148383893892123247100010001000389711110011000100001000430103939103961354373011611391101041000395395395392390
100439430045002379212121625100010001000149893943942173252100010001000394711110011000100001000390103539103961354373011611391101021000395395395395395
10043943004500137921218112510001000100014989389389217325210001000100039472111001100010000100039010353910356135397301161139110641000395390406390390
100439430045001379218181625100010001000149893943942173252100010001000389711110011000100001000430103939103961353973011611391101041000395395407397390
100439430045002379212121725100010001000150183943942163252100010001000394711110011000100001000430103939103961354373011611391101041000395395395395395
100439420045001379212121625100010001000150183943942163252100010001000391711110011000100001000430103939103961354373011611391101041000395395395395395
1004394300450023792187162510001000100015018394389212325210001000100039471111001100010000100039010353910356139397301161139110641000396395395395395
10043893004500237921212112510001000100015037389394217325210001000100039471111001100010000100039010353910396135397301161138610641000395395395395395
10043943004500237921212162510001000100014989394394217325210001000100039171111001100010000100039010353510356135397301161138610641000395395395395395

Test 2: Latency 1->2 (with chain penalty)

Chain cycles: 3

Code:

  ldrsb x0, [x6]
  eor x8, x8, x0
  eor x8, x8, x0
  add x6, x6, x8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0051

retire uop (01)cycle (02)03l1i tlb fill (04)090e0f18191e1f22233a3f4d4f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)a9acafb5l1d cache miss ld nonspec (bf)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
4020570051525110000601007003969785597132540108301001000130100100006160413342398049669550700357003564631364957401003020010000602001000070051351140201100991001000030100100001100100001100000001000011261147142698163000310101310000301007005570052700557005570055
402047005152501000010000700206978259695254010430103100003010010000616014334225419866971070051700516463136495440100302001000060200100007003535114020110099100100003010010000010010000010000000100001126112712269823300031313010000301007003670055700557005570036
402047003552500000000000700396978559713254010430103100013010010000616041334239804966956070057700566463136493840100302001000060200100007005435114020110099100100003010010000010010000110000000100001126462712269817300031313010000301007005570055700367005570055
402047005152400000010100700206978559695254010430103100013010010000616041334147004966971070054700546465036495740100302001000060200100007005435114020110099100100003010010000010010000110000003100001126112712269856300001310010000301007005570036700367005570055
402047005152400000010011700366976459713254010430103100013010010000616175334239814966974070057700356465036495740100302001000060200100007005435114020110099100100003010010000010010000110000000100001126112712269816300031313010000301007005270036700387005570038
402047003552500100000010700366978559695254010430103100003010010000616041334239804966974070051700546465036495740100302001000060200100007005435114020110099100100003010010000110010000110000000100001026112712269818300030131310000301007005570055700557005570055
402047003552500000010010700396978559755254010430103100003010010000616175334239814966974070054700546465036501440100302001000060200100007005435114020110099100100003010010000010010000110000000100001126112712269814300031313010000301007005570036700527005570036
4020470051524000000100007003969785597132540104301001000130100100006161753342398049669740700357005464650364957401003020010000602001000070035351140201100991001000030100100001100100000100001001000001261127132698003000013131010000301007005570055700377005570055
4020470054525000000100107003669785597134440104301001000030100100006169223342974049669740700547003564655364938401003020010000602001000070054351140201100991001000030100100000100100001100000001000011264527122698003000013131310000301007005570055700987003670055
402047003552501110010010700396978259713254010430100100013010010000616041334147014966974070157700526465036495740100302001000060200100007005435214020110099100100003010010000110010000110000003100001126112712269815300030131310000301007003670036700527005570055

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0047

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)09l2 tlb miss data (0b)0e0f1e22233a3f4d4f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)a9acafb5bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2cfd5map dispatch bubble (d6)d9ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
400257005352500001010007003569728597092540010300131000230010100006170683342206049669707005070035646683649754001030020100006002010000700503511400211091010000300101000011010000011000000010000101002520571021698013000090910000300107003670036700517004870036
400247004752400000011007003569760597092540010300131000130010100006170683342206049669707005070050646533649754001030020100006002010000700353511400211091010000300101000001010000011000000010000001002520171021698103000096910000300107004870051700367005170051
400247005052400000010007002069743597062540014300131000130010100006169823342206049669677005070035646533649604001030020100556002010000700503511400211091010000300101000001010000011000000010000100002520171011698113000306910000300107005170036700517005170051
400247004952400000010007003569760596952540014300131000130010100006170363342494049669707005070050646533649604001030020100006002010000700503511400211091010000300101000001010000011000000310000101002520171011698193000390910000300107005170048700487003670054
400247005052500000011007003269760597092540014300131000030010100006169823342206049669677003570050646683649724001030020100006002010000700473511400211091010000300101000001010000011000000010000101002520171011698543000366610000300107005170036700487005170051
400247004752500000010007003269760597092540010300101000130010100006170683342062049669677004770049646683649724001030020100006002010000700503511400211091010000300101000001010000011000000310000101002520171021698143000360910000300107003670048700367004870036
400247004752400000000007003569760597092540010300101000030010100006169823342206049669557005070050646683649754001030020100006041010000700503511400211091010000300101000001010000011000000010000101002520171031698443000306010000300107003670051700517003670051
400247003552500000001007002069728597092540010300131000030010100006169823342206049670287004770047646533649604001030020100006002010000700383511400211091010000300101000001010000011000010010000101002520117011698133000396910000300107004870048700367004870048
400247004752400000011007003569743596952540014300101000130010100006169823342206049669707003570035646683649604001030020100006002010000700503511400211091010000300101000001010000011000000010000000002520171011698163000099910000300107005170036700517003670036
400247004752400000111007003569760597092540014300131000130010100006170683342206049669677005070050646683649754001030020100006002010000700473511400211091010000300101000011010000011000000010000101002520171011698173000300910000300107005170056700517005170036

Test 3: throughput

Count: 8

Code:

  ldrsb x0, [x6]
  ldrsb x0, [x6]
  ldrsb x0, [x6]
  ldrsb x0, [x6]
  ldrsb x0, [x6]
  ldrsb x0, [x6]
  ldrsb x0, [x6]
  ldrsb x0, [x6]
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.3340

retire uop (01)cycle (02)0309l2 tlb miss data (0b)0e0f181e22233a3f4346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)6067696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)a5ld unit uop (a6)l1d cache writeback (a8)a9acafb5b6bbl1d cache miss ld nonspec (bf)c2cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
802052672320000010451022671221801225801001008000010080000500116578900492364226727267271663031668580100200800002008000026707561180201100991008000010080000110080000000800391042800006035000511021611267341064800001002670826728267282672326728
80204267272000000050100267072018122580100100800001008000050011678080049236272672226707166453166658010020080000200800002672256118020110099100800001008000001008000000080035003580035610390051101161126723062800001002672326728267082672326728
80204267272000000041101267072181812258010010080000100800005001167808004923642267222672216645316680801002008000020080000267277111802011009910080000100800000100800000390800350008003561353900511011611267241004800001002672826728267282670826728
80204267222000000045102267122121212258010010080000100800005001166525004923647267272672716650316665801002008000020080000267225611802011009910080000100800000100800000390800390036800350039430051101161126733060800001002672826728267282670826728
8020426707200000004510126712001212258010010080000100800005001167231004923647267272672716630316665801002008000020080000267077111802011009910080000100800000100800000393180132000800350100005110116112670910104800001002672826708267282672326730
802042672220000000000026707018181225801001008000010080000500116780810492364226822267271663031668080100200800002008000026707561180201100991008000010080000010080000039080035000800350135390051101161126721660800001002670826723267232672326723
8020426707200000004210026707218181225801001008000010080000500116723100492364226707267271665031666580100200800002008000026727561180201100991008000010080000010080000039080035003580035613500051101161126719662800001002672326708267232670826723
80204267222000000000002671631801225801001008000010080000500116780810492364226722267221663031668080100200800002008000026722561180201100991008000010080000010080000039080035000800350035390051101160126719064800001002670826723267232672326723
80204267072000000041001267122181216258010010080000100800005001165789004923627267272672716645316685801002008000020080000267227111802011009910080000100800000100800000390800350035800006135430051101161126719060800001002672326723267232670826708
8020426722200000000101266922181802580100100800001008000050011665250049236272672226722166453166838010020080000200800002672271118020110099100800001008000001008000003908003500080035610390051101161126763062800001002672526723267232672326723

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.3341

retire uop (01)cycle (02)030e0f1e22233a3f4346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)acafb5b6bbl1d cache miss ld nonspec (bf)cfd5map dispatch bubble (d6)dbddfetch restart (de)e0eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
800252672220001411012670701801125800101080000108000050116760514923647267222672216667316702800102080000208076926734561180021109108000010800000108000039800350080035610395020816010526705106080000102670926709267092672826728
8002426722200004510026712201212258001010800001080000501168843149236472670826727166673167228001020800002080000267277111800211091080000108000001080000398000003800356135050201216012626719106080000102672326709267232672926709
80024267082000045002267072120162580010108000010800005011676051492362826728267271665231670880010208000020800002672771118002110910800001080000010800003980039008000060354350201316011726705106080000102670926723267092670926728
800242670820000410002670720181225800101080000108000050116688614923647267282672816672316707800102080000208000026727561180021109108000010800000108000039800390358003961354350206160562672406480000102670926723267232670926729
80024267282000000022669320121625800101080000108000050116675014923628267082672216667316688800102080000208000026708711180021109108000010800001108000039800390398003961353950206160542672500480000102672926709267292672826728
80024267282000045102267070121802580010108000010800005011688431492364726708267081667231670880010208057920800002674056118002110910800001080000110800003980039139800396136435020716061226724610080000102670926709267292672926729
80024267272000045101267072181812258001010800001080000501167605149236422672226722166523166888001020800002080000267087111800211091080000108000001080000398003904580000603539502071607726705106480000102672326723267282670926729
8002426708200004100226712201216258001010800001080000501168843149236282672826727166723167088001020800002080000267085611800211091080000108000001080000438003513980000613543502051601272672506480000102672926728267282672926709
800242672720000010226707001802580010108000010800005011667501492364726727267081666731668880010208000020800002672856118002110910800001080000010800003980000039800390135435020101606626724100480000102672326709267232672326728
80024267272000145002267122012162580010108000010800005011688431492362826728267221666731668880010208000020800002672771118002110910800001080000010800003980039038800356035435020616075267241010080000102672326820267092672826729