Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

LDRB (register, uxtw)

Test 1: uops

Code:

  ldrb w0, [x6, w7, uxtw]
  mov x7, #4
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 0.000

Load/store unit issues: 1.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)03090e0f1e22243a3f4346494f51schedule uop (52)schedule ldst uop (55)dispatch ldst uop (58)simd uops in schedulers (5a)606d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map ldst uop (7d)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst int load (95)inst ldst (9b)9dl1d tlb access (a0)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)acafb5b6bbl1d cache miss ld nonspec (bf)cfd5map dispatch bubble (d6)ddfetch restart (de)e0eaebec? ldst retires (ed)f5f6f7f8fd
10053943100440013832111925100010001000152670394394221325610001000200039877111001100010000100043103803810396139447311611395101041000404402399399399
100439830004400138321121925100010001000152080398398221325610001000200039877111001100010000100043103803810386139437311611391101071000399395395399395
10043943000441013832111925100010001000150181394398221325210001000200039877111001100010000100043103803910386138447311611391101071000399399399399399
100439830004510138321211925100010001000152740394398221325610001000200039877111001100010000100043103803810386139447311611391101041000395395395395395
1004394300045101379212121625100010001000150370398394279325610001000200039877111001100010001100043103803810386138447311611395141471000399399395399400
10043983000440013832111925100010001000150181398398221325610001000200039877111001100010001100043103803810396139447311611395101471000395399399399399
10043983011450013792111925100010001000152741398398221325610001000200039877111001100010000100043103803810386139447311611395101041000399395395395399
1004394300045101383212121925100010001000153381398398217325610001000200039477111001100010000100043103803810386138447311611395101041000399399399399399
10043983000440013832111925100010001000152740398394221325610001000200039877111001100010001100043103803810386139447311611391101041000395395395395395
100439430004400137921211625100010001000152741398398221325610001000200039477111001100010001100043103803810386139447311611395101471000399399399399399

Test 2: Latency 1->2 (with chain penalty)

Chain cycles: 3

Code:

  ldrb w0, [x6, w7, uxtw]
  eor x8, x8, x0
  eor x8, x8, x0
  add x6, x6, x8
  mov x7, #4
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0051

retire uop (01)cycle (02)03090e0f18191e22243f4d4f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6d6emap stall dispatch (70)int prf full (71)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)a9acafb5l1d cache miss ld nonspec (bf)branch cond mispred nonspec (c5)branch mispred nonspec (cb)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
402057005152521000110700366979659740254010430103100013010010000616175334225414966974700547005164650036495740100302001000060200200007005235114020210099100100003010010000010010000011000000310000110026103711169817300001301310000301007008870088700987005570052
40204700545250000001070036697855980025401043010310000301001000061604133414701496695570054700546463103649384010030200100006020020000700353511402011009910010000301001000001001000001100000001000011002610171116981730003010010000301007006270056700527005570036
4020470035525010001107003969784597102540100301031000130100100006160143342398149669717005470054646500364938401003020010000602002000070035351140201100991001000030100100000100100002010000100100000000261017111697983000300010000301007005670053700557005570036
4020470054524001001007003669785597132540100301031000130100100006160413341470149639467005870054646310364954401003020010000602002000070035351140201100991001000030100100000100100000110000000100001100261017111697983000013101310000301007005570038700367003670055
4020470054525000001107002069788596952540104301031000130100100006160413342398149669747005470035646500364954401003020010000602002000070051351140201100991001000030100100000100100000110000000100001100261017111698143000310131310000301007011070057700527005570055
4020470035525000001007002069785597132540100301031000130100100006160143342398149669747005470035646500364954401003039810000602002000070051351140201100991001000030100100000100100000110000000100000110261017111697983000010101310000301007009170052700547005270055
402047005152500000010700206979359713254010030100100013010010000616175334239814966955700357003564650036493840100302001000060200200007005435114020110099100100003010010000010010000011000010010000110026101711169817300031010010000301007008970057700367005570055
402047005452500000600700396978659710254010030103100003010010000616041334147014966955700357005464650036495740100302001000060200200007005435114020110099100100003010010000010010000401000000010000000026101711169798300001013010000301007005770059700527003670055
402047005452500000010700366978659695254010430103100013010010000616014334239804966974700357005464650036495740100302001000060200200007005435114020110099100100003010010000010010000001000010010000110126101711169817300001313010000301007009270039700537005270055
40205700355240000011070039697855971425401003010310001301001000061601433423981496697470035700546465003649384010030200100006020020000700353511402011009910010000301001000011001000001100000001000011002610171116981730000001310000301007007570088700527005270096

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0057

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk instruction (07)mmu table walk data (08)09l2 tlb miss instruction (0a)l2 tlb miss data (0b)0e0f18191e1f2223243a3f4d4f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafb5bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2cfd5map dispatch bubble (d6)daddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
400257005752510110100007000017004269781597162540018300161000230010100006170453341769149669777005770057646753649824001030020100006002020000700573511400211091010000300101000001010003111000200011000011112252015710135698203000610101010000300107005870058700587005870058
4002470057524100101000020100270042697815971625400183001610002300101000061699533425421496697770057700576467536496640010300201000060020200007005735114002110910100003001010000010100021110001000110000111102520147101414698203000610101010000300107005870058700587005870058
4002470057525100101000070000170042697815974825400183001610002300101000061704533425421496697770057700576467536498240010300201000060020200007005735114002110910100003001010000010100022110002010110000111102520147101414698203000610101010000300107005870058700587005870058
40024700575251011001100201001700806978159716254001830016100023001010000617045334254214966977700577005764675364982400103002010000600202000070057351140021109101000030010100000101003711100430133619410048111102912202000816710803016610101010000300107160471626709657161271505
400247159753110222200111721381496100171576702656044546940216301801005332459105106542553385980149684767100471606652901486600642411329521066965920219827115135191400211091010000300101000001010058411004001338526100571111330202523301911713163018910101010000300107196671774718927193871962
4002472046539140202001215132711441002713207033160150396401693012110035315941081664788733709551496817470319700576467536496640010300201000060020200007005735114002110910100003001010000010100013110001000110000111112520147101414698203000610101010000300107005870058700587005870058
400247005752510110100002010017004269781597162540018300161000230010100006170453342542149639447005770057646753649824001030020100006002020000700573511400211091010000300101000001010001111000100011000011110252014710145698203000610101010000300107005870058700587005870058
4002470057525101001000020100170042697815971625400183001610002300101000061704533425421496697770057700576467536498240010300201000060020200007005735114002110910100003001010000010100021110002010110000111102520147101414698203000610101010000300107005870058700587004270058
400247005752510110100007010017004269702597162540018300161000230010100006170453342542149669777005770057646753649824001030020100006002020000700573511400211091010000300101000001010001111000100111000011111252014710145698203000610101010000300107005870058700587005870058
40024700575251011010000201001700426978159716254001830016100023001010000617045334254214966977700577005764675364984400103002010000600202000070057351140021109101000030010100000101000121100010021100001111225201471014469820300060101010000300107005870058700617005870058

Test 3: Latency 1->3 (with chain penalty)

Chain cycles: 3

Code:

  ldrb w0, [x6, w7, uxtw]
  eor x8, x8, x0
  eor x8, x8, x0
  add x7, x7, x8
  mov x7, #4
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0050

retire uop (01)cycle (02)03l2 tlb miss data (0b)0e0f1e1f22233f4d4f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)a9acafb5l1d cache miss ld nonspec (bf)c2branch mispred nonspec (cb)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
402057004752500010007003269781596952540104301031000130100100006160153342206496697070041700506464636495040100302001000060200200007005035114020110099100100003010010000110010000110000000100001100261017111698103000396910000301007004870048700487004870048
402047005052400010107003269781597062540104301031000030100100006160053342062496697070047700506464636495040100302001000060200200007005035114020110099100100003010010000010010000110000000100000100261017111698103000090910000301007005170051700367005170051
402047003552400010007008269735597092540104301031000130100100006160053342206496697070050700356464636495340100302001000060200200007005035114020110099100100003010010000010010000110000000100001100261017111698553000366910000301007005170048700487004870048
402047005052500010007003269735597062540104301031000030100100006160053342206496697070047700506464636495340100302001000061492200007006335114020110099100100003010010000010010000110000000100001100261017111698103000396610000301007005170051700487004870051
402047004752400010107003269735597092540104301001000130100100006160053342206496697070035700476464636493840100302001000060200200007005035114020110099100100003010010000110010000110000000100001100261017111698103000090610000301007004870036700367003670048
402047005052500010107003569735597062540104301001000030100100006160053342206496697070050700506464636495340100302001000060200200007005035114020110099100100003010010000010010000110000000100001100261017111698103000396610000301007004870051700517005170051
402047003552400000107003269735597062540104301031000130100100006160053342206496697070035700476464636495040100302001000060200200007005035114020110099100100003010010000010010000110000000100001100261017111698103000006910000301007004870048700487003670048
402047004752400000107002069781597092540104301031000130100100006161753342206496697070050700506464636495340100302001000060200200007005035114020110099100100003010010000010010000110000000100001000261017111698133000390910000301007005170036700517005170051
402047004752501010007003269735596952540104301031000130100100006160053341470496696770050700476464636495040100302001000061522200007005035114020110099100100003010010000010010000010000003100031100261017111698103000396910000301007005170048700487004870048
402047005052401010007002069781597062540104301031000130100100006160053342206496697070050700356464336493840100302001000060200200007005035114020110099100100003010010000010010000110000000100001100261017111698133000009910000301007004870051700517005170048

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0050

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)09l2 tlb miss data (0b)0e0f1e22233f4d4f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)6061696a6b6d6emap stall dispatch (70)int prf full (71)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)a9acafb5bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2branch cond mispred nonspec (c5)branch mispred nonspec (cb)cfd0d2d5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
400257005352511110021070041697705971025400143001310001300101000061698233420621049669550700507005064668036497540205300201000060020200007004735114002110910100003001010000010100000110000000100001010000252000271216981330003962810000300107005170051700487005170051
400247005652511110020070041697805971525400183001610002300101000061703633424941549669760700567005664720036497240010300201000060020200007004735114002110910100003001010000010100000110000000100001010000252051171116981330003962710000300107005170051700517003670036
400247005052500000010070035697605970925400143001310001300101000061698233422061549669700700507005064665036497540010300201000060020200007005035114002110910100003001010000010100000110000000100001010000252051171126981330003962710000300107011070053700487004870036
400247005052500001000070032697285970925400143001310001300101000061695233414701549669670700507004764653036497240010300201000060020200007004735114002110910100003001010000010100000010000000100000010000252052171126981030003962710000300107005170051700517004870051
400247005052400000011070020697605970225400143001310001300101000061699133422541549669700700507003564668036496140010300201000060020200007005035114002110910100003001010000110100000110000000100001000000252052271126979830003962010000300107005170051700517005170036
400247003552400000010070035697605970925400103001310001300101000061695233420621549669703700507003564665036497240010300201000060020200007004735114002110910100003001010000010100000110000000100001010000252052271116981330000992710000300107005170051700487005170051
400247005052500000010070035697605970925400143001310001300101000061698233414701549669670700477004764668036497240010300201000060020200007003535114002110910100003001010000010100000010000000100001010000252052271126981330003962710000300107005170051700517003670048
400247005052500000010070035697605970925400143002210001300101000061698233420621549669703700477004764668036497540010300201000060020200007005035114002110910100003001010000010100000110000000100001010000252052171126981330003962710000300107005170036700367004870036
400247005052500000011070035697605970925400143001310001300101000061698233422061549669670700507005064668036496040010300201000060020200007005035114002110910100003001010000110100000110003000100001010000252052171116981330003962710000300107005170051700487004870051
400247005052500000110070035697605970925400103001310001300101000061698233422061549669670700477005064665036497540010300201000060020200007017735114002110910100003001010000110100000110000000100001000000252052171116981630000992710000300107005170036700517005170051

Test 4: throughput

Count: 8

Code:

  ldrb w0, [x6, w7, uxtw]
  ldrb w0, [x6, w7, uxtw]
  ldrb w0, [x6, w7, uxtw]
  ldrb w0, [x6, w7, uxtw]
  ldrb w0, [x6, w7, uxtw]
  ldrb w0, [x6, w7, uxtw]
  ldrb w0, [x6, w7, uxtw]
  ldrb w0, [x6, w7, uxtw]
  mov x7, 8

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.3341

retire uop (01)cycle (02)030e0f1e22243a3f4346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d cache miss ld (a3)a5ld unit uop (a6)l1d cache writeback (a8)acafb5b6bbl1d cache miss ld nonspec (bf)branch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
80205267232001145102266922181812258010010080000100800155001177116149236272670726727166556167538011520080024200160048267277211802011009910080000100800001100800003908003503580039610011151180160026724064800001002672326708267232672326723
80204267222000045102267122121216258010010080000100800155001167303149236272672726707166556166798011420080024200160048267277111802011009910080000100800000100800003908000003580039613543111511801600267040104800001002672326723267232670826723
8020426727200000000267122181802580100100800001008001550011665961492362726722267221665561667480115200800242001600482670772118020110099100800001008000001008000000800000080000613543111511801600267241064800001002670826708267232670826728
8020426722200000102267120121202580100100800001008001550011771161492364726727267271665561667480114200800242001600482670756118020110099100800001008000001008000039080035008000060043111511801600267191004800001002672826708267232670826708
8020426727200004100026692212016258010010080000100800145001167303149236272672726727166356166748011520280216200160048267117111802011009910080000100800000100800000080039008003900354311151180160026719060800001002674326723267082673026726
80204267272000045002266920001625801001008000010080014500116730314923647267272672716655616864801162008002420016004826728711180201100991008000010080000010080000008003903800396100111511801600267241004800001002670826708267082672326728
80204267072000045002267120181202580100100800001008001550011673031492364726722267271663561667480115200800242001600482672271118020110099100800001008000001008000000800350398003960350111511801600267041064800001002672326728267282672826708
8020426727200004110026692012181625801001008000010080015500116659614923642267272672216650616674801142008002420016004826727561180201100991008000010080000010080000008000003980000003543111511801600267240102800001002670826723267282670826728
80204267272000000002671221201625801001008000010080016500116659614923647267272672716655616679801162008002420016004826727711180201100991008000010080000010080000390800390398003960350111511801600267041004800001002672826728267082672826708
80204267272000041100267122121216258010010080000100800155001167303149236472672226727166556166748011520280024200160048267277111802011009910080000100800000100800003908003503980039010011151180160026704664800001002672326708267282672326723

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.3341

retire uop (01)cycle (02)03l1d tlb fill (05)l2 tlb miss data (0b)0e0f181e223a3f4346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)6067696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)acafb5b6bbl1d cache miss ld nonspec (bf)cfd0d5map dispatch bubble (d6)ddfetch restart (de)e0eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
8002526736200000004511267212121219258001010800001080000501168843104923647267082672816672316688800102080000201600002672856118002110910800001080000010800004380039042800396039445020041647267241010480000102672826709267292672826728
800242670820000000451126717212121625800101080000108000050116884300492362826727267081667231668880010208000020160000267287711800211091080000108000011080000438000013980039603943502006166626724010480000102672926729267282672926728
800242672720000000451126719301216258001010800001080000501168843004923628267272672716672316707800102080000201600002672777118002110910800001080000110800004380039039800396139435020071643267241010480000102670926729267282672926728
8002426728200000006112672221212162580010108000010800005011688430049236282670826708166723167078001020800002016000026728771180021109108000010800000108000043800390398003901390502004164326725010080000102672926728267282672826729
8002426728200000004511267182121216258001010800001080000501166750004923647267272672816672316707800102080000201600002672877118002110910800001080000010800004380039039800006139435020041658267251010480000102672826729267292672826728
80024267282000001045112670000016258001010800001080000501166750004923628267282672816652316688800102080000201600002672877118002110910800001080000010800004380039039800396139435020041666267241010480000102672926729267282672826729
8002426728201000004510267232121216258001010800001080000501168843004923648267282672716672316707800102080000201600002672856118002110910800001080000110800004380000039800396039435020071676267241010480000102672826729267292672826728
80024267282000000045112672131212192580010108000010800005011668960049236472672826728166723167088001020800002016000026727771180021109108000010800000108000008000003980039613943502003166426725010480000102672926728267292670926728
8002426708200000004511266982121202580010108000010800005011688430049236482672726728166523167078001020800002016000026731561180021109108000010800000108000008003903980039613943502004164326724010480000102672826728267092672926728
800242672820000100450126693012121625800101080000108000050116689600492364726728267281667231668880010208000020160000267287711800211091080000108000001080000080039039800396139435020071634267251010080000102671326728267292670926709