Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

LDRH (register, sxtw)

Test 1: uops

Code:

  ldrh w0, [x6, w7, sxtw]
  mov x7, #4
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 0.000

Load/store unit issues: 1.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)03090e0f1e22233a3f4346494f51schedule uop (52)schedule ldst uop (55)dispatch ldst uop (58)simd uops in schedulers (5a)6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map ldst uop (7d)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst int load (95)inst ldst (9b)9dl1d tlb access (a0)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)acafb5b6bbl1d cache miss ld nonspec (bf)cfd5map dispatch bubble (d6)ddfetch restart (de)e0eaebec? ldst retires (ed)f5f6f7f8fd
1005394310044101383211192510001000100015274398394221325210001000200039877111001100010000100043103903810386139437311611391141071000399395399395395
1004398300044101383211192510001000100015208398394221325610001000200039877111001100010000100043103803810386139447311611391101071000399395395399395
1004398300044001383211192510001000100015267398398221325610001000200039877111001100010000100043103803810386139447311611391101041000399399399399399
1004398300044001383211192510001000100015267398398221325610001000200039877111001100010000100044103803810386139447311611395141471000399399399399399
10043983000441013832112192510001000100015018394398216325210001000200039877111001100010000100043103903810386139447321611395141071000399399399399399
1004398300044001383211192510001000100015370398398221325610001000200039877111001100010000100044103803810386139437311611395141071000399399395399399
1004398300044001383211162510001000100015267398398221325210001000200039477111001100010000100043103903910396139437311611391101041000395395395395395
10043943000441013832121192510001000100015274398398221325610001000200039877111001100010000100043103803910396139447311611391141041000399399399399399
1004398300044001383211192510001000100015208398398221325610001000200039877111001100010001100044103803810386138447311611391141471000399395399395399
10043983010450013832121192510001000100014989398398221325210001000200039877111001100010000100043103903810386139447311611395141041000399399395395399

Test 2: Latency 1->2 (with chain penalty)

Chain cycles: 3

Code:

  ldrh w0, [x6, w7, sxtw]
  eor x8, x8, x0
  eor x8, x8, x0
  add x6, x6, x8
  mov x7, #4
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0060

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)l2 tlb miss data (0b)0e0f1e22233a3f4d4f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)a9acafb5bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2branch cond mispred nonspec (c5)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
4020570057525110002000700266970259719254010830106100023010010000616087334287814966980070060700606465636496340100302001000060200200007006035114020110099100100003010010000010010001111000120110000111120026101711169823300060101310000301007006170061700617006170055
40204700575251000010007004569791597192540104301031000230100100006160683341769149669800700607006064656364944401003020010000602002000070060351140201100991001000030100100001100100022110001400110000110110026101711169804300030131310000301007004270061700617005870108
40204700625591000010007004269791597012540104301061000230100100006160953342686149640500700607006064728364963401003020010000602002000070060351140201100991001000030100100000100100021110001531110000110110026101711169804300031301310000301007005870061700617015470089
40204700415251110021007004569722597012540104301031000230100100006160953342686149669610700577005764664364963401003020010000602002000070041351140201100991001000030100100000100100052110001130110000111110026101711169804300061301010000301007008970058700587005870058
4020470041525111002001700426990059719254010830103100023010010000616095334254214966980070041700606465336496340100302001000060200200007006035114020110099100100003010010000010010001111000201110000111100026101711169823300061301310000301007006170042700587014470042
402047006052410000200070045697885975325401043010610002301001000061607833426861496698007006070041646613649444010030200100006020020000700573511402011009910010000301001000001001000231100040011000011110002610171116982330006001310000301007005870058700587005870059
402047004152410100210070028697885970125401083010610002301001000061609533426861496698007005770041646373649634010030200100006020020000700603511402011009910010000301001000001001000221100010011000011110002610171116982330006013010000301007004270042700587014070061
402047005752511100110070045698665975225401083010310002301001000061609533426861496696107005770041646563649634010030200100006020020000700603511402011009910010000301001000001001000220100030211000011111002610171116980430006100010000301007004270061700427018070061
4020470041524100001100700426978859778254010430103100023010010000616095334268614966985070060700416465636494440100302001000060596200007006035114020110099100100003010010000010010001111000201110000111100026101711169823300061313010000301007006170061700587015170058
40204700605241000120007004569874597172540104301031000230100100006160953342686149669800700607004164637364960401003020010000602002000070041351140201100991001000030100100000100100022110001001100001111100261017111698233000613131310000301007006170061701407012270058

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0047

retire uop (01)cycle (02)03mmu table walk instruction (07)090e0f181e22233f4d4f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6b6d6emap stall dispatch (70)int prf full (71)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)acafb5bbl1d cache miss ld nonspec (bf)c2branch mispred nonspec (cb)cfd0d5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
4002570035525011101107003269728596952540014300131000230010100006170683342062049669670700847004764653036497240010300201000060020200007004735114002110910100003001010000110100001100000010000111002520027111698133000396910000300107005170036700367003670036
4002470047525000001007003269728597062540014300131000130010100006169523341470149669750701477007064669036497540010300201000060020200007003535114002110910100003001010000010100001100000010000101002520027122697983000360910000300107005170048700517005170048
4002470047525000001007003269728597062540014300131000030010100006169523342062149669670700507004764665036497240010300201000060020200007003535114002110910100003001010000010100000100000010000101002520017122698133000096910000300107003670036700367004870048
4002470050524000001107003269728597062540014300131000130010100006169823342062049669550700477004764665036496340010300201000060020200007004735114002210910100003001010000010100001100000010000101002520017112698103000366610000300107005170036700517004870048
4002470047525100011107003269728597092540014300101000130010100006169823342206149670190700477004864665036497240010300201000060020200007003535114002110910100003001010000010100001100000010000101002520027121698133000366610000300107004870048700487003670048
4002470035524000001007003569728596952540014300131000130010100006169523342062049669600700477003564665036497540010300201000060020200007004735114002110910100003001010000010100001100000010000101002520027121697983000366910000300107003670051700517005170036
4002470035525000101107003269743597062540010300131000130010100006169523341470049669670700477004764665036497740010300201000060020200007003535114002110910100003001010000010100000100000010000101002520017111698103000366010000300107005170051700487005070036
4002470047525000001107002069760597062540014300131000130010100006169523342446149669730700357005064668036497240010300201000060020200007003535114002110910100003001010000010100000100000010000101002520027121698133000309910000300107004870048700517004870048
40024700475240000011070035697605970925400183001310001300101000061695233422061496697207004770047646650364972400103002010000604082000070047351140021109101000030010100000101000011000000100001010025200171116986130003661710000300107004970051700487003670048
4002470050525000101007003269728597062540014300131000130010100006170683341470049669730700477004764668036497540010300201000060020200007005035114002110910100003001010000010100001100000010000001002520027111698133000090610000300107004870048700487004870051

Test 3: Latency 1->3 (with chain penalty)

Chain cycles: 3

Code:

  ldrh w0, [x6, w7, sxtw]
  eor x8, x8, x0
  eor x8, x8, x0
  add x7, x7, x8
  mov x7, #4
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0047

retire uop (01)cycle (02)030e0f1e1f22233f4d4f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)acafb5l1d cache miss ld nonspec (bf)branch cond mispred nonspec (c5)branch mispred nonspec (cb)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
40205700475241010107003269784597162540104301031000130100100006160153342062049669677004770047646433649504010030200100006020020000700473511402011009910010000301001000011001000011000000100001100261017111698103000366610000301007004870048700487011970048
40204700475250010107003269735597412540104301031000130100100006160153342062049669677003570047646433649504010030200100006020020000700473511402011009910010000301001000001001000011000000100001100261017111698103000366610000301007010970048700487014470048
40204700475240010107003269735597582540104301031000130100100006183973341470049640077004970047646433649504010030200100676020020000700473511402011009910010000301001000001001000001000000100001100261017111698103000366610000301007004870048700487013170048
40204700475250060107003269735597452540104301031000130100100006160153342062049669677004770047646433649504010030200100006020020000700473511402011009910010000301001000001001000011000000100001100261017111700923000366610000301007004870393700487004870048
40204700475240010017003269735597952540104301031000130100100006206223342158049669677004770047646433649504010030526100006020020000703213511402011009910010000301001000001001000011000010100001100261017111698103000006610000301007004870048701307005870051
40204700475240000017003269820597072540104301031000130100100006160153342062049669677004770047646433649504010030200100006020020000700473511402011009910010000301001000001001000011000000100001100261017111698103000366610000301007004870048700487012670048
40204700475240060017003269816597082540104301031000130100100006160153342062049669677004770047646433649504010030200100006020020000700473511402011009910010000301001000001001000011000000100000100261017111698103000366610000301007004870048700487013770048
40204700475240010017003269735597292540104301141000130100100006160153342062049669677004770047646433649504010030200100006020020000700473511402011009910010000301001000001001000011000010100001100264617111698103000366610000301007016070038700487004870052
40204700475250010017003269884597302540104301031000130100100006160153342062049669677004770047646433649534010030200100006020020000700473511402011009910010000301001000001001000811000312273100000100261017111698103000366010000301007005570230700657019270048
40204700475250010017003269797597612540104301031000130100100006160153342062049669677003570047646433649504010030200100006020020000700473511402011009910010000301001000001001000011000000100001100261017111698103000366610000301007004870048700497004870080

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0059

retire uop (01)cycle (02)03mmu table walk instruction (07)09l2 tlb miss instruction (0a)0e18191e1f22233f43494d4f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)a9acafb5bbl1d cache miss ld nonspec (bf)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
400257006552400000014010700441169784598872540018300131000230010100006169123341867149669990700597004364677364984400103002010000600202000070043351140021109101000030010100000101000000100010011000111102520671346982230006010010000300107006470060700607004470060
40024700595240000042010700441169783597032540018300161000230010100006169123342686149669790700437004364661364984400103002010000600202000070059351140021109101000030010100000101000902100010011000111002520371426982230006100010000300107006070044700607006070044
400247006152440000020007004911697045971825400183001610002300101000061691233418671496698907009970064646613649844001030020100006002020440700593511400211091010000300101000001010000021000100710001110025205713369806300061010010000300107004470060700607006070066
400247040552500000020107002811697835970325400183001310001300101000061706333426381496697907005970043646773649684001030020100006002020000700593511400211091010000300101000001010000001000100110001110025203713269822300060101010000300107006070060700607006070060
4002470043524000000200070028116970859913254001430013100023001010000617063334263814966966070059700606466136498640010300201000060020200007004335114002110910100003001010000010100000210001004100130110252047123698223000610101010000300107006070060700607004470060
40024700595250000001000700441169785597032540018300131000230010100006169123341867149669890700657006664677364984400103002010000600202000070043351140021109101000030010100000101000002100010041000100102520271426982230006100010000300107006670424700607004470060
4002470059524010003700070028106978359718254001430016100023001010000617063334273414967007070043700596466136498440010300201000061332200007004335114002110910100003001010000010100000210001601100010110252027122698223000610101010000300107006070044700607006070060
400247005952500000010107004411697835971825400143001610002300101000061706333426381496697907006870043646613649684001030020100006002020000700433511400211091010000300101000001010000021000100110001110025203712269806300061010010000300107006070060700607006070044
400247005952400000020007002810697835970325400143001610002300101000061706333426381496701507005970059646613649684001030020100006002020000700593511400211091010000300101000001010000221000100110001111025202714469822300030101010000300107006070060700607006070060
400247004352500000010107004411697895972325400183001610001300101000061708133426381496699007005970044646613649844001030020100006002020000700593511400211091010000300101000001010014001000100110001011026202714369825300061010010000300107042670061700607006070044

Test 4: throughput

Count: 8

Code:

  ldrh w0, [x6, w7, sxtw]
  ldrh w0, [x6, w7, sxtw]
  ldrh w0, [x6, w7, sxtw]
  ldrh w0, [x6, w7, sxtw]
  ldrh w0, [x6, w7, sxtw]
  ldrh w0, [x6, w7, sxtw]
  ldrh w0, [x6, w7, sxtw]
  ldrh w0, [x6, w7, sxtw]
  mov x7, 8

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.3341

retire uop (01)cycle (02)03090e0f191e22233a3f4346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)acafb5b6bbl1d cache miss ld nonspec (bf)branch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)e0eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
80205267282001110440012671521212162580100100800001008001550011673030492364726727267271663561667980115200800242001600482672777118020110099100800001008000001008000043800380388000061394411151181162672410104800001002672826718267282670826708
802052670720000004400126722211192580100100800001008001550011686920492362726731267311665961667980114200800242001600482670756118020110099100800001008000001008000043800390080039600431115118016267240104800001002673226708267322673226732
802042673120000004400126697001216258010010080000100800155001177116049236272673126707166596166838011420080024200160048267317711802011009910080000100800000100800004380000038800386139441115118016267280147800001002673226708267322673226732
8020426707200000044001267172111925801001008000010080016500116659604923627267312673116659616679801152008002420016004826707771180201100991008000010080000010080000438000003980000610431115118016267041004800001002672826728267282670826972
802042672720000004500126695001202580100100800001008001450011665960492366526711267101663561665980116200800242001600482672777118020110099100800001008000001008000043801300080038013944111511801626704004800001002672826732267322670826732
802042670720000004400026719200025801001008000010080013500116659604923651267072673316667616683801142008002420016004826707771180201100991008000010080000110080000080038039800006139431115118016267240104800001002672826728267282672826728
802042672720000000001266920121192580100100800001008001550011683801492364726731267311665961665980114200800242001600482673177118020110099100800001008000011008000043800000080039013801115118016267240147800001002673226732267082673226732
802042673120000004400126719211182580100100800001008001750011671980492362726707267311663561668380114200800242001600482672777118020110099100800001008000001008000043800380398003961001115118016267240100800001002670826708267282670826728
802042672720000004600026725212016258010010080000100800145001167303149236512673126731166356166838011520080024200160048267277711802011009910080000100800000100800004380038008000061384311151180162672814104800001002673226732267082672826732
8020426731200011000012671720119258010010080000100800155001168380149236512670726707166556166838011420080024200160048267315611802011009910080000100800000100800000800000398000001394311151180162672414147800001002673226732267322673226728

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.3342

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)09l2 tlb miss data (0b)0e0f18191e1f22233a3f4346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)a5ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafb5b6bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2cfd0d5map dispatch bubble (d6)ddfetch restart (de)e0eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
800252673320011101100220103267172180162580010108000010800005011653044923658267342671416677316694800102080000201600002671481118002110910800001080000110800191942080057000218003861194219150200516532672909280000102671526715267152673326734
80024267152011111010065010226717201815258001010800001080000501168280492365726733267151667731671280010208000020160000267328111800211091080000108000001080020200080019001598013061574219150200316352671299280000102673326715267152673326716
8002426714200111101002101022671721818162580010108000010800005011676604923771267402673216680316712800102080000201600002673281118002110910800001080000010800201942080058101598003861574219150200516352672999280000102673426733267152673326716
80024267322001110000021010126700001802580010108000010800005011702044923656273162673616684316712800102080000201600002673364118002110910800001080000010800191942080057000598000061574219150200516532672990280000102673326734267342671626733
800242671420010110000210001267002181815258001010800001080000501167599492366526733267321667731671280010208000020160000267338111800211091080000108000001080020190080019000598003861574219150200516532671209080000102673326733267332673326734
8002426732200101000006500032669901818152580010108000010800005011675994923652267402673216677316712800102080000201600002673364118002110910800001080000010800202042080055000218003860584219150200516552672999280000102673326733267332673326715
80024267322001111000065000026717201812580010108000010800005011702044923650267322671516660316694800102080000201600002673281118002110910800001080000010800212142080019101598000001574119050200316552672909280000102673326734267162671626733
8002426732200100100002101032670020181625800101080000108000050116729849236652673226715166603167128001020800002016000026715821180021109108000010800000108002020008005700059800000157019150200516452671299280000102673426733267332671526716
800242673220011000000640101267002018152580010108000010800005011668154923652267342672316671316694800102080000201600002673581118002110910800001080000010800212040080019201588003801194219150200516752672990280000102673326733267332673326832
8002426715200111100006500012669920181258001010800001080000501168695492366826732267321667731669580010208000020160000267328211800211091080000108000001080021204208005700221800396057019150200516352672909080000102673326733267332673326716