Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

LDRH (register, lsl)

Test 1: uops

Code:

  ldrh w0, [x6, x7, lsl #1]
  mov x7, #4
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 0.000

Load/store unit issues: 1.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)09l2 tlb miss data (0b)191e1f223a3f4346494f51schedule uop (52)schedule ldst uop (55)dispatch ldst uop (58)simd uops in schedulers (5a)616d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map ldst uop (7d)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst int load (95)inst ldst (9b)9dl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)a5ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafb5b6bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2cfd0d5map dispatch bubble (d6)ddfetch restart (de)e0eaebec? ldst retires (ed)f5f6f7f8fd
1005389300100450123842181816251000100010001536203983982213257100010002000399811110011000100001020194201058101591038615542190730216223959921000400400400400400
100439931101065003384218181625100010001000154470399399222325610001000200039471111001100010000100003901035000391035613543007302162239110641000395395392395390
100439420000045012379212121625100010001000150180394394217325210001000200039471111001100010000100003901039000391039613943007302162239110641000395395395395390
1004394200000450023792121212251000100010001501803943942173252100010002000394711110011000100001000039010390003910396139430073021622391101041000395395395395395
1004394300000480023792121216251000100010001498903943942163252100010002000394711110011000100001000039010390003910396135430073021622391101041000395395395395395
100439430000045002379212121625100010001000147740394394217325210001000200039471111001100010000100003901039000391039613543007302162239110641000395395390390395
100438930000045002379212121625100010001000150670394394217325210001000200039471111001100010000100004301039000391039613943007302162239110641000395395395395395
1004394300000450123742181812251000100010001498903943942173252100010002000394711110011000100001000043010390003910396139430073021622391101041000395395395395395
100439430000045002379212121625100010001000150180389389217325210001000200039471111001100010000100003901035000391039613943007352162239110641000395395395395395
100439430000045002379212121625100010001000149890396394217325210001000200039471111001100010000100004301039000391035613943007352162239110641000390395390390395

Test 2: Latency 1->2 (with chain penalty)

Chain cycles: 3

Code:

  ldrh w0, [x6, x7, lsl #1]
  eor x8, x8, x0
  eor x8, x8, x0
  add x6, x6, x8
  mov x7, #4
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0047

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)0e0f1e223a3f4d4f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)a9acafb5bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
402057005562611112137002669702597122540104301031000230100100006160783342350149669737005370041646373649564010030200100006020020000700533511402011009910010000301001000001001000311100020111000011111261697149698193000360610000301007004270054700547005470054
402047005352410002137002069764597062540100301031000130100100006160153341470149669557004770048646433649504010030200100006020020000700493521402011009910010000301001000001001000000100000001000010000261697199698103000366010000301007004870048700487003670048
402047003552400001137003269764597062540104301001000130100100006161753342062149669677004770035646433649504010030200100006020020000700473511402011009910010000301001000001001000001100000001000010100261677194697983000000910000301007004870048700367003670048
402047003552400000137002069735596952540104301001000030100100006160153342062149669557004770035646433649504010030200100006020020000700473511402011009910010000301001000001001000001100000001000010100261697194698103000366010000301007003670048700487003670048
402047005154300001137002069735596952540104301001000030100100006160153342062149669557003570047646433649384010030200100006020020000700353511402011009910010000301001000001001000001100000001000000000261677199698103000360610000301007003670036700487004870036
4020470048525000001370032697355969525401043010310000302631000061601533414701496695570047700356463136495040100302001000060200200007004735114020110099100100003010010000010010000001000010010000001002616971109698103000366610000301007003670036700487004870048
402047004752500001137003269735597062540100301031000130100100006160153342062149669557003570035646433649384010030200100006020020000700473511402011009910010000301001000001001000001100000001000000000265197199698163000000010000301007003670036700487004870048
402047004752400100137003269735597062540104301031000130100100006160153342062149669557004770047646313649504010030200100006020020000700353511402011009910010000301001000001001000001100000001000000000261647199697983000300010000301007003670048700487004870048
4020470047525000011370032697355969525401003010010001301001000061601533414701496695570047700476463136495040100302001000060200200007004735114020110099100100003010010000010010000011000000010000100002616107194698103000306010000301007004870091700487004870048
40204700485240000113700206973559706254010430103100003010010000616015334147014966973700477004764643364940401003020010000602002000070035351140201100991001000030100100000100100000110000000100001010026161071109698103000360610000301007004870036700487004870048

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0051

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)09l2 tlb miss data (0b)0e0f191e22233a3f4d4f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)5f60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafb5bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2branch mispred nonspec (cb)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
40025700515250010110110070036697435969525400143001310002300101000061701833422540149669747003570035646693649764001030020100006002020000700513511400211091010000300101000011010000011000000001000011110025201571145698833000613101010000300107047370058700587005870058
40024704035251110110210070045697525971825400183001310002300101000061704533425420149669737005170035646693649794001030020100006002020000700513511400211091010000300101000001010000011000000001000010100025201471115698153001210101010000300107005370036700527005570036
400247003552500001101100700396977559710424001430013100013001010000617027334225400496695570054700546466936497640010300201000060020200007005435114002110910100003001010000010100000010000000010000101000252057161469814300030101010000300107005570052700557005670036
4002470051524000000000007003669743596952540014300131000030010100006170683342398014966974700517003564669364976400103002010000600202000070051351140021109101000030010100000101000001100000000100001111102520147114146982030006010010000300107005870058700587005870058
400247004152411110002100700266970259716254001430016100053001010000617045334254201496699870051700516466936497640010300201000060020200007005435114002110910100003001010000010100000110000000010000101000252067151469817300031010010000300107003670052700527005570036
40024700515250000000010070039697435969525400143001310001300101000061699133423980049669747005170035646693649764001030020100006002020000700513511400211091010000300101000001010000011000002012100001111102520147114569820300061010010000300107005870042700427005870058
40024700415251111000210170026697815971625400183001610002300101000061699533417690049669747005470051646723649604001030020100006002020000700373511400211091010000300101000001010000001000000001000000000025201417145698143000310101010000300107005270052700527003670052
4002470051524000000021007004269784597192540018300131000230010100006170453341769004966955700517005164653364979400103002010000600202000070051351140021109101000030010100000101000001100000000100001010002520147114146980330003100010000300107005570052700527005570052
4002470051524000000011017002669781597192540014300131000230010100006170453342542014966977700577005764675364982400103002010000600202000070057351140021109101000030010100001101000221100030011100001010002520147161469814300031013010000300107005270052700527005270052
400247005152400000001100700206974359695254001430010100013001010000616991334225401496697470054700516465336497940010300201000060020200007005135114002110910100003001010000010100000010000000010000001000252014711414697983000310101010000300107003670055700527005570036

Test 3: Latency 1->3 (with chain penalty)

Chain cycles: 3

Code:

  ldrh w0, [x6, x7, lsl #1]
  eor x8, x8, x0
  eor x8, x8, x0
  add x7, x7, x8
  mov x7, #4
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0057

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)l2 tlb miss data (0b)0e0f18191e1f22233a3f4d4f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)a9acafb5bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
4020570057525111000020101700426978859716254010430106100023010010000616078334254214966977700577005764653364944401003020010000602002000070041351140201100991001000030100100000100100031110002011100001111026101711169804300061010010000301007004270042700587005870058
402047005752510100002010170026697025970125401043010310002301001000061606833417690496696170041700416465336494440100302001000060200200007005735114020110099100100003010010000010010001111000200110000111112610171116980430006010010000301007005870058700427005970042
4020470041525110000020101700266978859716254010830106100023010010000616068334254214966962700657005864637364960401003020010000602002000070057351140201100991001000030100100000100100021110001011100001111026101711169820300061001010000301007005870042700587005870058
402047004152511100002010170042697025970125401043010310001301001000061606833425421496697770083700516463736496040100302001000060200200007005735114020110099100100003010010000010010002211000101110000011112610171116982030006100010000301007005870058700587004270042
40204700415241110000201007004269788597162540108301061000130100100006160683341769149669777005770057646533649444010030200100006020020000700413511402011009910010000301001000001001000211100020111000011110261017111698203000610101010000301007005870058700587005870042
40204700575251000000101017004269788597162540108301061000230100100006160683341769149669617004170041646373649604010030200100006020020000700573511402011009910010000301001000011001000110100010111000001011261017111698203000610101010000301007006170042700587005870042
40204700415251000000201007004269702597012540104301031000230100100006160683342542049669777004170057646533649634010030200100006020020000700573511402011009910010000301001000001001000111100010211000011111261017111698043000610101010000301007005870042700587005870058
402047004152510000001000170042697885970125401083010610001301001000061606833425420496697770057700416463736494440100302001000060200200007005735114020110099100100003010010000010010002111000201110000011102610171116982030003010010000301007005870058700427005870058
4020470041524110000010001700426978859701254010830106100013010010000616078334254204966980700417005764637364960401003020010000602002000070057351140201100991001000030100100001100100022110002001100000101126101711169804300061010010000301007005870042700587004270058
402047005752411000002001170026697885971625401043010610001301001000061607833425421496697770041700606465336496040100302001000060200200007004135114020110099100100003010010000110010002201000201110000111122610171116982030003100010000301007004270042700587004270058

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0047

retire uop (01)cycle (02)0309l2 tlb miss data (0b)0e0f191e22233f4d4f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)acafb5l1d cache miss ld nonspec (bf)cfd0d5map dispatch bubble (d6)d9ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
40025700475251000011070035697285970925400103001010001300101000061695233420621496696770035700476466536497240010300201000060020200007005035114002110910100003001010000010100000110000001000011252007710145698133000390610000300107003670036700517005170048
400247003552500000110700326976059709254001430013100003001010000617068334220604966970700357003564665364960400103002010000600202000070116351140021109101000030010100000101000001100000310003002520014710106698133000066010000300107003670036700487005170048
4002470050524000001007002069760597092540014300131000130010100516169823342206149669557005070050646533649754001030020100006002020000700503511400211091010000300101000001010000011000000100001125200147101414698133000396910000300107003670036700517005170036
400247003552400010100700356976059695254001430013100013001010000617068334220614966970700357005064668364972400103002010000600202000070035351140021109101000030010100000101000001100000010000112520014710146698133000360910000300107008570051700367004870051
4002470050524001101107003269760597092540014300131000130010100006169823342206049669557005070050646533649754001030020100006002020000700473511400211091010000300101000001010000011000000100001025203147101514698133000396910000300107004870051700487004870048
400247003552400100000700326976059709254001030013100003001010000616982334220604966970700507005064668364960400103002010000600202000070050351140021109101000030010100000101000001100006010000112520014710145697983000000010000300107005170048700517005170051
400247005052500000010700206974359695254001430010100013001010000616952334147014966970700357005064668364960400103002010000600202000070050351140021109101000030010100000101000000100000010000112520014710514697983000396910000300107005170051700517004870051
400247005052410000000700376976059695254001430010100013001010000616952334220614966970700507003564668364972400103002010000600202000070035351140021109101000030010100000101000001100000010000012520013710614697983000390910000300107005170048700517005170036
400247003552400000100700206976059709254001430010100003001010000616982334220614966955700507005064653364975400103002010000600202000070035351140021109101000030010100000101000000100000010000102520047101414697983000390010000300107004870051700517005170048
40024700355251011060070020697605970925400143001010000300101000061698233422061496697070050700506466536497540010300201000060020200007004735114002110910100003001010000010100000110000001000010252005710613698133000396910000300107005170051700487005670051

Test 4: throughput

Count: 8

Code:

  ldrh w0, [x6, x7, lsl #1]
  ldrh w0, [x6, x7, lsl #1]
  ldrh w0, [x6, x7, lsl #1]
  ldrh w0, [x6, x7, lsl #1]
  ldrh w0, [x6, x7, lsl #1]
  ldrh w0, [x6, x7, lsl #1]
  ldrh w0, [x6, x7, lsl #1]
  ldrh w0, [x6, x7, lsl #1]
  mov x7, 8

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.3341

retire uop (01)cycle (02)03l2 tlb miss data (0b)0e0f18191e2223243a3f4346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)a9acafb5b6bbl1d cache miss ld nonspec (bf)c2branch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
80205267272000100001000267222121162580100100800001008001550011683800492362726735267311665961692280115200800242001600482673156118020110099100800001008000001008000043800390041800006139440111511811600267240107800001002670826733267322673226708
8020426728200000000100026702011192580100100800001008001550011671980492365126727267311665961668380115200800242001600482672756118020110099100800001008000001008000043800390008000060394401115118016012673110104800001002673226732267082673226708
8020426731200000004410012672201018258010010080000100800155001166596049236512673126731166596166598011620080024200160048267315611802011009910080000100800000100800000800391208000001394401115118016002670410104800001002672826728267362673226739
80204267312000000044100126719211219258010010080000100800155001167198149236272673126731166596166598011520080024200160048267317711802011009910080000100800000100800004380000003880038600430111511801600267510147800001002673226708267282672426732
802042670720000000861001266922000258010010080000100800155001168380049236272670726736166356166838011320080024200160048267275611802011009910080000100800000100800004480038003880000613800111511801600267241407800001002673226728267082673226732
8020426727200000000100126714200162580100100800001008001450011683800492365126731267311665961668380115200800242001600482673177118020110099100800001008000001008000043800390038800006138001115118016002672410107800001002673226708267322673226728
8020426727201000000100026716011192580100100800001008001550011665961492365126731267071663561668380115200800242001600482673177118020110099100800001008000001008000043800390038800386039440111511801600267040104800001002672826708267282670826732
80204267312010010044100126772211202580100100800001008001550011665961492365126707267311665961667980115200800242001600482673177118020110099100800001008000001008000043800381008003860384401115118016002672714107800001002670826728267322673226728
8020426707200000004500002671421120258010010080000100800135001167198049236272672726707166596166798011520080024200160048267077711802011009910080000100800000100800000800380039800386138430111511801600267040147800001002673226732267322673226732
80204267312000000012000126716201192580100100800001008001550011665960492365126731267311665961668380115200800242001600482670781118020110099100800001008000011008000043800380008003860384401115118016002673210107800001002673826732267082670826728

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.3342

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)09l2 tlb miss data (0b)0e0f18191e1f22233a3f4346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)a5ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafb5b6bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2branch cond mispred nonspec (c5)cfd5map dispatch bubble (d6)ddfetch restart (de)e0eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
80025267232000000110045000026693218121125800101080000108000050116766014923652026732267151667731671380010208000020160000267326411800211091080000108000011080020204208005702800800396135000050207161082672460280000102670926723267232672326811
800242672220000000000450001266930181216258001010800001080000501173354149236520267142673216678316695800102080000201600002671481118002110910800001080000010800202000800540001568000061000005020716892671906080000102672326709267292672826815
80024267272000000000047000026707212121625800101080000108000050117303614923635026714267321667731671280010208000020160000267326511800211091080000108000001080020194208005803303880000610430005020716862686960480000102672326729267292672926840
80024267342001100000021010326717018181625800101080000108000050116760504923628026727267081666731670880010208000020160000267287111800211091080000108000001080000039080056000177800006139430005020616882672400480000102670926728267292670926709
800242676320000000000450000267120121242580010108000010800005011676050492364202672226722166523167028001020800002016000026728561180021109108000010800000108000000080057131062800376119421900502010166102673009080000102673326734267342671626813
80024267312000000000045000026712212121625800101080000108000050116646404923652026732267321667731671280010208000020160000267328211800211091080000108000001080019190080057025038000060353900050208161072672406280000102672826709267232672326847
800242674020011100000650100266992181812580010108000010800005011667761492364702673126719166523167058018820800002016000026728711180021109108000010800000108000003908005700018680039603943000502061668267051010480000102672926709267292672326828
800242671520011100000650003267173018162580010108000010800005011669931492364802672826722166673167078001020800002016000026728711180021109108000010800000108000003908005701104480039610430005020916782670566080000102672826732267322670926733
800242690820110100000300002267171001525802701080000108017850117447404923648026708267271667231670880010208000020160000267277111800211091080000108000001080000039080057010144800356139390005020916992671906080000102670926876267232672726907
800242679120011111102658800126717218181625800101080000108000050116713204923642026729267101666731668880010208000020160000267227111800211091080000108000011080000000800571232218003861194219105020716692672990280000102673326733267332673326891