Apple M1 Microarchitecture Research by Dougall Johnson

Firestorm: Overview | Base Instructions | SIMD and FP Instructions
Icestorm:  Overview | Base Instructions | SIMD and FP Instructions

LDNP (signed offset, S)

Test 1: uops

Code:

  ldnp s0, s1, [x6, #0x10]
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 2.000

Issues: 1.000

Integer unit issues: 0.000

Load/store unit issues: 1.000

SIMD/FP unit issues: 0.000

retire (01)cycle (02)030e1e22233a3f4346494f51inst issue (52)~issue ld/st (55)~dispatch ld/st (58)huge thing ld/st (5a)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op ld/st (7d)~map lookup ld/st (80)8283pipeline redirect (84)85inst all (8c)inst fp/simd load (98)inst ldst (9b)9da0a3a6acafb5b6bbdcache load miss (bf)cfd5d6ddinst fetch restart (de)e0e6eaebecld/st retires (ed)? fp/simd (ee)f5f6f7f8fd
200540231610003871741920251000100010001545613774024028031351000200010004023821110011000100001000591025551025615544731161139910551010610001000403403403403384
2004402206110138718019202510001000100015546135740238310031161000200010004023821110011000100001000591055551055615544731161137910551010610001000384403384384403
200440231610013871741932510001000100014532137738240210031351000200010004024021110011000100011000591055551055615544731161138010551010010001000403384403383403
200438331611003871701920251000100010001445313774023838331351000200010004024021110011000100001000010255510556155073116113991055100610001000403384384403403
2004382302510136817419325100010001000146771377383402803135100020001000402383111001100010000100059105555102561550731161139910251010610001000403403384403403
20044023025101387170193251000100010001548113774024028031351000200010004023821110011000100001000591056251055615544731161137910551010610001000403383403403403
20044023062101367170022510001000100014532136040240210031351000200010004024021110011000100001000591055551055612544731161139910551010610001000403403403403403
20044023061101387174192025100010001000154561377402383100311610002000100040240211100110001000010005910255510566055073116113991055010610001000403403403403383
200440230611013681741920251000100010001445313774024021003135100020001000382383111001100010000100059105555105561554473116113991055100610001000403403403384403
200440230610013871740202510001000100015499137738340210031351000200010004023831110011000100001000591055551055615544731161138010551010610001000384403403403403

Test 2: Latency 1->3 roundtrip

Chain cycles: 3

Code:

  ldnp s0, s1, [x6, #0x10]
  fmov x0, d0
  eor x8, x8, x0
  eor x8, x8, x0
  add x6, x6, x8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 9.0050

retire (01)cycle (02)0308090b0e0f18191e1f22233a3f4d4f51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)60696b6d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a3a6a8a9acafb5dcache load miss (bf)c5cdcfd5d6ddinst fetch restart (de)e0e6? int output thing (e9)eaebecld/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
60205120047899000110010100120020119494109458256011440100100001000030100100001000010795395736035611788211200230120050120050111881311237450100302002000010000602001000010000120050120035115020110099100401001000010000010010000010000101210001100032101108111196461000040002668100001000040100120051120048120048120048120051
6020412004789900000000000012002011949410945825601004010210001100003010010000100001079539573529361178820120023012003512005011188131124165010030200200001000060200100001000012004712003511502011009910040100100001000001001000011000000010000100032101108111196551000040000968100001000040100120036120051120036120051120051
6020412005089900000000000012003511949410944325601034010210000100003010010000100001079539573603561175990120026012005012003511189531123745034430200200001000060200100001000012005012004711502011009910040100100001000001001000011000000010000110032101108111196461000040000608100001000040100120048120048120051120051120048
6020412005089900011000010012002011950310945825601004010210000100003010010000100001079539573529361175990120011012003512003511190331124215010030200200001000060200100001000012004712004711502011009910040100100001000001001000011000000010000110032101100111196461000040000608100001000040100120036120048120036120051120051
6020412003589900010006010012003211950310944325601034010010001100003010010000100001079348573588861178820120011012005012003511189531124215010030200200001000060200100001000012003512004711502011009910040100100001000001001000011000000010001110032101108111196551000040002908100001000040100120051120036120048120036120051
6020412003589900000001000012002011949410945825601034010210001100003010010000100001079566573529361175990120026012005012004711188131124215010030200200001000060200100001000012005012003511502011009910040100100001000001001000011000000010000100032101135111196461000040000960100001000040100120051120036120051120051120051
6020412003589900000001000012003511949410945825601034010010001100003010010000100001079539573529361178820120011012005012003511189531124165010030200200001000060200100001000012005012004711502011009910040100100001000001001000011000000010000010032101108011196571000040000960100001000040100120036120036120036120048120051
6020412003589900000001000012003511949410944325601034010210001100003010010000100001079539573603561178820120026012003512004711189531124215010030200200001000060200100001005312005012004711502011009910040100100001000001001000011000000010000110032101135111196551000040002068100001000040100120048120036120048120036120051
6020412003589900000000010012003511949410945825601034010210001100003010010000100001079126573603561178821120011012003512005011189531124165010030200200001000060200100001006412008312012011502011009910040100100001000001001000001000000010000110032101108111196551000040000968100001000040100120036120036120051120051120036
6020412005089900010001010012003511949410945825601034010010001100003010010000100001079539573603561175991120011012005012005011190331124215010030200201261000060200100001000012005012003511502011009910040100100001000011001000011000000010000110032101100111196551000040002008100001000040100120051120051120048120051120051

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 9.0057

retire (01)cycle (02)030508090b0e0f18191e1f22233a3f4d4f51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)5f60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a1a3a6a7a8a9acafb5bbdcache load miss (bf)dtlb miss (c1)c2cdcfd5d6daddinst fetch restart (de)e0e6? int output thing (e9)eaebecld/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
600251200578991100000020101120042119490109464256001640014100011000030010100001000010798705736518612605100120036120057120041111928311245450010300202000010000600201000010063120057120041115002110910400101000010000010100013110002001110000111110314018990613119675100034000210109100001000040010120061120050120058120058120042
600241200579001001000020100120026119484109449256001640014100021000030010100001000010798705736518612350001120033120057120057111925311245450010300202000010065600201000010000120057120057115002110910400101000010000010100022110003001110000111100314017990176119675100014000210109100001000040010120061120058120042120058120058
6002412004189910010000200011200421194841094642560016400121000210000300101000010000107991557363746126051001200361200571200411119253112454500103002020000100006002010000100001200411200571150021109104001010000100001101000321100030111100001111203140139901717119675100014000410100100001000040010120042120058120042120058120058
600241200578991001010014000012002611949010944925600134001410002100003001010000100001079870573637461260510112003312005712005711192531124385001030020200001010860344100001000012005712004111500211091040010100001000001010002111000200141000011011031401899715611967510001400041009100001000040010120058120058120058120058120058
60024120057899110100001010012004211948410946425600164001410002100003001010000100001079915573559361260510112003312005712005711192831124545001030020200001000060020100001000012004112004111500211091040010100001000001010003311000100111000011110031406990717119675100014000210109100001000040010120058120058120042120042120058
60024120041899110100002000012004211949010946725600134001410002100003001010000100001079870573637461260510112003312005712006011192531124545001030020200001000060020100001000012005712005711500211091040010100001000001010002201000100241000011011031401799071711967510001400040109100001000040010120058120042120058120058120045
600241200578991101000010100120026119491109464256001640014100021000030010100001000010799155736374612605101120036120041120057111928311243850010300202000010000600201000010000120060120057115002110910400101000010000010100023110002000110000011120317614990715119675100014000410100100001000040010120058120042120059120061120058
600241200609001100000020100120042119490109464256001640012100021000030010100001000010798705736374612360501120033120057120057111925311245450010300202000010000600201000010000120041120057115002110910400101000010000010100033110002001110000010100314017990171711965910001400041009100001000040010120042120058120042120061120058
600241200579001101000020100120042119490109464866002840014100041000030152102101000010917205748386612605101120039120057120057111909311245450010301802000010000603401000010000120057120057115002110910400101000010000010100022110001001110000011110330417990187119675100014000410109100001000040010120802120564120632120584120058
600241200579001100000020100120042119490109464256001640012100021000030010100001000010798705736374612605101120020120041120057111925311245750010300202000010000600201000010000120041120057115002110910400101000010000010100012010002000110000111100314069901751196751000140004101012100001000040010120058120042120058120042120042

Test 3: Latency 2->3 roundtrip

Chain cycles: 3

Code:

  ldnp s0, s1, [x6, #0x10]
  fmov x1, d1
  eor x8, x8, x1
  eor x8, x8, x1
  add x6, x6, x8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 9.0056

retire (01)cycle (02)03050708090a0b0e0f181e1f22233a3f4d4f51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a1a3a6a8a9acafb5bbdcache load miss (bf)dtlb miss (c1)c2cfd5d6ddinst fetch restart (de)e0e6? int output thing (e9)eaebecld/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
6020512005390010014110020100120142119511109453256010640104100031000030100100611000010794145738870611938001201001201581201431119523112417501003020020000100006020010000100001200561200561150201100991004010010000100001100100011110001001100001000032115110551196651000140004968100001000040100120057120044120045120044120057
6020412005889910110100420101120040119511109450556011540143100041000430100101061005010869475750029612245601200171200411200561118973112409501003020020000100006020010000100001200561200531150201100991004010010000100001100100012010003024100001111032105121551196681000140002968100001000040100120042120054120056120057120441
6020412005689910100101058352101120418119511109466256010640102100021000030100100001000010794145735737612195301200171200531200541118993112415501003020020000100006020010000100001200531200531150201100991004010010000100000100100012010001011100001111032104121551196511000140004668100001000040100120057120057120054120042120042
6020412004189910010100070101120041119497109463256010340104100021000030100100001000010793685736182612119201200171200571204461119023112417501003020020000100006020010000100001200561200531150201100991004010010000100001100100022010002101100001111132104121551196621000140004000100001000040100120057120042120058120057120057
6020412004189910110111020100120038119530109463256010340104100021000030100100001000010793685736326612040401200291200561200411119023112415501003020020000100006020010000100001200411200411150201100991004010010000100000100100011010001001100001101132105121551196681000140002008100001000040100120057120057120054120042120042
6020512005690010010010020000120041119497109461256010640126100011000030100100001000010794145736182612005001200171200561200531118973112417501003020020000100006020010000100001200561200531150201100991004010010000100000100100032110001004100000111132105100441196681000140004065100001000040100120057120057120057120057120222
6020412005989910010100050100120026119511109463256010640102100011000030100100001000010793685739686611942201200291200561200561118973112415501003020020000100006020010000100001200561200531150201100991004010010000100000100100022110001021100000111132105110551196511000140002005100001000040100120042120057120042120057120042
6020412005690011100110020000120038119511109463256010640104100021000030100100001000010793685739254611943201200321200561200561118973112415501003020020000100006020010000100001200411200411150201100991004010010000100000100100021010001001100001101132105121551196511000140002960100001000040100120057120102120057120042120057
6020412005689910100101020010120026119530109449256010640104100021000030100100001000010794145736182612056101200321200411200531118993112417501003020020000100006020010000100001200411200531150201100991004010010000100000100100011110001001100001111232105100541196681000140004968100001000040100120057120042120042120057120042
602041200629001010111004130010120041119511109463256010640104100021000030100100001000010794145736182612082401200291200561200531119023112409501003020020000100006020010000100001200411201061150201100991004010010000100000100100000110000000100001101032104100551196681000140004608100001000040100120058120054120054120057120042

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 9.0054

retire (01)cycle (02)0307080e0f18191e1f2223243f4d4f51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a1a3a6a8a9acafb5dcache load miss (bf)c2branch mispredict (cb)cdcfd2d5d6ddinst fetch restart (de)e0e6? int output thing (e9)eaebecld/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
600251200518990110001000012002011948910944325600134001210000100003001010000100001079888573901261265180120030120054120054111922311243750010300202000010000600201000010000120051120051115002110910400101000010000110100000110000402710000110003140059943119672100004000213109100001000040010120036120055120057120061120055
60024120054899000000100001200201194871094612560013400121000010000300101000010000107988857360846128433012003012005412005411192231124485001030020200001000060020100001000012005112005111500211091040010100001000001010000111000000010000000013140049934119672100004000613012100001000040010120052120052120055120055120055
6002412005489900100000100120039119487109443256001340010100001000030010100001005510798585741744612718601200301201101200841119033112437500103002020000100006002010063100001200621200351150021109104001010000100000101000001100000024310000110003140049443119650100004000213012100001000040010120055120036120055120055120055
60024120078899000000000001200201194871094612560013400101000110000300101000010000107988557360846125746012001112005412005411190331124375001030020200001000060020100001000012003512005111500211091040010100001000001010000011000000204100001000031400599431196731000040002101312100001000040010120055120055120055120055120052
6002412008289900000010100120020119489109443256001340012100011000030010100001000010798945735293612574601200271200541200541119223112437500103002020000100006002010000100001200541200511150021109104001010000100000101000001100000023710000010003140049935119672100004000213139100001000040010120036120036120055120036120055
6002412012090200000000000120039119487109443256001340012100011000030010100001000010798585736230612434401200111200541200541119033112451500103002020000100006002010000100001200541200511150021109104001010000100000101000001100000020710000110003140059443119672100004000013100100001000040010120056120052120052120055120055
6002412012589900000040000120036119489109461256001340012100011000030010100001000010798975735293612574601200271200541200511119033112437500103002020000100006002010000100001200541200351150021109104001010000100000101000001100000025510000110003140039954119672100004000013109100001000040010120055120055120036120055120036
600241200749000000242433182200000122400120578110148792602864018910048100523354011419112861141461579113561804250120401122469122426112469365112586500103002020000100006002010000100001200541200511150021109104001010000100000101002801100361291911100360100039430634056121880100404023901312100001000040010123798123244123959123335123257
600241237749562100014000012003911948410945925600134001210003100003001010000100001079769574581761795020121905121693120054111922311245150010340732288811554722561192211559123026122612321500211091040010100001000011010046201004000898071000011000336307345481220721004040002171312100001000040010120801120058120143120061120036
600241200479300010001000012003911948910944325600134001210000100003001010000100001079769573623061257460120030120054120054111903311245150010300202012410000600201000010000120055120051115002110910400101000010000010100000110000001681000011000314004994311967210000400001009100001000040010120036120036120055120055120055

Test 4: throughput

Count: 8

Code:

  ldnp s0, s1, [x6, #0x10]
  ldnp s0, s1, [x6, #0x10]
  ldnp s0, s1, [x6, #0x10]
  ldnp s0, s1, [x6, #0x10]
  ldnp s0, s1, [x6, #0x10]
  ldnp s0, s1, [x6, #0x10]
  ldnp s0, s1, [x6, #0x10]
  ldnp s0, s1, [x6, #0x10]
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.3341

retire (01)cycle (02)03090e0f18191e22233a3f4346494f51inst issue (52)~issue int (53)~issue ld/st (55)~dispatch int (56)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map lookup int (7f)~map lookup ld/st (80)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a3a6a8acafb5b6bbdcache load miss (bf)cfd5d6ddinst fetch restart (de)e0e6eaebecld/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
160205267342011100045101267330112025801001008000010080000500116814002668526707267076650366858010020016000020080000267312672711802011009910010080000800001100800004380000039800396104451101161126704800391010080000800001002687626814267412673426870
1602042673120000022450012671221218025801001008000010080178500117634412670226731267076650366898010020016000020080000267272672711802011009910010080000800001100800000800381388003961394351101161126728800001414780000800001002672026844267182675226737
160204267312000000000002669200120258010010080000100800005001174628126682267312673166303666580100200160000200800002672726727118020110099100100800008000001008000044800380388003861044511011611267048003800780000800001002686826804268772674426732
16020426731200000000000267122112725801001008000010080000500117488712670626707267316654366898010020016000020080000267072670711802011009910010080000800000100800004380038008003800394451101161126728800391014080000800001002687826817267922674126732
1602042672720100000440012671621042580100100800001008000050011687541267062673126727665436685801002001600002008000026727267071180201100991001008000080000010080000080000038800386004351101161126724800381414780000800001002678226859267132673626732
160204267072000000044101267162121232580100100800001008000050011688801267062673126707665436689801002001600002008000026731267071180201100991001008000080000010080000080038038800380138445110116112672880038014780000800001002687026796267212737326749
1602042670720000000440002671621131258010010080000100800005001174887026706267312673166543668980100200160000200800002673126727118020110099100100800008000001008000043800380388000061384351101161126728800001410480000800001002688526782268332671726732
1602042673120000000440012671201136258010010080000100800005001174887026682267312670766543668980100200160000200800002670726727118020110099100100800008000001008000043800380388000000394451101161126728800381414080000800001002688826755267432670826740
1602042673220000000441002671921215258010010080000100800005001169085026702267312673166303666580100200160000200800002673126727118020110099100100800008000001008000043800000418003861394351101161126728800381414080000800001002689726788268022674026728
1602042673120000000010026716211102580100100800001008000050011748871267062673126731665436689801002001600002008000026707267271180201100991001008000080000010080000438000013880038603905110116112672880038014780000800001002682026738267282675126714

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.3342

retire (01)cycle (02)0305080b0f1e22233a3f4346494f51inst issue (52)~issue int (53)~issue ld/st (55)~dispatch int (56)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map lookup int (7f)~map lookup ld/st (80)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)92inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a1a3a6a7a8a9acafb5b6bbdcache load miss (bf)dtlb miss (c1)c2cfd5d6ddinst fetch restart (de)e0e6eaebecld/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
160025271552011220360002713837012618001010800001080000501169187267502674126741669136722800102016000020800002671926739118002110941010800008000011080020195880073102618003961420191502051655267338007410058000080000102672526992267642674826742
1600242674120011118100226728277242580010108000010800005011720042671726722267416667367258001020160000208000026741267361180021109010108000080000010800191908007300175800396059451905020416442673880034101008000080000102673326749267252691026750
16002426740200110081003267272720625800101080000108000050116912626694267412671966823669980010201600002080000267362673611800211094101080000800000108001919588007310175800396160451935020516552674080073111058000080000102676026750267432674226743
1600242671920011006600326727237719258001010800001080000501183608267112671926736666436716800102016000020800002671926719118002110901010800008000001080020200800731017580021615901925020316532673880073101008000080000102674326984267242672826743
160024267362001111810022670537723258001010800001080000501167722267112674126724666436699800102016000020800002673626736118002110901010800008000001080019200800411016080015617345191502051633267338006101308000080000102675526720267372673726742
1600242673620011006700226721157952580010108000010800005011701602669426722267226687367168001020160000208000026719267411180021109010108000080000010800201958800580016480015617401915020316552671880074101058000080000102675126751267462675626743
1600242673620011006600326726150714025800101080000108000050116886526695267192674166653672180010201600002080000267392673611800211090101080000800000108001920588003300144800546073431905020416442673880034101358000080000102672426724267252679626737
160024267422001110360022672637072580010108000010800005011687132671626719267366665366998001020160000208000026736267191180021109010108000080000010800212008005920175800396059451905020516552673880058131358000080000102674326737267232674326720
160024267422001010671022672116720252580010108000010800005011692952671126736267366681367168001020160000208000026736267361180021109010108000080000010800211958800340013680015017345190502041623267398003401358000080000102674326729267372675626737
16002426736200101036102267263720222580010108000010800005011721872671126736267196664367228001020160000208000026736267411180021109410108000080000110800191958800730016080040017443191502041634267398006001058000080000102680226744267532674226722