Apple M1 Microarchitecture Research by Dougall Johnson

Firestorm: Overview | Base Instructions | SIMD and FP Instructions
Icestorm:  Overview | Base Instructions | SIMD and FP Instructions

LD1R (8B)

Test 1: uops

Code:

  ld1r { v0.8b }, [x6]
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 60 nops): 2.000

Issues: 2.003

Integer unit issues: 0.000

Load/store unit issues: 1.000

SIMD/FP unit issues: 1.003

retire (01)cycle (02)03050708090a0b0e0f1e22243a3f43464951inst issue (52)~issue fp/simd (54)~issue ld/st (55)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing ld/st (5a)huge thing fp/simd (5b)5f6061696d6escheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op ld/st (7d)~map op fp/simd (7e)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst fp/simd load (98)inst ldst (9b)9da0a1a3a6a8a9acafb5bbdcache load miss (bf)dtlb miss (c1)c2cfd0d2icache miss (d3)itlb miss (d4)d5d6d9dadbddinst fetch restart (de)e0eaebecld/st retires (ed)? fp/simd (ee)f5f6f7f8fd
620052856121311510151004810525727939001162922001100410001000100050001193820022710283062833931020001000100010001000282012821011610011000100001003131002021100013100138481016572103452955196003366382510464027904147011245013288100010002809128142281652825228307
62004283002120150016000386105263280231111646620001004100010001000500011924500226942804128194310200010001000100010002835228082116100110001000010000210030011003013001351710084704233971237196713240381710404327959138971279013301100010002821128509284242833828066
6200428390213012001710030310488327970110164472002100410001000100050001194830022680280732836931020001000100010001000281792808511610011000100001000031003003100313200135441011271493450838194843339381712343227813146791225113315100010002812828240283232834628236
620042825721201300150001510508327988111163652004100410001000100050001194970022685280922846831020001000100010001000282672817511610011000100001000001003001100101200138461028572443341936196293399381911383627901139521253313319100010002819028222281962804828434
62004284022110170019000311051912803311116206200210021000100010005000119407002267528131281543102000100010001000100028306283521161001100010000100003100300110031130013627985570613397114019489349138149373528027153191214113427100010002833728335283272839428293
620042809621301800180110104965282570101628220041002100010001000500011928100022699283772821631020001000100010001000280762838211610011000100011000031001000100003300139621032769933414838197993341381514394327808145131252413342100010002803328233282272822928237
6200428500213019001300026104991279510111612620001004100010001000500011943900226522826228594310200010001000100010002839328217116100110001000010000210000001000133001388810168720234621038196233387380814404427998147701256713610100010002815728292283442822528267
620042832221201300160004105090280420111623120041004100010001000500011947120022664284822836831020001000100010001000281572820511610011000100001000031000003100310300138731039172633230123619817342438089364227991145271227413119100010002820028180281612821128316
620042823021201600200002321051502822911116198200010041000100010005000119078002272428240285103102000100010001000100028225283131161001100010000100003100100310012330013920105367198348183119704331938128423827971143711241313323100010002831428325284662822128281
620042837621101700160002900518728068110162182004100410001000100050001190090022708281382822831020001000100010001000283092837711610011000100001000031000003100020300139861036873033425732195263414381812443927883140461238113172100010002822828269282552816228490

Test 2: Latency 1->2 roundtrip

Chain cycles: 3

Code:

  ld1r { v0.8b }, [x6]
  fmov x0, d0
  eor x8, x8, x0
  eor x8, x8, x0
  add x6, x6, x8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 11.0047

retire (01)cycle (02)0308090e0f18191e1f22233f4d4f51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)696b6d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a3a6a8acafb5bbdcache load miss (bf)c2c5cdcfd5d6ddinst fetch restart (de)e0? int output thing (e9)eaebecld/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
60205140094104900110042401014003513941012936225701004010020003100003024220000100001264045669368514308701140026014003514003513183231323996010030200100002000060200100002000014003514003511502011009910040100100001000001001000001000023710910000101000321011221113954640000069100001000040100140051140052140157140051140036
602051400791058000000229901014005914064013014088970483403242010010037343272303411477133523767571461445052814221901428951431761324344741338776952635152115992276869762116382309514290514270431150201100991004010010000100000100100451100210310001000200321011221013954640000909100001000040100140051140379140051140051140048
6020414005010521100310916440001400321394101293592570102401002000210000301002000010000126404566935381431063314002301400351400501317923132402601003020010000200006020010000200001400501400471150201100991004010010000100000100100001100000010000101000321011271113955240000906100001000040100140051140036140048140051140036
60204140035104900000010011400351394071293472570100401002000210000301002000010000126380366929471431063314002601400351400471318253132402601003020010000200006020010000200001400501400471150201100991004010010000100000100100001100003010000001000321011271113954640006609100001000040100140051140051140036140036140048
602041400351049000000250011400321394071293622570102401002000210000301002000010000126413666929471431112814001101400471400361318513132382601003020010000200006020010000200001400501400471150201100991004010010000100001100100000100000010000101000321011391113954640000906100001000040100140036140036140048140036140048
60204140035104900000010011400201394101293622570102401002000010002301002000010000126404566935381431073714002601400501400501317923132403601003020010000200006020010000200001400351400351150201100991004010010000100000100100001100000010000101000321011221113955240000909100001000040100140036140051140051140051140036
60204140047104901000000011400201394101293592570102401002000210000301002000010000126413666929951430870114001101400351400501317923132382601003039410000200006020010000200001400471400471150201100991004010010000100000100100001100000010000000000321011221113954640000660100001000040100140051140051140051140051140051
60204140047104900000010001400321394101293622570102401002000010000301002000010000126413666936851431112814001101400471400501317923132402601003020010000200006020010000200001400501400471150201100991004010010000100000100100001100000010000101000321011221113965240000660100001000040100140048140051140051140051140048
60204140050104900000000001400321394061293592570102401002000010000301002000010000126404566935381430870114002301400351400471318373132382601003038910000200006020010000200001400501400471150201100991004010010000100000100100000100000010000101000321011271113955540000669100001000040100140051140051140051140036140051
60204140050104900000010001400201394101294142570102401002000210001301002000010000126404566941141431094514002301400471400521318573132402601003020010000200006020010000200001400351400351150201100991004010010000100000100100001100000010000100000321011271113954640000960100001000040100140048140048140051140036140036

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 11.0047

retire (01)cycle (02)03090e0f18191e22233f4d4f51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)60696d6edispatch stall (70)71scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a1a3a6a8acafb5dcache load miss (bf)cdcfd5d6ddinst fetch restart (de)e0? int output thing (e9)eaebecld/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
6002514004910481007511014080313979812937025700124001020002100003001020000100001264443669541014328846014002314004714005013181503132431600103002010000200006002010000200001400501400351150021109104001010000100001101000001100000010000110314061134213955440000960100001000040010140051140036140051140051140051
6002414004710481000010014002013939412936225700124001020002100003001020000100001264443669627414326216014002614005014005013181803132420600103002010000200006002010000200001400351400351150021109104001010000100000101000001100001010000100314021134513956940000009100001000040010140036140036140048140051140051
6002414003510490000000014003513939712935925700124001020002100003001020000100001264443669496314325931014002314003514004713180303132420600103020710000200006002010000200001400501400471150021109104001010000100000101000001100000010000010314041134313956640000969100001000040010140051140036140051140051140036
6002414005010490000001014003213939412936225700104001020002100003001020000100001264443669733014325829014002614005014004713181803132498600103002010000200006002010000200001400471400351150021109104001010000100000101000001100000010000101314021132413956940000999100001000040010140148140048140051140051140051
6002414005010490010011014003513939412935925700124001020000100003001020000100001264974669358614321442014001114005014003513181803132433600103002010000200006002010000200001400501400471150021109104001010000100000101000001100000010000110314021113413956940000069100001000040010140051140048140051140036140036
6002414003510490000011014002013939712935952700104001020000100003001020000100001264888669353814321442014002614005014005113181803132430600103002010000200006002010000200001400501400471150021109104001010000100000101000001100000010000110314051133513956940000000100001000040010140036140036140048140051140048
6002414004710491000001014002013939712936225700124001020000100003001020000100001264443669661014326445014002614004714005013181803132430600103002010000200006002010000200001400501400351150021109104001010000100000101000001100000010000110314041134413959140000660100001000040010140051140036140051140048140051
6002414005010491000010014003513939712936225700124001020002100003001020000100001264443669718614325829014002614005014005013181503132433600103002010000200006002010000200001400501400351150021109104001010000100000101000001100000010000010314041114213955840000999100001000040010140051140048140036140048140036
60024140050104900000131014003513939712934725700124001020000100003001020000100001264429669569814326112014001114004714005013181803132430600103002010000200006002010000200001400351400471150021109104001010000100000101000000100000010000110314031113513956640000666100001000040010140036140036140036140048140048
6002414003510490000011014003513939412936225700124001020000100003001020000100001264443669549114326345014001114003514005013180303132433600103002010000200006002010000200001400351400471150021109104001010000100000101000001100000010000010314041134213955440000009100001000040010140051140051140051140051140048

Test 3: throughput

Count: 8

Code:

  ld1r { v0.8b }, [x6]
  ld1r { v0.8b }, [x6]
  ld1r { v0.8b }, [x6]
  ld1r { v0.8b }, [x6]
  ld1r { v0.8b }, [x6]
  ld1r { v0.8b }, [x6]
  ld1r { v0.8b }, [x6]
  ld1r { v0.8b }, [x6]
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.3341

retire (01)cycle (02)03090b0e0f1e1f22233a3f4346494f51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)60696d6edispatch stall (70)72scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a3a5a6a8acafb5b6bbdcache load miss (bf)c5branch mispredict (cb)cdcfd5d6ddinst fetch restart (de)e0eaebecld/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
1602052672720010102430101267062121219251601411008004580000100800208001550011701791884588026709267282672866520666771601352008002080020200800208002026708267231180201100991001008000080000110080000390800390358003960354311151170161026725106080000800001002670926724267242672926729
16020426728200001041010126719001216251601001008004580000100800208001550011759111887539026709267082670866320666771601342008002080020200800208002026831267081180201100991001008000080000010080000390800390080039613543111511701600267251010080000800001002672926709267292672926729
16020426728200000141000126693212011251601411008004180000100800008000050011731831884444026704267282670866300366661601002008000080000200800008000026723267231180201100991001008000080000010080000008000000800390104300051101161126725010080000800001002670926709267292672926729
160204267282000000001012671920121625160145100800458000010080000800005001168754188401112668926708267286650036686160100200800008000020080000800002670826708118020110099100100800008000001008000039080035039800390135390005110116112672006080000800001002672926729267292670926729
160204267282000000450001267082120162516010010080045800001008000080000500116888018840111267042672326728663003668616010020080000800002008000080000267282672311802011009910010080000800000100800003908003900800396139430005110116112672006080000800001002672926729267292672926709
16020426728200000045000126797212121625160100100800438000010080000800005001168880188401102670926728267286630036666160100200800008000020080000800002672326723118020110099100100800008000001008000039080035039800356139390005110116112670566080000800001002670926709267292670926729
160204267282000000000012671901212025160141100800458000010080000800005001174628188732812670926728267286630036686160100200800008000020080000800002672326723118020110099100100800008000001008000039080000039800396035000051101161126705610080000800001002670926729267242672426729
16020426728200000045000126699212120251601451008004180000100800008000050011731831884444126689267282672866500366661601002008000080000200800008000026723267081180201100991001008000080000010080000008000000800006039000051101161126726610080000800001002670926729267292670926729
16020426728200000000001267172181216251601411008004580000100800008000050011687541884011026709267082670866500366861601002008000080000200800008000026708267231180201100991001008000080000010080000390800390080039613539000511011611267251010080000800001002674126729267092672426729
16020426708200000045000126713201216251601001008004580000100800008000050011688801884011026709267232672366300366811601002008000080000200800008000026708267081180201100991001008000080000110080000390800390398000001353900051101161126725106080000800001002672926724267292672426709

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.3342

retire (01)cycle (02)03050708090b0e0f1e2223243a3f4346494f51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)92inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a1a3a6a7a8a9acafb5b6bbdcache load miss (bf)dtlb miss (c1)c2cdcfd0itlb miss (d4)d5d6dbddinst fetch restart (de)e0eaebecld/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
1600252673820010000016610032673207922251600291080064800001080000800005011673711886603126696267152673766813669516001020800008000020800008000026715267371180021109010108000080000110800202045800191016380169615943191050200041602226734131328000080000102673826738267162673826738
1600242673720010101006610032670307720251600291080065800001080000800005011701791887750126723267372673966813669516001020800008000020800008000026715267151180021109010108000080000010800202043800190016180000601943191050200021602226734131328000080000102671826738267382673826716
160024267372001010100671003267090771925160074108006580000108000080000501167371187893712671826715268406805367171600102080000800002080000800002673826737118002110901010800008000001080020204380059100618004061584319005020002160332673413018000080000102673826716267162673826738
160024267372001000100211003267353002125160074108006580000108000080000501168291188319002671826715267156682366951600102080000800002080000800002673726737118002110901010800008000001080020204380060100218004001590190050200031603326734131308000080000102671626738267382673826738
1600242673720010000006700032673127920251600291080063800001080000800005011677221886406026718267372673766823671716001020800008000020800008000026737267371180021109010108000080000010800211943800191016480040605843191050200021602226713131328000080000102673826716267382671626716
1600242673720011000006600032737807020251600291080019800001080000800005011692351878296126718267372673766813671716001020800008000020800008000026737267151180021109010108000080000010800201943801931016080040605943192050200021602326712131328000080000102673826738267382671626738
160024267372001000000661003267313700251600731080019800001080000800005011677221886409126718267372671566813671716001020800008000020800008000026737267371180021109010108000080000010800192043800591216180039611943190050200021602226734131328000080000102673826738267382671626716
16002426737200101000167000326709307125160075108006580000108000080000501169774187893712671826715267376681367171600102080000800002080000800002673726737118002110901010800008000001080020200800580006180039611944190050209021602226734131308000080000102673926738267382673826738
160024267372011000000210003267313772025160029108006580000108000080000501178590187893712671826737267376682367171600102080000800002080000800002671526737118002110901010800008000001080020204380058002218004260580192050200021602226734131308000080000102673826738267192673826738
16002426715200100000021000326730370025160073108006380000108000080000501168291187716012671826737267376681367171600102080000800002080000800002673726715118002110901010800008000011080019200800190006180040011943190050200021602226737131318000080000102673826738267162673826738