Apple M1 Microarchitecture Research by Dougall Johnson

Firestorm: Overview | Base Instructions | SIMD and FP Instructions
Icestorm:  Overview | Base Instructions | SIMD and FP Instructions

LD1R (4S)

Test 1: uops

Code:

  ld1r { v0.4s }, [x6]
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 60 nops): 2.000

Issues: 2.002

Integer unit issues: 0.000

Load/store unit issues: 1.000

SIMD/FP unit issues: 1.002

retire (01)cycle (02)030407080a0b0e0f1e2223243a3f464951inst issue (52)~issue fp/simd (54)~issue ld/st (55)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing ld/st (5a)huge thing fp/simd (5b)5f6061696d6escheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op ld/st (7d)~map op fp/simd (7e)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst fp/simd load (98)inst ldst (9b)9da0a3a6a8acafb5bbdcache load miss (bf)cfd0d2icache miss (d3)itlb miss (d4)d5d6d9dadbddinst fetch restart (de)e0eaebecld/st retires (ed)? fp/simd (ee)f5f6f7f8fd
6200528428212510000121005046283591116452200210031000100010005000119281002267128108281203272000100010001000100028058281361161001100010000100001001001000113140921039170143457052199823462381212414127761149881290612830100010002863628110280562809928015
6200427953213200100020005180278740116664200010031000100010005003119224102261627922281093102000100010001000100028004280651161001100010000100021001001001102133221021573203556031198553228381111343227777151091242212795100010002805028520281962825728490
6200428064211000000021005317278880015933200210021000100010005010119225002265928055280583102000100010001000100028488280611161001100010001100021000011001102135151040270723521036197083486382110463527983149351208613733100010002863428046280682862328015
620042808421200000002000530327908111599120021002100010001000501211924210227242808628531310200010001000100010002811028041116100110001000010000100001100111213495106717218352203719943353038168383127807138841211112554100010002853727950284052797927995
6200427994212100000021005293279581116072200010001000100010005005119074082268328364280203102000100010001000100028077279991161001100010000100021001011002100139381038973003473039198933221381315394127873137951233713154100010002861428139279462808228516
6200428032212200101021005295278400116066200010021000100010005000119316082270727963280263102000100010001000100028122280321161001100010000100021002001000122138451025573463522033194603497381610333427747139551219312924100010002846628137284712818227979
620042842821211010103100521628250101599920021002100010001000500611928710226882832528110310200010001000100010002811227948116100110001000010002100100100021214160974172663442038194373423381613413827727137711222912842100010002806527999280132844628393
6200427975213310000020004855278611116300200210021000100010005004119315182268927988280003102000100010001000100027946278721161001100010000100021001011000112141751045473073458037193753557381512343827777137111216312737100010002794527983280182815428526
6200428098213200000020005308279361016308200210021000100010005010119271082261728026280803102000100010001000100027949281091161001100010000100001001041000012141461038872283455039193743212381211293827749139981192912656100010002797828449281962853927916
6200428543212200000101005309279601016416200210021000100010005006119002002269228012280883102000100010001000100028479283311161001100010001100021001001000012141161048671053535037198783520382011413627798156341268014366100010002844028064280862817727929

Test 2: Latency 1->2 roundtrip

Chain cycles: 3

Code:

  ld1r { v0.4s }, [x6]
  fmov x0, d0
  eor x8, x8, x0
  eor x8, x8, x0
  add x6, x6, x8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 11.0050

retire (01)cycle (02)0308090b0e0f18191e1f22233a3f4d4f51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a1a3a6a7a8a9acafb5bbdcache load miss (bf)c2branch mispredict (cb)cfd5d6ddinst fetch restart (de)e0? int output thing (e9)eaebecld/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
602051400471049000000010000140035139410129347257010240100200001000030100200001000012638396693778143106330140026140047140047131795313238260100302001000020000602001000020000140035140047115020110099100401001000010000010010000001000000001000010000321021221113955240000009100001000040100140036140036140051140051140051
60204140047104901000004030000140020139410129362257010240100200021000030100200001000012640546692947143106331140026140050140050131793313238260100302001000020000602001000020000140050140047115020110099100401001000010000010010000001000000001000000000321011221113955240000960100001000040100140036140036140036140051140051
6020414004710500000000101001400351394101293622570102401002000210000302502000010000126405466935381431063311400231400501400501317953132402601003084210000200006020010211200001400361404251150201100991004010010000100000100100070110009034121000010100321021271113955740000669100001000040100140051140036140036140048140051
60204140105104900010005940100140037139407129348257010240100200141000030100200001000012641546692947143087011140026140050140047131797611324026010030200100002000060200100002000014043514004711502011009910040100100001000001001000000100000002911000000140321011271113955640000669100001000040100140051140038140051140048140051
60204140048104900000001010014003513941012934725701004010020002100063010020000100001264415669353814310633014032914005014003513179531324026010030200100002000060200100002000014003514004711502011009910040100100001000001001000001100000002731000000100321011221113955540000009100001000040100140058140051140048140036140052
602041400921049001000010000140020139406129347757011640100200021000030100200001000012641806701741143108370140025140050140050131792313239960100302001000020000602001000020426140054140035115020110099100401001000010000010010000011000100031000000000321011271113954640000906100001000040100140051140036140051140402140051
602041401191052000100060100140020139410129347257010240100200001000030100200001000012642986693733143088051140026140050140050131795313238260100302001000020000602001006520000140047140047115020110099100401001000010000010010000011000001001000010000321011221113955540000600100001000040100140051140036140051140051140051
602041401041049000110010100140035139407129500257010040100200021000030100200001000012640456694177143087011140026140035140047131795313240260100302001000020000602001000020000140050140047115020110099100401001000010000010010000011000000001000010000321011271113954640000666100001000040100140051140051140094140056140036
602041400501049000000010000140035139410129362257010040100200021000030662200001000012638036693685143111281140011140035140047131795313240260100302001000020000602001000020000140050140035115020110099100401001000010000010010000011000000091000000100321011271113954640000999100001000040100140054140036140051140036140036
6020414004710480000004520100140035139597129362257010240100200001000030100200001000012640456693634143298971140023140050140412131793313238260100302001000020000602001000020000140035140047115020110099100401001000010000110010004011000004201000010100321011221113955540000069100001000040100140064140050140048140439140051

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 11.0051

retire (01)cycle (02)0305080b0e0f181e1f22233a3f4d4f51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)60696b6d6edispatch stall (70)71scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a1a3a6a7a8a9acafb5bbdcache load miss (bf)dtlb miss (c1)c2cfd5d6dadbddinst fetch restart (de)e0? int output thing (e9)eaebecld/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
60025140057104901001010100140036139398129363257001240010200021000030010200001000012645156693734143261850140027014005114003513180903132440600103002010000200006002010000200001400511400511150021109104001010000100001101000001100000000100001010031405113104513963040010101010100001000040010140405140052140052140052142770
600241401691071000002188102143163139398129363257001240010200001000030010200001000012645246693734143261850140027014005314005113181906132434603023002010000202136002010000200001406321400521150021109104001010000100000101000001100000000100001010031404113003713957040000101010100001000040010140168140263140036140036140052
60024140051104900000000100140036139398129363257001040010200021000030010200001000012644296692947143261850140027014005114005113181903132420600103002010000200006002010000200001400511400511150021109104001010000100000101000001100000000100001000031404111007713957040000101010100001000040010140036140036140052140052140052
60024140051104900000010100140036139398129347257001240010200021000030010200001000012644886693734143261850140027014003514005113181903132434600103002010000200006002010000200001400511400511150021109104001010000100000101000001100000000100001000031403111004413957040000101010100001000040010140052140052140052140052140052
600241400511048000000101001400361393981293632570012400102000210000300102000010000126448866929471432618501400270140051140051131819031324346001030020100002000060020100002000014005114005111500211091040010100001000001010000011000000001000010000314071130061013957040000101010100001000040010140052140052140052140052140036
60024140051104900000010100140036139398129363257001040010200001000030010200001000012644886693734143261850140011014005114003513181903132420600103002010000200006002010000200001400511400511150021109104001010000100000101000111100020011100001111231404113004413957040000000100001000040010140052140052140052140052140052
60024140051104900000025000014002013939812936325700124001020000100003001020000100001264488669373414326185114002701400351400511318190313243560010300201000020000600201000020000140051140035115002110910400101000010000010100000110000000010000101003140411300761395704000010010100001000040010140036140157140052140052140036
60024140051104900000000001140036139398129363257001040010200021000030010200001000012644886692947143214420140011014005114005113181903132434600103002010000200006002010000200001400511400511150021109104001010000100000101000001100000003100001010031404113005513957040000101010100001000040010140052140052140052140052140052
60024140051104900000010100140036139398129363257001240010200001000030010200001000012644296693734143261850140027014005114005113180303132420600103002010000200006002010000200001400511400511150021109104001010000100000101000000100000000100001000031407111005413955440000101010100001000040010140052140036140052140052140052
600241400511049000000100001400201393981293632570012400102000010000300102000010000126448866929471432144201400270140035140051131819031324206001030020100002000060020100002000014005114005111500211091040010100001000011010000011000000120110000101003140411100851395704000001010100001000040010140052140052140052140036140052

Test 3: throughput

Count: 8

Code:

  ld1r { v0.4s }, [x6]
  ld1r { v0.4s }, [x6]
  ld1r { v0.4s }, [x6]
  ld1r { v0.4s }, [x6]
  ld1r { v0.4s }, [x6]
  ld1r { v0.4s }, [x6]
  ld1r { v0.4s }, [x6]
  ld1r { v0.4s }, [x6]
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.3342

retire (01)cycle (02)0305080b0e1e22233a3f4346494f51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)92inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a1a3a6a7a8a9acafb5b6bbdcache load miss (bf)dtlb miss (c1)c2cfd5d6ddinst fetch restart (de)e0eaebecld/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
160205267412001010670022672227720251601641008006380000100800008000050011692951878294026721267372674166613669516010020080000800002008000080000267372673711802011009901001008000080000010080019190800601002180000613943190511031633267341313080000800001002673826738267382673826738
160204267372001010671032672200720251601631008006580000100800008000050011692951878294126699267422671666593669516010020080000800002008000080000267152673711802011009901001008000080000010080019194380058000218000060394319151103163326734013180000800001002673826738267382673826738
160204267372001110671002672227720251601631008001980000100800008000050011692951886401126721267472674466593670416010020080000800002008000080000267372673711802011009901001008000080000010080019194380058001618003901194319251102163326734130180000800001002673826738267162673826738
1602042690520010106710126722077202516011910080019800001008000080000500116929518782941266992674726737665936695160100200800008000020080000800002673726737118020110099010010080000800000100800211944800581012180000605843192511031623267341313180000800001002673826738267382673826738
1602042673720011102110326722077202516016410080065800001008000080000500116845018794151267182674526739665836695160100200800008000020080000800002673726737118020110099010010080000800000100800202043800591006180041613943191511031633267341313180000800001002673826738267382673826738
1602042673720011006600326700007192516016510080065800001008000080000500116988318794151267202674226737666136695160100200800008000020080000800002673726737118020110099010010080000800000100800192144800590006380039615843192511031633267341313080000800001002673826738267382673826716
1602042671520011107500326722377202516016510080063800001008000080000500116772218801901266962674326878666736695160100200800008000020080000800002671526737118020110099010010080000800000100800192044800580006380039015943190511031633267341313080000800001002673926716267382673826741
160204267372001000211032670030719251601651008006680000100800008000050011701791879415126724267412673766703669816010020080000800002008000080000267422673711802011009901001008000080000010080020190800590006180040603943192511031633267381313080000800001002673826738267382673826738
1602042673720011101031032672237719251601651008006580000100800008000050011738841879415126718267212674766593669516010020080000800002008000080000267372671911802011009901001008000080000010080152204380058100618004061604319151103163326734013180000800001002673826738267382673826738
16020426737201111121102267003771925160119100800658000010080000800005001167371188256212672126752267376659366951601002008000080000200800008000026737267371180201100990100100800008000001008002020080058000618000001594319151103163326712130180000800001002673826716267382671726738

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.3342

retire (01)cycle (02)030508090b0e0f1e2223243a3f4346494f51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)60696d6edispatch stall (70)72scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)92inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a1a3a5a6a7a8a9acafb5b6bbdcache load miss (bf)dtlb miss (c1)c2cdcfd5d6dbddinst fetch restart (de)e0eaebecld/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
1600252674220111100082000326722077192516007410800658000010800008000050116929518838010267182681326724666003671716001020800008000020800008000026741267371180021109010108000080000010800191943080058100648003961043191050209160782674101308000080000102671626716267162671626742
1600242673720111110088000326722377182516007310800638000010800008000050116772218782030267092671526739668803672216001020801898000020800008000026721267411180021109010108000080000010800192043080019101618003901190191050206160882671201308000080000102673826738267382671626738
1600242673720011100067000026722070125160073108001980000108000080000501168291187716002672726715267376682036695160010208000080000208000080000267372671511800211090101080000800000108001919430800591012280000010431900502051607626712131318000080000102673826738267382673826716
160024267372001000006600012672200719251600751080019800001080000800005011701791885460027247267462674866590366951600102080000800002080000800002673726715118002110901010800008000001080020190318005916060800400158431910502051607626712131308000080000102671626717267162673826738
1600242671520010101021000326722377125160075108006580000108000080000501169295188380112670726737267156682036717160010208000080000208000080000267152671511800211090101080000800000108002119430800580002180039615943190050206160562671201328000080000102673826738267382682026716
160024267372001010006710012672230719251600751080065800001080000800005011695441885456126707267152673766810367171600102080000800002080000800002673726737118002110901010800008000001080019194308005710061800400159431900502071606726712131318000080000102673826738267382673826738
160024267372001100008200002670007720251600751080065800001080000800005011701791883190026726267422673766810366951600102080000800002080000800002673726737118002110901010800008000001080019204308007200121800406159431900502051607726712131308000080000102671626738267382671626716
16002426737200100100660002267220771925160029108001980000108000080000501170179188546002672026720267376681036695160010208000080000208000080000267372671611800211090101080000800000108002019008005810021800396139431900502061606526712131318000080000102671926716267382673826716
16002426715200111100210003267222001251600291080065800001080000800005011701791883190026721267412673766830367171600102080000800002080000800002673726737118002110901010800008000001080020194308006001121800406019431900502051605626734131318000080000102673926738267382671626738
160024267372001101116600022672220720251600291080019800001080000800005011701791883190026822267502671566810367171600102080000800002080000800002673726737118002110901010800008000001080019190080074101608000061394313015020516088267170028000080000102673826716267382673826716