Apple M1 Microarchitecture Research by Dougall Johnson

Firestorm: Overview | Base Instructions | SIMD and FP Instructions
Icestorm:  Overview | Base Instructions | SIMD and FP Instructions

LD1R (8H)

Test 1: uops

Code:

  ld1r { v0.8h }, [x6]
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 60 nops): 2.000

Issues: 2.002

Integer unit issues: 0.000

Load/store unit issues: 1.000

SIMD/FP unit issues: 1.002

retire (01)cycle (02)0307090a0e0f1e223a3f464951inst issue (52)~issue fp/simd (54)~issue ld/st (55)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing ld/st (5a)huge thing fp/simd (5b)5f696d6escheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op ld/st (7d)~map op fp/simd (7e)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst fp/simd load (98)inst ldst (9b)9da0a3a6a8acafb5bbdcache load miss (bf)cfd0d2icache miss (d3)itlb miss (d4)d5d6d9dadbddinst fetch restart (de)e0eaebecld/st retires (ed)? fp/simd (ee)f5f6f7f8fd
62005293212191211400111462428828011723420021002100010001000500011929322630290462923831020001000100010001000291652911211610011000100001000210000010022231282590626832303654120717305338159263128345163991372615257100010002926129279293052930729237
62004292582191501300260469128806001724020021002100010001000500011934522618291412935531020001000100010001000292242917511610011000100001000210000210002031279791316890307463020601307038139374128344162591394514940100010002924929174292182934029329
620042923521911090021455028881001728720021004100010001000500011941222616290762924831020001000100010001000291582912811610011000100001000210000010002031281991366859311642820617308338159262728388164351411515068100010002926729294293952931129345
6200429226219100110031452528879001725520011002100010001000500011926122590290062925231020001000100010001000291542923011610011000100001000210000010002021303691896898313973320687304638135243128409163761382515037100010002922929261293172922529322
6200429260220130130001454628788001727920021001100010001000500011938222601291042931631020001000100010001000290822917311610011000100001000210000010002021290593316855305562620623304038135292728471161431388415091100010002915629223293442931429313
6200429269219801400181455928812001725620021003100010001000500011919022594291242929431020001000100010001000291672913211610011000100001000210000010001031294492086838307643020623306538209302928474165951406915087100010002932529299293652932129248
62004293242191201100271458228882001734920031002100010001000500011938322649290152930531020001000100010001000291292910511610011000100001000210000010002021282991886955308363220642315338167352628389164281385514940100010002924029277292952930429326
6200429267219130150020458728807001730320021002100010001000500011915222610291092933631020001000100010001000291572912711610011000100001000210000010002021297992696928310142220635308938205273428374162251395114892100010002929429267292672922829297
6200429270219901100214534287810017206200210021000100010005000119412225942908529333310200010001000100010002911029159116100110001000010002100000100020212871919468633053527206823095381413323128479163741411115133100010002932629264293352935529284
62004292592191101400204550287140017285200110021000100010005000119360226802903029178310200010001000100010002919629170116100110001000010002100000100020212939945269213013533206313041382010312928371163381394315190100010002930129327293002930129270

Test 2: Latency 1->2 roundtrip

Chain cycles: 3

Code:

  ld1r { v0.8h }, [x6]
  fmov x0, d0
  eor x8, x8, x0
  eor x8, x8, x0
  add x6, x6, x8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 11.0051

retire (01)cycle (02)03080b0e0f18191e1f22233a3f4d4f51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a1a3a6a7a8acafb5dcache load miss (bf)c2c5cdcfd5d6ddinst fetch restart (de)e0? int output thing (e9)eaebecld/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
602051400591049000000101001400361394061293632570102401002000210000301002000010000126402066937341431093911400111400511400881317973132399601003020010000200006020010000200001400511400351150201100991004010010000100001100100000110000000100001100032101126111395594000010100100001000040100140052140052140052140036140036
6020414005110490000002200001400381394111293632570102401002000010000301002012310000126402066937341431093911400301400731400421317983132382601003020010053200006020010000200001401661405591150201100991004010010000100000100100000110000003100000000032101126111395594000001013100001000040100140052140052140052140052140052
6020414005110490000001010014003613940612936325701004010020002100003010020000100001264020669294714308701114002714005114005113179731323826010030200100002000060200100002000014005114005111502011009910040100100001000001001000001100000001000011000321011261113956540000101013100001000040100140052140055140052140052140052
6020414005110490000001000014003613941112936325701024010020002100003010020000100001264020669373414310939114001114003514005113179731323826010030200100002000060200100002000014005114003511502011009910040100100001000001001000001100000001000001000321011391113955940000131010100001000040100140036140036140036140055140055
6020414005110490000001000014002013941112936525701004010020002100003010020000100001264020669387814310939114002714005114003513179731323996010030200100002000060200100002000014005114005111502011009910040100100001000001001000001100000001000011000321011271113955940000101310100001000040100140036140052140052140036140036
602041400511049000000001001400361394061293472570100401002000210000301002000010000126402066937341430870111400301400521400511317973132399601003020010000200006020010000200001400511400511150201100991004010010000100000100100000110000000100001101032101127111395594000001010100001000040100140052140036140052140052140036
6020414005110480000000000014002013940612934725701024010020002100003010020000100001264020669373414310939114003014007714005113179334132460601003020010000200006020010000200001400511400511150201100991004010010000100000100100000110000000100001100032101126111395654000001010100001000040100140055140052140052140052140055
6020514005110490000009101001400201394271293632570102401002000010000301002000010000126380366937341431093901400271400351400511317973132399601003020010000200006020010000200001400511400511150201100991004010010000100000100100000110000000100001100032101126111395594000010010100001000040100140036140052140052140036140055
602041400541049000000101001400201394111293634470102401002000210000301002000010000126395866929471431093911400271400521400511317973132382601003020010000200006020010000200001400511400511150201100991004010010000100000100100000010000003100001100032101127111395654000010010100001000040100140036140055140052140036140052
6020414005110490000000000014008413942912936325701024010020002100003010020000100001263958669373414308701114001114005114005113180031323826010030200100002000060200100002000014005114005111502011009910040100100001000001001000000100000001000011000321011261113954640000101010100001000040100140052140052140036140052140052

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 11.0055

retire (01)cycle (02)0308090b0e0f1e1f22233f43494d4f51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)60696b6d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a1a3a6a8acafb5bbdcache load miss (bf)c2branch mispredict (cb)cfd5d6ddinst fetch restart (de)e0? int output thing (e9)eaebecld/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
600251400561050010002000140028111393901293662570014400102000410000300102000010000126499366953181432661211400310140055140055131811313242660010300201000020000600201000020000140443140057115002210910400101000010000010100000210001011000111100314051132313957440000606100001000040010140056140044140056140044140056
6002414005510490000010101400401013940212936611770014400102000410000300102000010000126495766946941432671711400190140055140055131825313244161185300201000020000600201000020000140055140043115002110910400101000010000010100000210008011000111000314021133413957440000666100001000040010140044140056140056140056140057
600241400551049000001010140063111394021293552570014400102000210000300102000010000127423066939261432715711400190140055140043131823313243860010300201000020000600201000020000140055140055115002110910400101000010000010100000210001011000111100314021132313957440000006100001000040010140428140044140056140426140056
600241400551049000003817610140040111394021293662570014400102000410000300102000010000126499366948301432681911400210140057140055131823313243860010300201000020000600201000020000140043140055115002110910400101000010000010100000210001511000111100314041134313957440000606100001000040010140044140056140056140056140044
600241400551049001002010140040111394021293552570012400102000210000300102000010000126453466940701432661211400310140055140055131823313242660010300201000020000600201000020000140043140043115002110910400101000010000010100000010001011000111000314041133313957440000066100001000040010140056140056140060140056140056
600241400551049010001010140028111394021293662570014400102000410000300102000010000126457666939741432539611400310140043140055131811313242660010300201000020000600201000020000140055140055115002110910400101000010000010100000210007211000101100314031132313957440000606100001000040010140044140056140044140044140044
600241400431049000002000140040101394021293662570014400102000410000300102000010000126493966939181432661211400190140055140055131811313242660010300201000020000600201027520000140048140056115002110910400101000010000010100000010001411000111020314021133413957440000660100001000040010140432140044140056140056140044
600241400581049000112000140040111393901293662570014400102000410000300102000010000126675867021451433142011400310140055140055131823313243860010300201000020000600201000020000140043140047115002110910400101000010000110100000010001011000111100314031133213957440000606100001000040010140056140056140044140044140044
600241400561050000002010140028101394021293552570012400102000410000300102000010000126493666949821432702501400310140043140043131823313243860010300201000020000600201000020000140055140043115002110910400101000010000010100000210001011000101100314031134213963140000660100001000040010140044140056140044140056140044
600241400431050000001010140040101394021293662570014400102000410000300102000010000126496666943581432681611400310140055140055131823313242660010300201000020000600201000020000140055140055115002110910400101000010000110100040210001011000111100314031133213957440000660100001000040010140044140056140056140056140056

Test 3: throughput

Count: 8

Code:

  ld1r { v0.8h }, [x6]
  ld1r { v0.8h }, [x6]
  ld1r { v0.8h }, [x6]
  ld1r { v0.8h }, [x6]
  ld1r { v0.8h }, [x6]
  ld1r { v0.8h }, [x6]
  ld1r { v0.8h }, [x6]
  ld1r { v0.8h }, [x6]
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.3341

retire (01)cycle (02)0308090b0e0f18191e1f22233a3f4346494f51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)60696d6edispatch stall (70)72scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a1a3a5a6a7a8a9acafb5b6bbdcache load miss (bf)cfd5d6ddinst fetch restart (de)e0? int output thing (e9)eaebld/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
1602052673420001000004501002671721212162516014510080045800001008000080000500117462818873871267092672826728665003668616010020080000800002008000080000267282672811802011009910010080000800001100800000430800390004280039613943511011611267330101080000800001002672926729267292672926729
16020426728200000000000101267132121216251601451008004580000100800008000050011758561883573126709267302672866500366881601002008000080000200800008000026728267281180201100991001008000080000010080000000800390003980039613943511011611267330101080000800001002672926729267292672926729
1602042672820000000004500012671321212162516014510080045800001008000080000500117462818835731267092672826728665003668616010020080000800002008000080000267282672811802011009910010080000800000100800000430800390003980039613943511011601267250101080000800001002672926729267292672926729
1602042672820100000004500012671321212162516014510080045800001008000080000500117318318835731267092672826728665003668616010020080000800002008000080000267282672811802011009910010080000800000100800000430800390003980039613943511011611267270101080000800001002672926729267292672926729
1602042672820000000004501012671321212162516014510080045800001008000080000500117462818873871267092672826728665003668616010020080000800002008000080000267282672811802011009910010080000800000100800000430800390003980039613943511011611268160101080000800001002672926729267292672926729
160204267282000100000450001267132121216251601451008004580000100800008000050011746281887334126709267282672866500366661601002008000080000200800008000026728267281180201100991001008000080000010080000043080000000398003961390511011611267250101080000800001002672926729267092672926729
160204267282000100000001012671321212162516014510080045800001008000080000500116875418835731267092672826728665003668616010020080000800002008000080000267282672811802011009910010080000800001100800000430800390003980039613943511011611267330101080000800001002672926729267292672926729
1602042672820000000004500012671321212162516014510080045800001008000080000500117462818873911267092672826728665003668616010020080000800002008000080000267282672811802011009910010080000800001100800000430800390003980039613943511011611267330101080000800001002672926729267292672926729
1602042672820000000004500012671421212162516014510080045800001008000080000500117462818873341266892672826728665003668616010020080000800002008000080000267082672811802011009910010080000800000100800000430800000003980039613943511011611267250101080000800001002672926729267292672926729
160204267282000001100450001267132121216251601451008004580000100800008000050011746281887334126709267282672866500366861601002008000080000200800008000026728267281180201100991001008000080000010080000043080039000080039013943511011611267250101080000800001002672926729267292672926729

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.3342

retire (01)cycle (02)030507080a0b0e0f18191e1f22233a3f4346494f51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)60696d6edispatch stall (70)72scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)92inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a1a3a5a6a7a8a9acafb5b6bbdcache load miss (bf)dtlb miss (c1)c2cfd5d6d9daddinst fetch restart (de)e0? int output thing (e9)eaebecld/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
1600252673920110001100021010326700070025160029108001980000108000080000501170179188847502671826715267376660036717160010208000080000208000080000267372673711800211090101080000800000108001919008005810061800406159431915020716029726745013028000080000102673826717267382673826738
1600242673720010001010067000326700270192516007510800658000010800008000050116737118860960267182673726737668203671716001020800008000020800008000026715267371180021109010108000080000010800212000800581012280040605901905020101600111026762001328000080000102673826716267162671626716
160024267152001000000006600032672230720251600731080018800001080000800005011677221877160026696267152673766820366951600102080000800002080000800002673826737118002110901010800008000001080020194308005910060800406159431915020616009926823001328000080000102673826749267212673826738
1600242671520010100100066010026722377202516002910800658000010800008000050117017918831901267182671526716668203671816001020800008000020800008000026737267371180021109010108000080000010800202043080019101218004061194319150209160069267340131328000080000102673826738267382671626738
16002426737200100000000670000267223770251600291080063800001080000800005011677221877331126718267372673966810367171600102080000800002080000800002673826739118002110901010800008000001080019194308001901060800006158019050201016009626736013028000080000102673826738267382671626738
160024267372001010100006700022672237712516007510800658000010800008000050117017918854601267182673726715668203671716001020800008000020800008000026737267151180021109010108000080000010800202043080059000608004000194319050209160079267120131328000080000102673826716267382673826738
160024267152001010110009700032672239719251600751080065800001080000800005011692951878937126718267372673766820367171600102080000800002080000800002673726737118002110901010800008000001080019200080059100338004060190190502071600107267490131308000080000102673826716267382673826716
1600242673720110101000066000226722377192516007410800638000010800008000050116946118771601267182673726715668103669516001020800008000020800008000026737267371180021109010108000080000010800202043080059101618004061604319050206160010626742001318000080000102673826738267382673926738
16002426715201100010000670103267000771925160073108006580000108000080000501167371187829612671826716267376662036717160010208000080000208000080000267372671511800211090101080000800000108001921433180019110525280949015943196517768000141427928113018000080000102772227925278862784827741
1600242674220812100007711266160032784301574732671616351080929807801081301812465011842201932091127677278642788471161501307469162191208132381332208133481140278342786671800211090101080000800000108079919432338096901063028091001194319651791279001015280250131328000080000102788127861278832787827896