Apple M1 Microarchitecture Research by Dougall Johnson

Firestorm: Overview | Base Instructions | SIMD and FP Instructions
Icestorm:  Overview | Base Instructions | SIMD and FP Instructions

LD1 (single structure, B)

Test 1: uops

Code:

  ld1 { v0.b }[1], [x6]
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 60 nops): 2.000

Issues: 2.001

Integer unit issues: 0.000

Load/store unit issues: 1.000

SIMD/FP unit issues: 1.001

retire (01)cycle (02)030507080a0b0e0f1e1f22243a3f43464951inst issue (52)~issue fp/simd (54)~issue ld/st (55)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing ld/st (5a)huge thing fp/simd (5b)5f696d6escheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op ld/st (7d)~map op fp/simd (7e)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst fp/simd load (98)inst ldst (9b)9da0a1a3a6a8a9acafb5bbdcache load miss (bf)dtlb miss (c1)c2cfd0d2icache miss (d3)itlb miss (d4)d5d6d9dadbddinst fetch restart (de)e0eaebecld/st retires (ed)? fp/simd (ee)f5f6f7f8fd
6200528445212023030011410105009281250011641120001002100010001000500011901622668281402820231020001000100010002000283912823811610011000100001000021000000100001200140109892715134191067198323450381618566327897144481313413409100010002835228426283442822328201
620042833521401902101030105147281510101626120011001100010001000500011926622655282632823831020001000100010002000280472825311610011000100001000001001000100121300138139764729333631143197563366381611585928049140661252513351100010002846828239282912832028439
620042826621302702900000005008280960001617120011000100010001000500011926522694281812825131020001000100010002000282622821311610011000100001000021000000100001000138169825728334571152196093454381914556928038143261245013176100010002826028223283432817928119
6200428211215021023011435010513027959000162842000100110001000100050001192752270028057282193102000100010001000200028173282401161001100010000100002100010010002120013581995670803365855196243321382225575627849149131268113281100010002828428394284552830928253
6200428289212021019000001050792811901016286200010011000100010005000119044227162827328192310200010001000100020002813428213116100110001000010000010000001000202001368210176724134451056197103434382120565927899144791243013976100010002851128320284742827728243
62004282822130250280001780004919281060101628120011001100010001000500011929222657284752854431020001000100010002000284202826611610011000100001000021000000100000300135551023072123270953196153382382013575528000148251234113049100010002819928143285232834028326
62004283902120240250002701049242816600016252200110011000100010005000119320226522841828296310200010001000100020002813128080116100210001000010000010000001001200001387310409723833871259196223350381912555527888146571236213108100010002849928546284532837028485
6200428233211025016000310004976280510001650620011001100010001000500011922322729282012857231020001000100010002000282452827611610011000100001000021000000100020000135669842727434191253194963392382314515127929148871242013346100010002842928314281512815728187
620042837421202502400030005080283100001642520001000100010001000500011926322696281782825431020001000100010002000283692820611610011000100001000001001000100021200135561012971623423961197963378381716555227870145281241813294100010002859228152281452832528237
620042814821102302301130004885280290101634420011001100010001000500011929122722284412822731020001000100010002000281402813611610011000100001000021000000100120200140979797721433361253196083318381617555728029142411253313322100010002830428234282062849428235

Test 2: Latency 1->2 roundtrip

Chain cycles: 3

Code:

  ld1 { v0.b }[1], [x6]
  fmov x0, d0
  eor x8, x8, x0
  eor x8, x8, x0
  add x6, x6, x8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 11.0053

retire (01)cycle (02)0305080b0e0f18191e1f22233a3f4d4f51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)6061696b6d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a1a3a6a7a8a9acafb5bbdcache load miss (bf)dtlb miss (c1)c2branch mispredict (cb)cdcfd5d6ddinst fetch restart (de)e0? int output thing (e9)eaebecld/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
602051400531049100000042501011400431394561293672570102401002000410000301002000010000126397666939741431222910140032014004114005613181031323916010030200100002000060200100003000014042914004511502011009910040100100001000001001000121100020011100001111000321011391113956640000966100001000040100140057140054140054140054140042
6020414005310481100000700001400261394561293642570102401002000410000301002000010000126381466938301431222910140017014005614005313179931323916010030200100002000060200100003000014004114004111502011009910040100100001000001001000211100020001100001101100321011391113955040000600100001000040100140042140057140057140057140042
60204140056104910100006500001400261394881293642570104401002000410000301002000010000126397666932441431222910140032014005614004313179931323916010030200100002000060200100003000014004114005311502011009910040100100001000001001000111100020001100000101010321011391113955040000999100001000040100140057140057140057140054140042
6020414005610491110004201011400261394291293642570104401002000210000301002000010000126397666932441430966710140029014004114005613187631324016010030200100002000060200100003000014005614004111502011009910040100100001000001001000111100030011100001111000321011141113956740000069100001000040100140057140042140057140042140042
60204140041104910010005601001400381394291293642570104401002000410000301002000010000126400366932441430966710140017014005614005613179931323916010030200100002000060200100633000014005614005311502011009910040100100001000001001000240100020011100001111000321011141113955040033090100001000040100140057140057140057140057140057
60204140056104910100003400011400381394561293642570104401002000410000301002000010000126381466946691430966710140032014005614005613179931323916010030200100002042360200100003000014005314005311502011009910040100100001000001001000221100020004100031101000321011391113956740000996100001000040100140054140057140042140042140054
6020414004110491010000893520001400381394131293532570104401002000210000301002041810000126381466932441431222910140029014044114005913187231324016010030200102152000060200100003000014005614004111502011009910040100100001000001001000111100020011100001111100321011391113956240000699100001000040100140042140042140057140045140055
6020414005610491110000101011400261394301293672570104401002000410000301002041810000126396366932441431222910140017014005614005313184031323916010030200100002000060200100003000014005314004111502011009910040100100001000001001000210100010101100000101000321011141113956740000990100001000040100140042140057140042140046140054
60204140041104910111002010014004113942912935325701044010020004100003010020000100001263963669324414312487001400320140042140059131907313240160100302001000020000602001000030000140056140053115020110099100401001000010000110010001311000110116505100001111100321011141113955040000966100001000040100140054140054140057140042140042
6020414005610491010000200001400261394291293672570104401002000210000301002021210000126397666932441431248710140017014007614005513179931323916010030200100002000060200100003000014004214004111502011009910040100100001000001001000231100020011100000101200321011391113956740000099100001000040100140042140057140054140057140057

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 11.0055

retire (01)cycle (02)030508090b0e0f18191e1f2223243a3f4d4f51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)5f60696d6edispatch stall (70)71scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a1a3a6a8a9acafb5bbdcache load miss (bf)dtlb miss (c1)c2c5cfd0d5d6ddinst fetch restart (de)e0? int output thing (e9)eaebecld/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
6002514004710490010110010000014002013939712935925700124001020002100003001020000100001264534669324414325254311400171400411400411318240313243960010300201000020000600201000030000140041140041115002110910400101000010000010100022010002011100001111003140081133313957540000069100001000040010140196140056140060140057140042
6002414005310491100010020000014004113938812936725700144001020006100003001020000100001264507669383014326437411400321400561400531318240313242460010300201000020000600201000030000140056140041115002110910400101000010000010100011110002021100000111103178031133713956040000969100001000040010140057140057140057140054140057
6002414005610491111000020000114003813940012936425700144001020002100003001020000100001264534669397414325254311400321400531400531318210313243960010300201000020000600201000030000140056140041115002110910400101000010000010100021110001021100001101003140031133713957240000960100001000040010140057140042140054140062140057
6002414005310491011000010000014002613940312936725700144001020004100003001020000100001264507669397414326437311400321400561400561318240313244460010300201000020000600201000030000140125140035115002110910400101000010000010100000010000000100001010013140041134713957540000990100001000040010140057140042140057140057140057
6002414004110491101000020100014002613940312936425700144001020004100003001020000100001264534669397414326716311400171400411400411318240313243960010300201000020000600201000030000140056140053115002110910400101000010000110100011110001001100001111103140041134713956040000999100001000040010140051140051140036140051140036
6002414004710690000000060000014003513939712936225700124001020002100003001020000100001264507669397414326716411400171400411400411318240313243960010300201000020000600201000030000140056140053115002110910400101000010000010100011010001001100001111003140031133313957540000900100001000040010140042140042140054140042140062
600241400561049110101001010001400201393971293622570010400102000010000300102000010054126478466939741432643741140029140041140056131809031324246001030020100002000060020100003000014005614004111500211091040010100001000001010000011000000258100000010003140031133313956640000969100001000040010140036140051140051140051140051
6002414003510480000000010000014003513939412936225700124001020004100003001020000100001264370669397414325254411400171400561400561318240313242460010300201000020000600201000030000140053140053115002110910400101000010000010100013110003000100000010203140031133313957540000900100001000040010140042140042140057140057140057
6002414004110491001110020000014004113940312935325700144001020004100003001020000100001264534669324414325254311400321400561400531318240313243960010300201000020000600201000030000140056140053115002110910400101000010000010100011110002001100001111003140031134313957540000090100001000040010140042140057140042140057140054
6002414004110491001100020000014003513939412935925700124001020004100003001020000100001264534669397414326437411400321400561400531318097313243660010300201000020000600201000030000140056140053115002110910400101000010000110100023110002012100001111003140071133313957540000699100001000040010140057140057140057140057140042

Test 3: throughput

Count: 8

Code:

  ld1 { v0.b }[1], [x6]
  ld1 { v0.b }[1], [x6]
  ld1 { v0.b }[1], [x6]
  ld1 { v0.b }[1], [x6]
  ld1 { v0.b }[1], [x6]
  ld1 { v0.b }[1], [x6]
  ld1 { v0.b }[1], [x6]
  ld1 { v0.b }[1], [x6]
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 2.0006

retire (01)cycle (02)03080e0f18191e1f2223243a3f4346494e4f51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)60696b6d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a1a3a5a6a8a9acafb5b6bbdcache load miss (bf)c2c5cdcfd5d6ddinst fetch restart (de)e0? int output thing (e9)eaebecld/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
16020516006111990000000000016002521801596900251601001008000280000100800008000050042181022939240116003301600521600401396873140019160100200800008000020080000160000160052160052118020110099100100800008000001008000003908000000358003560353900051101161116004906628000080000100160041160053160041160053160053
16020416005211990000041010001600252001596860251601021008000080000100800008000050042128622939203116003301600401600401396753140019160100200800008000020080000160000160052160052118020110099100100800008000011008000003908016500080035000000051102431116004900008000080000100160041160041160053160041160053
160204160052119900000001001160037218181596861225160102100800028000010080000800005004210172293924011600330160040160052139675314001916010020080000800002008000016033616005216004011802011009910010080000800000100800000390800350035800006103900051101161116004900608000080000100160053160041160041160053160041
1602041600401199000004100100160041201815969011251601001008000080000100800008000050042181022939240116002101600521600521396873140019160100200800008000020080000160000160040160040118020110099100100800008000001008000003908000000080000003539003511601611160053010048000080000100160041160057160057160057160057
160204160056119800000450001116002501212159690162516010010080002800001008000080000500421998229395481160037016005616004013969131400231601002008000080000200800001600001600651600461180201100991001008000080000010080000043080039003980039010430005110116111600370101008000080000100160057160041160057160057160089
1602041600601199000000000101600412121215968402516010010080002800001008000080000500422041229395751160037016005616005613967531400141601002008000080000200800001600001600561600561180201100991001008000080000010080000043080039003980039013943000511011611160053010048000080000100160041160057160041160057160057
16020416005611990000045000111600413120159688025160102100800028000010080000800005004218102293924001600330160052160040139687314001916010020280000800002008000016000016004016005211802011009910010080000800000100800000008003500358003501350000511011611160049010608000080000100160041160053160053160041160041
160204160040119900000000001160037218015969002516010210080002800001008000080000500421810229381471160021016004016004013968731400101601002008000080000200800001600001600521600521180202100991001008000080000010080000000800350035800356003900051101161116004906628000080000100160053160041160041160053160041
1602041600521199000000000111600370181815969012251601021008000280000100800008000050042181022939240116003301600521600401396873140010160100200800008000020080000160000160052160052118020110099100100800008000001008000003908000000358003560353901051101161116004900008000080000100160041160053160041160041160053
1602041600521199011000001011600372181815968616251601021008000280000100800008000050042181522939240116003301600521600521396753140010160100200804948000020080000161314160052160052118020110099100100800008000011008000003908003510478003561353900051101161116004906028000080000100160041160041160053160053160053

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 2.0007

retire (01)cycle (02)0308090e0f1e22233a3f4346494e4f51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)60696b6d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a1a3a6a8acafb5b6bbdcache load miss (bf)c2branch mispredict (cb)cdcfd5d6ddinst fetch restart (de)e0eaebecld/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
160025160060119900004500116004121115968502516001210800028000010800008000050422600229395591160041016006016004013969731400401603222080000800002080000160000160040160040118002110910108000080000110800000438000003880039613944000502081655160037000800008000010160061160041160061160041160061
1600241600601199010100001600450101596840251600101080000800001080000800005042154522939547116004101600601600601397173140040160010208000080000208000016000016004016004011800211091010800008000001080000008003800800000039440005020616781600370140800008000010160061160090160061160061160061
1600251600601198011144100160025210159684192516001210800008000010800008000050422643229381471160021016006016006013969731400201600102080000800002080000160000160060160040118002110910108000080000010800000438003900804286139000050207165516005710140800008000010160041160041160061160057160057
16002416004011990100114001160045201159684025160012108000280000108000080000504210852293814711600410160060160040139717314004316001020800008000020800001600001600561600561180021109101080000800000108000000800380388000060384400050205165616005714140800008000010160061160061160061160041160061
160024160056119800000001160045001159685192516001210800028000010800008000050421564229381471160037016006016006013971731400401600102080000800002080000160000160060160040118002110910108000080000010800000080038138800000038430005020616651600371407800008000010160109160061160061160061160041
1600241600401199010044001160025200159684162516001210800028000010800008000050421638229381471160041016006016006013969731400401600102080000800002080000160000160040160056118002110910108000080000010800000438003800800006138440005020616751600571007800008000010160041160041160061160041160057
16002416005611980100440011600252111596901625160010108000280000108000080000504227412293954711600410160040160060139717314003616001020800008000020800001600001600561600401180021109101080000800000108000004380054039800006104400050208166516005710100800008000010160061160041160061160061160041
1600241600601199000001011600410101596840251600121080002800001080000800005042163022939975116002101600601600401397173140040160010208000080000208000016000016006016005611800211091010800008000001080000043800000428003960394300050208166516005310140800008000010160041160057160041160061160057
16002416006011980000561011600452101596901925160012108000080000108000080000504225172293997511600410160040160040139697314002016001020800008000020800001600001600601600561180021109101080000800000108000004380038138800006004300050205168816005310144800008000010160057160041160041160041160041
1600241600601199000044001160045211159684192516001210800028000010800008000050422540229396751160041016010916006013971731400361600102080000800002080000160000160060160056118002110910108000080000110801310438000000800396139000150207165616005310140800008000010160041160061160061160061160041