Apple M1 Microarchitecture Research by Dougall Johnson

Firestorm: Overview | Base Instructions | SIMD and FP Instructions
Icestorm:  Overview | Base Instructions | SIMD and FP Instructions

LD3 (multiple structures, 8H)

Test 1: uops

Code:

  ld3 { v0.8h, v1.8h, v2.8h }, [x6]
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 60 nops): 6.000

Issues: 6.006

Integer unit issues: 0.000

Load/store unit issues: 3.000

SIMD/FP unit issues: 3.006

retire (01)cycle (02)03040708090a0e0f1e2223243a3f464951inst issue (52)~issue fp/simd (54)~issue ld/st (55)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing ld/st (5a)huge thing fp/simd (5b)5f696d6escheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op ld/st (7d)~map op fp/simd (7e)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst fp/simd load (98)inst ldst (9b)9da0a1a3a6a8a9acafb5b6bbdcache load miss (bf)c2cfd0d2icache miss (d3)itlb miss (d4)d5d6d9dadbddinst fetch restart (de)e0eaebecld/st retires (ed)? fp/simd (ee)f5f6f7f8fd
660052935721991010009100459228977011686160003006300030003000150003573062294729095292973106000300030003000900029133292121161001100010000300009300400130045146012980907768473144078201053124382112565128348164941335115160300030002931429386293132921229259
660042934821940010009100456228921031691260063006300030003000150043558642296329181292333106000300030003000900029182291551161001100010000300006300000030045100012776902068393075060201813209381412505228624162931312415012300030002922929287291792927929339
660042920522050000009100454228991311697560093009300030003000150043570552299129072291983106000300030003000900029167291971161001100010000300000300400430045149012892909869823159156201163061381618656028371163971318914542300030002920629395293252928629234
660042932722040000000000456428850131683760093009300030003000150003573682298729248292453106000300030003000900029192291321161001100010000300009300400030015040012868904269653154249201733134382118595028549163351323214809300030002943729360292572921629398
66004292262202000000900046842891401170016009300930003000300015000356940230022917129205310600030003000300090002915429161116100110001000030000630041243001514901285592166943303415820026312838188565328547163831320915044300030002920529248292592926229280
6600429205220500000091004758288331016821600630083000300030001500035617102303629086292693106000300030003000900029064291231161001100010000300009300000130045119012841948567913227059201093103381618525228394160981294714820300030002935729190292412932829231
660042921521930000006100460828890001692360083000300030003000150003561732294229172292523106000300030003000900029290292191161001100010000300009300110030005146012741900869363037262201303097381815535928360163581324214939300030002930629318292852932929245
660042919821940000009100455928786101697660063006300030003000150003562352300529043292873106000300030003000900029029292001161001100010000300000300400030015146012884913068513072059200693013381613545828438162001323714851300030002929829318291972930329326
6600429341219400000010000457128934011699560063008300030003000150003577922308129051292803106000300030003000900029151291271161001100010001300006300000030045119012918914669243075053201033170381311515828705163671338114867300030002919529324292132925229250
660042923821930010016100458628733301681060093008300030003000150003569932303129180292573106000300030003000900029226290771161001100010000300029300100430015146012807920769013156049199903087382213585828391162621310014765300030002932129395292852924829364

Test 2: throughput

Count: 8

Code:

  ld3 { v0.8h, v1.8h, v2.8h }, [x6]
  ld3 { v0.8h, v1.8h, v2.8h }, [x6]
  ld3 { v0.8h, v1.8h, v2.8h }, [x6]
  ld3 { v0.8h, v1.8h, v2.8h }, [x6]
  ld3 { v0.8h, v1.8h, v2.8h }, [x6]
  ld3 { v0.8h, v1.8h, v2.8h }, [x6]
  ld3 { v0.8h, v1.8h, v2.8h }, [x6]
  ld3 { v0.8h, v1.8h, v2.8h }, [x6]
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 1.0008

retire (01)cycle (02)0305080b0e0f181e22373a3f4346494f51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)60696b6d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a1a3a6a7a8a9acafb5b6bbdcache load miss (bf)dtlb miss (c1)c2cfd5d6ddinst fetch restart (de)e0eaebecld/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
480205800855991100006512280051218181254801391002400212400001002400002400005003552190575116908004808007280054143544801002002400002400002002400007200008005480067118020110099100100800008000001002400181702400701013524003951320172510911711802290902400002400001008006880054800688006880055
48020480067602110000350328005231830125480139100240039240000100240000240000500350584057760870800483800678005514349480100200240000240000200240000720000800678005311802011009910010080000800000100240019174224005610235240039513242170510911711801870902400002400001008006880071800688007380068
4802048006760011000064122800422180132548012110024002124000010024000024000050035023975750782180053080067800541349480100200240000240000200240000720000800548005411802011009910010080000800000100240017184224007010059240039015642171510911711803690012400002400001008006880068800688006980054
4802048006759911100035044800391501812548012010024003224000010024000024000050035301365755891080051080072800721935448010020024000024000020024000072000080067800711180201100991001008000080000110024001817422400561003524003950560170510911711800500002400002400001008006880060800688006980073
4802048006660011100065112800522001325480139100240020240000100240000240000500350014057507131800530800678005414349480100200240000240000200240000720000800548005411802011009910010080000800000100240017185724003200235240053505644171510911711803429002400002400001008006880054800688006880068
4802048006760010000064043800573180125480139100240039240000100240000240000500354297957763330800470800718005315349480100200240000240000200240000720000800678007211802011009910010080000800000100240019184224005600159240040613242171510911711801646912400002400001008006880058800718006880071
48020480054600111000790428005215180142548013910024004224000010024000024000050035097225705547080048080072800671536648010020024000024000020024000072000080072800671180201100991001008000080000110024001918024005500135240039515742170510911711803369902400002400001008005580055800688005480073
4802048006759910100064101800673180142548013910024003924000010024000024000050035429795715080080048080067800531349480100200240000240000200240000720000800808006811802011009910010080000800000100240019174224003200173240015515642170510911711800509022400002400001008005780055800688005580068
4802048006760011100065041800522180142548012410024004624000010024000024000050035455515690539080034080054800721336480100200240000240000200240000720000800758007511802011009910010080000800000100240017204224003200277240040517044171510911711800649902400002400001008006880068800558006880055
48020480053600100100590128003930181725480139100240039240000100240000240000500350584057762850800480800538006719349480100200240000240000200240000720000800838007311802011009910010080000800000100240019184224003210135240054503243171510911711800519912400002400001008006880068800558006880068

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 1.0007

retire (01)cycle (02)0305080b0e0f181e2223373a3f4346494f5051inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)5f6061696d6edispatch stall (70)72scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a1a3a5a6a7a8a9acafb5b6bbdcache load miss (bf)dtlb miss (c1)c2c5cfd0d2d5d6dbddinst fetch restart (de)e0eaebecld/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
480025800636000000004110228005231818144254800451024003024000010240000240000503524561566366200080030800678006614036848001020240000240000202400007200008006280044118002110910108000080000010240000000240037000372400375103900050190021702380064090240000240000108006880051800518006880068
48002480067600111000641021800423191950254800181024003924000010240000240000502392151576262900080038800578005740339480010202402132400002024000072000080184800571180021109101080000800000102400171842024005610120240039615742170050195031702380054660240000240000108005880058800578005880058
480024800575990000004200238005230013025480064102400472400001024000024000050349280857515400158004780067800501403494800102024000024000020240000720000800668006111800211091010800008000001024000003902400370003724003700373800050195031703280064990240000240000108006880051800688005080050
4800248006759911000064009080029319190025480044102400012400001024000024000050350913655369680158002580057800570032948001020240000240000202400007200008007880067118002110910108000080000010240018184202400580012024004001570171050195141703280054660240000240000108005880058800458005880058
480024800575990000000003280042200002548005610240036240000102400002400005035388094863673015800318006780066003314800102024000024000020240000720000800628007111800211091010800008000001024000003802400370006424003701373900050195151702380064990240000240000108006880068800688006780050
4800248006759910100064100180042319000254800491024000524000010240000240000503522822360334901580025800578005700339480010202400002400002024000072000080059800671180021109101080000800000102400171842024005700060240000005642171050195141703380041000240000240000108005880057800458005780058
480024800445990000004210263800532181817025480053102400562400001024000024000050353880955037980158004880067802451403314800102024000024000020240000720000800548004411800211091010800008000001024000003802400370003924003551373800050195141703380064990240000240000108006880068800688006880050
480024800675991110006410018002930195025480012102400392400001024000024000050167644136032390158002580056800444032648001020240000240000202400007200008007680066118002110910108000080000010240018184102400561015924000051580170050195131703280054660240000240000108005880058800588005880057
480024800575990000004210028005200014025480054102400462400001024000024000050353384957306860158004880067800670034948001020240000240000202400007200008005380063118002110910108000080000010240000000240037000024003751373900050195131702380167991240000240000108006880068800718007580054
480024800705991111006410108004230194025480048102400362400001024000024000050167644155341900158003880056800575033848001020240000240000202400007200008007980053118002110910108000080000110240018180024005610020240039015643170050195021703280054660240000240000108005880058800578005880059