Apple M1 Microarchitecture Research by Dougall Johnson

Firestorm: Overview | Base Instructions | SIMD and FP Instructions
Icestorm:  Overview | Base Instructions | SIMD and FP Instructions

LD4R (4S)

Test 1: uops

Code:

  ld4r { v0.4s, v1.4s, v2.4s, v3.4s }, [x6]
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 60 nops): 5.000

Issues: 5.008

Integer unit issues: 0.000

Load/store unit issues: 1.000

SIMD/FP unit issues: 4.008

retire (01)cycle (02)03050708090a0b0e0f1e1f22243a3f464951inst issue (52)~issue fp/simd (54)~issue ld/st (55)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing ld/st (5a)huge thing fp/simd (5b)5f696d6escheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op ld/st (7d)~map op fp/simd (7e)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst fp/simd load (98)inst ldst (9b)9da0a1a3a6a8a9acafb5bbdcache load miss (bf)dtlb miss (c1)c2cfd0d2icache miss (d3)itlb miss (d4)d5d6d9dadbddinst fetch restart (de)e0eaebecld/st retires (ed)? fp/simd (ee)f5f6f7f8fd
65005292842201151114000400046072882100169445008401210004000100050004751042295029091293003105000100040001000400029138291721161001100010001100002100000010002020012974943268513046743201393202381211373728429160021384314647100040002922629316292512925129303
6500429228219016001300020104559287461016977500840081000400010005000475282229312907429256310500010004000100040002920629127116100110001000110000310000001000202001297993356977307833220157301838149333628399164091394914589100040002923629249292422930229240
6500429298219014001800030104549287791116970501240081000400010005000475612229462910329279310500010004000100040002903829130116100110001000010000210000001000212001299593886825309083420193312538137313428320163721381814973100040002929929389291552919029238
6500429237219015001200020004557288060016936501640161000400010005000475815229502909029285310500010004000100040002915329102116100110001000010021310020011000202001275194096845317573920202306138139373628380165001375514802100040002927729268292482929229284
6500429296219013001000020104602288190116980500840121000400010005000476250229172909329262310500010004000100040002911429136116100110001000010000210000001000203001302392546848302983520177306038124403228389162841385514774100040002931029261292332927829335
6500429219220016001400020104559288040017012501240081000400010005000475914229392913829241310500010004000100040002916229182116100110001000010000210010001001212001296794426870309744420190308637398413928414164141396714905100040002926929326292682927029316
650042927821901300200001650104568288301116967500840081000400010005000475081229182913029333310500010004000100040002913429057116100110001000010000210000001000202001287192386849308454220155311438127383728495158271400414678100040002926529300292522932429188
6500429316219013001201020004585288330116961501240161000400010005000476337229532913529306310500010004000100040002917029132116100110001000010000310000001000202001291391896864315293920195309338118393728407156111389714827100040002931429314292472920029334
65004292042190140017000200045252875310170325008401610004000100050004756322291529123292833105000100040001000400029209291921161001100010000100002100000110012030012843922568743010835203003080381310414128321163271402015167100040002920529231291662929529181
6500429187219016001800030004566288360016947500840121000400010005000475392229422910129312310500010004000100040002921629197116100110001000110022210010011000213101291492736827309574020105310038127424128368163201380515126100040002931029328292292921929219

Test 2: throughput

Count: 8

Code:

  ld4r { v0.4s, v1.4s, v2.4s, v3.4s }, [x6]
  ld4r { v0.4s, v1.4s, v2.4s, v3.4s }, [x6]
  ld4r { v0.4s, v1.4s, v2.4s, v3.4s }, [x6]
  ld4r { v0.4s, v1.4s, v2.4s, v3.4s }, [x6]
  ld4r { v0.4s, v1.4s, v2.4s, v3.4s }, [x6]
  ld4r { v0.4s, v1.4s, v2.4s, v3.4s }, [x6]
  ld4r { v0.4s, v1.4s, v2.4s, v3.4s }, [x6]
  ld4r { v0.4s, v1.4s, v2.4s, v3.4s }, [x6]
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 1.0007

retire (01)cycle (02)03080e0f191e1f22243f4346494e51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)60696b6d6escheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a3a6a8acafb5b6bbdcache load miss (bf)c5branch mispredict (cb)cdcfd5d6ddinst fetch restart (de)e0e7eaebld/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
400205800705990110200108004216002540011810032006480006100320032800085004000568640272080038080057800576284001402008000832003220080008320032800418004111802011009910010080000800000100800021880015014800026114181115116016008003801010800003200001008004280058800588004280058
4002048005760000002001080042006025400170100320012800061003200328000850040024686402721800380800418005762840014020080008320032200800083200328005780057118020110099100100800008000001008000218800020080015611401115117016008003801010800003200001008005880058800588005880058
4002048005759900002001080026106025400170100320064800061003200328000850040004986402721800380800578005762840014020080008320032200800083200328005780057118020110099100100800008000001008000218800160080016001401115117016008003811010800003200001008005880058800428005880042
4002048004159900001900080042166025400100100320000800001003200008000050040000038400001800380800578004133940010020080000320000200800003200008004180057118020110099100100800008000011008000018800130148001360018000510911711800380100800003200001008004280058800428004280058
4002048005759901101900080026066025400100100320000800001003200008000050040000086400201800220800418005734840010020080000320000200800003200008004180041118020110099100100800008000001008000008000001480013611418000510911711800540100800003200001008004280058800588005880042
400204800575990000190108004206002540016410032006480000100320000800005004000113840000180022080041800573484001002008000032000020080000320000800418005721802011009910010080000800000100800001880014117800006114180005109117118003801010800003200001008005880058800588004280058
400204800415990000000080026166025400100100320000800001003200008000050040001138400001800380800578005734840010020080000320000200800003200008005780057118020110099100100800008000001008000008000001780013010180005109117118005411010800003200001008004280042800588004780058
4002048005760000000000800261600254001641003200008000010032000080000500400011864002018003808005780041348400100200800003200002008000032000080057800571180201100991001008000080000010080000188001401480014611318000510911711800540100800003200001008005880042800588005880058
40020480057599000020010800421660254001001003200008000010032000080000500400011384000018002208005780057339400100200800003200002008000032000080057800571180201100991001008000080000010080000188001401480014611318000510911711800541010800003200001008010880042800588005880047
400204800416000000000080026100025400164100320064800001003200008000050040000986400201800220800418004133940010020080000320000200800003200008004480041118020110099100100800008000001008000008001300800140113180005109117118005411010800003200001008004280042800428005880042

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 1.0007

retire (01)cycle (02)0308090e0f1e1f22243f4346494e51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)60696d6escheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a3a6a8acafb5b6bbdcache load miss (bf)branch mispredict (cb)cfd5d6dbddinst fetch restart (de)e0e7eaebld/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
40002580057599010013770108004216602540007410320064800001032000080000504000183840000080038800578005733940001020800003200002080000320000800418005711800211091010800008000001080000188001310800006114005019517033800540101080000320000108006180042800588005880042
4000248005760000001901080026060025400086103200648000010320000800005040000086400200800228004180057339400010208000032000020800003200008005780057118002110910108000080000010800001880014014800146114005019417022800541101080000320000108005880058800588005880042
400024800416000000200108004216602540001010320000800001032000080000504000093840000080022800418004133940001020800003200002080000320000800578005711800211091010800008000001080000188001401480014600180501921702380054001080000320000108005880171802758005880058
40002480057600000092801080042106025400074103200008000010320000800005040000038400000800228006080041339400010208000032000020800003200008005780057118002110910108000080000010800001880013125800130114005019217022800541101080000320000108005880058800588005880042
40002480057600000020010800261600254000101032000080000103200008000050400000864002018003880057800573394000102080000320000208000032000080041800571180021109101080000800000108000018800140148001401141805019217022800410101080000320000108010980058800588005880058
400024800575990000898010800421660254000741032006480000103200008000050400015384000008003880057800573394000102080000320000208000032000080041800571180021109101080000800000108000018800130148000061141805019217022800381101080000320000108005880058800588005880058
40002480057599000020010800420660254000101032006480000103200008000050400000864002408003880041800573394000102080000320000208000032000080057800571180021109101080000800000108000008001400800146101805019217023800541101080000320000108004280058801568005880058
40002480057600100019010800421660254000861032007680000103200008000050400018864002008003880057800413394000102080000320000208000032000080041800571180021109101080000800000108000018800130148000060018050193170228005410080000320000108005880155800458005880058
400024800416000000995010800421005025400074103200648000010320000800005040000986400200800388006080057342400010208000032000020800503200008005780142118002110910108000080000010800001880013014800130113180501921702280054101080000320000108005880042800588004280058
40002480041600001094701080026166025400074103200648000010320000800005040001138400000800388005780057339400010208000032000020800003200008005780057118002110910108000080000010800001880000017800140113005019217022800540101080000320000108005880058800428004280045