Apple M1 Microarchitecture Research by Dougall Johnson

Firestorm: Overview | Base Instructions | SIMD and FP Instructions
Icestorm:  Overview | Base Instructions | SIMD and FP Instructions

LD1 (multiple, four registers, 4H)

Test 1: uops

Code:

  ld1 { v0.4h, v1.4h, v2.4h, v3.4h }, [x6]
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 60 nops): 4.000

Issues: 2.000

Integer unit issues: 0.000

Load/store unit issues: 2.000

SIMD/FP unit issues: 0.000

retire (01)cycle (02)030507080a0b0e0f18191e223a3f464951inst issue (52)~issue ld/st (55)~dispatch ld/st (58)huge thing ld/st (5a)5f6061696d6escheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op ld/st (7d)~map lookup ld/st (80)8283pipeline redirect (84)85inst all (8c)inst fp/simd load (98)inst ldst (9b)9da0a1a3a6a7a8a9acafb5bbdcache load miss (bf)dtlb miss (c1)c2cfd0d2icache miss (d3)itlb miss (d4)d5d6d9dadbddinst fetch restart (de)e0eaebecld/st retires (ed)? fp/simd (ee)f5f6f7f8fd
64005289582151191180100044149172817600234302000200020001000040016256282762870231020004000200028591286921161001100010000200240200201122000626211371810169700232691245198563334381644423928208150541243313974200020002878128496285802848828719
64004286942151161180000010150002815102234932000200020001000050016334283642888231020004000200028628285991161001100010001200336200400022002646211368310188719632871136195073369381639333128289146591263014078200020002873528592284942860528763
6400428656215112019100008049952842300236452000200020001000070016405281242857531020004000200028705284721161001100010000200450200400022004620221349810128713733661038196253414381341363228204144761226214133200020002877528788286492887128769
640042882921511911710000804967283902023370200020002000100016001641628367286933102000400020002871528600116100110001000020044620040022200042621138381020070613391939197073202381442393528284148281271613942200020002885928735287152844628731
6400528742215116117100003150942839000236092000200020001000050016429284062891331020004000200028552285061161001100010000200320200200222000040211402010076710633381039197043393381543403428155149441269413880200020002874528636286562856428673
640042875621411401400000304939283470023501200020002000100003091642128396288963102000400020002846228512116100110001000020043020070022200062022139029945711133601240196893417381541403528125147951267114029200020002879228646287082869428636
64004286902141101141000010148992841700236072000200020001000050016263284132865131020004000200028437285301161001100010000200336200200222000446211375810159706833391034198643342381340443228225147371289514080200020002861128448283642865528674
640042871121511411310000804950281560023319200020002000100004001641428252285353102000400020002839428597116100110001000020033020030022200042622137521016171133305740195813404381443323728201152961292914131200020002883228619285402875228824
640042859421511411410000605055282590023561200020002000100004001640528265286773102000400020002850728507116100110001000020044020040022200002620138421015871473330738196703327381048424128101144561243014145200020002869428517287162843828670
6400428723215112017111002050392828500234512000200020001000000016411282772860431020004000200028629286501161001100010000200336200301142000426211354610044705833971144196733294381441373528301152131285014345200020002871628700284752857928801

Test 2: throughput

Count: 8

Code:

  ld1 { v0.4h, v1.4h, v2.4h, v3.4h }, [x6]
  ld1 { v0.4h, v1.4h, v2.4h, v3.4h }, [x6]
  ld1 { v0.4h, v1.4h, v2.4h, v3.4h }, [x6]
  ld1 { v0.4h, v1.4h, v2.4h, v3.4h }, [x6]
  ld1 { v0.4h, v1.4h, v2.4h, v3.4h }, [x6]
  ld1 { v0.4h, v1.4h, v2.4h, v3.4h }, [x6]
  ld1 { v0.4h, v1.4h, v2.4h, v3.4h }, [x6]
  ld1 { v0.4h, v1.4h, v2.4h, v3.4h }, [x6]
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 1.0005

retire (01)cycle (02)030508090b0e0f1e22233a3f4346494f51inst issue (52)~issue int (53)~issue ld/st (55)~dispatch int (56)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map lookup int (7f)~map lookup ld/st (80)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a1a3a6a8a9acafb5b6bbdcache load miss (bf)dtlb miss (c1)c2c5branch mispredict (cb)cdcfd5d6ddinst fetch restart (de)e0eaebecld/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
32020580067599111101580028002625532716011210016001210016001650080146308002280041800410612160116200320032200160016800428004111802011009910010080000800001100160019134316005602511600436113431311115116016008003913051600001600001008004280042800428004280042
320204800415991100116100280028255328160112100160012100160016500801448080022800418004106121601162003200322001600168004280041218020110099100100800008000001001600181443160018001316004361514313011151170160080038131351600001600001008004280042800428004280042
320204800415991001001210280027055027160112100160012100160016500800124080022800418004106121601162003200322001600168004180041118020110099100100800008000001001600181343160057005516004301524313011151170160080038131301600001600001008004380042800428004280042
3202048004159910110018002800262550271601121001600121001600165008001370800228004180041071216011620032003220016009080041800421180201100991001008000080000010016001712016005700521600436113013111151160160080038131351600001600001008004280042800428004380042
320204800415991011115800280026205327160100100160000100160000500800042080015800408004003221601002003200002001600008004080040118020110099100100800008000011001600141401600533152160000615201310005109117118003701351600001600001008004180041800418004180041
320204800405991000015710280025255325160100100160000100160000500801382080015800408004003351601002003200002001600008004080040118020110099100100800008000001001600131401600140251160000005243130000510911711800370051600001600001008004180041800418004180041
320204800406001110015700280090050025160100100160000100160000500801386080015800408004003221601002003200002001600008004080040118020110099100100800008000001001600121443160051105116000000124313000051091171180037131301600001600001008004180041800418004180041
3202048004059910101113000800252550251601001001600001001600005008013770800158004080040032216010020032000020016000080040800401180201100991001008000080000010016001314431600540013160039001301320005109117118003701351600001600001008004180041800418004180041
3202048004060011111172010280025250025160100100160000100160000500800043080015800408004023221601002003200002001600008004080040118020110099100100800008000001001600121401600520013160000001301310005109117118003701351600001600001008004180041800418004180041
320204800405991111005800280025255325160100100160000100160000500800048080015800408004003221601002003200002001600008004080040118020110099100100800008000001001600141343160052005216000001524312000051091171180037131301600001600001008004180041800418004180041

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 1.0005

retire (01)cycle (02)0308090b0e0f181e1f22233a3f4346494f51inst issue (52)~issue int (53)~issue ld/st (55)~dispatch int (56)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)5f60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map lookup int (7f)~map lookup ld/st (80)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a3a6a7a8acafb5b6bbdcache load miss (bf)c2branch mispredict (cb)cdcfd5d6ddinst fetch restart (de)e0e7eaebecld/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
320025800546000001004201028002521212025160010101600001016000050800853418001580040800400322160010203200002016000080040801021180021109101080000800000101600003516003600361600006132400005019417238003701402160000160000108004180041800418004180041
32002480040600000000420102800252121202516001010160000101600005080085341800158004080040032216001020320000201600008004080040118002110910108000080000010160000401600000036160036610350005019417448003701002160000160000108004180041800418004180041
3200248004059900010042000280025012002516001010160000101600005080037581800158004080040032216001020320000201600008004080040118002110910108000080000010160000016000000361600000132000050194174380037014102160000160000108004180041800418004180041
320024800405990001104201028002520002516001010160000101600005080037481800158004080040032216001020320000201600008004080040118002110910108000080000010160000016003601016003601323500050194174380037014140160000160000108004180041800418004180041
320024800405990000003801028002521212025160010101600001016000050800377708001580040800400322160010203200002016000080040800401180021109101080000800000101600003516003600361600006104000050194173480037010142160000160000108004180041800418004180041
32002480040599000000000028002521212025160010101600001016000050800000718001580040800400322160010203200002016000080040800401180021109101080000800000101600003516000000321600320104000050194174480037014102160000160000108004180041800418004180041
320024800406000001103801008002521212025160010101600001016000050800000818001580040800400322160010203200002016000080040800401180021109101080000800000101600003516003600361600000104000050193174480037014140160000160000108004180041800418004180041
320024800405990000004200028002501212025160010101600001016000050800374718001580040800400322160010203200002016000080040800401180021109101080000800000101600003516000000391600326004000050193174480037014142160000160000108004180041800418004180041
32002480040599000000380000800252121202516001010160000101600005080037471800158004080040032216001020320000201600008004080040118002110910108000080000010160000351600320001600320136400005019417448003711402160000160000108004180041800418004180041
3200248004059900001042000280025201202516008610160000101600005080037281800158011780040032216001020320000201600008004080040118002110910108000080000110160000016003601361600000136400005019417348003700100160000160000108004180041800418004180041