Apple M1 Microarchitecture Research by Dougall Johnson

Firestorm: Overview | Base Instructions | SIMD and FP Instructions
Icestorm:  Overview | Base Instructions | SIMD and FP Instructions

LD3 (multiple structures, 2S)

Test 1: uops

Code:

  ld3 { v0.2s, v1.2s, v2.2s }, [x6]
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 60 nops): 5.000

Issues: 5.009

Integer unit issues: 0.000

Load/store unit issues: 2.000

SIMD/FP unit issues: 3.009

retire (01)cycle (02)030407080a0e0f1e22243a3f43464951inst issue (52)~issue fp/simd (54)~issue ld/st (55)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing ld/st (5a)huge thing fp/simd (5b)5f696b6d6escheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op ld/st (7d)~map op fp/simd (7e)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst fp/simd load (98)inst ldst (9b)9da0a3a6a8a9acafb5bbdcache load miss (bf)cfd0d2icache miss (d3)itlb miss (d4)d5d6d9dadbddinst fetch restart (de)dfe0eaebecld/st retires (ed)? fp/simd (ee)f5f6f7f8fd
650052932922011400108104543287960021699050093012200030002000100003573362292102904129247310500020003000200060002914229129116100110001000020006200200420044061332690696874309438420213319538091662571028336163961327215024200030002938529327294052939829220
65004292672203100006004568288490221698750123009200030002000100033574632289202910529375310500020003000200060002916329066116100110001000020006200000220004241287391286853307226220188305838081559611028408163151332914986200030002927729376293022936029379
65004292362204000006004556288740221703350093006200030002000100003579632295402905029358310500020003000200060002921729111116100110001000020006200000220024261296892406820308036320269313138151352531128419165971333715025200030002928229291294182925329407
6500429286220300000310459728815020171065006301220003000200010000357708228970291632939431050002000300020006000292622919711610011000100002000620000022004404128009041690231310642023730313823236766928405163951344415220200030002928529318293102937429305
65004292402198100008004576288370021694450063009200030002000100003575092290602907529316310500020003000200060002913529100116100110001000020006200200020024261282891046913306306320158311538171968661228412163251325514961200030002930429303293652936329458
65004293042197000006104466287950221707750093012200030002000100003577332292702910629257310500020003000200060002919129172116100110001000020006200400220024241285891126928306707620161312138152063561028456162781335214941200030002929829238292622924529334
65004293472195000008104577288280001692050093006200030002000100033574932290002908929363310500020003000200060002918229185116100110001000020004200400220004261300790476955309116720243310038111865671028464159961321214948200030002917429279293412935729340
65004292802196000008004599288130011700450093012200030002000100003578352287202913429324310500020003000200060002911129167116100110001000020006200000020044261277190456895312316220176311538141970701228365162391325414926200030002923929192292752922629345
65004293092205000004004606287970101696150123009200030002000100003576652290102912229168310500020003000200060002913529158116100110001000020006200200220024061285691626858308636520277308538121260621128479164111337115023200030002936329337294402916629229
65004293632205000008104544287880201706350063009200030002000100003577922290602907429253310500020003000200060002912829226116100110001000020004200000420024261307992876937316846120203308038171771661028454164941325215240200030002933629359293152930129296

Test 2: throughput

Count: 8

Code:

  ld3 { v0.2s, v1.2s, v2.2s }, [x6]
  ld3 { v0.2s, v1.2s, v2.2s }, [x6]
  ld3 { v0.2s, v1.2s, v2.2s }, [x6]
  ld3 { v0.2s, v1.2s, v2.2s }, [x6]
  ld3 { v0.2s, v1.2s, v2.2s }, [x6]
  ld3 { v0.2s, v1.2s, v2.2s }, [x6]
  ld3 { v0.2s, v1.2s, v2.2s }, [x6]
  ld3 { v0.2s, v1.2s, v2.2s }, [x6]
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 1.0005

retire (01)cycle (02)0308090b0e0f181e1f22233a3f4346494f51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a3a6a7a8acafb5b6bbdcache load miss (bf)c2cfd5d6ddinst fetch restart (de)e0e7eaebld/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
400205800696000101103801008002621200254001571002400001600001002400001600005008003772881982180022800418004103234001002001600002400002001600004800008004180041118020110099100100800008000001001600003516000000016000061323505109117118003810101600002400001008004280042800428004280042
4002048004160000000038010080026212120254001581002400581600001002400001600005008003772881975080022800418004103234001002001600002400002001600004800008004180041118020110099100100800008000001001600000160032003516003261035051091171180038110101600002400001008004280042800428004280042
4002048004160000000038010080026012120254001571002400571600001002400001600005008003742882019180022801288004103234001002001600002400002001600004800008004180041118020110099100100800008000001001600003516003200016003200035051091171180038110101600002400001008004280042800428004280042
40020480041599000000380100800262121202540015710024011116015210024000016000050080037428819681800228004180041032340010020016000024000020016000048000080041800411180201100991001008000080000010016000035160032003216000061323505109117118003810101600002400001008004280042800428004280042
4002048004160000000038010080026212120254001581002400581600001002400001600005008003772882023080022800418010503234001002001600002400002001600004800008004180041118020110099100100800008000001001600003516003200016000061324005109117118003811001600002400001008004280042800428004280042
4002048004160000000038000080026212120254001571002400571600001002400001600005008003772880000080022800418004103234001002001600002400002001600004800008004180041118020110099100100800008000001001600003516003200321600006100051091171180038010101600002400001008004280042800428004280042
4002048004159900000031117610080026212120254001571002400581600001002400001600005008008532880000180022800418004103234001002001600002400002001600004800008004180041118020110099100100800008000001001600003516000000016003200324005109117118003811401600002400001008004280042800428004280042
4002048004159900000000000800262121202540010010024000016000010024000016000050080000028800001800228004180041032340010020016000024000020016000048000080041800411180201100991001008000080000010016000035160036000160032613240051091171180038110141600002400001008004280042800428004280042
4002048004160000000000000800260121204640010010024005716000010024000016000050080000028800000800228004180041032340010020016000024000020016000048000080041800411180201100991001008000080000010016000035160032014116003261035051091171180038114101600002400001008004280042800428004280042
400204800415990100004200108002601200254001571002400631600001002400001600005008003742881968180022800418004103234001002001600002400002001600004800008004180041118020110099100100800008000001001600003516003200016003261035051091171180038110101600002400001008004280042800428004280042

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 1.0005

retire (01)cycle (02)030408090b0e0f1e1f22233a3f43464951inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)5f60696d6edispatch stall (70)72scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a3a6a8acafb5b6bbdcache load miss (bf)c2branch mispredict (cb)cfd0d2icache miss (d3)itlb miss (d4)d5d6d9dbddinst fetch restart (de)e0e7eaebld/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
400025800556000000005401028002621212254000681024006316000010240000160000508000002881982018002280041800410032340001020160000240000201600004800008004180041118002110910108000080000110160000351600320351600366132000502231112217002318800381010160000240000108004280042800428004280042
4000248004159910100054010080026212122540001010240063160000102400001600005080096028897570080022800418004100323400010201600002400002016000048000080041800411180021109101080000800000101600003516003603616003660035005020000123170018238003801410160000240000108004280042800428004280042
4000248004160010100000100800262121225400010102400631600001024000016000050800886288341600800228004180041003234000102016000024000020160000480000800418004111800211091010800008000001016000001600000016003661323500502030012317002212800381010160000240000108004280042800428004280042
400024800415991010004201028002621212254000731024006316000010240000160000508003772881982008002280041800410032340001020160000240000201600004800008004180041118002110910108000080000110160000016003600160036003200050223001131700231280038000160000240000108004280042800428004280042
400024800416001010003801028002620124740044110240000160000102400001600005080007528819770080022800418004100323400010201600002400002016000048000080041800411180021109101080000800000101600003516003603616003261360005020000123170019238003801414160000240000108004280042800428004280042
400024800415991010004201028002621202540007310240000160000102400001600005080037728833270080022800418004100323400010201600002400002016000048000080041800411180021109101080000800000101600003516003603616000060324000502000012317002322800380010160000240000108004280042800428004280042
4000248004160000000042010080026012122540006710240063160000102400001600005080000028820210080022800418004100323400010201600002400002016000048000080041800411180021109101080000800000101600003516000003616003261000050203001231700182280038000160000240000108004280042800428004280042
400024800415991010004201008002601202540001010240063160000102400001600005080000028833270080022800418004100323400010201600002400002016000048000080041800411180021109101080000800000101600003516000003616003261035005020300115170022138003801410160000240000108004280042800428004280042
40002480041600101000420100800260121225400010102400581600001024000016000050800000288197500800228004180041003234000102016000024000020160000480276800418004111800211091010800008000001016000035160000039160049610000502030012317002223800380140160000240000108004280042800428004280042
400024800416001010013801028002621202540007310240063160000102400001600005080375128819750080022800418004100323400010201600002400002016000048000080041800411180021109101080000800000101600000160000236160032613635005020000123170013238003801014160000240000108004280042800428004280042