Apple M1 Microarchitecture Research by Dougall Johnson

Firestorm: Overview | Base Instructions | SIMD and FP Instructions
Icestorm:  Overview | Base Instructions | SIMD and FP Instructions

LD1 (multiple, four registers, 16B)

Test 1: uops

Code:

  ld1 { v0.16b, v1.16b, v2.16b, v3.16b }, [x6]
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 60 nops): 4.000

Issues: 4.000

Integer unit issues: 0.000

Load/store unit issues: 4.000

SIMD/FP unit issues: 0.000

retire (01)cycle (02)030407090a0b0e0f1e22243a3f43464951inst issue (52)~issue ld/st (55)~dispatch ld/st (58)huge thing ld/st (5a)5f696d6escheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op ld/st (7d)~map lookup ld/st (80)8283pipeline redirect (84)85inst all (8c)inst fp/simd load (98)inst ldst (9b)9da0a1a3a6a8a9acafb5b6bbdcache load miss (bf)c2c9cfd0d2icache miss (d3)itlb miss (d4)d5d6d9dadbddinst fetch restart (de)e0eaebecld/st retires (ed)f5f6f7f8fd
6400529462220141100001200455229115000233784000400040002070561695728760293123104000400040002914229324116100110001000040000124007008400051812001265590176841315558420289309238132063592847716584134361526340002935529415294932938829437
6400429433220400100000045752913324023346400040004000207088169172876329506310400040004000291612926211610011000100004000004007001040000180001283890116814312626320423303138101560582841016348134301534440002941429381293462942629394
640042935621940010001310460329137000233844000400040002071361695728708293903104000400040002912929221116100110001000040000124006022400661612001280989346846312036220449311838121760592841716563133741521740002930729383293932942529356
64004294032206000000010447029315001233804000400040002071981694028676293811248400040004000292522922811610011000100004000012400700840076178401297491086861310326820395314838121060662840916315133801487340002941229383294022941729417
6400429276220600100014004562294890102341540004000400020714716936287142940131040004000400029313292062161001100010001400441240070247340005170901276490806834309937020446307838131765592848216363135971504640002946429541294452936829482
640042950222070010001200453929126000233894000400040002085961693828705294473104000400440002933429346316100110001000040000124002000400801812001274691736843311016420470307138071366612843416516134051502740002940529355293182937929396
64004294212324110010000466829372001236024000400040002070331693428884295173104000400040002920229188116100110001000040000124008109400760712001273390446880307666320222307138151559592846015852133151521540002933729338293802933729383
640042944122140000001900458629129200233634000400040002071831693328759294493104000400040002923329149116100110001000040000124007007400261312001277890596828306716920380308838081862642846916555133121527740002944229373294362945029316
6400429360220500000000044862910704023377400040004000207095169162865229341310400040004000292072920211610011000100004000012400600640066108001287291176847303337220347307338101460602836416342134591514240002943929410294642948229349
64004293802203100000000451829106030234184000400040002071031694628715293203104000400040002919829253116100110001000040000124007007400660012001280291846880303736420399306138141364612872416537134481519240002946529460293482926029511

Test 2: throughput

Count: 8

Code:

  ld1 { v0.16b, v1.16b, v2.16b, v3.16b }, [x6]
  ld1 { v0.16b, v1.16b, v2.16b, v3.16b }, [x6]
  ld1 { v0.16b, v1.16b, v2.16b, v3.16b }, [x6]
  ld1 { v0.16b, v1.16b, v2.16b, v3.16b }, [x6]
  ld1 { v0.16b, v1.16b, v2.16b, v3.16b }, [x6]
  ld1 { v0.16b, v1.16b, v2.16b, v3.16b }, [x6]
  ld1 { v0.16b, v1.16b, v2.16b, v3.16b }, [x6]
  ld1 { v0.16b, v1.16b, v2.16b, v3.16b }, [x6]
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 1.3341

retire (01)cycle (02)0308090b0e0f18191e1f2223243a3f4346494f51inst issue (52)~issue int (53)~issue ld/st (55)~dispatch int (56)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)60696b6d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map lookup int (7f)~map lookup ld/st (80)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a1a3a5a6a8a9acafb5b6bbdcache load miss (bf)c2c5branch mispredict (cb)cdcficache miss (d3)d5d6ddinst fetch restart (de)e0? int output thing (e9)eaebecld/st retires (ed)gpr retires (ef)f5f6f7f8fd
32020510673180001000000010011067192121216253201001003200001003200175004673844010668201067271067072667762670032011720032002420032002410672710670711802011009910010080000800000100320000043032003900393200396039000005109011711106724010104320000100106728106728106728106728106938
3202041067277990001000550000110671221200253201001003200001003200005004709628010670201067271067272667532670932010020032000020032076810674510672711802011009910010080000800000100320000043032003900393200396139430000510901171110672401004320000100106708106728106728106708106708
320204106727799000000045000011067122120162532010010032000010032000050047281830106702010672710672726675326712320100200320000200320000106727106727118020110099100100800008000001003200000430320039004232003901394300005109011711106724010100320000100106728106728106708106728106728
3202041067278000000000540100110671220120253201001003200001003200005004709628010670401067271067272667532670932010020032000020032000010672710672711802011009910010080000800000100320000043032003900032003961404300005109021711106724010104320000100106728106708106708106728106728
32020410670779900000004501001106712212121625320100100320000100320000500470962801067020106707106727266753266893201002003200002003200001067271067271180201100991001008000080000010032000000032003910403200396139430000510901171110672800104320000100106731106728106728106728106728
320204106727800000000045000001067122120162532010010032000010032000050047096280106702010672710672726675326709320100200320000200320000106729106727118020110099100100800008000001003200000430320000003932000060394300005109011711106704010104320000100106728106728106728106708106728
320204106707799000000045010011066922121202532010010032000010032000050047281830106702010672710672726675326711320100200320000200320768106804106742118020110099100100800008000001003200000430320039003932003901394300005109011711106704010104320000100106728106728106728106728106728
3202041067077990000000450000110671201212162532010010032000010032000050046988801106702010672710672726655326689320100200320000200320000106727106727118020110099100100800008000011003200000430320039003932000061394300005110011711106724010100320000100106728106728106728106728106728
32020410672780000000004501001106712212121625320100100320000100320000500470962811066850106730106710266553267093201002003200002003200001067271067271180201100991001008000080000010032000004303200390003200006139000005109011711106724010100320000100106728106728106728106728106708
320204106727799000110045010011067122121216253201001003200001003200005004673754110670201067071067272667532670932010020032000020032000010670710672711802011009910010080000800000100320000043032003900393200000139000105110011711106724010104320000100106708106728106708106731106728

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 1.3340

retire (01)cycle (02)0305080b0e1e223a3f4346494f51inst issue (52)~issue int (53)~issue ld/st (55)~dispatch int (56)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map lookup int (7f)~map lookup ld/st (80)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a1a3a6a7a8a9acafb5b6bbdcache load miss (bf)dtlb miss (c1)c2cdcfd5d6daddinst fetch restart (de)e0eaebecld/st retires (ed)gpr retires (ef)f5f6f7f8fd
3200251067238300000411110669220181225320010103200001032000050468510711069031067221067222667032668932001020320000203200001067271067071180021109101080000800000103200000032003500035320039613539000502014170121410671960232000010106727106728106734106723106725
32002410670779900004101106692318181225320010103200001032000050468510711066971067221067222667032670432001020320000203200001067221067221180021109101080000800000103200000393200350003532003561039000502012170141510670466232000010106723106723106723106726106708
32002410672279900004111106707218181225320010103200001032000050468336011066821068031067412667032670432001020320000203200001067071067221180021109101080000800000103200000393200350009232003561353900050199170101410671966232000010106725106723106716106728106762
320024106725799000041001067070181812253200101032000010320000504698880110669710672210672226670326704320010203200002032000010672210672211800211091010800008000001032000003932003500035320035613539000502014170141410671966232000010106723106723106708106723106723
32002410672280000004111106692218181225320010103200001032000050470583711066971067221067222667032670432001020320000203200001067221067301180021109101080000800000103200000393200350003832003661039000501916170161310670406232000010106723106723106811106708106708
32002410670779900004111106692218181225320010103200001032000050468510711066971067221067072667032668932001020320000203200001067221067111180021109101080000800000103200000393200350000320036613539000502014170141510685666232000010106723106708106723106723106708
32002410672279900000101067102018122532001010320000103200005047058371106820106722106722266703266893200102032000020320000106722106722118002110910108000080000010320000039320000000032003561039000502012170131310671966232000010106723106723106723106727106723
32002410672680000001211106707218185483201401032013010320890504668429110669710674910672626674326708320010203200002032000010672210672211800211091010800008000001032000003932003601041320035613539000502012170151210671966232000010106723106723106727106727106708
32002410672679900004111106707218181425320010103200001032000050472291311070111067071067222667032670432001020320000203200001067221067221180021109101080000800000103200000393200350003532000061350000502014170121210671966232000010106723106708106723106723106708
32002410672279900004111106707218181125320010103200001032000050470299211066971067221067072667032668932001020320000203200001067071067221180021109101080000800000103200000393200000003532000061350001502012170151110671960232000010106723106723106708106740106723