Apple M1 Microarchitecture Research by Dougall Johnson

Firestorm: Overview | Base Instructions | SIMD and FP Instructions
Icestorm:  Overview | Base Instructions | SIMD and FP Instructions

LD3 (multiple structures, 2D)

Test 1: uops

Code:

  ld3 { v0.2d, v1.2d, v2.2d }, [x6]
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 60 nops): 6.000

Issues: 6.006

Integer unit issues: 0.000

Load/store unit issues: 3.000

SIMD/FP unit issues: 3.006

retire (01)cycle (02)0304050708090b0e0f1e223a3f464951inst issue (52)~issue fp/simd (54)~issue ld/st (55)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing ld/st (5a)huge thing fp/simd (5b)5f696d6escheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op ld/st (7d)~map op fp/simd (7e)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst fp/simd load (98)inst ldst (9b)9da0a1a3a6a8a9acafb5b6bbdcache load miss (bf)dtlb miss (c1)c2cfd0d2icache miss (d3)d5d6d9dadbddinst fetch restart (de)e0eaebecld/st retires (ed)? fp/simd (ee)f5f6f7f8fd
660052919421910100001614593287380016841600630063000300030001500035721132303429102292003106000300030003000900029101291121161001100010000300003300000030045149001288491276856306552200383034380611354328408159931322014933300030002930029220292182923029241
6600429252218000000006145292879210167966009300930003000300015000357145230222906929258310600030003000300090002916429204116100110001000030000630000013001500600129239238684530423320028308338105404628379162321316814941300030002925029234292842934029184
66004292382200000000001455328704001680260093003300030003000150003576652301629115291773106000300030003000900029111291741161001100010000300006300100030045116001292791586924307737200653048381211384428305163751307514905300030002927629246292722927429253
6600429215219000000000045262878500169506000300030003000300015000357490230312915729204310600030003000300090002918129128116100110001000030000030000073000514900127649144681730733320040311438117393328415164491335315014300030002917829251292552926329279
66004292182180000000060450728819001690160003009300030003000150003578562302229138292143106000300030003000900029181291221161001100010000300556301102530033146001276692426824309637200773026381312353228419163291325915019300030002916729270292452924828848
6600429245218000000009046002879300169206006300630003000300015000357815229952905029233310600030003000300090002917428992116100110001000030000030010073001511600127138994691630443520087307338189333028454161581317614964300030002923329290292232925829339
66004292212190000000001450028879301691360093000300030003000150003573942298129171292563106000300030003000900029167290781161001100010000300006300100430015119001273691336812308336199763084381010383228364165661353015095300030002929429334291262930029243
6600429286219000000006146642883700168626006300030003000300015000357280230072910229207310600030003000300090002919529156116100110001000030000930040013004510900128799089689030613720053308838174373028365164611327614917300030002928629323292532928529193
6600429313220000001009144692875900169316006300030003000300015000357603230262918329269310600030003000300090002921229140116100110001000030000630010013001001600128919133693530973320067305338177362928358164361337414910300030002925829305292632935029267
66004292292180000000001466828838001687060003006300030003000150003583232296929168293133106000300030003000900029169290771161001100010001300000300110430015106001286891326879305935200533044381411323628396164521327814854300030002922728803293112928728986

Test 2: throughput

Count: 8

Code:

  ld3 { v0.2d, v1.2d, v2.2d }, [x6]
  ld3 { v0.2d, v1.2d, v2.2d }, [x6]
  ld3 { v0.2d, v1.2d, v2.2d }, [x6]
  ld3 { v0.2d, v1.2d, v2.2d }, [x6]
  ld3 { v0.2d, v1.2d, v2.2d }, [x6]
  ld3 { v0.2d, v1.2d, v2.2d }, [x6]
  ld3 { v0.2d, v1.2d, v2.2d }, [x6]
  ld3 { v0.2d, v1.2d, v2.2d }, [x6]
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 1.0008

retire (01)cycle (02)030508090b0e0f1e1f2223373a3f4346494f51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)60696d6edispatch stall (70)72scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)92inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a1a3a6a7a8a9acafb5b6bbdcache load miss (bf)dtlb miss (c1)c2c5branch mispredict (cb)cdcfd5d6ddinst fetch restart (de)e0eaebecld/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
48020580098600000011420100280047001952548013610024000824000010024002424002050035015574908111080041800578005740728480145200240024240024200240024720072800628005711802011009901001008000080000010024001718422400560016024004051560172111511701600800639902400002400001008006880067800688005080068
4802048005060011000042000918003031905254801341002400452400001002400242400225003501859576338818002580057800570072748014720024002424002420024002472007280057800571180201100990100100800008000001002400010392400370103724003750373900111511701600800596602400002400001008005880045800638004680063
480204800626000000000000018003031519025480137100240035240000100240024240024500351984757380761800258004480057406334801452002400242400242002400247200728006280057118020110099110010080000800000100240000039240042000372400425100001115116017008005410602400002400001008005880058800588005880058
480204800576000000000000618004730155254801361002400072400001002400242400225003522640566775208003880056800451006334801472002400242400242002400247200728005680053118020110099010010080000800000100240000039240037000024004201373900111511701600800540002400002400001008006580045800458004780045
4802048004460210111164000938022531515525480142100240043240000100240000240000500325316655934311800378006280044503444801002002400002400002002400007200008006280044118020110099110010080000800000100240000038240042010422400420137000000510911711800546602400002400001008006380063800578005880058
48020480062600000000990101080029001510254801061002400382400001002402012400005003508328565490608004380057800571003444801002002400002400002002400007200008005680057118020110099010010080000800000100240000038240037000024004001370000005109117118004110002400002400001008004580063802508006580066
480204803776001111006501012800293019025480102100240039240000100240000240000500355357743467291800388005680059100344480100200240000240000200240000720000800578005711802011009911001008000080000010024000003924003700062240038515642172000510911711800679902400002400001008006880068800508006880068
480204800796001001004501000800290150102548014210024003724000010024000024000050035243295534190080043800448004400344480100200240000240000200240000720000800568005711802011009901001008000080000010024000003924000000048240000504238000005109117118004101002400002400001008004580058800578005880063
4802048006960000001042010208004201505254801061002400402400001002400002400005003524458576262918004380062800620032648010020024000024000020024000072000080057800571180201100990100100800008000001002400000024003700061240037514145000005109117118005410002400002400001008004580045800588004580058
480204807135990000004201071800423150525480143100240034240000100240000240000500349494958713960800438004480057503264801002002400002400002002400007200008004480056118020110099010010080000800000100240000002400000004024004251045000005109117118005961002400002400001008006380058800588006380102

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 1.0007

retire (01)cycle (02)0308090e0f181e1f2224373a3f4346494f51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)60696b6d6edispatch stall (70)72scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)92inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a3a5a6a8acafb5b6bbdcache load miss (bf)c2cfd5d6ddinst fetch restart (de)e0eaebld/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
48002580062599010004201042800420151942548004310240010240000102400002400005035244585762629180038080057800574033848001020240000240000202400007200008005780057118002110901010800008000011024000039024003703724004251039050191117121380054106240000240000108007380068800458004580063
4800248005759900010420000080047319010254803151024003424000010240000240000501676441554234108002508006280062003494800102024000024000020240000720000800628006011800211091101080000800001102400003902400370422400000137005019121710118005906240000240000108007080063800638004580063
480024800475990001042000118002901915025480017102400362400001024000024000050349406555329751800430800628006210034448001020240000240000202400007200008004480056118002110901010800008000001024000039024003604224004251374505019121710680054106240000240000108008080058800578006380059
4800248005759900000420105180029319190254800181024004324000010240000240000503553577553419018004308006280062003444800102024000024000020240000720000800628004411800211090101080000800001102400003802400371422400005137390501911177128005406240000240000108007080073800588006080045
48002480062599000004201082800473191952548005410240042240000102400002400005034949493603255180043080044800574034448001020240000240000202400007200008006280057118002110901010800008000001024000000240042037240036010450501910177128005966240000240000108039280181800728006380045
4800248006259900000471321022800293191910254800491024000424000010240000240000503529181360325018003808006280057403394800102024000024000020240000720000800448005711800211091101080000800000102400003802400370422400425142390501911177108005906240000240000108005980045800588005880063
48002480057600000004200010800293151802548004710240042240000102400002400005035535775586488180025080062800574034448001020240000240000202400007200008004480057118002110901010800008000001024000039024003703724003751374505019717101080059100240000240000108005780063800638005880045
48002480057599000000000228004731515102548001610240033240000102400002400005016764415577915180025080064800480033948001020240000240000202400007206398028080044118002110901010800008000001024000039024000000240037513645050191117111480054100240000240000108006380057800588005880045
48002480058599000004200000800470191910254800471024000124000010240000240000503524329555604508004308004480062003394800102024000024000020240000720000800628005711800211090101080000800000102400003902400360412400425142005019131712780054100240000240000108016780065800458006380045
4800248005660000010000001800293191502548005210240040240000102400002400005016764413603287180043080057800574033948001020240000240000202400007200008004480057118002110901010800008000001024000000240042002400425042005019121711138005966240000240000108006280146802158007180045