Apple M1 Microarchitecture Research by Dougall Johnson

Firestorm: Overview | Base Instructions | SIMD and FP Instructions
Icestorm:  Overview | Base Instructions | SIMD and FP Instructions

LD4 (multiple structures, 8H)

Test 1: uops

Code:

  ld4 { v0.8h, v1.8h, v2.8h, v3.8h }, [x6]
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 60 nops): 12.000

Issues: 12.014

Integer unit issues: 0.000

Load/store unit issues: 4.000

SIMD/FP unit issues: 8.014

retire (01)cycle (02)03070a0e0f1e1f22243a3f464951inst issue (52)~issue fp/simd (54)~issue ld/st (55)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing ld/st (5a)huge thing fp/simd (5b)5f6061696d6escheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op ld/st (7d)~map op fp/simd (7e)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst fp/simd load (98)inst ldst (9b)9da0a3a6a8acafb5b6bbdcache load miss (bf)c2cfd0d2icache miss (d3)itlb miss (d4)d5d6d9dadbddinst fetch restart (de)e0eaebecld/st retires (ed)? fp/simd (ee)f5f6f7f8fd
720052844221118220080105114280594214234120148014400080004000203379827780024785280352817831012000400080004000200002846528213116100110001000040008400402400001480137931028072583508750181033485381416454827902140851173811930400080002825628271283812821028265
72004285352111214008000521528127401421312000800040008000400020336983219002477928044283073101200040008000400020000282332813911610011000100004000840040040000000013450961772583508838180063401381314444327878151751211911780400080002813328446282462803128087
7200428113210191400100105114281370414084120008014400080004000203419823230024720282392808331012000400080004000200002820328426116100110001000140008400400400441400133361027671663433741181093421381013424427870139981172512046400080002850228645283812840028125
72004286842122117008000516728127041417212014800040008000400020343980764002471928135280703101200040008000400020000281502829411610011000100004000840040440024048013918105157208324294018410351738059424327875141871204412299400080002814028155281682813228091
7200428193211161210801050502812104140851201480144000800040002034098247120024726280882805731012000400080004000200002831928185116100110001000040000400406400440480139541025971553501946181253577380411494427988151071177012255400080002812728213282372834128098
72004281542101619000000526527928401406212014801440008000400020347983877002471128103281823101200040008000400020000280972814711610011000100004000840000240004140013927989370373497944181983437381117455627975142031155212025400080002810428276281362817728055
7200428157211131901000051862806340140291200080144000800040002033098295600247152815328533310120004000800040002000028075280431161001100010001400084004044004404801380710298723834551137181293407380913444227958139741169612087400080002827428099284802815928290
720042854721215180080005185281894414243120148000400080004000203359828190024712281372834931012000400080004000200002808828176116100110001000040008400407400400080137801023970373337639181093402381615444627939145541190312051400080002819528292283912844328099
72004285722121916009000508428050441426512014801440008000400020340983093002470028024281543101200040008000400020000281862841111610011000100004000840030440024028014077102687008346654218011342938127434027777141671173811858400080002811528192282042807928064
720042805021116130080105237279994413992120008000400080004000203439829180024742280652809931012000400080004000200002819228063116100110001000140008400002400441480138811014872223435844181303362381712454627873140301174612223400080002804228188284342823028060

Test 2: throughput

Count: 8

Code:

  ld4 { v0.8h, v1.8h, v2.8h, v3.8h }, [x6]
  ld4 { v0.8h, v1.8h, v2.8h, v3.8h }, [x6]
  ld4 { v0.8h, v1.8h, v2.8h, v3.8h }, [x6]
  ld4 { v0.8h, v1.8h, v2.8h, v3.8h }, [x6]
  ld4 { v0.8h, v1.8h, v2.8h, v3.8h }, [x6]
  ld4 { v0.8h, v1.8h, v2.8h, v3.8h }, [x6]
  ld4 { v0.8h, v1.8h, v2.8h, v3.8h }, [x6]
  ld4 { v0.8h, v1.8h, v2.8h, v3.8h }, [x6]
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 2.0008

retire (01)cycle (02)03050708090a0b0e0f18191e1f22233a3f4346494e4f51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)60696b6d6edispatch stall (70)72scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a1a3a5a6a7a8a9acafb5b6bbdcache load miss (bf)dtlb miss (c1)c2c5branch mispredict (cb)cfd5d6ddinst fetch restart (de)e0eaebecld/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
960205160072119910100100006301031600542660025960541100640068320000100640000320000500168123611520028116003001600691600694035196054420032000064000020032000016000001600691600491180201100991001008000080000010032001413440320053000533200006153441300051094174416006613130320000640000100160070160070160070160050160070
960204160049119910100100001301031600542665025960156100640060320000100640000320000500168123611520028016005001600491600503035196010020032000064000020032000016000001600711600491180201100991001008000080000010032001314440320053001533200406113441310051094174416006613132320000640000100160070160070160050160070160164
9602041600691199100001000013000216005400000259601681006400163200001006400003200005001680098217607520160050016006916004830331960100200320000640000200320000160000016006916006911802011009910010080000800000100320014134403200530015332003900534413200510941744160045000320000640000100160050160070160050160070160070
96020416006911991010010000130103160054260502596016010064006832013010064000032000050016815652176075201600500160049160049660351960100200320000640000200320000160000016006916004911802011009910010080000800000100320013140032005302056320000605344130005109417441600681302320000640000100160050160050160070160070160070
96020416006912021000000000590000160034206502596011610064005632000010064000032000050016812362176075201600300160049160069303319601002003200006400002003201241600000160069160069118020110099100100800008000001003200141343032005300056320040605344130005129417431600660132320000640000100160070160070160070160070160070
9602041600691199100000000059000316005426660259601681006400683200001006400003200005001681238217607521160050016004916006930273196010020032000064000020032000016000001600691600691180201100991001008000080000010032001414440320053101143200406013441310051094174416006613134320000640000100160070160052160050160070160070
96020416006911991010000000130000160054166002596015610064006032000010064000032000050016812492176075201600300160069160071003519601002003200006400002003200001600000160049160069118020110099100100800008000001003200131400320054000133200406153441310051094176416004613132320000640000100160072160050160050160070160050
96020416006911991000010000590003160034266002596011610064006832000010064000032000050016812442176075201600300160049160049303319601002003200006400002003200001600000160069160049118020110099100100800008000001003200141344032005410153320040615344131005113517421600660132320000640000100160070160070160071160070160070
960204160069119910100000005900001600542065025960156100640016320000100640000320000500168123621760752116005001600491600690033196010020032000064000020032000016000001600491600691180201100991001008000080000010032001513440320053111563200006112013000510941731116006613132320000640000100160050160070160050160070160204
960204160069119910000000001300041600542665025960168100640016320000100640000320000500168009821760752016005001600691600490035196010020032000064000020032000016000001600691600691180201100991001008000080000010032001513440320013100533200406113441320051094174416016613130320000640000100160070160070160070160070160050

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 2.0007

retire (01)cycle (02)03050e0f1e22243a3f4346494f51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)60696b6d6edispatch stall (70)72scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a3a6a8acafb5b6bbdcache load miss (bf)c5branch mispredict (cb)cdcfd0icache miss (d3)d5d6d9ddinst fetch restart (de)e0eaebecld/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
960025160059119900057400016004931200259600741064006432000010640000320000501680881185600120160029016004316006400346960010203200006400002032000016000001600431600591180021109101080000800000103200003532003710320037610400005019003170331600561010032000064000010160065160065160066160065160065
9600241600641199010981021600493012025960074106400003200001064000032000050168088121759996016004001600591600640034196001020320000640000203200001600000160043160043118002110910108000080000010320000353200321503200326132350005019003170321600611410132000064000010160060160065160065160060160044
96002416004311990001041001600493120025960074106400643200001064000032000050168041318560012016004501600641600640034696001020320000640000203200001600000160064160059118002110910108000080000010320000353200370032003761370000501901317023160061010132000064000010160065160044160921161648160044
96002416006411990001251001600283121202596006610640000320000106400003200005016800018160000116004001600641600590034196001020320000640000203200001600000160059160059118002110910108000080000010320000032003700320037603200005019003170231600611010132000064000010160044160065160065160060160060
960024160064119800043102160049312120259600101064000032000010640000320000501680890185600120160045016006416006400346960010203200006400002032000016000001600641600591180021109101080000800000103200003532003200320037013240000501900317033160061140032000064000010160044160065160066160065160142
96002416004311990006910016004431212025960449106400563200001064000032000050168088181600000160024016005916006400346960010203200006400002032000016000001600431600591180022109101080000800000103200000320037037320037610400005019003170231600611010032000064000010160065160065160060160044160060
960024160043119900043102160028301202596006610640000320000106400003200005016808812175999601600460160064160154003419600102032000064000020320000160000016004316005911800211091010800008000001032000003200006440320000613200005019003170331600561410132000064000010160065160065160065160044160044
960024160064119900038100160028312120259600741064006432000010640000320000501680881185600120160024016006416004300348960010203200006400002032000016000001600641600431180021109101080000800001103200003532003200320037613700005019403170331600561010132000064000010160067160065160065160065160065
9600241600591199000720021600283121202596006610640056320000106400003200005016808811856001201600450160064160064003259600102032000064000020320000160000016005916004311800211091010800008000001032000035320037032320037600350005019003170331600401414132000064000010160060160044160065160060160044
96002416005911980001090021600283121202596007410640056320000106400003200005016808811856001201600240160059160064003419600102032000064000020320000160000016006416005911800211091010800008000011032000003200000323200370137400005019003170331600611410132000064000010160065160065160044160061160065