Apple M1 Microarchitecture Research by Dougall Johnson

Firestorm: Overview | Base Instructions | SIMD and FP Instructions
Icestorm:  Overview | Base Instructions | SIMD and FP Instructions

LD3 (single structure, H)

Test 1: uops

Code:

  ld3 { v0.h, v1.h, v2.h }[1], [x6]
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 60 nops): 4.000

Issues: 4.003

Integer unit issues: 0.000

Load/store unit issues: 1.000

SIMD/FP unit issues: 3.003

retire (01)cycle (02)030407090a0e0f181e223a3f43464951inst issue (52)~issue fp/simd (54)~issue ld/st (55)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing ld/st (5a)huge thing fp/simd (5b)5f696d6escheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op ld/st (7d)~map op fp/simd (7e)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst fp/simd load (98)inst ldst (9b)9da0a1a3a6a8acafb5bbdcache load miss (bf)cfd0d2icache miss (d3)itlb miss (d4)d5d6d9dadbddinst fetch restart (de)e0eaebecld/st retires (ed)? fp/simd (ee)f5f6f7f8fd
64005287912138112000215035283481101620540063006100030001000500035712722899284012833331040001000300010006000283092821911610011000100001000001002011000122130391024071703378074197233190381210485028118144361247713469100030002873228760283702831028314
6400428493212900000001518828425111165264000300610003000100050003571122291128212284343104000100030001000600028330281881161001100010001100002100202100010213098932272183213152192533171381517535428073155661256313034100030002842728782283152876028361
64004287912169000000015001281591101612440033006100030001000500035599322904282332880531040001000300010006000287182837911610011000100001000021002021001022130931001268813383250193153132381916555327967157411250113493100030002840928395287982874228158
640042841821340000003151232808811016612400630031000300010005000357041222921283582873731040001000300010006000282492866011610011000100001000021001011001112136291008470133221450194173094382115585427913154631263713454100030002839028339282892824328423
640042848021560000002147892809611016582400330031000300010005000356253229052863828475310400010003000100060002821928350116100110001000010000210020510021201384394677217342015519338312738168495027896147361237313423100030002836528325283722874728365
64004282982153010000315030280971111615340063003100030031000500035709522891286332844031040001000300010006000283552825111610011000100011000021001041000022138071009970733333158198433164380513605927936157861282013590100030002843728341283982836728292
6400428383213300000020510628034000165324006300610003000100050003560822287428607288003104000100030001000600028343282201161001100010000100000100101100210013862985171093360260198623193381820545728079155401329113580100030002833628324288202880228472
64004284172136000000014689285390001625140063006100030001000500035716322833285502845636340001000300010006000283362828111610011000100001000021002021000100131141015769403399150198033284381415625228059145781264314026100030002879328424284182854428700
6400428706215600000021505128047011163484003300610003000100050003574302286228626283673104000100030001000600028267286391161001100010000100002100002100112013075942368843175151193133392381718535227904156151334013539100030002877228355288032833728376
640042884221340100007147732817201116202400330031000300010005000356983228642829128259310400010003000100060002856728688116100110001000010000210010210001221380493667143338625919720340338199575527977157311323513334100030002842228302284692837528581

Test 2: throughput

Count: 8

Code:

  ld3 { v0.h, v1.h, v2.h }[1], [x6]
  ld3 { v0.h, v1.h, v2.h }[1], [x6]
  ld3 { v0.h, v1.h, v2.h }[1], [x6]
  ld3 { v0.h, v1.h, v2.h }[1], [x6]
  ld3 { v0.h, v1.h, v2.h }[1], [x6]
  ld3 { v0.h, v1.h, v2.h }[1], [x6]
  ld3 { v0.h, v1.h, v2.h }[1], [x6]
  ld3 { v0.h, v1.h, v2.h }[1], [x6]
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 2.0008

retire (01)cycle (02)03050708090b0e0f18191e1f22233a3f4346494e51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)60696b6d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a1a3a6a7a8a9acafb5b6bbdcache load miss (bf)dtlb miss (c1)c2branch mispredict (cb)cdcfd0d5d6ddinst fetch restart (de)e0e7eaebecld/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
320205160062119800000000017010016003906615986125320106100240000800001002400008000050040002022792366016003501600541601057992238003632010020080000240000200800004800001600541600621180201100991001008000080000010080006923800260122980020617070005110021722160059199280000240000100160063160063160047160063160063
320204160062119810111110060801011600471661598817132030010024004880174102240504801745004000332279400111600430160062160062799303800443201002008000024000020080000480000160062160062118020110099100100800008000001008006472381228002298002061252360005110021722160059099280000240000100160063160063160063160063160047
3202041601591199102200065320101160047166159881253201121002400128000010024000080000500400043227907631160043016006216004679930380044320100200800002400002008000048000016006216006211802011009910010080000800001100800077238002701226800206125061005110021742160059099280000240000100160063160063160063160047160063
32020416006211991010100003201001600310661598812532011210024001280000100240000800005004000392279285711600270160062160062799303800283201002008000024000020080000480000160062160062118020110099100100800008000001008000762480027001268001861262470005110021722160043190280000240000100160047160063160063160063160063
3202041600621199100101100320101160047166159881253202061002400128005810024000080000500400033227940010160043016006216006279930380044320100200800002400002008000048000016004616006211802011009910010080000800001100800088248002600026800186172372005110021722160059099280000240000100160063160063160063160063160047
32020416006211991001100006010116004716615988125320112100240012800001002400008000050040003322794001016004301600621600627993038002832372020080000240000200800004800001600461600621180201100991001008000080000010080007824800260517800206125060005130121722160059009080000240000100160047160063160047160047160047
32020416004611991011100007010116004710615988125320106100240012800001002400008006150040000322794001016002701600621600627991438004432010020080000240000200800004800001600461600621180201100991001008000080000010080008824800270017800196126070005110021722160059109280000240000100160047160063160063160063160063
3202041600621199100010000320001160031166159881253201121002400128000010024000080000500400033227940010160043016006216006279914380044320100200800002400002008000048000016004616006211802011009910010080000800000100800077248002800026800206172361005110021722160043199280000240000100160047160063160063160063160063
32020416006211991011000005200111600470061598532532011210024001280000100240000800005004000332279400101600430160062160062799303800443201002008000024000020080000480000160062160046118020110099100100800008000001008000792380026000258001960262471005110021722160059009280000240000100160063160063160063160063160063
320204160062119910111100070001160047166159853253201061002400128000010024000080000500400000227940040160027016006216006279930380044320100200800002400002008000048000016004616006211802011009910010080000800000100800097238000801025800206172362005110021722160059099080000240000100160063160063160047160063160047

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 2.0008

retire (01)cycle (02)030508090b0e0f1e22233a3f4346494e51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)5f60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a1a3a6a7a8a9acafb5b6bbdcache load miss (bf)dtlb miss (c1)c2c5cfd2d5d6dadbddinst fetch restart (de)e0e7eaebecld/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
32002516006211991001112141100160047166159869253200221024001280000102400008000050400034227925971116004316006216005579930380044320010208000024000020800004800001600621600661180021109101080000800000108000782480027002428001861272370050200121700141016005200908000024000010160063160056160063160056160056
320024160055119910000012150001600401001615988125320028102400188000010240000800005040033822791228011600431600621600627993038004432001020800002400002080000480000160062160055118002110910108000080000010800076248002700126800180125071050200171700141416005909928000024000010160056160056160063160063160063
32002416006211991101001367000160040106159890253200221024001280000102400008000050400013227940010116240216006216006279923380044320010208000024000020800004800001600621601131180021109101080000800000108000782480018000188001860172460050200131700141616005200908000024000010160056160067160067160067160063
3200241600621199101100142000160051106615988125320022102400128000010240000800005040003822792597011600361600551600627993038004432001020800002400002080000480000160062160062118002110910108000080000010800089238002600337808001960182970050200131700131316006306628000024000010160063160067160063160063160063
32002416006611991111008410016004716171598812532002210240012800001024000080000504000392279400101160043160062160055799303800443200102080000240000208000048000016006616006211800211091010800008000001080007808001800126800106126071050200141700111116005909928000024000010160063160063160056160056160067
3200241601061199101100113500016004010617159881253201161024001280000102400008000050400033227925970116003616006216006279930380044320010208000024000020800004800001600621600551180021109101080000800000108000882480017000268001801182461150200171700181416005910028000024000010160067160056160063160056160063
320024160062119911110113760001600511106159869253200221024001880000102400008000050400038227925970116004316005516006279930380044320010208000024000020800004800001600661600621180021109101080000800000108000782480027001208001961262972050200131700131116005209908000024000010160063160063160063160063160063
3200241600551199111100713100160051106159881253200221024001280000102400008000050400004227940010116003616005516006279923380048320010208000024000020800634800001600621600621180021109101080000800000108000782380027000398001161262371050200121700161616005919028000024000010160063160063160063160067160063
320024160055119911110011030001600471116159881253200221024001280000102400008000050400013228232800116004716039916005579934228003432001020800002400002080000480000160066160062118002110910108000080000010800079080025000268001060262400050200141700121216005909928000024000010160056160063160056160056160063
3200241600621199111111124510016005111014159890253200221024001280000102400008000050400062227948700116004316005516006279930380044320010208000024000020800004800001600621600552180021109101080000800000108000882380018001278002061172361050200131700111216005910928000024000010160063160063160063160063160067