Apple M1 Microarchitecture Research by Dougall Johnson

Firestorm: Overview | Base Instructions | SIMD and FP Instructions
Icestorm:  Overview | Base Instructions | SIMD and FP Instructions

LD1 (multiple, four registers, 1D)

Test 1: uops

Code:

  ld1 { v0.1d, v1.1d, v2.1d, v3.1d }, [x6]
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 60 nops): 4.000

Issues: 2.000

Integer unit issues: 0.000

Load/store unit issues: 2.000

SIMD/FP unit issues: 0.000

retire (01)cycle (02)03050708090a0b0e0f1e1f2223243a3f43464951inst issue (52)~issue ld/st (55)~dispatch ld/st (58)huge thing ld/st (5a)5f60696d6escheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op ld/st (7d)~map lookup ld/st (80)8283pipeline redirect (84)85inst all (8c)inst fp/simd load (98)inst ldst (9b)9da0a1a3a6a7a8a9acafb5b6bbdcache load miss (bf)dtlb miss (c1)c2cfd0d2icache miss (d3)itlb miss (d4)d5d6d9dadbddinst fetch restart (de)e0eaebecld/st retires (ed)? fp/simd (ee)f5f6f7f8fd
640052926821902500170001701004544288750122415420002000200010000601625128594293063102000400020002909629046116100110001000020000620020000200040460012952929968483051849202893267381512514728346164601359015538200020002929329223293252928929275
640042924621901700140007010046572881222224096200020002000100005016255288432930431020004000200029119291781161001100010000200006200000042000400600138299387691530841241202863082381810454428333163861359015467200020002934629243292992928129306
640042927821902500230007000045932886902024072200020002000100005016255286412928331020004000200029141291601161001100010000200004200400002002400600130459246696031071348203463086381617484428512163961361515403200020002928829216292792924629277
640042925521901900150007000046122905702224096200020002000100000016230285562926031020004000200029074291501161001100010000200006200001022000402400128489815686331001448203813081381111474628401162681362114868200020002927729336292772931129391
6400429261220017002200070000463828937200241132000200020001000030162422854529247310200040002000291062911811610011000100002000042004000220004046001283992596886305064020284305338177414828337161821354915419200020002932229287293022932229331
6400429515220023002200080100468428872022240512000200020001000050162472862529343310200040002000291122910711610011000100002000042002000020004026001304292156925310874520259302738148484928304162501364615341200020002926829247293252930529253
6400429269218023001800032010045982889320224043200020002002100140016251285402918031020004000200029047291721161001100010000200006200200022004403600130399411690931441038202873088381413484728719163251380215617200020002927029326292732920329270
640042923621802300190008000046232881220224080200020002000100004016234285772940631020004000200029213291521161001100010000200004200400022004402600130169221688431091038202423072381014424828416153951363815532200020002922229335293672938729280
64004292132190200017000800004640288262212407720002000200010000501625228679292273102000400020002916829043116100110001000020000420020000200240460013095929768733118848202873065381216484528392162521380315557200020002932429321293112940929250
6400429318219020001701180000461428796022241122000200020001000050162382858129437310200040002000291272899951610011000100012000042004000020024004001366693346877305374820359309438166494428454164651369415596200020002928829371292832932629301

Test 2: throughput

Count: 8

Code:

  ld1 { v0.1d, v1.1d, v2.1d, v3.1d }, [x6]
  ld1 { v0.1d, v1.1d, v2.1d, v3.1d }, [x6]
  ld1 { v0.1d, v1.1d, v2.1d, v3.1d }, [x6]
  ld1 { v0.1d, v1.1d, v2.1d, v3.1d }, [x6]
  ld1 { v0.1d, v1.1d, v2.1d, v3.1d }, [x6]
  ld1 { v0.1d, v1.1d, v2.1d, v3.1d }, [x6]
  ld1 { v0.1d, v1.1d, v2.1d, v3.1d }, [x6]
  ld1 { v0.1d, v1.1d, v2.1d, v3.1d }, [x6]
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 1.0005

retire (01)cycle (02)0308090b0e0f181e1f22233a3f43464951inst issue (52)~issue int (53)~issue ld/st (55)~dispatch int (56)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map lookup int (7f)~map lookup ld/st (80)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a3a5a6a8acafb5b6bbdcache load miss (bf)c5branch mispredict (cb)cdcfd5d6ddinst fetch restart (de)e0e7eaebecld/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
32020580067599000110380100800262121228160112100160012100160312500800929180083800418004206121601162003200322001600168004180042118020110099100100800008000001001600043501600360351600046132011151160160080039110021600001600001008004280043800428004380042
320204800416000000003800028002720122816011210016001210016001650080092918002280041800410612160116200320032200160016800418004111802011009910010080000800000100160004350160036036160036610401115116016008003800021600001600001008004280043800428038780042
3202048004159900000000000800260121228160112100160012100160016500800076180022800418004106121601162003200322001600168004180042118020110099100100800008000001001600043501600040016003661324211151160160080177014021600001600001008004280043800428038680042
3202048004159900001038000280026212122716011210016001210016001650080063318002280041800410231216011620032003220016001680042800411180201100991001008000080000010016000435016003650160040603240111511601600800380141401600001600001008004280042800428004280042
320204800416000001004201008002720027160112100160012100160016500800929180096800418004144612160116200320032200160016800418004111802011009910010080000800000100160004350160036032160004613640111511601600800380141421600001600001008004280042800428004280043
32020480041600100100420100800262121227160112100160012100160016500800453080022800418004106121601162003200322001600168004180041118020110099100100800008000001001600043501600364321600406000111511601600800381101001600001600001008004280043800428004280042
3202048004160000001042000280025012122516010010016000010016000050080037718001580040800400322160100200320000200160000800408004011802011009910010080000800000100160000350160000032160032610000051091171180037101001600001600001008004180041800418004180041
3202048004059900000038000280025212025160100100160000100160000500800000080015800408004003221601002003200002001600008004080040118020110099100100800008000001001600003501600000316003260364000051091171180122014021600001600001008004180041800418004180041
3202048004060000000042000280025012152516010010016000010016000050080000008001580040800400322160100200320000200160000800408004011802011009910010080000800000100160000350160032039160036613240000510911711800371141421600001600001008004180041800418004180041
320204800406000000003801028002520122516010010016000010016000050080000018001580040800400318016010020032000020016000080040800401180201100991001008000080000010016000035016003200160000613640000510911711800371101021600001600001008004180041800418012780041

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 1.0005

retire (01)cycle (02)0305080b0e0f1e22233a3f4346494f51inst issue (52)~issue int (53)~issue ld/st (55)~dispatch int (56)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)60696d6escheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map lookup int (7f)~map lookup ld/st (80)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a1a3a6a7a8a9acafb5b6bbdcache load miss (bf)dtlb miss (c1)c2cdcfd5d6dbddinst fetch restart (de)e0eaebecld/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
320025800425990001101008002525532516001010160000101600005080138308001580040800403221600102032000020160000800408004011800211091010800008000011016001314016001300151160000615143130050199170758003713135160000160000108004180041800418004180041
32002480040599111005800080025255325160010101600001016000050801383080015800408004032216001020320000201600008004080040118002110910108000080000010160013144316001200151160000015001310501951707580037005160000160000108004180041800418004180041
32002480040600110105810280025250325160010101600001016000050800048180015800408004032216001020320000201600008004080040118002110910108000080000010160014140160051000121600390113431300501951707580100005160000160000108004180103800418004180041
320024800406001110057102800252553251600101016000010160000508013831800158004080040322160010203200002016000080040800401180021109101080000800001101600141243160053001121600396151013005019517057800370135160000160000108004180041800418004180041
3200248004059910100130028002525532516001010160000101600005080138318001580040800403221600102032000020160000800408004011800211091010800008000001016001414016005400052160000605243130050198170578003713135160000160000108004180041800418004180041
3200248004060011000590008002505532516001010160000101600005080138608001580040800403221600102032000020160000800408004011800211091010800008000011016001515016005200113160039011243130050196170758003713135160000160000108004180041800418004180041
3200248004060011100590008002505002516001010160000101600005080138008001580040800403221600102032000020160000800408004011800211091010800008000001016001514431600130001316004061524312205019517057800370130160000160000108004180041800418004180041
320024800405991110057000800250553251600101016000010160000508000420800158004080040322160010203200002016000080040800401180021109101080000800000101600141243160052000191600406013431200501951707580037005160000160000108004180041800418004180041
32002480040599111106600280025205325160010101600001016000050801383080015800408004032216001020320000201600008004080040118002110910108000080000010160012134316005100113160039601343121050195170578003713130160000160000108004180041800418004180041
32002480040599110115700080025250325160010101600001016000050801377080015800408004032216001020320000201600008004080040118002110910108000080000010160014124316005300113160039005243120050197170578003713135160000160000108004180041800418004180041