Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

LD1 (multiple, 4 regs, 1D)

Test 1: uops

Code:

  ld1 { v0.1d, v1.1d, v2.1d, v3.1d }, [x6]
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 60 nops): 4.000

Issues: 2.000

Integer unit issues: 0.000

Load/store unit issues: 2.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk instruction (07)mmu table walk data (08)09l2 tlb miss instruction (0a)l2 tlb miss data (0b)0e0f1e1f2223243a3f43464951schedule uop (52)schedule ldst uop (55)dispatch ldst uop (58)simd uops in schedulers (5a)5f60696d6emap rewind (75)map stall (76)dispatch uop (78)map ldst uop (7d)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst simd load (98)inst ldst (9b)9dl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafb5b6bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2cfd0d2l1i cache miss demand (d3)l1i tlb miss demand (d4)d5map dispatch bubble (d6)d9dadbddfetch restart (de)e0eaebec? ldst retires (ed)? simd retires (ee)f5f6f7f8fd
640052926821902500170001701004544288750122415420002000200010000601625128594293063102000400020002909629046116100110001000020000620020000200040460012952929968483051849202893267381512514728346164601359015538200020002929329223293252928929275
640042924621901700140007010046572881222224096200020002000100005016255288432930431020004000200029119291781161001100010000200006200000042000400600138299387691530841241202863082381810454428333163861359015467200020002934629243292992928129306
640042927821902500230007000045932886902024072200020002000100005016255286412928331020004000200029141291601161001100010000200004200400002002400600130459246696031071348203463086381617484428512163961361515403200020002928829216292792924629277
640042925521901900150007000046122905702224096200020002000100000016230285562926031020004000200029074291501161001100010000200006200001022000402400128489815686331001448203813081381111474628401162681362114868200020002927729336292772931129391
6400429261220017002200070000463828937200241132000200020001000030162422854529247310200040002000291062911811610011000100002000042004000220004046001283992596886305064020284305338177414828337161821354915419200020002932229287293022932229331
6400429515220023002200080100468428872022240512000200020001000050162472862529343310200040002000291122910711610011000100002000042002000020004026001304292156925310874520259302738148484928304162501364615341200020002926829247293252930529253
6400429269218023001800032010045982889320224043200020002002100140016251285402918031020004000200029047291721161001100010000200006200200022004403600130399411690931441038202873088381413484728719163251380215617200020002927029326292732920329270
640042923621802300190008000046232881220224080200020002000100004016234285772940631020004000200029213291521161001100010000200004200400022004402600130169221688431091038202423072381014424828416153951363815532200020002922229335293672938729280
64004292132190200017000800004640288262212407720002000200010000501625228679292273102000400020002916829043116100110001000020000420020000200240460013095929768733118848202873065381216484528392162521380315557200020002932429321293112940929250
6400429318219020001701180000461428796022241122000200020001000050162382858129437310200040002000291272899951610011000100012000042004000020024004001366693346877305374820359309438166494428454164651369415596200020002928829371292832932629301

Test 2: throughput

Count: 8

Code:

  ld1 { v0.1d, v1.1d, v2.1d, v3.1d }, [x6]
  ld1 { v0.1d, v1.1d, v2.1d, v3.1d }, [x6]
  ld1 { v0.1d, v1.1d, v2.1d, v3.1d }, [x6]
  ld1 { v0.1d, v1.1d, v2.1d, v3.1d }, [x6]
  ld1 { v0.1d, v1.1d, v2.1d, v3.1d }, [x6]
  ld1 { v0.1d, v1.1d, v2.1d, v3.1d }, [x6]
  ld1 { v0.1d, v1.1d, v2.1d, v3.1d }, [x6]
  ld1 { v0.1d, v1.1d, v2.1d, v3.1d }, [x6]
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 1.0005

retire uop (01)cycle (02)03mmu table walk data (08)09l2 tlb miss data (0b)0e0f181e1f22233a3f43464951schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd load (98)inst ldst (9b)9d9fl1d tlb access (a0)l1d cache miss ld (a3)a5ld unit uop (a6)l1d cache writeback (a8)acafb5b6bbl1d cache miss ld nonspec (bf)branch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0e7eaebec? ldst retires (ed)? simd retires (ee)? int retires (ef)f5f6f7f8fd
32020580067599000110380100800262121228160112100160012100160312500800929180083800418004206121601162003200322001600168004180042118020110099100100800008000001001600043501600360351600046132011151160160080039110021600001600001008004280043800428004380042
320204800416000000003800028002720122816011210016001210016001650080092918002280041800410612160116200320032200160016800418004111802011009910010080000800000100160004350160036036160036610401115116016008003800021600001600001008004280043800428038780042
3202048004159900000000000800260121228160112100160012100160016500800076180022800418004106121601162003200322001600168004180042118020110099100100800008000001001600043501600040016003661324211151160160080177014021600001600001008004280043800428038680042
3202048004159900001038000280026212122716011210016001210016001650080063318002280041800410231216011620032003220016001680042800411180201100991001008000080000010016000435016003650160040603240111511601600800380141401600001600001008004280042800428004280042
320204800416000001004201008002720027160112100160012100160016500800929180096800418004144612160116200320032200160016800418004111802011009910010080000800000100160004350160036032160004613640111511601600800380141421600001600001008004280042800428004280043
32020480041600100100420100800262121227160112100160012100160016500800453080022800418004106121601162003200322001600168004180041118020110099100100800008000001001600043501600364321600406000111511601600800381101001600001600001008004280043800428004280042
3202048004160000001042000280025012122516010010016000010016000050080037718001580040800400322160100200320000200160000800408004011802011009910010080000800000100160000350160000032160032610000051091171180037101001600001600001008004180041800418004180041
3202048004059900000038000280025212025160100100160000100160000500800000080015800408004003221601002003200002001600008004080040118020110099100100800008000001001600003501600000316003260364000051091171180122014021600001600001008004180041800418004180041
3202048004060000000042000280025012152516010010016000010016000050080000008001580040800400322160100200320000200160000800408004011802011009910010080000800000100160000350160032039160036613240000510911711800371141421600001600001008004180041800418004180041
320204800406000000003801028002520122516010010016000010016000050080000018001580040800400318016010020032000020016000080040800401180201100991001008000080000010016000035016003200160000613640000510911711800371101021600001600001008004180041800418012780041

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 1.0005

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)l2 tlb miss data (0b)0e0f1e22233a3f4346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696d6emap rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd load (98)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafb5b6bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2cdcfd5map dispatch bubble (d6)dbddfetch restart (de)e0eaebec? ldst retires (ed)? simd retires (ee)? int retires (ef)f5f6f7f8fd
320025800425990001101008002525532516001010160000101600005080138308001580040800403221600102032000020160000800408004011800211091010800008000011016001314016001300151160000615143130050199170758003713135160000160000108004180041800418004180041
32002480040599111005800080025255325160010101600001016000050801383080015800408004032216001020320000201600008004080040118002110910108000080000010160013144316001200151160000015001310501951707580037005160000160000108004180041800418004180041
32002480040600110105810280025250325160010101600001016000050800048180015800408004032216001020320000201600008004080040118002110910108000080000010160014140160051000121600390113431300501951707580100005160000160000108004180103800418004180041
320024800406001110057102800252553251600101016000010160000508013831800158004080040322160010203200002016000080040800401180021109101080000800001101600141243160053001121600396151013005019517057800370135160000160000108004180041800418004180041
3200248004059910100130028002525532516001010160000101600005080138318001580040800403221600102032000020160000800408004011800211091010800008000001016001414016005400052160000605243130050198170578003713135160000160000108004180041800418004180041
3200248004060011000590008002505532516001010160000101600005080138608001580040800403221600102032000020160000800408004011800211091010800008000011016001515016005200113160039011243130050196170758003713135160000160000108004180041800418004180041
3200248004060011100590008002505002516001010160000101600005080138008001580040800403221600102032000020160000800408004011800211091010800008000001016001514431600130001316004061524312205019517057800370130160000160000108004180041800418004180041
320024800405991110057000800250553251600101016000010160000508000420800158004080040322160010203200002016000080040800401180021109101080000800000101600141243160052000191600406013431200501951707580037005160000160000108004180041800418004180041
32002480040599111106600280025205325160010101600001016000050801383080015800408004032216001020320000201600008004080040118002110910108000080000010160012134316005100113160039601343121050195170578003713130160000160000108004180041800418004180041
32002480040599110115700080025250325160010101600001016000050801377080015800408004032216001020320000201600008004080040118002110910108000080000010160014124316005300113160039005243120050197170578003713135160000160000108004180041800418004180041