Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

LD1 (multiple, 3 regs, 2D)

Test 1: uops

Code:

  ld1 { v0.2d, v1.2d, v2.2d }, [x6]
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 60 nops): 3.000

Issues: 3.000

Integer unit issues: 0.000

Load/store unit issues: 3.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)03mmu table walk instruction (07)l2 tlb miss instruction (0a)0e0f1e223a3f43464951schedule uop (52)schedule ldst uop (55)dispatch ldst uop (58)simd uops in schedulers (5a)5f696d6emap rewind (75)map stall (76)dispatch uop (78)map ldst uop (7d)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst simd load (98)inst ldst (9b)9dl1d tlb access (a0)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)acafb5b6bbl1d cache miss ld nonspec (bf)c2cfd0d2l1i cache miss demand (d3)l1i tlb miss demand (d4)d5map dispatch bubble (d6)d9dadbddfetch restart (de)e0eaebec? ldst retires (ed)f5f6f7f8fd
6300529446221252500331469728849000240363000300030001500001626228702292523103000300030002908029166116100110001000030009300101300451490132459125696431634772062432253825855522839815362137031551830002937029263292742919029371
6300429402219161501614558289490032417130003000300015001316184287352926131030003000300029166291231161001100010000300063001013001514901281194796858303555520739333038191753552838816239136981569830002929029373292512929929306
6300429302220151700904557289480002400730003000300015000016173285902921331030003000300029090291191161001100010000300063001013004504601276595146827303564820602311838211655602831216398135491519330002925929274293332926429308
6300429238219221900614773289360002408630003000300015000016240285652928931030003000300029143290961161001100010000300033004013001511601294190286805307384520545312238221953512833116360137131587030002928729219292772919629254
63004292172191519009146032893000024044300030003000150010161762866229292310300030003000291672919311610011000100003000630010130055116012778908968073157115320698321238231054542875816458136241550330002931429275294122936829290
63004293202192124009150022885500024169300030003000150000162672865729170310300030003000291262909911610011000100003000630010430045119012872907568192988105620687306038231153492840316384136171573630002929329398294502931929443
6300429342219182300104878288740002406330003000300015003116232288492936531030003000300029191292431161001100010000300063003013002514901287994056791305675520588306538281457472840216195136661545530002929529219293142916629416
6300429458220211600904555288570002415230003000300015002316156286232923331030003000300029220291941161001100010000300093003043003511601311791046830315475320599305338231554552841916467137481542930002927029352292342929529349
6300429328220182000714557288490022412230003000300015000016277287052929531030003000300029214291021161001100010000300063001013001501901286691006816302685020706304138201453542835116482135341548430002920629259292382933029257
6300429451219212600104664288930002419730003000300015000016153286762932931030003000300029081292381161001100010000300093005013001501601286892326839308285220613313738151156572843016396136421553030002925729299292752922429394

Test 2: throughput

Count: 8

Code:

  ld1 { v0.2d, v1.2d, v2.2d }, [x6]
  ld1 { v0.2d, v1.2d, v2.2d }, [x6]
  ld1 { v0.2d, v1.2d, v2.2d }, [x6]
  ld1 { v0.2d, v1.2d, v2.2d }, [x6]
  ld1 { v0.2d, v1.2d, v2.2d }, [x6]
  ld1 { v0.2d, v1.2d, v2.2d }, [x6]
  ld1 { v0.2d, v1.2d, v2.2d }, [x6]
  ld1 { v0.2d, v1.2d, v2.2d }, [x6]
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 1.0009

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)09l2 tlb miss data (0b)0e0f181e1f22233a3f4346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)92inst branch cond (94)inst int alu (97)inst simd load (98)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)a5ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafb5b6bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2branch mispred nonspec (cb)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
240205800715991110010680102800553771925240100100240000100240000500350436108015008004880048499921550005240100200240000200240000800618006111802011009901001008000080000010024000004302400400003924003951043000511021622800620101072400001008006680066800418007280066
2402048006460000000005300018005021020252401001002400001002400005003537804080042080065800654998835001924010020024000020024000080040800631180201100990100100800008000001002400000002400000003824003860044000511021622800620141072400001008006680066800668006280041
240204800655990000000440000800252012025240100100240000100240000500349543618003708004080040499883499982401002002400002002400008006480040118020110099010010080000800000100240000043024000001038240038503944000511021622800632101002400001008006680062800668004180062
240204800645990000000000018004601120252401001002400001002400005003519717080061080070800704997135002824010020024000020024021680069800481180201100990100100800008000001002400201943024001910121240000015943190051102162280067001302400001008007080070800718007080049
24020480070600110000067000080055207202524010010024000010024000050035034290800530800488004849970350028240100200240000200240000800698006921802011009901001008000080000010024002020430240059010632400410159431900511021622800660131352400001008007080050800718004980070
24020480069600101000021010380055300202524010010024000010024000050034862891801760800508004849992350006240100200240000200240000800478007011802011009901001008000080000010024002119430240060001622400410160431920511021622800440131352400001008004980071800708004980049
2402048006960010010002100128005500720252401001002400001002400005003486289180059080069800484997035002824010020024000020024000080069800481180201100990100100800008000001002400192043024001900161240041515943191051102162280067001352400001008007180070800498007180070
2402048004859910110006700038005420720252401001002400001002400005003501095180058080069800694999235002724010020024000020024000080048800701180201100990100100800008000011002400201943024001900162240040511943191151102162280045001352400001008004980049800718007180079
2402048004859911110006600038005407019252401001002400001002400005003503429180057080069800694997035000624010020024000020024000080070800701180201100990100100800008000001002400191900240059100622400005160431800511021622800660131352400001008007080070800718007180049
240204800696001100010670000800333971925240100100240000100240000500352095318005708004880048499923500272401002002400002002400008006980069118020110099010010080000800000100240021190024006010262240000506043190051102162280067001352400001008004880070800718007080070

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 1.0008

retire uop (01)cycle (02)030e0f18191e22233a3f4346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd load (98)inst ldst (9b)9d9fl1d tlb access (a0)l1d cache miss ld (a3)a5ld unit uop (a6)l1d cache writeback (a8)a9acafb5b6bbl1d cache miss ld nonspec (bf)cfd0l1i cache miss demand (d3)d5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
24002580398599000001008006921212195124001010240000102400005035384940800360800688007150119350021240010202400002024000080061800411180021109101080000800000102400004302400401062240040514043502000516468005801404240000108004280066800628006280061
2400248007060000004500080026212002524001010240000102400005035286090800360800618006150006350021240010202400002024000080061800611180021109101080000800001102400004302400400039240040514005020006162480058010104240000108004280065800428006280042
2400248006159900004510080046201216252400101024000010240000503523026080016080061800615000635004024001020240192202400008004180061118002110910108000080000010240000002400001040240039510435020005165280038010104240000108006280042800428006680066
2400248006860001005410180046212122025240010102400001024000050352178508003608006180065500063500412400102024000020240000800618006111800221091010800008000001024000043024000000402400395140435020002162480038014104240000108006580062800428004280042
240024803626000000460018002620116252400101024000010240000503522117080036080065800655001035004124001020240000202400008006180041118002110910108000080000010240000430240040004024003951400502000216468005801004240000108007380064800428006280042
24002480347601000045101800552121202524001010240000102400005035102270800360800618006150006350041240010202400002024000080061800411180021109101080000800000102400004302400400040240039510445020004164380057010104240000108006680042800628006280042
2400248006160000004510080046211025240010102400001024000050352238008004038006580065499863500212400102024000020240000800618006111800211091010800008000001024000043024000000402400400140435020002163480058010100240000108006280042800428006280062
2400248035860000000001800462121202524001010240000112400005035112630800160800658004150006350041240010202400002024000080061800611180021109101080000800000102400004302400390040240040614005020004165580057010100240000108006180062800628006280066
2400248037160111004500180046201216252400101024000010240000503490485080036080061800614998635004124001020240000202400008006180061118002110910108000080000010240000002400000028624004050005036006166480057010104240000108004280042800428004280062
2400248035660100010000800532121216252400101024000010240000503531036080016080061800654998635004524001020240000202400008004180041118002110910108000080000010240000431312400390040240039014043502000416428005800104240000108006280062800628006680063