Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

LD1 (multiple, 4 regs, 16B)

Test 1: uops

Code:

  ld1 { v0.16b, v1.16b, v2.16b, v3.16b }, [x6]
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 60 nops): 4.000

Issues: 4.000

Integer unit issues: 0.000

Load/store unit issues: 4.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)03l1i tlb fill (04)mmu table walk instruction (07)09l2 tlb miss instruction (0a)l2 tlb miss data (0b)0e0f1e22243a3f43464951schedule uop (52)schedule ldst uop (55)dispatch ldst uop (58)simd uops in schedulers (5a)5f696d6emap rewind (75)map stall (76)dispatch uop (78)map ldst uop (7d)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst simd load (98)inst ldst (9b)9dl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)a9acafb5b6bbl1d cache miss ld nonspec (bf)c2c9cfd0d2l1i cache miss demand (d3)l1i tlb miss demand (d4)d5map dispatch bubble (d6)d9dadbddfetch restart (de)e0eaebec? ldst retires (ed)f5f6f7f8fd
6400529462220141100001200455229115000233784000400040002070561695728760293123104000400040002914229324116100110001000040000124007008400051812001265590176841315558420289309238132063592847716584134361526340002935529415294932938829437
6400429433220400100000045752913324023346400040004000207088169172876329506310400040004000291612926211610011000100004000004007001040000180001283890116814312626320423303138101560582841016348134301534440002941429381293462942629394
640042935621940010001310460329137000233844000400040002071361695728708293903104000400040002912929221116100110001000040000124006022400661612001280989346846312036220449311838121760592841716563133741521740002930729383293932942529356
64004294032206000000010447029315001233804000400040002071981694028676293811248400040004000292522922811610011000100004000012400700840076178401297491086861310326820395314838121060662840916315133801487340002941229383294022941729417
6400429276220600100014004562294890102341540004000400020714716936287142940131040004000400029313292062161001100010001400441240070247340005170901276490806834309937020446307838131765592848216363135971504640002946429541294452936829482
640042950222070010001200453929126000233894000400040002085961693828705294473104000400440002933429346316100110001000040000124002000400801812001274691736843311016420470307138071366612843416516134051502740002940529355293182937929396
64004294212324110010000466829372001236024000400040002070331693428884295173104000400040002920229188116100110001000040000124008109400760712001273390446880307666320222307138151559592846015852133151521540002933729338293802933729383
640042944122140000001900458629129200233634000400040002071831693328759294493104000400040002923329149116100110001000040000124007007400261312001277890596828306716920380308838081862642846916555133121527740002944229373294362945029316
6400429360220500000000044862910704023377400040004000207095169162865229341310400040004000292072920211610011000100004000012400600640066108001287291176847303337220347307338101460602836416342134591514240002943929410294642948229349
64004293802203100000000451829106030234184000400040002071031694628715293203104000400040002919829253116100110001000040000124007007400660012001280291846880303736420399306138141364612872416537134481519240002946529460293482926029511

Test 2: throughput

Count: 8

Code:

  ld1 { v0.16b, v1.16b, v2.16b, v3.16b }, [x6]
  ld1 { v0.16b, v1.16b, v2.16b, v3.16b }, [x6]
  ld1 { v0.16b, v1.16b, v2.16b, v3.16b }, [x6]
  ld1 { v0.16b, v1.16b, v2.16b, v3.16b }, [x6]
  ld1 { v0.16b, v1.16b, v2.16b, v3.16b }, [x6]
  ld1 { v0.16b, v1.16b, v2.16b, v3.16b }, [x6]
  ld1 { v0.16b, v1.16b, v2.16b, v3.16b }, [x6]
  ld1 { v0.16b, v1.16b, v2.16b, v3.16b }, [x6]
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 1.3341

retire uop (01)cycle (02)03mmu table walk data (08)09l2 tlb miss data (0b)0e0f18191e1f2223243a3f4346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd load (98)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)a5ld unit uop (a6)l1d cache writeback (a8)a9acafb5b6bbl1d cache miss ld nonspec (bf)c2branch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfl1i cache miss demand (d3)d5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
32020510673180001000000010011067192121216253201001003200001003200175004673844010668201067271067072667762670032011720032002420032002410672710670711802011009910010080000800000100320000043032003900393200396039000005109011711106724010104320000100106728106728106728106728106938
3202041067277990001000550000110671221200253201001003200001003200005004709628010670201067271067272667532670932010020032000020032076810674510672711802011009910010080000800000100320000043032003900393200396139430000510901171110672401004320000100106708106728106728106708106708
320204106727799000000045000011067122120162532010010032000010032000050047281830106702010672710672726675326712320100200320000200320000106727106727118020110099100100800008000001003200000430320039004232003901394300005109011711106724010100320000100106728106728106708106728106728
3202041067278000000000540100110671220120253201001003200001003200005004709628010670401067271067272667532670932010020032000020032000010672710672711802011009910010080000800000100320000043032003900032003961404300005109021711106724010104320000100106728106708106708106728106728
32020410670779900000004501001106712212121625320100100320000100320000500470962801067020106707106727266753266893201002003200002003200001067271067271180201100991001008000080000010032000000032003910403200396139430000510901171110672800104320000100106731106728106728106728106728
320204106727800000000045000001067122120162532010010032000010032000050047096280106702010672710672726675326709320100200320000200320000106729106727118020110099100100800008000001003200000430320000003932000060394300005109011711106704010104320000100106728106728106728106708106728
320204106707799000000045010011066922121202532010010032000010032000050047281830106702010672710672726675326711320100200320000200320768106804106742118020110099100100800008000001003200000430320039003932003901394300005109011711106704010104320000100106728106728106728106728106728
3202041067077990000000450000110671201212162532010010032000010032000050046988801106702010672710672726655326689320100200320000200320000106727106727118020110099100100800008000011003200000430320039003932000061394300005110011711106724010100320000100106728106728106728106728106728
32020410672780000000004501001106712212121625320100100320000100320000500470962811066850106730106710266553267093201002003200002003200001067271067271180201100991001008000080000010032000004303200390003200006139000005109011711106724010100320000100106728106728106728106728106708
320204106727799000110045010011067122121216253201001003200001003200005004673754110670201067071067272667532670932010020032000020032000010670710672711802011009910010080000800000100320000043032003900393200000139000105110011711106724010104320000100106708106728106708106731106728

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 1.3340

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)l2 tlb miss data (0b)0e1e223a3f4346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd load (98)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafb5b6bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2cdcfd5map dispatch bubble (d6)daddfetch restart (de)e0eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
3200251067238300000411110669220181225320010103200001032000050468510711069031067221067222667032668932001020320000203200001067271067071180021109101080000800000103200000032003500035320039613539000502014170121410671960232000010106727106728106734106723106725
32002410670779900004101106692318181225320010103200001032000050468510711066971067221067222667032670432001020320000203200001067221067221180021109101080000800000103200000393200350003532003561039000502012170141510670466232000010106723106723106723106726106708
32002410672279900004111106707218181225320010103200001032000050468336011066821068031067412667032670432001020320000203200001067071067221180021109101080000800000103200000393200350009232003561353900050199170101410671966232000010106725106723106716106728106762
320024106725799000041001067070181812253200101032000010320000504698880110669710672210672226670326704320010203200002032000010672210672211800211091010800008000001032000003932003500035320035613539000502014170141410671966232000010106723106723106708106723106723
32002410672280000004111106692218181225320010103200001032000050470583711066971067221067222667032670432001020320000203200001067221067301180021109101080000800000103200000393200350003832003661039000501916170161310670406232000010106723106723106811106708106708
32002410670779900004111106692218181225320010103200001032000050468510711066971067221067072667032668932001020320000203200001067221067111180021109101080000800000103200000393200350000320036613539000502014170141510685666232000010106723106708106723106723106708
32002410672279900000101067102018122532001010320000103200005047058371106820106722106722266703266893200102032000020320000106722106722118002110910108000080000010320000039320000000032003561039000502012170131310671966232000010106723106723106723106727106723
32002410672680000001211106707218185483201401032013010320890504668429110669710674910672626674326708320010203200002032000010672210672211800211091010800008000001032000003932003601041320035613539000502012170151210671966232000010106723106723106727106727106708
32002410672679900004111106707218181425320010103200001032000050472291311070111067071067222667032670432001020320000203200001067221067221180021109101080000800000103200000393200350003532000061350000502014170121210671966232000010106723106708106723106723106708
32002410672279900004111106707218181125320010103200001032000050470299211066971067221067072667032668932001020320000203200001067071067221180021109101080000800000103200000393200000003532000061350001502012170151110671960232000010106723106723106708106740106723