Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

LD4 (multiple, post-index, 16B)

Test 1: uops

Code:

  ld4 { v0.16b, v1.16b, v2.16b, v3.16b }, [x6], x8
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 60 nops): 12.000

Issues: 13.022

Integer unit issues: 1.000

Load/store unit issues: 4.000

SIMD/FP unit issues: 8.026

retire uop (01)cycle (02)03l1i tlb fill (04)l1d tlb fill (05)mmu table walk instruction (07)mmu table walk data (08)l2 tlb miss instruction (0a)l2 tlb miss data (0b)0e0f181e22243a3f464951schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)5f6061696d6emap rewind (75)map stall (76)dispatch uop (78)map ldst uop (7d)map simd uop (7e)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst simd load (98)inst ldst (9b)9dl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)a9acafb5b6bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2cfd0d2l1i cache miss demand (d3)l1i tlb miss demand (d4)d5map dispatch bubble (d6)d9dadbddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? simd retires (ee)? int retires (ef)f5f6f7f8fd
72005292722191131111109004714291280415285130321000802840001000800040005000203099815910024773292522934731013000400080005000200002924429201116100110001000040054040070044006616042128879106700030891511928430773812747482852810001621613042133244000800010002920629255292862922329350
720042927521901010100015004630290440415326130281000802240001000800040005000203409837012002473929199291613101300040008000500020000291642927311610011000100004007612401102940020170401279590736916309604819246299738141142422851810001646613081132754000800010002918529231293462930029299
7200429273220010101000120046002904400153061302610008008400010008000400050002030998505100024805291392932231013000400080005000200002924729320116100110001000140044840080164002501012421281192046849307603919184308738111540442852310001635213004133084000800010002924429253292712930629245
72004292012190101010001600461329060001532513022100080224000100080004000500020340985341002479829136292713101300040008000500020000292732926111610011000100014006684011004400561612411284591226851308913919244308538171148442847610001631612969134284000800010002926029335292832930429219
7200429264219010101000400457929009001530213022100080264000100080004000500020309985251100247712909329110310130004000800050002000029267292121161001100010000400668400712440036090421285391146883306004119283306838061241352848910001635213117132054000800010002923529294293312924329314
72004293252190101110001200453529069001527913022100080224000100080004000500020309983172002480529216293073101300040008000500020000292582927411610011000100004005504010026400061411421316793106848306504619274303438101042452848110031612713083131084000800010002931829359293322927929181
72004293552200101100004004668290530015311130221000802640001000800040005000203019809100024767292252934131013000400080005000200002916529202116100110001000040066124012026400201411411294594026848306204019253305338091542412846110001630613035133234000800010002921229319293422923929279
72004292422190101010001200459629131001529313022100080264000100080004000500020326984720002476429086292743101300040008000500020000292462917411610011000100004005411400800440066148421295191186858313604919235308738121342382849110001645513086133854000800010002933429315292462930629204
72004292802190101010009004576290670015363130281000802840001000800040005000203159841660024743292042932331013000400080005000200002922129226116100110001000040066040110194000611011411300592046838311904119317306338141342482862410001631012965135184000800010002933029285293632939129223
7200429323219010001000130045602912300152691302210008022400010008000400050002032098516300248072918829350310130004000800050002000029163291361161001100010001400668400912640060060421274190426858305504019342314738131743462850710001632613136133504000800010002926329345293392925429262

Test 2: throughput

Count: 8

Code:

  ld4 { v0.16b, v1.16b, v2.16b, v3.16b }, [x6], x8
  ld4 { v0.16b, v1.16b, v2.16b, v3.16b }, [x6], x8
  ld4 { v0.16b, v1.16b, v2.16b, v3.16b }, [x6], x8
  ld4 { v0.16b, v1.16b, v2.16b, v3.16b }, [x6], x8
  ld4 { v0.16b, v1.16b, v2.16b, v3.16b }, [x6], x8
  ld4 { v0.16b, v1.16b, v2.16b, v3.16b }, [x6], x8
  ld4 { v0.16b, v1.16b, v2.16b, v3.16b }, [x6], x8
  ld4 { v0.16b, v1.16b, v2.16b, v3.16b }, [x6], x8
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 2.0007

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)09l2 tlb miss instruction (0a)l2 tlb miss data (0b)0e0f18191e1f22243a3f4346494e4f51schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)6067696b6d6emap stall dispatch (70)simd prf full (72)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd load (98)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)a5ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafb5b6bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2branch cond mispred nonspec (c5)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? simd retires (ee)? int retires (ef)f5f6f7f8fd
960205160072119911000000059000216005426600251040160801006400603200008010064000032000047960116812432176075200160024016004516005900326104010020032081264231220040350516093041622831624441618020110099100100800008000011003200000350320032010583200376132350000510911611160040800001010032000064000080100160060160044160044160060160065
96020416006411990000000006700001600283121200251040164801006400643200008010064000032000047960116804081856001210160024016004316006600326104010020032000064000020040000016000001600431600591180201100991001008000080000010032000003503200000000320000613240000051091161116005680000010032000064000080100160060160067160065160065160065
9602041600431199000000000430100160054266602510401688010064006832000080100640000320000479601168124411520028101600300160069160049303541040100200320000640000200400000160000016007116006911802011009910010080000800000100320013144403200531015332017260534413100510911610160051800001410032000064000080100160060160060160049160044160065
96020416004311980000000003800021600490000025104016480100640064320000801006400003200004796011680408816000010160024016006416006400347104010020032000064000020040000016000001600431600431180201100991001008000080000010032000000032000000037320037600400000510911611160041800001410032000064000080100160044160067160065160044160125
9602041600591199000000000000001600490121200251040156801006400643200008010064000032000047960116804081856001210160024016006416006400326104010020032000064000020040000016000001600641600621180201100991001008000080000010032000003503200370003732003700000000510911611160056800001010032000064000080100160065160044160065160044160065
960204160064119900000000000002160044312000251040100801006400003200008010064000032000047960116800011920001610160045016006416004300331104010020032000064000020040000016000001600591600601180201100991001008000080000010032000003503200320003320037003200000510911611160056800001410032000064000080100160065160067160065160065160065
960204160064119800000000000102160028312121025104010080100640064320000801006400003200004796011680881816000010160024016004816004300326104010020032000064000020040000016000001600641600591180201100991001008000080000010032000000032003700003200006032400000510911611160061800001410032000064000080100160224160065160067160060160044
960204160043119900000000043000016004931200025104010080100640260320000801006400003200004796011680932816000010160024016006416004300347104010020032000064000020040000016000001600431600431180201100991001008000080000010032000000032000000037320000603235000051091161116005680000010032000064000080100160044160044160067160044160060
96020416004311990000011004301021600443121200251040100801006400563200008010064000032000047960116804151856001200160024016004316006600342104010020032000064000020040000016000001600641600431180201100991001008000080000010032000003503200370003232000061360000051091161116004380000010032000064000080100160060160044160065160065160067
960204160064119900000000000102160044312120025104016480100640056320000801006400003200004796011680001185600161016004501600591600590034210401002003200006400002004000001600000160064160043118020110099100100800008000001003200000003200000003532000061374000005109116111600638000000032000064000080100160062160065160065160065160065

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 2.0008

retire uop (01)cycle (02)03mmu table walk data (08)090e0f1e1f22233a3f4346494e4f51schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)5f60696b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd load (98)inst ldst (9b)9d9fl1d tlb access (a0)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)acafb5b6bbl1d cache miss ld nonspec (bf)branch cond mispred nonspec (c5)branch mispred nonspec (cb)cfl1i cache miss demand (d3)d5map dispatch bubble (d6)d9dadbddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? simd retires (ee)? int retires (ef)f5f6f7f8fd
96002516006811990001800101160053128194025104006680010640024320000800106400003200004720411681228217606080116004901600511600682351104001020320000640000204000001600000160068160068118002110910108000080000010320000553200500503200506150430050190181630061716006580000010132000064000080010160069160052160069160051160052
9600241600511199000056010016005318819012510400348001064002432000080010640000320000472041168122521760608011600313160068160068035310400102032000064000020400000160000016006816006811800211091010800008000001032000055320050050320018611700050190171620081716006580000010032000064000080010160052160071160052160051160069
96002416006811990100590101160053170190025104007080010640056320000800106400003200004720411681225217606080116004901600681600680351104001020320000640000204000001600000160050160068118002110910108000080000010320000032001801732001760504400501901616200717160065800001010132000064000080010160069160069160416160226160069
9600241600681199000017010016005314004025104006680010640056320000800106400003200004720411681221217606080116004901600681600512334104001020320000640000204000001600000160068160068118002110910108000080000010320000032001805032018760504400501901716100516160065800001010132000064000080010160069160051160070160069160051
9600241600681199000056000116005413000025104006680010640056320000800106400003200004720411681228217606040116005101600681600682353104001020320000640000204000001600000160050160068118002110910108000080000010320000543200500173200500150440050190171620017816004880000010132000064000080010160069160069160052160069160052
960024160068119900001800011600531380102510400668001064005632000080010640000320000472041168009921760608011600490160145160051235110400102032000064000020400000160000016005016006811800211091010800008000001032000003200180533200506150000501901716200171516006780000100132000064000080010160069160069160052160051165158
960024160068119900001700001600531280012510400348001064006032000080010640000320000472041168122512800028011600490160068160050035110400102032000064000020400000160000016006816005311800211091010800008000001032000054320050218320050001700050190171620014616006580000010132000064000080010160052160069160052160051160069
9600241600681199000068010116005313800025104006680010640024320000800106400003200004720411681194217606080116004901600681600680613410400102032000064000020400000160000016006816063451800211091010800008000001032000055320018150320050611743005019017162001717160065800001010032000064000080010160052160051160070160069160051
9600241600681199001056010016005313001125104003480010640024320000800106400003200004720411681257134399960116004901600681600681343341040010203200006400002040000016000001600681600681180021109101080000800000103200000320050050320050601744005019017164101616160065800001010032000064000080010160052160051160052160069160069
960024160068119900001700001600361781901251040066800106400243200008001064000032000047204116812281280002801160049016006816006803341040010203200006400002040000016000001600681600681180021109101080000800001103200005432014111832005001504300501901716300616160065800001010032000064000080010160069160069160052160069160069