Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

ST1 (multiple, 3 regs, 8H)

Test 1: uops

Code:

  st1 { v0.8h, v1.8h, v2.8h }, [x6]
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 60 nops): 3.000

Issues: 3.000

Integer unit issues: 0.000

Load/store unit issues: 3.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)03l1i tlb fill (04)mmu table walk instruction (07)mmu table walk data (08)09l2 tlb miss instruction (0a)l2 tlb miss data (0b)18191e1f22243a3f464951schedule uop (52)schedule ldst uop (55)dispatch ldst uop (58)simd uops in schedulers (5a)5f696d6emap rewind (75)map stall (76)dispatch uop (78)map ldst uop (7d)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst simd store (99)inst ldst (9b)l1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)st unit uop (a7)l1d cache writeback (a8)acafbcl1d cache miss st nonspec (c0)c2c9cfd0d2l1i cache miss demand (d3)l1i tlb miss demand (d4)d5map dispatch bubble (d6)d9dadbddfetch restart (de)e0eaebec? ldst retires (ed)f5f6f7f8fd
630052876522202003000010047632854800225953000300030001589901705628191286473103000300060002853328625116100110001000300006300000300306001320694287055306813520101329438191437342822415487121191433730002871928663286502873328688
63004287352220200200001004702285340022662300030003000159063170502824128719310300030006000285522865111610011000100030000030000465300006001346194877038312603919969326238181636412818515242124291409730002869228598287742864528649
630042865422301003000010046772854500226703000300030001590621704928081286863103000300060002869628620116100110001000300006300000300306001308395287025315604119961319338151639382816615302125011400030002868028748286762874428769
63004287792220200200001004724286370022662300030003000159003170332808728696310300330006000286542859211610011000100030002630000030000602561322395826966322403720038322038191935322816915262125501394830002869528661288062887228682
63004286072230000200016210047192862000226293000300030001590241704128131287653103000300060002859128713116100110001000300000300000300006001335295836931319414220081322138181540382820515481123111408530002869928733285722866928752
630042878422303012001001048292857210225923000300030001590431705828121287533103000300060002866928615116100110001000300020300000300006001326993527042320714020185320738171639422822615568125831416130002880428628287462887128814
630042870122302101001001047232863400226893000300030001590121701928219286983103000300060002868228675116100110001000300020300000300006201348394927039316404220002322038181838412813515601126281395830002869528686287252869128628
630042868422301003010011047852848200226213000300030001591271704728117288003103000300060062858328645116100110001000300006300000300006001344793936973315314520009329038121436342817715709126141396430002869628629287162868828712
630042872422201000000010047202853500226783000300030001590421703628096286973103000300060002853628668116100110001000300000300000300006001324495037041313513520040334938202035352814615623124041386930002876228800287422865628652
6300428798222010010009100469828617002260730003000300015908417069281712876331030003000600028598286301161001100010003000063000047830000600132909621695931831382009433093820935332816015389126241420130002872228726286642865228761

Test 2: throughput

Count: 8

Code:

  st1 { v0.8h, v1.8h, v2.8h }, [x6]
  st1 { v0.8h, v1.8h, v2.8h }, [x6]
  st1 { v0.8h, v1.8h, v2.8h }, [x6]
  st1 { v0.8h, v1.8h, v2.8h }, [x6]
  st1 { v0.8h, v1.8h, v2.8h }, [x6]
  st1 { v0.8h, v1.8h, v2.8h }, [x6]
  st1 { v0.8h, v1.8h, v2.8h }, [x6]
  st1 { v0.8h, v1.8h, v2.8h }, [x6]
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 1.5005

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)l2 tlb miss data (0b)18191e1f233f46494f5051schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)61696b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd store (99)inst ldst (9b)9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)a4ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafbcl1d cache miss st nonspec (c0)c2branch cond mispred nonspec (c5)cdcfd0d2d5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? ldst retires (ed)? int retires (ef)f5f6f7f8fd
240205120063930101004830120027161622252401001002400001052400005005519448012013701200431200488995939001124010020024000020048000012004212004231802011009910010080000800001002400000421260240002205240002042000511000116211200390240000100120043120043120044120045120043
240204120042930000001230120122016730252401001002400001002400005005519448012001701200421200428995539001224010020024000020048024012004212004211802011009910010080000800001002400002420024000010524000220000511000116111200390240000100120043120043120041120047120043
240204120184930000000301200251616002524010010024000010024000050055194480120017012004212004089956390000240100200240000200480000120042120042118020110099100100800008000010024000004200240002000240002242000511000116111200390240000100120043120043120052120045120044
2402041200429300000024301200271616002524010010024000010024000050055243320120018012004212004389957390003240100200240000200480000120042120207218020110099100100800008000010024000004200240002002240002242000511000116111200390240000100120043120047120053120043120044
2402041200429300000012301200281616102524010012024000010024000050055194720120017012004012004289955389998240100200240000200480000120042120042318020110099100100800008000010024000004200240002102240002242000511000125111200390240000100120043120183120054120041120043
2402041200429310000003012002801610252401001002400001002401085005519352012001701200421200428995539000124020820024000020048000012004212004011802011009910010080000800001002400000420024000200024000020000511000116111200390240000100120043120047120046120043120041
240204120040931000000001200271616102524018510024000010024000050055194720120017312004012004289953390001240100200240000200480000120042120184118020110099100100800008000010024000004200240002000240002042000511000116111200390240000100120185120043120047120043120043
2402041200429310000012301200271616102524010010024000010024000050055194480120017012004012004289955390000240100200240000200480000120042120043118020110099100100800008000010024000004200240062002240002242000511000116111200395240000100120043120050120047120184120044
2402041200429300000003012002816167702524018010024000010024000050055194480120017012004212004290304390000240317200240000200480000120042120043118020110099100100800008000010024000004200240002102240002242000511000116211200390240000100120043120043120041120051120043
240204120054931000000301200281600025240100100240000100240000500551947201200180120042120043900463900002401002002400002004802401200421200431180201100991001008000080000100240000242002400020062400002420005110002161112003723240000100120182120326120185120470120182

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 1.5005

retire uop (01)cycle (02)03mmu table walk instruction (07)mmu table walk data (08)09l2 tlb miss data (0b)18191e1f2223243f46494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd store (99)inst ldst (9b)9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)a4ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafbcl1d cache miss st nonspec (c0)c2branch cond mispred nonspec (c5)cfd0d5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? ldst retires (ed)? int retires (ef)f5f6f7f8fd
240025120042931000000030001200271616125240010102400001024000050551944801200171200421200428997739002324001020240000204800001200421200431180021109101080000800001024000004200240002002240002242005020041638120039024000010120043120043120043120043120043
240024120040930000000930001200391616125240010102400001024000050551944801200181200421200428998939002224001020240000204800001200421200421180021109101080000800001024000004200240002002240002242005020051665120039024000010120043120043120043120043120043
240024120042931000000030001200271616125240010102400001024000050551944801200171200421200428997739002224001020240000204800001200421200421180021109101080000800001024000004400240002002240002042005020051654120039024000010120043120044120041120043120044
240024120042930000000001001200281616025240010102400001024000050551944801200211200421200438997739002224001020240000204800001200431200421180021109101080000800001024000004200240002002240002242005020041655120039024000010120044120043120043120044120043
240024120043930000000030001200271616025240010102400001024000050551947201200171200421200428997739002224001020240000204800001200441200421180021109101080000800001024000004200240002202240002242005020041633120040024000010120041120041120041120043120043
240024120043931000000030001200271616125240010102400001024000050551947201200211200431200428997839002324001020240000204800001200431200421180021109101080000800001024000004200240002002240002242005020051644120040024000010120044120044120043120043120044
240024120043930000000031001200271616125240010102400001024000050551944801200171200421200428997739002224001020240000204800001200421200421180021109101080000800001024000004200240002002240002242005020031634120040024000010120044120043120043120044120043
240024120043931000000030001200271616125240010102400001024000050552000001200171200421200428997839002224001020240000204800001200431200422180021109101080000800001024000004200240002002240002242005020021634120039024000010120043120043120045120043120600
240024120043964001000030001204230161842400101024000010240000505520000012001812004212004289977289018224001020240000204800001200431200421180021109101080000800001024000004200240002002240002042005020041644120051024000010120043120043120043120055120043
2400241200429300000003031001200271616125240010102400001024000050551944801216731204831200448997939002224001020240000204800001200421200431180021109101080000800001024006004200240062102240722242005020031643120039024000010120043120043120044120122120043