Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

ST1 (multiple, 3 regs, 16B)

Test 1: uops

Code:

  st1 { v0.16b, v1.16b, v2.16b }, [x6]
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 60 nops): 3.000

Issues: 3.000

Integer unit issues: 0.000

Load/store unit issues: 3.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)03l1i tlb fill (04)l1d tlb fill (05)mmu table walk instruction (07)mmu table walk data (08)09l2 tlb miss instruction (0a)l2 tlb miss data (0b)18191e1f2223243a3f464951schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch ldst uop (58)simd uops in schedulers (5a)5f696b6d6emap rewind (75)map stall (76)dispatch uop (78)map ldst uop (7d)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst simd store (99)inst ldst (9b)l1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafbcl1d cache miss st nonspec (c0)l1d tlb miss nonspec (c1)c2c9cfd0d2l1i cache miss demand (d3)l1i tlb miss demand (d4)d5map dispatch bubble (d6)d9dadbddfetch restart (de)e0eaebec? ldst retires (ed)f5f6f7f8fd
630052943323741100200061000466329452332352430000300330061601571712002888329751127130033003601229624296743161001100010003003292300406870300306001491324292966865314905221074339238181547492895816446132961481030002964429713296223022729596
63004296622380011101007559900046923001733233663000177683000300015898617048628803294523103000300060002932429340116100110001000300536230031063000393101302893886977315514920689341938113856532959316191136151481330002939129407293932943729419
630042929622700100111011710004682291260023169300003000300015904717044028573293493103000300060002923129179116100110001000300006130000003003000001333094366946321504720654330238171150492852815902132421465730002942429361293982926929249
63004294282270110100000890004607292670023222300003000300015905017050028530293713103000300060002918329257116100110001000300006030000033003060001310593146924310104820624329938141047502860316056129971480430002939629315293642925629251
63004294162270000011004261000474429136002310930000300030001590531704702867029364310300030006000292492916111610011000100030002903000000300009000130779456692331310462065232243814346462850216086132531498530002935629292294862942529331
630042941122700000000135110004689290503023297300003000300315900317044028615292483103000300060002918729164116100110001000300026030000033000090001318294746921320805120608326138191143492863916174131261471230002935029284293212932829225
63004293962270000000009911004564291123323264300003000300015903117064028547293413103000300060002915129214116100110001000300006030010003000090001339792946938312005020639339338141446522859916021130791458330002934429372293592938429488
630042919822700100200036301004718291560023239300003000300015925517054028645294656103000300060002928529259116100110001000300009030000003000060001337595176984312204920788337238131045442853516482131061467830002942429332294202936229317
630042931522700000101036310004709290340323239300003000300015902171702502818028745310300130006000285892873711610011000100030000623000000300028020129439139691031602552012532523815852482822415796127091431430002885028784286132884328827
630042886722300400200032410004651288442023135300003000300015897017037028042286023103000300060002884328859116100110001000300008030000023002080001347393017035324315519779321538141749572805615961127651504830002897128896291472895528646

Test 2: throughput

Count: 8

Code:

  st1 { v0.16b, v1.16b, v2.16b }, [x6]
  st1 { v0.16b, v1.16b, v2.16b }, [x6]
  st1 { v0.16b, v1.16b, v2.16b }, [x6]
  st1 { v0.16b, v1.16b, v2.16b }, [x6]
  st1 { v0.16b, v1.16b, v2.16b }, [x6]
  st1 { v0.16b, v1.16b, v2.16b }, [x6]
  st1 { v0.16b, v1.16b, v2.16b }, [x6]
  st1 { v0.16b, v1.16b, v2.16b }, [x6]
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 1.5005

retire uop (01)cycle (02)03mmu table walk data (08)09l2 tlb miss data (0b)18191e1f2223243f46494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd store (99)inst ldst (9b)9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)a4ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafbcl1d cache miss st nonspec (c0)c2branch cond mispred nonspec (c5)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? ldst retires (ed)? int retires (ef)f5f6f7f8fd
24020512005089900000123000120025016102524010010024000010024000050055194240120026120040120040899553900002401002002400002004800001200401200491180201100991001008000080000100240000036002400021022400022340005110116211200370240000100120041120043120043120052120041
24020412005193100000996000120036161612524010010024000010024000050055193520120017120042120051899553900002401002002400002004800001200421200401180201100991001008000080000100240000034002400001052400020340005110116111200390240000100120043120050120043120043120043
2402041200409300000012300012002716160252401001002400001002400005005519424012001712004212005089955390000240100200240000200480000120042120049118020110099100100800008000010024000000002400021052400002340005110216211200390240000100120043120050120043120041120052
240204120042931000001230001200251600252401001002400001002400005005519784012470812004212004289955389998240100200240000200480000120049120040118020110099100100800008000010024000003400240002102240002000005110116221200480240000100120041120043120043120050120051
2402041200409310000012000012002716160252401001002400001002400006005519760012001512004212004089955390000240100200240000200480000120051120042218020110099100100800008000010024000003400240002008240002200005110116211200470240000100120041120052120043120043120041
2402041200429310000006001120025161602524010010024000010024000050055194240120015120040120042899633900002401002002400002004800001200401200501180201100991001008000080000100240000034002400020022400022344005110116121200370240000100120043120043120041120041120043
24020412004993100000030001200271602252401001002400001002400005005519424012001712004212004089953390000240100200240136200480000120042120040118020110099100100800008000010024000000002400021022400022400005128116111200370240000100120041120043120043120052120182
2402041200519300000000000120036161602524010010024000010024000050055194241120017120042120042899623901162401002002400002004800001200421200491180201100991001008000080000100240000034002400020252400000340005110116111200370240000100120043120041120043120043120043
2402041200409310000000000120027160722524010010024000010024000050055194240120017120042120042900473900002401002002400002004802461200401200501180201100991001008000080000100240060034002400020052400002340015110216111200470240000100120051120043120043120051120043
2402041200409320000003000120027016056240100100240000100240000500551935201200251200421200408995539012324010020024000020048000012004212004011802011009910010080000800001002400000340024000200112400002340005110216211200390240000100120043120043120050120043120043

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 1.5005

retire uop (01)cycle (02)03mmu table walk data (08)09l2 tlb miss instruction (0a)l2 tlb miss data (0b)0e18191e1f22233f46494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd store (99)inst ldst (9b)9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)a4ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafbcl1d cache miss st nonspec (c0)c2cdcfd2d5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
24002512004293000000000300120027161602524001010240000102400005055194240120015120042120051899843900222400102024000020480000120042120042118002110910108000080000102400000340024000200224000203400502043163312003900024000010120043120041120051120182120043
240024120042964110101339697001203111616141178224151011240060112402165055339320120137120180120461901624090255240335202403602048072012032312018931800211091010800008000010241202005802401820016272401222000507333434512015710024000010120043120051120043120043120051
24002412004296500000000300120027161602524001010240000102400005055194240120024120050120042899773900222400102024000020480000120042120042118002110910108000080000102400000340024000010224000223400502013163312003900024000010120043120043120052120043120043
24002412004296400000000610120027161602524001010240000102400005055194240120017120042120051899773900222401282024000020480000120053120040118002110910108000080000102400000340024000200224000223400502013164312003900024000010120043120043120050120043120043
24002412004296400000000300120027161602524001010240000102400005055194240120017120042120042899773900292400102024000020480000120042120050118002110910108000080000102400000340024000200524000203400502003163312003900024000010120043120043120050120043120043
240024120042964000000003001200271616282524001010240000102400005055193520120025120040120042899773900222400102024000020480000120042120042118002110910108000080000102400000340024000200224000243400502003163312003900024000010120043120043120043120043120043
2400241200409640000100030012002701622524001010240000102400005055198080120017120042120042899773900222400102024000020480000120042120042118002110910108000080000102400000340024000200524000223400502003163312003900024000010120050120043120043120052120041
24002412004096400000000010120035161602524001010240000102400005055194240120017120042120049899773900222400102024000020480000120049120042118002110910108000080000102400000340024000200524000223400502003163312003900024000010120041120043120043120043120043
240024120040964000000012300120025161622524001010240000102400005055198080120017120042120042899773900222400102024000020480000120051120042118002110910108000080000102400000340024000210224000203400502003163312003700024000010120043120050120043120043120055
2400241200499640000000123001200271600252400101024000010240000505519352012001712004212005189984390030240010202400002048000012004912004211800211091010800008000010240000034002400020022400024000502003163312003900024000010120043120043120052120041120043