Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

ST1 (multiple, 3 regs, 4S)

Test 1: uops

Code:

  st1 { v0.4s, v1.4s, v2.4s }, [x6]
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 60 nops): 3.000

Issues: 3.000

Integer unit issues: 0.000

Load/store unit issues: 3.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)03l1i tlb fill (04)l1d tlb fill (05)mmu table walk instruction (07)mmu table walk data (08)09l2 tlb miss instruction (0a)l2 tlb miss data (0b)181e1f2223243a3f464951schedule uop (52)schedule ldst uop (55)dispatch ldst uop (58)simd uops in schedulers (5a)5f6061696d6emap rewind (75)map stall (76)dispatch uop (78)map ldst uop (7d)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst simd store (99)inst ldst (9b)l1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafbcl1d cache miss st nonspec (c0)l1d tlb miss nonspec (c1)c2c9cfd0d2l1i cache miss demand (d3)l1i tlb miss demand (d4)d5map dispatch bubble (d6)d9dadbddfetch restart (de)e0eaebec? ldst retires (ed)f5f6f7f8fd
630052948223531412310041004619290750323297300030003000158986101702628583293573103000300060002929029174116100110001000300446030030187300009000131949710694831460372059732393818844412869916363131051469730002930929375294202935129356
63004293402360030000001000464129144002309530003003300015900000170522852129343310300030006000291472908621610011000100030000003000001023000090001303491887026314713320667324438121134342854216128131261475330002933829236293832923029273
63004293872260010020001000465629225332315030003000300015904600170582857629330310300030006000290952911811610011000100030000903000004823000090001319193976940317814620576322838131533322849716132130461462230002925029290293412918229378
6300429296227001010009177000463329048332309930003000300015903600170242863029396330300030006000292012919411610011000100030000903000001203000090001321694696924319404120629318538131236422843916316132721462130002921829393293772925129350
63004293082260010120001000451429171332307630003000300015941700170342862029351328300030006000291102915511610011000100030000903000000300009000130319357691732011342068732513813934352854316301129691457130002927229311293442929329276
6300429208227001003010100046642907030231513000300030001590360017037285942924731030003000600029162291312161002100010003000090300000105300009000132999605696531401382069332563810936362851716354131131459530002925829225292852930129203
630042929022600300300011004709290453323166300030003000159046001702328557291703103000300060002913429208116100110001000300009030000010230000900207133089381700131161342067031293811733392850316010132211454330002939229228291392913029229
630042938022800200200010004647290300123164300030003000159071001705528474293633103000300060002914029059116100110001000300006030000015300006000131849404693932540382067032423820740422846216226128101458130002928529316293272924929325
6300429360227004002000100047072911931232283000300030001589970017039283892924331030003000600029054292721161001100010003000090300000120300009000131449349694431312372066432083814940332862116277130731465730002926629241292172931329215
63004292962270020020001100467229110002317930003000300015934600170402866129245310300030006000292002903511610011000100030000903000006300009000130149460693831871432061930723812836412842116189131821456730002933029238293222922129307

Test 2: throughput

Count: 8

Code:

  st1 { v0.4s, v1.4s, v2.4s }, [x6]
  st1 { v0.4s, v1.4s, v2.4s }, [x6]
  st1 { v0.4s, v1.4s, v2.4s }, [x6]
  st1 { v0.4s, v1.4s, v2.4s }, [x6]
  st1 { v0.4s, v1.4s, v2.4s }, [x6]
  st1 { v0.4s, v1.4s, v2.4s }, [x6]
  st1 { v0.4s, v1.4s, v2.4s }, [x6]
  st1 { v0.4s, v1.4s, v2.4s }, [x6]
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 1.5006

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)l2 tlb miss data (0b)18191e1f233a3f46494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)91inst branch cond (94)inst int alu (97)inst simd store (99)inst ldst (9b)9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)a4ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafbcl1d cache miss st nonspec (c0)l1d tlb miss nonspec (c1)c2cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? ldst retires (ed)? int retires (ef)f5f6f7f8fd
24020512006093100001030012002716160252401001002400001002400005005519424120017120179120042899553900082401002002401192004800001200421200401180201100990100100800008000010024000003400240000002240002234005110116111200460240000100120051120043120043120043120043
2402041200429300000003001205841616025240100100240060100240108500554301612013712004212004289955390000240100200240000200480000120049120042118020110099010010080000800001002400000000240002002240002234005110116111200390240000100120043120041120052120043120051
2402041200519300000006001200361616025240100100240000100240000500551942412001712004212004289955390000240100200240000200480000120042120050118020110099010010080000800001002400620000240002502240002234005110116111201580240000100120043120043120043120043120052
240204120042930000000300120027161602524010010024000010024000050055197601200171200401200428995539000724010020024000020048000012004212004221802011009901001008000080000100240000034280240002005240002234005110116111200390240000100120191120043120050120043120043
240204120042931000006300120027161602524010010024000010024000050055198081200151200491200428995539005524020820024000020048000012005012004211802011009901001008000080000100240120434002400620015952400022340251342351112273524240000100120331120179120567120322120459
240204120322934010312641790012031416161488624017612424018011224010859055241641202581203261204569014241902262403322002401192004802381203221203214180201100990100100800008000010024000003400240002202240002234005110116111200390240000100120043120041120043120043120043
240204120042931000001830012059516160252401001002400001002400005655519424120032120042120042899553900002401002002400002004800001200511200421180201100990100100800008000010024000003400240002008240002240005110116111200470240000100120052120043120043120043120043
24020412004293000000129001200281616025240100100240000100240000500552019212001712004212004289953390000240100200240000200480000120040120051118020110099010010080000800001002400000340024000200224000220005110116111200390240000100120043120043120041120043120043
24020412004993100000090012002816160252401001002400001002400005005519424120017120042120042899553900072401002002400002004800001200421200491180201100990100100800008000010024000003400240002008240002234005110116111200390240000100120043120043120043120050120041
24020412004293100000123001200271616025240100100240000100240000500551935212001712004212004289955390000240100200240000200480000120042120042118020110099010010080000800001002400000000240002102240002234005110116111200480240000100120041120043120051120043120043

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 1.5007

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)09l2 tlb miss instruction (0a)l2 tlb miss data (0b)18191e1f223a3f46494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd store (99)inst ldst (9b)9e9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)a4ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafb8bcl1d cache miss st nonspec (c0)l1d tlb miss nonspec (c1)c2cfd0d5map dispatch bubble (d6)dadbddfetch restart (de)e0? int output thing (e9)eb? ldst retires (ed)? int retires (ef)f5f6f7f8fd
24002512005293011001000140112003916167252400101024000010240000505520028012002912005412005489998390034240010202400002048000012006312004211800211091010800008000001024001504400240016001624000201644005020013160013111200510024000010120053120055120055120052120048
2400241200529301100100017011200401616025240010102400001024000050551944801200221200541200478998739003224001020240000204800001200421200541180021109101080000800000102400141542002400144180724000202441405020013160012101200500024000010120055120053120055120055120055
24002512005493001000001214011200321616225240010102400001024000050552007601200291200421200548999839003224001020240000204800001200541200421180021109101080000800000102400141444102400160022240002016014050200131600131312006101324000010120055120053120055120055120183
240024120047930100010002101120039161622524001010240000102400005055200280120038120063120054899891690022240010202400002048000012005212005411800211091010800008000001024001414500024001601524000202441405020061600131312005001024000010120055120043120053120055120055
2400241200549300000100129101120027161615624001010240000102400005055200280120026120052120054899893900342400102024000020480000120052120054218002110910108000080000010240015144400240002002024000201648140502001316001381200440024000010120044120055120053120048120044
24002412004793010001000170112017516167925240010102400001024000050552000401200261200521200548999639003424001020240000204800001201921200541180021109101080000800000102400001444310240016002124000201642140502001325006131200400024000010120055120044120204120044120055
24002412005493110000010301120027161612524001010240000102400005055199080120029125897126202899883900342400102024000020480240120054120051118002110910108000080000010240015044002400021016240002016440150200616006131200510024000010120056120055120043120055120055
2400241200429311000000121701120037161662524001010240000102400005055243080120029120054120051899893900342400102024000020480000120043120052118002110910108000080000010240000144201240016011924000201642140502006160013612003902024000010120055120056120053120044120055
240024120063930100000001901120039161622524001010240000102400005055204390120017120054120054899861690032240010202400002048000012005412004211800211091010800008000001024001504400240074001624000201448015020013160013131200510024000010120054120048120055120054120053
2400241200439311000000017001200391516125240010102400001024000050551990801200291200541201918998839003424001020240000204800001200431200521180021109101080000800000102400141444012400160022400620244140502001216001381200490024000010120055120053120064120055120055