Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

ST1 (multiple, 3 regs, 2D)

Test 1: uops

Code:

  st1 { v0.2d, v1.2d, v2.2d }, [x6]
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 60 nops): 3.000

Issues: 3.000

Integer unit issues: 0.000

Load/store unit issues: 3.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)03l1i tlb fill (04)l1d tlb fill (05)mmu table walk instruction (07)mmu table walk data (08)09l2 tlb miss instruction (0a)l2 tlb miss data (0b)181e1f22233a3f464951schedule uop (52)schedule ldst uop (55)dispatch ldst uop (58)simd uops in schedulers (5a)5f696b6d6emap rewind (75)map stall (76)dispatch uop (78)map ldst uop (7d)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst simd store (99)inst ldst (9b)l1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafbcl1d cache miss st nonspec (c0)l1d tlb miss nonspec (c1)c2c9cfd0d2l1i cache miss demand (d3)l1i tlb miss demand (d4)d5map dispatch bubble (d6)d9dadbddfetch restart (de)e0eaebec? ldst retires (ed)f5f6f7f8fd
63005295582374030030069110458728975332327030003000300015903916995028309292553103000300060002912529168116100110001000300006030001063000060001281590196807311514820554303338051452492841316461133351500630002924029305293182927729244
6300429208219002002002401004560290403023168300030003000159016170600283102924722103000300360002919529089116100110001000300000030001063000090001292391596940310714520638303438061546502840116371133951506430002930929247292982931229181
6300429297219000001009100455229062302316130003000300015900517027028324293343103000300060002910729161116100110001000300000030000003000060001316993476904316314920770315338141352702855516241133301474730002937629374293352932929376
63004292382270020020091004644290830023284300030003000158957170360287952963832730003000600629421293601161001100010003000401300301330005930013758101697230340114819435332237991547522790814326118171342230002822728180283322843028154
6300428197212012001109300506028092402213130003000300015906817048027974282823103000300060002827728221116100110001000300009030001003000090001355399667243337005319511331938111340472788814498117321301530002824428339283962833428356
630042848321200100100375100513628149402213730003000300015907917055027895285333103000300060002815928366116100110001000300000030000003000000001374110302706035260471961433343812748522786414011118021303830002821828164282812831228679
6300428518223001001006091005076281260422140300030003000158994170460280552832631030003000600028205281711161001100010003000090300000030010600013871101877215347715019590330038131544512787314421120361364530002832227970283902831128190
6300428204213001002004294015122282404422119300030003000159021017052028081284873103000300060002810828267116100110001000300000030000003000000001372299697089331415019730331038121447472797214620120041291430002838128181283092840628235
6300428359212001000000100501128253442221130003000300015905917057027893281353103000300060002833728238116100110001000300009030000003000090001319795357015315804920043317838111346492799016122131331475130002919329224291882924729347
63004293392271112220327392004596289400323087300030003000159084170490287172945331030003000600029259291841161001100010003000062300000030003120001304993156898314405020901341938131647512824515263125031417630002860528727286462854428753

Test 2: throughput

Count: 8

Code:

  st1 { v0.2d, v1.2d, v2.2d }, [x6]
  st1 { v0.2d, v1.2d, v2.2d }, [x6]
  st1 { v0.2d, v1.2d, v2.2d }, [x6]
  st1 { v0.2d, v1.2d, v2.2d }, [x6]
  st1 { v0.2d, v1.2d, v2.2d }, [x6]
  st1 { v0.2d, v1.2d, v2.2d }, [x6]
  st1 { v0.2d, v1.2d, v2.2d }, [x6]
  st1 { v0.2d, v1.2d, v2.2d }, [x6]
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 1.5005

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)09l2 tlb miss data (0b)18191e1f23243a3f46494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)61696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd store (99)inst ldst (9b)9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)a4ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafbcl1d cache miss st nonspec (c0)l1d tlb miss nonspec (c1)c2branch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)ea? ldst retires (ed)? int retires (ef)f5f6f7f8fd
24020512019193100000012190001200271600252401001002400001142400075005519455012001712004212004089961790013240225200240016200480032120040120042118020110099100100800008000010024000003400240002025240002200011151170160012004700240000100120051120043120043120043120180
240204120042931000001264283000120307160735524028811324162011524043260155338600120375120181120464902392790115240217200240360200480476120182120603218020110099100100800008000010024144203669202410820079324012223400000513413411120277190240000100120464120321120043120041120052
240204120042931000000121900012002716160252401001002400001002400005005519424012001712004212005089955390009240100200240000200480000120042120042118020110099100100800008000010024000000002400021022400022340000051101161112003700240000100120043120123120052120043120043
24020412005193000000003000120027161602524010010024000010024000050055194240120025120042120051899633900002401002002400002004800001200421200421180201100991001008000080000100240000034002400000022400022340000051101161112003900240000100120043120043120043120043120052
24020412004293100000012300012002716160252401001002400001002400005005519352012001512004212005089963389998240100200240000200480000120042120042118020110099100100800008000010024000003400240002008240002200000051101160112004800240000100120041120043120043120043120043
2402041200429310000000300012002701672524010010024000010024000050055199080120033120059120059899743900172401002002400002004800001200501200521180201100991001008000080000100240015153601240016012224000218340000051101161112003900240000100120043120043120043120043120051
24020412004293000000001000012003501602524010010024000010024000050055194240120026120042120042899643900002401002002400002004800001200511200421180201100991001008000080000100240000034002400020022400022340010051101161112003700240000100120043120041120041120043120052
24020412004293000000030100120027161602524010010024000010024000050055194240120025120042120128899533900002401002002400002004800001200501200421180201100991001008000080000100240000034002400020022400022340000051101161112003900240000100120043120043120050120043120050
24020412005093100000036300012002701632524010010024000010024000050055194240120024120042120042899623900002401002002400002004800001200501200421180201100991001008000080000100240000036002400020022400022340000051101161112003900240000100120043120043120051120043120043
24020412005093100000012310012002716160252401001002400001002400005005519424012001712004012004089953390000240100200240000200480000120042120049118020110099100100800008000010024006003400240002102240002200000051101161112003900240000100120041120041120043120043120043

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 1.5005

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)l2 tlb miss data (0b)0f18191e1f22243a3f46494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd store (99)inst ldst (9b)9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)a4ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafbcl1d cache miss st nonspec (c0)l1d tlb miss nonspec (c1)c2branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? ldst retires (ed)? int retires (ef)f5f6f7f8fd
240025120049931100021512010012003616160252400101024000010240000505519424012001701200421201818998439002224001020240000204802421200511200421180021109101080000800001024000003400240000003240000234000050201116613120039024000010120043120181120041120041120050
24002412005193000000012900012002501602524001010240000102400005055193520120026012005112018089977390022240010202400002048000012005012004011800211091010800008000010240060034002400021022400022340000502013161313120046024000010121441120043120043120043120043
240024120042931000000123000120027161602524001010240000102400005055242600120025012004212004289977390030240010202400002048000012005012004011800211091010800008000010240000034002400025081224000223400005020616614120039024000010120041120058120043120043120043
240024120042931000000090001200271616025240010102400001024000050551942401200170120042120042899843900222400102024000020480000120042120042118002110910108000080000102400141436022400761120240002164214000502013161212120049124000010120059120069120054120048120053
24002412005993011200001900112004401652524001010240000102400005055202680120022012028112005989984390039240010202400002048000012006212005211800211091010800008000010240014143601240014101824006214381400050201316613120165024000010120051120043120043120043120043
2400241200429310000003300012002716160252400101024000010240000505519424012001701200421200508997739013524001020240000204800001200401200421180021109101080000800001024000003400240002102240002034000050201316613120039024000010120051120043120043120043120182
24002412004093100003003000120027161602524001010240120102400005055194240120017012004012005089977390137240010202400002048000012004212004011800211091010800008000010240000034002400020022400022340000502012161313120039024000010120041120043120043120055120043
24002412004093000000009000120027161602524001010240000102400005055242600120025012004212004289977390022240010202400002048000012004012004211800211091010800008000010240000034002400020082400022340000502014161313120047024000010120044120050120043120043120041
240024120042931000000123000120167161676252400101024000010240000505520000012001761200421200509007039002224001020240000204800001200421200401180021109101080000800001024000020002400021022400022340000502013161313120039024000010120043120043120055120043120050
2400241200429300000000300012002716160252400101024000010240000505519424012002601200421200408997739002224001020240000204800001200501200421180021109101080000800001024000003400240002100240002234000050201116814120160224000010120462120182120329120337120462