Apple M1 Microarchitecture Research by Dougall Johnson

Firestorm: Overview | Base Instructions | SIMD and FP Instructions
Icestorm:  Overview | Base Instructions | SIMD and FP Instructions

AESE + AESMC

Test 1: uops

Code:

  aese v0.16b, v1.16b
  aesmc v0.16b, v0.16b
  movi v0.16b, 1
  movi v1.16b, 2

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 2.000

Issues: 1.000

Integer unit issues: 0.000

Load/store unit issues: 0.000

SIMD/FP unit issues: 1.000

retire (01)cycle (02)031e1f3f4e51inst issue (52)~issue fp/simd (54)~dispatch fp/simd (57)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op fp/simd (7e)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst neon or fp (9a)a0a6a8accfd6e0? fp/simd (ee)f5f6f7f8fd
20043037251206125482510001000100039831303018303730372290327701000200030003037303711200120000000143162884200030383038303830383038
200430372401246125482510001000100039831303018303730372290327701000200030003037303711200120000000136162884200030383038303830383038
2004303725006125482510001000100039831303018303730372290327701000200030003037303711200120000000136162884200030383038303830383038
20043037240025125482510001000100039831303018303730372290327701000200030003037303711200120000003136162884200030383038303830383038
2004303724006125482510001000100039831313018340830372290327701000200030003037303711200120000000136162884200030383038303830383038
2004303725006125482510001000100039831313018303730372290327701000200030003037303711200120000000136162884200030383038303830383038
2004303724906625482510001170128339831303018303730732290327701000200030003037303711200120000000136162884200030383038303830383038
2004303725006125482510001000100039831303018303730372290327701000200030003037303711200120000000136162884200030383038303830383038
2004303722006125482510001000100039831303018303730372290327701000200030003037303711200120000000136162884200030383038303830383038
2004303722006125482510001000100039831303018303730372290327701000200030003037303711200120000000136162884200030383038303830383038

Test 2: Latency 1->1

Code:

  aese v0.16b, v1.16b
  aesmc v0.16b, v0.16b
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 3.0037

retire (01)cycle (02)0308090e1e3a3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a1a8accdcfd0d5d6ddinst fetch restart (de)e0? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
2020434883256002002632954825101001001000010010000500427730003001830037300372701532749510100200200002003000030037300371120201100991001002000010000000131114163329885200001003003830038300383003830038
202043003722500001642954825101001001000010010000500427730003001830037300372701532749510100200200002003000030037300371120201100991001002000010000000131113163329885200001003003830038300383003830038
202043003722500001642954825101001001000010010000500427730013001830037300372701532749510100200200002003000030037300371120201100991001002000010000000131113163229885200001003003830038300383003830038
202043003722500001872954825101001001000010010000500427730013001830037300372701532749510100200206832023000030037300371120201100991001002000010000000131012163329885200001003003830038300383003830038
202043003722400000612954825101001001000010010000500427730013001830037300372701532749510100200200002003000030037300371120201100991001002000010000000131113163329885200001003003830038300383003830038
202043003722500001612954825101001001000010010000500427730013001830037300372701532749510100200200002003000030037300371120201100991001002000010000000131113163229885200001003003830038300383003830038
202043003722500000612954825101001001000010010000500427730003001830037300372701532749510100200200002003000030037300371120201100991001002000010000000131012163329885200001003003830038300383003830038
202043003722500001612954825101001001000010010000500427730013001830037300372701532749510100200200002003000030037300371120201100991001002000010000000131013163230094200001003003830038300383003830038
202043003722500001642954825101001001000010010000500427730013001830037300372701532749510100200200002003000030037300371120201100991001002000010000000131012163329885200001003003830038300383003830038
2020430037225000004192954825101001001000010010000500427730003001830037300372701532749510100200200002003000030037300371120201100991001002000010000000131013163229885200001003003830038300383003830038

1000 unrolls and 10 iterations

Result (median cycles for code): 3.0037

retire (01)cycle (02)031e1f3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa8cficache miss (d3)d5d6ddinst fetch restart (de)e0? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
20024335352360061295482510010101000010100005042773133001830037300372703732751710010202000020300003003730037112002110910102000010012700216132988420000103003830038300383003830038
20024300372250061295482510010101000010100005042773133001830037300372703732751710010202000020300003003730037112002110910102000010012700116122988420000103003830038300383003830038
20024300372250061295482510010101000010100005042773133001830037300372703732751710010202000020300003003730037112002110910102000010012700116232988420000103003830038300383003830038
20024300372250061295482510010101000010100005042773133001830037300372703732751710010202000020300003003730037112002110910102000010012700116342988420000103003830038300383003830038
20024300372250061295482510010101000010100005042773133001830037300372703732751710010202000020300003003730037112002110910102000010012700116132988420000103003830038300383003830038
20024300372250061295482510010101000010100005042773133001830037300372703732751710010202000020300003003730037112002110910102000010012700216132988420000103003830038300383003830038
20024300372250061295482510010101000010100005042773133001830037300372703732751710010202000020300003003730037112002110910102000010012700116132988420000103003830038300383003830038
20024300372250061295482510010101000010100005042773133001830037300372703732751710010202000020300003003730037112002110910102000010012700116112988420000103003830038300383003830038
20024300372250061295482510010101000010100005042773133001830037300372703732751710010202000020300003003730037112002110910102000010012700116112988420000103003830038300383003830038
20024300372250084295482510010101000010100005042773133001830037300372703732751710010202000020300003003730037112002110910102000010012700116132988420000103003830038300383003830038

Test 3: Latency 1->2

Code:

  aese v0.16b, v0.16b
  aesmc v0.16b, v0.16b
  movi v0.16b, 1

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 3.0037

retire (01)cycle (02)031e3a3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa8acc5branch mispredict (cb)cdcfd0d5d6ddinst fetch restart (de)e0? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
202043003722500612954625101001001000010010000500427714713001830037300372702162749110100200200082003001230037300371120201100991001002000010000111131701161129888200001003003830038300383003830038
202043003722500612954625101001001000010010000500427714713001830037300372702162749110100200200082003001230037300371120201100991001002000010000111131701161129888200001003003830038300383003830038
202043003722500612954625101001001000010010000500427714713001830037300372702262749110100200200082003001230037300371120201100991001002000010000111131701161129888200001003003830038300383003830038
202043003722500612954625101001001000010010000500427714713001830037300372702162749110100200200082003001230037300371120201100991001002000010000111131701161129888200001003003830038300383003830038
202043003722500612954625101001001000010010000500427714713001830037300372702162749110100200200082003001230037300371120201100991001002000010000111131701161129888200001003003830038300383003830038
202043003722500612954625101001001000010010000500427714713001830037300372702162749110100200200082003001230037300371120201100991001002000010000111131701161129888200001003003830038300383003830038
202043003722500612954625101001001000010010000500427714713001830037300372702162749110100200200082003001230037300371120201100991001002000010000111131701161129888200001003003830038300383003830038
202043003722500612954625101001001000010010000500427714713001830037300372702162749110100200200082003001230037300371120201100991001002000010000111131701161129888200001003003830038300383003830038
202043003722500612954625101001001000010010000500427714713001830037300372702162749110100200200082003001230037300371120201100991001002000010000111131701161129888200001003003830038300383003830038
202043003722500632954625101001001000010010000500427714703001830037300372702162749110100200200082003001230037300371120201100991001002000010000111131701161129888200001003003830038300383003830038

1000 unrolls and 10 iterations

Result (median cycles for code): 3.0037

retire (01)cycle (02)03070a0b3a3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa6a8a9acbranch mispredict (cb)cdcfd5d6ddinst fetch restart (de)e0? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
200243003722511037129547251001010100001010000504277160130018300373003727036327517100102020000203000030037300371120021109101020000100000001276221612152988320000103003830038300383003830038
20024300372251103712954725100101010000101000050427851103001830037300372703632751710010202000020300003003730037112002110910102000010000000127614161382988320000103003830038300383003830038
2002430037225110314629547251001010100001010000504277160030018300373003727036327517100102020000203000030037300371120021109101020000100000001276131611132988320000103003830038300383003830038
200243003722511037129547251001010100001010000504277160030018300373003727036327517100102020000203000030037300371120021109101020000100100001276131610142988320000103003830038300383003830038
200243003722511037129511251001010100081010000504278511130018300373003727036327517100102020000203000030037300371120021109101020000100800001276131614132988320000103003830038300383003830038
200243003722511037129547251001010100001010000504277160130018300373003727036327517100102020000203000030037300371120021109101020000100000001276141612122988320000103003830038300383003830038
20024300372251103712954725100101010000101000050427716003001830037300372703632751710010202000020300003003730037112002110910102000010210000127613161482988320000103013430087300383003830038
200243003722611131132954725100101010000101000050427716013001830037300372703632751710010202000020300003003730037112002110910102000010000000127681613142988320000103003830038300383003830038
2002430037225110311329547251001010100001010000504277160030018300373003727036327517100102020000203026730037300371120021109101020000100103001276131613132988320000103003830038300383003830038
200243003722511037129547251001010100001010000504277160030018300373003727036327517100102020000203000030037301811120021109101020000100103001276141612132988320000103003830038300383003830038

Test 4: throughput

Count: 8

Code:

  movi v0.16b, 0
  aese v0.16b, v8.16b
  aesmc v0.16b, v0.16b
  movi v1.16b, 0
  aese v1.16b, v8.16b
  aesmc v1.16b, v1.16b
  movi v2.16b, 0
  aese v2.16b, v8.16b
  aesmc v2.16b, v2.16b
  movi v3.16b, 0
  aese v3.16b, v8.16b
  aesmc v3.16b, v3.16b
  movi v4.16b, 0
  aese v4.16b, v8.16b
  aesmc v4.16b, v4.16b
  movi v5.16b, 0
  aese v5.16b, v8.16b
  aesmc v5.16b, v5.16b
  movi v6.16b, 0
  aese v6.16b, v8.16b
  aesmc v6.16b, v6.16b
  movi v7.16b, 0
  aese v7.16b, v8.16b
  aesmc v7.16b, v7.16b
  movi v8.16b, 9

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.3758

retire (01)cycle (02)031e3f51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)696d6escheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa8acc5branch mispredict (cb)cdcfd5d6ddinst fetch restart (de)e0? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
240204300912259630259011210090012100900215007502403004630066300666139012120016003720024005630066300661124020110099100100240000100001111511941600300632400001003006730067300673006730067
24020430066225030259011210090012100900215007502403004630066300666139012120016003720024005630066300661124020110099100100240000100001111511901600300632400001003006730067300673006730067
240204300662252430259011210090012100900215007502403004630066300666139012120016003720024005630066300661124020110099100100240000100001111511901600300632400001003006730067300673006730067
240204300662264830259011210090012100900215007502403004630066300666139012120016003720024005630066300661124020110099100100240000100001111511901600300632400001003006730067300673006730067
24020430066225030259011210090012100900215007502403004630066300666319012120016003720024005630066300661124020110099100100240000100001111511901600300632400001003006730067300673006730067
2402043006622528830259011210090012100900215007502403004630066300666139012120016003720024005630066300661124020110099100100240000100002221513112311300752400001003007930079300793007930080
24020430078225066289011410090014100900235007503963005730078300789139012320016004120024006230078300781124020110099100100240000100002221513112311300752400001003015530079300793008030079
240204300792253666289011410090014100900235007503963005730078300799139012320016004120024006230078300791124020110099100100240000100002221513112311300752400001003008030079300793007930079
240204300782250662790114100900141009002350075039630057300783007810139012320016004120024006230079300781124020110099100100240000100002221513112311300762400001003008030079300793007930080
240204300792252766279011410090014100900235007503963005730078300789139012320016004120024006230078300781124020110099100100240000100001111512122422300732400001003007730077300773007730077

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.3763

retire (01)cycle (02)0307090a18191e1f3a3f51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)5f6061696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa8acc2cfd0d2icache miss (d3)d5d6d9dadbddinst fetch restart (de)e0eaebec? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
2400243019122500003330394389001010900011090000507528332153100830109301060323900102016000020241137301353008411240021109101024000010000150522713724748862521300931175717240000103010330100301033010030100
2400243010222600000603943890010109000010903835075309331530907300993008403249001020160000202400003010230181112400211091010240000102100150523513725729962222300951015717240000103010330100301033011030103
2400243008422500000390288389001010900001090000507566275110308973009930084032490010201600002024000030102301021124002110910102400001000015052491371872886921300931175417240000103010330308301003010030103
240024300842260000036038838900101090000109000050753093311030903300993008403133900102016000020240000301023009911240021109101024000010000150423215723748861222300951175417240000103010330100301003010330103
24002430084225000002703983890010109000010900005074838241103093330102300990324900102016000020240000301023010211240021109101024000010000150534515719728861923300931175417240000103010330100301003010030103
24002430084225000002703943890010109016010900005075309331103090830099301020324900102016018120240000301343010211240021109101024000010000150513213719728861911300931175719240000103010330100301003010030103
24002430102225000002705114389001010900001090000507530933110309113010230099032390010201600002024000030102301021124002110910102400001003015062572715281181616112814301391978110240000103010030100300853010030100
2400243008422600000003883890010109000010900005075309331103095130102300990324900102016000020240000300843010211240021109101024000010000150493213719728861023300931175417240000103010330100301003010030103
24002430102225000003303883890010109000010900005075309331103089030154301510324900102016000020240000301483013411240021109101024000010000150393213718728861919300931175417240000103010330109301003010030103
24002430102226000002404943890010109000010900005075309331103094230131300840323900102016000020240000301023010911240021109101024000010000150563513819721087229300931175417240000103010330100301003010030100

Test 5: throughput

Count: 16

Code:

  aese v0.16b, v16.16b
  aesmc v0.16b, v0.16b
  aese v1.16b, v16.16b
  aesmc v1.16b, v1.16b
  aese v2.16b, v16.16b
  aesmc v2.16b, v2.16b
  aese v3.16b, v16.16b
  aesmc v3.16b, v3.16b
  aese v4.16b, v16.16b
  aesmc v4.16b, v4.16b
  aese v5.16b, v16.16b
  aesmc v5.16b, v5.16b
  aese v6.16b, v16.16b
  aesmc v6.16b, v6.16b
  aese v7.16b, v16.16b
  aesmc v7.16b, v7.16b
  aese v8.16b, v16.16b
  aesmc v8.16b, v8.16b
  aese v9.16b, v16.16b
  aesmc v9.16b, v9.16b
  aese v10.16b, v16.16b
  aesmc v10.16b, v10.16b
  aese v11.16b, v16.16b
  aesmc v11.16b, v11.16b
  aese v12.16b, v16.16b
  aesmc v12.16b, v12.16b
  aese v13.16b, v16.16b
  aesmc v13.16b, v13.16b
  aese v14.16b, v16.16b
  aesmc v14.16b, v14.16b
  aese v15.16b, v16.16b
  aesmc v15.16b, v15.16b
  movi v16.16b, 17

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.2504

retire (01)cycle (02)031e1f3f51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)6061696b6d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa1a6a8a9acc2c5branch mispredict (cb)cdcfd0d2d5d6ddinst fetch restart (de)e0? int output thing (e9)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
3202044009230000312516011610016001610016002850012801961040051040067400670614160128200320056202480084400674006711320201100991001003200001000010301112012600216334006403200001004006840068400684006840068
320204400673000031251601161001600161001600285001280196104004504006740067061416012820032005620048008440067400671132020110099100100320000100000072801112012200316554027103200001004006840068403104006840068
32020440067300300312516011610016001610016002850012801960540045040067400670614160128200320056200480084400674006711320201100991001003200001000000001112012253316434006403200001004006840068400684006840068
3202044006730000312516011610016001610016002850012801961040045040067400670614160128200320056200480084400674006711320201100991001003200001002000001112012154316244016303200001004006840068400684006840068
3202044006730000312516011610016001610016002850012801961540045040067400670614160128200320056200480084400674006711320201100991001003200001000000001112012454316324006403200001004006840068400684006840068
3202044006730000312516011610016001610016002850012801961540045040067400670614160128200320056200480084400674006711320201100991001003200001000000001112012254316364006403200001004006840068400684006840068
32020440067300240312516011610016001610016002850012801961540045040067400670614160128200320056200480084400674006711320201100991001003200001000000001112012054316534006403200001004006840068400684006840068
32020440067300360312516011610016001610016002850012801961540045040067400670614160128200320056200480084400674006711320201100991001003200001000000001112012354316454006403200001004006840068400684006840068
32020440067300988312516011610016001610016002850012801961540052040067400670614160128200320056200480084400674006711320201100991001003200001000000001112012454416134006403200001004006840068400684006840068
32020440067300606962516011610016001610016002850012801961540045040067400670614160128200320056200480084400674006711320201100991001003200001000000001112012254316444006403200001004006840068400684006840068

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.2560

retire (01)cycle (02)03070a0b191e1f3a3f51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)5f6061696b6d6e72scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa8acc2cfd0d2icache miss (d3)itlb miss (d4)d5d6d9dadbddinst fetch restart (de)e0eaebec? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
320024412303086300008810192951600101016000010160000501280000315431080409764095403221600102032000020480000409314095211320021109101032000010480020365371186121236943147140682612408441302281330320000104098341021410294096940906
320024409443075300007194632616001010160000101600005012800007154294904096340944032216001020320000204800004094240942113200211091010320000100930204053801941242241017159163742813408601318295333320000104102741027410254102740961
320024409183064400007697229716001010160000101600005012800004154286334096440927032216001020320000204800004095940918113200211091010320000100002039134919111722489215613970249408191310283335320000104090740943409074090740949
32002440906306320000809492951600101016000010160000501280000415426990410874101603221600102032000020480000409584095811320021109101032000010000203863791911162111045180163752411409471320290335320000104095941046409334096141059
3200244101230634009089974294160010101600001016000050128000081542932040982411250322160010203200002048000040958409581132002110910103200001000020399359199117211904155142702512408721318282333320000104095940931409594095940937
32002540952307320000821000288160010101600001016000050128000001542733041065411160322160010203200002048000040930409301132002110910103200001000020386371199117225882149147722611408441318300333320000104093340931409324093240959
3200244095830632000082954289160010101600001016000050128000001542809041105410690322160010203200002048000040930409301132002110910103200001000020371359203123212932151147712611408441462356405320000104105941059410604103241059
32002441058308340000881031294160010101600001016000050128000001542922041018411080322160010203200002048000040958409581132002110910103200001000020375347315126226974175158722711408721305300333320000104093140959409314093141032
32002441083308430000881060295160010101600001016000050128000041542904041032410880322160010203200002048000041058410591132002110910103200001003020385371190121128934155148701422408521462356414320000104094140928409874095040959
3200244095830633000080949294160010101600001016000050128000041542778041070409250322160010203200002048000041040410071132002110910103200001000020384358202121127981171160742211409411399345387320000104093140931409414094340943