Apple M1 Microarchitecture Research by Dougall Johnson

Firestorm: Overview | Base Instructions | SIMD and FP Instructions
Icestorm:  Overview | Base Instructions | SIMD and FP Instructions

AESD + AESIMC

Test 1: uops

Code:

  aesd v0.16b, v1.16b
  aesimc v0.16b, v0.16b
  movi v0.16b, 1
  movi v1.16b, 2

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 2.000

Issues: 1.000

Integer unit issues: 0.000

Load/store unit issues: 0.000

SIMD/FP unit issues: 1.000

retire (01)cycle (02)031e3f4e51inst issue (52)~issue fp/simd (54)~dispatch fp/simd (57)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op fp/simd (7e)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst neon or fp (9a)a1a6a8a9accfd6e0? fp/simd (ee)f5f6f7f8fd
2004303722061254825100010001000398313030183037303722903277010002000300030373037112001200000000136162884200030383038303830383038
2004303722061254825100010001000398313030183037303722903277010002000300030373037112001200000000136162884200030383038303830383038
2004303723061254825100010001000398313030183037303722903277010002000300030373037112001200000000136162884200030383038303830383038
2004303723061254825100010001000398313130183037303722903277010002000300030373037112001200000000136162884200030383038303830383038
2004303723061254825100010001000398313130183037303722903277010002000300030373037112001200000000136162884200030383038303830383038
2004303723061254825100010001000398313130183037303722903277010002000300030373037112001200000000136162884200030383038303830383038
2004303723061254825100010001000398313130183037303722903277010002000300030373037112001200000000136162884200030383038303830383038
2004303722061254825100010001000398313030183037303722903277010002000300030373037112001200000000136162884200030383038303830383038
2004303722061254825100010001000398313130183037303722903277010002000300030373037112001200000000136162884200030383038303830383038
2004303722361254825100010001000398313130183037303722903277010002000300030373037112001200000000136162884200030383038303830383038

Test 2: Latency 1->1

Code:

  aesd v0.16b, v1.16b
  aesimc v0.16b, v0.16b
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 3.0037

retire (01)cycle (02)031e1f3a3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa6a8accfd0d5d6ddinst fetch restart (de)e0? int output thing (e9)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
202043443324100061295482510100100100001001000050042773001300183003730037270153274951010020020000200300003003730037112020110099100100200001000001310121622298850200001003003830038300383003830038
2020430037224210061295482510100100100001001000050042773001300183003730037270153274951010020820000200300003003730037112020110099100100200001000001310121622298850200001003003830038300383003830038
2020430037225000536295482510100100100001001000050042773000300183003730037270153274951010020020000200300003003730037112020110099100100200001000001310131622298850200001003003830038300383003830038
2020430037225000348295482510100100100001001000050042773000300183003730037270153274951010020020000200300003003730037112020110099100100200001000001310121622298850200001003003830038300383003830038
2020430037225000726295482510100100100001001000050042773000300183003730037270153274951010020020000200300003003730037112020110099100100200001000001310131622298850200001003003830038300383003830038
202043003722500061295482510100100100001001000050042773000300183003730037270153274951010020020000200300003003730037112020110099100100200001000001310121622298850200001003003830038300383003830038
202043003722500061295482510100100100001001000050042773000300183003730037270153274951010020020000200300003003730037112020110099100100200001000001310131622299310200001003003830038300383003830038
202043003722500163295482510100100100001001000050042773000300183003730037270153274951010020020000200300003003730037112020110099100100200001000001310121622298850200001003003830038300383003830038
202043003722500061295482510100100100001001000050042773000300183003730037270153274951010020020000200300003003730037112020110099100100200001000001310121622298850200001003003830038300383003830038
202043003722500061295482510100100100001001000050042773000300183003730037270153274951010020020000200300003003730037112020110099100100200001000001310121622298850200001003003830038300383003830038

1000 unrolls and 10 iterations

Result (median cycles for code): 3.0037

retire (01)cycle (02)031e3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa8acc2branch mispredict (cb)cdcfd0d5d6ddinst fetch restart (de)e0ea? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
200243353323606129548251001010100001010000504277313300183003730037270373275171001020200002030000300373003711200211091010200001000000127001161129884020000103003830038300383003830038
200243003722506129548251001010100001010000504277313300183003730037270373275171001020200002030000300373003711200211091010200001000000127001161129884020000103003830038300383003830038
200243003722406129548251001010100001010000504277313300183003730037270373275171001020200002030000300373003711200211091010200001000000127001161229884020000103003830038300383003830038
200243003722506129548251001010100001010000504277313300183003730037270373275171001020200002030000300373003711200211091010200001000000127001162229884020000103003830038300383003830038
200243003722506129548251001010100001010000504277313300183003730037270373275171001020200002030000300373003711200211091010200001000000127001161129884020000103003830038300383003830038
200243003722506129548251001010100001010000504277313300183003730037270373275171001020200002030000300373003711200211091010200001000000127002161129884020000103003830038300383003830038
2002430037225072629548251001010100001010000504277313300183003730037270373275171001020200002030000300373003711200211091010200001000000127001161129884020000103003830038300383003830038
200243003722506129548251001010100001010000504277313300183003730037270373275171001020200002030000300373003711200211091010200001000000127001161129884020000103003830038300383003830038
2002430037225025129548251001010100001010000504277313300183003730037270373275171001020200002030000300373003711200211091010200001000000127001161129884020000103003830038300383003830038
2002430037225023229548251001010100001010000504277313300183003730037270373275171001020200002030000300373003711200211091010200001000000129802161129884020000103003830038300383003830038

Test 3: Latency 1->2

Code:

  aesd v0.16b, v0.16b
  aesimc v0.16b, v0.16b
  movi v0.16b, 1

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 3.0037

retire (01)cycle (02)03080b18191e1f3a3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a6a8c2c5branch mispredict (cb)cdcfd0d5d6ddinst fetch restart (de)e0? int output thing (e9)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
202043003722500000005362954625101001001000010010000500427714713001830037300372702162749110100200200082003001230084300371120201100991001002000010000001111317011611298880200001003003830038300383003830038
20204300372250000000612954625101001001000010010000500427714703001830037300372702162749110100200200082003001230037300371120201100991001002000010000001111317011612298880200001003003830038300383003830038
20204300372250000000612954625101001021000010010000500427714703001830037300372702162749110100200200082003001230037300371120201100991001002000010000001111317011611298880200001003003830038300383003830038
20204300372250000000612954625101001001000010010000500427714713001830037300372702162749110100200200082003001230037300371120201100991001002000010000001111317011611298880200001003003830038300383003830038
20204300372250000000612954625101001001000010010000500427714713001830037300372702162749110100200200082003001230037300371120201100991001002000010000001111317011611298880200001003003830038300383003830038
20204300372250000000642954625101001001000010010000500427714703001830037300372702162749110100200200082003001230037300371120201100991001002000010000001111317011611298880200001003003830038300383003830038
20204300372250000000612954625101001001000010010000500427714703001830037300372702162749110100200200082003001230037300371120201100991001002000010000001111317011611298880200001003003830038300383003830038
202043003722500000005362954625101001001000010010000500427714713001830037300372702162749110100200200082003001230037300371120201100991001002000010000001111317011611298880200001003003830038300383003830038
20204300372250000000612954625101001001000010010000500427714713012630037300372702162749110100200200082003001230037300371120201100991001002000010000001111317011611298880200001003003830038300383003830038
202043003722500000007262954625101001001000010010000500427714713001830037300372702162749110100200200082003001230037300371120201100991001002000010000001111317011611298880200001003003830038300383003830038

1000 unrolls and 10 iterations

Result (median cycles for code): 3.0037

retire (01)cycle (02)031e3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa9cfd5d6ddinst fetch restart (de)e0? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
2002430037224061295472510010101000010100005042771601300183003730037270363275381001020200002030000300373003711200211091010200001001270116112988320000103003830038300853003830038
2002430037225061295472510010101000010100005042771601300183003730037270363275171001020200002030000300373003711200211091010200001001270116112988320000103003830038300383003830038
2002430037225061295472510010101000010100005042771601300183003730037270363275171001020200002030000300373003711200211091010200001001270116112988320000103003830038300383003830038
2002430037233661295472510010101000010100005042771601300183003730037270363275171001020200002030000300373003711200211091010200001001270116112988320000103003830038300383003830038
2002430037225961295472510010101000010100005042771601300183003730037270363275171001020200002030000300373003711200211091010200001001270116112988320000103003830038300383003830038
2002430037225061295472510010101000010100005042771601300183003730037270363275171001020200002030000300373003711200211091010200001001270116112988320000103003830038300383003830038
2002430037224061295472510010101000010100005042771601300183003730037270363275171001020200002030000300373003711200211091010200001001270116112988320000103003830038300383003830038
2002430037225061295472510010101000010100005042771601300183003730037270363275171001020200002030000300373003711200211091010200001001270116112988320000103003830038300383003830038
2002430037224061295472510010101000010100005042771601300183003730037270363275171001020200002030000300373003711200211091010200001001270116112988320000103003830038300383003830038
2002430037225061295472510010101000010100005042771601300183003730037270363275171001020200002030000300373003711200211091010200001001270116112988320000103003830038300383003830038

Test 4: throughput

Count: 8

Code:

  movi v0.16b, 0
  aesd v0.16b, v8.16b
  aesimc v0.16b, v0.16b
  movi v1.16b, 0
  aesd v1.16b, v8.16b
  aesimc v1.16b, v1.16b
  movi v2.16b, 0
  aesd v2.16b, v8.16b
  aesimc v2.16b, v2.16b
  movi v3.16b, 0
  aesd v3.16b, v8.16b
  aesimc v3.16b, v3.16b
  movi v4.16b, 0
  aesd v4.16b, v8.16b
  aesimc v4.16b, v4.16b
  movi v5.16b, 0
  aesd v5.16b, v8.16b
  aesimc v5.16b, v5.16b
  movi v6.16b, 0
  aesd v6.16b, v8.16b
  aesimc v6.16b, v6.16b
  movi v7.16b, 0
  aesd v7.16b, v8.16b
  aesimc v7.16b, v7.16b
  movi v8.16b, 9

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.3758

retire (01)cycle (02)031e3f51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6escheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa1a8acc5branch mispredict (cb)cdcfd6e0? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
2402043009222593025901121009001210090021500750240130046300663006661390121200160037200240056300663006611240201100991001002400001000001111511916300632400001003006730067300673006730067
2402043006622503025901121009001210090021500750240130046300663006661390121200160037200240056300663006611240201100991001002400001000001111511916300632400001003006730067300673006730067
2402043006622503025901121009001210090021500750240130046300663006661390121200160037200240056300663006611240201100991001002400001000001111511916300632400001003006730067300673006730067
2402043006622503025901121009001210090021500750240030046300663006661390121200160037200240056300663006611240201100991001002400001000001111511916300632400001003006730067300673006730067
2402043006622503025901121009001210090021500750240030046300663006661390121200160037200240056300663006611240201100991001002400001000001111511916300632400001003006730067300673006730067
2402043006622503025901121009001210090021500750240130046300663006661390121200160037200240056300663006611240201100991001002400001000001111511916300632400001003006730067300673006730067
2402043006622503025901121009001210090021500750240030046300663006661390121200160037200240056300663006611240201100991001002400001000001111511916300632400001003006730067300673006730067
2402043006622503025901121009001210090021500750240030046300663006661390121200160037200240056300663006611240201100991001002400001000001111511916300632400001003006730067300673006730067
2402043006622503025901121009001210090021500750240130046300663006661390121200160037200240056300663006611240201100991001002400001000001111511916300632400001003006730067300673006730067
2402043006622503025901121009001210090021500750240130046300663006661390121200160037200240056300663006611240201100991001002400001000001111511916300632400001003006730067300673006730067

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.3761

retire (01)cycle (02)03080b18191e1f3a3f51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)5f6061696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a1a6a8acc2c5cdcfd0d2icache miss (d3)d5d6d9dadbddinst fetch restart (de)e0eaebec? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
2400243022822600000063392590011109000010900015075487851530067300893008603249001020160000202400003009030086112400211091010240000100000300015114140221325581212761530083120634240000103008730087300873008730087
240024300902250000004121259001010900001090000507548785153006730086300860324900102016000020240000300863008611240021109101024000010000060000150553820119581212715730083120634240000103008730087300873008730087
24002430086225000000412125900101090000109000050754878615300673008630086032490010201600002024000030086300861124002110910102400001000016000150483820117581212751430083120634240000103008730087300873008730087
24002430086226000000412125900101090000109000050754878515300673008630086032490010201600002024000030086300861124002110910102400001000006000150483820115581212771030083120634240000103010930087301093010930087
24002430086225000000412125900101090000109000050754878515300893010830108032490187201600002024000030086300861124002110910102400001000000000150463820118581212714630083120634240000103008730087300873008730087
2400243008622500000041212590010109000010900005075487851530067300863008603249001020160000202400003008630086112400211091010240000100000301015045381911858121279730083120634240000103008730087300873008730087
24002430086225000000412125900101090000109000050754878515300673008630086032490010201600002024000030086300861124002110910102400001000003000150443822111281121279930083120634240000103010930109300873008730109
24002430086226000000412125900101090000109000050754878615300673008630086032490010201600002024000030086300861124002110910102400001000009000150534422141081181210161130104176634240000103008730087300873008730087
24002430086226000000412125900101090000109000050754878515300673008630090032490010201600002024000030086300861124002110910102400001000006000150473819117581212771530083120634240000103008730087300873008730087
24002430086226000000412125900101090000109000050753948515300673008630086032490010201600002024000030086300861124002110910102400001000000000150533819118581212712730083135634240000103008730090300873008730087

Test 5: throughput

Count: 16

Code:

  aesd v0.16b, v16.16b
  aesimc v0.16b, v0.16b
  aesd v1.16b, v16.16b
  aesimc v1.16b, v1.16b
  aesd v2.16b, v16.16b
  aesimc v2.16b, v2.16b
  aesd v3.16b, v16.16b
  aesimc v3.16b, v3.16b
  aesd v4.16b, v16.16b
  aesimc v4.16b, v4.16b
  aesd v5.16b, v16.16b
  aesimc v5.16b, v5.16b
  aesd v6.16b, v16.16b
  aesimc v6.16b, v6.16b
  aesd v7.16b, v16.16b
  aesimc v7.16b, v7.16b
  aesd v8.16b, v16.16b
  aesimc v8.16b, v8.16b
  aesd v9.16b, v16.16b
  aesimc v9.16b, v9.16b
  aesd v10.16b, v16.16b
  aesimc v10.16b, v10.16b
  aesd v11.16b, v16.16b
  aesimc v11.16b, v11.16b
  aesd v12.16b, v16.16b
  aesimc v12.16b, v12.16b
  aesd v13.16b, v16.16b
  aesimc v13.16b, v13.16b
  aesd v14.16b, v16.16b
  aesimc v14.16b, v14.16b
  aesd v15.16b, v16.16b
  aesimc v15.16b, v15.16b
  movi v16.16b, 17

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.2504

retire (01)cycle (02)031e3f51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6e72scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa8acc5branch mispredict (cb)cdcfd6inst fetch restart (de)e0? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
32020440091300074025160116100160016100160028500128019604004540067400670614160128200320056200480084400674006711320201100991001003200001000011120119160400643200001004006840068400684006840068
32020440067300097825160116100160016100160028500128019604004540067400670614160128200320056200480084400674006711320201100991001003200001000011120119160400643200001004006840068400684006840068
32020440067300016325160116100160016100160028500128019604004540067400670614160128200320056200480084400674006711320201100991001003200001000011120119160400643200001004006840068400684006840068
32020440067300014025160116100160016100160028500128019604004540067400670614160128200320056200480084400674006711320201100991001003200001000011120119160400643200001004006840068400684006840068
32020440067300014225160116100160016100160028500128019604004540067400670614160128200320056200480084400674006711320201100991001003200001000011120119160400643200001004006840068400684006840068
32020440067300063625160116100160016100160028500128019604004540067400670614160128200320056200480084400674006711320201100991001003200001000011120119160400643200001004006840068400684006840068
3202044006730007325160116100160016100160028500128019604004540067400670614160128200320056200480084400674006711320201100991001003200001000011120119160400643200001004006840068400684006840068
32020440067300016125160116100160016100160028500128019604004540067400670614160128200320056200480084400674006711320201100991001003200001000011120119160400643200001004006840068400684006840068
32020440067300014025160116100160016100160028500128019604004540067400670614160128200320056200480084400674006711320201100991001003200001000011120119160400643200001004006840068400684006840068
32020440067300077925160116100160016100160028500128019604004540067400670614160128200320056200480084400674006711320201100991001003200001000011120119160400643200001004006840068400684006840068

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.2550

retire (01)cycle (02)03040b18191e1f3a3f51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)5f60696d6escheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa8acc5cfd0d2icache miss (d3)d5d6d9dadbddinst fetch restart (de)e0eaebec? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
32002541556306100000482164723016001010160000101600005012800001140776407804078732216001020320000204800004079640792113200211091010320000100002035734020211634771137141542421407651334253339320000104078640785408004110240781
32002440765306000000404115917616001010160000101600005012800001140764407804078132216001020320000204800004079640780113200211091010320000100002035934020011324755140141532421407481138186237320000104080040782408014078440802
32002440817305000000403164317716001010160000101600005012800001140761407854079932216001020320000204800004081840797113200211091010320000100002035733620011619738134141532020407471187210246320000104078140799407824079940783
32002440786305000090406120418116001010160000101600005012800000140896407794080032216001020320000204800004077740770113200211091010320000100302035434220211419760136142532220407481122188242320000104076740785408034079040781
32002440780306100000403122418316001010160000101600005012800000140766407884078632216001020320000204800004078040780113200211091010320000100002036434520211621769159158582223408591303253252320000104078640800407814079940801
3200244078230500000040197517616001010160000101600005012800000140762407814078332216001020320000204800004081840838113200211091010320000100002036534320211321757137138542420407481123189229320000104079640782407994079140781
32002440782306000000392105517716001010160000101600005012800000140766407804078232216001020320000204800004078440783113200221091010320000100002036034320211321756137160542320407651140197252320000104089840800408134080040910
32002440814305000000400110118216001010160000101600005012800000140765407974081632216001020320000204800004079540782113200211091010320000100002035334320211323773135142542020408371173204260320000104078240796407804079940783
32002440782305000000398127134416001010160000101600005012800000140892408004081532216001020320000204800004087740811113200211091010320000100002035635520611623769137142542024407611235194243320000104080040800408054078340801
32002440805305000000377108918016001010160000101600005012800000140781407824078032216001020320000204800004080040809113200211091010320000100002036033720211721773139142552316407551144188254320000104078340784408074081640832