Apple M1 Microarchitecture Research by Dougall Johnson

Firestorm: Overview | Base Instructions | SIMD and FP Instructions
Icestorm:  Overview | Base Instructions | SIMD and FP Instructions

FSQRT (scalar, D)

Test 1: uops

Code:

  fsqrt d0, d0
  movi v0.16b, 1
  movi v1.16b, 2

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 0.000

Load/store unit issues: 0.000

SIMD/FP unit issues: 1.000

retire (01)cycle (02)031e1f3f4e51inst issue (52)~issue fp/simd (54)~dispatch fp/simd (57)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op fp/simd (7e)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst neon or fp (9a)a8cfd5d6ddinst fetch restart (de)e0? fp/simd (ee)f5f6f7f8fd
10041303797132061115402510001000100045795311301813037130371224831289510001000100013037130371110011000073116111243110001303813038130381303813038
100413037981226471115402510001000100045795311301813037130371224831289510001000100013037130371110011000073116111243110001303813038130381303813038
100413037989061115402510001000100045795311301813037130371224831289510001000100013037130371110011000073116111243110001303813038130381303813038
100413037976061115402510001000100045795311301813037130371224831289510001000100013037130371110021000073116111243110001303813038130381303813038
1004130379724061115402510001000100045795311301813037130371224831289510001000100013037130371110011000073116111243110001303813038130381303813038
100413037980061115402510001000100045795311301813037130371224831289510001000100013037130371110011000073116111243110001303813038130381303813038
1004130379700149115402510001000100045795311301813037130371224831289510001000100013037130371110011000073116111243110001303813038130381303813038
100413037980061115402510001000100045795311301813037130371224831289510001000100013037130371110011000073116111243110001303813038130381303813038
100413037970061115402510001000100045795311301813037130371224831289510001000100013037130371110011000073116111243110001303813038130381303813038
100413037980061115402510001000100045795311301813037130371224831289510001000100013037130371110011000073116111243110001303813038130381303813038

Test 2: Latency 1->2

Code:

  fsqrt d0, d0
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 13.0037

retire (01)cycle (02)03080b18191e3a3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa1a6a8a9acc5branch mispredict (cb)cdcfd5d6ddinst fetch restart (de)e0? int output thing (e9)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
1020413003797400000025111954025101001001000010010000500466095311300181300371300371281057128740101002001000820010008130037130037111020110099100100100001000000011171801700129444010000100130038130038130038130038130038
102041300379740000174072611954025101001001000010010000500466095301300181300371300371281057128740101002001000820010008130037130037111020110099100100100001000000011171701600129444010000100130038130038130038130038130038
10204130037974000000168411954025101001001000010010000500466095311300181300371300371281056128741101002001000820010008130037130037211020110099100100100001000000011171801600129443010000100130038130038130038130038130038
1020413003797400000061119540251010010010000100100005984660953113001813003713003712810514128740101002001000820010008130037130037111020110099100100100001000000011171701600129443010000100130038130038130038130038130038
102041300379740000006111954025101001001000010010000500466095301300181300371300371281056128740101002001000820010008130037130037111020110099100100100001000000011171801600129444010000100130038130038130038130038130038
102041300379740000006111954025101001001000010010000500466095301300181300371300371281056128740101002001000820010008130037130037111020110099100100100001000000011171701600129444010000100130038130038130038130038130038
10204130037974000000611195322510100100100001001000050046609531130018130037130037128105712874110100200100082001000813003713003711102011009910010010000100000026711171801600129444010000100130038130038130038130038130038
10204130037974000000611195402510100100100001001000050046609531130018130037130037128105712874110100200100082001000813003713003711102011009910010010000100000030011171801600129444010000100130038130038130038130038130038
10204130037974000063061119540251010010010000100101525004660953113001813003713003712810561287411010020010008200100081300371300371110201100991001001000010000780311171701600129444010000100130038130038130038130038130038
1020413003797400000061119540251010010010000100100005004660953113001813003713003712810571287411010020010008200100081300371300371110201100991001001000010000820011171801600129443010000100130179130085130038130038130038

1000 unrolls and 10 iterations

Result (median cycles for code): 13.0037

retire (01)cycle (02)03181e1f3a3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)72scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa8acc5cdcfd5d6ddinst fetch restart (de)e0? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
1002413003797400006111954025100101010000101000050466095311300181300371300371281200312876710010201000020100001300371300371110021109101010000100900640216221294311000010130038130038130038130038130038
10024130037974057009611954025100101010000101000050466095311300181300371300371281200312876710010201000020100001300371300371110021109101010000100000640216221294311000010130038130038130038130038130038
1002413003797400006111954025100101010000101000050466095311300181300371300371281200312876710010201000020100001300371300371110021109101010000100000640216221294311000010130038130038130038130038130038
1002413003797400006111954025100101010000101000050466095311300181300371300371281200312876710010201000020100001300371300371110021109101010000100000640216221294311000010130038130038130038130038130038
10024130037974000061119540251001010100081010000504660953113001813003713003712812003128767100102010000201000013003713003711100211091010100001085300640216221294311000010130038130038130038130038130038
1002413003797400006111954025100101010000101000050466095311300181300371300371281840312876710010201000020100001300371300371110021109101010000100000640216221294311000010130038130038130038130038130038
10024130037974000091611954025100101010000101000050466095311300181300371300371281200312876710010201000020100001300371300371110021109101010000100000640216221294311000010130038130038130038130038130038
10024130037974000061119540251001010100001010000504660953113001813003713003712812003128767100102010000201000013003713003711100211091010100001088000640216221294311000010130038130038130038130038130038
1002413003797400006111954025100101010000101000050466095311300181300371300371281200312876710010201000020100001300371300371110021109101010000100000640216221294311000010130038130038130038130038130038
10024130037974000061119540251001010100001010000504660953113001813003713003712813903128767100102010000201000013003713003711100211091010100001093000640216221294311000010130038130038130038130038130038

Test 3: throughput

Count: 8

Code:

  fsqrt d0, d8
  fsqrt d1, d8
  fsqrt d2, d8
  fsqrt d3, d8
  fsqrt d4, d8
  fsqrt d5, d8
  fsqrt d6, d8
  fsqrt d7, d8
  movi v8.16b, 9

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 2.0006

retire (01)cycle (02)031e3a3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)6061696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa8acc5branch mispredict (cb)cdcfd0d2d5d6ddinst fetch restart (de)e0? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
8020416004811980012757993725801001008003410080000500567798410160029160048160048149895615000280100200800082008000816004816004811802011009910010080000100001115117001160015998180000100160049160049160049160049160049
80204160048119900617993725801001008000010080000500567798400160029160048160048149895615000280100200800082008000816004816004811802011009910010080000100131115117000160015998180000100160049160049160049160049160049
802041600481198007267993725801001008000010080000500567798400160029160048160048149895615000280100200800082008000816004816004811802011009910010080000100001115117000160015998180000100160049160049160049160049160049
80204160048119900617993725801001008000010080000500567798400160029160048160048149895615000280100200800082008000816004816004811802011009910010080000100001115117000160016005180000100160049160049160049160049160049
8020416004811990027807993725801001008000010080000500567798400160029160048160048149895615000280100200800082008000816004816004811802011009910010080000100001115117000160015998180000100160049160049160049160049160049
802041600481199002987993725801171008000010080000500567798410160029160048160048149895615000280100200800082008000816004816004811802011009910010080000100001115117000160015998180000100160049160049160049160049160049
80204160048119900617993725801001008000010080000555567798410160029160048160048149895615000280100200800082008000816004816004811802011009910010080000100131115117000160015998180000100160049160049160049160049160049
80204160106119800617993725801001008000010080000500567798400160029160048160048149895615000280100200800082008000816004816004811802011009910010080000100001115117000160015998180000100160049160049160049160049160049
802041600481198006317993725801001008000010080000500567798400160029160048160048149895615000280100200800082008000816004816004811802011009910010080000100001115117000160015998180000100160049160049160049160049160049
802041600481199654021707993725801001008000010080049500567798400160029160048160048149895615000280100200800082008000816004816004811802011009910010080000100001115117000160015998180000100160049160049160049160049160049

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 2.0006

retire (01)cycle (02)031e1f3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)5f60696d6edispatch stall (70)72scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa8a9acbranch mispredict (cb)cdcfd0d5d6dbddinst fetch restart (de)e0ebec? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
8002416004811994206179937258001010800001080000505677984011600291600481600481499100315002880010208000020800001600481600481180021109101080000100000050200116011159971008000010160049160049160049160049160049
800241600481198006179937258001010800001080051505677984011600291600481600481499100315002880010208000020800001600481600481180021109101080000100000050200116011159971008000010160049160049160049160049160049
800241600481198006179937258001010800001080000505677984011600291600481600481499100315002880010208000020800001600481600481180021109101080000100230050350117011159971008000010160049160049160049160049160049
800241600481199006179937258001010800001080000505677984011600291600481600481499100315002880010208000020800001600481600481180021109101080000100000050200116011159971008000010160049160049160049160049160049
800241600481199006179937258001010800001080000505677984011600291600481600481499100315002880010208000020800001600481600481180021109101080000100000150200116011159971008000010160049160049160049160049160049
800241600481199006179937258001010800001080000505677984011600291600481600481499100315002880010208000020800001600481600481180021109101080000100000050200216011159971008000010160049160049160049160049160049
800241600481199006179937258001010800001080000505677984011600291600481600481499100315002880010208000020800001600481600481180021109101080000105000050200116011159971008000010160049160049160049160049160049
8002416004811990043479937258002710800171080051505677984011600291600481600481499100315002880010208000020800001600481600481180021109101080000100000050200116011159971008000010160049160049160049160049160049
8002416004811990072679937258001010800001080000505677984011600291600481600481499100315002880010208000020800001600481600481180021109101080000100000050200116011159971008000010160049160049160049160049160049
800241600481199006679937258001010800001080000505677984011600291600481600481499100315002880010208000020800001600481600481180021109101080000100000050200116011159971008000010160049160049160049160049160049