Apple M1 Microarchitecture Research by Dougall Johnson

Firestorm: Overview | Base Instructions | SIMD and FP Instructions
Icestorm:  Overview | Base Instructions | SIMD and FP Instructions

FSQRT (vector, 4S)

Test 1: uops

Code:

  fsqrt v0.4s, v0.4s
  movi v0.16b, 1
  movi v1.16b, 2

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 0.000

Load/store unit issues: 0.000

SIMD/FP unit issues: 1.000

retire (01)cycle (02)03091e3f4e51inst issue (52)~issue fp/simd (54)~dispatch fp/simd (57)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op fp/simd (7e)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst neon or fp (9a)a8accfd5d6ddinst fetch restart (de)e0? fp/simd (ee)f5f6f7f8fd
100410037750061864825100010001000351941110018100371003793953989510001000100010037100371110011000007311611957810001003810038100381003810038
1004100377500618648251000100010003519410100181003710037939539895100010001000100371003711100110000757311611957810001003810038100381003810038
100410037750061864825100010001000351941110018100371003793953989510001000100010037100371110011000007311611957810001003810038100381003810038
100410037750061864825100010001000351941110018100371003793953989510001000100010037100371110011000007311611957810001003810038100381003810038
1004100377500618648251000100010003519411100181003710037939539895100010001000100371003711100110000457311611957810001003810038100381003810038
100410037750961864825100010001000351941110018100371003793953989510001000100010037100371110011000007311611958610001003810038100381003810038
100410037750061864825100010001000351941110018100371003793953989510001000100010037100371110011000007311611957810001003810038100381003810038
100410037750061864825100010001000351941110018100371003793953989510001000100010037100371110011000007311611957810001003810038100381003810038
1004100377500618648251000100010003519411100181003710037939539895100010001000100371003711100110000607311611957810001003810038100381003810038
100410037750061864825100010001000351941110018100371003793953989510001000100010037100371110011000007311611957810001003810038100381003810038

Test 2: Latency 1->2

Code:

  fsqrt v0.4s, v0.4s
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 10.0037

retire (01)cycle (02)030b181e3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa8acc5branch mispredict (cb)cdcfd5d6ddinst fetch restart (de)e0? int output thing (e9)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
10204100037749000618964825101001001000010010000500358294101000181000371000379824539874510100200100002001000010003710003711102011009910010010000100000007101161199578010000100100086100038100038100038100038
1020410003774900061896482510100100100001001000050035829411100018100037100037982453987451010020010000200100001000371000371110201100991001001000010002040007101161199578010000100100038100038100038100038100038
102041000377490007268964825101001001000010010000500358294101000181000371000379824539874510100200100002001000010003710003711102011009910010010000100000007101161199578010000100100038100038100038100038100038
10204100037749000618964825101001001000010010000500358294101000181000371000379824539874510100200100002001000010003710003711102011009910010010000100000007101161199578010000100100038100038100038100038100038
10204100037749000618964825101001001000010010000500358294111000181000371000379824539874510143200100002001000010003710003711102011009910010010000100000007101171199578010000100100038100038100038100038100038
102041000377490003468964825101001001000010010000500358294101000181000371000379824539874510100200100002001000010003710003711102011009910010010000100000007101161199578010000100100038100038100038100038100038
10204100037749000618964825101001001000010010000500358294111000181000371000379824539874510100200100002001000010003710003711102011009910010010000100000007101161199578010000100100038100038100038100038100038
10204100037749000618964825101001001000010010000500358294101000181000371000379824539874510100200100002001000010003710003711102011009910010010000100000007101161199578010000100100038100038100038100038100038
10204100037749000618964825101001001000010010000500358294111000181000371000379824539874510100200100002001000010003710003711102011009910010010000100000007101161199578010000100100038100038100038100038100038
10204100037749000618964825101001001000010010000500358294101000181000371000379824539874510100200100002001000010003710003711102021009910010010000100000007101161199578010000100100038100038100038100038100038

1000 unrolls and 10 iterations

Result (median cycles for code): 10.0037

retire (01)cycle (02)03080b18191e1f3a3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa1a6a8acc2c5cdcfd5d6ddinst fetch restart (de)e0? int output thing (e9)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
100241000377490000900618964825100101010000101000050358294101000181000371000379826739876710010201000020100001000371000371110021109101010000100000000640216229957801000010100038100038100038100038100038
10024100037750000060900618964825100101010000101000050358294111000181000371000379826739876710010201000020100001000371000371110021109101010000100000000640224229957801000010100038100038100038100038100038
10024100037749000000061896482510010101000010100005035829410100018100037100037982673987671001020100002010000100037100037111002110910101000010001036010640216229957801000010100038100038100038100038100038
100241000377500000000618964825100101010000101000050358294101000181000371000379826739876710010201000020100001000371000371110021109101010000100000000640216229957801000010100038100038100038100038100038
100241000377490000000618964825100101010000101000050358294101000181000371000379826739876710010201000020100001000371000371110021109101010000100000000640216229957801000010100038100038100038100081100038
100241000377490000000618964825100101010000101000050358294101000181000371000379826739876710010201000020100001000371000372110021109101010000100003000640216229957801000010100038100038100038100038100038
100241000377490000600618964825100101010000101000050358294101000181000371000379826739876710010201000020100001000371000371110021109101010000100070000640216229957801000010100038100038100038100038100038
100241000377490000000618964825100101010000101000050358294101000181000371000379826739876710010201000020100001000371000371110021109101010000100000000640216229957801000010100038100038100038100038100038
100241000377500000000618964825100101010000101000050358294101000181000371000379826739876710010201000020100001000371000371110021109101010000100000000640216229957801000010100038100038100038100038100038
100241000377490000000618964825100101010000101000050358294101000181000371000379826739876710010201000020100001000371000371110021109101010000100000000640216229957801000010100038100038100038100038100038

Test 3: throughput

Count: 8

Code:

  fsqrt v0.4s, v8.4s
  fsqrt v1.4s, v8.4s
  fsqrt v2.4s, v8.4s
  fsqrt v3.4s, v8.4s
  fsqrt v4.4s, v8.4s
  fsqrt v5.4s, v8.4s
  fsqrt v6.4s, v8.4s
  fsqrt v7.4s, v8.4s
  movi v8.16b, 9

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 2.0006

retire (01)cycle (02)03181e3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696b6d6edispatch stall (70)72scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a8a9acc2c5branch mispredict (cb)cdcfd5d6ddinst fetch restart (de)e0? int output thing (e9)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
802041600451199006179937258010010080000100800005005677984116002601600451600451498950614999980100200800722008000816004516004511802011009910010080000100031000111511701600159981080000100160046160046160046160046160046
80204160045119900542799372580100100800001008000051156779841160026016004516004514989506149999801002008000820080008160045160045118020110099100100800001000123000111511701600159981080000100160046160046160046160046160046
802041600451198006179937258010010080000100800005005677984116002601600451600451498950614999980100200800082008000816004516004511802011009910010080000100040002111511701600160076080000100160046160046160046160046160046
80204160375119903010379937258010010080000100800005005677984116002601600451600451498950614999980100200800082008000816004516004511802011009910010080000100095000111511701600159981080000100160046160046160046160046160046
80204160045119900617993725801001008000010080000500567798411600260160045160045149895061499998010020080008200800081600451601001180201100991001008000010007000111511701600159981080000100160046160046160046160046160046
80204160045119900617993725801001008000010080000500567798411600260160045160045149927061499998010020080008200800081600451600451180201100991001008000010003030111511701700159981080000100160046160046160046160046160046
802041600451199007267993725801001008001610080000500567798411600260160045160045149895061499998015620080008200800081600451600451180201100991001008000010000000111511701600159981080000100160046160046160046160046160046
80204160045119900617993725801001008000010080000500567798411600260160045160045149895061499998010020080008200800081600451600451180201100991001008000010000090111511701600160037080000100160046160046160100160046160046
8020416004511980127267993725801001008000010080000500567798411600260160045160045149895061499998010020080008200800081600451600451180201100991001008000010000000111511701600159981080000100160046160046160046160046160046
802041600451199001247993725801001008000010080000500567798411600260160045160045149895061500458010020080008200800081600451600451180201100991001008000010000000111511701600159981080000100160046160046160046160046160046

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 2.0006

retire (01)cycle (02)031e3a3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa1a6a8a9acc2c5branch mispredict (cb)cfd5d6ddinst fetch restart (de)e0eaeb? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
80024160045119800617993725800101080000108014450567798416002616004516004514991031500258001020803582080000160045160045118002110910108000010222002005037211611615997210808000010160046160046160046160046160046
800241600451199006179937258001010800001080000505677984160026160045160045149910315002580010208000020800001600451600451180021109101080000100000000050205169101599729308000010160046160046160046160046160046
800241600451198006179937258001010800001080000505677984160026160045160045149910315002580010208000020800001600451600451180021109101080000100000000050201016581599727708000010160046160046160046160046160046
80024160045119900617993725800101080000108000050567798416002616004516004514991031500258001020800002080000160045160045118002110910108000010000000005020416861599727708000010160046160046160046160046160046
80024160045119900617993725800101080000108000050567798416002616004516010014991031500258001020800002080000160045160045118002110910108000010000000005020816761599729308000010160046160046160046160046160046
800241600451199006179937258001010800001080000505677984160026160045160045149910315002580010208000020800001600451600451180021109101080000100000000050201116641599729308000010160046160046160046160046160046
8002416004511980061799372580010108000010800005056779841600261600451600451499103150025800102080000208000016004516004511800211091010800001000002700050208167151599727708000010160046160046160046160046160046
8002416004511993006179937258001010800001080000505677984160026160045160045149910315002580010208000020800001600451600451180021109101080000100000000050205167101599727708000010160046160046160046160046160046
8002416004511990061799372580010108000010800005056779841600261600451600451499103150025800102080000208000016004516004511800211091010800001000002010005020916681599727708000010160046160046160046160046160046
8002416004511990061799372580010108000010800005056779841600261600451600451499103150025800102080000208000016004516004511800211091010800001000000000502071614111599729308000010160046160046160046160046160046