Apple M1 Microarchitecture Research by Dougall Johnson

Firestorm: Overview | Base Instructions | SIMD and FP Instructions
Icestorm:  Overview | Base Instructions | SIMD and FP Instructions

FSQRT (scalar, S)

Test 1: uops

Code:

  fsqrt s0, s0
  movi v0.16b, 1
  movi v1.16b, 2

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 0.000

Load/store unit issues: 0.000

SIMD/FP unit issues: 1.000

retire (01)cycle (02)03080b1e1f3f4e51inst issue (52)~issue fp/simd (54)~dispatch fp/simd (57)huge thing fp/simd (5b)696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op fp/simd (7e)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst neon or fp (9a)a8accfd5d6ddinst fetch restart (de)e0? fp/simd (ee)f5f6f7f8fd
1004100377500006186482510001000100035194110018100371003793953989510001000100010037100371110011000007321611957810001003810038100381003810038
10041003776002706186482510001000100035194110018100371003793953989510001000100010037100371110011000007311611957810001003810038100381003810038
1004100377500606186482510001000100035194110018100371003793953989510001000100010037100371110011000007311611957810001003810038100381003810038
1004100377500606186482510001000100035194110018100371003793953989510001000100010037100371110011000007311611957810001003810038100381003810038
1004100377500606186482510001000100035194110018100371003793953989510001000100010037100371110011000007311611957810001003810038100381003810038
1004100377500006186482510001000100035194110018100371003793953989510001000100010037100371110011000007311611957810001003810038100381003810038
1004100377600006186482510001000100035194110018100371003793953989510001000100010037100371110011000007311611957810001003810038100381003810038
10041003775000053686482510001000100035194110018100371003793953989510001000100010037100371110011000007311611957810001003810038100381003810038
1004100377500006186482510001000100035194110018100371003793953989510001000100010037100371110011000007311611957810001003810038100381003810038
1004100377500006186482510001000100035194110018100371003793953989510001000100010037100371110011000007311611957810001003810038100381003810038

Test 2: Latency 1->2

Code:

  fsqrt s0, s0
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 10.0037

retire (01)cycle (02)03070a1e1f3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa6a8acc2c5cdcfd5d6ddinst fetch restart (de)e0? int output thing (e9)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
1020410003774900006189648251010010010000100100005003582941010001810003710003798245398745101002001000020010000100037100037111020110099100100100001000300007101161199578010000100100038100038100038100038100038
102041000377490005286189648251010010010000100100005003582941010001810003710003798245398745101002001000020010000100037100037111020110099100100100001000000107101161199578010000100100038100038100038100038100038
1020410003774900006189648251010010010000100100005003582941010001810003710003798245398745101002001000020010000100037100037111020110099100100100001000000007101161199578010000100100038100038100038100038100038
1020410003775000006189648251010010010000100100005003582941010001810003710003798245398745101002001000020010000100037100037111020110099100100100001000000007101161199578010000100100038100038100038100038100038
10204100037750000019389648251010010010000100100005003582941010001810003710003798245398745101002001000020010000100037100037111020110099100100100001000000007101161199578010000100100038100038100038100038100038
10204100037749000044189648251010010010000100100005003582941010001810003710003798245398745101002001000020010000100037100037111020110099100100100001000000007101161199578010000100100038100038100038100038100038
1020410003774900010861896482510100100100001001000050035829411100018100070100037982453987451010020010000200100001000371000371110201100991001001000010000000071025611996862410000100100038100038100038100038100038
10204100037749006006189648251010010010000100100005003582941010001810003710003798245398745101002001000020010000100037100037111020110099100100100001000000007101161199578010000100100038100038100038100038100038
10204100037750000072689648251010010010000100100005003582941110001810003710003798245398745101002001000020010000100037100037111020110099100100100001000000007101161199578010000100100038100038100038100038100038
1020410003774900006189648251010010010000100100005003582941110001810003710003798245398745101002001000020010000100037100037111020110099100100100001000200007101161199578010000100100038100038100038100038100038

1000 unrolls and 10 iterations

Result (median cycles for code): 10.0037

retire (01)cycle (02)0304070b191e1f3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696b6d6edispatch stall (70)72scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a1a6a7a8a9acc2c5cdcfd5d6ddinst fetch restart (de)e0? int output thing (e9)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
1002410003775000000061896482510010101000010100005035829411100018010003710003798267039876710010201000020100001000371000371110021109101010000100000000000640216229957801000010100038100038100038100038100038
1002410003774900000061896482510010101000010100005035829410100018010003710003798267039876710010201000020100001000371000371110021109101010000100000000000640216229957801000010100038100038100038100038100038
100241000717490000003721896482510010101000010100005035829410100018010003710003798267039876710010201000020100001000371000371110021109101010000100000000000640216229957801000010100038100038100038100081100038
1002410003774900000061896482510010101000010100005035829410100018010003710003798267039876710010201000020100001000371000371110021109101010000100000000000640216229957801000010100038100038100038100038100038
1002410003774900000061896482510010101000010100395035829411100018010003710003798267039876710010201000020100001000371000371110021109101010000100000000000640216229957801000010100038100038100038100038100038
10024100037750000000103896482510010101000010100005035829410100018010003710003798267039876710010201000020100001000701000371110021109101010000100000000000640216229957801000010100038100038100038100038100038
1002410003774900000061896482510010101000010100005035829411100018010003710003798267039876710010201000020100001000371000371110021109101010000100000000000640116229957801000010100038100038100038100038100038
1002410003774900000061896482510010101000010100005035829410100018010003710003798267039876710010201000020100001000371000371110021109101010000100000000000640216229957801000010100038100038100038100038100038
1002410003774900000061896485410010101000010100005035829411100018010003710003798267339876710010201000020100001000371000371110021109101010000100000000000640216229957801000010100038100038100038100038100038
1002410003774900000061896482510010101000010100005035829411100018010003710003798267039876710010201000020100001000371000371110021109101010000100000000000640216229957801000010100038100038100038100038100038

Test 3: throughput

Count: 8

Code:

  fsqrt s0, s8
  fsqrt s1, s8
  fsqrt s2, s8
  fsqrt s3, s8
  fsqrt s4, s8
  fsqrt s5, s8
  fsqrt s6, s8
  fsqrt s7, s8
  movi v8.16b, 9

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 2.0006

retire (01)cycle (02)0307080a0b18191e1f3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696b6d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a8acc5branch mispredict (cb)cdcfd5d6ddinst fetch restart (de)e0? int output thing (e9)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
802041600451199001000903497993725801001008000010080000500567798411600260160045160045149895614999980100200800082008000816004516004511802011009910010080000100000111511721611159985080000100160046160046160046160046160046
802041600451198101000007267993725801001008000010080000500567798401600260160045160045149895614999980100200800082008000816004516004511802011009910010080000100000111511711611159985080000100160046160046160046160046160046
802041600451199101000006179937258010010080000100800005005677984116002601600451600451498956149999801002008000820080008160045160045118020110099100100800001000001115117116111599852480000100160046160046160046160046160046
802041600451198201000009357993725801001008000010080000500567798411600260160045160045149895615015580100200800082008000816004516026611802011009910010080000100000111511711611160157080000100160046160046160046160046160046
80204160045119810100000617993725801001008000010080000500567798401600260160045160045149895614999980100200800082008000816004516004531802011009910010080000100013791111511714911159985080000100160046160101160046160046160046
80204160045119910100024352827993725801001008000010080000500567798411600260160045160045149895614999980100200800082008000816004516004511802011009910010080000100000111511713211159985080000100160046160046160046160046160046
802041600451199101000007267993725801001008000010080000500567798401600260160045160045149895614999980100200800082008000816004516004511802011009910010080000100010111511711611159985080000100160046160046160046160046160046
8020416004512001010000011147993725801001008000010080000500567798401600260160045160045149895614999980100200800082008000816004516004511802011009910010080000100003111511711611159985080000100160046160046160046160046160046
80204160045119910100000617993725801001008000010080000500567798411600260160045160045149895614999980100200800082008000816004516004511802011009910010080000100000111511711611160211080000100160046160046160046160046160046
80204160045119810100000617993725801001008000010080000500567798411600260160045160045149895614999980100200800082008000816004516004511802011009910010080000100000111511721611159985080000100160046160046160046160046160046

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 2.0006

retire (01)cycle (02)031e3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)6061696d6edispatch stall (70)72scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa1a8acc5branch mispredict (cb)cfd0d2icache miss (d3)d5d6ddinst fetch restart (de)e0? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
8002416004511980617993725800101080000108000050567798415160026160045160045149910031500258001020800002080000160045160045118002110910108000010000005020000316331599728000010160046160046160046160046160046
80024160045119807267993725800101080000108000050567798400160026160045160045149910031500258001020800002080000160045160045118002110910108000010010005020500216231599728000010160046160046160046160046160046
800241600451198123961799372580010108000010800005056779841516002616010016004514991003150025800102080000208000016004516004511800211091010800001040895005020000316421603658000010160046160046160434160376160046
80024160045119907687993725800101080000108000050567798415160026160045160045149910031500258001020800002080000160045160045218002110910108000010000005020000316431599728000010160046160046160046160046160046
8002416004511990617993725800101080000108000050567818505160026160045160045149910031500258001020800002080000160045160045118002110910108000010000005020000316231600138000010160046160046160046160046160046
800241600451199987617993725800101080000108000050567798400160026160045160045149910031500258001020800002080000160045160045118002110910108000010000005020002316431599728000010160046160046160046160046160046
80024160045119907267993725800101080000108000050567798410160026160045160045149910031500258001020800002080000160045160045118002110910108000010000005020540316331599728000010160046160046160046160046160046
8002416004511990617993725800101080000108004850567798415160026160045160045149910031500258001020800002080000160045160045118002110910108000010000005020540316331599728000010160046160046160046160046160046
80024160045119907267993725800101080000108000050567798415160081160045160045149910031500258001020800002080000160101160045118002110910108000010000005020540616341599728000010160046160046160046160046160046
800241600451198360617993725800101080000108000050567798415160026160045160045149910031500258001020800002080000160045160045118002110910108000010040005020540316321599728000010160101160046160046160046160046