Apple M1 Microarchitecture Research by Dougall Johnson

Firestorm: Overview | Base Instructions | SIMD and FP Instructions
Icestorm:  Overview | Base Instructions | SIMD and FP Instructions

FSQRT (vector, 8H)

Test 1: uops

Code:

  fsqrt v0.8h, v0.8h
  movi v0.16b, 1
  movi v1.16b, 2

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 0.000

Load/store unit issues: 0.000

SIMD/FP unit issues: 1.000

retire (01)cycle (02)030b181e3f4e51inst issue (52)~issue fp/simd (54)~dispatch fp/simd (57)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op fp/simd (7e)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst neon or fp (9a)a8accfd5d6ddinst fetch restart (de)e0? fp/simd (ee)f5f6f7f8fd
10048037600006167202510001000100028126118018803780377493378951000100010008037803711100110000073116117676100080388038803880388038
10048037600006167202510001000100028126118018803780377493378951000100010008037803711100110000073116117676100080388038803880388038
10048037600006167202510001000100028126108018803780377493378951000100010008037803711100110000073116117676100080388038803880388038
100480376001126167202510001000100028126108018803780377493378951000100010008037803711100110000073116117676100080388038803880388038
100480376000061672025100010001000281261180188037803774933789510001000100080378037111001100005773116117676100080388038803880388038
10048037600006167202510001000100028126118018803780377493378951000100010008037803711100110000073116117676100080388038803880388038
10048037610006167202510001000100028126118018803780377493378951000100010008037803711100110000073116117676100080388038803880388038
10048037600006167202510001000100028126118018803780377493378951000100010008037803711100110000073116117676100080388038803880388038
10048037611006167202510001000100028126118018803780377493378951000100010008037803711100110000073116117676100080388038803880388038
10048037600006167202510001000100028126118018803780377493378951000100010008037803711100110000073116117676100080388038803880388038

Test 2: Latency 1->2

Code:

  fsqrt v0.8h, v0.8h
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 8.0037

retire (01)cycle (02)03080918191e3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a6a8accfd5d6ddinst fetch restart (de)e0? int output thing (e9)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
102048003760000004086169720251010010010000100100005002864261080018800378003778343378745101002001000020010000800378003711102011009910010010000100000071011611796760100001008003880038800388003880038
10204800376000000396169720251010010010000100100005002864261180018800378003778343378745101002001000020010000800378003711102011009910010010000100200071011611796760100001008003880038800388003880038
10204800376000000516169720251010010010000100100005002864261080018800378003778343378745101002001000020010000800378003711102011009910010010000100000071011611796760100001008003880038800388003880038
10204800375990000216169720251010010010000100100005002864261080018800378003778343378745101002001000020010000800378003711102011009910010010000100000071011611796760100001008003880038800388003880038
102048003759900001810369720251010010010000100100005002864261180018800378003778343378745101002001000020010000800378003711102011009910010010000100000371011611796760100001008003880038800388003880038
102048013160011224956169720251010010010000100100005002864261080018800378003778343378745101002001000020010000800378003711102021009910010010000100001071011611796760100001008003880038800388003880038
1020480037600000036012669720251010010010000100100005162864563080018800378003778343378745102222001000020010000800378003711102011009910010010000100020071011611796760100001008003880038800388003880038
10204800376000000846169720251010010010000100100005002864261080018800378003778343378745101002001000020010000800378003711102011009910010010000100000071011611796760100001008003880038800388003880038
102048003759900001926169720251010010010000100100005002864261080053800378003778343378745101002001000020010000800378003711102011009910010010000100000071011611796760100001008003880038800388003880038
102048003760000004688269720251010011110000100100005002864261080018800378003778343378745101002001000020010000800378003711102011009910010010000100000071001611796760100001008003880038800388003880038

1000 unrolls and 10 iterations

Result (median cycles for code): 8.0037

retire (01)cycle (02)030b1e3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a1a8acbranch mispredict (cb)cdcfd5d6ddinst fetch restart (de)e0? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
100248003759905461697202510010101000010100005028642618001880037800377836537876710010201000020100008003780037111002110910101000010000010640216227967610000108003880038800388003880038
100248003759905161697202510010101000010100005028642618001880037800377836537876710010201000020100008003780037111002110910101000010000000640216227967610000108003880038800388003880038
100248003760003061697202510010101000010100005028642618001880037800377836537876710010201000020100008003780037111002110910101000010000000640216227967610000108003880038800388003880038
100248003760003361697202510010101000010100005028642618001880037800377836537876710010201000020100008003780037111002110910101000010000000640216227967610000108003880038800388003880038
100248003759903661697202510010101000010100005028642618001880037800377836537876710010201000020100008003780037111002110910101000010000000640216227967610000108003880038800388003880038
10024800375990061697202510010101000010100005028642618001880037800377836537876710010201000020100008003780037111002110910101000010000000640216227967610000108003880038800388003880038
100248003759903661697202510010101000010100005028642618001880037800377836537876710010201000020100008003780037111002110910101000010000000640216227967610000108003880038800388007180038
100248003760003061697206410025121000010100005028642618001880037800837836537876710010201000020100008003780037111002110910101000010220000640216227967610000108003880038800388003880038
100248003759902761697202510010101000010100005028642618001880037800377836537876710010201000020100008003780037111002110910101000010000000640216227967610000108003880038800388003880038
100248003760003961697202510010101000010100005028642618001880037800377836537876710010201000020100008003780037111002110910101000010000000640216227967610000108003880038800388003880038

Test 3: throughput

Count: 8

Code:

  fsqrt v0.8h, v8.8h
  fsqrt v1.8h, v8.8h
  fsqrt v2.8h, v8.8h
  fsqrt v3.8h, v8.8h
  fsqrt v4.8h, v8.8h
  fsqrt v5.8h, v8.8h
  fsqrt v6.8h, v8.8h
  fsqrt v7.8h, v8.8h
  movi v8.16b, 9

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 2.0005

retire (01)cycle (02)03070a1e3f4e5051inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696b6d6edispatch stall (70)72scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa6a8a9acc5branch mispredict (cb)cdcfd5d6ddinst fetch restart (de)e0? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
80204160043119911061799370258010010080000100800005005677984016002401600431600431498950614999780100200800082008000816004316004311802011009910010080000100000011151170161115998480000100160044160044160044160044160044
8020416004311991106179937160027258010010080000100800005005677984016002401600431600431498950614999780100200800082008000816004316004311802011009910010080000100000011151171161115998480000100160044160097160044160044160044
802041600431199110726799370258010010080015100800005005677984016002401600431600431498950614999780100200800082008000816004316004311802011009910010080000100000011151171161115998480000100160044160044160044160044160044
80204160043119811061799370258010010080000100800005005677984016002401600431600431498950614999780100200800082008000816004316004311802011009910010080000100000011151171161115998480000100160044160044160044160044160044
802041600431199110617993702580100100800001008000050056779840160024016004316004314989502514999780100200800082008000816004316004311802011009910010080000100000011151171161115998480000100160044160044160044160044160044
80204160043119811061799370258010011980000100800005005677984016002401600431600431498950614999780100200800082008000816004316009611802011009910010080000100000011151171161115998480000100160044160044160044160044160044
80204160043119911061799370258010010080000100800005005677984016002401600431600431498950614999780100200800082008000816004316004311802011009910010080000100000011151171161115998480000100160044160044160044160044160044
80204160043119911061799370258010010080000100800005005677984016002401600431600431498950614999780100200800082008000816004316004311802011009910010080000100000011151171161115998480000100160044160044160044160044160044
80204160043119911061799370408010010080000100800005005677984016002401600431600431498950614999780100200800082008000816004316004311802011009910010080000100000011151171161115998480000100160044160044160044160044160044
802041600431199110183799370258010010080000100800005005677984016002401600431600431498953614999780100200800082008000816004316004311802011009910010080000100000011151301161115998480000100160044160044160044160044160044

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 2.0005

retire (01)cycle (02)030809181e3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696b6d6edispatch stall (70)72scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa8acc5cfd5d6d9ddinst fetch restart (de)e0? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
800241600431198000061799372580010108000010800005056779841160024016004316004314991003150023800102080000208000016004316004311800211091010800001000050201016011151599698000010160044160044160044160044160044
8002416004312000000726799372580010108000010800005056779841160024016004316004314991003150023800102080000208000016004316004311800211091010800001000050201216014161599698000010160044160044160044160044160044
800241600431198000061799372580010108000010800005056779841160077016004316004314991003150023800102080000208006816004316004311800211091010800001000050201816016131599698000010160044160044160044160044160044
8002416004311990000726799372580010108000010800005056779841160024016004316004314991003150023800102080000208000016004316004311800211091010800001000050201116010151599698000010160044160044160044160044160044
80025160043119800001029799372580010108000010800005056779841160024016004316004314991003150023800102080000208000016004316004311800211091010800001000050201216014151599698000010160044160044160044160044160044
8002416004311990000726799372580010108000010800005056779841160024016004316004314991003150023800102080000208000016004316004311800211091010800001000050201416013131599698000010160044160044160044160044160044
800241600431199000094799372580010108000010800005056779841160024016004316004314991003150023800102080000208000016004316004311800211091010800001000050201216011131599698000010160044160044160044160044160044
8002416004311990000726799374380010108000010800005056779841160024016004316004314991003150023800102080000208000016004316004311800211091010800001000050201416012101599698000010160044160044160044160044160044
800241600431220000061799372580040108004510800005056779841160024016004316004314991003150023800102080000208000016004316004311800211091010800001000050201616015151599698000010160044160044160044160044160044
8002416004311990000631799372580010108000010800005056779841160024016004316004314991003150023800102080000208000016004316004311800211091010800001000050201316013131599698000010160044160044160044160044160044