Apple M1 Microarchitecture Research by Dougall Johnson

Firestorm: Overview | Base Instructions | SIMD and FP Instructions
Icestorm:  Overview | Base Instructions | SIMD and FP Instructions

FMOV (D[1] to X)

Test 1: uops

Code:

  fmov x0, v0.d[1]
  mov x0, 1
  mov x1, 2

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 2.000

Integer unit issues: 1.000

Load/store unit issues: 0.000

SIMD/FP unit issues: 1.000

retire (01)cycle (02)03071e1f3f51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch fp/simd (57)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op fp/simd (7e)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst integer (97)a8accfd5d6ddinst fetch restart (de)e0? int output thing (e9)gpr retires (ef)f5f6f7f8fd
100453841088138252000100010001081800015195385383703396100010001000538538111001100003732161153510001000539539539539539
1004538400043252000100010001000800015195385383703396100010001000538538111001100000731161153510001000539539539539539
1004538400043252000100010001000800015195385383703396100010001000538538111001100000731161153510001000539539539539539
1004538400043252000100010001000800015195385383703396100010001000538538111001100000731162253510001000539539539539539
1004538400043252000100010001000800005195385383703396100010001000538538111001100000731161153510001000539539539539539
10045385057043252000100010001000800005195385383703396100010001000538538111001100000731161153510001000539539539539539
10045384015043252000100010001000800015195385383703396100010001000538538111001100000751161153510001000539539539539539
1004538400043252000100010001000800015195385383703396100010001000538538111001100010731161153510001000539539539539539
1004538400043252000100010001000800015195385383703396100010001000538538111001100000731161153510001000539539539539539
1004538500043252000100010001000800015195385383703396100010001000538538111001100000731161153510001000539539539539539

Test 2: Latency 1->2 roundtrip

Code:

  fmov x0, v0.d[1]
  fmov d0, x0
  mov x0, 1
  mov x1, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 10.0038

retire (01)cycle (02)03080b18191e1f3a3f4f51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)60696b6d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa1a6a8acc2c5cfd0d5d6ddinst fetch restart (de)e0e4? int output thing (e9)ld/st retires (ed)gpr retires (ef)f5f6f7f8fd
2020410003877500002040010002389561253010010100100001000010010000100005004778481558034901000130100044100379969003974982010020010000100002001000010000100038100038112020110099100101001000010001000000001310121622996370100001000010100100042100039100039100039100039
20204100379775000000010002389564253010010100100001000010010000100005004778481558034901000130100038100038969003974962010020010000100002001000010000100038100038112020110099100101001000010001000020001310121622996370100001000010100100039100039100039100039100225
202041000387750000120010002389562253010010100100001000010010056100005004778481558034901000130100038100038969003974962010020010000100002001000010000100038100038112020110099100101001000010001000030001310121622996370100001000010100100039100039100039100039100039
20204100038776000000010002389561253010010100100001000010010000100005004778481558034901000130100038100038969043974962010020010000100002001000010000100038100038112020110099100101001000010001000000001310121622996370100001000010100100039100039100039100039100039
202041000387760000000100023895612530100101001000010000100100001000050047784815580349010001301000381000389690039749620100200100001000020010000100001000381000381120201100991001010010000100010000015001310121622996370100001000010100100039100039100039100039100040
20204100038776000000010002389561253010010100100001000010010000100005004778481558034901000130100038100038969003974962010020010000100002001000010000100038100038112020110099100101001000010001000013001329121622996370100001000010100100039100041100041100039100131
20204100045776010100010002389561433010010133100001000410510179100505004784587558045901000130100122100041969033974962010020010000100002001000010000100038100038112020110099100101001000010001000000001310121622996370100001000010100100042100045100118100039100039
20204100038776000000010002389561253010010100100001000010010000100005004778481558034901000160100038100038969423974962010020010000100002001000010000100041100038112020110099100101001000010001000003001310121622996370100001000010100100039100039100039100039100039
2020410004177601135670010036989561253010010100100001000010010000100005004778481558040601000130100038100038969003974962010020010000100002001000010000100040100038112020110099100101001000010001000010001310121622996370100001000010100100039100042100039100039100039
20204100038775000000010002389561253010010100100001000010010000100005004778481558034901000130100039100038969003974962010020010000100002001000010059100039100038112020110099100101001000010021000009001311121622996370100001000010100100040100039100039100039100039

1000 unrolls and 10 iterations

Result (median cycles for code): 10.0038

retire (01)cycle (02)03080b18191e1f3f4f51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a1a6a8a9acc2cfd5d6ddinst fetch restart (de)e0? int output thing (e9)ld/st retires (ed)gpr retires (ef)f5f6f7f8fd
200241000387750000001000248962125300101001010000100001010000100005047784815578954110001310003810003896922397518200102010000100002010000100001000381000381120021109101001010000100010000603012701161199637100001000010010100039100039100039100071100039
2002410003877500001201000268962825300101001010000100001010000100005047784815578954010001310003810003996922397520200102010000100002010000100001000381000381120021109101001010000100010000003012701161199637100001000010010100039100039100039100098100039
200241000387750000001000248959925300101001010000100001010000100005047784815579008010001310003810003896922397518200102010000100002010000100001000381000381120021109101001010000100010000100012981161199637100001000010010100039100039100039100039100039
200241000387760000001000238964725300101001010000100001010000100005047784815578954110001310003810003896922397518200102010000100002010000100001000381000381120021109101001010000100010000000012701161199637100001000010010100039100042100040100039100039
2002410003877510001801000238959125300101001010000100001010000100005047784815578954010001310003810003896922397518200102010000100002010000100001000381000381120021109101001010000100010000000012701161199637100001000010010100039100039100039100039100039
200241000387750000001000238963725300101001010000100001010000100005047784815578954010001310003810003896923397518200102010000100002010000100001000381000401120021109101001010000100010000000012701161199637100001000010010100039100039100043100039100039
2002410003877600001201000238958425300101001010000100001010234100005047784815578954010001610003810003896922397518200102010000100002010000100001000381000381120021109101001010000100010000000012701161199637100001000010010100039100039100039100104100039
2002410003877500001201000238962031300101001010000100001010000100005047784815578954110001310003810003896922397518200102010000100002010000100001000381000381120021109101001010000100010000000012701161199637100001000010010100040100039100039100039100039
200241000387750000001000238964625300101001010000100001010000100005047784815578954010001310003810003996944397518200102010000100002010000100001000381000381120021109101001010000100010000003012701161199637100001000010010100039100039100042100039100041
200241000387760000001000238959925300101001010000100001010000100005047784815578954010001310003810003896922397518200102010000100002010000100001000381000381120021109101001010000100010000003012701161199637100001000010010100039100039100040100093100039

Test 3: throughput

Count: 8

Code:

  fmov x0, v8.d[1]
  fmov x1, v8.d[1]
  fmov x2, v8.d[1]
  fmov x3, v8.d[1]
  fmov x4, v8.d[1]
  fmov x5, v8.d[1]
  fmov x6, v8.d[1]
  fmov x7, v8.d[1]
  mov x8, 9

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.5005

retire (01)cycle (02)03090b18191e1f3f51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)9fa1a6a8a9acc2c5branch mispredict (cb)cdcfd5d6ddinst fetch restart (de)e0? int output thing (e9)gpr retires (ef)f5f6f7f8fd
802044005131100000032251601008010080000100802665006407440401804003840303299766299918010420080016200800164003840038118020110099100801001000000301115167016004003580000801004003940039400394003940039
802044003831000000032251601008010080000100800045006400241400194003840038299766299918010420080016200800164003840038118020110099100801001000020001115117016004003580000801004003940039400394003940169
8020440038310000000322516010080100800001008000450064002414001940038400382997662999180104200800162008001640038400381180201100991008010010000320001115117016004003580000801004003940039400394003940039
802044003831010000032251601008010080000100800045006400241400194003840038299766299918010420080016200800164003840038118020110099100801001000000001115117016004003580000801004003940039400394003940039
802044003831000000032251601008010080000100800045006400241400194003840038300076299918010420080016200800164003840038118020110099100801001000000001115117016014003580000801004003940039400394003940039
802044003831000000032251601008010080000100800045006400241400194003840038299766299918010420080016200800164003840038118020110099100801001000010001115117016004003581741801004003940039400394003940039
802044003831000000032251601008010080000100800045006400240400194003840038299766299918010420080016200800164003840038118020110099100801001000020001115117232004019980000801004010540039403074003940303
802044023331000333992644108016010080330803171008025458064205014023640497404323009725301718045020080294200806524023240038518020110099100801001004239175321115117040504003580000801004003940233400394003940303
80204400383130001144032251601008010080000100800045006400240400194003840038299766299918010420080016200800164003840038118020110099100801001000000001115117016004003580000801004003940039400394003940039
802044003831100000032251601008010080000100800045006400240400194003840038299766299918010420080016200800164003840297118020110099100801001000000001115117016004003580000801004003940039400394003940039

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.5005

retire (01)cycle (02)03091e3f51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)5f60696b6d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)9fa8accfd0d5d6daddinst fetch restart (de)e0? int output thing (e9)gpr retires (ef)f5f6f7f8fd
800244004831100432516001080010800001080000506400000040019040038400382999233001880010208000020800004003840038118002110910800101000502006160434003580000800104003940039400394003940039
800244003831000432516001080010800001080000506400000040019040038400382999233001880010208000020800004003840038118002110910800101010502003160344003580000800104003940039400394003940039
8002440038310012432516001080010800001080000506400001140019040038400382999233001880010208000020800004003840038118002110910800101003502003160434003580000800104003940039400394003940039
800244003831000432516001080010800001080000506400000040019040038400382999233001880010208000020800004003840038118002110910800101010502003160444003580000800104003940039400394003940039
8002440038310001272516001080010800001080000506400000040019040038400382999233001880010208000020800004003840038118002110910800101003502003160344003580000800104003940039400394003940039
800244003831000432516001080010800001080000506400000040019040038400382999233001880010208000020800004003840038118002110910800101000502004160434003580000800104003940039400394003940039
800244003831100432516001080010800001080000506400000140019040038400382999233001880010208000020800004003840038118002110910800101000502004160444003580000800104003940039400394003940039
800244003831100852516001080010800001080000506400000140019040038400382999233001880010208000020800004003840038118002110910800101000502003160444003580000800104003940039400394003940039
800244003831010432516001080010800001080000506400000140019040038400382999233001880010208000020800004003840038118002110910800101000502003160444003580000800104003940039400394003940039
800244003831000432516001080010800001080167506400000040019040038400382999233001880010208053820800004003840038118002110910800101000502004160444003580000800104003940039400394003940039