Apple M1 Microarchitecture Research by Dougall Johnson

Firestorm: Overview | Base Instructions | SIMD and FP Instructions
Icestorm:  Overview | Base Instructions | SIMD and FP Instructions

FMOV (H to X)

Test 1: uops

Code:

  fmov x0, h0
  mov x0, 1
  mov x1, 2

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 2.000

Integer unit issues: 1.000

Load/store unit issues: 0.000

SIMD/FP unit issues: 1.000

retire (01)cycle (02)031e3f51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch fp/simd (57)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op fp/simd (7e)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst integer (97)a8accfd5d6ddinst fetch restart (de)e0? int output thing (e9)gpr retires (ef)f5f6f7f8fd
10045384085252000100010001000800015195385383703396100010001000538538111001100000731161153510001000539539539539539
10045384043252000100010001000800015195385383703396100010001000538538111001100000731161153510001000539539539539539
10045384043252000100010001000800005195385383703396100010001000538538111001100000731161153510001000539539539539539
10045384043252000100010001000800005195385383703396100010001000538538111001100000731161153510001000539539539539539
100453851243252000100010001000800005195385383703396100010001000538538111001100000731161153510001000539539539539539
100453840138252000100010001000800005195385383703396100010001000538538111001100010731161153510001000539539539539539
10045384043252000100010001000800005195385383703396100010001000538538111001100000731161153510001000539539539539539
10045384343252000100010001000800005195385383703396100010001000538538111001100000731161153510001000539539539539539
10045384043252000100010001000800005195385383703396100010001000538538111001100000731161153510001000539539539539539
10045384043252000100010001000800005195385383703396100010001000538538111001100000731161153510001000539539539539539

Test 2: Latency 1->2 roundtrip

Code:

  fmov x0, h0
  fmov d0, x0
  mov x0, 1
  mov x1, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 10.0038

retire (01)cycle (02)0308090b18191e1f3a3f4f51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)60696b6d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a1a6a7a8a9acc2c5branch mispredict (cb)cdcfd0d5d6d8ddinst fetch restart (de)e0? int output thing (e9)ld/st retires (ed)gpr retires (ef)f5f6f7f8fd
202041000387760000090001000258960625301001010010000100001211000010000531477862555803490100013010003810003896907697492201002021000410004200100041000410003810004111202011009910010100100001000010000000001111317021601199646100001000010100100042100039100039100039100039
20204100038776100000011000238956125301001010010000100021001000010000561477848155803490100013010013310003896997697492201002001000410004202100041000410004010003811202011009910010100100001000010000010001111317011601199646100221000010100100039100039100039100039100131
202041000447750000012001001128956225301001010010000100001001000010000500477848155803490100014010003910003896907697493201002001000410004200100041000410003810003811202011009910010100100001000010000000021111317011601199739100001000010100100121100129100039100127100041
202041001337751000012001001898961163301371011410000100041001000010000500477848155803490100013010003810003896907697492201002001000410247200100671000410004010003811202011009910010100100001000210000000001111317011601199646100001000010100100039100039100039100041100039
20204100038776000000001000238959825301001010010000100001001000010000500477848155805200100015010003810003896900397496201002001000010000200100001000010003810004111202011009910010100100001000010000010300001310122502399637100001000010100100039100039100041100121100122
20204100038776000006001002028956425301191012510000100041001023810000500477848155839880100013010005510003896900397496206682001000010000200100001000010004010003811202011009910010100100001000010000020000001328121602299640100001000010100100136100039100040100041100039
202041000417760000024001000238956125301001011610002100001231041110000500477848155803490100013010004010004196900797499201002001000010000200100611000010055110004061202011009910010100100001000010000020000001310121602299637100031000010100100531100125100039100039100039
202041000387761000042001003398956125301001010010000100021001000010000500477848155803490100013010003910043097187397496204532001000010000202100601000010030910003821202011009910010100100001004210002020349500001310121602299640100001000010100100039100039100118100088100039
20204100039775000010001000258962625301001010010000100001001000010000596477910955803490100013010003810003896902397498203222001000010124202100611000010004810004111202011009910010100100001000010005002000001310121602299638100331000010100100040100039100039100040100039
20204100124776000001710010002389561423012710100100001000010010000100005004780977558034901000130100038100039969001097498201002001000010000200100001000010003810013411202011009910010100100001000010000030300001310132502399640100001000010100100039100039100039100039100039

1000 unrolls and 10 iterations

Result (median cycles for code): 10.0038

retire (01)cycle (02)03070818191e1f3a3f4f51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)60696b6d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a6a8a9acc2branch mispredict (cb)cfd5d6ddinst fetch restart (de)e0? int output thing (e9)ld/st retires (ed)gpr retires (ef)f5f6f7f8fd
2002410003877500002100100023895612530010100101000010002101000010000504778529558299011000130100038100038969223975182001020100001000020100001000010003810003811200211091010010100001001000020000127011163199637100001000010010100039100039100039100039100039
20024100038775110000010002389562253001010010100001000210100001000050477934555790641100013310008310003896922397518200102010000100002010000100001000381000381120021109101001010000100100000030012701161199637100001000010010100039100039100039100042100039
20024100038775000000010002389561253001010010100001000010100001000050478136155797881100013010003810003896922397518200102010000100002010000100001000381000381120021109101001010000100100000000012701161199637100001000010010100040100039100039100039100039
20024100038776000000010002389561253001010010100001000010100001000050478193755800590100013010003810003896922397518200102010000100002010000100001000381000381120021109101001010000100100000000012701162199637100001000010010100040100039100039100039100039
20024100038775000000010002389564253001010010100001000010100001000050478246556119051100162010003910003896922397518200102010000101212010000100001000391000381120021109101001010000100100000000012701162199637100001000010010100039100039100039100039100039
20024100038775000000010002389561253001010010100001000010100001000050478169755816251100013010003810003896922397518200102010066100002010000100001000381000381120021109101001010000100100000000012702162199640100001000010010100039100039100039100039100039
20024100038776000000010002389563253001010010100001000010100001000050478011355790621100013010003810003896922397518200102010000100002010000100001000381000381120021109101001010000100100000000012701161199637100001000010010100039100039100039100039100039
20024100038775000000010002389563253001010010100001000010100001000050477996955790091100013010003810003896922397518200102010000100002010000100001000381000381120021109101001010000100100001000012702162199637100001000010010100039100039100042100039100041
20024100038775000000010002389562253001010010100001000010100001000050478088155789540100013010003810004696922397518200102010000100002010000100001000381000381120021109101001010000100100000000012702161299640100001000010010100039100039100039100039100039
20024100038776000000010002389561253001010010100001000411100001000050478987755789541100013010003810003896922397518200102010000100002010000100001000381000381120021109101001010000100100001000012702161199637100001000010010100039100039100041100039100039

Test 3: throughput

Count: 8

Code:

  fmov x0, h8
  fmov x1, h8
  fmov x2, h8
  fmov x3, h8
  fmov x4, h8
  fmov x5, h8
  fmov x6, h8
  fmov x7, h8
  mov x8, 9

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.5005

retire (01)cycle (02)03080b191e1f3f51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)9fa0a1a6a8acc2c5branch mispredict (cb)cdcfd5d6dde0? int output thing (e9)gpr retires (ef)f5f6f7f8fd
8020440051312014144032251605768010080000102800045006420361401814003840238299762030130804402008039020480293402384003841802011009910080100100220116732111516814014020080000801004003940173402334003940039
802044003831100014417641925160276801008000010080004500642058140019400384003829976629991801042008001620080016402374003811802011009910080100100020102111511701604003580000801004003940232400394023640039
802044023231000240803225160100801008000010080004500640024140019400384003829976629991801042008001620080016400384003811802011009910080100100000000111511701604003580000801004003940039400394003940039
8020440038310000005725160100801008000010080004500640024140019400384003829976629991801042008001620080016400384003811802011009910080100100000000111511701604003580000801004003940039400394023640039
8020440038310000003225160100801008000010080004500640024140019400384003829976629991801042008001620080016400384003811802011009910080100100000000111511701604003580000801004003940039400394003940039
8020440038311000003225160100801008000010080004500640024140019400384003829976629991801042008001620080016400384003811802011009910080100100000000111511701604003580000801004003940039400394003940039
8020440038310000003225160100801008000010080004500640024040019400384003829976629991801042008001620080016400384003811802011009910080100100000000111511701604003580000801004003940039400394003940039
80204400383100000013025160100801008000010080004500640024040019400384003829976629991801042008001620080016400384003811802011009910080100100000000111511701604003580000801004003940039400394003940039
8020440038310000003225160100801008000010080004500640024040019400384003829976629991801042008001620080016400384003811802011009910080100100000000111511701604003580000801004003940039403044003940039
8020440038310000003225160100801008000010080004500640024140019400384003829976629991801042008001620080016400384003811802011009910080100100000000111511701604003580000801004003940039400394003940039

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.5005

retire (01)cycle (02)0304090a191e1f3f51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)9fa0a1a6a7a8a9acc2cfitlb miss (d4)d5d6d9dadbddinst fetch restart (de)e0? int output thing (e9)eagpr retires (ef)f5f6f7f8fd
8002440048310000000518251600108001080000108000050640000140019400384003829992330018800102080000208000040038400381180021109108001010000000270502001316100101640035800000800104003940039400394003940039
80024400383101100009125160010800108000010800005064000004001940038400382999233001880010208000020800004003840038118002110910800101000000000502111616000171440035800000800104003940039400394003940039
80024400383101100009125160010800108000010800005064000014001940038400382999233001880010208000020800004003840038118002110910800101000000000502111616000161240035800000800104003940039400394003940039
80024400383111100009225160010800108000010800005064000004001940038400382999233001880010208000020800004003840038118002110910800101000001030502111716001171740035800000800104003940039400394003940039
80024400383101100009125160010800108000010800005064000004001940038400382999233001880010208000020800004003840038118002110910800101000000000502111816000181740035800000800104003940039400394003940039
80024400383111100009125160010800108000010800005064000004001940038400382999233001880010208000020800004003840038118002110910800101000000000502111516001151540035800000800104003940039400394003940039
80024400383101100009125160010800108000010800005064000014001940038400382999233001880010208000020800004003840038118002110910800101000000000502111716011171440035800000800104003940039400394003940039
80024400383101100009125160010800108000010800005064000004001940038400382999233001880010208000020800004003840038118002110910800101000001000502111716001171740035800000800104003940039400394003940039
800244003831011001209125160010800108000010800005064000004001940038400382999233001880010208000020800004003840038118002110910800101000000000502111716001151840035800000800104003940039400394003940039
8002440038311110000912516001080010800001080000506400000400194003840038299923300188001020800002080000400384003811800211091080010100000100050211141600016840035800000800104003940039400394003940039