Apple M1 Microarchitecture Research by Dougall Johnson

Firestorm: Overview | Base Instructions | SIMD and FP Instructions
Icestorm:  Overview | Base Instructions | SIMD and FP Instructions

FMOV (H to W)

Test 1: uops

Code:

  fmov w0, h0
  mov x0, 1
  mov x1, 2

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 2.000

Integer unit issues: 1.000

Load/store unit issues: 0.000

SIMD/FP unit issues: 1.000

retire (01)cycle (02)031e1f3f51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch fp/simd (57)huge thing fp/simd (5b)696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op fp/simd (7e)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst integer (97)a8accfd5d6ddinst fetch restart (de)e0? int output thing (e9)gpr retires (ef)f5f6f7f8fd
10045384004325200010001000100080005195385383703396100010001000538538111001100000732162253510001000539539539539539
10045384004325200010001000100080005195385383703396100010001000538538111001100003732162253510001000539539539539539
10045384006425200010001000100080005195385383703396100010001000538538111001100000732162253510001000539539539539539
10045384004325200010001000100080005195385383703396100010001000538538111001100000732162253510001000539539539539539
100453850012825200010001000100080005195385383703396100010001000538538111001100000731162253510001000539539539539539
10045384004325200010001000100080005195385383703396100010001000538538111001100000732161253510001000539539539539539
10045384004325200010001000100080005195385383703396100010001000538538111001100010732162253510001000539539539539539
1004538401084325200010001000100080005195385383703396100010001000538538111001100000731161253510001000539539539539539
10045384004325200010001000100080005195385383703396100010001000538538111001100000731162253510001000539539539539539
10045384004325200010001000100080005195385383703396100010001000538538111001100000732162253510001000539539539539539

Test 2: Latency 1->2 roundtrip

Code:

  fmov w0, h0
  fmov d0, x0
  mov x0, 1
  mov x1, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 10.0038

retire (01)cycle (02)0308090b18191e1f3a3f4f51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a1a6a8a9acc2cfd0d5d6ddinst fetch restart (de)e0? int output thing (e9)ld/st retires (ed)gpr retires (ef)f5f6f7f8fd
20204100038776000000001000238956125301001010010000100001001000010000500477848155803490100013100038100038969003974962010020010000100002001000010000100038100038112020110099100101001000010000100000000131013162299638100001000010100101183101424100549100254100039
20204100038776000000001000238956125301001010010000100001001000010000500478001755808500100013100038100041969003974962010020010000100002001000010000100038100039112020110099100101001000010000100000000131012162299637100001000010100100039100039100039100074100039
2020410003877500000528001000238956125301001010010000100001001000010000500478044955805170100013100038100038969003974962010020010000100002001000010000100038100038112020110099100101001000010000100000000131012162299637100001000010100100039100039100039100039100039
202041000387760000012001000238956125301001010010000100001001000010000500478068955803490100013100038100038969003974962010020010000100002001000010000100038100038112020110099100101001000010000100001030131012162299637100001000010100100039100040100039100039100039
20204100038776000000001000238956225301001010010000100001001000010000500477992155806840100013100038100039969003974962010020010000100002001000010000100038100038112020110099100101001000010000100000000131012162299637100001000010100100039100039100039100039100039
20204100038776000000001000238956125301001010010000100001001000010000590477977755805200100013100038100038969003974962010020010000100002001000010000100038100038112020110099100101001000010000100001035080131012162299637100001000010100100039100039100039100039100039
202041000387770000000010002389561253010010100100001000010010000100005004779777558034901000141000381000389690039749620100200100001000020010000100001000381000381120201100991001010010000100001000014000131012162299637100001000010100100039100039100039100039100039
20204100038775000009001000658956125301481012710006100001191000010050500478410655803490100014100207100041969573976272045220010000100002001000010000100038100038112020110099100101001000010000100000060131012162299637100001000010100100039100039100039100039100039
20204100039776000000001000268956125301001010010000100001001000010000500478040155804060100013100038100038969003974962010020010000100002001000010000100038100038112020110099100101001000010000100001000131012162299637100001000010100100039100042100039100042100039
202041000387750000000010002389562253010010100100001000010010000100005004778769558034901000161001281000389699315974962032920010000100002001000010000100038100213112020110099100101001000010000100000000131012162299637100001000010100100039100039100039100039100039

1000 unrolls and 10 iterations

Result (median cycles for code): 10.0038

retire (01)cycle (02)03080918191e1f3a3f4f51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)60696b6d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst function returns (8f)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a1a6a8a9acc2cfd5d6ddinst fetch restart (de)e0? int output thing (e9)ld/st retires (ed)gpr retires (ef)f5f6f7f8fd
200241000387750000000100023895612530010100101000810008101000010000504778481557895401000130100039100038969223975182001020100001000020100001000010003810003811200211009101001010000100010008000213432162299691100001000010010100372100317100286100316100265
200241003467770001000100501895612530020100131000010000101000010050614778481557895401000130100038100038969223975192001020100001000020100001000010003810003811200211009101001010000100010000000012701162199637100001000010010100039100039100039100040100039
200241000387760000000100023895612530010100101000010000101000010000504778481557911601000130100038100038969223975182001020100001000020100611000010003810003811200211009101001010000100010000000012702163299637100001000010010100039100129100039100039100131
200241000387751000000100023895612530010100101000010000101000010000504778481557895401000160100038100038969223975182001020100001000020100001000010003810003811200211009101001010000100010000000012701162299637100001000010010100039100039100039100039100039
200241000387750000000100023895612530010100101000010000101000010000504778577557895401000130100038100038969223975182001020100001000020100001000010003810003811200211009101001010000100010000003012701162299637100001000010010100040100039100039100040100042
2002410003877500001200100023895612530010100101000010000101000010000504778481557895401000850100038100038969223975182001020100001000020100001000010003810003811200211009101001010000100010000000012701162199637100001000010010100039100039100039100039100039
2002410003877600001200100023895632530010100101000010000101000010000504778481557895401000140100038100038969253975182001020100001000020100001000010003810003811200211009101001010000100010000000012702161199637100001000010010100039100039100039100039100041
200241000387750000600100023895682530010100101000010000101000010000504778481557895401000130100038100038969223975182001020100001000020100001000010003810003811200211009101001010000100010000000012701161299637100001000010010100039100039100039100039100039
200241000387760000000100023895622530010100101000010000101000010000504778481557895401000130100038100038969223975182001020100001000020100001000010003810003811200211009101001010000100010000003012701161299639100001000010010100039100039100039100039100043
200241000387750000000100023895622530010100101000010000101000010000504778481557895401000130100038100038969223975182001020100001000020100001000010007010003811200211009101001010000100010000820012701161199637100001000010010100039100039100039100039100039

Test 3: throughput

Count: 8

Code:

  fmov w0, h8
  fmov w1, h8
  fmov w2, h8
  fmov w3, h8
  fmov w4, h8
  fmov w5, h8
  fmov w6, h8
  fmov w7, h8
  mov x8, 9

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.5005

retire (01)cycle (02)03070a181e3f51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)9fa8a9acc5branch mispredict (cb)cdcfd5d6ddinst fetch restart (de)e0? int output thing (e9)gpr retires (ef)f5f6f7f8fd
8020440054300110032251601008010080000100800045006400240400194003840038299766299918010420080016200800164003840038118020110099100801001000061115117116114003580000801004003940039400394003940039
80204400383001100718251601008010080000100800045006400240400194003840038299766299918010420080016200800164003840038118020110099100801001000091115117116114003580000801004003940039400394003940106
8020440038300110032251601008010080000100800045006400240400194003840038299766299918010420080016200800164003840038118020110099100801001000001115117116114003580000801004003940039400394003940039
8020440038300110032251601008010080000100800045006400240400194003840038299766299918010420080016200800164003840038118020110099100801001000001115117116114003580000801004003940039400394003940039
80204400383251109932251601008010080000100800045006400240400194003840038299766299918010420080016200800164003840038118020110099100801001000001115117116114003580000801004003940039400394003940039
80204400383001100322516010080100800001008000450064002414001940038400382997662999180104200800162008001640038400381180201100991008010010000961115117116114003580000801004003940039400394003940039
80204400383001100322516010080100800001008000450064002414001940038400382997662999180104200800162008001640038400381180201100991008010010000121115117116114003580000801004003940039400394003940039
802044003830011003225160100801008000010080004500640024040019400384003829976629991801042008001620080016400384003811802011009910080100100001051115117116114003580000801004003940039400394003940039
802044003830011003225160100801008000010080004500640024140019400384003829976629991801042008001620080016400384003811802011009910080100100001471115117116114003580000801004003940039400394003940039
8020440038300110032251601008010080000100800045006400241400194003840038299766299918010420080016200800164003840038118020110099100801001000091115117116114003580000801004003940039400394003940039

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.5005

retire (01)cycle (02)030708090a0b0e18191e1f3f51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)5f60696d6edispatch stall (70)74scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)9fa0a1a6a7a8a9acc2cfd5d6dbddinst fetch restart (de)e0? int output thing (e9)gpr retires (ef)f5f6f7f8fd
8002440047310000000000043251600108001080000108000050640000004001940038400382999203300188001020800002080088400384023831800211091080010100000000050202162334003580000800104003940039400394003940168
80024400383140000000000708621600108001080000108000050640000004001940038400382999203300188001020800002080000400384003811800211091080010104000000050203161334003580000800104010740039400394003940106
800244003831300000000120299251600108001080000108000050640000004001940038400382999203300188001020800002080000400384003811800211091080010100000100050202241224008980000800104003940039400394003940039
8002440038310000000710043251600108001080000108000050640000004001940038401692999203300188001020800002080000400384010411800211091080010100000003050202160224003580000800104003940039400394003940039
80024400383100000100000142225160010800108000010800005064000000400194003840038300230330018800102080000208063040038400382180021109108001010020000108450202241324008880000800104003940039400394010540039
800244003831100000070901092251600108001080000108000050640000004001940038400382999203300188001020800002080000400384003811800211091080010100000000050204162224003580000800104003940039400394003940039
800244003831000000000150951251600108001080000108000050640000004001940038400383002403302408001020806292080000400384003811800211091080010100000003050202162324003580000800104003940039400394003940039
8002440038313000000000043251600108001080000108000050640000004001940038400382999203300648001020800002080000400384003811800211091080010100000000050382161274025580079800104003940039400394003940104
8002440038310000000000043251600108001080000108016950641898004001940038400382999203300188001020800002080000400384003811800211091080010100000020050202162234003580000800104003940039401044003940039
8002440038311000000019043251600108001080000108000050640000004001940038400382999208303418001020800002080000400384003811800211091080010100000101253050373162334009080000800104003940039400394003940039