Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

MOVK (64-bit)

Test 1: uops

Code:

  movk x0, #0x1234, lsl 16
  mov x0, 1

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 1.000

Load/store unit issues: 0.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)031e3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)606d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst int alu (97)l1d cache writeback (a8)accfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
1004103570618622510001000100016916110351035728386810001000100010354111100110000073141119371000100010361036103610361036
1004103580618622510001000100016916110351035728386810001000100010354111100110000073141119371000100010361036103610361036
1004103580618622510001000100016916110351035728386810001000100010354111100110000073141119371000100010361036103610361036
1004103570618622510001000100016916010351035728386810001000100010354111100110000073141119371000100010361036103610361036
1004103580618622510001000100016916110351035728386810001000100010354111100110000073141119371000100010361036103610361036
1004103580618622510001000100016916110351035728386810001000100010354111100110000073141119371000100010361036103610361036
1004103570618622510001000100016916110351035728386810001000100010354111100110000073141119371000100010361036103610361036
1004103583618622510001000100016916110351035728386810001000100010354111100110000073141119371000100010361036103610361036
1004103570618622510001000100016916110351035728386810001000100010354111100110000073141119371000100010361036103610361036
1004103580618622510001000100016916110351035728386810001000100010354111100110000073141119371000100010361036103610361036

Test 2: Latency 1->1

Code:

  movk x0, #0x1234, lsl 16
  mov x0, 1

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 1.0035

retire uop (01)cycle (02)03mmu table walk data (08)l2 tlb miss data (0b)18191e1f3a3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fl1d tlb access (a0)l1d cache writeback (a8)a9acc2branch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
1020410035750002917609819893461017610152101009422749695510035100358545687261041110200102001012841111020110099100101001004122285011172001600996510023101001003610036100361003610036
102041003575000000110819877491017310179102579165249695510035100358607787341011710240102401003541211020110099100101001000003530011172001600996510000101001003610036100361003610036
10204100357500000016198866710150101511011787686496955100351003586077873410117102401024010035411110201100991001010010000000111720016011003710000101001003610036100361003610036
10204100357500001200619877461012510151101179163449700210082101288606787651011710240102401003541111020110099100101001000000011172001600996510000101001003610036100361003610036
10204100357500000006198772510100101001011787686497049100811003586041787341011710407102401003541111020110099100101001000000011172001600996510000101001003610036100361003610036
10204100827602021817607439877251010010100101178768649695510035100358607787341011710413102401003541111020110099100101001000000011172001600996510000101001003610036100361003610036
1020410035750000000619875461014910125101009395449695510035100358545686871010010200102001003541111020110099100101001000000211172226822992410000101001003610036100361003610036
10204100357600000881979870681010010173101009030049695510035100358545686871010010200102001003541111020110099100101001000000011176601600996510000101001003610036100361003610036
1020410035760000333006198791071010010100101178768649695510035100358607787341011710240102401003541111020110099100101001000000011172001600996510000101001003610082100361008210036
10204100817600001500619877251012410100101178768649695510035100358607787341011710408107661012741411020110099100101001000000011172001600996510000101001003610036100361003610036

1000 unrolls and 10 iterations

Result (median cycles for code): 1.0035

retire uop (01)cycle (02)03191e1f3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fl1d cache writeback (a8)acc2branch cond mispred nonspec (c5)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
100241003575084061986325100101001010010887840496955100351003586023874010010100201002010035411110021109101001010000064034122994010000100101003610036100361003610036
10024100357500061986325100101001010010887840496955100351003586023874010010100201002010035411110021109101001010000064024122994010000100101003610036100361003610036
10024100357500061986325100101001010010887840496955100351003586023874010010100201002010035411110021109101001010090064024122994010000100101003610036100361003610036
10024100357500061986325100101001010010887840496955100351003586023874010010100201002010035411110021109101001010000064024122994010000100101003610036100361003610036
10024100357500061986325100101001010010887840496955100351003586023874010010100201002010035411110021109101001010000064024122994010000100101003610036100361003610036
100241003575048061986325100101001010010887840496955100351003586023874010010100201002010035411110021109101001010000064024122994010000100101003610036100361003610036
10024100357600061986325100101001010010887840496955100351003586023874010010100201002010035411110021109101001010000064024122994010000100101003610036100361003610036
10024100357500061986325100101001010010887840496955100351003586023874010010100201002010035411110021109101001010000064024122994010000100101003610036100361003610036
10024100357500061986325100101001010010887840496955100351003586023874010010100201002010035411110021109101001010030064024122994010000100101003610036100361003610036
1002410035750141061986325100101001010010887840496955100351003586023874010010100201002010035411110021109101001010000064024122994010000100101003610036100821003610036

Test 3: throughput

Count: 8

Code:

  movk x0, #0x1234, lsl 16
  movk x1, #0x1234, lsl 16
  movk x2, #0x1234, lsl 16
  movk x3, #0x1234, lsl 16
  movk x4, #0x1234, lsl 16
  movk x5, #0x1234, lsl 16
  movk x6, #0x1234, lsl 16
  movk x7, #0x1234, lsl 16

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.1674

retire uop (01)cycle (02)03mmu table walk instruction (07)181e3f5051schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)6061696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fl1d cache writeback (a8)a9accfd0d5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
8020413396100000361337625801008010080100543180154910312133921339233283334780100802008020013392391180201100991008010010000051125319331338980000801001339313393133931339313393
8020413392100000361337625801008010080100543180104910312133921339233283334780100802008020013392391180201100991008010010000051125319331338980000801001339313393133931339313393
80204133921000003613376258010080100801005431801549103121339213392332833347801008020080200133923911802011009910080100100610651125319331338980000801001339313393133931339313393
80204133921000012360258010080100801005431801049103121339213392332833347801008020080200133923911802011009910080100100160051120319331338980000801001348313393134671339313393
802041339210000036025801008010080100543180054910312133921339233283334780100802008020013392391180201100991008010010000051125319331338980000801001339313393133931339313393
802041339210000036025801008010080100543180054910312133921339233283334780100802008020013392391180201100991008010010000051125319331338980000801001339313393133931339313393
802041339210000036025801008010080100543180154910312133921339233283334780100802008020013392391180201100991008010010030051120319331338980000801001339313393133931339313393
802041339210100036025801008010080100543180154910312133921339233283334780100802008020013392391180201100991008010010000051120319331338980000801001339313393133931339313458
8020413392101000360258010080100801005431801549103121339213392332833347801008020080200133923911802011009910080100100130351125319331338980000801001339313393133931339313393
802041339210000036025801008010080100543180154910312133921339233283334780100802008020013392391180201100991008010010000051120319331338980000801001339313393133931339313393

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.1671

retire uop (01)cycle (02)0318191e3f51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fl1d cache writeback (a8)branch mispred nonspec (cb)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
80024133901000063625800108001080010492052049102921337213372333033349800108002080020133723911800211091080010100050251619781336980000800101337313373133731337313373
80024133721000015362580010800108001049205204910292133721337233303334980010800208002013372391180021109108001010005022719831336980000800101337313373133731337313373
8002413372100000362580010800108001049205204910292133721337233303334980010800208002013372391180021109108001010005022519641336980000800101337313373133731337313373
8002413372100000362580010800108001049205204910292133721337233303334980010800208002013372391180021109108001010005025519991336980000800101337313373133731337313373
800241337210000243625800108001080010492052049102921337213372333033349800108002080020133723911800211091080010100050221119551336980000800101337313373133731337313373
80024133721000003625800108001080010492052049102921337213372333033349800108002080020133723911800211091080010100050216195111336980000800101337313373133731337313373
80024133721000021362580010800108001049205204910292133721337233303334980010800208002013372391180021109108001010005043519331336980000800101337313373133731337313373
8002413372100000362580010800108001049205214910292133721337233303334980010800208002013372391180021109108001010005022619551336980000800101337313373133731337313373
80024133721000003625800108001080010492052049102921337213372333033349800108002080020133723911800211091080010100050211119641336980000800101337313373133731337313373
8002413372100000362580010800108001049205204910292133721337233303334980010800208002013372391180021109108001010005022619541336980000800101337313373133731337313373