Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

ADD (immediate, 64-bit)

Test 1: uops

Code:

  add x0, x0, #3
  mov x0, 1
  mov x1, 2

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 1.000

Load/store unit issues: 0.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)03mmu table walk instruction (07)1e3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)606d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst int alu (97)l1d cache writeback (a8)accfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
10041035800618622510001000100016916110351035729386810001000100010354111100110000073141119371000100010361036103610361036
100410358112618622510001000100016916010351035728386810001000100010354111100110000073141119371000100010361036103610361036
10041035809618622510001000100016916010351035728386810001000100010354111100110000073241229371000100010361036103610361036
10041035800828622510001000100016916010351035728386810001000100010354311100110000075239119371000100010361036103610361036
10041035800828622510001000100016916110351081728386810001000100010354111100110000075141129361000100010361036103610361036
10041035810618622510001000100016916110351035728386810001000100010354311100110000075141229361000100010361036103610361036
100410357015848622510001000100016916010351035728386810001000100010354111100110000073141119371000100010361036103610361036
10041035800618622510001000100016916010351035728386810001000100010354111100110000073141229361000100010361036103610361036
10041035800618622510001000100016916010351035728386810001000100010354111100110000075141119361000100010361036103610361036
10041035810618622510001000100016916010351035728386810001000100010354111100110000073239229371000100010361036103610361036

Test 2: Latency 1->2

Code:

  add x0, x0, #3
  mov x0, 1
  mov x1, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 1.0035

retire uop (01)cycle (02)03l2 tlb miss data (0b)18191e1f3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fl1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)l1d cache writeback (a8)a9acbranch mispred nonspec (cb)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
102041003575000004499877251010010100101008866404969551003510035858038722101001020010200100354111102011009910010100100000006071013711994110000101001003610036100361003610036
102041003575000001479877251010010100101008866414969551003510035858038722101001020010200100354111102011009910010100100000100071013711994110000101001003610036100361003610036
10204100357500000619877251010010100101008866404969551003510035858038722101001020010200100354111102011009910010100100000000071013711994110000101001003610036100361003610036
10204100357500000829877251010010100101008866414969551003510035858038722101001020010200100354111102011009910010100100000000071013711994110000101001003610036100361003610036
102041003575000162642439877251010010100101008866414969551003510035858038722101001020010200100354111102011009910010100100000000071013711994110000101001003610036100361003610036
102041003575000003329877251010010100101008866414969551003510035858038722101001020010200100354111102011009910010100100000000071013711994110000101001003610036100361003610036
10204100357500000849877251010010100101008866414969551003510035858038722101001020010200100354111102011009910010100100000000071013711994110000101001003610036100361003610036
102041003576000004069877251010010100101008866404969551003510035858038722101001020010200100354111102011009910010100100000000071013711994110000101001003610036100361003610036
10204100357500000619877251010010100101008866414969551003510035858038722101001020010200100354111102011009910010100100000000071013711994110000101001003610036100361003610036
10204100357500000619877251010010100101008866404969551003510035858038722101001020010200100354111102011009910010100100000000071013711994110000101001003610036100361003610036

1000 unrolls and 10 iterations

Result (median cycles for code): 1.0035

retire uop (01)cycle (02)03mmu table walk data (08)09l2 tlb miss instruction (0a)l2 tlb miss data (0b)191e1f3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fl1d tlb access (a0)l1d cache writeback (a8)accfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
10024100357500000008098632510010100101001088784049695510035100358602387401001010020100201003541111002110910100101000064024122994010000100101003610036100361003610036
10024100357500000006198632510010100101001088784149695510035100358602387401001010020100201003541111002110910100101001064024122994010000100101003610036100361003610036
10024100357500000006198632510010100101001088784149695510035100358602387401001010020100201003541111002110910100101000064024122994010000100101003610036100361003610036
100241003575000000017098632510010100101001088784049695510035100358602387401001010020100201003541111002110910100101000064024122994010000100101003610036100361003610036
10024100357500000006198632510010100101001088784149695510035100358602387401001010020100201003541111002110910100101000064024122994010000100101003610036100361003610036
1002410035750000101087998632510010100101001088784049695510035100358602387401001010020100201003541111002110910100101000064024122994010000100101003610036100361003610036
100241003575000000014998632510010100101001088784049695510035100358602387401001010020100201003541111002110910100101000064024122994010000100101003610036100361003610036
10024100357500000006198632510010100101001088784049695510035100358602387401001010020100201003541111002110910100101000064024122994010000100101003610036100361003610036
10024100357500000006198632510010100101001088784149695510035100358602387401001010020100201003541111002110910100101000064024122994010000100101003610036100361003610036
100241003575000000053698632510010100101001088784149695510035100358602387401001010020100201003541111002110910100101000064024122994010000100101003610036100361003610036

Test 3: throughput

Count: 8

Code:

  add x0, x8, #3
  add x1, x8, #3
  add x2, x8, #3
  add x3, x8, #3
  add x4, x8, #3
  add x5, x8, #3
  add x6, x8, #3
  add x7, x8, #3
  mov x8, 9

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.1674

retire uop (01)cycle (02)033f51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fl1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)l1d cache writeback (a8)acbranch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
8020413425100282780136801368014840071049103101339013390332663336801488026480264133903911802011009910080100100000001115120216221338780036801001346013391133911339113391
8020413390100942780136801368014840071049103101339013390332663336801488026480264133903911802011009910080100100000001115120216331338780036801001339113391133911339113391
80204133901001392780136801368014840071049103101339013390332663336801488026480264133903911802011009910080100100000001115120316331338780036801001339113391133911339113391
8020413390101282780136801368014840071049103101339013390332663336801488026480264133903911802011009910080100100000001115120316321338780036801001339113391133911339113391
8020413390100282780136801368014840071049103101339013390332663336801488026480264133903911802011009910080100100000001115121416331338780036801001339113391133911339113391
8020413390100702780136801368014840071049103101339013390332663336801488026480264133903911802011009910080100100000001115120416441338780036801001339113391133911339113391
8020413390100912780136801368014840071049103101339013390332663336801488026480264133903911802011009910080100100000001115120316421338780036801001339113391133911339113391
8020413390100282780136801368014840071049103101339013390332663336801488026480264133903911802011009910080100100000001115121316431338780036801001339113391133911339113391
8020513390100282780136801368014840071049103101339013390332663336801488026480264133903911802011009910080100100000001115120316221338780036801001339113391133911339113391
80204133901001142780136801368014840071049103101339013390332663336801488026480264133903911802011009910080100100000101115120416221338780036801001339113391133911339113391

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.1671

retire uop (01)cycle (02)03mmu table walk data (08)09l2 tlb miss instruction (0a)18191e1f3f51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)5f60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fl1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acc2branch cond mispred nonspec (c5)cfd5map dispatch bubble (d6)d9ddfetch restart (de)e0? int output thing (e9)ea? int retires (ef)f5f6f7f8fd
800241337610000000007725800108001080010400050014910291133711337133303334880010800208002013371391180021109108001010000000000502151906613368800000800101337213372133721337213372
800241337110000000003525800108001080010400050014910291133711337133303334880010800208002013371391180021109108001010000000000502261906513368800000800101337213372133721337213372
8002413371100000000035258001080010800104000500049102911337113371333033348800108002080020133713911800211091080010100000000005021419065133688000021800101337213372133721337213372
800241337110000000003525800108001080010400050014910291133711337133283334880010800208002013371391180021109108001010000000000502161905513368800000800101337213372133721337213372
800241337110000000003525800108001080010400050004910291133711337133303334880010800208002013371391180021109108001010000000000502251916513368800000800101337213372133721337213372
8002413371100000000022725800108001080010400050004910291133711337133303334880010800208002013371391180021109108001010000000000502151906513368800000800101337213372133721337213372
8002413371100000000035155800108001080010400050004910291133711337133303334880010800208002013371391180021109108001010000000000502251906513368800000800101337213372133721337213372
8002413371100000000031325800108001080010400050004910291133711337133303334880010800208002013371391180021109108001010000000000502251906713368800000800101337213372133721337213372
800241337110000000009825800108001080010400050004910291133711337133303334880010800208002013371391180021109108001010000000000502151905513368800000800101337213372133721337213372
800241337110000000003525800108001080010400050004910291133711337133303334880010800208002013371391180021109108001010000000000502141905513368800000800101337213372133721337213372