Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

SUB (uxtx, 64-bit)

Test 1: uops

Code:

  sub x0, x0, x1, uxtx
  mov x0, 1
  mov x1, 2

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 1.000

Load/store unit issues: 0.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)03l1i tlb fill (04)1e3a3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)606d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst int alu (97)l1d cache writeback (a8)accfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
100410358003828622510001000100016916110351035728386810001000200010354111100110000079441449371000100010361036103610361036
100410358003828622510001000100016916110351035728386810001000200010354111100110000079441449371000100010361036103610361036
100410358003828622510001000100016916110351035728386810001000200010354111100110000079441449371000100010361036103610361036
100410358003828622510001000100016916110351035728386810001000200010354111100110000079441449371000100010361036103610361036
100410357003828622510001000100016916110351035728386810001000200010354111100110000079441449371000100010361036103610361036
100410358003828622510001000100016916110351035728386810001000200010354111100110000679441449371000100010361036103610361036
100410358003828622510001000100016916110351035728386810001000200010354111100110000079441449371000100010361036103610361036
100410358003828622510001000100016916110351035728386810001000200010354111100110002079441449371000100010361036103610361036
100410358003828622510001000100016916110351035728386810001000200010354111100110000079441449371000100010361036103610361036
100410358003828622510001000100016916110351035728386810001000200010354111100110000079441449371000100010361036103610361036

Test 2: Latency 1->2

Code:

  sub x0, x0, x1, uxtx
  mov x0, 1
  mov x1, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 1.0035

retire uop (01)cycle (02)031e3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)61696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fld unit uop (a6)l1d cache writeback (a8)accfd2d5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
1020410035750619877251010010100101008866434969551003510035858038722101001020020200100354111102011009910010100100000710223722994110000101001003610036100361003610036
1020410035750619877251010010100101008866404969551003510035858038722101001020020200100354111102011009910010100100000710223722994110000101001003610036100361003610036
10204100357512619877251010010100101008866434969551003510035858038722101001020020200100354111102011009910010100100000710023722994110000101001003610036100361003610036
1020410035760619877251010010100101008866404969551003510035858038722101001020020200100354111102011009910010100100010710023722994110000101001003610036100361003610036
1020410035750619877251010010100101008866404969551003510035858038722101001020020200100354111102011009910010100100000710023722994110000101001003610036100361003610036
1020410035750619877251010010100101008866404969551003510035858038722101001020020200100354111102011009910010100100000710023722994110000101001003610036100361003610036
1020410035750619877251010010100101008866404969551003510035858038722101001020020200100354111102011009910010100100000710043722994110000101001003610036100361003610036
1020410035750619877251010010100101008866404969551003510035858038722101001020020200100354111102011009910010100100000710023722994110000101001003610036100361003610036
10204100357530619877251010010100101008866404969551003510035858038722101001020020200100354111102011009910010100100000710023722994110000101001003610036100361003610036
1020410035750619877251010010100101008866404969551003510035858038722101001020020200100354111102011009910010100100000710023722994110000101001003610036100361003610036

1000 unrolls and 10 iterations

Result (median cycles for code): 1.0035

retire uop (01)cycle (02)031e3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fl1d cache writeback (a8)a9accfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
10024100357506198632510010100101001088784149695510035100358602387401001010020200201003541111002110910100101020664024123994010000100101003610036100361003610036
100241003575017098632510010100101001088784149695510035100358602387401001010020200201003541111002110910100101000064024122994010000100101003610036100361003610036
100241003575082986325100101001010010887841496955100351003586023874010010100202002010035411110021109101001010005764024123994010000100101003610036100361003610036
10024100357606198632510010100101001088784149695510035100358602387401001010020200201003541111002110910100101010064024122994010000100101003610036100361003610036
10024100357508498632510010100101001088784149695510035100358602387401001010020200201003541111002110910100101000064024122994010000100101003610036100361003610036
100241003575061986325100101001010010887841496955100351003586023874010010100202002010035411110021109101001010007264024123994010000100101003610036100361003610036
10024100357506198632510010100101001088784149695510035100358602387401001010020200201003541111002110910100101000064024123994010000100101003610036100361003610036
10024100357506198632510010100101001088784149695510035100358602387401001010020200201003541111002110910100101000064024123994010000100101003610036100361003610036
10024100357508298634410010100101001088784149695510035100358602387401001010020200201003541111002110910100101000064024123994010000100101003610036100361003610036
1002410035751810398632510010100101001088784149695510035100358602387401001010020200201003541111002110910100101000064024123994010000100101003610036100361003610036

Test 3: Latency 1->3

Code:

  sub x0, x1, x0, uxtx
  mov x0, 1
  mov x1, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 1.0035

retire uop (01)cycle (02)031e3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fa9acc2cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
102041003575061987725101001010010100886640496955100351003585803872210100102002020010035411110201100991001010010000071023722994110000101001003610036100361003610036
102041003575961987725101001010010100886641496955100351003585803872210100102002020010035411110201100991001010010000071023722994110000101001003610036100361003610036
102041003575061987725101001010010100886641496955100351003585803872210100102002020010035411110201100991001010010000071023722994110000101001003610036100361003610036
102041003576961987725101001010010100886640496955100351003585803872210100102002020010035411110201100991001010010001290071023722994110000101001003610036100361003610036
1020410035750441987725101001010010100886640496955100351003585803872210100102002020010035411110201100991001010010000071023722994110000101001003610036100361003610036
102041003575061987725101001010010100886641496955100351003585803872210100102002020010035411110201100991001010010000071023722994110000101001003610036100361003610036
102041003576661987725101001010010100886640496955100351003585803872210100102002020010035411110201100991001010010000071023722994110000101001003610036100361003610036
10204100357539661987725101001010010100886641496955100351003585803872210100102002020010035411110201100991001010010000071023722994110000101001003610036100361003610036
10204100357591895987725101001010010100886640496955100351003585803872210100102002020010035411110201100991001010010000071023722994110000101001003610036100361003610036
102041003575061987725101001010010100886641496955100351003585803872210100102002020010035411110201100991001010010000071023722994110000101001003610036100361003610036

1000 unrolls and 10 iterations

Result (median cycles for code): 1.0035

retire uop (01)cycle (02)03l2 tlb miss data (0b)3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fl1d cache writeback (a8)a9accfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
100241003575010398632510010100101001088784049695510035100358602387401001010020200201003541111002110910100101010064034123994010000100101003610036100361003610036
100241003575082986325100101001010010887841496955100351003586023874010010100202002010035411110021109101001010003364024123994010000100101003610036100361003610036
10024100357508298632510010100101001088784149695510035100358602387401001010020200201003541111002110910100101000064024122994010000100101003610036100361003610036
10024100357506198632510010100101001088784149695510035100358602387401001010020200201003541111002110910100101000064024123994010000100101003610036100361003610036
10024100357508298632510010100101001088784149695510035100358602387401001010020200201003541111002110910100101000064024123994010000100101003610036100361003610036
10024100357606198632510010100101001088784049695510035100358602387401001010020200201003541111002110910100101000064024122994010000100101003610036100361003610036
10024100357506198632510010100101001088784049695510035100358602387401001010020200201003541111002110910100101000064024123994010000100101003610036100361003610036
10024100357506198632510010100101001088784049695510035100358602387401001010020200201003541111002110910100101000064024122994010000100101003610036100361003610036
100241003575061986325100101001010010887840496955100351003586023874010010100202002010035411110021109101001010005764024123994010000100101003610036100361003610036
10024100357506198632510010100101001088784049695510035100358602387401001010020200201003541111002110910100101000064024122994010000100101003610036100361003610036

Test 4: throughput

Count: 8

Code:

  sub x0, x8, x9, uxtx
  sub x1, x8, x9, uxtx
  sub x2, x8, x9, uxtx
  sub x3, x8, x9, uxtx
  sub x4, x8, x9, uxtx
  sub x5, x8, x9, uxtx
  sub x6, x8, x9, uxtx
  sub x7, x8, x9, uxtx
  mov x8, 9
  mov x9, 10

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.1673

retire uop (01)cycle (02)03mmu table walk instruction (07)l2 tlb miss instruction (0a)l2 tlb miss data (0b)18191e1f3f51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)6061696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fl1d tlb miss (a1)ld unit uop (a6)l1d cache writeback (a8)acc2cfd0d2d5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
80204134171010000033035258010080100801004005001049103061338613386332333341801008020016020013386391180201100991008010010000000511100419231338380000801001338713387133871338716279
80204133861000000045035258010080100801004005001049103061338613386332333341801008020016020013386391180201100991008010010000000511100319321338380000801001338713387133871338713387
80204133861000000000227258010080100801004005001049103061338613386332333341801008020016020013386391180201100991008010010000000511000319331338380000801001338713387133871338713387
8020413386100000000035258010080100801004005001049103061338613386332333341801008020016020013386391180201100991008010010000000511100319341359280408801001357813577134541351313578
802041345210200003555176403898050480379805174113131049104971353213576332333341801008020016047413386391180201100991008010010000000511100119331338380000801001338713387133871338713387
8020413453100000012672644411228010080370803874134831049104321358013511332333361803858061016098013579392180201100991008010010022000511100319331338380000801001338713387133871338713387
8020413386101000103720792580100801008010040050010491030613386135113328183401801008046116097813386394180201100991008010010022113700511100419241350080418801001338713577135561338713387
8020413386100000000035258010080100801004005001049104331338613386332311340280386802001602001338639118020110099100801001000024682516900374421343380000801001359313528135761338713387
802041357510011114690882252580100801008010040050010491030613386133863323253421801008033616125813386391180201100991008010010000045505129003146331348480663801001357513759134511376213750
80204135731060000000510258010080100801004005001049103061338613386332333341801008020016020013386391180201100991008010010000000511100219321338380000801001338713387133871338713387

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.1671

retire uop (01)cycle (02)031e3f51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)5f696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fld unit uop (a6)l1d cache writeback (a8)accfd5map dispatch bubble (d6)d9ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
80024133761000352580010800108001040005014910291133711337133303334880010800201600201337139118002110910800101000050203190331336880000800101337213372133721337213372
80024133711000352580010800108001040005004910291133711337133303334880010800201600201337139118002110910800101000050203192331336880000800101337213372133721337213372
80024133711000352580010800108001040005004910291133711337133303334880010800201600201337139118002110910800101021050204190331336880000800101337213372133721337213372
80024133711000352580010800108001040005004910291133711337133303334880010800201600201337139118002110910800101000050203190331336880000800101337213372133721337213372
80024133711000352580010800108001040005004910291133711337133303334880010800201600201337139118002110910800101000050203190321336880000800101337213372133721337213372
80024133711000352580010800108001040005004910291133711337133303334880010800201600201337139118002110910800101000050203190331336880000800101337213372133721337213372
800241337110027562580010800108001040005004910291133711337133303334880010800201600201337139118002110910800101000050202190321336880000800101337213372133721337213372
80024133711000772580010800108001040005014910291133711337133303334880010800201600201337139118002110910800101000050203190331336880000800101337213372133721337213372
80024133711000772580010800108001040005004910291133711337133303334880010800201602921337139118002110910800101000050203190231336880000800101337213372133721337213372
800241337110001022580010800108001040005004910291133711337133303334880142800201600201337139118002110910800101000050203190321336880000800101337213372133721337213372