Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

BFXIL (64-bit)

Test 1: uops

Code:

  bfxil x0, x1, #3, #7
  mov x0, 1

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 1.000

Load/store unit issues: 0.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)033f51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)606d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst int alu (97)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
10041036747251000100010005999010361036864389410001000200010361641110011000731161110321000100010371037103710371037
10041036847251000100010005999110361036864389410001000200010361641110011000731161110321000100010371037103710371037
10041036847251000100010005999010361036864389410001000200010361641110011000731161110321000100010371037103710371037
10041036847251000100010005999010361036864389410001000200010361641110011000731161110321000100010371037103710371037
10041036747251000100010005999110361036864389410001000200010361641110011000731161110321000100010371037103710371037
10041036847251000100010005999010361036864389410001000200010361641110011000731161110321000100010371037103710371037
10041036847251000100010005999110361036864389410001000200010361641110011000731161110321000100010371037103710371037
10041036847251000100010005999110361036864389410001000200010361641110011000731161110321000100010371037103710371037
10041036847251000100010005999110361036864389410001000200010361641110011000731161110321000100010371037103710371037
10041036847251000100010005999110361036864389410001000200010361641110011000731161110321000100010371037103710371037

Test 2: Latency 1->1

Code:

  bfxil x0, x1, #3, #7
  mov x0, 1

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 1.0036

retire uop (01)cycle (02)031e3f51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fl1d cache writeback (a8)acbranch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfmap dispatch bubble (d6)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
102041003675036251010010100101006049914969561003610036872178739101001020820216100361621110201100991001010010000111717161003310000101001003710037100371003710037
102041003675036251010010100101006049914969561003610036872178740101001020820356100361621110201100991001010010000111717161003310000101001003710037100371003710037
102041003675036251010010100101006049914969561003610036872168740101001020820216100361621110201100991001010010000111717161003310000101001003710037100371003710037
102041003675035251010010100101006049914969561003610036872168740101001020820216100681621110201100991001010010000111717161003310000101001003710037100371003710037
102041003675036251010010100101006049914969561003610036872168740101001020820216100361621110201100991001010010000111717161003310000101001003710037100691003710037
102041003675958251010010100101006049914969561003610036872168739101001020820216100361621110201100991001010010000111717161003310000101001003710037100371003710037
102041003675936251010010100101006049914969561003610036872168740101001020820216100361621110201100991001010010000111718161003310000101001003710037100371003710037
10204100367501970251010010100101006049914969561003610036872168739101001020820216100361621110201100991001010010000111717161003310000101001003710037100371003710037
102041003675036251010010100101006049914969561003610036872168740101001020820216100361621110201100991001010010000111718161003310000101001003710037100371003710037
102041003675035251010010100101006049914969561003610036872178739101001020820216100361621110201100991001010010000111718161003310000101001003710037100371003710037

1000 unrolls and 10 iterations

Result (median cycles for code): 1.0036

retire uop (01)cycle (02)03mmu table walk data (08)l2 tlb miss data (0b)3f51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fl1d cache writeback (a8)accfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
1002410037750025725100101001010010600491496956100361003687363876610010100202002010036164111002110910100101000640216221003210000100101003710037100371003710037
1002410036760115425100101001010010600491496956100361003687363876610010100202002010036164111002110910100101000640216221003210000100101003710037100371003710037
100241003675006825100101001010010600491496956100361003687363876610010100202002010036164111002110910100101000640216231003210000100101003710037100371003710037
100241003675004725100101001010010600491496956100361003687363876610010100202002010036164111002110910100101000640216221003210000100101003710037100371003710037
1002410036750047251001010010100106004914969561003610036873638766100101002020020100361641110021109101001010018640216321003210000100101003710037100371003710037
1002410036750036025100101001010010600491496956100361003687363876610010100202002010036164111002110910100101000640216321003210000100101003710037100371003710037
1002410036750015825100101001010010600491496956100361003687363876610010100202002010036164111002110910100101000640216321003210000100101003710037100371003710037
1002410036750013325100101001010010600491496956100361003687363876610010100202002010036164111002110910100101000640216321003210000100101003710037100371003710037
1002410036750020225100101001010010600491496956100361003687363876610010100202002010036164111002110910100101003640216321003210000100101003710037100371003710037
1002410036750042025100101001010010600491496956100361003687363876610010100202002010036164111002110910100101003640216221003210000100101003710037100371003710037

Test 3: Latency 1->2

Chain cycles: 1

Code:

  add x1, x0, x0
  mov x0, 0
  bfxil x0, x1, #3, #7
  mov x0, 1

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 1 chain cycle): 1.0035

retire uop (01)cycle (02)03mmu table walk data (08)l2 tlb miss data (0b)18191e1f3a3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fl1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)l1d cache writeback (a8)a9acc2branch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
302042003515000000005369951252010020100201061288657491695520035200351616071623620106202114022220035571130201100991003010010000000001111919116212000720000301002003620036200362003620036
30204200351500000000619951252010020100201061288657491695520035200351616071623620106202114022220035571130201100991003010010000000001111919116112000720000301002003620036200362003620036
30204200351490000000619951252010020100201061288657491695520035200351616071623620106204974022220035571130201100991003010010000000001111919116112000720000301002003620036200362003620036
30204200351500000000619951252010020100201061288657491695520035200351616071623720106202114022220035571130201100991003010010000000001111919116112000720000301002003620036200362003620036
30204200351500000000619951252010020100201061288657491695520035200351616071623620106202114022220035571130201100991003010010000000001111919116112000720112301002021620263202632026120262
3020420258152005566088112109914130202142023420519128995949170452026020262161612616328204212067941158202635761302011009910030100100022103858011120492101222006820111301002026320261202652026120264
3020420262152016267244001295987410820146202122050712898034917227203532012416177301631720106202114022220081571130201100991003010010000000001111919116112000720000301002008120036202722003620080
3020420125151005467244001322988513020212202102052312893744917181202632026216179231631820590206874116820396575130201100991003010010000212404821112019156112021120069301002026320127202182030720265
3020420260152005566044001344989819320307202832049712902364916955200352003516160716236201062021140222200355711302011009910030100100000250648501111978164112021320133301002030820081203522030820311
30204201711521066801176120299951252010020122201001288565491704620126200351615361631020511202004020020035571130201100991003010010002012320621112058324232020320000301002003620036200362003620036

1000 unrolls and 10 iterations

Result (median cycles for code, minus 1 chain cycle): 1.0035

retire uop (01)cycle (02)03l1i tlb fill (04)mmu table walk data (08)09l2 tlb miss data (0b)18191e1f3a3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fl1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafc2cfd5map dispatch bubble (d6)dbddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
30024200351500000000001039951252001020010200101287866049169552003520035161793162652001020020400202003557113002110910300101000000000018905160552000320000300102003620036200362003620036
3002420035150000000000619951252001020010200101287866049169552003520035161793162652001020020400202003557113002110910300101000000000018907160662000320000300102003620036200362003620036
3002420035149000000000849951252001020010200101287866049169552003520035161793162862008820020400202003557113002110910300101000000000018907161672000320000300102003620036200362003620036
3002420035150000000000619951252001020010200101287866049169552003520035161793162652001020020400202003557113002110910300101000000000018907160652000320000300102003620036200362003620036
3002420035150000000000619951252001020010200101287866049169552003520035161793162652001020020400202003557113002110910300101000000000018907160772000320000300102003620036200362003620036
30024200351500000000001919951252001020010200101287866049169552003520035161793162652001020020400202003557113002110910300101000000000018907160582000320000300102003620036200362003620036
3002420035150000000000619951252001020010200101287866049169552003520035161793162652001020020400202003557113002110910300101000000000018908160552000320000300102003620036200362003620036
30024200351500000000001709951252001020010200101287866049169552003520035161793162652001020020400202003557113002110910300101000000000018909160672000320000300102003620036200362003620036
30024200351500000000003469951252001020010200101287866049169552003520035161793162652001020020400202003557113002110910300101000000000018906160552000320000300102003620036200362003620036
3002420035150000000000619951252001020010200101287866049169552003520035161793162652001020020400202003557113002110910300101000000000018907160562000320000300102003620036200362003620036

Test 4: throughput

Count: 8

Code:

  bfxil x0, x8, #3, #7
  bfxil x1, x8, #3, #7
  bfxil x2, x8, #3, #7
  bfxil x3, x8, #3, #7
  bfxil x4, x8, #3, #7
  bfxil x5, x8, #3, #7
  bfxil x6, x8, #3, #7
  bfxil x7, x8, #3, #7

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 1.0004

retire uop (01)cycle (02)03mmu table walk data (08)091e3a3f51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fl1d tlb access (a0)l1d cache writeback (a8)acbranch cond mispred nonspec (c5)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
80204800355990015604625801008010080100400500149769558003580035699643699938010080200160200800351641180201100991008010010000005110216228003180000801008003680036800368003680036
8020480035599003004625801008010080100400500049769558003580035699643699938010080200160200800351641180201100991008010010000005110216228003180000801008003680036800368003680036
802048003560000304625801008010080100400500049769558003580035699643699938010080200160200800351641180201100991008010010000005110216228003180000801008003680036800368003680036
8020480035599003904625801008010080100400500049769558003580035699643699938010080200160200800351641180201100991008010010020305110216228003180000801008003680036800368003680036
8020480035599002404625801008010080100400500049769558003580035699643699938010080200160200800351641180201100991008010010000005110216228003180021801008003680036800368003680036
8020480035599001204625801008010080100400500049769558003580035699643699938010080200160200800351641180201100991008010010000005110216228003180000801008003680036800368003680036
8020480035599003604625801008010080100400500049769558007480035699643699938013180200160200800351641180201100991008010010000005110216228003180000801008003680036800368003680036
8020480035600003304625801008010080100400500049769558003580035699643699938010080200160200800351641180201100991008010010000005110216228003180000801008003680036800368003680036
8020480035599006046258010080100801004005000497695580035800356996421699938010080200160200800351641180201100991008010010000005110216228003180000801008003680036800368003680036
8020480035599003604625801008013480100400500049769558003580035699643699938010080200160200800351641180201100991008010010000005110216228003180000801008003680036800368003680036

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 1.0004

retire uop (01)cycle (02)03mmu table walk data (08)191e3f51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fld unit uop (a6)l1d cache writeback (a8)acbranch mispred nonspec (cb)cfd2d5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
80024800355990021046258001080010800104000501497695580035800356998637001580010800201600208003516411800211091080010100000502001416498003280000800108003680036800368003680036
80024800356000004625800108001080010400050049769558003580035699863700158001080020160020800351641180021109108001010000050200516558003280000800108003680036800368003680036
800248003559900046258001080010800104000500497695580035800356998637001580010800201600208003516411800211091080010100000502004166108003280000800108003680036800368003680036
80024800356000004625800108001080010400050049769558003580035699863700158001080020160020800351641180021109108001010000050200716648003280000800108003680036800368003680036
80024800355990004625800108001080010400050049769558003580035699863700158001080020160020800351641180021109108001010000050200616678003280000800108003680036800368003680081
8002480035599011534625800108001080010400050049769558003580035699863700158001080020160020800351641180021109108001010000050200216368003280000800108003680036800368003680036
800248003559900046258001080010800104000500497695580035800356998637001580010800201600208003516411800211091080010100000502001116558003280000800108003680036800368003680036
800248003559900183197258001080010800104000500497695580035800356998637001580010800201600208003516411800211091080010100000502004165118003280000800108003680036800368003680036
8002480035600000217258001080010800104000500497695580035800356998637001580010800201600208003516411800211091080010100000502005161098003280000800108003680036800368003680036
80024800356000004625800108001080010400050049769558003580035699863700158001080020160020800351641180021109108001010000150200616568003280000800108003680036800368003680036