Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

BFI (32-bit)

Test 1: uops

Code:

  bfi w0, w1, #3, #7
  mov x0, 1

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 1.000

Load/store unit issues: 0.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)03l2 tlb miss data (0b)1e3f51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)606d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst int alu (97)l1d cache writeback (a8)accfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
100410368004725100010001000599911036103686438941000100020001036164111001100003731161110321000100010371037103710371037
100410368004725100010001000599911036103686438941000100020001036164111001100000731161110321000100010371037103710371037
100410368004725100010001000599911036103686438941000100020001036164111001100000731161110321000100010371037103710371037
100410368004725100010001000599911036103686438941000100020001036164111001100000731161110321000100010371037103710371037
100410368004725100010001000599911036103686438941000100020001036164111001100060731161110321000100010371037103710371037
100410368004725100010001000599911036103686438941000100020001036164111001100030731161110321000100010371037103710371037
100410368004725100010001000599911036103686438941000100020001036164111001100060731161110321000100010371037103710371037
100410368004725100010001000599911036103686438941000100020001036164111001100040731161110321000100010371037103710371037
100410367004725100010001000599911036103686438941000100020001036164111001100000731161110321000100010371037103710371037
100410368004725100010001000599911036103686438941000100020001036164111001100000731161110321000100010371037103710371037

Test 2: Latency 1->1

Code:

  bfi w0, w1, #3, #7
  mov x0, 1

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 1.0036

retire uop (01)cycle (02)03mmu table walk instruction (07)l2 tlb miss instruction (0a)1e3a3f51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fbranch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
10204100367511810362510100101001010060499149695610036100368721787401010010208202161003616211102011009910010100100111717116111003310000101001003710037100371003710037
1020410036751130352510100101001010060499149695610036100368721787391010010208202161003616211102011009910010100100111717116111003310000101001003710037100371003710037
1020410036751100352510100101001010060499149695610036100368721687391010010208202161003616211102011009910010100100111717116111003310000101001003710037100371003710037
10204100367511120362510100101001010060499149695610036100368721687401010010208202161003616211102011009910010100100111717116111003310000101001003710037100371003710037
10204100367511120362510100101001010060499149695610036100368721687391010010208202161003616211102011009910010100100111717116111003310000101001003710037100371003710037
1020410036751100352510100101001010060499149695610036100368721687391010010208202161003616211102011009910010100100111718116111003310000101001003710037100371003710037
1020410036751100362510100101001010060499149695610036100368721787401010010208202161003616211102011009910010100100111717116111003310000101001003710037100371003710037
10204100367511150362510100101001010060499149695610036100368721787401010010208202161003616211102011009910010100100111717116111003310000101001003710037100371003710037
1020410036751190362510100101001010060499149695610036100368721787401010010208202161003616211102011009910010100100111717116111003310000101001003710037100371003710037
1020410036751190352510100101001010060499149695610036100368721687401010010208202161003616211102011009910010100100111717116111003310000101001003710037100371003710037

1000 unrolls and 10 iterations

Result (median cycles for code): 1.0036

retire uop (01)cycle (02)031e3f51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fa9branch mispred nonspec (cb)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
1002410037753472510010100101001060049496956100361003687363876610010100202002010036164111002110910100101000640216221003210000100101003710037100371003710037
1002410036759472510010100101001060049496956100361003687363876610010100202002010036164111002110910100101000640216221003210000100101003710037100371003710037
1002410036750472510010100101001060049496956100361003687363876610010100202002010036164111002110910100101000640216221003210000100101003710037100371003710037
10024100367501332510010100101001060049496956100361003687363876610010100202002010036164111002110910100101000640216221003210000100101003710037100371003710037
1002410036750472510010100101001060049496956100361003687363876610010100202002010036164111002110910100101000640216221003210000100101003710037100371003710037
1002410036750682510010100101001060049496956100361003687363876610010100202002010036164111002110910100101000640216221003210000100101003710037100371003710037
1002410036750472510010100101001060049496956100361003687363876610010100202002010036164111002110910100101000640216221003210000100101003710037100371003710037
1002410036750472510010100101001060049496956100361003687363876610010100202002010036164111002110910100101000640216221003210000100101003710037100371003710037
1002410036753472510010100101001060049496956100361003687363876610010100202002010036164111002110910100101000640216221003210000100101003710037100371003710037
10024100367501422510010100101001060049496956100361003687363876610010100202002010036164111002110910100101000640216221003210000100101003710037100371003710037

Test 3: Latency 1->2

Chain cycles: 1

Code:

  add x1, x0, x0
  mov x0, 0
  bfi w0, w1, #3, #7
  mov x0, 1

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 1 chain cycle): 1.0035

retire uop (01)cycle (02)031e3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fl1d cache writeback (a8)acbranch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
30204200351500619951252010020100201061288657149169550200352003516160716236201062021140222200355711302011009910030100100001111919216112000720000301002003620036200362003620036
30204200351500619951252010020100201061288657149169550200352003516160716236201062021140222200355711302011009910030100100001111919116112000720000301002003620036200362003620036
302042003515048619951252010020100201061288657149169550200352003516160716236201062021140222200355711302011009910030100100001111919116112000720000301002003620036200362003620036
30204200351500619951252010020100201061288657149169550200352003516160716236201062021140222200355711302011009910030100100001111919116112000720000301002003620036200362003620036
30204200351500619951252010020100201061288657149169550200352003516160716236201062021140222200355711302011009910030100100001111919116112000720000301002003620036200362003620036
30204200801500619951252010020100201061288657149169550200352003516160716236201062021140222200355711302011009910030100100001111919116112000720000301002003620036200362003620036
30204200351509619951252010020100201061288657149169550200352003516160716236201062021140222200355711302011009910030100100001111919116112000720000301002003620036200362003620036
30204200351500619951252010020100201061288657049169550200352003516160716236201062021140222200355711302011009910030100100001111919116112000720000301002003620036200362003620036
3020420035150126619951252010020100201061288657149169550200352003516160716236201062021140222200355711302011009910030100100001111919116112000720000301002003620036200362003620036
30204200351500619951252010020100201061288657149169550200352003516160716236201822021140222200355711302011009910030100100001111919116112000720000301002003620036200362008120036

1000 unrolls and 10 iterations

Result (median cycles for code, minus 1 chain cycle): 1.0035

retire uop (01)cycle (02)03mmu table walk instruction (07)mmu table walk data (08)l2 tlb miss instruction (0a)l2 tlb miss data (0b)18191e1f3a3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fl1d tlb access (a0)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9accfd0d5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
3002420035150000000000619951252001020010200101287866149169552003520035161793162652001020020400202003557113002110910300101000050018900716642000320000300102003620036200362003620036
3002420035150000000000619951252001020010200101287866049169552003520035161793162652001020020400202003557113002110910300101000010018900316352000320000300102003620036200362003620036
3002420035150000000000619951252001020010200101287866149169552003520035161793162652001020020400202003557113002110910300101000040018900516542000320000300102003620036200362003620036
30024200351500000000004419951252001020010200101287866049169552003520035161793162652001020020400202003557113002110910300101000000018900516532000320000300102003620036200362003620036
3002420035150000000000619951252001020010200101287866049169552003520035161793162652001020020400202003557113002110910300101000010018900516562000320000300102003620036200362003620036
3002420035150000000000619951252001020010200101287866049169552003520035161793162652001020020400202003557113002110910300101000020018900516532000320000300102003620036200362003620036
3002420035150000000000619951252001020010200101287866049169552003520035161793162652001020020400202003557113002110910300101000010018900616752000320000300102003620036200362003620036
3002420035150000000000619951252001020010200101287866049169552003520035161793162652001020020400202003557113002110910300101000000018900516552000320000300102003620036200362003620036
300242003515000000000053699512520010200102001012878660491695520035200351617931626520010200204002020035571130021109103001010000005418900316352000320000300102003620036200362003620036
3002420035150000000000619951252001020010200101287866049169552003520035161793162652001020020400202003557113002110910300101000000018900616352000320000300102003620036200362003620036

Test 4: throughput

Count: 8

Code:

  bfi w0, w8, #3, #7
  bfi w1, w8, #3, #7
  bfi w2, w8, #3, #7
  bfi w3, w8, #3, #7
  bfi w4, w8, #3, #7
  bfi w5, w8, #3, #7
  bfi w6, w8, #3, #7
  bfi w7, w8, #3, #7

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 1.0004

retire uop (01)cycle (02)031e3f51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)696a6b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fl1d cache writeback (a8)acc2branch cond mispred nonspec (c5)branch mispred nonspec (cb)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
8020480082600046258010080100801004006824976955080035800356996436999380100802001602008003516411802011009910080100100000005110216228003180000801008003680036800368003680036
802048003560063162258010080100801004005004976955080035800356996436999380100802001602008003516411802011009910080100100000005110216228003180000801008003680036800368003680036
80204800355990521258010080100801004005004976955080035800356996436999380100802001602008003516411802011009910080100100000005110216228003180000801008003680036800368003680036
8020480035600046258010080100801004005004976955080035800356996436999380100802001602008003516411802011009910080100100000005110216228003180000801008003680036800748003680036
8020480035599046258010080100801004005004976955080035800356996436999380100802001602008003516411802011009910080100100000005110216228003180000801008003680036800368003680036
8020480035599046258010080100801004005004976955080035800356996436999380100802001602008003516411802011009910080100100000005110216228003180000801008003680036800368003680036
802048003560012462580100801008010040050049769550800358003569964369993801008020016020080035164118020110099100801001000150005110216228003180000801008003680036800368003680036
80204800356000951258010080100801004005004976955080035800356996436999380100802001602008003516411802011009910080100100000005110216228003180000801008003680036800368003680036
8020480035600046258010080100801004005004976955080035800356996436999380100802001602008003516411802011009910080100100000005110216228003180000801008003680036800368003680036
8020480035599046258010080100801004005004976955080035800356996436999380100802001602008003516411802011009910080100100000005110216228003180000801008003680036800368003680036

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 1.0004

retire uop (01)cycle (02)03mmu table walk data (08)09l2 tlb miss data (0b)18191e1f3f51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fl1d tlb access (a0)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9accfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
8002480035599000000046258001080010800104000500497695580035800356998637001580010800201600208003516411800211091080010100000005020216428003280000800108003680036800368003680128
8002480035600000000088258001080010800104000500497695580035800356998637001580010800201600208003516411800211091080010100000005020416938003280000800108003680036800368003680036
800248003560000000240711258001080010800104000500497695580035800356998637001580010800201600208003516411800211091080010100000005020416428003280000800108003680036800368003680036
80024800356000000000711258001080010800104000500497695580035800356998637014880010800201600208003516411800211091080010100000005020216248003280000800108003680036800368003680036
80024800355990000000462580010800108010040005004976955800358003569986370015800108002016002080035164118002110910800101000052005020450248003280000800108003680036800368003680036
8002480035600000000046258001080010800104000500497695580035800356998637001580010800201600208003516411800211091080010100004005020316448003280000800108003680036802188003680036
8002480035599000000046258001080095800104000500497695580035800356998637001580010800201600208003516411800211091080010100000005020216438003280000800108003680036800368003680036
8002480035600000000046258001080010800104000500497695580035800356998637001580010800201600208003516411800211091080010100000005020216248003280000800108003680036800368003680036
8002480035599000000046258001080010800104000500497695580035800356998637001580010800201600208003516411800211091080010100004405020416428003280000800108003680036800368003680071
80024800355990000000462580010800108001040005004976955800358003569986370015800108002016002080035164118002110910800101000030005033316428003280000800108003680036800368003680036