Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

BFI (64-bit)

Test 1: uops

Code:

  bfi x0, x1, #3, #7
  mov x0, 1

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 1.000

Load/store unit issues: 0.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)031e3f51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)606d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst int alu (97)l1d tlb access (a0)l1d cache writeback (a8)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
10041036804725100010001000599911036103686438941000100020001036164111001100000731161110321000100010371037103710371037
10041036804725100010001000599911036103686438941000100020001036164111001100000731161110321000100010371037103710371037
10041036804725100010001000599911036103686438941000100020001036164111001100000731161110321000100010371037103710371037
10041036804725100010001000599911036103686438941000100020001036164111001100000731161110321000100010371037103710371037
10041036804725100010001000599911036103686438941000100020001036164111001100000731161110321000100010371037103710371037
10041036704725100010001000599911036103686438941000100020001036164111001100000731161110321000100010371037103710371037
10041036804725100010001000599911036103686438941000100020001036164111001100000731161110321000100010371037103710371037
10041036804725100010001000599911036103686438941000100020001036164111001100000731161110321000100010371037103710371037
10041036804725100010001000599911036103686438941000100020001036164111001100000731161110321000100010371037103710371037
10041036704725100010001000599911036103686438941000100020001036164111001100000731161110321000100010371037103710371037

Test 2: Latency 1->1

Code:

  bfi x0, x1, #3, #7
  mov x0, 1

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 1.0036

retire uop (01)cycle (02)03l2 tlb miss data (0b)18191e3f51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fl1d cache writeback (a8)acbranch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
102041003775000059251010010100101006049914969561003610036872168739101001020820216100371621110201100991001010010000111717116111003310000101001003710037100371003710037
1020410036750000267251010010100101006049914969561003610036872178739101001020820216100361621110201100991001010010000111718116111003310000101001003710037100371003710037
1020410036750100362510100101001010060499149695610036100368721787401010010208202161003616211102011009910010100100140111718116111003310000101001003710037100371003710037
102041003675000036251010010100101006049914969561003610036872178740101001020820216100361621110201100991001010010000111717116111003310000101001003710037100371003710037
1020410036750000100251010010100101006049914969561003610036872168740101001020820216100361621110201100991001010010000111718116111003310000101001003710037100371003710037
102041003675000099251010010100101006049914969561003610036872178739101001020820216100361621110201100991001010010000111717116111003310000101001003710037100371003710037
1020410036750000203251010010100101006049914969561003610036872178739101001020820216100361621110201100991001010010000111718116111003310000101001003710037100371003710037
102041003675000035251010010100101006049914969561003610036872178739101001020820216100361621110201100991001010010010111718116111003310000101001003710037100371003710037
102041003675000036251010010100101006049914969561003610036872168739101001020820216100361621110201100991001010010000111718116111003310000101001003710037100371003710037
1020410036750000400251010010100101006049914969561003610036872168740101001020820216100361621110201100991001010010000111718116111003310000101001003710037100371003710037

1000 unrolls and 10 iterations

Result (median cycles for code): 1.0036

retire uop (01)cycle (02)031e3f51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fl1d tlb miss (a1)l1d cache writeback (a8)acc2cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
100241003775047251001010010100106004949695610036100368736387661001010020200201003616411100211091010010100002640216221003210000100101003710037100371003710037
100241003675047251001010010100106004949695610036100368736387661001010020200201003616411100211091010010100260640216221003210000100101003710037100371003710037
100241003675047251001010010100106004949695610036100368736387661001010020200201003616411100211091010010100000640216221003210000100101003710037100371003710037
100241003675047251001010010100106004949695610036100368736387661001010020200201003616411100211091010010100000640216221003210000100101003710037100371003710037
100241003675047251001010010100106004949695610036100368736387661001010020200201003616411100211091010010100000640216221003210000100101003710037100371003710037
100241003675047251001010010100106004949695610036100368736387661001010020200201003616411100211091010010100130640216221003210000100101003710037100371003710037
100241003675047251001010010100106004949695610036100368736387661001010020200201003616411100211091010010100000640216221003210000100101003710037100371003710037
100241003675047251001010010100106004949695610036100368736387661001010020200201003616411100211091010010100000640216221003210000100101003710037100371003710037
100241003675047251001010010100106004949695610036100368736387661001010020200201003616411100211091010010100100640216221003210000100101003710037100371003710037
100241003675047251001010010100106004949695610036100368736387661001010020200201003616411100211091010010100000640216221003210000100101003710037100371003710037

Test 3: Latency 1->2

Chain cycles: 1

Code:

  add x1, x0, x0
  mov x0, 0
  bfi x0, x1, #3, #7
  mov x0, 1

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 1 chain cycle): 1.0035

retire uop (01)cycle (02)03l2 tlb miss data (0b)191e3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fl1d cache writeback (a8)a9acbranch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfl1i cache miss demand (d3)d5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
302042003515000276199512520100201002010612886570491695520035200351616071623620106202114022220035571130201100991003010010000011119190116112000720000301002003620036200362003620036
30204200351500096199512520100201002010612886571491695520035200351616071623720106202114022220035571130201100991003010010000011119190116112000720000301002003620036200362003620036
30204200351500006199512520100201002010612886571491695520080200351616071623620106202114022220035571130201100991003010010000011119190116112000720000301002003620036200362003620036
30204200351500006199512520100201002010612886571491695520035200351616071623620106202114022220035571130201100991003010010000011119190116112000720000301002003620036200362006720036
30204200351500006199512520100201002010612886570491695520035200351616071623620106202114022220035571130201100991003010010000011119190116112000720000301002003620036200362003620036
30204200351500006199512520100201002010612886571491695520035200351616071623620106202114022220035571130201100991003010010000011119190116112000720000301002003620036200362003620036
30204200351500006199512520100201002010612886570491695520035200351616071623620106202114022220035571130201100991003010010000011119190116112000720000301002003620036200362003620036
30204200351500006199512520100201002010612888410491695520035200351616071623720106202114022220035571130201100991003010010000011119190116112000720000301002003620036200362006720036
30204200351500006199512520100201002010612886570491695520035200351616071623620106202114022220035571130201100991003010010000011119190116112001020000301002003620036200362003620036
30204200351500006199512520100201002010612886570491695520035200351616071623620106202114022220035571130201100991003010010000011119190116112000720000301002003620036200362003620036

1000 unrolls and 10 iterations

Result (median cycles for code, minus 1 chain cycle): 1.0035

retire uop (01)cycle (02)03l1i tlb fill (04)mmu table walk data (08)09l2 tlb miss data (0b)18191e1f3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fl1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acc2cfd5map dispatch bubble (d6)d9ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
30024200351500000000061995125200102001020010128786604916955200352003516179316265200102002040020200355711300211091030010100000003018902160222000320000300102003620036200362003620036
300242003515000000000998995125200102001020010128786614916955200352003516179316265200102002040020200355711300211091030010100200000018902160222000320000300102003620036200362003620036
30024200351500000000061995125200102001020010128786604916955200352003516179316265200102002040020200355711300211091030010100000000018902160222000320000300102003620036200362003620036
3002420035150000000001126995125200102001020010128786604916955200352003516179316265200102002040020200355711300211091030010100000000018902160222000320000300102003620036200362003620036
30024200351500000000061995125200102001020010128786604916955200352003516179316265200102002040020200355711300211091030010100000000018902160222000320000300102003620036200362003620036
30024200351500000000061995125200102001020010128786614916955200352003516179316265200102002040020200355711300211091030010100000000018902160222000320000300102003620036200362003620036
3002420035150000000001123995125200102001020010128786604916955200352003516179316265200102002040020200355711300211091030010100000000018902160222000320000300102003620036200362003620036
300242003515000000000879951252001020010200101287866049169552003520035161793162652001020020400202003557113002110910300101000002000018902160222000320000300102003620036200362003620036
300242003515000000000979995125200102001020010128786614916955200352003516179316265200102002040020200355711300211091030010100000000018902160322000320000300102003620036200362003620036
30024200351500000000061995125200102001020010128786614916955200352003516179316265200102002040020200355711300211091030010100000000018902160232000320000300102003620036200362003620036

Test 4: throughput

Count: 8

Code:

  bfi x0, x8, #3, #7
  bfi x1, x8, #3, #7
  bfi x2, x8, #3, #7
  bfi x3, x8, #3, #7
  bfi x4, x8, #3, #7
  bfi x5, x8, #3, #7
  bfi x6, x8, #3, #7
  bfi x7, x8, #3, #7

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 1.0004

retire uop (01)cycle (02)03l2 tlb miss data (0b)181e3a3f51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fld unit uop (a6)l1d cache writeback (a8)a9acc2branch mispred nonspec (cb)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
802048003559900004625801008010080100400500049769550800358003569964369993801008020016020080035164118020110099100801001000000005110316228003180000801008003680036800368003680036
8020480035599000092025801008010080100400500149769550800358003569964369993801008020016020080035164118020110099100801001000000005110216228003180000801008003680036800368003680036
802048003559900004625801008010080100400500049769550800358003569964369993801008020016020080035164118020110099100801001000000005110216228003180000801008003680036800368003680036
802048003560000004625801008010080100400500049769550800358003569964369993801008020016020080035164118020110099100801001000000005110216228003180000801008003680036800368003680036
8020480035599000071125801008013480100400500049769550800358003569964369993801008020016020080035164118020110099100801001000000005110216228003180000801008003680036800368003680036
80204800355990000228625801008010080100400500049769550800358003569964369993801008020016020080035164118020110099100801001000000005110216228003180000801008003680036800368003680036
802048003559900004625801008010080100400500149769550800358003569964369993801008020016020080035164118020110099100801001000000005110216228003180000801008003680036800368003680036
802048003559900004625801008010080100400500049769550800358003569964369993801008020016020080035164118020110099100801001000000015110217228003180000801008003680036800368003680036
802048003559900004625801008010080100400500049769550800358003569964369993801008020016020080035164118020110099100801001000000005110216228003180000801008003680036800368003680036
80204800355990012046468010080100801004005001497695508003580079699641169993801008023216020080035164118020110099100801001000103005110216228003180000801008003680036800368003680036

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 1.0004

retire uop (01)cycle (02)03191e1f3f51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fl1d cache writeback (a8)a9acc2branch mispred nonspec (cb)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
8002480035600000483258001080010800104000500497695508003580035699863700158001080020160020800351641180021109108001010000005020316248003280000800108003680036800368003680036
8002480035599000513258001080010800104000500497695508003580035699863700158001080020160020800351641180021109108001010000005020416428007180000800108003680036800368003680036
80024800355990001877258001080010800104000500497695508003580035699863700158001080020160020800351641180021109108001010000005020216448003280000800108003680036800368003680036
8002480035599000674258001080010800104000500497695508003580035699863700158001080020160020800351641180021109108001010000005020216428003280000800108003680036800368003680036
8002480035599000650258001080010800104000500497695508003580035699863700158001080020160020800351641180021109108001010000005020416428003280000800108003680036800368003680036
8002480035600000544258001080010800104000500497695538003580035699863700158001080064160020800351641180021109108001010000005020416428003280000800108003680036800368003680036
8002480035599000614258001080010800104000500497695508003580035699863700158001080020160020800351641180021109108001010000005020216448003280000800108003680036800368003680036
800248003559900040525800108001080010400050049770000800358012470040770049800108005316017280035164118002110910800101002578005020416248003280000800108003680036800368003680036
800248003559901202073258001080010800104000500497695508003580035699863700158001080020160020800351641180021109108001010100005047516458003280000800108008180218800368008180171
8002480035599000595258001080010800104000500497695508003580035699863700158001080020160020800351641180021109108001010100005020416448003280000800108003680036800368003680036