Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

NEGS (register, lsl, 64-bit)

Test 1: uops

Code:

  negs x0, x0, lsl #17
  mov x0, 1
  mov x1, 2

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 2.000

Integer unit issues: 2.000

Load/store unit issues: 0.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)03l1i tlb fill (04)1e1f3a3f4c4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)606d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst int alu (97)l1d cache writeback (a8)accfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
10042035150002661000186225200020001000126235120352035172931866100010001000203541111001100000774434419202000100020362036203620362036
10042035150002661000186225200020001000126235120352035172931866100010001000203541111001100000774434419202000100020362036203620362036
10042035150002661000186225200020001000126235120352035172931866100010001000203541111001100000774434419202000100020362036203620362036
10042035150002661000186225200020001000126235120352035172931866100010001000203541111001100000774434419202000100020362036203620362036
100420351500021081000186225200020001000126235120352035172931866100010001000203541111001100000774434419202000100020362036203620362036
100420351500026610001862252000200010001262351203520351729318661000100010002035411110011000012774434419202000100020362036203620362036
10042035150002661000186225200020001000126235120352035172931866100010001000203541111001100000774434419202000100020362036203620362036
10042035150002661000186225200020001000126235120352035172931866100010001000203541111001100000774434419202000100020362036203620362036
10042035150002661000186225200020001000126235120352035172931866100010001000203541111001100000774434419202000100020362036203620362036
10042035150002661000186225200020001000126235120352035172931866100010001000203541111001100000774434419202000100020362036203620362036

Test 2: Latency 1->2

Code:

  negs x0, x0, lsl #17
  mov x0, 1
  mov x1, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 2.0035

retire uop (01)cycle (02)031e3f4c4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fl1d cache writeback (a8)a9accdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
10204200351500611000019862252010020100101001305121149169552003520035185813187201010010200102002003541111020110099100101001000000710139111992220000101002003620036200362003620036
10204200351500611000019862252010020100101001305121149169552003520035185813187201010010200102002003541111020110099100101001000000710139111992220000101002003620036200362003620036
10204200351500611000019862252010020100101001305121149169552003520035185813187201010010200102002003541111020110099100101001000000710139111992220000101002003620036200362003620036
10204200351500611000019862252010020100101001305121049169552003520035185813187201010010200102002003541111020110099100101001000000710139111992220000101002003620036200362003620036
10204200351500611000019862252010020100101001305121049169552003520035185813187201010010200102002003541111020110099100101001008200710139111992220000101002003620036200362003620036
10204200351500611000019862252010020100101001305121149169552003520035185813187201010010200102002003541111020110099100101001000000710139111992220000101002003620036200362003620036
10204200351500611000019862252010020100101001305121149169552003520035185813187201010010200102002003541111020110099100101001000000710139111992220000101002003620036200362003620036
102042003515004361000019862252010020100101001305121149169552003520035185813187201010010200102002003541111020110099100101001000000710139111992220000101002003620036200362003620036
10204200351500611000019862252010020100101001305121149169552003520035185813187201010010200102002003541111020110099100101001000000710139111992220000101002003620036200362003620036
10204200351500611000019862252010020100101001305121149169552003520035185813187201010010200102002003541111020110099100101001000000710139111992220000101002003620036200362003620036

1000 unrolls and 10 iterations

Result (median cycles for code): 2.0035

retire uop (01)cycle (02)03mmu table walk data (08)l2 tlb miss data (0b)18191e1f3f4c4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fl1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acc2branch mispred nonspec (cb)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
10024200351500000003341000019862252001020010100101305229491695520035200351860331874010010100201002020035411110021109101001010000000000640341331993020000100102003620036200362003620036
1002420035150000000941000019862252001020010100101305229491695520035200351860331874010010100201002020035411110021109101001010000000000640341331993020000100102003620036200362003620036
1002420035150000000611000019862252001020010100101305229491695520035200351860331874010010100201002020035411110021109101001010000000000640341331993020000100102003620036200362003620036
1002420035150000000611000019862252001020010100101305229491695520035200351860331874010010100201002020035411110021109101001010000000000640341331993020000100102003620036200362003620036
1002420035149000000611000019862252001020010100101305229491695520035200351860331874010010100201002020035411110021109101001010000000000640341331993020000100102003620036200362003620036
1002420035150000000611000019862252001020010100101305229491695520035200351860331874010010100201002020035411110021109101001010000000000640341331993020000100102003620036200362003620036
10024200351500000001051000019862252001020010100101305229491695520035200351860331874010010100201002020035411110021109101001010000000000640341331993020000100102003620036200362003620036
100242003515000000028741000019862252001020010100101305229491695520035200351860331874010010100201002020035411110021109101001010000000000640341331993020000100102003620036200362003620036
1002420035150000000611000019862252001020010100101305229491695520035200351860331874010010100201002020035411110021109101001010000000000640341331993020000100102003620036200362003620036
1002420035150000000611000019862252001020010100101305229491695520035200351860331874010010100201002020035411110021109101001010000000000640341331993020000100102003620036200362003620036

Test 3: Latency 3->2

Chain cycles: 1

Code:

  negs x0, x1, lsl #17
  cset x1, cc
  mov x0, 1
  mov x1, 2
  mov x2, 3
  mov x3, 4
  mov x4, 5

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 1 chain cycle): 2.0035

retire uop (01)cycle (02)03mmu table walk data (08)1e3f4c4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fld unit uop (a6)l1d cache writeback (a8)a9acbranch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfmap dispatch bubble (d6)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
202043003522500611000029899253010030100201071956240149269553003530035273918274862010720224202243003585112020110099100201001010000001111320162998230000201003003630036300363003630036
202043003522500611000029899253010030100201071956240149269553003530035273918274862010720224202243003585112020110099100201001010000001111320162998230000201003003630036300363003630036
202043003522500611000029899253010030100201071956240049269553003530035273917274862010720224202243003585112020110099100201001010000001111319162998330000201003003630036300363003630036
202043003522500611000029899253010030100201071956240049269553003530035273917274862010720224202243003585112020110099100201001010000001111319162998330000201003003630036300363003630036
202043003522500611000029899253010030100201071956240149269553003530035273917274852010720224202243003585112020110099100201001010001201111319162998230000201003003630036300363003630036
202043003522500611000029899253010030100201071956240149269553003530035273918274862010720224202243003585112020110099100201001010000001111320162998230000201003003630036300363003630036
202043003522500611000029899253010030100201071956240049269553003530035273918274862010720224202243003585112020110099100201001010000001111320162998230000201003003630036300363003630036
202043003522500611000029899253010030100201071956240049269553003530035273917274862010720224202243003585112020110099100201001010000001111319162998330000201003003630036300363003630036
202043003522500611000029899253010030100201071956240049269553003530035273917274852010720224202243003585112020110099100201001010000001111320162998330000201003003630036300363003630036
202043003522500611000029899253010030100201071956240049269553003530035273918274862010720224202243003585112020110099100201001010000001111319162998330000201003003630036300363003630036

1000 unrolls and 10 iterations

Result (median cycles for code, minus 1 chain cycle): 2.0035

retire uop (01)cycle (02)03mmu table walk data (08)0918191e1f3f4c4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fl1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)l1d cache writeback (a8)a9accfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
200243003522500000061100002989125300103001020010195628914926955300353003527391327498200102002020020300358511200211091020010100100000001270233112995930000200103003630036300363003630036
200243003522500000061100002989125300103001020010195628914926955300353003527391327498200102002020020300358511200211091020010100100000001270133112995930000200103003630036300363003630036
200243003522500000061100002989125300103001020010195628904926955300353003527391327498200102002020020300358511200211091020010100100000001270233112995930000200103003630036300363003630036
200243003522500000061100002989125300103001020010195628904926955300353003527391327498200102002020020300358511200211091020010100100000001270133112995930000200103003630036300363003630036
200243003522500000061100002989125300103001020010195628904926955300353003527391327498200952002020020300358511200211091020010100100000001270133112995930000200103003630036300363003630036
200243003522500000061100002989125300103001020010195628904926955300353003527391327498200102002020020300358511200211091020010100100000001270133112995930000200103003630036300363003630036
200243003522400000061100002989125300103001020010195628904926955300353003527391327498200102002020020300358511200211091020010100100000001270233112995930000200103003630036300363003630036
200243003522500000061100002989125300103001020010195628904926955300353003527391327498200102002020020300358511200211091020010100100000001270133112995930000200103003630036300363003630036
200243003522500000061100002989125300103001020010195628914926955300353003527391327498200102002020020300358511200211091020010100100000001270133112995930000200103003630036300363003630036
200243003522500000061100002989125300103001020010195628904926955300353003527391327498200102002020020300358511200211091020010100100000001270133112995930000200103003630036300363003630036

Test 4: throughput

Count: 8

Code:

  negs x0, x8, lsl #17
  negs x1, x8, lsl #17
  negs x2, x8, lsl #17
  negs x3, x8, lsl #17
  negs x4, x8, lsl #17
  negs x5, x8, lsl #17
  negs x6, x8, lsl #17
  negs x7, x8, lsl #17
  mov x8, 9
  mov x9, 10
  mov x10, 11

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.6676

retire uop (01)cycle (02)03mmu table walk data (08)l2 tlb miss data (0b)191e1f3f4c4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6d6emap stall dispatch (70)flags prf full (73)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fl1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)l1d cache writeback (a8)a9acc2cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
80204534504010000061801614874125160100160100801003440005149503305341053410432982909343360801008020080200534103911802011009910080100100000000051101241153390160000801005341153411534115341153411
80204536394000000061800004874125160100160100801003440005149503305341053410432983024343360801008020080200536393911802011009910080100100000102971051101241153390160000801005341153411534115341153411
80204534104000002101898000048741251601001601008010034400051495033053410534104329829093433608010080200802005341039118020110099100801001000000012051101241153390160000801005341153411534115341153411
80204534104000000061800004874125160100160100805053440005149503305341053410432982909343360801008020080200534103911802011009910080100100003000051101241153390160000801005341153411534115341153411
802045341040000000103800004874125160100160100802023440005149503305341053410432983024343360801008020080200534103911802011009910080100100000102956051101241153390160000801005341153411534115341153411
80204534104000000061800004874125160100160100801003440005149503305341053410433602909343360801008020080200534103911802011009910080100100000500051101241153390160000801005341153411534115341153411
8020453410400000270618000048741251601001601008010034400051495033053410534104329829093843360801008020080200534103911802011009910080100100000003051101245153390160000801005341153411534115341153411
8020453410400000007268000048741251601001601008010034400051495033053410534104329830243843360801008020080200534103911802011009910080100100000300051101241153390160358801005341153411534115341153411
80204534104001000061800004874125160100160100801003440005149503305341053410432982909343360801008020080200534103911802011009910080100100000120051101241153390160000801005341153411534115341153411
802045341040000000103800004874125160100160100801003440005149503305341053410432982909343360801008020080200534103911802011009910080100100000600051101551153390160000801005341153411534115341153411

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.6673

retire uop (01)cycle (02)03mmu table walk data (08)l2 tlb miss data (0b)1e3f4c4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6d6emap stall dispatch (70)flags prf full (73)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fld unit uop (a6)l1d cache writeback (a8)acc2cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
8002453401400008376180000479462516001016001080010343813004950300533805338043290325134335280010800208002053380391180021109108001010000050204244653360160000800105338153381533815338153381
8002453380400005526180000479462516001016001080010343813004950300533805338043290325134335280010800208002053380391180021109108001010000050207246753360160000800105338153381533815338153381
800245338040000072680000479462516001016001080010343813004950300533805338043290325134335280010800208002053380391180021109108001010040050203244753360160000800105338153381533815338153381
80024533804000006180000479462516001016001080010343813004950300533805338043290274934335280010800208002053380391180021109108001010000050204244653360160000800105338153381533815338153381
8002453380400018586180159479462516001016001080010343813004950300534355349343290325134335280010800208002053380391180021109108001010000050204244353360160000800105338153381533815338153381
80024533804000006180000479462516001016001080010343813009850300533805338043290325134335280010800208002053380391180021109108001010003050204244653360160000800105338153381533815338153381
80024533804000106180000479462516001016001080010343813004950300533805338043290274934335280010800208002053380393180021109108001010002918050207247653360160000800105338153381533815338153381
80024533804000006180000479462516001016001080010343813004950300533805338043290293634335280010800208002053380391180021109108001010000050204244353360160000800105338153381533815338153381
80024533803990006180000479462516001016001080010343813004950300533805338043290325134335280010800208002053380391180021109108001010000050203244453360160000800105338153381533815338153381
80024533803990006180000479462516001016001080010343813004950300533805338043290325134335280010800208002053380391180021109108001010000050206244753360160000800105338153381533815338153381