Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

ORN (register, lsr, 64-bit)

Test 1: uops

Code:

  orn x0, x0, x1, lsr #17
  mov x0, 1
  mov x1, 2

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 2.000

Integer unit issues: 2.000

Load/store unit issues: 0.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)0318191e1f3f4c4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst int alu (97)accfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
10042035150000611000173525200020001000325702035203515753184210001000200020354211100110000733673317812000100020362036203620362036
100420351600120611000173525200020001000325702035203515753184210001000200020354211100110000733672317812000100020362036203620362036
10042035150000611000173525200020001000325702035203515753184210001000200020354211100110000733673317812000100020362036203620362036
10042035160030611000173525200020001000325702035203515753184210001000200020354211100110000733673317812000100020362036203620362036
10042035160000611000173525200020001000325702035203515753184610001000200020354211100110000732673317812000100020362036203620362036
10042035160000611000173525200020001000325702035203515753184210001000200020354211100110000733673317812000100020362036203620362036
10042035160000611000173525200020001000325702035203515753184210001000200020354211100110000733673317812000100020362036203620362036
100420351500001391009173525200020001000325702035203515753184210001000200020354211100110000733673317812000100020362036203620362036
10042035160000611000173525200020001000325702035203515753184210001000200020354211100110000733673317812000100020362036203620362036
10042035150030611000173525200020001000325702035203515753184210001000200020354211100110000733673317812000100020362036203620362036

Test 2: Latency 1->2

Code:

  orn x0, x0, x1, lsr #17
  mov x0, 1
  mov x1, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 2.0035

retire uop (01)cycle (02)03mmu table walk data (08)l2 tlb miss data (0b)1e1f3f4c4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fl1d cache writeback (a8)accfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
10204200351550000611000019803252010020100101001853421491695520035200351842931870010100102002020020035421110201100991001010010000710259111979120000101002003620036200362003620036
10204200351550100611000019803252010020100101001853420491695520035200351842931870010100102002020020035421110201100991001010010010710159111979120000101002003620036200362003620036
10204200351550000611000019803252010020100101001853420491695520035200351842931870010100102002020020035421110201100991001010010000710159111979120000101002003620036200362003620036
10204200351550000611000019803252010020100101001853421491695520035200351842931870010100102002020020035421110201100991001010010000710159111979120000101002003620036200362003620036
10204200351550000611000019803252010020100101001853420491695520035200351842931870010100102002020020035421110201100991001010010000710159111979120000101002003620036200362003620036
10204200351550000611000019803252010020100101001853420491695520035200351842931870010100102002020020035421110201100991001010010000710159111979120000101002003620036200362003620036
10204200351550000611000019803252010020100101001853420491695520035200351842931870010100102002020020035421110201100991001010010020710159111979120000101002003620036200362003620036
10204200351550000611000019803252010020100101001853421491695520035200351842931870010100102002020020035421110201100991001010010010710159111979120000101002003620036200362003620036
10204200351550000611000019803252010020100101001853421491695520035200351842931870010100102002020020035421110201100991001010010000710159111979120000101002003620036200362003620036
102042003515500006110000198032520100201001010018534204916955200352003518429318700101001020020200200354211102011009910010100100027710159111979120000101002003620036200362003620036

1000 unrolls and 10 iterations

Result (median cycles for code): 2.0035

retire uop (01)cycle (02)0318193f4c4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fl1d cache writeback (a8)accdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
1002420035150003651000019743252001020010100101853101491695520035200351845131871810010100202002020035421110021109101001010000640263221979220000100102003620036200362003620036
1002420035150001491000019743252001020010100101853101491695520035200351845131871810010100202002020035421110021109101001010000640263221979220000100102003620036200362003620036
1002420035150001701000019743252001020010100101853100491695520035200351845131871810010100202002020035421110021109101001010301640263221979220000100102003620036200362003620036
1002420035150001491000019743252001020010100101853101491695520035200351845131871810010100202002020035421110021109101001010000640263221979220000100102003620036200362003620036
100242003514900611000019743252001020010100101853101491695520035200351845131871810010100202002020035421110021109101001010000640263221979220000100102003620036200362003620036
100242003515000611000019743252001020010100101853100491695520035200351845131871810010100202002020035421110021109101001010000640263221979220000100102003620036200362003620036
1002420035150001241000019743252001020010100101853101491695520035200351845131871810010100202002020035421110021109101001010000640263221979220000100102003620036200362003620036
1002420035150001911000019743252001020010100101853100491695520035200351845131871810010100202002020035421110021109101001010100640263221979220000100102003620036200362003620036
1002420035150001241000019743252001020010100101853101491695520035200351849631871810010100202002020035421110021109101001010000640263221979220000100102003620036200362003620036
100242003515000611000019743252001020010100101853101491695520035200351845131871810010100202002020035421110021109101001010000640263221979220000100102003620036200362003620036

Test 3: Latency 1->3

Code:

  orn x0, x1, x0, lsr #17
  mov x0, 1
  mov x1, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 2.0035

retire uop (01)cycle (02)03091e3f4c4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fl1d cache writeback (a8)acc2cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
10204200351552010310000198032520100201001010018534214916955200352003518429318700101001020020200200354211102011009910010100100000710159111979120000101002003620036200362003620036
1020420035155006110000198032520100201001010018534214916955200352003518429318700101001020020200200354211102011009910010100100030710159111979120000101002003620036200362003620036
1020420035155006110000198032520100201001010018534214916955200352003518429318700101001020020200200354211102011009910010100100000710159111979120000101002003620036200362003620036
1020420035156006110000198032520100201001010018534214916955200352003518429318700101001020020200200354211102011009910010100100000710159111979120000101002003620036200362003620036
1020420035155006110000198032520100201001010018534214916955200352003518429318700101001020020200200354211102011009910010100100000710159111979120000101002003620036200362003620036
1020420035156006110000198032520100201001010018534214916955200352003518429318700101001020020200200354211102011009910010100100100710159111979120000101002003620036200362003620036
1020420035155006110000198032520100201001010018534214916955200352003518429318700101001020020200200354211102011009910010100100000710159111979120000101002003620036200362003620036
10204200351550126110000198032520100201001010018534214916955200352003518429318700101001020020200200354211102011009910010100100000710159111979120000101002003620036200362003620036
1020420035155006110000198032520100201001010018534214916955200352003518429318700101001020020200200354211102011009910010100100000710159111979120000101002003620036200362003620036
1020420035156006110000198032520100201001010018534214916955200352003518429318700101001020020200200354211102011009910010100100000710159111979120000101002003620036200362003620036

1000 unrolls and 10 iterations

Result (median cycles for code): 2.0035

retire uop (01)cycle (02)03mmu table walk data (08)l2 tlb miss data (0b)18191e1f3a3f4c4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fl1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acc2cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
10024200351550000600628100001974325200102001010010185310049169552003520035184513187181001010020200202003544111002110910100101000020000642763431979220000100102003620036200362017120173
10024200351570000001252100001974325200102001010012187039149169552003520035184513187181001010020203602003542111002110910100101000001000640363331979220000100102003620036200362003620036
1002420035156003090061100001974325200102001210012185310149169552003520035184533187181001210020200202003542111002110910100101000000000640363331979220068100102003620036200362003620036
1002420171156000300061100001974325200102001010010185310049169552003520035184533187181001010020200202003542111002110910100101000000000640363331979220000100102003620036200362003620036
10024200351560030120061100091974325200122001210010185310049169552003520035184513187181001010020200202006842111002110910100101000000000704363331979220000100102003620036200362003620036
1002420035156000000061100001974325200102001010010185310049169552003520035184533187221001210020200202003542111002110910100101000000000640363331979220000100102003620036200362003620036
100242003515600002700061100001974325200102001010010185310049169552003520035184513187181001210020200202003542111002110910100101020000000640359331979220000100102003620036201722003620036
1002420035155000015006110000197432520010200101001018531004916955200352003518451318718100101002020020200354211100211091010010100000001650640363331979220000100102008120036200362003620036
1002420035155000000061100361974325200102001010010185310049169552003520035184513187181001010020200202017142111002110910100101000000000640363631979220000100102003620036200362003620036
100242003515600000002330100001974125200102001010010185310049169552003520035184513187221001010020200202003542111002110910100101000000000642763331979220000100102003620036200812003620036

Test 4: throughput

Count: 8

Code:

  orn x0, x8, x9, lsr #17
  orn x1, x8, x9, lsr #17
  orn x2, x8, x9, lsr #17
  orn x3, x8, x9, lsr #17
  orn x4, x8, x9, lsr #17
  orn x5, x8, x9, lsr #17
  orn x6, x8, x9, lsr #17
  orn x7, x8, x9, lsr #17
  mov x8, 9
  mov x9, 10

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.3341

retire uop (01)cycle (02)03mmu table walk instruction (07)l2 tlb miss instruction (0a)1e1f3a3f4c4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fl1d cache writeback (a8)a9accfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)ea? int retires (ef)f5f6f7f8fd
802042676920811120369800002609425160100160100801001643181492364526725267251661531667780100802001602002672539118020110099100801001000005116112299267171600000801002672626726267262672626726
8020426725207110036980000260942516010016010080100164318149236452672526725166153166778010080200160200267253911802011009910080100100000511642299267171600000801002672626726267262672626726
8020426725207110036980000260942516010016010080100164318149236452672526725166153166778010080200160200267253911802011009910080100100100511692299267171600000801002672626726267262672626726
80204267252071102643698000026094251601001601008010016431814923645267252672516615316677801008020016020026725391180201100991008010010000951161022510267171600000801002672626726267262672626726
802042672520811003111800002609425160100160100801001643181492364526725267251661531667780100802001602002672539118020110099100801001000110511692249267171600000801002672626726267262672626726
80204267252071100311180000260942516010016010080100164318149236452672526725166153166778010080200160200267253911802011009910080100100000511692277267171600000801002672626726267262672626726
80204267252071112036980000260942516010016010080100164318049236452672526725166173166778010080200160200267253911802011009910080100100100511692294267171600000801002672626726267262672626726
8020426725207110036980000260942516010016010080100164318049236452672526725166153166778010080200160200267253911802011009910080100100000511692299267171600000801002672626726267262672626726
8020426725207110036980000260942516010016010080100164318149236452672526725166153166778010080200160200267253911802011009910080100100000511692299267171600000801002672626726267262672626726
802042672520711003698000026094251601001601008010016431804923645267252672516615316677801008020016020026725391180201100991008010010010051169221010267171600000801002672626726267262672626726

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.3339

retire uop (01)cycle (02)03mmu table walk instruction (07)l2 tlb miss instruction (0a)l2 tlb miss data (0b)18191e1f3a3f4c4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)6067696a6d6emap stall dispatch (70)int prf full (71)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fl1d tlb access (a0)l1d tlb miss (a1)a9acbranch cond mispred nonspec (c5)cfd0d5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)ec? int retires (ef)f5f6f7f8fd
800242671721500000005828000021280251600101600108001016314213492363126711267111662306166858001080020160020267113911800211091080010100000050200922617267041600000800102671226712267122671226712
80024267112000000024610816180000212802516001016001080010163142104923631267112671116623031668580010800201600202671139118002110910800101000000502001622918267041600000800102716327577273452752027513
80024273482192001511198061603224811591906319416257616202182108212818104924323275212752416641011517000831668196816651027569391218002110910800101000064800597631881208272141618150800102740527171274642729427291
800242735021344016131452968032028111517624315162406162774823242127891049242672763327520166420931702480010800201600202671139118002110910800101042289000519001622617267041623850800102671226884269432717527516
8002426998209021711924968022980240219302516001016240280642163142104923631267112671116627031668580010800201600202671139118002110910800101000000502001722178267041600000800102671226712267122671226764
8002426711200000000001918000021280251600101600108001016314210492363126711267111662303166858001080020160020267113911800211091080010100000050200622817267041600000800102671226712267122671226712
800242671120000000000618000021280251600101600108001016314210492363126711267111662303166858001080020160020267113911800211091080010100000050200172217172670416000017800102671226712267122671226712
800242671120000000000618000021280251600101600108001016314210492363126711267111662303166858001080020160020267113911800211091080010100000050200622617267041600000800102671226712267122671226712
800242671120000000000618000021280251600101600108001016314210492363126711267111662303166858001080020160020267113911800211091080010100000050200622817267041600000800102671226712267122671226712
8002426711200000000006180000212802516001016001080010163142104923631267112671116623031668580010800201600202671139118002110910800101000000502006221717267041600000800102671226712267122671226712