Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

LDRSH (register, uxtw, 64-bit)

Test 1: uops

Code:

  ldrsh x0, [x6, w7, uxtw]
  mov x7, #4
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 0.000

Load/store unit issues: 1.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)030e0f1e22243a3f4346494f51schedule uop (52)schedule ldst uop (55)dispatch ldst uop (58)simd uops in schedulers (5a)606d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map ldst uop (7d)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst int load (95)inst ldst (9b)9dl1d tlb access (a0)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)acafb5b6bbl1d cache miss ld nonspec (bf)cfd5map dispatch bubble (d6)ddfetch restart (de)e0eaebec? ldst retires (ed)f5f6f7f8fd
100540320045100379212121625100010001000149891394394217325210001000200039477111001100010000100043103903910396139437311632391101041000395395395395395
100439420045001379212121625100010001000150371398398221325610001000200039481111001100010000100043103803810396139447311622395141471000399395399399399
1004398200441013833111925100010001000152671398398221325610001000200039877111001100010000100044103803910396138447311622396141441000399399401395399
1004398300440013832111925100010001000152741398398221325610001000200039877111001100010000100043103803810396139447311621397141471000400399399399399
10043983004400137921211925100010001000149891398394221325210001000200039877111001100010000100043103803810386139447311621398141471000402399399399399
1004398200440013832111925100010001000150371394394217325210001000200039877111001100010000100043103803810386139437311622395101071000401399399399399
1004398200440013832111925100010001000152081398398221325610001000200039877111001100010000100043103803810386139437311622395141041000395399399399399
1004394300440013832111925100010001000152671398398221325610001000200039877111001100010000100043103803810386138447311622391141071000395399399399395
1004398300440013832111925100010001000152081398398221325610001000200039877111001100010000100043103803810396139447311622391141471000417399399399399
1004398300440013902111925100010001000152081398398221325610001000200039477111001100010000100043103803910396140447311611391141471000399399399399395

Test 2: Latency 1->2 (with chain penalty)

Chain cycles: 3

Code:

  ldrsh x0, [x6, w7, uxtw]
  eor x8, x8, x0
  eor x8, x8, x0
  add x6, x6, x8
  mov x7, #4
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0048

retire uop (01)cycle (02)03mmu table walk data (08)l2 tlb miss data (0b)0e0f18191e1f223a3f4d4f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)a9acafb5l1d cache miss ld nonspec (bf)c2branch cond mispred nonspec (c5)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
40205700475250000001010700206976459709254010430103100003010010000616005334220649669707005070050646313649504010030200100006020020000700473511402011009910010000301001000011001000001100001001000011000261017111697983000396010000301007003670051700367004870051
40204700475240000001010700356976459695254010030103100013010010000616005334220649669707005070035646433649534010030200100006020020000700503511402011009910010000301001000001001000301100000001000011000261017111698133000099910000301007005170051700487005170051
40204700505240000001010700356978159706254010430103100013010010000616175334220649669707005070050646463649534010030200100006020020000700503511402011009910010000301001000001001000001100002001000011000261017111698133000399910000301007005170051700367005170049
40204700355250000000010700356973559720254010430100100013010010000616175334220649669707005070047646463649384010030200100006020020000700473511402011009910010000301001000001001000000100001031000011000261017111698103000099010000301007005170048700517003670053
40204700885250000001000700206978159709254010430103100003010010000616005334147049669707005070050646463649534010030200100006020020000700353511402011009910010000301001000001001000000100001031000010000261017111697983000399010000301007003670036700517005170417
402047005052400000010007003569764597092540104301031000130100100006161753342206496696770050700356464636495040100302001000060200200007005035114020110099100100003010010000110010000001000020181000011000261017111698133000366010000301007004870051700517004870054
40204700355240000001000700356978159709254010030103100013010010000616005334220649669557005070050646463649384010030200100006020020000700503511402011009910010000301001000001001000001100001001000011000261017111698133000099910000301007005170036700487005170060
40204700505250000001000700356973559706254010430103100013010010000616005334206249669557005070047646433649534010030200100006020020000700473511402011009910010000301001000001001000001100001001000011000261017111697983000090910000301007005170051700367004870039
40204700355250000000010700356978159709254010430103100013010010000616005334206249669707005070050646463649384010030200100006020020000700353511402011009910010000301001000001001000000100003001000000000261017111698133000390010000301007005170051700517003670052
40204700475250000000010700326978159709254010030103100003010010000616175334220649669707005070050646433649384010030200100006020020000700353521402011009910010000301001000001001000001100001001000001000261017111698133000069010000301007005170036700517005170072

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0047

retire uop (01)cycle (02)0309l2 tlb miss data (0b)0e0f1e1f22233f4d4f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)6067696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)acafb5l1d cache miss ld nonspec (bf)cdcfd5map dispatch bubble (d6)daddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
4002570047525101100107003269728597092540014300101000130010100006169523342062004966955700357004764665364972400103002010000600202000070047351140021109101000030010100001101000011000000100001112520171011697983000366610000300107004870048700487005170051
4002470047524101110107003269743597062540014300131000130010100006169523342062104966970700477004764665364975400103002010000600202000070050351140021109101000030010100000101000011000000100001102520171011698103000396610000300107005170036700487003670048
4002470035525000110107003269760597092540014300131000130010100006169823342206004962799700357004764665364972400103002010000600202000070050351140021109101000030010100000101000011000030100001102520171011698103000366610000300107004870048700487004870049
4002470047524001110007003269760597062540014300131000130010100006169523342206104966970700477004764653364972400103002010000600202000070047351140021109101000030010100000101000011000000100001102520171011698103000366910000300107005170051700487005170051
4002470050525100010107003569728596952540014300131000130010100006169523342062004966955700477004764653364972400103002010000600202000070047351140021109101000030010100000101000011000000100001102520171011698103000066010000300107004870048700487004870036
4002470047525100010107003569728597062540014300101000030010100006170683341470104966967700477003564665364972400103002010000600202000070047351140021109101000030010100000101000011000000100001002520171011698133000396610000300107004870048700487004870048
4002470035525000010007003569760597092540014300131000130010100006169823342062004966970700357003564665364972400103002010000600202000070047821140021109101000030010100000101000011000000100001002520171011698103000066610000300107004870048700517004870048
4002470047525000010107002069728597062540014300131000030010100006181703342062004966967700357005064668364972400103002010000600202000070050351140021109101000030010100000101000011000000100001102520171011698103000390010000300107005570051700487004870051
4002470047524000000107002069734597073640014300131000130010100006170003342206004966955700477005064668364960400103002010000600202000070047351140021109101000030010100000101000011000000100001002520171022698103000366610000300107005170051700487005170051
4002470047525000010107003269760597172540014300101000130010100006169823342206004966967700507004764665364972400103002010000600202000070047351140021109101000030010100001101000011000000100001102520171011697983000366610000300107003670048700487004870048

Test 3: Latency 1->3 (with chain penalty)

Chain cycles: 3

Code:

  ldrsh x0, [x6, w7, uxtw]
  eor x8, x8, x0
  eor x8, x8, x0
  add x7, x7, x8
  mov x7, #4
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0050

retire uop (01)cycle (02)03090e0f18191e1f22233f4d4f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafb5l1d cache miss ld nonspec (bf)branch mispred nonspec (cb)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
40205700475251000010107003569772597182540104301031000130100100006160053342206149669700700497005064646364950401003020010000602002000070047351140201100991001000030100100000100100000110000000010000100261017111698133000396910000301007005170048701157006470049
40204700475240000010007002069741597092540104301031000130100100006160053342206049669550700507004764646364953401003020010000602002000070050351140201100991001000030100100000100100000110000000010000110261017111698103000399910000301007005170048701427011570036
40204700505250000010107010169867597122540104301031000130100100006160053342206149669550700507005064643364950401003020010000602002000070050351140201100991001000030100100000100100000110000000010000110261017111698133000396910000301007004870048701317005170051
40204700505250000010107003569769597092540108301031000130100100006160053342206149669700700507005064643364953401003020010000602002000070050351140201100991001000030100100000100100000110000000110000010261017111698133000399010000301007005170048701147007570048
40204700475250000000017003569865596962540104301031000130100100006160153342939149669700700507005064646364953401003020010000602002000070050351140201100991001000030100100000100100000110000000010000110261017111698133000366910000301007005170048701407009270051
40204700505240000000007003569802597102540104301031000130100100006160053342206149669700700507004764646364938401003020010000602002000070050351140201100991001000030100100000100100000110000000010000110261017111698133000360910000301007005170051701567006270056
40204700475250100010007003569794597072540104301031000130100100006160153342206149669670700507005064646364938401003020010000602002000070050351140201100991001000030100100000100100000110000000010000110261017111698103000396610000301007005170048701497008470051
40204700505250000010017003569887597112540104301031000030100100006160053342206149669700700507005064646364950401003020010000602002000070050351140201100991001000030100100000100100000110001000010000110261017111698133000090910000301007005170051700517010570051
4020470047525000005140017005069775597112540104301031000130100100006160053342206149669700700507004764646364953401003020010000602002000070050351140201100991001000030100100000100100000110000000010000010261017111698133000390910000301007005170048701277010870051
40204700505250000000007003569838597102540100301031000130100100006160153342206149669700700507003564646364953401003020010000602002000070047351140201100991001000030100100000100100000110000000010000110261017111697983000360910000301007005170048701287010670051

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0047

retire uop (01)cycle (02)03mmu table walk instruction (07)mmu table walk data (08)09l2 tlb miss instruction (0a)l2 tlb miss data (0b)0e0f18191e1f22233f4d4f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)a9acafb5l1d cache miss ld nonspec (bf)c2cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
4002570047525001000000100070032697285970625400103001310002300101000061706833420624966967700477003564653364972400103002010000600202000070035351140021109101000030010100000101000001100006038656100481102920152111011709933016860610000300107150271590716037154471580
40024715365376110000181723111496107003269728597062540014300101000130010100006170683344722496696770328704996508813265759431293256310824652522098871494351814002110910100003001010000010100570110035103850310054112306771901117713883021700610000300107169871966718727186870036
40024700355240000000021198123210700206973059706254020030165100473187310663650087337562449669677003770047646651416586042739300201000060020204387003835114002110910100003001010000010100000110000580010000000252057157698983000360610000300107004870036700487004870052
40024700515250000001000000700326974359728254001030010100013001010000616952334206249669597003570035646533649604001030020100006002020000701403511400211091010000300101000011010000001000000010000110252077157699233000066010000300107004870048700367004870112
40024700475250000000001010700206972859706254001430013100013001010000616952334206249669707005070035646653649724001030020100006002020000700473511400211091010000300101000001010000011000020610000110252057157698013000006610000300107004870048700487003670079
400247004752400000000010107003269728597972540014300101000130010100006170683342062496696770047700476466536497240010300201000060020200007003535114002110910100003001010000010100000010000550010000110252057175698103000066610000300107013970048700517004870084
400247005152500000100010107003269728597062540014300101000130010100006169523341854496696770047700476466536496040206300201000060020200007004735114002110910100003001010000010100000110000350010000100252077157697983000360010000300107004870048700487004970105
400247003552500000000000007003269728597122540010300131000030010100006169523342062496696770035700556465336496040010300201000060020200007004735114002110910100003001010000010100000010006400010000110252077175697983000060610000300107003670048700367004870089
4002470047524000000000270007002069743597122540014300101000130010100006170683342062496695770053700476466636496040010300201000060020200007004735114002110910100003001010000010100000010000570010000010252057157697983000360610000300107004870036700367003670093
4002470035524000000100101070032697435971225400143001010001300101000061706833420624966955700477003564665364960400103002010000600202000070035351140021109101000030010100001101000001100003211110000100252077157697983000366610000300107004870048700367004870107

Test 4: throughput

Count: 8

Code:

  ldrsh x0, [x6, w7, uxtw]
  ldrsh x0, [x6, w7, uxtw]
  ldrsh x0, [x6, w7, uxtw]
  ldrsh x0, [x6, w7, uxtw]
  ldrsh x0, [x6, w7, uxtw]
  ldrsh x0, [x6, w7, uxtw]
  ldrsh x0, [x6, w7, uxtw]
  ldrsh x0, [x6, w7, uxtw]
  mov x7, 8

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.3341

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)l2 tlb miss data (0b)0e0f1e223a3f4346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafb5b6bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)branch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
80205267272000000145132672127727258010010080000100800165001166596049236472672726727166556166938011520080024200160048267366411802011009910080000100800000100800191943800191926080040615943191115118016002673413135800001002673726737267372673726786
8020426739200100007013267212791925801001008000010080013500116822404923634267142673616664616688801162008002420016004826736861180201100991008000010080000010080000043800390180398003961404301115118016002672410104800001002672826728267932684326728
802042672720000000451026712212121625801001008000010080015500116754204923627267362672716660616659801152008002420016004826727137118020110099100800001008000001008000004380039010398003901394301115118016002672410104800001002672826728268292683926729
80204267072000000045112671221212182580100100800001008001550011771161492364726727267271663561668280114200800242001600482672777118020110099100800001008000001008000004380039000398003961394301115118016002670410104800001002670826728267962682226731
80204267072000000045112671221212125801001008000010080015500116585614923647267272670716655616679801152008002420016004826727771180201100991008000010080000010080000043800390000800396139430111511801600267241004800001002672826810267282674226784
80204267372000000045112671201212182580100100800001008001550011771161492364726727267271665561667980115200800242001600482672777118020110099100800001008000001008000004380039000380000613943011151180160026724000800001002672826728268062683926733
80204267272000000045012669220121825801001008000010080014500116769214923627267072670716635616682801142008002420016004826727771180201100991008000010080000010080000043800390003980039010430111511801600267240104800001002672826728267832684526733
8020426736200000004511267122121216258010010080000100800155001166596149236272670726727166556166798011420080024200160048267277711802011009910080000100800000100800000080039000398003961394301115118016002672410104800001002672826728268152682926708
8020426727207000004511267122121218258010010080000100800155001177116049236472672726727166556166798011420080024200160048267277711802011009910080000100800000100800000438003900008003961394301115118016002670410104800001002672826728267632681226731
8020426727200000004500266920120182580100100800001008001550011665961492362726727267271665561665980115200800242001600482672777118020110099100800001008000001008000004380039000398000061394301115118016002670410104800001002672826728268462681326708

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.3343

retire uop (01)cycle (02)03l1i tlb fill (04)l1d tlb fill (05)mmu table walk data (08)09l2 tlb miss data (0b)0e181e22233a3f4346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)92inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafb5b6bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2cdcfl1i tlb miss demand (d4)d5map dispatch bubble (d6)ddfetch restart (de)e0eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
80025267272000000000451012671220016258001010800001080000501169222149236282672726708166723167088001020800002016000026727771180021109010800001080000010800000080039000244880039613943000502001716917267241010480000102672826728267102672826744
8002426743200100100045101267122121216258001010800001080000501166896149236482672726728166723167078001020800002016000026727771180021109010800001080000010800000080039010428003961390000502111317161426724010480000102672926729267292672926731
80024267462001001000450012671221212025800101080000108000050116729814923656267362673616660316695800102080000201600002673685118002110901080000108000001080020204380058021728004061194319005021117161718267331313580000102673726737267372673726745
800242674420011011006610326721377192580010108000010800005011677911492365626715267141668131671680010208000020160000267378611800211090108000010800000108002021438006001127800396158019205021116161716267341313080000102673726737267412673726737
8002426745200111110067103266992001925800101080000108000050116945114923635267372673616681316716800102080000201600002671564118002110901080000108000001080021214380019212248004061594319205020017161717267331313580000102673726715267372673826744
8002426741200111100067103267222771925800101080000108000050117397514923656267362673616681316717800102080000201600002673764118002110901080000108000001080019194380058110648004061594319005021117161417267331313080000102673726716267372673726737
8002426742201110110066103267210771925800101080000108000050116779114923656267362673716681316716800102080000201600002673686118002110901080000108000001080019194380061131668004061584319105021117161717267341313580000102671526737267372673726819
8002426749200111110066003267210072025800101080000108000050117440814923657267362673616660316695800102080000201600002671485118002110901080000108000001080020214380059100308004061584319105021117161017267341313580000102671526715267372673726738
80024267492011111100241022672137719258001010800001080000501167119149236572673626715166603167168001020800002016000026717851180021109010800001080000010800211943800581011248004061574319105021117161317267171313580000102673726737267372673726737
8002426744201111110067102267213092025800101080000108000050116721914923657267362673616682316695800102080000201600002673685118002110901080000108000011080021214380060002678004061594319105021117161417267331313080000102673726737267372673726737