Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

LDRSH (register, 64-bit)

Test 1: uops

Code:

  ldrsh x0, [x6, x7]
  mov x7, #4
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 0.000

Load/store unit issues: 1.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)030e0f191e22243a3f4346494f51schedule uop (52)schedule ldst uop (55)dispatch ldst uop (58)simd uops in schedulers (5a)606d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map ldst uop (7d)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst int load (95)inst ldst (9b)9dl1d tlb access (a0)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)acafb5b6bbl1d cache miss ld nonspec (bf)cfd5map dispatch bubble (d6)ddfetch restart (de)e0eaebec? ldst retires (ed)f5f6f7f8fd
1005403311045101379312121625100010001000152740394398217325210001000200039477111001100010000100043103803810396139437331633400131351000395395399395399
100439430004410138331211625100010001000176001398398217325210001000200039477111001100010000100043103903910386139437331633395101041000395399395399395
1004398300045101379212121925100010001000149890394394217325210001000200039477111001100010000100043103903910386138447331633395101441000395395395399399
1004394200045001383212121925100010001000150371394394221325210001000200039477111001100010000100043103903910396139437331633393141041000395395501395399
100439830004500138321121925100010001000149890394398216325210001000200039477111001100010001100043103803910396138437331633391141041000395395395395395
100439830004500137921211625100010001000150370398398221325210001000200039477111001100010000100043103903910396138447331633391101041000399395395395395
10043983000441013792111625100010001000152670394394216325210001000200039877111001100010000100042103813910396138447331633391141041000395399395395395
100439830004500137931181625100010001000152080394394217325210001000200039477111001100010001100039103803810386139437331633391101041000395395395395395
10043943000440013832111925100010001000152670394394221325610001000200039877111001100010001100043103803910386139447331633395141441000399399399395399
1004398200044001379212121925100010001000152670394395221325610001000200039477111001100010000100043103803810386138447331633395141441000399399399399399

Test 2: Latency 1->2 (with chain penalty)

Chain cycles: 3

Code:

  ldrsh x0, [x6, x7]
  eor x8, x8, x0
  eor x8, x8, x0
  add x6, x6, x8
  mov x7, #4
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0053

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)09l2 tlb miss data (0b)0e0f191e22243a3f4d4f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafb5bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2branch mispred nonspec (cb)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
40205700575241100010100070032697355970625401043010310001301001000061600533414704966967700477004764631364950401003020010000602002000070036351140201100991001000030100100001100100032110004001110000110120261017111698163000666610000301007005470057700547005470054
40204700565241010000210070038697845971225401083010610002301001000061603233423504966973700417005664652364944401003020010000602002000070190352140201100991001000030100100000100100022110002000110000110110261017111702253000606610000301007005470042700557005470057
40204700545251001000200070026697845971225401083010610002301001000061607833417694966973700537005364649364956401003020010000602002000070056351140201100991001000030100100000100100024110002001110000111100261017111698103000360910000301007004870048700367003670036
40204700475250000000600070038697845970225401083010610002301001000061603233423504966961700537005364649364944401003039810000602002000070048351140201100991001000030100100001100100000110000000010000101000261017111698223000306010000301007004870048700487004870036
40204700475240000000010170026697025971225401043010610002301001000061607833423504966973700417005364649364944401003020010000602002000070057351140201100991001000030100100000100100021110001001110000111110261017111698163000666010000301007005470042700547005770054
40204700535251011000110070041697845971225401083010610001301001000061603233417694966973700417005364649364956401003020010000602002000070056351140201100991001000030100100000100100021110001001110000010110261017111698193000600610000301007005470042700547005470095
40204700475250000000110070032697645969525401043010310001301001000061601533414704966970700357004764643364938401003020010000602002000070053351140201100991001000030100100000100100013110003001110000111110261017111698043000666610000301007005470042700547004270042
40204700535251100000110170038697025971225401043010310002301001000061603233417694966961700417004164649364956401003020010000602002000070053351140201100991001000030100100000100100022110002010110000101000261017111698103000360610000301007004870048700517003670048
40204700475250000000010070038697845970125401083010610001301001000061603233423504966976700417005364649364959401003020010000602002000070061351140201100991001000030100100001100100021010002001110000101000261017112698163000660610000301007005470054700547005470054
40204700415251111000010070038697845971225401043010610001301001000061607833423504966961700537004164649364956401003020010000602002000070053351140201100991001000030100100000100100021110003010110000111120261017111698163000660610000301007004270042700547005470054

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0047

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)l2 tlb miss instruction (0a)l2 tlb miss data (0b)0e0f1e22233f4d4f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)acafb5l1d cache miss ld nonspec (bf)branch mispred nonspec (cb)cfd5map dispatch bubble (d6)dbddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
40025700535251000110107006369743597062540010300131000130010100006169523342062149669677004770047646653649724001030020100006002020000700473511400211091010000300101000011010000001000003100001002520271011698103000306010000300107005170036700487004870048
40024700475251000001107003569743597062540010300101000130010100006169523342062149669557003570035646653649724001030020100006002020000700503511400211091010000300101000001010000001000000100001102520171012698133000006910000300107003670036700367004870036
40024700355240101001107007369728597062540014300101000130010100006170683342062149669557009170118646553649724001030020100006002020000700353511400211091010000300101000001010000011000010100000122520171012697983000006010000300107005170051700367005170036
40024700475240000111107003569728596952540014300131000030010100006169523342062149669677003570047646653649724001030020100006002020000700353511400211091010000300101000001010000001000000100001102520171011698103000396610000300107004870036700367003670048
40024700355250000111007002069743597062540014300131000130010100006169523342062149669677003570047646653649604001030020100006002020000700473511400211091010000300101000001010000011000000100001102520171012698133000366610000300107009170069700487004870048
40024700475240000006107003269728597062540010300131000130010100006169523341470149669557005870037646653649604001030020100006002020000700503521400211091010000300101000001010000011000000100001102520171021698103000366010000300107004870048700517004870048
40024700475240000101107003569743597092540014300131000030010100006169823342062149669677004770035646653649724001030020100006002020000700473511400211091010000300101000001010000011000000100001102520171021698103000360610000300107008770066700487005170036
40024700475250010007007009169743597272540014300101000130010100006169523345374149669557004770035646723649724001030020100006002020000700473511400211091010000300101000001010000001000000100001102520171021698103000366610000300107005770048700487004870048
40024700355250000001007003569760597062540014300131000030010100006170683341470149669557003570050646653649724001030020100006002020000700353511400211091010000300101000001010000011000010100001102520271011698103000366910000300107005170051700487004870051
40024700475250000001107003569728597062540014300131000030010100006169823342062149669677004770047646533649724001030020100006002020000700473511400211091010000300101000001010000011000000100000002520171011698133000366610000300107004870036700487004870048

Test 3: Latency 1->3 (with chain penalty)

Chain cycles: 3

Code:

  ldrsh x0, [x6, x7]
  eor x8, x8, x0
  eor x8, x8, x0
  add x7, x7, x8
  mov x7, #4
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0050

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)09l2 tlb miss data (0b)0e0f191e1f22243a3f4d4f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6d6emap stall dispatch (70)int prf full (71)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)a9acafb5bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
402057004752500100021000070035697355970925401043010310001301001000061601533420620496696770047700506464303649384010030200100006020020000700763511402011009910010000301001000001001001601100000001000010100261017111698133000396910000301007004870051700487005170051
402047005052400000001000070020697645970625401043010310001301001000061601533422060496697070035700506464603649504010030200100006020020000700703511402011009910010000301001000001001000331100011111000111110261017111697983000069910000301007005170036700517003670052
402047005052500001001000070032697815969525401043010310002302541000061600533422061496695570047700356464603649584010030200100006020020000700413511402011009910010000301001000011001000321100020011000011110261017111697983000360610000301007004870048700517003670036
402047003552500000000000070041697845971225401043010610002301001000061603233423500496696170041700566464903649444010030200100006020020000700613511402011009910010000301001000001001000000100010001000000100261017111697983000306910000301007003670051700487005170051
402047005552500000001010070026697845971225401083010610002301001000061603233423500496697670053700566463703649564010030200100006020020000700933511402011009910010000301001000001001000000100000001000010100261017111698133000300010000301007003670048700487003670036
402047005052400000001010070041699695971025401083010610002301001000061603233417690496697670056700566465203649444010030200100006020020000700473511402011009910010000301001000001001000001100000001000010100261017111698133000306910000301007005170048700517003670051
402047005052400000000000070026697875971225401083010610001301001000061607833423500496697370041700536465203649564010030200100006020020000700783511402011009910010000301001000001001000001100000001000000100261017111698193000606910000301007005470054700427005470042
402047005352511010002000070032697355970625401733010010001301001000061601533420620496697070047700506464303649534010030200100006020020000701173511402011009910010000301001000001001000001100000001000000100261017111698043000366610000301007005770042700547004270054
402047004152510000002000070032697645970925401043010310000301001000061600533422060496734070047700506464303649384010030200100006020020000700873511402011009910010000301001000001001000000100140001000010000261017111698103000366910000301007005170051700367003670036
402047003552500000000010170026697845971225401043010610002301001000061603233417690496697670056700416463703649594010030200100006020020000700513511402011009910010000301001000011001000000100000001000000100261017111698193000306610000301007005470140700577005970057

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0050

retire uop (01)cycle (02)03l1d tlb fill (05)0e0f1e22233a3f4d4f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)a9acafb5bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
4002570047525010210070131697285969525400103001010002301631000061697033420620496695507003570050646533649724001030020100006002020000700473511400211091010000300101000011010002201000201110000111100252097186698163000366610000300107004870036700407003670048
4002470035524000110070129697435970925400103001010001300101000061698233422061496695507003570047646683649754001030020100006002020000700353511400211091010000300101000001010000011000111410000111120252077198698193000306910000300107004270057700427004270057
400247005652510010007012069780597172540018300221000130010100006170093342494049669730700567004164671364978400103002010000604082000070056351140021109101000030010100000101000310100031001000010100025206711010698103000306610000300107005170048700517003670051
40024700505240116000701056976059709254001030013100013001010000617068334220604966967070035700476465336496040010300201000060020200007003535114002110910100003001010000010100022110001011100001111002520871710698193000660610000300107005470054700577005770062
40024700565251001100701076976059706254001030013100013001010000616952334147004966973070041700416465936496040010300201000060020200007004735114002110910100003001010000110100000110003101100000010002520127186698133000060610000300107003670036700367004870048
4002470047525100110070045697635971125400143001310001300101000061706833420621496696707005070050646683649604001030020100006002020000700503511400211091010000300101000011010003211000100010000101000252067168697983000306910000300107005170051700517004870036
4002470035524000000170108697605970925400143001310001300101000061698233422061496696707005070035646683649754001030020100006002020000700503511400211091010000300101000001010001111000000010000010000252067168698133000396610000300107005170051700517003670055
4002470050525000000070117697605970925400143001310001300101005861706833422060496696707004770047646533649604001030020100006002020000700503511400211091010000300101000001010000001000010010000100000252077167698133000000910000300107003670036700517004870036
4002470035525000100070100697435969525400103001010001300101000061698233422061496696707005070050646533649604001030020100006002020000700353511400211091010000300101000001010000011000000010000101000252087197698133000396610000300107003670036700367004870051
4002470050525000100070059697435970925400143001310001300101000061695233414701496696707005070050646653649604001030020100006002020000700353511400211091010000300101000001010000011000000010000101000252057168698133000396610000300107003670051700517005170048

Test 4: throughput

Count: 8

Code:

  ldrsh x0, [x6, x7]
  ldrsh x0, [x6, x7]
  ldrsh x0, [x6, x7]
  ldrsh x0, [x6, x7]
  ldrsh x0, [x6, x7]
  ldrsh x0, [x6, x7]
  ldrsh x0, [x6, x7]
  ldrsh x0, [x6, x7]
  mov x7, 8

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.3341

retire uop (01)cycle (02)03mmu table walk data (08)l2 tlb miss data (0b)0e0f18191e223a3f4346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)acafb5b6bbl1d cache miss ld nonspec (bf)branch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
8020526728200000000441126712012120258010010080000100800155001167303149236512672726707166356166798011620080024200160048267467711802011009910080000100800000100801300438003800800386139441115118016122670410104800001002672826728267282673226728
80204267272000000004401266922111925801001008000010080014500116659604923651267072673116635616683801142008002420016004826732771180201100991008000010080000010080000043800390080000000431115118116212672814107800001002673226708267322670826732
802042670720000000044002669221201625801001008000010080015500116719804923651267072670716635616659801152008002420016004826707561180201100991008000010080000010080000043800380388003901394411151182161226728007800001002681226708267282672826728
802042672720000000044102669221121925801001008000010080014500116659604923627267072673116655616701801152008002420016004826730561180201100991008000010080000110080000043800390388000060001115118116212670414104800001002672826708267082673226732
802042673120000000044112671620121625801001008000010080015500116745904923651267312672716659616659801142008002420016004826716771180201100991008000010080000010080000043800000398000061044111511811622267041407800001002673226708267322673226708
80204267312000000004410267160011925801001008000010080014500116659604923651267272672716659616683801142008002420016004826708771180201100991008000010080000010080000044800380080038613843111511811612267280100800001002673226728267282673226732
8020426707200000100010267120001625801001008000010080014500116730304923651267072673116635616659801152008002420016004826740561180201100991008000010080000110080000043800000398003861043111511821621267241007800001002670826708267282672826728
80204267312000000000112671221212162580100100800001008001550011665960492364726731267311663561668480115200800242001600482673777118020110099100800001008000001008000000800380398003901390111511811612267281007800001002672826732267082673226728
802042673120000000045102671220019258010010080000100800155001167303049236272672726731166556166838011520080024200160048267167711802011009910080000100800000100800000438003903980038613801115118216222670410100800001002673226708267082673226728
8020426707200001000451126692010192580100100800001008001550011673030492364726727267271665961667980115200800242001600482673977118020110099100800001008000001008000004380000039800006104311151182162226704004800001002673226734267282673226732

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.3341

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk instruction (07)l2 tlb miss data (0b)0e0f1e2223243a3f4346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)a5ld unit uop (a6)l1d cache writeback (a8)acafb5b6bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2cfd5map dispatch bubble (d6)ddfetch restart (de)e0e7eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
800252672820000011000012674221213425800101080000108000050116720114923651267082673116676316688800102080000201600002673156118002110910800001080000110800000008003804180038610430050201116692672501410080000102673226732267092673226732
8002426708200000004410012676921212192580010108000010800005011668961492362826731267311665231670780010208000020160000267087711800211091080000108000001080000043080038038800386100005020616692672411010780000102673226728267322672826732
80024267312000000044000126756011202580010108000010800005011671241492365126731267311667631671180010208000020160000267315611800211091080000108000001080000000800390080038013843005020716992670501414080000102673226709267322673226709
80024267312000001044000126774010162580010108000010800005011672011492365126798267311665231671180010208000020160000267285611800211091080000108000001080000000800000388003801394400502091697267050010780000102673226729267322673226732
8002426731200000000000126716011192580010108000010800005011671241492364726731267311667631671180010208000020160000267085611800211091080000108000011080000000800380080039600430050201016972672801410080000102671926778267322670926732
800242673120100000440001267840011625800101080000108000050116688604923654267312673116676316688800102080000201600002673177118002110910800001080000010800000430800380080000000000502091669267080140780000102673226732267322670926732
800242673120000000010002681100119258001010800001080000501167124049236512673126731166763167118001020800002016000026731771180021109108000010800001108000004308003803980038613843005020716792672801014080000102673226729267092672826728
80024267272000000000001267673101925800101080000108000050116750114923651267312673116676316711800102080000201600002673177118002110910800001080000010800000430800000080039610000502091699267240010480000102673226709267322670926732
8002426731200000014400012677601201925800101080000108000050116712414923651267272672716672316707800102080000201600002673156118002110910800001080000110800000430800380388003861394300502091699267050100780000102670926732267292673226732
800242673120100000440000267700120025800101080000108000050116712414923628267312673116676316711800102080000201600002673177118002110910800001080000010800000430800381388003861044005020716692672801410480000102673726712268072673626732