Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

LDRSB (register, 64-bit)

Test 1: uops

Code:

  ldrsb x0, [x6, x7]
  mov x7, #4
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 0.000

Load/store unit issues: 1.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)l2 tlb miss data (0b)0e0f1e22233a3f4346494f51schedule uop (52)schedule ldst uop (55)dispatch ldst uop (58)simd uops in schedulers (5a)606d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map ldst uop (7d)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)92inst int load (95)inst ldst (9b)9dl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafb5b6bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2cfd5map dispatch bubble (d6)ddfetch restart (de)e0eaebec? ldst retires (ed)f5f6f7f8fd
100540831010081102388167720251000100010001548003864032113266100010002000408851110014100010000102019581073101761015615943192731161140510001000387409404387404
10044083100008100338723074251000100010001552613893882253266100010002000386691110014100010000101919581034000601039617345191731161138313051000409410409404409
10043863100008200238830204251000100010001579904103862093261100010002000403691110014100010000102020431060001751023013443191731161140501351000387387403403409
10044033101003610239320204251000100010001484303864082093261100010002000408851110010100010000101919581034000601023615943190731161140501301000387387403410404
100438631010036101393157025251000100010001574104043872113266100010002000408851110010100010000102020581042100391054615943192731161138301351000409404404409403
100440331000066102388167025251000100010001579904033862313247100010002000408851110014100010000101920010421004410546034451907311611386101051000409409387409390
1004386311100821023883720725100010001000157880408403225324410001000200038668111001010001000010191945105800076105500594319173116114050001000409403404404387
10044033101006710237115720202510001000100015480038640822532611000100020004028511100141000100001019194310740006110546174019073116114050001000409387409404387
10044032100008100239337725251000100010001476903863892253266100010002000408911110010100010000101920431059102441040615801907311611386101051000387404403409404
100440831110082101389167025251000100010001552604084082253247100010002000403851110010100010000102121010731006010556073431907311611400131051000409409404387409

Test 2: Latency 1->2 (with chain penalty)

Chain cycles: 3

Code:

  ldrsb x0, [x6, x7]
  eor x8, x8, x0
  eor x8, x8, x0
  add x6, x6, x8
  mov x7, #4
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0051

retire uop (01)cycle (02)030e0f191e1f223f4d4f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)acafb5l1d cache miss ld nonspec (bf)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
402057005752511010170036697825971025401043010310001301001000061696733424460496697170051700516464736495440100302001005560200200007005135114020110099100100003010010000010010000110000001000011261137133698143000310101010000301007005270052700527005270052
40204700355251001017003669782597102540104301031000130100100006168783344414049669737005170051646313649544010030200100006020020000700513511402011009910010000301001000001001000011000000100001126113713369798300031010010000301007005270052700527005270052
402047005152500010070093697825971025401043010310001301001000061679733440300496697170051700516463136493840100302001000060200200007005135114020110099100100003010010000010010000110000001000011261127123698143000310101010000301007005270052700527005270052
402047005152500010070036697825971025401043010010001301001000061603233449420496731370051700516463136495440100302001000060200200007005135114020110099100100003010010000010010000110000001000001261137133698143000010101010000301007005270052700367005270052
402047005152400010170036697825971025401003010310001301001000061705733452780496697170051700516463136495440100302001000060200200007003535114020110099100100003010010000010010000110000001000011261137133698143000310101010000301007005270052700527005270052
40204700515240001007003669782597102540104301031000130100100006161753346366049669717005170051646473649544010030200100006020020000700513511402011009910010000301001000001001000011000000100001126113713269814300031010010000301007005270036700527005270036
40204700515250001017003769782597102540104301031000130100100006167903342926049669717005170035646473649544010030200100006020020000700513511402011009910010000301001000001001000001000000100001126113713369814300001010010000301007003670052700527005270052
402047003552400000170036697825971025401003010310001301001000061684233451340496695570051700516469336496140100302001000060200200007005135114020110099100100003010010000010010000110000531000000261137123697983000310101010000301007005270052700527005270036
40204700515250001017003669782596952540104301031000030100100006168603344558049669557005170051646473649544010030200100006020020000700353511402011009910010000301001000001001000011000000100001126113713269798300031001010000301007005270052700527005270036
402047005152500010170036697825971025401043010010001301001000061684233446060986697170035700516463136495440100302001000060200200007003535114020110099100100003010010000010010000110000001000011261127133698143000310101010000301007005270052700527005270052

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0051

retire uop (01)cycle (02)03mmu table walk data (08)09l2 tlb miss data (0b)0e0f1e22233f4d4f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6d6emap stall dispatch (70)int prf full (71)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)a9acafb5l1d cache miss ld nonspec (bf)c2branch cond mispred nonspec (c5)cdcfd0l1i cache miss demand (d3)d5map dispatch bubble (d6)daddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
4002570060525000100007003969780597132540014300131000130010100006170183342398049669747003570054646720364976400103002010000600202000070051351140021109101000030010100000101000000100000001000010000252000171011697983000313101310000300107003670036700557005570056
40024700355240001011070020697435971325400143001310004300101000061699133415661496697470054700356467203649794001030020100006002020000700543511400211091010000300101000001010000001000000010000100002520001710116981730000013010000300107005570036700557005570055
400247005452500000110703136975660098254001430013100013001010000617018334239814966974700357005564669036496040010300201000060354200007005435214002110910100003001010000110100000110000000100000000025200017101169817300031313010000300107005570052700367005570052
4002470035525000001007003969743596952540010300131000130010100006170183342398049669747003570035646727364979400103002010000600202000070054352140021109101000030010100000101000000100000001000011000252000171011698433000013101310000300107003670060700527005570055
40024700545250000010070039697785971325400103001310000300101000061706833414700496697470054700356467203649794001030020100006002020000700353511400211091010000300101000001010000001000000010000110002520001710116981930003100010000300107005570098701637005570055
400247005152500000100700206977559713254001430013100013001010000617018334239804966974700547005164653036496040010300201000060020200007003535114002110910100003001010000010100000110000000100001100025200017101269830300001001010000300107005570036700367005570036
40024700545250000000070039697785971325400143001010000300101000061701833423981496697470054700546467203649764001030020100006002020000700353511400211091010000300101000001010000011000000010000010002520001710116988930003001310000300107005570036700557005570052
4002470051525000001007003669743597132540010300131000130010100006170183342398049669747005470051646690364979400103002010000600202000070035351140021109101000030010100000101000001100000001000011000252000171021698273000310101310000300107005570055700557003670055
4002470054524000000007003969778597132540010300131000130010100006170183342398049669557005170054646530364979400103002010000600202000070035351140021109101000030010100000101000001100000001000011000252000171011698363000310131310000300107003670052700557003670052
400247005152400000110700206977859710254001430013100003001010000617018334239804966974700547005464672036497940010300201000060020200007005435114002110910100003001010000010100000110000000100001100025200017101169877300031001310000300107003670055700557003670055

Test 3: Latency 1->3 (with chain penalty)

Chain cycles: 3

Code:

  ldrsb x0, [x6, x7]
  eor x8, x8, x0
  eor x8, x8, x0
  add x7, x7, x8
  mov x7, #4
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0056

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)l2 tlb miss data (0b)0e0f191e1f22233a3f4d4f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)a9acafb5bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
40205700535251001101401017005069784597902540104301031000230100100006161753342206149669550700357005064631364950401003020010000602002000070035351140201100991001000030100100000100100022110001111100001101000261017111698193000390010000301007005770054700577005970057
402047005652510100321081007004369878597172540148301651000730264101106160593342494049669760700417005664652364944401003020010000602002000070056351140201100991001000030100100000100100011010001001100000101100261017111698043000366910000301007005770042700547004470054
4020470056525110000101007004169787597742540108301061000230100100006160593342542049669760700417004164652364944401003020010000602002000070421355140201100991001000030100100000100100021010000200100001010000261017110698103000309910000301007005170036700547011670042
4020470041525100000100007004169787598712540141301061000230100100006160593342494049669760702887015664637364959401003020010000602002000070056351140201100991001000030100100000100100011110002101100001111000261017111698193000696010000301007005770057700547009970057
4020470056525111000200007002669847597672540108301061000230100100006160593341769049669760700567005664637364959401003039710000602002000070041351140201100991001000030100100000100100021010002121100001111100264517111698223000396010000301007005770042700427005770063
4020470041525101000100007002669877597172540108301061000130100100006160593341769049669760700417005664652364959401003020010000602002000070056351140201100991001000030100100000100100031010002001100001111200261017111698193000699910000301007005770042700547014570057
4020470056524111000700007002669784597482540108301061000130100100006160783342494049669760700567005664652364959401003020010000602002000070041351140201100991001000030100100000100100011110001011100001101000261017111698193000699910000301007005770054700577008570057
4020470061525101000200017004169787597152540108301061000230100100006160593341769049669760700567005664652364959401003020010000602002000070041351140201100991001000030100100000100100012110002001100001111000261017111698163000690910000301007004270042700547005770057
4020470056525100000701007010569787597152540108301061000230100100006160783342350049669730701097004164652364956401003020010000602002000070056351140201100991001000030100100000100100021110002111100001111000261017111698193000699910000301007004270057700577005770057
4020470056525100100200007004169787597152540108301061000430100100006160593342494049669730700567005364649364959401003020010000602002000070056351140201100991001000030100100000100100012110001027100001111000261017111698193000690910000301007004270057700427005770057

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0060

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)09l2 tlb miss data (0b)0e0f1e1f22233a3f4d4f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)a9acafb5bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2branch cond mispred nonspec (c5)branch mispred nonspec (cb)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
400257004152411110010100700456978459701254001430013100023001010000617063334254249669807006070062646783649824001030020100006002020000700573511400211091010000300101000001010000011000000010000101000025202712269820300031010010000300107006170061700427005870061
40024700415241000002010070042697845971925400183001610005301551000061707233426864966961700577006064659364967400103002010000600202000070060351140021109101000030010100000101000220100010011000001110012520571426982330003130010000300107006170061701287005570061
4002470057525100100701007004569784597192540018300161000130010100006170723342686496698070060700606465936504240010300201000060020200007006035114002110910100003001010000010100022110001011100001111200252057132698233000613111310000300107006170042700617006170061
400247006052410010020100700456978459716254001830016100013001010000616995334176949669807006070060646593650724001030020100006002020000700603511400211091010000300101000001010001111000100110000011100025204712269804300031313010000300107005870061700587004270058
40024700605251000102010070045697845971925400183001310002300101000061707233426864966980700417006064678365055400103002010000600202000070060351140021109101000030010100000101000111100020011000011010002520271326982330003010010000300107006170061700617006170061
4002470060525100000110000700456978459695254001830013100023001010000616995334268649669617004170060646783650174001030020100006002020000700573511400211091010000300101000001010001111000101110000011100025202713469804300061301310000300107004270058700617006170058
4002470060524111100101007004569789597195140018300131000230010100006170723342686496698070060700606465936501940010300201000060020200007006035114002110910100003001010000010100031110002001100000101100252027122698233000610131310000300107004270042700617006170061
4002470060525111000200007004569784597192540018300161000130010100006171443341817496696170060700576467836505840010300201000060020200007004135114002110910100003001010000010100021110001011100001111000252027122698233000610131010000300107004270061700617004270061
40024700415251011002010070045697025971925400183001610002300101000061699533417694966977700607005764678365066400103002010000600202000070057351140021109101000030010100000101000211100020111000011111002520271446982330006130010000300107006170061700617006170061
4002470057524110100200007004569784597012540014300161000230010100006170723341769496698070060700606467536502240010300201000060020200007006035114002110910100003001010000010100021110001011100001111100252037122698233000610131310000300107006170061700617006170042

Test 4: throughput

Count: 8

Code:

  ldrsb x0, [x6, x7]
  ldrsb x0, [x6, x7]
  ldrsb x0, [x6, x7]
  ldrsb x0, [x6, x7]
  ldrsb x0, [x6, x7]
  ldrsb x0, [x6, x7]
  ldrsb x0, [x6, x7]
  ldrsb x0, [x6, x7]
  mov x7, 8

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.3342

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)l2 tlb miss data (0b)0e0f1e22233a3f4346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafb5b6bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2branch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)fetch restart (de)e0eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
802052673320010111651022672421818152580100100800001008001650011695630492363426732267331665961668580114200800242001600482673281118020110099100800001008000011008002020428005710159800386156421911115118016026730992800001002673326733267332673426733
802042673220010101650022680821818152580100100800001008001650011697570492363426732267321666061668480115200800242001600482673282118020110099100800001008000001008001920428005610159800376157421911115118016026729992800001002673326733267342673326734
80204267142001100065102267242181802580100100800001008001650011671070492365226732267331666061668480114200800242001600482673281118020110099100800001008000001008002019428005710021800006157421921115118016026730992800001002673326733267332673326733
80204267322011110065002267062180162580100100800001008001650011695631492365226714267141666061668580118200800242001600482673381118020110099100800001008000001008001921428005710059800396057421921115118016026729992800001002671526715267332673326733
802042673220011100650022669921818172580100100800001008001550011668361492365226714267151665961668380116200800242001600482673264118020110099100800001008000011008001920418005710158801686156421921115118016026729992800001002673326733267332673326734
8020426733200101006500226809218180258010010080000100800165001167763049236522673226714166606166848011620080024200160048267328111802011009910080000100800000100800212042800571005980038615701921115118016026730902800001002673426733267332673326715
80204267322001100066102267122121216258010010080000100800155001167303049236472672726727166556166628011520080024202160048267367111802011009910080000100800000100800000398003900039800396139390011151181160267241064800001002672826728267282672826728
80204267272000000041101268032180025801001008000010080014500116787504923655267452673216660616685801122008002420016004826732811180201100991008000010080000110080020190800571016080038615601901115118016026730992800001002673326733267332673426733
802042673220011100651022672521818162580100100800001008001550011674621492365326732267321666061668480116200800242001600482673264118020110099100800001008000001008001920428005510162800386157421901115118016026730992800001002673326715267342673426733
80204267332001100165002267202018152580100100800001008001650011670951492365226732267321666061668580115200800242001600482671481118020110099100800001008000001008001920428001910059800386057421901115118016026729992800001002673326733267332673326734

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.3342

retire uop (01)cycle (02)03l1i tlb fill (04)l1d tlb fill (05)mmu table walk data (08)09l2 tlb miss data (0b)0e0f191e22233a3f4346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)a5ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafb5b6bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2cfl1i tlb miss demand (d4)d5map dispatch bubble (d6)daddfetch restart (de)e0eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
8002526736200010010006610326721397025800101080000108000050117397514923635267372673716681316784800102080000201600002673685118002110910800001080000110800201943080059101608003961584319150211151601782673300080000102673826737267152673726737
80024267152001112000066103266993771925800101080000108000050117397514923656267372673616681316823800102080000201600002673685118002110910800001080000010800192043080059101648000060594319050200171601617267331313580000102673726738267382673826737
80024267142010110000067102267003772025800101080000108000050116721914923657267372673616681316695800102080000201600002675385118002110910800001080000010800191943080059000648003960594319250200171601717267341313580000102673726715267372673726738
80024267372000110100067100267213772025800101080000108000050116710604923635267372673616681316803800102080000201600002673685118002110910800001080000010800191943080059102608004000584419250200171601517267341313080000102671526737267372673726737
8002426736200010000006710326721300025800101080000108000050116721914923656267362671516682316783800102080000201600002673685118002110910800001080000010800201945080058111218004161594319050200161601717267331313080000102673726716267372673726738
80024267372000111000067103267213770258001010800001080000501167834149236562673626736166603167208001020800002016000026736851180021109108000010800001108001919430800190106180000615943191502009160169267331313580000102673726738267372673726737
8002426736200011100006610326721377202580010108000010800005011671061492365626736267361666031675680010208000020160000267178511800211091080000108000001080021210080019101618004061594519150200171601816267331313580000102671626737267372673726737
8002426736200011000002110326721377222580010108000010800005011672191492365626736267361668231681580010208000020160000267368511800211091080000108000001080019204308005810260800406159019250200151601717267121313580000102673726737267372673726738
8002426737201011110002210226700377202580010108000010800005011672191492365726736267361668231679580010208000020160000267366411800211091080000108000001080019190080059101608000061594319150200171601717267331313580000102673726737267372671626738
8002426737200010000002110226722297202580010108000010800005011677911492365626737267371668131678480010208000020160000267368511800211091080000108000001080019204308005910061800406158431915020014160171426711130580000102673726737267162673726738