Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

LDRSB (register, sxtw, 32-bit)

Test 1: uops

Code:

  ldrsb w0, [x6, w7, sxtw]
  mov x7, #4
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 0.000

Load/store unit issues: 1.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)l2 tlb miss data (0b)0f1e22243a3f4346494f51schedule uop (52)schedule ldst uop (55)dispatch ldst uop (58)simd uops in schedulers (5a)6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map ldst uop (7d)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst int load (95)inst ldst (9b)l1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafb5b6bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2cfd5map dispatch bubble (d6)ddfetch restart (de)e0eaebec? ldst retires (ed)f5f6f7f8fd
10053993101168002384218181725100010001000153753983992213257100010002000399831110011000100010202242105700058103861574219173116113959921000399400400400399
10043993111165103383218181725100010001000153203993992213257100010002000399821110011000100010202041105710159103861574219173116113959921000400400399400400
10043993101065003383218181625100010001000153623993992213256100010002000399811110011000100010202042105710059103861574219173116113979921000400403400399400
10043992111165002383218181625100010001000153623993992213256100010002000399811110011000100010211942105711159103761574219073116113959921000399400400400400
10043993111165103384318181625100010001000153753993982223257100010002000398811110011000100010192142105710059103861574119273116113969921000400400401400399
10043983100177003383218181725100010001000153283993992213257100010002000399811110011000100010192042105700059103861544219173116113969921000400400399400400
10043993111165002384318181625100010001000153753983992213257100010002000399811110011000100010191942105710259103861574219273116113979921000400399400400399
10043983100165002383218181525100010001000159583993992223256100010002000399811110011000100010202042105700059103861564219073116113969921000400399400400399
10043983100165003385218181625100010001000153153983992233257100010002000399811110011000100010212142105810159103861584219173116113969921000400400400399400
10043993111165103385218181625100010001000153153994002293257100010002000399811110011000100010191942105700059103861574219173116113969921000400400399400400

Test 2: Latency 1->2 (with chain penalty)

Chain cycles: 3

Code:

  ldrsb w0, [x6, w7, sxtw]
  eor x8, x8, x0
  eor x8, x8, x0
  add x6, x6, x8
  mov x7, #4
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0050

retire uop (01)cycle (02)03mmu table walk data (08)09l2 tlb miss data (0b)0e0f1e1f22233f4d4f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)acafb5bbl1d cache miss ld nonspec (bf)branch mispred nonspec (cb)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
402057005052500000101070035697815970625401043010310002301001000061668033453261496697007005070050646313649534010030200100006020020000700503511402011009910010000301001000001001000011000000100001010261037122698103000366910000301007003670036700367005170051
402047004752500000100070035697815980125401043010310001301001000061672633426861496696707004770035646433649534010030200100006020020000700473511402011009910010000301001000001001000011000000100001010261027122698133000396910000301007005170036701307006370059
402047005052500000100070032697355970625401043010310001301001000061689633430061496697007005070050646463649534010030200100006020020000700353511402011009910010000301001000001001000011000100100001010261027122698133000399910000301007005170051700517005170051
402047005052400010000070035697355970925401163010010001301001000061671733452781496697007005970050646523649504010030200100006053620000700503511402011009910010000301001000001001000011000000100001010261027122698133000309610000301007004870048700517004870048
402047004752400000100070035697825970925401043010310001301001000061659133425901496697007005070050646433649534010030200100006020020000700503511402011009910010000301001000001001000011000000100001010261027122698133000606010000301007003670051700367005170051
402047005052500000100170035697815970925401043010310001301001000061601533471981496697007003570050646463649534010030200100006020020000700353511402011009910010000301001000001001000001000000100001010261027122698133000396910000301007005170051700517005170051
402047005052500000100070020697645970925401043010310001301001000061601533460941496696707005070050646433649534010030200100006020020000700473511402011009910010000301001000001001000011000000100001000261027122698133000396910000301007005170036700367005170059
402047005052500010100070032697815970625401043010310001301001000061601533420621496695507005070050646463649534010030200100006020020000700503511402011009910010000301001000001001000011000000100001010261027122698103000399910000301007004870048700517005170048
402047005052400000100070035697815970925401043010310000301001000061600533465261496697007005070050646313649584010030200100006020020000700503511402011009910010000301001000001001000011000000100001000261027122698133000390910000301007005170051700517005170036
402047005052500010100170035697815970925401043010010002301001000061601533457581496697007005070047646433649534010030200100006020020000700503521402011009910010000301001000001001000011000000100001010261027122698133000396910000301007003670051700517004870051

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0050

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk instruction (07)mmu table walk data (08)09l2 tlb miss data (0b)0e0f181e22233a3f4d4f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6b6d6emap stall dispatch (70)int prf full (71)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)a9acafb5bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
400257005352410111000200070026697805970125400143001610002300101000061699533417691496697607005670056646710364966400103002010000600202000070056351140021109101000030010100000101000221100020011000011110252397135698193000696910000300107004270042700547005470042
400247005652410000010100070041697025971525400143001610001300101000061699533424940496697307005370056646740364981400103002010000600202000070041351140021109101000030010100000101000120100010011000011110252367165698193000609010000300107005770042700547005770057
400247011152510010000200070026697805971525400183001610002300101000061699533417690496697707005370056646710364966400103002010000600202000070056351140021109101000030010100000101000311100030011000011111252547174698043000390910000300107005770042700427005770042
400247004152510011000210070041697805971525400143001310002300101000061703633424940496696107005670041646740364981400103002010000600202000070041351140021109101000030010100000101000211100020141000011110252337153698193000369610000300107005470042700547005470057
4002470056524100000002000700416977759701254001830016100043001010000617009334249404966976070056700566465903649814001030020100006002020000700533511400211091010000300101000001010001301000101101000011111252327165698043000696010000300107005770058700577005770057
400247005652610110000210070041697805971525400183001610001300101000061703633424940496697607005670053646740364966400103002010000600202000070056351140021109101000030010100000101000111100010011000011110252357134698193000690010000300107004270057700547005470057
400247005652510101000110070031697775971525400143001610001300101000061700933424940496697607005670056646740364966400103002010000600202000070056351140021109101000030010100000101000221100020111000011110252357134698203000399010000300107005770057700427005770057
400247005652410111000200070026697805971525400183001310002300101000061703633424940496697607004170056646740364981400103002010000600202000070041351140021109101000030010100000101000231100010111000011111252347154698193000696910000300107004270057700577004270057
400247005652510100000200070038697775971525400183001610002300101000061703633424940496697307005670041646710364978400103002010000600202000070041351140021109101000030010100000101000131100010111000011111252567146698193000699010000300107005770057700577005770042
400247005652510000000100170041697805971525400183001310001300101000061703633424940496697607005870056646850364981400103002010000600202000070041351140021109101000030010100000101000111100020111000011011252337145698193000396010000300107005770057700547005770057

Test 3: Latency 1->3 (with chain penalty)

Chain cycles: 3

Code:

  ldrsb w0, [x6, w7, sxtw]
  eor x8, x8, x0
  eor x8, x8, x0
  add x7, x7, x8
  mov x7, #4
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0050

retire uop (01)cycle (02)03mmu table walk instruction (07)l2 tlb miss instruction (0a)0e0f191e22233a3f4d4f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)696a6d6emap stall dispatch (70)int prf full (71)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)acafb5l1d cache miss ld nonspec (bf)branch cond mispred nonspec (c5)branch mispred nonspec (cb)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
40205700475251101001037003569735597232540104301031000230100100006160153342206496695570035700356464603649384010030200100006020020000700508011402011009910010000301001000001001000001000000100001000261610711010697983000306910000301007005170051700487012370036
402047005052411100110370035697355973425401003010310001301001000061619333422064966970700507004764646036495340100302001000060200200007005035114020110099100100003010010000010010003010000412100001110261691799698133000366610000301007005170048700487013870051
40204700505251100010037003569735597342540104301031000130100100006160153342062496697070047700356464303649534010030200100006020020000700503511402011009910010000301001000001001000001000000100001100261697199698133000390610000301007005170051700487012170048
40204700505251100001037003269764597432540104301031000130100100006161753342206496697070050700506463103649534010030200100006020020000700353511402011009910010000301001000001001000001000003100001100261697159698133000399910000301007003670048700487014270052
40204700505251100010037003569735597012540104301031000030100100006160153341470496697070037700356463103649384010030200100006020020000700503511402011009910010000301001000001001000011000000100001100261697194698103000396910000301007003670036700747003670048
40204700505241100010037002069781597282540100301001000130100100006160053342206496697070050700506464303649534010030200100006020020000700473511402011009910010000301001000001001000011000000100000000261697199698133000096910000301007005170036700487011870051
40204700355251100000037003569764597602540104301031000130100100006161753342206496696770050700356464603649504010030200100006020020000700503511402011009910010000301001000001001000011000000100000100261697177697983000006910000301007005170051700487006270036
40204700505251100000137009869781597572540104301031000130100100006160053342206496697070095700406463303649404010030200100006020020000700503521402011009910010000301001000001001000001000000100000100261657199698133000306010000301007003670051700487013270036
4020470050525110000003700356973559743254010430103100013010010000616005334220649669707003570050646460364950401003020010000602002000070050351140201100991001000030100100000100100000100000010000010026161071910698133000060910000301007005170036700487013970052
4020470050524111001710037003569764597762540104301001000130100100006160153342062496697070050700506464303649534010030200100006020020000700503511402011009910010000301001000011001000011000000100001000261677199698103000366010000301007003670051700517012470036

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0051

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)09l2 tlb miss data (0b)0e0f18191e22233a3f4d4f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)a9acafb5bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
4002570051525001000001100700396974359710254001430013100013001010000616991334239804966955700517005164672364983400103002010000600202000070051351140021109101000030010100000101000001100000001000011000252067111698143000310101010000300107005270052700527005270052
4002470051524000000006000700426977559713254001430013100043001010000617054334230204966971700357005164653364998400103002010000600202013270057351140021109101000030010100000101000211100020111000001112252011721698143000310101010000300107005270052700527003670052
4002470051524000001001101700426978159716254001830016100023001010000617045334254204966971700547005164672365024400103002010000600202000070054351140021109101000030010100000101000001100000001000000100252017111698203000610101010000300107005870058700427004270042
4002470057525110000002100700366974359710254001430013100013001010000617018334147004966971700517005164672365020400103002010000600202000070051351140021109101000030010100000101000001100000001000010100252017111698143000310101010000300107005270055700527005270052
4002470096525000000001100700266978159701254001830013100023001010000617072334268604966977700577004164675365053400103002010000600202000070054351140021109101000030010100000101000001100001001000010100252017111698203000610101010000300107005870061700587006170058
4002470057524101100007100700366977559710254001430013100013001010000616991334225404966971700517005164669365013400103002010000600202000070051351140021109101000030010100000101000001100000001000010100252027121698143000010101010000300107005270052700527005270052
4002470051525000000001000700366974359710254001430013100013016310000616991334225404966971700517005464669365005400103002010000600202000070051351140021109101000030010100000101000001100000001000000100252017111698143000313101010000300107005270052700527005570052
4002470051525000000001100700366977559710254001430013100013001010000616991334225404966971700547005164669365043400103002010000600202000070057351140021109101000030010100000101000001100000001000010100252017111698143000310101310000300107003670052700557005570052
4002470051524000000001000700366977559710254001430013100013001010000616991334225404966971700547005164669365015400103002010000600202000070051351140021109101000030010100001101000001100000001000010000252017111698203000310101010000300107005570052700527005270036
4002470051525000000001100700366977559710254001430013100013001010000616991334225404966983700517005564653365005400103002010000600202000070051351140021109101000030010100000101000001100000001000010100252017111698143000310101010000300107005270052700527003670052

Test 4: throughput

Count: 8

Code:

  ldrsb w0, [x6, w7, sxtw]
  ldrsb w0, [x6, w7, sxtw]
  ldrsb w0, [x6, w7, sxtw]
  ldrsb w0, [x6, w7, sxtw]
  ldrsb w0, [x6, w7, sxtw]
  ldrsb w0, [x6, w7, sxtw]
  ldrsb w0, [x6, w7, sxtw]
  ldrsb w0, [x6, w7, sxtw]
  mov x7, 8

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.3341

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk instruction (07)mmu table walk data (08)09l2 tlb miss data (0b)0e0f1e22233a3f4346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafb5b6bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2branch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
802052673620010011116710326721077632580100100800001008001650011676280492365602673626736166636166898011520080024200160048267146411802011009910080000100800001100800192043800591002180040615943190111511801600267331305800001002673726808267372673926716
802042673620010111002110326722377252580100100800001008001550011672150492363402671426736166646166888011320080024200160048267378511802011009910080000100800001100800201908006100164800000059019111151180160026733005800001002673726750267462674226739
8020426736200100010067102267212072325801001008000010080013500118255104923634026736267361666461668880115200800242001600482671486118020110099100800001008000001008001919438005910161800396059019011151180160026711005800001002674026723267242673926737
8020426736200100110066001267210775258010010080000100800155001167628049236560267362671416663616689801162008002420016004826736861180201100991008000010080000110080019194380019001218000061580191111511801600267331300800001002671826743267252673726715
8020426736200100000021101267210072525801001008000010080015500116721504923634026737267361666461666680115200800242001600482671485118020110099100800001008000011008002019438006010061800396159431901115118016002673313135800001002674226737270192681926741
80204267192001011100661032672209021258010010080000100800155001167215049238100267362673616642616666801132008002420016004826736851180201100991008000010080000010080020190800191112180040011901901115118016002671110104800001002673626834267302670826737
802042672720000000006751002671220019258010010080000100800145001166596049236270267272672716635616659801152008002420016004826727771180201100991008000010080000010080000043800390000800006104300111511801600267041004800001002673126715267162672826728
80204267072000000000451012671221212525801001008000010080015500116730304923647026727267271663561665980116200800242001600482670777118020110099100800001008000001008000004380000000398003901394300111511801600267040100800001002673126734267162673126708
80204267072000000000450012671221212825801001008000010080016500116659604923647026731267311665561668280116200800242001600482670777118020110099100800001008000001008000004380039000398003960394300111511801600267040104800001002673826714267362673226728
80204267272000000000450012671201212212580100100800001008001550011673030492364702672726727166556166798011520080024200160048267277711802011009910080000100800000100800000438000000039800400039000111511801600267040100800001002671426711270262673326718

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.3341

retire uop (01)cycle (02)030e0f1e1f2223243a3f4346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)acafb5b6bbl1d cache miss ld nonspec (bf)cfd5map dispatch bubble (d6)dbddfetch restart (de)e0eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
8002526723200004500002267072012122580010108000010800005011688431492364202672826727166723168138001020800002016000026728711180021109108000010800000108000039800390080039613543502081604226725100480000102670926729267092672326723
8002426727200114501002267120121816258001010800001080000501166750149236420267272672716672316830800102080000201600002670871118002110910800001080000010800000800000080039613505043216044267241010480000102672326728267232672926709
8002426727200000010022671320121225800101080000108000050116884314923649026722267271666731682080010208000020160000267277211800211091080000108000011080000398003900800350039435020216042267051010480000102670926729267292672826729
8002426728200004501000267132120112580010108000010800005011667501492364802670826727166723167258001020800002016000026728711180021109108000010800000108000039800390398003961393950202160462672406280000102672326709267092670926709
80024267082000045010002669321212162580010108000010800005011688431492364802672826727166523167998001020800002016000026728711180021109108000010800000108000039800390398003960005020216024267241010480000102672326728267092672826729
8002426708200000000002687421212162580010108000010800005011667501492364702672726722166723168258001020800002016000026727711180021109108000010800000108000039800000358004061394350204160242671900280000102672326729267292672326723
800242672220000001002266930018162580010108000010800005011668861492364702672826728166523168218001020800002016000026727711180021109108000010800000108000008000003980039613543508951602426724100480000102672826728267282672826709
8002426728200004500002267072181216258001010800001080000501166896149236470267082672816667316791800102080000201600002672256118002110910800001080000010800003980039039801696000502041602426705010080000102672326723267292672826728
80024267282000057010012671221212025800101080000108000050116689614923628026727267281667231670780010208000020160000267287111800211091080000108000001080000080039039800006135435020316024267051810480000102672926729267092672826728
80024267222010045010022671221212162580010108000010800005011688431492362802672826722166523167998001020800002016000026727711180021109108000010800000108000008000003580039013543502041624226724010080000102670926723267282672826724