Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

LDURSB (32-bit)

Test 1: uops

Code:

  ldursb w0, [x6, #1]
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 0.000

Load/store unit issues: 1.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)030e0f1e22233a3f4346494f51schedule uop (52)schedule ldst uop (55)dispatch ldst uop (58)simd uops in schedulers (5a)606d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map ldst uop (7d)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst int load (95)inst ldst (9b)9dl1d tlb access (a0)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)acafb5b6bbl1d cache miss ld nonspec (bf)cfd5map dispatch bubble (d6)ddfetch restart (de)e0eaebec? ldst retires (ed)f5f6f7f8fd
1005394300300138301211925100010001000152741399374197325610001000100037477111001100010001100043100003810396139447311611371141071000375375399399375
1004374211511013830112192510001000100015037137439422132561000100010003787711100110001000010004310380010386139447311611395101071000399399399375395
100437430000013830111925100010001000152741374398197325610001000100039877111001100010001100043103903810386138447311611391141471000375399399395399
1004398300440003592010251000100010001520813773982213256100010001000394561110011000100001000431038038103800043731161137101471000399375399375399
10043983114510138321212025100010001000152081403374221325610001000100039877111001100010000100001000038100061007311611395141471000399399399399375
100439830044101383201025100010001000150371398394221325610001000100039877111001100010000100043103803810390139447311611392141441000375399395395375
100439820001013872012192510001000100014060137439822132321000100010003987711100110001000010004410381010380039437311611395141471000399375399375399
100439820045100383210192510001000100015018139839422132521000100010003945711100110001000010004310381411000000447311611395141071000375399399399375
1004398300440013590120025100010001000152741398398221325210001000100039877111001100010000100043103803810386139447311611371141071000375399375399399
10043743004400138321019251000100010001498913983741973256100010001000398771110011000100001000010380381000013944731161139514041000375399399375399

Test 2: Latency 1->2 (with chain penalty)

Chain cycles: 3

Code:

  ldursb w0, [x6, #1]
  eor x8, x8, x0
  eor x8, x8, x0
  add x6, x6, x8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0089

retire uop (01)cycle (02)030e0f1e22233f4d4f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6d6emap stall dispatch (70)int prf full (71)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)acafb5l1d cache miss ld nonspec (bf)branch cond mispred nonspec (c5)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
4020570057524001107003669782596952540104301031000130100100006163653343022049669747005170051646310364954401003020010000602001000070051801140201100991001000030100100000100100001100000610000110261027111698143000310101010000301007005270052700527005570052
4020470088525001007003669785597102540104301031000130100100006160143342254149669717005170051646500364954401003020010000602001000070051351140201100991001000030100100000100100001100000310000010261017111698143000310101010000301007005270052700527005270052
402047009252400600700366978259710254010430103100013010010000616041334225414966971700517005164647036495440100302001000060200100007005135114020110099100100003010010000010010000110000013510000110261017111698203000310101010000301007005270036700527005270052
4020470051524001007003669782597132540104301031000130100102046160413342350149669747005470051646470364938401003020010000602001000070051351140201100991001000030100100000100100001100000610000110261017111698143000310101010000301007005270052700367005270055
40204700575241010070036697825971325401043010310001301001000061601433422541496697570051700516464703649544010030200100006020010000700353511402011009910010000301001000001001000011000001411000011026101711169814300031010010000301007005270052700527005270052
402047011652500100700396978259710254010430103100013010010000616068334225414966974700517005164631036495440100302001000060200100007005135114020110099100100003010010000010010000110000010210000110261017111698143000310101010000301007005270052700527005270052
402047008052500100700396978259710254010430103100013010010000616014334225414966971700547005164631036495440100302001000060200100007005135114020110099100100003010010000010010000010000011110000110261017111698143000310101010000301007005270036700527005270087
40204700625250010070039697855971025401003010310001301001000061601433423981496697170051700546464703649544010030200100006020010000700513511402011009910010000301001000001001000011000001441000011026101711169814300030101010000301007005570055700547005270052
402047009852500100700866978559710254010430103100013010010204616023334412614966971700517005164650036495440100302001000060200100007005435114020110099100100003010010000010010000110000011110000110261017111698143000310101010000301007005270052700557005270052
402047011752500100700366978259710254010030103100013010010000616023334225414966971700547003564647036495440100302001000060200100007005435114020110099100100003010010000010010000110000012610000010261017111698143000313131010000301007005570052700527005270052

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0047

retire uop (01)cycle (02)030e0f1e22233f4d4f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)acafb5l1d cache miss ld nonspec (bf)c2cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
4002570047525111107003269743597092540014300131000130010100006170683342206049669557005070050646533649754001030020100006002010000700533511400211091010000300101000001010000011000000100001100252027133698133000000910000300107004870051700517003670048
4002470050525010007003569728597062540010300101000030010100006170683341470149669677005070035646683649754001030020100006002010000700593511400211091010000300101000001010000011000000100000100252027133698133000300910000300107003670051700487005170051
40024700355250058007003269743597092540014300101000130010100006169823341470049669677005070047646683649724001030020100006002010000700873511400211091010000300101000001010000011000000100000000252017135698103000396010000300107004870048700487004870051
4002470047524000007002069728597062540014300131000130010100006169523342062049669557005070035646653649724001030020100006002010000700763511400211091010000300101000001010000011000000100001100252017155697983000390610000300107003670051700367005170036
4002470047525001007003569760597092540014300101000030010100006169823342062049669677005070050646683649754001030020100006002010000700923511400211091010000300101000001010000011000006100001100252027132698143000090010000300107004870051700367004870037
4002470047525019007003569760596952540010300131000030010100006170683342062049669677003570047646533649724001030020100006002010000701123511400211091010000300101000001010000011000000100001100252017133698103000300010000300107005170036700367005170051
4002470050525000007002069760596952540014300131000130010100006169823342206049669707003570050646683649604001030020100006002010000700653511400211091010000300101000001010000011000000100001000252027133697983001299910000300107003670051700517003670036
4002470047524001007003569760597062540010300131000130010100006169523341470049669557005070035646533649604001030020100666002010000700423511400211091010000300101000001010000011000000100000000252017165698133000369910000300107004870048700367004870048
4002470047524001007002069760596952540014300101000030010100006169523342206049669677005070050646683649724001030020100006002010000700503511400211091010000300101000001010000011000013100001100252027124698103000366910000300107003670036700517003670048
4002470035524001007003569760597062540014300131000130010100006170683341470049669707005070050646683649604001030020100006002010000700723511400211091010000300101000001010000011000010100000100252017137698133000306910000300107005170048700517005170048

Test 3: throughput

Count: 8

Code:

  ldursb w0, [x6, #1]
  ldursb w0, [x6, #1]
  ldursb w0, [x6, #1]
  ldursb w0, [x6, #1]
  ldursb w0, [x6, #1]
  ldursb w0, [x6, #1]
  ldursb w0, [x6, #1]
  ldursb w0, [x6, #1]
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.3341

retire uop (01)cycle (02)03mmu table walk data (08)090e0f18191e1f22233a3f4346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d cache miss ld (a3)a5ld unit uop (a6)l1d cache writeback (a8)acafb5b6bbl1d cache miss ld nonspec (bf)cfd5map dispatch bubble (d6)ddfetch restart (de)e0eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
8020526723200011000450001267123121216258010010080000100800005001167231492364726727267271665031668080100200800002008000026722711180201100991008000010080000010080000390800390358000061354351102163226709664800001002672826728267282670826708
80204267072000000004500002671201218162580100100800001008000050011678084923627267312673116649316689801002008000020080000267077111802011009910080000100800000100800000080039039800396135435110216222673610100800001002670826708267282672826730
802042672720000010012000226722012181024880230100800001008017850011597474923647267072672716650316665801002008000020080000267077111802011009910080000100800000100800003908003900800396135051102162226724662800001002672826708267282670826723
8020426732200100001450002266922001625801001008000010080000500117723149236472672526728166503166858010020080000200800002672271118020110099100800001008000001008000039080039035800006100511021622267301002800001002672826723267282672826708
802042670720000000045000226712201218258010010080000100800005001166525492364726727267271663031666580100200800002008000026711711180201100991008000010080000110080000393680000079580035613539511031622267266104800001002672726731267272671226728
80204267072000000013601921022669701212033801001008000010080000500117703849236422670726713166533166858010020080000200800002693771118020110099100800001008000001008000039080039039800006035435110216222672710102800001002672826728267082672826709
802042672720000000042000126692012016258010010080000100800005001159747492362726731267321664931666980100200800002008000026722711180201100991008000010080000010080000390800390080039613905110216222677310100800001002673226712267282670826728
80204267072000000004501022671221212162580100100800001008000050011657894923627267072672716650316685801002008000020080000267227111802011009910080000100800000100800003908000003980000600051102162226719662800001002673426736267262673126739
802042672620000000000002267120121216258010010080000100800005001166525492362726727267271663031668580100200800002008000026722711180201100991008000010080000010080000390800350398003961394351102162226704004800001002672826723267082672826708
802042672220100000000002267120018025801001008000010080000500116652549236422672226707166303166858010020080000200800002672271118020110099100800001008000001008000000800390398003960354351102162226719664800001002672826723267282672826723

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.3340

retire uop (01)cycle (02)03090e0f191e22233a3f4346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)acafb5b6bbl1d cache miss ld nonspec (bf)branch mispred nonspec (cb)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
800252672220001104110126707218181225800101080000108000050116675014923628267222670816667316798800102080000208000026722711180021109108000010800001108000039800350358003561350050203162326729099280000102673326733267342672326723
80024267082000000411022669300181225800101080000108000050116760514923628267222672216652316774800102080000208000026728711180021109108000010800000108000039800350080035613539050203162426729000280000102670926709267232672326723
800242672220000004100126693018012258001010800001080000501172240149236422670826722166673167948001020800002080000267225611800211091080000108000001080000080000008003561039050203163226712099080000102672326723267092670926723
8002426708200000041101267072181812258001010800001080000501167605149236422670826722166673167848001020800002080000267227211800211091080000108000001080000398003500800356100050204163326712009080000102672326723267092672326723
800242672220000004500026707018181225800101080000108000050116760514923642267082670816667316715800102080000208000026722711180021109108000010800000108000008003603580039613539050203163226711000280000102672326723267232672326723
800242672220000004110026707018181325800101080000108000050116760514923642267222670816667316702800102080000208000026722711180021109108000010800000108000039800000358000061350050203163326712000280000102673926723267262670926709
80024267082000000411002670721818048800101080000108000050116760514923642267222672216667316806800102080000208000026722711180021109108000010800000108000039800351358003561039050203163326730099280000102673326733267232670926709
80024267082000010411012669321801225800101080000108000050116760514923642267222672216667316792800102080000208000026722561180021109108000010800000108000039800350358003561350050202162426729099280000102672326723267092670926723
800242672220000004210026707218180258001010800001080000501167605149236422672226722166673168008001020800002080000267227111800211091080000108000001080000398003503580035613539050203162326729099280000102682026723267092670926709
80024267222010000010126707218181225800101080000108000050117224014923642267222672216667316789800102080000208000026722711180021109108000010800001108000008003503580035613539050202164226729009280000102673326723267232672326723