Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

LDURH

Test 1: uops

Code:

  ldurh w0, [x6, #1]
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 0.000

Load/store unit issues: 1.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)0e0f1e22233a3f4346494f51schedule uop (52)schedule ldst uop (55)dispatch ldst uop (58)simd uops in schedulers (5a)606d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map ldst uop (7d)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst int load (95)inst ldst (9b)l1d tlb access (a0)l1d cache miss ld (a3)ld unit uop (a6)acafb5b6bbl1d cache miss ld nonspec (bf)cfd5map dispatch bubble (d6)ddfetch restart (de)e0eaebec? ldst retires (ed)f5f6f7f8fd
1005389300114110137421818122510001000100014838138939121232471000100010003897211100110001000100039103535103561353973116113876621000390390390390390
1004389300114100137421818112510001000100014844138938921232471000100010003897111100110001000100039103535103561353973116113866621000392390390390390
10043892001141001374218181125100010001000148381389392212324710001000100039171111001100010001000391039391039613943731161139110641000395395395395395
10043942001145002379212181625100010001000149890394394217325210001000100039471111001100010001000391039391039613939731161139110621000395395396390395
10043943001042002379212121625100010001000172190394394217325210001000100039471111001100010001000391039391039613543731161139110641000395395390390395
10043942001045001379212121625100010001000149890394394216325210001000100039471111001100010001000391039391039613643731161139110641000395395395395395
100439430010501023792121216251000100010001501803883892173252100010001000394721110011000100010003910393910396139437311611391101041000395395395395395
10043943001045002379212121625100010001000149890394394217325210001000100039471111001100010001000391039391035613539731161138610641000398395395395395
10043943001041002379218122225100010001000150370394394216325210001000100039471111001100010001000391039391039613543731161138610641000395395395395395
10043943001045101379212181625100010001000147740394394217325210001000100038971111001100010001000391039391035613943731161139110641000395395395395395

Test 2: Latency 1->2 (with chain penalty)

Chain cycles: 3

Code:

  ldurh w0, [x6, #1]
  eor x8, x8, x0
  eor x8, x8, x0
  add x6, x6, x8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0054

retire uop (01)cycle (02)03090e0f18191e1f22233f4d4f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)acafb5bbl1d cache miss ld nonspec (bf)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
40205700515241000010107003669764597132540104301031000130100100006161753342254049669977005170051646473649544010030200100006020010000700513511402011009910010000301001000001001000001100000010000111261037122698143000310101010000301007005270036700557003670055
40204700535250000012010700396978559713254010430103100013010010000616175334147004966957700587005164647364938401003020010000602001000070054351140201100991001000030100100000100100050110000001000011126102712269814300031313010000301007005570036700367005570036
40204700545250110010007003969764597132540100301031000130100100006160413342398049669557005470054646503649574010030200100006020010000700543511402011009910010000301001000001001000001100000310000110261027122698143000313101310000301007003670036700527005570055
40204700545250000010007003969785597102540100301031000130100100006160413342398049670067005470035646503649574010030200100006020010000700543511402011009910010000301001000001001000001100000010000111261027122697983000313131310000301007003670055700557003670055
40204700545240000010007003969785597102540104301001000130100100006160413342398049670227005470035646503649384010030200100006020010000700353511402011009910010000301001000001001000000100001010000011261027122698143000313131010000301007005570055700527005570036
4020470035524000000010700396989659716254010430103100003010010000616041334239804967033700357005464650364938401003020010000602001000070396351140201100991001000030100100000100100000110000001000001126102712269798300001301010000301007003670055700527005570055
40204700545250000010007002069782597132540104301031000030100100006160413342398049670697007570037646313649574087830200100006020010000700543511402011009910010000301001000001001000000100001910000111261027122701183000313131310000301007005570055700367005270038
40204700545240000010107003669782596952540100301031000130100100006160413342398049670067003570054646503649684010030200100006020010000700543511402011009910010000301001000001001000001100000010000110261027122697983000313131310000301007005570055700367003670036
4020470054525000006000700396979259695254010430103100013010010000616175334147004966958700547005464650364957401003020010000602001000070244351140201100991001000030100100000100100000110000001000011126103712269817300000131310000301007005570052700557005570055
402047003552500000100170039697645971325401043010310000301001000061601433423980496695670035700356465037649574010030200100006020010000700353511402011009910010000301001000011001000001100000010000111261027122698173000310101310000301007005570052700557003670055

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0051

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk instruction (07)mmu table walk data (08)l2 tlb miss data (0b)0f1e1f22233a3f4d4f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)5f60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)a9acafb5bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2cfd0d2l1i cache miss demand (d3)d5map dispatch bubble (d6)d9dadbddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
40025700575251001020101700266978459716254001830016100023001010000616991334225401496697170051700516467236498240010300201000060020100007005735114002110910100003001010000010100022110003021100001101125200008710006869814300030101010000300107005270036700527003670052
4002470051524000000010070036697755971025400143001310001300101000061706833422540149669717005170051646693649764001030020100006002010000700353511400211091010000300101000001010001111000301110000111102520000571001656981730000100010000300107005270055700527005270052
4002470051525000001010070036697845971025400143001310001300101000061699133422540149669557005170051646693649764001030020100006002010000700513511400211091010000300101000001010002111000202110000111122520000671000107698893000310101010000300107005270052700527005270052
400247005152400000101007003669775596952540014300131000130010100006169913342254014966971700357003564669364976400103002010000600201000070051351140021109101000030010100000101000001100000001000010100252000057100066698143000310101310000300107005270052700367005270052
40024700515250000010101700426978159716254001830016100023001010000617045334254201496696170057700576467536498240010300201000060020100007005735114002110910100003001010000010100000010000000100001010025200006710006669820300031001310000300107005870058700587004270042
400247005852510110201007003669775597102540014300131000030010100006169913342254014966971700517005164653364976400103002010000600201000070035351140021109101000030010100000101000001100000031000010100252000057100066698143000310101010000300107005270052700367005270036
400247005152500000100017004269784597162540018300161000230010100006169953342542014966977700577005764675364982400103002010000600201000070057351140021109101000030010100000101000001100000001000010100252000067100056698143000310101010000300107005270064700527005270052
400247005152500000101017003669775597102540014300101000030010100006169913342254014966955700357005164672364976400103002010000600201000070051351140021109101000030010100000101000401100000001000010100252000077100056698203000610101010000300107005870042700427005870042
400247005752510010101007003669775596952540014300131000030010100006169913342302114966971700517005164669364976400103002010000600201000070051351140021109101000030010100000101000001100000001000010100252000067100056698203000610101010000300107005870058700427005870060
40024700575251000020100700366977559695254001430013100013001010000616991334225401496695570051700516466936497640010300201000060020100007005135114002110910100003001010000110100022110002021100001111125200006710005769814300030102410000300107005870058700587005870058

Test 3: throughput

Count: 8

Code:

  ldurh w0, [x6, #1]
  ldurh w0, [x6, #1]
  ldurh w0, [x6, #1]
  ldurh w0, [x6, #1]
  ldurh w0, [x6, #1]
  ldurh w0, [x6, #1]
  ldurh w0, [x6, #1]
  ldurh w0, [x6, #1]
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.3342

retire uop (01)cycle (02)03l1i tlb fill (04)l1d tlb fill (05)mmu table walk data (08)09l2 tlb miss data (0b)0e0f181e1f22233a3f4346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)a5ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafb5b6bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2branch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
80205267322000101010065010326721218181625801001008000010080015500116710704923653267362671416664617027801142008002420080024267328111802011009910080000100800001100800202042080019000598000060184219011151180160026729902800001002673326733267152671526715
8020426714201011110006501022672321818025801001008013010080178500118237104923652267332673216655316690801002008000020080000267338111802011009910080000100800000100800192042080057000598003761194219100051101161126729090800001002673326734267332673326733
80204267342000111100065000326731018180258010010080000100800005001166758049236522673226732166553166728010020080000200800002673281118020110099100800001008000001008002119008005700162800386157019100051101161126729992800001002671526715267332671526734
8020426732200011100008300022671700189225801001008000010080000500116675804923634267182673616659316672801002008000020080000267328211802011009910080000100800000100800191942080019001598000061574219100051101161126711992800001002673326733267152671526733
802042673220001011000210002267172018125801001008000010080000500116651904923653267142673216655316690801002008000020080000267328111802011009910080000100800000100800191942080057113658003801194219200051101161126711092800001002673427336267332671926734
80204267142000101000011300032669920012580100100800001008000050011696760492363426733267331665531669080100200800002008000026732811180201100991008000010080000010080020190080057002218003760564219100051101161126733002800001002673326715267332671526733
8020426714200011100006500032669921818152580100100800001008000050011591200492395526745267331663731669080100200800002008000026732821180201100991008000010080000010080020190080019001598000060574219000051101161126729902800001002671526733267342671526715
8020426732200011000002200022669901818162580100100800001008000050011694860492365226733267321663731669080100200800002008000026714641180201100991008000010080000010080019210080057100628000060194219000051101161126729002800001002673426733267332673326734
80204267332000111000021000226699018016258010010080000100800005001166519049236522671426732166373166908010020080000200800002671481118020110099100800001008000001008002119008005700121800386157019000051101161126729992800001002673326733267342671526733
8020426732200010111106500012672021818162580100100800001008000050011676851492363426714267141665531667280100200800002008000026715811180201100991008000010080000110080021204208005610259800006119019100051101161126729992800001002671526734267342671526734

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.3341

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)09l2 tlb miss data (0b)0e0f1e22233a3f4346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)5f60696a6b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)a5ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafb5b6bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2cfd2l1i cache miss demand (d3)d5map dispatch bubble (d6)d9daddfetch restart (de)e0eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
80025267282000010016610326721377202580010108000010800005011720090149236560267362673716659316716800102080000208000026736641180021109108000010800000108001920430800591002180040603943005020001164011267271010480000102672826728267292672926709
8002426728200000000450012669300116258001010800001080000501166750014923648026727267271667231670780010208000020800002672777118002110910800001080000010800191943080058012618000061394300502000116201126725100480000102672826709267292670926728
80024267082000000004410126712000162580010108000010800005011667500149236480267282670816672316688800102080000208000026731771180021109108000010800000108001919430800590006080040613944005020001162011267051010080000102672926732267092672826709
80024267282000000000100267122121216258001010800001080000501168843114923648026727267271667231670780010208000020800002672777118002110910800001080000010800202143080059100618000001394300502000116201126724010480000102670926732267322673226728
8002426729200000000000126713001216488001010801301080000501167501114923647026731267311667631671180010208000020800002673177118002110910800001080000010800192000800591006180039603943005020101160011267301010080000102672926709267092672826709
800242672820000000045001267162110258001010800001080000501166750004923669026727267281665231670880010208000020800002670856118002110910800001080000010800202000800590026180000013843005020001162011267241010480000102673226709267332672926729
800242672820000000045000267132112162580010108000010800005011667500049236480267272670816672316707800102080000208000026728771180021109108000010800000108002020008005810164800406104300502000116201126724010480000102672926729267282672826709
8002426708200000000440012671300120258001010800001080000501168843004923648026728267081667631670880010208000020800002673077118002110910800001080000010800191943080059002218004061394300502000116201126728010080000102672826729267092670926728
80024267272000000000000267162120162580010108000010800005011688431149236510267312673116652316688800102080000208000026731771180021109108000010800000108001920430800590006180000613943005020001162013267321010480000102672826728267282672826729
80024267282000000100001267130000258001010800001080000501166750114923647026708267271667231670880010208000020800002672877118002110910800001080000010800191943080058000608004001394400502000116201126724140780000102670926728267322673226742