Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

LDUR (S)

Test 1: uops

Code:

  ldur s0, [x6, #1]
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 0.000

Load/store unit issues: 1.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)09l2 tlb miss data (0b)0e0f1e223a3f4346494f51schedule uop (52)schedule ldst uop (55)dispatch ldst uop (58)simd uops in schedulers (5a)696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map ldst uop (7d)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst simd load (98)inst ldst (9b)9dl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafb5b6bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2cfd5map dispatch bubble (d6)ddfetch restart (de)e0eaebec? ldst retires (ed)f5f6f7f8fd
10053893000010411135920180251000100010001484436438937421232471000100010003913891110011000100001000039100000001035600390073216113766621000390390395392390
10043892000000410237921818162510001000100015018369389389197325210001000100038140311100110001000010192001057111621038615742190731161137110641000375392395375375
100438920000000123592018025100010001000149893693893942123232100010001000381402111001100010000101820421019111211000615742191731161137110081000375395375395395
100437430000000003592121216251000100010001483834939439421232521000100010003893891110011000100001000001039000351039600000731161137110041000375375375395395
1004394300000045023842181816251000100010001537537439939922132391000100010003744091110011000100001000039100000035103960043007311611371101021000390390375375395
100439420000004502384201816251000100010001536237339939822132571000100010004093891110011000100001000039103900039100061043007311611391101041000395395375375375
100439430000000003760012162510001000100014838364389389212325210001000100037438911100110001000010000391035000010396135430073116113710001000395375395375395
100438930000004502384018181525100010001000144563563823992213240100010001000398399111001100010000101921421057000211000015742190731161139110641000395395395395395
10043943000000002379212121625100010001000148383493743742173232100010001000399398111001100010000101919421019102211000615742190731161137110641000375395395392395
1004374300000045013792121212251000100010001498936937438921732471000100010003943741110011000100001000039103500001039610430073116113799921000399400400400382

Test 2: Latency 1->2 roundtrip

Chain cycles: 3

Code:

  ldur s0, [x6, #1]
  fmov x0, d0
  eor x8, x8, x0
  eor x8, x8, x0
  add x6, x6, x8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 9.0047

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)09l2 tlb miss data (0b)0e0f18191e1f22233a3f4d4f51schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)60696b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd load (98)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafb5bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2branch mispred nonspec (cb)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
5020512004789900001100931000112003811951510946625601034010410001100003010010000100001079097573633261364191120023012004712004711314331136705010030200100001000060200100001000012004112005311502011009910040100100001000001001000220100010011100001101103210310711119660400026001000040100120042120054120054120054120042
502041200568991001000067010012002011950910946125601034010210001100003010010000100001078862573545561338180120026012003512009511314531136675010030200100001000060200100001000012003512003511502011009910040100100001000001001000001100000000100000000003210113511119660400000651000040100120036120048120036120048120036
502041200478990000000080635210012003511950910946125601004010210001100003010010000100001078999573604461338180120063012005012005011314331136365010030200100001000060200100001000012004712004711502011009910040100100001000001001000001100000000100001010003210110111119657400026651000040100120048120036120036120037120036
5020412003589900000000521010012003211949310944925601034010210001100003010010000100001078999573604461344610120023012004712005011314331136395010030200100001000060200100001000012005012004711502011009910040100100001000001001000001100000000100001000003210113511119660400026601000040100120051120051120048120048120051
502041200508990000000084501001200321194931094492560103401021000110000301001000010000107886257354556133818112002301200471200351131433113639501003020010000100006020010000100001200471200351150201100991004010010000100000100100000010000000010000101000321011712119660400000601000040100120036120036120048120048120048
50204120050899000000007010012003211949310946125601034010010001100003010010000100001079200573618861338180120011012004712004711314331136365010030200100001000060200100001000012004712004711502011009910040100100001000001001000001100000000100001000003210110111119660400026681000040100120051120048120036120051120036
5020412005089900000000865010012003211949310946325601034010010001100003010010061100001078862573618861361660120023012004712004711314331136365010030200100001000060200100001000012004712003511502011009910040100100001000001001000001100000003100001010003210110111119657400000651000040100120051120038120036120036120048
502041200479000000000084835201012003211951910946125601034013410001100003010010000100001078862573604461342830120023012004712004711314131136365010030200100001000060200100001000012004712004711502011009910040100100001000001001000001100000100100000010003210110121119660400020051000040100120048120048120048120048120048
5020412004789900000000722001012002011951910946125601034010210000100003010010000100001079200573618861338180120023012004712004711314131136365010030200100001000060200100001000012004712004711502011009910040100100001000001001000000100000000100010000003210110711119660400026601000040100120051120048120048120048120048
5020412003589900001100532000012002011951910946125601034010210001100003010010000100001078862573642861338180120023012004712004711314331136365010030200100001000060200100641000012005012003511502011009910040100100001000001001000000100000000100001000003210113511119660400009081000040100120036120051120048120048120048

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 9.0051

retire uop (01)cycle (02)03mmu table walk instruction (07)mmu table walk data (08)l2 tlb miss instruction (0a)l2 tlb miss data (0b)0e0f18191e1f22233a3f4d4f51schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd load (98)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)a9acafb5l1d cache miss ld nonspec (bf)c2cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
5002512005189900000000701001200361195091094642560013400121000110000300101000010000107951757362366133662012001112005112005111316931136905001030020100001000060020100001000012008912006811500211091040010100001000001010000011000000010000110314012107641196504000201091000040010120036120052120052120052120052
50024120051899000000006010012003611964210946425600134001210001100003001010000100001079557573633261336620120027120051120035113169241136035027530020100001000060020100001000012006312005411500211091040010100001000001010000011000000010000110314031093411966840002101091000040010120052120052120052120052120052
5002412005189900000000101001200361195091094642560010400121000110000300101000010000107955757362366133662012002712003512005111324131136905001030020100001000060020100001000012008612007011500211091040010100001000001010000011000000010000110314031073411966640002101001000040010120052120052120052120052120052
5002412005189900000000101001200361195091094642560013400121000110000300101000010000107955757362366133662012002712005112005111316931136745001030020100001000060020100001000012005012005411500211091040010100001000001010000001000010010000110314031074411966640002101091000040010120036120052120052120052120052
5002512005193100000000601001200361195091094642560013400121000110000300101000010000107955757362366133662012002712006312005111316931136905001030020100001000060020100001000012009112005811500211091040010100001000011010000011000000310000110314041076711966640010101091000040010120036120052120036120052120052
5002412005189900001100295010012003611957510952813760013400281000110002300101005210098107951757362366133764212002912005112005111316931136905001030179100001011060020101601000012011212018521500211091040010100001000001010000011000000610000100314031074411966640002101091000040010120052120052120052120052120052
5002412005290000000000157881011202351195091094642560025400121000110000300101000010000107955757362366134783012002712015412005311316931136905001030020100001000060020100531000012006112006711500211091040010100001000001010000211000000310000110314061077611966640002101091000040010120243120053120053120147120156
5002412005189900000000001001200361195691094642560013400121000110000302931000010000107955757363326133662112002712005112005111316931136905001030020100001000060020100001005412008612008611500211091040010100001000001010000011000000010000110314031073411966640002101091000040010120052120052120052120052120052
500241200519000000000010100120020119509109483256001340012100011000030010100001000010795575736236613366201200111200511200511131703113690500103002010000100006002010000100001200511200511150021109104001010000100000101000001100000031000011031404107761196664000201091000040010120052120052120036120052120052
5002412005189900000001101001200361195691094642560013400121000110000300101000010000107955757362366133662012002712005112005111316917113776500103002010000100006002010000100001200511200521150021109104001010000100001101000001100000001000011031404107341196664000213091000040010120052120052120052120052120037

Test 3: throughput

Count: 8

Code:

  ldur s0, [x6, #1]
  ldur s0, [x6, #1]
  ldur s0, [x6, #1]
  ldur s0, [x6, #1]
  ldur s0, [x6, #1]
  ldur s0, [x6, #1]
  ldur s0, [x6, #1]
  ldur s0, [x6, #1]
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.3341

retire uop (01)cycle (02)0309l2 tlb miss instruction (0a)0e0f191e22243a3f4346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd load (98)inst ldst (9b)9d9fl1d tlb access (a0)l1d cache miss ld (a3)a5ld unit uop (a6)l1d cache writeback (a8)a9acafb5b6bbl1d cache miss ld nonspec (bf)cfd5map dispatch bubble (d6)ddfetch restart (de)e0eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
80205267362000000044001267120012162580100100800001008000050011728690267062670726727166503166658010020080000200800002672726727118020110099100100800008000001008000043080039000800396039435110116112672814107800001002670826708267282673226728
802042672720100010441002669231002580100100800001008000050011683170266822679727001166573166718010020080000200800002673126727118020110099100100800008000011008000043080039003980000013943511011611267280147800001002672926728267322673226728
802042673120000010450012669221102580100100800001008000050011728410267062670726731166303166658010020080000200800002673126727118020110099100100800008000011008000043080038003880038600435110116112672814147800001002673226732267322673226708
80204267272000000044101267122001925801001008000010080000500117323212668226727267071665031666580100200800002008000026707267271180201100991001008000080000010080000430800380038800006139445110116112670414144800001002670826708267282670826708
80204267072001100001002671600019258010010080000100800005001166525126702267072672716630316665801002008019320080000267072672711802011009910010080000800000100800000080038000800386139051101161126724007800001002670826732267282672826708
8020426707201000004500026712212002580100100800001008000050011733701267022670726707166303166898010020080000200800002673126707118020110099100100800008000001008000043080038103880039603905110116112672814140800001002672826728267082672826728
8020426727200000004400126692211162580100100800001008000050011702391267062673126707166303166658010020080000200800002670726727118020110099100100800008000001008000000800380010480000600445110116112670414147800001002670826732267082673226732
802042670720000100450012671221212162580100100800001008000050011699141267062671026730166543166658010020080000200800002670726727118020110099100100800008000001008000043080000003880038610435110116112670414147800001002672826732267282670826728
80204267072000000000012671521212162580100100800001008000050011678251267022672726727166303166858010020080000200800002670726707118020110099100100800008000001008000000800000008003961043511011611267280144800001002672826732267322672826728
80204267072001011045001267162011725801001008000010080000500116831212670626731267071665431668980100200800002008000026707267071180201100991001008000080000010080000430800000038800380139445110116112672810147800001002673226728267322673226732

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.3340

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)l2 tlb miss instruction (0a)l2 tlb miss data (0b)0e0f1e1f22243a3f4346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd load (98)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafb5b6bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2cfd5map dispatch bubble (d6)ddfetch restart (de)e0eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
800252672220000011141010326718218180258001010800001080000501172370126697267222672216667316712800102080000208000026732267331180021109101080000800000108000003980035010358003561544219150201116262671906280000102672526723267232672326723
80024267222000000006500032671721818025800101080000108000050117007802670826732267151666031671280010208000020800002673226732118002110910108000080000010800192042800191016280039613539005020216222671966280000102672326723267232672326723
8002426722200000000410100266932180112580010108000010800005011662911266832671426722166543166888001020800002080000267222672211800211091010800008000011080000039800350003580035613539005020216222671960280000102672326723267232672326726
80024267222010000005001012670720181225800101080000108000050116760502670826732267331667731671280010208000020800002673226733118002110910108000080000010800000398003500035800356157421905020216222672999280000102672326812268862673426723
800242672320001000041000126707218011258001010800001080000501174614126697267222672216667316702800102080000208000026708267221180021109101080000800000108000003980035000358003561350005020216222671966280000102672326723267232672326723
800242672220000000000101267072181812258001010800001080000501173655026697267082670816652316702800102080000208000026722267081180021109101080000800000108000003980035000080000013539005020216222671966280000102672326723267092670926723
800242672220000000000101267070181812258001010800001080000501172941026697267222670816729316702800102080000208000026722267221180021109101080000800000108000000800350003580035613539005020216222671966280000102672326723267092672326723
800242672220000010065000126707018181225800101080000108000050117269402669726722267221665231668880010208000020800002672226722118002110910108000080000010800000080035000080035003539005020216262670566280000102672326709267232672326723
8002426708200000000410001266932181812258001010800001080000501166750026697267222670816667316833800102080000208000026722267221180021109101080000800000108000000800000003580035613539005020216622671966280000102672326723267232672326723
80024267222000001102101032671721818142580010108000010800005011730320267082672226708166673167028001020800002080000267222672211800211091010800008000001080000039800350003580035603539005020216622670566280000102672326723267092672326709