Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

LDP (32-bit)

Test 1: uops

Code:

  ldp w0, w1, [x6]
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 2.000

Issues: 1.000

Integer unit issues: 0.000

Load/store unit issues: 1.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)030e0f1e22233a3f4346494f51schedule uop (52)schedule ldst uop (55)dispatch ldst uop (58)simd uops in schedulers (5a)606d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map ldst uop (7d)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst int load (95)inst ldst (9b)l1d tlb access (a0)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)acafb5b6bbl1d cache miss ld nonspec (bf)cfd5map dispatch bubble (d6)ddfetch restart (de)e0eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
20053943004510137921212162510001000100015014139439492312710002000100039435111001100010001000431039039103961394373316113911010410001000395396395395395
20043943004500137921212162510001000100015014039439492312710002000100039435111001100010001000431039039103961394373116113911010410001000395395395395395
20043943004500137921212162510001000100015014139439492312710002000100039435111001100010001000431039039103961394373116113911010410001000395395395395395
20043942004500137921212162510001000100015053139439492312710002000100039435111001100010001000431039039103961394373116113941010410001000395395395395395
20043943004500137921212162510001000100015065039439492312710002000100039435111001100010001000431039039103961394373116113911010410001000395395395395395
20043943004500137921212162510001000100015014039439492312710002000100039435111001100010001000431039039103961394373116113911010410001000399399399399399
200439430044001383211192510001000100015288139839896312710002000100039835111001100010001000431039238103861384473116113951414710001000399399399399399
200439830044001383211192510001000100015288139839896313110002000100039435111001100010001000431040039103861394373116113951414710001000399399399399399
200439820044101383211192510001000100015246039839896313110002000100039835111001100010001000431039039103861394373116223911410710001000395395399395395
200439431145001383211192510001000100015065039839896313110002000100039835111001100010001000431039039103961384473116113951414710001000399399399399399

Test 2: Latency 1->3 (with chain penalty)

Chain cycles: 3

Code:

  ldp w0, w1, [x6]
  eor x8, x8, x0
  eor x8, x8, x0
  add x6, x6, x8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0047

retire uop (01)cycle (02)03090e0f191e22233f4d4f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)a9acafb5l1d cache miss ld nonspec (bf)c2cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
5020570047525111045310700356969859684254010030103100023010010000613915334205614966970700507005063406363710401003020020000602001000070050371140201100991001000030100100000100100000110000000100001000261036411697983000396610000401007003670051700487004870051
5020470050524010010300700206971459684254010430103100003010010000613652334205604966970700507003563388363710401003020020000602001000070050371140201100991001000030100100000100100000110000000100001100263616411697983000090910000401007008870036700517005170100
5020470050525000024410702206969859684254010430100100013010010000613756334190604966970700507003563400363695401003020020000605841000070047371140201100991001000030100100000100100000110000100100001000261016411698133000390910000401007004870036700517005170051
502047003552400007310700356971459700254010430100100013010010000613915334205614966970700357005063403363707401003020020000602001000070050372140201100991001000030100100000100100000110000000100000100261016411698103000360010000401007004870051700367005170051
502047003552400007310700206971459700254010430103100013010010000613915334130404966970700507003563502363695401003020020000602001000070035371140201100991001000030100100000100100000110000000100001100261016411697983000306910000401007005170051700517005170051
502047003552500006700700356969859700254010430100100003010010000613652334205604966970700357005063388363710401003020020000602001000070050371140201100991001000030100100000100100000010000000100001100261016411698133000096910000401007005170056700517005170051
5020470050524000031610700236971459700254010430103100013010010000613915334130404966970700507004763400363707401003020020000602001000070050373140201100991001000030100100000100100000110000126100001100261016411697983000096010000401007005270051700367005170036
502047003552500001010700366971559684254010430103100013010010000613924334205604966968700507004763418363707401003020020000602001000070035371140201100991001000030100100000100100000110000603100001000261016411698003000096610000401007004870051700577006170053
5020470035524001037500700356971459694254010430103100013010010000613652334190604966955700507005063403363707401003020020000602001000070050371140201100991001000030100100000100100000110000000100000000261016411698133000396910000401007005170051700517005170051
5020470047525010015310700326971159694254010030103100013010010000613756334130404966967700357004763400363707401003020020000602001000070035371140201100991001000030100100001100100000110000000100001100261016411698103000396910000401007003670051700517003670051

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0050

retire uop (01)cycle (02)03mmu table walk data (08)09l2 tlb miss data (0b)0e0f191e1f2223243a3f4d4f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)acafb5l1d cache miss ld nonspec (bf)c2cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
5002570117525000000401000700206971959689254001030013100013001010000615126334189904966955700507007763447363718400103002020000600201000070050371140021109101000030010100000101000001100000010000110252019911698513000396610000400107005170051700367005170051
5002470050524010000101000700326971459677254001430013100013001010000614994334204604966970701287012463438363718400103002020000600201000070050371140021109101000030010100000101000001100000010000010252017811698173000306010000400107005170051700517005170036
5002470050525000000001000700356971459692254001430010100013001010000615126334204604966970701467010563406363718400103002020000600201000070050371140021109101000030010100000101000001100000010000100252029921698173000390610000400107005170051700517015170036
5002470050525000000100000700206971959692254001030013100013001010000614994334189914966970700667004763403363682400103002020000600201000070035371140021109101000030010100000101000000100000010000110252017811698013000396010000400107003670036700517005170036
5002470050524000000101000700206971459692254001430013100013001010000614994334204614966955701437009163420363718400103002020000600201000070050371140021109101000030010100000101000000100000010000100252017811698173000360610000400107005170036700517005170051
5002470035525000000601000700206971459677254001430010100013001010000614994334129524966970701137011063404363718400103002020000600201000070051371140021109101000030010100000101000001100000310000000252017812698013000099910000400107005270048700367005170051
5002470035525000000100000700356971959692254001430013100013001010000615126334129504966970700707005163418363718400103002020000600201000070050371140021109101000030010100000101000000100000010000010252029911698183000390010000400107003670048700487005170036
5002470035525000000101000700356971459692254001030013100003001010000614994334204604966970700897004663415363718400103002020000600201000070050371140021109101000030010100000101000001100000310000000252019921698173000360610000400107005170048700517005170051
5002470050525000000100000700326971859693254001030013100013001010000615126334204614966970701367009963431363718400103002020000600201000070035371140021109101000030010100000101000000100000010000110252017811698173000399010000400107005170051700367003670051
5002470050524000000101000700356971959692254001030013100003001010000615120334189904966967701207008163419363682400103002020000600201000070073371140021109101000030010100000101000001100000010000110252019912698173000390910000400107005170051700517005170051

Test 3: Latency 2->3 (with chain penalty)

Chain cycles: 3

Code:

  ldp w0, w1, [x6]
  eor x8, x8, x1
  eor x8, x8, x1
  add x6, x6, x8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0051

retire uop (01)cycle (02)0309l2 tlb miss data (0b)0e0f18191e1f22233f4d4f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)a9acafb5l1d cache miss ld nonspec (bf)c2cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
50205700515241000001010700396971959700254010430103100023010010000613652334210404966955700367005163407363714401003020020000602001000070035371140201100991001000030100100000100100000100000001000011026102641169814300030101010000401007003670052700527005270052
50204700355250000001010700366971959703254010430103100013010010000613888334130404966981700547005163388363714401003020020000602001000070051371140201100991001000030100100000100100001100000001000010026101641169814300031010010000401007005270052700527005270052
50204700545250000000010700396969859703254010030100100013010010000613930334210414966971700517005163388363711401003020020000602001000070052371140201100991001000030100100000100100001100000001000000026101641169821300031310010000401007005570036700567005370055
50204700515250000001000700366969859703254010430100100013026210000613930334210404966972700357005163404363711401003020020000602001000070051371140201100991001000030100100000100100000100000001000011026101641169814300031301310000401007005270052700557005570055
5020470051524000000600070036697155972325401043010310000301001000061393033413041496695570056700546339336374340100302002000060200100007005137114020110099100100003010010000010010000110000000100000102610164116981430000001010000401007005270052700527003670052
50204700355250000000000700366971959684254010430103100013010010000613888334130404966986700547005463407363714401003020020000602001000070051371140201100991001000030100100000100100001100000001000011026101641169798300030101310000401007005570052700557005570055
50204700545250000000000700396969859700254010030103100003010010000613652334210404966972700547005163407363695401003020020000602001000070035371140201100991001000030100100000100100001100000001000010026101641169815300030131010000401007003670055700367005570055
50204700545240000001000700396969859703254010430103100013010010000613888334130414966976700547005463407363714401003020020000602001000070035371140201100991001000030100100000100100000100001001000011026101641169814300031313010000401007005270052700527005270055
50204700355240000001010700796970059684254010430103100013010010000615263334623204966974700357005463407363714401003020020000602001000070054371140201100991001000030100100000100100001100000001000011026101641169798300030101310000401007003670052700527005270052
502047003552400110010107003669719596842540104301031000030100100006136523342104149673487014870056633883637144010030200200006020010000700513711402011009910010000301001000001001000011000000010000010261016411698173000310101310000401007005570055700557005570036

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0054

retire uop (01)cycle (02)03mmu table walk data (08)09l2 tlb miss instruction (0a)l2 tlb miss data (0b)0e0f18191e1f22233f4d4f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafb5l1d cache miss ld nonspec (bf)c2cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
5002570051525000011001010700366971959769254001430013100013001010000614994334225449669567005270036634193637194001030020200006002010000700513711400211091010000300101000001010000011001400001000011025201641169818300031310010000400107005270055700527005570036
50024700635270000100410007007669719596932540014300101000130010100006151453342095496697270051700516340336371940010300202000061324100007005137114002110910100003001010000010100002110000000010000110252019911698183000913131310000400107005570055700557003670036
5002470055525000010005235210703606971959707254001430013100013001010204615481334225449669717005170051634033637364001030020200006002010000700353711400211091010000300101000001010000011000000001000011025201642269818300031301310000400107003670055700577003670056
50024700515250000110000007003969715596782540014300131000130010100006154813341295496697170054700516342236373640010300202000060020100007005137114002110910100003001010000010100000010000000310000010252011001169817300131001010000400107005570036700527005570052
5002470127525000010010010700396971559693254001430013100013001010051614994334432249669717005170035634223637194001030020200006002010000700513711400211091010000300101000001010000001000000001000010025201641169818300031313010000400107005570052700367005570036
5002470054524000010001010700206971559693254001030013100003001010000615145334129549669777005770035634223636824001030020200006002010000700543711400211091010000300101000001010000011000000001000010025201991169818300031301310000400107005270055700557005570055
5002470054524000010000010700206996259707254001430013100013001010000615481334225449669727005470054634223637364001030020200006002010000700513711400211091010000300101000001010000011000005001000010025201641169803300030131310000400107005570052700527003670036
5002470035524000011001010700396972559709254001030010100013001010000615481334225449669717005170054634033637194001030020200006002010000700513711400211091010000300101000001010000111000000001000011025201781169817300031010010000400107006470055700367003670055
5002470056525000010001000700396971959707254001430013100003001010000615481334129549669747005470054634223636824078730020200006002010000700513711400211091010000300101000001010000001000303001000011025201642169817300001310010000400107005570055700557005570055
50024700395250000110000007023169719598512540014300131000330010100006154813341295496697170054700516342236371940010300202000060020100007005137114002110910100003001010000110100000010000030201000011025201641169817300000131310000400107003670055700367005570052

Test 4: throughput

Count: 8

Code:

  ldp w0, w1, [x6]
  ldp w0, w1, [x6]
  ldp w0, w1, [x6]
  ldp w0, w1, [x6]
  ldp w0, w1, [x6]
  ldp w0, w1, [x6]
  ldp w0, w1, [x6]
  ldp w0, w1, [x6]
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.3342

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)09l2 tlb miss data (0b)0e0f1e22233a3f4346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)92inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)a5ld unit uop (a6)st unit uop (a7)a9acafb5b6bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2cfd5map dispatch bubble (d6)ddfetch restart (de)e0eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
1602052672920011111166103267210070258010010080000100800005001169281149236272670726707665036685801002001600002008000026824351180201100990100800001008000001008000004308003900398003960580191511011611267111313580000801002673726737267372673726737
16020426736200110000661022672130720258010010080000100800005001173690149236562673626714663736694801002001600002008000026714351180201100990100800001008000001008002020430800190110178003961584319051101161126711130580000801002673726737267172673826715
160204267362021111002100326721077025801001008000010080000500116875314923634267362673666583669480100200160000200800002671435118020110099010080000100800000100800192043080058002180000015843191511011611267331313580000801002673726737267372673726737
16020426736201100100211032672100702580100100800001008000050011712511492365626736267366659366948010020016000020080000267373511802011009911008000010080000010080019200080019116180040601943192511011611267331313580000801002673726737267152673726715
1602042673620011110021002267210702025801001008000010080000500117518414923634267362673666583669480100200160000200800002671435118020110099010080000100800000100800191943080019102180039005943190511011611267111313080000801002673726737267152673726737
16020426714200100100670022669930719258010010080000100800005001174381149236342671426714665936672801002001600002008000026736351180201100990100800001008000001008001919430800591121800406119019151101161126711013580000801002673726715267372673726715
160204267362001111006710326721377202580100100800001008000050011684851492363526736267366659366948010020016000020080000267143511802011009901008000010080000010080020190080059116080040611945192511011611267331313580000801002673726737267372673726737
1602042671420010110067002267210772125801001008000010080000500116789214923656267362673666593669480100200160000200800002673635118020110099010080000100800000100800192043080149006180039005945190513811611267331313580000801002673726737269722681426737
16020426714201100100670032669937719258010010080000100800005001175341149236562671426736665936694801002001600002008000026726351180201100990100800001008000001008001919430800590121800396019019151101161126733130580000801002671526737267162673726737
160204267142001001006600126699070192580100100800001008000050011765421492365626736267366659366728010020016000020080000267363511802011009901008000010080000110080019204308005800618004061190191511011611267331313580000801002673726737267152673726737

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.3342

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)l2 tlb miss data (0b)0e0f1e1f22233a3f4346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)5f60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)acafb5b6bbl1d cache miss ld nonspec (bf)c2cfl1i cache miss demand (d3)d5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
1600252673520111100650003267172181817258001010800001080000501170443014923642267222672266683670280010201600002080000267223511800211091080000108000001080000398003513580035613539050200916171726719066080000800102672326723267232672326723
16002426727200000004100002669221818122580010108000010800005011689350149236422672226722666836702800102016000020800002670735118002110910800001080000010800003980035041800350100050200171661726719066280000800102670826723267082672326723
16002426734200000004101012670721818025800101080000108000050117540901492364226722267226668367028001020160000208000026722351180021109108000010800000108000039801300358003561353905020081661726704066280000800102672326723267232672326723
1600242672820100000001012670721818122580010108000010800005011733431149236312720026723666836702800102016000020800002672735118002110910800001080000010800003980000011080035613639050200171615626719060280000800102672326723267232672326708
16002426798201000004100012669221818122580010108000010800005011701070149236422670726722666836702800102016000020800002672235118002110910800001080000010800003980035138800350103905020161681726719066280000800102672326723267232670826723
160024267322000000000100267070181812258001010800001080000501170107004923642267222672266683670280010201600002080000267223511800211091080000108000001080000398003511288003501350050200171615626719066280000800102672326723267232672326723
1600242687320000000410101267072181812258001010800001080000501170107004923642267222670766683670280010201600002080000267223511800211091080000108000001080000398000011378003561039050200171681726704066280000800102672326723267232672326723
160024267322000000041010126707218181225800101080000108000050117881511492364226722267226668367028001020160000208000026707351180021109108000010800000108000039800350358003561350050200171661726719066280000800102672326723267082672326723
16002426732200000004101012669421818025800101080000108000050117039501492364226722267226668367028001020160000208000026722351180021109108000010800000108000039800000418000061353905020071617626719066280000800102672326723267232672326723
16002426727200000004101012670701801225800101080000108000050117049111492364226722267226668367028001020160000208000026722351180021109108000010800000108000039800000388003561363905020061617626719066280000800102672326723267232672326723