Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

LDRSW (register, lsl)

Test 1: uops

Code:

  ldrsw x0, [x6, x7, lsl #2]
  mov x7, #4
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 0.000

Load/store unit issues: 1.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)03090e0f1e22233a3f4346494f51schedule uop (52)schedule ldst uop (55)dispatch ldst uop (58)simd uops in schedulers (5a)606d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map ldst uop (7d)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst int load (95)inst ldst (9b)9dl1d tlb access (a0)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)acafb5b6bbl1d cache miss ld nonspec (bf)cfd5map dispatch bubble (d6)ddfetch restart (de)e0eaebec? ldst retires (ed)f5f6f7f8fd
100539431004500137921212162510001000100015037039437421732521000100020003947711100110001000010004310390391039613943732162237101041000395395395395395
10043742000010137900121625100010001000150371374394217325210001000200039456111001100010000100001000039103961394373216223910001000395395395375395
10043943000451003790000251000100010001503713943941973253100010002000394561110011000100001000431039039103900390732162237110041000375395395395395
1004394311045101379312121625100010001000149890394394217325210001000200037477111001100010000100043103900103961390732162239101041000395396395395395
100439430004510137921212162510001000100015037137437421732321000100020003945611100110001000010004310390010006139437321622391101001000395395375395375
1004374300000013792121216251000100010001498913943942173232100010002000394771110011000100001000431000039103961043732162239101041000395398395375395
1004394300045000379212121625100010001000150181394394216325210001000200039477111001100010001100043103903910396139437321622391101001000395375395395395
1004394300045101379212121625100010001000149891394394217325210001000200037477111001100010000100043100003910396139437321622394101041000395395395375375
10043942011451003592121202510001000100014989139439421732521000100020003947711100110001000010004310390391039010437321622391101041000395395395395395
100439430004500135920120251000100010001503713943942173232100010002000374771110011000100001000431000001000613943732162239101041000395395395375397

Test 2: Latency 1->2 (with chain penalty)

Chain cycles: 3

Code:

  ldrsw x0, [x6, x7, lsl #2]
  eor x8, x8, x0
  eor x8, x8, x0
  add x6, x6, x8
  mov x7, #4
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0053

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)09l2 tlb miss data (0b)0e0f18191e22233a3f4d4f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)a9acafb5bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
40205700475240000000021007003269759597062540104301031000230100100006160153342062149669677004770047646463649504010030200100546020020000700473511402011009910010000301001000011001000001100001001000010100261017111698103000366610000301007004970048700487004870048
40204700475250000000011007003269735597062540104301031000130100100006160153342062149669677004770047646433649504010030200100006020020000700473511402011009910010000301001000011001000000100000001000010100261017111698103000366610000301007005070048700487003670048
40204700475250000000011007003269735597062540104301001000130100100006160153342062149669677004770047646433649504010030200100006020020000700353511402011009910010000301001000011001000001100000001000010100261017111698163000666610000301007005470054700547005470054
402047005352510000000210170038697845971225401083010610002301001000061603233417691496697370053700536464936495640100302001000060200200007005335114020110099100100003010010000110010003111000101311000011110261017111698163000666610000301007005970061700547005470054
40204700535241001000020017003869784597122540108301061000230100100006160323342350149669617005370053646493649564010030200100006020020000700533511402011009910010000301001000011001000121100020111000011112261017111698163000666610000301007005970054700577005470054
40204700535241111000021017003869790597072540104301031000130100100006160153342062149669677004770047646433649504010030200100006020020000700473511402011009910010000301001000011001000001100000001000010100261017111698103000360610000301007006170056700547005470112
40204700545251011000021017003869784597122540108301061000230100100006160323342350149669737005370053646373649564010030200100006020020000700533511402011009910010000301001000011001000221100010011000011111261017111698163000666610000301007005870059700547005470054
40204700535251010000020017003869784597012540104301061000230100100006160323342350149669737005370041646493649564010030200100006020020132700533511402011009910010000301001000011001000221100020011000011112261017111698163000666610000301007006570055700547005470054
40204700535241111000021017003869786597122540108301061000230100100006160323342350049669737004770047646433649504010030200100676020020000700473511402011009910010000301001000011001000221100010011000011110261017111698163000666610000301007008670111700557004270054
40204700535251111100021017003869784597122540108301061000230100100006160323342350149669737005370053646493649564010030200100006020020000700533511402011009910010000301001000011001000221100020211000011111261017111698163000666610000301007005570055700557005470054

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0054

retire uop (01)cycle (02)03mmu table walk instruction (07)l2 tlb miss instruction (0a)0e0f1e22233a3f4d4f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)acafb5l1d cache miss ld nonspec (bf)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
4002570051525110012700370036697755971325400143001310001300101000061706833422544966974700547005164748364960400103002010000600202000070051351140021109101000030010100001101000001100000010000112526147114146979830000001310000300107005570055700557003670055
40024700515251100100370020697435969525400103001310001300101000061701833414704966974700357005464759364979400103002010000600202000070054351140021109101000030010100000101000011100000010000012526167117176981730003130010000300107005570055700367003670052
400247003552511001003700396977859713254001430013100013001010000617018334239849669747005470035647933649604001030020100006002020000700543511400211091010000300101000001010000011000010100001025261771141769814300001313010000300107005270055700557005270055
4002470054524110082103700396977559695254001030013100013001010000617018334239849669747005470054647673649764001030020100006002020000700513511400211091010000300101000001010000001000000100001125261671161669817300030131010000300107005570055700367003670036
40024700355241100211037003969778597132540014300101000030010100006170183342398496697470056700546475836497940010300201000060020200007003535114002110910100003001010000110100000110000001000011252619711586981730003010010000300107003670055700557005270055
4002470054525110055103700396974359695254001430013100003001010000617068334147049669557005470054647503649814001030020100006002020000700543511400211091010000300101000001010000001000000100001125269711315698143000013131010000300107005570036700527005270055
4002470035525110025103700396974359713254001030013100003001010000616991334225449669557005470054647553649604001030020100006002020000700543511400211091010000300101000001010000001000000100001025261371151569814300001313010000300107003670055700557005270055
4002470051525110052103700206977559695254001430013100013001010000617018334225449669717003570051647843649604001030020100006002020000700513511400211091010000300101000001010000001000000100001125261671141669814300000131310000300107005570036700597005570055
4002470054525110010037003969778597132540014300101000130010100006170183342398496697470054700546475436497640010300201000060020200007005135114002110910100003001010000010100000110000001000011252611711017697983000313101010000300107003670055700367003670055
4002470054524110011037003969775596952540014300101000030010100006170183342398496697470056700356475236496040010300201000060020200007005435114002110910100003001010000010100000010000001000000252697115136981730003001310000300107005570055700557005270055

Test 3: Latency 1->3 (with chain penalty)

Chain cycles: 3

Code:

  ldrsw x0, [x6, x7, lsl #2]
  eor x8, x8, x0
  eor x8, x8, x0
  add x7, x7, x8
  mov x7, #4
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0051

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)l2 tlb miss data (0b)0e0f1e22233a3f4d4f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6d6emap stall dispatch (70)int prf full (71)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafb5bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
4020570057524101111101700426978259710254010430103100023010010000616014334225414966971700517005164647036496040100302001000060200200007005735114020110099100100003010010000010010001211000100111000010100261017111698203000610101010000301007005270052700527005270052
4020470051525000001100701456978859716254010830106100023010010000616068334254214966977700577005764653036496040100302001000060200200007005735114020110099100100003010010000010010000011000003001000010100261017111698143000610101010000301007005270052700527005270052
4020470051524000001100700426978259710254010030103100013010010000616014334225404966971700517008864647036495440100302001000060200200007005135114020110099100100003010010000010010000011000000001000010100261017111698143000610101010000301007005270052700527005270052
4020470052525000001100700426978259710254010430103100013010010000616175334225414966971700517005164647036495440100302001000060200200007005135114020110099100100003010010000010010003111000300241000011111261017111698203000610101010000301007005270052700527005270052
4020470051524000101100700426978859716254010830106100023010010000616068334176914966961700577005764653036496040100302001000060200200007004135114020110099100100003010010000010010000011000001031000011111261017111698203000610101010000301007005270052700527005270052
4020470051524000001100700426978259710254010430103100013010010000616014334225414966971700517005164647036495440100302001000060200200007005135114020110099100100003010010000010010001311000600211000011110261017111698273000610101010000301007005270052700537005270052
4020470051524000001100700426978859716254010830106100023010010000616068334254214966977700577005764653036496040100302001000060200200007005735114020110099100100003010010000010010000011000000001000010100261017111698143001810101010000301007005270052700527005270052
4020470051526000001100700426978859716254010830106100023010010000616068334254214966977700577005764653036494440100302001000060200200007005735114020110099100100003010010000010010000011000000001000010100261017111698143000610101010000301007005870058700587005870045
4020470057524111002101700426976459710254010430103100003010010000616014334225414966971700517005164647036495440100302001000060200200007005135114020110099100100003010010000010010001211000200011000011111261017111699013000610101010000301007005270052700527005270052
4020470051524000101100700426978859716254010830106100023010010000616068334254214966977700577005764653036496040100302001000060200200007005779114020110099100100003010010000010010002111000200011000011110261017111698203000610101010000301007005270052701367005770052

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0051

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)09l2 tlb miss data (0b)0e0f181e22233f4d4f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)5f60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)acafb5l1d cache miss ld nonspec (bf)c2cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
4002570051525000000013910700366977559710254001430013100023001010000616991334225400496697170051700516474836498240010300201000060020200007005135114002110910100003001010000010100001100000010000110025201712169820300061010010000300107005870058700587005870058
400247005752510110005710700366977559710254001430013100013001010000616991334225400496697170051700356476436497640010300201000060020200007005135114002110910100003001010000010100001100000010000010025201711169820300030101010000300107005270052700527005270052
4002470051524000000073107003669775597102540014300101000130010100006169913342254004966971700517005164669364976400103002010000600202000070051351140021109101000030010100000101000011000000100001100252017132698043000610101010000300107005270052700527005270052
4002470051524000000052107003669775597102540014300131000130010100006169913342254004966971700517005164781364976400103002010000600202000070051351140021109101000030010100000101000011000000100001000252037122698203000610101010000300107005270036700527008970036
4002470051524000000061107003669775597102540014300131000130010100006169913342254004966971700517005164756364976400103002010000600202000070035351140021109101000030010100000101000011000000100001100252017111698043000610101010000300107005870058700527005270269
40024700515250000000110700366977559710254001430013100013001010000616991334225400496697170051700516472836497640010300201000060020200007005135114002110910100003001010000010100001100000010000110025204711169820300031010010000300107005870058700527005270052
400247005152400000003410700366977559695254001430013100013001010000616991334225400496697170051700516475936497640010300201000060020200007005135114002110910100003001010000010100001100000310000100025202711269820300031001010000300107005870058700527005270052
400247005152500000102510700366977559710254001430013100013001010000616991334225400496697170057700786471736497640010300201000060020200007005135114002110910100003001010000010100001100000010000110025201711169820300061001010000300107005970058700527003770052
4002470163524000000001070036697755969525400143001310001300101000061699133422540049669557005170051647393649764001030020100006041020000700893511400211091010000300101000001010000110000500100001101252017122698203000610101010000300107005870052700527005270052
400247005152400001000107003669775597102540014300131000130010100006169913341470004966971700517013464670364976400103002010000600202000070051351140022109101000030010100000101000011000010100001100252027121698203000610101010000300107005270036700527003670036

Test 4: throughput

Count: 8

Code:

  ldrsw x0, [x6, x7, lsl #2]
  ldrsw x0, [x6, x7, lsl #2]
  ldrsw x0, [x6, x7, lsl #2]
  ldrsw x0, [x6, x7, lsl #2]
  ldrsw x0, [x6, x7, lsl #2]
  ldrsw x0, [x6, x7, lsl #2]
  ldrsw x0, [x6, x7, lsl #2]
  ldrsw x0, [x6, x7, lsl #2]
  mov x7, 8

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.3342

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)l2 tlb miss instruction (0a)l2 tlb miss data (0b)0e0f18191e1f223a3f4346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)a5ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafb5b6bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2branch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
8020526734200101100006501326718218181725801001008000010080015500115923914923653267342671416642616684801162008002420016004826734831180201100991008000010080000010080020204208006000066800413160421901115118016002674610103800001002673426735267342673426735
802042673420011000000210122671831818172580100100800001008001550011678110492410926734267331666161668580116200800242001600482673382118020110099100800001008000001008001920420800601016280041316001931115118016002673010103800001002673426734267342673426734
802042673320010010000650132671921818162580100100800001008001650011676170492365426734267331666161668580117200800242001600482673382118020110099100800001008000001008002120420800601016280041316101911115118016002673010103800001002673526734267342673426734
80204267332001100000064012267230181816258010010080000100800165001167559049236532671426733166616166858011620080024200160048267338211802011009910080000100800000100804152142968019710162800423160421901115118016002673310103800001002673426734267342673426735
80204267142001000000065012267182181820625801001008000010080016500116980514923654267332673316661616686801162008002420016004826733831180201100991008000010080000010080019194208006000262800413160421911115117016002673010103800001002673426734267342673526734
802042673320010010000650132671831801725801001008000010080016500116781114923653267332673516660616684801162008002420016004826734821180201100991008000010080000010080020194208006010162800413160421901115118016002673610103800001002673426738267342673426734
802042673320111010000650122672521818025801001008000010080016500116961104923653267342673316661616755801162008002420016004826733821180201100991008000010080000010080021204208006010161800413161421922225128123112673010103800001002673826734267662673426738
802042673320010010000650132671821800258010010080000100800195001166770149236352673326733166363016672801222008003020016006026733821180201100991008000010080000010080020194208006100162800413060421802225128123112673610103800001002673426735267342673426734
8020426733200110010006501326718218181825801001008000010080020500116710214923653267332673316651916673801212008003020016006026733821180201100991008000010080000010080020204208006210165800433160421922225128123112671210103800001002673426734267342673426734
80204267332001101000065013267182211816258010010080000100800215001167322149236532673326733166511016673801212008003020016006026734831180201100991008000010080000110080020194208006010162800413060421912225128123112673510103800001002673426734267342673426734

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.3342

retire uop (01)cycle (02)03mmu table walk data (08)l2 tlb miss instruction (0a)l2 tlb miss data (0b)0e0f1e22233a3f4346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)5f60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)a5ld unit uop (a6)l1d cache writeback (a8)acafb5b6bbl1d cache miss ld nonspec (bf)c2cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
8002526728200000004410126714001217258001010800001080534501171382114923628267292672916652316709800102080000201600002670878118002110910800001080000010800000430800421548004231424321502021161313267251111480000102670926730267302673026791
80024267122000001100012669321212140258001010800001080000501168891014923648267292672816652316709800102080000201600002670856118002110910800001080000010800000008004204280000004200050201816101326725110480000102673026709267302670926710
80024267082000000000012671301201192580010108000010800005011667500149236482673226732166733167098001020800002016000026729781180021109108000010800000108000000080042008004201000050201816101326705110480000102672926729267292672926869
8002426729200000004510026714212121112580010108000010800005011667500149236282672826728166733166888001020800002016000026728561180021109108000010800000108000004308004264880042310430050201316128267051111480000102673026709267302670926732
800242673420000000451002669321212172580010108000010800005011688910149236492672826708166523167088001020800002016000026732781180021109108000010800000108000004308004206080042300430050201116181326705011480000102673026730267302672926711
80024267142000000001012671300121725800101080520108000050116828500492364826729267281665231670980010208000020160000267297811800211091080000108000001080000000800420380042314200050201316181726726011080000102670926709267292672926711
800242673320000001451012671321212025800101080000108000050116693401492364826728267281667331668880010208000020160000267085611800211091080000108000001080000000800001980042014200050201016131026726011080000102673026709267092673026714
800242674520000000000126716212121725800101080000108000050116889101492364926729267081667331668880010208000020160000267295611800211091080000108000001080000000800420578004230424300502017161313267251111080000102673026709267302673026737
80024267412010000045100267142001725800101080000108000050116889101492364926729267281665231668880010208000020160000267297811800211091080000108000001080000043080042151800423042000502013161013267051111080000102673026730267092670926730
800242672520000000010126713212020125800101080000108000050116889101492362826728267081665231670980010208000020160000267087811800211091080000108000011080000043080042012800000142430050201316912267251111480000102672926730267092673026729