Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

LDR (register, sxtw, 32-bit)

Test 1: uops

Code:

  ldr w0, [x6, w7, sxtw]
  mov x7, #4
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 0.000

Load/store unit issues: 1.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)l2 tlb miss data (0b)0e0f1e22243a3f4346494f51schedule uop (52)schedule ldst uop (55)dispatch ldst uop (58)simd uops in schedulers (5a)606d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map ldst uop (7d)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst int load (95)inst ldst (9b)9dl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafb5b6bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2cfd5map dispatch bubble (d6)ddfetch restart (de)e0eaebec? ldst retires (ed)f5f6f7f8fd
10053992111012110238321818152510001000100015421139938420432571000100020003818111100110001000010202142105710059103760574219073216223969921000400400400400399
10043993101006500338400180251000100010001453303993982223240100010002000399811110011000100001020214210190005910380157019073216223799921000400418382400399
10043993111002110138420180251000100010001533403993982213257100010002000399811110011000100001020190105710258103861574219073216224009921000400382382400403
100439921100021102384218180251000100010001445613993982043239100010002000399821110011000100001019200105810162100000574219173216223969901000382382400399399
100439931110021103384218180251000100010001439913983992043260100010002000399641110011000100001020210105710021103961564219173216223969901000400400399399399
10043993101006510238411818162510001000100015349039838122132561000100020003996411100110001000010202042101900059100061574219073216223789921000399399399399400
10043823110006500338421818162510001000100015323139939922232571000100020003986411100110001000010212042105600021100061564219073216223790901000382383383383387
100439931000021002384018180251000100010001537503993992213257100010002000382821110011000100001019194210571016210386157019273216223780901000382399382400385
10044063111006510236600180251000100010001531803993992043256100010002000399811110011000100001020190101910121103861194219173216223789921000400382399383383
100439921010065101366218002510001000100014456139938222132571000100020003996411100110001000110202042105710059103860574219273216223780021000382400400400400

Test 2: Latency 1->2 (with chain penalty)

Chain cycles: 3

Code:

  ldr w0, [x6, w7, sxtw]
  eor x8, x8, x0
  eor x8, x8, x0
  add x6, x6, x8
  mov x7, #4
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0047

retire uop (01)cycle (02)03mmu table walk data (08)090e0f1e22233f4d4f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)6067696a6b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)acafb5l1d cache miss ld nonspec (bf)branch cond mispred nonspec (c5)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
40205700475240111100700326976459709254010430103100013010010000616005334220600496695507005070047646463649504010030200100006020020000700473511402011009910010000301001000001001000001000000100001100261127132698133000396610000301007004870036700517005170048
40204700355250000100700326978159709254010430103100013010010000616005334220600496696707005070050646463649534010030200100006020020000700353511402011009910010000301001000001001000001000000100001100261137133698103000306910000301007004870051700517005170051
402047004752500001310700356976459695254010030103100013010010000616175334147000496696707005070050646463649504010030200100006020020000700353511402011009910010000301001000001001000011000000100001100261137123698133000306610000301007005170051700517005170048
40204700355250000100700326978159709254010430103100003010010000616175334220600496696707003570035646433649534010030200100006020020000700503511402011009910010000301001000001001000011000000100000100261137133698133000306010000301007005170036700517005170036
40204700505240000100700326976459709254010430103100013010010000616005334147000496696707004770047646463649384010030200100006020020000700473511402011009910010000301001000001001000001000000100001000261127123697983000306910000301007005170051700487003770090
40204700505250000100700326978159695254010030103100013010010000616005334206200496695507003570035646433649534010030200100006020020000700473511402011009910010000301001000001001000001000000100001100261127133697983000096610000301007004870048700487004870051
40204700475241000100700326978159709254010430100100003010010000616005334220600496695507003570050646463649504010030200100006020020000700503511402011009910010000301001000001001000001000000100001100261137133697983000006610000301007005170051700487003670051
40204700355240000100700206976459706254010030103100013010010000616175334220600496695507004770047646463649504031030200100006020020000700353511402011009910010000301001000001001000011000000100001100261137132698103000300610000301007004870051700517005170048
40204700495250011100700326976759709254010430103100003010010000616005334220600496695507003570047646433649504010030200100006020020000700473511402011009910010000301001000001001000001000000100001100261137133697983000099610000301007005170051700487005170051
40204700505250000001701176978159706254010430103100003010010000616005334206200496697007003570050646313649384010030200100006020020000700503511402011009910010000301001000001001000011000000100001100261127132698103000396910000301007005170036700517009270036

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0050

retire uop (01)cycle (02)03mmu table walk instruction (07)mmu table walk data (08)09l2 tlb miss data (0b)0e0f18191e1f22233a3f4d4f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafb5l1d cache miss ld nonspec (bf)c2branch mispred nonspec (cb)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
40025700475240000000010100700326976059709254001430010100013001010058616952334206204966970700477005064670364975400103002010000600202000070035351140021109101000030010100000101000000100000000100001100252027111698363000360910000300107005170048700367003670051
40024700505250000000000100700356976059709254001030013100013001010000616982334220604966970700477005064653364960400103021510000600202000070050351140021109101000030010100000101000001100000000100000100252017111698173000366910000300107005370048700517005170048
40024700505240000000010100700206976359709254001430013100003001010000617027334220604966970700357004764653364975400103002010000600202000070050351140021109101000030010100000101000001100000000100001100252017111698293000099010000300107005170051700367004870051
40024700475240000000010100700356976059709254001430013100013001010000617027334147004966970700357005064665364960400103002010000600202000070035351140021109101000030010100000101000001100000100100001000252017111698483000306010000300107005170048700367003670036
40024700355250000000010100700356973059709254001430010100013001010000616952334220604966967700477005064665364960400103002010000600202000070052351140021109101000030010100000101000001100000100100001100252007112698063000300910000300107003670036700517003670051
40024700505250000000000000700326976059706254001430013100013001010000616982334147004966967700507004764665364962400103002010000600202000070035351140021109101000030010100000101000000100000000100001100252017111698483000360010000300107004870051700567005170048
40024700505250000000060000700356974359706254001430010100003001010000617068334206204966975700507005064653364960400103002010000600202000070050351140021109101000030010100000101000001100000000100001001252017111698653000399910000300107005170048700517004870036
40024700475250000000010100700356974359706254001430010100013001010000616982334220604966970700507005064665364960400103002010000600202000070050351140021109101000030010100000101000000100000100100001000252017111698463000390910000300107003670036700367003670036
40024700505240000000010000700356974359695254001030010100013001010000616982334220609866955700507005064668364972400103002010000600202000070050351140021109101000030010100000101000001100000000100000100252017111698173000000910000300107005170051700517003670051
40024700505250000000060100700356977959709254001430013100003001010000616952334147004966970700357004764653364975400103002010000600202000070050351140021109101000030010100000101000001100000100100000100252017111698603000390010000300107005170051700517003670051

Test 3: Latency 1->3 (with chain penalty)

Chain cycles: 3

Code:

  ldr w0, [x6, w7, sxtw]
  eor x8, x8, x0
  eor x8, x8, x0
  add x7, x7, x8
  mov x7, #4
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0054

retire uop (01)cycle (02)03mmu table walk data (08)l2 tlb miss data (0b)0e0f18191e1f22233a3f4d4f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafb5l1d cache miss ld nonspec (bf)c2cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
40205700575250000000010070045697825971325401043010310001301001000061601433423981496697470054700546465036493840100302001000060200200007003535114020110099100100003010010000010010000011000000001000011026101711169798300031301310000301007005570055700527005270036
40204700515250000001000070049697825971325401043010310000301001000061601433423981496697470051700566465036495740100302001000060200200007012535114020110099100100003010010000010010000011000001001000011026101711169817300030101010000301007005570055700557005570088
402047005452500001200000700366978559713254010030103100013010010000616041334225414966974700357005464650364957401003020010000602002000070054351140201100991001000030100100000100100000110000000010000110261017111698173000313131310000301007003670055700527005270055
402047005452501000000000700396978259713254010430103100013010010000616014334239814966974700547005464650364957401003020010000602002000070054351140201100991001000030100100000100100000110000000010000110261017111697983000313131310000301007005570055700557005570036
402047005452500000010000700396978259713254010430103100013010010000616041334239814967009700547005964631364938401003020010000602002000070054351140201100991001000030100100000100100000110000000010000110261017111698173000310101010000301007005570055700527005570055
40204700515250000001000070043697645971325401043010310001301001000061604133426381496697470054700546465036495740100302001000060200200007005435114020110099100100003010010000010010000001000000001000001026101711169814300031310010000301007005570036700557005570055
402047005452500000010100700826978259695254010430103100013010010000616014334239814966974700547005464650364957401003020010000602002000070054351140201100991001000030100100000100100000110000000010000110261011711697983005910101310000301007005670055701457022970055
402047005452500000010000700396978559713254010430100100013010010000616014334239804966979700547005464650364957401003020010000602002000070054351140201100991001000030100100000100100000110000000310000110261017111698173000313131310000301007005570055700367005570055
402047005452400000010000700396978559714254010430103100013010010000616041334239814966974700517005464647364957401003020010000602002000070051351140201100991001000030100100000100100000110000000010000110261017111698733000310101010000301007003670100700627005270052
402047005152500000010010700206978559695254010430103100003010010000616014334239814966974700547005464650364957401003020010000602002000070054351140201100991001000030100100000100100000110000000010000110261017111698173000310101310000301007005570055700557005270052

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 4.0051

retire uop (01)cycle (02)03090e0f191e22233f4d4f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)6061696a6b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)acafb5l1d cache miss ld nonspec (bf)branch mispred nonspec (cb)cfd0d2d5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
4002570051525000010070036697785971025400103001310001300101000061701833423980049669710700547005464672364979400103002010000600202000070035351140021109101000030010100000101000011000000100001102520002711169798300001310010000300107003670052700557003670055
40024700545250000110700396974359713254001430013100013001010000617018334239800496697407003570035646723649604001030020100006002020000700543511400211091010000300101000001010000110000001000010025200017111698173000313131010000300107005270052700557005570052
400247003552500001107003969778597102540010300101000030010100006170183342398004966955070054700356466936497640010300201000060020200007005435114002110910100003001010000010100001100000232610000110252000171116979830003130010000300107005570058700387003670036
40024700355240000110700396977859713254001430013100013001010000617018334239800496697407005470054646723650224001030020100006002020000700593511400211091010000300101000001010000110000001000011025205127111698173000000010000300107005270036700557005570036
4002470035525000011070020697755971325400143002110001300101000061699133422540049669550700547005164653365037400103002010000600202000070054351140021109101000030010100000101000011000000100001102520001712169817300031401310000300107005570036700527005570036
40024700545240000010700396977859713254001430013100003001010000616991334147000496695507005470054646723649604001030020100006002020000700593511400211091010000300101000001010000110000001000011025200017111697983000310101310000300107005270052700527003670036
40024700515241000110700366977859713254001430013100013001010000617018334147000496695507005170051646723650174001030020100006002020000700353511400211091010000300101000001010000110000001000011025200017112700403000310131310000300107005270036700527003670036
4002470051525000010070020697755969525400103001310001300101000061706833423980049669740700547003564653364963402053002010000600202000070051351140021109101000030010100000101000011000300100001102520511711169817300031001310000300107005270052700367003670055
4002470035525000000070039697755969525400143001310001300101000061701833414701549669550700547005464672364960400103002010000600202000070054351140021109101000030010100000101000001000000100001002520511711169817300031301310000300107005270055700367003670052
40024700515240000100700206977559710254001430010100003001010000617068334239800496697107005470054646533649764001030020100006002020000700543511400211091010000300101000001010000010000001000010025200017111698173000013101310000300107003670036700557005570036

Test 4: throughput

Count: 8

Code:

  ldr w0, [x6, w7, sxtw]
  ldr w0, [x6, w7, sxtw]
  ldr w0, [x6, w7, sxtw]
  ldr w0, [x6, w7, sxtw]
  ldr w0, [x6, w7, sxtw]
  ldr w0, [x6, w7, sxtw]
  ldr w0, [x6, w7, sxtw]
  ldr w0, [x6, w7, sxtw]
  mov x7, 8

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.3341

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk instruction (07)mmu table walk data (08)l2 tlb miss instruction (0a)l2 tlb miss data (0b)0e0f18191e233a3f4346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafb5b6bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2branch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
8020526737200010100000210126722212016258010010080000100800155001167628049236342673626737166426166888011520080024200160048267366411802011009910080000100800000100800202043800591006180000616043191111511811611267331007800001002673226732267322673226732
8020426707200010100000440126877211192580100100800001008001550011605200492363426736267361666361668980116200800242001600482671485118020110099100800001008000001008000004380039000388003861394400111511811611267280147800001002673226732267282673226708
80204267312000101000004401268930101925801001008000010080013500116679504923656267142673616642616666801162008002420016004826737851180201100991008000010080000010080000043800380000800386100001115118116112672810100800001002671726739267342674126732
80204267072000101000000012669821119258010010080000100800155001167215049236342671426714166646166888011320080024200160048267148511802011009910080000100800000100800000438003801138800406004400111511811611267281407800001002673226732267282673226732
802042673120001010000044012687621002580100100800001008001550011673030492365126727267311663561665980115200800242001600482673477118020110099100800001008000001008000004380000000080038615444001115118116112670414144800001002673226708267082673226708
802042670720001010000000126716201202580100100800001008001550011699490492365626737267361666461668880115200800242001600482673785118020110099100800001008000011008000004380038000080039013844001115118116112670410104800001002670826732267082673226708
802042673120001010000069002688521212192580100100800001008001550011699490492365626714267361666461666680113200800242001600482673764118020110099100800001008000001008000000800000003980038013844001115118116112672814144800001002673226732267322670826732
802042673120001010000044002673721116258010010080000100800165001167377049236572673626736166426166658011220080024200160048267368611802011009910080000100800000100800000080039000080038013844001115118116112672810140800001002673226737267152673226732
8020426731200010100000440126848201192580100100800001008001550011672150492363426736267371666461668880115200800242001600482673664118020110099100800001008000011008000004380038000380039613844001115118116112672814104800001002672826728267282672826732
8020426731200010100000440126900201216258010010080000100800155001167215049236562673726736166646166888011520080024200160048267148511802011009910080000100800001100800000438003800038800380138000111511811611267040140800001002672826732267282672826732

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.3340

retire uop (01)cycle (02)03l1i tlb fill (04)090e0f1e1f22233a3f4346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)a9acafb5b6bbl1d cache miss ld nonspec (bf)cfl1i tlb miss demand (d4)d5map dispatch bubble (d6)daddfetch restart (de)e0eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
80025267232000010315000226693212120258001010800001080000501172240149236282672726708166673167838001020800002016000026722721180021109108000010800001108000003980000003580039613539502001816014162670506080000102672826728267122672326728
800242670820000003000102267120121212258001010800001080000501168843149236422670826727166723167138001020800002016000026708711180021109108000010800000108000003980000000800356135395020017160171726724106480000102670926723267292672826709
80024267222000000381000026693212002580010108000010800005011668860492364226728267281667231670780010208000020160000267287111800211091080000108000001080000039800350008003961394350200171601716267251010480000102670926723267232672826723
800242672820000002730100267122018162580010108000010800005011722401492362826708267271667231670880010208000020160000267277111800211091080000108000001080000008003500398003961043502001716091726724010480000102670926728267232672826729
8002426728200000031201022671321212162580010108000010800005011668860982362826728267081667231678680010208000020160000267277111800211091080000108000001080000039800350035800006139050200141601714267051010480000102672926728267232670926729
8002426722200000028101022670720122258001010800001080000501166750049236282672726727166523167098001020800002016000026727561180021109108000010800000108000003980000003580000613539502001716017172670500080000102672926729267092672826728
800242670820000002820100266932120162580010108000010800005011667500492364226722267221667231671580010208000020160000267275611800211091080000108000001080000039800000008000061353950200916017142670566080000102672326723267232672326723
8002426708200000030601002671220121225800101080000108000050116675004923647267282670816672316693800102080000201600002672756118002110910800001080000010800000080000000800366135435020017160151726724106080000102670926728267092672926728
80024267272000010311000226712218121625800101080000108000050116884304923642267222672216667316712800102080000201600002672756118002110910800001080000010800000080039003980039600050200171601317267281010480000102672326728267092670926729
8002426708200000145010026712212121625800101080000108000050116675004923645267222672316667316702800102080000201600002672571118002110910800001080000010800000398003500080000613539502001716017172671966280000102672326723267092672326723