Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

LDR (register, lsl, D)

Test 1: uops

Code:

  ldr d0, [x6, x7, lsl #3]
  mov x7, #4
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 0.000

Load/store unit issues: 1.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)09l2 tlb miss data (0b)0e0f1e22233a3f4346494f51schedule uop (52)schedule ldst uop (55)dispatch ldst uop (58)simd uops in schedulers (5a)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map ldst uop (7d)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)92inst simd load (98)inst ldst (9b)9dl1d tlb access (a0)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)acafb5b6bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)cfd5map dispatch bubble (d6)ddfetch restart (de)e0eaebec? ldst retires (ed)f5f6f7f8fd
100540430000116710238937102125100010001000155721379404404226326210001000200038240411100101000100001000431042042100030614519731161140301461000404382382405382
1004403211110167001380000202510001000100015315137039937421832571000100020003993951110010100010000100043104104110003142440731161140314061000405383405405405
10044043101000670013590110251000100010001406013493993992183257100010002000399374111001010001000010004310410010413042440731161140101461000405405382405382
10044033100100670003840101725100010001000153251349374399222323210001000200039937411100101000100001000441042041104100424407311611401141461000406382405405382
10044033111000661013842112202510001000100015322137039939919732571000100020003743951110010100010000100001041041104101044073116113710071000375400400400400
10043743000000440013592012025100010001000153151370399374218325710001000200037439911100101000100001000431042141104131044073116114010061000382405383382405
1004404210110021001384200172510001000100015037137439940222232571000100020003743991110010100010000100044104104410000142440731161140114061000382404405387405
100438131101102100038421120251000100010001532213743993991973232100010002000399374111001010001000010004310000010000041007311611401141461000405405405382405
100440531111002100138907720251000100010001557213793813812043262100010002000381403111001010001000010000104104110410004307311611379141461000383405405405405
100440431111006601138501212202510001000100015037134939939919732571000100020003993951110010100010000100043104104110413142007311611401141461000405404383426405

Test 2: Latency 1->2 roundtrip

Chain cycles: 3

Code:

  ldr d0, [x6, x7, lsl #3]
  fmov x0, d0
  eor x8, x8, x0
  eor x8, x8, x0
  add x6, x6, x8
  mov x7, #4
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 9.0055

retire uop (01)cycle (02)03mmu table walk data (08)090e0f181e1f22233f4d4f51schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)60696b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd load (98)inst ldst (9b)9d9fl1d tlb access (a0)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)acafb5l1d cache miss ld nonspec (bf)c2branch mispred nonspec (cb)cfd0d5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
5020512005289901100601012004011949310946525601034010210000100003010010000100001078862573642861344611120011012005512005511315031136725010030200100001000060200200001000012003512005211502011009910040100100001000001001000011000000010000110032100110711119662400021411131000040100120056120056120056120053120056
502041200558990000000101200201194931094682560103401001000410000301001000010000107904457354556136521112003101200351200521131503113658501003020010000100006020020000100001200551200521150201100991004010010000100000100100001100000001000011003240011071111966240000140131000040100120056120056120053120056120056
502041200528990000010101200201195141094492560100401021000010000301001000010000107901757364286134461112002801200351200551131413113669501003020010000100006020020000100001200551200521150201100991004010010000100000100100001100000001000011003210011011111965940002014131000040100120056120070120056120056120056
502041200558990000000101200401195111094652560103401021000010001301001000010000107901757362846136368112003101200551200351131503113669501003020010000100006020020000100001200551200521150201100991004010010000100000100100001100000001000011003210011011111965140002014131000040100120056120056120036120056120056
5020412003589900000101012004011949310946825601034010210001100003010010000100001078862573628461344611120031012005512005211315031136725010030200100001000060200200001000012003512005211502011009910040100100001000001001000001000000710000110032100210711119646400021411101000040100120056120056120053120053120056
5020412003589900000101012002011951410946825601034010210001100003010010000100001078862573545561365211120011012005512003511315031136725010030200100001000060200200001000012005512005211502011009910040100100001000001001000011000000010000010032100110111119662400001414131000040100120036120036120056120056120056
50204120052899000001010120020119514109468256010340102100011000030100100001000010790445736284613652111200310120055120035113155311366950100302001000010000602002000010000120055120035115020110099100401001000010000010010000110000000100001100332701107111196594000201401000040100120056120056120056120056120036
5020412003589900000101012004011951110947325601064010010001100003010010000100001078862573642861363681120031012005512003511314731136585010030200100001000060200200001000012005512005211502011009910040100100001000001001000011000000010000110032100110711119649400001411101000040100120056120056120039120036120086
502041200358990000010001200201194931094492560103401021000110000301001000010000107904457364286136521112003101200551200551131503113675501003020010000100006020020000100001200521200351150201100991004010010000100000100100001100000001000010003210011071111966240002011131000040100120036120056120053120036120056
5020412003589900000100112002011951410946825601004010210001100003010010000100001079044573642861363681120028012005512005511315031136695010030200100001000060200200001000012003512010011502011009910040100100001000011001000011000000010000110032100110711119646400001411131000040100120056120036120053120056120056

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 9.0055

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)l2 tlb miss data (0b)0e0f1e22233f4d4f51schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)int prf full (71)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd load (98)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)acafb5bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2branch mispred nonspec (cb)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
500251200589000001111012002011951310946525600134001210001100003001010000100001079566573642861327570120028120035120052113170031136745001030020100001000060020200001000012003512009311500211091040010100001000001010000011000000100001010003140210733119670400021411131000040010120036120036120056120056120056
500251200358990000000012004011949210946825600134001210001100003001010000100001079566573642861338660120028120052120055113173031136945001030020100001000060020200001000012012112005511500211091040010100001000011010000011000020100001010003140210734119670400001111131000040010120056120036120053120056120056
50024120035899000001001200401195131094492560013400121000010000300101000010000107956657362846132757012003112005512003511317303113691500103002010000100006002020000100001200551200521150022109104001010000100000101000000100000010000101000314031073311967040000140131000040010120089120041120036120036120036
5002412006489900000010120040119510109468256001340012100001000030010100001000010795175735455613371301200281200551200351131730311369150010300201000010000600202000010000120055120052115002110910400101000010000010100000110000001000010100031403107321196504000200131000040010120056120036120056120069120102
500241200358990000011012004011951310946825600134001010000100003001010000100001079517573545561337130120028120035120052113170031136915001030020100001000060020200001000012003512005211500211091040010100001000001010000011000000100000010003140210723119667400021414101000040010120056120036120056120036120036
500241200558990000001012004011951310946825600134001210001100003001010000100001079593573642861327570120011120052120055113173031136945001030020100001000060020200001000012005212005211500211091040010100001000001010000011000000100001010003140410733119650400021414131000040010120056120056120036120053120056
500241200558990000011012002511951310946825600134001010001100003016210000100001079566573628461327570120011120035120052113173031136745001030020100001000060020200001000012005512005211500211091040010100001000001010000001000000100000000003140310723119670400021411131000040010120056120036120056120036120036
50024120055899000001101200401195101094492560010400101000110000300101000010000107959357354556133713012003112005512003511315303113674500103002010000100006002020000100001200351200521150021109104001010000100000101000001100001010000101000319731071411974940000140101000040010120037120054120036120144120136
500241202559000000011012004111951310946825600134001010001100003001010000100001079593573545561337130120031120058120055113173031136945001030020100001000060020200001000012005812005211500211091040010100001000001010000011000017253100031010003140310743119670400021414101000040010120053120054120053120053120056
500241200578990001001012004011949210946925600104001210000100003001010000100001079517573545561338660120092120035120055113153031136795025430020100001000060020200001000012005512005211500211091040010100001000011010000011000013100001010003140310742119670400021411101000040010120056120056120053120053120041

Test 3: Latency 1->3 roundtrip

Chain cycles: 3

Code:

  ldr d0, [x6, x7, lsl #3]
  fmov x0, d0
  eor x8, x8, x0
  eor x8, x8, x0
  add x7, x7, x8
  mov x7, #4
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 9.0052

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)09l2 tlb miss data (0b)0e0f18191e1f22233f4d4f51schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)60696b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd load (98)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)a9acafb5l1d cache miss ld nonspec (bf)c2branch cond mispred nonspec (c5)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
5020512005889900001100101012003711951110946525601034010210000100003010010000100001079017573628461387431120011012005212005211314731136695010030200100001000060200200001000012005212005211502011009910040100100001000001001000001100000001000011003210110711119659400021111101000040100120053120053120053120053120053
502041200528990000001110101200201194931094492560103401021000110000301001000010000107888957364286148780112002801200521200521131413113669501003020010000100006020020000100001200521200521150201100991004010010000100000100100000110000000100000000321011071111965940002011101000040100120053120053120053120053120053
502041200528990000000060101201381194931094652560103401001000110000301001000010000107901757362846139374012001101200351200351131473113658501003084810000100006020020000100001200351200521150201100991004010010000100000100100000110000003100000000321011071111965940002111101000040100120053120056120054120056120053
5020412005289900001140101012003711949310946525601034010210001100003010010000100001079017573628461376530120028012005212005211314731136695010030200100001000060200200001000012005212009611502011009910040100100001000001001000001100000001000010003269110711119646400021111101000040100120053120053120036120053120054
5020412005289900001000101012003711951110946525601034010210000100003010010000100001079017573974061363370120028012005212005211314131136695010030200100001000060580200001000012005212005211502011009910040100100001000001001000001100000001000011203210110111119646400021111101000040100120053120053120036120053120053
50204120035900000000001300112003711951110946525601034010210001100003067610000100001079017573628461373480120028012005212005211314731136705010030200100001000060200200001000012003512005211502011009910040100100001000001001000021100000001000011003210110111119659400021111101000040100120053120053120053120053120053
5020412005289900000000100112003711951110944925601034010210001100003010010000100001079017573628461391130120011012005212005211314731136695010030200100001000060200200001000012005212005211502011009910040100100001000001001000001100001001000000003210110111119659400001111101000040100120053120053120053120053120036
502041200528991000000010001200371195111094652560103401021000010000301001000010000107886257362846138990012002801200521200521131473113669501003020010000100006020020000100001200351200521150201100991004010010000100000100100000110000000100001000321011011111965940000110101000040100120036120053120053120053120053
502041200528990000101129200112003711949410946725601424010210000100043010010207100001079017574238561399640120028012003512005211314731136695010030200100001000060200200001000012003512005511502011009910040100100001000001001000001100000001000010003210110111119659400021111101000040100120053120053120053120038120059
502041200529010011000531628001120037119511109467256010340102100011000030100100001000010790175736332613719701200280120054120052113154311366950100302001005510000602002000010053120036120152115020110099100401001000010000010010042011004402125678100471100418511071112063540239111101000040100123251122560123897123681123345

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 9.0055

retire uop (01)cycle (02)03mmu table walk data (08)09l2 tlb miss data (0b)0e0f18191e1f22233f4d4f51schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd load (98)inst ldst (9b)9d9fl1d tlb access (a0)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)a9acafb5l1d cache miss ld nonspec (bf)branch cond mispred nonspec (c5)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
500251200528990100000000012003711949210946825600104001010001100003001010000100001079566573642861327570120034120035120055113170311369150010300201000010000600202000010063120052120035115002110910400101000010000010100000100000001000010003140610711119650400021111101000040010120083120056120056120056120053
5002412005589901011003700012002011951310946525600134001210001100003001010000100001079517573642861327570120011120055120052113173311369450010302091000010000600202000010000120055120052115002110910400101000010000010100001100000001000011003140110711119650400020001000040010120056120056120056120036120036
5002412003589901000000000120037119510109465256001340010100011000030010100001000010795665736428613371311200111200551200351131533113694500103002010000100006002020000100001200521200521150021109104001010000100000101000011000000010000100032831107111196674000000131000040010120056120056120056120056120056
50024120035899000100010001200371194921094682560013400101000010000300101000010000107951757354556133713012003112005512005511315331136945001030020100001000060020200001000012003512005211500211091040010100001000001010000110000003100000000314011071111966740002141401000040010120056120056120053120036120056
5002412003589900011001000120020119513109468256001040012100011000030010100621000010796115736284613275701200311200551200551131533113674500103002010000100006002020000100001200551200353150021109104001010000100000101000001000000310000100031401107111196704000014001000040010120036120053120053120053120056
500241200528990001000001012002011949210946825600134001210000100003001010000100001079566573545561327571120031120052120035113173311369450010300201000010000600202000010000120055120052115002110910400101000010000010100001100000001000011003140110711119670400021411131000040010120056120056120053120036120056
5002412003589900001001000120020119594109468256001340010100011000030152100001000010815715737948613615511200311225141227191139323114719563213196010000100006002020000100001200521200521150021109104001010000100000101000011000000121000010003140110711119667400001414131000040010120053120036120056120056120036
500241200558990001100100012004011951010946825600104001210000100003001010000100001079593573545561338660120028120052120055113153311369150284300201000010000600202000010000120052120052115002110910400101000010000010100000100000031000011003140110711119670400020001000040010120036120036120053120056120056
50024120055899000000000001200401195131094492560010400121000010000300101000010000107956657362846132757012003212005512005211317331136915001030020100001000060020200001000012005512005211500211091040010100001000001010000010000000100001100314011071111967040002110131000040010120053120036120053120056120056
500241200528990001100100012004011951310946525600134001210001100003001010000100561079593573545561327570120011120035120052113153311369450010300201000010000600202000010000120035120035115002110910400101000010000010100001100001001000211003140110711119667400001414131000040010120056120056120036120056120053

Test 4: throughput

Count: 8

Code:

  ldr d0, [x6, x7, lsl #3]
  ldr d0, [x6, x7, lsl #3]
  ldr d0, [x6, x7, lsl #3]
  ldr d0, [x6, x7, lsl #3]
  ldr d0, [x6, x7, lsl #3]
  ldr d0, [x6, x7, lsl #3]
  ldr d0, [x6, x7, lsl #3]
  ldr d0, [x6, x7, lsl #3]
  mov x7, 8

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.3338

retire uop (01)cycle (02)03mmu table walk instruction (07)l2 tlb miss instruction (0a)l2 tlb miss data (0b)18191e3f4f5051schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd load (98)inst ldst (9b)9fl1d tlb access (a0)a5ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)acafc2branch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? ldst retires (ed)? int retires (ef)f5f6f7f8fd
8020526874200110000266920025801001008000010080015500117234802668226707267071663561665980115200800242001600482670726707118020110099100100800008000010080000080000008180000011151182161126704800001002670826708267082670826708
8020426782200110000266920025801001008000010080015500118236802668226707267071663561665980115200800242001600482670726707118020110099100100800008000010080000080000006080000011151181161126704800001002670826708267082670826708
802042676520111000026692002580100100800001008001450011670550266822670726707166356166598011520080024200160048267072670711802011009910010080000800001008000008000001680000011151181161126704800001002670826708267082670826708
80204268802001100002669200258010010080000100800155001167864026682267072670716635616659801152008002420016004826707267071180201100991001008000080000100800000800000015680000011151181161126704800001002670826708267082670826708
802042685320111000026699002580100100800001008001550011674920269562671126719166356166598011520080024200160048267112730411802011009910010080000800001008052208052201315980000011151421501126712800001002672326723267192672326713
802042670720011000026692202580100100800001008001550011675400266822670726707166356166598011520080024200160048267072671111802011009910010080000800001008000008000000080000011151181161126708800001002670826708267082670826708
8020426707200110000266920025801001008000010080015500117443402668726711267071663561665980115200800242001600482671026707118020110099100100800008000010080000080000001580000011151181161126704800001002670826710267082670826708
80204267212001100012266920025801001008000010080015500116659612668626707267071663561665980115200800242001600482670726707118020110099100100800008000010080000080000002480000011151181161126704800001002670826708267082670826708
80204267582011100002669200258010010080000100800165001166432026682267072670716635616659801152008002420016004826707267071180201100991001008000080000100800000800000010580000011151181161126704800001002670826708267082670826708
8020426707200110000266920025801001008000010080016500116731602668226707267071663561665980115200800242001600482670726707118020110099100100800008000010080000080000005780000011151181161126704800001002670826708267082670826708

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.3351

retire uop (01)cycle (02)03mmu table walk instruction (07)0918191e3f4f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd load (98)inst ldst (9b)9fl1d tlb access (a0)ld unit uop (a6)l1d cache writeback (a8)acafcfd5map dispatch bubble (d6)ddfetch restart (de)e0? ldst retires (ed)? int retires (ef)f5f6f7f8fd
80025268772000000026693025800101080000108000050116668826683267082670816652316688800102080000201600002670826708118002110910108000080000108000080000066800005020616762670580000102670926709267092670926709
800242674520100000266930258001010800001080000501166647266832670826708166523166888001020800002016000026708267081180021109101080000800001080000800000108800005020616652670580000102670926709267092670926709
800242681620000000266930258001010800001080000501166266266832670826708166523166888001020800002016000026708267081180021109101080000800001080000800000111800005020516562670580000102670926709267092670926709
800242681720100000266930258001010800001080000501174993266832670826708166523166888001020800002016000026708267081180021109101080000800001080000800000102800005020516772670580000102670926709267092670926709
80024268192000000026693025800101080000108000050115745226683267082670816652316688800102080000201600002671026708118002110910108000080000108000080000060800005020516552670580000102672826709267092670926709
800242670820001000266930258001010800001080000501173217266832670826708166523166888001020800002016000026708267081180021109101080000800001080000800000126800005020816882670580000102670926711267092670926709
8002426840201000002669302580010108000010800005011728342668326708267081665231668880010208000020160000267082670811800211091010800008000010800008000009800005020716782670580000102670926709267092670926709
80024267712000000026693025800101080000108000050116603226685267082670816652316688800102080000201600002670826708118002110910108000080000108000080000054800005020516662670580000102670926709267092670926709
8002426708200000002669302580010108000010800005011667502668326708267081665231668880010208000020160000267082670811800211091010800008000010800008000003801305020816652670580000102670926709267092670926709
800242684420000000266930258001010800001080000501173570266832670826708166523166888001020800002016000026708267081180021109101080000800001080000800000111800005020816862670580000102670926709267092670926709