Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

LD1 (multiple, 3 regs, 2S)

Test 1: uops

Code:

  ld1 { v0.2s, v1.2s, v2.2s }, [x6]
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 60 nops): 3.000

Issues: 2.000

Integer unit issues: 0.000

Load/store unit issues: 2.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)03l1i tlb fill (04)090e1e22243a3f43464951schedule uop (52)schedule ldst uop (55)dispatch ldst uop (58)simd uops in schedulers (5a)5f696d6emap rewind (75)map stall (76)dispatch uop (78)map ldst uop (7d)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst simd load (98)inst ldst (9b)9dl1d tlb access (a0)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)acafb5b6bbl1d cache miss ld nonspec (bf)cfd0d2l1i cache miss demand (d3)l1i tlb miss demand (d4)d5map dispatch bubble (d6)d9dadbddfetch restart (de)e0eaebec? ldst retires (ed)? simd retires (ee)f5f6f7f8fd
63005293402197105104524288230002416320002000200010000716177285912921231020003000200029134291321161001100010000200042000002000600412982931368913081374206563109381416545628349160621358915339200010002929729357293022939929342
630042937422050010104662288620002425720002000200010000516175286112924231020003000200029163290661161001100010000200042000032000400412866954668983139059206233448381515545628509164241303615570200010002926829411294112925729385
630042935321940041045172878602024202200020002000100001016164287892930131020003000200029182291191161001100010000200042000002000400413098917769203087358206613092381612576828338162061356115877200010002938129186292972925829301
63004292462205004104526290470012419020002000200010000716185286102930631020003000200029080292311161002100010000200042000002000400613111926469153146157205843048380913555828356162741375215386200010002924229252292882925629313
6300429331219300210470828841000240912000200020001000081616828730293013102000300020002921929215116100110001000020004200000200040041312690706924308705920596302738127545428406164351372715755200010002924929239292782928329330
630042936622020054104530288500202424320002000200010001516179286742926331020003000200029151291281161001100010000200042000002000400412942899968873116157205933070381513615828376161641389415697200010002934529256292852928029293
63004293342217005104538288250012427020002000200010000016160289352921031020003000200029086291491161001100010000200042000002000600412980903568143144058205713106381113626528334164691381115673200010002938329256292952931129335
63004293832206000104612289110002420520002000200010000116148286722934031020003000200029247291761161001100010000200042000012000400413060914568843074250206453055381610595228537161441353215523200010002935529274293622924229299
63004293862206000104636288700022419320002000200010000316182288762931931020003000200029172291871161001100010000200042000002000400412894917268193149060206363088381613575328315161861362415428200010002932329114293052933129179
63004293392193004104882288130002411820002000200010000716159286432930431020003000200029144290901161001100010000200042000002000400613100918569323142064207383100381110615428477163361365015671200010002936929318293112932629429

Test 2: throughput

Count: 8

Code:

  ld1 { v0.2s, v1.2s, v2.2s }, [x6]
  ld1 { v0.2s, v1.2s, v2.2s }, [x6]
  ld1 { v0.2s, v1.2s, v2.2s }, [x6]
  ld1 { v0.2s, v1.2s, v2.2s }, [x6]
  ld1 { v0.2s, v1.2s, v2.2s }, [x6]
  ld1 { v0.2s, v1.2s, v2.2s }, [x6]
  ld1 { v0.2s, v1.2s, v2.2s }, [x6]
  ld1 { v0.2s, v1.2s, v2.2s }, [x6]
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.6675

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)l2 tlb miss data (0b)0e0f1e22233a3f4346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd load (98)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafb5b6bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2branch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0eaebec? ldst retires (ed)? simd retires (ee)? int retires (ef)f5f6f7f8fd
2402055340040011100651005336621818025160100100160000100160018500233616805337453399533812332572360716011720024004020016002153381533811180201100991001008000080000110016002020421600570005916003861574219111151171161153378002160000800001005339953382534005340053400
24020453399400101006510253366018181625160100100160000100160017500233426115335653398533812332582359016011520024003220016002153398533981180201100991001008000080000010016002120421600571005916000061574219111151171161153378900160000800001005339953382534005338253400
240204533814001000065003533660181816251601001001600001001600155002334114153374533815338123324823350160117200240040200160027533985339911802011009910010080000800000100160020190160057101581600386057019011151171161153396990160000800001005340053400533825340053399
2402045338139910100650025338421801625160100100160000100160016500235831105337453414533852332482337816011720024003220016002153381533991180201100991001008000080000010016001921411600581115916003861194219011151171161153396992160000800001005340053400534005338253399
24020453398400110006500253384001802516010010016000010016001350023581631533745339853398233258236251601132002400322001600215338153398118020110099100100800008000001001600201901600571005916003801574219211151171161153378992160000800001005338253399533825340053382
2402045338139911100651025338301818162516010010016013010016001650023397570533565339953398233257236271601172002400322001600275339953381118020110099100100800008000001001600192001600190015916003861574219011151171161153395002160000800001005340053382533825340053583
240204533984001110021102533852181812516010010016000010016001650023583070533745339953399233258235821601142002400402001600215338353399118020110099100100800008000001001600212042160019000211600386157019111151171161153396900160000800001005338253382534055340053399
24020453399400111006500053383200162516010010016000010016001650023341140533745338153398233248236011601172002400402001600275339953398118020110099100100800008000001001600192101600571115916000000574219111151171161153378992160000800001005338253399533825339953399
24020453401400111006400253384218181625160100100160000100160018500233616805337453399533812332472338116011520024003220016002753381533981180201100991001008000080000010016001920421600191005916003861574219011151181161153378090160000800001005340053382534005338253382
24020453399400110007400253384101802516010010016000010016001450023403700534655338153399233258236071601132002400322001600215339953399118020110099100100800008000011001600201901600570015916003860674219111151171161153396992160000800001005340053400533825339953382

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.6675

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk instruction (07)mmu table walk data (08)09l2 tlb miss instruction (0a)l2 tlb miss data (0b)0e0f18191e1f22233a3f4346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)6061696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)92inst branch cond (94)inst int alu (97)inst simd load (98)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)a5ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafb5b6bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2cdcfd0d2d5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? simd retires (ee)? int retires (ef)f5f6f7f8fd
24002553400400101000000010300025336621801251600101016000010160000502353389105335653399533812332632336116001020240000201600005339953398118002110901010800008000001016002021420160019000211600386157019005020002216222253395099016000080000105340053399534005339953400
24002453399400100100000010401035338320181525160010101600001016000050233403600533745338153381233433233801600102024000020160000533995339811800211090101080000800000101600202000160057101591600386119019105020001116231453378009216000080000105340053382533825338253382
2400245339940010110100003770103533832018152516001010160000101600005023580911053356533985338123343323394160010202400002016000053399533811180021109010108000080000010160019194201600190005916003861574219205020002616242553396099216000080000105340053399533995340053382
240024533994001001020000423010253383018181525160010101600001016000050235809110533745339953381233433234291600102024000020160000549425524413180021109010108000080000010161845194240616187921610769161690605742193052550045134202555162099216000080000105558255397555455552755558
2400245447841412020000131419131144104535380181812516001010160000101600005023449300055131555305540724456150248741625042024375722162697554005566614180021109010108000080000010161715194239716187900010717161858615741193052530030142252955289209216000080000105555155559554945539055538
240024555364161020000023213058193610356905218181818696163001141629901216320455244700400559315629256177249302162547316232620243758201625015478453399118002110901010800008000011016001920423801601872126416172861194219405020001316262853540099216000080000105600855705552425552955848
2400245658743912023200001190002560042180162516001010160130111633827324233170055516558475585324735180248191630372024519020163078544345339911800211090101080000800001101600201903461601891005916003861574019105020002416262553691009216000080000105341853386534075340753400
240024533994131000011100113000253390077025160010101600001016000050233403600533785338153381233483233831600102024000020160000534035340211800211090101080000800000101600192000160059002611600406159431900502000241613235339901313516000080000105340453382534045338253404
240024533814001010000000940002533870001925160010101600001016000050233124600533785338153381233263233831600102024000020160000534035340211800211090101080000800001101600202000160019000641600006159431900502000261626255340001313516000080000105340453404534045338253404
2400245338139910000111004830003533883772025160010101600001016000050235912400533775340353403233473233821600102024000020160000536575345811800211090101080000800001101600202043016005800222160039005901920502000201624125337801313516000080000105340353382534035338253404