Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

LD3 (single, post-index, D)

Test 1: uops

Code:

  ld3 { v0.d, v1.d, v2.d }[1], [x6], x8
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 60 nops): 5.000

Issues: 6.003

Integer unit issues: 1.000

Load/store unit issues: 2.000

SIMD/FP unit issues: 3.003

retire uop (01)cycle (02)03l1i tlb fill (04)l1d tlb fill (05)mmu table walk instruction (07)mmu table walk data (08)09l2 tlb miss instruction (0a)l2 tlb miss data (0b)0e0f18191e1f2223243a3f43464951schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)5f696b6d6emap rewind (75)map stall (76)dispatch uop (78)map ldst uop (7d)map simd uop (7e)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst simd load (98)inst ldst (9b)9dl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)a9acafb5bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2c9cfd0d2l1i cache miss demand (d3)l1i tlb miss demand (d4)d5map dispatch bubble (d6)d9dadbddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? simd retires (ee)? int retires (ef)f5f6f7f8fd
6500528888232002800180000000100463228528001166676000100030002000100030002000500010018356071122920028763288743106000200030003000900028678288481161001100010000200000200000020024400001322894806935314395519823324337451657592834210001565312906140332000300010002893628818290132901628904
6500428962234001800160001115988000472628526001166356005100030032000100030002000500510009357535229000287962898231060002000300030009000287882892211610011000100002000062005002200240000013009934268853165105819919316138221658622826610001548312970140352000300010002901828907289302900228898
6500428983233002000210010090000466528538000165676003100030052000100030002000500010023357135228850287002884531060002000300030009000288362882111610011000100002000042000303200242000013272926269703151125019791327938251554532826710001589712981138502000300010002890328905289272902429003
6500428875232001700200000060000461728598000166816003100030052000100030002000500010030356195229010288042887871060002000300030009000289742878911610011000100002000062002005200242600013280941868703142105919904327838262353542844010001582712780139892000300010002890328968289822899628997
6500428990232001700231000010000046282849702116678600010003003200010003000200050001001835694522915028808289793286000200030003000900028836286831161001100010000200000200200320020240001342393816892312275519854331738252654562835110001566313062139622000300010002892428951288852894029006
6500428872234001800220000011000046652854322216716600810003006200010003000200050001000735775522991028815288973296000200030003000900028889289361161001100010000200004200210220020260001333993786908318385819817322138241957582835010001581112762138852000300010002889828956289462887129004
65004288962330019001800000220000467228615200166616000100030032000100030002000500010034356845229090288132897931060002000300030009000287572896411610011000100002000062002100200240600013239948569493134115519859325538181654602841010001543412935138932000300010002896328976289532888528999
6500428951232002700200000038800047022855700216573600310003005200010003000200050001003935600522945028741289733106000200230003000900028921288911161001100010000200000200200020024060001307294086973312785219896323138272153522835110001593612844141112000300010002894728990290692895029005
650042890023320200023000004000046862859601016736600310003000200210003000200050001000035745722942028847290203306000200030003000900028907288551161001100010000200006200200620022260001316996276845311685219882315638201951552838010001591912729136922000300010002891229011289362893229054
6500428994233001900140100010000046642860300016650600310003000200210003000200050001002735760522930028765289483106000200030003000900028842288141161001100010000200008200200404200442600013153949469233147105219870328038212156542833110001566212822138352000300010002892228904288832900728822

Test 2: throughput

Count: 8

Code:

  ld3 { v0.d, v1.d, v2.d }[1], [x6], x8
  ld3 { v0.d, v1.d, v2.d }[1], [x6], x8
  ld3 { v0.d, v1.d, v2.d }[1], [x6], x8
  ld3 { v0.d, v1.d, v2.d }[1], [x6], x8
  ld3 { v0.d, v1.d, v2.d }[1], [x6], x8
  ld3 { v0.d, v1.d, v2.d }[1], [x6], x8
  ld3 { v0.d, v1.d, v2.d }[1], [x6], x8
  ld3 { v0.d, v1.d, v2.d }[1], [x6], x8
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 2.0007

retire uop (01)cycle (02)03mmu table walk data (08)l2 tlb miss data (0b)0e0f18191e1f23243a3f4346494e4f51schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)simd prf full (72)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd load (98)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)a5ld unit uop (a6)l1d cache writeback (a8)acafb5b6bbl1d cache miss ld nonspec (bf)c2branch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0e7? int output thing (e9)eaeb? ldst retires (ed)? simd retires (ee)? int retires (ef)f5f6f7f8fd
40020516005512411010002700011600402121215986602548010680182240006160000801002400001600004804998003422279258701600371600551600557992303800444805802001600002400002002400007200001600561600552180201100991001008000080000110016000002501600300016002161482520005110117111600521800006616000024000080100160178160057160057160056160056
400204160055124200000136000016004021212159866062480106801002400061600008018024000016000048049980063922792725016003716005516017879923038003748010020016000024000020024000072000016004116005521802011009910010080000800000100160000025016002902216002261212500005110117111600521800006016000024000080100160056160056160056160056160056
40020416005512410000000000016004021212159866025480106801002401201600008010024000016000048049980050022792725016003616005516005579924038003748010020016000024000020024000072000016005616017911802011009910010080000800000100160000025016002202916002961222500005110117111600521800006616000024000080100160056160056160057160056160057
40020416005512400011000000016004021212159866025480106801002400061600008010024000016000048049980033322792725016003616005516005579923038003848010020016000024000020224000072000016005616005511802011009910010080000800000100160000025016002202216015261222500005110117111600521800006616000024000080100160057160056160180160061160056
400204160055124000110028000116004021212159866025480106801002400061600008018024000016000048049980033922792740016003616005716017879923038003748010020016000024000020024000072000016005516005511802011009910010080000800000100160000025016003012416000061222500005572117111600521800806616000024000080100160180160056160179160056160056
4002041600551242001100280000160164212121598682254801068010024000616000080100240243160000480499800342227927250160036160056160178799230268003848010020016000024000020024000072000016017716005511802011009910010080000800000100160000025016002402216002961223300005110117111600520800006716000024000080100160056160056160056160056160178
400204160055124100000039000116016221212159866025480106801002400061600008010024000016016040066780018022792725016003616005616005579923263800234801002001600002402402002402407200001600551600551180201100991001008000080000010016000002501600000291600226122250000512411711160052180000101016000024000080100160056160056160057160180160178
40020416005612400000002800011601622121215986902548010680100240006160130801002402431600004006348003492279272501600361600551600557992303800374801002001600002400002002400007200001600551600551180201100991001008000080000010016000020016002202216002261223300005141153131609821801606616000024000080100160179160056160056160303163871
400204160546124310001342426400016028520121598746104480430802622402341601308026324047716048040935681269422805723016031216042416054979852777180037480100200160000240000200240000720000160056160055118020110099100100800008000001001600000250160000029160021602225000051101171116005208000010616000024000080100160056160056160056160056160056
400204160055124100110028000016004021212159869025480106801002400061600008010024000016000048049980035322792725016003616005516005679923038003748010020016000024000020024000072000016005516005511802011009910010080000800000100160000025016002202516002260242500005110117111600521800007716000024000080100160056160056160056160056160059

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 2.0007

retire uop (01)cycle (02)03mmu table walk data (08)09l2 tlb miss data (0b)0e0f18191e1f2223243a3f4346494e4f51schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)5f60696d6emap stall dispatch (70)simd prf full (72)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int load (95)inst int alu (97)inst simd load (98)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)a5ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)acafb5b6bbl1d cache miss ld nonspec (bf)c2cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0e7? int output thing (e9)eaeb? ldst retires (ed)? simd retires (ee)? int retires (ef)f5f6f7f8fd
4000251600561240000111036010001600412121215986902548001680010240006160000800102400001601584800498004832279297400160037160056160056799240380038480010201600002402402024000072000016005616005611800211091001080000800000101600000330160030002916002961303300503552645160146080081101016000024000080010160180160185160181160180161657
4000241620611244021114343126400021634862121215987381404803408017024034816026080249240714160482439405806531228124920016020616030016042679886527280106480964201603202407082024024372000016005616005611800211091001080000800001101600000330160030003016002961303300502061767160053180000101016000024000080010160057160057160057160057160057
400024160056124100011000001001600412121215986902548001680010240006160000800102400001600004001848004802279297400160038160056160056799240380038480010201600002400002024000072000016005616005611800211091001080000800001101600000330160030013216000061303300502041755160053080000101016000024000080010160057160059160063160057160057
400024160056124100011004700100160041212121598690254800168001024000616000080010240000160000480049800180227929740016003716005616005679928038003848001020160000240000202400007200001600411600411180021109100108000080000110160000000160030003216003061303300502051755160053180000101016000024000080010160057160057160057160057160057
4000241600561241000100035000001600452121215986902548001680010240006160000800102400001600004001848004812279297400160039160545160056799240380038480010201600002400002024000072144016042616005611800211091001080000800000101600000330160030003016003000303300502051755160038080000101016000024000080010160057160057160057160057160057
4000241600561240000100050000001600412121215986902548001680010240006160000800102400001600004800498004802279357700160037160056160056799240380038480010201600002400002024000072000016005616005611800211091001080000800000101600000330160030003316002961303300502051745160057180000101016000024000080010160057160058160057160057160057
4000241600561241000000036001011600412121215987702548001680010240006160000800102400001600004800498004972279942500160037160056160056799240380038480010201600002400002024000072000016005616005611800211091001080000800001101600000330160029003016000061303300502051765160038180000101416000024000080010160057160057160057160061160057
400024160056124000001003600100160041212121598690254800168001024000616000080010240000160000480049800499227929740016003716005616005679924038003848001020160000240000202400007200001600561600561180021109106841080000800000101600000330160030002916003061303300502051765160053080000101016000024000080010160042160057160059160057160057
40002416005612400101134564264010316039421212159831791311490707826452427421600008008924023716048044246180962222819106001602211604231605487987278478014148193620160320240480202407237228621604231603005180021109100108000080000010160390233114160552013016003061303300502051745160053180000101016000024000080010160057160057160057160057160057
400024160060124001011002991760100160026201215983802548001680010240006160000800102400001600004800498004832279336400160037160059160056799240380038480010201600002400002024000072000016005616005611800211091001080000800000101600000330160029013016003061303300502071777160053180000101116000024000080010160058160057160057160057160057