Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

LD1 (single, post-index, H)

Test 1: uops

Code:

  ld1 { v0.h }[1], [x6], x8
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 60 nops): 2.000

Issues: 3.001

Integer unit issues: 1.000

Load/store unit issues: 1.000

SIMD/FP unit issues: 1.001

retire uop (01)cycle (02)03l1i tlb fill (04)l1d tlb fill (05)mmu table walk instruction (07)mmu table walk data (08)09l2 tlb miss instruction (0a)l2 tlb miss data (0b)0e0f18191e1f22243a3f43464951schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)5e5f696d6emap rewind (75)map stall (76)dispatch uop (78)map ldst uop (7d)map simd uop (7e)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst simd load (98)inst ldst (9b)9dl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafb5bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2c9cfd0d2l1i cache miss demand (d3)l1i tlb miss demand (d4)d5map dispatch bubble (d6)d9dadbddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? simd retires (ee)? int retires (ef)f5f6f7f8fd
6200528837224313013100004000489528337010166073004100010041000100010001000500050011195101922633284272863531030001000100020002000283772856611610011000100001003201002001310042401101310096587083315605020011316838212248542807810001536712512135021000100010002851328615286362842828750
6200428561221012113000006000487328324011167373001100010011000100010001000500050001190301922597285592848831030001000100020002000284632858511610011000100001002221005001110032241101352293137021314204720000314638141758552809110001515112372137691000100010002872728584285172858528623
6200428627221011111100001000477328384010167273004100010031000100010001000500050001197901622632285092857131030001000100020002000284412858711610011000100001003231003002110000121201316894457043317305020021323638201559502818610001524312570138181000100010002861128719286422872828744
6200428761223002001000009000479228348000167423000100010001000100010001000500050001192601722671285622868431030001000100020002000286912857111610011000100001000021000000010002020001323094006987328714720069320238241157562825310001536912608139151000100010002881828717285972861828698
6200428670222001002000003000480628396000166473000100010011000100010001000500050001190601022645286252861231030001000100020002000286502852211610011000100001000001000000010000000001329297146923322214719987320438262356552823110001538112671137331000100010002872328727286362860328709
620042868422100300200100300048612835100016705300110001000100010001000100050005000119320522612285902859531030001000100020002000286392862311610011000100001000021000000010000020001308294056956320204820113321938242155462804610001506312573135951000100010002861428554285682869728628
6200428703223101001000003000482728341000166913001100010011000100010001000500050001190401122631285572871531030001000100020002000286202849411610011000100001000001000000010002020001332894647023322105020048322738281652512814210011536912661135291000100010002859828729286422873628826
6200428649223002003000211771080047452839200016675300110011002100010001001100150005005119460622699286612867472930031002100020022000286012872421610011000100001000021000010010000000001326893837069320114520117325838262048482826010001522612459137611000100010002874628702285512862828656
6200428646222000002000000000483828278000165783001100010011000100010011000500050001190401122611285702859431030001000100020002000286612852511610011000100001000001000000010000020001332797267012318704619957317938221054512836810001541412455135941000100010002863128597287582861528542
620042865722200100000000200048792845100016654300110001000100010001000100050005000119260622651284762865231030001000100020002000285502861211610011000100001000001000000010002020001345595247027320705020113325838321956572813310001543412043136251000100010002867928549286932871528775

Test 2: Latency 1->2 roundtrip

Chain cycles: 3

Code:

  ld1 { v0.h }[1], [x6], x8
  fmov x0, d0
  eor x8, x8, x0
  eor x8, x8, x0
  add x6, x6, x8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 11.0054

retire uop (01)cycle (02)03mmu table walk instruction (07)mmu table walk data (08)09l2 tlb miss instruction (0a)l2 tlb miss data (0b)0e0f18191e1f2223243a3f4d51schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)6061696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd load (98)inst ldst (9b)9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafb5l1d cache miss ld nonspec (bf)c2cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? simd retires (ee)? int retires (ef)f5f6f7f8fd
60205140047108600000000021401000140035139556258010050100200001000040100200001000012456075304487107103660014001114005014005013196431324327010030200100002000060200200003000014005014004711502011009910040100100001000010010002011000000001000011032103933713971550000966100001000050100140051140048140051140051140051
602041400531086000000100532000001400351395712580102501002000210000401002000010000124563753039271071021000140026140050140035131964313241670100302001000020000602002000030000140050140047115020110099100401001000010000100100000110000020310000100321039333139711500001099100001000050100140051142730142337140048140051
602041400501125000000000100000140130139571258010250100200021000040100200001000012456375304487107092760014002614005014005013196431324287010030200100002000060200200003000014005014003511502011009910040100100001000010010000011000001001000011032103935313971450000909100001000050100140051140036140051140036140051
6020414005010850000001003100000140035139556258010250100200021000040100200001000012456115304487107102100014002614003514003513196431324317010030200100002000060200200003000014005014004711502011009910040100100001000010010000011000000001000011032103933314003350000969100001000050100140051140051140146140036140051
6020414005010850000000004300000140021139635258010250100200021000040100200001000012456075304373107135750014009614005014005013196131324317010030200100002000060200200003000014005114005011502011009910040100100001000010010000011000000001000010032103933313971450000999100001000050100140053140138140051140051140051
6020414005010860000000004200010140130139618258010050100200021000040100200001000012456375304373107104440014002614004714004713196431324287036330200100002000060200200003000014005014004711502011009910040100100001000010010000011000001031000011032103933513969950000099100001000050100140051140143140036140051140051
60204140051108600000000010000014003213956825801025010020002100004010020000100001248418530392710710366101400111400501400501319491513243370100302001000020000602002000030000140053140142115020110099100401001000010000100100000110000000322510000110321031063313971450000999100001000050100140051140051140051140051140051
60204140051108600000000082000001400351395712580100501002000210001401002000010000124563753044871071021000140029140050140050131964313243170100302001000020000602002000030000140050140047115020110099100401001000010000100100000110000030010000110321039333139798500001366100001000050100140051140051140051140051140144
60204140052108600001000110300000140035139571528010250100200021000040100200001000012456375304487107102100014016014022314005013196413132506703613045010000201626068620000302341401401401332815020110099100401001000010000100100040110027200956010005110323431164314002350264069100001000050100140316140337140233140326140234
6020414022410880100000134681760000140035139556258010050100200021000040100200001000012456375304487107102100014002614005014005013196431324317010030200100002000060200200003000014005014004711502011009910040100100001000010010000011000001031000010032103933313971450000909100001000050100140054140051140051140051140051

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 11.0050

retire uop (01)cycle (02)03mmu table walk data (08)09l2 tlb miss data (0b)0e0f18191e1f2223243f4d51schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)77dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd load (98)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)a9acafb5l1d cache miss ld nonspec (bf)c2branch mispred nonspec (cb)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? simd retires (ee)? int retires (ef)f5f6f7f8fd
600251400471085000000000000140032139654258001050010200021000040010200001000012457885307504107171091140026140050140035131995313245807001030020100002000060020200003000014005014004711500211091040010100001000011010000011000010910000110031402882213972450000969100001000050010140036140051140051140036140036
6002414003510860101000130100140036139654258001250010200021000040010200001000012457885307504107171091140011140119140047131998313245807001030140100002000060020200003000014005014003511500211091040010100001000001010000001000000010000110031402882213972150000999100001000050010140051140051140051140036140036
600241400501086000000010000140035139654258001250010200021000040010200001003912457885307504107171090140026140047140050132000313246107001030020100002000060020200003000014003514003511500211091040010100001000001010000011000000310037100031403883313972150010009100001000050010140051140036140136140051140052
6002414004710850001100130000140035139654258001250010200021000040010200001000012458245307504107175770140026140142140054131998313245807001030144100002000060020200003000014040914004811500211091040010100001000001010000011000010610000110031403882213972150000900100001000050010140053140036140036140048140051
6002414013810850000000088000140088139614258001250010200021000040010200001000012457885306922107171090140026140047140050131998313245807001030020100002000060020200003000014005014022711500211091040010100001000011010000011000020310000110031402883313972250000909100001000050010140036140137140036140036140051
60024140051108600000001200001400351396542580012500202000210000400102000010000124578853069221071647901400131400351400501319983132458070010300201000020000600202000030000140126140047115002110910400101000010000010100000110000103254100000100314041073313972550000131013100001000050010140744140052140055140056140143
600241400541086000000012880011401451396202580012500102000210000401532000010000124582453075821072080001400111400541400511320023132459070268300201000020000600202000030000140054140053215002110910400101000010000010100000010000220271000011103187388231397095000013013100001000050010140055140052140056140052140175
600241400351085100010046000014002113965825800125004020002100014016120000100001245753530765910717499014002714005614005113200231324620702723002010000200006002020000301211400541400511150021109104001010000100000101000001100001440010000000031404882213970650010101014100001000050010140036140125140055140056140130
6002414015110860000000100001400391396584980010500102000210000400102000010000124784053069221071757701400301400541400351319993132497070010300201000020000600202000030000140054140035215002110910400101000010000010100032110000124001000011003140388221397255000001013100001000050010140147140036140055140052140053
60024140054108600000001880001400391396582580010500102000210000400102008010000124582453076561071742101400301401401402341320373132556070010323371036320246605162032030244140334140310415002110910400101000010000010100062010003129718100021100318941493914183050037131013100001000050010140324140330140124140433140306

Test 3: throughput

Count: 8

Code:

  ld1 { v0.h }[1], [x6], x8
  ld1 { v0.h }[1], [x6], x8
  ld1 { v0.h }[1], [x6], x8
  ld1 { v0.h }[1], [x6], x8
  ld1 { v0.h }[1], [x6], x8
  ld1 { v0.h }[1], [x6], x8
  ld1 { v0.h }[1], [x6], x8
  ld1 { v0.h }[1], [x6], x8
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 2.0007

retire uop (01)cycle (02)03mmu table walk data (08)l2 tlb miss data (0b)0e0f18191e1f22243f4346494e4f51schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)6067696b6d6emap stall dispatch (70)rob full (74)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd load (98)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)a5ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafb5b6bbl1d cache miss ld nonspec (bf)c2cfd5map dispatch bubble (d6)ddfetch restart (de)e0e7? int output thing (e9)eaeb? ldst retires (ed)? simd retires (ee)? int retires (ef)f5f6f7f8fd
16020516005311990000004500016004016615955842524010280100800028000080100800008000053099052919422935460001600370160055160055139690031400132401002008000080000200160000160000160055160054118020110099100100800008000001008000000080012000108000961917051101161116005208000096800008000080100160056160056160054160056160041
160204160055124000000030000160025166159588425240102801008000280130801008000080000530990529194229354560016003601600401601261396900313999824010020080000800002001600001600001600551600401180201100991001008000080000010080130017080013000168001260130051101161116005208000099800008000080100160056160054160041160112160041
1602041600401241000000088001600401061595886252401028010080002800008010080000800005309945291942293544800160075016005516004013969003140034240100200800008000020016000016000016004016062111802011009910010080000800000100800000140800100001880012611314051101161116005208000000800008000080100160112160041160056160056160056
16020416005512410011013100016011116615958852524010280100800028000080100801528000053099052919422935460001600360160056160055139677031400112401002008016480000200160000160000160055160053118020110099100100800008000001008000001708001301038001301100051101163116009218000099800008000080100160056160056160056160128160041
160204160055124100000015100016004010615958862524010280100800028000080100800008000053099452919522935448001600750160055160053139675031400192401002008000080000200160000160000160125160053118020110099100100800008000001008000000080012000128001260130051101161116005008000096800008000080100160056160056160056160056160126
160204160053124100000019000160172166159588525240102801008001480000801008000080160530994529194229354480016003601600401600551396750314001124058220080000801642001600001600001600551601231180201100991001008000080000010080000017080145000138014260100051102161116005008000099800008000080100160056160056160056160113160056
160204160055124000000018001160096166159588063240102801008000080000802608000080000530990529276229354480016003601601091600551396900313999824010020080000800002001600001600001600551601242180201100991001008000080000010080000017080010000128000900017051101251116005208000066800008000080100160041160054160041160056160056
1602041600551241000010190001601121601595886252401028010080002800008026580152800005309905291942293260400160036016005516005513967503140013240100200800008000020016000016000016005516005311802011009910010080000800000100801300170800130001080012611314051101161116003718000096800008000080100160057160125160056160056160056
1602041600551241000000000016030710615916851412410088058380048803908058080608804875806635541152291995300161128016026416034013963427741400972415242008049280492204160656160984160269160266318020110099100100800008000001008455201429580013000622080390011017151721342116005218000099800008000080100160057160041160041160054160041
1602041600551241000000190001600380661595886252401028010080002800008010080000800005309945291942293260400160036016005516005313969003140013240100200800008000020016033616000016004016005311802021009910010080000800000100800000140801420301380143001017051101161116003718000096800008000080100160041160057160056160056160041

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 2.0007

retire uop (01)cycle (02)03mmu table walk instruction (07)mmu table walk data (08)09l2 tlb miss data (0b)0e0f18191e1f2223243f4346494e4f51schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)rob full (74)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)91inst branch cond (94)inst int alu (97)inst simd load (98)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)a5ld unit uop (a6)l1d cache writeback (a8)a9acafb5b6bbl1d cache miss ld nonspec (bf)c2branch cond mispred nonspec (c5)branch mispred nonspec (cb)cfd0d5map dispatch bubble (d6)ddfetch restart (de)e0e7? int output thing (e9)eaeb? ldst retires (ed)? simd retires (ee)? int retires (ef)f5f6f7f8fd
16002516005511980000000039000016004000615958842524001280010800028000080010800008000053067752933122935456016003616005516005513971203140033240010208000080000201600001600001600401600401180021109010108000080000010800000008001500080013011317000502006169916005218000069800008000080010160041160056160056160041160054
1600241600551241000010001901001600941061595884252400128001080002800008001080000800005306215293312293545601600211600551600551397100314003524001020800008000020160000160000160040160053118002110901010800008000001080000014080000001680013011315000502006169816005218000090800008000080010160041160056160041160111160041
1600241600551240000000003000001600401661595586252400128001080002800008001080000800005306775293312293546001600361600581600551397120314002024001020800008000020160000160000160040160053118002110931010800008000001080000014080013021980012611017000502009169916005018000096800008000080010160056160127160056160056160041
1600241600551241000000001288100160025160159588525240314800108000280000800108000080000530677529938229354480160021160055160055139710031400332400102080000800002016000016000016005516004011800211090101080000800000108000001408000000207880013611017000502006161010160052080480169800008000080010160041160056160056160126160041
160024160055124100000000340000160040160159588525240012801708000280000800108000080000530677529331229362930160036160124160040139712031400332400102080000800002016000016000016005316005311800211090101080000800000108013201408001320209380012611317000502009169616005208000098800008000080010160056160056160056160127160056
16002416004012400000000034000016004316615955810252403128001080002800008001080000800005306775293312293587901600361600551600401397120314003524001020800008000020160000160000160055160053118002110901010800008000001080000017080013001280012611017000502009168816005218000060800008000080010160041160041160056160056160056
160024160040124001000000300001160040160159588625240328800108000080130800108000080000530681529290229358790160036160055160125139710031400352400102080000800002016000016000016005516005311800211090101080000800000108013000080013101880013601117000502008167916009218000090800008000080010160056160056160056160056160056
160024160055124100000100308800016002516615958846424001080010800028000080010800008000053141352933322935879016003616005516005513971203140035240482208000080000201600001600001600551600532180021109010108000080000010800000140800001015800006191702050200101610916005208016096800008000080010160056160128160056160056160041
16002416005512410000000018000016004016615958862524001280010800028000080010800008000053068152933622935460016003616005516005513969702714003524001020800008000020160000160000160125160055118002110901010800008000001080392017106284434006236805326110170005084053481116017218048090800008000080010160339160183160341160268162406
160024160205124210010032264176000160025066159588625240012800108000280000800108000080000530681529334229358710160021160055160055139712031400202400102080000800002016000016000016005516005311800211090101080000800000108000001708001300138001261130000502006166916005218000099800008000080010160056160041160054160054160041