Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

LD4 (multiple, post-index, 8H)

Test 1: uops

Code:

  ld4 { v0.8h, v1.8h, v2.8h, v3.8h }, [x6], x8
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 60 nops): 12.000

Issues: 13.014

Integer unit issues: 1.000

Load/store unit issues: 4.000

SIMD/FP unit issues: 8.014

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk instruction (07)mmu table walk data (08)09l2 tlb miss instruction (0a)l2 tlb miss data (0b)0f181e223a3f43464951schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)5f60696d6emap rewind (75)map stall (76)dispatch uop (78)map ldst uop (7d)map simd uop (7e)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst simd load (98)inst ldst (9b)9dl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafb5b6bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2cfd0d2l1i cache miss demand (d3)l1i tlb miss demand (d4)d5map dispatch bubble (d6)d9dadbddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? simd retires (ee)? int retires (ef)f5f6f7f8fd
72005293482200210126000814611290220001529513030100080264000100080004000500020309982929024780291852923631013000400080005000200002929329277116100110001000040066840040021040056198411277993406787298795919228304738091247472849710001633512953133994000800010002921529302292532920829204
720042931121911510200001204477290730001526513008100080084000100080004000500020355981545024765292562936731013000400080005000200002926029211116100110001000040045840100024400601612421311091046797305413471922331043810854452849810001629313066132014000800010002917129329292622932029276
7200429309219022002100014144952904600015311130141000801440001000800040005000203229843270247612920129316310130004000800050002000029201292671161001100010000400008400300064005615800128539208684530917521928431543811848522856010001639412929132764000800010002924129293292162936629287
7200429304220016002000001457328997200152681300010008014400010008000400050002033798430802481529111292323101300040008000500020000291782912511610011000100004000004002010240006058001277591656811307884019251310038061647462848010001610913038130184000800010002920529208292292925429215
72004292862190170017000814545290782001518513000100080144000100080004000500020335982503024814292342926931013000400080005000200002926829186116100110001000040000840030000400661080012809902069323047114919166314038171145462850510001621812932131364000800010002928829305292992921129246
720042921521802000210000046582907600015232130001000801440001000800040005000203169835215024768292572920831013013400080005000200002911329144116100110001000040000040020002400260500012888912268263076114519253300238091745462851510001623412940133234000800010002926229268292572924729210
7200429222219017002200080450329087000153321301410008014400010008000400050002030398238202473629235292683101300040008000500020000291282920511610011000100014000084005000040056150001270894456859306814471923130603809948482848810001632212903133924000800010002927829237291952921129271
720042926221902100230008046152904100015271130221000803040001000800040005000203329813990248322917429333310130004000800050002000029189292721161001100010000400668401000244002601011411289791566875308865119257300838161244432859310001638713010133444000800010002929629205292352929229207
720042930421911510151001204501291290001525313020100080144000100080004000500020318983385024736291792930231013000400080005000200002922029195116100110001000040000840020000400001500012885910368353062114519300306738151148482848610001632613031134434000800010002920829203293032929129147
7200429242219014002300091465229070000153441301410008020400010008000400050002030598293002485729133292843101300040008000500020000293042920511610011000100004000084002000240006130001294692586831303984819253312438142043472864010001636213088134084000800010002921429326292822929129273

Test 2: throughput

Count: 8

Code:

  ld4 { v0.8h, v1.8h, v2.8h, v3.8h }, [x6], x8
  ld4 { v0.8h, v1.8h, v2.8h, v3.8h }, [x6], x8
  ld4 { v0.8h, v1.8h, v2.8h, v3.8h }, [x6], x8
  ld4 { v0.8h, v1.8h, v2.8h, v3.8h }, [x6], x8
  ld4 { v0.8h, v1.8h, v2.8h, v3.8h }, [x6], x8
  ld4 { v0.8h, v1.8h, v2.8h, v3.8h }, [x6], x8
  ld4 { v0.8h, v1.8h, v2.8h, v3.8h }, [x6], x8
  ld4 { v0.8h, v1.8h, v2.8h, v3.8h }, [x6], x8
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 2.0007

retire uop (01)cycle (02)03l1i tlb fill (04)mmu table walk data (08)09l2 tlb miss data (0b)0e0f18191e1f2223243f4346494f51schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)6067696b6d6emap stall dispatch (70)simd prf full (72)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd load (98)inst ldst (9b)9d9fl1d tlb access (a0)l1d cache miss ld (a3)a5ld unit uop (a6)l1d cache writeback (a8)a9acafb5b6bbl1d cache miss ld nonspec (bf)branch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd2d5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaeb? ldst retires (ed)? simd retires (ee)? int retires (ef)f5f6f7f8fd
960205160056119900001100300100160041012120251040144801006400443200008010064000032000047960116802421663998000160037016004316005600326104010020032000064000020040000016000001600561600561180201100991001008000080000010032000027032015500243200246124270005109041611160053800006632000064000080100160057160057160044160057160057
96020416004311990000110030010016004131212025104014480100640044320000801286400003200004796011680274816000010160037016005616005600339104010020032000064000020040000016000001600561600561180201100991001008000080000110032000027032002400243200246124270005109011611160053800006632000064000080100160057160057160161160057160057
9602041600561199000000003001001600413121202510401448010064004432000080100640000320000479601168024216639980101600370160056160056003391040100200320000640000200400000160000016005616005611802011009910010080000800000100320000003200240027320000612400005121011611160053800006032000064000080100160057160057160057160057160057
960204160056119900001100300100160041312120251040144801006400443200008010064000032000047960116800011663998000160037016004316005600339104010020032000064000020040000016000001600561600561180201100991001008000080000010032000027032002400243200246124270005109011611160053800006632000064000080100160057160057160057160057160057
9602041600561199000011003001001600413121202510401448010064000032000080100640000320000479601168024281600000016003701600561600560032610401002003200006400002004000001600000160056160056118020110099100100800008000001003200002703200000003200246124270005109011611160053800006632000064000080100160057160057160057160057160057
960204160056120300100120300100160041312124872691042677801676400003200008010064000032000047960116803421663998010160037016004416005600326104010020032000064000020040000016000001600561600561180201100991001008000080000010032000027032002410243200246124270005109011611160053800000632000064000080100160057160057160057160057160044
960204160056119800000000300100160041312120251040144801006402883200008010064000032000047960116802421663998000160024016005616005600340104010020032000064000020040000016000001600431600561180201100991001008000080000010032000027032002400243200246124270005109011611160053800006632000064000080100160044160057160057160057160045
96020416005611980000000030010016004130120251040144801006400443200008010064000032000047960116802981663998000160037016005616005600343104010020032000064000020040000016000001600561600561180201100991001008000080000010032000027032002400243200246124270005120011611160053800000632000064000080100160044160057160057160057160044
96020416005611990000000030010016004131212025104014480100640044320000801006400003200004796011680242166399800016003701600431600560033910401002003200006400002004000001600000160057160043118020210099100100800008000001003200002703200240003200246024270005109011611160153800006632000064000080100160057160057160057160057160057
960204160056119900000000000001600413121202510401008010064004432000080100640000320000479601168024216639980001600370160056160056003391040100200320000640000200400000160000016005616005611802011009910010080000800000100320000270320000100263200246024270005109011611160053800006632000064000080100160057160057160057160057160044

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 2.0007

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)l2 tlb miss data (0b)0e0f1e22233a3f4346494e51schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)60696d6esimd prf full (72)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd load (98)inst ldst (9b)9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)a5ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafb5b6bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2cdcfd2d5map dispatch bubble (d6)dbddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? simd retires (ee)? int retires (ef)f5f6f7f8fd
9600251600651199110005800216005020151251040066800106400563200008001064000032000046640816811151152002801600461600651600490348104001020320000640000204000001600000160065160066118002110910108000080000103200141400320052000133200396013431300501901160111600628000009032000064000080010160066160066160050160050160066
96002416004911991110058102160050217012510400628001064005232000080010640000320000472041168097811520028016003016006916004903481043514203201486400002040000016000001600651600491180021109101080000800001032001313430320052100523200386113431310501901160111600628000009132000064000080010160050160050160051160050160066
9600241600651199110015800316003401717125104002680010640056320000800106400003200004693921681087217601520160030160065160065033210400102032000064000020400000160000016004916004911800211091010800008000010320013144303200131005610320039015301310501901160111600468000099132000064000080010160050160066160066160066160066
9600241600491199111005910016005201700251040066800106400563200008001064000032000046415616802542176015201600461600491600650332104001020320000640000204000001600000160065160049118002110910108000080000103200141300320052002523200006013431310501901160111600628000000132000064000080010160066160050160066160050160066
960024160065119811101380001600440121202510400548001064005632000080010640000320000472041168024298233840160037160059160056732610400102032000064000020400000160000016005616004311800211091010800008000010320000027032002400003200240024350005019011601116005380000100032000064000080010160044160062160060160057160057
96002416005611990001138001160044001202510400548001064000032000080010640000320000472041168032118560012016004016004316005903421040010203200006400002040000016000001600621600561180021109101080000800001032000002703200000000320024612435000501901160111600538000060032000064000080010160060160057160060160060160060
9600241600591198000113810016002800120251040054800106400443200008001064000032000046969316804841663998001600401600431600590326104001020320000640000204000001600000160059160056118002110910108000080000103200000270320000010243200326132350005019011601116005680000100032000064000080010160044160060160057160060160046
9600241600561199000003800016002831212025104006680010640060320000800106400003200004659001680478166399800160040160062160043032610400102032000064000020400000160000016004316005611800211091010800008000010320000027032003210035320024612435000501901160111600538000000032000064000080010160060160044160060160044160057
9600241600561199000003810116004430002510400548001064005632000080010640000320000472041168041781600000160040160056160045033910400102032000064000020400000160000016005616004311800211091010800008000010320000027032003200024320024602435000501901160111600568000000032000064000080010160057160044160057160156160060
960024160056119800011380001600440120025104006680010640000320000800106400003200004720411680295166399800160040160056160056033910400102032000064000020400000160000016004316005611800211091010800008000010320000027032000000024320032002435000501901160111600568000006032000064000080010160060160057160044160063160060