Apple M1 Microarchitecture Research by Dougall Johnson

Firestorm: Overview | Base Instructions | SIMD and FP Instructions
Icestorm:  Overview | Base Instructions | SIMD and FP Instructions

LD3 (multiple structures, 16B)

Test 1: uops

Code:

  ld3 { v0.16b, v1.16b, v2.16b }, [x6]
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 60 nops): 6.000

Issues: 6.008

Integer unit issues: 0.000

Load/store unit issues: 3.000

SIMD/FP unit issues: 3.008

retire (01)cycle (02)030708090a0b0e0f191e1f2223243a3f43464951inst issue (52)~issue fp/simd (54)~issue ld/st (55)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing ld/st (5a)huge thing fp/simd (5b)5f696d6escheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op ld/st (7d)~map op fp/simd (7e)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst fp/simd load (98)inst ldst (9b)9da0a1a3a6a8a9acafb5b6bbdcache load miss (bf)c2c9cfd0d2icache miss (d3)itlb miss (d4)d5d6d9dadbddinst fetch restart (de)e0eaebecld/st retires (ed)? fp/simd (ee)f5f6f7f8fd
6600528677214240128011060000505428563001160226008300630003000300015004356776229982827228431310600030003000300090002821428338116100110001000130000630040043004513900130551001671153298976191283369381617636627994145881206413086300030002845328444283572825728324
6600428369212300128001060000503328152000163756009300930003000300015004357267230132868628306310600030003000300090002827128303116100110001000030000630040013004513600132489969711533051260193193423381214565527944146951224013177300030002843528298284322841128336
6600428872212260129000090000505528120331159426006300630003000300015004357576230812824528312310600030003000300090002836028384116100110001000030000630010013001313900136851005470403370136719229328038219646027874142321187913457300030002884328381284872845528451
66004283892142800300000901005104280400301640160063009300030003000150003571112307728341283603106000300030003000900028161282091161001100010000300006300400430045119001397710061723833131158191183347381719576027882147221189813160300030002831428769283602843828351
6600428247212240024010090000500428190000160196006300930003000300015000357456230212826228414310600030003000300090002821228631116100110001000130000630010113004513600136061004571003377954191483329380813615827966157121187913301300030002830528392284422827828310
66004284182112400290000601004995281880031642360063006300030003000150003576482301928411284643106000300030003000900028267283581161001100010000300006300100130045149001369110038710933851254192103355381717616727924146291191013267300030002843828321283692838428325
6600428788212270027000060000463828102001158556006300930003000300015004356881230102869028460310600030003000300090002861828465116100110001000030000630040013001516600129639954716832881358196143408381215505627877147851270013130300030002844728801284642822928684
6600428765213240027000090000499028213033159026008300630003000300015000357551230192819128742310600030003000300090002828228373116100110001000030000630040043001501600138621008369463366146019167340038188575927949144711200412994300030002835028279283972874128468
66004284122152300250010900004797281720301593260093006300030003000150043573132298428276283763106000300030003000900028275281921161001100010000300006300400130015136001362310080684133051165191673314381715595727985157321189913291300030002833528350283302824528411
660042844021330003001109000051112797000116123600630063000300030001500035694623058281872835131060003000300030009000281702819011610011000100003000063001001300151360013858993871393407955191053391382015526227955147551253213378300030002842628475284202870828830

Test 2: throughput

Count: 8

Code:

  ld3 { v0.16b, v1.16b, v2.16b }, [x6]
  ld3 { v0.16b, v1.16b, v2.16b }, [x6]
  ld3 { v0.16b, v1.16b, v2.16b }, [x6]
  ld3 { v0.16b, v1.16b, v2.16b }, [x6]
  ld3 { v0.16b, v1.16b, v2.16b }, [x6]
  ld3 { v0.16b, v1.16b, v2.16b }, [x6]
  ld3 { v0.16b, v1.16b, v2.16b }, [x6]
  ld3 { v0.16b, v1.16b, v2.16b }, [x6]
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 1.0008

retire (01)cycle (02)030508090b0e0f18191e222324373a3f4346494f51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)60696b6d6edispatch stall (70)72scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)92inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a1a3a5a6a7a8a9acafb5b6bbdcache load miss (bf)dtlb miss (c1)c2cfd5d6ddinst fetch restart (de)e0eaebecld/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
480205800505991000000001001180042001502548010110024001124000010024000024000050035338494862939080040080080800671403494801002002400002400002002400007200008006780049118020110099010010080000800000100240018214202400571315924003901373800510911711800410602400002400001008005880063800678006780045
48020480062599111000006400000800513181814254801361002400392400001002400002400005003515364553297518002508004480062100344480100200240000240000200240000720000800588005811802011009901001008000080000010024000003902400420003240039511742171510911711800479002400002400001008005080068800688005080068
480204800675991000000042100208006400180254801381022400142400001002400002400005003545551486242118004808006780067003324801002002400002400002002400007200008006780067118020110099010010080000800000100240018170024005600060240039513745005109117118005910002400002400001008006380045800638006380046
480205800626011110000064100248005731818025480118100240030240130100240000240000500354297955560300800430800448006210034448010020024000024000020024000072000080062800571180201100990100100800008000001002400000390240000000024004050560171510911711800649902400002400001008006880068800688005080068
480204800676001000000000000280047315002548014910024003924000010024000024000050035429793601919180043080062800621003444801002002400002400002002400007200008006280044118020110099110010080000800000100240000038024004200041240040515642170510911711800479002400002400001008006780068800688006880068
480204800495991000104047100008004731915102548010710024000924000010024000024000050035388075724897180025080056800621003304815532002400002400002002400007200008004480057118020110099110010080000800001100240000039024004200002400385037450051091171180059101002400002400001008006380063800458006380063
4802048004460010000000501000180050315010254801371002400422400001002400002400005003505352486372918004808006780067150349480100200240000240756200240000720000800678006711802011009901001008000080000010024001917420240017101602400395137000510911711800410602400002400001008004580063800638006380045
480204800626001000000047000028004731518025480142100240042240000100240000240000500350535248636981800480800678006700332480100200240000240000200240000720000800678005011802011009901001008000080000010024001818426524005710259240039503645005109117118004101002400002400001008004580058800588006480063
480204800525991101000065100038197931501415748014710024000524000010024000024000050034958935698290080031080067800660034948010020024000024000020024000072000080050800671180201100990100100800008000001002405371800240017101202400400136000516311711800596002400002400001008007380045800638005880063
48020480062599101111006400002800550181814254801391002400392400001002400002400005003542979553509708002508006280062003444801002002400002400002002400007200008005780057118020110099010010080000800000100240000039024004300042240039513745005109117118005901002400002400001008006380045800578006380045

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 1.0040

retire (01)cycle (02)0305090b0e0f191e2224373a3f4346494f51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)92inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a3a5a6a8acafb5b6bbdcache load miss (bf)cfd5d6ddinst fetch restart (de)e0eaebecld/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
480025801316010000004710218004721515025480054102400052400001024055824000050352515757433801800438006280066103444803742024000024000020240000720000800628006211800211091101080000800000102400000024004202782400425142455019517448005910100240000240000108006580071800828006380063
480024801456000000104700118002921515025480052102400422400001024000024000050167644155329801800478004480062143444800102024000024000020240000720000800628006211800211091101080000800000102400004502400000297240041500455019417348005910100240000240000108006380067800638006780063
48002480343601100000471021800472415102548005510240044240000102400002400005026451183603705180043800628006210344480010202400002400002024000072000080062800621180021109110108000080000010240000002400420258240042604245501941734800591400240000240000108006380063800638004580067
4800248035560000000047003180029015151025480055102400472400001024000024000050285322657479851800258004480062034448001020240000240000202400007200008006280062118002110911010800008000001024000045024004215124004251420501941744800410100240000240000108004580063800498008580045
4800248032060000011047000180029041510254800551024004224000010240000240000501676441359592318004380062800620344480010202400002400002024000072000080044800621180021109110108000080000010240000450240041028824004251045501941734800410100240000240000108006380063800638006380063
480024803426010000004710218004701501025480055102400422400001024000024071250350858656916491800438004480062103264800102024000024000020240189720000800628006211800211095101080000800000102400004502400420270240042004345501941743800591400240000240000108004580063800638006380063
48002480200600000000010218004720151025480052102400062400001024000024000050351506236032460800438006280062103264800102024000024000020240000720000800628006211800211091101080000800000102400004502401311243240043514205019417448005910100240000240000108178081499800708006380063
480024803636010000000100180047215150254800561024004324000010240000240000501676441553297518002580062800621034448001020240000240000202400007200008006280062118002110911010800008000001024000045024000002582400425042455019417438005914100240000240000108006380045800458035180067
480024801206000000007100608002920151025480277102401722400001024000024017850352515757391981800438006580044123484800102024000024000020240000720000802138007711800211091101080000800001102400004502401741452400415141455019417448007410103240000240000108004580063800458006380063
48002480214600000110470011802312151502548005210240043240000102400002400005035562015519827180043800448038303444800102024000024017720240000720639800498006211800211090101080000800000102400004502400425160240000504245501941743800590100240000240000108006380045800638006780063