Apple M1 Microarchitecture Research by Dougall Johnson

Firestorm: Overview | Base Instructions | SIMD and FP Instructions
Icestorm:  Overview | Base Instructions | SIMD and FP Instructions

LD4 (multiple structures, 8B)

Test 1: uops

Code:

  ld4 { v0.8b, v1.8b, v2.8b, v3.8b }, [x6]
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 60 nops): 6.000

Issues: 6.008

Integer unit issues: 0.000

Load/store unit issues: 2.000

SIMD/FP unit issues: 4.008

retire (01)cycle (02)0304080e0f18191e2223243a3f43464951inst issue (52)~issue fp/simd (54)~issue ld/st (55)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing ld/st (5a)huge thing fp/simd (5b)5f696b6d6escheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op ld/st (7d)~map op fp/simd (7e)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst fp/simd load (98)inst ldst (9b)9da0a1a3a6a8acafb5bbdcache load miss (bf)cfd0d2icache miss (d3)itlb miss (d4)d5d6d9dadbddinst fetch restart (de)e0eaebecld/st retires (ed)? fp/simd (ee)f5f6f7f8fd
66005290382252000004100514128234222159706000401220004000200010000475003230350283162833231060002000400020008000281862816711610011000100002000042004142002224138831005272583430151192273435381817383828020142981214612921200040002821028289282002828328353
66004283262130001004100517128262022159956012401220004000200010000475742230550282302839431060002000400020008000284162812811610011000100002000042000002002044135041004971343392040190983296381513443627975141491230413252200040002851128497284532825928281
66004283492120000000100493928247200158336008401220004000200010000475524230140281332832531060002000400020008000284322827211610011000100002000042000042004224140451022871433293139191173397382110424327852146301190813227200040002843028361283662842528387
66004283302140000006100505028005222157986012401220004000200010000475844230320280482829831060002000400020008000282892825811610011000100012000042004042004224136551000870713399042193223312382011473927831143191202413679200040002838928471281752833228413
6600428362213000000810050832812422215922601240122000400020001000047556423064028250284313106000200040002000800028339282021161001100010001200004200404200204013863100807117331804119274338538209374028077145411204213446200040002825728421282542845828263
6600428368213001100410049822803702215949600840002000400020001000047528423061028264282203106000200040002000800028291283921161001100010000200004200200200424413579997871923448043193273347381513444027924141281190213162200040002836128356284242833428231
6600428303212100000410048952809422215874600840122000400020001000047578623079028212283683106000200040002000800028359283411161001100010000200004200217200422413936985370653376044191203250382413404128052147431212113700200040002843128524284372832228372
66004283392132000000100495828248020158256012401220004000200010000475526230310281682832131060002000400020008000283682831611610011000100002000042002022002224136141015771623392038191703409381515403727951145491194312967200040002836228303282562839828339
660042829421200000041005000281590221614860124000200040002000100004752682303802823428246310600020004000200080002816528305116100110001000020000420020220002441363798367210319404519152331338138434227954144291187613259200040002829528308284142838528408
66004282722131000000100514728296022160766012401220004000200010000475407229930282502843331060002000400020008000284032830611610011000100002000042004022002244137661003670683442039191913313381214354328025149291229613624200040002846428409282142828428318

Test 2: throughput

Count: 8

Code:

  ld4 { v0.8b, v1.8b, v2.8b, v3.8b }, [x6]
  ld4 { v0.8b, v1.8b, v2.8b, v3.8b }, [x6]
  ld4 { v0.8b, v1.8b, v2.8b, v3.8b }, [x6]
  ld4 { v0.8b, v1.8b, v2.8b, v3.8b }, [x6]
  ld4 { v0.8b, v1.8b, v2.8b, v3.8b }, [x6]
  ld4 { v0.8b, v1.8b, v2.8b, v3.8b }, [x6]
  ld4 { v0.8b, v1.8b, v2.8b, v3.8b }, [x6]
  ld4 { v0.8b, v1.8b, v2.8b, v3.8b }, [x6]
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 1.0009

retire (01)cycle (02)030508090b0e0f18191e22233a3f4346494e4f51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)60696b6d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a1a3a5a6a8a9acafb5b6bbdcache load miss (bf)dtlb miss (c1)c2cfd5d6ddinst fetch restart (de)e0eaebecld/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
480205800725991100100058002800542555025480172100320072160000100320000160000500800048544002808005008006980069335148010020016000032000020016000064000080047800691180201100991001008000080000010016001315430160053115616003961524312051091171180066131341600003200001008007080049800708007080070
4802048006959911110000571008005405050254801241003200721600001003200001600005008011871088075208002808006980047035148010020016000032000020016000064000080069800471180201100991001008000080000010016001313430160052015116000061514313251091171180066131301600003200001008004880048800488007080070
4802048006960010000000581008003205500254801721003200241600001003200001600005008011871088075218002808006980047332948010020016000032000020016000064000080069800691180201100991001008000080000010016001214430160054025216000001514313051091171180066131301600003200001008007080070800708004880048
48020480070600100000005810280032055502548016410032006416000010032000016000050080004210880752080050080047800690329480100200160000320000200160000640000800478006911802011009910010080000800000100160015140016005202521600390051431205109117118006601351600003200001008007080070800708007080049
480204800695991100000058102800542555025480172100320072160000100320000160000500801179108808880800500800698004733514801002001600003200002001600006400008004780069118020110099100100800008000001001600141343016005201521600006152431325109117118004401341600003200001008007080070800708004880070
480204800475991110000012102800322055025480172100320072160000100320000160000500800042108807520800500800698006903294801002001600003200002001600006400008006980069118020110099100100800008000001001600141243016001302551600000152431305109117118006601351600003200001008007080070800708007080048
48020480069599100100002110380054205502548016410032007216000010032000016000050080117954400320800500800698006933524805142001600003200002001600006400008006980069118020110099100100800008000001001600121443016001212521600396112012051091171180044131301600003200001008007080048800708004880048
480204800695991011000073102800540055025480164100320064160000100320000160000500801181108807520800280800718006933534801002001600003200002001600006400008006980069118020110099100100800008000001001600151400160053001316003900524313051091171180066131301600003200001008007080048800708007080048
480204800696001010000058102800542555025480172100320024160000100320000160000500800048108807520800280800698006933514801002001600003200002001600006400008006980047118020110099100100800008000001001600141443016005201511600396112431305109117118004401351600003200001008007080048800708004880070
48020480069600111111005810280032205502548012410032007216013810032000016000050080004310880752080028080047800690329480100200160000320000200160000640000800488004711802011009910010080000800000100160015134301600540152160039615201215109117118006601301600003200001008007080070800488007080070

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 1.0007

retire (01)cycle (02)03090b0e0f18191e1f22243f4346494f51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)60696b6d6edispatch stall (70)72scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a1a3a5a6a8acafb5b6bbdcache load miss (bf)branch mispredict (cb)cfd5d6ddinst fetch restart (de)e0e7eaebecld/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
480025800565991010005701080041212120254800701032006016000010320000160000508002228320000080037080056800560033848001020160000320000201600006400008005680056118002110910108000080000110160000027016002302416002461242705019121723800531660160000320000108005780057800428005780042
48002480056600000000300108004100120254800701032006016000010320000160000508000003840000080022080056800410033848001020160000320000201600006400008005680041118002110910108000080000010160000000160000001600006124270501961723800530660160000320000108005780057803378050580057
480024800565990000003241640080501212120594800701032006016000010320000160000508000008320000080022080338800560056394800102016026032026020160000640000800568005621800211091010800008000001016050422701600240241600246024270501922623807081660160000320000108005780057800578005780057
480024800565990001013000087128012013001724493610103321001667541033332016745250979235648734408681508864188422125470233848001020160000320000201600006400008004180044118002110910108000080000010160000027016002402416000001000501921762800381660160000320000108005780057800428013880057
4800248005660000000030010800412121202548007010320000160000103200001600005080021883200000800370800568005600338480010201600003200002016000064000080056800561180021109101080000800000101600000001600240241600240124270501921722800531660160000320000108005780057800578005780042
48002480056599000000300108004120002548007010320060160000103200001600005080022183200000800370800568005600338480010201600003200002016000064000080041800561180021109101080000800000101600000270160024001600246124270501921726800380060160000320000108005780057800428005780057
48002480056600000000756010800410121202548006610320060160000103200001600005080000083200000800370800418005600338480010201600003200002016000064000080041800411180021109101080000800000101600000270160000024160000602400501921722800531660160000320000108005780057800578004280057
480024800566000000003001080041212002548007010320060160000103200001600005080021938400000800370801168005600338480010201600003200002016000064000080056800561180021109101080000800000101600000270160024001600246124270501921726800531060160000320000108004280057800578004280057
4800248005660000000030010800412121202548007010320076160000103200001600005080000083200000800370800568005600338480010201600003200002016000064000080056800561180021109101080000800000101600000270160024024160024612400501961762800531060160000320000108004280057800578005780042
4800248005659900000084300080026012002548001010320060160000103200001600005080021983200000800370800418005600338480010201600003200002016000064000080056800561180021109101080000800000101600000270160000001600246124270505021726800531660160000320000108005780057800578010780057