Apple M1 Microarchitecture Research by Dougall Johnson

Firestorm: Overview | Base Instructions | SIMD and FP Instructions
Icestorm:  Overview | Base Instructions | SIMD and FP Instructions

LD4R (16B)

Test 1: uops

Code:

  ld4r { v0.16b, v1.16b, v2.16b, v3.16b }, [x6]
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 60 nops): 5.000

Issues: 5.012

Integer unit issues: 0.000

Load/store unit issues: 1.000

SIMD/FP unit issues: 4.012

retire (01)cycle (02)03050708090a0b0e0f1e2223243a3f43464951inst issue (52)~issue fp/simd (54)~issue ld/st (55)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing ld/st (5a)huge thing fp/simd (5b)5f696d6escheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op ld/st (7d)~map op fp/simd (7e)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst fp/simd load (98)inst ldst (9b)9da0a1a3a6a7a8a9acafb5bbdcache load miss (bf)dtlb miss (c1)c2c9cfd0d2icache miss (d3)itlb miss (d4)d5d6d9dadbddinst fetch restart (de)e0eaebecld/st retires (ed)? fp/simd (ee)f5f6f7f8fd
65005294892290180019000300046812890500016980500840121000400010005000475774229032908929285310500010004000100040002911429119116100110001000010000210000000100020000012882925569113038105120326308638179554928359164921351115083100040002939029334293832943529333
65004292452200170013010310045682899800016963500040121000400010005000475182229362917129372310500010004000100040002915829174116100110001000010000210000003100020200012859913468183252651202363114382116565428338160141402314476100040002936629254292832934729333
650042937622001200170003100460028793000169375000400810004000100050004754602288029198293153105000100040001000400029268292001161001100010000100000100000001000200000134779194690130631448202333099381713564628294162501411515165100040002931729382293402937129312
650042925222001500150003100464828698000170865008401210004000100050004750402286229153293413105000100040001000400029227292261161001100010000100002100000001000202000128749201700330251048202483072382411534928511163811373614999100040002934529464293552942329324
6500429217220017001600001004562287810001699450084008100040001000500047514522918290622927331050001000400010004000291292923711610011000100001000021001000010002020001292491716821325074820169312038158524828369163801381414488100040002928529316292462936229278
65004293252200130011000310045732877701017032501240081000400010005000474387228882907829370310500010004000100040002920829202116100110001000010000210000000100020200012912916368383069748203083040381312535628599157121396315134100040002941129324292692936029265
65004292902200130011000310046462873800016966500840081000400010005000475263228542911729384310500010004000100040002915329273116100110001000010000210000000100020200012833964368213270843202563081381814545328340161221407514861100040002926829331292462933829313
65004293122200160020000310049192899600017007500840081000400010005000474065228832912429259310500010004000100040002920829206116100110001000010000210000000100020300012837970570623045104620342311338208514928431163921403714539100040002925029351292602936129295
65004294692380180013000000045452873100017024500040121000400010005000473966228732910329360310500010004000100040002926529174116100110001000010000210000000100120200013499905070383025844202463105381817525828670162941384014580100040002932729312292882935829225
65004293252200160014000310048702881700016973500840081000400010005000476035228432907129286310500010004000100040002910429205116100110001000010000310000000100020200012784999467993078847202983177381612634528383165501397414440100040002923229264292862930029313

Test 2: throughput

Count: 8

Code:

  ld4r { v0.16b, v1.16b, v2.16b, v3.16b }, [x6]
  ld4r { v0.16b, v1.16b, v2.16b, v3.16b }, [x6]
  ld4r { v0.16b, v1.16b, v2.16b, v3.16b }, [x6]
  ld4r { v0.16b, v1.16b, v2.16b, v3.16b }, [x6]
  ld4r { v0.16b, v1.16b, v2.16b, v3.16b }, [x6]
  ld4r { v0.16b, v1.16b, v2.16b, v3.16b }, [x6]
  ld4r { v0.16b, v1.16b, v2.16b, v3.16b }, [x6]
  ld4r { v0.16b, v1.16b, v2.16b, v3.16b }, [x6]
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 1.0007

retire (01)cycle (02)030b0e0f18191e1f22233f43464951inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)60696b6d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a3a6a8acafb5b6bbdcache load miss (bf)c2c5branch mispredict (cb)cdcfd5d6ddinst fetch restart (de)e0e7eaebld/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
400205800696000000018000800391662540015210032005280000100320000800005004000107680008180035080054800560338400100200800003200002008000032000080054800551180201100991001008000080000010080000158001111080011601115000051092171180051066800003200001008005580042800558005580055
400204800545990000017010800391062540015210032005280000100320000800005004000007680008180035080054800540336400100200800003200002008000032000080054800541180201100991001008000080000010080000158001101180011501115000051091171180051066800003200001008005580055800558004280055
40020480041600000001701080039166254001001003200528000010032000080000500400008768000808003508004180054033640010020080000320000200800003200008004180054118020110099100100800008000001008000015800110080011601115000051091171180051166800003200001008005580055800558005580055
40020480054599000001701080039166254001521003200528000010032018080000500400021768000808003508005480041033640010020080000320000200800003200008004180054118020110099100100800008000001008000015800110080011001015000051091171180051006800003200001008004280055800558004280042
4002048005460000000001080026166254001521003200528000010032000080000500400007768000808003508004180054033640010020080000320000200800003200008005680041118020110099100100800008000001008000008001001380010601115010051091171180208060800003200001008077280045800428004280042
4002048005459900000160108002600625400152100320052800001003200008000050040000776800080800353800468005403364001002008000032000020080000320000800548005411802011009910010080000800000100800001580000011800110000000051091171180051166800003200001008006080055800428005580055
40020480054599010001701080039166254001521003200008000010032000080000500400000768000808003508005480054033640010020080000320000200800003200008005480054118020110099100100800008000001008000015800111080000601115000051091171180053106800003200001008005580055800558004280057
400204800416000000000108004116615340010010032005280000100320000800005004000007680008080035080041800540323400100200800003200002008000032000080054800541180201100991001008000080000010080000158001411080011601115000051091171180051066800003200001008005580055800558005780055
40020480054599000003701080039106254001521003200528000010032000080000500400007384000008003508005480054033640010020080000320000200800003200008004180054118020110099100100800008000001008000015800110080000601115000051091171180051060800003200001008005780055800558004280055
40020480054600000000010800391602540015210032000080000100320000800005004000117680008080022080054800540323400100200800003200002008000032000080054800541180201100991001008000080000110080000158001101080011601115000051091171180123066800003200001008005580055800558005580055

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 1.0008

retire (01)cycle (02)03050708090a0b0e0f18191e1f2223243a3f4346494e51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a1a3a6a7a8a9acafb5b6bbdcache load miss (bf)dtlb miss (c1)c2cdcficache miss (d3)d5d6ddinst fetch restart (de)e0e7? int output thing (e9)eaebecld/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
40002580460602101101010031176100080047006025400062103200528000010320000800005040006510240016080043800628004703444000102080000320000208000032000080062800621180021109101080000800001108000762380027001268002001250610501902117516800591000080000320000108006380289803318004880063
40002480062599101001000070100180032000025400062103200528000010320000800005040003354400240800988006280100034440001020800003200002080000320000800628006211800211091010800008000001080007723800280116800180125061050190817817800590009280000320000108006380063800638004880063
40002480062600101101000032010008004706002540006210320052800001032000080000504000331024001608004380062800620329400010208000032000020800003200008006280062118002110910108000080000010800068238000700178001901260710501901817817800590099280000320000108004880048800638006380063
4000248006259910110000006010008003210602540005810320052800001032000080000504000391024001608002880047800620329400200208000032000020800003200008006280062118002110910108000080000010800067080028022268002001723710501901717186800590099080000320000108004880063800638006380063
400024800626001011010000950100180047066025400062103200488000010320000800005040000210240016080043800478006203444000102080000320000208000032000080062800621180021109101080000800000108000872380027001388002060272370050190617178800440099280000320000108004880048800638006380048
40002480062599100000000031010008003210602540003410320048800001032000080000504000035440028080028800628004703294000102080000320000208000032000080062800621180021109101080000800001108000782480026000268001861707205019012171717800441099080000320000108004880063800638004880048
4000248006260110110100007000008004700002540005810320048800001032000080000504000395440028080043800628006203294000102080000320000208000032000080062800481180021109101080000800001108000772480028000298002061262470050190617617800440099080000320000108006380063800638006380048
4000248006260010110100003101001800470600254000341032005280000103200008000050400039102400160800438006280062034440001020800003200002080000320000800628006211800211091010800008000001080007824800260002680018012506005019017171717800590090280000320000108004880063800638006380063
400024800475991011010000320100080032166025400058103200528000010320000800005040004010240016080043800628006203444000102080000320000208000032000080062800621180021109101080000800000108000882380058100268002001262371050190617178800440099280000320000108006380048800488006380063
40002480047600101100100070100080032166025400062103200248000010320000800005040003310240016080043800628004703294000102080000320000208000032000080062800621180021109101080000800000108000972480025000680018612624710501901617617800591090280000320000108006380063800638006380063