Apple M1 Microarchitecture Research by Dougall Johnson

Firestorm: Overview | Base Instructions | SIMD and FP Instructions
Icestorm:  Overview | Base Instructions | SIMD and FP Instructions

LD4 (multiple structures, 4H)

Test 1: uops

Code:

  ld4 { v0.4h, v1.4h, v2.4h, v3.4h }, [x6]
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 60 nops): 6.000

Issues: 6.008

Integer unit issues: 0.000

Load/store unit issues: 2.000

SIMD/FP unit issues: 4.008

retire (01)cycle (02)03070a0b0e0f181e223a3f464951inst issue (52)~issue fp/simd (54)~issue ld/st (55)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing ld/st (5a)huge thing fp/simd (5b)5f696d6escheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op ld/st (7d)~map op fp/simd (7e)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst fp/simd load (98)inst ldst (9b)9da0a3a6a8acafb5bbdcache load miss (bf)cfd0d2icache miss (d3)itlb miss (d4)d5d6d9dadbddinst fetch restart (de)e0eaebecld/st retires (ed)? fp/simd (ee)f5f6f7f8fd
66005293002192025000001461528874001691360124008200040002000100004751810230152912629310310600020004000200080002906529083116100110001000020006200002200042412833923368523070775200943060381311525528391163471334214717200040002927429336292522920729244
66004292722192118000001458328875121685160124012200040002000100034752632298329211292393106000200040002000800029205291251161001100010000200042000002000404128619244681130751154201493078380411575528470162891319514834200040002937429264293062923029313
66004292882192419000051461628795201686260124012200040002000100004752282297529139292293106000200040002000800029170290921161001100010000200042000002002424129479273686430421549200793092381013555528420159731340214926200040002933429269293502928529256
6600429293220261900005146712884300168456008400820004000200010000475205229742905829203310600020004000200080002899729162116100110001000020004200000200040413078930568543103948200513083380810485228457163591327514884200040002934729327292842931629268
660042934021922150000614619288762016831600840002000400020001000047532623005290762928631060002000400020008000291892906111610011000100002000420000020004041276392036871303054720040307938118514428405163601324914738200040002923429429293572943729206
6600429317220211710005045432876900168696008400020004000200010000475064230272905229304310600020004000200080002919429236116100110001000020000200000200000412912926568363063105420102308738098555228473162851352414825200040002930529274292652921929292
660042925222022210000014583288260016899600840082000400020001000047528423038290992932231060002000400020008000291542918211610011000100002000420000020004061293191846811302875320077308138108585628379161751317314753200040002937029314293692934329342
6600429429219202400007145422885300169266008400820004000200010000475364229772911329307310600020004000200080002917529185116100110001000020004200000200040412931920268713059135320092310038159596328466162471327414908200040002926129315292672928629261
66004292922192316000051460528850021687760004008200040002000100004749652300129091292453106000200040002000800029255291941161001100010000200042000002000404129429149687830551451200183074380712545928451163421337514930200040002926629255292692926429322
6600429271220202300000145602884300168676008400820004000200010000475284229992912229321310600020004000200080002913229169116100110001000020004200003200040413040918868363100944200403114381212557128480157281331614863200040002930929213292452927329237

Test 2: throughput

Count: 8

Code:

  ld4 { v0.4h, v1.4h, v2.4h, v3.4h }, [x6]
  ld4 { v0.4h, v1.4h, v2.4h, v3.4h }, [x6]
  ld4 { v0.4h, v1.4h, v2.4h, v3.4h }, [x6]
  ld4 { v0.4h, v1.4h, v2.4h, v3.4h }, [x6]
  ld4 { v0.4h, v1.4h, v2.4h, v3.4h }, [x6]
  ld4 { v0.4h, v1.4h, v2.4h, v3.4h }, [x6]
  ld4 { v0.4h, v1.4h, v2.4h, v3.4h }, [x6]
  ld4 { v0.4h, v1.4h, v2.4h, v3.4h }, [x6]
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 1.0008

retire (01)cycle (02)0305080b0e0f1e1f223a3f4346494e51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)60696d6edispatch stall (70)72scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a1a3a6a8a9acafb5b6bbdcache load miss (bf)dtlb miss (c1)cfd5d6ddinst fetch restart (de)e0e7eaebecld/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
48020580085600100104201280049201252548010010032007216000010032000016000050080037796000001800458006480064003474801002001600003200002001600006400008006080060118020110099100100800008000011001600000016005301521600006132350510911711800381141001600003200001008006580079800618004280042
48020480041599011005401080080049212120254801761003200241600001003200001600005008000003840000180022800648006500342480322200160000320000200160000640000800478006911802011009910010080000800001100160000035160032000160000613200510911711800381101001600003200001008006580066800658006580065
480204800606000001000008004901212025480176100320084160000100320000160000500800853960000018004580064800410034248010020016000032000020016000064000080064800411180201100991001008000080000110016000000160000003616003661000510911711800610141401600003200001008006580065800658006580065
480204800645990000038000800492120025480184100320084160000100320000160000500800853960000008002780041800600034748010020016000032000020016000064000080064800601180201100991001008000080000010016000000160036003616003600320051091171180061013041600003200001008004880048800708004880070
480204800476001110038000800491121202548018410032000016000010032000016000050080000096000001800458004180064003464801002001600003200002001600006400008006480060118020110099100100800008000001001600000016003600016003660035051091171180061001001600003200001008006580065800428006580065
480204800646000000042002800490121202548017610032008416000010032000016000050080085338400001800458006480041003464801002001600003200002001600006400008004180060118020110099100100800008000001001600130016003600481600366132400510911711800380141401600003200001008006580066803348006180042
480204800646000000042002800490121202548018410032008416000010032000016000050080085310879996180022800648004100323480100200160000320000200160000640000800418004111802011009910010080000800000100160000035160000003616000061360051091171180061001401600003200001008007080061800658004280065
480204800646000000038010800492121202548010010032008416000010032000016000050080085338400001800458006480064003464801002001600003200002001600006400008006480060118020110099100100800008000001001600151335160036001316000061134305109117118003800001600003200001008006580065800658006580065
480204800646000000042010800292121202548012410032006416000010032000016000050080118610880752180045800418004100346480100200160000320000200160000640000800648006011802011009910010080000800000100160000001600360036160036005101251091171180038001001600003200001008004880065800658004280065
48020480041600000004201280049201202548018410032000016000010032000016000050080085338400001800458004180065003464801002001600003200002001600006400008004180060118020110099100100800008000011001600000351600360036160036603240051091171180038001401600003200001008006580065800618006580065

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 1.0008

retire (01)cycle (02)03090e0f191e22233f4346494f51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)60696d6escheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a3a6a8a9acafb5b6bbdcache load miss (bf)c5cdcfd5d6daddinst fetch restart (de)e0e7eaebld/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
480025800696001000381080045212120254800861032007616000010320000160000508003749600000180041800608006034248001020160000320000201600006400008006080060118002110910108000080000010160000351600000032160032610350050192617027188005711010160000320000108006180061800618006180061
480024800606000000381080045212002548008610320076160000103200001600005080037296000001800418004180060342480010201600003200002016000064000080060800601180021109101080000800000101600000160032000160032603200050191717016278005711010160000320000108004280061800618006180061
48002480060600000012510800452121202548008610320076160000103200001600005080040296000001800418006080060342480010201600003200002016000064000080060800601180021109101080000800000101600003516003200321600326132350050191517028158005711010160000320000108006180061800618006180042
4800248006059900003810800452121202548008610320076160000103200001600005080037492800121800418006080060342480010201600003200002016000064000080060800601180021109101080000800000101600003516003200321600326132350050192717014258005711010160000320000108006180061800618006180060
48002480041600000038108004520120254800861032007616000010320000160000508003773840000180041800608006034248001020160000320000201600006400008006080060118002110910108000080000010160000016003200321600320132350050191517026188003811010160000320000108006180061800618004280061
4800248006060000103810800452121202548008610320076160000103200001600005080000096000001800418006080060342480010201600003200002016000064000080060800601180021109101080000800000101600003516003200321600326132350050192617027308005711010160000320000108006180061800618006180061
4800248006060000003810800262121202548008610320000160000103200001600005080037496000001800418006080060342480010201600003200002016000064000080060800601180021109101080000800000101600003516003200321600320132350050192517027188005711010160000320000108004280061800618006180061
480024800606000000381080045201202548008610320076160000103200001600005080037496000001800418006080060342480010201600003200002016000064000080060800411180021109101080000800001101600003516003200321600326132350050192617025278005711010160000320000108006180147800618004280061
4800248004159900003810800452121202548008610320076160000103200001600005080037738400001800418006080060342480010201600003200002016000064000080060800411180021109101080000800000101600003516003100321600326132350050192617028188005711010160000320000108006180061800428006180042
480024800606000000381080045212120254800861032007616000010320000160000508003729600000180041800608004134248001020160000320000201600006400008004180060118002110910108000080000010160000351600320001600326132350050192817027288005711010160000320000108006180061800618006180042