Apple M1 Microarchitecture Research by Dougall Johnson

Firestorm: Overview | Base Instructions | SIMD and FP Instructions
Icestorm:  Overview | Base Instructions | SIMD and FP Instructions

LD4R (2D)

Test 1: uops

Code:

  ld4r { v0.2d, v1.2d, v2.2d, v3.2d }, [x6]
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 60 nops): 6.000

Issues: 6.012

Integer unit issues: 0.000

Load/store unit issues: 2.000

SIMD/FP unit issues: 4.012

retire (01)cycle (02)03050708090a0b0e0f1e22243a3f43464951inst issue (52)~issue fp/simd (54)~issue ld/st (55)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing ld/st (5a)huge thing fp/simd (5b)5f696d6escheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op ld/st (7d)~map op fp/simd (7e)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst fp/simd load (98)inst ldst (9b)9da0a3a6a8acafb5bbdcache load miss (bf)cfd0d2icache miss (d3)itlb miss (d4)d5d6d9dadbddinst fetch restart (de)e0eaebecld/st retires (ed)? fp/simd (ee)f5f6f7f8fd
660052940822002900210100004554287930001687660124016200040002000100004761862301929151294523106000200040002000800029180291561161001100010001200062002022002406129389190681130301387199623089380812615728405161181304615015200040002925229279292412939329241
660042920821902400250007004525287490001682860124012200040002000100004781672302729038292373106000200040002000800029193291701161001100010001200002000052000024128369106680530451053201113076380515555928386161901344914826200040002929329233292332928029225
660042925121902500290000104604289240001690360124000200040002000100004757642299029133292283106000200040002000800029140291791161001100010001200062002122002426129239038680230701160201473087381518615628452164711331314985200040002931329128292622926029309
660042935022002300180006004553288120201694260124016200040002000100004759872300329196293573106000200040002000800029177290941161001100010000200062002022000406127749105684629861065200903104381515535828391164211325614927200040002920629263292172920929367
660042923421902400240008004469287250021686060124012200040002000100004760232299329131292943106000200040002000800029056291421161001100010001200062000022002006128019076682930111162201313063380615605828424164571329814934200040002916329296292272929629330
66004292892190230024000600453728808020169136012401220004000200010000476006229892906629244310600020004000200080002920929157116100110001000120006200000200200612991921268493032955200653054380913656428417162471347614832200040002927229287293562925629371
660042925921902500180001500460428759002168746008401220004000200010003474263229552899429143310600020004000200080002913029047116100110001000120006200203200040612977902668643035864200663084380911535628397162931322714826200040002925129238292872934129214
66004292542190210024001700458128851000168676008401220004000200010000476479230382902729239310600020004000200080002872729133116100110001000120006200000200442612887912267943113858200443050381514605628388162771314014801200040002931429352292812927729197
660042931521902500190007104602288240201692960124008200040002000100004761282300729104293193106000200040002000800029124290251161001100010001200062000022004426128769023678230951058201753045381613686228510161491335414866200040002925429284293282935429232
66004292142190300018000800457328742020169836008401220004000200010000475163229932899529254310600620004000200080002915829119116100110001000120004200000200200012763925969003095965201893043381310596028446162201328714609200040002926229201293502929929237

Test 2: throughput

Count: 8

Code:

  ld4r { v0.2d, v1.2d, v2.2d, v3.2d }, [x6]
  ld4r { v0.2d, v1.2d, v2.2d, v3.2d }, [x6]
  ld4r { v0.2d, v1.2d, v2.2d, v3.2d }, [x6]
  ld4r { v0.2d, v1.2d, v2.2d, v3.2d }, [x6]
  ld4r { v0.2d, v1.2d, v2.2d, v3.2d }, [x6]
  ld4r { v0.2d, v1.2d, v2.2d, v3.2d }, [x6]
  ld4r { v0.2d, v1.2d, v2.2d, v3.2d }, [x6]
  ld4r { v0.2d, v1.2d, v2.2d, v3.2d }, [x6]
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 1.0008

retire (01)cycle (02)03080e0f18191e1f22243a3f4346494e51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)60696d6edispatch stall (70)72scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a1a3a5a6a7a8a9acafb5b6bbdcache load miss (bf)cdcfd5d6ddinst fetch restart (de)e0e7? int output thing (e9)eaebld/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
480205800696000000000100800262120025480184100320084160000100320000160000500800000960000018004180060800640034248010020016000032000020016000064000080041800601180201100991001008000080000010016001313430160052001521600006100051091171180061000101600003200001008006180042800658006580065
4802048006059900000420002800500000254801841003200841600001003200001600005008008865029876080045800648004100346480100200160000320000200160000640000800608004111802011009910010080000800000100160013144301600520021216003900323505109117118004110001600003200001008006580065800658006580061
4802048006460000100000028004921200254801841003200841600001003200001600005008004029600000080041800648006400342480100200160000320000200160000640000800418021021802011009910010080000800001100160000035016003600036160036610350510911711800380014141600003200001008006580042800428006580061
4802048006459900000420102800452000254801841003200841600001003200001600005008008531087999608002280064800410032348010020016000032000020016000064000080060800601180201100991001008000080000110016000000016003601035160036603240051091171180038007141600003200001008006580065800658006580065
4802048004160000000420000800260121202548017610032000016000010032000016000050080085338400001800228006480064003264801002001600003200002001600006400008004180060118020110099100100800008000011001600000350160036000361600366136400510911711800380014141600003200001008006180065800428006580042
480204800645990000042000280026215120254801841003200841600001003200001600005008008533840000080045800648006400346480100200160000320000200160000640000800418006011802011009910010080000800000100160000035016000001032160036003640051091171180061100101600003200001008006180061800658006580061
4802048004160000000380000800452120025480184100320000160000100320000160000500800377960000408002280065800640034648010020016015232000020016000064000080064800601180201100991001008000080000110016000000016003600036160036613235051091171180142000101600003200001008006580065800658006580152
480204800646000000042000280049200025480184100320076160000100320000160000500800377960000018004580064800410034248010020016000032000020016000064000080064800601180201100991001008000080000010016000003501600360000160036610350510911711800610014141600003200001008006580065800428006580065
4802048006460000000001028004900120254801841003200001600001003200001600005008003771087999618004580064800640032348010020016000032000020016000064000080060800601180201100991001008000080000110016000000016003600036160000613201510911711800410014141600003200001008006580065800658006580042
480204800646000000087010080049012120254801841003200841605181003200001600005008008531087999608002280041800410034648010020016000032000020016000064000080064800601180201100991001008000080000010016000003501600360203916003661363505109117118006100001600003200001008006580065800428006580061

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 1.0007

retire (01)cycle (02)03090e0f1e22233f43464951inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)60696d6e72scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a3a6a8acafb5b6bbdcache load miss (bf)c5branch mispredict (cb)cdcfd5d6ddinst fetch restart (de)e0e7eaebld/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
4800258005660011130008004121212254800861032007616000010320000160000508002214133364180037800568005603384800102016000032000020160000640000800568004111800211091010800008000001016000027160024024160000600000050192417221280038106160000320000108005780057800578005780057
48002480056599000391080026212122548007010320000160000103200001600005080000083200001800378005680056034248001020160000320000201600006400008005680056118002110910108000080000010160000016000002716002401242700050192217192480053166160000320000108004580057800438005780057
48002480055600000000800262120254800701032006016000010320000160000508002198320000180022800568005503384800102016000032000020160000640000800568005611800211091010800008000011016000001600240016002461242700050192317202480053166160000320000108005780057800578005780057
4800248004160001130108002621212254800701032006016000010320000160000508002163840000180037800568004103384800102016000032000020160000640000800568005611800211091010800008000001016000001600240016002461242700050192317222380041106160000320000108005780057800578004280042
48002480056599000301080120212122548007010320060160000103200001600005080021983200040800228005680041033848001020160000320000201600006400008004180056118002110910108000080000110160000271600240241600006024010150192517202580053160160000320000108005780057800618005780042
48002580056599000010800412120254802241032006016000010320000160000508002228320000080022800568005683384800102016000032000020160000640000800568005611800211091010800008000011016000027160024024160024612427000501925172125800531850160000320000108005780057800578005780057
4800248005660000030008003221202548007010320060160000103200001600005080021938400000800378005680056033848001020160000320000201600006400008005680056118002110910108000080000010160000271600241271600006124270005019121725208003811086160000320000108005780057800578005780057
480024800565990003010800412121225480070103200001600001032000016000050800000832000008003780056800410342480010201600003200002016000064000080056800411180021109101080000800001101600002716002402416002461027000501925172424800381700160000320000108004280057800578004280042
4800248005660000030108002621202548007010320060160000103200001600005080000083200040800378019880056033848001020160000320000201600006400008005680056118002110910108000080000010160000016000000160024012427000501925172424800531700160000320000108004280057800578005780057
480024800416000000108002621212254800701032005616000010320000160000508002178320000080037800568005603384800102016000032000020160000640000800568005611800211091010800008000001016000027160024024160024612427000501925172525800531910160000320000108005780042800578005780057