Apple M1 Microarchitecture Research by Dougall Johnson

Firestorm: Overview | Base Instructions | SIMD and FP Instructions
Icestorm:  Overview | Base Instructions | SIMD and FP Instructions

LD1R (2S)

Test 1: uops

Code:

  ld1r { v0.2s }, [x6]
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 60 nops): 2.000

Issues: 2.002

Integer unit issues: 0.000

Load/store unit issues: 1.000

SIMD/FP unit issues: 1.002

retire (01)cycle (02)0304090e0f191e223a3f464951inst issue (52)~issue fp/simd (54)~issue ld/st (55)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing ld/st (5a)huge thing fp/simd (5b)5f696d6escheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op ld/st (7d)~map op fp/simd (7e)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst fp/simd load (98)inst ldst (9b)9da0a3a6a8a9acafb5bbdcache load miss (bf)cfd0d2icache miss (d3)itlb miss (d4)d5d6d9dadbddinst fetch restart (de)e0eaebecld/st retires (ed)? fp/simd (ee)f5f6f7f8fd
6200529305219410000147252885101172912003100310001000100050001190692264229081293543102000100010001000100029193291671161001100010000100021001001100120312828934069143062072207033076381514575728341162851388915009100010002924829259292872925929298
620042923122050110214532287861117348200210021000100010005000119465225892918629313310200010001000100010002920829253116100110001000010003100000010003031299692686896304816220594302738199585028416164521403015295100010002931629365293212923029388
6200429354219500103146442883710173772003100210001000100050001192912266929217293283102000100010001000100029148292241161001100010000100021001000100020213025926268373095155206133122381719645528427160151376514943100010002933929219292472930329332
6200429271219600002146662883610173852003100010001000100050001193822263729097292753102000100010001000100029225292021161001100010000100001000001100020213152921968363050152206503161381819616328389161561379814987100010002919529296293322937229273
620042936422040000304624288050117218200010021000100010005000119060226082908529401310200010001000100010002919329266116100110001000010000100010110002001280692286868311216020609306738168606028395163511386115041100010002935129377293452932729266
62004292622204000021458528866001728020001000100010001000500011909622621291002933931020001000100010001000292432909811610011000100001000210000044210002121289192526986306106020690318438158585928434164041391215173100010002925129299293952927829377
6200429344219500003045112882300172072003100310001000100050001194462267429265293673102000100010001000100029136292141161001100010000100021000044100021012847917269043088054207193143381512605728373161691372015224100010002930329289293362925629193
620042935822050000314594288110017348200010001000100010005000119075226752910729286310200010001000100010002924029043116100110001000010003100000010002021317493836858314706020612317138166526228538164131377115102100010002925729373293052932929290
620042929221960000014750290021117310200210031000100010005000119440226462914429311310200010001000100010002911229174116100210001000010003100000010002021292593626877305315720697317238108625428531162861375714987100010002931529249291752936729300
620042934422040100214567286910017296200310001000100010005000119440226392914829164310200010001000100010002904929159116100110001000010002100000110002031309290716891306415920638305638199575728426163081380514930100010002922829321292982914929308

Test 2: Latency 1->2 roundtrip

Chain cycles: 3

Code:

  ld1r { v0.2s }, [x6]
  fmov x0, d0
  eor x8, x8, x0
  eor x8, x8, x0
  add x6, x6, x8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 11.0060

retire (01)cycle (02)0305080b0e0f18191e1f22233a3f4d4f51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a1a3a6a7a8a9acafb5bbdcache load miss (bf)dtlb miss (c1)c2branch mispredict (cb)cdcfd5d6ddinst fetch restart (de)e0? int output thing (e9)eaebecld/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
6020514005710491110000201001400451394301293712570102401002000410000301002000010000126401266940221431259101400331400601400601318063132395601003020010000200006020010000200001400411400411150201100991004010010000100001100100032110001001410000110100032101114121395684000013100100001000040100140042140042140061140058140042
60204140060104911101002010014004513944412935925701024010020002100003010020000100001263814669402214312591014003314004114005713179931323926010030200100532000060200100002000014006014005721502011009910040100100001000001001000611100500009330610021111110032101139111395714000013130100001000040100140061140061140042140060140058
6020414006010491010000245000014002613943312936825701044010020002100003010020000100001263985669402214309667014001714005714006013180331324016010030390100002000060200100002000014004114005711502011009910040100100001000001001000321100010001100001110100321011391113957140000131010100001000040100140042140058140058140059140061
60204140057104911010002000014004513945612937125701024010020004100003010020000100561264156669435814312591014003614006014006013180631323956010030200100002000060200100002000014004114005711502011009910040100100001000001001000111100020001100001111100321011391113955040000131313100001000040100140042140061140058140058140061
6020414006010491010000845010014004513943012937125701024010020004100003010020000100001264012669324414309667014001714004114006013179931323956010030200100002000060200100002000014006014005711502011009910040100100001000001001000221100020001100001111100321011391113957140000131310100001000040100140061140061140061140061140042
60204140060104910000007010114004513945612936025701044010020004100003010020000100001263814669416614312903014002314005714004113179931324016010030200100002000060200100002000014006014004111502011009910040100100001000001001000321100020021100001111200321011141113955040000101010100001000040100140061140061140042140042140042
60204140041104811100002010014004513943312937325701044010020004100003010020000100001264012669416614309667014003614004114006013180331323926010030200100002000060200100002000014005714004111502011009910040100100001000001001000111100010001100001101000321011391113957140000131310100001000040100140061140042140061140061140061
60205140041104910000002010014004513943012937125701044010020004100003010020000100001263814669324414312591014003614005714005713183431324016010030200100002000060200100002000014006014005711502011009910040100100001000001001000111100030001100001111000321011141113957140000131313100001000040100140061140042140061140052140061
60204140060104910100007450100140045139430129353257010240100200021000030100200001000012640126694022143129030140017140057140060131806313239260100302001000020000602001000020000140041140057115020110099100401001000010000010010001201000100011000001110003210111411139550400000100100001000040100140061140042140042140061140042
60204140060104910000007010014004513943312937125701044010020004100003010020126100001264012669324414312591014001714005714006013179931323956010030200100002000060200100002000014006014005711502011009910040100100001000001001000211100030011100001111100321011391113957140000101010100001000040100140058140058140042140042140061

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 11.0050

retire (01)cycle (02)0308090f191e2223243f4d4f51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)60696b6d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a3a6a8a9acafb5bbdcache load miss (bf)cdcfd5d6ddinst fetch restart (de)e0? int output thing (e9)eaebecld/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
600251400501049000018100140035139394129362257001240010200021000030010200001000012644296693685143258290140026014005014005013186831324206001030211100002000060020100002000014005014004711500211091040010100001000001010000110000003100001010314031111113955440000069100001000040010140051140036140051140051140036
60024140035104900001100140112139397129366257001240010200001000030010200001000012644296693685143258290140011014005014003513187731324306001030020100002000060020100002000014003514003511500211091040010100001000001010000010000000100001000314011131113956940000066100001000040010140036140036140051140048140048
6002414005010490000397000140020139394129347257001240010200021000030010200001000012644776693685143258290140023014004714003513184731324336001030020100002000060020100002000014005014004711500211091040010100001000001010000110000000100000010314011111113955440000999100001000040010140051140051140048140051140051
60024140050104900001100140035139394129347257001040010200001000030010200001000012644296693538143258290140026014005014005013180431324336001030020100002000060020100002000014005014004711500211091040010100001000001010000110000000100000010314011111113956940000906100001000040010140051140036140051140051140051
60024140050104800000000140035139394129362257001240010200021000030010200001000012644296693685143261120140023014004714004713190231324206001030020100002000060020100002000014005014004711500211091040010100001000001010000110000000100001010314011131113956940000909100001000040010140051140036140048140048140051
60024140047104900000100140035139397129362257001240010200051000030010200001000012644296693538143261120140011014005014004713186231324336001030020100002000060020100002000014005014004711500211091040010100001000001010000010000000100001010314011131113956940000960100001000040010140051140051140051140036140051
600241400501049000018100140068139397129347257001040010200021000030010200001000012644776692947143261120140013014003514003513190931324306001030020100002000060020100002000014005014004711500211091040010100001000001010000110000000100001010314011111113956940000000100001000040010140036140036140048140036140051
60024140050104900001100140035139397129347257001240010200001000030010200001000012644776692947143261120140011014003514003513185831324336001030020100002000060020100002000014004714004711500211091040010100001000001010000010000000100000000314011111113956940000069100001000040010140036140051140051140036140036
60024140050104900001065100140032139397129359257001040010200001000030010200001000012644436693538143258290140026014005014005013189331324336001030207100002000060020100002000014003514003511500211091040010100001000001010000110000010100001000314011131113956940000960100001000040010140036140051140051140036140051
60024140035104900000001140035139397129347257001240010200021000030158200001000012644776693973143258290140065014005014005013181231324306001030020100002000060020100002000014005014004711500211091040010100001000001010000110000000100000000314011131113956940000909100001000040010140051140051140036140051140051

Test 3: throughput

Count: 8

Code:

  ld1r { v0.2s }, [x6]
  ld1r { v0.2s }, [x6]
  ld1r { v0.2s }, [x6]
  ld1r { v0.2s }, [x6]
  ld1r { v0.2s }, [x6]
  ld1r { v0.2s }, [x6]
  ld1r { v0.2s }, [x6]
  ld1r { v0.2s }, [x6]
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.3342

retire (01)cycle (02)030508090b0e0f1e22233a3f4346494f51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)92inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a1a3a6a7a8a9acafb5b6bbdcache load miss (bf)dtlb miss (c1)c2cdcfd5d6ddinst fetch restart (de)e0eaebecld/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
1602052673220010111021000267220072225160165100800658000010080000800005001167722188019002671826737267376637366731601002008000080000200800008000026715267151180201100990100100800008000001008002121438005910060800006159431920511011611267121313180000800001002673826738267162673826716
1602042671520011010066003267223771925160165100800658000010080000800005001168291188660702669626715267376637366731601002008000080000200800008000026715267371180201100990100100800008000001008002121438005920160800006019431920511011611267341313080000800001002673826738267382671626716
16020426737201111100670032670007012516016510080065800001008000080000500117017918794151267182671526715665936673160100200800008000020080000800002673726737118020110099110010080000800000100800202043800600012180040611943191051101161126734013080000800001002673826716267162673826738
16020426737201110000210032670000712516016210080019800001008000080000500116923518835321266962673726737663736695160100200801898000020080189800002673726715118020110099010010080000800000100800201943800580012280040615843190051101161126712013280000800001002673826716267382673826738
160204267372001101006900226722377192516016510080065800001008000080000500116772218866071266962673726715665936673160100200800008000020080000800002671526737118020110099010010080000800000100800202008005900060800406058431900511011611267341313280000800001002673826738267162671626738
16020426737201100100670022672237719251601651008006580000100800008000050011701791879415126696267372671666383669516010020080000800002008000080000267152673711802011009901001008000080000010080020204380059100608000060580190051101161126734013280000800001002673826716268402671626738
16020426737200111111660012672207702516016510080063800001008000080000500116772218801901267182673726715663736673160100200800008000020080000800002673726715118020110099010010080000800000100800202008005900121800400158019005110116112671200180000800001002673826716267162673826738
160204267372001001002110026722270025160119100800198000010080000800005001167722188402112671826748267456659366731601002008000080000200800008000026737267151180201100990100100800008000001008002019438005900063800006019431900511011611267341313080000800001002673826716267382671626738
1602042673720011010021001267220701251601651008006580000100800008000050011701791879415126724267152673766373667316010020080000800002008000080000267372673711802011009901001008000080000010080021190800591016480042601943190051101161126712013080000800001002671626817267162671626738
1602042673720011111067003267220771925160165100800198000010080000800005001167722188384912671826715267376658366731601002008000080000200800008000026737267371180201100990100100800008000011008002020438005900061800006019431900511011611267121313180000800001002674626743267252671626738

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.3340

retire (01)cycle (02)03070b0e0f1e1f22233a3f4346494f51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)60696d6edispatch stall (70)72scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a3a5a6a8acafb5b6bbdcache load miss (bf)cfd5d6ddinst fetch restart (de)e0eaebld/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
1600252671120010014501012671300121225160010108004580000108000080000501174628188403212668926728267086673036703160010208000080000208000080000267082672311800211091010800008000011080000390800390398000061354350206165426705068000080000102672426729267092672426709
160024267232000000450002267132120122516001010800458000010800008000050117462818840321266892672826728665503670816001020800008000020800008000026708267081180021109101080000800000108000039080000008000061350502061666267256108000080000102670926729267292672426729
16002426708200001145000026693012121625160010108000080000108000080000501168754188595202671826723267326653036703160010208000080000208000080000267232672311800211091010800008000001080000008003500800006035050206166526720008000080000102670926724267092672926709
16002426728200000000002267132120043160055108004580000108000080000501168880188600502672526735267286653036688160010208000080000208000080000267282672811800211091010800008000001080000008003904080035613543502081657267291008000080000102672426729267092672926724
160024267082000110001002671300002516005510800418000010800008000050117462818840320266892672826723667203668816001020800008000020800008000026723267231180021109101080000800000108000000800390080039600435020716662672510108000080000102672426724267292670926709
1600242670820100114101022669300002516001010800418000010800008000050117318318867590266892670826728667303668816001020800008000020800008000026728267231180021109101080000800000108000043080039038800026103950204165426720008000080000102673126900267122672726730
1600242672320000004501002669320121625160010108004580000108000080000501174628188403202668926728267086668036708160010208000080000208000080000267082672311800211091010800008000001080000390800390358003501005020816562672510108000080000102670926729267092682826724
16002426728200000045010126713212121125160051108004580000108000080000501167973188675912668926723267286653036688160010208000080000208000080000267282672311800211091010800008000001080000390800000398003901039502071653267201008000080000102672926729267092670926709
1600242670820000000010226693212002516005510800458000010800008000050116888018833430267092670826708667203668816001020800008000020800008000026708267231180021109101080000800000108000039080039039800356035435020516552670510108000080000102672926724267092672926729
1600242670820000000010126713018181225160051108004180000108000080000501168880188403212668926723267346672036688160010208000080000208000080000267082672311800211091010800008000001080000430800390358000001005020816882672510108000080000102670926729267242670926709