Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

LD1R (2S)

Test 1: uops

Code:

  ld1r { v0.2s }, [x6]
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 60 nops): 2.000

Issues: 2.002

Integer unit issues: 0.000

Load/store unit issues: 1.000

SIMD/FP unit issues: 1.002

retire uop (01)cycle (02)03l1i tlb fill (04)090e0f191e223a3f464951schedule uop (52)schedule simd uop (54)schedule ldst uop (55)dispatch simd uop (57)dispatch ldst uop (58)simd uops in schedulers (5a)ldst uops in schedulers (5b)5f696d6emap rewind (75)map stall (76)dispatch uop (78)map ldst uop (7d)map simd uop (7e)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst simd load (98)inst ldst (9b)9dl1d tlb access (a0)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)a9acafb5bbl1d cache miss ld nonspec (bf)cfd0d2l1i cache miss demand (d3)l1i tlb miss demand (d4)d5map dispatch bubble (d6)d9dadbddfetch restart (de)e0eaebec? ldst retires (ed)? simd retires (ee)f5f6f7f8fd
6200529305219410000147252885101172912003100310001000100050001190692264229081293543102000100010001000100029193291671161001100010000100021001001100120312828934069143062072207033076381514575728341162851388915009100010002924829259292872925929298
620042923122050110214532287861117348200210021000100010005000119465225892918629313310200010001000100010002920829253116100110001000010003100000010003031299692686896304816220594302738199585028416164521403015295100010002931629365293212923029388
6200429354219500103146442883710173772003100210001000100050001192912266929217293283102000100010001000100029148292241161001100010000100021001000100020213025926268373095155206133122381719645528427160151376514943100010002933929219292472930329332
6200429271219600002146662883610173852003100010001000100050001193822263729097292753102000100010001000100029225292021161001100010000100001000001100020213152921968363050152206503161381819616328389161561379814987100010002919529296293322937229273
620042936422040000304624288050117218200010021000100010005000119060226082908529401310200010001000100010002919329266116100110001000010000100010110002001280692286868311216020609306738168606028395163511386115041100010002935129377293452932729266
62004292622204000021458528866001728020001000100010001000500011909622621291002933931020001000100010001000292432909811610011000100001000210000044210002121289192526986306106020690318438158585928434164041391215173100010002925129299293952927829377
6200429344219500003045112882300172072003100310001000100050001194462267429265293673102000100010001000100029136292141161001100010000100021000044100021012847917269043088054207193143381512605728373161691372015224100010002930329289293362925629193
620042935822050000314594288110017348200010001000100010005000119075226752910729286310200010001000100010002924029043116100110001000010003100000010002021317493836858314706020612317138166526228538164131377115102100010002925729373293052932929290
620042929221960000014750290021117310200210031000100010005000119440226462914429311310200010001000100010002911229174116100210001000010003100000010002021292593626877305315720697317238108625428531162861375714987100010002931529249291752936729300
620042934422040100214567286910017296200310001000100010005000119440226392914829164310200010001000100010002904929159116100110001000010002100000110002031309290716891306415920638305638199575728426163081380514930100010002922829321292982914929308

Test 2: Latency 1->2 roundtrip

Chain cycles: 3

Code:

  ld1r { v0.2s }, [x6]
  fmov x0, d0
  eor x8, x8, x0
  eor x8, x8, x0
  add x6, x6, x8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 11.0060

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)l2 tlb miss data (0b)0e0f18191e1f22233a3f4d4f51schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd load (98)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafb5bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? simd retires (ee)? int retires (ef)f5f6f7f8fd
6020514005710491110000201001400451394301293712570102401002000410000301002000010000126401266940221431259101400331400601400601318063132395601003020010000200006020010000200001400411400411150201100991004010010000100001100100032110001001410000110100032101114121395684000013100100001000040100140042140042140061140058140042
60204140060104911101002010014004513944412935925701024010020002100003010020000100001263814669402214312591014003314004114005713179931323926010030200100532000060200100002000014006014005721502011009910040100100001000001001000611100500009330610021111110032101139111395714000013130100001000040100140061140061140042140060140058
6020414006010491010000245000014002613943312936825701044010020002100003010020000100001263985669402214309667014001714005714006013180331324016010030390100002000060200100002000014004114005711502011009910040100100001000001001000321100010001100001110100321011391113957140000131010100001000040100140042140058140058140059140061
60204140057104911010002000014004513945612937125701024010020004100003010020000100561264156669435814312591014003614006014006013180631323956010030200100002000060200100002000014004114005711502011009910040100100001000001001000111100020001100001111100321011391113955040000131313100001000040100140042140061140058140058140061
6020414006010491010000845010014004513943012937125701024010020004100003010020000100001264012669324414309667014001714004114006013179931323956010030200100002000060200100002000014006014005711502011009910040100100001000001001000221100020001100001111100321011391113957140000131310100001000040100140061140061140061140061140042
60204140060104910000007010114004513945612936025701044010020004100003010020000100001263814669416614312903014002314005714004113179931324016010030200100002000060200100002000014006014004111502011009910040100100001000001001000321100020021100001111200321011141113955040000101010100001000040100140061140061140042140042140042
60204140041104811100002010014004513943312937325701044010020004100003010020000100001264012669416614309667014003614004114006013180331323926010030200100002000060200100002000014005714004111502011009910040100100001000001001000111100010001100001101000321011391113957140000131310100001000040100140061140042140061140061140061
60205140041104910000002010014004513943012937125701044010020004100003010020000100001263814669324414312591014003614005714005713183431324016010030200100002000060200100002000014006014005711502011009910040100100001000001001000111100030001100001111000321011141113957140000131313100001000040100140061140042140061140052140061
60204140060104910100007450100140045139430129353257010240100200021000030100200001000012640126694022143129030140017140057140060131806313239260100302001000020000602001000020000140041140057115020110099100401001000010000010010001201000100011000001110003210111411139550400000100100001000040100140061140042140042140061140042
60204140060104910000007010014004513943312937125701044010020004100003010020126100001264012669324414312591014001714005714006013179931323956010030200100002000060200100002000014006014005711502011009910040100100001000001001000211100030011100001111100321011391113957140000101010100001000040100140058140058140042140042140061

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 11.0050

retire uop (01)cycle (02)03mmu table walk data (08)090f191e2223243f4d4f51schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)60696b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd load (98)inst ldst (9b)9d9fl1d tlb access (a0)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)a9acafb5bbl1d cache miss ld nonspec (bf)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? simd retires (ee)? int retires (ef)f5f6f7f8fd
600251400501049000018100140035139394129362257001240010200021000030010200001000012644296693685143258290140026014005014005013186831324206001030211100002000060020100002000014005014004711500211091040010100001000001010000110000003100001010314031111113955440000069100001000040010140051140036140051140051140036
60024140035104900001100140112139397129366257001240010200001000030010200001000012644296693685143258290140011014005014003513187731324306001030020100002000060020100002000014003514003511500211091040010100001000001010000010000000100001000314011131113956940000066100001000040010140036140036140051140048140048
6002414005010490000397000140020139394129347257001240010200021000030010200001000012644776693685143258290140023014004714003513184731324336001030020100002000060020100002000014005014004711500211091040010100001000001010000110000000100000010314011111113955440000999100001000040010140051140051140048140051140051
60024140050104900001100140035139394129347257001040010200001000030010200001000012644296693538143258290140026014005014005013180431324336001030020100002000060020100002000014005014004711500211091040010100001000001010000110000000100000010314011111113956940000906100001000040010140051140036140051140051140051
60024140050104800000000140035139394129362257001240010200021000030010200001000012644296693685143261120140023014004714004713190231324206001030020100002000060020100002000014005014004711500211091040010100001000001010000110000000100001010314011131113956940000909100001000040010140051140036140048140048140051
60024140047104900000100140035139397129362257001240010200051000030010200001000012644296693538143261120140011014005014004713186231324336001030020100002000060020100002000014005014004711500211091040010100001000001010000010000000100001010314011131113956940000960100001000040010140051140051140051140036140051
600241400501049000018100140068139397129347257001040010200021000030010200001000012644776692947143261120140013014003514003513190931324306001030020100002000060020100002000014005014004711500211091040010100001000001010000110000000100001010314011111113956940000000100001000040010140036140036140048140036140051
60024140050104900001100140035139397129347257001240010200001000030010200001000012644776692947143261120140011014003514003513185831324336001030020100002000060020100002000014004714004711500211091040010100001000001010000010000000100000000314011111113956940000069100001000040010140036140051140051140036140036
60024140050104900001065100140032139397129359257001040010200001000030010200001000012644436693538143258290140026014005014005013189331324336001030207100002000060020100002000014003514003511500211091040010100001000001010000110000010100001000314011131113956940000960100001000040010140036140051140051140036140051
60024140035104900000001140035139397129347257001240010200021000030158200001000012644776693973143258290140065014005014005013181231324306001030020100002000060020100002000014005014004711500211091040010100001000001010000110000000100000000314011131113956940000909100001000040010140051140051140036140051140051

Test 3: throughput

Count: 8

Code:

  ld1r { v0.2s }, [x6]
  ld1r { v0.2s }, [x6]
  ld1r { v0.2s }, [x6]
  ld1r { v0.2s }, [x6]
  ld1r { v0.2s }, [x6]
  ld1r { v0.2s }, [x6]
  ld1r { v0.2s }, [x6]
  ld1r { v0.2s }, [x6]
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.3342

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)09l2 tlb miss data (0b)0e0f1e22233a3f4346494f51schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)92inst branch cond (94)inst int alu (97)inst simd load (98)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafb5b6bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0eaebec? ldst retires (ed)? simd retires (ee)? int retires (ef)f5f6f7f8fd
1602052673220010111021000267220072225160165100800658000010080000800005001167722188019002671826737267376637366731601002008000080000200800008000026715267151180201100990100100800008000001008002121438005910060800006159431920511011611267121313180000800001002673826738267162673826716
1602042671520011010066003267223771925160165100800658000010080000800005001168291188660702669626715267376637366731601002008000080000200800008000026715267371180201100990100100800008000001008002121438005920160800006019431920511011611267341313080000800001002673826738267382671626716
16020426737201111100670032670007012516016510080065800001008000080000500117017918794151267182671526715665936673160100200800008000020080000800002673726737118020110099110010080000800000100800202043800600012180040611943191051101161126734013080000800001002673826716267162673826738
16020426737201110000210032670000712516016210080019800001008000080000500116923518835321266962673726737663736695160100200801898000020080189800002673726715118020110099010010080000800000100800201943800580012280040615843190051101161126712013280000800001002673826716267382673826738
160204267372001101006900226722377192516016510080065800001008000080000500116772218866071266962673726715665936673160100200800008000020080000800002671526737118020110099010010080000800000100800202008005900060800406058431900511011611267341313280000800001002673826738267162671626738
16020426737201100100670022672237719251601651008006580000100800008000050011701791879415126696267372671666383669516010020080000800002008000080000267152673711802011009901001008000080000010080020204380059100608000060580190051101161126734013280000800001002673826716268402671626738
16020426737200111111660012672207702516016510080063800001008000080000500116772218801901267182673726715663736673160100200800008000020080000800002673726715118020110099010010080000800000100800202008005900121800400158019005110116112671200180000800001002673826716267162673826738
160204267372001001002110026722270025160119100800198000010080000800005001167722188402112671826748267456659366731601002008000080000200800008000026737267151180201100990100100800008000001008002019438005900063800006019431900511011611267341313080000800001002673826716267382671626738
1602042673720011010021001267220701251601651008006580000100800008000050011701791879415126724267152673766373667316010020080000800002008000080000267372673711802011009901001008000080000010080021190800591016480042601943190051101161126712013080000800001002671626817267162671626738
1602042673720011111067003267220771925160165100800198000010080000800005001167722188384912671826715267376658366731601002008000080000200800008000026737267371180201100990100100800008000011008002020438005900061800006019431900511011611267121313180000800001002674626743267252671626738

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.3340

retire uop (01)cycle (02)03mmu table walk instruction (07)l2 tlb miss data (0b)0e0f1e1f22233a3f4346494f51schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)simd prf full (72)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd load (98)inst ldst (9b)9d9fl1d tlb access (a0)l1d cache miss ld (a3)a5ld unit uop (a6)l1d cache writeback (a8)acafb5b6bbl1d cache miss ld nonspec (bf)cfd5map dispatch bubble (d6)ddfetch restart (de)e0eaeb? ldst retires (ed)? simd retires (ee)? int retires (ef)f5f6f7f8fd
1600252671120010014501012671300121225160010108004580000108000080000501174628188403212668926728267086673036703160010208000080000208000080000267082672311800211091010800008000011080000390800390398000061354350206165426705068000080000102672426729267092672426709
160024267232000000450002267132120122516001010800458000010800008000050117462818840321266892672826728665503670816001020800008000020800008000026708267081180021109101080000800000108000039080000008000061350502061666267256108000080000102670926729267292672426729
16002426708200001145000026693012121625160010108000080000108000080000501168754188595202671826723267326653036703160010208000080000208000080000267232672311800211091010800008000001080000008003500800006035050206166526720008000080000102670926724267092672926709
16002426728200000000002267132120043160055108004580000108000080000501168880188600502672526735267286653036688160010208000080000208000080000267282672811800211091010800008000001080000008003904080035613543502081657267291008000080000102672426729267092672926724
160024267082000110001002671300002516005510800418000010800008000050117462818840320266892672826723667203668816001020800008000020800008000026723267231180021109101080000800000108000000800390080039600435020716662672510108000080000102672426724267292670926709
1600242670820100114101022669300002516001010800418000010800008000050117318318867590266892670826728667303668816001020800008000020800008000026728267231180021109101080000800000108000043080039038800026103950204165426720008000080000102673126900267122672726730
1600242672320000004501002669320121625160010108004580000108000080000501174628188403202668926728267086668036708160010208000080000208000080000267082672311800211091010800008000001080000390800390358003501005020816562672510108000080000102670926729267092682826724
16002426728200000045010126713212121125160051108004580000108000080000501167973188675912668926723267286653036688160010208000080000208000080000267282672311800211091010800008000001080000390800000398003901039502071653267201008000080000102672926729267092670926709
1600242670820000000010226693212002516005510800458000010800008000050116888018833430267092670826708667203668816001020800008000020800008000026708267231180021109101080000800000108000039080039039800356035435020516552670510108000080000102672926724267092672926729
1600242670820000000010126713018181225160051108004180000108000080000501168880188403212668926723267346672036688160010208000080000208000080000267082672311800211091010800008000001080000430800390358000001005020816882672510108000080000102670926729267242670926709