Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

LD1R (post-index, 4H)

Test 1: uops

Code:

  ld1r { v0.4h }, [x6], x8
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 60 nops): 2.000

Issues: 3.003

Integer unit issues: 1.000

Load/store unit issues: 1.000

SIMD/FP unit issues: 1.003

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk instruction (07)mmu table walk data (08)09l2 tlb miss instruction (0a)l2 tlb miss data (0b)181e1f223a3f43464951schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)5f696b6d6emap rewind (75)map stall (76)dispatch uop (78)map ldst uop (7d)map simd uop (7e)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst simd load (98)inst ldst (9b)9dl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafb5bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2cfd0d2l1i cache miss demand (d3)l1i tlb miss demand (d4)d5map dispatch bubble (d6)d9dadbddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? simd retires (ee)? int retires (ef)f5f6f7f8fd
6200530160233019001900300463828576011167113002100010031000100010001000500050011192316226920282332850631030001000100020001000285712837511610011000100011000021000000110012120013269939068963221114120095314038081439412813310001496112138135211000100010002855628499286142855828681
620042875022311611141060047712823200116706300610001006100010001000100050005000119381222671028664286943103000100010002000100228529286591161001100010000100123100400141003222111344494267018314794120016330438081641442797510001528312226132701000100010002854028616286332857228450
6200428548223116011700600485528208011167143006100010061000100010001000500050011195212222230286122898231030001000100020001000284562841411610011000100001002231005001210032231113273942069243070104620109322438191638372809110001515112503136431000100010002846228492285312858628524
6200428724222120111910600484628358011165993005100010061000100010001000500050001194512226060284352859131030001000100020001000286812864311610011000100001002231004001410032441213398939769583150104319958319238112136432803010001496012443134851000100010003025228893288322886928846
6200428890232117101711600461228378011168593006100010061000100010001000500050001194219226180287402890631030001000100020001000287512891911610011000100001002231003001710033431113223941568693164103920241321338141631362811710001579212618138841000100010002865928843289462887228883
6200428882232117111611150047382851501117118300510001007100110001000100050005000119481222584028584285733103000100010002000100028626286171161001100010000100223100800141003244111317292546913311884120104317038201236402812810001555112572136531000100010002863728695288132874028702
62004285842221171014201360046762837311116796300610001005100010001000100050005000119461722555028513286588103000100010002000100028592284631161001100010000100123100500051003223111316893386992311474320008317038051441452804010001566312635139161000100010002875328599285512854728694
620042874722211411171060047102837701116768300610001006100010001000100050005001119457225400285402862432930001000100020001000286012847811610011000100001002331006000410032441013266947469233078114220138324038061340402815710001548912199137981000100010002861328706287112874928741
6200428780222118111210688046132832701116661300610001006100010001001100050005001119442022612028535286243103000100010002000100028605286291161001100010000100042100100011001213001315595526903315064020038322438081044362815510001559912658135511000100010002867528746286812859028663
62004285912220121013006004719283560111671030061000100510001000100010005000500011952722617028614287013103000100010002000100028673286451161001100010000100323100500141003224121311595446947315273920168322138081737442814110001523512675137971000100010002873928787287672871028708

Test 2: Latency 1->2 roundtrip

Chain cycles: 3

Code:

  ld1r { v0.4h }, [x6], x8
  fmov x0, d0
  eor x8, x8, x0
  eor x8, x8, x0
  add x6, x6, x8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 11.0050

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)09l2 tlb miss data (0b)0e0f18191e1f2223243a3f4d51schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)60696b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd load (98)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)a9acafb5bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? simd retires (ee)? int retires (ef)f5f6f7f8fd
6020514005310860000000010000014003913957425801025010020002100004010020000100001245664530437310709276114003201400501400561319703132437701003020010000200006020020000200001400501400471150201100991004010010000100001100100012110001011100001011032101931113972050000069100001000050100140052140051140052140053140037
60204140035108600000000130000014003513955625801025010020002100004010020000100001245637530448710710210114002601400501400501319643132431701003020010000200006020020000200001400501400471150201100991004010010000100000100100010110000100100001000032101931113971950000069100001000050100140051140048140051140054140051
6020414005010860000000010000014003713957125801025010020002100004010020000100001245637530463910710210114002801400501400501319493132431701003020010000200006020020000200001400501400351150201100991004010010000100000100100021110000011100001010032102931113971550000960100001000050100140051140051140051140051140036
602041400531085000000001288000014004213957725801025010020004100004010020000100001245691530392710709276114003501400561400561319703132471701003020010000200006020020000200001400501400501150201100991004010010000100000100100012110002001100001111132101931113971450000900100001000050100140051140048140051140051140051
6020414003510850100000010000014002013957125801185010020002100004010020000100001245637530452510710288114006901400521400501319643132431701003020010040200006020020000200001400501400471150201100991004010010000100000100100012010002021100001001032101931113971450000969100001000050100140057140042140057140058140042
602041400411085001100001340000014003813957225801005010020002100004024320000100001245637530392710710288114001101400351400501319493132432701003020010000200006020020000200001401491400501150201100991004010010000100001100100000110001001100000000032101931013971450000069100001000050100141872143594144089140673140051
6020414003510860100010010000014003213957352801025010020002100004010020000100001245637530456310710210114002601400511401261319643132416701003020010041200006020020000200001400501400471150201100991004010010000100000100100010110000121100001010032101931113982650000069100001000050100140051140052140051140051140051
6020414006210850000000010000014003813957125801025010020002100004024320000100001245637530448710710210114002801400501400521319643132431701003020010000200006020020000200001401421400501150201100991004010010000100000100100032110002003201100000010032101931113972050000909100001000050100140057140057140057140057140057
60204140051108600000000288000014003513957125801025010020002100004010020000100001248428530448710710210114002601400501400501319653132478701003020010000200006020020000200001400501400502150201100991004010010000100000100100001110001010100001010032101931113971650011999100001000050100140057140042140051140146140057
602041400591085100100002000011400421395712580104501002000210000401002000010000124730553047151071021011401030140236140050131970151324317036130200100412016260200201622008314014414023521502011009910040100100001000001001000421100350091601000410102328221031113994050260969100001000050100140142140327140327140229140322

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 11.0056

retire uop (01)cycle (02)03l1i tlb fill (04)l1d tlb fill (05)mmu table walk data (08)09l2 tlb miss data (0b)0e0f18191e1f22233a3f4d51schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)5f60696b6d6emap stall dispatch (70)int prf full (71)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd load (98)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)a9acafb5bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? simd retires (ee)? int retires (ef)f5f6f7f8fd
6002514005310491111001001401011400411396602580012500102000410000400102000010000124581553077321071781100140032014005614005613200403132461700103002010000200006002020000200001400561400531150021109104001010000100000101000221100010041000011111031403883413972750000969100001000050010140058140042140057140057140054
60024140041108601110000014010014004113964825800145001020004100004001020000100001245842530781010717577001400320140056140041132004014132464700103002010000200006002020000200001400561401491150021109104001010000100000101000120100010111000011110031404884413972750000909100001000050010140057140056140057140057140042
600241400561085010010000201011400411396602580014500102000210000400102000010000124584253077321071757701140029014005314005613200403132464700103002010000200006002020000200001400561400411150021109104001010000100000101000211100030041000011110031404873413971250000069100001000050010140057140057140063140057140054
600241400561085011110000200001400431396602580014500102000710000400102000010000124584253077321071757700140032014005614005613200103132462710453002010000200006002020080200001400531400561150021109104001010000100000101000211100010011000011110031405884413972750000900100001000050010140057140058140057140057140060
60024140150108601010100013400001400421396602580014500102000410000400102007710000124584253077321071757700140032014005614005613199003132464700103002010000200006002020000200811400561400411150021109104001010000100000101000211100011011000001110031403874413972750000600100001000050010140054140054140042140068140046
60024140142108601001000014881001400421396622580014500102000410000400102007910000124585153077321071795700140032014005614005613200403132461700103002010000200006002020000200001401251400561150021109104001010000100000101000121100010011000011110031404884313972850000990100001000050010140057140057140057140057140042
60024140154108601010000013301001400431396602580014500102000410000400102000010000124584253071101072216900140112014005614005613200403132465700103002010049200006002020000200001400411400561150021109104001010000100000101000211100020041000001112031404883313972750010999100001000050010140057140054140057140143140057
6002414005610860110000005300101401361397412580014500102000210000400102000010000124584253077321071757700140041014005614005613200403132464702683002010000200006002020000200001400561400531150021109104001010000100000101000110100011071000011110031404884413971250018969100001000050010140058140144140057140057140054
6002414005610850111000002001014004113966025800125001020004100004001020000100001247856530773210717811001400320140056140056132004031325067104530020100002000060020200002000014014314005311500211091040010100001000001010001111000100318110000111100314041024313972750000900100001000050010140057140042140057140057140054
6002414005310850111110002001014004113966025800145001020004100014001020000100001245842530773210717811001400320140056140041132004031325277001030020100002000060266200002000014005614005311500211091040010100001000001010001111000501317110001111130318611293313995250020969100001000050010140336140238140237140422140240

Test 3: throughput

Count: 8

Code:

  ld1r { v0.4h }, [x6], x8
  ld1r { v0.4h }, [x6], x8
  ld1r { v0.4h }, [x6], x8
  ld1r { v0.4h }, [x6], x8
  ld1r { v0.4h }, [x6], x8
  ld1r { v0.4h }, [x6], x8
  ld1r { v0.4h }, [x6], x8
  ld1r { v0.4h }, [x6], x8
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 1.0005

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)l2 tlb miss data (0b)0e0f18191e1f22243a3f4346494f51schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)6067696b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd load (98)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafb5b6bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2c3branch cond mispred nonspec (c5)cfd5map dispatch bubble (d6)ddfetch restart (de)e0e7? int output thing (e9)eaeb? ldst retires (ed)? simd retires (ee)? int retires (ef)f5f6f7f8fd
1602058004164310010001201008002616612252401328010080018800008010080000800004358978375882449186660080022080041800415992435999924010020080000800002001600008000080041800411180201100991001008000080000010080007623800260012680018617237000511011611800380800001098000080000801008004280042800428004280042
160204800416421110100490101800261662252401318010080033800008010080000800004359014375882449189100080022080041800415992435999924010020080000800002001600008000080041800411180201100991001008000080000010080008623800070001380018002423710051101161180038080000998000080000801008004280042800428004280042
160204800416431000100340100800260009252401318010080033800008010080000800004358978375882449188960080022080041800415992435999924010020080000800002001600008000080041800411180201100991001008000080000010080007823800060002580019612523700051101161280038180000098000080000801008004280042800428004280175
160204800416431001100370100800260062252401218010080006800008010080000800004358986376303749188930080022080041800415992435999924010020080000800002001600008000080041800411180201100991001008000080000010080007723800000007800196170700051451161180038180000998000080000801008004280042800428004280042
160204800416431000000001008002606682524013180100801228000080100800008000043589823758824491889800800220800418004159924359999240100200800008000020016000080000800418004111802011009910010080000800000100800066238000600034800186126237000511011611800380800009108000080000801008004280042800428004280042
1602048004164210010003700108002616022524013180100800318000080100800008000043590143758824491889600800220800418004159924359999240100200800008000020016000080000800418004111802011009910010080000800000100800077238002600168000061623710051101161180038080000098000080000801008004280042800428004280042
1602048004164310100006001180026106925240132801008003180000801008000080000435898637588244918893008002208004180041599243599992401002008015180000200160000800008004180177118020110099100100800008000001008000872380026010780000612523700051101161180038080000908000080000801008004280042800428004280042
160204800416421100000310010800261664252401068010080031800008010080000800004358966375882449187330080022080041800415992435999924010020080000800002001602888000080041800411180201100991001008000080000410080095023800250002680012612523720051101161180038180000098000080000801008004280042800428004280042
160204800416431100000440000800261661225240131801008003280000801008000080000436245637588244918893008002208004180041599243599992401002008000080000200160000800008004180041118020110099100100800008000001008000872380025010980018612623610051101161180038180000998000080000801008004280042800428004280042
16020480041643100000070010800261661425240404801008003180000801008000080000435901837588254918897008002208004180041599243599992417652008000080000200160000800008004180041118020110099100100800008000001008000882380026000298000061723600051101161180038080000008000080000801008004280042800428004280042

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 1.0005

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)l2 tlb miss data (0b)0e0f18191e1f22243a3f4346494f51schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd load (98)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafb5b6bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0e7? int output thing (e9)eaeb? ldst retires (ed)? simd retires (ee)? int retires (ef)f5f6f7f8fd
16002580041620000110018010080026100025240010800108001580000801508000080000435841337588244918531800228004180041599463600212400102080000800002016000080000800418004111800211091010800008000001080000008001201013800096013000050205168980038180000968000080000800108004280042800428004280042
16002480041620000000043010080026106425240010800108001980000800108000080000435841737588244918531800228004180041599463600212400102080000800002016028880000800418004111800211091010800008000001080000008000000013800006101700050205168880038080000998000080000800108004280042800428004280042
1600248004162111100007010080026166142524001780010800078000080010800008000043583973758824491842780022800418004159946246002124001020800008000020160000800008004180041118002110910108000080000010800077080026000298001861252361050208168580038080000908000080000800108004280042800428004280042
1600248004162110100003101008002616610252400408001080031800008001080000800004358401375882449187618002280041800415994636002124001020800008000020160000800008004180041118002110910108000080000010800000148001301010800126091700050208168580038080000068000080000800108004280042800428004280042
160024800416200000000180100800261666252400298001080018800008001080000800004358421375882449183838002280041800415994636002124170820800008000020160000800008004180041118002110910108000080000010801810148000000016800136101700050208165880038180000068000080000800108004280042800428004280042
16002480041620000000015901008002606092524001080010800198000080010800008000043584253758824491854880022800418004159946360021240010208000080000201600008000080041800411180021109101080000800000108000001480012020138000061121700050208165880038180000668000080000800108004280042800428004280042
160024800416200000000190000800260069252400298001080018800008001080000801344358429375882449183838002280041800415994636002124001020800008000020160000800008004180041118002110910108000080000010800000148001300013800126101700050208165880038180000968000080000800108004280042800428004280042
160024800416200000000210000800261664252400298001080021800008001080000800004358421375882449183838002280041800415994636002124001020800008000020160000800008004180041118002110910108000080000010800000080015000138001361101700050205169580038180000908000080000800108004280042801748004280042
16002480041620000000019000080026166225240028800108001880000800108000080000435843337588244918536800228004180041599463600212400102080000800002016000080000800418004111800211091010800008000001080000014800120101128800130101700050208168580145180000998000080000800108004280042800428004280042
160024800416200000000000108002610652524002980010800198000080010800008000043584333758824491854280022800418004159946360021240010208000080000201600008000080041800411180021109101080000800000108000002080016010380000600000050208165880038180000968000080000800108004280042800428004280042