Apple M1 Microarchitecture Research by Dougall Johnson

Firestorm: Overview | Base Instructions | SIMD and FP Instructions
Icestorm:  Overview | Base Instructions | SIMD and FP Instructions

LDR (register, uxtw, Q)

Test 1: uops

Code:

  ldr q0, [x6, w7, uxtw]
  mov x7, #4
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 0.000

Load/store unit issues: 1.000

SIMD/FP unit issues: 0.000

retire (01)cycle (02)03090e0f1e223a3f4346494f51inst issue (52)~issue ld/st (55)~dispatch ld/st (58)huge thing ld/st (5a)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op ld/st (7d)~map lookup ld/st (80)8283pipeline redirect (84)85inst all (8c)inst fp/simd load (98)inst ldst (9b)9da0a3a6a8acafb5b6bbdcache load miss (bf)cfd5d6ddinst fetch restart (de)e0eaebecld/st retires (ed)f5f6f7f8fd
100540221106111368174192025100010001000154110376382402225324010001000200038340211100110001000010000105505510256155447311611399101061000383403384403403
100438330016200387254020251000100010001467813774024022243260100010002000402402111001100010000100059105505510556155447311611400101061000403403403403403
10044023001610038718402025100010001000154040377402402224326010001000200040238311100110001000010000105505510250155447311611399101061000404403403403403
100440220016100367254032510001000100015471035838240222632601000100020003834021110011000100001000591025025105560550731161139901001000384403383403383
100440220006000367254020251000100010001547113774024052253260100010002000383402111001100010001100059105502510550125447311611399101001000403383403403403
1004402300125103671701920251000100010001458013774004032353240100010002000402382111001100010000100001056055102501250731161139901001000403383403403403
1004402200125103672500212510001000100015471137740242322532601000100020003834021110011000100001000010550551025015544731161138010061000385403383403383
100440220016111367174032510001000100015394137738240322532411000100020004033821110011000100001000010550251055605544731161138001001000383403384403403
1004383300061003872540202510001000100015411137740340522832401000100020004023831110011000100001000591025055105560250731161140210061000403384403384403
1004402200125003672540202510001000100015394137740238322532601000100020004023901110011000100001000010550551025605644731161137901061000383403404403384

Test 2: Latency 1->2 roundtrip

Chain cycles: 3

Code:

  ldr q0, [x6, w7, uxtw]
  fmov x0, d0
  eor x8, x8, x0
  eor x8, x8, x0
  add x6, x6, x8
  mov x7, #4
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 9.0047

retire (01)cycle (02)030e0f181e1f22233f4d4f51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)60696b6d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9fa0a1a3a6a8acafb5dcache load miss (bf)c2branch mispredict (cb)cfd5d6ddinst fetch restart (de)e0? int output thing (e9)eaebecld/st retires (ed)gpr retires (ef)f5f6f7f8fd
502051200478991101000120020119519109461256010040100100011000030100100001000010792005736044613381811200110120047120047113143311366750100302001000010000602002000010000120047120047115020110099100401001000010000100100000110000001000011003210113511119646400026081000040100120051120051120048120048120036
502041200478990001010120032119493109463256010040102100011000030100100001000010789995736044613446111200110120047120047113143311365850100302001000010000602002000010000120047120047115020110099100401001000010000100100000110000001000011003210110711119646400026651000040100120048120048120036120048120048
50204120047899000117010120035119519109462256010340102100001000030100100001000010792005736188613381811200283120049120049113141311364050100302001000010000602002000010000120035120047115020110099100401001000010000100100000110000001000011003210110111119660400029651000040100120036120048120048120048120051
502041200508990000000120032119509109449256010340102100011000030100100001000010788625735455613381811200260120035120047113143311366750100302001000010000602002000010000120050120035115020110099100401001000010000100100000010000001000011003210110122119822400026681000040100120036120036120048120048120048
502041200508991101000120020119519109512256010340100100001000030100100001000010788625736044613446111200260120047120047113145311365850100302001000010000602002000010000120035120047115020110099100401001000010000100100000110000001000010003210113511119646400026051000040100120036120051120051120048120048
502041200478990006010120035119493109461256010340100100011000030100100001000010792005736044613381811200230120035120050113143311365850100302001000010000602002000010000120050120047115020110099100401001000010000100100000110000101000211003210113511119660400000001000040100120048120036120048120036120036
50204120050899000100112003211951910946125601034010210001100003010010000100001078999573618861344611120011012003512004711314331136365010030200100001000060200200001000012004712004711502021009910040100100001000010010000011000000100001100321011711119660400009051000040100120048120036120051120048120036
502041200479000001001120032119519109449256010340102100001000030100100001000010788625736044613381811200230120047120047113143311363650100302001000010000602002000010000120050120047115020110099100401001000010000100100000110000001000011003210113511119660400020001000040100120048120036120048120048120048
502041200358990009000120032119493109449256010340102100011000030100100001000010792005735455613381811200110120035120047113141311363650100302001000010000602002000010000120035120076115020110099100401001000010000100100000010000001000011003210110111119646400026001000040100120052120036120098120101120036
502041200478991100000120020119493109461256010040100100011000030100100001000010788625735455613381811200110120047120035113141311363650100302001000010065602002000010000120054120047115020110099100401001000010000100100000110000001000011003210110711119660400026601000040100120036120036120051120048120048

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 9.0051

retire (01)cycle (02)030e0f1e1f22243f4d4f51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)6061696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a3a6a8acafb5dcache load miss (bf)c2branch mispredict (cb)cdcfd5d6ddinst fetch restart (de)e0? int output thing (e9)eaebecld/st retires (ed)gpr retires (ef)f5f6f7f8fd
500251200518991110101200361195091094642560013400121000010000300101000010000107951757362366133662001200271200511200351131533113690500103002010000100006002020000100661200351200351150021109104001010000100000101000001000000100001100031401107111196664000210091000040010120052120052120052120052120052
5002412005189901000012003611950910944925600104001210001100003001010000100001079517573545561327571012002712005112005111316931136905001030020100001000060020200001000012005112005111500211091040010100001000001010000110000001000000000314011071111966640002101091000040010120052120052120036120052120052
5002412005189901001012003611949910946446600104001210001100003001010000100001079517573623661336620012001112005112005111316931136905001030020100001000060020200001000012005112005111500211091040010100001000001010000110000001000011000314011071111965040002101091000040010120052120052120052120052120052
500241200518990010101200201195091094642560010400121000110000300101000010000107955757362366133662001200271200511200351131693113701500103002010000100006002020000100001200511200511150021109104001010000100000101000011000000100001100031401107111196504000001091000040010120052120052120052120052120036
50024120035899011010120036119509109449256001040012100011000030010100001000010795175736236613366200120011120054120035113153311369050010300201000010000600202000010000120035120051115002110910400101000010000010100001100001010000112003140110711119666400000091000040010120052120052120036120036120036
50024120035899011000120036119565109449256001340012100011000030010100001000010795575736236613366200120027120051120035113153311369050010300201000010000600202000010000120037120054115002110910400101000010000010100001100000010000110003140110711119666400020001000040010120052120052120052120036120036
5002412003589901000012003611949210946725600134001010001100003001010000100001079517573623661336620012001112005112005111316931136905001030020100001000060020200001000012005112005111500211091040010100001000001010000110000001000010000314011071111965040000101001000040010120052120036120052120052120036
500241200518990110001200201194921094642560010400101000110000300101000010000107958457362366132757001200111200351200761131693113674500103002010000100006002020000100001200511200511150021109104001010000100000101000001000000100001000031401107111196504000201091000040010120036120052120036120052120052
5002412005189901001012003611950910944925600134001210001100003001010000100001079557573623661336620012001112005112005111316931136745001030020100001000060020200001000012005112005111500211091040010100001000001010000110000001000010000314011071111965040000101091000040010120052120052120052120052120036
500241200358990110001200361195091094642560013400101000110000300101000010000107955757362366133662001200271200511200351131693113690500103002010000100006002020000100001200511200351150021109104001010000100000101000001000000100001100031401107111196664000201001000040010120036120054120036120054120052

Test 3: Latency 1->3 roundtrip

Chain cycles: 3

Code:

  ldr q0, [x6, w7, uxtw]
  fmov x0, d0
  eor x8, x8, x0
  eor x8, x8, x0
  add x7, x7, x8
  mov x7, #4
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 9.0050

retire (01)cycle (02)03090e0f1e1f22233a3f4d4f51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)60696d6edispatch stall (70)71scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a1a3a6a8a9acafb5dcache load miss (bf)c2c5branch mispredict (cb)cdcfd5d6ddinst fetch restart (de)e0? int output thing (e9)eaebecld/st retires (ed)gpr retires (ef)f5f6f7f8fd
50205120050899000101001200351195201094682560103401001000010000301001000010000107899957361886133818112001112005012005011314503113658501003020010000100006020020000100001200501200471150201100991004010010000100000100100000110000000100001100203210110711119657400009681000040100120048120111120059120049120051
50204120035899000000001200351195201094632560103401001000110000301001000010000107899957361886133818012002312005012005011314503113667501003020010000100006020020000100001200501200471150201100991004010010000100000100100000110000000100001100003210110111119657400029901000040100120048120036120048120036120051
50204120047899000100001200351195471094492560100401021000010000301001000010000107920057354556133818012002612003512003511314503113667501003020010000100006020020000100001200351200471150201100991004010010000100000100100000010000000100001100003210110711119657400029681000040100120048120106120052120049120051
50204120050899000000001200321195241094632560103401021000110000301001000010000107899957360446133818112001112003512003511314503113667501003020010000100006020020000100001200351200471150201100991004010010000100000100100000110000000100001100003210110711119646400009901000040100120048120083120071120037120096
50204120050899000100001200201195211094632560103401001000110000301001000010000107899957361886136166012002612003512003511314503113667501003020010000100006020020000100001200471200351150201100991004010010000100000100100000010000100100001140003210110111119660400006651000040100120080120080120048120048120039
502041200388990001000101200351195201094632560103401021000010000301001000010000107886257360446133818012001112004712003511314103113667501003020010000100006020020000100001200471200471150201100991004010010000100000100100000110000000100001100003210110711119660400029651000040100120048120126120082120056120048
50204120050899000100001200351195371094632560103401251000010000301001000010000107886257361886133818012001112005012005011314503113667501003020010000100006020020000100001200471200471150201100991004010010000100000100100000010000000100000100003210110111119646400029051000040100120048120102120103120063120051
50204120035899000100101200351195311094612560100401021000110000301001000010000108173657385406133870012001112005012003511314103113658501003020010000100006020020000100001200351200471150201100991004010010000100000100100000110000000100001100003210110721119646400029981000040100120048120094120075120053120051
5020412004789900010012122284120728110288710604394033510050100503352111319112321135017579561561949560121858122475122376113883031011493456423342391133311399618142256811352122346122399261502011009910040100100001000001001000330100011278235100231140003836132821215121448402289681000040100123597123305123815123190122992
5020412346095701131721584012123356121396110548106660514404481005710074345101152311827116033758046916218443012256712301312361911314301401150205763635802117991194069908200001030012087212041441150201100991004010010000100000100100000010000103100001100013210210711119832400020981000040100120048120126120050120048120051

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 9.0047

retire (01)cycle (02)03080b0e0f18191e22233a3f4d4f51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)6061696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a1a3a6a7a8a9acafb5dcache load miss (bf)c5cfd0d5d6ddinst fetch restart (de)dfe0? int output thing (e9)eaebecld/st retires (ed)gpr retires (ef)f5f6f7f8fd
50025120053899000100110112003211950510946125600134001210001100083001010000100001079523573604461334990012002312004712003511316524113686500103002010000100006002020000100001200471200471150021109104001010000100000101000001100010550010000110314861107118119663400026651000040010120048120048120048120048120060
5002412003589900100000011200321195051094612560010400121000110000300101000010000107952357360446132757001200231200351200471131663113686500103066710000100006002020000100001200471200351150021109104001010000100000101000001100000430310000000314862107118119662400346651000040010120048120048120036120036120154
50024120052899000000110112003211949210946125600104001210001100003001010211100001079523573545561334990012002312004712004711315331138875001030020100001000060020200001000012004812004711500211091040010100001000001010000011000003601210000110314861107118119666400026651000040010120049120048120051120048120059
5002412003589900000028001120032119604109492806001340012100111000430010103681000010809375740711613653800120023120035120047113153311368650010300201000010000600202000010000120050120419115002110910400101000010000010100000110000080010000110314861107118119650400020651000040010120048120429120036120048120050
50024120047899010040130011204151195511094642560013400121000110000300101005210049108169657375966133499001201731200471200471131653113686500103002010000100006002020000100001200471200471150021109104001010000100000101000000100000390010000110314861107118119663400026651000040010120048120048120048120048120054
5002412004789900000013001120020119505109463256001340012100011000030010100001019710795235736044613982500120023120047120047113153311368650010300201000010000600202000010000120297120035115002110910400101000010000010100000110000030010000100314861107118119662400006651000040010120048120048120048120048120070
5002412005089900000001011200321195051094612560013400121000010000300101000010000107952357361136133756001200231200471200351131533113686500103002010000100006002020000100001200471200471150021109104001010000100000101000000100000660010000100314861107118119662400026051000040010120048120048120036120048120076
5002412004789900000011011200321194921094612560013400121000110000301631000010049107956857412346133499001200231200471200351131653113686500103002010000100006002020000100001200471200505150021109104001010000100000101000001100000390010000110314861107118119650400026601000040010120048120036120036120048120095
50024120047902000000110112003211950510945225600244001010001100003001010000100001088727573618861334990012002312003512004711328731136865001030020100001000060020200001000012004712004711500211091040010100001000001010000211000011901432610000110314861107118119664400026651000040010120048120048120049120058120075
500241200478990000041002120032119552109461256001340012100011000030010100001000010795235736044613349900120023120036120047113153311368650010300201000010000600202000010000120035120047515002110910400101000010000110100004010000000010000110314861107118119664400020601000040010120048120048120036120048120096

Test 4: throughput

Count: 8

Code:

  ldr q0, [x6, w7, uxtw]
  ldr q0, [x6, w7, uxtw]
  ldr q0, [x6, w7, uxtw]
  ldr q0, [x6, w7, uxtw]
  ldr q0, [x6, w7, uxtw]
  ldr q0, [x6, w7, uxtw]
  ldr q0, [x6, w7, uxtw]
  ldr q0, [x6, w7, uxtw]
  mov x7, 8

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.3340

retire (01)cycle (02)0308090e0f181e1f22233a3f4346494f51inst issue (52)~issue int (53)~issue ld/st (55)~dispatch int (56)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map lookup int (7f)~map lookup ld/st (80)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a3a6a8acafb5b6bbdcache load miss (bf)c5branch mispredict (cb)cdcfd6ddinst fetch restart (de)e0? int output thing (e9)eaebecld/st retires (ed)gpr retires (ef)f5f6f7f8fd
8020526723200011100010226727218121625801001008000010080015500116787512668226727267271665061667480114200800242001600482672226707118020110099100100800008000011008000039800000398000061039111511816102673401064800001002672826708267282670826728
802042672720000000000012670721212162580100100800001008001550011675471267012672726727166746166598011520080024200160048267272672211802011009910010080000800001100800004380035142800396100111511816002670400104800001002673126732267082670826728
802042672820010000198010226726218121625802301008000010080020500116720312668226727267271665561667980115200800242001600482670726707118020110099100100800008000001008000008003920800006000111511816002672501064800001002671826723267082673226723
8020427032201000014100012671021801625801001008000010080016500116707512670226707267271665961665980114200800242001600482672226711118020110099100100800008000001008000039800000398003901039111511816002672400104800001002672826728267362673126728
80204267072140010057000226716018120258010010080000100800145001166596126702267272672216635616659801152008002420016004826726267261180201100991001008000080000010080000080035008003960043111511816102676401002800001002671227006273302670827027
8020426707202001110192002266922001625801001008000010080016500116730312668626707267271665561668380116200800242001600482672726722118020110099100100800008000001008000039800351428000061350111511816102678804102800001002670826708267282670826728
80204267312000000045010226723200225801001008000010080015500116730312670226707267071665561665980115200800242001600482672726707118020110099100100800008000001008000039800000398000000043111511816002687200104800001002672326732267272671226714
80204267392000000000002267160121812258010010080000100800155001167303126702267222672216655616674801152008002420016004826727267221180201100991001008000080000010080000398003503580039013539111511816002693300102800001002670826708267282672326732
802042672220000000450001267070012025801001008000010080015500116688812670226727267271663561665980114200800242001600482672726722118020110099100100800008000001008000039800000358003501350111511816002674800104800001002672826708267282672826728
80204267272000100000000267072121216258010010080000100800155001167875126702267272672716635616674801162008002420016004826727267221180201100991001008000080000010080000080040035800396104311151181600268300664800001002672826708267232672826708

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.3341

retire (01)cycle (02)030e0f1e22233a3f4346494f51inst issue (52)~issue int (53)~issue ld/st (55)~dispatch int (56)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map lookup int (7f)~map lookup ld/st (80)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a3a5a6a8acafb5b6bbdcache load miss (bf)c5cfd5d6dbddinst fetch restart (de)e0eaebecld/st retires (ed)gpr retires (ef)f5f6f7f8fd
80025267272001001012671221121625800101080000108000050116675002668326708267281667631671180010208000020160000267312670811800211091010800008000001080000430800390388003861394405020916026726790010780000102670926732267322673226732
8002426731200004410126716011192580010108000010800005011675010267062673126708166763167118001020800002016000026731267081180021109101080000800000108000043080038038800006039005020716021726728140780000102672926709267292672826732
800242672720000010126716200025800101080000108000050116712402668326731267081667831670780010208000020160000267282672811800211091010800008000011080000430800390398000061000502011160911267431410780000102673226732267092673226732
80024267312000044001267163011625800101080000108000050116675002670626708267081665231668880010208000020160000267272673111800211091010800008000001080000430800390218800396139005020111601011267281410780000102673226709267282673226732
80024267312000044101267160000258001010800001080000501166750026683267312673116676316711800102080000201600002673126708118002110910108000080000010800004308003803880039603900502011160111126835014080000102673226732267092672826728
8002426731200000000266930101925800101080000108000050116712402670626728267281667231671180010208000020160000267312672811800211091010800008000001080000008000003980038613900502011160247268941414780000102673226732267092673226732
800242670820000440012671601200258001010800001080000501167124026706267082673116652316711800102080000201600002680626768118002110910108000080000010800000080038008000061394405020716025112684600780000102672926732267322670926728
80024267312000045101267160111925800101080000108000050116720102670626708267311667631671180010208000020160000267312672811800211091010800008000001080000008003800800396139005020111601711267371414780000102673226728267322670926709
8002426727200004400126716001025800101080000108000050116884302668326708267311665231671180010208000020160000267312670811800211091010800008000001080000430800001608000061000502071602111267121410780000102672826709267092673226732
8002526731200114510126712011216258001010800001080000501167117026706267082672716676316688800102080000201600002673126727118002110910108000080000010800004308003803880038003944050207160196268421010480000102672826729267092672926709