Apple M1 Microarchitecture Research by Dougall Johnson

Firestorm: Overview | Base Instructions | SIMD and FP Instructions
Icestorm:  Overview | Base Instructions | SIMD and FP Instructions

LDR (register, sxtw, Q)

Test 1: uops

Code:

  ldr q0, [x6, w7, sxtw]
  mov x7, #4
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 0.000

Load/store unit issues: 1.000

SIMD/FP unit issues: 0.000

retire (01)cycle (02)030e0f1e223a3f4346494f51inst issue (52)~issue ld/st (55)~dispatch ld/st (58)huge thing ld/st (5a)696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op ld/st (7d)~map lookup ld/st (80)8283pipeline redirect (84)85inst all (8c)inst fp/simd load (98)inst ldst (9b)9da0a3a6a8acafb5b6bbdcache load miss (bf)cfd5d6ddinst fetch restart (de)e0eaebecld/st retires (ed)f5f6f7f8fd
10053943004501379212121625100010001000150373693943942173256100010002000394394111001100010000100043103903910396139437341622391101441000395395398399395
10043982004501379212121625100010001000150183693943942163252100010002000394394111001100010000100043103903910396139437321622391101041000395395395400395
10043943004511379212121625100010001000149893693953942173252100010002000394394111001100010001100043103903910396139447321622391101041000395395400401395
10043943004511379212121325100010001000149893693943942173252100010002000394394111001100010000100043103903910396139437321622391101041000395395395404395
10043942004401379212121625100010001000150373733943942173252100010002000394394111001100010000100043103903910396139437321622391101041000395395395426399
1004394300450137921211625100010001000149893693943942163256100010002000394394111001100010000100043103903910396139437321622391101041000395395395395395
10043943004511379212121625100010001000149893693943942173252100010002000394401111001100010000100043103903910396139437321622391101071000395395398396395
10043943004511379212121625100010001000149893693943942173252100010002000394394111001100010000100043103903910396139437321622391101041000395395395400395
10043943004501379212121925100010001000150183693943942173252100010002000398394111001100010000100043103903910396139437321622391101041000395396399395395
10043943004511379212121625100010001000149893693943942173256100010002000394394111001100010000100043103903910396139437321622391101041000395395396396395

Test 2: Latency 1->2 roundtrip

Chain cycles: 3

Code:

  ldr q0, [x6, w7, sxtw]
  fmov x0, d0
  eor x8, x8, x0
  eor x8, x8, x0
  add x6, x6, x8
  mov x7, #4
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 9.0054

retire (01)cycle (02)03090e0f18191e1f22233a3f4d4f51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a1a3a6a7a8a9acafb5dcache load miss (bf)cfd0d5d6ddinst fetch restart (de)e0? int output thing (e9)eaebecld/st retires (ed)gpr retires (ef)f5f6f7f8fd
50205120051899300005200001200361195101094672560103401001000010000301001000010000107900857362366136317012001112003512003511323231136585010030200100001000060200200001000012005112003511502011009910040100100001000001001000001100000000100001132100210711119661400020091000040100120036120036120055120055120036
502041200518990000022900001200201195131094672560103401021000110000301001000010000107900857362366136317012002712005412005411320431136715010030200100001000060200200001000012003512003511502021009910040100100001000001001000001100000600100001032100110711119661400020001000040100120059120036120055120055120055
50204120035899000001000012002011949310944925601004010210001100003010010000100001078862573545561363170120027120035120051113195311366850100302001000010000605822000010000120051120051115020110099100401001000010000010010000001000000001000001321001107111196584000013091000040100120055120055120055120055120055
50204120054899000002830100120039119510109464256010340100100011000030100100001000010790355735455613446101200271200541200351131793113671501003020010000100006020020000100001200511200801150201100991004010010000100000100100000110000000010000103210011071111966140002013121000040100120055120055120052120055120036
502041200548990000060100120039119510109464256010040100100011000030100100001000010788625736332613446111200271200511200351131943113658501003020010000100006057420000100001200351200541150201100991004010010000100001100100000010000000010000113210011012111966140002130121000040100120055120055120052120055120055
50204120035899000001010012004411951310944925601004010010001100003010010000100001079035573545561363171120011120054120054113195311367150100302001000010000602002000010000120035120035115020110099100401001000010000010010000001000000001000001321001107111196464000201091000040100120055120055120055120052120055
502041200548990000040000120039119513109467256010040100100011000030100100001000010790085736380613631701200301200541200541132063113671501003020010000100006020020000100001200351200511150201100991004010010000100000100100000110000000010000103210011071111965840002131001000040100120052120055120055120052120055
5020412005489900000000001200391195631094492560103401021000110000301001000010000107886257362366134461012002712003512003511318131139005010030200100001000060200200001000012003612005111502011009910040100100001000011001000001100000100100001032100110111119646400000001000040100120052120036120052120052120055
50204120054899000005160000120036119513109467256010340100100001000030100100001000010790085735455613631701200301200541200981131613113668501003020010000100006020020000100001200541200511150201100991004010010000100000100100000110000000010000103210011071111965840002013121000040100120055120055120052120036120055
50204120035899000000010012003911951010944925601034010010001100003010010000100001079035573545561344610120071120054120035113200311366850100302001000010000602002000010000120054120035115020110099100401001000010000010010000011000000001000011321001107111196614000213001000040100120055120055120036120055120055

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 9.0056

retire (01)cycle (02)030508090b0e0f18191e1f22233a3f43494d4f51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)60696b6d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a1a3a6a7a8a9acafb5bbdcache load miss (bf)dtlb miss (c1)c2c5cfd2d5d6dadbddinst fetch restart (de)e0? int output thing (e9)eaebecld/st retires (ed)gpr retires (ef)f5f6f7f8fd
500251200538991011100020100120038001195111094552560016400141000210000300101000010000107957557357506133216012002901200561200561131713113695500103002010000100006002020000100001200561200531150021109104001010000100001101000210100030014100001111103140031190211119671400046651000040010120057120057120581120042120584
5002412005689911110000110000120041001195111094692560016400141000110000300101000010000107946357364766133764012003201200531200601131743113680500103002010000100006002020000100001200531200531150021109104001010000100000101000111100030011100001111203140011070021119671400049601000040010120057120042120057120057120057
500241200568991201000020000120026001195141094692560016400121000210000300101005210000107957557364766133917112001701200411200411131713113680500103002010000100006002020000100001200561200561150021109104001010000100000101000532100020002100010201103140011070011119668400046001000040010120042120042120057120042120057
500241200419001100000020000120026001195131094552560016400141000110000300101000010000107960257364766133764112001701200831200601131743113695500103018010000100006002020000100001200561200611150021109104001010000100000101000220100020001100001111003140011070011119672400026051000040010120042120057120057120054120042
50024120056899111000002000012004100119514109469256001640012100021000030010100001005010796025736476613376411200590120056120056113159311369550010300201000010000600202010610000120041120053115002110910400101000010000010100022110002111110000111110314001970111119671400029981000040010120057120054120057120057120042
500241200609001111000070000120026001195141094692560016400121000210000300101000010000107960257357506133917112001701200561200561131743113695500103002010000100006002020000100001200561200531150021109104001010000100000101000121100010001100001101103146011070111119671400040001000040010120057120158120057120057120054
500241200538991001000020000120041001195141094692560016400141000110000300101000010000107962057364766135232112002901200561200531131593113695500103002010000100006033620000100001200611200411150021109104001010000100000101000321100010011100001101103140011070011119676400066051000040010120050120050120050120062120062
500241200618991110000030000120041001195141094555260013400121000210000300101000010000107946357357506133917112003201200561200411131743113680500103002010000100006002020000100001200561200541150021109104001010000100000101000121100010001100001111003140011070011119656400049081000040010120057120057120057120245120057
5002412004889910110001317280100120334001195141094552560016400141000110000300101000010000107946357372136133216012003201200561200531131743113695500103002010000100006002020000100001200581201401150021109104001010000100000101000321100020011100001101003140011070012119668400040081000040010120042120458120054120057120143
5002412005689910110031200001200380011951110946925600164001410002100023001210000100501079629573633261339171120032012005612013311316131137435001030020100001005360020200001000012005612005311500211091040010100001000001010003311000100210100001111103140011070022119656400046981000040010120057120057120042120057120057

Test 3: Latency 1->3 roundtrip

Chain cycles: 3

Code:

  ldr q0, [x6, w7, sxtw]
  fmov x0, d0
  eor x8, x8, x0
  eor x8, x8, x0
  add x7, x7, x8
  mov x7, #4
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 9.0050

retire (01)cycle (02)03080b0e0f18191e1f2223243a3f4d4f51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)60696b6d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a1a3a6a8acafb5dcache load miss (bf)c2branch mispredict (cb)cdcfd5d6ddinst fetch restart (de)e0? int output thing (e9)eaebecld/st retires (ed)gpr retires (ef)f5f6f7f8fd
5020512004789900010200000012003511950910946325601034010210001100023010010000100001078999573618861338180120023012005012005011315831136365010030200100001000060200200001000012005012004711502011009910040100100001000001001000001100000010000110003210110711119657400009681000040100120036120036120051120051120051
5020412009789900000010100012003511951910944925601034010210001100023010010000100001078862573747161338180120026312007012005511323531136365010030200100001000060200201281000012004712003511502011009910040100100001000001001000001100000010000010003210113511119660400029651000040100120051120036120051120048120048
5020412005089900000010000012003511949310946325601004010210001100003024810000100001079008573832061377940120030012003512004711314531136585010030200100001000060200200001000012005012004711502011009910040100100001000001001000001100000010000010003210110111119657400009681000040100120051120036120048120051120051
5020412003590000100010000012002011951410946125601034010010001100003010010000100001078999573618861338180120058012003512005011314531137865010030200100001000060200200001000012005012003521502011009910040100100001000001001000001100000110000010003210110121119657400029951000040100120036120051120051120051120051
5020412003589900000000000012003511949310944925601034010410000100003010010000100001078999573545561361660120034012004712005011314531136675010030200100001000060200200001000012003512004711502011009910040100100001000001001000001100000010000110003210113511119660400000651000040100120051120036120051120036120048
5020412005089900000010000012003511951910946825601034010410001100003010010000100001078999573545561338180120027012003512004711314531136585010030200100001000060200200001000012005012003511502011009910040100100001000001001000001100010010000110003210110711119646400020681000040100120048120048120036120048120051
5020412005089900000010000012002011950910944925601004010210001100003010010000100001078862573545561338180120024012005012005011314131136585010030200100001000060200200001000012004712003511502011009910040100100001000001001000001100000010000012003879131232121545402200881000040100122405122337122472122379122509
5020412244591610100618800011223421208471101817786044340308100551005633381114241132811404535786100620013701221340123021123735114087393115595591353539811981119977057723956116211235331236214115020110099100401001000010000010010010211002408882310039110004186139942122692402859081000040100120036120052120048120036123101
50204120875955010031374470000012003211949510946325601314010010001100003010010000100001078999573618861344611120023012005012003511314531136675010030200100001000060200200001000012004712004711502011009910040100100001000001001000001100000010000110003210210111119657400020601000040100120051120051120051120048120051
5020412003589900000070001012003511951310946325601034010210001100003010010000100001078862573545561338181120017012003512005011314531136365010030200100001000060200200001000012005012003511502011009910040100100001000001001000001100000010000010003210110721119663400029601000040100120048120048120048120051120036

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 9.0051

retire (01)cycle (02)03080f18191e22243f4d4f51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)5f60696d6edispatch stall (70)71scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a3a6a7a8a9acafb5dcache load miss (bf)branch mispredict (cb)cfd5d6ddinst fetch restart (de)e0? int output thing (e9)eaebecld/st retires (ed)gpr retires (ef)f5f6f7f8fd
5002512005189900002510120039119509109464256001340012100011000030010100001000010795575736332613496201120027120075120052113169031136945001030020100001000060020200001000012005112005111500211091040010100001000011010005010002242365810002110314011071111966640002101001000040010120054120052120052120052120052
5002412005189900001310120038119509109449256001340012100011000030010100621000010795575736236613366201120027120074120058113227031136795001030020100001000060020200001000012005112005111500211091040010100001000001010000110000000010000110314011071111966640002101091000040010120052120036120036120052120052
500241200519000000110120036119509109449256002440012100011000030010100001000010795575736236613366201120027120090120052113169031136905001030020100001000060020200001000012005112005111500211091040010100001000001010000110000000010000100314011071111966640000101001000040010120052120052120052120052120052
500241200518990000110120037119509109464256001340012100011000030010100001000010795175736236613366200120027120078120052113169031136905001030020100001000060020200001000012005112005111500211091040010100001000001010000110000000010000110314011071111966640002101091000040010120052120052120052120036120036
50024120051899004013300120036119509109449256001340012100011000030010100001000010795575736236613366200120027120051120051113169031136905001030020100001000060020200001000012005112005111500211091040010100001000001010000110000000010000010314011071111966640002101091000040010120052120055120052120052120036
500241200518990000110120020119509109464256001340012100011000030010100001000010795575736236613366200120027120052120051113169031136745001030020100001000060020200001000012003512003511500211091040010100001000001010000110000000010000110314011071111966640000101091000040010120052120052120052120052120052
500241200518990000010120071119509109464256001340012100011000030010100001000010795575736236613275700120027120072120054113169031136745001030020100001000060020200001000012005112005111500211091040010100001000001010000110000000010000010314011071111966640000101091000040010120052120052120052120052120052
500241200518990000110120036119509109449256001340012100011000030010100001000010795575735455613275711120011120062120051113169031136905001030020100001000060020200001000012005112005111500211091040010100001000001010000110000000010000110314011071111966640002101091000040010120052120052120052120052120055
50024120051899000001012002011950910946425600134001210000100003001010000100001082249573545561336620012002712005312005111316903113690500103002010000100006002020000100001200511200511150021109104001010000100000101000011000000001000010031401107111196664000010091000040010120052120052120052120052120052
500241200519000040010120036119492109464256001040012100011000030010100001000010795175736236613366200120027120054120051113169031136905001030020100001000060020200001000012005112005111500211091040010100001000001010000110000000010000101314011071111966640000101091000040010120052120036120052120052120052

Test 4: throughput

Count: 8

Code:

  ldr q0, [x6, w7, sxtw]
  ldr q0, [x6, w7, sxtw]
  ldr q0, [x6, w7, sxtw]
  ldr q0, [x6, w7, sxtw]
  ldr q0, [x6, w7, sxtw]
  ldr q0, [x6, w7, sxtw]
  ldr q0, [x6, w7, sxtw]
  ldr q0, [x6, w7, sxtw]
  mov x7, 8

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.3341

retire (01)cycle (02)0308090e0f1e22243a3f4346494f51inst issue (52)~issue int (53)~issue ld/st (55)~dispatch int (56)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map lookup int (7f)~map lookup ld/st (80)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a3a6a8a9acafb5b6bbdcache load miss (bf)c5branch mispredict (cb)cdcfd5d6e0? int output thing (e9)eaebecld/st retires (ed)gpr retires (ef)f5f6f7f8fd
8020526722202000045000266923120162580100100800001008002350011672800266822672726734166556166798011520080024200160048267072672211802011009910010080000800000100800000800390039800396135431115118016267240960800001002672826728267332672826728
80204267302000000411022669221212162580100100800001008001650011715221267022672226722166556166798011520080024200160048267272670711802011009910010080000800000100800003980039003980000610431115118016267190062800001002672826708267232672826723
802042670720000004110226707201884258010010080000100800165001167153126702267072672716635616679802932008002420016004826707267221180201100991001008000080000010080000080000000800356139431115118016267190664800001002672826728267282670826728
80204267272000000450002671201812025801001008000010080016500116909802670226727267071665561667480115200800242001600482672726722118020110099100100800008000001008000039800390008003961354311151180162672401062800001002672826723267232672826723
80204267272010000000226712212121225801001008000010080014500116926902668226727267271665561667980115200800242001600482672726707118020110099100100800008000001008000039800000008003960390111511801626724010100800001002670826728267282670826723
8020426707200000045002267120181216258010010080000100800155001170579026702267072670716824616679801142008002420016004826730267221180201100991001008000080000010080000398003900398005161350111511801626815010104800001002672826708267232671226728
80204267272000000450002671221201625801001008000010080016500117309302670226707267221665561667480114200800242001600482670726722118020110099100100800008000001008000039800000035800396135391115118016267190004800001002672326728267232672826723
802042672720000004500026712212002580100100800001008001550011665961267022672726707166351416661801152008002420016004826727267071180201100991001008000080000010080000080000003980035603501115118016267240060800001002670826723267232672826723
80204267272010000410022670700002580100100800001008001650011732251267022672726727166556166798011520080024200160048267272672211802011009910010080000800000100800003980000003980039013501115118016267240000800001002672326728267282672826728
802042670720000004500026712012120258010010080000100800155001172155026702267272672716635616659801162008002420016004826727267221180201100991001008000080000110080000080039003680039603543111511801626704010104800001002672826708267282672826728

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.3340

retire (01)cycle (02)030e0f18191e22243a3f4346494f51inst issue (52)~issue int (53)~issue ld/st (55)~dispatch int (56)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map lookup int (7f)~map lookup ld/st (80)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9fa0a3a6a8acafb5b6bbdcache load miss (bf)cfd5d6ddinst fetch restart (de)e0eaebecld/st retires (ed)gpr retires (ef)f5f6f7f8fd
800252672220000004110126707218181425800101080000108000050116760512677226722267361666731670280010208000020160000267222672211800211091010800008000010800003980035035800356135395020616112671960280000102672326723267232670926723
800242672220000006210126707218181225800101080000108000050116760502669726929267391666931670280010208000020160000267222672211800211091010800008000010800003980035035800356135395020116312671960280000102672326709267232672326723
80024267222000000411012670701818112580010108000010800005011673661267962673126722166673166888001020800002016000026722267081180021109101080000800001080000398003513580035613505020116112671966280000102672326723267232672326723
8002426722201000041001266992001125800101080000108000050116675002680226730267221666731670280010208000020160000267222672211800211091010800008000010800003980165335800356035395020116112672066080000102672326723267232672326723
80024267222000000410002670720181225800101080000108000050116760502679826727267281665231670280010208000020160000267082672211800211091010800008000010800003980036035800356135395020116112671960280000102672326723267092672326723
80024267222000000410002670721818122580010108000010800005011667501267992672726722166673167028001020800002016000026722267221180021109101080000800001080000398003503580035613505020116112671966280000102672326723267232670926723
80024267222000010410012670720181225800101080000108000050117224012679326728267391666731670280010208000020160000267222670811800211091010800008000010800003980035035800350135395020216122671966080000102670926723267232672326709
80024267082000000411012670721818122580010108000010800005011676051268212673926722166673166888001020800002016000026722267221180021109101080000800001080000398003503580035613505020116112670566080000102672326723267092672326723
800242672220000004110126693018012258001010800001080000501167605026889267302672216667316702800102080000201600002672226722118002110910108000080000108000039800350358000061005020116112671966280000102670926726267232672326726
80024267222000000010126707218184125800101080000108000050116760512681526730267271666731670280010208000020160000267222672211800211091010800008000010800003980035038800006135395020116112671966280000102672326709267232672326709