Apple M1 Microarchitecture Research by Dougall Johnson

Firestorm: Overview | Base Instructions | SIMD and FP Instructions
Icestorm:  Overview | Base Instructions | SIMD and FP Instructions

LDR (register, lsl, Q)

Test 1: uops

Code:

  ldr q0, [x6, x7, lsl #4]
  mov x7, #4
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 2.000

Issues: 2.000

Integer unit issues: 1.000

Load/store unit issues: 1.000

SIMD/FP unit issues: 0.000

retire (01)cycle (02)030508090b0f1e3a3f4f51inst issue (52)~issue int (53)~issue ld/st (55)~dispatch int (56)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map lookup int (7f)~map lookup ld/st (80)8283pipeline redirect (84)85inst all (8c)inst fp/simd load (98)inst ldst (9b)a0a1a6a7a9acafb5bbdtlb miss (c1)c2cfd5d6ddinst fetch restart (de)e0? int output thing (e9)ld/st retires (ed)gpr retires (ef)f5f6f7f8fd
2006383310000003610252000100010001000100050001429803513763767431132000100010001000200037637611100110001000100001000000100000007311611381100010001000385384384384384
20043833100000036202520001000100010001000500014298035738438380311620001000100010002000383384111001100010001020201019222010000191907311611380100010001000385384384385384
2004384310000013610252000100010001000100050001429813513763767431092000100010001000200037637611100110001000100001000000100000007311611373100010001000377377377377377
20043763000004703680252000100010001000100050001464003513763767431092000100010001000200037637611100110001000100001000006100000007311611373100010001000377377377377377
200437630000000361025200010001000100010005000142980468383383813116200010001000100020003833831110011000100010201910192120100000007311611373100010001000377377377377377
20043912000000036122520001000100010001000500014598135738438380311620001000100010002000383383111001100010001019191019012010000191917311611380100010001000384385384384385
200438431011020136802520001000100010001000500014605035838338380311620001000100010002000383384111001100010001019201019012010000191927311611380100010001000384384384385384
20043832100002013681252000100010001000100050001464003513763767431092000100010001000200037637611100110001000100001000000100000007311611373100010001000377377377377377
200437630010020036802520001000100010001000500014548035838338280311720001000100010002000383383111001100010001021191019112010000191907311611381100010001000384385384384384
2004384310000240361025200010001000100010005000142980351376376743109200010001000100020003763761110011000100010002100000010000191927311611380100010001000385384384385385

Test 2: Latency 1->2 roundtrip

Chain cycles: 3

Code:

  ldr q0, [x6, x7, lsl #4]
  fmov x0, d0
  eor x8, x8, x0
  eor x8, x8, x0
  add x6, x6, x8
  mov x7, #4
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 9.0051

retire (01)cycle (02)03040508090b1e3a3f434d4f51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)60696b6d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9fa0a1a6a8a9acafb5bbdtlb miss (c1)c2branch mispredict (cb)cfd5d6ddinst fetch restart (de)e0? int output thing (e9)ld/st retires (ed)gpr retires (ef)f5f6f7f8fd
602061200678990112123120036111950798283257010650104100021000040100100001000011748515715676589817411200270120051120051111892311245860100402001006510000702002000010000120051120051115020110099100401001000010000100100022100020221000102110321596799119663500041000050100120052120098120113120055120052
602041200519000100123120036111950898283257010650104100021000040100100001000011748335715676589812411200270120051120051111892311245860400402001000010000702002000010000120051120051115020110099100401001000010000100100012100030121000102110321596799119663500041000050100120052120052120052120052120052
60204120051899011002312003611195079828325701095016710002100004010010000100001174842571582058981241120027012005112005111189231124586010040200100001006070200200001000012005112008211502011009910040100100001000010010002210004412100010210032154671011119663500041000050100120052120052120053120052120052
60204120051899011012312003611195099828425701065010410002100004010010000100001174833571567658981241120027012005112005111189231124586010040200100001000070200200001000012005112005811502011009910040100100001000010010003210004012100010211032151067910119663500041000050100120052120052120052120052120052
6020412005190001000231200361119507982832570106501041000210000401001000010000117483357156765898124112002701200511200521118953112458601004020010000100007020020000100001200511200511150201100991004010010000100001001000211000301210001121103215967410119663500041000050100120052120052120052120052120052
602041200998990100123120036111952798284257010650104100021000040100100001000011748335715676589812401200270120051120052111892311245860100402001000010000702002000010000120051120051115020110099100401001000010000100100012100020021810001121003215967109119663500041000050100120052120052120052120052120052
602041200518990100123120036111950799316257010650104100021000040100100001000011748335715676589812411200270120051120051111892311245860100402001000010000702002000010000120051120051115020110099100401001000010000100100032100020121000102110321596799119663500041000050100120052120052120052120052120052
602041200518990100123120036111950798283257010650104100021000040100100001005811748605720099589812411200270120051120051111892311245860405402001000010000702002000010000120051120051115020110099100401001000010000100100012100030121000102110321596749119663500041000050100120052120052120052120052120052
6020412005189901011231200361119507982832570106501041000210002401001000010000117483357156765898124112002701200521200511118923112458601004020010000100007020020000100001200511200511150201100991004010010000100001001000211000200210001021203215467109119663500041000050100120052120052120052120052120052
602041200519000101023120036111950798283257010650104100021000040100100001000011748335715676589812411200270120051120051111892311245860100402001000010000702002000010000120051120051115020110099100401001000010000100100012100040121000102120321556784119663500041000050100120052120052120052120052120052

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 9.0037

retire (01)cycle (02)0305080b0f18191e1f3f4d4f51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)60696b6d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9fa0a1a6a8a9acafbbdtlb miss (c1)c2c5cfd5d6ddinst fetch restart (de)e0? int output thing (e9)ld/st retires (ed)gpr retires (ef)f5f6f7f8fd
6002612004389900100010120028119498999662570013500121000110000400101000010000111982057191255902882012001931200431200431119413112442600104002010000100007044020000100001200431200431150021109104001010000100001010003210002011100001110314049733119676500021000050010120044120044120044120044120044
6002412004389910100010120028119564999662570013500121000110000400101000010000111982057191255902882012001901200431200431119353112442600104002010000100007002020000100001200461200461150021109104001010000100001010001110002021100001110314039724119676500021000050010120044120044120044120044120044
6002412004389910000000120022119491983272570010500101000010000400101000010000120358757139805897195012001301200371200371119273112480600104002010000100007002020000100001200371200371150021109104001010000100001010000010000000100000000314036523119696500001000050010120038120038120038120038120038
6002412003789900000000120022119627983272570010500101000010000400101000010000120358757139805897195012001301200371200371119273112475600104002010000100007002020000100001200371200371150021109104001010000100001010000010000000100000000314036533119659500001000050010120038120038120038120038120038
6002412003789900000000120022119491983272570010500101000010000400101000010000120358757139805897195112001331200861200421119273112468600104002010000100007002020000100001200731201001150021109104001010000100001010000010000000100000000314026532119659500001000050010120038120038120038120038120038
6002412003789900000000120022119491983272570010500101000210000400101000010000118236057139805897195012005701200371200371119273112468600104002010000100007002020000100001200371200371150021109104001010000100001010000010000000100000001314026534119659500001000050010120038120041120038120038120038
6002412003889900000000120022119491983272570010500101000010000400101000010000122881757139805897195112001301200371200371119273112468600104002010000100007002020000100001200371200371150021109104001010000100001010000010000000100000000314036533119659500001000050010120038120072120049120058120038
6002412003789900000000120022119491983272570010500101000010000400101000010000120358757139805897195212001301200371200371119273112468600104002010000100007002020000100001200371200371150021109104001010000100001010000010000023100000000314036543119659500001000050010120038120038120038120038120038
60024120037899000000120120023119491983282570010500101000010000400101000010000120376757139805900485012020301202521200371119273112468600104022910000100537002020208100001203511200372150021109104001010000100001010000010000003100000000314026524119661500001000050010120038120038120038120038120038
600241200378990000001201200221194939784425700105001010000100004001010055100001203587571329658973971120016012003712005611192931124716001040433101021000071468200001000012014112014341500211091040010100001000010100000100000024100000000314026534119659500001000050010120038120038120038120038120038

Test 3: Latency 1->3 roundtrip

Chain cycles: 3

Code:

  ldr q0, [x6, x7, lsl #4]
  fmov x0, d0
  eor x8, x8, x0
  eor x8, x8, x0
  add x7, x7, x8
  mov x7, #4
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 11.0035

retire (01)cycle (02)030b0e191e22233f4d51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9fa0a1a3a6a8acafb5dcache load miss (bf)cfd0d5d6ddinst fetch restart (de)e0? int output thing (e9)eaebecld/st retires (ed)gpr retires (ef)f5f6f7f8fd
60206140047104900000014002013955625701035010010000100004010010000100001245607532392753453491400111400351400831319493132416601004020010000100007020020000100001400351400351150201100991004010010000100001001000000100000010000003210019311139699500000001000050100140036140036140036140036140036
60204140035108500000014002013955625701005010010000100004010010000100001245607532392753453491400111400851400351319493132416601004020010000100007020020000100001400351400351150201100991004010010000100001001000000100000010000003210019311139699500000001000050100140036140036140036140036140036
60204140035104900000014002013955677701005010010000100004010010000100001245661532392753453871400111400351400351319493132416601004020010000100007020020000100001400471400351150201100991004010010000100001001000040100000010000003210019311139703500000001000050100140036140036140036140036140036
60204140035104800000014002013955625701005010010000100004010010000100001245607532392753453491400111400351400351319503132416601004020010000100007020020000100001400351400351150201100991004010010000100001001000000100000010000103210019311139702500000001000050100140036140036140036140036140036
60204140035104900000014002013955625701005010010000100004010010000100001245607532392753453491400231400351400351319493132416601004020010000100007020020000100001400351400351150201100991004010010000100001001000000100000010000003210019311139702500000001000050100140036140036140036140036140036
602041400351049000930014002013955625701005010010000100004010010000100001245607532392753453491400111400351400471319613132416601004020010000100007020020000100001400811400431150201100991004010010000100001001000000100000010000003210019311139702500000001000050100140036140036140041140036140036
60204140035104900000014002013955625701005010010000100004010010000100001245607532392753453491400111400351400351319493132416601004020010000100007020020000100001400351400351150201100991004010010000100001001000000100000910000003210019311139700500000001000050100140036140037140036140036140036
60204140035104900001014002013955625701005010010000100004010010045100001245611532392753469701400561400351400351319493132416601004020010000100007020020000100001400351400351150201100991004010010000100001001000000100000010000003210019311139699500000001000050100140036140036140036140036140036
602041400351049010150014002013955625701005010010000100004010010000100001245607532392753453491400111400351400351319493132416601004020010000100007020020000100001400351400351150201100991004010010000100001001000000100000010000003210019311139699500000001000050100140036140036140036140036140036
60204140035104900000014002013955625701005010010000100004010010000100001245607532392753453491400111400351400351319493132416601004020010000100007020020000100001400351400351150201100991004010010000100001001000000100000310001003210019311139699500110001000050100140384140036140036140036140402

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 11.0035

retire (01)cycle (02)03080b18191e1f3a3f4d51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9fa0a1a6a8a9acafc2branch mispredict (cb)cdcfd0d2icache miss (d3)d5d6dadbddinst fetch restart (de)e0? int output thing (e9)ebecld/st retires (ed)gpr retires (ef)f5f6f7f8fd
6002614003510490000000140020139649257001050010100001000040010100001000012457535326922534955701400111400351400351320043132443600104002010000100007002020000100001400351400351150021109104001010000100001010000010000000100000003140000988102213970550000001000050010140072140039140036140036140037
6002414003510490000000140020139639257001050010100001000040433100001000012457535326922534955711400111400351401461319833132443600104002010000100007002020000100491400351400351150021109104001010000100001010000010000000100020003140000288002213970550000001000050010140113140062140037140086140036
600241400351049000000014002013971125700105001010000100004001010000100001245753532692253495571140015140035140035131983313244360010400201000010000700202000010000140035140035115002110910400101000010000101000001000010010000200386900043630041214199250296001000050010143024142867142864143098142993
60024142955107110323242992904514314614114491570495503791007410050456481131011014133189553982275426545114300014347314351513316843413456768507477971109011964834362385411912144545144339491500211091040010100001000010100492100031012691100004004037000288006114173150006001000050010140038140036140036140036140036
6002414011611140100002140111139639257001050010100001000042282100401000012457535326922534955711431271400351401311319832813244360010400201000010040700202000010000140077140035315002110910400101000010000101000001000089001000000032631002148002213970550000001000050010140036140036140383140036140036
600241400351049000005980140020139639257001050010100001000040010100001000012457535327040534987201400111400351400351319833132443600104002010000100007002020000100001400351400351150021109104001010000100001010000010000000100000003140000288002213979650000001000050010140036140036140036140036140036
6002414003510480000000140020139639257001050010100001000040010100001000012457535326922534955701400111400351400351319833132443600104002010000100007002020000100001400351400351150021109104001010000100001010000010000006100000003140000288002213975350000001000050010140036140036140036140036140036
6002414003510490000000140020139639257001050010100001000040010100001000012457535326922534955701400111400351400351319833132443600104002010000100007002020000100001400351400351150021109104001010000100001010000010000000100000003140000288002213978450000001000050010140036140036140036140036140036
6002414003510490000300140020139639257001050010100001000040010100001000012457535326922534955701400111400351400351319833132443600104002010000100007002020000100001400351400351150021109104001010000100001010000010000000100000003140000288002213975550000001000050010140036140036140036140036140073
6002414003510490000000140020139639257001050010100001000040010100001000012457535326922534955701400161400351400351319833132443600104002010000100007002020000100001400351400351150021109104001010000100001010000010000000100000003140000217002213970950000001000050010140036140036140036140036140036

Test 4: throughput

Count: 8

Code:

  ldr q0, [x6, x7, lsl #4]
  ldr q0, [x6, x7, lsl #4]
  ldr q0, [x6, x7, lsl #4]
  ldr q0, [x6, x7, lsl #4]
  ldr q0, [x6, x7, lsl #4]
  ldr q0, [x6, x7, lsl #4]
  ldr q0, [x6, x7, lsl #4]
  ldr q0, [x6, x7, lsl #4]
  mov x7, 8

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.3340

retire (01)cycle (02)030507080a0b0e0f18191e1f22233a3f4346494f51inst issue (52)~issue int (53)~issue ld/st (55)~dispatch int (56)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)67696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map lookup int (7f)~map lookup ld/st (80)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)92inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a1a3a5a6a7a8a9acafb5b6bbdcache load miss (bf)dtlb miss (c1)c2c5branch mispredict (cb)cdcfd5d6ddinst fetch restart (de)e0? int output thing (e9)eaebecld/st retires (ed)gpr retires (ef)f5f6f7f8fd
1602062674820111010000021010126703077125160118801178000080120800174006061169543126713267182671866636666716014280220800248022416004826896267921180201100990100100800008000001008001921008005910164800390019019011151172161126737800161313180000801002674126719267412671926719
160204267182001101000006600012672507722516011780116800008012480016400600116876502671326739267406663666891601368022480024802241600482689726788118020110099010010080000800000100800202100800191106480000601901911115117116112671580016013080000801002674126741267412674126741
160204267182001121100002000022670307022516011280116800008012080017400600117151302669126718267406641666671601438022080024802241600402674926721118020110099010010080000800000100800202000800591012080000015844190111511711611267378001600180000801002674126741267412674126741
1602042671820011010000020010026703000025160118801168000080120800184005951168765026691267182671866416666716013680224800248022416004826740267401180201100990100100800008000001008001819430800590102080000001901911115117116112672580016013180000801002671926741267412671926741
1602042674020111110000020010026703000025160116801168000080124800164005961163174326923267402674066416668816014380224800248022416004826925267981180201100990100100800008000001008002019440800601012080040005944192111511711611267158001600180000801002671926741267192674126719
1602042671820211011000067010026703377225160116801168000080124800194005881164637126691267402674066416668816013480224800248022416004826871267831180201100990100100800008000001008002020008001911220800006019019111151171161126715800161313180000801002671926741267412674126719
160204267402001111100006700002672527719251601168011680000801208001840060411695430267132671826718664166667160142802248002480224160048268222685211802011009901001008000080000010080020204308001901163800400058019111151171161126737800161313180000801002674126719267192671926719
160204267182001111100002200012670500002516011780116800008012080017400593116317402671326718267186641666671601378022480024802241600402677926817118020110099010010080000800000100800192100800600112080040601944191111511711611267378017600180000801002674126722267222671926743
16020426718200111100000200001267030772251601178011680000801248001540059211647810267132671826718666366688160143802248002480224160048269242676811802011009901001008000080000010080019194408005911063801716019441901115117116112671580016013080000801002671926719267412671926719
1602042674020711110000011101012672707722516011680116800008012480015400604117151302671326718267186663666661601378022480024802201600482674026740118020110099010010080000800000100800192000800601016480000001944190111511711611267318001600180000801002671926741267412674126720

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.3340

retire (01)cycle (02)030508090b0e0f18191e223a3f4346494f51inst issue (52)~issue int (53)~issue ld/st (55)~dispatch int (56)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)6067696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map lookup int (7f)~map lookup ld/st (80)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)92inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a1a3a6a7a8a9acafb5b6bbdcache load miss (bf)dtlb miss (c1)c2branch mispredict (cb)cfd5d6daddinst fetch restart (de)e0? int output thing (e9)eaebecld/st retires (ed)gpr retires (ef)f5f6f7f8fd
16002626723200111010002003267230700251600108001080000800108000040005011686810026691267162671666613669616001080020800008002016000026716267161180021109010108000080000110800192044800190006380000611944192050203165332673280000130080000800102671726739267172671726717
160024267382001011010020002670500022516001080010800008001080000400050117211302267142673826738666136696160010800208000080020160000267162671611800211090101080000800000108001920080061101618000000190190050203166432671380000013080000800102671726717267202672026718
160024267172021010000018201269390000251600108001080000800108000040005011765871026691267162671666613669616001080020800008002016000026716267161180021109010108000080000010800201908001921120800000019019105020316733269738000000080000800102671726717267172671726717
1600242671620010110000200126701000025160010800108000080010800004000501168681112669126716267386665367041600108002080000800201600002671626716118002110901010800008000001080020204480019121648000000190191050202165332688080000130180000800102671726740267172671726717
160024267162001111000021012671100744251603048017480000800108000040005011685951026713267202673866653669616001080020800008002016000026721267241180021109010108000080000010800211908006010120800000119019005020316533269858000000080000800102671726717267172671726717
16002426716200111000006600267232000251600108001080000800108000040005011630811126691267162672066613671816001080020800008002016000026716267201180021109010108000080000010800211908005900020800000019441910502031652326966800001313180000800102671726717267182671726718
16002426742200111100003201267010001025160010800108000080010800004000501168681102669126738267166661366961600108002080000800201600002671626716118002110901010800008000001080019194480019100208000000594419105020316643267228000000080000800102673926720267252672126739
160024267402001000000020112670100019251600108001080000800108000040005011685951026691267162671666613669616001080020800008002016000026716267161180021109010108000080000010800192008005900063800416119019105020216633267358000000080000800102671726717267172671726717
160024267162001110000020032670100022516001080010800008001080000400050117176210266922671626716666136696160010800208000080020160000267162673811800211090101080000800000108002019080060001648000000190191050203166442671380000130080000800102671826717267172671726717
160024267162001111000020012670120702516001080010800008001080000400050116308110267132673826716668336718160010800208000080020160000267162671611800211090101080000800001108001919080019100638000000190191050202166332679980000130180000800102672526750267172671726717