Apple M1 Microarchitecture Research by Dougall Johnson

Firestorm: Overview | Base Instructions | SIMD and FP Instructions
Icestorm:  Overview | Base Instructions | SIMD and FP Instructions

LDR (register, uxtw, S)

Test 1: uops

Code:

  ldr s0, [x6, w7, uxtw]
  mov x7, #4
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 0.000

Load/store unit issues: 1.000

SIMD/FP unit issues: 0.000

retire (01)cycle (02)03050708090b0e0f1e223a3f4346494f51inst issue (52)~issue ld/st (55)~dispatch ld/st (58)huge thing ld/st (5a)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op ld/st (7d)~map lookup ld/st (80)8283pipeline redirect (84)85inst all (8c)inst fp/simd load (98)inst ldst (9b)9da0a1a3a6a7a9acafb5b6bbdcache load miss (bf)dtlb miss (c1)c2cfd5d6ddinst fetch restart (de)e0eaebecld/st retires (ed)f5f6f7f8fd
10053993101010166123792121812251000100010001498913643943942173252100010002000394389111001100010000100003910350035103561353900751161139110741000392392392395392
1004394301000004500379212181625100010001000149890369389394212325210001000200039939911100110001000010201942105811591038615742190731161139110641000395398395395395
10043943000000045023792121816251000100010001501813643893942123252100010002000394389111001100010000100003910350035103961353900751161139110641000395395395395395
1004394200000004502379312121625100010001000148380364394394217325210001000200039939811100110001000110212042105801591038615742190751161139110641000395395395395390
10043942000000045023842181815251000100010001543413743993982223257100010002000394389111001100010000100003910390039103961354300751161139110671000395395390392395
1004394301000004112379318181625100010001000150541364389394217325210001000200039939811100110001000010191942106201711037615742190751161138610641000395395390395395
1004394301000004503383218181625100010001000153621373399398221325710001000200039438911100110001000010000391035003510396135430075116113969921000400399400399400
10043993100100065023842181817251000100010001533403743993992223256100010002000394389111001100010000100003910380039103561354300731161138610741000390395395395390
1004394200000004502379212121625100010001000149890369394394216324710001000200039939911100110001000010191942105801591038615742190751161138810641000395390390390395
1004394300000004502379212121125100010001000149890369394394217324710001000200039138911100110001000010000391039003910396135390075116113969921000400400400400401

Test 2: Latency 1->2 roundtrip

Chain cycles: 3

Code:

  ldr s0, [x6, w7, uxtw]
  fmov x0, d0
  eor x8, x8, x0
  eor x8, x8, x0
  add x6, x6, x8
  mov x7, #4
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 9.0050

retire (01)cycle (02)03090e0f1e22233f4d4f51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)60696b6d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a3a6a8a9acafb5bbdcache load miss (bf)branch mispredict (cb)cfd5d6ddinst fetch restart (de)e0? int output thing (e9)eaebecld/st retires (ed)gpr retires (ef)f5f6f7f8fd
502051200478991117101200831195091095272560100401021000110000301001000010000107920057361886136166012003401200501200471131453113658501003020010000100006020020000100001200551200471150201100991004010010000100000100100001100000001000010103210310712119660400009681000040100120036120051120036120051120036
502041200508990001001200321194931094492560103401021000110000301001000010000107899957360446133818012002301200471200471131453113636501003020010000100006020020000100001200501200471150201100991004010010000100000100100001100000001000000103210210112119657400000681000040100120051120051120036120051120051
5020412005089900010012003511950910946325601034010210001100003010010000100001078999573618861338180120043012004712003511315031136585010030200100001000060200200001000012005012004711502011009910040100100001000001001000001000000151000010103210213512119657400020651000040100120048120036120051120051120036
502041200508990000001200201194931094632560103401021000110000301001000010000107886257361886133818012001201200471200351131833113658501003020010000100006020020000100001200501200471150201100991004010010000100000100100001100000001000010003210210112119660400026601000040100120051120036120051120051120051
502041200508990001101200321195091094632560103401021000110000301001000010000107886257354556133818012029801200501200351131413113658501003020010000100006020020000100001200501200351150201100991004010010000100000100100001100000001000010103210113512119657400049081000040100120051120036120036120036120051
5020412005089900012001200201194931094492560103401021000010000301001006210000107920957360446134461112003601200501200351131413113667501003020010000100006020020000100001200471200351150201100991004010010000100000100100001100000001000000103210210712119657400020681000040100120051120051120036120036120051
502041200508990001101200351194931094492560100401021000110000301001000010000107886257361886133818012007101200501200471131453113658501003020010000100006020020000100001200501200471150201100991004010010000100000100100011100000001000010103210210712119646400026601000040100120036120036120036120051120036
502041200508990000101200351195191094492560100401021000010000301001000010000107920057354556136166012005201200501200351131453113636501003020010000100006020020000100001200501200471150201100991004010010000100001100100001100001001000010103210210712119657400026651000040100120051120051120048120051120036
5020412005090000060001200321195091094682560103401021000110000301001000010000107899957354556133818012004601200351200501131413113667501003020010000100006020020000100001200501200471150201100991004010010000100000100100001100000001000010103210310712119657400029681000040100120051120051120051120051120036
502041200508990001101200351195091094492560100401021000110000301001000010000107886257361886133818012033101200471200351131413113636501003020010000100006020020000100001200501200471150201100991004010010000100000100100000100000001000000003210210712119660400000081000040100120048120141120040120039120051

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 9.0047

retire (01)cycle (02)0307090e0f191e22243f4d4f51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)60696b6d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a1a3a6a8acafb5dcache load miss (bf)dtlb miss (c1)cdcfd5d6ddinst fetch restart (de)e0? int output thing (e9)eaebecld/st retires (ed)gpr retires (ef)f5f6f7f8fd
5002512005089901210110120032119505109463256001340012100011000030010100001000010795175736844613396601200110120047120050113165311369350010300201000010000600202000010000120047120047115002110910400101000010000110100000110000001000011003140110711119650400009651000040010120098120088120077120051120049
5002412003589910000110120033119505109461256001340012100041000030010100001000010795235736044613349901200230120048120047113165311368650010300201000010000600202012810000120049120048115002110910400101000010000010100000010000001000011003140110711119662400026651000040010120048120048120048120051120049
5002412005489900000110120032119505109541256001340010100011000030010100001000010795235736044613349901200230120057120047113165311368650010300201000010000600202000010000120035120047115002110910400101000010000010100000110000031000011003142110711119664400026691000040010120048120036120049120049120048
5002412004789900000110120020119492109449256001340012100011000030010100001000010795235735455613349901200260120413120131113171311368950010300201000010000600202000010000120050120290215002110910400101000010000010100000010000001000011003140110711119662400006651000040010120048120048120048120048120048
5002412003589900000110120035119508109461256001040012100011000030010100001000010795485736188613349901200260120051120049113168311368750012300201000010000600202000010000120048120047115002110910400101000010000010100000110000001000011003140110711119662400000951000040010120051120036120036120051120051
5002412005089900000100120032119505109461256001340012100011000030010100001000010795235736044613349901200230120093120048113168311368950010300201000010000600202000010000120035120047115002110910400101000010000010100000110000001000011003140110711119665400026651000040010120036120048120036120048120036
5002412004790010000600120032119495109461256001240014100001000030012100001000010795235736087613426201200240120069120047113165311368650275300201000010000600202000010000120048120047115002110910400101000010000010100000110000001000001003140110711119662400026651000040010120048120048120584120156120108
5002412048889900001110120035119508109463256001340012100011000030010100001000010795485736044613349901200230120056120047113153311368650010300201000010000600202000010000120047120047115002110910400101000010000010100000110000001000000003140110711119665400026651000040010120048120051120036120051120048
5002412004789900000100120020119505109463256001040012100011000030010100001000010795235736044613349901200260120417120071113165311369050010300201000010000600202000010000120047120047215002110910400101000010000010100000110000001000011003140110711119662400026051000040010120048120048120049120048120048
50024120047899001001001200321195051094612560013400101000010000300101000010000107951757360446133499012002501200661200491131653113686500103002010000100006002020000100001200471200471150021109104001010000100000101000001100000010000010031401107141196674000210951000040010120048120048120036120048120048

Test 3: Latency 1->3 roundtrip

Chain cycles: 3

Code:

  ldr s0, [x6, w7, uxtw]
  fmov x0, d0
  eor x8, x8, x0
  eor x8, x8, x0
  add x7, x7, x8
  mov x7, #4
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 9.0051

retire (01)cycle (02)0308090b0e0f18191e1f2223243a3f4d4f51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)60696b6d6edispatch stall (70)71scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a3a6a8acafb5dcache load miss (bf)c2cfd5d6ddinst fetch restart (de)e0? int output thing (e9)eaebecld/st retires (ed)gpr retires (ef)f5f6f7f8fd
502051200478990000000600000120035119563109473256010340102100001000030100100001000010788625736188613381801200260120035120047113141031136675038930200100001000060200200001000012004712004711502011009910040100100001000001001000011000000100001103211210133119660400029651000040100120042120106120087120060120051
502041200478990000000101000120035119714109460256010340102100011000030100100001000010789995736188613381811200260120035120035113141031136675010030200100001000060200200001000012005012003511502011009910040100100001000001001000011000000100001103211310743119662400006681000040100120048120093120074120053120051
5020412003589900000000000001200351195311094492560103401021000110000301001000010000107899957360446134461112001101200501200501131430271136405010030200100001000060200200001000012005012004721502021009910040100100001000001001000011000000100001103211310723119646400026681000040100120051120096120075120051120051
502041200568990000000001000120035119510109449256010340102100011000030100100001000010788625736188613381811200260120050120050113145031136675010030200100001000060200200001000012003512003511502011009910040100100001000001001000011000000100000003211310723119657400020981000040100120108120088120049120036120048
502041200478990000000101000120020119534109461256011240102100011000030100100001000010788625736044613446101200260120047120050113141031136365010030200100001000060200200001000012005012003521502011009910040100100001000001001000011000000100001003211210732119657400029081000040100120048120088120064120052120051
502041200508990000000000000120020119543109463256010340100100001000030100100001000010789995736044613446101200260120035120047113145031136585010030200100001000060592200001000012005012003511502011009910040100100001000001001000011000000100000003211310732119657400009601000040100120048120093120082120051120051
502041200508990000000101000120020119493109463256010340102100011000030100100001000010789995736188613616611200260120050120035113145031136365010030200100001000060200200001000012003512004711502011009910040100100001000001001000011000003100000003211310133119660400029081000040100120051120107120089120052120051
502041201008990000000000000120032119530109463256010340100100011000030100100001000010788625736188613381811200260120035120050113141031137535010030200100001000060200200001000012003512003511502011009910040100100001000001001000011000000100001103211310733119646400020001000040100120051120118120077120049120036
502041200358990000000100000120020119528109449256010040102100001000030100100001000010789995735455613381801200260120050120050113145031136365010030200100001000060200200001000012003512004711502011009910040100100001000001001000001000000100001103211310132120242400029901000040100122694122394122467122570122474
5020412287891711111283134712288000012309312107811033474560487403381005510036302511000010000107886257360446152155112002631201281204881131440171140355010030200102661000060200200001010712004712003531502011009910040100100001000011001000001000016100000123211210724119879400029681000040100120110120052120056120048120052

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 9.0050

retire (01)cycle (02)0308090b0e0f1e22243f4d4f51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)60696b6d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a3a6a8acafb5dcache load miss (bf)branch mispredict (cb)cfd5d6ddinst fetch restart (de)e0? int output thing (e9)eaebecld/st retires (ed)gpr retires (ef)f5f6f7f8fd
50025120035900010100101200351195051094632560013400181000310000300101000010000107954857361886133499112013001200501200501131683113686500103002010000100006002020000100001200351200471150021109104001010000100001101000001000000100001003140610754119662400026081000040010120051120089120058120052120036
50024120047899000001001200351194921094632560013400121000110000300101000010000107954857354556133499112002631200471200471131653113686500103002010000100006002020000100001200351200471150021109104001010000100001101000011000003100001103140410745119650400009981000040010120051120051120036120051120048
50024120047899000001001200201195081094632560010400121000110000300101000010000107954857354556133499112001101200501200501131533113674502773002010000100006002020000100001200501200351150021109104001010000100000101000001000043100001003140410745119665400029681000040010120048120036120036120051120051
50024120047899001001001200321194921094612560010400121000010000300101000010000107954857361886133499112002601200351200501131533113689500103002010000100006002020000100001200351200471150021109104001010000100000101000011000000100001003140510754119662400029081000040010120051120036120036120052120054
50024120035899000001001200361195081094612560013400121000110000300101000010000107951757361886133499112002301200501200351131533113689500103002010000100006002020000100001200501200471150021109104001010000100000101000011000000100000003140510744119662400020681000040010120051120051120036120051120050
50024120050899000001001200201195081094632560062400121000110000300101000010000107951757370246132757112002301200501200351131683113674500103002010000100006002020000100001200351200471150021109104001010000100000101000011000000100000103140510765119665400029081000040010120036120051120051120051120051
50024120035899000103101200351194931094652560013400121000110001300101000010000107954857360446132757112001101200351200501131533113674500103002010000100006002020000100001200531200351150021109104001010000100000101000001000023100000103140610744119665400009901000040010120051120036120051120051120051
50024120047900000001101200321194921094634060013400121000110000300101000010000107951757361886133499112002301200351200471131683113689500103018010000100696002020000100001200501200481150021109104001010000100000101000011000000100001003140410745119665400020081000040010120036120048120036120051120036
50024120051899100001101200351195171094492560013400121000110000300101000010000107954857361886133499112001101200501200501131653113689500103002010000100006002020000100001200501200471150021109104001010000100000101000011000050100001103140610755119650400029601000040010120048120036120051120048120051
50024120047899000011001200351195081094632560013400121000110000300101000010000107954857354556133551112002601200501200501131683113689500103002010000100006002020000100001200501200471150021109104001010000100000101000001000000100001103140410745119665400000901000040010120036120051120051120051120048

Test 4: throughput

Count: 8

Code:

  ldr s0, [x6, w7, uxtw]
  ldr s0, [x6, w7, uxtw]
  ldr s0, [x6, w7, uxtw]
  ldr s0, [x6, w7, uxtw]
  ldr s0, [x6, w7, uxtw]
  ldr s0, [x6, w7, uxtw]
  ldr s0, [x6, w7, uxtw]
  ldr s0, [x6, w7, uxtw]
  mov x7, 8

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.3341

retire (01)cycle (02)03090e0f191e22243a3f4346494f51inst issue (52)~issue int (53)~issue ld/st (55)~dispatch int (56)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map lookup int (7f)~map lookup ld/st (80)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a3a5a6a8acafb5b6bbdcache load miss (bf)c2c5branch mispredict (cb)cdcfd5d6ddinst fetch restart (de)e0eaebecld/st retires (ed)gpr retires (ef)f5f6f7f8fd
802052672820011004400026692211221258010010080000100800145001167198126682267312670716635616683801152008002420016004826731267271180201100991001008000080000110080000430800380388000061384401115118216122672414100800001002674126735267322678026733
8020426731200001044001267162112192580100100800001008001350011671980267062673126731166596166838011520080024200160048267312672711802011009910010080000800001100800000080039008003801394401115118216122672810107800001002671626729267332673226732
802042673120000004400126693200025801001128000010080013500116692312670626731267311665961668380114200800242001600482673126707118020110099100100800008000011008000043080000242800386104401115118216212672814144800001002673226732267082673226708
8020426731201000000012671220122725801001008000010080015500116838012670626707267311665961668480115200800242001608302673126727118020110099100100800008000011008000043080038038800386139440111511811612267281007800001002670826708267302673226732
802042670720000004400126712001219258010010080000100800155001168380126682267312673116635616683801152008002420016004826707267271180201100991001008000080000010080000008000000800006038440111511821612267040140800001002674326730267462670826732
80204267072000000440002671601211925801001008000010080015500116838012670626731267321665961665980113200800242001600482673126707118020110099100100800008000001008000000800000398003861384401115118216122672410147800001002674226731267372683326732
80204267312000010440012671621119258010010080000100800155001167327126682267312672716655616683801152008002420016004826731267271180201100991001008000080000110080000008000000800000039440111511811622267300107800001002674126728267412673226732
802042673120000004500026712212002580100100800001008001450011673030267022672726707166556166618011320080024200160048267312672711802011009910010080000800000100800004308003800800006104301115118216212672814107800001002689226815267362687926732
80204267072000100440012671621202125801001008000010080016500116838002668226727267071665561668380115202806042001600482673526727118020110099100100800008000001008000043080039338800006039001115118216212672914147800001002672826733267362672226732
8020426727200011000012671201212162580100100800001008001550011683800267062670726727166356166838011520080024200160048267312672711802011009910010080000800000100800004308000005480000610001115118216222672814147800001002674226732267212670826732

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.3341

retire (01)cycle (02)0305080b0e0f1e22243a3f4346494f51inst issue (52)~issue int (53)~issue ld/st (55)~dispatch int (56)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)5f60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map lookup int (7f)~map lookup ld/st (80)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)92inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a1a3a6a7a8a9acafb5b6bbdcache load miss (bf)dtlb miss (c1)c2cfd5d6ddinst fetch restart (de)e0eaebecld/st retires (ed)gpr retires (ef)f5f6f7f8fd
8002526737200000105410126716070222580010108000010800005011677910126690267362673616681316716800102080000201600002673626715118002110901010800008000001080000043800380000800386139440050204162426724140780000102673626750267092673226732
8002426731200000104410126716277192580010108000010800005011694510126689267362673716681316716800102080000201600002673626737118002110901010800008000001080000043800380003980038613943005020616342670500080000102673326734267342673226709
8002426731199000010100267162077258001010800001080000501173975012668926737267371668131672180010208000020160386267372673611800211090101080000800001108013004480039000388003860384400502041642267281010480000102674226739267382673826738
80024267362001110067102267223012192580010108000010800005011668860126706267312673116676316711800102080000201600002670826708118002110901010800008000001080000043800000000800380139430050204166426724010080000102673726732267322670926732
8002426708200000010000267160012162580010108000010800005011675010026683267272670816672316711800102080000201600002670826727118002110901010800008000001080020194380019101218004001594519050204164226734130580000102673826736267212673226732
8002426731200000004400026716200182580010108000010800005011672980126690267142673616660316717800102080000201600002673626736118002110901010800008000001080000043800380003880038010440050203165626724140780000102691426751267382671126732
80024267312000000045001267122100258001010800001080000501167124002668526731267361667631671180010208000020160000267482672811800211090101080000800001108000000800000004180000600440050204164226725010780000102673426729267322670926732
8002426731200000000101267162001022580010108000010800005011671240026702267282673116652316708800102080000201600002673126708118002110901010800008000001080000008003800038800386139440050202164426724010780000102673526748267322670926732
80024267312000000044001267162771925800101080000108000050117397501267112673626736166813167168001020800002016000026736267151180021109110108000080000010800000080038000388000001394300502021642267281414080000102674626737267382673726716
800242673620011010661002672101211925800101080000108000050116750101267062673126731166763166888001020800002016000026731267081180021109010108000080000010800202043800190107980040015843190502041644267351313080000102674826709267092670926728