Apple M1 Microarchitecture Research by Dougall Johnson

Firestorm: Overview | Base Instructions | SIMD and FP Instructions
Icestorm:  Overview | Base Instructions | SIMD and FP Instructions

LDNP (signed offset, Q)

Test 1: uops

Code:

  ldnp q0, q1, [x6, #0x10]
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 2.000

Issues: 2.000

Integer unit issues: 0.000

Load/store unit issues: 2.000

SIMD/FP unit issues: 0.000

retire (01)cycle (02)030e0f1e2223243a3f4346494f51inst issue (52)~issue ld/st (55)~dispatch ld/st (58)huge thing ld/st (5a)696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op ld/st (7d)~map lookup ld/st (80)8283pipeline redirect (84)85inst all (8c)inst fp/simd load (98)inst ldst (9b)9da0a1a3a6a8a9acafb5b6bbdcache load miss (bf)cfd5d6ddinst fetch restart (de)e0e6eaebecld/st retires (ed)f5f6f7f8fd
2005722710450002693218181625200020002000296467037277084053460200020002000708708111001100010001200003920000002000613507311611720203910642000728729728728729
200472760045001271621201625200020002000295746977277304053460200020002000727708111001100010000200000200000392035613539731161171920000602000709728723728728
2004731600450012712201216252000200020002859170270872842534602000200020007287221110011000100002000039203900392039610397311611724203910042000728713729728733
2004727700450002707012016252000200020002939169870872742534602000200020007227281110011000100002000039203900020000135397311611720203910642000728728728728729
2004727600450011693201816252000200020002957469772770840534602000200020007277231110011000100002000002039004220006135397311611724203910642000709728728709728
20047277004100027122121216252000200020002957470272272742534602000200020007277221110011000100002000039200000392039613507311611719200010002000728728728728728
200472870045000171200121625200020002000295746977087284053460200020002000728723111001100010000200003920391039200061390731161170520006642000728709728728729
2004727600450102693201216252000200020002962469770872242534602000200020007217221110011000100002000002000003920000000731161172520390002000709709728729728
2004727600000027070120162520002000200028591697727727425346020002000200072772711100110001000020000020390002000613507311611719203961042000728728709728728
20047276100000271721201725200020002000295746977087114253460200020002000730708111001100010000200003920350039200060393973116117242039101042000728729709709723

Test 2: Latency 1->3 roundtrip

Chain cycles: 3

Code:

  ldnp q0, q1, [x6, #0x10]
  fmov x0, d0
  eor x8, x8, x0
  eor x8, x8, x0
  add x6, x6, x8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 9.0057

retire (01)cycle (02)030508090b0e0f191e1f2223243a3f43494d4f51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a1a3a6a8a9acafb5bbdcache load miss (bf)dtlb miss (c1)c2c5cdcfd5d6ddinst fetch restart (de)e0e6? int output thing (e9)eaebecld/st retires (ed)gpr retires (ef)f5f6f7f8fd
6020512006189910120102010011200380096718109733407010340104100022000030100100002000010425957573353234653360120017120053120053112141311251160100302002000010000602002000010000120053120053115020110099100401001000010000010020003322000200220000222200032102161111982620002400046652000040100120054120054120054120042120054
6020412005390010111104001011200260096706109742257010340102100012000030100100002000010426917573353234656840120017120053120053112141311251160100302002000010000602002000010000120041120053115020110099100401001000010000010020003322000210220000220200032101161111981420002400026602000040100120042120054120054120054120054
6020412005693110110004001001200380096706109742257010340104100022000030100100002000010426917573353234656840120029120053120053112129311249960100302002000010000602002000010000120041120053115020110099100401001000010000010020002322000201220000222210032101161111981420002400040652000040100120042120054120054120054120054
6020412005389910110004001001200380096718109742257010640104100012000030100100002000010426917573295634656840120017120053120053112141311255260100302002000010000602002000010000120053120053115020110099100401001000010000010020002202000301220000222210032101161111982620002400046002000040100120054120042120042120054120054
6020412005390010110004001001200462296726109750257010940106100022000030100100002000010427613573391634659160120017120041120053112141311251160100302002000010000602002000010000120041120053115020110099100401001000010000010020002402000301220000222210032101161111981420002400026602000040100120054120054120042120042120054
6020412005389910100104001011200420096715109744257011740102100012000030100100002000010425873573353234656840120017120041120041112129311251160100302002000010000602002000010000120041120053215020110099100401001000010000010020003322000301220000222200032101161111981420002400046052000040100120054120042120054120054120054
60204120041900101111028000011200380096706109730257010640102100022000030100100002000010426917573295634656840120029120041120053112129311251160100302002000010000604042000010000120041120053115020110099100401001000010000010020003222000301220000222210032101161111982620002400046002000040100120054120054120042120054120042
6020412005389910110102001001200380096718109730257010640104100022000030100100002000010425873573353234656840120017120053120053112129311251160100302002000010000602002000010000120053120053115020110099100401001000010000010020003222000301220000222220032101161111982620002400046652000040100120054120042120054120054120042
6020412005389910011104000011200260096706109742257010340104100012000030100100002000010425873573353234656840120025120049120061112167311253660100302002000010000602002000010000120061120061115020110099100401001000010000010020002202000401420002222220032101161111982620002400040002000040100120058120054120042120042120058
6020412005390010111004000001200380096722109742257010640104100022000030100100002000010425873573372434656840120017120057120057112141311251560100302002000010000602002000010000120098120053115020110099100401001000010000010020003202000200520000222230032101161111981420002400046002000040100120058120042120042120058120059

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 9.0053

retire (01)cycle (02)0305080b0e0f18191e1f22233a3f4d4f51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)60696b6d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a1a3a6a8a9acafb5bbdcache load miss (bf)dtlb miss (c1)c2cfd5d6ddinst fetch restart (de)e0e6? int output thing (e9)eaebecld/st retires (ed)gpr retires (ef)f5f6f7f8fd
60025120053899101110040002120038966471097302570016400141000120000301031000020000104258965733724346540801200290120057120041112168311253360010300202000010000600202000010000120057120053115002110910400101000010000110200043220002002200002222031404164311982820002400026692000040010120042120042120058120058120058
60024120057899101000020001120026966471097462570028400121000120000300101000020000104245045733724346540801200330120057120057112168311252460010300202000010000600202000010000120057120053115002110910400101000010000010200022220003122200002202031403163411983220002400020002000040010120058120042120054120058120042
6002412004189910100004000212004296632109746257001340012100022000030010100002000010425896573372434654080120033012005712005711216831125336001030020200001000060020200001000012004112005311500211091040010100001000001020003222000201220000222213140416341198322000240002101092000040010120054120058120042120042120054
6002412010489910100006101021200269664710974625700164001210002200003001010000200001042450457337243465524012002301200411200571121683112533600103002020000100006002020000100001200571200531150021109104001010000100000102000352200021022000022220314041643119832200024000410692000040010120058120238120058120334120058
60024120041899110000020001120042966471097462570016400121000120000300101000020000104245045733724346540801200330120041120057112152311253760010300202000010000600202000010000120057120053115002110910400101000010000110200022220003015200002222031404163411982820002400040692000040010120042120044120054120054120054
60024120053899100000020000120026966471097302570013400141000220000300101000020000104245045733820346540801200330120061120061112168311252160010300202000010000600202000010000120041120057115002110910400101000010000110200032220004012200002222031403163411983220002400040652000040010120058120058120058120042120042
600241200538991000000800021200429664710974625700164001410002200003001010000200001042450457329563465408012001701200411200411121683112537600103002020000100006002020000100001200531200531150021109104001010000100000102000242200040022000022220314041644119816200024000410692000040010120042120042120042120058120058
600241200578991000000400001200269663110974225700134001410002200003001010000200001042589657337243465060012003301200531200411121523112537600103002020000100006002020000100001200531200531150021109104001010000100000102000222200040122000022023314041644119816200024002910692000040010120054120058120058120132120054
600241200538991000001400001200269664710974625700134001410002200003001010000200001042589657335323465060012003301200571200531121523112521600103002020000100006002020000100001200411200411150021109104001010000100001102000222200022152000002021314041644119832200024000410092000040010120058120042120058120058120054
6002412005389910000002000012004296647109746257001340014100022000030010100002000010424504573372434654080120033012005712004111216831125216001030020200001000060020200001000012005312005311500211091040010100001000001020004322000212232000002021314041634119832200024000410002000040010120054120058120058120058120042

Test 3: Latency 2->3 roundtrip

Chain cycles: 3

Code:

  ldnp q0, q1, [x6, #0x10]
  fmov x1, d1
  eor x8, x8, x1
  eor x8, x8, x1
  add x6, x6, x8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 9.0048

retire (01)cycle (02)03090b0e0f1e22233a3f4d4f51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)60696b6d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a3a6a8acafb5dcache load miss (bf)cdcfd5d6ddinst fetch restart (de)e0e6? int output thing (e9)eaebecld/st retires (ed)gpr retires (ef)f5f6f7f8fd
6020512004889910112100120033967171097412570103401021000120000301001000020000104252285733484346553911200110120052120048112136311250660100302002000010000602002000010000120035120048115020110099100401001000010000110020000220000002000020032101161111980820000400026052000040100120139120241120579120555120053
6020412004889900000101120058967171097372570103401001000120000301001000020000104268305733292346565501200240120052120048112140311250660100302002000010000602002000010000120035120048115020110099100401001000010000010020000220000002000020032101161111982520000400026052000040100120053120049120053120053120036
6020412005289910002000120033967171097372570103401021000120000301001000020000104252285732666346553901200280120035120052112140311250660100302002000010000602002000010000120048120048115020110099100401001000010000010020000220000002000000032101161111982520000400026602000040100120053120036120036120036120053
60204120048899100020011200339671710974125701034010210001200003010010000200001042683057334843465539012001101200521200521121403112510601003020020000100006020020000100001200521200351150201100991004010010000100000100200002200000020000220321011611119825200004000201092000040100120053120036120053120036120036
60204120052899000020001200339740310972425701034010210001200003010010000200001042683057326663465539112002801200951200521121233112510601003020020000100006020020000100001200351200481150201100991004010010000100001100200000200000020000220321011611119825200004000210692000040100120036120053120036120053120036
6020412005289900006000120033967171097412570100401001000120000301001000020000104252285733484346553901200280120052120048112140311250660100302002000010000602002000010000120048120048115020110099100401001000010000110020000220000002000002032101161111980820000400020652000040100120053120053120053120036120036
60204120035900000001001200339671710974125701004010210001200003010010000200001042683057334843465539012002801200521200521121233112493601003020020000100006020020000100001200521200481150201100991004010010000100000100200002200000020000200321011611119808200004000261052000040100120049120053120049120053120036
6020412004889900000000120140967131097372570103401001000020000301001000020000104264825733484346553901200240120048120052112123311249360100302002000010000602002000010000120048120048115020110099100401001000010000010020000220000002000020032101161111982520000400026692000040100120053120036120036120053120053
60204120052900000000001200339671710973725701004010210001200003010010000200001042522857334843465655012002801200521200481121233112493601003020020000100006020020000100001200351200351150201100991004010010000100001100200002200000020000220321011611119825200004000261002000040100120053120053120036120036120036
6020412004890010002000120037967171097412570103401021000120000301941000020000104268305733628346677501200110120052120035112123311251060100302002000010000602002000010032120035120056115020110099100401001000010000110020000020000002000022032101161111982520000400006652000040100120049120036120036120036120036

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 9.0062

retire (01)cycle (02)0305080b0e0f18191e1f22233a3f4d4f51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a1a3a6a7a8a9acafb5bbdcache load miss (bf)dtlb miss (c1)c2c5branch mispredict (cb)cdcfd0d5d6ddinst fetch restart (de)dfe0e6? int output thing (e9)eaebecld/st retires (ed)gpr retires (ef)f5f6f7f8fd
60025120058899100000040102120052966521097302570016400141000120000301031000020000104245045733772346555301200381200411200581121733112538600103002020000100006002020000100001200411200621150021109104001010000100000102000322200020002200002222000131433116113119837200024000221002000040010120063120063120042120063120063
60024120058899110000040100120026966521097302570013400141000220000300101000020000104263315733964346555301200381200621200621121733112542600103002020000100006002020000100001200411200581150021109104001010000100001102000232200040102200002222100031433116113119837200024000201402000040010120063120043120042120042120063
6002412004189910100001100011200269664810973025700164001410002200003001010000200001042598357339643465669012003412006212006211217331125426001030020200001000060020200001000012006212004111500211091040010100001000011020002202000200122000002020100314331162131198332000240002100132000040010120063120042120043120042120042
60024120062899100000040001120043966311097512570016400141000120000300101000020000104263315733964346506001200381200621200621121523112542600103002020000100006002020000100001200621200581150021109104001010000100000102000220200020012200002222100031433116113119816200024000201402000040010120042120063120063120042120063
600241200628991100000201001200269663110973025700134001210001200003001010000200001042450457329563465553012003812006212005811215231125216001030020200001000060020200661000012006212005811500211091040010100001000001020003222000201122000022020000314331161131198192000240004140132000040010120063120063120063120063120063
600241200628991100000400001200269665210973025700164001410002200003001010000200001042633157339643465060012003812004112005811216931125426001030020200001000060020200001000012006212004111500211091040010100001000011020002222000300122000022221000314331161131198162000240002140132000040010120063120042120059120042120062
600241200418991000000200001200439663110974725700164001410002200003001010000200001042598357339643465669012003812004112005811215231125216001030020200001000060020200001000012004112004111500211091040010100001000001020002302000200022000022220000314331161131198372000240004141492000040010120059120063120042120042120063
6002512006289910000004000012004396656109751257001340014100012000030010100002000010426331573295634655530120017120062120061112177311253860010300202000010000600202000010000120041120041115002110910400101000010000010200033220002000220002222210003143311611311983720002400041410132000040010120059120063120063120042120059
600241200628991110025253188228800212234096510110909572704304023710078201043256110821213541054993758070963508204012093812242912250211328815112538605283002020000100326002020000100001200621200581150021109104001010000100000102005652200610106256020064222200003846312331331223702009040309141492000040010123596123898123407123768123531
6002412368095510001344240100123398977501097304770082400301000720076308371015320508105959255813758351257301200411202511204321122451611266260010300202000010000600202000010000120059120058115002110910400101000010000010200032220003030716772007402020000407431163331198412000240002010132000040010120059120059120059120059120066

Test 4: throughput

Count: 8

Code:

  ldnp q0, q1, [x6, #0x10]
  ldnp q0, q1, [x6, #0x10]
  ldnp q0, q1, [x6, #0x10]
  ldnp q0, q1, [x6, #0x10]
  ldnp q0, q1, [x6, #0x10]
  ldnp q0, q1, [x6, #0x10]
  ldnp q0, q1, [x6, #0x10]
  ldnp q0, q1, [x6, #0x10]
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.6674

retire (01)cycle (02)0308090b0e0f191e1f22233a3f4346494f51inst issue (52)~issue int (53)~issue ld/st (55)~dispatch int (56)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map lookup int (7f)~map lookup ld/st (80)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a3a5a6a7a8acafb5b6bbdcache load miss (bf)c5cfd5d6ddinst fetch restart (de)e0e6eaebecld/st retires (ed)gpr retires (ef)f5f6f7f8fd
1602055339040001000041010253379201816251601001001600001001600005002333962053349533745338933317333347160100200160000200160000533895338911802011009910010080000800000100160000390160035003516003961364305110117105365016003510641600001005337553395533965339553395
16020453394400000000780102533612121222251601001001600001001600005002331836053369534025339833324333363160280200160000200160000533855400911802011009910010080000800000100160000399416003503421600356135430511011611537061600396641600001005337553375533965339253403
16020453394400000000570001533740181802516010010016000010016000050023667840533495339453394332983334831601002001600002001600005338953405118020110099100100800008000011001600000016030100481600390139005110116115338616003910001600001005339553375533905339553395
1602045339740000000045010053359212121625160100100160000100160000500233396205336953394533893331233335216010020016000020016000053394533891180201100991001008000080000010016000039016000000391600390136390511011611533861600350601600001005339053390533755339053390
1602045339440000100045000153379201216251601001001600001001600005002333262053364533745337433296333332160100200160000200160000536815338911802011009910010080000800000100160000430160035003916003561354305110116115339016003910621600001005337553395533755339053397
1602045339439900000000002533792120162516010010016000010016000050023332621533495338953394333173333521601002001600002001600005337453389118020110099100100800008000001001600000016000000351600396139431511011611533861600006601600001005339553395533755339053375
1602045339440000000045010253379012016251601001001600001001600005002333962053349533945337433317333353160100200160000200160000533895337411802011009910010080000800000100160000390160039000160000010005110116115338616003910021600001005337553390533955339553395
160204533944000000000000253359218181625160100100160000100160000500233396215336453389533893350333334716010020016000020016000053394533891180201100991001008000080000010016000039016000000391600406103905110116115338616003910641600001005337553390533955339553395
1602045339440000000045010153359212120251601001001600001001600005002333962053369533945338933317333332160100200160000200160000533945338911802011009910010080000800000100160000390160000003916003960354305110116115369416003910641600001005339553395533755337553395
16020453389400000000450001533792181816251601001001600001001600005002332897053349533945337433312333352160100200160000200160000533895337411802011009910010080000800000100160000390160039000160000603543051101161153391160039101041600001005339553375533755339553398

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.6675

retire (01)cycle (02)030508090b0e0f181e1f22243a3f4346494f51inst issue (52)~issue int (53)~issue ld/st (55)~dispatch int (56)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map lookup int (7f)~map lookup ld/st (80)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a1a3a6a7a8a9acafb5b6bbdcache load miss (bf)dtlb miss (c1)c2cfd2d5d6ddinst fetch restart (de)e0e6eaebecld/st retires (ed)gpr retires (ef)f5f6f7f8fd
160025533984001101100650102533842001625160010101600001016000050233108615335653399533993334433337916001020160000201600005338153381118002110910108000080000010160020210160019101591600380157421915020751617853395160057092160000105340053400533825338253382
1600245339940010010006500035336601801625160010101600001016000050233837105335653388533993332633338116001020160000201600005341653399118002110910108000080000110160019210160057012711600380157421915020661617853396160057992160000105339953382534005339953400
16002453398400111000021000253383018181625160010101600001016000050234275015337353398533983332633337916001020160000201600005354853407118002110910108000080000010160020204216001900021160038005742190502061616171753395160057090160000105338253399534005340053382
1600245339940011100006500035336621801625160010101600001016000050235536705337453399533813333063338216001020160000201600005350553399118002110910108000080000010160019200160019000591600380057019150206141661753396160057990160000105340053399534005338253400
160024533814001011003210003533833181816251600101016000010160000502328974053373533985339833326333361160010201600002016000053398533961180021109101080000800000101600191942160057102591600006056421905020671617753396160058992160000105340053382534005338253400
160024533984001111000650103533840180162516001010160000101600005023553670533745338153399333263333781600102016000020160000534665338911800211091010800008000001016002021421600571015916003861574219050207161681753378160057090160000105340053400534005340053399
160024533814001010000650100533662000251600101016000010160000502334497053373533985339833327333379160010201600002016000053400534081180021109101080000800001101600202042160056100601600386119019050207161617753395160057992160000105339953400533865340453399
160024533994001111000210002533660181815251600101016000010160000502349817053373533815339933326333378160010201600002016000053482534081180021109101080000800000101600202142160019100591600376119019150207161681753378160019092160000105338253382534095342253402
160024533994001110000220101533832181802516001010160000101600005023329970533745338153399333263333611600102016000020160000535125340011800211091010800008000001016002019016005710062160038011901915020771681753378160019000160000105339953400533825338253382
16002453381400111000065000053384201815251600101016000010160000502329020053356533815338133344333361160010201600002016000053530534101180021109101080000800000101600191901600572002116000060574119150206516171753396160057990160000105339953400533995338353382