Apple M1 Microarchitecture Research by Dougall Johnson

Firestorm: Overview | Base Instructions | SIMD and FP Instructions
Icestorm:  Overview | Base Instructions | SIMD and FP Instructions

LDP (Q)

Test 1: uops

Code:

  ldp q0, q1, [x6]
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 2.000

Issues: 2.000

Integer unit issues: 0.000

Load/store unit issues: 2.000

SIMD/FP unit issues: 0.000

retire (01)cycle (02)030507080a0b0e0f1e223a3f4346494f51inst issue (52)~issue ld/st (55)~dispatch ld/st (58)huge thing ld/st (5a)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op ld/st (7d)~map lookup ld/st (80)8283pipeline redirect (84)85inst all (8c)inst fp/simd load (98)inst ldst (9b)9da0a1a3a6a7a8a9acafb5b6bbdcache load miss (bf)dtlb miss (c1)c2cfd5d6ddinst fetch restart (de)e0eaebecld/st retires (ed)f5f6f7f8fd
2005727601010006714721277192520002000200030073171173773643434692000200020007367361110011000100002020194320581016020406158421807551666733131352000737737737737737
2004736511010116714721277192520002000200030134171173673643434692000200020007367361110011000100002020204320580016120406158431917561666733131352000737737737737737
2004736611110006714699277192520002000200030066171173673643334692000200020007367361110011000100002020204320591026120396159431917561666733131352000737737737737737
2004736611111016613721377182520002000200030095071173673643434702000200020007377361110011000100002021194320601026020416159431907551655733131352000737737737737737
2004736511110006913721177192520002000200030403079573673643434692000200020007367361110011000100002019194320580026020406159431917561666733131352000742737737737737
2004736611111107014721277192520002000200030170171173673643434692000200020007367361110011000100002020204320591016120406159431917561665733131352000737737737737737
2004736511110006603721377202520002000200030169071175773643434692000200020007367361110011000100012019194320590006420396159431927551655733131352000737737737737737
2004736511010006614721377182520002000200030095171173673643334692000200020007367361110011000100002020204320591016420396159431917551655733131352000737737737737738
2004736511110006703721279192520002000200030095171173673643434692000200020007367361110011000100002020204320581026120406158431917551655733131352000737737737737737
2004736511110006714721277192520002000200030170171173673743434692000200020007367361110011000100002020204320600616120396159431917551655733131352000737737737737737

Test 2: Latency 1->3 roundtrip

Chain cycles: 3

Code:

  ldp q0, q1, [x6]
  fmov x0, d0
  eor x8, x8, x0
  eor x8, x8, x0
  add x6, x6, x8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 9.0051

retire (01)cycle (02)03050e0f191e1f2223243f4d4f51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)60696b6d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a3a6a8acafb5dcache load miss (bf)cfd5d6ddinst fetch restart (de)e0? int output thing (e9)eaebecld/st retires (ed)gpr retires (ef)f5f6f7f8fd
6020512005789900002010012003696716109740257010040102100012000030100100002000010426743573343634656261120027012005512005111212331125096010030200200001000060966200621000012005112003511502011009910040100100001000001002000022000000200002232372161111982440002101092000040100120038120050120052120052120036
602041200518991000201001200209671610974025701034010210000200003010010000200001042674357337243467648012002701200511200511121393112509601003020020000100006020020000100001200511200511150201100991004010010000100000100200002200000020000223210116111198244000201092000040100120036120052120052120052120052
602041200518991100200001200369671610974025701034010210001200003010010000200001042674357337243467452012002701200511200511121393112509601003020020000100006020020000100001200511200511150201100991004010010000100001100200002200000020000023210116011198244000210092000040100120052120052120052120052120036
602041200518991000000001200369671610974025701034010210001200003010010000200001042674357337243467052012002731200541200511121393112509601003020020000100006020020000100001200351200511150201100991004010010000100000100200002200000020000223210116111198244000210092000040100120052120052120036120052120052
60204120035899100020100120036967161097482570103401021000120000301001000020000104267435735548346628001200270120051120051112139311250960100302002000010000602002000010000120051120051115020110099100401001000010000010020000220000002000022321011611119824400020092000040100120052120036120052120052120052
6020412005189910000000012003897403109740257010340102100012000030100100002000010426743573295634671950120027012005112003511213931125096010030200200001000060200200001000012003512005111502011009910040100100001000001002000022000000200002032101161111982440002101092000040100120052120052120053120052120052
6020412003589910002010012003696716109740257010340102100012000030100100002000010426743573372434658000120027012005112005111213931125096010030200200001000060200200001000012005112005111502011009910040100100001000001002000022000000200002032101161111980840002101092000040100120052120052120052120036120036
6020412005189910002010012003696716109740257010040102100012000030100100002000010426743573372434676550120027012005112005111213931125096010030200200001000060200200001000012005112005111502021009910040100100001000001002000022000000200000232101161111982440000101092000040100120036120052120036120052120052
6020412005189910002010012002096716109740257010340102100032000030100100002000010425228573372434673700120027012005112005111213931125096010030200200001000060200200001000012005112005111502011009910040100100001000001002000022000003200002232101161111982440002101092000040100120052120052120052120036120052
6020412003589910002000012003696716109744257010340102100012000030100100002000010426743573372434667300120030012003512005111213931125096027730200200001000060200200001000012009112005111502011009910040100100001000001002000022000000200002232101161111982440002101092000040100120052120036120052120052120052

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 9.0047

retire (01)cycle (02)03080a0b0e0f18191e1f2223243a3f4d4f51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a3a6a8a9acafb5dcache load miss (bf)cdcfd5d6ddinst fetch restart (de)e0? int output thing (e9)eaebecld/st retires (ed)gpr retires (ef)f5f6f7f8fd
60025120051899000000020000012003693975109740257001340012100012000030010100002000010425374573343634653500120090120051120051112162311251560010300202000010000600202000010000120051120051115002110910400101000010000010200002200000002000022031405161111982640000101092000040010120052120036120052120036120052
60024120051899000000020100012003696641109740257001340010100012000030010100002000010425374573343634653500120107120051120074112162311253760190300202000010000600202000010000120054120053115002110910400101000010000110200002200000002000022031401161111982640002101092000040010120052120052120052120052120052
60024120051899000000020100012003696641109740257001340012100002000030010100002000010425374573343634653500120027120058120051112162311253160010300202000010000600202000010000120051120035115002110910400101000010000010200002200000002000022131401161111982640002101092000040010120052120052120052120052120052
600241200558990000000201000120036966411097402570013400121000020000300101000020000104253745733436346535001200281200511200511121622411254860010300202000010000600202000010000120051120035115002110910400101000010000010200002200000002000022031401161111981040002101002000040010120052120052120052120052120052
60024120051899000000020000012002096641109724257001040012100012000030010100002000010426827573343634653500120011120051120051112162311251560010300202000010000604042006210000120051120051115002110910400101000010000010200002200000002000022031401161111982640000101092000040010120052120052120052120052120036
60024120051899000000020100012002093975109724257001040012100012000030010100002000010425374573416034653500120027120051120051112146311253160010300202000010000600202000010000120051120051115002110910400101000010000010200002200000002000022031401161111982640000101092000040010120052120052120052120036120052
6002412005189900000002000001200369664110974025700134001210000200023001010000200001042537457326663465350012002712003512005111216231125316001030020200001000060020200001000012005512005111500211091040010100001000001020000220004100200002203140116111198264000210092000040010120052120052120036120036120036
6002412005189900000002000001200389664110974025700134001210001200003001010000200001042537457334363465350012001112005112005111216231125316001030020200001000060020200001000012005112003511500211091040010100001000001020000220000000200002203140116111198264000201092000040010120052120052120052120052120052
60024120051899000000020100012003696641109740257001340010100012000030010100002000010425374573343634653500120027120051120051112162311253160010300202000010000600202000010000120051120051115002110910400101000010000010200002200000002000022031401161111982640002101092000040010120057120052120052120052120052
60024120051899000000020100012003696641109740257001040012100012000030010100002000010425374573343634653500120027120051120051112162311253160010301122000010000600202000010000120051120051215002110910400101000010000010200002200000002000022031401161111982640002101092000040010120052120052120056120052120052

Test 3: Latency 2->3 roundtrip

Chain cycles: 3

Code:

  ldp q0, q1, [x6]
  fmov x1, d1
  eor x8, x8, x1
  eor x8, x8, x1
  add x6, x6, x8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 9.0051

retire (01)cycle (02)0308090b0e0f18191e1f2223243a3f4d4f51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a1a3a6a7a8aaacafb5bbdcache load miss (bf)c5branch mispredict (cb)cfd5d6ddinst fetch restart (de)e0? int output thing (e9)eaebecld/st retires (ed)gpr retires (ef)f5f6f7f8fd
60205120053899000100020100112003696712109740257010340102100012000030100100002000010426743573343634651561120023120047120047112135311250560100302002000010000602002000010000120047120047115020110099100401001000010000110020000022000000002000020200321021621119820400020692000040100120052120052120052120048120052
602041200518990001000200001120036967161097362570100401021000120000301001000020000104267435733436346551011200111200471200471121413112505601003020020000100006020020000100001200471200471150201100991004010010000100001100200000220000000020000002003210216111198244000210692000040100120048120052120048120048120052
602041200519000000000200001120036967161097402570103401021000120000301001000020000104268275733436346562611200271200511200471121403112509601003020020000100006020020000100001200511200351150201100991004010010000100001100200000020000000020000002003210216121198254000210602000040100120053120052120036120052120052
602041200558990000000201001120036967161097402570103401021000120000301001000020000104267435733244346551011200271200351200471121393112509601003020020000100006020020000100001200551200471150201100991004010010000100000100200000220000000020000202003210216111198244000210092000040100120052120052120036120048120052
6020412005189900000001500000120020967201097402570103401021000120000301001000020000104267435733436346562611200271200471200471121393112505601003020020000100006020020000100001200471200471150201100991004010010000100001100200000220000000020000202003210216221198244000210692000040100120052120052120052120052120052
602041200518990000000200000120036967161097402570103401021000120000301001000020000104263955733244346515611200271200351200511121393112493601003020020000100006020020000100001200351200471150201100991004010010000100001100200000220000000320000002003210216211198244000210052000040100120052120052120052120052120052
602041200518990000000140010112003696716109736257010340102100012000030100100002000010425228573446934659931120011120035120035112139311251360100302002000010000602002000010000120051120047115020110099100401001000010000010020000022000003062000000000321021622119820400026052000040100120048120052120036120052120048
602041200518990000000200100120036967161097402570103401021000120000301001000020000104252285733436346562611200111200471200471121393112509601003020020000100006020020000100001200511200471150201100991004010010000100000100200000020000000020000202003210216221198244000201092000040100120048120036120052120052120048
60204120072899000000020010112003698085110840547704924033610077201003249210794213041054503058055203506495112048312233612242411317816611397664607327572163610852653322163610823122379122397281502011009910040100100001000001002006602200601105981520000202003677222042121540402296692000040100122624122337122473122636122333
6020412289391911111413460892464010012218597726111564785706814035910115201443278511256218541058984958428203517767112088412004712004711315040113915647743211121444107596437221698105711224561219922515020110099100401001000010000110020000022000600306914320044202113237316121198204000210692000040100120052120052120048120052120048

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 9.0051

retire (01)cycle (02)03070b0e0f1e22233a3f4d4f51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)60696b6d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a1a3a6a8acafb5dcache load miss (bf)dtlb miss (c1)c2cdcfd5d6ddinst fetch restart (de)e0? int output thing (e9)eaebecld/st retires (ed)gpr retires (ef)f5f6f7f8fd
6002512005489900101410012002096646109741110700134001210003200003001010000200001042562657334843465350112002701200511200511121623112531600103002020000100006002020000100001200511200511150021109104001010000100000102000002200000020000220003140516231198164000201092000040010120052120052120052120052120052
60024120051899000020001200369664110974025700134001210001200003001010000200001042537457346623465350012001101200511200511121623112531600103002020000100006002020000100001200351200511150021109104001010000100000102000002200045020000220003140216231198264000201002000040010120036120052120052120036120052
6002412005189900000000120036939751097412570013400121000120004300101000020000104253745733436346535011200270120051120051112146311253160010300202000010000600202000010000120035120051115002110910400101000010000010200000220000470200002200031402163211982640002101092000040010120052120093120058120052120052
6002412005189900001100012002096641109740357001340012100012000030010100002000010425374573343634653500120027012005112005111216231125316001030020200001000060020200001000012005112009611500211091040010100001000001020000022000000200002200031402163311982740000101092000040010120052120052120052120052120052
600241200358990000210012003696641109724257001340012100012000030010100002000010425374573343634653501120011012003512005111216231125316001030020200001000060020200001000012005112005111500211091040010100001000001020000022000240200002200031403163311982640000101092000040010120052120057120052120052120057
60024120035899000021001200369664110974025700134001210001200003001010000200001042537457334363465350012001101200351200511121623112515600103002020000100006002020000100001200531200511150021109104001010000100000102000000200000020000000003140316321198264000201092000040010120052120052120052120052120052
60024120035899000021001200369664110974025700134001210001200003001010000200501042537457334363465350012002701200351200511121623112515600103002020000100006002020000100001200351200511150021109104001010000100000102000002200001020000020003140416331198104000010092000040010120052120052120052120053120052
600241200359000001210012003696641109724257001340012100012000030010100002000010425374573343634653500120027012005112005111214631125316001030020200001000060020200001000012005112005111500211091040010100001000001020000002000010200002200031403164411982640002101092000040010120037120052120036120052120052
60024120051929000020001200369664110974025700134001210001200003001010000200001042537457326663465490112001101200351200511121623112576600103002020000100006002020000100001200511200571150021109104001010000100000102000002200000020004220003140316331198264000201002000040010120052120052120128120037120052
600241200358990000210012003696686109740257001340012100012000030010100002000010425374573266634648800120011012005112005111216231125316001030020200001000060020200001000012005112005111500211091040010100001000001020000022000000200002200031403162311982640002101092000040010120052120052120097120102120052

Test 4: throughput

Count: 8

Code:

  ldp q0, q1, [x6]
  ldp q0, q1, [x6]
  ldp q0, q1, [x6]
  ldp q0, q1, [x6]
  ldp q0, q1, [x6]
  ldp q0, q1, [x6]
  ldp q0, q1, [x6]
  ldp q0, q1, [x6]
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.6674

retire (01)cycle (02)030e0f191e2223243a3f4346494f51inst issue (52)~issue int (53)~issue ld/st (55)~dispatch int (56)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map lookup int (7f)~map lookup ld/st (80)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a3a6a7a8a9acafb5b6bbdcache load miss (bf)c2cfd5d6ddinst fetch restart (de)e0? int output thing (e9)eaebecld/st retires (ed)gpr retires (ef)f5f6f7f8fd
16020553393400110441001533742181812251601001001600001001600005002325261153364533895339433312333347160100200160000200160000533945339411802011009910010080000800001100160000391600350104116003561353905110116115338606621600001005339053390533995367653414
160204533894000004110015337421818169251601001001600001001600005002325261153364533895338933312333347160100200160000200160000533895338911802011009910010080000800000100160000391600350204116003661353905110116115338606621600001005339053390533905339053497
16020453389400010411001533742181812251601001001600001001600005002325261153364533895338933312333361160100200160000200160000533945338911802011009910010080000800000100160000391600350103516003561353905110116115338906621600001005339053390533905367853390
16020453389400000451001533742181880251601001001600001001600005002325261153364533895338933312333347160100200160000200160000533895338911802011009910010080000800000100160000391600350303516003561353905110116115338606621600001005339053390533905339053411
1602045338940000041100153374218181162516010010016000010016000050023254051533645338953389333153333471601002001600002001600005339353389118020110099100100800008000001001600003916003502903816003561353905110116115339106621600001005339453394533945339453404
16020453393400000530001533742181814251601001001600001001600005002325261153371533975339733315333360160100200160000200160000533935338911802011009910010080000800000100160000391600350203816003561353905110116115338606621600001005339553393533905339353390
160204533893990005310015338521818152251601001001600001001600005002325415153415533895338933312333351160100200160216200160000533895348111802011009910010080000800000100160000391605580303516003561353905126116115339006721600001005339053390533905339053394
16020453393414000411001533742181818725160100100160000100160000500234009715336453389533893331233334716010020016000020016000053389533891180201100991001008000080000010016000039160035000323916003561353905110116115338606621600001005339553390533905339053390
160204533893990004100015337421818168251601001001600001001600005002325261153364533895338933312333347160100200160000200160000533895338911802011009910010080000800000100160000391600350003516003561353905110116115338606621600001005339053390533905339053497
16020453389400000411001533742181846251601001001600001001600005002343394153364533895338933312333350160100200160000200160000533895338911802011009910010080000800000100160000391600350003516003561353905110116115338606621600001005339053390533905339053404

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.6675

retire (01)cycle (02)0307090a0e0f191e223a3f4346494f51inst issue (52)~issue int (53)~issue ld/st (55)~dispatch int (56)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map lookup int (7f)~map lookup ld/st (80)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9fa0a1a3a6a7a8acafb5b6bbdcache load miss (bf)cdcfd5d6daddinst fetch restart (de)e0eaebecld/st retires (ed)gpr retires (ef)f5f6f7f8fd
16002553395400010000441153383211219251600101016000010160000502339107153373533745339833343333354160010201600002016000053398533741180021109101080000800001016000004316003811381600386139430502210160655339514107160000105339953375533755339553654
16002453387400000000011533832112192516001010160000101600005023325490533735339853398333433333781600102016000020160000533745339411800211091010800008000010160000043160000078916003861394405020316023533951404160000105339953399534235340053623
16002453407399001000000537742110251600101016000010160000502330707053369533745339433339333378160010201600002016000053398533741180021109101080000800001016000004316003807544160038603944050203160335339514147160000105339953395533955337553693
16002453412400000000440053383011192516001010160000101600005023325490533495337453374333431533422160010201600002016000053398533941180021109101080000800001016000004316003815847160038603943050203160335337114107160000105339953375533755337553626
160024534034000000004401533830111925160010101600001016000050233977505334953398533983334333335416001020160000201600005339853374118002110910108000080000101600000431600380754716000060390050203160335339514147160000105339953399533955337553650
1600245341240000000044005338301116251600101016000010160000502339775053349533985339833319333378160010201600002016000053374533941180021109101080000800001016000000160000070411600396000050203160335337114147160000105339953375533995339953393
160024533943990000004400533832012025160010101600001016000050233254905337353398533983331933335416001020160000201600005339853374118002110910108000080000101600000441600380234216003801384405020316033533950147160000105337553399533995337553399
16002453473400000000440153383201162516001010160000101600005023352281533765339853400333393333571600102016000020160000536285341011800211091010800008000010160000001600380734816003801390050203160335339114107160000105339953375533955339953642
160024534064000000004400533830112162516001010160000101600005023307071533745339853398333433333541600102016000020160000533985339411800211091010800008000010160000001600380754116003800394305020316033533950147160000105339953375533955337553670
1600245340540000000044015338321119251600101016000010160000502339775153349533985339433319333354160010201600002016000053398533941180021109101080000800001016000004316003908248160039610005020316033533951400160000105337553375533995339953660