Apple M1 Microarchitecture Research by Dougall Johnson

Firestorm: Overview | Base Instructions | SIMD and FP Instructions
Icestorm:  Overview | Base Instructions | SIMD and FP Instructions

LDNP (Q)

Test 1: uops

Code:

  ldnp q0, q1, [x6]
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 2.000

Issues: 2.000

Integer unit issues: 0.000

Load/store unit issues: 2.000

SIMD/FP unit issues: 0.000

retire (01)cycle (02)031e223a3f4346494f51inst issue (52)~issue ld/st (55)~dispatch ld/st (58)huge thing ld/st (5a)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op ld/st (7d)~map lookup ld/st (80)8283pipeline redirect (84)85inst all (8c)inst fp/simd load (98)inst ldst (9b)9da0a3a6a8acafb5b6bbdcache load miss (bf)cfd5d6ddinst fetch restart (de)e0e6eaebecld/st retires (ed)f5f6f7f8fd
20057276450171321212162520002000200029597170572772742534602000200020007287271110011000100012000432039039203961394373516667242039101042000728734728728728
20047275450171221212162520002000200029624170273273242934642000200020007277271110011000100002000432040038203861384473616667292038101072000733732732733732
200473254401717211162520002000200029873170273172742934612000200020007317271110011000100012000432039038203861394473616667292038141472000733734733733732
200472754401716211192520002000200029850170773173242934642000200020007327271110011000100012000432039039200061394473616667252038101072000729732733733732
200473254501717211192520002000200029624170773173242934612000200020007327271110011000100002000432039139203861394473616557242038141472000733732732733732
200472754601717211162520002000200033122170673173142534652000200020007277271110011000100002000432039038203961394473616667282038141442000732728733732732
2004732544017162112192520002000200029873170673173144934642000200020007317271110011000100002000432039038203961384473616667282038141072000732732733732732
2004790562017162121192520002000200029873170573173143034652000200020007317271110011000100002000432039038203861384473616667282038141472000732732728729732
2004732544017212121162520002000200029763170673273142934652000200020007287281110011000100002000432039039203961394373616657292038141042000733739732733732
200473164401716211192520002000200029850170773273242934642000200020007327271110011000100002000432039039203801394473616667282038141472000729728728729733

Test 2: Latency 1->3 roundtrip

Chain cycles: 3

Code:

  ldnp q0, q1, [x6]
  fmov x0, d0
  eor x8, x8, x0
  eor x8, x8, x0
  add x6, x6, x8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 9.0055

retire (01)cycle (02)030508090b0e0f18191e1f22233a3f4d4f51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)60696b6d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a1a3a6a8a9acafb5bbdcache load miss (bf)dtlb miss (c1)c2cfd5d6ddinst fetch restart (de)e0e6? int output thing (e9)eaebecld/st retires (ed)gpr retires (ef)f5f6f7f8fd
6020612007396800001100401001200429671610973625701004010210001200003010010000200001042726557335323465684112003301200571200411121453112515601003020020000100006020020000100001200531200411150201100991004010010000100000100200023220004012200002202032101161011982420000400026692000040100120048120052120036120052120052
60204120047899000010002000012002096716109724257010340102100012000030100100002000010425228573343634655101120023012005112005111213931125096010030200200001000060200200001000012005112004711502011009910040100100001000001002000000200000002000020200321011611119820200004000210692000040100120052120048120052120052120048
602041200359000000000020001120036974031097432570100401021000120000301001000020000104267435733436346515601200273120051120052112139311252460100302002000010000602002000010000120057120053115020110099100401001000010000010020000022000010020000202003210116111198832000040002101092000040100120036120096120080120053120052
60204120035899000000002000112003696712109740257010340102100002000030100100002000010426743573343634655101120023012005212003511214131124936010030200200001000060200200001000012005312003511502011009910040100100001000001002000820200001232000802000322011613120199200064000210092000040100120052120052120052120036120048
60204120052955000000000000012004096720109744257010340102100012000030100100002000010427091573266634656260120031012003512005111214331125136010030200200001000060200200001000012005212005111502011009910040100100001000001002000002200000032000020200321011611119904200024002214002000040100120058120036120062120238120056
602041201929040110000018001112004096720109744257010340102100012000030100100002000010425228573362834656260120011012005112005111213931125146010030200200001000060200200001000012005612005311502011009910040100100001000001002000002200002032000020200321011611119828200044000201092000040100120053120057120056120057120036
60204120055899000000002000112008396720109744257010040102100002000030100100002000010427091573276234651561120031012005112003511214331125136010030200200001000060200200001000012005512003511502011009910040100100001000001002000002200000002000020200321011611119828200004000201492000040100120052120056120052120036120056
602041200558990000000020000120040967231097442570100401021000020000301001003120000104270915732666346562611200110120035120055112143311251360100302002000010000602002000010000120035120035115020110099100401001000010000110020000022000000020000000003210116111198282000040010014132000040100120036120056120036120056120036
602041200559350000000000000120040967201097242570103401021000120000301001000020000104252285732666346562611200310120055120035112143311251360100302002000010000602002000010000120055120035115020110099100401001000010000010020003202000201220000222203210116111198342000040002101092000040100120056120041120056120056120056
60204120055899000000002001112002096720109724257010340102100012000030100100002000010427091573343634656261120027012005112005111213931125096010030200200001000060200200001000012005512003511502011009910040100100001000001002000000200000002000020000321011611119882200004000214092000040100120036120056120056120056120060

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 9.0047

retire (01)cycle (02)03090e0f181e22233a3f4d4f51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a3a6a8a9acafb5dcache load miss (bf)c2c5cdcfd5d6ddinst fetch restart (de)e0e6? int output thing (e9)eaebecld/st retires (ed)gpr retires (ef)f5f6f7f8fd
600251200478991012111001200329397510972425700134001010000200003001010000200001042502657332443464880120023120047120047112146311252760010300202000010000600202000010000120047120047115002110910400101000010000110200000200000002000020000314041646119822200004000276692000040010120048120048120048120048120048
6002412004789901006901001200209663710973625700134001210001200003001010000200001042502657332443465234120023120047120047112158311252760010300202000010000600202000010000120035120047115002110910400101000010000010200002200000002000022000314041646119822200004000275002000040010120036120036120036120048120048
60024120047899011020001200329663710973625700134001010001200003001010000200001042502657332443465234120129120035120055112158311251560010300202000010000600202000010000120035120088115002110910400101000010000010200002200000002000022000314051653119822200004000256652000040010120048120036120048120048120036
60024120047899000021001200329397510972425700134001010001200003001010000200001042502657332443465234120023120047120047112158311252760010300202000010000600202000010000120047120035115002110910400101000010000110200002200000002000022000314051655119822200004000056652000040010120048120048120048120036120048
60024120047899000021001200329663710973625700134001010001200003001010000200001042502657326663466666120011120035120047112158311252760010300202000010000600202000010000120035120047115002110910400101000010000110200002200000002000022000314051653119822200004000056652000040010120048120048120048120048120048
60024120047900000021001200209663710973625700134001210000200003001010000200001042502657332443464880120023120047120047112158311252760010300202000010000600202000010000120047120035115002210910400101000010000010200002200000002000022000314061645119822200004000256652000040010120048120048120036120048120086
60024120047899000021001200329663710972425700134001010000200003001010000200001042502657332443465234120023120035120035112146311253160010300202000010000600202000010000120047120047115002110910400101000010000010200002200000002000022000314031666119822200004000256652000040010120036120036120052120048120036
60024120047900001020001200329664010973642700104001010000200003001010000200001042397457332443465234120023120035120047112158311252760010300202000010000600202000010000120047120075115002110910400101000010000110200002200001002000022000314071646119822200004000271652000040010120048120048120036120036120048
60024120035899000001001200329397510973625700104001210001200003001010000200001042397457332443464880120023120047120035112158311252760010300202000010033600202000010000120047120047115002110910400101000010000110200002200000002000022000314051645119822200004000256052000040010120048120052120048120048120036
60024120047899000021001200329397510973625700134001010001200003001010000200001042397457332443464880120011120047120051112158311251560010300202000010000600202000010000120047120047115002110910400101000010000010200002200000002000022000314051656119810200004000256652000040010120049120048120048120036120048

Test 3: Latency 2->3 roundtrip

Chain cycles: 3

Code:

  ldnp q0, q1, [x6]
  fmov x1, d1
  eor x8, x8, x1
  eor x8, x8, x1
  add x6, x6, x8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 9.0047

retire (01)cycle (02)030e0f1e223a3f4d4f51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a3a6a8a9acafb5dcache load miss (bf)cfd5d6ddinst fetch restart (de)e0e6? int output thing (e9)eaebecld/st retires (ed)gpr retires (ef)f5f6f7f8fd
602051200478991121012004196712109724257010340102100012000030100100002000010426743573343634651561200231200471200351121393112505601003020020000100006020020000100001200511200471150201100991004010010000100001100200000200000032000022321011611119808200024000010652000040100120052120048120036120052120052
60204120047899002101200209740310973625701034010010001200003010010000200001042674357326663465156120023120047120051112135311250960100302002000010000602002000010000120035120047115020110099100401001000010000110020000220000000200002032101161111980820000400020692000040100120052120052120052120036120048
602041200928990021112003696712109736257010040102100012000030100100002000010425228573281034651561200271200511200511121353112509601003020020000100006020020000100001200471200471150201100991004010010000100000100200082200000002000022321011611119824200004000210002000040100120036120036120052120036120036
602041200478990020012002396720109724257010340100100012000030100100002000010425228573324434655101200111200351200581121393112493601003020020000100006020020000100001200511200471150201100991004010010000100000100200002200000002000022321011611119809200004000210092000040100120036120036120052120048120148
602041200358990101112003296716109740257010340102100002000030100100002000010426743573343634655101200231200351200351121353112493601003020020000100006020020000100001200511200471150201100991004010010000100000100200000200000002000002321011611119824200004000210002000040100120036120052120048120036120048
60204120051899012111200389740310972425701034010210001200003010010000200001042639557326663465156120023120035120047112123311250960100302002000010000602002000010000120035120035115020110099100401001000010000110020000020000000200002232101161111982420000400020652000040100120052120036120048120048120048
602041200478991121112002096716109740257010040102100012000030100100002000010425228573266634655101200271200351200511121393112509601003020020000100006020020000100001200511200351150201100991004010010000100000100200002200000002000002323911611119824200004000210692000040100120052120052120052120036120052
602041200478990021012003496712109740257010040102100012000030100100002000010426395573324434655101200271200471200351121353112505601003020020000100006020020000100001200471200471150201100991004010010000100000100200002200001002000020321011611119820200004000210092000040100120036120048120052120036120049
602041200518990000012003296712109724257010340102100002000030100100002000010425228573343634655101200271200471200471121393112509601003020020000100006020020000100001200351200351150201100991004010010000100000100200002200001002000020321011611119824200004000210692000040100120036120052120052120052120052
60204120035899002011200329671610973625701034010210001200003010010000200001042674357326663465626120027120051120051112139311249360100302002000010000602002000010000120035120047115020110099100401001000010000110020000020000000200002232101161111982020000400026002000040100120049120048120036120048120036

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 9.0047

retire (01)cycle (02)0308090a0b0e0f18191e1f23243a3f4d4f51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)60696b6d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a3a6a8acafb5dcache load miss (bf)c2c5branch mispredict (cb)cdcfd5d6ddinst fetch restart (de)e0e6? int output thing (e9)eaebecld/st retires (ed)gpr retires (ef)f5f6f7f8fd
600251200518990100000020000120036939751097442570013400101000120000300101000020000104239745733628346535001200270120055120051112146311253560010300202000010000600202000010000120122120069115002110910400101000010000010200002200000020000220000314061646119826200004000010002000040010120056120056120036120052120052
6002412003589900000000000011200409397510974425700134001010001200003001010000200001042537457336283465466012001101200351200591121623112531600103002020000100006002020000100001200511200841150021109104001010000100000102000002000000200002200003140416341198302000040002101002000040010120056120056120036120056120052
60024120051899000000009900001200409664510972425700134001210001200003001010000200001042397457326663465466012001101200551200551121663112535600103002020000100006002020000100001200511200961150021109104001010000100000102000022000000200002200003140417441198102000040002010132000040010120036120052120052120056120056
6002412003589900000000200011200209664110974425700134001210001200003001010031200001042563557337723464880112001131200511200551121463112531600103002020000100006002020000100001200351200995150021109104001010000100000102000022000003200002200003140416341198102000040002101002000040010120150120036120198120057120036
600241200559000000000011000012031996641109724257001340012100002000030010100002000010423974573362834653500120027012005112003511216631125356001030020200001000060020200001000012010912005911500211091040010100001000001020000220000102000000000131404164411983020000400000002000040010120056120052120052120056120052
60024120055899000000001400011200209397510974525700104001210001200003001010000200001042537457326663465466012003101202151201521122023112701601843021020000100006002020000100001201041202794150021109104001010000100000102000022000013200000200003140416431198322000040002140132000040010120056120056120037120056120037
60024120035899000000002000012004196647109744257001340012100002000030010100002000010423974573362834653500120331012003512003511216631126666001030020200001000060020200001000012011712003611500211091040010100001000001020000020000002000022000031403163411983020000400021414132000040010120056120036120056120052120056
60024120055900000010000000312334997029110915582704124023610074201123239910795213081055230258022683507384012206001223871223811132491601140466467232578216961088764942209441057012233412245728150021109104001010000100000102000022006416265320066222000366851699141216652006240251140132000040010122444122521122150122706122647
600241220989171021003642555030800001200209703211042455070359402051007620096326751095221312105670635813944351016401200270120051120055112162871127526347032564214541082064942212561095112281412328334150021109104001010000100000102005822000000200000200003140416371198302002240216141092000040010123574123238123263123239123068
6002412005593000011100150010120032966371097412570010400121000120000300101000020000104253745732714346523401200230120047120047112146311253160010300202000010000600202000010000120051120047115002110910400101000010000010200002200000020000220000314041634119810200004000010602000040010120048120052120036120036120052

Test 4: throughput

Count: 8

Code:

  ldnp q0, q1, [x6]
  ldnp q0, q1, [x6]
  ldnp q0, q1, [x6]
  ldnp q0, q1, [x6]
  ldnp q0, q1, [x6]
  ldnp q0, q1, [x6]
  ldnp q0, q1, [x6]
  ldnp q0, q1, [x6]
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.6674

retire (01)cycle (02)0304090e0f191e1f22233a3f4346494f51inst issue (52)~issue int (53)~issue ld/st (55)~dispatch int (56)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map lookup int (7f)~map lookup ld/st (80)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a1a3a5a6a8a9acafb5b6bbdcache load miss (bf)c2branch mispredict (cb)cfd5d6ddinst fetch restart (de)e0e6eaebecld/st retires (ed)gpr retires (ef)f5f6f7f8fd
1602055339440011110450101533832111325160100100160000100160000500233396205337353398533983331733333216010020016000020016000053394533741180201100991001008000080000110016000004301600000038160000613943015110116115339116003801001600001005337553395533955337553395
16020453398400000004400005335921116251601001001600001001600005002299840053349533985339833321333356160100200160000200160000533945339411802011009910010080000800000100160000000160000003916003800044005110116115339516000014071600001005339553395533755339953395
160204533983990000045010153359212121625160100100160000100160000500233326205336953398533943331733333216010020016000020016000053374533941180201100991001008000080000010016000000016003800381600386039430051101161153391160038101001600001005339553375533955337553399
1602045339440000000440100533792111925160100100160000100160000500233626405337353374533943332033335616010020016000020016000053394533941180201100991001008000080000010016000000016003800431600386139440051101161153391160038141471600001005339953375533995337553399
1602045337440000000001015337921120251601001001600001001600005002336264053373533985337433321333356160100200160000200160766533745340911802011009910010080000800000100160000043122160000003816003801390005110116115337116003814041600001005339953396533965342053405
1602045339840000000440101533830101925160100100160000100160000500235227505337353398533983332033335616010020016000020016000053394533741180201100991001008000080000010016000004301600390001600380139440051101161153371160039141071600001005339953399533755339953399
160204533984000010000101533832112025160100100160000100160000500233289705334953398533743332133333216010020016000020016000053398533941180201100991001008000080000110016000004301600000038160038013944005110116115337116005414071600001005339753399534675346853375
1602045337440000010000005338321019251601001001600001001600005002336264053373533745339833321333332160100200160000200160000533985339411802011009910010080000800000100160000000160000103816000001384400511011611533951600380071600001005337553399533755337553399
160204533944000000044880005338320120251601001001600001001600005002332897053349533985339833321333356160100200160000200160000533985337411802011009910010080000800000100160000043016003800391600006139430051101161153395160038101471600001005339953375533995339953395
160204533943990000045000153359200025160100100160000100160000500233289705337353374533983329633335616010020016000020016076853398534891180201100991001008000080000010016000004301600390038160000610440051101161153395160039141001600001005339953399533995337553395

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.6675

retire (01)cycle (02)030508090b0e0f1e22233a3f4346494f51inst issue (52)~issue int (53)~issue ld/st (55)~dispatch int (56)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map lookup int (7f)~map lookup ld/st (80)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)92inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a1a3a5a6a7a8a9acafb5b6bbdcache load miss (bf)dtlb miss (c1)c2cfd5d6daddinst fetch restart (de)e0e6eaebecld/st retires (ed)gpr retires (ef)f5f6f7f8fd
160025534024001100002110153366307125160010101600001016000050233513215337753867534143335433338216001020160000201600005340253402118002110901010800008000001016002020430160019103251600396159431905020616065534001600591305160000105340353404534045340353403
1600245340240010110021102533980701925160010101600001016000050233299715337753411534083334833338216001020160000201600005340353402118002110901010800008000001016001919430160019100611600400121431915020216023533781600190135160000105340353382533825340353403
16002453402400101000210025338720719251600101016000010160000502334678153356534115341733348333382160010201600002016000053402534021180021109010108000080000010160019194301600190002116000061594319250203160335339916005913135160000105340353403534035340353382
160024534023991101002110153387007125160010101600001016000050233299715337753400534083332633338216001020160000201600005340253403118002110901010800008000001016002020433016001911230160000015901905020616033534481600591305160000105340353403534045340353406
160024534024001100006700353366270025160010101600001016000050233578115337753392534133334733338216001020160000201600005340653402118002110901010800008000001016001919430160059101211600390019019150203160555339916005813135160000105340353403534035340353403
1600245338140011010021103534780771251600101016000010160000612346902153623534065338533361333364160010201600002016000053853534031180021109010108000080000010160020194301600580022116003961594319150206160535339916005913130160000105341153406534035340653382
1600245338140011110066102533872071925160010101600001016000050233578115338953402534023332633338216001020160000201600005340353402118002110901010800008000001016001919430160059101211600396159451915020516055533991600190130160000105338253403534035340353403
160024534024001110002110053366377025160010101600001016000050232902015335653411534183334833338316001020160000201600005340653402118002110901010800008000001016001919430160019011211600006019019050205160555339916001913135160000105340353403534035340853405
160024534174001001006610153387000225160010101600001016000050232902015337753419534103332633338216001020160000201600005340253402118002110901010800008000001016001920430160059111611600406119018150205160365339916005913130160000105340353404533825340353403
1600245340240010011066103533870770251600101016000010160000502328655153403534025340233326333361160010201600002016000053402533811180021109010108000080000010160021194301600600006116004061594319150203160655337816001913130160000105340353403533825338253403