Apple M1 Microarchitecture Research by Dougall Johnson

Firestorm: Overview | Base Instructions | SIMD and FP Instructions
Icestorm:  Overview | Base Instructions | SIMD and FP Instructions

LDNP (D)

Test 1: uops

Code:

  ldnp d0, d1, [x6]
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 2.000

Issues: 1.000

Integer unit issues: 0.000

Load/store unit issues: 1.000

SIMD/FP unit issues: 0.000

retire (01)cycle (02)030508090b0e0f1e22233a3f4346494f51inst issue (52)~issue ld/st (55)~dispatch ld/st (58)huge thing ld/st (5a)696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op ld/st (7d)~map lookup ld/st (80)8283pipeline redirect (84)85inst all (8c)inst fp/simd load (98)inst ldst (9b)9da0a1a3a6a7a9acafb5b6bbdcache load miss (bf)dtlb miss (c1)c2cfd5d6ddinst fetch restart (de)e0e6eaebecld/st retires (ed)? fp/simd (ee)f5f6f7f8fd
200539931111006510338421818152510001000100015399374399399973132100020001000398398111001100010000102021411057005910386157421917311611395105799210001000400403399399399
200439931110016500238421818162510001000100015358373398398963131100020001000398399111001100010000101919421057215910386157421917311611395105799210001000399400400400399
200439931110006510338421818162510001000100015358373398399963132100020001000399398111001100010000102020421058015910376157421917311611395105799210001000399400400400399
200439931011006500338421818152510001000100015352374398398963132100020001000399400111001100010000101920421058125910386157421917311611396105799210001000400400400400399
200439931011006500238421818152510001000100015362373398400963132100020001000399398111001100010000101921421054115910386157421917311611396105699210001000403399400400400
200439831010006500338421818152510001000100015362373398399963132100020001000398398111001100010000101921421058115910386157421917311611396105799210001000400399399399400
200439921100006500338321818162510001000100015345374399398963131100020001000412399111001100010000101919421057016010386157421917311611395105699210001000399399400400400
200439831010006500238421818162510001000100015358373398398963132100020001000398399111001100010000102020421057115910386157421917311611440105799210001000400400400399399
200439921100006500338421818162510001000100015358373398398963131100020001000399398111001100010000102020411057105910366157421917311611396105799210001000399400399399400
20043993101000650033843186162510001000100015344374399399963131100020001000399398111001100010000102021421059115910386157411917311611396105799210001000399399399400399

Test 2: Latency 1->3 roundtrip

Chain cycles: 3

Code:

  ldnp d0, d1, [x6]
  fmov x0, d0
  eor x8, x8, x0
  eor x8, x8, x0
  add x6, x6, x8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 9.0050

retire (01)cycle (02)03090b0e0f18191e1f23243a3f4d4f51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a3a6a8a9acafb5bbdcache load miss (bf)c5branch mispredict (cb)cdcfd5d6ddinst fetch restart (de)e0e6? int output thing (e9)eaebecld/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
60205120103931000000100001200201195381094552560100401021000110000301001000010000107912657360356117882012002312004712004711188131124215010030200200001000060200100001000012004912003511502011009910040100100001000001001000011000060121000010100032101135111196571000040002060100001000040100120051120048120051120051120036
6020412005089900100060000120035119494109455256010340102100011000030100100001000010791265736035611759901200261200351200501118813112416501003020020126100006020010000100001200481200351150201100991004010010000100000100100001100000001000010100032101100111196571000040000998100001000040100120036120051120048120051120051
6020412005089900000000000120035119494109458256010340102100011000030100100001000010791265735293611788201200261200501200471118813112374501003020020000100006020010000100001200511200471150201100991004010010000100000100100001100000001000010100032101108111196551000040002068100001000040100120051120051120036120048120036
6020412005090000000090000120035119538109443256010340100100001000030100100001000010795395735293611825001200231200351200471118813112374501003020020000100006020010000100001200571200471150201100991004010010000100000100100001100000001000010100032101132111197391000040002960100001000040100120051120036120048120048120167
6020412005089900000010000120035119503109443256010340102100001000030100100001000010793305736035611825001200261200501200531119033112416501003020020000100006020010000100001200481200351150201100991004010010000100000100100001100000001000010000032101100111196461000040002968100001000040100120051120051120051120048120036
6020412004789900000010100120020119494109458256010040100100011000030100100001000010795395736035611759901200261200501200501118953112374501003020020000100006020010000100001200501200351150201100991004010010000100000100100001100000001000010100032101129111196571000040000990100001000040100120051120051120051120036120036
60204120050899000000100001200201195031094432560103401001000110000301001000010000107953957360356117599012002612005012005011189571124945012330227200221000960254100101000912005412005111502011009910040100100001000001001000001000000010000001000321011081111965610000400000100100001000040100120052120052120052120036120055
6020412005489900000000010120039119509109461256010340102100011000030100100001000010791265736230611828501200271200991200541119003112407501003020020000100006020010000100001200511200511150201100991004010010000100000100100000100000001000000100032101108111196461000040002131012100001000040100120055120055120055120055120052
602041200559080000000000012003911950910946125601034010210000100003010010000100001079396573623061175990120011120051120054111896311240750100302002000010000602001000010000120035120051115020110099100401001000010000010010000110000000100000010003210412111119666100004000213012100001000040100120036120055120055120036120052
602041200358990000006000012003911950910944325601004010210001100003010010000100001079396573623061175990120030120054120051111900311241750100302002000010000602001000010000120051120051115020110099100401001000010000010010000010000003100000101113220016001197271000040002965100001000040100120036120051120051120051120051

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 9.0060

retire (01)cycle (02)0305080b0f1e223a3f4d4f51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a1a3a6a7a8a9acafb5bbdcache load miss (bf)dtlb miss (c1)c2branch mispredict (cb)cdcficache miss (d3)d5d6daddinst fetch restart (de)e0e6? int output thing (e9)eaebecld/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
6002512005789901102101200451194931094672560016400141000110000301581000010000107985857363746126051012003612006012005711192831124815001030020200001000060020100001000012004112005711500211091040010100001000001010001211000200211000011012003140012990881196591000140004131012100001000040010120058120061120061120042120042
6002412006090011101111202301194931094672560013400141000210000300101000010000107987057365186126051112003312006012005711190931124865001030020200001000060020100001000012006012005711500211091040010100001000001010001311000200311000001111003140089907811967810001400041300100001000040010120063120042120064120064120042
600241200608991110210120045119493109449256001640014100021000030010100001000010799425735593612605111201891200601201411119283112457500103034120428100006034210000101601205451202506150021109104001010000100000101000121100030101100001111200314009990771196771000140004131312100001000040010120061120042120061120042120042
6002412006089911001111200421194901094642560016400141000210000300101000010000107991557363746126051112003612004112005311201131124545001030665200001000060344100641000012005912005811500211091040010100001000011010002201000200111000011110003140089909811965910001400040100100001000040010120061120061120061120042120058
60024120060900100021012004511949310946725600164001410002100003001010000100001079942573637461235001120017120060120060111909311246150010300202000010000600201000010000120057120041115002110910400101000010000010100011110002001110000111110031400799088119678100014001813109100001000040010120042120061120061120042120061
6002412005789910102101200261194841094642560016400121000110000300101000010000107994257364226123500112003612006012006011192831124875001030020200001000060020100001000012006012006011500211091040010100001000001010002211000202116100000111100314008990881196781000140004131312100001000040010120061120042120058120061120042
6002412006089911002001200261194841094492560016400141000210000300101000010000107987057363746126051112003612006012006011190931124745001030020200001000060020100001000012006012005711500211091040010100001000001010002201000100111000011110003140089908811967510000400021300100001000040010120042120061120042120061120061
60024120052899100011012004511949310946425600164001210002100003001010000100001079942573651861260511120036120041120041111925311245550010300202000010000600201000010000120060120057115002110910400101000010000010100021110001001110000111110031400799088119678100014000413109100001000040010120058120042120042120061120061
6002412005790010101101200451194931094672560016400121000310000300101000010000107987057363746126051112003312006012005711192831124575001030020200001000060020100001000012006012004111500211091040010100001000001010002201000200111000011111003140079928811967510001400040100100001000040010120058120058120042120042120061
600241200608991111210120045119484109467256001340012100021000030010100001000010799425736518612350011200361200571200601119283112472500103002020000100006002010000100001200601200411150021109104001010000100000101000210100020021100001111000314008990771196591000140010131312100001000040010120058120061120061120042120066

Test 3: Latency 2->3 roundtrip

Chain cycles: 3

Code:

  ldnp d0, d1, [x6]
  fmov x1, d1
  eor x8, x8, x1
  eor x8, x8, x1
  add x6, x6, x8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 9.0057

retire (01)cycle (02)030507080a0b0f1e22233a3f4d4f51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a1a3a6a8a9acafb5bbdcache load miss (bf)dtlb miss (c1)c2c5cfd5d6ddinst fetch restart (de)e0e6? int output thing (e9)eaebecld/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
602051200578991111012102120042119512109464256010640104100021000030100100001000010794235736374611917211200331200571200571119033112410501003020020000100006020010000100001200571200571150201100991004010010000100001100100032110002111100001111003212712187119669100014000210109100001000040100120058120058120058120058120058
60204120041900111110200212004211951210944925601064010410002100003010010000100001079423573559361191721120033120057120105111903311241050100302002000010000602001000010000120057120041115020110099100401001000010000010010002211000202110000111110321271007511966910001400021009100001000040100120058120058120058120042120042
60204120057900111110200212004211951210946425601064010210002100003010010000100001079423573637461191721120033120057120059111897311241550100302002000010000602001000010000120057120057115020110099100401001000010000010010001101000201110000111100321251217511966910001400041009100001000040100120042120058120058120058120058
602041201498991111107002120026119530109464256010340104100021000030100100001000010794235736374611751811200331200411200571119033112410501003020020000100006020010000100001200571200571150201100991004010010000100000100100032110000000100001111103212712177119669100014000410100100001000040100120058120058120058120058120058
602041200578991111102103120042119512109464256010640104100021000030100100001000010794235736374611917211200331200411200571119033112415501003020020000100006020010000100001200571200571150201100991004010010000100000100100022110002001100001111103212712177119669100014000410109100001000040100120058120058120058120058120058
602041200578991111002102120042119512109575256010640104100021000030100100001000010793685736374611751811200171200411200571119033112410501003020020000100006020010000100001200571200571150201100991004010010000100000100100032010002021100001111103212712177119669100014000210109100001000040100120058120058120058120058120058
60204120057899110110210112004211951210945225601064010410002100003010010009100001079368573559361175181120033120057120057111903311241550100302002000010000602001000010000120057120057115020110099100401001000010000010010002311000201110000111100321271007511966910001400040109100001000040100120058120058120058120058120058
602041200418991101102101120026119512109464256010640112100011000030100100001000010793685736374611751811200171200571200571119033112415501003020020000100006020010000100001200411200571150201100991004010010000100000100100023110003011100001111103212710066119669100014000410110100001000040100120042120042120058120058120058
60204120057899111100210112004511951210947125601064010210002100003010010000100001079423573637461175181120033120057120057111903311241050100302002000010000602001000010000120057120041115020110099100401001000010000010010001111000200110000011110321271217711967010001400021009100001000040100120042120058120058120058120058
602041200578991111102102120042119512109464256010640104100021000030100100001000010794235736374611917211200331200571200411119033112410501003020020000100006020010000100001200571200571150201100991004010010000100000100100022110003011100001111103212712176119651100014000410100100001000040100120058120058120058120042120042

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 9.0051

retire (01)cycle (02)0307080b0e0f18191e1f2223243f4d4f51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)5f60696b6d6edispatch stall (70)71scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a1a3a6a8a9acafb5dcache load miss (bf)c2c5branch mispredict (cb)cfd0d2icache miss (d3)d5d6dadbddinst fetch restart (de)e0e6? int output thing (e9)eaebecld/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
6002512005389900010002101001200351194831094432560010400121000010000300101000010000107976957358886125581001200230120047120050111918031124445001030020200001000060020100001000012003512003511500211091040010100001000001010000011000000151000011000314000019900111196651000040002660100001000040010120036120036120051120051120048
60024120047899000110010100120035119483109455256001340010100011000030010100001000010797695735888612558100120026012003512003511191803112447500103002020000100006002010000100001200501200351150021109104001010000100000101000001100000001000001000314000019900111197661000040000905100001000040010120051120036120048120051120051
60024120050899000100010000120035119483109458256001340012100011000030010100001000010797695736035612558100120026012005012004711191803112447500103002020000100006002010000100001200471200471150021109104001010000100000101000001100000001000011000314000019900111196651000040002968100001000040010120048120048120048120036120036
60024120050899000000010100120020119483109458256001340010100011000030010100001000010798455736035612558101120011012005012005011190303112447500103002020000100006002010000100001200501200471150021109104001010000100000101000000100000031000000000314000019400111196651000040000998100001000040010120036120051120036120051120051
60024120035899000000060000120032119483109458256001340012100011000030010100001000010798455736035612434400120023012005012005011190303112437500103002020000100006002010000100001200501200471150021109104001010000100000101000001100000001000000000314000019900111196681000040002968100001000040010120051120051120051120036120036
600241200508990000100100000120020119489109444256001340012100011000030010100001000010798455735888612571400120026012005012005011191803112447500103002020000100006002010000100001200471200351150021109104001010000100000101000001100000001000010000314000019400111196651000040002960100001000040010120051120048120036120051120051
6002412003589900000001010012005211948010945525600134001010001100003001010000100001079845573603561255810012002601200351200501119030311244450010300202000010000600201000010000120047120035115002110910400101000010000010100324010030108593110032100003860000231900331203371003740189668100001000040010120051120036120051120051120036
600241200509002101026262655237610012290012089611024595060325402401005910054341751162711231115340458057106178215011224460124062123416112630051311431359892349992417011927707101202611771123717123680341500211091040010100001000001010001001000012145251003310200314000019400541212171004040180990100001000040010120051120053120036120051122054
6002412043193101101002020100120035120495109562961603374025610063100643506211897113841151571580471561902510112002601200471200471119030354112447500103002020000100006002010000100001201151200371150021109104001010000100000101000001100000041000001000314000019400111196721000040002101012100001000040010120055120055120055120055120055
60024120051899000110010001120039119484109461256001040012100011000030010100001000010798585736230612574601120030012003512005111198703112465500103002020000100006002010000100001200351200351150021109104001010000100000101000001100002028729100001100031400001990011119650100004000210109100001000040010120055120052120055120055120052

Test 4: throughput

Count: 8

Code:

  ldnp d0, d1, [x6]
  ldnp d0, d1, [x6]
  ldnp d0, d1, [x6]
  ldnp d0, d1, [x6]
  ldnp d0, d1, [x6]
  ldnp d0, d1, [x6]
  ldnp d0, d1, [x6]
  ldnp d0, d1, [x6]
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.3341

retire (01)cycle (02)030e0f1e22233a3f4346494f51inst issue (52)~issue int (53)~issue ld/st (55)~dispatch int (56)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map lookup int (7f)~map lookup ld/st (80)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a1a3a5a6a7a8acafb5b6bbdcache load miss (bf)c2cfd5d6ddinst fetch restart (de)e0e6eaebecld/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
16020526727200004510026714212181825801001008000010080000500116875402670226727267226645366858010020016000020080000267222672211802011009910010080000800001100800000390800350142800396135005110216222672480035106280000800001002672726728269672674026723
160204267312001041100267122121811825801001008000010080000500117462802670626731267076650366858010020016000020080000267272672211802011009910010080000800001100800000390800350038800396135390511021622267198000066480000800001002673226728269642673926728
16020426727200000101267123120125258010010080000100800005001174628026702267272672766503668080100200160000200800002672726722118020110099100100800008000011008000003930800390139800396103945110216222672480039108480000800001002675326729267082672826728
160204267172000041102267122121216258010010080000100800005001173183026702267272670766303668080100200160000200800002672726722118020110099100100800008000001008000003908003900398003961350051102162226724800391010480000800001002673226738267232672826708
16020426727200004100226692212121625801001008000010080000500117010702669726722267276650366858010020016000020080000267272670711802011009910010080000800000100800000390800390039800356135430511021622267248003900480000800001002673426730267232670826728
160204267272000045002266922121816258010010080000100800005001174628026682267072672766303668580100200160000200800002672726707118020110099100100800008000001008000003908003900398003561043051102162226724800391010480000800001002673626731267322672326728
1602042672720000450022669221218162580100100800001008000050011687540266972670726722665236685801002001600002008000026707267221180201100991001008000080000010080000039080035003980039610430511021622267248003966280000800001002673126715267372673526728
160204267272000045002267122121225258010010080000100800005001168880026702267072672266453668580100200160000200800002672726722118020110099100100800008000001008000003908003900398003961043051102162226719800391010280000800001002670826723268772678226724
1602042672720000451022671221212202580100100800001008000050011746280267822672726707664536680801002001600002008000026727267221180201100991001008000080000110080000039080035003980039613943051102162226724800391010480000800001002684026727267402681126728
160204267222001145002267122181870258010010080000100800005001170107026702267272672766453668080100200160000200800002670726722118020110099100100800008000001008000003908003500398003561354305110216222672480039106080000800001002673726748267342672826728

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.3341

retire (01)cycle (02)0308090b0e0f181e1f223a3f4346494f51inst issue (52)~issue int (53)~issue ld/st (55)~dispatch int (56)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map lookup int (7f)~map lookup ld/st (80)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a1a3a5a6a8acafb5b6bbdcache load miss (bf)cfd5d6ddinst fetch restart (de)e0e6eaebecld/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
16002526722200000100410002669200127625800101080000108000050117462826702267072672266683670280010201600002080000267072672211800211091010800008000001080000039080039039800396100502028162626267198003510608000080000102674326743267282672826723
1600242672720000001045010266920180162580010108000010800005011688802668226724267076653367118001020160000208000026727267221180021109101080000800000108000003908003903980035013543502024161424267048003510608000080000102681126734267312672326728
1600242672720000000000122671220123258001010800001080000501173183267022671026722667236687800102016000020800002670726722118002110910108000080000110800000390800350398003961393950202616132526704800000628000080000102682526730267162672326728
160024267222000000004601226707212014258001010800001080000501170107266822672226727665336707800102016000020800002672226731118002110910108000080000010800000430800352398000061354350201216241326724800390648000080000102672326735267132670826718
1600242671520000000000002671221218025800101080000108000050116888026702267272670766723668780010201603842080000267272672211800211091010800008000011080000000800350398000061039502025161524267198003510048000080000102680726734267422672326708
16002426727200000000410002671201801225800101080000108000050117010726682267072672266683668780010201600002080000267072672211800211091010800008000001080000039080035038800006035050202016262126719800396048000080000102673426739267232672826728
1600242672720000000045010266922120025800101080000108000050116875426697267072672766733670780010201600002080000267272672211800211091010800008000001080000039080000540800006035395020131626132670480039101048000080000102673926737267082672826708
16002426722201000000001026712212017258001010800001080000501170107267022672726707667336707800102016000020800002670726722118002110910108000080000010800000390800390398000060354350201916251226724800000008000080000102688326931270062693626723
16002426727200000000001026712212182258001010800001080000501168880269072672226707667236702800102016000020800002672726722118002110910108000080000010800000390800160968003961394350202825202626704800000048000080000102672826728268552677826727
1600242704320211000019710400267122180342608001010800001080000501173183267062672726727667236687800102016000020800002672726722118002110910108000080000010800000390800393398000061394350202616132526704800350628000080000102674326725267402672826708