Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

XAFLAG

Test 1: uops

Code:

  xaflag

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 1.000

Load/store unit issues: 0.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)031e3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)606d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)l1d cache writeback (a8)accfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)f5f6f7f8fd
100410358082917251000100010006225001035103580538821000100010001035104111001007342733990100010361036103610361036
100410358061917251000100010006225001035103580538821000100010001035104111001007332733990100010361036103610361036
100410358061917251000100010006225001035103580538821000100010001035104111001007332733990100010361036103610361036
100410357061917251000100010006225001035103580538821000100010001035104111001007332733990100010361036103610361036
100410357061917251000100010006225001035103580538821000100010001035104111001007332733990100010361036103610361036
100410357061917251000100010006225001035103580538821000100010001035104111001007332733990100010361036103610361036
1004103580201917251000100010006225001035103580538821000100010001035104111001007332733990100010361036103610361036
100410358061917251000100010006225001035103580538821000100010001035104111001007332733990100010361036103610361036
100410357061917251000100010006225001035103580538821000100010001035104111001007332733990100010361036103610361036
100410358061917251000100010006225001035103580538821000100010001035104111001307332733990100010361036103610361036

Test 2: Latency 1->1

Code:

  xaflag

(non-fused SUB/CBNZ loop)

100 unrolls and 100 iterations

Result (median cycles for code): 1.0035

retire uop (01)cycle (02)03191e1f3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst int alu (97)l1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)l1d cache writeback (a8)acc2cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
102041003575000829920251020010200102006476520496955100351003586563873210200102001020010035110111020110099100000000710127119990101001001003610036100361003610036
1020410035752120619920251020010200102006476520496955100351003586563873210200102001020010035110111020110099100000000710127119990101001001003610036100361003610036
102041003576000619920251020010200102006476520496955100351003586563873210200102001020010035110111020110099100000300710127119990101001001003610036100361003610036
102041003575000619920251020010200102006476520496955100351003586563873210200102001020010035110111020110099100000000710127119990101001001003610036100361003610036
102041003575000619920251020010200102006476520496955100351003586563873210200102001020010035110111020110099100000000710127119990101001001003610036100361003610036
1020410035750006199202510200102001020064765204969551003510035865638732102001020010200100351101110201100991000002200710127119990101001001003610036100361003610036
102041003575013588619920251020010200102006476520496955100351003586563875710200102001020010035110111020110099100000100710127119990101001001003610036100361003610036
102041003576000619920251020010200102006476520496955100351003586563873210200102001020010035110111020110099100020000710127119990101001001003610036100361003610036
102041003576000619920251020010200102006476520496955100351003586563873210200102001020010035110111020110099100000000710127119990101001001003610036100361003610036
1020410035750004299920251020010200102006476520496955100351003586563873210200102001020010127110111020110099100000000710127119990101001001003610036100361003610036

1000 unrolls and 10 iterations

Result (median cycles for code): 1.0035

retire uop (01)cycle (02)0318191e3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst int alu (97)l1d tlb access (a0)l1d tlb miss (a1)l1d cache writeback (a8)accfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
100241003576000126991825100201002010020647296496955100351003586783875410020100201002010035104111002110910000064032722999310010101003610036100361003610036
100241003583001284991825100201002010020647296496955100351003586783875410020100201002010035104111002110910000064032733999310010101003610036100361003610036
10024100357500061991825100201002010020647296496955100351003586783875410020100201002010035104111002110910000064022723999310010101003610036100361003610036
10024100357500061991825100201002010020647296496955100351003586783875410020100201002010035104111002110910000064032732999310010101003610036100361003610036
1002410035750002089918251002010020100206472964969551003510035867838754100201002010020100351041110021109100007864032722999310010101003610036100361003610036
10024100357500061991825100201002010020647296496955100351003586783875410020100201002010035104111002110910000064022722999310010101003610036100361003610036
10024100357500061991825100201002010020647296496955100351003586783875410020100201002010035104111002110910000064032733999310010101003610036100361003610036
100241003575001284991825100201002010020647296496955100351003586783875410020100201002010035104111002110910000064022733999310010101003610036100361003610036
100241003575000130991825100201002010020647296496955100351003586783875410020100201002010035104111002110910000064032722999310010101003610036100361003610036
10024100357500061991825100201002010020647296496955100351003586783875410020100201002010035104111002110910000064022732999310010101003610036100361003610036

Test 3: throughput

Count: 8

Code:

  ands xzr, xzr, xzr
  xaflag
  ands xzr, xzr, xzr
  xaflag
  ands xzr, xzr, xzr
  xaflag
  ands xzr, xzr, xzr
  xaflag
  ands xzr, xzr, xzr
  xaflag
  ands xzr, xzr, xzr
  xaflag
  ands xzr, xzr, xzr
  xaflag
  ands xzr, xzr, xzr
  xaflag

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.6675

retire uop (01)cycle (02)03mmu table walk instruction (07)l2 tlb miss instruction (0a)l2 tlb miss data (0b)18191e1f3a3f51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fl1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)l1d cache writeback (a8)a9acc2branch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
16020453408400000000002827160120160120160128106373804950328053408534083334763335716012816024080220534086611160201100991008010010000000001111011921622534051600201005340953409534095340953409
160204534084000000000059827160120160120160128106373804950328053408534083334763335716012816024080220534086611160201100991008010010000000001111011921622534051600201005340953409534095340953409
16020453408400000000002827160120160120160128106373804950328053408534083334763335716012816024080220534086611160201100991008010010000000001111011921622534051600201005340953409534095340953409
16020453408400000162020041232038744531614881615701615921021139049511430544135398233618953400516167016185580730544596622116020110099100801001000000002000103121617264542061613021005427254176543215436854274
16020454223406200000003725160100160100160100106358814950324053404534043333933335916010016020080200534046611160201100991008010010000000000001011021922534001600001005340553405534055340553405
16020453404399000000003725160100160100160100106358804950324053404534043333933335916010016020080200534046611160201100991008010010000000000001011021922534001600001005340553405534055340553405
16020453404400000000003725160100160100160100106358804950324053404534043333933335916010016020080200534046611160201100991008010010000000000001011021922534001600001005340553405534055340553405
160204534044000000000020825160100160100160100106358804950324053404534043333933335916010016020080200534046611160201100991008010010000000000001011021922534001600001005340553405534055340553405
16020453404400000000003725160100160100160100106358804950324053404534043333933335916010016020080200534046611160201100991008010010000000000001011021922534001600001005340553405534055340553405
160204534044000000060059725160100160100160100106358814950324053404534043333933335916010016020080200534046611160201100991008010010000000000001011021962534001600001005340553405534055340553405

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.6672

retire uop (01)cycle (02)03l2 tlb miss data (0b)18191e1f3f51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)5f60696a6d6emap stall dispatch (70)flags prf full (73)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fl1d tlb miss (a1)l1d cache writeback (a8)acbranch mispred nonspec (cb)cfd0d2l1i cache miss demand (d3)l1i tlb miss demand (d4)d5map dispatch bubble (d6)d9dadbddfetch restart (de)e0? int output thing (e9)eaeb? int retires (ef)f5f6f7f8fd
1600245340339900000432516001016001016001010293881049502945337453374333310333351160010160020800205337466111600211091080010100000100223120171921149533701600002011105337553375533755337553375
1600245337439900000328251600101600101600101029388104950294533745337433331033335116001016002080020533746611160021109108001010000010022311061921165533701600002011105337553375533755337553375
160024533744000000043251600101600101600101029388114950294533745337433331033335116001016002080020533746611160021109108001010000010024612041941193533701600004011105337553375533755337553375
1600245337439900000613251600101600101600101029388114950294533745337433331033335116001016002080020533746611160021109108001010000010022311091921245533701600002011105337553375533755337553375
160024533743990000043251600101600101600101029388114950294533745337433331033335116001016002080020533746611160021109108001010000010022311061921165533701600002011105337553375533755337553375
160024533744000000049251600101600101600101029388114950294533745337433331033335116001016002080020533746611160021109108001010000010024621061941263533701600002011105337553375533755337553375
160024533743990000043251600101600101600101029388104950294533745337433331033335116001016002080020533746611160021109108001010000010022311091921164533701600002011105337553375533755337553375
16002453374400010004325160010160010160010102938811495029453374533743333103333511600101600208002053374661116002110910800101000001002231101019211105533701600002011105337553375533755341453375
160024533743990000043251600101600101600101029388104950294533745337433331033335116001016002080020533746611160021109108001010000010022311091921155533701600002011105337553375533755337553375
160024533743990000043251600101600101600101029388104950294533745337433331033335116001016002080020533746611160021109108001010000010022311041921169533701600002011105337553375533755337553375

Test 4: throughput

Count: 4

Code:

  fcmp s0, s0
  xaflag
  xaflag
  xaflag
  xaflag

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.3353

retire uop (01)cycle (02)031e3f51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)flags prf full (73)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fld unit uop (a6)l1d cache writeback (a8)accfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
5020413416101045255010040100100004010010000574757800001133851341413414613024673711950100402001000040200200001341413414115020110099100100100001000131232104193313411400001001341513415134151341513415
50204134141010452550100401001000040100100005801848000011338513414134146128245637119501004020010000402002000013414134141150201100991001001000010001332103193313411400001001341513415134151341513415
502041341410004525501004010010000401001000057475780000113385134141341461302456371195010040200100004020020000134141341411502011009910010010000100006332103193313411400001001341513415134151341513415
50204134141010452550100401001000040100100005747578000011338513414134146128245637119501004020010000402002000013414134141150201100991001001000010000032104193413411400001001341513415134151341513415
502041341410104525501004010010000401001000057475780000113385134141341461282456167119501004020010000402002000013414134141150201100991001001000010002032103193313411400001001341513415134151341513415
50204134141010452550100401001000040100100005747578000011338513414134146130246737119501004020010000402002000013414134141150201100991001001000010000032103193413411400001001341513415134151341513415
502041341410104525501004010010000401001000057475780000113385134141341461282467371195010040200100004020020000134141341411502011009910010010000100001532103193313411400001001341513415134151341513415
502041341410104525501004010010000401001000057475780000113385134141341461302456371195010040200100004020020000134141341411502011009910010010000100003032103193313411400001001341513415134151341513415
50204134141000452550100401001003240100100005747578000011338513414134146128246737119501004020010000402002000013414134141150201100991001001000010000032103193313411400001001341513415134151341513415
50204134621000452550100401001000040100100005747578000011338513414134146128246737119501004020010000402002000013414134141150201100991001001000010000032103193313411400001001341513415134151341513415

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.3346

retire uop (01)cycle (02)031e3f51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)flags prf full (73)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d cache writeback (a8)accfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
50024134081000452550010400101000040010100005734568000001335313382133825575378437109500104002010000400202000013382133821150021109101010000101483140419471337940000101338313383133831338313383
500241338210004525500104001010000400101000057345680000113353133821338255773784371095001040020100004002020000133821338211500211091010100001011533140619541337940000101338313383133831338313383
5002413382100045255001040010100004001010000573456800000133531338213382557537843710950010400201000040020200001338213382115002110910101000010003140519441337940000101338313383133831338313383
5002413382100066255001040010100004001010000573456800000133531338213382557737843710950010400201000040020200001338213382115002110910101000010003140519551337940000101338313383133831338313383
5002413382100045255001040010100004001010000573456800001133531338213382557737843710950010400201000040020200001338213382115002110910101000010003140419551337940000101338313383133831338313383
5002413382100045255001040010100004001010000573456800000133531338213382557737953710950010400201000040020200001338213382115002110910101000010763140519351337940000101338313383133831338313383
50024133821000452550010400101000040010100005734568000001335313382133825577379537109500104002010000400202000013382133821150021109101010000100663140519351337940000101338313383133831338313383
5002413382101045255001040010100004001010000573456800000133531338213382557537843710950010400201000040020200001338213382115002110910101000010003140519561337940000101338313383133831338313383
5002413382100045255001040010100004001010000573456800000133531338213382557737953710950010400201000040020200001338213382115002110910101000010003140419551337940000101338313383133831338313383
5002413382100045255001040010100004001010000573456800000133531338213382557737953710950010400201000040020200001338213382115002110910101000010003140419641337940000101338313383133831338313383