Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

SDIV (slow, 64-bit)

Test 1: uops

Code:

  sdiv x0, x1, x2
  mov x1, #0x8000000000000000
  mov x2, #3

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 1.000

Load/store unit issues: 0.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)031e3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)606d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst int alu (97)l1d cache writeback (a8)accfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
100420401509495025100010001000537251204020401801318981000100020002040261111001100000731161119801000100020412041204120412041
100420401606195025100010001000537250204020401801318981000100020002040261111001100000731161119801000100020412041204120412041
100420401506195025100010001000538940204020401801318981000100020002040261111001100000731161119801000100020412041204120412041
100420401506195025100010001000537251204020401801318981000100020002040261111001100000731161119801000100020412041204120412041
100420401506195025100010001000537251204020401801318981000100020002040261111001100000731161119801000100020412041204120412041
100420401506195025100010001000537251204020401801318981000100020002040261111001100000731161119801000100020412041204120412041
100420401606195025100010001000537251204020401801318981000100020002040261111001100000731161119801000100020412041204120412041
1004204015126195025100010001000537251204020401801318981000100020002040261111001100000731161119801000100020412041204120412041
100420401536195025100010001000537251204020401801318981000100020002040261111001100000731161119801000100020412041204120412041
100420401506195025100010001000537251204020401801318981000100020002040261111001100000731161119801000100020412041204120412041

Test 2: Latency 1->2

Chain cycles: 2

Code:

  sdiv x0, x1, x2
  eor x1, x1, x0
  eor x1, x1, x0
  mov x1, #0x8000000000000000
  mov x2, #3

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 2 chain cycles): 9.0035

retire uop (01)cycle (02)031e3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fl1d cache writeback (a8)acc3cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
30204110035824061100940253010030100301001043262814910695511025811016810585031063063072130200616061102162101130201100991003010010003001910216111096663000030100110036110036110036110036110036
3020411003582487726100940253010030100301001043262804910695511007611003510581531062433010030200602001100352101130202100991003010010000001910116111096663000030100110036110036110036110036110036
30204110035824061100940253010030100301001043262804910695511003511003510581531062433010030200602001100352101130201100991003010010000001910116111096663000030100110036110036110036110036110036
30204110035824061100940253010030100301001043262804910695511003511003510581531062433010030200602001100352101130201100991003010010000001910116111096663000030100110036110036110036110036110036
3020411003582401988100940253010030100301001043262804910695511003511003510581531062433010030200602001100352101130201100991003010010000001910116111096663000030100110036110036110036110036110036
30204110035824061100940253010030100301001043262804910695511003511003510581531062433010030200602001100352101130201100991003010010000001910116111096663000030100110036110036110036110036110036
302041100358246061100940253010030100301001043262804910695511003511003510581531062433010030200602001100352101130201100991003010010000001910116111096663000030100110036110036110036110036110036
30204110035824961100940253010030100301001043262804910695511003511003510581531062433010030200602001100352101130201100991003010010000001910116111096663000030100110036110036110036110036110036
30204110035824061100940253010030100301001043262804910695511003511003510581531062433010030200602001100352101130201100991003010010010001910116111096663000030100110036110036110036110036110036
30204110035824061100940253010030100301001043262804910695511003511003510581531062433010030200602001100352101130201100991003010010000001910116111096663000030100110036110036110036110036110036

1000 unrolls and 10 iterations

Result (median cycles for code, minus 2 chain cycles): 9.0035

retire uop (01)cycle (02)03mmu table walk data (08)0918191e1f3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fl1d cache writeback (a8)acbranch cond mispred nonspec (c5)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
300241100358240000006110115525300103001030010104346310491069550110035110035105848310626530010300206002011003521011300211091030010100001890316331096773000730010110036110036110036110036110036
3002411003582400000061101155253001030010300101043463104910695501100351100351058482210624730010300206002011003521011300211091030010100001890416331096773000030010110036110036110036110036110036
300241100358240000006110115525300103001030010104346310491069550110035110035105848310626530010300206002011003521011300211091030010100001890316331096773000030010110036110036110036110036110036
30024110035824000096044110115525300103001030010104346310491069550110035110035105848310626530010300206002011003521011300211091030010100001890416331096773000030010110036110036110036110036110036
300241100358240000306110115525300103001030010104346310491069550110035110035105848310626530010300206002011003521011300211091030010100001890316331096773000030010110036110036110036110036110036
300241100358240000006110115525300103001030010104346310491069550110035110035105848310626530010301526002011003521011300211091030010100001890316331096773000030010110036110036110036110036110036
300241100358240000006110115525300103001030010104346310491069550110035110035105848310626530010300206002011003521011300211091030010100001890316331096773000030010110036110036110036110036110036
300241100358240000006110115525300103001030010104346310491069550110035110035105848310626530010300206002011003521011300211091030010102001890316331096773000030010110036110036110036110036110036
300241100358240000006110115525300103001030010104346310491069550110035110035105848310626530010300206002011003521011300211091030010101001890316331096773000030010110036110036110036110036110036
300241100358240000006110115525300103001030010104346310491069550110035110035105848310626530010300206002011003521011300211091030010100001890316331096773000030010110036110036110036110036110036

Test 3: Latency 1->3

Chain cycles: 2

Code:

  sdiv x0, x1, x2
  eor x2, x2, x0
  eor x2, x2, x0
  mov x1, #0x8000000000000000
  mov x2, #3

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 2 chain cycles): 9.0035

retire uop (01)cycle (02)03l2 tlb miss instruction (0a)191e1f3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6b6d6emap stall dispatch (70)int prf full (71)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fl1d tlb miss (a1)l1d cache writeback (a8)acc2branch cond mispred nonspec (c5)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
3020411003582400007261009402530100301003010010432628049106955011003511003510581503106243301003020060200110035210113020110099100301001000320001910316111096663000030100110036110036110036110036110036
302041100358240000611009402530100301003010010432628049106955011003511003510581503106243301003020060200110035210113020110099100301001000310011910116111096663000030100110036110036110036110036110036
302041100358240000611009402530100301003010010432628049106955011003511003510581503106243301003020060200110035210113020110099100301001000600001910116111096663000030100110036110036110036110036110036
30204110035824007260611009402530100301003010010432628049106955011003511003510581503106243301003020060200110035210113020110099100301001000390001910116111096663000030100110036110036110036110036110216
302041100358240020724061100940253010030100301001043262804910695501100351100351058150310624330100302006020011003521011302011009910030100100020001910116111097403000030100110077110036110036110036110036
302041100358250000611009402530100301003010010432628049106955011003511003510581503106243301003020060200110035210113020110099100301001000562463201910116111096663000030100110168110172110079110081110036
302041100358240012061100940253010030100301001043262804910695501100351100351058150310624330100302006020011003521011302011009910030100100273001910116111096663000030100110036110036110036110036110036
302041100358240012061100940253010030100301001043262804910695501100351100351058440310624330100302006020011003521011302011009910030100100020001910116111096663000030100110036110036110036110036110036
302041100358240000611009402530100301003010010432628049106955011003511003510581533106243301003020060200110035210113020110099100301001000350001910116111096663000030100110036110036110036110036110036
302041100358240001086311009402530100301003010010432628049106955011003511003510581503106243301003020060200110035210113020110099100301001000390001910116111096663000030100110036110036110036110036110036

1000 unrolls and 10 iterations

Result (median cycles for code, minus 2 chain cycles): 9.0035

retire uop (01)cycle (02)03l2 tlb miss data (0b)18191e3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fld unit uop (a6)l1d cache writeback (a8)acc2cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
300241100358240000103101155253001030010300101043463114910695511003511003510584831062653001030020600201100352101130021109103001010201201890316311096773000730010110036110036110036110079110036
30024110035832100010510115525300103001030010104346311491069551100351100351058483106265300103002060020110035210113002110910300101000001890316331096773000030010110036110036110036110036110036
3002411003582400006110115525300103001030010104346310491069551100351100351058483106265300103002060020110035210113002110910300101000001890316331096773000030010110036110036110036110036110036
3002411003582400006110115525300103001030010104346311491069551100351100351058483106265300103002060020110035210113002110910300101000001890316331096773000030010110036110036110036110036110036
3002411003582400006110115525300103001030010104346310491069551100351100351058483106285300103002060020110035210113002110910300101000001890316331096773000030010110036110036110036110036110036
3002411003582500006110115525300103001030010104346310491069551100351100351058483106265300103002060020110035210113002110910300101000001890316331096773000030010110036110036110036110036110036
30024110035824000077010115525300103001030010104346310491069551100351100351058483106265300103002060020110035210113002110910300101000001890316331096773000030010110036110036110036110036110036
3002411003582400006110115525300103001030010104346310491069551100351100351058483106265300103002060020110035210113002110910300101000001890316331096773000030010110036110036110036110036110036
30024110035825000072610115525300103001930010104346311491069551100351100351058483106265300103002060020110035210113002110910300101000001890316331096773000030010110036110036110036110036110036
3002411003585200006110115525300103001030010104346310491069551100351100351058483106265300103002060020110035210113002110910300101000001890316331096773000030010110036110036110036110036110036

Test 4: throughput

Count: 8

Code:

  sdiv x0, x8, x9
  sdiv x1, x8, x9
  sdiv x2, x8, x9
  sdiv x3, x8, x9
  sdiv x4, x8, x9
  sdiv x5, x8, x9
  sdiv x6, x8, x9
  sdiv x7, x8, x9
  mov x8, #0x8000000000000000
  mov x9, #3

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 2.0005

retire uop (01)cycle (02)031e3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)acc2cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
8020416004011990110879950258010080100801004399225149156960016004016004014990131499988010080200160200160040261118020110099100801001000000005110316111599808000080100160041160041160041160041160041
802041600401199061799502580100801008010043992250491569600160040160040149901314999880100802001602001600402611180201100991008010010000630005110116111599808000080100160041160041160041160041160041
80204160040119906179950258010080100801004399225049156960016004016004014990131499988010080200160200160040261118020110099100801001000000005110116111599808000080100160041160041160041160041160041
80204160040119906179950258010080100801004399225049156960016004016004014990131499988010080200160200160040261118020110099100801001000000005110116111600418000080100160041160041160041160041160041
80204160040119806179950258010080100801004399394049156960016004016004014990131499988010080200160200160040261118020110099100801001000000005110116111599808000080100160041160041160041160041160041
80204160040119806179950258010080100801004399225049156960016004016004014990131499988010080200160200160040261118020110099100801001000006005134116111599808000080100160041160041160041160041160041
80204160040119906179950258010080100801004399225049156960316004016004014990131499988010080200160200160040261118020110099100801001000000005110116111599808000080100160041160041160041160041160041
8020416004011990617995025801318010080143439922514915696001600401600401499012014999880100802001602001600822611180201100991008010010000540005110116111599808000080100160041160041160083160041160041
80204160040119806179950258010080100801004399225149156960016004016004014990131499988010080200160200160040261118020110099100801001000000005110116111599808000080100160041160041160041160041160041
8020416004011992472679950258010080100801004399225149156960016004016004014990131499988010080200160200160040261118020110099100801001000000005110116111599808000080100160041160041160041160041160081

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 2.0005

retire uop (01)cycle (02)03mmu table walk data (08)l2 tlb miss data (0b)18191e3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)5f60696a6b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fl1d tlb miss (a1)ld unit uop (a6)l1d cache writeback (a8)a9acbranch cond mispred nonspec (c5)cfl1i cache miss demand (d3)d5map dispatch bubble (d6)dbddfetch restart (de)e0? int output thing (e9)ea? int retires (ef)f5f6f7f8fd
800241600401199000006179950258001080010800104398775014915696001600401600401499233150020800508002016002016004026111800211091080010100000005020011601115998080000080010160041160041160041160041160041
8002416004011990000072679950258001080010800104398775014915696001600401600401499233150020800108002016002016004026111800211091080010100000005020011601115998080000080010160041160041160041160041160041
800241600401198000006179950258001080010800104398775014915696001600401600401499233150020800108002016002016004026111800211091080010100000005020011601115998080000080010160041160041160041160041160041
800241600401198000006179950258001080010800104398775019815696001600401600401499233150020800508002016002016004026111800211091080010100010005020011601115998080000080010160041160041160041160041160041
800241600401199000006179950258001080010800104398949014915706201600911600901499237150020800108002016002016004026111800211091080010100040005020011601115998080000080010160041160041160081160041160041
8002416004011990000066479950258001080010800104398942014915393101600401600821499233150020800108002016002016004026111800211091080010100000605022011601115998080000080010160041160041160041160041160041
8002416004011990000072679950258001080010800104398775014915696001600401600401499233150020800108002016002016004026111800211091080010100000005020011631115998080000080010160041160041160041160041160041
800241600401198000008279950258001080010800104398775014915696001600401600401499233150020800108002016002016004026111800211091080010100030005020011631115998080000080010160041160041160041160041160041
8002416004011990000072679950258001080010800104398775014915696001600401600401499233150020800108002016002016004026111800211091080010100000905020011631115998080000080010160041160041160041160041160041
800241600401199000006179950258001080010800104398775014915696001600401600821499543150020800108002016002016004026111800211091080010100000005020011631115998080000080010160041160041160041160041160041