Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

SDIV (slow, 64-bit)

Test 1: uops

Code:

  sdiv x0, x1, x2
  mov x1, #0x7fffffffffffffff
  mov x2, #3

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 1.000

Load/store unit issues: 0.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)03l2 tlb miss data (0b)1e3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)606d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst int alu (97)l1d cache writeback (a8)accfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
1004204016006195025100010001000537251204020401801318981000100020002040261111001100000731161119801000100020412041204120412041
1004204015006195025100010001000537251204020401801318981000100020002040261111001100000731161119801000100020412041204120412041
1004204015006195025100010001000537251204020401801318981000100020002040261111001100000731161119801000100020412041204120412041
1004204015006195025100010001000537251204020401801318981000100020002040261111001100000731161119801000100020412041204120412041
10042040160186195025100010001000537250204020401801318981000100020002040261111001100000731161119801000100020412041204120412041
1004204015006195025100010001000537251204020401801318981000100020002040261111001100000731161119801000100020412041204120412041
1004204016006195025100010001000537250204020401801318981000100020002040261111001100000731161119801000100020412041204120412041
10042040160061950251000100010005372512040204018013189810001000200020402611110011000112731161119801000100020412041204120412041
1004204015006195025100010001000537251204020401801318981000100020002040261111001100000731161119801000100020412041204120412041
1004204015006195025100010001000537250204020401801318981000100020002040261111001100000731161119801000100020412041204120412041

Test 2: Latency 1->2

Chain cycles: 2

Code:

  sdiv x0, x1, x2
  eor x1, x1, x0
  eor x1, x1, x0
  mov x1, #0x7fffffffffffffff
  mov x2, #3

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 2 chain cycles): 9.0035

retire uop (01)cycle (02)03l2 tlb miss data (0b)18191e1f3a3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fl1d cache writeback (a8)acbranch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
30204110035824000000980100940253010030100301001043262814910695511003511003510581531062433010030200602001100352101130201100991003010010000001910216111096663000030100110036110121110036110036110036
3020411003582400000061100940253010030100301001043262814910695511003511003510581531062433010030200602001100352102130201100991003010010000001910116111096663000030100110036110036110036110036110036
3020411003582400000061100940253010030100301001043262804910695511003511003510581531062433010030200602001100352101130201100991003010010000001910116111096663000030100110036110036110036110036110036
3020411003582400000061100940253010030100301001043262804910695511003511003510581531062433010030200602001100352101130201100991003010010030001910116111096663000030100110036110036110036110036110036
3020411003582400000061100940253010030100301001043262804910695511003511003510581531062433010030200602001100352101130201100991003010010000001910116111096663000030100110036110036110036110036110036
3020411003582500000061100940253010030100301001043262804910695511003511003510581531062433010030200602001100352101130201100991003010010000001910116111096663000030100110036110036110036110036110036
3020411003582400000061100940253010030100301001043262804910695511003511003510581531062433010030200602001100352101130201100991003010010000001930116211097583000030100110036110036110036110127110036
302041100358251232400061100940253010030100301001043262804910695511003511003510581531062433010030200602001100352101130201100991003010010000001910116111096663000030100110036110036110036110036110036
3020411003582400000061100940253010030100301001043262804910695511003511003510581531062433010030200602001100352101130201100991003010010000001910116111096663000030100110036110036110036110036110036
30204110035824000000964100940253010030100301001043262804910695511003511003510581531062433010030200602001100352101130201100991003010010000001910116111096663000030100110036110036110036110036110036

1000 unrolls and 10 iterations

Result (median cycles for code, minus 2 chain cycles): 9.0035

retire uop (01)cycle (02)031e3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fld unit uop (a6)l1d cache writeback (a8)a9acc2cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
30024110035824061101155253001030010300101043463114910695511003511003510584831062653001030020600201100352101130021109103001010000001890416231096773000030010110036110036110036110036110036
30024110035824361101155253001030010300101043463114910695511003511003510584831062653001030020600201100352101130021109103001010000001890316331096773000030010110036110036110036110036110036
3002411003582502166101155253001030010300101043463114910695511003511003510584831062653001030020600201100352101130021109103001010000031890316431096773000030010110036110036110036110036110036
30024110035824061101155253001030010300101043463114910695511003511003510584831062653001030020600201100352101130021109103001010000001890416331096773000030010110036110036110036110036110036
300241100358240103101155253001030010300101043463114910695511003511003510584831062653001030020600201100352101130021109103001010000001890316231096773000030010110036110036110036110036110036
300241100358240611011552530010300103001010434631149106955110035110035105848310626530010300206002011003521011300211091030010100500001890316231096773000030010110036110036110036110036110036
300241100358250279101155253001030010300101043463114910695511003511003510584831062653001030020600201100352101130021109103001010000001890316331096773000030010110036110036110036110036110036
30024110035824061101155253001030010300101043463114910695511003511003510584831062653001030020600201100352101130021109103001010000041890316231096773000030010110036110036110036110036110036
30024110035824061101155253001030010300101043463114910695511003511003510584831062653001030020600201100352101130021109103001010000001890316241096773000030010110036110036110036110036110036
300241100358240726101155253001030010300101043463114910695511003511003510584831062653001030020600201100642101130021109103001010000001890316231096773000030010110036110036110036110036110036

Test 3: Latency 1->3

Chain cycles: 2

Code:

  sdiv x0, x1, x2
  eor x2, x2, x0
  eor x2, x2, x0
  mov x1, #0x7fffffffffffffff
  mov x2, #3

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 2 chain cycles): 9.0035

retire uop (01)cycle (02)03mmu table walk data (08)l2 tlb miss data (0b)191e1f3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)696a6d6emap stall dispatch (70)int prf full (71)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache writeback (a8)acc2cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
302041102168240006035210310094025301003010030100104326284910695511003511003510581503106243301003020060200110035210113020110099100301001000020001910416111096663000030100110036110036110036110036110036
302051100358240000072610094025301003014230100104326284910695511003511003510581507106243301003020060200110035210113020110099100301001000020001910116111096663000030100110036110036110215110036110036
30204110212824000006110094025301003010030303104326284910695511003511003510581503106243301003020060200110035210113020110099100301001004020001910116111096663000030100110036110036110036110036110036
302041100358240000072610094025301003010030100104326284910695511003511003510581503106243301003020060200110035210213020110099100301001000029701001910116111096663000030100110036110036110036110036110036
30204110035824000393526110094025301003010030100104326284910713311003511003510581503106243301003020060200110035210113020110099100301001000020001910116111096663004630100110036110036110036110036110036
302041100358240000072610094025301003010030100104406514910695511003511003510581503106243301003020060200110035210113020110099100301001000009001910116111096663000030100110036110036110036110036110036
302041100358240000010310094025301003014430100104326284910695511003511003510581503106243301003020060200110035210113020110099100301001000010001910116111096663000030100110036110036110036110036110036
3020411003582400012053610094025301003010030100104326284910695511003511003510581501910624330100302006020011003521011302011009910030100100023112001910116111096663000030100110036110036110217110036110036
30204110035824010006110094025301003010030516104326284910695511003511003510581503106243301003020060200110035210113020110099100301001000010001910116111096663000030100110036110036110216110036110036
302041100358240000072610094025301003010030100104326284910695511003511003510581503106243301003020060200110035210113020110099100301001000020001910116111096663000030100110036110036110036110036110036

1000 unrolls and 10 iterations

Result (median cycles for code, minus 2 chain cycles): 9.0035

retire uop (01)cycle (02)03181e3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6d6emap stall dispatch (70)int prf full (71)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fl1d cache writeback (a8)acbranch cond mispred nonspec (c5)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
300241100358250061101155253001030010300101043463104910695511003511003510584803106265300103002060020110035210113002110910300101000001890216241096773000230010110036110036110036110067110036
300241100358240061101155253001030010300101043463104910695511003511003510584803106265300123002060020110035210113002110910300101000001890216231096773000030010110036110036110036110036110036
300241100358240061101155253001030010300101043463114910695511003511003510584803106265300103002060020110035232113002110910300101000001890216251096773000030010110036110036110036110036110036
300241100358240061101155253001030010302021043463114910695511003511003510584803106265300103002060020110035210113002110910300101000001890216241096773000030010110036110036110036110036110036
300241100358250061101155253001030010300101043463114910695511003511003510584803106265300103002060020110035210113002110910300101000001890216241096773000030010110036110036110036110036110036
3002411003582500611011552530010300103001010434631049106955110035110035105848031062653001030020600201100352321130021109103001010610001890216221096793000030010110036110036110036110036110036
300241100358240061101343253001030010300101043463104910695511003511003510584803106265300103002060020110035232113002110910300101020001890216251096773000030010110036110036110036110036110036
300241100358240061101155253001030010300101043488304910695511003511003510584803106265300103002060020110035232113002110910300101000001890216241096773000030010110036110036110036110036110036
3002411003582400251101155253001030010301151043463114910695511003511006410584803106265300103002060020110035210113002110910300101000001890216231096773000030010110036110036110036110036110036
300241100358240061101155253001030010300101043463104910695511003511003510584803106265300103002060284110035210113002110910300101000001890216221096773000030010110036110036110036110036110036

Test 4: throughput

Count: 8

Code:

  sdiv x0, x8, x9
  sdiv x1, x8, x9
  sdiv x2, x8, x9
  sdiv x3, x8, x9
  sdiv x4, x8, x9
  sdiv x5, x8, x9
  sdiv x6, x8, x9
  sdiv x7, x8, x9
  mov x8, #0x7fffffffffffffff
  mov x9, #3

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 2.0005

retire uop (01)cycle (02)031e3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fl1d cache writeback (a8)acbranch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
8020416004011990617995025801008010080100439922504915696001600401600401499013149998801008020016020016004026111802011009910080100100000005110316111599808000080100160041160041160041160041160041
80204160040119807267995025801008010080100439922504915696001600821600401499013149998801008020016020016004026111802011009910080100100000005110116111599808000080100160041160041160041160041160041
8020416004011980617995025801008010080100439922514915696001600401600401499013149998801008020016020016004026111802011009910080100100000005131116111599808000080100160041160041160041160041160041
8020416004011990617995025801008010080100439922504915696001600401600401499013149998801008020016020016004026111802011009910080100100000005110116111599808000080100160041160041160041160041160041
8020416004011990617995025801008010080100439922504915696001600401600401499013149998801008020016020016004026111802011009910080100100000005110116111599808000080100160041160041160041160041160041
8020416004011980617995025801008010080100439922504915696001600401600401499013149998801008020016020016004026111802011009910080100100000015110116111599808000080100160041160041160041160041160041
8020416004011990617995025801008010080100439922504915696001600401600401499013149998801008020016020016004026111802011009910080100100000005110116111599808000080100160041160041160041160041160041
8020416004011990617995025801008010080100439922504915696001600401600401499413149998801008020016020016004026111802011009910080100100000005110116111599808000080100160041160041160041160041160041
8020416004011980617995025801008010080100439922504915696001600401600401499013149998801008020016020016004026111802011009910080100100000105110116111599808000080100160041160041160041160041160041
8020416004011990617995025801008010080100439922504915696001600401600401499013149998801008020016020016004026111802011009910080100100000005110116111599808000080100160041160041160041160041160041

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 2.0005

retire uop (01)cycle (02)03l1i tlb fill (04)1e1f3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fl1d cache writeback (a8)acbranch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd2l1i cache miss demand (d3)l1i tlb miss demand (d4)d5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
80024160040119800061799502580010800108001043987751491569600160040160040149923315002080010800201601401600402611180021109108001010000005020001201614171599808000080010160041160083160041160041160041
80024160040119900061799502580010800108001043987751491569600160040160040149923315002080010800201600201600402611180021109108001010000005021001181616141599808000080010160041160041160041160041160079
8002416004011981006179950258001080010800104398775149156960016004016004014992331500208001080020160020160040261118002110910800101000000502100123169171599808000080010160041160041160141160092160041
80024160040119910774747799282580010800258001043987751491569600160040160040149923315002080010800201600201600402611180021109108001010000005021801171610171599808000080010160041160041160041160041160041
800241600401199100449799502580010800108001043987751491569600160040160040149923315002080010800201600201600402611180021109108001010100005021601111617131599808000080010160041160041160041160041160041
8002416004011991001677799502580010800108001043987751491569600160040160040149923315002080010800201600201600402611180021109108001010001005021611171617171599808000080010160041160041160041160041160041
8002416004011981006179950258001080010800104398775149156960016004016004014992331500208001080020160020160040261118002110910800101000000502160191617141599808000080010160041160041160041160041160041
80024160040119910072679950258001080010800104398775149156960016004016004014992331500208001080020160020160040261118002110910800101000000502160114161791599808000080010160041160041160041160041160041
8002416004011991008279950258001080010800104398775149156960016004016004014992331500208001080020160020160040261118002110910800101000000502160117169171599808000080010160041160041160041160041160041
800241600401199100726799502580010800108001043987751491569600160040160040149923315002080049800201600201600402611180021109108001010360005021601171617171599808000080010160041160041160041160041160041