Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

UDIV (slow, 32-bit)

Test 1: uops

Code:

  udiv w0, w1, w2
  mov w1, #0xffffffff
  mov w2, #3

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 1.000

Load/store unit issues: 0.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)031e3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst int alu (97)l1d cache writeback (a8)accfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
10042039150619502510001000100053725203920391801318971000100020002039261111001100000733161119801000100020402040204020402040
10042039150619502510001000100053725203920391801318971000100020002039261111001100000731161119801000100020402040204020402040
10042039150619502510001000100053725203920391801318971000100020002039261111001100000731161119801000100020402040204020402040
10042039150619502510001000100053725203920391801318971000100020002039261111001100000731161119801000100020402040204020402040
10042039150619502510001000100053725203920391801318971000100020002039261111001100000731161119801000100020402040204020402040
10042039150619502510001000100053725203920391801318971000100020002039261111001100000731161119801000100020402040204020402040
10042039150619502510001000100053725203920391801318971000100020002039261111001100000731161119801000100020402040204020402040
10042039160619502510001000100053725203920391801318971000100020002039261111001100000731161119801000100020402040204020402040
10042039150619502510001000100053725203920391801318971000100020002039261111001100000731161119801000100020402040204020402040
10042039150619502510001000100053725203920391801318971000100020002039261111001100000731161119801000100020402040204020402040

Test 2: Latency 1->2

Chain cycles: 2

Code:

  udiv w0, w1, w2
  eor x1, x1, x0
  eor x1, x1, x0
  mov w1, #0xffffffff
  mov w2, #3

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 2 chain cycles): 8.0035

retire uop (01)cycle (02)03mmu table walk data (08)09l2 tlb miss data (0b)18191e1f3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6d6emap stall dispatch (70)int prf full (71)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fl1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)l1d cache writeback (a8)a9acc2branch mispred nonspec (cb)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
3020410003574900000006191261253010030100301009493443049969551000351000359587303962423010030200602001000351931130201100991003010010000000000191021711997223000030100100036100036100036100036100036
30205100035749000000072691261253010030100301009493443049969551000351000359587303962423010030200602001000351931130201100991003010010000000000191011711997223000030100100036100036100036100036100036
30204100035749000000010391261253010030100301009493443049969551000351000359587303962423010030200602001000351931130201100991003010010000000000191011711997223000030100100036100036100036100036100036
3020410003575000000006191261253010030100301009493443049969551000351000359587303962423010030200602001000351931130201100991003010010000000000191011711997223000030100100036100036100036100036100036
30204100035749000000072691261253010030100301009493443049969551000351000359587303962423010030200604541000351931130201100991003010010000000000191011711997223000030100100036100036100036100036100036
30204100035749000000061912612530100301003010094934430499695510003510003595873039624230100302006020010003519311302011009910030100100000001200191011711997223000030100100036100036100036100036100036
3020410003574900000006191261253010030100301009493443098969551000351000359587303962423010030200602001000351931130201100991003010010000000000191011711997223000030100100036100036100036100036100036
302041000357500000000166591261253010030100301009493443049969551000351000359587303962423010030200604481000351931130201100991003010010000000600191011712997223000030100100036100036100036100036100063
3020410003575000000006191261253010030100301009493443049969551000351000359587303962423010030200602001000351931130201100991003010010000000000191011711997223000030100100036100036100036100036100036
3020410003574900000006191261253010030100301009493443049969551000351000359587303962423010030200602001000351931130201100991003010010000000000191011711997223000030100100036100036100036100036100036

1000 unrolls and 10 iterations

Result (median cycles for code, minus 2 chain cycles): 8.0035

retire uop (01)cycle (02)031e3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fld unit uop (a6)l1d cache writeback (a8)accdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
30024100035749061911822530010300103001094787670499695501000351000359588539626530010300206002010003519311300211091030010100060189021622997143000030010100036100036100036100036100036
300241000357490726911822530010300103001094787670499695501000351000359588539626530010300206002010003519311300211091030010100090189021623997143000030010100036100036100036100036100075
30024100035749061911822530010300103001094787670499695501000351000359588539626530010300206002010003519311300211091030010100090189021622997143000030010100036100036100036100036100036
30024100035749061911822530010300103001094787671499695501000351000359588539626530010300206002010003519311300211091030010100061189021622997143000030010100036100036100036100036100036
300241000357490726911822530010300103001094787671499695501000351000359588539626530010300206002010003519311300211091030010100000189021622997143000030010100036100036100036100036100036
30024100035749061911822530010300103001094787671499695501000351000669588539626530010300206002010003519311300211091030010100060189021632997143000030010100036100036100036100036100036
3002410003574901079118225300103001030010947408004996955010003510003595885396265300103002060020100035193113002110910300101000150189021622997143000030010100036100036100036100036100036
30024100035749061911822530010300103001094787670499695501000351000359588539626530010300206002010003519311300211091030010100090189021722997143000030010100036100036100036100036100036
30024100035749061911822530010300103001094787671499695501000351000359588539626530010300206002010003519311300211091030010100090189021622997143000030010100036100036100036100036100036
30024100035750061911822530010300103001094787670499695501000351000359588539626530010300206002010003519311300211091030010100000189021622997143000030010100036100036100036100036100036

Test 3: Latency 1->3

Chain cycles: 2

Code:

  udiv w0, w1, w2
  eor x2, x2, x0
  eor x2, x2, x0
  mov w1, #0xffffffff
  mov w2, #3

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 2 chain cycles): 8.0035

retire uop (01)cycle (02)03mmu table walk data (08)09191e1f3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fld unit uop (a6)l1d cache writeback (a8)acbranch cond mispred nonspec (c5)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
30204100035750000006191261253010030100301009493443049969551000351000359587322962423010030200602001000351931130201100991003010010000600191021711997223000030100100036100036100036100036100036
30204100035749000003609126125301003010030100949344304996955100035100035958733962423010030200602001000351931130201100991003010010000900191011711997223000030100100036100036100036100036100036
30204100035749000006691261253010030100301009493443049969551000351000359587339624230100302006020010003519311302011009910030100100009000191011711997223000030100100036100036100036100036100036
3020410003574900000619126125301003010030100949344304996955100035100035958733962423010030200602001000351931130201100991003010010000000191011711997223000030100100036100036100036100036100036
3020410003574900000619126125301003010030100949344304996955100035100035958733962423010030200602001000351931130201100991003010010000900191011711997223000030100100036100036100036100036100036
3020410003574910000619126125301003010030100949344304996955100035100035958733962423010030200602001000351931130201100991003010010000010191011711997223000030100100036100036100036100036100036
30204100035749000006191261253010030100301009493443049969551000351000359587339624230100302006020010003519311302011009910030100100001200191011711997223000030100100036100036100036100036100036
3020410003575000000619126125301003010030100949344304996955100035100035958733962423010030200602001000351931130202100991003010010000000191011711997223000030100100036100036100036100036100036
30204100035749000009699126125301003010030100949344304996955100035100035958733962423010030200602001000351931130201100991003010010000000191011711997223000030100100036100036100036100036100036
3020410003575000090619126125301003010030100949344304996955100035100035958733962423010030200602001000351931130201100991003010010000900191011711997223000030100100036100036100036100036100036

1000 unrolls and 10 iterations

Result (median cycles for code, minus 2 chain cycles): 8.0035

retire uop (01)cycle (02)0318191e3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fl1d cache writeback (a8)a9acbranch mispred nonspec (cb)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
3002410003574900044191182253001030010300109478767149969551000351000359588539626530010300206002010003519311300221091030010100000189021622997143000030010100036100036100036100036100036
300241000357490006191182253001030010300109478767049969551000351000359588539626530010300206002010003519311300211091030010100000189021622997143000030010100036100036100127100036100036
300241000357490006191182253001030010300109478767149969551000351000359588539626530010300206002010003519311300211091030010100000189021622997143000030010100036100036100036100036100036
3002410003574900085291182253001030010300109478767149969551000351000359588539626530010300206002010003519311300211091030010100000189021622997143000030010100036100036100036100036100036
300241000357490006191182253001030010300109478767149969551000351000359588539626530010300206002010003519311300211091030010100000189021623997143000030010100036100036100036100036100036
300241000357490006191182253001030010300109478767198969551000351000359588539626530010300206002010003519311300211091030010100000189021622997143000030010100036100036100036100036100036
3002410003574900072691182253001030010300109478767149969551000351000359588539626530010300206002010003519311300211091030010100000189021622997143000030010100036100212100036100036100036
300241000357490006191182253001030010300109478767049969551000351000359588539626530010300206002010003519311300211091030010100000189021622997143000030010100036100036100036100036100036
300241000357500006191182253001030010300109478767149969551000351000359588539626530010300206002010003519311300211091030010100000189021623997143000030010100036100036100036100036100036
300241000357500006191182253001030010300109478767149969551000351000359588539626530010300206002010003519311300211091030010100000189021622997143000030010100036100036100036100036100036

Test 4: throughput

Count: 8

Code:

  udiv w0, w8, w9
  udiv w1, w8, w9
  udiv w2, w8, w9
  udiv w3, w8, w9
  udiv w4, w8, w9
  udiv w5, w8, w9
  udiv w6, w8, w9
  udiv w7, w8, w9
  mov w8, #0xffffffff
  mov w9, #3

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 2.0005

retire uop (01)cycle (02)03mmu table walk data (08)09l2 tlb miss data (0b)18191e1f3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)6061696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fl1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)l1d cache writeback (a8)acbranch cond mispred nonspec (c5)branch mispred nonspec (cb)cfd0d2d5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
80204160039119900000006179950258010080100801254399351104915695916003916003914990131499978010080200160200160039261118020110099100801001000000300511000316111599808000080100160040160040160040160040160040
802041600391198000000072679950258010080100801004399225154915695916003916003914990131499978010080200160200160039261118020110099100801001000000000511051116111599808002580100160040160040160040160040160040
802041600391198000000061799502580100801008013843992251549156959160039160039149901315002780100802001602001600392611180201100991008010010000001200511000116111599808000080100160040160040160040160040160040
802041600391199000000072679950258010080100801004399225154915695916003916003914990131499978010080200160200160039261118020110099100801001000000001511051016111599808002580100160080160040160040160040160040
80204160039119800000006179950258010080100801004399225054915695916003916003914990131499978010080200160200160039261118020110099100801001000000000511000116111599808000080100160040160040160040160040160040
802041600391198000000015679950258012580125801254399351154915695916003916003914990131499978010080200160200160039261118020110099100801001000001000511051116111599808000080100160040160040160040160040160040
80204160039119800000006179950258010080100801004399225054915695916003916003914990131499978012580200160200160080261118020110099100801001000000000511050116111599808000080100160040160040160040160040160040
8020416003911990000000127879878468010080135801254399351054915695916003916003914990131499968012580200160200160039261118020110099100801001000001000513750265111600188006280100160040160040160040160040160040
8020416003911990000012010379950258011380100801004399225154915695916003916003914990131499978010080200160622160039275118020110099100801001000000000511000116111599808000080100160040160040160040160040160040
80204160039119900100006179950258010080100801004399225054915695916003916003914990131499978010080200160200160039261118020110099100801001000010000513451116111599808000080100160040160040160040160040160040

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 2.0005

retire uop (01)cycle (02)031e3a3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)5f6061696a6b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fl1d cache writeback (a8)acbranch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd0d2d5map dispatch bubble (d6)dadbddfetch restart (de)e0? int output thing (e9)eaec? int retires (ef)f5f6f7f8fd
80024160039119915061799502580010800108001043987750154915695901600391600391499233150019800108002016002016003926111800211091080010100000050205321600111599808000090080010160040160040160040160235160040
80024160039119800726799022580010800108001043987750154915695901600391600391499233150019800108002016002016003926111800211091080010100000050205311600111599808000050080010160040160040160040160040160040
8002416003911990061799502580010800108001043987750154915695901600391600391499233150019800108002016002016003926111800211091080010100000050205311600111599808000050080010160040160040160040160040160040
800241600391198147061799502580010800108001043987750154915695901600391600391499233150019800108002016002016003926111800211091080010100900050205311600111599808000050080010160040160040160040160040160040
80024160039119900726799502580010800108001043987750154915705901600391600391499233150019800108002016002016003926111800211091080010100300050205311600111599808000051080010160040160040160040160040160040
8002416003911981201097995067800528002480202439905901549156959016003916003914992312150054800108002016002016003926111800211091080010101300150205311600111601328000050080010160040160040160040160040160040
80024160039119900726799502580010800108001043987750154915695901600391600391499233150019800108002016002016003926111800211091080010100000050205411600111599808000050080010160040160040160040160040160040
800241600391198011154799382580010800108001043989240154915695901600881600391499233150054800108002016002016003926111800211091080010100000050205411600111599808000050080010160040160040160040160040160040
80024160039119999061799502580010800108001043987750154915695901600391600391499233150019800108002016002016003926111800211091080010100001050205411600111599808000050080010160040160040160040160040160040
8002416003911990061799502580010800108001043987750154915695901600391600391499233150019800108002016002016003926111800211091080010100000050205411600111599808000050080010160040160040160040160040160040