Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

UDIV (slow, 64-bit)

Test 1: uops

Code:

  udiv x0, x1, x2
  mov x1, #0xffffffffffffffff
  mov x2, #3

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 1.000

Load/store unit issues: 0.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)031e3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst int alu (97)l1d cache writeback (a8)accfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
10042040150619502510001000100053725204020401801318981000100020002040261111001100000731161119801000100020412041204120412041
10042040150619502510001000100053725204020401801318981000100020002040261111001100000731161119801000100020412041204120412041
100420401512619502510001000100053725204020401801318981000100020002040261111001100000731161119801000100020412041204120412041
10042040150619502510001000100053725204020401801318981000100020002040261111001100000731161119801000100020412041204120412041
100420401504639502510001000100053725204020401801318981000100020002040261111001100000731161119801000100020412041204120412041
10042040150619502510001000100053725204020401801318981000100020002040261111001100000731161119801000100020412041204120412041
10042040150619502510001000100053725204020401801318981000100020002040261111001100000731161119801000100020412041204120412041
10042040150619502510001000100053725204020401801318981000100020002040261111001100003731161119801000100020412041204120412041
10042040150619502510001000100053725204020401801318981000100020002040261111001100000731161119801000100020412041204120412041
10042040150619502510001000100053725204020401801318981000100020002040261111001100000731161119801000100020412041204120412041

Test 2: Latency 1->2

Chain cycles: 2

Code:

  udiv x0, x1, x2
  eor x1, x1, x0
  eor x1, x1, x0
  mov x1, #0xffffffffffffffff
  mov x2, #3

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 2 chain cycles): 9.0035

retire uop (01)cycle (02)03mmu table walk data (08)18191e3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)696a6b6d6emap stall dispatch (70)int prf full (71)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fl1d tlb access (a0)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)acbranch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
302041100358240003061100940253010030100301001043262849106955011003511007410581533106243301003020060200110064210113020110099100301001000008800001910216111096663000030100110036110036110036110036110036
3020411003582400006110094025301003010030100104326284910695501100351100351058150310624330100302006020011003521011302011009910030100100000000001910116101096663000030100110036110036110036110213110036
3020411003582500006110094025301003010030100104326284910695501100351100351058150310624330100302006020011003521011302011009910030100100000000001910116111096663000030100110036110036110036110036110036
3020411003582400008210094025301003010030100104326284910695531100351100351058150310624330100302006020011003521011302011009910030100100000000001910116111096663000030100110036110036110036110036110036
3020511003582400006110094025301003010030100104326284910695501100351100351058150310624330100302006020011003521011302011009910030100100000000001910016111096663000030100110036110036110036110036110036
3020411003582400006110094025301003010030100104326284910695501100351100351058150310624330100302006020011003521011302011009910030100100000001001910116111096663000030100110036110036110036110036110036
3020411003582500006110094025301003010030100104326284910695501100351100351058150310624330100302006020011003521011302011009910030100100000001001910116111096663000030100110036110036110036110036110036
3020411003582400106110094025301003010030100104326284910695501100351100351058150310624330100302006020011003521011302011009910030100100000000001910116111096663000030100110036110036110036110036110036
3020411003582400006110094025301003011030100104326284910695501100351100351058150310624330100302006020011003521011302011009910030100100000000001910116111096663000030100110036110036110036110036110036
30204110035825000072610094025301003010030100104326284910695501100351100351058150310624330100302006020011003521011302011009910030100100000000001910116111096663000030100110036110036110036110036110036

1000 unrolls and 10 iterations

Result (median cycles for code, minus 2 chain cycles): 9.0035

retire uop (01)cycle (02)03mmu table walk data (08)0918191e1f3a3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fl1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)l1d cache writeback (a8)acbranch cond mispred nonspec (c5)branch mispred nonspec (cb)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)ec? int retires (ef)f5f6f7f8fd
3002411003582400000006110115525300103001030010104346314910695511003511003510584831062653001030020600201100352101130021109103001010000000018903162210967730000030010110036110036110036110036110036
3002411003582400000006110115525300103001030010104346314910695511003511003510584831062653001030020600201100352101130021109103001010000030018902162210967730000030010110036110036110036110036110036
3002411003582500000006110115525300103001030010104346314910695511003511007610584831062653001030020600201100352101130021109103001010000000018902162210967730000030010110036110066110036110036110036
30024110035824000090006110115525300103001030010104346314910695511003511003510584831062653001030020600201100352101130021109103001010000000018902162210967730000030010110036110036110036110036110036
30024110035824000000072610115525300103001030010104346314910695511003511003510584831062653001030020600201100352101130021109103001010000000018903162210967730000030010110036110036110036110036110036
300241100358240000000611011552530010300103001010434631491069551100351100351058483106265300103002060020110035210113002110910300101000001560018902162811028530000030010111194111195111148111233111201
30024111185833112125316812321144781009945323015630165326221044117349107760111110111192106038112106824325193303365854110790210191300211091030010100032359860018902162210967730000030010110036110036110036110036110036
30024110035824000000053610115525300103001030010104346314910695511003511003510587231062653001030020600201100352101130021109103001010000000018902162210967730000030010110036110036110036110036110036
3002411003582400000006110115525300103001030010104346314910695511003511003510584831062653001030020600201100352101130021109103001010000000018902163210967730000030010110036110036110036110036110036
300241100358240000000747101155253001030010300101046431849106955110035110035105848310626530010300206002011003521011300211091030010100006500018902162210967730000030010110036110036110036110036110036

Test 3: Latency 1->3

Chain cycles: 2

Code:

  udiv x0, x1, x2
  eor x2, x2, x0
  eor x2, x2, x0
  mov x1, #0xffffffffffffffff
  mov x2, #3

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 2 chain cycles): 9.0035

retire uop (01)cycle (02)031e3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fl1d cache writeback (a8)acbranch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
3020411003582406110094025301003010030100104326280491039150110035110035105824710623930100302086021611003521011302011009910030100100001111918016001096803000030100110036110036110036110036110036
3020411003582406110094025301003010030100104326281491069550110035110035105824610623830100302086021611003521011302011009910030100100491111917016001096803000030100110036110036110036110036110036
30204110035824074710094025301003010030100104326280491069550110035110035105815310624330100302006020011003521011302011009910030100100000001910116111096663000030100110036110036110036110036110036
3020411003582406110094025301003010030100104326281491069550110035110035105815310624330100302006020011003521011302011009910030100100000001910116111096663000030100110036110036110036110036110036
30204110035824025110094025301003010030100104326281491069550110035110035105815310624330100302006020011003521011302011009910030100100630001910116111096663000030100110036110036110036110036110036
3020411003582406110094025301003010030100104326281491069550110035110035105815310624330100302006020011003521011302011009910030100100000001910116111096663000030100110036110036110036110036110036
30204110035824072610094025301003010030100104326281491069550110035110035105815310624330100302006020011003521011302011009910030100100000001937116111096663000030100110036110036110036110036110036
3020411003582406110094025301003010030100104326281491069550110035110035105815310624330100302006020011003521011302011009910030100100000001910116111097733000030100110036110036110036110036110036
3020411003582406110094025301003010030100104326281491069550110035110035105815310624330100302006020011003521011302011009910030100100000001910116111096663000030100110036110036110036110036110036
3020411003582406110094025301003010030100104326281491069550110035110035105815310624330100302006020011003521011302011009910030100100000001910116111096663000030100110036110036110036110036110036

1000 unrolls and 10 iterations

Result (median cycles for code, minus 2 chain cycles): 9.0035

retire uop (01)cycle (02)03mmu table walk data (08)09l2 tlb miss instruction (0a)l2 tlb miss data (0b)18191e1f3f4d5051schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fl1d tlb miss (a1)l1d cache writeback (a8)acbranch mispred nonspec (cb)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
30024110035824000000300892101155110019253001030010300101043463104910695501100351100351058483106265300103002060020110035210113002110910300101000001890416441096773000030010110036110036110036110036110036
30024110035824110000003381011550253001030010300101043463104910695501100351100351058483106265300103002060020110035210113002110910300101000011890416441096773000030010110036110036110036110036110036
3002411003582400000060168101155025300103001030010104346310491069550110035110035105848310626530010300206002011003521011300211091030010100018301890416441096773000030010110036110036110036110036110036
30024110035824000000002521011550253001030010300101043463104910695501100351100351058483106265300103002060020110035210113002110910300101000001890416441096773000030010110036110036110036110036110036
30024110035824000000001661011550253001030010300101043463114910695501100351100351058483106265300103002060020110035210113002110910300101000001890416441096773000030010110036110036110036110036110057
30024110035825000000003591011550253001030010300101043463114910695501100351100351058483106265300103002060020110035210113002110910300101000001890416441096773000030010110036110036110036110036110036
3002411003582400000000611011550253001030010300101043463104910695501100351100351058483106265300103002060020110035210113002110910300101000001890416441096773000030010110036110036110036110036110036
3002411003582400000000611011550253001030010300101043463104910695501100351100351058483106265300103002060020110035210113002110910300101000001890416441096773000030010110036110036110036110036110036
30025110035824000000003401011550253001030010300101043463104910695501100351100351058483106265300103002060020110035210113002110910300101000001890416441096773000030010110036110036110036110036110036
30024110035824000000002501011550253001030010300101043463114910695501100351100351058483106265300103002060020110035210113002110910300101000001890416241096773000030010110036110036110036110036110036

Test 4: throughput

Count: 8

Code:

  udiv x0, x8, x9
  udiv x1, x8, x9
  udiv x2, x8, x9
  udiv x3, x8, x9
  udiv x4, x8, x9
  udiv x5, x8, x9
  udiv x6, x8, x9
  udiv x7, x8, x9
  mov x8, #0xffffffffffffffff
  mov x9, #3

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 2.0005

retire uop (01)cycle (02)03mmu table walk data (08)1e3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fl1d cache writeback (a8)acc2cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
802041600401198030617995025801008010080100439922519815696016004016004014990131499988010080200160200160040261118020110099100801001000005110216111599808000080100160041160041160041160041160041
802041600401199007267995025801008010080100439922514915696016004016004014990131499988010080200160200160040261118020110099100801001001005110016111599808000080100160041160041160041160041160041
80204160040119900617995025801008010080100439922514915696016004016004014990131499988010080200160200160040261118020110099100801001000905110116111599808000080100160041160041160041160041160041
802041600401198007267995025801008010080100439922504915696016004016004014990118149998801008020016020016004026111802011009910080100100218605110116221600198000080100160041160041160041160041160041
802041600401199006179950258010080100801004399225149157010160077160040149901211500348027580308160200160040261118020110099100801001001305110116111599808000080100160041160041160041160041160041
802041600401198107267995025801008010080100439922514915696016004016004014990131499988010080200160200160040261118020110099100801001000005110116111599808000080100160041160041160041160041160041
80204160040119800617995025801008010080139439922514915696016004016004014990131499988010080200160200160040261118020110099100801001000305110116111599808000080100160041160041160041160041160041
80204160040119900726799502580100801008010043992251491569601600401600401499013149998801008020016020016004026111802011009910080100100017405110116111599808000080100160041160041160041160041160041
80204160040119900617995025801008010080100439922514915696016004016004014990131499988010080200160200160040261118020110099100801001000005110117111599808000080100160041160041160041160041160041
8020416004011990217267995025801008010080100439922514915696016004016004014990131499988010080200160200160040261118020110099100801001000005110116111599808000080100160041160041160041160041160041

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 2.0005

retire uop (01)cycle (02)03mmu table walk instruction (07)09l2 tlb miss data (0b)18191e1f3a3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)5f60696a6b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fl1d cache writeback (a8)accfd5map dispatch bubble (d6)daddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
800241600401198000000006179950458001080040800104398954014915696001600401600401499233150020800108007616021816004026111800211091080010101050204160431599808000080010160041160041160041160041160041
80024160242119801128120010379950258001080010800104398775014915696001600401600401499233150020800108002016002016004026111800211091080010103050203160441599808000080010160041160041160041160041160041
800241600401199000000006179950258001080023800104398775014915696001600401600401499233150020800108002016002016004026121800211091080010100050205320441599808000080010160041160041160041160041160041
80024160040119900000648006179950258001080010800104398775014915696001600401600401499233150020800108002016002016004026111800211091080010100050204160431599808000080010160041160041160041160041160041
800241600401199000000006179950258001080010800104398775014915696001600401600401499233150020800108002016002016004026111800211091080010100050203160441599808000080010160041160041160041160041160041
8002416004011990000000072679950258001080010800104398775014915696001600401600401499233150020800108002016002016008026111800211091080010100050204160441599808001380010160041160041160041160041160041
800241600401198000000006179950258001080010800104398775014915696001600401600401499233150020800108002016002016004026111800211091080010101050204160341599808000080010160041160041160041160041160041
800251600401199000000006179950258001080010800104398775014915696001600401600401499233150020800108002016002016004026111800211091080010100050204160341599808000080010160041160041160041160041160041
800241600401199000000006179950258001080010800104398775114915696001600401600401499233150020800108002016007416004026111800211091080010100050204160341599808000080010160091160041160041160041160142
8002416009111990000000032679950258001080010800104398775014915696001600401600401499233150020800108002016002016004026111800211091080010100050204160441599808000080010160041160041160041160041160041