Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

UDIV (fast, 64-bit)

Test 1: uops

Code:

  udiv x0, x1, x2
  mov x1, #0
  mov x2, #0

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 1.000

Load/store unit issues: 0.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)03mmu table walk data (08)1e3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst int alu (97)l1d cache writeback (a8)accfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
100420381500619502510001000100053725203820381801318961000100020002038261111001100000732162219811000100020392039203920392039
100420381500829502510001000100053725203820381801318961000100020002038261111001100000732162219811000100020392039203920392039
100420381600619502510001000100053725203820381801318961000100020002038261111001100000732162219811000100020392039203920392039
100420381600619502510001000100053725203820381801318961000100020002038261111001100000732162219811000100020392039203920392039
100420381500619502510001000100053725203820381801318961000100020002038261111001100000732162219811000100020392039203920392039
100420381500619502510001000100053725203820381801318961000100020002038261111001100000732162219811000100020392039203920392039
100420381600619502510001000100053725203820381801318961000100020002038261111001100000732162219811000100020392039203920392039
100420381500619502510001000100053725203820381801318961000100020002038261111001100000732162219811000100020392039203920392039
100420381500849502510001000100053725203820381801318961000100020002038261111001100000732162219811000100020392039203920392039
100420381500619502510001000100053725203820381801318961000100020002038261111001100000732162219811000100020392039203920392039

Test 2: Latency 1->2

Chain cycles: 2

Code:

  udiv x0, x1, x2
  eor x1, x1, x0
  eor x1, x1, x0
  mov x1, #0
  mov x2, #0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 2 chain cycles): 7.0035

retire uop (01)cycle (02)03mmu table walk data (08)09l2 tlb miss data (0b)18191e1f3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fl1d cache writeback (a8)accdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
3020490035675000000061811202530100301003010085226561498695590035900358590038624330100302006020090035176113020110099100301001000001910316228975130000301009003690036900369003690036
3020490035675000000061811202530100301003010085226561498695590035900358590038624330100302006020090035176113020110099100301001000001910216228975130000301009003690036900369007790036
302049003567500000330061811202530100301003010085226560498695590035900358589838624330100302006020090035343113020110099100301001000001910216228975130000301009003690036900369003690036
3020490035674000000061811202530100301003010085226561498695590035900358590038624330100302006020090035176113020110099100301001000001910216228975130000301009003690036900369003690036
3020490035674000302790128811202530100301003010085226560498695590035900358590038631130100302006044290035176113020110099100301001000001910216228975130000301009003690036900369003690036
30204900356740000000726811202530100301003010085226561498695590035900358590038624330100302006020090035176113020110099100301001000001910216228975130000301009003690036900369003690036
3020490035675000000061811202530100301003010085226561498695590035900358590038624330100302006020090035176113020110099100301001000001910216228975130000301009003690036900369003690036
30204900356740000083702097811522530100301003043285273580498695590035900358590038624330100302006020090035176113020110099100301001000001910216228975130000301009003690036900369003690036
3020490035674000000061811202530100301003010085226561498695590035900358590038624330100302006020090035176113020110099100301001000001910216228975130000301009003690036900369003690036
30204900356740000000618112025301003010030100852265604986955900359003585900386308301003020060200900351761130201100991003010010034001910216228975130000301009003690036900369003690036

1000 unrolls and 10 iterations

Result (median cycles for code, minus 2 chain cycles): 7.0035

retire uop (01)cycle (02)03mmu table walk data (08)09l2 tlb miss data (0b)18191e1f3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fl1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)l1d cache writeback (a8)a9acc2branch mispred nonspec (cb)cfd0d5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
30024900356750000000726812102530010300103010885229010498695509003590035859223862653001030020600209003517611300211091030010100004000018900216228975130000300109003690036900369003690036
3002490035674000000061812102530010300103001085229010498695509003590035859223862653001030020600209003517611300211091030010100000000018900216248975130000300109003690036900369039690081
30024900356740001310230600812102530010300103001085229010498695509003590035859223862653001030020600209003517611300211091030010100000060018900288228994430015300109003690036900369003690036
300249003567500000903061810992530010300103001085229010498699909003590080859223862653001030020600209003517621300211091030010100000000018900216228975130000300109008090036900369003690081
3002490035674000011206181210253001030010300108522901049869550900359003585922386265300103002060020900351761130021109103001010000334030018900216228981830000300109003690036900369003690036
30024900356740000010230726812102530010300103001085229010498695509003590035859223862653001030020600209003517611300211091030010100000030018900216228981330000300109003690036900369003690036
3002490035674000000061812102530010300103001085229010498695509003590035859223862653001030020600209003517611300211091030010100000000018900216228975130000300109003690036900369003690036
3002490035674000000061812102530010300103001085229010498695509003590035859223862653001030020600209003517611300211091030010100000000018900216228975130000300109003690036900369003690036
3002490035674000000061812102530010300103001085229010498695509003590035859223862653001030020600209003517611300211091030010100000230018900216228975130000300109003690067900369003690036
3002490035674000000061812102530010300103001085229010498695509003590035859223862653001030020600209003517611300211091030010100000000018900224228975130000300109003690036900369006790036

Test 3: Latency 1->3

Chain cycles: 2

Code:

  udiv x0, x1, x2
  eor x2, x2, x0
  eor x2, x2, x0
  mov x1, #0
  mov x2, #0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 2 chain cycles): 7.0035

retire uop (01)cycle (02)03191e1f3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fl1d cache writeback (a8)accfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)ea? int retires (ef)f5f6f7f8fd
3020490035675000618112025301003010030100852265604986955090035900358590038624330100302006020090035176113020110099100301001000019102161189751300081301009003690036900369003690036
3020490035674058507268112025301003010030100852265604986955090035900358590038624330100302006020090035176113020110099100301001000019101161189751300000301009003690036900369003690036
302049003567402850618112025301003010030100852265604986955090035900358590038624330100302006020090035176113020110099100301001000019101162189751300000301009003690036900369003690036
30204900356750120618112025301003010030100852265614986955090035900358590038624330100302006020090035176113020110099100301001000019101161189751300000301009003690036900369003690036
3020490035674000618112025301003010030100852265604986955090035900358590038624330100302006020090035176113020110099100301001000019101161189751300000301009003690036900369003690036
30204900356740002048112025301003010030100852265614986955390035900358590038624330100302006020090035176113020110099100301001000019101161189751300000301009003690036900369003690036
30204900356750177887268112025301003010030100852265604986955090035900358590038624330100302006020090035176113020110099100301001000019101161189751300000301009003690036900369003690036
3020490035674000618112025301003010030100852265604986955090035900358590038624330100302006020090035176113020110099100301001000019101161189751300000301009003690036900369003690036
3020590035675000618112025301003010030100852265604986955090035900358590038624330100302006020090035176113020110099100301001000019101161189751300000301009003690036900369003690036
3020490035674000618112025301003010030100852265614986955090035900358590038624330100302006020090035176113020110099100301001000019101161189751300000301009003690036900369003690036

1000 unrolls and 10 iterations

Result (median cycles for code, minus 2 chain cycles): 7.0035

retire uop (01)cycle (02)03mmu table walk data (08)l2 tlb miss data (0b)18191e1f3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fld unit uop (a6)l1d cache writeback (a8)a9acc2cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
300249003567400000061812102530010300103001085229014986955900359003585922386265300103002060020900351761130021109103001010000001890316438975130000300109003690036900369003690036
300249003567500000061812102530010300103001085229014986955900359003585922386265300103002060020900351761130021109103001010000301890316338975130000300109003690036900369003690036
3002490035674000300703812102530010300103001085229014986955900359007485922386265300103002060020900351761130021109103001010000001890550338975130000300109003690036900369012790214
30024900356740100120726812102530010300103001085229014986955900359003585922386265300103002060020900351761130021109103001010000001890316338975130000300109003690036900369003690036
300249003567500000061812102530010300103001085229014986955900359003585922386265300103002060020900351761130021109103001010000001890316338975130000300109003690036900369003690036
3002490035674000021061812102530010300103001085229014986955900359003585922386265300103002060020900351762130021109103001010000001890316338975130008300109003690036900369003690036
300249003567400000061812102530010300103001085229014986955900359003585922386265300103002060020900351761130021109103001010000001890316338975130000300109003690036900369003690036
300249003567400110082812102530010300103001085229014986955900359003585922386265301233002060020900351761130021109103001010000001890416338975130000300109003690036900369003690036
300249003567400000061812102530019300103001085229014986955900359003585922386265300103002060020900351761130021109103001010000001890316338975130000300109003690036900369003690036
300249003567400000061812102530010300103001085229014986955900359003585922386265300103002060020900351761130021109103001010000001890316338975130000300109003690036900369003690036

Test 4: throughput

Count: 8

Code:

  udiv x0, x8, x9
  udiv x1, x8, x9
  udiv x2, x8, x9
  udiv x3, x8, x9
  udiv x4, x8, x9
  udiv x5, x8, x9
  udiv x6, x8, x9
  udiv x7, x8, x9
  mov x8, #0
  mov x9, #0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 2.0005

retire uop (01)cycle (02)0318191e1f3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fl1d cache writeback (a8)acbranch cond mispred nonspec (c5)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
8020416003811990000617995025801008010080100439922504915695816003816003814990131499968010080200160200160038261118020110099100801001000015110216111599808000080100160039160039160039160039160039
8020416003811980000617995025801008010080100439922514915695816003816003814990131499968010080200160200160038261118020110099100801001000005132116111599808000080100160039160039160039160039160039
8020416003811990000617995025801008010080100439922514915695816003816003814990131499968010080200160200160038261118020110099100801001000005110116111599808000080100160039160039160039160039160039
8020416003811990000617995025801008010080100439922504915695816003816003814990131499968010080200160200160038261118020110099100801001000005110116111599808000080100160039160039160039160039160039
8020416003811990000617995025801008010080100439922504915695816003816003814990131499968010080200160200160038261118020110099100801001000005110116111599808000080100160039160039160039160039160039
80204160038119900002517995025801008010080100439922504915695816003816003814990131499968013880200160200160038261218020110099100801001000005110116111600348000080100160039160039160039160039160039
8020416003811980000617995025801008010080100439922514915695816003816003814990131499968010080200160200160038261118020110099100801001000005110116111599808000080100160039160039160039160039160039
802041600381199000017367995038801008010080100439922504915695816003816003814990131499968010080200160200160038261118020210099100801001000005110116111599808000080100160039160039160039160039160039
8020416003811990000617995025801008010080100439922514915695816003816003814990131499968010080200160328160038261118020110099100801001000005110116111599808000080100160039160039160039160039160039
8020416003811990000947995025801008010080100439922504915695816003816003814990131499968010080200160200160038261118020110099100801001000005110116111599808000080100160039160039160039160039160075

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 2.0005

retire uop (01)cycle (02)03mmu table walk instruction (07)l2 tlb miss instruction (0a)181e1f3a3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fl1d tlb miss (a1)l1d cache writeback (a8)a9acbranch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)d9dbddfetch restart (de)e0? int output thing (e9)eaec? int retires (ef)f5f6f7f8fd
80024160038119900000061799502580010800108001043987751491569580160038160038149923315001880010800201600201600382611180021109108001010000000502061600551599808000042080010160039160039160039160039160039
8002416003811990007740061799502580010800108001043987750491569580160038160038149923315001880010800201600201600382611180021109108001010000000502041600451599808000021080010160039160039160039160039160039
8002416003811980000006179950258001080010800104398775049156958016003816003814992331500188001080020160020160038261218002110910800101000000050205160035160165800290080010160039160039160039160039160376
800241600381198031000768799382288001080010800104398775049156958016003816003814992331500188001080020160020160038261118002210910800101001000050205240055159980800000080010160039160039160039160039160039
80024160038119800021006179950258001080010800104398775149156958016003816003814992331500188001080020160020160038261118002110910800101000001050205160055159980800000080010160039160039160039160039160039
8002416003811980000006179950258001080010800104398775149156958016003816003814992331500188001080020160020160038261118002110910800101000000050205160066159980800000080010160039160039160039160039160039
8002416003811980006006179950258001080010800104398775049156958016003816003814992331500188001080020160020160038261118002110910800101000000050205160077159980800000080010160039160039160039160039160039
8002416003811990000006179950258001080010800104398775149156958016003816003814992331500188001080020160020160038528118002110910800101000000050205160054159980800000080010160039160039160039160039160039
8002416003811990000006179950258001080010800104398775149156958016003816003814992331500188001080020160020160038261118002110910800101000000050205160076159980800000080010160039160039160039160039160039
8002416003811980000008479950258001080010800104398775149156958016003816003814992331500188001080020160020160038261118002110910800101000000050205160055159980800000080010160039160039160039160039160039