Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

UDIV (fast, 32-bit)

Test 1: uops

Code:

  udiv w0, w1, w2
  mov w1, #0
  mov w2, #0

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 1.000

Load/store unit issues: 0.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)031e3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)606d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst int alu (97)l1d tlb miss (a1)ld unit uop (a6)l1d cache writeback (a8)a9accfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
100420381506195025100010001000537250203820381801318961000100020002038261111001100000000731161119811000100020392039203920392039
100420381506195025100010001000537250203820381801318961000100020002038261111001100000000731161119811000100020392039203920392039
1004203815017895025100010001000537251203820381801318961000100020002038261111001100000000731161119811000100020392039203920392039
1004203815216195025100010001000537250203820381801318961000100020002038261111001100000000731162219811000100020392039203920392039
100420381508295025100010001000537250203820381801318961000100020002038261111001100000000871161219811000100020392039203920392039
100420381606195025100010001000537250203820381801318961000100020002038261111001100000000731162219811000100020392039203920392039
100420381606195025100010001000537250203820381801318961000100020002038261111001100000000731161119811000100020392039203920392039
100420381606195025100010001000537251203820381801318961000100020002038261111001100000000731162219811000100020392039203920392039
100420381506195025100010001000537250203820381801318961000100020002038261111001100000000731161219811000100020392039203920392039
100420381506195025100010001000537250203820381801318961000100020002038261111001100000000731161119811000100020392039203920392039

Test 2: Latency 1->2

Chain cycles: 2

Code:

  udiv w0, w1, w2
  eor x1, x1, x0
  eor x1, x1, x0
  mov w1, #0
  mov w2, #0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 2 chain cycles): 7.0035

retire uop (01)cycle (02)03mmu table walk data (08)1e3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fl1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)l1d cache writeback (a8)a9acbranch cond mispred nonspec (c5)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
3020490035674006181120253010030100301008522656149869550900359003585900386243301003020060200900351761130201100991003010010000000001910216118975130000301009003690036900369003690036
30204900356750306181120253010030100301008522656049869550900359003585900386243301003020060200900351761130201100991003010010000000001910116118975130000301009003690036900369003690036
30204900356750072681120253010030100301008522656049869550900359003585900386243301003020060200900351761130201100991003010010000000001910116118983630000301009003690036900369003690036
3020490035675006181120253010030100301008522656049869550900359003585900386243301003020060200900661761130201100991003010010000000001910116118975130000301009003690036900369003690036
3020490035675006181120253010030100301008522656049869550900359003585900386243301003020060200900351761130201100991003010010000000001910116118975130000301009003690036900369003690036
3020490035675006181120253010030100301008522656049869550900359003585900386243301003020060200900351761130201100991003010010000000001910116118975130000301009003690036900369003690036
3020490035674006181120253010030100301008522656149869550900359003585900386243301003020060200900351761130201100991003010010000000001910116118975130000301009003690036900369003690036
3020490035674006181120253010030100301008522656049869550900359003585900386243301003020060200900351761130201100991003010010000000001910116118975130000301009003690036900369003690036
3020490035674006181120253010030100301008522656049869550900359003585900386243301003020060200900351761130201100991003010010000000001910116118975130000301009003690036900369003690036
30204900356740072681145253010030100301008522656049869550900359003585900386243301003020060200900351761130201100991003010010000000301910116118975130000301009003690036900369003690036

1000 unrolls and 10 iterations

Result (median cycles for code, minus 2 chain cycles): 7.0035

retire uop (01)cycle (02)03mmu table walk data (08)l2 tlb miss data (0b)1e1f3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fl1d tlb access (a0)l1d cache writeback (a8)acc2cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
30024900356750000618121025300103001030010852290114986955090035900358592238626530010300206002090035176113002110910300101000001890616228975130000300109003690036900369003690036
30024900356740000618121025300103001030010852290114986955090035900358592238626530010300206002090035176113002110910300101000001890216238975130000300109003690036900369003690036
30024900356740000618121025300103001030010852290114986955090035900358592238626530010300206002090035176113002110910300101000001890216228975130008300109003690036900369003690036
300249003567400007268121025300103001030010852290114986955090035900358592238626530113300206002090035176113002110910300101000601890216228975130000300109003690036900369003690036
30024900356740000618121025300103001630010852290114986955090035900358592238626530010300206002090035176113002110910300101000001890216228975130000300109003690036900369003690036
30024900356740000618121025300103001030010852290114986955090035900358592238626530010300206002090035176213002110910300101000001890216228975130000300109003690036900369003690036
30024900356750000618121025300103001030010852290114986955090035900358592238626530010300206002090035176113002210910300101000001890216228975130000300109003690036900369003690036
300249003567400001058121025300103001030010852290114986955090035900358592238626530010300206002090035176113002110910300101000001890216328975130000300109003690036900369003690036
300249003567400901038121025300103001030010852290114986955090035900358592238626530010300206002090035176113002110910300101000001890216028975130000300109003690036900369003690036
3002490035674000072681210253001030010300108522901149869550900359003585922128626530010300206002090035176113002110910300101002001890216228975130000300109003690036900369003690036

Test 3: Latency 1->3

Chain cycles: 2

Code:

  udiv w0, w1, w2
  eor x2, x2, x0
  eor x2, x2, x0
  mov w1, #0
  mov w2, #0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 2 chain cycles): 7.0035

retire uop (01)cycle (02)03mmu table walk data (08)l2 tlb miss data (0b)18191e1f3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fl1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acc2branch cond mispred nonspec (c5)branch mispred nonspec (cb)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
30204900356740000001038112025301003010030100852265614986955900359003585900386243301003020060200900351761130201100991003010010000001020760001910216118975130000301009003690036900369003690036
3020490035674000000618112025301003010030100852265614986955900359003585900386243301003020060200900351761130202100991003010010000000000001910116118975130000301009003690036900369003690036
3020490035675000000618112025301003010030100852265604986955900359003585900386243301003020060200900351761130201100991003010010000000000001910016118975130000301009003690036900369003690036
30204900356740000002318112025301003010030100852265614986955900359003585900386243301003020060200900351761130201100991003010010000018000001910116118975130000301009003690036900369003690036
30204900356740040005368112025301003010030100852265604986955900359003585900386243301003020060200900351761130201100991003010010000000030001910116118975130000301009003690036900369021490036
3020490035674000057302518112025301003010030100852265604986955900359003585928386243301003020060200900351765130201100991003010010000000030001910116118975130000301009003690036900369003690036
3020490035674000000618112025301003010030100852265604986955900359003585900386243301003020060200900351761130201100991003010010000000000001910116118975130000301009003690036900369003690036
3020490035674000000618112025301003010030100852265604986955900359003585900386243301003020060200900351761130201100991003010010000000030001910116118975130000301009003690036900369006890036
3020490035674000000618112025301003010030100852265614986955900359003585900386243301003020060200900351761130201100991003010010000000034001910116118975130000301009003690036900369003690036
3020490035675000000618112025301003010030100852265604986955900359003585900386243301003020060200900351761130201100991003010010000000000001910116118975130000301009003690036900369003690036

1000 unrolls and 10 iterations

Result (median cycles for code, minus 2 chain cycles): 7.0035

retire uop (01)cycle (02)031e3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fl1d cache writeback (a8)acbranch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)d9ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
30024900746740136181210253001030010300108520582149869980900359003585922786265304163002060020900351761130021109103001010000018905160358975130000300109003690036900369003690036
300249003567406181210253001030010300108522901049869550900359003585922386265300103002060020900351761130021109103001010300018905160358975130000300109003690036900369003690036
300249003567506181210253001030010300108522901149869550900359003585922386265300103002060020900351761130021109103001010000018903160358975130000300109003690036900369003690036
3002490035675061812102530010300103001085229011498695509003590035859223862653001030020600209003517611300211091030010106000018905160538975130000300109003690036900369003690036
300249003567506181210253001030010300108522901049869550900769003585922386265300103002060020900351761130021109103001010000018903160538975130000300109003690036900369003690036
300249003567406181210253001030010300108522901049869550900359003585922386265300103002060020900351761130021109103001010000018905160538975130000300109003690036900369003690036
3002490035674025181210253001030010300108522901049869553900359003585922386265300103002060020900351761130021109103001010100018903160358982130000300109003690036900369003690036
300249003567406181210253001030010300108522901049869550900359006685922386265300103002060020900351761130021109103001010000018903160538975130000300109003690036900369003690036
300249003567406181210253001030010300108522901149869550900359003585922386265300103002060020900351761130021109103001010000018903160358975130000300109003690036900369003690036
300249003567406181210253001030010300108522901049869550900359003585922386265300103002060020900351761130021109103001010090018903160578975130000300109003690036900369003690036

Test 4: throughput

Count: 8

Code:

  udiv w0, w8, w9
  udiv w1, w8, w9
  udiv w2, w8, w9
  udiv w3, w8, w9
  udiv w4, w8, w9
  udiv w5, w8, w9
  udiv w6, w8, w9
  udiv w7, w8, w9
  mov w8, #0
  mov w9, #0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 2.0005

retire uop (01)cycle (02)0309l2 tlb miss data (0b)1e3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fl1d cache writeback (a8)acbranch cond mispred nonspec (c5)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)ea? int retires (ef)f5f6f7f8fd
802041600801198000726799502580100801008010043992251491569580160038160038149901314999680100802001602001600382611180201100991008010010000051102161115998080000280100160039160039160039160039160039
80204160038119800061799502580100801008010043992250491569580160038160038149901314999680100802001602001600382611180201100991008010010006051101171115998080000080100160039160039160039160039160039
80204160038119800061799502580100801008010043992251491539290160038160038149901314999680100802001602001602292611180201100991008010010000051102161115998080000080100160039160039160039160039160039
8020416003811990048617995025801008010080100439922504915695801600381600381499011814999680100802001602001600382611180201100991008010010000051101161115998080000080100160039160039160039160039160039
802041600381199000617995025801008010080100439922504915695801600381600381499013149996801008020016020016003826111802011009910080100100000511021291216094380425080100161347161393161103160961161486
80204161731121011061799502580100801008010043992251491569580160038160038149901314999680100802001602001600382611180201100991008010010000051101161115998080000080100160039160039160039160039160039
80204160038119800061799502580100801128010043992250491569580160038160038149901314999680100802001602001600382611180201100991008010010000051231161115998080000080100160039160039160039160039160039
8020416003811980001135799502580100801008010043992251491569580160038160038149901314999680100802001602001600382611180201100991008010010000051101161115998080000080100160039160039160039160039160039
802041600381198000726799502580100801008010043992250491569580160038160038149901314999680100802001602001600382611180201100991008010010080051101161116003280000080100160039160039160039160039160039
8020416003811980006538799502580100801008010043992250491569580160038160038149901314999680100802001602001600382611180201100991008010010000051101161115998080000080100160039160039160039160078160039

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 2.0005

retire uop (01)cycle (02)03mmu table walk data (08)l2 tlb miss data (0b)0f18191e1f3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fl1d tlb miss (a1)l1d cache writeback (a8)acc2branch mispred nonspec (cb)cfd0l1i cache miss demand (d3)d5map dispatch bubble (d6)dadbddfetch restart (de)e0? int output thing (e9)ebec? int retires (ef)f5f6f7f8fd
8002416003811990000000617995025800108001080010439877504915695801600381600781499233150018800108008016002016003826111800211091080010100000050201208160042159980800000080010160039160039160039160039160039
80024160038119900000006940799502580010800108001043987750491569580160038160038149923315001880010800201600201600382611180021109108001010200005020302160024159980800000080010160039160039160039160039160039
8002416003811990000000428799502580010800108004843987750491569580160038160038149923315001880010800201600201600382611180021109108001010000005020002160042159980800000080010160039160039160039160039160039
800241600381198000000061799502580010800108001043987750491569580160038160038149923315001880010802181600201600382611180021109108001010000005020304160025159980800000080010160039160039160081160039160039
800241600381199000000061799502580010800108001043987750491569580160038160038149923315001880010800201600201600382611180021109108001010000005020004160024159980800000080010160039160039160039160077160039
8002416003811990000000617995025800108001080010439877504915695801600381600381499233150018800108002016002016003826111800211091080010100000050200041600421599808000018080010160039160039160039160039160039
800241600381198000000061799502580010800108001043987751491569580160038160038149923315003680049800201600201600382611180021109108001010000005020004160042159980800000080010160039160039160039160039160039
800241600381199000000061799502580010800108001043987750491569583160038160038149923315001880010800201600201600382611180021109108001010000005020004170042159980800000080010160039160077160039160039160039
800241600381199000000061799502580010800108001043987750491569580160038160038149923315001880010801671600201600382611180021109108001010000005020002160034159980800000080010160039160039160039160039160039
800241600381199000000061799502580010800108001043987750491569580160038160038149923315001880010800201600201600385301180021109108001010000005020004160044159980800000080010160039160039160039160039160039