Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

SDIV (fast, 64-bit)

Test 1: uops

Code:

  sdiv x0, x1, x2
  mov x1, #0
  mov x2, #0

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 1.000

Load/store unit issues: 0.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)03mmu table walk instruction (07)l2 tlb miss instruction (0a)18191e1f3a3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)606d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst int alu (97)l1d cache writeback (a8)accfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
100420381610000008495025100010001000537250203820381801318961000100020002038261111001100000731161119811000100020392039203920392039
100420381510000006195025100010001000537251203820381801318961000100020002038261111001100000731161119811000100020392039203920392039
100420381510000006195025100010001000537251203820381801318961000100020002038261111001100000731161119811000100020392039203920392039
1004203815100000011295025100010001000537251203820381801318961000100020002038261111001100000731161119811000100020392039203920392039
100420381510000006195025100010001000537250203820381801318961000100020002038261111001100000731161119811000100020392039203920392039
100420381610000006195025100010001000537251203820381801318961000100020002038261111001100000731161119811000100020392039203920392039
100420381510000006195025100010001000537251203820381801318961000100020002038261111001100000731161119811000100020392039203920392039
100420381510000006195025100010001000537251203820381801318961000100020002038261111001100000731161119811000100020392039203920392039
1004203816100091806195025100010001000537251203820381801318961000100020002038261111001100000731161119811000100020392039203920392039
100420381510000006195025100010001000537251203820381801318961000100020002038261111001100000731161119811000100020392039203920392039

Test 2: Latency 1->2

Chain cycles: 2

Code:

  sdiv x0, x1, x2
  eor x1, x1, x0
  eor x1, x1, x0
  mov x1, #0
  mov x2, #0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 2 chain cycles): 7.0035

retire uop (01)cycle (02)03mmu table walk instruction (07)mmu table walk data (08)l2 tlb miss data (0b)1e1f3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fl1d cache writeback (a8)acbranch cond mispred nonspec (c5)branch mispred nonspec (cb)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
3020490035674000606181120253010030100301008522656149869550900359003585900386243301003020060200900351761130201100991003010010000001910316228975130000301009003690036900369003690036
30204900356740000061811202530100301003010085226560498695509003590035859003862433010030200602009003517611302011009910030100100030001910216228975130000301009003690036900369003690036
3020490035674000006181120253010030109301008522656049869550900359003585900386243301003020060200900351761130201100991003010010000101910216228975130000301009003690036900369003690036
3020490035675000006181120253010030100301008522656049869550900359003585900386243301003020060200900351761130201100991003010010000001910216228975130000301009003690036900369003690036
3020490035675000006181120253010030100301008522656049869550900359003585900386243301003020060200900351761130201100991003010010000101910416228975130000301009003690036900369003690036
30204900356740000072681120253010030100301008522656098869550900359003585900386243301003020060200900351761130201100991003010010000001910216228975130000301009003690036900369003690036
3020490035674000006181120253010030100301008522656049869550900359003585900386243301003020060200900351761130201100991003010010000001910216228975130000301009003690036900369003690066
3020490035674000006181120253010030100301008522656049869550900359003585900386243301003020060200900351761130201100991003010010000001910216228975130000301009003690036900369003690036
3020490035675000006181120253010030100301008522656049869550900359003585900386243301003020060200900351761130201100991003010010000001910216228975130000301009003690036900369003690036
3020490035675000006181120253010030100301008522656049869550900359003585900386243302013020060200900351761130201100991003010010000001910416228975130000301009003690036900369003690036

1000 unrolls and 10 iterations

Result (median cycles for code, minus 2 chain cycles): 7.0035

retire uop (01)cycle (02)03l1i tlb fill (04)09l2 tlb miss data (0b)18191e1f3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fl1d tlb access (a0)l1d cache writeback (a8)a9acc2cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
3002490035674000000061812102530010300103001085229014986955900359003585922386265300103002060020900351761130021109103001010000001890316238975130000300109003690036900369003690036
3002490035674000000061812102530010300103001085229014986955900359003585922386265300103002060020900351761130021109103001010000001890216238975130000300109003690036900369003690036
30024900356740000000726812102530010300103001085229014986955900359003585922386265300103002060020900351761130021109103001010010301890416228977830000300109003690081900369003690079
3002490035675001000061812102530010300103001085229014986955900359003585922386265300103002060020900351761130021109103001010000001890316328975130000300109003690036900369003690036
300249003567400002035261812102530010300103001085229014986955900359007685922386265300103002060020900351761130021109103001010000001890216238975130000300109003690036900369003690036
3002490035674000000061812102530010300103001085229014986955900359003585922386265300103002060020900351761130021109103001010000001890316338975130000300109003690036900369003690036
3002490035674000000061816432530010300103001085229014986955900359003585922386265300103002060020900351761130021109103001010000001890216328975130000300109003690036900369003690036
3002490035674000000061812102530010300103001085229014986955900359003585922386265300103002060020900351762130021109103001010000001890216328975130000300109003690036900369003690036
3002490035674000000061812102530010300103001085229014986955900359003585922386265300103002060020900351761130021109103001010000001890216218975130000300109003690036900369003690036
30024900356740000000251812102530010300103001085229014986955900359003585922386265300103002060020900351761130021109103001010000001890316228975130000300109003690036900369003690036

Test 3: Latency 1->3

Chain cycles: 2

Code:

  sdiv x0, x1, x2
  eor x2, x2, x0
  eor x2, x2, x0
  mov x1, #0
  mov x2, #0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 2 chain cycles): 7.0035

retire uop (01)cycle (02)03mmu table walk instruction (07)1e3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6d6emap stall dispatch (70)int prf full (71)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fl1d cache writeback (a8)acc2cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
3020490035674006181120253010030100301258522656149869559003590035859000386243301003020060200900351761130201100991003010010000001910316118975130000301009003690036900369003690036
3020490035675106181252253012530125301258522656149869559003590035859000386243301003020060200900351761130201100991003010010000001910116118975130000301009003690036900369003690036
30204900356750126181120423010030100301008522656149869559003590035859000386240301003020060200900351761130201100991003010010000001910116118975130000301009003690036900369003690036
3020490035675006181120253010030100301008522656049869559003590035859000386243301003020060200900351761130201100991003010010000001910116118975130000301009003690036900369003690036
3020490035674006181120253010030100301008522656149869559003590035859000386243301003020060200900351761130201100991003010010000001910116118975130000301009003690036900369003690036
3020490035674006181120253010030100301008522656149869559003590035859000386243301003020060200900351761130201100991003010010000001910116118975130000301009003690036900369003690036
3020490035675006181120253010030100301008522656149869559003590035859000386243301003020060200900351761130201100991003010010000001910116118975130000301009003690036900369003690036
3020490035674006181120253010030100301008522656049869559003590035859000386243301003020060200900351761130201100991003010010000001910116118975130000301009003690036900369003690036
3020490035674006181120253010030100301008522656149869559003590035859000386243301003020060200900351761130201100991003010010000001910116018975130000301009003690036900369003690036
3020490035675006181120253010030100301008522656149869559003590035859000386243301003020060200900351941130201100991003010010000001910116118980830000301009003690036900369003690036

1000 unrolls and 10 iterations

Result (median cycles for code, minus 2 chain cycles): 7.0035

retire uop (01)cycle (02)03mmu table walk data (08)1e3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fld unit uop (a6)l1d cache writeback (a8)accfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
3002490035674006181210253001030010300108522901149869559003590035859223862653001030020600209003517611300211091030010100001890316228975130000300109003690036900369003690036
3002490035674006181210253001030010300108522901149869559003590035859223862653001030020600209003517611300211091030010100001890216238975130000300109003690036900369003690036
3002490035675006181210253001030010300108522901149869559003590035859223862753001030020600209003517611300211091030010100001890316228975130000300109003690036900369003690036
30024900356740072681210253001030010300108522901049869559003590035859223862653001030020600209007617611300211091030010100001890216228978530000300109003690077900369003690036
3002490035674006181210253001030010300108522901149869559003590035859223862653001030020600209003517611300211091030010100001890216228975130000300109003690036900369003690036
3002490035675006181210253001030010300108522901149869559003590035859223862653001030020600209003517611300211091030010100001890316228975130000300109003690036900369003690036
3002490035674006181210313001030010300108522901149869559003590035859223862653001030020600209003517611300211091030010100001890216228975130000300109003690036900369003690036
3002490035674006181210253001030010300108522901149869559003590035859223862653001030020600209003517611300211091030010100101890216228975130000300109003690036900369003690036
3002490035674006181210253001030010300108522901149869559003590035859223862653001030020600209003517611300211091030010100031890216328975130000300109003690036900369003690036
3002490035675006181210253001030010300108522901149869559003590035859223862653001030020600209003517611300211091030010100001890216228975130000300109003690036900369003690036

Test 4: throughput

Count: 8

Code:

  sdiv x0, x8, x9
  sdiv x1, x8, x9
  sdiv x2, x8, x9
  sdiv x3, x8, x9
  sdiv x4, x8, x9
  sdiv x5, x8, x9
  sdiv x6, x8, x9
  sdiv x7, x8, x9
  mov x8, #0
  mov x9, #0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 2.0005

retire uop (01)cycle (02)03mmu table walk data (08)l2 tlb miss data (0b)191e1f3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fl1d tlb access (a0)ld unit uop (a6)l1d cache writeback (a8)a9acbranch cond mispred nonspec (c5)branch mispred nonspec (cb)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
8020416003811980000061799502580100801368010043992251491569580160038160038149901314999680100802001602001600382611180201100991008010010000000005110316111599808000080100160039160039160039160039160039
8020416003811990000061799502580100801008010043992251491569580160038160038149901314999680100802001602001600382611180201100991008010010000000005110116111599808000080100160039160039160039160039160039
8020416003811990000061799502580100801008010043992251491569580160038160038149901314999680100802001602001600382611180201100991008010010000000005110116111599808000080100160039160039160039160039160039
8020416003811990000061799502580100801008010043992250491569580160038160038149940314999680100802001602001600382611180201100991008010010000000005110116111599808000080100160039160039160039160039160039
802041600381198000001568799502580100801008010043992251491569580160038160038149901314999680100802001602001600382611180201100991008010010000000005110116111599808000080100160039160039160039160039160039
802041600381199000006322799502580100801008010043992251491569580160038160038149901314999680100802001602001600382611180201100991008010010020000005110116111599808000080100160039160039160039160039160039
8020416003811980000061799502580100801008010043992250491569580160038160076149901314999680138802001602001600382611180201100991008010010000003005110116111599808000080100160039160039160039160039160039
8020416007811980000061799502580100801008010043992250491569580160038160038149901314999680100802001602001600382611180201100991008010010000000005110116111599808000080100160039160039160039160039160039
80204160038119900021061799502580100801008010043992250491569580160038160038149901314999680100802001602001600382611180201100991008010010000000005110116111599808000080100160077160039160039160039160039
8020416003811990000061799502580100801008010043992251491569583160038160038149901314999680100802001602001600382611180201100991008010010000100005110116111599808000080100160039160039160039160039160039

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 2.0005

retire uop (01)cycle (02)03mmu table walk data (08)09l2 tlb miss instruction (0a)l2 tlb miss data (0b)18191e1f3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache writeback (a8)acbranch cond mispred nonspec (c5)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaec? int retires (ef)f5f6f7f8fd
8002416003811980000009904828799502580010800108001043987751491569580160038160038149923315001880010800201600201600382611180021109108001010000000502015161518159980800000080010160039160039160039160039160039
8002416003811980000000061799502580010800108001043987750491569580160038160038149923315001880010800201600201600382611180021109108001010000000502015161815159980800000080010160039160039160039160039160039
8002416003811980000000061799502580010800108001043987750491569580160038160038149923315001880010800201600201600382611180021109108001010000000502018161518159980800000080010160039160039160039160039160039
8002416003811990000000061799502580010800108001043987750491539270160038160038149923315001880010800201600201600382611180022109108001010000000502015161715159980800000080010160039160039160039160039160039
80024160038119800000000726799502580010800108001043987750491569580160038160038149923315001880010800201600201600382611180021109108001010003000502018161620159980800000080010160039160039160039160039160039
8002416003811990000000061799502580010800108001043987750491569580160038160038149923315001880010800201600201600382611180021109108001010000000502024161719159980800000080010160039160039160039160039160039
80024160038119800000000726799502580010800108001043987750491569580160038160038149923315001880010800201600201600382611180021109108001010000000502014161519159980800000080010160039160039160039160039160039
8002416007911990000000061799502580010800108001043987750491569580160038160038149923315001880010800201600201600382611180021109108001010000000502017161818159980800000080010160039160039160039160039160039
8002416003811990000009061799502580010800108001043987750491569580160038160038149923315001880010800201600201600382611180021109108001010000001502015161815159980800000080010160039160039160039160039160039
8002416003811980000000061799502580010800108001043987750491569580160038160038149923315001880010800201600201600382611180021109108001010000000502017161717159980800000080010160039160039160039160039160039