Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

SDIV (fast, 32-bit)

Test 1: uops

Code:

  sdiv w0, w1, w2
  mov w1, #0
  mov w2, #0

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 1.000

Load/store unit issues: 0.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)031e3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)606d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst int alu (97)l1d cache writeback (a8)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)ec? int retires (ef)f5f6f7f8fd
100420381506195025100010001000537251203820381801318961000100020002038261111001100007311611198110000100020392039203920392039
100420381506195025100010001000537250203820381801318961000100020002038261111001100007311611198110000100020392039203920392039
100420381506195025100010001000537251203820381801318961000100020002038261111001100007311611198110000100020392039203920392039
1004203815396195025100010001000537251203820381801318961000100020002038261111001100007311611198110000100020392039203920392039
100420381506195025100010001000537251203820381801318961000100020002038261111001100007311611198110000100020392039203920392039
1004203815035095025100010001000537251203820381801318961000100020002038261111001100007311611198110000100020392039203920392039
100420381606195025100010001000537251203820381801318961000100020002038261111001100007311611198110000100020392039203920392039
100420381506195025100010001000537250203820381801318961000100020002038261111001100007311611198110000100020392039203920392039
100420381596195025100010001000537251203820381801318961000100020002038261111001100007311611198110000100020392039203920392039
100420381506195025100010001000537251203820381801318961000100020002038261111001100007311611198110000100020392039203920392039

Test 2: Latency 1->2

Chain cycles: 2

Code:

  sdiv w0, w1, w2
  eor x1, x1, x0
  eor x1, x1, x0
  mov w1, #0
  mov w2, #0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 2 chain cycles): 7.0035

retire uop (01)cycle (02)03mmu table walk data (08)l2 tlb miss data (0b)18191e1f3a3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fld unit uop (a6)l1d cache writeback (a8)a9acbranch cond mispred nonspec (c5)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
3020490035674000000061811202530100301003010085226560498695590035900358590038624330100302006115290035176113020110099100301001000000001910316228975130000301009003690036900369003690036
30204900356740000121760390811202530100301003010085226560498695590035900358590038624330100302006020090216176113020110099100301001000100001910216228975130000301009003690036900369003690036
30204900356740000000103811202530100301003010085226561498695590035900358590038624330100302006020090035176113020110099100301001000000001910216228975130000301009003690036900369003690036
30204900356740040000618112025301003010030100852265604986955900359003585900386243301003020060200900351761130201100991003010010000015001910216228975130000301009003690036900369003690081
30204900356740000900618112010930100301003010085226560498695590035900358590038624330100302006020090035176113020110099100301001000800001910416228975130000301009021290036900369003690036
3020490035676000000061811202530100301003010085226561498695590035900358590038624330100302006020090035176113020110099100301001000000001910216238975130000301009003690036900369003690036
3020490035675000000061811202530100301003010085226560498695590035900358590038624330100302006020090035176113020110099100301001000000001910216228975130000301009003690036900369003690036
30204900356740040243520103811202530100301003010085226561498695590035900358590038624330100302006020090035176113020110099100301001000000001910216228975130000301009003690036900369003690036
30204900356740000000726811202530100301003010085226561498695590035900358590038624330617302006020090035176113020110099100301001002100001910216228975130000301009003690036900369003690215
3020490035688000400061811202530155301003010085226561498695590035900358590038624330100302006020090035176113020110099100301001000000001910216238975130000301009003690036900369003690036

1000 unrolls and 10 iterations

Result (median cycles for code, minus 2 chain cycles): 7.0035

retire uop (01)cycle (02)0309l2 tlb miss data (0b)1e3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)5f60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fl1d cache writeback (a8)accfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
30024900356750006181210253001030010300108522901014986955900359003585922386265300103002060020900351761130021109103001010001890316238975130000300109003690036900369003690036
30024900356740006181210423001030010300108522901014986955900359003585922386265300103002060020900351761130021109103001010001890316328975130000300109003690036900369003690036
30024900356740006181210253001030010301168522901014986955900359003585922386265300103002060020900351761130021109103001010001890316238975130000300109003690036900369003690036
30024900356740006181210253001030010300108522901014986955900359003585922386265300103002060020900351761130021109103001010001890416328975130000300109003690036900369003690036
30024900356750006181210253001030010300108522901014986955900359003585922386265300103002060020900351761130021109103001010101890216228975130000300109003690036900369003690036
300249003567400072681210253001030010300108522901014986955900359003585922386265300103002060020900351761130021109103001010031890316228975130000300109003690036900369003690036
30024900356740006181210253001030010300108522901014986955900359003585922386265300103002060020900351761130021109103001010001890316238975130000300109003690036900369003690036
30024900356740006181210253001030010300108522901014986955900359003585922386265301263002060020900351761130021109103001010001890216438975130000300109003690036900369003690036
30024900356750006181210253001030010300108522901014986955900359003585922386265300103002060020900351761130021109103001010001890216238975130000300109003690036900369003690036
30024900356740096181210253001030010300108522901014986955900359003585922386265300103002060020900351761130021109103001010001890416128975130000300109003690036900369003690036

Test 3: Latency 1->3

Chain cycles: 2

Code:

  sdiv w0, w1, w2
  eor x2, x2, x0
  eor x2, x2, x0
  mov w1, #0
  mov w2, #0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 2 chain cycles): 7.0035

retire uop (01)cycle (02)03mmu table walk data (08)09l2 tlb miss data (0b)191e1f3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fst unit uop (a7)l1d cache writeback (a8)acbranch mispred nonspec (cb)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
3020490035674001000618112025301003010030100852265604986955900359003585900386243301003020060200900351761130201100991003010010007001910216118975130000301009003690036900369003690036
3020490035674000000618112025301003010030100852265604986955900359003585900386243301003020060200900351761130201100991003010010000301910117118975130000301009003690036900369003690036
30204900356740000001568112025301003010030100852265604986955900359003585900386243301003020060200900351761130201100991003010010000001910116118975130000301009003690036900369003690036
3020490035675000000618112025301003010030100852265604986955900359003585900386243301003020060200900351761130201100991003010010000001910116118975130000301009003690036900369003690036
3020490035674000000618112042301003010030100852265604986955900359003585900386243301003020060200900351761130201100991003010010000001910116118975130000301009003690036900369003690036
30204900356750100240618112025301003010030100852265604986955900359003585900386243301003020060200900351761130201100991003010010000001910116118975130000301009003690036900369003690036
302049003567500001890618112025301003010030100852265604986955900359003585900386243301003020060200900351761130201100991003010010000001910116018975130000301009003690036900689003690036
30204900356740000007268112025301003010030100852265604986955900359003585900386243301003020060200900351761130201100991003010010001001910116118975130000301009003690036900369003690036
3020490035674000000618112025301003010030100852265604986955900359003585900386243301003020060200900351761130201100991003010010001301910116118975130000301009003690036900369003690036
302049003567400000061811202530100301003010085226560498695590035900358590038624330100302006020090035176113020110099100301001000018301910116108975130000301009003690036900369003690036

1000 unrolls and 10 iterations

Result (median cycles for code, minus 2 chain cycles): 7.0035

retire uop (01)cycle (02)03mmu table walk data (08)l2 tlb miss data (0b)18191e1f3a3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)5f60696a6d6emap stall dispatch (70)int prf full (71)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fl1d cache writeback (a8)acbranch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
3002490035674000000043681210253001030010300108522901004986955900359003585922038626530010300206002090035176113002110910300101000001890716668975130000300109003690036900369003690036
300249003567500000006181210253001030010300108522901004986955900359003585922038626530010300206002090035176113002110910300101000001915616558975130000300109003690036900369003690036
3002490035675000000072681210253001030010300108522901014986955900359003585922038626530010300206002090035176113002110910300101000001890616668975130000300109003690036900369003690036
300249003567400000006181210253001030010300108522901014986955900359003585922038626530010300206002090035176113002110910300101000001890616658975130000300109003690036900369003690036
3002490074674000000061812102530010300103001085229010149869559003590035859220386265300103002060020900761761130021109103001010370001890716778975130000300109003690036900369003690036
300249003567500000006181210253001030010300108522901004986955900359003585922038626530010300206002090035176113002110910300101000001890716778975130000300109003690036900369012590036
300249003567400000006181210253001030010300108522901004986955900359003585922038626530010300206002090035176113002110910300101000001890716778975130000300109003690036900369003690036
300249003567400000006181210253001030010300108522901014986955900359003585922038626530010300206002090035176113002110910300101000001890616758975130000300109003690036900369003690036
300249003567400000006181210253001030010300108522901004986955900359003585922038626530010300206002090035176113002110910300101000101890716798975130000300109003690036900369003690036
3002490035675000000072681210253001030010300108522901004986955900359003585922038626530010300206002090035176213002110910300101009001890716778975130000300109003690036900369003690036

Test 4: throughput

Count: 8

Code:

  sdiv w0, w8, w9
  sdiv w1, w8, w9
  sdiv w2, w8, w9
  sdiv w3, w8, w9
  sdiv w4, w8, w9
  sdiv w5, w8, w9
  sdiv w6, w8, w9
  sdiv w7, w8, w9
  mov w8, #0
  mov w9, #0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 2.0005

retire uop (01)cycle (02)03l2 tlb miss data (0b)1e1f3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)696a6b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fld unit uop (a6)l1d cache writeback (a8)a9acbranch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
8020416003811980006179950258010080100801004399225491569580160038160038149901314999680100802001602001600382611180201100991008010010000000005110416111599808000080100160039160039160039160039160039
8020416003811990006179950258010080100801004399225491569580160038160038149901314999680100802001602001600382612180201100991008010010000000005110116111599808000080100160039160039160039160039160039
80204160038119810010379950258010080100801004399225491569580160038160038149901314999680100802001602001600382611180201100991008010010023060005110116111599808000080100160039160039160039160039160039
80204160038119900072679950258010080100802614399798491569580160038160038149901314999680100802001602001600382611180201100991008010010000000005110116101599808000080100160039160039160039160039160039
8020416003811980006179950258010080100801004399225491569580160038160038149901314999680100802001602001600382611180201100991008010010000000005110116111599808000080100160039160039160039160039160039
80204160038119900072679950258010080100801004399225491569580160038160038149901314999680100802001603281600382611180201100991008010010000000005110216111599808000080100160039160039160039160039160039
8020416003811990006179950258010080100801004399225491569580160038160038149901314999680100802001602001600382611180201100991008010010000000005110116111599808000080100160039160039160039160039160039
80204160038119900072679950258010080100801004399225491569580160038160038149901314999680100802001602001600382611180201100991008010010000000005110116111599808000080100160039160039160039160039160039
8020416003811991006179950258010080100801004399225491569580160038160038149901314999680100802001602001600382611180201100991008010010000000005110116111599808000080100160039160039160039160039160039
80204160038119901711766179950258010080100801004399225491539270160038160038149901314999680100802001602001600382611180201100991008010010000000005110116111599808000080100160039160039160039160039160039

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 2.0005

retire uop (01)cycle (02)03mmu table walk data (08)09l2 tlb miss data (0b)18191e1f3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fl1d tlb miss (a1)ld unit uop (a6)l1d cache writeback (a8)a9acc2branch cond mispred nonspec (c5)branch mispred nonspec (cb)cfd2d5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)ec? int retires (ef)f5f6f7f8fd
8002416003811990004315061799502580010800108001043987751491569580160038160038149923315001880010800201600201600382611180021109108001010020033000502003161115998080000080010160039160039160039160039160039
80024160078119900000006179950258001080010800104398775149156958016003816003814992331500188001080020160020160038261118002110910800101000000000502001161115998080000080010160039160039160039160039160039
80024160038119800000006179950258001080010800104398775149156958016003816003814992331500188001080020160020160038261118002110910800101000000000502002161115998080000080010160039160039160039160039160039
80024160038119900000006179950258001080010800104398775149156958316003816003814992331500188001080020160140160038261118002110910800101000000000503401162215998080000080010160039160039160039160039160039
80024160038119900000006179950258002480010800104398775149156958016003816003814992331500188001080020160020160038261118002110910800101000000000502001161115998080000080010160081160039160039160039160039
80025160038119800000006179950258001080010800104398775049156958016003816003814992331500188001080020160020160038261118002110910800101000000000502001161115998080000080010160039160039160039160039160039
80024160038119900000006179950258001080010800104398775049156958016003816003814992331500188001080020160020160038261118002110910800101000000000502001161115998080000080010160039160039160039160039160039
80024160080119800000006179950258001080010800104398775049156958016003816003814992331500188001080020160020160038261118002110910800101000000000502001161115998080000080010160039160039160039160039160039
800241600381199000000042079950258001080010800104398775149156958016003816003814992331500188001080020160020160038261118002110910800101000000000502001161115998080000080010160039160039160039160039160039
80024160038119800000006179950258001080010800104398775149156958016003816003814992331500188001080020160020160038261118002110910800101000200000502001161115998080000080010160039160039160039160039160281