Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

SDIV (slow, 32-bit)

Test 1: uops

Code:

  sdiv w0, w1, w2
  mov w1, #0x7fffffff
  mov w2, #3

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 1.000

Load/store unit issues: 0.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)03l2 tlb miss data (0b)191e3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)606d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst int alu (97)l1d cache writeback (a8)accfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
10042039150006195025100010001000537251203920391801318971000100020002039261111001100000732162219801000100020402040204020402040
10042039150006195025100010001000537251203920391801318971000100020002039261111001100000732162219801000100020402040204020402040
100420391510186195025100010001000537251203920391801318971000100020002039261111001100000732162219801000100020402040204020402040
10042039150006195025100010001000537250203920391801318971000100020002039261111001100000732162219801000100020402040204020402040
10042039160006195025100010001000537250203920391801318971000100020002039261111001100000732162219801000100020402040204020402040
10042039150006195025100010001000537250203920391801318971000100020002039261111001100000732162219801000100020402040204020402040
10042039150006195025100010001000537250203920391801318971000100020002039261111001100000732162219801000100020402040204020402040
10042039150006195025100010001000537250203920391801318971000100020002039261111001100000732162219801000100020402040204020402040
10042039150006195025100010001000537250203920391801318971000100020002039261111001100000732162219801000100020402040204020402040
10042039150006195025100010001000537250203920391801318971000100020002039261111001100000732162219801000100020402040204020402040

Test 2: Latency 1->2

Chain cycles: 2

Code:

  sdiv w0, w1, w2
  eor x1, x1, x0
  eor x1, x1, x0
  mov w1, #0x7fffffff
  mov w2, #3

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 2 chain cycles): 8.0035

retire uop (01)cycle (02)03mmu table walk data (08)181e3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6b6d6emap stall dispatch (70)int prf full (71)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fl1d cache writeback (a8)acbranch cond mispred nonspec (c5)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
30204100035749000619126125301003010030100949344314996955010003510003595873039624230100302006020010003519311302011009910030100100000200331711997223000030100100036100036100036100036100036
30205100035749000619126125301003010030100949344304996955010003510003595873039624230100302006020010003519311302011009910030100100000191011711997223000030100100036100036100036100036100036
302041000357500007269126125301003010030100949344304996955010003510003595873039624230100302006020010003519311302011009910030100100130191011711997223000030100100036100036100036100036100036
30204100035750000619126125301003010030100949344314996955010003510003595873039624230100302006020010003519311302011009910030100100000191011711997223000030100100036100036100036100036100036
30204100035749000619126125301003010030100949344304996955010003510003595873039624230100302006020010003519311302011009910030100100000191001711997223000030100100036100036100036100036100036
30204100035749000619126125301003010030100949344304996955010003510003595873039624230100302006020010003519311302011009910030100100000191012611997223000030100100036100036100036100036100036
30204100035749000619126125301003010030100949344304996955010003510003595873039624230100302006020010003519311302011009910030100100000191011711997223000030100100036100036100036100036100036
30204100035749000619126125301003010030100949344314996955010003510003595873039624230100302006020010003519311302011009910030100100000191011711997223000030100100036100036100036100036100036
302041000357490007269126125301003010030100949344304996955010003510003595873039624230100302006020010003519311302011009910030100100000191011711997223000030100100036100036100036100036100036
30204100035750000619126125301003010030100949344304996955010003510003595873039624230100302006020010003519311302011009910030100100000191011711997223000030100100036100036100036100036100036

1000 unrolls and 10 iterations

Result (median cycles for code, minus 2 chain cycles): 8.0035

retire uop (01)cycle (02)0318191e3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache writeback (a8)acc2branch mispred nonspec (cb)cdcfd0d5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
30024100035749000619118225300103001030010947876704996955010003510003595885396265300103002060020100035193113002110910300101000000001890061656997143000030010100036100036100036100036100036
30024100035749200619118225300103001030010947876704996955010003510003595885396265300103002060020100035193113002110910300101000000001890051656997143000030010100036100036100036100036100036
30024100035749000619118225300103001030010947876704996955010003510003595885396265300103002060020100035193113002110910300101000000001890051653997143000030010100036100036100036100036100036
30024100035749000619118225300103001030010947876704996955010003510003595885396265300103002060020100035193113002110910300101000000001963051654997143000030010100036100036100036100036100036
30024100035750000619118225300103001030010947876704996955010003510003595885396265300103002060020100035193113002110910300101000000001890041645997143000030010100036100036100036100036100036
300241000357490001039118225300103001030010947876704996955010003510003595885396265300103002060020100035193113002110910300101000000001890041645997143000030010100036100036100036100036100036
30024100035749000619118225300103001030010947876704996955010003510003595885396265300103002060020100035193113002110910300101000000001890051635997143000030010100036100036100036100036100036
300241000357490007269118225300103001030010947876704996955010003510003595885396265300103002060020100035193113002110910300101000000001890031653997143000030010100036100036100036100036100036
30024100035750000619118225300103001030010947876704996955010003510003595885396265300103002060020100035193113002110910300101000100001890051655997143000030010100036100036100036100036100036
30024100035750000619118225300103001030010947876704996955010003510003595885396265300103002060020100035193113002110910300101000000001890051653997143000030010100036100036100036100036100036

Test 3: Latency 1->3

Chain cycles: 2

Code:

  sdiv w0, w1, w2
  eor x2, x2, x0
  eor x2, x2, x0
  mov w1, #0x7fffffff
  mov w2, #3

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 2 chain cycles): 8.0035

retire uop (01)cycle (02)0309l2 tlb miss data (0b)18191e3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fl1d cache writeback (a8)a9acbranch cond mispred nonspec (c5)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
302041000357490000015819126125301003010030100949344314996955100035100035958733962423010030200602001000351931130201100991003010010000000191021711997223000030100100036100036100036100036100036
3020410003575000000619126125301003010030100949344314996955100035100035958733962423010030200602001000351931130201100991003010010000000191011711997223000030100100036100036100036100036100036
3020410003574900000619126125301003010030100949344314996955100035100035958733962423010030200602001000351931130201100991003010010000000193511711997223000030100100036100036100036100036100036
302041000357490000027279126125301003010030100949344314996955100035100035958733962423010030200602001000351931130201100991003010010000000191011711997223000030100100036100036100036100036100036
3020410003574900000619126125301003010030100949344314996955100035100035958733962423010030200602001000351931130201100991003010010000000191011711997223000030100100036100036100036100036100036
3020410003574900000619126125301003010030100949344314996955100035100035958733962423010030200602001000351931130201100991003010010000000191011711997223000030100100036100036100036100036100036
30204100035749000007269126125301003010030100949344314996955100035100035958733962423010030200602001000351931130201100991003010010000000191011711997223000030100100036100036100036100036100036
3020410003574900000619126125301003010030100949344314996955100035100035958733962423020530200602001000351931130201100991003010010000000191011711997223000030100100036100036100036100036100036
30204100035749000005139126125301003010030100949344314996955100035100035958733962423010030200602001000351931130201100991003010010000000191011711997223000030100100036100036100036100036100036
3020410003575000000619126125301003010030100949344314996955100035100035958733962423010030200602001000351931130201100991003010010010000191011711997223000030100100036100036100036100036100036

1000 unrolls and 10 iterations

Result (median cycles for code, minus 2 chain cycles): 8.0035

retire uop (01)cycle (02)03mmu table walk instruction (07)l2 tlb miss instruction (0a)1e3a3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6d6emap stall dispatch (70)int prf full (71)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fl1d tlb miss (a1)st unit uop (a7)l1d cache writeback (a8)acbranch cond mispred nonspec (c5)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
30024100035749110268911822530010300103001094787670499695510003510003595885039626530010300206002010003519311300211091030010100001530189417161417997143000030010100036100036100036100036100036
300241000357491102689118225300103001030010947876704996955100035100035958850396265300103002060020100035193113002110910300101000000189416161820997143000030010100036100036100036100036100036
3002410003574911023279118225300103001030010947876704996955100035100035958850396265300103002060020100035193113002110910300101000000189417161620997143000030010100036100036100036100075100036
300241000357491102739118225300103001030010947876704996955100035100035958850396265300103002060020100035193113002110910300101000000189418161710997143000030010100036100036100036100036100036
300241000357491102689118225300103001030010947876714996955100035100035958850396265300103002060020100035193113002110910300101000000189418161616997143000030010100036100075100036100036100036
3002410003575011926389118225300103001030105947876704996955100035100035958850396265300103002060020100035193113002110910300101000000189418161718997143000030010100036100036100036100036100036
300241000357491102689118225300103001030010947876704996955100035100035958850396265300103002060020100035193113002110910300101000000189415161718997143000030010100036100036100036100036100036
300241000357491102689118225300103001030010947876704996955100035100035958850396265300103002060020100035193113002110910300101000000189416161615997143000030010100036100036100036100036100036
3002410003574911027339118225300103001030010947876704993923100035100035958850396265300103002060020100035193113002110910300101000000189415161621997143000030010100036100036100036100036100036
300241000357491102311911822530010300103001094787671499695510003510003595885039626530010300206002010003519311300211091030010100000018941616158997143000030010100036100036100036100036100036

Test 4: throughput

Count: 8

Code:

  sdiv w0, w8, w9
  sdiv w1, w8, w9
  sdiv w2, w8, w9
  sdiv w3, w8, w9
  sdiv w4, w8, w9
  sdiv w5, w8, w9
  sdiv w6, w8, w9
  sdiv w7, w8, w9
  mov w8, #0x7fffffff
  mov w9, #3

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 2.0005

retire uop (01)cycle (02)0318191e1f3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fld unit uop (a6)l1d cache writeback (a8)acc2branch cond mispred nonspec (c5)branch mispred nonspec (cb)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
80204160039119800006179950258010080100801004399225149156959016003916003914990131500238010080200160200160039261118020110099100801001000000005110316111599808000080100160040160040160040160040160040
80204160039119900006179950258010080100801004399225149156959016003916003914990131499978010080200160200160039261118020110099100801001000000005110116111599808000080100160040160040160040160040160040
80204160039119800006179950258010080100801004399225149156959016003916003914990131499978010080200160200160039261118020110099100801001000000005110116111599808000080100160040160040160040160040160040
80204160039119900006179950258010080100801004399225049156959016003916003914990131499978010080200160200160039261118020110099100801001000000005110116111599808000080100160040160040160040160040160040
80204160039119900006179950258010080100801004399225049156959016003916003914990131499978010080200160200160039261118020110099100801001000000105110116111599808000080100160040160040160040160040160040
80204160039119800006179950258010080100801004399225049156959016003916003914990131499978010080200160200160039261118020110099100801001000000005110116111599808000080100160040160040160040160040160040
80204160039119800006179950258010080100801004399225049156959016003916003914994031499978010080200160200160039261118020110099100801001000000005110116111599808000080100160040160040160040160040160040
80204160039119900006179950258010080100801004399225149156959016003916003914990131499978010080200160200160039261118020110099100801001000000005110116111599808000080100160040160040160040160082160040
80204160039119900006179950258010080100801004399225049156959016003916003914990131499978010080200160200160039526118020110099100801001000000005110116111599808000080100160040160040160040160040160040
80204160039119800006179950258010080100801004399225149156959016003916003914990131499978010080200160200160039261118020110099100801001000000005110116111599808000080100160040160040160040160040160040

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 2.0005

retire uop (01)cycle (02)03mmu table walk instruction (07)l2 tlb miss instruction (0a)191e3a3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fl1d cache writeback (a8)acc2cfd0d5map dispatch bubble (d6)daddfetch restart (de)e0? int output thing (e9)eaec? int retires (ef)f5f6f7f8fd
80024160039119911002116579950258001080010800104398775049156959016003916003914992331500198001080020160020160039261118002110910800101000050230191601616159980800000080010160040160040160040160040160040
8002416003911991100268799502580010800108001043987750491569590160039160039149923315001980010800201600201600392611180021109108001010025805023016160815159980800000080010160040160040160040160040160040
800241600391198110026879950258001080010800104398775049156959016003916003914992331500198001080020160020160039261118002110910800101000050230141601515159980800000080010160040160040160040160040160040
800241600391199110026879950258001080010800104398775049156959016003916003914992331500198001080020160020160039261118002110910800101000050230161601616159980800000080010160040160040160040160040160040
800241600391199110026879950258001080010800104398775049156959016003916003914992331500198001080020160020160039526118002110910800101003050230151601519159980800000080010160040160040160040160040160040
8002416003911991100273379950258001080010800104398775049156959016003916008014992331500198001080020160020160039261118002110910800101000050230151601715159980800000080010160040160040160040160040160040
80024160039119811002687995025800108001080010439877504915695901600391600391499233150019800108002016002016003926111800221091080010100005023012160915159980800000080010160040160040160040160040160040
8002416003911991100273379950258001080010800104398775049156959016003916003914992331500198001080020160020160078261118002110910800101000050230171601316159980800000980010160040160040160040160040160040
800241600391199110026879950258001080010800104398775049156959016003916003914992331500198001080020160020160039261118002110910800101000050230161601516159980800000080010160040160040160040160040160040
800241600791199110026879950258001080010800104398775049156959016003916003914992331500198001080020160020160039261118002110910800101000050230195701516160018800000080010160040160040160040160040160040