Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

UDIV (slow, 64-bit)

Test 1: uops

Code:

  udiv x0, x1, x2
  mov x1, #0xffffffffffffffff
  mov x2, #3

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 2.000

Integer unit issues: 2.001

Load/store unit issues: 0.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)dispatch uop (78)map int uop (7c)map int uop inputs (7f)? int output thing (e9)? int retires (ef)
10042103020012001100018724410001000200020011000
10042103020012001100018724410001000200020011000
10042103020012001100018724410001000200020011000
10042103020012001100018724410001000200020011000
10042103020012001100018724410001000200020011000
10042103020012001100018741510091016200020011000
10042103020012001100018724410001000200020011000
10042103020012001100018724410001000200020011000
10042103020012001100018724410001000200020011000
10042103020012001100018724410001000200020011000

Test 2: Latency 1->2

Chain cycles: 2

Code:

  udiv x0, x1, x2
  eor x1, x1, x0
  eor x1, x1, x0
  mov x1, #0xffffffffffffffff
  mov x2, #3

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 2 chain cycles): 21.0030

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)dispatch uop (78)map int uop (7c)map int uop inputs (7f)? int output thing (e9)? int retires (ef)
3020423003040201402013020361771503020330210602204010130100
3020423003040201402013020361772063020330212602244010130100
3020423003040201402013020361775423023230248602244010130100
3020423003040201402013020361772063020330212602244010130100
3020423003040201402013020361772063020330212602244010130100
3020523006040203402033023161772063020330212602244010130100
3020423003040201402013020361772063020330212602244010130100
3020423003040201402013020361772063020330212602964010230100
3020423003040201402013020361772063020330212602244010130100
3020423003040201402013020361772063020330212602244010130100

1000 unrolls and 10 iterations

Result (median cycles for code, minus 2 chain cycles): 21.0030

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)dispatch uop (78)map int uop (7c)map int uop inputs (7f)? int output thing (e9)? int retires (ef)
3002423003040011400113001361774343001030020600204000130010
3002423003040011400113001061778293004130068600204000130010
3002423003040011400113001061774653001030020600204000130010
3002423003040011400113001061774653001030020600204000130010
3002523006040012400123004061774653001030020600204000130010
3002423003040011400113001061774653001030020600204000130010
3002423003040011400113001061774653001030020600904000230010
3002423003040011400113001061774653001030020600204000130010
3002423003040011400113001061774653001030020600204000130010
3002423003040011400113001061778163003730057600204000130010

Test 3: Latency 1->3

Chain cycles: 2

Code:

  udiv x0, x1, x2
  eor x2, x2, x0
  eor x2, x2, x0
  mov x1, #0xffffffffffffffff
  mov x2, #3

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 2 chain cycles): 21.0030

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)? int output thing (e9)? int retires (ef)
3020423003040201402013020306177150030203302100602244010130100
3020423003040201402013020306177178030203302100602244010130100
3020523006040203402033023106177206030203302120602244010130100
3020423003040201402013020306177206030203302120602244010130100
3020423003040201402013020306177206030203302120602244010130100
3020423003040201402013020306177206030203302120602244010130100
3020423003040201402013020306177206030203302120602964010430100
3020423003040201402013020306177206030203302120602244010130100
3020423003040201402013020306177206030203302120602244010130100
3020423003040201402013020306177206030203302120602244010130100

1000 unrolls and 10 iterations

Result (median cycles for code, minus 2 chain cycles): 21.0030

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)? int output thing (e9)? int retires (ef)
3002423003040011400113001306177484030013300320600204000130010
3002423003040011400113001006177465030010300200601164000430010
3002423003040011400113001006177465030010300200601164000330010
3002523006040012400123003706177465030010300200600204000130010
3002423003040011400113001006177815030042300700600204000130010
3002423003040011400113001006177465030010300200600204000130010
3002423003040011400113001006177465030010300200600204000130010
3002523006040012400123004006177829030041300680600204000130010
3002423003040011400113001006177465030010300200600204000130010
3002423003040011400113001006177465030010300200601164000430010

Test 4: throughput

Code:

  udiv x0, x1, x2
  mov x1, #0xffffffffffffffff
  mov x2, #3

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 21.0030

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)dispatch uop (78)map int uop (7c)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
1020421003020101201011010018795441010010206202120200010010100
1020421003020101201011010018795441010010208202480200020010100
1020421003020101201011010018795441010010208202160200010010100
1020421003020101201011010018795441010010208202540200030010100
1020421003020101201011010018795441010010208202480200020010100
1020421003020101201011010018795441010010208202160200010010100
1020421003020101201011010018795441010010208202160200010010100
1020421003020101201011010018795441010010208202480200020010100
1020421003020101201011010018795441010010208202160200010010100
1020421003020101201011010018795441010010208202160200010010100

1000 unrolls and 10 iterations

Result (median cycles for code): 21.0030

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)? int output thing (e9)? int retires (ef)
1002421003020021200211002001879304010020100280200702001210010
1002421003020021200211002001879304010020100200200202001110010
1002421003020021200211002001879304010020100200200202001110010
1002421003020021200211002001879304010020100200200682001210010
1002421003020021200211002001879304010020100200200682001210010
1002421003020021200211002001879304010020100200200202001110010
1002421003020021200211002001879304010020100200200682001210010
1002421003020021200211002001879304010020100200200202001110010
1002421003020021200211002001879304010020100200200202001110010
1002421003020021200211002001879421010029100440200202001110010