Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

SDIV (medium, 64-bit)

Test 1: uops

Code:

  sdiv x0, x1, x2
  mov x1, #0xffffffff80000000
  mov x2, #3

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 2.000

Integer unit issues: 2.001

Load/store unit issues: 0.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)dispatch uop (78)map int uop (7c)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
100413030200120011000115532100010002000002001001000
10041303020012001100011553210001000255021559638513088714715413993
100413030200120011000115648100910162000002001001000
100413030200120011000115532100010002000002001001000
100413030200120011000115532100010002000002001001000
100413030200120011000115532100010002000002001001000
100413030200120011000115532100010002000002001001000
100413030200120011000115532100010002034002004001000
100413030200120011000115532100010002000002001001000
100413030200120011000115532100010002000002001001000

Test 2: Latency 1->2

Chain cycles: 2

Code:

  sdiv x0, x1, x2
  eor x1, x1, x0
  eor x1, x1, x0
  mov x1, #0xffffffff80000000
  mov x2, #3

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 2 chain cycles): 13.0030

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)? int output thing (e9)? int retires (ef)
3020415003040201402013020304018014030203302100602244010130100
3020415003040201402013020304018070030203302120602244010130100
3020415003040201402013020304018070030203302120602964010430100
3020415003040201402013020304018070030203302120602244010130100
3020415003040201402013020304018070030203302120602244010130100
3020515006040204402043023104018070030203302120602244010130100
3020415003040201402013020304018070030203302120602964010430100
3020415003040201402013020304018042030203302100602204010130100
3020415003040201402013020304018070030203302120602244010130100
3020415003040201402013020304018070030203302120602244010130100

1000 unrolls and 10 iterations

Result (median cycles for code, minus 2 chain cycles): 13.0030

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)dispatch uop (78)map int uop (7c)map int uop inputs (7f)? int output thing (e9)? int retires (ef)
3002415003040011400113001340186313004130066600444000130010
3002415003040011400113001040183293001030020601164000430010
3002415003040011400113001040183293001030020600204000130010
3002415003040011400113001040183293001030020600204000130010
3002515006040014400143004140183293001030020600204000130010
3002415003040011400113001040183293001030020600204000130010
3002415003040011400113001040183293001030020600204000130010
3002415003040011400113001040183293001030020600204000130010
3002515006040014400143003840183293001030020600204000130010
3002415003040011400113001040183293001030020600204000130010

Test 3: Latency 1->3

Chain cycles: 2

Code:

  sdiv x0, x1, x2
  eor x2, x2, x0
  eor x2, x2, x0
  mov x1, #0xffffffff80000000
  mov x2, #3

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 2 chain cycles): 13.0030

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)? int output thing (e9)? int retires (ef)
3020415003040201402013020304018337030230302470602204010130100
3020515006040204402043023104018070030203302120603004010430100
3020415003040201402013020304018070030203302120602944010430100
3020415003040201402013020304018070030203302120602944010530100
3020415003040201402013020304018070030203302120602244010130100
3020415003040201402013020304018070030203302120602244010130100
3020415003040201402013020304018070030203302120602984010430100
3020415003040201402013020304018070030203302120602244010130100
3020415003040201402013020304018070030203302120602244010130100
3020415003040201402013020304018070030203302120602244010130100

1000 unrolls and 10 iterations

Result (median cycles for code, minus 2 chain cycles): 13.0030

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)dispatch uop (78)map int uop (7c)map int uop inputs (7f)? int output thing (e9)? int retires (ef)
3002415003040011400113001340183483001330032600204000130010
3002415003040011400113001040186793004230068600204000130010
3002415003040011400113001040183293001030020600204000130010
3002415003040011400113001040183293001030020600204000130010
3002415003040011400113001040183293001030020600204000130010
3002415003040011400113001040186933004130068600204000130010
3002415003040011400113001040183293001030020600204000130010
3002415003040011400113001040183293001030020600204000130010
3002415003040011400113001040183293001030020600204000130010
3002415003040011400113001040186973004030064600204000130010

Test 4: throughput

Code:

  sdiv x0, x1, x2
  mov x1, #0xffffffff80000000
  mov x2, #3

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 13.0030

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)dispatch uop (78)map int uop (7c)map int uop inputs (7f)? int output thing (e9)? int retires (ef)
1020413003020101201011010011598321010010206202122000110100
1020413003020101201011010011598321010010208202162000110100
1020413003020101201011010011598321010010208202162000110100
1020513006020104201041010911598321010010208202162000110100
1020413003020101201011010011598321010010208202162000110100
1020413003020101201011010011598321010010208202162000110100
1020413003020101201011010011598321010010208202162000110100
1020413003020101201011010011598321010010208202462000410100
1020413003020101201011010011598321010010208202162000110100
1020413003020101201011010011598321010010208202162000110100

1000 unrolls and 10 iterations

Result (median cycles for code): 13.0030

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)dispatch uop (78)map int uop (7c)map int uop inputs (7f)? int output thing (e9)? int retires (ef)
1002413003020021200211002011595921002010026200202001110010
1002413003020021200211002011595921002010020200202001110010
1002413003020021200211002011595921002010020200202001110010
1002413003020021200211002011595921002010020200202001110010
1002513006020024200241002911595921002010020200202001110010
1002413003020021200211002011595921002010020200202001110010
1002513006020024200241002911595921002010020200202001110010
1002413003020021200211002011595921002010020200202001110010
1002413003020021200211002011595921002010026200362001110010
1002413003020021200211002011595921002010028200362001110010