Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

UMLAL2 (by element, 2D)

Test 1: uops

Code:

  umlal2 v0.2d, v1.4s, v2.s[1]
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 0.001

Load/store unit issues: 0.000

SIMD/FP unit issues: 1.000

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch simd uop (57)ldst uops in schedulers (5b)dispatch uop (78)map simd uop (7e)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)
1004303310011100010007590510001000300011000
1004303310011100010007590510001000300011000
1004303310011100010007590510001000300011000
1004303310011100010007590510001000300011000
1004303310011100010007590510001000300011000
1004303310011100010007590510001000300011000
1004303310011100010007590510001000300011000
1004303310011100010007590510001000300011000
1004303310011100010007590510001000300011000
1004303310011100010007590510001000300011000

Test 2: Latency 1->1

Code:

  umlal2 v0.2d, v1.4s, v2.s[1]
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 3.0033

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
1020430033101011011000010010000307769247101332021004420030012110000100
1020430033101011011000010010000300768905101002001000420030012110000100
1020430033101011011000010010000300768905101002001000420030012110000100
1020430033101011011000010010000300768905101002001000420030012110000100
1020430033101011011000010010000300768905101002001000420030012110000100
1020430033101011011000010010000300768905101002001000420030012110000100
1020430033101011011000010010000300768905101002001000420030012110000100
1020430033101011011000010010000300768905101002001000420030012110000100
1020430033101011011000010010000300768905101002001000420030012110000100
1020430033101011011000010010000307769247101332021004620030012110000100

1000 unrolls and 10 iterations

Result (median cycles for code): 3.0033

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
10024300331002121100002010000707689051002020100042030000111000010
10024300331002121100002010000707692471005120100442030000111000010
10024300331002121100002010000707689051002020100002030000111000010
10024300331002121100002010000707689051002020100002030000111000010
10024300331002121100002010000707689051002020100002030012111000010
10024300331002121100002010000707689051002020100002030000111000010
10024300331002121100002010000707689051002020100002030000111000010
10024300331002121100002010000707689051002020100002030000111000010
10024300331002121100002010000707689051002020100002030000111000010
10024300331002121100002010000707689051002020100002030000111000010

Test 3: Latency 1->2

Code:

  umlal2 v0.2d, v0.4s, v1.s[1]
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 3.0033

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
1020430033101011011000010010000300768905101002001000620030012110000100
1020430033101011011000010010000300768905101002001000620030012110000100
1020430033101011011000010010000300768905101002001000420030012110000100
1020430033101011011000010010000300768905101002001000420030012110000100
1020430033101011011000010010000300768905101002001000420030012110000100
1020430033101011011000010010000300768905101002001000420030012110000100
1020430033101011011000010010000300768905101002001000420030012110000100
1020430033101011011000010010000300768905101002001000420030012110000100
1020430033101011011000010010000300768905101002001000420030012110000100
1020430033101011011000010010000300768905101002001000420030012110000100

1000 unrolls and 10 iterations

Result (median cycles for code): 3.0033

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
100253006610027211000602010031707689051002020100062030000111000010
100243003310021211000002010000697692471005120100482030000111000010
100243003310021211000002010000707689051002020100002030000111000010
100243003310021211000002010000707689051002020100002030000111000010
100243003310021211000002010000707689051002020100002030000111000010
100243003310021211000002010000707689051002020100002030000111000010
100243003310021211000002010000707689051002020100002030000111000010
100243003310021211000002010000707689051002020100002030000111000010
100243003310021211000002010000707689051002020100002030000111000010
100243003310021211000002010000707689051002020100002030000111000010

Test 4: Latency 1->3

Code:

  umlal2 v0.2d, v1.4s, v0.s[1]
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 3.0033

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
1020430033101011011000010010000300768905101002001000620030018110000100
1020430033101011011000010010000300768905101002001000420030012110000100
1020430033101011011000010010000300768905101002001000420030012110000100
1020430033101011011000010010000300768905101002001000420030012110000100
1020430033101011011000010010000300768905101002001000420030012110000100
1020430033101011011000010010000300768905101002001000420030012110000100
1020430033101011011000010010000300768905101002001000420030012110000100
1020430033101011011000010010000300768905101002001000420030012110000100
1020430033101011011000010010000300768905101002001000420030012110000100
1020430033101011011000010010000300768905101002001000420030012110000100

1000 unrolls and 10 iterations

Result (median cycles for code): 3.0033

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
10024300331002121100002010000707689051002020100062030000111000010
10024300331002121100002010000707689051002020100002030000111000010
10024300331002121100002010000707689051002020100002030000111000010
10024300331002121100002010000707689051002020100002030000111000010
10024300331002121100002010000707689051002020100002030000111000010
10024300331002121100002010000707689051002020100002030000111000010
10024300331002121100002010000707689051002020100002030000111000010
10024300331002121100002010000707689051002020100002030000111000010
10024300331002121100002010000707689051002020100002030000111000010
10024300331002121100002010000707689051002020100002030000111000010

Test 5: throughput

Count: 8

Code:

  movi v0.16b, 0
  umlal2 v0.2d, v8.4s, v9.s[1]
  movi v1.16b, 0
  umlal2 v1.2d, v8.4s, v9.s[1]
  movi v2.16b, 0
  umlal2 v2.2d, v8.4s, v9.s[1]
  movi v3.16b, 0
  umlal2 v3.2d, v8.4s, v9.s[1]
  movi v4.16b, 0
  umlal2 v4.2d, v8.4s, v9.s[1]
  movi v5.16b, 0
  umlal2 v5.2d, v8.4s, v9.s[1]
  movi v6.16b, 0
  umlal2 v6.2d, v8.4s, v9.s[1]
  movi v7.16b, 0
  umlal2 v7.2d, v8.4s, v9.s[1]
  movi v8.16b, 9
  movi v9.16b, 10

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.5011

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
1602044048580110101800091008001330032005680113200800132002400391160000100
1602044011080110101800091008001330032005280112200800122002400361160000100
1602044008680109101800081008001230032005280112200800122002400361160000100
1602044010980109101800081008001230032005280112200800122002400361160000100
1602044008680109101800081008001230032005280112200800122002400361160000100
1602044008680109101800081008001230032005280112200800122002400361160000100
1602044008680109101800081008001230032005280112200800122002400361160000100
1602044008680109101800081008001230032005280112200800122002400361160000100
1602044008680109101800081008001230032005280112200800122002400361160000100
1602044008680109101800081008001230032005280112200800122002400361160000100

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.5052

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
1600244391580019118000810800123032000080010208000020240000116000010
1600244132580011118000010800003032000080010208000020240000116000010
1600244040280011118000010800003032000080010208000020240105116000010
1600244036580011118000010800003032005280022208001220240000116000010
1600244047980011118000010800003032000080010208000020240000116000010
1600244041880011118000010800003032000080010208000020240000116000010
1600244041880011118000010800003032000080010208000020240000116000010
1600244039180011118000010800003032000080010208000020240000116000010
1600244044180011118000010800003032020080059208004920240000116000010
1600244042980011118000010800003032000080010208000020240000116000010

Test 6: throughput

Count: 12

Code:

  umlal2 v0.2d, v12.4s, v13.s[1]
  umlal2 v1.2d, v12.4s, v13.s[1]
  umlal2 v2.2d, v12.4s, v13.s[1]
  umlal2 v3.2d, v12.4s, v13.s[1]
  umlal2 v4.2d, v12.4s, v13.s[1]
  umlal2 v5.2d, v12.4s, v13.s[1]
  umlal2 v6.2d, v12.4s, v13.s[1]
  umlal2 v7.2d, v12.4s, v13.s[1]
  umlal2 v8.2d, v12.4s, v13.s[1]
  umlal2 v9.2d, v12.4s, v13.s[1]
  umlal2 v10.2d, v12.4s, v13.s[1]
  umlal2 v11.2d, v12.4s, v13.s[1]
  movi v12.16b, 13
  movi v13.16b, 14

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.5003

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
120204600711201061011200051001200093004802081201542001200622003600421120000100
120204600451201071011200061001200103004800361201082001200122003600361120000100
120204600351201051011200041001200083004800361201082001200122003600361120000100
120204600351201051011200041001200083004800361201082001200122003600361120000100
120204600351201051011200041001200083004800361201082001200122003600361120000100
120204600351201051011200041001200083004800361201082001200122003600361120000100
120204600351201051011200041001200083004800361201082001200122003600361120000100
120204600351201051011200041001200083004800361201082001200122003600361120000100
120205600701201411011200401001200523004800361201082001200122003600361120000100
120204600351201051011200041001200083004800361201082001200122003600361120000100

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.5003

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
120024601301200161112000510120009304800001200102012000020360000112000010
120024600351200111112000010120000304800001200102012000020360000112000010
120024600351200111112000010120000304800001200102012000020360000112000010
120024600351200111112000010120000304800001200102012000020360000112000010
120024600351200111112000010120000304800001200102012000020360000112000010
120024600351200111112000010120000304800001200102012000020360000112000010
120024600351200111112000010120000304800001200102012000020360183112000010
120024600351200111112000010120000304800001200102012000020360000112000010
120024600351200111112000010120000304800001200102012000020360000112000010
120024600351200111112000010120000304800001200102012000020360000112000010