Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

SLI (vector, 8H)

Test 1: uops

Code:

  sli v0.8h, v1.8h, #3
  movi v0.16b, 1
  movi v1.16b, 2

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 0.001

Load/store unit issues: 0.000

SIMD/FP unit issues: 1.000

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch simd uop (57)ldst uops in schedulers (5b)dispatch uop (78)map simd uop (7e)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)
1004203310011100010005024810001000200011000
1004203310011100010005024810001000200011000
1004203310011100010005024810001000200011000
1004203310011100010005024810001000200011000
1004203310011100010005024810001000200011000
1004203310011100010005024810001000200011000
1004203310011100010005024810001000200011000
1004203310011100010005024810001000200011000
1004203310011100010005024810001000200011000
1004203310011100010005024810001000200011000

Test 2: Latency 1->1

Code:

  sli v0.8h, v1.8h, #3
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 2.0033

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
1020420033101011011000010010000300509248101002001000620020008110000100
1020420033101011011000010010000300509248101002001000420020008110000100
1020420033101011011000010010000300509248101002001000420020098110000100
1020420033101011011000010010000300509248101002001000420020008110000100
1020420033101011011000010010000300509248101002001000420020008110000100
1020420033101011011000010010000300509248101002001000420020008110000100
1020420033101011011000010010000300509248101002001000420220096210000100
1020420033101011011000010010000300509248101002001000420020008110000100
1020420033101011011000010010000300509248101002001000420020008110000100
1020420033101011011000010010000300509248101002001000420020008110000100

1000 unrolls and 10 iterations

Result (median cycles for code): 2.0033

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
10024200331002121100002010000705092471002020100062020000111000010
10024200331002121100002010000705092481002020100002020000111000010
10024200331002121100002010000705092481002020100002020000111000010
10024200331002121100002010000705092481002020100002020000111000010
10024200331002121100002010000705092481002020100002020000111000010
10024200331002121100002010000705092481002020100002020000111000010
10024200331002121100002010000705092481002020100002020000111000010
10024200331002121100002010000705092481002020100002020000111000010
10024200331002121100002010000705092481002020100002020000111000010
10024200331002121100002010000705092481002020100002020000111000010

Test 3: Latency 1->2

Code:

  sli v0.8h, v0.8h, #3
  movi v0.16b, 1

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 2.0033

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
1020420033101011011000010010000030005092481010020001000420020008110000100
1020420033101011011000010010000030005092481010020001000420020008110000100
1020420033101011011000010010000030005092481010020001000420020008110000100
1020420033101011011000010010000030005092481010020001000420020008110000100
1020420033101011011000010010000030005092481010020001000420020008110000100
1020420033101011011000010010000030005092481010020001000420020008110000100
1020420033101011011000010010000030005092481010020001000420020008110000100
1020420033101011011000010010000030005092481010020001000420020008110000100
1020420033101011011000010010000030005092481010020001000420020008110000100
1020420033101011011000010010000030005092481010020001000420020008110000100

1000 unrolls and 10 iterations

Result (median cycles for code): 2.0033

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
10024200331002121100002010000705092481002020100042020000111000010
10024200331002121100002010000705092481002020100002020000111000010
10024200331002121100002010000705092481002020100002020000111000010
10024200331002121100002010000705092481002020100002020000111000010
10024200331002121100002010000705092481002020100002020000111000010
10024200331002121100002010000705092481002020100002020000111000010
10024200331002121100002010000705092481002020100002020000111000010
10024200331002121100002010000705092481002020100002020000111000010
10024200331002121100002010000705092481002020100002020000111000010
10024200331002121100002010000705092481002020100002020000111000010

Test 4: throughput

Count: 8

Code:

  movi v0.16b, 0
  sli v0.8h, v8.8h, #3
  movi v1.16b, 0
  sli v1.8h, v8.8h, #3
  movi v2.16b, 0
  sli v2.8h, v8.8h, #3
  movi v3.16b, 0
  sli v3.8h, v8.8h, #3
  movi v4.16b, 0
  sli v4.8h, v8.8h, #3
  movi v5.16b, 0
  sli v5.8h, v8.8h, #3
  movi v6.16b, 0
  sli v6.8h, v8.8h, #3
  movi v7.16b, 0
  sli v7.8h, v8.8h, #3
  movi v8.16b, 9

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.5011

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
1602044056380109101800081008001230032005280112200800122001600261160000100
1602044009980110101800091008001330032005280112200800122001600261160000100
1602044008680109101800081008001230032005280112200800122001600921160000100
1602044009880110101800091008001330032005280112200800122001600241160000100
1602044008680109101800081008001230032005280112200800122001600241160000100
1602044008680109101800081008001230032005280112200800122001600241160000100
1602044008680109101800081008001230032005280112200800122001600241160000100
1602054012180146103800431028004730032005280112200800122001600241160000100
1602044008680109101800081008001230032005280112200800122001600241160000100
1602054013280144101800431008004930032005280112200800122001600241160000100

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.5052

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
16002443841800191180008010800123032005280022208001220160026116000010
16002441332800111180000010800003032000080010208000020160000116000010
16002540498800531180042010800463032000080010208000020160000116000010
16002440810801561180145010801453032000080010208000020160000116000010
16002440391800111180000010800003032000080010208000020160000116000010
16002440420800111180000010800003032000080010208000020160000116000010
16002440419800111180000010800003032000080010208000020160000116000010
16002440432800111180000010800003032014080045208003520160000116000010
16002440409800111180000010800003032000080010208000020160000116000010
16002440368800111180000010800003032000080010208000020160000116000010

Test 5: throughput

Count: 16

Code:

  sli v0.8h, v16.8h, #3
  sli v1.8h, v16.8h, #3
  sli v2.8h, v16.8h, #3
  sli v3.8h, v16.8h, #3
  sli v4.8h, v16.8h, #3
  sli v5.8h, v16.8h, #3
  sli v6.8h, v16.8h, #3
  sli v7.8h, v16.8h, #3
  sli v8.8h, v16.8h, #3
  sli v9.8h, v16.8h, #3
  sli v10.8h, v16.8h, #3
  sli v11.8h, v16.8h, #3
  sli v12.8h, v16.8h, #3
  sli v13.8h, v16.8h, #3
  sli v14.8h, v16.8h, #3
  sli v15.8h, v16.8h, #3
  movi v16.16b, 17

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.5002

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
160204801091601071011600061001600103006400361601082001600122003200241160000100
160204800341601071011600061001600103006400361601082001600122003200281160000100
160204800341601051011600041001600083006400361601082001600122003200281160000100
160204800341601071011600061001600103006400361601082001600122003200241160000100
160204800341601051011600041001600083006400361601082001600122003200241160000100
160204800341601051011600041001600083006401921601502001600602003200241160000100
160204800341601051011600041001600083006400361601082001600122003200241160000100
160204800341601051011600041001600083006400361601082001600122003200241160000100
160204800341601051011600041001600083006400361601082001600122003200241160000100
160204800341601051011600041001600083006400361601082001600122003200241160000100

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.5002

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
160024801531600151116000410160008030064004416002020016001420320024116000010
160024800341600111116000010160000030064000016001020016000020320000116000010
160024800341600111116000010160000030064000016001020016000020320000116000010
160024800341600111116000010160000030064019216006020016005820320000116000010
160024800341600111116000010160000030064000016001020016000020320000116000010
160024800341600111116000010160000030064000016001020016000020320000116000010
160024800341600111116000010160000030064000016001020016000020320000116000010
160024800341600111116000010160000030064000016001020016000020320000116000010
160024800341600111116000010160000030064000016001020016000020320000116000010
160024800341600111116000010160000030064000016001020016000020320116116000010