Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

BIT (vector, 8B)

Test 1: uops

Code:

  bit v0.8b, v1.8b, v2.8b
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 0.001

Load/store unit issues: 0.000

SIMD/FP unit issues: 1.000

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch simd uop (57)ldst uops in schedulers (5b)dispatch uop (78)map simd uop (7e)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)
1004203310011100010005024810001000300011000
1004203310011100010005024810001000300011000
1004203310011100010005024810001000300011000
1004203310011100010005024810001000300011000
1004203310011100010005024810001000300011000
1004203310011100010005024810001000300011000
1004203310011100010005024810001000300011000
1004203310011100010005024810001000300011000
1004203310011100010005024810001000300011000
1004203310011100010005024810001000300011000

Test 2: Latency 1->1

Code:

  bit v0.8b, v1.8b, v2.8b
  movi v0.16b, 1
  movi v1.16b, 2
  movi v2.16b, 3

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 2.0033

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
1020420033101011011000010010000300509248101002001000420030012110000100
1020420033101011011000010010000300509248101002001000420030012110000100
1020420033101011011000010010000300509248101002001000420030012110000100
1020420033101011011000010010000300509248101002001000420030012110000100
1020420033101011011000010010000300509248101002001000420030012110000100
1020420033101011011000010010000300509248101002001000420030012110000100
1020420033101011011000010010000300509248101002001000420030012110000100
1020420033101011011000010010000300509248101002001000420030012110000100
1020420033101011011000010010000300509248101002001000420030012110000100
1020420033101011011000010010000300509248101002001000420030012110000100

1000 unrolls and 10 iterations

Result (median cycles for code): 2.0033

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
10024200331002121100002010000705092461002020100042030000111000010
10024200331002121100002010000705092481002020100002030000111000010
10024200331002121100002010000705092481002020100002030000111000010
10024200331002121100002010000705092481002020100002030000111000010
10024200331002121100002010000705092481002020100002030000111000010
10024200331002121100002010000705092481002020100002030000111000010
10024200331002121100002010000705092481002020100002030000111000010
10024200331002121100002010000705092481002020100002030000111000010
10024200331002121100002010000705092481002020100002030000111000010
10024200331002121100002010000705092481002020100002030000111000010

Test 3: Latency 1->2

Code:

  bit v0.8b, v0.8b, v1.8b
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 2.0033

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
102042003310101101100001001000030750958010136202100442420915732301971118873221003313114
102042003310101101100001001000030050924810100200100042000300121010000100
102042003310101101100001001000030050924810100200100042000300121010000100
102042003310101101100001001000030450958010136202100502000300121010000100
102042003310101101100001001000030050924810100200100042000300121010000100
102042003310101101100001001000030050924810100200100042000300121010000100
102042003310101101100001001000030050924810100200100042000300121010000100
102042003310101101100001001000030050924810100200100042000300121010000100
102042003310101101100001001000030050924810100200100042000300121010000100
102042003310101101100001001000030050924810100200100042000300121010000100

1000 unrolls and 10 iterations

Result (median cycles for code): 2.0033

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
10024200331002121100002010000705092471002020100062030000111000010
10024200331002121100002010000705092481002020100002030000111000010
10024200331002121100002010000705092481002020100002030000111000010
10024200331002121100002010000705092481002020100002030000111000010
10024200331002121100002010000705092481002020100002030000111000010
10024200331002121100002010000705092481002020100002030000111000010
10024200331002121100002010000705092481002020100002030000111000010
10024200331002121100002010000705092481002020100002030000111000010
10024200331002121100002010000705092481002020100002030000111000010
10024200331002121100002010000705092481002020100002030000111000010

Test 4: Latency 1->3

Code:

  bit v0.8b, v1.8b, v0.8b
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 2.0033

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
1020420033101011011000010010000300509248101002001000420030018110000100
1020420033101011011000010010000300509248101002001000420030012110000100
1020420033101011011000010010000300509248101002001000420030012110000100
1020420033101011011000010010000300509248101002001000420030012110000100
1020420033101011011000010010000300509248101002001000420030012110000100
1020420033101011011000010010000300509248101002001000420030012110000100
1020420033101011011000010010000300509248101002001000420030012110000100
1020420033101011011000010010000300509248101002001000420030012110000100
1020420033101011011000010010000300509248101002001000420030012110000100
1020420033101011011000010010000300509580101342001004820030012110000100

1000 unrolls and 10 iterations

Result (median cycles for code): 2.0033

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
10024200331002121100002010000705092481002020100042030012111000010
10024200331002121100002010000705092481002020100042030000111000010
10024200331002121100002010000705092481002020100002030000111000010
10024200331002121100002010000705092481002020100002030000111000010
10024200331002121100002010000705092481002020100002030000111000010
10024200331002121100002010000705092481002020100002030000111000010
10024200331002121100002010000705092481002020100002030000111000010
10024200331002121100002010000705092481002020100002030000111000010
10024200331002121100002010000705092481002020100002030000111000010
10024200331002121100002010000705092481002020100002030000111000010

Test 5: throughput

Count: 8

Code:

  movi v0.16b, 0
  bit v0.8b, v8.8b, v9.8b
  movi v1.16b, 0
  bit v1.8b, v8.8b, v9.8b
  movi v2.16b, 0
  bit v2.8b, v8.8b, v9.8b
  movi v3.16b, 0
  bit v3.8b, v8.8b, v9.8b
  movi v4.16b, 0
  bit v4.8b, v8.8b, v9.8b
  movi v5.16b, 0
  bit v5.8b, v8.8b, v9.8b
  movi v6.16b, 0
  bit v6.8b, v8.8b, v9.8b
  movi v7.16b, 0
  bit v7.8b, v8.8b, v9.8b
  movi v8.16b, 9
  movi v9.16b, 10

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.5011

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
1602044046380110101800091008001330032005680113200800132002400391160000100
1602044012180110101800091008001330032005280112200800122002400361160000100
1602044008680109101800081008001230032005280112200800122002400361160000100
1602044008680109101800081008001230032005280112200800122002400361160000100
1602044008680109101800081008001230032005280112200800122002400361160000100
1602044008680109101800081008001230032005280112200800122002400361160000100
1602044008680109101800081008001230032005280112200800122002400361160000100
1602044010780109101800081008001230032020480150200800502002400361160000100
1602044008680109101800081008001230032005280112200800122002400361160000100
1602044008680109101800081008001230032005280112200800122002400361160000100

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.5052

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
1600244386480019118000810800123032005680023208001320240039116000010
1600244132480011118000010800003032000080010208000020240000116000010
1600244039780011118000010800003032000080010208000020240000116000010
1600244042080011118000010800003032000080010208000020240000116000010
1600244043180011118000010800003032000080010208000020240000116000010
1600244042980011118000010800003032005680023208001320240039116000010
1600244042580011118000010800003032000080010208000020240000116000010
1600244039180011118000010800003032000080010208000020240039116000010
1600244057180020118000910800133032000080010208000020240000116000010
1600244043180011118000010800003032000080010208000020240000116000010

Test 6: throughput

Count: 16

Code:

  bit v0.8b, v16.8b, v17.8b
  bit v1.8b, v16.8b, v17.8b
  bit v2.8b, v16.8b, v17.8b
  bit v3.8b, v16.8b, v17.8b
  bit v4.8b, v16.8b, v17.8b
  bit v5.8b, v16.8b, v17.8b
  bit v6.8b, v16.8b, v17.8b
  bit v7.8b, v16.8b, v17.8b
  bit v8.8b, v16.8b, v17.8b
  bit v9.8b, v16.8b, v17.8b
  bit v10.8b, v16.8b, v17.8b
  bit v11.8b, v16.8b, v17.8b
  bit v12.8b, v16.8b, v17.8b
  bit v13.8b, v16.8b, v17.8b
  bit v14.8b, v16.8b, v17.8b
  bit v15.8b, v16.8b, v17.8b
  movi v16.16b, 17
  movi v17.16b, 18

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.5002

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
160205801161601391011600381001600503006400441601102001600132004800391160000100
160204800441601071011600061001600103006400361601082001600122004800421160000100
160204800341601051011600041001600083006400361601082001600122004800361160000100
160204800341601051011600041001600083006400361601082001600122004800361160000100
160204800341601051011600041001600083006400361601082001600122004800361160000100
160204800341601071011600061001600103006400441601102001600132004800361160000100
160204800341601051011600041001600083006400361601082001600122004800361160000100
160205800681601411011600401001600523006400361601082001600122004800361160000100
160204800341601051011600041001600083006400361601082001600122004800361160000100
160204800341601051011600041001600083006400361601082001600122004800361160000100

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.5002

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
1600248016816001711160006010160010306400441600202016001420480000116000010
1600248003416001111160000010160000306400001600102016000020480135116000010
1600248003416001111160000010160000306400001600102016000020480000116000010
1600248003416001111160000010160000306400001600102016000020480000116000010
1600248003416001111160000010160000306400001600102016000020480000116000010
1600248003416001111160000010160000306400001600102016000020480000116000010
1600248003416001111160000010160000306400001600102016000020480000116000010
1600248003416001111160000010160000306400001600102016000020480189116000010
1600248003416001111160000010160000306400001600102016000020480183116000010
1600248003416001111160000010160000306400001600102016000020480000116000010