Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

SCVTF (scalar, integer, S from W)

Test 1: uops

Code:

  scvtf s0, w0
  mov x0, 1
  mov x1, 2

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 2.000

Issues: 2.000

Integer unit issues: 0.001

Load/store unit issues: 1.000

SIMD/FP unit issues: 1.000

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch simd uop (57)dispatch ldst uop (58)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map ldst uop (7d)map simd uop (7e)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? simd retires (ee)
20045782001110001000100010003000800020001000100010001000110001000
20045412001110001000100010003000800020001000100010001000110001000
20045372001110001000100010423234838420841042104210001000110001000
20045372001110001000100010003000800020001000100010001000110001000
20045372001110001000100010003000800020001000100010001000110001000
20045372001110001000100010003180818020001000100010001000110001000
20045372001110001000100010003000800020001000100010001000110001000
20045372001110001000100010003000800020001000100010001000110001000
20045372001110001000100010003000800020001000100010001000110001000
20055872061110301030104210003000800020001000100010001000110001000

Test 2: Latency 1->2 roundtrip

Code:

  scvtf s0, w0
  fmov x0, d0
  mov x0, 1
  mov x1, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 10.0030

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
3020410003040101101012000010000100200001000130015461642578264301012001000220004200100022000210001100001000010100
3020410004540101101012000010000100200001000130015461762578264301012001000220004200100022000210001100001000010100
3020410003040101101012000010000100200001000030015461742578264301002001000220002200100022000210001100001000010100
3020410003040101101012000010000100200001000030015461742578264301002001000220002200100022000210001100001000010100
3020510006340108101032000310002100200291001930015463962578617301482001002220042202100222004010003100001000010100
3020410003040101101012000010000100200001000030015461742578264301002001000220002200100022000210001100001000010100
3020410003040101101012000010000100200001000030015461742578264301002001000220002200100022000210001100001000010100
3020410003040101101012000010000100200001000030015461742578264301002001000220002200100022000210001100001000010100
3020410003040101101012000010000100200001000030015461742578264301002001000220002200100022000210001100001000010100
3020410003040101101012000010000100200001000030015461742578264301002001000220002200100022000210001100001000010100

1000 unrolls and 10 iterations

Result (median cycles for code): 10.0030

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
300251000684001810013200031000210200291000130154617625782643001120100022000420100022000210001100001000010010
300241000304001110011200001000010200001000030154617425782643001020100002000020100002000010001100001000010010
300241000304001110011200001000010200001000030154617425782643001020100002000020100002000010001100001000010010
300241000304001110011200001000010200001000030154617425782643001020100002000020100002000010001100001000010010
300241000304001110011200001000010200001000030154617425782643001020100002000020100002000010001100001000010010
300241000304001110011200001000010200001000030154617425782643001020100002000020100212003910003100001000010010
300241000304001110011200001000010200001000030154617425782643001020100002000020100002000010001100001000010010
300241000304001110011200001000010200001000030154617425782643001020100002000020100002000010001100001000010010
300241000304001110011200001000010200001000030154617425782643001020100002000020100002000010001100001000010010
300241000304001110011200001000010200001000030154617425782643001020100002000020100002000010001100001000010010

Test 3: throughput

Count: 8

Code:

  scvtf s0, w8
  scvtf s1, w8
  scvtf s2, w8
  scvtf s3, w8
  scvtf s4, w8
  scvtf s5, w8
  scvtf s6, w8
  scvtf s7, w8
  mov x8, 9

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.5018

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
16020440126160115101800068000810080012800123002400366400801601242008001280012200800548005418000080000100
16020540158160175101800368003810080054800123002400366400801601242008001280012200800128001218000080000100
16020440571160355101801268012810080180800543002402706404641602082008005480054200800128001218000080000100
16020440141160115101800068000810080012801823002644456722121604642008018280182200800128001218000080000100
16020440090160115101800068000810080012800123002527476553501601242008001280012200801808018018000080000100
16020440090160115101800068000810080012800123002400366400801601242008001280012200800128001218000080000100
16020540621160415101801568015810080222800123002400366400801601242008001280012200800128001218000080000100
16020440141160114101800058000810080012801803002497196529361604602008018080180200800128001218000080000100
16020440090160115101800068000810080012800123002496166546531601242008001280012200801808018018000080000100
16020440090160115101800068000810080012800123002400366400801601242008001280012200800128001218000080000100

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.5022

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
16002540569160085118003680038108005480012302810856841181600342080012800122080096800961800008000010
16002440317160131118006080060108008480000302513676515791600102080000800002080042800421800008000010
16002540119160087118003780039108005580000302400006400001600102080000800002080000800001800008000010
16002440308160131118006080060108008480012302400366400801600342080012800122080000800001800008000010
16002440086160011118000080000108000080084302710396765141601782080084800842080000800001800008000010
16002440046160011118000080000108000080000302570156577511600102080000800002080084800841800008000010
16002440054160011118000080000108000080000302400006400001600102080000800002080000800001800008000010
16002440615160025118000680008108001280096302583976621181602022080096800962080054800541800008000010
16002540264160084118003580038108005480096302977507047421602022080096800962080054800541800008000010
16002440296160147118006780069108009780012303082847282391600342080012800122080097800971800008000010