Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

FCVTMU (scalar, D to W)

Test 1: uops

Code:

  fcvtmu w0, d0
  mov x0, 1
  mov x1, 2

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 2.000

Issues: 3.000

Integer unit issues: 1.001

Load/store unit issues: 0.000

SIMD/FP unit issues: 2.000

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch simd uop (57)ldst uops in schedulers (5b)dispatch uop (78)map simd uop (7e)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
20041037300110012000200011000200020002000100110001000
20041037300110012000200011000200020002000100110001000
20041037300110012000200011000200020002000100110001000
20041037300110012000200011000200020002000100110001000
20041037300110012000200011000200020002000100110001000
20041037300110012000200011000200020002000100110001000
20041037300110012000200011000200020002000100110001000
20041037300110012000200011000200020002000100110001000
20041037300110012000200011000200020002000100110001000
20041037300110012000200011000200020002000100110001000

Test 2: Latency 1->2 roundtrip

Code:

  fcvtmu w0, d0
  fmov d0, x0
  mov x0, 1
  mov x1, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 10.0127

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
3020410031340130101092001510006102200901003430015485832581266301942001003620076202100532011310008100001000010100
3020410030540128101072001410007100200901003530015486792581402301952001003720075200100562011210007100001000010100
3020410021640123101092000910005104200601003630015484972581038301962001003820078200100192004110003100001000010100
3020410030640128101072001310008100200901003430015486582581396301942001003620076200100362007610005100001000010100
3020510052240154101142002510015102201781005230615496442582893302442021005420111200100382007810005100001000010100
3020410012240110101032000510002100200301003630015486512581278301962001003820074204100742014910011100001000010100
3020410003240101101012000010000100200001008631315516842586453303402041009020183200100722015010009100001000010100
3020410040140137101092001810010100201201008730715513012585677303392021009220184204101262025610017100001000010100
3020410049040150101152002310012104201501007130715507982584890302932021007620150200100892018310011100001000010100
3020410003240101101012000010000100200001005130015498622583303302412001005220112202100362007610006100001000010100

1000 unrolls and 10 iterations

Result (median cycles for code): 10.0052

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
300241000324001110011200001000010200001000030154633625782063001020100022000420100002000010001100001000010010
300241006254006510023200281001410201801006830155026425848203019820100682014420100002000010001100001000010010
300241003064003810017200121000910200901001830154734325798273005820100192004122100352007310006100001000010010
300241017404017310047200811004510205401010430155293525892183029420101052021620100182003510003100001000010010
300241002134002910015200081000610200601003630154820225813303010620100382007420100552011010007100001000010010
300241001374002010013200051000210200301005330154896625826653015320100542010820100722014410008100001000010010
300241004144004710019200171001110201201001730155176725866133005720100172003620100002000010001100001000010010
300241000324001110011200001000010200001000030154634625782063001020100002000020100002000010001100001000010010
300241001204002010013200041000310200301000030154634625782063001020100002000020101992039610023100001000010010
300241004594005610021200241001110201501000030154641425783103001020100002000020100532011210005100001000010010

Test 3: throughput

Count: 8

Code:

  fcvtmu w0, d8
  fcvtmu w1, d8
  fcvtmu w2, d8
  fcvtmu w3, d8
  fcvtmu w4, d8
  fcvtmu w5, d8
  fcvtmu w6, d8
  fcvtmu w7, d8
  mov x8, 9

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 1.0005

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
1602048005724010380101160002100160008030008800411601082000160012200160012800018000080100
1602048003724010380101160002100160008030008800411601082000160012200160012800018000080100
1602048003724010380101160002100160008030008800411601082000160012200160012800018000080100
1602048003724010380101160002100160008030008800411601082000160012200160012800018000080100
1602048003724010380101160002100160008030008802781601562000160066200160012800018000080100
1602048003724010380101160002100160008030008800411601082000160012200160012800018000080100
1602048003724010380101160002100160008030008800411601082000160012200160012800018000080100
1602048003724010380101160002100160008030008800411601082000160012200160012800018000080100
1602048003724010380101160002100160008030008800411601082000160012200160012800018000080100
1602048003724010380101160002100160008030008800411601082000160012200160012800018000080100

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 1.0005

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)? int output thing (e9)? simd retires (ee)? int retires (ef)
160024800622400138001116000210160008308800411600182016001220160012800018000080010
160024800372400118001116000010160000308800001600102016000020160000800018000080010
160024800372400118001116000010160000308800001600102016000020160054800158000080010
160024800372400118001116000010160000308800001600102016000020160000800018000080010
160025800742400658002516004010160058308800001600102016000020160000800018000080010
160024800372400118001116000010160000308800001600102016000020160000800018000080010
160024800372400118001116000010160000308800001600102016000020160000800018000080010
160024800372400118001116000010160000308800001600102016000020160000800018000080010
160024800372400118001116000010160000308800001600102016000020160000800018000080010
160024800372400118001116000010160000308800001600102016000020160068800158000080010