Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

SETF8

Test 1: uops

Code:

  setf8 w1
  mov x0, 1
  mov x1, 2
  mov x2, 3
  mov x3, 4
  mov x4, 5

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 1.000

Load/store unit issues: 0.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)031e1f3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)606d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst int alu (97)l1d cache writeback (a8)accfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)f5f6f7f8fd
1004103580598619172510001000100062250110351035805388210001000200010351041110011000007312711990100010361036103610361036
10041035800619172510001000100062250010801035816388210001000200010351041110011000107312711990100010361036103610361036
10041035800619172510001000100062250010351035805388210001000200010351041110011000007312711990100010361036103610361036
10041035700619172510001000100062250010351035805388210001000200010351041110011000007312711990100010361036103610361036
10041035700799172510001000100062250010351035805388210001000200010351041110011000007312711990100010361036103610361036
100410357001039172510001000100062250010351035805388210001000200010351041110011000007312711990100010361036103610361036
10041035700619172510001000100062250110351035805388210001000200010351041110011000007312711990100010361036103610361036
10041035800619172510001000100062250010351035805388210001000200010351041110011000007312711990100010361036103610361036
10041035700619172510001000100062250010351035805388210001000200010351041110011000007312711990100010361036103610361036
10041035800619172510001000100062250010351035805388210001000200010351041110011000507312711990100010361036103610361036

Test 2: Latency 2->1

Chain cycles: 1

Code:

  setf8 w1
  cset x1, cc
  mov x0, 1
  mov x1, 2
  mov x2, 3
  mov x3, 4
  mov x4, 5

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 1 chain cycle): 1.0035

retire uop (01)cycle (02)03mmu table walk data (08)09l2 tlb miss data (0b)18191e1f3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fl1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acc2c3branch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfmap dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
20204200351500000045061199302520100201002011212972334916955200352003517425617487201122022430236200351041120201100991002010010100000000000111131816002001120000101002003620036200362003620036
202042003515000000339061199302520100201002011212972334916955200352003517425617487201122022430236200351041120201100991002010010100000000000111131816002001120000101002003620036200362003620036
2020420035149000000061199302520100201002011212972334916955200352003517425617487201122022430236200351041120201100991002010010100000000000111131816002001120000101002003620036200362003620036
20204200351500000015061199302520100201002011212972334916955200352003517425617487201122022430236200351041120201100991002010010100000000000111131816002001120000101002003620036200362003620036
2020420035150000000061199302520100201002011212972334916955202152003517425617487201122031930236200351041120201100991002010010100000010317330111131816002001120000101002003620036200362003620036
202042003515000000123061199302520100201002011212972334916955200352003517425617487201122022430236200351041120201100991002010010100000000000111131816002001120000101002003620036200362003620036
2020420035150000000061199302520100201002011212972334916955200352003517425617487201122022430236200351041120201100991002010010100000000000111131816002001120000101002003620036200362003620036
20204200351500000012082199302520100201002011212972334916955200352003517425617487201122022430236200351041120201100991002010010100000000000111131816002001120000101002003620036200362003620036
20204200351500000024061199302520100201002011212972334916955200352003517425617487201122022430236200351041120201100991002010010100000000000111131816002001120000101002003620036200362003620036
202042003515000000399061199302520100201002011212972334916955200802003517425617487201122022430236200351041120201100991002010010100000000000111131816002001120000101002003620036200362003620036

1000 unrolls and 10 iterations

Result (median cycles for code, minus 1 chain cycle): 1.0035

retire uop (01)cycle (02)031e3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fl1d tlb miss (a1)l1d cache writeback (a8)accfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
2002420035150396119918252001020010200101297247149169552003520035174283175042001020020300202003510411200211091020010100100001270127111999520000100102003620036200362003620036
20024200351504296119918252001020010200101297247149169552003520035174283175042001020020300202003510411200211091020010100100001270127211999520000100102003620036200362003620036
200242003515006119918252001020010200101297247149169552003520035174283175042001020020300202003510411200211091020010100100001270127111999520000100102003620036200362003620036
2002420035150336119918252001020010200101297247149169552003520035174283175042001020020300202003510411200211091020010100100001270127111999520000100102003620036200362003620036
2002420035150126119918252001020010200101297247149169552003520035174283175042001020020300202003510411200211091020010100100001270127111999520000100102003620036200362003620036
200242003515036119918252001020010200101297247149169552003520035174283175042001020020300202003510411200211091020010100100001270127111999520000100102003620036200362003620036
2002420035150082199182520010200102001012972471491695520035200351742821175042001020020300202003510411200211091020010100101001270127111999520000100102003620036200362003620036
200242003515006119918252001020010200101297247149169552003520035174283175042001020020300202003510411200211091020010100100001270127111999520000100102003620036200362003620036
200242003515066119918252001020010200101297247149169552003520035174283175042001020020300202003510411200211091020010100100001270127111999520000100102003620036200362003620036
200242003515006119918252001020010200101297247149169552003520035174283175042001020020300202003510411200211091020010100100001270127111999520000100102003620036200362003620036

Test 3: Latency 2->2

Code:

  setf8 w0
  mov x0, 1
  mov x1, 2
  mov x2, 3
  mov x3, 4
  mov x4, 5

(non-fused SUB/CBNZ loop)

100 unrolls and 100 iterations

Result (median cycles for code): 1.0035

retire uop (01)cycle (02)03181e3a3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst int alu (97)l1d tlb access (a0)l1d cache writeback (a8)acbranch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfmap dispatch bubble (d6)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
1020410035750015369927251020010200102106477121496955100351003586737873610210102242024810035110111020110099101000001117191610010101001001003610036100361003610036
102041003575001619927251020010200102106477121496955100351003586737873610210102242024810035110111020110099101000001117191610010101001001003610036100361003610036
102041003575001619927251020010200102106477121496955100351003586737873610210102242024810035110111020110099101000001117191610010101001001003610036100361003610036
102041003575000829927251020010200102106477121496955100351003586737873610210102242024810035110111020110099101000001117191610010101001001003610036100361003610036
102041003575001829927251020010200102106477121496955100351003586737873610210102242024810035110111020110099101000001117191610010101001001003610036100361003610036
102041003575001829927251020010200102106477121496955100351003586737873610210102242024810035110111020110099101000001117191610010101001001003610036100361003610036
102041003576001619927251020010200102106477121496955100351003586737873610210102242024810035110111020110099101000001117191610010101001001003610036100361003610036
102041003576000619927251020010200102106477121496955100351003586736873610210102242024810035110111020110099101000001117191610010101001001003610036100361003610036
10204100357500116699272510200102001021064771214969561003510035867378736102101022420248100351101110201100991010000331117191610010101001001003610036100361003610036
10204100357601201039927251020010200102106477121496955100351003586737873610210102242024810035110111020110099101000001117191610010101001001003610036100361003610036

1000 unrolls and 10 iterations

Result (median cycles for code): 1.0035

retire uop (01)cycle (02)03191e3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst int alu (97)l1d cache writeback (a8)a9accfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
100241003575006199182510020100201002064729649695510035100358678387541002010020200201003510411100211091001000364032733999310010101003610036100361003610036
1002410035750661991825100201002010020647296496955100351003586783875410020100202002010035104111002110910010000640327431002510010101003610036100361003610036
10024100357501566199182510020100201002064729649700010035100358686787791002010020200201003510411100211091001002840640327331002510010101003610036100361003610036
100241003575119810399182510020100201002064729649695510035100358678387541002010020200201003510411100211091001000064032733999310010101003610036100361003610036
1002410035750128299182510020100201002064729649695510035100358678387541002010020200201003510411100211091001000064032733999310010101003610036100361003610036
100241003575006199182510020100201002064729649695510035100358678387541002010020200201003510411100211091001000064032733999310010101003610036100361003610036
100241003575006199182510020100201002064729649695510035100358678387541002010020200201003510411100211091001000064042733999310010101003610036100361003610036
100241003575006199182510020100201002064729649695510035100358678387541002010020200201003510411100211091001000064032733999310010101003610036100361003610036
1002410035750091599182510020100201002064729649695610035100358678387541002010020200201003510411100211091001000064032733999310010101003610036100361003610036
100241003575006699182510020100201002064729649695510035100358678387541002010020200201003510411100211091001000064032733999310010101003610036100361003610036

Test 4: throughput

Count: 8

Code:

  ands xzr, xzr, xzr
  setf8 w0
  ands xzr, xzr, xzr
  setf8 w0
  ands xzr, xzr, xzr
  setf8 w0
  ands xzr, xzr, xzr
  setf8 w0
  ands xzr, xzr, xzr
  setf8 w0
  ands xzr, xzr, xzr
  setf8 w0
  ands xzr, xzr, xzr
  setf8 w0
  ands xzr, xzr, xzr
  setf8 w0
  mov x0, 1

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.6676

retire uop (01)cycle (02)03mmu table walk instruction (07)mmu table walk data (08)l2 tlb miss instruction (0a)l2 tlb miss data (0b)18191e1f3a3f51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fl1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acc2branch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)ea? int retires (ef)f5f6f7f8fd
16020453408400101000000282716012016012016012810637384950328534085340833347633357160128160240160240534086611160201100991001601001000000000011110119116115340516002001005340953409534095340953409
1602045340840010100001760282716012016012016012810637384950328534085340833347633357160128160240160240534086611160201100991001601001000000000011110119116115340516006311005340953409534095340953409
160204534084001010403000282716012016012016012810619894950328534085340833347633357160128160240160240534086611160201100991001601001000000000011110119116115340516002001005340953409534095340953409
160204534084001010002700282716012016012016012810637384950328534085340833347633357160128160240160240534086611160201100991001601001000000000011110119116115340516002001005340953409534095340953409
16020453408400101000000282716012016012016012810637384950328534085340833347633357160128160240160240534086611160201100991001601001000000000011110119116115340516002001005340953409534095340953409
16020453408400101000000492716012016012016012810637384950328534085340833347633357160128160240160240534086611160201100991001601001000000000011110119116115340516002001005340953409534095340953409
16020453408399101000000282716012016012016012810637384950328534085340833347633357160128160240160240534086611160201100991001601001000000000011110119116115340516002001005340953409534095340953409
16020453408400101000000282716012016012016012810637384950328534085340833347633357160128160240160240534086611160201100991001601001000000000011110119116115340516002001005340953409534095340953409
16020453408400101000000282716012016012016012810637384950328534085340833347633357160128160240160240534086611160201100991001601001000000000011110119116115340516002001005340953409534095340953409
16020453408400101000000282716012016012016012810637384950328534085340833347633357160128160240160240534086611160201100991001601001000000000011110119116115340516002001005340953409534095340953409

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.6672

retire uop (01)cycle (02)03mmu table walk instruction (07)mmu table walk data (08)l2 tlb miss instruction (0a)18191e1f3a3f51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)5f60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fl1d tlb access (a0)l1d tlb miss (a1)st unit uop (a7)l1d cache writeback (a8)acc2branch cond mispred nonspec (c5)cfd0d2l1i cache miss demand (d3)d5map dispatch bubble (d6)d9dadbddfetch restart (de)e0? int output thing (e9)eaeb? int retires (ef)f5f6f7f8fd
16002453379400000000004325160010160010160010102938811495029453374533743333133335116001016002016002053374661116002110910160010100000000100223115192114353370160000157105337553375533755337553375
160024533744000000000043251600101600101600101029388104950294533745337433331333351160010160020160020533746611160021109101600101000002700100223114192113453370160000157105337553375533755337553375
16002453374399000000004325160010160010160010102938811495029453374533743333133335116001016002016002053374661116002110910160010100000600100223114192114453370160000157105337553375533755337553411
16002453374400000000004325160010160010160010102938811495029453374533743333133335116001016002016002053374661116002110910160010100000300100223114192114353370160000157105337553375533755337553375
160024533744100000000043251600101600101600101029388104950294533745337433331333351160010160020160020533746611160021109101600101000076600100223114192114353370160000157105337553375533755337553375
16002453374400000000004325160010160010160010102938811495029453374533743333133335116001016002016002053374661116002110910160010100000300100223114192114353370160000157105337553375533755337553375
16002453374400000000004325160010160010160010102938811494727253374533743333133335116001016002016002053374661116002110910160010100000300100223114192113453370160000157105337553375533755337553375
16002453374400000000004325160010160010160010102938810495029453374533743333133335116001016002016002053374661116002110910160010100000600100223114192114353370160000157105337553375533755337553375
160024533743990000000043251600101600101600101029388114950294533745337433331333351160010160020160020533746611160021109101600101000002101100223114192114353370160000157105337553375533755337553375
1600245337440000000000432516001016001016001010293881149502945337453374333313333511600101600201600205337466111600211091016001010000811200100223114192113453370160000157105337553375533755337553375

Test 5: throughput

Count: 4

Code:

  fcmp s0, s0
  setf8 w0
  setf8 w0
  setf8 w0
  setf8 w0
  mov x0, 1

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.3354

retire uop (01)cycle (02)03mmu table walk data (08)l2 tlb miss data (0b)18191e1f3a3f4d51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)flags prf full (73)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)l1d cache writeback (a8)a9acc2branch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd0d5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
5020413456110000000028025501224011210010401431001357512780097013395134161341661372467771105015640251100138030220026134161341611502011009910040100100001000000030111322100160013413400121001341713417134171341713417
5020413416100000000028025501224011210084401431001357512780097013395134161341661372456771105015640251100138030220026134161341511502011009910040100100001000000000111322100160013413400121001341713417134171341713417
50204134161000000000510255012240112100104014310013575127800970133951341613416613924677711050156402511001380302200261341513416115020110099100401001000010000000300111322100160013413400121001341713417134171341713417
5020413416100000000028025501224011210010401431001357512780097113395134151341661392456771105015640251100138030220026134151341611502011009910040100100001000000000111322100160013413400121001341713417134171341713417
5020413416101000000028025501224011210010401431001357512780097013395134161341661392456771105015640251100138030220026134161341611502011009910040100100001000000000111322100160013413400121001341713417134171341713417
50204134161000000120028025501224011210010401431001357512780097013395134161341661392467771105015640251100138030220026134151341611502011009910040100100001000000000111322101160013413400121001341713417134171341713417
50204134161000000120028025501224011210010401431001357512780680013395134161341661392467771105015640251100138030220026134161341611502011009910040100100001000001000111322100160013413400121001341713417134171341713416
5020413416100000000028025501224011210010401431001357512780097013395134161341661372467771105015640251100138030220026134151341611502011009910040100100001000001000111322100160013413400121001341713416134171341713417
5020413416100000000028024501224011210010401431001357512780097013395134161341661372467771105015640251100138030220026134161341611502011009910040100100001000000030111322100160013413400121001341713417134161341713417
5020413416100000000028025501224011210010401431001357512780097013395134161341661372467771105015640251100138030220026134161341611502011009910040100100001000000090111322000160013413400121001341713417134171341713417

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.3346

retire uop (01)cycle (02)03l1i tlb fill (04)mmu table walk data (08)l2 tlb miss data (0b)181e3f51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)696d6emap stall dispatch (70)flags prf full (73)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fld unit uop (a6)l1d cache writeback (a8)acbranch mispred nonspec (cb)cfl1i cache miss demand (d3)l1i tlb miss demand (d4)d5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
50024133841011000045255001040010100004001010000570031800001335313382133825577378437109500104002010000800202000013382133821150021109104001010000100000314100211919161337940000101338313383133831338313383
50024133821000000045255001040010100004001010000573456800001335313382133825575379537109500104002010000800202000013382133821150021109104001010000100000314100191918191337940000101338313383133831338313383
50024133821000000045255001040010100004001010000573456800001335313382133825575379537109500104002010000800202000013382133821150021109104001010000100060314100181917201337940000101338313383133831338313383
50024133821010000045255001040010100004001010000573456800001335313382133825577379537109500104002010000800202000013382133821150021109104001010000100030314100191919191337940000101338313383133831338313383
5002413382100000004525500104001010000400101000057345680000133531338213382557737843710950010400201000080020200001338213382115002110910400101000010000031410015191691337940000101338313383133831338313383
50024133821000000045255001040010100004001010000573456800001335313382133825575378437109500104002010000800202000013382133821150021109104001010000100100314100201916191337940000101338313383133831338313383
500241338210000000121255001040010100004001010000573456801911335313382133825575378437134500104002010000800202000013382133821150021109104001010000100030314100191911191337940000101338313383133831338313383
500241338210100003645255001040010100004001010000573456800001335313382133825575379537109500104002010000800202000013382133821150021109104001010000100000314100191919161337940000101338313383133831338313383
500241338210000000399255001040010100004001010000573456800001335313382133825575379537109500104002010000800202000013382133821150021109104001010000100000314100191919191337940000101342913383133831338313383
50024133821010000045255001040010100004001010000573456800001335313382133825575379537109500104002010000800202000013382133821150021109104001010000100000314100191919191337940000101338313383133831338313383