Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

INS (general, B)

Test 1: uops

Code:

  ins v0.b[2], w1
  mov x0, 1
  mov x1, 2

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 2.000

Issues: 2.000

Integer unit issues: 0.000

Load/store unit issues: 1.000

SIMD/FP unit issues: 1.000

retire uop (01)cycle (02)031e3f4e51schedule uop (52)schedule simd uop (54)schedule ldst uop (55)dispatch simd uop (57)dispatch ldst uop (58)simd uops in schedulers (5a)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map ldst uop (7d)map simd uop (7e)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst simd alu (9a)ld unit uop (a6)l1d cache writeback (a8)accfd5map dispatch bubble (d6)ddfetch restart (de)e0? ldst retires (ed)? simd retires (ee)f5f6f7f8fd
200420401502025169025200010001000100010006342265147120212040204014503177320001000100010002000204020401110011000100010732161120371000100020412041204120412041
200420401602025169025200010001000100010006342265147020212040204014503177320001000100010002000204020401110011000100000731161120371000100020412041204120412041
200420401502025169025200010001000100010006342265147020212040204014503177320001000100010002000204020401110011000100000731161120371000100020412041204120412041
200420401502025169025200010001000100010006373265147020602040204014503177320001000100010002000204020401110011000100000731161120371000100020412041204120412041
2004204016020251690252000100010001000100063422651471202120402040145031773200010001000100020002040204011100110001000021731161120371000100020412041204120412041
200420401502025169025200010001000100010006342265147120212040204014503177320001000100010002000204020401110011000100000731161120371000100020412041204120412041
200420401502025168625200010001000100010006342265147120212040204014503177320001000100010002000204020401110011000100040731161120371000100020412041204120412041
200420401502025169025200010001000100010006342265147120212040204014503177320001000100010002000204020401110011000100000731161120371000100020412041204120412041
200420401502025169025200010001000100010006342265147120212040204014503177320001000100010002000204020401110011000100000731161120371000100020412041204120412041
200420401502025169025200010001000100010006342265147120212040204014503177320001000100010002000204020401110011000100000731161120371000100020412041204120412041

Test 2: Latency 1->1

Code:

  ins v0.b[2], w1
  mov x0, 1
  mov x1, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 2.0040

retire uop (01)cycle (02)03mmu table walk data (08)l2 tlb miss data (0b)18191e1f3a3f4e4f51schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)60696b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d tlb miss (a1)a5ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acc2branch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)f5f6f7f8fd
202042004016100001800200251968903720100100100001000010010000100005165351828481320200210200402004017175317498201002001000010170200100002000020040200401110201100991001001000010020010000010000007101161120086010000100001002004120041200412004120041
20204200401610000000200251968902520100100100001000010210000100005005351828481320200210200402004017175317498201002001000010000200100062000020040200401110201100991001001000010000010000000000007100161120037010000100001002004120041200412004120041
20204200401610000000200251967102520100100100001000010010000100005005351828481320200210200402004017175317498201002001000010000200100002000020040200401110201100991001001000010000010000000300007390161120037010000100001002004120041200412004120041
2020420090161000029700200251968902520100100100001000010010000100005225351828481320200210200402004017175317498201002001000010000200100002000020040200401110201100991001001000010000010000000601007101161120037010000100001002004120041200412004120041
2020420040161000012000200251968902520100100100121000010010000101585005683128481320200210200402004017175317498201002001000010000200100002000020040200401110201100991001001000010000010000080000017100161120037010000100001002004120041200412004120041
202042004016000000002002519689025201001001001210000100100001000050053518284813202002132004020090171752617498201002001000010166200100002000020090200401110201100991001001000010000010000000000007101161120037010000100001002004120041200412004120041
202042004016100003000200251968902520100100100001000010010000100005005351828481320200210200402004017175317498201002001000010000200101712000020040200401110201100991001001000010000010000000000007101161120037010000100001002004120041200412004120228
20204200401610000000200251968902520100100100001000010010000100005005351828481321200210200402004017175317498201002001000010000200100002000020040200401110201100991001001000010000010000000000007101161120037010000100001002004120041200412004120041
20204200401560000000200251968902520100100100001000010010000100005005351828481321200210200402004017175317498201002001000010000200100002000020040200401110201100991001001000010000010000000000007101161120037010000100001002004120041200412004120041
20204200401550000000200251968902520100100100001000010010000100005005351828481321200210200402004017175317498201002001000010000200100002000020040200401110201100991001001000010000010000000000007101161120037010000100001002004120041200412004120041

1000 unrolls and 10 iterations

Result (median cycles for code): 2.0040

retire uop (01)cycle (02)03mmu table walk instruction (07)191e1f3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)ld unit uop (a6)l1d cache writeback (a8)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? ldst retires (ed)? simd retires (ee)? int retires (ef)f5f6f7f8fd
200242004015000002002519690252001010100001000010100001000050535852848147020021200402004017197317520200102010000100002010000200002004020040111002110910101000010010000064031633200371000010000102004120041200412004120041
200242004015000002002519690252001010100001000010100001000050535852848147020021200402004017197317520200102010000100002010000200002004020040111002110910101000010010000064031633200371000010000102004120041200412004120041
200242004015000002002519690252001010100001000010100001000050535852848147120021200402004017197317520200102010000100002010000200002004020040111002110910101000010010000064031633200771000010000102004120041200412004120041
200242004015000002002519690252001010100001000010100001000050535852848147120021200402004017197317520200102010000100002010000200002004020040111002110910101000010010000064031633200371000010000102004120041200412004120041
200242004015000002002519690252001010100001000010100001000050535852848147120021200402004017197317520200102010000100002010000200002004020040111002110910101000010010000064031633200371000010000102004120041200412004120041
200242004015000002002519690252001010100001000010100001000050535852848147120021200402004017197317520200102010000100002010000200002004020040111002110910101000010010000064031633200371000010000102004120041200412004120041
200242004015000002002519690252001010100001000010100001000050535852848147120021200402004017197317520200102010000100002010000200002004020040111002110910101000010010000064031633200371000010000102004120041200412004120041
200242004015000002002519690252001010100001000010100001000050535852848147120021200402004017197317520200102010000100002010000200002004020040111002110910101000010410000064031633200371000010000102004120041200412004120041
200242004015000002002519690252001010100001000010100001000050535852848147020021200402004017197317520200102010000100002010000200002004020040111002110910101000010010000064031633200371000010000102004120041200412004120041
200242004015000002002519690252001010100001000010100001000050535852848147020021200402004017197317520200102010000100002010000200002004020040111002110910101000010010000064031633200371000010000102004120041200412004120041

Test 3: Latency 1->2 roundtrip

Code:

  ins v0.b[2], w0
  fmov x0, d0
  mov x0, 1
  mov x1, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 12.0032

retire uop (01)cycle (02)03mmu table walk data (08)091e1f3f4f51schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)60696b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acc2branch cond mispred nonspec (c5)c9branch mispred nonspec (cb)cdcfd0d5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)f5f6f7f8fd
3020412003289900104701200171094562540100101042000010000100200001000050057356721367205301200130120032120032115532711623630100200100032000520010003300081200321200322120201100991001010010000100010000000001011131701162211958310000100001000010100120033120033120033120033120033
302041200328990079201200171094562540100101002000010000100200001000050057356721367205301200130120032120032115532611626730100200100032000520010003300081200321200322120201100991001010010000100010000000001011131801161111958310000100001000010100120033120033120033120033120033
30204120032930009601761200171094562540100101002000010000100200001000050057356721367205301200130120032120032115532611623630100200100032000520010003300081200321200321120201100991001010010000100010000000001011131801172211958310000100001000010100120033120033120033120033120033
302041200329310057901200171094562540100101002000010000100200001000050057356721367205301200130120032120032115532711623530100200100032000520010003300081200321200321120201100991001010010000100010000000001011131801161211958310000100001000010100120033120033120033120033120033
302041200329300090901200171094562540100101002000010000100200001000050057356721367205301200130120032120032115532711623930100200100032000520010003300081200321200321120201100991001010010000100010000000000000131013163311957410000100001000010100120033120033120033120033120033
3020412003289900123001200171094562540100101002000010000100200001000050057356721367205301200130120032120032115525311624030100200100002000020010000300001200321200321120201100991001010010000100010000000000010131013163311957410000100001000010100120033120033120033120033120033
302041200328990072601200171094562540100101002000010000100200001000050057356721367205301200130120032120032115525311624530100200100002000020010000300001200321200321120201100991001010010000100010000000000000131013163311957410000100001000010100120033120033120033120033120033
3020412003289900117901200171094572540100101002000010000100200001000050057356721367205301200130120032120032115525311624030100200100002000020010000300001200321200321120201100991001010010000100010000000000000131013163311957410000100001000010100120033120033120033120033120033
3020412003289900001200171094562540100101002000010000100200001000050057356721367205301200130120032120032115525311624030100200100002000020010000300001200321200321120201100991001010010000100010000000000000131013163311957410000100001000010100120033120033120033120033120033
3020412003289900001200171094562540100101002000010000100200001000050057381901367216901200130120032120032115525311624030100200100002000020010000300001200321200321120201100991001010010000100010000000000000131013163311957410000100001000010100120033120033120033120033120033

1000 unrolls and 10 iterations

Result (median cycles for code): 12.0032

retire uop (01)cycle (02)03181e1f3f4f51schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb miss (a1)ld unit uop (a6)l1d cache writeback (a8)acbranch mispred nonspec (cb)cdcfd0d5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)f5f6f7f8fd
3002412003289900012001710945725400101001020000100001020000100005057356721367002911200131200321200321155483116262300102010000200002010000300001200321200321120021109101001010000100100000010127001161111958610000100001000010010120033120033120033120033120033
3002412003289900012001710945625400101001020000100001020000100005057356721367002911200131200321200321155483116262300102010000200002010000300001200321200321120021109101001010000100100000000129501161111960610000100001000010010120033120033120033120033120033
30024120032106400012001710945625400101001020000100001020000100005057356721367002911200131200321200321155483116262300102010000200002010000300001200321200321120021109101001010000100100000000127001161111960210000100001000010010120033120033120033120033120033
3002412003289900012001710945625400101001020000100001020000100005057356721367002911200131200321200381155483116262300102010000200002010000300001200321201161120021109101001010000100100000000127002161111964610000100001000010010120033120033120033120033120033
3002412003289900012001710945625400101001020000100001020000100005057356721367002911200131200321200321155483116262300102010000200002010000300001200321200321120021109101001010000100100000000127001161111962310000100001000010010120033120033120076120033120033
3002412003289900012001710945625400101001020000100001020000100005057356721367002911200131200321200321155483116262300102010000200002010000300001200321200321120021109101001010000100100000000127001161111960610000100001000010010120033120033120033120033120033
3002412003289900012001710945625400101001020000100001020000100005057356721367002911200131200321200321155483116262300102010000200002010000300001200321200321120021109101001010000100100000001131502161111963910000100001000010010120033120033120033120033120033
3002412003289900012001710945625400101001020000100001020000100005057356721367002911200131200321200321155483116262300102010000200002010000300001200321200321120021109101001010000100100000000127001161111985410000100001000010010120033120033120033120033120131
3002412003289900012001710945525400101001020000100001020000100505057359601367002911200131200321200321155483116262300102010000200002010000300001200321200321120021109101001010000100100000000127001161111962410000100001000010010120033120033120033120033120033
3002412003289906012001710945625400101001020000100001020000100005057356721367002911200131200321200321155483116262300102010000200002010000300001200331200321120021109101001010000100100000000127001160111960810000100001000010010120034120033120033120033120033

Test 4: throughput

Count: 8

Code:

  movi v0.16b, 0
  ins v0.b[2], w8
  movi v1.16b, 0
  ins v1.b[2], w8
  movi v2.16b, 0
  ins v2.b[2], w8
  movi v3.16b, 0
  ins v3.b[2], w8
  movi v4.16b, 0
  ins v4.b[2], w8
  movi v5.16b, 0
  ins v5.b[2], w8
  movi v6.16b, 0
  ins v6.b[2], w8
  movi v7.16b, 0
  ins v7.b[2], w8
  mov x8, 9

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.3759

retire uop (01)cycle (02)0318191e1f3f4f51schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)60696b6d6emap stall dispatch (70)simd prf full (72)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)ld unit uop (a6)l1d cache writeback (a8)acbranch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)f5f6f7f8fd
240204300942420012030054027160143125800058001312580019800196504164809814871300480300683006900614160163200800198001920080019160038301323006811160201100991001001600001000800020011110119316113006525800001600001003006930069300693006930070
24020430069241000030053027160143125800058001312580019800196504164809814870300480300683006900714160163200800198001920080019160038300693015611160201100991001001600001000800020011110119216113006625800001600001003019730069300693006930069
24020430069242000030053026160143125800058001312580019800196504164809814871300480300683006900714160163200800198001920080019160038300683012811160201100991001001600001000800020311110119316113006625800001600001003006930070300693006930070
24020430069241000030053027160143125800058001312580019800196504164809814871300480300683006900714160163200800198001920080019160038300683006811160201100991001001600001000800020011210119116113006525800001600001003006930069300703007030070
24020430069233000030053027160143125800058001312580019800196504164809814871300490300693006900614160163200800198001920080019160038300993006821160201100991001001600001000800020311110119116113006625800001600001003006930069300693006930069
24020430068233000030053027160143125800058001312580019800196504164809814871300480300683006800714160163200800198001920080019160038300683006911160201100991001001600001000800020011110119116113016125800001600001003007030069300703007030069
24020430068241000030053027160143125800058001412580019801586504174929834371300480300683006900714160163200800248002320080024160046300813008111160201100991001001600001000800020011110119116113006525800001600001003006930070300703007030070
240204300692415015030053027160143125800058001312580019800196504164809814871300480300693006800614160163200800198001920080019160038300683006811160201100991001001600001000800020011110119116113006625800001600001003007030070300703007030069
24020430069241000030054026160144125800058001312580019800196504158859814870300480300683006800714160163200800198001920080019160038300683006811160201100991001001600001000800020011110119116113006625800001600001003006930069300693007030070
24020430068242000030054027160143125800058001312580019800196504164809814871300480300693006800714160163200800198001920080019160038300693006911160201100991001001600001000800020312210129116113006525800001600001003007030069300693007030069

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.3756

retire uop (01)cycle (02)03mmu table walk instruction (07)mmu table walk data (08)18191e1f3f4f51schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)5f6061696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb miss (a1)a5ld unit uop (a6)l1d cache writeback (a8)a9acc2cfd0d2l1i cache miss demand (d3)d5map dispatch bubble (d6)d9dadbddfetch restart (de)e0? int output thing (e9)eaec? ldst retires (ed)? simd retires (ee)? int retires (ef)f5f6f7f8fd
240024300632250000003003002616001212800008000012800008000062401157960991111030026300453004503241600122080000800002080000160000300453004511160021109101016000010008000000001002213219192112330042220080000160000103004630046300463004630046
24002430045225000000310562563741626091380519811701181238812226244467210215171110309353115531044238179606162523208122581354208042216246031170311651011600211091010160000100258116610640301002213517222114330042220080000160000103004630046300463004630787
24002430510225015481035230030160315161573108039080650128082780410684515061022183111030485304133057435812221616001220800008081220800001600003004530666311600211091010160000100980000020010278135191462124531501120080000160000103191031541320283138431047
240024300452330100003003002616001212800008000012800008000062401157960991011030026300453004503241600122080000800002080000160000300453007211160021109101016000010008000010001002416629192113330042220080000160000103004630046300463004630046
2400243004522500000030030026160012128000080000128000080000624077609697950110300263004530136032416001220800008000020800001600003004530058111600211091010160000100080000000010024135110222122330042220080000160000103004630046300463004630046
240024300452250000003003002616001212800008000012800008000062407760969795011030026300453004503241600122080000800002080000160000300453005411160021109101016000010008000000001002413628224223230042240080000160000103004630046300463004630046
240024300452250000003003002616001212800008000012800008000062407760969795011030026300453004503241600122080000800002080000160000300453004511160021109101016000010008000000001002213516194223330042240080000160000103004630046300463004630046
240024300452250000003003002616001212800008000012800008000062407760969795011030026300453004503241600122080000800002080000160000300453004511160021109101016000010008000000001002213517194222330042240080000160000103004630046300463004630046
240024300452250000003003002616001212800008000012800008000062407760960991111030026300453004503241600122080000800002080000160000300473004511160021109101016000010008000000001002213516192113230042220080000160000103004630046300463004630046
240024300452330000003003002616001212800008000012800008000062401157960991111030026300453004503241600122080000800002080000160000300453009111160021109101016000010008000000001002213517222112330042220080000160000103004630046300463004630132