Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

SRI (vector, 2D)

Test 1: uops

Code:

  sri v0.2d, v1.2d, #3
  movi v0.16b, 1
  movi v1.16b, 2

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 0.000

Load/store unit issues: 0.000

SIMD/FP unit issues: 1.000

retire uop (01)cycle (02)03mmu table walk data (08)181e3f4e51schedule uop (52)schedule simd uop (54)dispatch simd uop (57)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map simd uop (7e)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst simd alu (9a)l1d tlb access (a0)l1d cache writeback (a8)a9accfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)f5f6f7f8fd
1004203716000611687251000100010002646801201820372037157231895100010002000203720371110011000000073116111787100020382038203820382038
1004203716000611687251000100010002646800201820372037157231895100010002000203720371110011000000073116111787100020382038203820382038
1004203716000611687251000100010002646801201820372037157231895100010002000203720371110011000000073116111855100020382038203820382038
1004203717000611687251000100010002646800201820372037157231895100010002000203720371110011000000073116111787100020382038203820382038
1004203715000611687251000100010002646800201820372037157231895100010002000203720841110011000000073116111787100020382038203820382038
1004203716000611687251000100010002646801201820372037157231895100010002000203720371110011000000073116111787100020382038203820382038
1004203716000611687251000100010002646801201820372037157231895100010002000203720371110011000000073116111787100020382038203820382038
1004203716000611687251000100010002646800201820372037157231895100010002000203720371110011000000073116111787100020382038203820382038
1004203716000611687251000100010002646800201820372037157231895100010002000203720371110011000000073116111787100020382038203820382038
1004203716000611687251000100010002646800201820372037157231895100010002000203720371110011000000073116111787100020382038203820382038

Test 2: Latency 1->1

Code:

  sri v0.2d, v1.2d, #3
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 2.0037

retire uop (01)cycle (02)03mmu table walk data (08)l2 tlb miss data (0b)18191e1f3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)l1d cache writeback (a8)a9accdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
102042003715500000012419687441010010010000100100005002847680020018200372003718425718745105812001000020020000200372003711102011009910010010000100020000071021611198230100001002003820038200382013420038
10204200371560000006119687251010010010000100100005002847680120018200372003718422318854113622001049520020000200372003711102011009910010010000100000000071011621197910100001002003820038200382003820038
10204200371550000006119687251010010010000100100005002847680020018200372003718422818745101002001016420020000200372003711102011009910010010000100000000071011611197910100001002003820038200382003820038
10204200371550000006119687251010010010000100100005002847680020018200852003718422318745101002001000020020000200372013121102011009910010010000100000000071011611197910100001002003820038200382003820038
10204200371550000006119687251010010010000100100005002847680120018200372003718422318745101002001000020020000200372003711102011009910010010000100000000071011611197910100001002003820038200382003820038
102042003715600000014819687251010010010023100100005002847680120018200372003718422318745101002001000020020000200372003711102011009910010010000100020002055071011611197910100001002003820038200382003820038
10204200371550000006119687251010010010000100100005002847680020018200372003718422318745101002001000020020000200372003711102011009910010010000100000000071011611197910100001002003820038200382003820038
10204200371550000006119687251010010010000100100005002848963020018200372003718422318745101002001000020020000200372003711102011009910010010000100000000071011611197910100001002003820038200382003820038
10204200371550000006119687251010010010000100100005002847680120018200372003718422318745101002001000020020000200372003711102011009910010010000100000000071011611197910100001002003820038200382003820038
102042003715600000061196872510100100100001001000050028476800200182003720037184221118745101002001000020020000200372003711102011009910010010000100000000173311611197910100001002003820038200382003820038

1000 unrolls and 10 iterations

Result (median cycles for code): 2.0037

retire uop (01)cycle (02)03mmu table walk data (08)l2 tlb miss instruction (0a)l2 tlb miss data (0b)18191e1f3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)l1d cache writeback (a8)a9acc2branch cond mispred nonspec (c5)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
100242003715500000001031968725100101010000111000050284768002005420037200371844431876710010201000020200002003720037111002110910101000010000000006404162219785010000102003820038200382003820085
10024200371550000092641241968725100101010000101000050284768002001820037200371844431876710010201000020200002003720037111002110910101000010000000006402162219785010000102003820038200382003820038
10024200371550000000611968725100101010000101000050284768002001820037200371844431876710010201000020200002003720037111002110910101000010000000006402162219785010000102003820038200382003820038
10024200371550000000611968725100101010000101000050284768012001820037200371844431876710010201000020200002003720037111002110910101000010000000006402162219785010000102003820038200382003820038
10024200371550000000611968725100251010000101000050284768002001820037200371844431876710010201000020200002003720037111002110910101000010000003006402162219785010000102003820038200382003820038
100242003716100100002011968725100101010000101000050284768002001820037200371844431876710010201000020200002003720037111002110910101000010000000006402162219785010000102003820038200382003820038
10024200371610000000611968725100101010000111045665285152912001820037200371844471876710010201000020200002003720037111002110910101000010000000006402162219785210000102003820038200382003820038
1002420037161000000037171968725100101010000101000050284768002001820037200371844431876710010201000020200002003720037111002110910101000010000000006402162219785010000102003820038200382003820038
10024200371550100000611968725100101010000101000050284768002001820037200371844431876710010201000020200002003720037311002110910101000010000000006402162219785010000102003820038200382003820038
100242003715500000001491968725100101010000101000050284768002001820037200371844431876710010201000020200002003720037111002110910101000010000000006402162219785010000102003820038200382003820038

Test 3: Latency 1->2

Code:

  sri v0.2d, v0.2d, #3
  movi v0.16b, 1

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 2.0037

retire uop (01)cycle (02)03mmu table walk data (08)l2 tlb miss data (0b)18191e1f3a3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acc2branch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
1020420037150000000017419686251010010010000100100005002847521200182003720037184286187411010020010008200200162003720037111020110099100100100001000000000011171822400198010100001002003820038200382003820038
1020420037150000000010319686251010010010000100100005002847521200182003720037184287187411010020010008200200002003720037111020110099100100100001000000000011171822422198000100001002003820038200382003820038
1020420037150000000113919686251010010610012100100005002847521200182003720085184287187401010020010008200200162008620085111020110099100100100001000000000011172201600198010100001002003820038200382003820038
1020420037150000012006119686251014510010000100100005002847521200182003720037184286187411010020010008200200162003720037111020110099100100100001000000000011171801600198000100001002003820038200382003820038
102042003715000000006119686251010010010000100100005002847521200182003720037184286187411010020010008200200162003720037111020110099100100100001000000000011171701600198010100001002003820038200382003820038
102042003715000000006119686251010010010000100100005002847521200182003720037184286187401010020010008200200162003720037111020110099100100100001000000000011171801600198010100001002003820038200382003820038
102042003715000000006119686251010010010000100100005002847521200182003720037184287187401010020010008200200162003720037111020110099100100100001000000000011171801600198010100001002003820038200382003820038
102042003715000000006119686251010010010000100100005002847521200182003720037184287187411010020010008200200162003720037111020110099100100100001000000000011171801600198000100001002003820038200382003820038
102042003715000000006119686251010010010000100100005002847521200182003720037184286187411010020010008200200162003720037111020110099100100100001000000000011171801600198000100001002003820038200382003820038
102042003715000000006119686251010010010000100100005002847521200182003720037184287187401010020010008200200162003720037111020110099100100100001000000000011171701600198000100001002003820038200382003820038

1000 unrolls and 10 iterations

Result (median cycles for code): 2.0037

retire uop (01)cycle (02)03191e1f3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fld unit uop (a6)l1d cache writeback (a8)accfl1i tlb miss demand (d4)d5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
100242003715000010319686251001010100001010000502847521200182003720037184433187671001020100002020000200372003711100211091010100001000064008162219856010000102003820038200382003820038
10024200371490006119686251001010100001010000502847521200182013220037184433187671001020100002020000200372017911100211091010100001000364002162219852010000102003820038200382003820038
10024200371500006119686251001010100001010000502847521200182003720037184433187671001020100002020000200372003711100211091010100001000064002162219786010000102003820038200382003820038
10024200371500006119686251001010100001010000502847521200182003720037184433187671001020100002020000200372003711100211091010100001000064002162219786010000102003820038200382003820038
100242003715000010319686251001010100001010000502847521200182003720037184433187671001020100002020000200372003711100211091010100001000064002162219786010000102003820038200382003820038
100242003715000012419686251001010100001010000502847521200182003720037184433188041001020100002020000200372003711100211091010100001000064002162219786010000102003820038200382003820038
10024200371500006119686251001010100001010000502847521200182003720037184433187671001020100002420000200372003711100211091010100001000064002162219786010000102003820038200382003820038
100242003715000010319686251001010100001010000502847521200182003720037184433187671001020100002020000200372003711100211091010100001001248864002162219786010000102003820038200382003820038
100242003715000020819686251001010100001010000502847521200182003720037184433187671001020100002020000200372003711100211091010100001000064002162219786010000102003820038200382003820038
100242003715030018719686251001010100001010000502847521200182003720037184433187671001020101632020000200372003711100211091010100001000364002162219786010000102003820038200382003820038

Test 4: throughput

Count: 8

Code:

  movi v0.16b, 0
  sri v0.2d, v8.2d, #3
  movi v1.16b, 0
  sri v1.2d, v8.2d, #3
  movi v2.16b, 0
  sri v2.2d, v8.2d, #3
  movi v3.16b, 0
  sri v3.2d, v8.2d, #3
  movi v4.16b, 0
  sri v4.2d, v8.2d, #3
  movi v5.16b, 0
  sri v5.2d, v8.2d, #3
  movi v6.16b, 0
  sri v6.2d, v8.2d, #3
  movi v7.16b, 0
  sri v7.2d, v8.2d, #3
  movi v8.16b, 9

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.2508

retire uop (01)cycle (02)033f51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)696d6emap rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d cache writeback (a8)branch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
16020420091150292580116100800161008002850064019620044200652006561280128200800282001600562006520065111602011009910010016000010001111012131612200621600001002006620066200662006620066
16020420065150292580116100800161008002850064019620044200652006561280128200800282001600562006520065111602011009910010016000010001111012121622200621600001002006620066200662006620066
16020420065150292580116100800161008002850064019620044200652006561280128200800282001600562006520065111602011009910010016000010001111012141621200621600001002006620066200662006620066
16020420065151292580116100800161008002850064019620044200652006561280128200800282001600562006520065111602011009910010016000010001111012031611200621600001002006620066200662006620066
16020420065151292580116100800161008013250064098820044200652006561280128200800282001600562006520065111602011009910010016000010001111012141621200621600001002006620066200662006620066
16020420065150292580116100800161008002850064019620044200652006561280128200800282001600562006520065111602011009910010016000010001111012151622200621600001002006620066200662006620066
16020420065150292580116100800161008002850064019620044200652006561280128200800282001600562006520065111602011009910010016000010001111012131621200621600001002006620066200662006620066
16020420065150292580116100800161008002850064019620044200652006561280128200800282001600562006520065111602011009910010016000010001111012151621200621600001002006620066200662006620066
16020420065151292580116100800161008002850064019620044200652006561280128200800282001600562006520065111602011009910010016000010001111012141622200621600001002006620066200662006620066
16020420065150292580116100800161008002850064019620044200652006561280128200800282001600562006520065111602011009910010016000010001111012141621200621600001002006620066200662006620066

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.2506

retire uop (01)cycle (02)031e3f51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)5f60696d6emap rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)ld unit uop (a6)l1d cache writeback (a8)a9accfd0d2l1i cache miss demand (d3)d5map dispatch bubble (d6)d9dadbddfetch restart (de)e0ea? simd retires (ee)? int retires (ef)f5f6f7f8fd
16002420064150244425800101080000108000050640000112003320052200523218001020800002016000020056200521116002110910101600001000000100283212229212552004915160000102005320053200532005320053
16002420056150014925800101080000108000050640000112003320052200523218001020800002016000020056200521116002110910101600001000000100313111928211442004915160000102005320053200532005320053
1600242005615004425800101080000108000050640000112003320052200523218001020800002016000020052200521116002110910101600001000000100263111332211432004915160000102005320053200532005320053
1600242005215104425800101080000108000050640000112003320052200523218001020800002016000020052200521116002110910101600001000000100263113428211342004915160000102005320053200532005320053
1600242005215104425800101080000108000050640000112003320052200523218001020800002016000020052200521116002110910101600001000000100273113528211542004915160000102005320053200532005320053
1600242005215004425800101080000108000050640000112003320052200523218001020800002016000020052200521116002110910101600001000000100293222828411442004915160000102005320053200532005320053
1600242005215004425800101080000108000050640000112003320052200523218001020800002016000020052200521116002110910101600001000000100283112228211432004915160000102005320053200532005320053
1600242005215004425800101080000108000050640000112003320052200523218001020800002016000020052200521116002110910101600001010000100263111828211342004915160000102005320053200532005320053
1600242005215004425800101080000108000050640000112003320052200523218001020800002016000020052200521116002110910101600001000000100263112128211332004915160000102005320053200532005320053
1600242005215004425800101080000108000050640000112003320052200523218001020800002016000020052200521116002110910101600001000000100273112028211342004915160000102005320053200532005320053

Test 5: throughput

Count: 16

Code:

  sri v0.2d, v16.2d, #3
  sri v1.2d, v16.2d, #3
  sri v2.2d, v16.2d, #3
  sri v3.2d, v16.2d, #3
  sri v4.2d, v16.2d, #3
  sri v5.2d, v16.2d, #3
  sri v6.2d, v16.2d, #3
  sri v7.2d, v16.2d, #3
  sri v8.2d, v16.2d, #3
  sri v9.2d, v16.2d, #3
  sri v10.2d, v16.2d, #3
  sri v11.2d, v16.2d, #3
  sri v12.2d, v16.2d, #3
  sri v13.2d, v16.2d, #3
  sri v14.2d, v16.2d, #3
  sri v15.2d, v16.2d, #3
  movi v16.16b, 17

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.2502

retire uop (01)cycle (02)0318191e1f3f51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d cache writeback (a8)a9acbranch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
160204400603100000292516010810016000810016002050012801320400194003840038199776199891601202001600322003200644003840038111602011009910010016000010000011110118416114003501600001004003940039400394003940039
160204400382990000292516010810016000810016002050012801320400194003840038199776199891601202001600322003200644003840038111602011009910010016000010000011110118116104003501600001004003940039400394003940039
160204400383000000292516010810016000810016002050012801320400194003840038199776199891601202001600322003200644003840038111602011009910010016000010000011110118016004003501600001004003940039400394008940091
160204400382990000292516010810016000810016002050012801320400194003840038199776199891601202001600322003200644003840038111602011009910010016000010000011110118016004003501600001004003940039400394003940039
16020440038300005070292516010810016000810016002050012801320400194003840038199776199891601202001600322003200644003840038111602011009910010016000010000011110118216104003501600001004003940039400394003940039
1602044003830000002192516010810016000810016002050012801321400194003840038199776199891601202001600322003200644003840038111602011009910010016000010000011110118116114003501600001004003940039400394003940039
160204400383000000292516010810016000810016002050012801321400194003840038199776199891601202001600322003200644003840038111602011009910010016000010000011110118116014003501600001004003940039400394003940039
160204400383000000292516010810016000810016002050012801320400194003840038199776199891601202001600322003200644003840038111602011009910010016000010000011110118116114003501600001004003940039400394003940039
160204400383000000292516010810016000810016002050012801320400194003840038199776199891601202001600322003200644003840038111602011009910010016000010000011110118016014003501600001004003940039400394003940039
160204400383000000295016010810016000810016002050012801321400194003840038199776199891601202001600322003200644003840038111602011009910010016000010000011110118016114003501600001004003940039400394003940039

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.2502

retire uop (01)cycle (02)03mmu table walk data (08)l2 tlb miss data (0b)18191e1f3f51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)5f60696b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fld unit uop (a6)l1d cache writeback (a8)a9accfd0d2l1i cache miss demand (d3)d5map dispatch bubble (d6)d9dadbddfetch restart (de)e0eaeb? simd retires (ee)? int retires (ef)f5f6f7f8fd
160024400513000000003362516001010160000101600005012800001140019040038400381999632001816001020160000203200004003840038111600211091010160000100003100223115016211202040035155160000104003940039400394003940039
1600244009630100010104452516001010160000101600005012800001140019040038400381999632001816001020160000203200004003840038111600211091010160000107221410100223114216211212140035155160000104003940039400394003940039
16002440038299000000682516001010160000101600005012800001140019040038400381999632001816001020160000203200004003840038111600211091010160000100000100223114616211211940035155160000104003940039400394003940039
16002440038299000000452516001010160000101600005012800001140019040038400381999632001816001020160000203200004003840038111600211091010160000100000100223113916211221640035155160000104003940039400394003940039
160024400383000000120512516001010160000101600005012800001140019040038400381999632001816001020160000203200004003840038111600211091010160000100100100583114316211232040035155160000104003940039400394014040039
16002440038300001190452516001010160000101600005012800001140019040038400381999632001816001020160000203200004003840038111600211091010160000100000100223113316211242240035155160000104003940039400394003940039
16002440038300000000452516001010160000101600005012800001140019040038400381999632001816001020160000203200004003840038111600211091010160000100100100223113916211232240035155160000104003940039400394003940039
160024400383000000007522516001010160000101600005012800001140019040038400381999632001816001020160000203200004003840038111600211091010160000100100100223115016411142140035155160000104003940039400394003940039
160024400383000000004525160010101600001016000050128000011400190400384003819996320018160010201600002032000040038400381116002110910101600001001001002231192216211222240035155160000104003940039400394003940039
1600244003830000005340452516001010160000101600005012800001140019040038400381999632001816001020160000203200004003840038111600211091010160000100000100223111859211192140035155160000104003940039400394003940039