Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

SRSRA (vector, 16B)

Test 1: uops

Code:

  srsra v0.16b, v1.16b, #3
  movi v0.16b, 1
  movi v1.16b, 2

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 0.000

Load/store unit issues: 0.000

SIMD/FP unit issues: 1.000

retire uop (01)cycle (02)03mmu table walk data (08)l2 tlb miss data (0b)18191e1f3f4e51schedule uop (52)schedule simd uop (54)dispatch simd uop (57)ldst uops in schedulers (5b)696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map simd uop (7e)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst simd alu (9a)l1d tlb miss (a1)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acc2cfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)f5f6f7f8fd
1004303723000090612548251000100010003983133018303730372415328951000100020003037303711100110000002000073316332630100030383038303830383038
10043037220000204061254825100010001000398313301830373037241532895100010002000303730371110011000000000073316332630100030383038303830383038
10043037220000207061254825100010001000398313301830373037241532895100010002000303730371110011000000000073316332630100030383038303830383038
100430372200000061254825100010001000398313301830373037241532895100010002000303730371110011000000000073316332630100030383038303830383038
1004303723000012061254825100010001000398313301830373037241532895100010002000303730371110011000000000073316332630100030383038303830383038
10043037220000006125482510001000100039831330183037303724153289510001000200030373037111001100000000156073316332630100030383038303830383038
100430372300000061254825100010001000398313301830373037241532895100010002000303730371110011000000000073316332630100030383038303830383038
100430372300000061254825100010001000398313301830373037241532895100010002000303730371110011000000000073316332630100030383038303830383038
100430372200000061254825100010001000398313301830373037241532895100010002000303730371110011000000000073316332630100030383038303830383038
100430372300000061254825100010001000398313301830373037241532895100010002000303730371110011000000100073316332630100030383038303830383038

Test 2: Latency 1->1

Code:

  srsra v0.16b, v1.16b, #3
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 3.0037

retire uop (01)cycle (02)03mmu table walk data (08)18191e1f3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)6061696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d cache writeback (a8)a9acc2cfd0d2d5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
10204300372250000061295482510100100100001001000050042773130530018300373003728265328745101002001000020020000300373003711102011009910010010000100000007100121622296340100001003003830038300383003830038
10204300372250000061295482510100100100001001000050042773130530018300373003728265328745101002001000020020000300373003711102011009910010010000100000007100021622296340100001003003830038300383003830038
10204300372240000061295482510100100100001001000050042773131030018300373003728265328745101002001000020020000300373003711102011009910010010000100000007100021622296340100001003003830038300383003830038
10204300372250000061295482510100100100001001000050042773131030018300373003728265328745101002001000020020000300373003711102011009910010010000100000007100021622296340100001003003830038300383003830038
10204300372250000061295482510100100100001001000050042773131030018300373003728265328745101002001000020020000300373003711102011009910010010000100000007100121622296340100001003003830038300383003830038
102043003722400000726295482510100100100001001000050042773131030018300373003728265328745101002001000020020000300373003711102011009910010010000100000007100021622296340100001003008530038300383003830038
10204300372250000061295482510100100100001001000050042773131030018300373003728265328745101002001000020020000300373003711102011009910010010000100000007105121622296340100001003003830038300383003830038
102043003722500000726295482510100100100001001000050042773131030018300373003728265328745101002001000020020000300373003711102011009910010010000100000327100021622296340100001003003830038300383003830038
10204300372240000061295482510100100100001001000050042773131530018300373003728265328745101002001000020020000300373003711102011009910010010000100000007100021622296340100001003003830038300383003830038
102043003722400000631295482510100100100001001000050042773131530018300373003728265328745101002001000020020000300373003711102011009910010010000100000007105021622296340100001003003830038300383003830038

1000 unrolls and 10 iterations

Result (median cycles for code): 3.0037

retire uop (01)cycle (02)03mmu table walk instruction (07)mmu table walk data (08)l2 tlb miss instruction (0a)18191e1f3a3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acc2branch cond mispred nonspec (c5)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
1002430037225000000006129548251001010100001010000504277313030018300373003728287328767100102010000202000030037300371110021109101010000100000000006403162229630010000103003830038300383003830038
100243003722500000000271029548251001010100001010000504277313030018300373008428287328767100102010000202000030037300371110021109101010000100200000006402162229630010000103022930038300383003830038
10024300372250000030010329548251001010100001010000504277313030018300373003728287328767100102010000202000030037300371110021109101010000100000000006402162229630010000103003830038300383003830038
1002430037224200000006129548251002910100001010000504277313030018300373003728287328767100102010000202000030037300371110021109101010000100000000006402162229630010000103003830038300383003830038
1002430037225000000006129548251001010100001010000504277313030018300373003728287328767100102010000202000030037300371110021109101010000100000000006402162229630010000103003830038300383003830038
1002430037225000000006129548251001010100001010000504277313030018300373003728287328767100102010000202000030037300371110021109101010000100000000006402162229630010000103003830038300383003830038
10024300372250000012006129548251001010100001010000504277313030018300373003728287328767100102010000202000030037300371110021109101010000100000000406402162229630010000103003830038300383003830038
100243003722500000120061295482510010101000010100005042773130300183003730037282873287671001020100002020000300373003711100211091010100001000001011101006402162229630010000103003830038300383003830038
1002430037225000003006129548251004710100001010000504277313030018300373003728287328767100102010000202000030037300371110021109101010000100000000006402162229630010000103003830038300383003830038
10024300372250000000054829548251001010100001010149504277313030018300373003728287328767100102010000202000030037300371110021109101010000100000000006402162229630010000103003830038300383003830085

Test 3: Latency 1->2

Code:

  srsra v0.16b, v0.16b, #3
  movi v0.16b, 1

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 3.0037

retire uop (01)cycle (02)031e3a3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fld unit uop (a6)acbranch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
102043003722500972954725101001001000010010000500427716030018300373003728252628733101002001000020020000300373003711102011009910010010000100011411172222422296290100001003003830038300383003830038
102043003722500972954725101001001000010010000500427716030018300373003728252628733101002001000020020000300373003711102011009910010010000100012011172222422296290100001003003830038300383003830038
102043003722501972954725101001001000010010000500427716030018300373003728252628733101002001000020020000300373003711102011009910010010000100011111172222422296290100001003003830038300383003830038
102043003722501972954725101001001000010010000500427716030018300373003728252628733101002001000020020000300373003711102011009910010010000100011711172222422296290100001003003830038300383003830038
1020430037224019729547251010010010000100100005004277160300543003730037282717287561010020010008200200163003730037111020110099100100100001000011171801600296460100001003003830038300383003830038
102043003722500612954725101001001000010010000500427716030018300373003728271728741101002001000820020016300373003711102011009910010010000100012311171701600296460100001003003830038300383003830038
102043003722400612954725101001001000010010000500427716030018300373003728271628741101002001000820020016300373003711102011009910010010000100012311171801600296462100001003003830038300383003830038
102043003722500612954725101001001000010010000500427716030018300373003728271628741101002001000820020016300373003711102011009910010010000100011711171801600296450100001003003830038300383003830038
102043022822400612954725101001001000010010000500427716030018300373003728271628740101002001000820020016300373003711102011009910010010000100013811171801600296460100001003003830038300383003830038
1020430037225006129547251010010010000100100005004281205300183003730037282526287331010020210000200200003003730037111020110099100100100001000011172222422296290100001003003830038300383003830038

1000 unrolls and 10 iterations

Result (median cycles for code): 3.0037

retire uop (01)cycle (02)031e3a3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fld unit uop (a6)acc2cfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
100243003722400612954725100101010000101000050427716013001830037300372828632876710010201000020200003003730037111002110910101000010000640416222962910000103003830038300383003830038
100243003722500612954725100101010000101000050427716013001830037300372828632876710010201000020200003003730037111002110910101000010000640216222962910000103003830083300383003830038
100243003722500612954725100101010000101000050427716013001830037300372828632876710010201000020200003003730037111002110910101000010000640216222962910000103003830038300383003830038
100243003722500612954725100101010000101000050427716013001830037300372828632876710010201000020200003003730037111002110910101000010000640216222962910000103003830038300383003830038
100243003722500612954725100101010000101000050427716013001830037300372828632876710010201000020200003003730037111002110910101000010000640216222962910000103003830038300383003830038
100243003722500612953825100101010000101000050427716013001830037300372828632876710010201000020200003003730037111002110910101000010000640216222962910000103003830038300383003830038
100243003722500612954725100101010000101000050427716013001830037300372828632876710010201000020200003003730037111002110910101000010000640216222962910000103003830038300383003830038
100243003722500612954725100101010000101000050427716013001830037300372828632876710010201000020200003003730037111002110910101000010000640216222962910000103003830038300383008530038
100243003722500612954725100101010000101000050427716013001830037300372828632876710010201000020200003003730037111002110910101000010000640216222962910000103003830038300383003830038
100243003722500612954725100101010000101000050427716013001830037300372828632876710010201000020200003003730037111002110910101000010000640216222962910000103003830038300383003830038

Test 4: throughput

Count: 8

Code:

  movi v0.16b, 0
  srsra v0.16b, v8.16b, #3
  movi v1.16b, 0
  srsra v1.16b, v8.16b, #3
  movi v2.16b, 0
  srsra v2.16b, v8.16b, #3
  movi v3.16b, 0
  srsra v3.16b, v8.16b, #3
  movi v4.16b, 0
  srsra v4.16b, v8.16b, #3
  movi v5.16b, 0
  srsra v5.16b, v8.16b, #3
  movi v6.16b, 0
  srsra v6.16b, v8.16b, #3
  movi v7.16b, 0
  srsra v7.16b, v8.16b, #3
  movi v8.16b, 9

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.2508

retire uop (01)cycle (02)031e3f51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)5f60696d6emap rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d cache writeback (a8)acbranch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
1602042009115002925801161008001710080028500640196002004520065200656128012820080028200160056200652006511160201100991001001600001000031111012321633200621600001002006620066200662006620066
160204200651500292580116100800161008002850064019601200452006520065612801282008002820016005620065200651116020110099100100160000100001711111012231637200621600001002006620066200662006620066
1602042006515002925801161008001610080028500640196012004520065200656128012820080028200160056200652006511160201100991001001600001000001111014441634200621600001002006620066200662013420066
1602042006515002925801161008001610080028500640196002004520065200656128012820080028200160056200652006511160201100991001001600001000001111012341634200621600001002006620066200662006620066
16020420065150029258011610080016100800285006401960020048200652006561280128200800282001600562006520065111602011009910010016000010000721111012341634200621600001002006620066200662006620066
1602042006515002925801161008001610080028500640196002004520065200656128012820080028200160056200652006511160201100991001001600001000001111012231634200621600001002006620066200662006620066
1602042006515002925801161008001610080028500640196002004520172200656128012820080028200160056200652006511160201100991001001600001000001111012231633200621600001002006620066200662006620066
1602042006515102925801161008001610080028500640196002004520065200653912801282008002820016005620065200651116020110099100100160000100001591111012241634200621600001002006620066200662006620066
1602042006515002925801161008001610080028500640196002004520065200656128012820080028200160056200652006511160202100991001001600001000091111012241634200621600001002006620066200662006620066
1602042006515002925801161008001610080028500640196002004520065200656128012820080028200160056200652006511160201100991001001600001000031111012231633200621600001002006620066200662006620066

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.2507

retire uop (01)cycle (02)0318191e3f51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)5f60696d6emap rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)accfd0d2l1i cache miss demand (d3)d5map dispatch bubble (d6)d9dadbddfetch restart (de)e0eaec? simd retires (ee)? int retires (ef)f5f6f7f8fd
16002420085150000241298001010800001080000506400001120032200512005132280010208000020160000200512005111160021109101016000010000006100393113825221162120048201160000102005220052200522005220052
1600242006015100045278001010800001080000506400001120032200512005132280010208000020160000200512005111160021109101016000010000003100443112125211212120048201160000102005220052200522005220052
16002420051151000452780010108000010800005064000011200322005120051322800102080000201600002005120051111600211091010160000100000115100443112125211262120048201160000102005220052200522005220052
1600242005115000045278001010800001080000506400001120032200512005132280010208000020160000200512005111160021109101016000010000003100393111625211201620048201160000102005220052200522005220052
16002420051150000452780010108000010800005064000011200322005120051322800102080000201600002005120051111600211091010160000100000015100433112025211272220048201160000102005220052200522005220052
16002420051150000452780010108000010800005064000011200322005120051322800102080000201600002005120051111600211091010160000100000087100393111625211201620048201160000102005220052200522005220052
1600242005115000045278001010800001080000506400001120032200512005132280010208000020160000200512005111160021109101016000010000006100393111625211201720048203160000102005220052200522005220052
1600242006015000045278001010800001080000506400001120032200512005132280010208000020160000200512005111160021109101016000010000003100483112525211162020048201160000102005220052200522005220052
1600242005115000045278001010800001080000506400001120032200512005132280010208000020160000200512005111160021109101016000010000000100473112025211252020048201160000102005220052200522005220052
1600242005115000051298001010800001080000506400001120032200512005132280010208000020160000200512005111160021109101016000010000000100493112525211162520048201160000102005220052200522005220052

Test 5: throughput

Count: 16

Code:

  srsra v0.16b, v16.16b, #3
  srsra v1.16b, v16.16b, #3
  srsra v2.16b, v16.16b, #3
  srsra v3.16b, v16.16b, #3
  srsra v4.16b, v16.16b, #3
  srsra v5.16b, v16.16b, #3
  srsra v6.16b, v16.16b, #3
  srsra v7.16b, v16.16b, #3
  srsra v8.16b, v16.16b, #3
  srsra v9.16b, v16.16b, #3
  srsra v10.16b, v16.16b, #3
  srsra v11.16b, v16.16b, #3
  srsra v12.16b, v16.16b, #3
  srsra v13.16b, v16.16b, #3
  srsra v14.16b, v16.16b, #3
  srsra v15.16b, v16.16b, #3
  movi v16.16b, 17

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.2502

retire uop (01)cycle (02)03mmu table walk data (08)09l2 tlb miss data (0b)18191e1f3f51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)l1d cache writeback (a8)a9acc2branch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfmap dispatch bubble (d6)fetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
16020440059300020002403025160108100160008100160020500128013214002040039400391997761999016012020016003220032006440039400391116020110099100100160000100000000011110118160400361600001004004040040400404004040040
1602044003930000000003025160108100160008100160020500128013214002040039400391997761999016012020016003220032006440039400391116020110099100100160000100010100011110118160400361600001004004040040400404004040040
1602044003930000000003025160108100160008100160020500128013214002040039400391997761999016012020016003220032006440039400391116020110099100100160000100000200011110118160400361600001004004040040400404004040040
1602044003930000000007225160108100160008100160020500128013214002040039400391997761999016012020016003220032006440039400391116020110099100100160000100000000011110118160400361600001004004040040400404004040040
1602044003930000000003025160184100160008100160020500128013214002040039400391997761999016012020016003220032006440039400391116020110099100100160000100000000011110118160400361600001004004040040400404004040040
1602044003930000000003025160108100160008100160020500128013214002040039400391997761999016012020016003220032006440039400391116020110099100100160000100000000011110118160400361600001004004040040400404004040040
1602044003930000000003025160108100160008100160020500128013214002040039400391997761999016012020016003220032006440039400891116020110099100100160000100000100011110118160400361600001004004040040400404004040040
1602044003930000000003025160108100160008100160020500128013214002040039400391997761999016012020016003220032006440039400391116020110099100100160000100000000011110118160400361600001004004040040400404004040040
16020440039300000000030101160108100160008100160020500128013214002040039400391997761999016012020016003220032006440039400391116020110099100100160000100000200011110118160400361600001004004040040400404004040040
1602044003930000000003025160108100160008100160020500128013204002040039400391997761999016012020016003220032006440039400391116020110099100100160000100000003011110118160400361600001004004040040400404004040040

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.2502

retire uop (01)cycle (02)03mmu table walk data (08)1e3f51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)5f6061696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d cache writeback (a8)accdcfd0d2l1i cache miss demand (d3)d5map dispatch bubble (d6)d9dadbddfetch restart (de)e0eaeb? simd retires (ee)? int retires (ef)f5f6f7f8fd
16002440051299112462516001010160000101601125012800001054002040039400391999632001916001020160000203200004003940039111600211091010160000100901002283113162118540036206160000104004040040400404004040040
1600244003930000462516001010160000101600005012800001154002040039400391999632001916001020160000203200004003940039111600211091010160000102800100228218162118440036206160000104004040040400404004040040
160024400393000046251600101016000010160000501283312115400204003940039199963200191601142016000020320000400394003911160021109101016000010200100228219162115840036206160000104004040040400404009040040
160024400393000046251600101016000010160000501280000115400204003940039199963200191600102016000020320000400394003911160021109101016000010100100228315162114840036206160000104004040040400404004040040
1600244003930000462516001010160000101600005012800001154002040039400391999632001916001020160000203200004003940039111600211091010160000102001002283191621171040036206160000104004040040400404004040040
1600244003930000462516001010160000101600005012800001154002040039400391999632001916001020160000203200004003940039111600211091010160000101180100228318162116840085206160000104004040040400404004040040
160024400393000046251600101016000010160000501280000115400204003940039199963200191600102016000020320000400394003911160021109101016000010100100228318162118940036206160000104004040040400404004040040
1600244003930000462516001010160000101600005012800001154002040039400391999632001916001020160000203200004003940039111600211091010160000103300100228314162115940036206160000104004040040400404004040040
160024400393000046251600101016000010160000501280000115400204003940039199963200191600102016000020320000400394003911160021109101016000010200100228214162119940036406160000104004040040400404004040040
16002440039300008825160010101600001016000050128000011540020400394003919996320019160010201600002032000040039400391116002110910101600001039001002283141621151040036206160000104004040040400404004040040