Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

SRSRA (vector, 2D)

Test 1: uops

Code:

  srsra v0.2d, v1.2d, #3
  movi v0.16b, 1
  movi v1.16b, 2

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 0.000

Load/store unit issues: 0.000

SIMD/FP unit issues: 1.000

retire uop (01)cycle (02)03mmu table walk instruction (07)3f4e51schedule uop (52)schedule simd uop (54)dispatch simd uop (57)ldst uops in schedulers (5b)696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map simd uop (7e)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst simd alu (9a)l1d cache writeback (a8)accfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)f5f6f7f8fd
10043037230822548251000100010003983133018303730372415328951000100020003037303711100110000075116112630100030383038303830383038
10043037230612548251000100010003983133018303730372415328951000100020003037303711100110000073116112630100030383038303830383038
10043037230612548251000100010003983133018303730372415328951000100020003037303711100110000973116112630100030383038303830383038
10043037231612548251000100010003983133018303730372415328951000100020003037303711100110000073116112630100030383038303830383038
10043037230612548251000100010003983133018303730372415328951000100020003037303711100110000073116112630100030383038303830383038
10043037230612548251000100010003983133018303730372415328951000100020003037303711100110000073116112630100030383038303830383038
10043037220612548251000100010003983133018303730372415328951000100020003037303711100110000073116222630100030383038303830383038
100430372201032548251000100010003983133018303730372415328951000100020003037303711100110000075216222627100030383038303830383038
10043037220612548251000100010003983133018303730372415328951000100020003037303711100110000073116112630100030383038303830383038
10043037230612548251000100010003983133018303730372415328951000100020003037303711100110000073116112630100030383038303830383038

Test 2: Latency 1->1

Code:

  srsra v0.2d, v1.2d, #3
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 3.0037

retire uop (01)cycle (02)03mmu table walk data (08)181e3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d cache writeback (a8)acc2cfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
10204300372250006129548251010010010000100100005004277313300183003730037282653287451010020010000200200003003730037111020110099100100100001000007101161129634100001003003830038300383003830038
10204300372251006129548251010010010000100100005004277313300183003730037282653287451010020010000200200003003730037111020110099100100100001006007101161129634100001003003830038300383003830038
1020430037225001866129548251010010010000100100005004277313300183003730037282653287451010020010000200200003003730037111020110099100100100001000007101161129634100001003003830038300383003830038
10204300372250006129548251010010010000100100005004277313300183003730037282653287451010020010000200200003003730037111020110099100100100001000557807101161129634100001003003830038300383003830038
10204300372250006129548251015110010000100100005004277313300183003730037282653287451010020010000200200003003730037111020110099100100100001000007101161129634100001003003830038300383003830038
10204300372250006129548251010010010000100100005004277313300183003730037282653287451010020010000200200003003730037111020110099100100100001000007101161129634100001003003830038300383003830038
10204300372240006129548251010010010000100100005004277313300183003730037282653287451010020010000200200003003730037111020110099100100100001000007101161129634100001003003830038300383003830038
10204300372250006129548251010010010000100100005004277313300183003730037282653287451010020010000200200003003730037111020110099100100100001002007101161129634100001003003830038300383003830038
10204300372240006129548251010010010000100100005004277313300183003730037282653287451010020010000200200003003730037111020110099100100100001000007101161129634100001003003830038300383003830038
10204300372250006129548251010010010000100100005004277313300183003730037282653287451010020010000200200003003730037111020110099100100100001000007101161129634100001003003830038300383003830038

1000 unrolls and 10 iterations

Result (median cycles for code): 3.0037

retire uop (01)cycle (02)03mmu table walk data (08)09l2 tlb miss data (0b)191e1f3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb miss (a1)l1d cache writeback (a8)accfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
1002430037225000000612954825100101010000101000050427731303001830037300372828732876710010201000020200003003730037111002110910101000010000640316332963010000103008630086300383003830038
100243003722500000010362954825100101010000101000050427731303001830037300372828732876710010201000020200003003730037111002110910101000010000640316332963010000103003830038300383003830038
1002430037225000000612954825100101010000101000050427731303001830037300372828732876710010201000020200003003730037111002110910101000010000640316332963010000103003830038300383003830038
1002430037226000000612954825100101010000101000050427731303001830037300372828732876710010201000020200003003730037111002110910101000010000640316332963010000103003830038300383003830038
10024300372250000003602954825100101010000101000050427731303001830037300372828732876710010201000020200003003730037111002110910101000010000640316332963010000103003830038300383003830038
100243003722500000020512954825100101010000101000050427731303001830085300372828732876710160201000020200003003730037111002110910101000010000640316332963010000103003830038300383003830038
10024300372250000007262954825100101010032101000050427731303001830037300372828732876710010201000020200003003730037111002110910101000010000640316332963010000103003830038300383003830038
10024300372250000002512954825100101010000101000050427731303001830037300372829932876710010201000020200003003730037111002110910101000010000640316332963010000103003830038300383003830038
1002430037225000000612954825100101010000101000050427731303001830037300372828732876710010201000020200003003730037111002110910101000010000640316332963010000103003830038300383003830038
10024300372250000006129548251001010100001010000504277313030018300373003728287172876710010201000020200003003730037111002110910101000010000640316332963010000103003830038300383003830038

Test 3: Latency 1->2

Code:

  srsra v0.2d, v0.2d, #3
  movi v0.16b, 1

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 3.0037

retire uop (01)cycle (02)03mmu table walk data (08)l2 tlb miss data (0b)18191e1f3a3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb miss (a1)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acc2branch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
102043003722500000006129547251010010010000100100005004277160030018300373003728271628741101002001000020020000300373003711102011009910010010000100000000011172242434296290100001003003830038300383003830038
102043003722500000009729547251010010010000100100005004277160030018300373003728252628733101002001000020020000300373003711102011009910010010000100000000011172242434296290100001003003830038300383003830038
102043003722500000009729547251010010010000100100005004277160030018300373003728252628733101002001000020020000300373003711102011009910010010000100000000011172242443296290100001003003830038300383003830038
102043003722500000019729547251010010010000100100005004277160030018300373003728252628733101002001000020020000300373003711102011009910010010000100000000011172232444296290100001003003830038300383003830038
1020430084225000000197295472510100100100001001000050042771600300183003730037282526287331010020010000200200003003730084111020110099100100100001000010000111722580122982810100001003037130229303243037030324
10204303232250176936616010329547251010010010000100100005004277160030018300373003728271628741101002001000820421012301793013261102011009910010010000100220108383211179056446297386100001003018230277302273018130226
1020430178226102614188053729484229102091241005613411050648428695703027030653304692828959289421125624411821236243003056530549111102011009910010010000100201023617521119722120762994014100001003046530515306543037330421
10204306542371099118896814684294661761016311910064118109005934285272030306304173051028314442887910869220113352262331430466304191011020110099100100100001000200232111829449112997221100001003051430454306093069430656
1020430752237006131455132033942294841591024313010056133110505934292623130018300373003728271628740101002001000820020016300373003711102011009910010010000100000000011171711634296291100001003003830038300383003830038
102043003722500000006129547251010010010000100100005004277160130018300373003728271628740101002001000820020016300373003711102011009910010010000100000000011171711611296460100001003003830038300383003830038

1000 unrolls and 10 iterations

Result (median cycles for code): 3.0037

retire uop (01)cycle (02)03191e3a3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d cache writeback (a8)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
100243003722500061295472510010101000010100005042771601300183003730037282863287671001020100002020000300373003711100211091010100001006402162229629010000103003830038300383003830038
100243003722500061295472510010101000010100005042771601300183003730037282863287671001020100002020000300373003711100211091010100001006402162229629010000103003830038300383003830038
100243003722500061295472510010101000010100005042771601300183003730037282863287671001020100002020000300373003711100211091010100001006402162229629010000103003830038300383003830038
100243003722503061295472510010101000010100005042771601300183003730037282863287671001020100002020000300373003711100211091010100001006402162229629010000103003830038300383003830038
1002430037225000375295472510010101000010100005042771601300183003730037282863287671001020100002020000300373003711100211091010100001006402162229667010000103003830038300863008530086
1002430085225048161295472510010101000010100005042771601300183003730037282863287671001020100002020000300373003711100211091010100001016402162229629010000103003830038300383003830038
1002430037225012061295472510010101000010100005042771601300183003730037282863287671001020100002020000300373003711100211091010100001006402162229629010000103003830038300383003830038
100243003722500061295472510010101000010100005042771601300183003730037282863287671001020100002020000300373003711100211091010100001036402162229629010000103003830038300383003830038
100243003722500061295472510010101000010100005042771601300183003730037282863287671001020100002020000300373003711100211091010100001006402163229629010000103003830038300383003830038
1002430037225121061295472510010101000010100005042771601300183003730037282863287671001020100002020000300373003711100211091010100001006402162229629010000103003830038300383003830038

Test 4: throughput

Count: 8

Code:

  movi v0.16b, 0
  srsra v0.2d, v8.2d, #3
  movi v1.16b, 0
  srsra v1.2d, v8.2d, #3
  movi v2.16b, 0
  srsra v2.2d, v8.2d, #3
  movi v3.16b, 0
  srsra v3.2d, v8.2d, #3
  movi v4.16b, 0
  srsra v4.2d, v8.2d, #3
  movi v5.16b, 0
  srsra v5.2d, v8.2d, #3
  movi v6.16b, 0
  srsra v6.2d, v8.2d, #3
  movi v7.16b, 0
  srsra v7.2d, v8.2d, #3
  movi v8.16b, 9

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.2508

retire uop (01)cycle (02)031e3f51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)696b6d6emap rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d cache writeback (a8)branch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd0d5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
1602042007715002925801161008001610080028500640196200450200652006561280128200800282001600562006520311111602011009910010016000010000111101270816882006211600001002006620066200662006620066
1602042006515502925801161008001610080028500640196200450200652006561280128200800282001600562006520065111602011009910010016000010000111101270816882006201600001002006620066200662006620066
1602042006515002925801161008001610080028500640196200450200652006561280128200800282001600562006520065111602011009910010016000010000111101260916882006201600001002006620066200662006620066
1602042006515102925801161008001610080028500640196200450200652006561280128200800282001600562006520065111602011009910010016000010000111101270716882006201600001002006620066200662006620066
1602042006515002925801161008001611480028500640196200450200652006561280128200800282001600562006520065111602011009910010016000010000111101280816882006201600001002006620066200662006620066
1602042006515002925801161008001610080028500640196200450200652006561280128200800282001600562006520065111602011009910010016000010000111101270316892006201600001002013420066200662006620066
16020420065150452925801161008001610080028500640196200450200652006561280128200800282001600562006520065111602011009910010016000010000111101270816882006201600001002006620066200662006620066
16020420065150067525801161008001610080028500640196200450200652006561280128200800282001600562006520065111602011009910010016000010000111101270316872006201600001002006620066200662006620066
1602042006515002925801161008001610080028500640196200450200652006561280128200800282001600562006520065111602011009910010016000010000111101270816882006201600001002006620066200662006620066
1602042006515102925801161008001610080028500640196200450200652006561280128200800282001600562006520065111602011009910010016000010000111101270816732006201600001002006620066200662006620066

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.2506

retire uop (01)cycle (02)03mmu table walk data (08)1e3f51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)5f60696d6emap rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9faccfd0d2l1i cache miss demand (d3)l1i tlb miss demand (d4)d5map dispatch bubble (d6)d9dadbddfetch restart (de)e0ea? simd retires (ee)? int retires (ef)f5f6f7f8fd
1600242007215000452580010108000010800005064000011200272004620046322800102080000201600002004620046111600211091010160000100100273210820211662004315160000102004720047200472004720047
1600242004615000452580010108000010800005064000001200272005020050322800102080000201600002005020050111600211091010160000100100326220624422642004730160000102005120051200512005120051
1600242005015000512580010108000010800005064089201200312005020050322800102080000201600002005020050111600211091010160000100100316220624422662004730160000102005120051200512005120051
16002420050150001622580010108000010800005064000001200312005020050322800102080000201600002005020050111600211091010160000100100326220624422742004730160000102005120051200512005120051
1600242005015000512580010108000010800005064000001200312005020050322800102080000201600002005020050111600211091010160000100100306220624422662004730160000102005120051200512005120051
16002420050150001162580010108000010800005064000001200312005020050322800102080000201600002005020050111600211091010160000100100326220424422462004730160000102005120051200512005120051
1600242005015000512580010108000010800005064000001200312005020050322800102080000201600002005020050111600211091010160000100100326220624422462004730160000102005120051200512005120051
1600242005015000512580010108000010800005064000001200312005020050322800102080000201600002005020050111600211091010160000100100326220624422462004730160000102005120051200512005120051
1600242005015000512580010108000010800005064000001200312005020050322800102080000201600002005020050111600211091010160000100100326220624422662004730160000102005120051200512005120051
1600242005015000512580010108000010800005064000001200312005020050322800102080000201600002005020050111600211091010160000100100326220624422662004730160000102005120051200512005120051

Test 5: throughput

Count: 16

Code:

  srsra v0.2d, v16.2d, #3
  srsra v1.2d, v16.2d, #3
  srsra v2.2d, v16.2d, #3
  srsra v3.2d, v16.2d, #3
  srsra v4.2d, v16.2d, #3
  srsra v5.2d, v16.2d, #3
  srsra v6.2d, v16.2d, #3
  srsra v7.2d, v16.2d, #3
  srsra v8.2d, v16.2d, #3
  srsra v9.2d, v16.2d, #3
  srsra v10.2d, v16.2d, #3
  srsra v11.2d, v16.2d, #3
  srsra v12.2d, v16.2d, #3
  srsra v13.2d, v16.2d, #3
  srsra v14.2d, v16.2d, #3
  srsra v15.2d, v16.2d, #3
  movi v16.16b, 17

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.2502

retire uop (01)cycle (02)03mmu table walk data (08)l2 tlb miss instruction (0a)l2 tlb miss data (0b)181e3f51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)6061696d6emap stall dispatch (70)simd prf full (72)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fld unit uop (a6)l1d cache writeback (a8)acbranch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd0d2d5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
16020440059300000003252516010810016000810016002050012801320540020400394003919977061999016012020016003220032006440039400391116020110099100100160000100000111101185101611400361600001004004040040400404004040040
1602044003930000000302516010810016000810016002050012801320040020400394003919977761999016012020016003220032006440039400391116020110099100100160000100000111101185111610400361600001004004040040400404004040040
1602044003930001000302516010810016000810016013050012801321540020400394003919977061999016012020016003220032006440039400391116020110099100100160000100000111101180001600400361600001004004040040400404004040040
1602044003930000000302516010810016000810016002050012801320540020400394003919977061999016012020016003220032006440039400391116020110099100100160000100000111101185011610400361600001004004040040400404004040040
1602044003929900000302516010810016000810016002050012801321040020400394003919977061999016012020016003220032006440039400391116020110099100100160000100000111101180011611400361600001004004040040400404004040040
1602044003929900000302516010810016000810016002050012801320040020400394003919977061999016012020016003220032006440039400391116020110099100100160000100000111101180001600400361600001004004040040400404004040040
16020440039300000006952516010810016000810016002050012801321540020400394003919977061999016012020016003220032006440039400391116020110099100100160000100000111101180011610400361600001004004040040400404004040040
16020440039300000054302516010810016000810016002050012801321040020400394003919977061999016012020016003220032006440039400391116020110099100100160000100000111101180101610400361600001004004040040400404004040040
1602044003930000000302516010810016000810016002050012801321540020400394003919977061999016012020016003220032006440039400391116020110099100100160000100000111101185001611400361600001004004040040400404004040040
1602044003929900000302516010810016000810016002050012801320540020400394003919977061999016012020016003220032006440039400391116020110099100100160000100000111101185101601400361600001004004040040400404004040040

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.2502

retire uop (01)cycle (02)03l2 tlb miss data (0b)18191e1f3a3f51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)5f60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)accfd0d2l1i cache miss demand (d3)d5map dispatch bubble (d6)d9dadbddfetch restart (de)e0eaeb? simd retires (ee)? int retires (ef)f5f6f7f8fd
1600244005129900000046251600101016000010160000501280000114002040039400391999632001916001020160000203200004003940039111600211091010160000100000010022622211622296400363010160000104004040040400404004040040
16002440039300000000522516001010160000101600005012800000140020400394003919996320019160010201600002032000040039400391116002110910101600001000000100246221016422106400363010160000104004040040400404004040040
160024400393000000004625160010101600001016000050128000011400204003940039199963200191600102016000020320000400394003911160021109101016000010000001002231191621191040036155160000104004040040400404004040040
160024400393000000005225160010101600001016000050128000001400204003940039199963200191600102016000020320000400394003911160021109101016000010000001002431110164221411400363010160000104004040040400404004040040
160024400392990000004625160010101600001016000050128075611400204003940039199963200191600102016000020320000400394003911160021109101016000010000001002432110162115740036155160000104004040040400404004040040
1600244003930000000046251600101016000010160000501280000104002040039400391999632001916001020160000203200004003940039111600211091010160000100000010024622916422145400363010160000104004040040400404004040040
160024400393000000005225160010101600001016000050128000001400614003940039199963200191600102016000020320000400394003911160021109101016000010000001002462111162128440036155160000104004040040400404004040040
1600244003930000000046251600101016000010160000501280000114002040039400391999632001916001020160000203200004003940039111600211091010160000100000010024622916422106400363010160000104004040040400404009040040
160024400393000000005225160010101600001016000050128000001400204003940039199963200191600102016000020320000400394003911160021109101016000010000001002231171621151240036155160000104004040040400404004040040
16002440039300000000236251600101016000010160000501280000014002040039400391999632001916001020160000203200004003940039111600211091010160000100000010024622716422512400363010160000104004040040400404004040040