Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

URSRA (vector, 8H)

Test 1: uops

Code:

  ursra v0.8h, v1.8h, #3
  movi v0.16b, 1
  movi v1.16b, 2

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 0.000

Load/store unit issues: 0.000

SIMD/FP unit issues: 1.000

retire uop (01)cycle (02)031e3f4e51schedule uop (52)schedule simd uop (54)dispatch simd uop (57)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map simd uop (7e)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst simd alu (9a)l1d cache writeback (a8)accfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)f5f6f7f8fd
100430372306125482510001000100039831303018303730372415328951000100020003037303711100110000073316222630100030383038303830383038
1004303723010725482510001000100039831303018303730372415328951000100020003037303711100110000073216222630100030383038303830383038
1004303723010325482510001000100039831303018303730372415328951000100020003037303711100110000073216222630100030383038303830383038
100430372206125482510001000100039831303018303730372415328951000100020003037303711100110000073216222630100030383038303830383038
1004303722061254825100010001000398313030183037303724153289510001000200030373037111001100004273216222630100030383038303830383038
100430372206125482510001000100039831303018303730372415328951000100020003037303711100110000073216222630100030383038303830383038
100430372306125482510001000100039831303018303730372415328951000100020003037303711100110000073216222630100030383038303830383038
100430372306125484410001000100039831313018303730372415328951000100020003037303711100110000073216222630100030383038303830383038
100430372306125482510001000100039831303018303730372415328951000100020003037303711100110000073216222630100030383038303830383038
100430372396125482510001000100039831303018303730372415328951000100020003037303711100110000073216222630100030383038303830383038

Test 2: Latency 1->1

Code:

  ursra v0.8h, v1.8h, #3
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 3.0037

retire uop (01)cycle (02)03l2 tlb miss data (0b)181e1f3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fld unit uop (a6)l1d cache writeback (a8)accfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
10204300372250039014529548251010010010000100100005004277313300183003730037282653287451010020010000200200003003730037111020110099100100100001000007101161129634100001003003830038300383003830038
10204300372250010506129548251010010010000100100005004277313300183003730037282653287451010020010000200200003003730037111020110099100100100001000007101161129634100001003003830038300383003830038
10205300372250011706129548251010010010000100100005004277313300183003730037282653287451010020010000200200003003730037111020110099100100100001000007101161129634100001003003830038300383003830038
1020430037225009606129548251010010010000100100005004277313300183003730037282653287451010020010000200200003003730037111020110099100100100001000007101161129634100001003003830038300383003830038
10204300372250011706129548251010010010000114100005004277313300183003730037282653287451010020010000200200003003730037111020110099100100100001000007101161129634100001003003830038300383003830038
102043003722500906129548251010010010000100100005004277313300183003730037282653287451010020010000200200003003730037111020110099100100100001000007101161129634100001003003830038300383003830038
10204300372250087010729548251010010010016100100005004277313300183003730037282653287451010020010000200200003003730037111020110099100100100001002307101161129634100001003013430038300383003830132
102043003722510006129548251010010010000100100005004277313300183003730037282653287451010020010000200200003003730037111020110099100100100001000007101161129634100001003003830038300383003830038
1020430037233001206129548251010010010000100100005004277313300183003730037282653287451010020010000200200003003730037111020110099100100100001000107101161129634100001003003830038300383003830038
102043003722500006129548251010010010000100100005004277313300183003730037282653287451010020010000200200003008530037111020110099100100100001000137101161129634100001003003830038300383003830038

1000 unrolls and 10 iterations

Result (median cycles for code): 3.0037

retire uop (01)cycle (02)031e3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d cache writeback (a8)accfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
100243003722514161295482510010101000010100005042773130300183003730037282873287671001020100002020000300373003711100211091010100001000640216222963010000103003830038300383003830038
100243003722534261295482510010101000010100005042773130300183003730037282873287671001020100002020000300373003711100211091010100001000640216222963010000103003830038300383008430038
10024300372254261295482510010101000010100005042773130300183003730037282873287671001020100002020000300373003711100211091010100001000640216222963010000103003830038300383003830038
100243003722427061295482510010101000010100005042773130300183003730037282873287671001020100002020000300373003711100211091010100001000640216222963010000103003830038300383003830038
100243003722533061295482510010101000010100005042773130300183003730037282873287671001020100002020000300373003711100211091010100001000640216222963010000103003830038300383003830038
100243003722430661295482510020101000010100005042773130300183003730037282873287671001020100002020000300373003711100211091010100001000640216222963010000103003830038300383003830038
100243003722532461295482510010101000010100005042773130300183003730037282873287671001020100002020000300373003711100211091010100001000640216222963010000103003830038300383003830038
100243003722529461295482510010101000010100005042773130300183003730037282873287671001020100002020000300373003711100211091010100001000640216222963010000103003830038300383003830038
10024300372252761295482510010101000010100005042773130300183003730037282873287671001020100002020000300373003721100211091010100001000640216222963010000103003830038300383003830038
100253003722541461295482510010101000010100005042773130300183003730037282873287671001020100002020000300373003711100211091010100001000640216222963010000103003830038300383003830038

Test 3: Latency 1->2

Code:

  ursra v0.8h, v0.8h, #3
  movi v0.16b, 1

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 3.0037

retire uop (01)cycle (02)031e3a3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fld unit uop (a6)l1d cache writeback (a8)acbranch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
1020430037224147061295472510100100100001001000050042771601300183003730037282717287401010020010008200200163003730037111020110099100100100001000001117170160029646100001003003830038300383003830038
10204300372250061295472510100100100001001000050042771601300183003730037282716287401010020010008200200163003730037111020110099100100100001000001117170160029645100001003003830038300713003830038
10204300372250061295472510100100100001001000050042771601300183003730037282717287411010020010008200200163003730037111020110099100100100001000001117180160029645100001003003830038300383003830038
10204300372250061295472510100100100001001000050042771601300183003730037282717287401010020010008200200163003730037111020110099100100100001000001117180160029646100001003003830038300383003830038
10204300372250061295472510100100100001001000050042771601300183003730037282717287411010020010008200200163003730037111020110099100100100001000001117170160029646100001003003830038300383003830038
10204300372250061295472510100100100001001000050042771601300183003730037282716287411010020010008200200163003730037111020110099100100100001000001117170160029646100001003003830038300383003830038
1020430037225216061295472510100100100001001015050042771601300183003730037282716287411010020010008200200163003730037111020110099100100100001000001117180160029645100001003003830038300383003830038
102043003722515061295472510100100100001001000050042771601300183003730037282717287411010020010008204200163003730037111020110099100100100001000301117180160029645100001003003830038300383003830038
10204300372250061295472510100100100001001000050042771601300183003730037282717287411010020010008200200163003730037111020110099100100100001000001117170160029645100001003003830038300383003830038
10204300372250061295472510100100100001001000050042771601300183003730037282717287411010020010008200200163003730037111020110099100100100001000001117180160029645100001003003830038300383003830038

1000 unrolls and 10 iterations

Result (median cycles for code): 3.0037

retire uop (01)cycle (02)03191e1f3a3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb miss (a1)l1d cache writeback (a8)a9accfd0d5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
100243003722503000726295472510010101000010100005042771600300183003730037282863287671001020100002020000300373013221100211091010100001000006400216222962910000103003830038300383003830038
10024300372250240061295472510010101000010100005042771600300183003730037282863287671001020100002020000300373003711100211091010100001000006400216222962910000103003830038300383003830038
10024300372250150061295472510010101000010100005042771600300183003730037282863287671001020100002020000300373003711100211091010100001000006400216222962910000103003830038300383003830038
10024300372250300061295472510010101000010100005042771600300183003730037282863287671001020100002020000300373003711100211091010100001000006400216222962910000103003830038300383003830038
100243003726902940061295472510010101000010100005042771600300183003730037282863287671001020100002020000300373003711100211091010100001000006400216222962910000103003830038300383003830038
10024300372250150061295472510010101000010100005042771600300183003730037282863287671001020100002020336300373003711100211091010100001000006400216222962910000103003830038300383003830038
10024300372250240061295472510010101000010100005042771600300183003730037282863287671001020100002020000300373003711100211091010100001000036400216222962910000103003830038300383003830038
10024300372250360061295472510010101000010100005042771600300183003730037282863287671001020100002020000300373003711100211091010100001000006400216222962910000103003830038300383003830038
10024300372240180061295472510010101000010100005042771600300183003730037282863287671001020100002020000300373003711100211091010100001000006400216222962910000103003830038300383003830038
10024300372250270061295472510010101000010100005042771600300183003730037282863287671001020100002020000300373003711100211091010100001000006400216222962910000103003830038300383003830038

Test 4: throughput

Count: 8

Code:

  movi v0.16b, 0
  ursra v0.8h, v8.8h, #3
  movi v1.16b, 0
  ursra v1.8h, v8.8h, #3
  movi v2.16b, 0
  ursra v2.8h, v8.8h, #3
  movi v3.16b, 0
  ursra v3.8h, v8.8h, #3
  movi v4.16b, 0
  ursra v4.8h, v8.8h, #3
  movi v5.16b, 0
  ursra v5.8h, v8.8h, #3
  movi v6.16b, 0
  ursra v6.8h, v8.8h, #3
  movi v7.16b, 0
  ursra v7.8h, v8.8h, #3
  movi v8.16b, 9

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.2510

retire uop (01)cycle (02)03191e1f3f51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696b6d6erob full (74)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d cache writeback (a8)acbranch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
16020420088150027029258011610080016100800285006401961200450200652006556128012820080028200160056200652006511160201100991001001600001000001111011901600200621600001002006620066200662006620066
16020420065151000694258011610080016100800285006401961200450200652006506128012820080028200160056200652006511160201100991001001600001000001111011901600200621600001002006620066200662006620066
1602042006515000029258011610080016100800285006401961200450200652006506128012820080028200160056200652006511160201100991001001600001000001111011901600200621600001002006620066200662006620066
160204200651510300292580116100800991008002850064019612004502006520065061280128200800282001600562006520065111602011009910010016000010000181111011901600200621600001002006620066200662006620066
16020420065150018029258011610080016100800285006401961200450200652006506128012820080028200160056200652006511160201100991001001600001000001111011901600200621600001002006620066200662006620066
16020420065150000504258011610080016100800285006401961200450200652006506128012820080028200160056200652006511160201100991001001600001000001111011901600200621600001002006620066200662006620066
1602042006515100029258011610080016100800285006401961200450200652006506128012820080028200160056200652006511160201100991001001600001000001111011901600200621600001002006620066200662006620066
1602042006515000029258011610080016100800285006401961200450200652006506128012820080028200160056200652006511160201100991001001600001000001111011901600200621600001002006620066200662006620066
1602042006515000029258011610080016100800285006401961200450200652006506128012820080028200160056200652006511160201100991001001600001000001111011901600200621600001002006620066200662006620066
1602042006515100029258011610080016100800285006401961200450200652006506128012820080028200160056200652006511160201100991001001600001000001111011901600200621600001002006620066200662006620066

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.2506

retire uop (01)cycle (02)03mmu table walk instruction (07)l2 tlb miss instruction (0a)1e3f51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)5f60696d6emap rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)a9cfd0d2l1i cache miss demand (d3)d5map dispatch bubble (d6)d9dadbddfetch restart (de)e0ea? simd retires (ee)? int retires (ef)f5f6f7f8fd
1600242006315022264512580010108000010800005064000011200292005620050322800102080000201600002004620052111600211091010160000100010044622182232120232004515160000102004720047200472004920047
16002420050150120572580010108000010800005064000001200312005620048322800102080000201600002004620046111600211091010160000100010043622212642217212004730160000102005120053200532005120051
160024200521501221452580010108000010800005064000011200292005420153322800102080000201600002005020050111600211091010160000100010046622192642219202004930160000102005320053200512005120051
160024200501501207162580010108000010800005064000011200272006720046322800102080000201600002004620046111600211091010160000100010046311192221119202004316160000102004920049200492004720049
160024200481502227452580010108000010800005064000011200272006520048322800102080000201600002004620046111600211091010160000100010040311192621122182004330160000102004720047200472004720051
16002420050150100512580010108000010800005064000001200272005420050322800102080000201600002004820046111600211091010160000100010044311222011119232004316160000102004720049200492004920049
16002420052150009512580010108000010800005064000001200312006520046322800102080000201600002005020050111600211091010160000100010046611172021118202004315160000102004720047200472004720047
16002420050150200452580010108000010800005064000011200292005420046322800102080000201600002004820046111600211091010160000100010044311232221118212004315160000102004720047200472004920049
160024200481502105202580010108000010800005064000011200272005420050322800102080000201600002005220050111600211091010160000100010042611192221118242004515160000102004720047200472004720047
1600242004615010141512580010108000010800005064000011200272005420046322800102080000201600002004620046111600211091010160000100010046622172442218192004930160000102005120051200512005120051

Test 5: throughput

Count: 16

Code:

  ursra v0.8h, v16.8h, #3
  ursra v1.8h, v16.8h, #3
  ursra v2.8h, v16.8h, #3
  ursra v3.8h, v16.8h, #3
  ursra v4.8h, v16.8h, #3
  ursra v5.8h, v16.8h, #3
  ursra v6.8h, v16.8h, #3
  ursra v7.8h, v16.8h, #3
  ursra v8.8h, v16.8h, #3
  ursra v9.8h, v16.8h, #3
  ursra v10.8h, v16.8h, #3
  ursra v11.8h, v16.8h, #3
  ursra v12.8h, v16.8h, #3
  ursra v13.8h, v16.8h, #3
  ursra v14.8h, v16.8h, #3
  ursra v15.8h, v16.8h, #3
  movi v16.16b, 17

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.2503

retire uop (01)cycle (02)03mmu table walk data (08)0918191e1f3f51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9facbranch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
160204400393000000510302516010810016000810016002050012801324002040039400391997761999016012020016003220032006440039400391116020110099100100160000100011110118016004003601600001004004040040400404004040040
16020440039300000000302516010810016000810016002050012801324002040039400391997761999016012020016003220032006440039400391116020110099100100160000100011110118016004003601600001004004040040400404004040040
16020440039300000000302516010810016000810016002050012801324002040039400391997761999016012020016003220032027240039400391116020110099100100160000100011110118016004003601600001004004040040400404004040040
16020440039300000000302516010810016000810016002050012801324002040039400391997761999016012020016003220032006440039400391116020110099100100160000100011110118016004003601600001004004040040400404004040040
16020440039300000000612516010810016000810016002050012801324002040039400391997761999016012020016003220032006440039400391116020110099100100160000100011110118016004003601600001004004040040400404004040040
16020440039299000000302516010810016000810016002050012801324002040039400391997761999016012020016003220032006440039400391116020110099100100160000100011110118016004003601600001004004040040400404004040040
16020440039300000000302516010810016000810016002050012801324002040039400391997761999016012020016003220032006440039400391116020110099100100160000100311110118016004003601600001004005040049400494004940049
16020440048300000000642616011610016001610016002850012801964002940048400481997691998616012820016003820032007640048400481116020110099100100160000100022210129123114004501600001004004940049400504005040049
160204400483000000006426160116100160016100160028500128019640029400484004819976101998616012820016003820032007640049400481116020110099100100160000100021210129123114004501600001004004940050400494005040050
160204400493000000006426160116100160016100160028500128019640029400484004819976101998616012820016014220032007640048400481116020110099100100160000100022210128123114004601600001004004940049400494004940050

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.2502

retire uop (01)cycle (02)03191e3f51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)5f60696d6emap stall dispatch (70)simd prf full (72)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fld unit uop (a6)branch mispred nonspec (cb)cfd0d2l1i cache miss demand (d3)d5map dispatch bubble (d6)d9dadbddfetch restart (de)e0eaeb? simd retires (ee)? int retires (ef)f5f6f7f8fd
16002440039300004625160010101600001016000050128000011400204003940039199960320019160010201600002032000040039400391116002110910101600001000100223112616211232440036155160000104004040040400404004040040
16002440039299004625160010101600001016000050128000011400204003940039199960320019160010201600002032000040039400391116002110910101600001000100223112416211222540036155160000104004040040400404004040040
16002440039300004625160010101600001016000050128000011400204003940039199960320019160010201600002032000040039400391116002110910101600001000100223112416211252540036155160000104004040040400404004040040
160024400393000124625160010101600001016000050128000011400204003940039199960320019160010201600002032000040039400391116002110910101600001001100223112016211252740036155160000104004040040400904004040040
160024400393000154625160010101600001016000050128000011400814003940039199960320019160010201600002032000040039400391116002110910101600001000100223112616211232440036155160000104004040040400404004040040
16002440039299004625160010101600001016000050128000011400204003940039199960320019160010201600002032000040039400391116002110910101600001000100223112416211242440036155160000104004040040400404004040040
16002440039300004625160010101600001016000050128000011400204003940039199960320019160010201600002032000040039400391116002110910101600001000100223112416211252440036155160000104004040040400404004040040
16002440039300007425160010101600001016000050128000011400204003940039199960320019160010201600002032000040039400391116002110910101600001000100223112316211232340036155160000104004040040400404004040040
16002440039300004625160010101600001016000050128000011400204003940039199960320019160010201600002032000040039400391116002110910101600001000100223112316211242640036155160000104004040040400404004040040
16002440039300004625160010101600001016000050128000011400204003940039199960320019160010201600002032000040039400391116002110910101600001000100223112316211242340036155160000104004040040400404004040040