Apple M1 Microarchitecture Research by Dougall Johnson

Firestorm: Overview | Base Instructions | SIMD and FP Instructions
Icestorm:  Overview | Base Instructions | SIMD and FP Instructions

URSRA (vector, 4H)

Test 1: uops

Code:

  ursra v0.4h, v1.4h, #3
  movi v0.16b, 1
  movi v1.16b, 2

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 0.000

Load/store unit issues: 0.000

SIMD/FP unit issues: 1.000

retire (01)cycle (02)031e3f4e51inst issue (52)~issue fp/simd (54)~dispatch fp/simd (57)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op fp/simd (7e)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst neon or fp (9a)a8acc2cfd5d6ddinst fetch restart (de)e0? fp/simd (ee)f5f6f7f8fd
10043037230612548251000100010003983131301830373037241532895100010002000303730371110011000143073216112783100030383038303830383038
1004303723061254825100010001000398313130183037303724153289510001000200030373037111001100000073116112630100030383038303830383038
1004303722061254825100010001000398313130183037303724153289510001000200030373037111001100000073116112630100030383038303830383038
1004303722061254825100010001000398313130183037303724153289510001000200030373037111001100000073116112630100030383038303830383038
1004303723061254825100010001000398313130183037303724153289510001000200030373037111001100010073116112630100030383038303830383038
10043037230103254825100010001000398313130183037303724153289510001000200030373037111001100000073116112630100030383038303830383038
1004303723061254825100010001000398313130183037303724153289510001000200030373037111001100000073116112630100030383038303830383038
10043037230612548251000100010003983131301830373037241532895100010002000303730371110011000015073116112630100030383038303830383038
1004303722061254825100010001000398313130183037303724153289510001000200030373037111001100000073116112630100030383038303830383038
1004303723061254825100010001000398313130183037303724153289510001000200030373037111001100000073116112630100030383038303830383038

Test 2: Latency 1->1

Code:

  ursra v0.4h, v1.4h, #3
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 3.0037

retire (01)cycle (02)03080b1e3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a1a6a8a9accfd5d6ddinst fetch restart (de)e0? int output thing (e9)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
102043003722500047002954810410100100100001001000050042773133001830037300372826532874510100200100002002000030037300371110201100991001001000010000010071011611296340100001003003830038300383003830038
102043003722500061295482510100100100001001000050042773133001830037300372826532874510100200100002002000030037300371110201100991001001000010000000075411610296340100001003003830038300383003830038
102043003722500061295482510100100100001001000050042773133001830037300372826532874510100200100002002000030037300371110201100991001001000010000000071011611296340100001003003830038300383003830038
1020430037225000232295482510100100100001001000050042773133001830037300372826532874510100200100002002000030037300371110201100991001001000010000000071011611296340100001003003830038300383003830038
102043003722500061295482510100100100001001000050042773133001830037300372826532874510100200100002002000030037300371110201100991001001000010000000071011611296340100001003003830038300383003830038
102043003722500061295302510100100100001001000050042773133001830037300372826532874510100200100002002000030037300371110201100991001001000010000010071011611296340100001003003830038300383003830038
102043003722400061295482510100100100001001000050042773133001830037300372826532874510100200100002002000030037300371110201100991001001000010000001071011611296340100001003003830038300383003830038
102043003722500061295482510100100100001001000050042773133001830037300372826532874510100200100002002000030037300371110201100991001001000010000010071012511296340100001003003830038300383003830038
102043003722400061295482510100100100001001000050042773133001830037300372826532874510100200100002002000030037300371110201100991001001000010000000071011611296340100001003003830038300383003830038
102043003722500061295482510100100100001001000050042773133001830037300372826532874510100200100002002000030037300371110201100991001001000010000000071011611296340100001003003830038300383003830038

1000 unrolls and 10 iterations

Result (median cycles for code): 3.0037

retire (01)cycle (02)0318191e1f3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a1a6a7a8a9accfd5d6ddinst fetch restart (de)e0eb? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
10024300372240000251295482510010101000010100005042773130300183003730037282873287671001020100002020000300373003711100211091010100001000000006402162229630010000103003830038300383003830038
1002430037225000061295482510010101000010100005042773131300183003730037282873287671001020100002020000300373003711100211091010100001000003006402162229630010000103003830038300383003830038
1002430037225000061295482510010101000010100005042773130300183003730037282873287671001020100002020000300373003711100211091010100001000000006402162229630010000103003830038300383003830038
1002430037224000061295482510010101000010100005042773130300183003730037282873287671001020100002020000300373003711100211091010100001000000006402162229630010000103003830038300383003830038
10024300372250000612954810210010101000010100005042773130300183003730037282873287671001020100002020000300373003711100211091010100001000000006402162229630010000103003830038300383003830038
10024300372240000726295482510010101000010100005042773130300183003730037282873287671001020100002020000300373003711100211091010100001000000006402162229630010000103003830038300383003830038
1002430037225009061295482510010101000010100005042773130300183003730037282873287671001020100002020000300373003711100211091010100001000004006402162229630010000103003830038300383003830038
1002430037225000061295482510010101000010100005042773130300183003730037282873287671001020100002020000300373003711100211091010100001000000006402162229630010000103003830038300383003830038
10024300372250000726295482510010101000010100005042773130300183003730037282873287671001020100002020000300373003711100211091010100001000000006402162229630010000103003830038300383003830038
100243003722500004412954865100101010000101000050427731303001830037300372828732876710010201000020200003003730037111002110910101000010000029036402162229630010000103003830038300383003830038

Test 3: Latency 1->2

Code:

  ursra v0.4h, v0.4h, #3
  movi v0.16b, 1

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 3.0037

retire (01)cycle (02)03080b18191e1f3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696b6d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a1a6a8a9acc2c5branch mispredict (cb)cdcfd5d6e0? int output thing (e9)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
1020430037225000000612954725101001001000010010000500427716003001803003730037282716287401010020010008200200163003730037111020110099100100100001000000030111718016296450100001003003830038300383003830038
102043003722500001201032954725101001001000010010000500427716003001803003730037282717287411010020010008200200163003730037111020110099100100100001000001000111717016296460100001003003830038300383003830038
1020430037225000000612954725101001001000010010000500427716013001803003730037282717287401010020010008200200163003730037111020110099100100100001000000000111717016296450100001003003830038300383003830038
1020430037225000000612954725101001001000010010000500427716013001803003730037282716287411010020010008200200163003730037111020110099100100100001000000000111718016296450100001003003830038300383003830038
1020430037225000000612954725101001001000010010000500427716003001803003730037282717287411010020010008200200163003730037111020110099100100100001000000000111718016297160100001003003830038300383003830038
10204300372250000002032954725101001001000010010000500427716013001803003730037282717287411010020010008200200163003730037111020110099100100100001000000000111718016296460100001003003830038300383003830038
10204300372250000006129547251010010010000100101505004277160030018030037300372827112287941010020010008200200163003730037111020110099100100100001000000000111741016296460100001003003830038300383003830038
10204300372250000120612953825101001001000010010150500427716013001803003730037282717287411010020010008200200163003730084111020110099100100100001000000000111717016296450100001003003830038300383003830038
1020430037225000000822954725101001001000010010000500427716003001803003730037282716287411010020010008200200163003730037111020110099100100100001000000000111718016296460100001003003830038300383003830038
10204300372250000601032954725101001001000010010000500427716003001803003730037282717287411010020010008200200163003730037111020110099100100100001000000000111717016296460100001003003830038300383003830038

1000 unrolls and 10 iterations

Result (median cycles for code): 3.0037

retire (01)cycle (02)031e3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa8accfd5d6ddinst fetch restart (de)e0? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
1002430037225061295472510010101000010100005042771600300183003730037282863287671001020100002020000300373003711100211091010100001000640416342962910000103003830038300383003830038
1002430037225061295472510010101000010100005042771600300183003730037282863287671001020100002020000300373003711100211091010100001000640316342962910000103003830038300383003830038
100243003722501114295472510010101000010100005042771600300183003730037282863287671001020100002020000300373003711100211091010100001000640416432962910000103003830038300383003830038
1002430037225061295472510010101000010100005042771600300183003730037282863287671001020100002020000300373003711100211091010100001040640416342962910000103003830038300383003830038
1002430037225061295472510010101000010100005042771600300183003730037282863287671001020100002020000300373003711100211091010100001020640416342962910000103003830038300383003830038
1002430037225061295472510010101000010100005042771600300183003730037282863287671001020100002020000300373003711100211091010100001000640316342962910000103003830038300383003830038
1002430037225061295472510010101000010100005042771600300183003730037282863287671001020100002020000300373003711100211091010100001000640416342962910000103003830038300383003830038
1002430037225061295472510010101000010100005042771600300183003730037282863287671001020100002020000300373003711100211091010100001000640416442962910000103003830038300383003830038
1002430037225061295472510010101000010100005042771600300183003730037282863287671001020100002020000300373003711100211091010100001000640416442962910000103003830038300383003830038
10024300372250612954725100101010000101000050427716003001830037300372828632876710010201000020200003003730037111002110910101000010018640316442962910000103003830038300383003830038

Test 4: throughput

Count: 8

Code:

  movi v0.16b, 0
  ursra v0.4h, v8.4h, #3
  movi v1.16b, 0
  ursra v1.4h, v8.4h, #3
  movi v2.16b, 0
  ursra v2.4h, v8.4h, #3
  movi v3.16b, 0
  ursra v3.4h, v8.4h, #3
  movi v4.16b, 0
  ursra v4.4h, v8.4h, #3
  movi v5.16b, 0
  ursra v5.4h, v8.4h, #3
  movi v6.16b, 0
  ursra v6.4h, v8.4h, #3
  movi v7.16b, 0
  ursra v7.4h, v8.4h, #3
  movi v8.16b, 9

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.2508

retire (01)cycle (02)031e1f3f51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6escheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa8a9acc5branch mispredict (cb)cdcfd5d6ddinst fetch restart (de)e0ec? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
160204200891500029258011610080016100800285006401961200452006520065612801282008002820016005620065200651116020110099100100160000100009011110120216222006201600001002006620066200662006620066
160204200651500069425801161008001610080028500640196020045200652006561280128200800282001600562006520065111602011009910010016000010000011110120216212006201600001002006620066200662006620066
16020420065150002925801161008001610080028500640196120045200652006561280128200800282001600562006520065111602011009910010016000010000011110121116222006201600001002006620066200662006620066
16020420065151002925801161008001610080028500640196120045200652006561280128200800282001600562006520065111602011009910010016000010010011110121216122006201600001002006620066200662006620066
16020420065151002925801161008001610080028500640196020045200652006561280128200800282001600562006520065111602011009910010016000010000011110120216222006201600001002006620066200662006620066
16020420065150002925801161008001610080028500640196120045200652006561280128200800282001600562006520065111602011009910010016000010000011110121216222006201600001002006620066200662006620066
16020420065150002925801161008001610080028500640196120045200652006561280128200800282001600562006520065111602011009910010016000010000011110121216222006201600001002006620066200662006620066
160204200651500029258011610080016100800285006401961200452006520065612801282008002820016005620065200651116020110099100100160000100006611110120116212006201600001002006620066200662006620066
16020420065150002925801161008001610080028500640196120045200652006561280128200800282001600562006520065111602011009910010016000010000011110121216212006201600001002006620066200662006620066
16020420065150002925801161008001610080028500640196120045200652006561280128200800282001600562006520065111602011009910010016000010000011110121216222006201600001002006620066200662006620066

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.2506

retire (01)cycle (02)030b3f51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)5f6061696d6escheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa8accfd0d2icache miss (d3)d5d6d9dadbddinst fetch restart (de)e0? int output thing (e9)eaec? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
1600242008915004527800101080000108000050640000115200322005120051322800102080000201600002005120051111600211091010160000100010027821325211342004802053160000102005220052200522005220052
16002420051150045278001010800001080000506400001052003220051200513228001020800002016000020051200511116002110910101600001059310028821525211342004802037160000102005220052200522005220052
16002420051150045278001010800001080000506400001152003220051200513228001020800002016000020051200511116002110910101600001006310027821525211542004802040160000102005220052200522005220052
1600242005115104568800101080000108000050640000105200322005120051322800102080000201600002005120051111600211091010160000101010027821425211442004802043160000102005220052200522005220052
1600242005115004527800101080000108000050640000100200322005120051322800102080000201600002005120051111600211091010160000100010027821425211442004802052160000102005220052200522005220052
1600242005115004527800101080000108000050640000115200322005120051322800102080000201600002005120051111600211091010160000100010025821425211442004802047160000102005220052200522005220052
1600242005115004527800101080000108000050640000115200322005120051322800102080000201600002005120051111600211091010160000100010027821325211472004802055160000102005220052200522005220052
160024200511500452780010108000010800005064000011520032200512005132280010208000020160000200512005111160021109101016000010096100278214251911442004802050160000102005220052200522005220052
1600242005115004527800101080000108000050640000115200322005120051322800102080000201600002005120051111600211091010160000100010028821425211342004802063160000102005220052200522005220052
1600242005115004527800101080000108000050640000115200322005120051322800102080000201600002005120051111600211091010160000100010027821425211342004802042160000102006120061200612006120061

Test 5: throughput

Count: 16

Code:

  ursra v0.4h, v16.4h, #3
  ursra v1.4h, v16.4h, #3
  ursra v2.4h, v16.4h, #3
  ursra v3.4h, v16.4h, #3
  ursra v4.4h, v16.4h, #3
  ursra v5.4h, v16.4h, #3
  ursra v6.4h, v16.4h, #3
  ursra v7.4h, v16.4h, #3
  ursra v8.4h, v16.4h, #3
  ursra v9.4h, v16.4h, #3
  ursra v10.4h, v16.4h, #3
  ursra v11.4h, v16.4h, #3
  ursra v12.4h, v16.4h, #3
  ursra v13.4h, v16.4h, #3
  ursra v14.4h, v16.4h, #3
  ursra v15.4h, v16.4h, #3
  movi v16.16b, 17

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.2502

retire (01)cycle (02)03181e3f51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa1a8acc5branch mispredict (cb)cdcfd5d6inst fetch restart (de)e0? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
16020440058299003025160108100160008100160020500128013214002040039400391997761999016012020016003220032006440039400391116020110099100100160000100000111101182160400361600001004004040040400404004040040
16020440039300003025160108100160008100160020500128013204002040039400391997761999016012020016014120032006440039400391116020110099100100160000100000111101181160400361600001004004040040400404004040040
16020440039303003025160108100160008100160020500128013204002040039400391997761999016012020016003220032006440039400391116020110099100100160000100000111101181160400361600001004004040040400404004040040
160204400393000183025160108100160008100160020500128013204002040039400391997761999016012020016003220032006440039400391116020110099100100160000100010111101182160400361600001004004040040400404004040040
16020440039300005125160108100160008100160020500128013204002040039400391997761999016012020016003220032006440039400391116020110099100100160000100000111101181160400361600001004004040040400404004040040
160204400392990069525160108100160008100160020500128013204002040039400391997761999016012020016003220032006440039400391116020110099100100160000100000111101181160400361600001004004040040400404004040040
16020440039300003025160108100160008100160020500128013204002040039400391997761999016012020016003220032006440039400391116020110099100100160000100003111101181160400361600001004004040040400904004040040
16020440039300003025160108100160008100160020500128013204002040039400391997761999016012020016003220032006440039400391116020110099100100160000100000111101182160400361600001004004040040400404004040040
16020440039301003025160108100160008100160020500128013204002040039400391997761999016012020016003220032006440039400391116020110099100100160000100000111101182160400361600001004004040040400404004040040
16020440039300033025160108100160008100160020500128013204002040039400391997761999016012020016003220032006440039400391116020110099100100160000100000111101181160400361600001004004040040400404004040040

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.2502

retire (01)cycle (02)0318191e1f3f51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)5f6061696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa6a8a9acc5cfd0d2icache miss (d3)d5d6d9dadbddinst fetch restart (de)e0eaeb? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
1600244003930100007172516001010160000101600005012800000154002040039400391999632001916001020160000203200004035040166311600211091010160000100000010022114151621120940036406160000104004040040400404004040040
1600244003929900926455242516001010160000101600005012800001154002040039400391999632001916001020160000203200004003940039111600211091010160000100000010022831416211820400362012160000104004040040400404004040102
1600244024230000132881092516001010160098101600005012800001154002040039400391999632001916001020160000203200004003940039111600211091010160000100003010022831161621119940036206160000104004040040400404004040040
1600244003930000120258251600101016000010160000501283236115400204003940039199963200191600102016000020320000400394003911160021109101016000010040001002283154921118940036206160000104004040040400404004040040
16002440091300000088251600101016000010160000501280000115400204003940039199963200191600102016000020320000400394003911160021109101016000010010301002283141621118740036206160000104004040040402464004040040
160024400393000000469716001010160000101600005012800001154002040039400391999632001916001020160000203200004003940039111600211091010160000100000010022831151621118940036206160000104004040040400404004040040
1600244003930000120692516001010160000101600005012800001154002040039400391999632001916001020160000203200004003940039111600211091010160000100000010022831161621117840036206160000104004040040400404019540040
160024400392990021046251600101016000010160000501280000115400204003940039199963200191600102016000020320000400394003911160021109101016000010000001002283131621191740036206160000104004040040400404004040040
160024400393000000462516001010160000101600005012800001154002040039400391999632001916001020160000203200004003940039111600211091010160000100000010023831416211202140036206160000104004040040400404004040040
16002440039299000352462516001010160000101600005012800001154002040039400391999632001916001020160000203200004003940039111600211091010160000100000010022831416211222140036206160000104004040040400404004040040