Apple M1 Microarchitecture Research by Dougall Johnson

Firestorm: Overview | Base Instructions | SIMD and FP Instructions
Icestorm:  Overview | Base Instructions | SIMD and FP Instructions

UQSHRN2 (2D)

Test 1: uops

Code:

  uqshrn2 v0.4s, v1.2d, #3
  movi v0.16b, 1
  movi v1.16b, 2

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 0.000

Load/store unit issues: 0.000

SIMD/FP unit issues: 1.000

retire (01)cycle (02)031f3f4e51inst issue (52)~issue fp/simd (54)~dispatch fp/simd (57)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op fp/simd (7e)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst neon or fp (9a)a6a8accfd5d6ddinst fetch restart (de)e0? fp/simd (ee)f5f6f7f8fd
10043037230105254825100010001000398313030183037303724153289510001000200030373037111001100000073116112630100030383038303830383038
1004303722011725482510001000100039831313018303730372415328951000100020003037303711100110000012373116112630100030383038303830383038
10043037230126254825100010001000398313130183037303724153289510001000200030373037111001100020073116112630100030383038303830383038
1004303723061254825100010001000398313030183037303724153289510001000200030373037111001100000073116112630100030383038303830383038
1004303723084254825100010001000398313030183037303724153289510001000200030373037111001100001073116112630100030383038303830383038
1004303723061254825100010001000398313130183037303724153289510001000200030373037111001100000073116112630100030383038303830383038
1004303723084254825100010001000398313030183037303724153289510001000200030373037111001100000073116112630100030383038303830383038
10043037230168254825100010001000398313130183037303724153289510001000200030373037111001100000073116112630100030383038303830383038
100430372310861254825100010001000398313130183037303724153289510001000200030373037111001100000073116112630100030383038303830383038
1004303723074254825100010001000398313030183037303724153289510001000200030373037111001100000073116112630100030383038303830383038

Test 2: Latency 1->1

Code:

  uqshrn2 v0.4s, v1.2d, #3
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 3.0037

retire (01)cycle (02)031e3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9facc2cfd5d6ddinst fetch restart (de)e0? int output thing (e9)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
1020430037225061295482510100100100001001000050042773131300183003730037282653287451010020010000200200003003730037111020110099100100100001000071221611296340100001003003830038300383003830038
1020430037225061295482510100100100001001000050042773130300183003730037282653287451010020010000200200003003730037111020110099100100100001000071011611296340100001003003830038300383003830038
1020430037225061295482510100100100001001000050042773131300183003730037282653287451010020010000200200003003730037111020110099100100100001000071011611296340100001003003830038300383003830038
1020430037225061295482510100100100001001000050042773131300183003730037282653287451010020010000212200003003730037111020110099100100100001000071011611296340100001003003830038300383003830038
1020430037225061295482510124100100001001000050042773131300183003730037282653287451010020010000200200003003730037111020110099100100100001000071011611296340100001003003830038300383003830038
1020430037225061295482510100100100001001000050042773130300183003730037282653287451010020010000200200003003730037111020110099100100100001000071011611296340100001003003830038300383003830038
1020430037225061295482510100100100001001000050042773131300183003730037282653287451010020010000200200003003730037111020110099100100100001000071011611296340100001003003830038300383003830038
1020430037225071295482510100100100001001000050042773130300183003730037282653287451010020010000200200003003730037111020110099100100100001000071011611296340100001003003830038300383003830038
1020430037224061295482510100100100001001000050042773130300183003730037282653287451010020010166204200003003730037111020110099100100100001000071011611296340100001003003830038300383003830038
1020430037225061295482510100100100001001000050042773131300183003730037282653287451010020010000200200003003730037111020110099100100100001000071011611296340100001003003830038300383003830038

1000 unrolls and 10 iterations

Result (median cycles for code): 3.0037

retire (01)cycle (02)0318191e3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa6a8acc5cfd5d6ddinst fetch restart (de)e0? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
10024300372250006129548251001010100001010000504277313030018300373003728287328767100102010000202000030037300371110021109101010000100000640216222963010000103003830038300383003830038
10024300372250006129548251001010100001010000504277313030018300373003728287328767100102010000202000030037300371110021109101010000100000640216222963010000103003830038300383003830038
100243003722500486129548251001010100001010000504277313130018300373003728287328767100102010000202000030083300981110021109101010000100000640216222963010000103003830038300383003830038
10024300372250006129548251001010100001010000504277313030018300373003728287328767100102010000202000030037300371110021109101010000100000640216222963010000103003830038300383003830038
100243003722500011529548251001010100001010000504277313130018300373003728287328767100102010000202000030037300371110021109101010000100000640216222963010000103003830038300383003830038
10024300372240006129548251001010100001010000504277313130018300373003728287328767100102010000202000030037300371110021109101010000100000640216222963010000103003830038300383003830038
10024300372250006129548251001010100001010000504277313130018300373003728287328767100102010000202000030037300371110021109101010000100000640216222963010000103003830038300383003830038
10024300372250006129548251001010100001010000504277313130018300373003728287328767100102010000202064630037300371110021109101010000100000640216222963010000103003830038300383003830038
10024300372250006129548251001010100001010000504277313030018300373003728287328767100102010000202000030037300371110021109101010000100000640216222963010000103003830038300383003830038
10024300372250006129548251001010100001010000504277313030018300373003728287328767100102010000202000030037300371110021109101010000100000640216222963010000103003830038300383003830038

Test 3: Latency 1->2

Code:

  uqshrn2 v0.4s, v0.2d, #3
  movi v0.16b, 1

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 3.0037

retire (01)cycle (02)030708090a0b18191e1f3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa1a6a8acc2c5branch mispredict (cb)cdcfd5d6ddinst fetch restart (de)e0? int output thing (e9)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
102043032422730110758106164578294931381016812010024118110506214286624302343032130368282774028789111742221099022222992301813037171102011009910010010000100231194450111883165122991812100001003037230375303713037030133
10204303722282001177924616455729484136101751211005612211050593428515630234303733036928276402886311224226111592242231030372303738110201100991001001000010043119400411187516422298807100001003041930371304203037130373
102043037122821110779486166129547251010010010000100100005004277160300183003730037282716287411010020010008200200163003730037111020110099100100100001000000011171711611296490100001003003830038300383003830038
10204300372251001000006129547251010010010000100100005004277160300183003730037282717287401010020010008200200163003730037111020110099100100100001000000011171711611296843100001003008630086300383003830038
1020430037225100100008812429547251010010010008106100005004277160300183003730037282716287411010020010008200200163003730037111020110099100100100001000000011171811611296490100001003003830038300383003830038
10204300372251001000006129547251010010010000100100005004277160300183003730037282716287411010020810008200200163003730037111020110099100100100001000000011171811611296500100001003003830038300383003830038
102043003722410010000053629547251010010010000100100005004277160300183003730037282716287401010020010008200200163003730037111020110099100100100001000013011171811611296500100001003003830038300383003830038
10204300372251001000006129547251010010010000100100005004277160300183003730037282717287411010020010008200200163003730037111020110099100100100001000000011171711611296500100001003003830038300383003830038
10204300372251001000006129547251010010010000100100005004277160300183003730037282717287411010020010008200200163003730037111020110099100100100001000010011171811611296490100001003003830038300383003830038
10204300372241001000006129547251010010010000100100005004277160300183003730037282716287401010020010008200200163003730037111020110099100100100001000000011171811611296490100001003008530038300383003830038

1000 unrolls and 10 iterations

Result (median cycles for code): 3.0037

retire (01)cycle (02)031e3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696b6d6edispatch stall (70)72scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9facbranch mispredict (cb)cfd5d6ddinst fetch restart (de)e0? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
100243003722506129547251001010100001010000504277160030018030037300372828603287671001020100002020000300373003711100211091010100001000640216222962910000103003830038300383003830038
100243003722506129547251001010100001010000504277160130018030037300372828603287671001020100002020000300373003711100211091010100001000640216222962910000103003830038300383003830038
100243003722506129547251001010100001010000504277160130018030037300372828603287671001020100002020000300373003711100211091010100001000640216222962910000103003830038300383003830038
100243003722506129547251001010100001010000504277160030018030037300372828603287671001020100002020000300373003711100211091010100001000640216222962910000103003830038300383003830038
100243003722406129547251001010100001010000504277160130018030037300372828603287671001020100002220000300373003721100211091010100001001640216222962910000103003830038300383003830038
1002430037225306129547251001010100001010000504277160130018030037300372828603287671001020100002020000300373003711100211091010100001000640216222962910000103003830038300383003830038
100243003722506129547251001010100001010000504277160130018030037300372828603287671001020100002020000300373003711100211091010100001000640216222962910000103003830038300383003830038
100243003722506129547251001010100001010000504277160130018030037300372828603287671001020100002020000300373003711100211091010100001000640216222962910000103003830038300383003830038
100243003722506129547251001010100001010000504277160130018030037300372828603287671001020100002020000300373003711100211091010100001000640216222962910000103003830038300383003830038
1002430037225053629547251001010100001010000504277160030018030037300372828603287671001020100002020000300373003711100211091010100001000640216222962910000103003830038300383003830038

Test 4: throughput

Count: 8

Code:

  movi v0.16b, 0
  uqshrn2 v0.4s, v8.2d, #3
  movi v1.16b, 0
  uqshrn2 v1.4s, v8.2d, #3
  movi v2.16b, 0
  uqshrn2 v2.4s, v8.2d, #3
  movi v3.16b, 0
  uqshrn2 v3.4s, v8.2d, #3
  movi v4.16b, 0
  uqshrn2 v4.4s, v8.2d, #3
  movi v5.16b, 0
  uqshrn2 v5.4s, v8.2d, #3
  movi v6.16b, 0
  uqshrn2 v6.4s, v8.2d, #3
  movi v7.16b, 0
  uqshrn2 v7.4s, v8.2d, #3
  movi v8.16b, 9

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.2509

retire (01)cycle (02)03080b18191e1f3a3f51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6escheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a1a8a9acc5branch mispredict (cb)cdcfd5d6ddinst fetch restart (de)e0? int output thing (e9)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
160204200881500000000292580116100800161008002850064019612004520065200656128012820080028200160056200652006511160201100991001001600001000000011110119016002006201600001002006620066200662006620066
160204200651510000000292580116100800161008002850064019612004520065200656128012820080028200160056200652006511160201100991001001600001000000011110119016002006201600001002006620066200662006620066
16020420065150000000029258011610080016100800285006401961200452006520065612801282008002820016005620065200651116020110099100100160000100000048811110119016002006201600001002006620066200662006620066
160204200651500000000712580116100800161008002850064019602004520065200656128012820080028200160056200652006511160201100991001001600001000000011110119016002006201600001002006620066200662006620066
160204200651500000000292580116100800161008002850064019612004520065200656128012820080028200160056200652006511160201100991001001600001000000011110119016002006201600001002006620066200662006620066
160204200651500000900292580116100800161008002850064019612004520065200656128012820080028200160056200652006511160201100991001001600001000000011110119016002006201600001002006620066200662006620066
1602042006515000000002925801161008001610080028500640196020045200652006539128012820080028200160056200652006511160201100991001001600001000000011110119016002006201600001002006620066200662006620066
160204200651500000000292580116100800161008002850064019612004520065200656128012820080028200160056202302006521160201100991001001600001000000011110119016002006201600001002006620066200662006620066
160204200651500000000292580116100800161008002850064019612004520065200656128012820080028200160056200652006511160201100991001001600001000000011110119016002006201600001002006620066200662006620066
160204200651510000000292580116100800161008002850064019602004520065200656128012820080028200160056200652006511160201100991001001600001000000011110119016002006201600001002006620066200662006620066

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.2506

retire (01)cycle (02)03071e3f51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)5f6061696d6e74scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa8accfd0d2icache miss (d3)d5d6d9dadbddinst fetch restart (de)e0ea? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
160024200631500045258001010800001080000506400000052002720046200460322800102080000201600002004620046111600211091010160000100010031821820211662004315160000102004720047200472004720047
160024200461500051258001010800001080000506400001152002720046200460322800102080000201600002004620046111600211091010160000100010030821620211872010915160000102004720047200472004720047
160024200461500045258001010800001080000506400001052002720046200460322800102080000201600002004620046111600211091010160000100010031821820211772004315160000102004720047200472004720047
160024200461500045258001010800001080000506400001152002720046200460322800102080000201600002004620046111600211091010160000100010030821920211772004315160000102004720047200472004720047
160024200461500045258001010800001080000506400001052002720046200460322800102080000201600002004620046111600211091010160000100010030821720211752004315160000102004720047200472004720047
160024200461500045258001010800001080000506400001102003320046200460322800102080000201600002004620046111600211091010160000100010032821720211882004315160000102004720047200472004720047
1600242004615000452580010108000010800005064000010520027200462004603228001020800002016000020046200461116002110910101600001000100328211320221672004315160000102004720047200472004720047
160024200461500045258001010800001080000506400001102002720046200460322800102080000201600002004620046111600211091010160000100010031821720211752004315160000102004720047200472004720047
160024200461500051258001010800001080000506400001152002720046200460322800102080000201600002004620046111600211091010160000100010031821720211682004315160000102004720047200472004720047
160024200461500045258001010800001080000506400001152002720046200460322800102080000201600002004620046111600211091010160000100010029821920211762004330160000102004720047200472004720047

Test 5: throughput

Count: 16

Code:

  uqshrn2 v0.4s, v16.2d, #3
  uqshrn2 v1.4s, v16.2d, #3
  uqshrn2 v2.4s, v16.2d, #3
  uqshrn2 v3.4s, v16.2d, #3
  uqshrn2 v4.4s, v16.2d, #3
  uqshrn2 v5.4s, v16.2d, #3
  uqshrn2 v6.4s, v16.2d, #3
  uqshrn2 v7.4s, v16.2d, #3
  uqshrn2 v8.4s, v16.2d, #3
  uqshrn2 v9.4s, v16.2d, #3
  uqshrn2 v10.4s, v16.2d, #3
  uqshrn2 v11.4s, v16.2d, #3
  uqshrn2 v12.4s, v16.2d, #3
  uqshrn2 v13.4s, v16.2d, #3
  uqshrn2 v14.4s, v16.2d, #3
  uqshrn2 v15.4s, v16.2d, #3
  movi v16.16b, 17

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.2502

retire (01)cycle (02)030818191e1f3a3f51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a1a6a8acc2c5branch mispredict (cb)cdcfd6dde0? int output thing (e9)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
160204400593000001500622825160108100160008100160020500128013214002040039400391997761999016012020016003220032006440039400391116020110099100100160000100000000111101181604003601600001004004040040400404004040040
16020440039300000279003025160108100160008100160020500128013214002040039400391997761999016012020016003220032006440039400391116020110099100100160000100000200111101181604003601600001004004040040400404004040040
16020440039300000336003025160108100160008100160020500128013214002040039400391997761999016012020016003220032006440039400391116020110099100100160000100000000111101181604003601600001004004040040400404004040040
16020440039300000288003025160108100160008100160020500128013214002040039400391997761999016012020016003220032006440039400391116020110099100100160000100000000111101181604003601600001004004040040400404004040040
16020440039300000231003025160108100160008100160020500128013214002040039400391997761999016012020016003220032006440039400391116020110099100100160000100000000111101181604003601600001004004040040400404004040040
1602044003930000015003025160108100160008100160020500128013214002040039400391997761999016012020016003220032006440039400391116020110099100100160000100000000111101181604003601600001004004040040400404004040040
16020440039299000342003025160108100160008100160020500128013214002040039400391997761999016012020016003220032006440039400391116020110099100100160000100000000111101181604003601600001004004040040400404004040040
16020440039300000360003025160108100160008100160020500128013214002040039400391997761999016012020016003220032006440039400391116020110099100100160000100000000111101181604003601600001004004040040400404004040040
1602044003929900018003025160108100160008100160020500128013214002040039400391997761999016012020016003220032006440039400391116020110099100100160000100000000111101181604003601600001004004040040400404004040040
1602044003930000045300220251601081001600081001600205001280132140020400394003919977151999016012020016003220032006440039400391116020110099100100160000100000000111101181604003601600001004004040040400404009040040

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.2502

retire (01)cycle (02)031e3f51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)5f6061696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa8cfd0d2icache miss (d3)d5d6d9dadbddinst fetch restart (de)e0eaeb? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
16002440051300217112516001010160000101600005012800001054002040039400391999432001916001020160000203200004003940039111600211091010160000100100223115162116840036206160000104004040040400404004040040
16002440039300423462516001010160000101600005012800001054002040039400391999432001916001020160000203200004003940039111600211091010160000100100223114162118840036206160000104004040040400404004040040
1600244003930008825160010101600001016000050128000010540020400394003919994320019160010201600002032000040039400391116002110910101600001001002231141621148400362012160000104004040040400404004040040
1600244003930012462516001010160000101600005012800001104002040039400391999432001916043420160000203200004003940039111600211091010160000100100223118164225840036206160000104004040040400404004040040
1600244003930015462516001010160000101600005012800001054002040039400391999432001916001020160000203200004003940039111600211091010160000100100223116162119840036206160000104004040040400404004040040
1600244003930012462516001010160000101600005012800001004002040039400391999432001916001020160000203200004003940039111600211091010160000100100223316162118540036406160000104004040040400404004040040
16002440039300315462516001010160000101600005012800000004002040039400391999432001916001020160000203200004003940039111600211091010160000100100223115162118840036206160000104004040040400404004040040
1600244003930042911825160010101600001016000050128000000540020400394003919994320019160010201600002032000040039400391116002110910101600001001002211126162116640036206160000104004040040400404004040040
160024400392990462516001010160000101600005012800001004002040039400391999432001916001020160000203200004003940039111600211091010160000100100223218162116440036206160000104004040040400404004040040
1600244003929915462516001010160000101600005012800001004002040039400391999432001916001020160000203200004003940039111600211091010160000100100543418164116640036206160000104004040040400404004040040