Apple M1 Microarchitecture Research by Dougall Johnson

Firestorm: Overview | Base Instructions | SIMD and FP Instructions
Icestorm:  Overview | Base Instructions | SIMD and FP Instructions

UQRSHL (vector, 4S)

Test 1: uops

Code:

  uqrshl v0.4s, v0.4s, v1.4s
  movi v0.16b, 1
  movi v1.16b, 2

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 0.000

Load/store unit issues: 0.000

SIMD/FP unit issues: 1.000

retire (01)cycle (02)030b1e3f4e51inst issue (52)~issue fp/simd (54)~dispatch fp/simd (57)huge thing fp/simd (5b)696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op fp/simd (7e)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst neon or fp (9a)a8accfd5d6ddinst fetch restart (de)e0? fp/simd (ee)f5f6f7f8fd
100430372300612548251000100010003983133018303730372415328951000100020003037303711100110000073216112630100030383038303830383038
100430372300612548251000100010003983133018303730372415328951000100020003037303711100110000073116112630100030383038303830383038
1004303722002512548251000100010003983133018303730372415328951000100020003037303711100110000673116112630100030383038303830383038
100430372300612548251000100010003983133018303730372415328951000100020003037303711100110000073116112630100030383038303830383038
100430372200612548251000100010003983133018303730372415328951000100020003037303711100110000073116112630100030383038303830383038
100430372200612548251000100010003983133018303730372415328951000100020003037303711100110006073116112630100030383038303830383038
100430372300612548251000100010003983133018303730372415328951000100020003037303711100110000073116112630100030383038303830383038
100430372300612548251000100010003983133018303730372415328951000100020003037303711100110000073116112630100030383038303830383038
10043037220019925482510001000100039831330183037303724153289510001000200030373037111001100002773116112630100030383038303830383038
1004303723006125482510001000100039831330183037303724153289510001000200030373037111001100002173116112630100030383038303830383038

Test 2: Latency 1->2

Code:

  uqrshl v0.4s, v0.4s, v1.4s
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 3.0037

retire (01)cycle (02)031e3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa8accfd5d6ddinst fetch restart (de)e0? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
102043003722406129548251010010010000100100005004277313130018300373003728265328745101002001000020020000300373003711102011009910010010000100137101161129634100001003003830038300383003830038
102043003722506129548251010010010000100100005004277313130018300373003728265328745101002001000020020000300373003711102011009910010010000100107101161129634100001003003830038300383003830038
102043003722506129548251010010010000100100005004277313130018300373003728265328745101002001000020020000300373003711102011009910010010000100007101161129634100001003003830038300383003830038
102043003722506129548251010010010000100100005004277313130018300373003728265328745101002001000020020000300373003711102011009910010010000100007101161129634100001003003830038300383003830038
102043003722406129548251010010010000100100005004277313130018300373003728265328745101002001000020020000300373003711102011009910010010000100007101161129634100001003003830038300383003830038
102043003722506129548251010010010000100100005004277313130018300373003728265328745101002001000020020000300373003711102011009910010010000100067101161129634100001003003830038300383003830038
102043003722506129548251010010010000100100005004277313130018300373003728265328745101002001000020020000300373003741102011009910010010000100007101161129634100001003003830038300383003830038
102043003722506129548251010010010000100100005004277313130018300373003728265328745101002001000020020000300373003711102011009910010010000100007101161129634100001003003830038300383003830038
102043003722506129548251010010010000100100005004277313130018300373003728265328745101002001000020020000300373003711102011009910010010000100007101161129634100001003003830038300383003830038
102043003722506129548251010010010000100100005004277313130018300373003728265328745101002001000020020000300373003711102011009910010010000100007101161129634100001003003830038300383003830038

1000 unrolls and 10 iterations

Result (median cycles for code): 3.0037

retire (01)cycle (02)03080b18191e1f3a3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a1a6a7a8a9acc2cfd5d6ddinst fetch restart (de)e0? int output thing (e9)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
10024300372250000000612954825100101010000101000050427731303001830037300372828732876710010201000020200003003730037111002110910101000010000010006402162229630010000103003830038300383003830038
10024300372250000000612954825100101010000101000050427731313001830037300372828732876710010201000020200003003730037111002110910101000010000001006402162229630010000103003830038300383003830038
10024300372250000000612954825100101010000101000050427731313001830037300372828732876710010201000020200003003730037111002110910101000010000010006402162229630010000103003830038300383003830038
100243003722500000006129548251001010100001010000504277313130018300373003728287328767100102010000202000030037300371110021109101010000100000330007674653429881310000103036030372303253032030324
10024303592271167924264145142948516210073141005612108946042854550302703036930365282953028900110552411186222228230366303689110021109101010000106022022132307886853329942210000103036730369303693037130368
100243050822700000001032954884100691310048161104377428674203030630370303692831233288991105820104912222294303703036981100211091010100001020003122091327882855529884310000103050130273304153046730454
100243041722800891068704159122946718010081111007216111926642882990303423045130177283153528936116542211307242227830463304141011002110910101000010220021232490291761132529812410000103056130466305623059530412
100243054423500003006129548251001010100081210298834285455130306300833046328303132893710908261032526232763003730369411002110910101000010000000006402162229630010000103003830038300383003830038
10024300372330000000822954825100101010000101000050427731303001830037300842828732876710010201000020200003003730037111002110910101000010000000006402162229630010000103003830038300383003830038
10024300372320000000822954825100101010000101000050427731303001830037300372828732876710010201000020200003003730037111002110910101000010000010006402162229630010000103003830038300383003830038

Test 3: Latency 1->3

Code:

  uqrshl v0.4s, v1.4s, v0.4s
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 3.0037

retire (01)cycle (02)0318191e1f3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a8cfd5d6ddinst fetch restart (de)e0? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
102043003722500006129548251010010010000100100005004277313130018300373003728265328745101002001000020020000300373003711102011009910010010000100007101161129634100001003003830038300383003830038
102043003722500006129548251010010010000100100005004277313130018300373003728265328745101002001000020020000300373003711102011009910010010000100007101161129634100001003003830038300383003830038
102043003722400006129548251010010010000100100005004277313130018300373003728265328745101002001000020020000300373003711102011009910010010000100007101161129634100001003003830038300383003830038
102043003722500006129548251010010010000100100005004277313130018300373003728265328745101002001000020020000300373003711102011009910010010000100007101161129634100001003003830038300383003830038
102043003722500006129548251010010010000100100005004277313130018300373003728265328745101002001000020220000300373003711102011009910010010000100007101161129634100001003003830038300383003830038
102043003722500006129548251010010010000100100005004277313130018300373003728265328745101002001000020020000300373003711102011009910010010000100007101161129634100001003003830038300383003830038
102043003722500006129548251010010010000100100005004277313130018300373003728265328745101002001000020020000300373003711102011009910010010000100007101161129634100001003003830038300383003830038
102043003722500006129548251010010010000100100005004277313130018300373003728265328745101002001000020020000300373003711102011009910010010000100007101161129634100001003003830038300383003830038
102043003722400006129548251010010010000100100005004277313130018300373003728265328745101002001000020020000300373003711102011009910010010000100007101161129634100001003003830038300383003830038
1020430037225001206129548251010010010000100100005004277313130018300373003728265328745101002001000020020000300373003711102011009910010010000100007101161129634100001003003830038300383003830038

1000 unrolls and 10 iterations

Result (median cycles for code): 3.0037

retire (01)cycle (02)03070a1e3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)72scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa8accfd5d6ddinst fetch restart (de)e0? int output thing (e9)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
100243003722500669729548251001212100081010000504277313130018300373003728287032876710010201000020200003003730037111002110910101000010006423163329630010000103003830038300383003830038
10024300372250006129548251001010100001010000504277313130018300373003728287032876710010201000020200003003730037111002110910101000010006425164529630010000103003830038300383003830038
10024300372250006129548251001010100071210000504277313130018300373003728287032876710010201000020200003003730037111002110910101000010006403165429632010000103003830038300383003830038
10024300372250006129548251001010100001010000504277313130018300373003728287032876710010201000020200003003730037111002110910101000010006403163329630010000103003830038300383003830085
100243003722500276129548251001010100001010000504277313130018300373003728287032876710010201000020200003003730037111002110910101000010206423163329630010000103003830038300383003830038
10024300372250006129548251001212100001210000504277313130018300373003728287032876710010201000020200003003730037111002110910101000010006403163329630010000103003830038300383003830038
10024300372250006129548251001212100001210000504277313130018300373003728287032876710010201000020200003003730037111002110910101000010036403163329630010000103003830038300383003830038
10024300372250006129548251001010100001010000504277313130018300373003728287032876710012201000020200003008430037111002110910101000010006425163329630010000103003830038300383003830038
100243003722500025129548251001212100001210000504277313130018300373003728287032876710012201000020200003003730037111002110910101000010006403163329630010000103003830038300383003830038
100243003722510072629548251001010100001010000504277313130018300373003728287032876710010201000020200003003730037111002110910101000010006403163329630010000103003830038300383003830038

Test 4: throughput

Count: 8

Code:

  uqrshl v0.4s, v8.4s, v9.4s
  uqrshl v1.4s, v8.4s, v9.4s
  uqrshl v2.4s, v8.4s, v9.4s
  uqrshl v3.4s, v8.4s, v9.4s
  uqrshl v4.4s, v8.4s, v9.4s
  uqrshl v5.4s, v8.4s, v9.4s
  uqrshl v6.4s, v8.4s, v9.4s
  uqrshl v7.4s, v8.4s, v9.4s
  movi v8.16b, 9
  movi v9.16b, 10

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.2505

retire (01)cycle (02)030b1e1f3f51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa6a8a9accfd5d6ddinst fetch restart (de)e0? int output thing (e9)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
80204200591500004125801001008000010080000500640000020020200392003999733999780100200800002001600002003920039118020110099100100800001000000511031611200360800001002004020040200402004020040
80204200391500004125801001008000010080000500640000120020200392003999733999780100200800002001600002003920039118020110099100100800001000000511011611200360800001002004020040200402004020040
802042003915000072925801001008000010080000500640000120020200392003999733999780100200800002001600002003920039118020110099100100800001000000511011611200360800001002004020040200402004020040
80204200391500008325801001008000010080000500640000120020200392003999733999780100200800002001600002003920039118020110099100100800001000000511011611200360800001002004020040200402004020040
802042003915000010625802171008000010080000536640000120061200392003999733999780100200800002001600002003920039118020110099100100800001000100511011611200360800001002004020040200402004020040
802042003915000054644801001008000010880000500640000020020200392003999733999780100200800002001600002003920039118020110099100100800001000100511011611200360800001002004020040200402004020040
80204200391500004125801001008000010080000500640000120020200392003999733999780100200800002001600002003920039118020110099100100800001000000511011611200360800001002020020040200402004020040
802042003915001204125801001008000010080000500640000120020200392003999733999780100200800002001600002003920039118020110099100100800001000000511011611200360800001002004020040200402004020040
80204200391500004125801001008000010080000500640000020020200392003999733999780100200800002001600002003920039118020110099100100800001000000511011611200360800001002004020040200402004020040
8020420145150012041258010010080000100800005006400001200202003920039997339997801002008000020016000020039200391180201100991001008000010001005110116112003624800001002009420040200402004020040

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.2505

retire (01)cycle (02)031e3a3f51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa8accfd5d6ddinst fetch restart (de)e0? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
800242004015005822580010108000010800005064000012002020039200399996310019800102080000201600002003920039118002110910108000010005020516352003680000102004020040200402004020040
800242003915000402580010108000010800005064000012002020039200399996310019800102080105201600002010020039118002110910108000010005020716562003680000102004020040200402004020040
8002420039150001262580010108000010800005064000012002020039200399996310019800102080000201600002003920039118002110910108000010005020516452003680000102004020040200402004020040
800242003915000402580010108000010800005064000012002020039200399996310019800102080000201600002003920039118002110910108000010005020516452003680000102004020040200402004020040
800242003915000402580010108000010800005064000012002020039200399996310019800102080000201600002003920039118002110910108000010005020516752003680000102004020040200402004020040
800242003915000402580010108000010800005064000012002020039200399996310019800102080000201600002003920039118002110910108000010005020416452003680000102004020040200402004020040
8002420039150001032580010108000010800005064000012002020039200399996310019800102080000201600002003920039118002110910108000010005020516562003680000102004020040200402004020040
8002420039150005422580010108000010800005064000012002020039200399996310019800102080000201600002003920039118002110910108000010005020516542003680000102004020040200402004020040
800242003915000402580010108000010800005064000012002020039200399996310019800102080000201600002003920039118002110910108000010005020516542003680000102004020040200402004020040
800242003915000402580010108000010800005064000012002020039200399996310019800102080000201600002003920039118002110910108000010005020616552003680000102004020040200402004020040