Apple M1 Microarchitecture Research by Dougall Johnson

Firestorm: Overview | Base Instructions | SIMD and FP Instructions
Icestorm:  Overview | Base Instructions | SIMD and FP Instructions

FSQRT (vector, 2D)

Test 1: uops

Code:

  fsqrt v0.2d, v0.2d
  movi v0.16b, 1
  movi v1.16b, 2

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 0.000

Load/store unit issues: 0.000

SIMD/FP unit issues: 1.000

retire (01)cycle (02)031e3f4e51inst issue (52)~issue fp/simd (54)~dispatch fp/simd (57)huge thing fp/simd (5b)696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op fp/simd (7e)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst neon or fp (9a)a8accfd5d6ddinst fetch restart (de)e0? fp/simd (ee)f5f6f7f8fd
10041303798061115402510001000100045795313018130371303712248312895100010001000130371303711100110005373116111243110001303813038130381303813038
10041303798061115402510001000100045795313018130371303712248312895100010001000130371303711100110006073116111243110001303813038130381303813038
10041303798061115402510001000100045795313018130371303712248312895100010001000130371303711100110000073116211243110001303813038130381303813038
100413037970611154025100010001000457953130181303713037122483128951000100010001303713071111001100004873116111243110001303813038130381303813038
1004130379815611154025100010001000457953130181303713037122483128951000100010001303713037111001100004273116111243110001303813038130381303813038
10041303797061115402510001000100045795313018130371303712248312895100010001000130371303711100110000073116111243110001303813038130381303813038
10041303798061115402510001000100045795313018130371303712248312895100010001000130371303711100110000073116111243110001303813038130381303813038
10041303797061115402510001000100045795313018130371303712248312895100010001000130371303711100110000073116111243110001303813038130381303813038
10041303798061115402510001000100045795313018130371303712248312895100010001000130371303711100110000073116111243110001303813038130381303813038
10041303797961115402510001000100045795313018130371303712248312895100010001000130371303711100110000073116111243110001303813038130381303813038

Test 2: Latency 1->2

Code:

  fsqrt v0.2d, v0.2d
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 13.0037

retire (01)cycle (02)03090b1e3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696b6d6edispatch stall (70)72scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa6a8acc5cdcfd5d6ddinst fetch restart (de)e0ea? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
102041300379740007011195402510100100100001001000050046609530130018013003713003712809803128745101002001000020010000130037130037211020110099100100100001002000071011611129431010000100130038130038130038130038130038
102041300379740001871195402510100100100001001000050046609530130018013007013003712809803128745101002001000020010000130037130037111020110099100100100001000000071011611129431010000100130038130038130038130038130038
102041300379740007261195402510100100100001001003750046609530130018013003713003712809803128745101002001000020010000130037130037111020110099100100100001000030071011611129431010000100130038130038130038130038130038
102041300379740005691195402510100100100001001000050046609530130018013003713003712809803128745101002001000020010000130037130037111020110099100100100001000000071011611129431010000100130038130038130038130038130038
102041300379740002561195402510100100100001001000050046609530130018013003713003712809803128745101002001000020010000130037130037111020110099100100100001000000071011611129431010000100130038130038130038130038130038
10204130037974000611195402510100100100001001000050046609530130018013003713003712809803128745101002001000020010000130037130037111020110099100100100001000000071011611129431010000100130038130038130038130038130038
10204130037974000611195402510100100100001001000050046609530130018013003713003712809803128745101002001000020010000130037130037111020110099100100100001000000071011611129431010000100130038130038130038130038130038
10204130037974000611195402510100100100001001000050046609530130018013003713003712809803128745101002001000020010000130037130037111020110099100100100001000000171011611129431010000100130038130038130038130038130038
10204130037974000611195402510100100100001001000050046609530130018013003713003712809803128745101002001000020010000130037130037111020110099100100100001000030071011611129431010000100130038130038130038130038130038
102041300379740002161195402510100100100001001000050046609530130018013003713003712809803128745101002001000020010000130037130037111020110099100100100001000000071011611129431010000100130038130038130038130038130038

1000 unrolls and 10 iterations

Result (median cycles for code): 13.0037

retire (01)cycle (02)0308181e1f3a3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696b6d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a1a6a7a8acc2c5cdcfd5d6ddinst fetch restart (de)e0? int output thing (e9)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
10024130037974000008211954025100101010000101000050466095311300183130037130037128120312879910010201000020100001300371300371110021109101010000100000000006402162212943101000010130038130038130038130038130038
10024130037974000006111954025100101210000101000050466095311300180130037130037128120312876710010201000020100001300371300371110021109101010000100000000006402162212943101000010130038130038130038130038130038
10024130037974000006111954025100101010000101000050466095311300180130037130037128120312876710010201000020100001300371300371110021109101010000102031000006402162212943101000010130038130038130038130038130038
1002413003797400138006111954025100101010000101000050466095311300180130226130225128191312876710010201000020100001300371300371110021109101010000100000000006402162212943101000010130038130038130038130038130038
10024130037974000006111954025100101010000101000050466095311300180130037130037128120312876710010201000020100001300371300371110021109101010000100000000006402162212943101000010130038130038130038130038130038
10024130037974000006111954025100101010000101000050466095311300180130037130037128120312876710010201000020100001300371300371110021109101010000100000000006402162212943101000010130038130038130038130038130038
10024130037974000006111954025100101010000101000050466095311300180130037130037128120312876710010201000020100001300371300371110021109101010000100000000006402162212943101000010130038130038130038130038130038
10024130037974000006111954025100101010000101000050466095311300180130037130037128120312887610010201000020100001300371300371110021109101010000100000000006402162212943101000010130038130038130038130038130038
10024130037974000006111954025100101010000101000050466095311300180130037130037128120312876710010201000020100001300371300371110021109101010000100000000006402162212943101000010130038130038130038130038130038
10024130037974000006111954025100101010000101000050466095311300180130037130037128120312876710010201000020100001300371300371110021109101010000100000000006402162112943101000010130038130038130038130038130038

Test 3: throughput

Count: 8

Code:

  fsqrt v0.2d, v8.2d
  fsqrt v1.2d, v8.2d
  fsqrt v2.2d, v8.2d
  fsqrt v3.2d, v8.2d
  fsqrt v4.2d, v8.2d
  fsqrt v5.2d, v8.2d
  fsqrt v6.2d, v8.2d
  fsqrt v7.2d, v8.2d
  movi v8.16b, 9

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 2.0006

retire (01)cycle (02)03070a191e3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696b6d6edispatch stall (70)72scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a1a6a8acc5branch mispredict (cb)cdcfd5d6ddinst fetch restart (de)e0? int output thing (e9)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
80204160048119801030617993725801001008000010080000500567798411600290160048160048149895061500028010020080008200800081600481600481180201100991001008000010000003111511711611159984080000100160049160049160049160049160049
8020416004811991100617993725801001008000010080000500567798401602640160048160048149895061500028010020080008200800081600481600482180201100991001008000010002003360111511711611159984080000100160049160049160049160049160049
80204160048119911015617993725801001008000010080000500567798411600290160048160048149895061500028010020080008200800081600481600481180201100991001008000010000000111511711611159984080000100160049160049160049160049160049
8020416004811991100617993725801001008000010080000500567798411600290160048160048149895061500028010020080008200800081600481600482180201100991001008000010000000111511711611159984080000100160049160049160049160049160049
8020416004811991100617993725801001008000010080000500567798411600290160048160341149895061500028010020080008200800081600481600483180201100991001008000010000000111511711611159984080000100160049160049160049160049160049
8020416004811991100617993725801001008000010080000500567798411600290160048160048149895061500028010020080072200800081600481600481180201100991001008000010000003111511711611159984080000100160049160049160049160049160049
802041600481199110822617993725801001008000010080000500567798411600290160048160048149895061500028010020080008200800801600481600481180201100991001008000010000000111511711611159984080000100160049160049160049160049160049
8020416004811981100617993725801001008000010080000500567798411600290160048160048149895061500028010020080008200800801600481600481180201100991001008000010000000111511711611159984080000100160049160049160049160049160105
8020416004811991100617993725801001008000010080000500567798411600290160048160048149895061500028010020080008200800081600481600481180201100991001008000010000000111511711611159984080000100160049160049160049160049160049
80204160048119911018617992125801001008000010080000500567798411600290160048160048149895061500028010020080008200800721600481600481180201100991001008000010000000111511711611159984080000100160049160049160049160049160049

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 2.0006

retire (01)cycle (02)031e3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696b6d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa8acc2branch mispredict (cb)cdcfd5d6ddinst fetch restart (de)e0? int output thing (e9)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
8002416004811990617993725800101080000108000050567798411600290160048160048149910315002880010208000020800001600481600481180021109101080000100000050201416121215997108000010160049160049160049160049160049
8002416004811980617993725800101080000108000050567798411600290160048160048149910315002880010208000020800001600481600481180021109101080000104000050201216171315997108000010160049160049160049160107160049
80024160048119905367993725800271080000108000050567798411600290160048160048149910315002880010208000020800001600481600481180021109101080000100000050201716161315997128000010160049160049160049160049160049
8002416004811990617993725800101080000108000050567798411600293160048160048149910315002880010208000020800001600481600481180021109101080000100000050201716171415997108000010160049160049160049160049160049
8002416004811980617993725800101080000108000050567798411600290160048160048149910315002880010208000020800001600481600481180021109101080000100000050201316121215997108000010160049160049160049160049160049
800241600481199082799372580010108000010800005056779841160029016004816004814991031500288001020800002080000160048160107118002110910108000010018300050431516151615997108000010160049160049160049160049160049
8002416004811990617993725800101080017108000050567798411600290160048160048149910315002880010208000020800001600481600481180021109101080000100000050201216111415997108000010160049160049160049160049160049
8002416004811980617993725800101080000108000050567798401600290160048160048149910315002880010208000020800001600481600481180021109101080000100000050201116131716004708000010160049160049160049160049160049
8002416004811990617993725800101080000108000050567798401600290160048160048149910315002880010208000020800001600481600481180021109101080000100000050201016121715997108000010160049160049160049160049160049
8002416004811990617993725800101080000108000050567798411600290160106160048149910315002880010208000020800001600481600481180021109101080000100000050201616141615997108000010160049160049160049160049160049