Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

FSQRT (vector, 4H)

Test 1: uops

Code:

  fsqrt v0.4h, v0.4h
  movi v0.16b, 1
  movi v1.16b, 2

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 0.000

Load/store unit issues: 0.000

SIMD/FP unit issues: 1.000

retire uop (01)cycle (02)031e1f3f4e51schedule uop (52)schedule simd uop (54)dispatch simd uop (57)ldst uops in schedulers (5b)696d6emap stall dispatch (70)simd prf full (72)map rewind (75)map stall (76)dispatch uop (78)map simd uop (7e)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst simd alu (9a)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)f5f6f7f8fd
10048037600061672025100010001000281261801880378037749303789510001000100080378037111001100073116117676100080388038803880388038
10048037600061672025100010001000281261801880378037749303789510001000100080378037111001100073116117676100080388038803880388038
10048037606061672025100010001000281261801880378037749303789510001000100080378037111001100073116117676100080388038803880388038
1004803760108061672025100010001000281261801880378037749303789510001000100080378037111001100073116117676100080388038803880388038
1004803760351061672025100010001000281261801880378037749303789510001000100080378037111001100073116117676100080388038803880388038
10048037600061672025100010001000281261801880378037749303789510001000100080378037111001100073116117676100080388038803880388038
10048037600061672025100010001000281261801880378037749303789510001000100080378037111001100073116117676100080388038803880388038
10048037600061672025100010001000281261801880378037749303789510001000100080378037111001100073116117676100080388038803880388038
10048037600061672025100010001000281261801880378037749303789510001000100080378037111001100073116117676100080388038803880388038
1004803761282061672025100010001000281261801880378037749303789510001000100080378037111001100073116117676100080388038803880388038

Test 2: Latency 1->2

Code:

  fsqrt v0.4h, v0.4h
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 8.0037

retire uop (01)cycle (02)03181e1f3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)696d6emap stall dispatch (70)simd prf full (72)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d cache writeback (a8)accfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
102048003759900010369720251010010010000100100005002864261800188003780037783430378745101002001000020010000800378003711102011009910010010000100007101161179676100001008003880038800388003880038
10204800375990006169720251010010010000100100005002864261800188003780037783430378745101002001000020010000800378003711102011009910010010000100007101161179676100001008003880072800388003880038
10204800376000006169720251010010010000100100005002864261800188003780037783430378745101002001000020010000800378003711102011009910010010000100907101161179676100001008003880038800388003880038
10204800375990006169720831010010010000100100005002864261800188003780037783430378745101002001000020010000800378003711102011009910010010000100007101161179676100001008003880038800388003880038
10204800375990006169720251010010010000100100005002864261800188003780037783430378745101002001000020010000800378003711102011009910010010000100007101161179676100001008003880038800388003880038
10204800376000008269720251010010010000100100005002864261800188003780037783430378745101002001000020010000800378003711102011009910010010000100007101161179676100001008003880038800388003880038
1020480037600000102469720251010010010000100100005002864261800188003780037783430378745101002001000020010000800378003711102011009910010010000100597101161179676100001008003880038800388003880038
10204800375990006169720251010010010000100100005002864261800188003780037783430378745101002001000020010000800378003711102011009910010010000100007101161179676100001008003880038800388003880038
102048003760000025169706251010010010000100100005002864261800188003780037783430378745101002001000020010000800378003711102011009910010010000100007101161179676100001008003880038800388003880038
102048003760000025169720251010010010000100100005002864261800188003780037783430378745101002001000020010000800378003711102011009910010010000100007101161179676100001008003880038800388003880038

1000 unrolls and 10 iterations

Result (median cycles for code): 8.0037

retire uop (01)cycle (02)03mmu table walk instruction (07)09191e1f3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb miss (a1)ld unit uop (a6)l1d cache writeback (a8)acbranch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
100248003759900490616972025100101010000101000050286426108001880037800377836537876710010201000020100008003780037111002110910101000010000000640216227967610000108003880038800388003880038
100248003760000000616972025100101010000101000050286426108001880037800377836537876710010201000020100008003780037111002110910101000010000000640216227967610000108003880038800388003880038
100248003759900000616972025100101010000101000050286426118001880037800377836537876710010201000020100008003780037111002110910101000010000000640216227967610000108003880038800388003880038
100248003759900000826972025100101010000101000050286426108001880037800377836537876710010201000020100008003780037111002110910101000010000001640216227974810000108003880038800388003880038
100248003760000000616972025100101010000101000050286426108001880037800377836537876710010201000020100008003780037111002110910101000010000000640216227967610000108003880038800388003880038
1002480037599000120616972025100101010000101000050286426108001880037800377836537876710010201000020100008003780037111002110910101000010003300640216227967610000108003880038800388003880038
100248003759900000616972025100141310000101000050286426108001880037800377836537876710010201000020100008003780037111002110910101000010000000640216227967610000108003880038800388003880038
1002480037600000007266972025100101010000101000050286426108001880037800377836537876710010201000020100008003780037111002110910101000010000000640216227967610000108003880038800388003880038
1002480037600000007266972025100101010000101000050286426108001880037800377836537876710010201000020100008003780037111002110910101000010000000640216227967610000108003880038800388003880038
1002480037600000007686972025100101010000101000050286426108001880037800377836537876710010201000020100008003780037111002110910101000010000000640216227967610000108003880038800388003880038

Test 3: throughput

Count: 8

Code:

  fsqrt v0.4h, v8.4h
  fsqrt v1.4h, v8.4h
  fsqrt v2.4h, v8.4h
  fsqrt v3.4h, v8.4h
  fsqrt v4.4h, v8.4h
  fsqrt v5.4h, v8.4h
  fsqrt v6.4h, v8.4h
  fsqrt v7.4h, v8.4h
  movi v8.16b, 9

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 2.0005

retire uop (01)cycle (02)0318191e1f3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696b6d6emap stall dispatch (70)simd prf full (72)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fst unit uop (a7)l1d cache writeback (a8)acbranch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
8020516004311990000617993725801001008000010080000500567798411600240160043160043149895061499978010020080008200800081600431600431180201100991001008000010000311151170160015998180000100160044160044160044160044160044
8020416004311990118602847993725801001008000010080000500567798411600240160043160043149895061499978010020080008200800081600431600431180201100991001008000010000011151170160015998180000100160044160044160044160044160044
8020416004311990000617993725801001008000010080000500567798411600240160043160043149895061499978010020080072200800081600431600431180201100991001008000010000011151170160015998180000100160044160044160044160044160044
80204160043119800007267993725801001008000010080000500567798411600240160043160043149895361499978010020080008200800081600431600431180201100991001008000010000011151170160015998180000100160044160044160044160044160044
8020516004311990000617993725801001008000010080000500567798411600240160043160043149895061499978010020080008200800081600431600431180201100991001008000010000011151170160015998180000100160044160044160044160044160044
80204160043119800198010247993725801001008000010080000500567798411600240160043160043149895061499978010020080008200800081600431600431180201100991001008000010000011151170160015998180000100160044160044160044160044160044
8020416004311990000617993725801001008000010080000500567798411600240160043160043149895061499978010020080008200800081600431600431180201100991001008000010000011151170160015998180000100160044160044160044160044160044
8020416004311980000617993725801001008000010080000500567798411600240160043160043149895061499978010020080008200800081600431600431180201100991001008000010000011151170160015998180000100160044160044160044160044160044
8020416004311980000617993725801001008000010080000500567798411600240160043160043149895061499978010020080008200800081600431600431180201100991001008000010000011151170160015998180000100160044160044160044160044160044
8020416004311980000617993725801001008000010080000500567798411600240160043160043149895061499978010020080008200800081600431600431180201100991001008000010000011151170160015998180000100160044160044160044160044160044

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 2.0005

retire uop (01)cycle (02)031e3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)6061696b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb miss (a1)l1d cache writeback (a8)acc2branch cond mispred nonspec (c5)branch mispred nonspec (cb)cfd0d2l1i cache miss demand (d3)d5map dispatch bubble (d6)dadbddfetch restart (de)dfe0ec? simd retires (ee)? int retires (ef)f5f6f7f8fd
800241600431199023547993725800101080000108000050567798400160024016004316004314991031500238001020800002080000160043160043118002110910108000010000000502200017160075215996998000010160044160044160044160044160044
80024160096119902957799372580010108000010800005056779840016002401600431600961499103150023800102080000208000016004316004311800211091010800001000000050220006160055215996908000010160044160044160044160044160044
8002416004311990848799372580010108000010800005056779840016002401600431600431499103150023800102080000208000016004316004311800211091010800001000000050220004160075215996908000010160044160044160044160044160044
80024160043119903877993725800101080000108000050567798400160024016004316004314991031500238001020800002080000160043160043118002110910108000010000000502200051600611215996908000010160044160044160044160044160044
8002416020311990811799372580010108000010800005056779840016002401600431600431499103150023800102080000208000016004316004311800211091010800001000000050220005160064215996908000010160044160044160044160044160044
80024160043119902959799372580010108000010800005056779840016002401600431600431499103150023800102080000208000016004316004311800211091010800001000000050220003160056215996908000010160044160044160044160044160044
8002416004311980438799372580010108000010800005056779840016002401600431600431499103150023800102080000208000016004316004311800211091010800001000000050220203160046215996908000010160044160044160044160044160044
80024160043119802836799372580010108000010800005056779840016002401600431600431499103150023800102080000208000016004316004311800211091010800001000000050220003160095215996908000010160044160044160044160044160044
8002416004311980550799372580010108000010800005056779840016002401600431600431499103150023800102080000208000016004316004311800211091010800001000000050220004160069215996908000010160044160044160044160044160044
8002416004311982853617993743800101080000108000050567798400160024016004316004314991031500238001020800002080000160043160043118002110910108000010069000050220005160059215996908000010160044160044160044160044160044