Apple M1 Microarchitecture Research by Dougall Johnson

Firestorm: Overview | Base Instructions | SIMD and FP Instructions
Icestorm:  Overview | Base Instructions | SIMD and FP Instructions

FMAXP (vector, 4H)

Test 1: uops

Code:

  fmaxp v0.4h, v0.4h, v1.4h
  movi v0.16b, 1
  movi v1.16b, 2

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 0.000

Load/store unit issues: 0.000

SIMD/FP unit issues: 1.000

retire (01)cycle (02)03091e3f4e51inst issue (52)~issue fp/simd (54)~dispatch fp/simd (57)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op fp/simd (7e)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst neon or fp (9a)a6a8accfd5d6ddinst fetch restart (de)e0? fp/simd (ee)f5f6f7f8fd
10042037150061168725100010001000264680020182037203715723189510001000200020372037111001100000073116111787100020382038203820382038
10042037160061168725100010001000264680120182037203715723189510001000200020372037111001100000073116111787100020382038203820382038
10042037150061168725100010001000264680120182037203715723189510001000200020372037111001100000073116111787100020382038203820382038
10042037150061168725100010001000264680020182037203715723189510001000200020372037111001100000073116111787100020382038203820382038
10042037160061168725100010001000264680020182037203715723189510001000200020372037111001100000073116111787100020382038203820382038
10042037150382168725100010001000264680020182037203715723189510001000200020372037111001100000073116111787100020382038203820382038
10042037150061168725100010001000264680120182037203715723189510001000200020372037111001100000073116111787100020382038203820382038
10042037150061168725100010001000264680120182037203715723189510001000200020372037111001100000073116111787100020382038203820382038
10042037150061168725100010001000264680020182037203715723189510001000200020372037111001100000073116111787100020382038203820382038
10042037151061168725100010001000264680120182037203715723189510001000200020372037111001100010073116111787100020382038203820382038

Test 2: Latency 1->2

Code:

  fmaxp v0.4h, v0.4h, v1.4h
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 2.0037

retire (01)cycle (02)03080b18191e1f3a3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a8a9acc2c5cfd5d6ddinst fetch restart (de)e0? int output thing (e9)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
10204200371500000000611968725101001001000010010000500284768002001820037200371842231874510100200100002002000020037200371110201100991001001000010000000071011611197910100001002003820038200382003820038
10204200371500000000611968725101001001000010010000500284768002001820037200371842231874510100200100002002000020037200371110201100991001001000010000000071011611197910100001002003820038200382003820038
102042003715000003000611968725101001001000010010000500284768002001820037200371842231874510100200100002002000020037200371110201100991001001000010000000071011611197910100001002003820038200382003820038
10204200371500000000611968725101001001000010010000500284768002001820037200371842231874510100200100002002000020037200371110201100991001001000010000000071011611197910100001002003820038200382003820038
10204200371500000300611968725101001001000010010000500284768002001820037200371842231874510100200100002002000020037200371110201100991001001000010000000071011611197910100001002003820038200382003820038
10204200371500000000611968725101001001000010010000500284768002001820037200371842231874510100200100002002000020037200371110201100991001001000010000000071011611197910100001002003820038200382003820038
102042003715111576606160341519621138102221361008414010912714285506512016220277202801845131188331088822010665208200002013220277611020110099100100100001000121009020802148241993527100001002022820277202312026520275
102042022815101455404402255119632117101991321007213410912692285409502023420281202751844129188541089121610994215216562032220322711020110099100100100001000001188500844381112002128100001002032520325203322027820277
1020420325152016692444002967196101391022413710084145109126942856801020306204082036618450361889211047223109952242233620374203738110201100991001001000010021013950208891128212009522100001002021620511204192027620421
1020420504153016913328800611968725101001001000010010000500284768002001820037200861842231876310100200100002152000020037200371110201100991001001000010000030071011611197910100001002003820038200382003820038

1000 unrolls and 10 iterations

Result (median cycles for code): 2.0037

retire (01)cycle (02)030708090a0b18191e1f3a3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)72scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a1a6a8a9acc2cfd5d6ddinst fetch restart (de)e0? int output thing (e9)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
10024200371500000000000611968725100101010000101000050284768002001820037200371844403187671001020100002020000200372003711100211091010100001000000006402162219785010000102003820038200382003820038
10024200371490000000000611968725100101010000101000050284768002001820037200371844403187671001020100002020000200372003711100211091010100001000000006402162219785010000102003820038200382003820038
10024200371500000000000611968725100101010000101000050284768002001820037200371844403188061001020100002020000200372003711100211091010100001000012006402162219785010000102003820038200382003820038
10024200371500000000000611968725100101010000101000050284768002001820037200371844403187671001020100002020000200372003711100211091010100001000000006402162219785010000102003820038200382003820038
10024200371500000000000611968725100101010000101000050284768012001820037200371844403187671001020100002020000200372003711100211091010100001000000006402162219785010000102003820038200382003820038
100242003715000000000002511968725100101010000101000050284768002001820037200371844403187671001020100002020000200372003711100211091010100001000000006402162219785010000102003820038200382003820038
10024200371500000000300611968725100101010000101000050284768002001820037200371844403187671001020100002020000200372003711100211091010100001000000006402162219785010000102003820038200382003820038
10024200371500000000000611968725100101010000101000050284768002001820037200371844403187671001020100002020000200372003711100211091010100001000000006402162219785010000102003820038200382003820038
10024200371490000000000611968725100101010000121000050284768002001820037200371844403187671001020100002020000200372003711100211091010100001000000006402162219785010000102003820038200382003820038
10024200371500000000000611968725100101010000101000050284768002001820037200371844403187671001020100002020000200372003711100211091010100001000000006402162219785010000102003820038200382003820038

Test 3: Latency 1->3

Code:

  fmaxp v0.4h, v1.4h, v0.4h
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 2.0037

retire (01)cycle (02)031e1f3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a8a9acc5branch mispredict (cb)cdcfd5d6ddinst fetch restart (de)e0? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
102042003715000103196872510100100100001001000050028476802001820037200371842971874110100200100082002001620037200371110201100991001001000010000001117181161119802100001002003820038200382003820038
10204200371500084196762510100100100001001000050028476802001820037200371842961874010100200100082002001620037200371110201100991001001000010000001117171161219801100001002003820038200382003820038
102042003715000379196872510100100100001001000050028476802001820037200371842231874510100200100002002000020037200371110201100991001001000010000000007102162219791100001002003820038200382003820038
10204200371500082196872510100100100001001000050028476802001820037200371842231874510100200100002002000020037200371110201100991001001000010000000007102162219791100001002003820038200382003820038
10204200371500082196872510100100100001001000050028476802001820037200371842231874510100200100002002000020037200371110201100991001001000010000000007102162219791100001002003820038200382003820038
102042003715000103196872510100100100001001000050028476802001820037200371842231874510100200100002002000020037200371110201100991001001000010000000007102162219791100001002003820038200382003820038
102042003715000103196872510100100100001001000050028476802001820037200371842231874510100200100002002000020037200371110201100991001001000010000000007102162219791100001002003820038200382003820038
102042003715000103196872510100100100001001000050028476802001820037200371842231874510100200100002002000020037200371110201100991001001000010000000007102162219791100001002003820038200382003820038
10204200371500082196872510100100100001001000050028476802001820037200371842231874510100200100002002000020037200371110201100991001001000010000000007102162219791100001002003820038200382003820038
102042003715008882196872510100100100001001000050028476802001820037200371842231874510100200100002002000020037200371110201100991001001000010000000007102162219791100001002003820038200382003820038

1000 unrolls and 10 iterations

Result (median cycles for code): 2.0037

retire (01)cycle (02)031e3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa8cdcfd5d6ddinst fetch restart (de)e0? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
1002420037150061196872510010101000010100005028476801200182003720037184443187671001020100002020000200372003711100211091010100001000640216221978510000102003820038200382003820038
1002420037150061196872510010101000010100005028476800200182003720037184443187671001020100002020000200372003711100211091010100001000640216221978510000102003820038200382003820038
1002420037150061196872510010101000010100005028476800200182003720037184443187671001020100002020000200372003711100211091010100001000640216221978510000102003820038200382003820038
10024200371500124196872510010101000010100005028476801200182003720037184443187671001020100002020000200372003711100211091010100001000640216221978510000102003820038200382003820038
1002420037150061196872510010101000010100005028476801200182003720037184443187671001020100002020000200372003711100211091010100001000640216221978510000102003820038200382003820038
1002420037150061196876210010101000010100005028476801200182003720037184443187671001020100002020000200372003711100211091010100001000640216221978510000102003820038200382003820038
10024200371500641196872510010101000010100005028476801200182003720037184443187671001020100002020000200372003711100211091010100001000640216221978510000102003820038200382003820038
10024200371500546196872510010101000010100005028476801200182003720037184443187671001020100002020000200372003711100211091010100001000640416221978510000102003820038200382003820038
10024200371500124196872510010101000010100005028476801200182003720037184443187671001020100002020000200372003711100211091010100001000640216221978510000102003820038200382003820038
10024200371500145196872510010101000010100005028476801200182003720037184443187671001020100002020000200372003711100211091010100001000640216221978510000102003820038200382003820038

Test 4: throughput

Count: 8

Code:

  fmaxp v0.4h, v8.4h, v9.4h
  fmaxp v1.4h, v8.4h, v9.4h
  fmaxp v2.4h, v8.4h, v9.4h
  fmaxp v3.4h, v8.4h, v9.4h
  fmaxp v4.4h, v8.4h, v9.4h
  fmaxp v5.4h, v8.4h, v9.4h
  fmaxp v6.4h, v8.4h, v9.4h
  fmaxp v7.4h, v8.4h, v9.4h
  movi v8.16b, 9
  movi v9.16b, 10

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.2505

retire (01)cycle (02)03071e3f51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696b6d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa6a8acc5cfd5d6ddinst fetch restart (de)e0? int output thing (e9)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
80204200571500040258010010080000100800005006400000200190200382003899733999680100200800002001600002003820038118020110099100100800001000000511021611200350800001002003920039200392003920039
80204200381500062258010010080000100800005006400000200190200382003899733999680100200800002001600002003820038118020110099100100800001000000511011611200350800001002003920039200392003920039
802042003815000230258010010080000100800005006400000200190200382003899733999680100200800002001600002003820038118020110099100100800001000000511011611200350800001002003920039200392003920039
802042003815000402580100100800001008000050064000002001902003820038997339996801002008000020016000020038200381180201100991001008000010000005110116112003525800001002003920039200392003920039
80204200381500040258010010080000125800005006400000200190200382003899733999680100200800002001600002003820038118020110099100100800001000000511011611200350800001002003920039200392003920039
80204200381500040258010010080000100800005006400000200190200382003899733999680100200800002001600002003820038118020110099100100800001000000511011611200350800001002003920039200392003920039
802042003815000392580125125800001258000062664000002001902003820038997339996801002008000020016000020038200381180201100991001008000010000005110516112003525800001002003920039200392003920039
80204200381500040258010010080000100800005006400000200190200382003899883999680100200800002001600002003820038118020110099100100800001000000511011611200350800001002003920039200392003920039
80204200381500040258010010080000100800005006400000200190200382003899733999680100200800002001600002003820038118020110099100100800001000000511211711200350800001002003920039200392003920039
80204200381500040258010010080000100800005006400000200190200382003899733999580125200800002001600002003820038118020110099100100800001000000511011611200350800001002003920039200392003920039

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.2505

retire (01)cycle (02)030a3f51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa8cfd5d6ddinst fetch restart (de)e0ea? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
80024200481500392580010108000010800005064000002001920038200389996310018800102080000201600002003820038118002110910108000010050207168320035080000102003920039200392003920039
80024200381500392580010108000010800005064000012001920038200389996310018800102080000201600002003820038118002110910108000010050203168520035080000102003920039200392003920039
80024200381500392580010108000010800005064000012001920038200389996310018800102080000201600002003820038118002110910108000010050205163520035080000102003920039200392003920039
80024200381500392580010108000010800005064000012001920038200389996310018800102080000201600002003820038118002110910108000010050205165320035080000102003920039200392003920039
8002420038150047162580010108000010800005064000012001920038200389996310018800102080000201600002003820038118002110910108000010050203165520035080000102003920088200392003920039
80024200381500392580010108000010800005064000012001920038200389996310018801182080000201600002003820038118002110910108000010050205166520035080000102003920039200392003920039
80024200381500392580010108000010800005064000012001920038200389996310018800102080000201600002003820038118002110910108000010050203167320035080000102003920039200392003920039
80024200381500392580010108000010800005064000012001920038200389996310018800102080000201600002003820038118002110910108000010050203165520035080000102003920039200392003920039
80024200381500832580010108000010800005064000012001920038200389996310018800102080000201600002003820038118002110910108000010050205165520035080000102003920039200392003920039
80024200381500392580010108000010800005064000012001920038200389996310018800102080000201600002003820038118002110910108000010050205167520035080000102003920039200392003920039