Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

MADD (64-bit)

Test 1: uops

Code:

  madd x0, x0, x1, x2
  mov x0, 1
  mov x1, 2

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 1.000

Load/store unit issues: 0.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)031e3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst int alu (97)l1d cache writeback (a8)accfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
100430332206119222510001000100081440040303330332760328911000100030003033380111001100030731161129391000100030343034303430343034
100430332306119222510001000100081440040303330332760328911000100030003033380111001100000731161129391000100030343034303430343034
100430332206119222510001000100081440040303330332760328911000100030003033380111001100000731161129391000100030343034303430343034
100430332308219222510001000100081440040303330332760328911000100030003033380111001100000731161129391000100030343034303430343034
1004303322045119222510001000100081440040303330332760328911000100030003033380111001100000731161129391000100030343034303430343034
1004303323010319222510001000100081440040303330332760328911000100030003033380111001100000731161129391000100030343034303430343034
100430332208219222510001000100081440040303330332760328911000100030003033380111001100000731161129391000100030343034303430343034
100430332206119222510001000100081440040303330332760328911000100030003033380111001100000731161129391000100030343034303430343034
1004303322038319222510001000100081440040303330332760328911000100030003033380111001100000731161129391000100030343034303430343034
1004303323010519222510001000100081440040303330332760328911000100030003033380111001100000731161129391000100030343034303430343034

Test 2: Latency 1->2

Code:

  madd x0, x0, x1, x2
  mov x0, 1
  mov x1, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 3.0033

retire uop (01)cycle (02)031e3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9faccdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
102043003322506119922251010010100101008289404926953300333003328610328741101001020030200300333741110201100991001010010001710216222993910000101003003430034300343003430034
102043003322506119922251010010100101008289404926953300333003328610328741101001020030200300333741110201100991001010010000710216222993910000101003003430034300343003430034
102043003322506119922251010010100101008289404926953300333003328610328741101001020030200300333741110201100991001010010000710216222993910000101003003430034300343003430034
102043003322508119922251010010100101008289404926953300333003328610328741101001020030200300333741110201100991001010010000710216222993910000101003003430034300343003430034
102043003322506119922251010010100101008289404926953300333003328610328741101001020030200300333741110201100991001010010000710216222993910000101003020730034300343003430034
102043003322506119922251010010100101458289404926953300333003328610328741101001020030200300333741110201100991001010010000710216222993910000101003003430034300343003430034
102043003322506119922251010010100101008289404926953300333003328610328741101001020030200300333741110201100991001010010000710216222993910000101003003430034300343003430034
102043003322506119922251010010100101008289404926953300333003328610328741101001020030200300333741110201100991001010010000710216222993910000101003003430034300343003430034
102043003322506119922251010010100101008289404926953300333003328610328741101001020030200300333741110201100991001010010000710216222993910000101003003430034300343003430034
102043003322506119922251010010100101008289404926953300333003328610328741101001020030200300333741110201100991001010010000710216222993910000101003003430034300343003430034

1000 unrolls and 10 iterations

Result (median cycles for code): 3.0033

retire uop (01)cycle (02)031e3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fl1d tlb miss (a1)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9accfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
1002430033225025119922251001010010100108284904926953300333003328632328763100101002030020300333801110021109101001010000000640216222993910000100103003430034300343003430034
100243003322506119922251001010010100108284904926953300333003328632328763100101002030020300333801110021109101001010000000640216222993910000100103003430034300343003430034
100243003322506119922251001010010100108284904926953300333003328632328763100101002030020300333801110021109101001010000000640216222993910000100103003430034300343003430034
100243003322506119922251001010010100108284904926953300333003328632328763100101002030020300333801110021109101001010000000640216222993910000100103003430034300343003430034
100243003322506119922251001010010100108284904926953300333003328632328763100101002030020300333801110021109101001010000001118662216222993910000100103003430034300343003430034
100243003322506119922251001010010100108284904926953300333003328632328763100101002030020300333801110021109101001010000000640216222993910000100103003430034300343003430034
100243003322506119922251001010010100108284904926953300333003328632328763100101002030020300333801110021109101001010000000640216222993910000100103003430034300343003430034
100243003322506119922251001010010100108284904926953300333003328632328763100101002030020300333801110021109101001010000000640216222993910000100103003430034300343003430034
100243003322506119922251001010010100108284904926953300333003328632328763100101002030020300333801110021109101001010000000640216222993910000100103003430034300343003430034
100243003322506119922251001010010100108284904926953300333003328632328763100101002030020300333801110021109101001010000000640216222993910000100103003430034300343003430034

Test 3: Latency 1->3

Code:

  madd x0, x1, x0, x2
  mov x0, 1
  mov x1, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 3.0033

retire uop (01)cycle (02)03l2 tlb miss data (0b)18191e3a3f4d5051schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fl1d cache writeback (a8)acc2cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
102043003322400000468199223001725101001010010100828940149269533003330033286103287411010010200302003003337411102011009910010100100000710116112993910000101003003430034300343003430034
10204300332250000061199223001725101001010010100828940149269533003330033286103287411010010200302003003337411102011009910010100100000710116112993910046101003003430034300343003430034
10204300332250000027919922025101001010010100828940149269533003330033286103287411010010200302003003337411102011009910010100100000710116112993910000101003003430034300343003430034
10204300332250000044219922025101001010010100828940149269533003330033286103287411010010200302003003337411102011009910010100100000710116112993910000101003003430034300343003430034
10204300332250000061199220251010010100101008289401492695330033300332861032874110100102643020030033374111020110099100101001000180710116112993910000101003003430034300343003430034
10204300332250000018919922025101001010010100828940149269533003330033286103287411010010200302003003337411102011009910010100100000710116112993910000101003003430034300343003430034
10204300332240000043519922025101001010010100828940149269533003330033286103287411010010200302003003337411102011009910010100100000710116112993910000101003003430034300343003430034
10204300332250000019319922025101001010010100828940149269533003330033286103287411010010200302003003337411102011009910010100100000710116112993910000101003003430034300343003430034
10204300332250000042419922025101001010010100828940149269533003330033286103287411010010200302003003337411102011009910010100100000710116112993910000101003003430034300343003430034
10204300332250000014519922025101001010010100828940149269533003330033286103287411010010200302003003337411102011009910010100100000710116112993910000101003003430034300343003430034

1000 unrolls and 10 iterations

Result (median cycles for code): 3.0033

retire uop (01)cycle (02)031e3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fl1d tlb miss (a1)l1d cache writeback (a8)branch cond mispred nonspec (c5)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
1002430033225061199222510010100101001082849004926953300333003328632328763100101002030020300333801110021109101001010000662216222993910000100103003430034300343003430034
10024300332250890199222510010100101001082849014926953300333003328632328763100101002030020300333801110021109101001010050640216222993910000100103003430034300343003430034
10024300332250834199222510010100101001082849004926953300333003328632328763100101002030020300333801110021109101001010000640216222993910000100103003430034300343003430034
10024300332250853199222510010100101001082849004926953300333003328632328763100101002030020300333801110021109101001010000640216222993910000100103003430034300343003430034
10024300332250889199222510010100101001082849004926953300333003328632328763100101002030020300333801110021109101001010000640216222993910000100103003430034300343003430034
10024300332250884199222510010100101001082849004926953300333003328632328763100101002030020300333801110021109101001010000640216222993910000100103003430034300343007130034
10024300332250950199222510010100101001082849004926953300333003328632328763100101002030020300333801110021109101001010200640216222993910000100103003430034300343003430034
10024300332250871199222510010100101001082849004926953300333003328632328763100101002030020300333801110021109101001010000640216222993910000100103003430034300343003430034
10024300332250829199222510010100101001082849004926953300333003328632328763100101002030020300333801110021109101001010000640216222993910000100103003430034300343003430034
10024300332250798199222510010100101001082849004926953300333003328632328763100101002030020300333801110021109101001010000640216222993910000100103003430034300343003430034

Test 4: Latency 1->4

Code:

  madd x0, x1, x2, x0
  mov x0, 1
  mov x1, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 1.0037

retire uop (01)cycle (02)031e3f51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fld unit uop (a6)a9accfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
1020410040750482510100101001010070498049695710037100378714387451010010200302001003716211102011009910010100100000710116111003310000101001003810038100381003810038
10204100377503392510100101001010070498149695710037100378714387451010010200302001003716211102011009910010100100000710116111003310000101001003810038100381003810038
1020410037750482510100101001010070498049695710037100378714387451010010200302001003716211102011009910010100100000710116111003310000101001003810038100381003810038
1020410037750482510100101001010070498049695710037100378714387451014810200302001003716211102011009910010100100000710116111003310000101001003810038100381003810038
10204100377593482510100101001010070498049695710037100378714387451010010200302001003716211102011009910010100100000710116111003310000101001003810038100381003810038
1020410037750692510100101001010070498049695710037100378714387451010010200302001003716211102021009910010100100000710116111003310000101001003810038100381003810038
1020410037750482510100101001010070498049695710037100378714387451010010200302001003716211102011009910010100100000710116111003310000101001003810038100381003810038
10204100377615482510100101001010070498049695710037100378714387451010010200302001003716211102011009910010100100000710116111006810000101001003810038100381003810038
1020410037750482510100101001010070498049695710037100378714387451010010200302001003716211102011009910010100100000710116111003310000101001003810038100381003810038
10204100377590482510100101001010070498149695710037100378714387451010010200302001003716211102011009910010100100000710116111003310000101001003810038100381003810038

1000 unrolls and 10 iterations

Result (median cycles for code): 1.0037

retire uop (01)cycle (02)03mmu table walk data (08)09l2 tlb miss data (0b)191e3f51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fl1d tlb access (a0)l1d cache writeback (a8)accfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
10024100377500000904251001010010100107004804969571003710037873638767100101002030020100371641110021109101001010000640216221003310000100101003810038100381003810038
1002410037750000048251001010010100107004804969571003710037873638767100101002030020100371641110021109101001010000640216221003310000100101003810038100381003810038
1002410037760000048251001010033100107004814969571003710037873638767100101002030020100371641110021109101001010000640216221003310000100101003810038100381003810038
1002410037750000069251001010010100107004804969571003710037873638767100101002030020100371641110021109101001010000640216221003310000100101003810038100381003810038
1002410037750000048251001010010100107004804969571003710037873638767100101002030020100371641110021109101001010000640216221003310000100101003810038100381003810038
10024100377500000540251001010032100107004804969571003710037873638767100101002030020100841641110021109101001010200640216221003310000100101003810038100381003810038
1002410037751111048251001010010100107020404969571003710085873638767100101002030020100371641110021109101001010000640216221003310000100101003810038100381003810038
1002410037750000048251001010033100107004804969571003710037873638767100101002030020100371641110021109101001010020640216221003310000100101003810038100381003810038
10024100377500000214251001010010100107004804969571003710037873638767100101002030020100371641110021109101001010000640216221003310000100101003810038100381003810038
1002410037750000048251001010010100107004804969571003710037873638767100101002030020100371641110021109101001010030640216221003310000100101003810038100381003810038

Test 5: throughput

Count: 8

Code:

  madd x0, x8, x9, x9
  madd x1, x8, x9, x9
  madd x2, x8, x9, x9
  madd x3, x8, x9, x9
  madd x4, x8, x9, x9
  madd x5, x8, x9, x9
  madd x6, x8, x9, x9
  madd x7, x8, x9, x9
  mov x8, 9
  mov x9, 10
  mov x10, 11

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 1.0004

retire uop (01)cycle (02)03mmu table walk data (08)09l2 tlb miss instruction (0a)l2 tlb miss data (0b)18191e1f3a3f51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)696a6d6emap stall dispatch (70)int prf full (71)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fl1d tlb access (a0)l1d cache writeback (a8)acc2branch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
802048003559900000015003525801008010080100400500497695580035800356997106699898010080208240224800351641180201100991008010010001001115117316118003280000801008030980036801268003680036
80204800355990001100003525801008010080100400500497695580035800356997106699898010080208240224800351641180201100991008010010001001115117116118003280000801008003680036800368003680036
8020480035600000000600046258010080100801004005004976955800358003569964020699938010080200240200800351641180201100991008010010000000005110216328003180000801008003680036800368003680036
8020480035599012034360046478010080100801004005004976955801738017170015011701248018880331240406800351641180201100991008010010000000005110416328003180000801008003680036800368003680036
80204800356000000000004625801008010080100400500497695580035800356996403699938010080200240200800351641180201100991008010010000000005110216228003180000801008003680036800368003680036
80204800356000000000004625801008010080100400500497695580035800356996403699938010080200240200800351641180201100991008010010000000005110216238003180000801008003680036800368003680036
80204800355990000000004625801008010080100400500497695580035800356996403699938010080200240200800351641180201100991008010010000000005110216228003180000801008003680036800368003680036
80204800356000000000004625801008010080100400500497695580035800356996403699938010080200240200800351641180201100991008010010000000005110216328003180000801008003680036800368003680036
80204800356000000000004625801008010080100400500497695580035800356996403699938010080200240200800351641180201100991008010010000000005110316238003180000801008003680036800368003680036
8020480035600000000000711258010080100801004005004975560800358003569964036999380100802002402008003516411802011009910080100100001200005110316328003180000801008003680036800368003680036

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 1.0004

retire uop (01)cycle (02)03mmu table walk instruction (07)mmu table walk data (08)l2 tlb miss instruction (0a)l2 tlb miss data (0b)18191e1f3a3f5051schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)6061696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fl1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)l1d cache writeback (a8)a9acc2cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)ec? int retires (ef)f5f6f7f8fd
800248007459900000000046604180010800108001040015700497695580035800356998637001580010800202400208003516411800211091080010100000000502012166480032800000800108003680036800368003680036
8002480035599000000000460258001080010800104000500049769558003580035699863700158001080020240020800351641180021109108001010000000050206166580032800000800108003680036800368003680036
8002480035599000000000460258001080010800104000500049769558003580035699863700158001080020240020800351641180021109108001010000000050205169480032800000800108003680036800368003680036
80024800356000000000001550258001080010800104000500049769558003580035699863700158001080020240020800351641180021109108001010000120005020121611580032800000800108003680036800368003680036
80024800355990000000007110258001080010800104000500049769558007480035699863700158001080020240020800351641180021109108001010000000050205164680032800000800108003680036800368003680036
800248003559900000000061602580010800108001040005000497695580035800356998637001580010800202400208003516411800211091080010100000000502041651280032800000800108003680036800368003680036
80024800355990000000007110258001080010800104000500049769558003580035699863700158001080020240020800351641180021109108001010000000050203169580032800000800108003680036800368003680036
800248003560000000000071102580010800108001040005000497695580035800356998637001580010800202400208003516411800211091080010100000000502010165480032800000800108003680036800368003680036
80024800355990000000003570258001080010800104000500049769558003580035699863700158001080020240020800351641180021109108001010000000050203164580032800000800108003680036800368003680036
80024800355990000000004602580010800108001040005000497695580035800356998637001580010800202400208003516411800211091080010100000000502051651180032800000800108003680036800368003680036