Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

STADDL (64-bit)

Test 1: uops

Code:

  staddl x0, [x6]
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 70 nops): 3.000

Issues: 3.002

Integer unit issues: 1.003

Load/store unit issues: 2.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
73005342133018101420041002200077701052130001000200020004000100320001000
73004340693003100320001000200077701052130001000200020004000100320001000
73004340763003100320001000200077701052130001000200020004000100320001000
73004340733003100320001000200077701052130001000200020004000100320001000
73004340733003100320001000200077701052130001000200020004000100320001000
73004340733003100320001000200077701052130001000200020004000100320001000
73004340763003100320001000200077701052130001000200020004000100320001000
73004340733003100320001000200077701052130001000200020004000100320001000
73004340733003100320001000200077701052130001000200020004000100320001000
73004340733003100320001000200077701052130001000200020004000100320001000

Test 2: throughput

Code:

  staddl x0, [x6]
  add x6, x6, 8

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 6.0065

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
40209606164045820368200902020320005115957964914011020205200053020840009200082000020100
40204600624010620106200002010220002116074962604010420202200023025140068200432000020100
40204600624010620106200002010220002116084962774010420202200023020340004200062000020100
40204600624010620106200002010220002116078962654010420202200023020340004200062000020100
40204600624010620106200002010220002116082962764010420202200023020340004200062000020100
40204600624010620106200002010220002116078962674010420202200023020340004200062000020100
40204600624010620106200002010220002116074962614010420202200023020340004200062000020100
40204600624010620106200002010220002116146963944010420202200023020340004200062000020100
40204600624010620106200002010220005116144966034011020205200053020840009200192000020100
40204600584011720117200002010220002116181963414010420202200023020840009200182000020100

1000 unrolls and 10 iterations

Result (median cycles for code): 6.0062

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
40029606114035620273200832010620002115919961884001420022200023011840129200732000020010
40024600894001620016200002001220005114853991424002020025200053002040000200062000020010
40024600624001620016200002001020000115825961124001020020200003002040000200062000020010
40024600624001620016200002001020000115835961284001020020200003002040000200062000020010
40024600624001620016200002001020000115841961394001020020200003002040000200062000020010
40024600624001620016200002001020000115813960904001020020200003002040000200062000020010
40024600624001620016200002001020000115790960494001020020200003011340122200682000020010
40024600624001620016200002001020060912831066204013120081200603002040000200062000020010
40024600624001620016200002001020000115835961294001020020200003007440070200422000020010
40024600624001620016200002001020000115841961384001020020200003002040000200062000020010

Test 3: throughput

Code:

  staddl x0, [x6]
  mov x7, 8

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 10.7726

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
302051142184644222379240631415120881196168019096483177110990215302237644080202682000010100
302051079564108020021210591106420354194391018992783076310513205902115041741196322000010100
302041079834120120162210391088421337196048019207653260111368222422317045363203402000010100
302041074494088619872210141098720543192683218943273113010701209342079541064194042000010100
302041075573947819255202231015320231194747819015123053710406203862162842632198152000010100
302051068244040319664207391070820549194947918981753109410645208242082641213194902000010100
302041068364016419628205361060820533196065319079943107810646208702038440340193192000010100
302041087574152220332211901111720560194729818984243113310673209202240244021198342000010100
302051073844025019649206011056122489192352718554283693515951239662246443968201132000010100
302041099974235820721216371173021602198880419319813312311650226942144442231197762000010100

1000 unrolls and 10 iterations

Result (median cycles for code): 11.4086

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
300251152084789622550253461448924502208996120130783841413923276442943756569224002000010010
300241145474684222617242251416124835208476720083343920914388282312751253665222192000010010
300241136404577322167236061420625217208252120073004000314801289332903855843224742000010010
300241137774621322141240721424124587208859220120493884114269279692857154989221202000010010
300241137514619222311238811396224550207877120030413865314119277192773054233222892000010010
300241142534643922281241581441424832208010720042403903214213280312762053880225262000010010
300241146314704822839242091403525238208979320129973987714651286322773454077225882000010010
300251146654664822406242421436324740208149420056033895614232279112693552968226602000010010
300241139264593322176237571401524623208282720073943877514165277102821055002224712000010010
300241139594635622245241111444924703208847520115513909214399280922808854424224132000010010