Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

STADDLB

Test 1: uops

Code:

  staddlb w0, [x6]
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 70 nops): 3.000

Issues: 3.002

Integer unit issues: 1.003

Load/store unit issues: 2.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
73005349493032101820141007202079591082330301010202020264052102120001000
73004353043047102120261013202080261100830301010202020264052102420001000
73004351973047102320241012202479991090930361012202420244048102020001000
73004353693048102020281014202682261137430391013202620224044102120001000
73004353263053102520281014202279951085430331011202220264052102020001000
73004354223064103220321016202880801103330421014202820284056102120001000
73004351593043102120221011201980161098430291010201920164032101420001000
73004347773031101520161008201679431083230241008201620184036101720001000
73004348573035101720181009201879951093130271009201820184036101820001000
73004341693003100320001000203382331143530501017203320304060102720001000

Test 2: throughput

Code:

  staddlb w0, [x6]
  add x6, x6, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 6.0065

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
40206602854024220201200412015220005115561958234011020205200053020840009200072000020100
40204600624010520105200002010220002115647955214010420202200023020340004200052000020100
40204600624010520105200002010220002115650955264010420202200023020340004200052000020100
40204600624010520105200002010220002115661955454010420202200023020340004200052000020100
40205601104017320141200322013420002115652955304010420202200023020340004200052000020100
40204600624010520105200002010220002115645955164010420202200023020340004200052000020100
40204600624010520105200002010220002115646955184010420202200023020340004200052000020100
40204600624010520105200002010220002115654955374010420202200023020340004200052000020100
40204600624010520105200002010220002115636954984010420202200023020340004200052000020100
40204600624010520105200002010220002115655955324010420202200023020340004200052000020100

1000 unrolls and 10 iterations

Result (median cycles for code): 6.0062

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
400266028840145201072003820058200021154249543604001420022200020300204000002000520000020010
400246006240015200152000020010200001154189542804001020020200000300204000002000520000020010
400246006240015200152000020010200001154309544604001020020200000300204000002000520000020010
400246006240015200152000020010200001154089541404001020020200000300204000002000520000020010
400246006240015200152000020010200001154149542404001020020200000300204000002000520000020010
400246006240015200152000020010200001154099541404001020020200000300204000002000520000020010
400246006240015200152000020010200001154169542604001020020200000300204000002000620000020010
400246006240015200152000020010200001154179542304001020020200000300204000002000620000020010
400246006240015200152000020010200001154049540504001020020200000300204000002000520000020010
400246006240016200162000020010200001154049540504001020020200000300204000002000620000020010

Test 3: throughput

Code:

  staddlb w0, [x6]
  mov x7, 8

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 10.7718

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
302051122234470721631023076129850233392045185197854336320131272539524685478102089720000010100
306821098324044119801020640107540202171923230190072830501103852037529904461202325922711415275
302041100834287020922021948120390208631976419191986931695109322144121550425191981020000010100
302041077303967119519020152101330204541944556189715630973106192083221430423491978920000010100
302041073984019219567020625105970201971883288186583230456103602031223946467162084020000010100
302041100234279520956021839119340212571975716192332532423112732201421593425951960620000010100
302041071004133620133021203112490208551900805188254331596108422127920584406951903320000010100
302041072674072319694021029110810201261919683190270130317102922017320648408701935920000010100
302041059643960319324020279102720210961980466192414832090110982175420912413761966120000010100
302041079854071020035020675106900205351955481190970031101106692091220734410031947920000010100

1000 unrolls and 10 iterations

Result (median cycles for code): 11.4072

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
30025114896465612256124000141092495920721311997412392181427227925282845459302227620000010010
30024114862470452247724568145842475220973282019330388221408127788286245538302244020000010010
30024114489466702243824232142742482120927042015379390731426628013281845453802245720000010010
30024114545470042224324761148072480920891812012585390391424528009281965448402204620000010010
30024114140462352229323942138922430920637621990439381691387227241281845419102231920000010010
30024114274462682272623542135742394520845292008284374361350326693279625441702203020000010010
30024113606460122210923903139242458020764702002123385771400927531289205588502205920000010010
30024113800459682242723541134672479720813082004964388711408627930284425524602243120000010010
30024114083463332218324150144142485720849532008551392261438228182284265481602215420000010010
30024114200463632213224231143442452920823942006494384511393627554282865447202216820000010010