Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

CASPL (64-bit)

Test 1: uops

Code:

  caspl x0, x1, x2, x3, [x6]
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 70 nops): 6.002

Issues: 3.003

Integer unit issues: 0.001

Load/store unit issues: 3.003

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch ldst uop (58)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
760113465130191301830061102230062004400820027007130003002
760063460530041300330031101430032002400420027007130003002
760063507330041300330031101130032002400420027007130003002
760063459630041300330031101130032002400420027007130003002
760063511430041300330031101130032002400420027007130003002
760063438430041300330031101130032002400420027007130003002
760063439830041300330031101130032002400420027007130003002
760063439630041300330061103430062004400820027007130003002
760063437930041300330031101130032002400420027007130003002
760063439930041300330031101130032002400420027007130003002

Test 2: throughput

Code:

  caspl x0, x1, x2, x3, [x6]
  add x6, x6, 16

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 19.0053

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
702351911125419724057301402388630002760742245829055056302024000303020270004249613000040100
702041900525506125061300002505430002756362245199055056302024000303020270004249653000040100
702041900465506525065300002505430002756362245236055056302024000303020270004249673000040100
702051901005393223902300302389430002756482245274055056302024000303020270004249653000040100
702041900765506525065300002505430002756362245199055056302024000303020270004249653000040100
702041900465506525065300002505430002756362245199055056302024000303020270004249653000040100
702051901095459824568300302455630002756362245199055056302024000303020270004249653000040100
702041900465506525065300002505430002756362245199055056302024000303020270004249653000040100
702041900465506525065300002505430002756362245199055056302024000303020270004249653000040100
702041900465506525065300002505430038723072244585053938302384005103020270004249653000040100

1000 unrolls and 10 iterations

Result (median cycles for code): 19.0055

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
70054191061540882398230106238043000275887224953605500630022400030300207000002500430000040010
70024190081550092500930000250023000075511224912905500230020400000300207000002500030000040010
70024190049550142501430000250023000075484224901205500230020400000300567008402383030000040010
70024190049550142501430000250023000075490224901205500230020400000300207000002500430000040010
70024190049550142501430000250023000075490224901205500230020400000300207000002500430000040010
70024190051550142501430000250023000075484224901205500230020400000300587008802452130000040010
70024190049550142501430000250023000075490224901205500230020400000300207000002500430000040010
70024190049550142501430000250023000075484224901205500230020400000300207000002500430000040010
70024190049550142501430000250023000075484224901205500230020400000300207000002500430000040010
70025190116539022387230030238603000075881224953005500230020400000300207000002500030000040010

Test 3: throughput

Code:

  caspl x0, x1, x2, x3, [x6]
  mov x7, 8

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 19.0098

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
7019719036211399558929550665890610504533067613136851648937063414006270572244896609153000040090
7019619009811627461259550156127410496633428013125581661547058013995670584244938609593000040092
7019419009211558060872547086107610497333458613127361662477058413996670398244275603773000040092
7019419009311587561068548076127410502533711213174821659417062014003670584244938607613000040092
7019419009211587761070548076127410497333458613127361662477058413996670584244938609593000040092
7019419009211558060872547086107610497333458613127361662477058413996670584244938609593000040092
7019419009211627361268550056127410497333458613127361662477058413996670634245101591193000040086
7019419009211447360068544056047410437334158013298391648477018413916670584244938609593000040092
7019419009211627361268550056127410497333460013127531662477058413996670584244938609593000040092
7018919016811612761070550576108410497333467813129711662477058413996670620245058608653000040082

1000 unrolls and 10 iterations

Result (median cycles for code): 19.0094

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
7001719022411496759907550605989410497633605213140661668827004613997070026244875618353000040002
7001619010111689461881550136189210492133620713147161667937001013989670060244982615803000039992
7001619013411682961843549866187410492133625513148431667937001013989670028244876618623000040000
7001419009411689261892550006189010494833575713140141668387002813993270028244876618623000040000
7001419009411689261892550006189010494833575713140141668387002813993270028244876618623000040000
7000719013611525660236550206025610491833619713148541667827000813989270010244813618263000040000
7000719016211492459910550145995210495433637213147551668287003613994270036244886618123000040000
7001419009411681161842549696187410495433637213147551668287003613994270028244876618313000040000
7001419009211686961872549976187410494833575013139751668387002813993270028244876618583000040000
7001419009211688861888550006189010499333408513154611660917006013999470008244806618183000040000