Apple M1 Microarchitecture Research by Dougall Johnson

Firestorm: Overview | Base Instructions | SIMD and FP Instructions
Icestorm:  Overview | Base Instructions | SIMD and FP Instructions

CMTST (register, 8B)

Test 1: uops

Code:

  cmtst v0.8b, v0.8b, v1.8b
  movi v0.16b, 1
  movi v1.16b, 2

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 0.000

Load/store unit issues: 0.000

SIMD/FP unit issues: 1.000

retire (01)cycle (02)031e1f3f4e51inst issue (52)~issue fp/simd (54)~dispatch fp/simd (57)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op fp/simd (7e)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst neon or fp (9a)accfd5d6ddinst fetch restart (de)e0? fp/simd (ee)f5f6f7f8fd
1004203715750611687251000100010002646801201820372037157231895100010002000203720371110011000073116111787100020382038203820382038
100420371500611687251000100010002646801201820372073157231895100010002324203720371110011000373116111787100020382038203820382038
100420371500611687251000100010002646801201820372037157231895100010002000203720371110011000073116111787100020382038203820382038
1004203715088611687251000100010002646801201820372037157231895100010002000203720371110011000073116111787100020382038203820382038
100420371500611687251000100010002646800201820372037157231895100010002000203720371110011000373116111787100020382038203820382038
100420371500611687251000100010002646800201820372037157231895100010002000203720371110011000073116111787100020382038203820382038
100420371500611687251000100010002646801201820372037157231895100010002000203720371110011000073116111787100020382038203820382038
100420371600611687251000100010002646801201820372037157231895100010002000203720371110011000073116111787100020382038203820382038
100420371600611687251000100010002646800201820372037157231895100010002000203720371110011000073116111787100020382038203820382038
100420851500841687251000100010002646801201820372037157231895100010002000203720371110011000073116111787100020382038203820382038

Test 2: Latency 1->2

Code:

  cmtst v0.8b, v0.8b, v1.8b
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 2.0037

retire (01)cycle (02)03070a18191e3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a1a8accfd5d6ddinst fetch restart (de)e0? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
1020420037150000033611968725101001001000010010000500284896312001820037200371842231874510100200100002002000020037200371110201100991001001000010000007102162219791100001002003820038200382003820038
102042003715000000611968725101001001000010010000500284768002001820037200371842231874510100200100002002000020037200371110201100991001001000010000007102162219791100001002003820038200382003820038
102042003715000000611968725101001001000010010000500284768012001820037200371842231874510100200100002002000020037200371110201100991001001000010000007102162219791100001002003820038200382003820038
102042003715000000611968725101001001000010010000500284768002001820037200371842231874510100200100002002000020037200371110201100991001001000010000007102162219791100001002003820086200382003820038
102042003715800009611968725101001001000010010000500284768002001820037200371842231874510100200100002002000020037200371110201100991001001000010000007102162219791100001002003820038200382003820038
102042003715000000611968725101001001000010010000500284768002001820037200371842231874510100200100002002000020037200371110201100991001001000010000007102162219791100001002003820038200382003820038
102042003715000000611968725101001001000010010000500284768002001820037200371842231874510100200100002002000020037200371110201100991001001000010000007102162219791100001002003820038200382003820038
102042003715000000611968725101001001000010010000500284768012001820037200371842231874510100200100002002000020037200371110201100991001001000010000137102162219803100001002003820038200382003820038
102042003715000000611968725101001001000010010000500284768002001820037200371842231874510100200100002002000020037200371110201100991001001000010000107102492219791100001002003820038200382003820038
102042003715000009611968725101001001000010010000500284768002001820037200371842231874510100200100002002000020037200371110201100991001001000010000007102162219791100001002003820038200382003820038

1000 unrolls and 10 iterations

Result (median cycles for code): 2.0037

retire (01)cycle (02)031e3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a1a8acc5cfd5d6ddinst fetch restart (de)e0eb? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
10024200371500611968725100101010000101000050284768012001820037200371844431876710010201000020200002003720037111002110910101000010000006403162219785010000102003820038200382003820038
10024200371500611968725100101010000101000050284768002001820037200371844431876710010201000020200002003720037111002110910101000010020006402162219785010000102003820038200382003820038
10024200371500611968725100101010000101000050284768002001820037200371844431876710010201000020200002003720037111002110910101000010000006402162219785010000102003820038200382003820038
10024200371500611968725100101010000101000050284768002001820037200371844431876710010201000020200002003720037111002110910101000010000006402162219785210000102003820038200382003820038
10024200371500611968725100101010000101000050284768002001820037200371844431876710010201000020200002003720037111002110910101000010000006402162219785010000102003820038200382003820038
10024200371500611968725100101010000101000050284768012001820037200371844431876710010201000020200002003720037111002110910101000010001006402162219785010000102003820038200382003820038
10024200371500611968725100101010000101000050284768002001820037200371844431876710010201000020200002003720037111002110910101000010000006402162219785010000102003820038200382003820038
10024200371500611968725100101010000101000050284768002001820037200371844431876710010201000020200002003720037111002110910101000010000006402162219785010000102003820038200382003820038
10024200371500611968725100101010000101000050284768002001820037200371844431876710010201000020200002003720037111002110910101000010000006402162219785010000102003820038200382003820038
10024200371500611968725100101010000101000050284768002001820037200371844431876710010201000020200002003720037111002110910101000010000606402162219785010000102003820038200382003820038

Test 3: Latency 1->3

Code:

  cmtst v0.8b, v1.8b, v0.8b
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 2.0037

retire (01)cycle (02)03081e3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa1a6a8acc5branch mispredict (cb)cdcfd5d6ddinst fetch restart (de)e0? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
102042003715000611968725101001001000010010000500284768012001820037200371842971874010100200100082002001620037200371110201100991001001000010000001117170160019802100001002003820038200382003820038
102042003715000611968725101001001000010010000500284768002001820037200371842231874510100200100002002000020037200371110201100991001001000010000000007101161119791100001002003820038200382003820038
102042003715000611968725101001001000010010000500284768012001820037200371842231874510100200100002002000020037200371110201100991001001000010000119490007101161119791100001002003820038200382008520038
102042003715000611968725101001001000010010000500284768012001820037200371842231874510100200100002002000020037200371110201100991001001000010000000007101161119791100001002003820038200382003820038
102042003715000611968725101001001000010010000500284768012001820037200371842231874510100200100002002000020037200371110201100991001001000010000100007101161119791100001002003820038200382003820038
1020420037150006119687251010010010000100100005002847680120018200372003718422318745101002001000020020000200372003711102011009910010010000100003800007101161119791100001002003820038200382003820038
102042003715000611968725101001001000010010000500284768002001820037200371842231874510100200100002002000020037200371110201100991001001000010000130007101161119791100001002003820038200382003820038
102042003715000611968725101001001000010010000500284768012001820037200371842231874510100200100002002000020037200371110201100991001001000010050000007101161119791100001002003820038200382003820038
102042003715000611968725101001001000010010000500284768002001820037200371842231874510100200100002002000020037200371110201100991001001000010000200007101161119791100001002003820038200382003820038
10204200371500045001968725101001001000010010000500284768002001820037200371842231874510100200100002002000020037200371110201100991001001000010000000007101161119791100001002003820038200382003820038

1000 unrolls and 10 iterations

Result (median cycles for code): 2.0037

retire (01)cycle (02)03080b18191e3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a1a7a8a9acbranch mispredict (cb)cfd5d6ddinst fetch restart (de)e0? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
10024200371500000061196872510010101000010100005028476800200182003720037184443187671001020100002020000200372003711100211091010100001000020000640316331978510000102003820038200382003820038
1002420037150000006119687251001010100001010000502847680020018200372003718444318767100102010000202000020037200371110021109101010000100000000640316331978510000102003820038200382003820038
1002420037150000006119687251001010100001010000502847680120018200372003718444318767100102010000202000020037200371110021109101010000100000000640316331978510000102003820038200382003820038
1002420037150000006119687251001010100001010000502847680020018200372003718444318767100102010000202000020037200371110021109101010000100001000640416331978510000102003820038200382003820038
10024200371500000019319687251001010100001010000502847680120018200372003718444318767100102010000202000020037200371110021109101010000100000000640316331978510000102003820038200382003820038
1002420037150000006119687251001010100001010000502847680020018200372003718444318767100102010000202000020037200371110021109101010000100000001640316331978510000102003820038200382003820038
1002420037150000006119687251001010100001010000502847680120018200372003718444318767100102010000202000020037200371110021109101010000100000000640316331978510000102003820038200382003820038
1002420037150000006119687251001010100001010000502847680020018200372003718444318767100102010000202000020037200371110021109101010000100000000640316331978510000102003820085200382003820038
10024200371501000061196872510010101000010100005028476800200182003720037184447187671001020100002020000200372003711100211091010100001024012630640316331978510000102003820038200382003820038
10024200371500000010319687441001010100001110152502848963020018200372003718444318767100102010000202000020037200371110021109101010000100000000640316331978510000102003820038200382003820038

Test 4: throughput

Count: 8

Code:

  cmtst v0.8b, v8.8b, v9.8b
  cmtst v1.8b, v8.8b, v9.8b
  cmtst v2.8b, v8.8b, v9.8b
  cmtst v3.8b, v8.8b, v9.8b
  cmtst v4.8b, v8.8b, v9.8b
  cmtst v5.8b, v8.8b, v9.8b
  cmtst v6.8b, v8.8b, v9.8b
  cmtst v7.8b, v8.8b, v9.8b
  movi v8.16b, 9
  movi v9.16b, 10

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.2505

retire (01)cycle (02)030708090a0b18191e1f3f51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696b6d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a1a6a8a9acc2cfd5d6ddinst fetch restart (de)e0? int output thing (e9)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
80204202461570110165660616192611780497126803741198059260264310012001902029520447100272210104802162028058720416117220287202919180201100991001008000010020200375025209416112003521800001002034620402203462019320453
80204203981580100081924704204215680779122805621238059357964308012025902065620609100604010238807112008069020016136420336203989180201100991001008000010000010282545226112322203910800001002050320039205042048420451
80204204471580010088804440206215480777127805611228058262964537812025802034220347100343210154806102068059020416138620357203438180201100991001008000010000010138505289116112022623800001002029420189202452024320343
80204204301580000054396352331915980868127808401268078263464768612041802049920444100494210157810052008029220416077620237202449180201100991001008000010020002140805225129112007619800001002039920345202852019020093
8020420088162011003379888842458058912280186115800005366400001203000201382014199901110048803102008009920016038820552201873180201100991001008000010040210893251261138222007212800001002014120345203462013920090
802042013515101101222738876525801001008000012580000500648496120059020652201379990241004980311206801932021604022014420708318020110099100100800001004401293805143340112007416800001002003920039200392014320453
8020420144150010001127396815274481056119801841128009661564611012009502008820089999081002280314200800972001603902008920090518020110099100100800001000001030511011611202430800001002003920039200392003920039
802042003815000000003040648031210080000100800005006400001200190200382003899733999680100200800002001600002003820038118020110099100100800001000002060511011611200350800001002003920039200392003920039
80204200381500000000162040258048512580000100800005006400001200190200382003899733999680100200800002001600002003820038118020110099100100800001000200000511011611200350800001002003920039200392003920039
802042003816110000000040258010010080000100800005006400001200190200382003899733999680100200800002001600002003820038118020110099100100800001000000000511031611200350800001002003920039200392003920039

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.2505

retire (01)cycle (02)03080b1f3f51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa1a6a8acc2cdcfd5d6d9dadbddinst fetch restart (de)e0? int output thing (e9)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
80024200481500002502580010108000010800005064000012001920038200389996310018800102080000201600002003820038118002110910108000010000000502010160005620035080000102003920039200392003920039
80024200381500003925800101080000108000050640000020019200382003899963100188001020800002016000020038200381180021109101080000100000005020121600051120035080000102003920039200392003920039
8002420038150000392580010108000010800005064000002001920038200389996310018800102080000201600002003820038118002110910108000010000000502061600011620035080000102003920039200392003920039
8002420038150000392580010108000010800005064000002001920038200389996310018800102080000201600002003820038118002110910108000010000300502061600051320035080000102003920039200392003920039
800242003815000039258001010800001080000506400001200192003820038999631001880010208000020160000200382003811800211091010800001000000050207160007720035080000102003920039200392003920039
800242003815000039258001010800001080000506400001200192003820038999631001880010208000020160000200382003811800211091010800001010200150207160007520035080000102003920039200392003920039
80024200381500003925800101080000108000050640000120019200382003899963100188001020800002016000020038200381180021109101080000100010005020121600012620035080000102003920039200392003920039
8002420038150000392580010108000010800005064000002001920038200389996310018800102080000201600002003820038118002110910108000010000000502051600061220035080000102003920039200392003920039
800242003815000039258001010800001080000606400000200192003820038999631001880010208000020160000200382003811800211091010800001000000050206160005520035080000102003920039200392003920039
80024200381500003925800101080000108000050640000020019200382003899963100188001020800002016000020038200381180021109101080000100000005020121600012720035080000102003920039200392003920039