Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

CRC32B

Test 1: uops

Code:

  crc32b w0, w0, w1
  mov x0, 1
  mov x1, 2

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 1.000

Load/store unit issues: 0.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)0309l2 tlb miss data (0b)1e3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst int alu (97)ld unit uop (a6)l1d cache writeback (a8)accfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
100430332200061192225100010001000814400403033303327603289110001000200030333801110011000000731161129391000100030343034303430343034
100430332200061192225100010001000814401403033303327603289110001000200030333801110011000000731161129391000100030343034303430343034
100430332200061192225100010001000814401403033303327603289110001000200030333801110011000000731161129391000100030343034303430343034
1004303323000611922251000100010008144014030333033276032891100010002000303338011100110000170731161129391000100030343034303430343034
100430332200361192225100010001000814401403033303327603289110001000200030333801110011000009731161129391000100030343034303430343034
1004303322000611922251000100010008144004030333033276032891102510002000303338011100110000015731161129391000100030343034303430343034
1004303322000611922251000100010008144004030333033276032891100010002000303338011100110000012731161129391000100030343034303430343034
100430332200061192225100010001000814401403033303327603289110001000200030333801110011000000731161129391000100030343034303430343034
100430332300061192225100010001000814400403033303327603289110001000200030333801110011000000731161129391000100030343034303430343034
100430332200061192225100010001000814401403033303327603289110001000200030333801110011000003731161129391000100030343034303430343034

Test 2: Latency 1->2

Code:

  crc32b w0, w0, w1
  mov x0, 1
  mov x1, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 3.0033

retire uop (01)cycle (02)03l2 tlb miss instruction (0a)191e3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9faccfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
10204300332250006119922251010010100101008289400492695330033300332861032874110100102002020030033374111020110099100101001000710116112993910000101003003430034300343003430034
10204300332250006119922251010010100101008289400492695330033300332861032874110100102002020030033374111020110099100101001000710116112993910000101003003430034300343003430034
10204300332250006119922251010010100101008289400492695330033300332861032874110100102002039230033374111020110099100101001000710116112993910000101003003430034300343003430034
10204300332250006119922251010010100101008289400492695330033300332861032874110100102002020030033374111020110099100101001000710116112993910000101003003430034300343003430034
1020430033225001116119922251010010100101008289400492695330033300332861032874110100102002020030033374111020110099100101001000710116112993910000101003003430034300343003430034
1020430033225001116119922251010010100101008289400492695330033300332861032874110100102002020030033374111020110099100101001000710116112993910000101003006530034300343003430034
1020430033225002706119922251010010100101008289400492695330033300332861032874110100102002020030033374111020110099100101001000710116112993910000101003003430034300343003430034
1020430033225003876119922251010010100101008289400492695330033300332861032874110100102002020030033374111020110099100101001000710116112993910000101003003430034300343003430034
1020430033225003276119922251010010100101008289400492695330033300332861032874110100102002020030033374111020110099100101001000710116112993910000101003003430034300343003430034
1020430033224002916119922251010010100101008289400492695330033300332861032874110100102002020030033374111020110099100101001000710116112993910000101003003430034300343003430034

1000 unrolls and 10 iterations

Result (median cycles for code): 3.0033

retire uop (01)cycle (02)03mmu table walk data (08)l2 tlb miss data (0b)18191e3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fl1d tlb miss (a1)l1d cache writeback (a8)a9accfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
100243003322500000611992225100101001010010828490149269533003330033286323287631001010020200203003338011100211091010010100000640316222993910000100103003430034300343003430034
100243003322400000611992225100101001010010828490149269533003330033286323287631001010020200203003338011100211091010010100000640216222993910000100103003430034300343003430034
100243003322500000611992225100101001010010828490149269533003330033286323287631001010020200203003338011100211091010010100209640216222993910000100103003430034300343003430034
1002430033225000006119922251001010010100108284901492695330033300332863232876310010100202002030033380111002110910100101002200640216222993910000100103003430034300343003430034
1002430033225000006119922251001010010100108284901492695330033300332863232876310010100202002030033380111002110910100101004500640216222993910000100103003430034300343003430061
100243003323310001261199222510010100101004482849014926953300333003328692328793100101011820020300333801110021109101001010000120640216322997110000100103003430034300343003430034
10024300332250000061199222510010100101001082849014926953300333003328632328763100101002020020300333801110021109101001010061093640216222993910000100103003430034300343003430034
100243003322500000821992225100101001010010828490149269533003330033286323287631001010020200203003338011100211091010010100400640216222993910000100103003430034300343003430034
1002430033224000030581119922251001010010100108284901492695330033300332863232876310010100202002030033380111002110910100101003203640216222993910000100103003430034300343003430034
10024300332250000061199222510010100101001082849014926953300333003328632328763100101002020020300333801110021109101001010000183640216222993910000100103003430034300343003430034

Test 3: Latency 1->3

Code:

  crc32b w0, w1, w0
  mov x0, 1
  mov x1, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 3.0033

retire uop (01)cycle (02)03l1i tlb fill (04)mmu table walk data (08)09l2 tlb miss data (0b)18191e1f3a3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fl1d tlb miss (a1)ld unit uop (a6)l1d cache writeback (a8)a9acc2branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
102043003322500200000061199222510100101001010082894004926953300333003328610328741101001020020200300333741110201100991001010010000000000710216222993910000101003003430034300343003430034
102043003322500000000061199222510100101001010082894014926953300333003328610328741101001020020200300333741110201100991001010010000200000710216222993910000101003003430034300343003430034
1020430033225000000000441199222510100101001010082894004926953300333003328610328741101001020020200300333741110201100991001010010000100000710216222993910000101003003430034300343003430034
1020430033225000000000726199222510100101001010082894004926953300333003328610328741101001020020200300333741110201100991001010010000100000710216222993910000101003003430034300343003430034
102043003322500000000061199222510100101001010082894004926953300333003328610328741101001020020200300333741110201100991001010010000100000710217222993910000101003003430034300343003430034
102043003322400000000061199222510100101001010082894004926953300333003328610328741101001020020200300333741110201100991001010010000000000710216222993910000101003003430034300343003430034
1020430033225000000000536199222510100101001010082894004926953300333003328610328741101001020020200300333741110201100991001010010000000000710216222993910000101003003430034300343003430034
1020430033225000000000631199222510100101001010082894004926953300333003328610328741101001020020200300333741110201100991001010010000000000710216222993910000101003003430034300343003430034
102043003322500000000061199222510100101001010082894004926953300333003328610328741101001020020200300333741110201100991001010010000000000710216222993910000101003003430034300343003430034
102043003322500000000061199222510100101001010082894004926953300333003328610328741101001020020200300333741110201100991001010010000000000710216222993910000101003003430034300343003430034

1000 unrolls and 10 iterations

Result (median cycles for code): 3.0033

retire uop (01)cycle (02)031e3f4d51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
1002430033224061199222510010100101001082849014926953300333003328632328763100101002020020300333801110021109101001010640616222993910000100103003430034300343003430034
1002430033225061199222510010100101001082849014926953300333003328632328763100101002020020300333801110021109101001010640316222993910000100103003430034300343003430034
1002430033224061199222510010100101001082849004926953300333003328632328763100101002020020300333801110021109101001010640316222993910000100103003430034300343003430034
100243003322501032199222510010100101001082849004926953300333003328632328763100101002020020300333801110021109101001010640316222993910000100103003430034300343003430034
1002430033225061199222510010100101001082849004926953300333003328632328763100101002020020300333801110021109101001010640216222993910000100103003430034300343003430034
1002430033225061199222510010100101001082849004926953300333003328632328763100101002020020300333801110021109101001010640316222993910000100103003430034300343003430034
100243003322505555199222510010100101001082849004926953300333003328632328763100101002020020300333801110021109101001010640316222993910000100103003430034300343003430034
1002430033225061199222510010100101001082849004926953300333003328632328763100101002020020300333801110021109101001010640316222993910000100103003430034300343003430034
1002430033225061199222510010100101001082849004926953300333003328632328763100101002020020300333801110021109101001010640216222993910000100103003430034300343003430034
1002430033225061199222510010100101001082849014926953300333003328632328763100101002020020300333801110021109101001010640316222993910000100103003430034300343003430034

Test 4: throughput

Count: 8

Code:

  crc32b w0, w8, w9
  crc32b w1, w8, w9
  crc32b w2, w8, w9
  crc32b w3, w8, w9
  crc32b w4, w8, w9
  crc32b w5, w8, w9
  crc32b w6, w8, w9
  crc32b w7, w8, w9
  mov x8, 9
  mov x9, 10

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 1.0004

retire uop (01)cycle (02)03mmu table walk data (08)l2 tlb miss instruction (0a)l2 tlb miss data (0b)181e3f51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache writeback (a8)a9acbranch cond mispred nonspec (c5)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
80204800356010000046258010080100801004005004976955800358003569964369993801008033516020080035164118020110099100801001000000005110216118003180000801008003680036802178008280214
80204800355990000046258010080100801004005004976955800358003569964369993801008020016020080218164118020110099100801001000020005110116118003180000801008003680036800368003680036
802048003559910000462580100801008010040050049769558003580035699641869993801008020016020080035164118020110099100801001000000005110116118003180000801008003680036800368003680036
802048021760000000711258010080100801004005004976955800358003569964369993801008020016020080035164118020110099100801001000200005110116118003180000801008003680218800368003680036
80204800355990000046258010080100801004005004976955800358003569964369993801008020016020080035164118020110099100801001000000005110116118003180000801008003680036800368003680036
802048003560000103346258010080100801004005004976955800358021569964369993801008020016020080035164118020110099100801001000000005161116118003180000801008003680036802198003680036
8020480035599100008772580100801008010040050049769558003580035699642169993801008020016020080035164118020110099100801001000000005110116118016680000801008003680036800368003680036
802048003560000001246258010080100801004005004976955800358003569964369993801008020016020080035164118020110099100801001000000005110116118003180000801008003680036800368017580036
802048003559900101246258010080100801004005004976955800358003569964369993801008020016020080035164118020110099100801001000010005110116218003180000801008003680036800368003680127
802048003559900000882580100801008010040050049769558003580035699642169993801008020016020080035164118020110099100801001000000005110116118003180000801008003680216800368003680036

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 1.0004

retire uop (01)cycle (02)03l1i tlb fill (04)mmu table walk data (08)181e3f51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696a6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map int uop inputs (7f)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)9fl1d cache writeback (a8)a9acbranch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd2l1i tlb miss demand (d4)d5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? int retires (ef)f5f6f7f8fd
800248003559900004625800108001080010400050149769558003580035699863700158001080020160020800351641180021109108001010100000502000161617188003280000800108003680036800368003680036
800248003559900004625800108001080010400050149769558003580035699863700158001080020160020800351641180021109108001010000000502000161618178003280000800108003680036800368003680036
800248003560000004625800108001080010400050149769558003580035699863700158001080020160292800351641180021109108001010000000502101171618188003280000800108003680036800368003680036
800248003559910004625800108001080010400050149769558003580035699863700158001080020160020800351641180021109108001010000000502101161618188003280000800108003680036800368003680036
800248003560010012462580010800108001040005014976955800358003569986370015800108002016002080035164118002110910800101010000050210117169178003280000800108003680036800368008180126
8002480081599112174625925800108001080010400050149769558003580035699863700158001080020160020800351641180021109108001010003000502101181611178003280000800108003680036800368003680036
8002480035599100124625800108001080034400050149769558003580035700033700158001080020160020800351641180021109108001010000000502101151618188003280000800108003680036800368003680036
800248003560010004625800108001080010400050149769558003580035699863700158001080020160284800351641180021109108001010000000502000111618118003280000800108003680036800368003680036
800248003559900004625800108001080010400050149769558003580035699863700158001080020160020800351641180021109108001010000000502000101618168003280000800108003680036800368003680036
800248003559900004625800108001080010400050149769558003580035699863700158001080020160020800351641180021109108001010000000502000181614188003280000800108003680036800368003680036