Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

MOV (vector, 16B)

Test 1: uops

Code:

  mov v0.16b, v1.16b
  nop ; nop ; nop ; nop ; nop ; nop ; nop
  movi v0.16b, 1
  movi v1.16b, 2

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 7 nops): 1.000

Issues: 0.000

Integer unit issues: 0.000

Load/store unit issues: 0.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)031e3f5160696d6emap rewind (75)map stall (76)map simd uop (7e)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst simd alu (9a)l1d cache writeback (a8)accfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)f5f6f7f8fd
800410277027102501009102710273101000200010271027118001100003512315111024100010281028102810281028
800410278027102511009102710273101000200010271027118001100000512115111024100010281028102810281028
800410277027102501009102710273101000200010271027118001100000512115111024100010281028102810281028
800410278027102501009102710273101000200010271027118001100000513215111024100010281028102810281028
800410278027102501009102710273101000200010271027118001100000513115111050100010281028102810281028
800410278027102501009102710273101000200010271027118001100000512115111024100010281028102810281028
8004102770271025010091027102731010002000102710271180011000015512115111024100010281028102810281028
800410277027102501009102710273101000200010271027118001100000512115111024100010281028102810281028
800410277027102501009102710273101000200010271027118001100000512115111024100010281028102810281028
800410277027102501009102710273101000200010271027118001100000512115111024100010281028102810281028

Test 2: Latency 1->2

Chain cycles: 2

Code:

  mov v0.16b, v1.16b
  add v1.16b, v1.16b, v0.16b
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 2 chain cycles): 0.0037

retire uop (01)cycle (02)031e3a3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d cache writeback (a8)acbranch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
202042003715000688196862510100100100001001000050028475060200182003720037171786174911010020020008200400162003720037112020110099100100200001000011113171161119842200001002003820038200382003820038
2020420037158001336196862510100100100001001000050028475060200182003720037171786174911010020020008200400162003720037112020110099100100200001000011113171161119842200001002003820038200382003820038
202042003715000105196862510100100100001001000050028475060200182003720037171786174911010020020008200400162003720037112020110099100100200001000011113171161119842200001002003820038200382003820038
20204200371500061196862510128113100001001000050028475060200182003720037171786174911010020020008200400162003720037112020110099100100200001000011113171161119842200001002003820038200382003820038
20204200371500061196862510100100100001001000050028475060200182003720037171786174911010020020008200400162003720037112020110099100100200001000011113171161119842200001002003820038200382003820038
202042003716100786196862510100100100001001000050028475060200182003720037171786174911010020020008200400162003720037112020110099100100200001000011113171161119842200001002003820038200382003820038
202042003715000168196862510100100100001001000050028475060200182003720037171786174911010020020008200400162003720037112020110099100100200001000011113171162119842200001002003820038200382003820038
202042003715001147196862510100100100001001000050028475060200182003720037171786174911010020020008200400162003720037112020110099100100200001000011113171161119842200001002003820038200382003820038
2020420037150007271968625101001001000010010000500284750602001820037200371717861749110100200200082004001620037200371120201100991001002000010028611113171161219842200001002003820038200382003820038
202042003715000611968625101001001000010010000500284750602001820037200371717861749110100200200082004001620037200371120201100991001002000010034311113171161119842200001002003820038200382003820038

1000 unrolls and 10 iterations

Result (median cycles for code, minus 2 chain cycles): 0.0037

retire uop (01)cycle (02)03mmu table walk data (08)l2 tlb miss data (0b)18191e1f3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)l1d cache writeback (a8)a9acc2cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
200242003715000000010021968625100101010000101000050284752102001820037200371719331751710010202000020400002003720037112002110910102000010000000012701161119833020000102003820038200382003820038
20024200371500000001031968625100101010000101000050284752102001820037200371719331751710010202000020400002003720037112002110910102000010000000012701161119833020000102003820100200382003820038
2002420037150000000611968625100101010000101000050284752102001820037200371719331751710010202000020400002003720037112002110910102000010000000012701161119833020000102003820038200382003820038
2002420037150000000611968625100101010000101000050284752102001820037200371719331751710010202000020400002003720037112002110910102000010000000012701161119833020000102003820038200382003820038
200242003715000000881031968625100101010000101000050284752102001820037200371719331751710010202000020400002003720037112002110910102000010000000012701161119833020000102003820038200382003820038
20024200371500000001871968625100101010000101000050284752102001820037200371719331751710010202000020400002003720037112002110910102000010000000012701161119833020000102003820038200382003820038
2002420037150000000611968625100101010000101000050284752102001820037200371719331751710010202000020400002003720037112002110910102000010000000012701161119833020000102003820038200382003820038
2002420037150000000611968625100101010000101000050284752102001820037200371719331751710010202000020400002003720037112002110910102000010000000012702161119833020000102003820038200382003820038
20024200371500000002031968625100101010000101000050284752102001820037200371719331751710010202000020400002003720037112002110910102000010000000012701161219833020000102003820038200382003820038
20024200371500000003571968625100101010000101000050284752102001820037200981719331751710010202000020400002003720037112002110910102000010000000012701161119833020000102003820038200382003820038

Test 3: throughput

Count: 8

Code:

  mov v0.16b, v8.16b
  mov v1.16b, v8.16b
  mov v2.16b, v8.16b
  mov v3.16b, v8.16b
  mov v4.16b, v8.16b
  mov v5.16b, v8.16b
  mov v6.16b, v8.16b
  mov v7.16b, v8.16b
  movi v8.16b, 9

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.1258

retire uop (01)cycle (02)03mmu table walk data (08)1e3f51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696d6emap rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d cache writeback (a8)a9aaacbranch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)dde0? simd retires (ee)? int retires (ef)f5f6f7f8fd
80204100898101228956110010010050011004210064100646101002008005620016011210064100641180201100991001008000010000001115119016010061800001001006510065100651006510065
8020410064800028956110010010050011004210064100646101002008005620016011210064100641180201100991001008000010000001115120016010061800001001006510065100651006510065
80204100647800140956110010010050011004210064100646101002008005620016011210064100641180201100991001008000010000001115119016010061800001001006510065100651006510065
80204100647802728956110010010050011004210064100646101002008005620016011210064100641180201100991001008000010000001115119016010061800001001006510065100651006510065
8020410064780028956110010010050011004210064100646101002008005620016011210064100641180201100991001008000010000001115119016010061800001001006510065100651006510065
8020410064780028956110010010050011004210064100646101002008005620016011210064100641180201100991001008000010000001115119016010061800001001006510065100651006510065
80204100647801228956110010010050011004210064100646101002008005620016011210064100641180201100991001008000010000001115119016010061800001001006510065100651006510065
8020410064780028956110010010050011004210064100646101002008005620016011210064100641180201100991001008000010000001115119016010061800001001006510065100651006510065
80204100647701228956110010010050011004210064100646101002008005620016011210064100641180201100991001008000010000001115119016010061800001001006510065100651006510065
80204100647801228956110010010050011004210064100646101002008005620016011210064100641180201100991001008000010000031115119016010061800001001006510065100651006510065

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.1255

retire uop (01)cycle (02)03mmu table walk instruction (07)l2 tlb miss instruction (0a)1e3f51schedule uop (52)schedule int uop (53)dispatch int uop (56)int uops in schedulers (59)60696d6emap rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d cache writeback (a8)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
800241005578220899982101010501100121006910038318102080000201600001003810038118002110910108000010005037271629261003580000101003910039100391003910039
800241003877220899982101010501100121009810038318102080000201600001003810038118002110910108000010025045281627261003580000101003910039100391003910039
8002410038782233479982101010501100121008210038318102080000201600001003810038118002110910108000010005047271616281003580000101003910039100391003910039
8002410038782215479982101010501100121008510038318102080000201600001003810038118002110910108000010005040201625201003580000101003910039100391003910039
8002410038782265229982101010501100121008210038318102080000201600001003810038118002110910108000010005043221623181003580000101003910039100391003910039
8002410038782212479982101010501100121009010038318102080269201600001003810038118002110910108000010005035171627161003580000101003910039100391003910039
800241003878220479982101010501100121008610038318102080000201600001003810038118002110910108000010015043251627201003580000101003910039100391003910039
800241003878220479982101010501100901009110038318102080000201600001003810038118002110910108000010005045111628201003580000101003910039100391003910039
800241003878220419982101010501100121008310038318102080000201600001003810038118002110910108000010005044231618231003580000101003910039100391003910039
800241003878220479982101010501100121008410038318102080000201600001004410038118002110910108000010005042291627261003580000101003910039100391003910039