Apple M1 Microarchitecture Research by Dougall Johnson

Firestorm: Overview | Base Instructions | SIMD and FP Instructions
Icestorm:  Overview | Base Instructions | SIMD and FP Instructions

DUP (general, 8H)

Test 1: uops

Code:

  dup v0.8h, w0
  mov x0, 1
  mov x1, 2

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 2.000

Issues: 2.000

Integer unit issues: 0.000

Load/store unit issues: 1.000

SIMD/FP unit issues: 1.000

retire (01)cycle (02)031e3f4f51inst issue (52)~issue fp/simd (54)~issue ld/st (55)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing ld/st (5a)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op ld/st (7d)~map op fp/simd (7e)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst neon or fp (9a)a6a8accfd5d6ddinst fetch restart (de)e0ld/st retires (ed)? fp/simd (ee)f5f6f7f8fd
2004375303600252000100010001000100014075228201358375375723108200010001000100010003803751110011000100000732162237210001000376376376376376
2004375303600252000100010001000100014075228200356375375723108200010001000100010003753751110011000100000732162237210001000376376376376376
2004375303600252000100010001000100014075228201356375375723108200010001000100010003753771110011000100010732161237210001000391379376376376
2004376303600252000100010001000100014075228201356375375723108200010001000100010003753751110011000100003732162237210001000376376376376376
2004375203600252000100010001000100014291228201356375375723108200010001000100010003753751110011000100000732162237210001000376376376376376
2004375303600252000100010001000100014075228201356457375723108200010001000100010003753751110011000100000732162237210001000376376376376376
2004375303600252000100010001000100014075251280356375375723108200010001000100010003753751110011000100000732162237210001000376376376376376
2004375303600252000100010001000100014075228201356375375723108200010001000100010003753751110011000100000732162237210001000376376376376376
2004375303600252000100010001000100014075228200356375375723108200010001000100010003753751110011000100000732162237210001000376376376376376
2004380303600252000100010001000100014075228200356375406723108200010001000100010003753751110011000100000732162237210001000376376376376376

Test 2: Latency 1->2 roundtrip

Code:

  dup v0.8h, w0
  fmov x0, d0
  mov x0, 1
  mov x1, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 12.0032

retire (01)cycle (02)0307080b191e3f4f51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)60696b6d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a1a6a8acbranch mispredict (cb)cfd0d5d6ddinst fetch restart (de)e0? int output thing (e9)ld/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
302041200328990000012001710945625401001010020000100001002000010000500573567213672053112001301200321200321155253116240301002001000020000200100002000012003212003211202011009910010100100001000010000000131012163311957410000100001000010100120033120033120033120033120093
302041200328990000012001710945625401001010020000100001002000010000500573567213672053012001301200321200321155253116240301002001000020000200100002000012003212003211202011009910010100100001000010000000131013163411957410000100001000010100120033120033120033120033120033
302041200328991000012001710945625401001010020000100001002000010000500573567213672053012001301200321200321155253116240301002001000020000200100002000012003212003211202011009910010100100001000010000000131013163311957410000100001000010100120033120033120033120033120033
3020412003289900000120017109456254010010100200001000010020000100005005735672136720530120013012003212003211552531162403010020010000200002001000020000120032120032112020110099100101001000010000100001300131014163311957410000100001000010100120033120033120033120036120033
302041200328990000606120017109456254010010100200001000010020000100005005735672136720530120013012003212003211552531162403010020010000200002001000020000120032120032112020110099100101001000010000100002000131014165611957410000100001000010100120033120033120033120033120033
302041200328990000012001710945625401001010020000100001002000010000500573567213672053012001301200321200321155253116246301002001000020000200100002000012003212003211202011009910010100100001000010000000131013163311957410000100001000010100120033120033120033120033120033
302041200328990000012001710945825401001010020000100001002000010000500573567213672053012001301200321200321155253116240301002001000020000200100002000012003212003211202011009910010100100001000010000000131013163311957410000100001000010100120033120033120033120033120033
3020412003289900000120017109456254010010100200001000010020000100005005735672136720530120013012003212003211552931162403010020010000200002001000020000120032120032112020110099100101001000010000100001100131013163311957410000100001000010100120033120033120033120033120033
302041200328990000012001710945625401001010020000100001002000010000500573567213672053012001301200321200321155253116240301002001000020000200100002000012003212003211202011009910010100100001000010000000131013163311957410000100001000010100120033120033120033120033120033
302041200328990000012001710945625401001010020000100001002000010000500573567213672053012001301200321200321155253116240301002001000020000200100002000012003212003211202011009910010100100001000010000040960131014163311957410000100001000010100120033120033120033120033120033

1000 unrolls and 10 iterations

Result (median cycles for code): 12.0032

retire (01)cycle (02)031e3f4f51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)60696b6d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a6a8a9acbranch mispredict (cb)cdcfd5d6ddinst fetch restart (de)e0? int output thing (e9)ld/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
300241200328990120017109456254001010010200001000010200001000050573567213670029012006401200321200321155483116262300102010124200002010000200001200321200321120021109101001010000100100040200012706161111962010006100001000010010120211120033120036120118120033
3002412003289918120017109456254001010010200001000010200001000050573567213670259012001301200321200321155483116262300102010000200002010000200001200321200321120021109101001010000100100064239630012701161111957410000100001000010010120033120035120034120034120033
300241200338990120017109456254001010010200001000010200001000050573572013670029012001301201121200321155483116262300102010000200002010000200001200321200321120021109101001010000100100000030012701161111968910000100001000010010120033120033120034120033120035
300241200328990120017109456254001010010200001000010200001000050573567213670029012001301200321200321155483116262300102010000200002010000200001200321200321120021109101001010000100100000000012701162111957410000100001000010010120033120037120033120033120033
300241200328990120017109456254001010010200001000010200001000050573576813670029012001301200321200321155483116293300102010000200002010000200001200321200321120021109101001010000100100000000012702161111957410000100001000010010120033120033120033120033120033
300241200328990120017109456254001010010200001000010200001000050573567213670029012001301200321200681155483116262300102010000200002010000200001203701200321120021109101001010000100100000000012961161111957410000100001000010010120033120033120033120033120033
300241200328990120017109456254001010010200001000410204721000050573567213670029012001301200321200321155483116262300102010000200002010000200001200471200341120021109101001010000100100001000012703161211957510000100001000010010120034120033120035120373120035
300241200378990120017109463254001010010200001000010200001000050573567213670713012001601200321200321155483116269300102010000200002010000200001200611200331120021109101001010000100100000000012701161111957410000100001000010010120033120034120034120033120033
300241200328990120017109456254001010010200001000010200001000050573567213670029012001301200321200321155483116262300102010000200002010000200001200601200331120021109101001010000100100000000012701161111957410000100001000010010120033120033120033120033120033
300241200328990120017109456254001010010200001000010200001000050573567213670029012001301200321200321155483116262300102010000200002010000200001200351200351120021109101001010000100100000000012701161111957410000100001000010010120033120033120033120033120033

Test 3: throughput

Count: 8

Code:

  dup v0.8h, w8
  dup v1.8h, w8
  dup v2.8h, w8
  dup v3.8h, w8
  dup v4.8h, w8
  dup v5.8h, w8
  dup v6.8h, w8
  dup v7.8h, w8
  mov x8, 9

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.3338

retire (01)cycle (02)03070a1e3f4f51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)60696b6d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa6c5branch mispredict (cb)cdcfd5d6ddinst fetch restart (de)e0ld/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
160204267102001102669302516010010080000800001008002080015500116832618841631266930267082671366326665716013620080020800202028002080020267082670811802011009910010080000100800001115117116122670580000800001002670926709267092670926709
160204267081991102669302516010010080000800001008002080015500116895118841631266890267082670866326665716013620080020800202048002080020267152670811802011009910010080000100800001115117116112670580000800001002670926709267092670926709
160204267082001102669302516010010080000800001008002080015500116895118841631266890267082670866326665716013620080020800242008002080020267082670811802011009910010080000100800001115117116112670580000800001002670926709267092670926709
160204267082001102669302516010010080000800001008002080015500116895118841630266890267082670866326665716013620080020800202008002080020267082670811802011009910010080000100800001115117116112670580000800001002670926709267092670926709
160204267082001102669302516010010080000800001008002080015500116895118841631266890267082670866326665716013620080020800202008002080020267082670811802011009910010080000100800001115117116112670580000800001002670926709267092670926709
160204267082001102669302516010010080000800001008002080015500116895118841630266890267082670866326665716013520080020800202008002080020267082670811802011009910010080000100800001115117116112670580000800001002670926709267092670926709
160204267082001102669302516010010080000800001008002080015500116895118841631266890267102670866326665716013520080024800242008002080020267172670811802011009910010080000100800001115117116112670580000800001002670926709267092670926709
160204267082001102669302516010010080000800001008002080015500116895118841631266890267082670866326665716013620080024800202008002080020267082670811802011009910010080000100800001115117116112670580000800001002671226709267092670926709
160204267082001102669302516010010080000800001008002080015500116895118841631266890267082670866326665716013420080020800202008002080020267082670811802011009910010080000100800001115117116112670580000800001002670926709267092670926709
160204267082001102669302516010010080000800001008002080015500116895118841631266890267082670866326665716013620080024800202008002080024267082670811802011009910010080000100800001115117116112670580000800001002670926709267092670926709

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.3338

retire (01)cycle (02)03081e3f51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa6a8acbranch mispredict (cb)cfd5d6ddinst fetch restart (de)e0ld/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
160024267092000026693251600101080000800001080000800005011688801884032026689267082670866533674116001020800008000020800008000026709267231180021109101080000108000000050201416129267058000080000102671826709267092670926709
16002426708200002669325160010108000080000108000080000501168880188403202668926708267086653366881600102080000800002080000800002670826708118002110910108000010800000005020916116267058000080000102670926709267092670926709
1600242670820000266932516001010800008000010800008000050116888018840320266892670826708665336688160010208056780000208000080000267122676411800211091010800001080000000502010161211267058000080000102670926709267092670926709
1600242670820000266932516001010800008000010800008000050116888018840321266892670826708665336825160010208000080000208000080000267082670811800211091010800001080000000502010161111267058000080000102670926709267092670926709
160024267082000026693251600101080000800001080000800005011688801884032126689267082670866533668816001020800008000020800008000026708267251180021109101080000108000000050201116119267098000080000102670926709267112671926709
16002426708221002669325160010108000080000108000080000501168880188403212668926708267086653366881600102080000800002080000800002670826708118002110910108000010800000005020916107267058000080000102670926709267092670926709
16002426708200002669325160010108000080000108000080000501168880188403212668926708267086653246688160010208000080000208000080000267082670811800211091010800001080000000502012161012267058000080000102670926709267092670926709
1600242670820000266932516001010800008000010800008000050116888018840320266892670826708665336758160010208075980000208000080000267122671411800211091010800001080000000502012161112267058000080000102670926709267092670926709
1600242670819900266932516001010800008000010800008000050116888018840320266892670826708665336688160010208000080000208000080000267082670811800211091010800001080000000502013161212267058000080000102670926709267092670926709
160024267082000026693251600101080000800001080000800005011688801884032026689267082670866533674016001020800008000020800008000026708267081180021109101080000108000000050201116119268118000080000102670926732267092670926709