Apple M1 Microarchitecture Research by Dougall Johnson

Firestorm: Overview | Base Instructions | SIMD and FP Instructions
Icestorm:  Overview | Base Instructions | SIMD and FP Instructions

DUP (general, 4H)

Test 1: uops

Code:

  dup v0.4h, w0
  mov x0, 1
  mov x1, 2

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 2.000

Issues: 2.000

Integer unit issues: 0.000

Load/store unit issues: 1.000

SIMD/FP unit issues: 1.000

retire (01)cycle (02)031e3f4f51inst issue (52)~issue fp/simd (54)~issue ld/st (55)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing ld/st (5a)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op ld/st (7d)~map op fp/simd (7e)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst neon or fp (9a)a6a8accfd5d6ddinst fetch restart (de)e0ld/st retires (ed)? fp/simd (ee)f5f6f7f8fd
2004375203600252000100010001000100014075228201356375375723108200010001000100010003753751110011000100000731161137210001000376376376376376
2004375303600252000100010001000100014075228201356375375723108200010001000100010003753751110011000100000731161137210001000376376376376376
2004375303600252000100010001000100014075228200356375375723108200010001000100010003753751110011000100000731161137310001000376376376376376
2004375303600252000100010001000100014075228200356375375723108200010001000100010003753751110011000100000731161137210001000376376376376376
2004375333600252000100010001000100014075228200356375375723110200010001000100010003753751110011000100000731161137210001000376376376376376
2004375203600252000100010001000100014075228200360375375723108200010001000100010003753751110011000100000731161137210001000376376376376376
2004375203600252000100010001000100014075229011356375375723108200010001000100010003753751110011000100000731161137210001000376376376376376
2004375303600252000100010001000100014075228200356375377723108200010001000100010003753751110011000100000731161137210001000376376376376376
2004375303600252000100010001000100014075228201356375375723108200010001000100010003793751110011000100000731161137210001000376376376376376
2004375393600252000100010001000100014075228201356377375723108200010001000100010003793751110011000100000731161137210001000376376376376376

Test 2: Latency 1->2 roundtrip

Code:

  dup v0.4h, w0
  fmov x0, d0
  mov x0, 1
  mov x1, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 12.0032

retire (01)cycle (02)030a1e1f3f4f51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)60696b6d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa6a8acc2c5cdcfd0d5d6ddinst fetch restart (de)e0? int output thing (e9)ld/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
302041200328990336012001710945625401001010020000100001002000010000500573567213672053112001301200321200321155253116240301002001000020000200100002000012003212003211202011009910010100100001001000000000131012162211957410000100001000010100120033120033120033120033120033
302041200328990798012001710945625401001010020000100001002000010000500573567213672053112001601200351200321155333116244301002021000020000200100002000012003212003211202011009910010100100001001000000000131012162211957410000100001000010100120033120033120033120033120033
3020412003289901353012001710945625401001010020000100001002000010000500573567213672053112001301200321200321155253116240301002001000020000200100002000012003212003211202011009910010100100001001000000010131012162211957410000100001000010100120033120033120033120033120033
3020412003289901119012001710945625401001010020000100001002000010000500573567213672053112001301200321200321155253116240301002001000020000200100002000012003212003211202011009910010100100001001000000000131012162211965110000100001000010100120033120033120033120033120034
302041200328990117012001710945625401001010020000100001002011710000500573567213672053112001301200321200321155253116240301002001000020000200100002000012003212003211202011009910010100100001001000009010131012162211957410000100001000010100120033120033120033120033120033
3020412003289901224012001710945625401001010020000100001002000010000500573567213672053112001301200321200321155253116240301002001000020000200100002000012003212003211202011009910010100100001001000000000131012162211957410000100001000010100120033120033120033120033120033
302041200328990993012001710945625401001010020000100001002000010000500573567213672053112001301200321200321155253116240301002001000020000200100002000012003212003211202011009910010100100001001000000000131013162211957410000100001000010100120035120033120033120033120033
3020412003289901335012001710945625401001010020000100001002000010000500573567213672053112001301200321200321155253116240301002001000020000200100002000012003212003211202011009910010100100001001000000000131012162211957410000100001000010100120033120033120033120033120072
3020412003289900012001710945625401001010020000100001002000010000500573567213672053112001301200321200321155253116240301002001000020000200100002000012003212003211202011009910010100100001001000000000131012162311957410000100001000010100120033120033120033120033120033
302041200328990966012001710945625401001010020000100001002000010000500573567213672053112002801200321200681155273116240301002001000020000200100002000012003212003211202011009910010100100001001000000000131012162211957410000100001000010100120033120033120033120033120033

1000 unrolls and 10 iterations

Result (median cycles for code): 12.0032

retire (01)cycle (02)0318191e3f4f51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa1a6a8acbranch mispredict (cb)cfd5d6ddinst fetch restart (de)e0? int output thing (e9)ld/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
30024120032899000120017109456254001010010200001000010200001000050573567213670029112001312003212003211568831162623001020100002000020100002000012003212003211200211091010010100001001000000012703163211957410000100001000010010120033120033120033120033120033
30024120032899000120017109456254001010010200001000010200001000050573567213670029112001312004012029611554831162623001020100002000020100002000012003212003211200211091010010100001001000000012704162211957410000100001000010010120033120033120033120033120033
300241200328990015120017109456254001010010200001000010200001000050573567213670029112001412003212003211554831162623001020100002000020100002000012003212003211200211091010010100001001000030012704162211957910000100001000010010120377120035120033120033120033
30024120032899000120017109456254001010010200001000010200001000050573567213670029112001312003212003211554831162623001020100002000020100002000012003212003211200211091010010100001001000040012702163111957410000100001000010010120033120033120033120033120033
30024120032899000120017109459254001010010200001000010200001000050573567213670029112001312003212003211554831162623001020100002000020100002000012003212003211200211091010010100001001000000012702162211957410000100001000010010120033120033120033120033120034
30024120032899000120017109456254001010010200001000010200001000050573567213670029112001312003212003211554831162623001020100002000020100002000012003212003211200211091010010100001001000000012704161311957410000100001000010010120033120033120033120033120033
30024120032899000120017109456254001010010200001000010200001000050573567213670029112001312003212003211554831162623001020100002000020100002000012003212003211200211091010010100001001000000012702162311957410000100001000010010120033120033120033120033120033
300241200328991115120017109456254001010010200001000010200001000050573567213670029112001512003212003211554831162623001020100002000020100002013112003212003211200211091010010100001001000000012702161111957410000100001000010010120033120033120033120033120033
30024120032899000120017109456254001010010200001000010200001000050573567213670029112001312003212003211554831162623001020100642000020100002000012003212003211200211091010010100001001000000012703162311957410000100001000010010120033120033120033120033120033
30024120032899000120017109456254001010010200001000010200001000050573567213670029112001312003212003211554831162623017720100002000020100002000012003812003211200211091010010100001001000000012702162311957410000100001000010010120033120033120033120033120033

Test 3: throughput

Count: 8

Code:

  dup v0.4h, w8
  dup v1.4h, w8
  dup v2.4h, w8
  dup v3.4h, w8
  dup v4.4h, w8
  dup v5.4h, w8
  dup v6.4h, w8
  dup v7.4h, w8
  mov x8, 9

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.3338

retire (01)cycle (02)031e3f4f51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)67696b6d6edispatch stall (70)72scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa6a8acc5branch mispredict (cb)cdcfd5d6e0ld/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
16020426711199026693025160100100800008000010080020800155001168951188158802669102672026708663206665716013520080020800202008002080020267082670811802011009910010080000100800000011151171162670580000800001002670926709267092670926709
16020426708200026693025160100100800008000010080020800155001168951188416302670002670826708663206665716013520080020800202008002080020267082670811802011009910010080000100800000011151170162670580000800001002670926709267092670926709
16020426708200026693025160100100800008000010080020800155001168951188416302668902670826708663206665716013520080020800202008002080020267082670811802011009910010080000100800000011151170162670580000800001002670926709267092670926709
16020426708199026693025160100100800008000010080020800155001168951188416302669902670826708663206665716013520080020800202008002080020267082670811802011009910010080000100801300011151170162670580000800001002671926709267092670926709
160204267172002426693025160100100800008000010080020800155001168951188416302668902670826708663206665716013520080020800202008002080020267082670811802011009910010080000100800000011151170162670580000800001002670926709267092670926709
16020426708200026693025160100100800008000010080020800155001168951188416302669402671026708663206665716013520080020800202008002080020267082670811802011009910010080000100800000011151170162670580000800001002670926709267092670926709
16020426708200026693025160100100800008000010080020800155001168951188416302669802670826708663206665716013520080020800202008002080020267082670811802011009910010080000100800000011151170162670580000800001002670926709267092670926709
16020426708200026693025160100100800008000010080020800155001168951188416302669802670826708663206665716013520080020800202008002080020267082670811802011009910010080000100800000011151170162670580000800001002670926709267092670926709
16020426708200026693025160100100800008000010080020800155001168951188416302668902670826708663206665716013520080020800202008002080020267082670811802011009910010080000100800000011151170162670580000800001002670926709267092670926709
16020426708200026693025160100100800008000010080020800155001168951188416302669802670826708663206665716013520080020800202008002080020267082670811802011009910010080000100800000011151170162670580000800001002670926709267092670926709

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.3338

retire (01)cycle (02)031e3f4f51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa5a6a8accfd5d6ddinst fetch restart (de)e0ld/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
1600242671020002669302516001010800008000010800008000050117127218889971266932671226708665336688160010208000080000208000080000267082670811800211091010800001008000000502041653267058000080000102670926709267152671326709
160024267102000266930251600101080000800001080000800005011612391913473126692267082670866533668816001020800008000020800008000026711267111180021109101080000100800000871502031632267058000080000102670926709267092670926709
16002426708200026700272516001010800008000010800008000050117374118840321266892670826708665336688160010208000080000208000080000267082670811800211091010800001008000000502021623267058000080000102670926709267092670926713
1600242670820002669302516001010800008000010800008000050117661918880301266892670826708665336688160010208000080000208000080000267082670811800211091010800001008000000502021623267058000080000102670926709267092670926709
1600242670820002669722516001010800008000010800008000050117742918850561266892670826708665336688160010208000080000208000080000267082670811800211091010800001008000000502051653267058000080000102670926709267092671326709
1600242670820002680002516001010800008000010800008000050117301818842871266892670826708665336688160010208000080000208000080000267082670811800211091010800001008000000502061656267058000080000102670926709267092670926709
1600242670820002669302516001010800008000010800008000050117151418897961266892670826708665336688160010208000080000208000080000267082670811800211091010800001008000000502051623267058000080000102670926709267092670926709
1600242670820002669302516001010800008000010800008000050116962418899721266892670826708665336688160010208021480000208000080000267082670811800211091010800001008000000502021623267058000080000102670926709267092670926709
1600242670820002669302516001010800008000010800008000050117126618875231266892670826708665336688160010208000080000208000080000267082670811800211091010800001008000000502031633267058000080000102670926709267092670926709
1600242670820002669302516001010800008000010800008000050117132318887901266892670826708665336688160010208000080000208000080000267082670811800211091010800001008000000502031653267058000080000102670926709267092670926709