Apple M1 Microarchitecture Research by Dougall Johnson

Firestorm: Overview | Base Instructions | SIMD and FP Instructions
Icestorm:  Overview | Base Instructions | SIMD and FP Instructions

ORR (vector, immediate, 2S)

Test 1: uops

Code:

  orr v0.2s, #1
  movi v0.16b, 1

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 0.000

Load/store unit issues: 0.000

SIMD/FP unit issues: 1.000

retire (01)cycle (02)031e3f4e51inst issue (52)~issue fp/simd (54)~dispatch fp/simd (57)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op fp/simd (7e)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst neon or fp (9a)a8accfd5d6ddinst fetch restart (de)e0? fp/simd (ee)f5f6f7f8fd
100420371506116862510001000100026452112018203720371571318951000100010002037203711100110000073116111786100020382038203820382038
100420371506116862510001000100026452112018203720371571318951000100010002037203711100110000073116111786100020382038203820382038
100420371506116862510001000100026452112018203720371571318951000100010002037203711100110000073116111786100020382038203820382038
100420371506116862510001000100026452112018203720371571318951000100010002037203711100110000073116111786100020382038203820382038
100420371606116862510001000100026452112018203720371571318951000100010002037203711100110000073116111786100020382038203820382038
100420371506116862510001000100026452112018203720371571318951000100010002037203711100110000073116111786100020382038203820382038
100420371506116862510001000100026452112018203720371571318951000100010002037203711100110000073116111786100020382038203820382038
100420371506116862510001000100026452112018203720371571318951000100010002037203711100110000073116111786100020382038203820382038
100420371606116862510001000100026452112018203720371571318951000100010002037203711100110000073116111786100020382038203820382038
100420371606116862510001000100026452112018203720371571318951000100010002037203711100110000073116111786100020382038203820382038

Test 2: Latency 1->1

Code:

  orr v0.2s, #1
  movi v0.16b, 1

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 2.0037

retire (01)cycle (02)03191e1f3a3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa1a8a9acc2c5branch mispredict (cb)cdcfd5d6ddinst fetch restart (de)e0? int output thing (e9)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
10204200371500000227196862510100100100001001000050028475211200182003720037184286187401010020010008200100082003720037111020110099100100100001000000011171701600198000100001002003820038200382003820038
10204200371500000126196862510100100100001001000050028475210200182003720037184287187411010020010008200100082003720037111020110099100100100001000000011171801600198000100001002003820038200382003820038
1020420037150000061196862510100100100001001000050028475210200182003720037184286187411010020010008200100082003720037111020110099100100100001000000011171801600198000100001002003820038200382003820038
10204200371500000798196862510100100100001001000050028475211200182003720037184286187411010020010008200100082003720037111020110099100100100001000000011171701600198000100001002003820038200382003820038
10204200371490000126196862510100100100001001000050028475211200182003720037184286187411010020010008200100082003720037111020110099100100100001000000011171701600198000100001002003820038200382003820038
102042003715000001349196862510100100100001001000050028475211200182003720037184287187401010020010008200100082003720037111020110099100100100001000000011171801600198000100001002003820038200382003820038
1020420037150000061196862510100100100001001000050028475210200182003720037184286187411010020010008200100082003720037111020110099100100100001000000011171701600198010100001002003820038200382003820038
1020420037150000061196862510100100100001001000050028475211200182003720037184286187401010020010008200100082003720037111020110099100100100001000000011171701600198010100001002003820038200382003820038
1020420037150000061196862510100100100001001000050028475211200182003720037184286187411010020010008200100082003720037111020110099100100100001000000011171701600198010100001002003820038200382003820038
1020420037150000061196862510100100100001001000050028475210200182003720037184287187411010020010008200100082003720037111020110099100100100001000000011171701600198000100001002003820038200382003820038

1000 unrolls and 10 iterations

Result (median cycles for code): 2.0037

retire (01)cycle (02)03081e3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa8accfd5d6ddinst fetch restart (de)e0? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
100242003715000166196862510010101000010100005028475210200182003720037184433187671001020100002010000200372003711100211091010100001000640716331978610000102003820038200382003820038
10024200371500061196862510010101000010100005028475210200182003720037184433187671001020100002010000200372003711100211091010100001000640316331978610000102003820038200382003820038
100242003715000145196862510010101000010100005028475210200182003720037184433187671001020100002010000200372003711100211091010100001000640316331978610000102003820038200382003820038
10024200371500061196862510010101000010100005028475210200182003720037184433187671001020100002010000200372003711100211091010100001037165640316331978610000102003820038200382003820038
1002420037150001311196862510010101000010100005028475210200182003720037184433187671001020100002010000200852003711100211091010100001033189640316331978610000102003820038200382003820038
100242003715000168196862510010101000010100005028475210200182003720037184433187671001020100002010000200372003711100211091010100001000640316331978610000102003820038200382003820038
10024200371500061196862510010101000010100005028475210200182003720037184433187671001020100002010000200372003711100211091010100001000640316331978610000102003820038200382003820038
10024200371500061196862510010101000010100005028475210200182003720037184433187671001020100002010000200372003711100211091010100001000640316331978610000102003820038200382003820038
100242003715000319196862510010101000010100005028475210200182003720037184433187671001020100002010000200372003711100211091010100001000640316331978610000102003820038200382003820038
1002420037150006119686251001010100001010000502847521020018200372003718443318767100102010000201000020037200371110021109101010000103272640316331978610000102003820038200382003820038

Test 3: throughput

Count: 8

Code:

  movi v0.16b, 0
  orr v0.2s, #1
  movi v1.16b, 0
  orr v1.2s, #1
  movi v2.16b, 0
  orr v2.2s, #1
  movi v3.16b, 0
  orr v3.2s, #1
  movi v4.16b, 0
  orr v4.2s, #1
  movi v5.16b, 0
  orr v5.2s, #1
  movi v6.16b, 0
  orr v6.2s, #1
  movi v7.16b, 0
  orr v7.2s, #1

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.2508

retire (01)cycle (02)030b3f51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)5f60696b6d6escheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa8acc5branch mispredict (cb)cdcfd5d6ddinst fetch restart (de)e0? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
16020420090150026825801161008001610080028500640196002004402006520065612801282008002820080028200652006511160201100991001001600001004101111011911611200621600001002006620066200662006620066
160204200651500732580116100800161008002850064019600200440200652006561280128200800282008002820065200651116020110099100100160000100301141111011911611200621600001002006620066200662006620066
16020420065151029258011610080016100800285006401960020044020065200656128012820080028200800282006520065111602011009910010016000010012901111011911611200621600001002006620066200662006620066
1602042006515002482580116100800161008002850064019611200440200652006561280128200800282008002820065200651116020110099100100160000100001111011911611200621600001002006620066200662006620066
160204200651500292580100100800001008000050064000001200440200632006332180100200800002008000020063200631116020110099100100160000100000001011111611200601600001002006420064200642006420064
1602042006315107672580100100800001008000050064000000200440200632006332180100200800002008000020063200631116020110099100100160000100000001011111611200601600001002006420064200642006420064
1602042006315001032580100100800001008000050064000000200440200632006332180100200800002008000020063200631116020110099100100160000100000001011111611200601600001002006420064200642006420064
1602042006315001262580100100800001008000050064000000200440200632006332180100200800002008000020063200631116020110099100100160000100000001011111611200601600001002006420064200642006420064
1602042006315001052580100100800001008000050064000000200440200632006332180100200800002008000020063200631116020110099100100160000100000001011111611200601600001002006420064200642006420064
160204200631500168258010010080000100800005006400000020044020063200633218010020080000200800002006320063111602011009910010016000010036150001011111611200601600001002006420064200642006420064

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.2506

retire (01)cycle (02)031e3a3f51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)5f6061696d6escheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa1a8a9accfd0d2icache miss (d3)d5d6d9dadbddinst fetch restart (de)e0? int output thing (e9)ea? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
16002420055150001902580012128000012800006264000011520026200452004532180012208000020800002004520045111600211091010160000100000100328212320211222220042215160000102004620046200462004620046
1600242004515100642580012128000012800006264000011520026200452004532180012208000020800002004520045111600211091010160000100000100328411020211221020042215160000102004620046200462004620046
16002420045150001202580012128000012800006264000011520026200452004532180012208000020800002004520045111600211091010160000100460159100348412120211212120042218160000102004620046200462004620046
16002420045150005825800121280000128000062640000115200262004520045321800122080000208000020045200451116002110910101600001000001003284192021121920042215160000102004620046200462004620046
160024200451505403132580012128000012800006264000011520026200452004532180012208000020800002004520045111600211091010160000100000100338412120211212120042215160000102004620046200462004620046
1600242004515000582580012128000012800006264000011520026200452004532180012208000020800002004520045111600211091010160000100000100468412120211102220042215160000102004620046200462004620046
16002420045150925442580012128000012800006264000011520026200452004532180012208000020800002004520045111600211091010160000100000100458412120211102320042215160000102004620046200462004620124
1600242004515002486258001212800001280000626400001152002620045200453218001220800002080000200452004511160021109101016000010000010045841212021121920042215160000102004620046200462004620046
1600242004515002467258001212800001280000626400001152002620123201243218001220800002080000200452004511160021109101016000010402473100458412120211102220042215160000102004620046200462004620046
160024200451500278625800121280000128000062640000115200262004520045104180012208000020800002006320045111600211091010160000100100100338412220211232320042215160000102004620046200462004620046

Test 4: throughput

Count: 16

Code:

  orr v0.2s, #1
  orr v1.2s, #1
  orr v2.2s, #1
  orr v3.2s, #1
  orr v4.2s, #1
  orr v5.2s, #1
  orr v6.2s, #1
  orr v7.2s, #1
  orr v8.2s, #1
  orr v9.2s, #1
  orr v10.2s, #1
  orr v11.2s, #1
  orr v12.2s, #1
  orr v13.2s, #1
  orr v14.2s, #1
  orr v15.2s, #1

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.2502

retire (01)cycle (02)03081e1f3f51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696b6d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a1a8acc5cfd5d6ddinst fetch restart (de)e0? int output thing (e9)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
1602044006030000061725160100100160000100160000500112001614001904003840038199883199961601002001600002001600004003840038111602011009910010016000010000000101101016994003501600001004003940039400394003940039
16020440038300000128251601001001600001001600005001120016140019040038400381997331999616010020016000020016000040038400381116020110099100100160000100000001011081610840035221600001004003940039400394003940039
16020440038300000612516010010016000010016000050011200160400190400384003819973319996160100200160000200160000400384003811160201100991001001600001000000010110916994003501600001004003940039400394003940039
1602044003830000019325160100100160000100160000500112001614001904003840038199733199961601002001600002001600004003840038111602011009910010016000010000000101109169104003501600001004003940039400394003940039
16020440038300000402516010010016000010016000050011200161400190400384003819973319996160100200160000200160000400384003811160201100991001001600001000000010110916994003501600001004003940039400394003940039
1602044003830000039525160100100160000100160000500112001604001904003840038199733199961601002001600002001600004003840038111602011009910010016000010000000101101016494003501600001004003940039400394003940039
1602044003829900021425160100100160000100160000500112001604001904003840038199733199961601002001600002001600004003840038111602011009910010016000010000000101109169940035241600001004003940039400394003940039
160204400383000002562516010010016000010016000050011200160400190400384003819973319996160100200160000200160000400384003811160201100991001001600001000000010110916994003501600001004003940039400394003940039
160204400383000001492516010012116000010016000050011200160400190400384003819973319996160100200160136200160000400384003811160201100991001001600001000000010110816994003501600001004003940039400394003940039
160204400382990001722516010010016000010016000050011200160400190400384003819973319996160100200160000200160000400384003811160201100991001001600001000000010110916994003501600001004003940039400394003940039

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.2502

retire (01)cycle (02)03090a0b18191e1f3a3f51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)5f6061696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a1a8a9acc2cfd0d2icache miss (d3)d5d6d9dadbddinst fetch restart (de)e0? int output thing (e9)eaeb? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
16002440047300000000033291251600101016000010160000501120016115400194003840038199963200181600102016000020160000400384003811160021109101016000010000000100241662516222234003502016160000104003940039400394003940039
16002440038300000000027506251600101016000010160000501120016011040019400384003819996320018160010201600002016000040038400381116002110910101600001000000010022138131621133400350608160000104003940039400394003940039
1600244003830000000002415482516001010160000101600005011200160110400194003840038199963200181600102016000020160000400384003811160021109101016000010000000100241692316422324003512016160000104003940039400394003940039
16002440038300000000019158251600101016000010160000501120016111040019400384003819996320018160010201600002016000040038400381116002110910101600001000000010024138231621243400350408160000104003940039400394003940039
16002440038300000000021175251600101016000010160000501120016111040019400384003819996320018160010201600002016000040038400381116002110910101600001000000010022168131621133400350408160000104003940039400394003940039
160024400382990000000191752516001010160000101600005011200161110400194003840038199963200181600102016000020160000400384003811160021109101016000010000000100221391316211234003502016160000104003940039400394003940039
16002440038300000000017154251600101016000010160000501120016111040019400384003820010320018160010201600002016000040038400381116002110910101600001000000010022139131621133400350208160000104003940039400394003940039
16002440038299000000017884251600101016000010160000501120016111040019400384003819996320018160010201600002016000040038400381116002110910101600001000000010022139131621132400350208160000104003940039400394003940039
160024400383000000000201332516001010160000101600005011200161110400194003840038199963200181600102016000020160000400384003811160021109101016000010000000100221310231621132400350208160000104003940039400394003940039
160024400383000000000181332516001010160000101600005011200160110400194003840038199963200181600102016000020160000400384003811160021109101016000010000000100241610131621133400350208160000104003940039400394003940039