Apple M1 Microarchitecture Research by Dougall Johnson

Firestorm: Overview | Base Instructions | SIMD and FP Instructions
Icestorm:  Overview | Base Instructions | SIMD and FP Instructions

BIC (vector, immediate, 4H)

Test 1: uops

Code:

  bic v0.4h, #1
  movi v0.16b, 1

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 0.000

Load/store unit issues: 0.000

SIMD/FP unit issues: 1.000

retire (01)cycle (02)03081e3f4e51inst issue (52)~issue fp/simd (54)~dispatch fp/simd (57)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op fp/simd (7e)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst neon or fp (9a)a8cfd5d6ddinst fetch restart (de)e0? fp/simd (ee)f5f6f7f8fd
100420371500611686251000100010002645210201820372037157131895100010001000203720371110011000073216221786100020382038203820382038
100420371500611686251000100010002645210201820372037157131895100010001000203720371110011000073216221786100020382038203820382038
100420371500611686251000100010002645211201820372037157131895100010001000203720371110011000073216221786100020382038203820382038
100420371500611686251000100010002645210201820372037157131895100010001000203720371110011000073216221786100020382038203820382038
100420841510611686251000100010002645211201820372037157131895100010001000203720371110011000073216221786100020382038203820382038
100420371500611686251000100010002645210201820372037157131895100010001000203720371110011000073216221786100020382038203820382038
100420371500611686251000100010002645211201820372037157131895100010001000203720371110011000073216221786100020382038203820382038
100420371600841686251000100010002645211201820372037157131895100010001000203720371110011000073216221786100020382038203820382038
100420371500611686251000100010002645210201820372037157131895100010001000203720371110011000073216221786100020382038203820382038
100420371500611686251000100010002645210201820372037157131895100010001000203720371110011000073216221786100020382038203820382038

Test 2: Latency 1->1

Code:

  bic v0.4h, #1
  movi v0.16b, 1

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 2.0037

retire (01)cycle (02)03081e3a3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa6a8acc5branch mispredict (cb)cdcfd5d6ddinst fetch restart (de)e0? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
10204200371500906119686251010010010000100100005002847521200182003720037184287187411010020010008200100082003720037111020110099100100100001000001117170160019801100001002003820038200382003820038
10204200371500019719686251010010010000100100005002847521200182003720037184096187331010020010000200100002003720037111020110099100100100001000001117222242219787100001002003820038200382003820038
10204200371500319719686251010010010000100100005002847521200182003720037184096187331010020010000200100002003720037111020110099100100100001000001117222242219787100001002003820038200382003820038
10204200371500019719686251010010010000100100005002847521200182003720037184096187331010020010000200100002003720037111020110099100100100001000001117222242219787100001002003820038200382003820038
10204200371500019719686251010010010000100100005002847521200182003720037184096187331010020010000200100002003720037111020110099100100100001000031117222243219787100001002003820038200382003820038
1020420037150018176219686251010010010000100100005002847521200182003720037184096187331010020010000200100002003720037111020110099100100100001000001117222242219787100001002003820038200382003820038
102042003715002119719686251010010010000100100005002847521200182003720037184096187331010020010000200100002003720037111020110099100100100001000001117222242219787100001002003820038200382003820038
10204200371500019719686251010010010000100100005002847521200182003720037184096187331010020010000200100002003720037111020110099100100100001000001117222242219787100001002003820038200382003820038
102042003715001219719686251010010010000100100005002847521200182003720037184096187331010020010000200100002003720037111020110099100100100001000001117222242219787100001002003820038200382003820038
102042003715001219719686251010010010000100100005002847521200182003720037184096187331010020010000200100002003720037111020110099100100100001000001117222242219787100001002003820038200382003820038

1000 unrolls and 10 iterations

Result (median cycles for code): 2.0037

retire (01)cycle (02)031e3f4e51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa8a9c2branch mispredict (cb)cfd5d6ddinst fetch restart (de)e0? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
100242003714906119686251001010100001010000502847521120018200372003718443318767100102010000201000020037200371110021109101010000100000640216221978610000102003820038200382003820038
100242003715006119686251001010100001010000502847521120018200372003718443318767100102010000201000020037200371110021109101010000100000640216221978610000102003820038200382003820038
100242003715006119686251001010100001010000502847521120018200372003718443318767100102010000201000020037200371110021109101010000100000640216221978610000102003820038200382003820038
100242003715006119686251001010100001010000502847521120018200372003718443318767100102010000201000020037200371110021109101010000100000640216221978610000102003820038200382003820038
100242003715006119686251001010100001010000502847521120018200372003718443318767100102010000201000020037200371110021109101010000100000640216221978610000102003820038200382003820038
1002420037150366119686251001010100001010000502847521120018200372003718443318767100102010000201000020037200371110021109101010000100000640216221978610000102003820038200382003820038
100242003715006119686251001010100001010000502847521120018200372003718443318767100102010000201000020037200371110021109101010000100030640216221978610000102003820038200382003820038
100242003715006119686251001010100001010000502847521120018200372003718443318767100102010000201000020037200371110021109101010000100000640216221978610000102003820038200382003820038
100242003715006119686251001010100001010000502847521120018200372003718443318767100102010000201000020037200371110021109101010000107000640216221978610000102003820038200382003820038
100242003715006119686251001010100001010000502847521120018200372003718443318767100102010000201000020037200371110021109101010000100000640216221978610000102003820084200382003820038

Test 3: throughput

Count: 8

Code:

  movi v0.16b, 0
  bic v0.4h, #1
  movi v1.16b, 0
  bic v1.4h, #1
  movi v2.16b, 0
  bic v2.4h, #1
  movi v3.16b, 0
  bic v3.4h, #1
  movi v4.16b, 0
  bic v4.4h, #1
  movi v5.16b, 0
  bic v5.4h, #1
  movi v6.16b, 0
  bic v6.4h, #1
  movi v7.16b, 0
  bic v7.4h, #1

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.2508

retire (01)cycle (02)03080a0b18191e1f3a3f51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)696d6escheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a1a6a7a8a9acc2cfd5d6ddinst fetch restart (de)e0? int output thing (e9)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
16020420065151000000003825801001008000010080000500640000200442006320063321801002008000020080000200632006311160201100991001001600001000000000010113316442006001600001002006420064200642006420064
16020420063150000000018002938061411780596119805966386447522042920610202222910380519202804012008049020534206117116020110099100100160000100022010288801023032443320384211600001002061320582205972049420537
1602042057015311067675440012512578081212380596121804986146439922047020506204575014280714202804002008049820612205317116020110099100100160000100220010215301023141783620383221600001002040520455203992053720513
1602042045415310100276264054921880621122805021198050362564401620403205342040436380420202803002028029720297202985116020110099100100160000100022010250841023131734220422211600001002045620459204072045420455
1602042045415210055663440010342208072211880497121804996266447362036220455205083412280721200803992008049520461204556116020110099100100160000100402000241341025542144320384191600001002049520457204492048020373
160204204571531005667544009822248072511880601126804986546431522036320458204533414080719202805032028049020455204447116020110099100100160000100000012250801025532433520450201600001002056120533205322053320563
160204205311541017780452801159254807251238070012380689632644792204282053920534342180411204804982028050120404204596116020110099100100160000100042002239821021031793320320181600001002037820376204532037620531
1602042044915100156807616079120280516119804981228049963164395220323203772037427100806182028040220080390203732040461160201100991001001600001004021002415010113316332006001600001002006420064200642006420064
16020420063150000006003825801001008000010080000500640000200442006320063321801002008000020080000200632006311160201100991001001600001000000000010113416332006001600001002006420064200642006420064
160204200631500000024003825801001008000010080000500640000200442006320063321801002008000020080000200632006311160201100991001001600001000000000010113316332006001600001002006420064200642006420064

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.2506

retire (01)cycle (02)03070a0b181e3f51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)5f6061696d6escheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a6accfd0d2icache miss (d3)d5d6d9dadbddinst fetch restart (de)e0? int output thing (e9)eaeb? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
160024200751500000011325800121280000128000062640000115200262004520045321800122080000208000020045200451116002110910101600001000121002583192021124200422150160000102004620046200502005020046
16002420045150000004425800121280000128000062640000115200262004520045321800122080000208000020045200451116002110910101600001000781002783122021142200422150160000102004620046200462004620046
1600242004515000000442580012128000012800006264000011520026200452004532180012208000020800002004520045111600211091010160000100031002583162442163200422150160000102004620050200462004620046
1600242004515000000442580012128000012800006264000011520026200452004532180012208000020800002004520049111600211091010160000100001002583142021142200422300160000102005020046200462004620046
1600242004515100000442580012128000012800006264000011520026200452004532180012208000020800002004520045111600211091010160000100001002583142021142200421150160000102004620046200462004620046
1600242004515000000442580012128000012800006264000011520026200492004532180012208000020800002004520045111600211091010160000100001002783122021124200422150160000102004620046200462004620046
16002420045150100005025800121280000128000062640000115200262004520045321800122080000208000020045200451116002110910101600001000901002782142021142200422150160000102004620046200462004620046
16002420045150000004425800121280000128000062640000115200262004520045321800122080000208000020045200451116002110910101600001000991002783142021124200422150160000102004620046200462004620046
160024200451500000244425800121280000128000062640000115200282004520045321800122080000208000020045200451116002110910101600001000010027831420211242004224730160000102004620046200462004620046
16002420045150000004425800121280000128000062640000115200262004520045321800122080000208000020045200451116002110910101600001000751002683142421142200422150160000102004620046200462004620046

Test 4: throughput

Count: 16

Code:

  bic v0.4h, #1
  bic v1.4h, #1
  bic v2.4h, #1
  bic v3.4h, #1
  bic v4.4h, #1
  bic v5.4h, #1
  bic v6.4h, #1
  bic v7.4h, #1
  bic v8.4h, #1
  bic v9.4h, #1
  bic v10.4h, #1
  bic v11.4h, #1
  bic v12.4h, #1
  bic v13.4h, #1
  bic v14.4h, #1
  bic v15.4h, #1

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.2502

retire (01)cycle (02)030b18191e1f3f51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)5f60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa0a1a8accdcfd5d6ddinst fetch restart (de)e0? int output thing (e9)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
16020440058300000004025160100100160000100160000500112001601400194003840038199733199961601002001600002001600004003840038111602011009910010016000010000012010110116114003501600001004003940039400394003940039
16020440038300000004025160100100160000100160000500112001601400194003840038199733199961601002001600002001600004003840038111602011009910010016000010000021010110116114003501600001004003940039400394003940039
16020440038299000004025160100100160000100160000500112001600400194003840038199733199961601002001600002001600004003840038111602011009910010016000010000021010110116114003501600001004003940039400394003940039
16020440038300000004025160100100160000100160000500112001601400194003840038199733199961601002001600002001600004003840038111602011009910010016000010000024010110116114003501600001004003940039400394003940039
16020440038300000001352516010010016000010016000050011200160040019400384003819973319996160100200160000200160000400384003811160201100991001001600001000000010110116114003501600001004003940039400394008940039
1602044003830000000402516010010016000010016000050011200160140019400384003819973319996160100200160000200160000400384003811160201100991001001600001000000010110116114003501600001004003940039400394003940039
16020440038300000004025160100100160000100160000500112001601400194003840038199733199961601002001600002001600004003840038111602011009910010016000010000129010110116114003501600001004003940039400394003940039
160204400383000006040251601001001600001001600005001120016004001940038400381997331999616010020016000020016000040038400381116020110099100100160000100001015010110116114003501600001004003940039400394003940039
16020440038300000004025160100100160000100160000500112001601400194003840038199733199961601002001600002001600004003840038111602011009910010016000010000912010110116114003501600001004003940039400394003940039
1602044003829900000402516010010016000010016000050011200160140019400384003819973319996160100200160000200160000400384003811160201100991001001600001000003010110116114003501600001004003940039400394003940039

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.2502

retire (01)cycle (02)03041e3a3f51inst issue (52)~issue int (53)~issue fp/simd (54)~dispatch int (56)~dispatch fp/simd (57)huge thing int (59)huge thing fp/simd (5b)5f6061696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op fp/simd (7e)~map lookup int (7f)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst neon or fp (9a)9fa8accfd0d2icache miss (d3)d5d6d9dadbddinst fetch restart (de)e0eaeb? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
16002440040300000100725160010101600001016000050112001611040019400384003819996320018160010201600002016000040038400381116002110910101600001000100228114116211131540035208160000104003940039400394003940039
160024400383000001071251600101016000010160000501120016115400684003840038199963200181600102016000020160000400384003811160021109101016000010001002284113162111513400352016160000104003940039400394003940039
1600244003830000032025160010101600001016000050112001611540019400384003819996320018160010201600002016000040038400381116002110910101600001000100228411516211121440035208160000104003940039400394003940039
1600244003830000036425160010101600001016000050112001611540019400384003819996320018160010201600002016000040038400381116002110910101600001000100228411416211151240035208160000104003940039400394003940039
160024400383000004142516001010160000101600005011200161154001940038400381999632001816001020160000201600004003840038111600211091010160000100010022841141621116940035208160000104003940039400394003940039
1600244003830000036825160010101600001016000050112001611540019400384003819996320018160010201600002016000040038400381116002110910101600001000100228411316211151440035208160000104003940039400394003940039
1600244003830000036425160010101600001016000050112001611540019400384003819996320018160010201600002016000040038400381116002110910101600001000100228411516211131340035208160000104003940039400394003940039
1600244003830003034125160010101600001016000050112001611540019400384003819996320018160010201600002016000040038400381116002110910101600001000100228411516211151540035208160000104003940039400394003940039
1600244003829900038825160010101600001016000050112001611540019400384003819996320018160010201600002016000040038400381116002110910101600001000100228411516211151440035208160000104003940039400394003940039
1600244003830000041025160010101600001016000050112001611540019400384003819996320018160010201600002016000040038400381116002110910101600001000100228411416211161540035208160000104003940039400394003940039