Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

SSRA (vector, 8H)

Test 1: uops

Code:

  ssra v0.8h, v1.8h, #3
  movi v0.16b, 1
  movi v1.16b, 2

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 1.000

Integer unit issues: 0.000

Load/store unit issues: 0.000

SIMD/FP unit issues: 1.000

retire uop (01)cycle (02)033f4e51schedule uop (52)schedule simd uop (54)dispatch simd uop (57)ldst uops in schedulers (5b)696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map simd uop (7e)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst simd alu (9a)l1d cache writeback (a8)accfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)f5f6f7f8fd
1004303722612548251000100010003983133018303730372415328951000100020003037303711100110000073116112630100030383038303830383038
1004303723612548251000100010003983133018303730372415328951000100020003037303711100110000073116112630100030383038303830383038
1004303722612548251000100010003983133018303730372415328951000100020003037303711100110000073116112630100030383038303830383038
1004303723612548251000100010003983133018303730372415328951000100020003037303711100110001073116112630100030383038303830383038
1004303723612548251000100010003983133018303730372415328951000100020003037303711100110000073116112630100030383038303830383038
1004303723612548251000100010003983133018303730372415328951000100020003037303711100110000073116112630100030383038303830383038
1004303722612548251000100010003983133018303730372415328951000100020003037303711100110000073116112630100030383038303830383038
1004303722612548251000100010003983133018303730372415328951000100020003037303711100110000073116112630100030383038303830383038
1004303722612548251000100010003983133018303730372415328951000100020003037303711100110000073116112630100030383038303830383038
1004303723612548251000100010003983133018303730372415328951000100020003037303711100110000073116112630100030383038303830383038

Test 2: Latency 1->1

Code:

  ssra v0.8h, v1.8h, #3
  movi v0.16b, 1
  movi v1.16b, 2

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 3.0037

retire uop (01)cycle (02)03mmu table walk data (08)l2 tlb miss data (0b)18191e1f3a3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d tlb miss (a1)st unit uop (a7)l1d cache writeback (a8)a9accfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
10204300372240000000612954825101001001000010010000500427731313001830037300372826532874510100200100002002000030037300371110201100991001001000010000000071011611296340100001003003830038300383003830038
10204300372250000000612954825101001001000010010000500427731313001830037300372826532874510100200100002002000030037300371110201100991001001000010000000071011611296340100001003003830038300383003830038
10204300372250000000612954825101001001000010010000500427731313001830037300372826532874510100200100002002000030037300371110201100991001001000010000000071011611296340100001003003830038300383003830038
10204300372250000000612954825101001001000010010000500427731313001830037300372826532874510100200100002002000030037300371110201100991001001000010000000071011611296340100001003003830038300383003830038
10204300372250000000612954825101001001000010010000500427731313001830037300372826532874510100200100002002000030037300371110201100991001001000010000000071011611296340100001003003830038300383003830038
10204300372250000000612954825101001001000010010000500427731313001830037300372826532874510100200100002002000030037300371110201100991001001000010000000071011611296340100001003003830038300383003830038
102043003722500000006129548251010010010000100100005004277313130018300373003728265328745102512001000020020000300373003711102011009910010010000100000520071011611296340100001003003830038300383003830038
10204300372250000000612954825101001001000010010000500427731313001830037300372826532874510100200100002002000030037300371110201100991001001000010000050071011611296340100001003003830038300383003830038
10204300372250000000612954825101001001000010010000500427731313001830037300372826532874510100200100002042000030037300371110201100991001001000010000000071011611296340100001003003830038300383003830038
10204300372250000000662954825101001001000010010000500427731313001830037300372826532874510100200100002002000030037300371110201100991001001000010000000071011611296340100001003003830038300383003830038

1000 unrolls and 10 iterations

Result (median cycles for code): 3.0037

retire uop (01)cycle (02)0318191e1f3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d tlb miss (a1)ld unit uop (a6)l1d cache writeback (a8)acc2cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
100243003722500001425295482510010101000010100005042773130300180300373003728287328767100102010000202000030037300371110021109101010000100000006402162229630010000103003830038300383003830038
10024300372250000536295482510010101000010100005042773130300180300373003728287328767100102010000202000030037300371110021109101010000100000006402162229630010000103003830038300383003830038
1002430037225000061295482510010101000010100005042773130300180300373003728287328767100102010168202032630084300852110021109101010000102221564626823333329702110000103013330132301323013130132
1002430037225123481921337295306510029121001612102987142786700300180300373003728287328767101612010000202000030037300371110021109101010000100001006402162229630010000103003830038300383003830038
1002430037225000061295482510010101000010100005042773131300180300373003728287328767100102010000202000030037300371110021109101010000100000006402162229630010000103003830038300383003830038
1002430037225000061295482510010101000010100005042773131300180300373003728287328767100102010000202000030037300371110021109101010000100000006402162229630010000103003830038300383003830038
1002430037225000061295482510010101000010100005042773130300180300373003728287328767100102010000202000030037300371110021109101010000100000006402162229630010000103003830038300383003830038
10024300372250012061295482510010101000010100005042773131300180300373003728287328767100102010000202000030037300371110021109101010000100000006402162229630010000103003830038300383003830038
1002430037225000061295482510010101000010100005042773131300180300373003728287328767100102010000202000030037300371110021109101010000100000006402162229630010000103003830038300383003830038
1002430037225000061295482510010101000010100005042773131300180300373003728287328767100102010000202000030037300371110021109101010000100000306402162229630010000103003830038300383003830038

Test 3: Latency 1->2

Code:

  ssra v0.8h, v0.8h, #3
  movi v0.16b, 1

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 3.0037

retire uop (01)cycle (02)03l1i tlb fill (04)1e3a3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d cache writeback (a8)branch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
10204300372240120612954725101001001000010010000500427716003001830037300802825262873310100200100002002001630037300371110201100991001001000010001117171161129645100001003003830038300383003830038
10204300372250240822954725101001001000010010000500427716003001830037300372827172874110100200100082002001630037300371110201100991001001000010001117181161129646100001003003830038300383003830038
1020430037225000612954725101001001000010010000500427716003001830037300372827172874110100200100082002001630037300371110201100991001001000010001117181161129646100001003003830038300383003830038
1020430037225060612954725101001001000010010000500427716003001830037300372827162874110100200100082002001630037300371110201100991001001000010001117171161129645100001003003830038300383003830038
1020430037225000612954725101001001000010010000500427716003001830037300372827162874010100200100082002001630037300371110201100991001001000010001117171161129646100001003003830038300383003830038
1020430037225060612954725101001001000010010000500427716003001830037300372827162874110100200100082002001630037300371110201100991001001000010001117171161129645100001003003830038300383003830038
102043003722503450612954744101001001000010010000500427716003001830037300372827162874010100200100082002001630037300371110201100991001001000010001117171161129646100001003003830038300383003830038
10204300372400390612954725101001001000010010000500427716003001830037300372827162874010100200100082002001630037300371110201100991001001000010001117181161129645100001003003830038300383003830038
102043003722405760612954725101001001000010010000500427716003001830037300372827162874010100200100082002001630037300371110201100991001001000010001117181161129646100001003003830038300383003830038
1020430037224060612954725101001001000510010000500427716003001830037300372827162874110100200100082002001630037300371110201100991001001000010001117171161129645100001003003830038300383003830038

1000 unrolls and 10 iterations

Result (median cycles for code): 3.0037

retire uop (01)cycle (02)03181e3f4e51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d cache writeback (a8)acbranch cond mispred nonspec (c5)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? simd retires (ee)? int retires (ef)f5f6f7f8fd
1002430037225006129547251001010100001010000504277160030018300373003728286328767100102010000202000030037300371110021109101010000100006403163329629010000103003830038300383003830038
1002430037225006129547251001010100001010000504277160130018300373003728286328767100102010000202000030037300371110021109101010000100006403163329629010000103003830038300383003830038
1002430037225006129547251001010100001010000504277160130018300373003728286328767100102010000202000030037300371110021109101010000100006403163329629010000103003830038300383003830038
1002430037225006129547251001010100001010000504277160030018300373003728286328767100102010000202000030037300371110021109101010000100006403162429701010000103003830038300383003830134
10024300372250069629547251001010100001010000504279864130018300373003728286328767100102010000202033430037300371110021109101010000100006403163329629010000103003830038300383003830038
1002430037225006129547251001010100001010000504277160130018300373003728286328767100102010000202000030037300371110021109101010000101006403163329629010000103003830038300383003830038
1002430037225006129547251001010100001010000504277160130018300373003728286328767100102010000202000030037300371110021109101010000100006403163329629010000103003830038300383003830038
1002430037225006129547251001010100001010000504277160130018300373003728286328767100102010000202000030037300371110021109101010000100006403163329629010000103003830038300383003830038
1002430037224006129547251001010100001010000504277160130018300373003728286328767100102010000202000030037300371110021109101010000100006403163329629010000103003830038300383003830038
1002430037225006129547251001010100001010000504277160030018300373003728286328767100102010000202056430037300371110021109101010000100006403163329629010000103003830038300383003830038

Test 4: throughput

Count: 8

Code:

  movi v0.16b, 0
  ssra v0.8h, v8.8h, #3
  movi v1.16b, 0
  ssra v1.8h, v8.8h, #3
  movi v2.16b, 0
  ssra v2.8h, v8.8h, #3
  movi v3.16b, 0
  ssra v3.8h, v8.8h, #3
  movi v4.16b, 0
  ssra v4.8h, v8.8h, #3
  movi v5.16b, 0
  ssra v5.8h, v8.8h, #3
  movi v6.16b, 0
  ssra v6.8h, v8.8h, #3
  movi v7.16b, 0
  ssra v7.8h, v8.8h, #3
  movi v8.16b, 9

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.2508

retire uop (01)cycle (02)03181e1f3f51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)6061696d6emap rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache writeback (a8)acc2branch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd0d2l1i cache miss demand (d3)d5map dispatch bubble (d6)d9dadbddfetch restart (de)e0eaec? simd retires (ee)? int retires (ef)f5f6f7f8fd
1602042008915009029258011610080016100800285006401961520057200652006561280128200800282001600562006520065111602011009910010016000010000000111101270007160008820062001600001002006620079200782007920066
16020420065150000292580116100800161008002850064019600200452006520065612801282008002820016005620065200651116020110099100100160000100000001111012700081600063200742101600001002007820078200782007820066
160204200651500300292580116100800161008002850064019605200452006520146291280129200800282001600562006520065111602011009910010016000010000002111101270107160007720062001600001002016920066200662006620066
160204200651511121083192580117100801221008002950064019600200572006520065612801282008002820016005620065200651116020110099100100160000100000041111012750011490008820203001600001002006620066200662006620066
16020420065150000292580116100800161008002950064019605200452006520077101280131200800312001600622007820077111602011009910010016000010000030222101340109230009920075001600001002007820078200792007820078
16020420077150000652980119100800191008003150064022000200562007720078101280131200800312001600622007820078111602011009910010016000010000000222101400318230009820074001600001002009020078200792007820091
16020420077150000653180119100800191008003150064022000200562007720077101280131200800312001600622007820078111602011009910010016000010000000222101380009231009920074001600001002007920078200782007920078
160204200771510606581802251008001910080137500640220002005620078203981612801312008003120016006220077200771116020110099100100160000100200002221013801092300099200742111600001002009020090200912009120090
16020420077150060712980119100800191008003150064022000200562007820078912801312008003120016006220077200771116020110099100100160000100000002221013800010230009920074001600001002007820078200792007820132
16020420077151000653280119100800191008003150064022000200562007820078101280131200800312001600622007820077111602011009910010016000010000000222101418008340009920074001600001002007820078200782007920078

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.2506

retire uop (01)cycle (02)03mmu table walk data (08)l2 tlb miss instruction (0a)l2 tlb miss data (0b)18191e1f3f51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)5f6061696b6d6emap rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb miss (a1)ld unit uop (a6)l1d cache writeback (a8)accfd0d2l1i cache miss demand (d3)d5map dispatch bubble (d6)d9dadbddfetch restart (de)e0eaec? simd retires (ee)? int retires (ef)f5f6f7f8fd
16002420095151000009045278001010800001080000506400001152004502005120051322800102080000201600002005120051111600211091010160000100000100278416252114220048201160000102005220052200522005220052
16002420051150000006045278001010800001080000506400001152003202005120051322800102080000201600002013120187111600211091010160000100000100258512252112420048201160000102005220052200522005220052
16002420051151000000045278001010800001080000506400001152003202005120051322800102080000201600002005120051111600211091010160000100000100278512252114420048201160000102005220052200522005220052
16002420051150000000045278001010800001080000506400001152003202005120051322800102080000201600002005120051111600211091010160000100000100278514252112420114201160000102014220141201402013820133
16002420150151101111381768816780115108021010801045064168011520098020165201311022800102080000201600002005120051111600211091010160000102200100258514252114420048201160000102005220052200522005220052
16002420051150000000045278001010800001080000506400001152003202005120051322800102080000201600002005120051111600211091010160000100000100258512252112420048201160000102005220052200522005220052
160024200511500000000235278001010800001080000506400001152003202005120051322800102080000201600002005120051111600211091010160000100000100258512252112420048201160000102005220052200522005220052
16002420051150000000045278001010800001080000506400001152003202005120051322800102080000201600002005120051111600211091010160000100000100258512252114220048201160000102005220052200522005220052
16002420051150000000045278001010800001080000506400001152003202005120051322800102080000201600002005120051111600211091010160000100000100278524252112420048201160000102005220052200522005220052
16002420051150000000045278001010800001080000506400001152003202005120051322800102080000201600002005120051111600211091010160000100000100278514252114220057201160000102005220052200522005220052

Test 5: throughput

Count: 16

Code:

  ssra v0.8h, v16.8h, #3
  ssra v1.8h, v16.8h, #3
  ssra v2.8h, v16.8h, #3
  ssra v3.8h, v16.8h, #3
  ssra v4.8h, v16.8h, #3
  ssra v5.8h, v16.8h, #3
  ssra v6.8h, v16.8h, #3
  ssra v7.8h, v16.8h, #3
  ssra v8.8h, v16.8h, #3
  ssra v9.8h, v16.8h, #3
  ssra v10.8h, v16.8h, #3
  ssra v11.8h, v16.8h, #3
  ssra v12.8h, v16.8h, #3
  ssra v13.8h, v16.8h, #3
  ssra v14.8h, v16.8h, #3
  ssra v15.8h, v16.8h, #3
  movi v16.16b, 17

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.2502

retire uop (01)cycle (02)031e3f51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)simd prf full (72)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fld unit uop (a6)l1d cache writeback (a8)acbranch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)e0? simd retires (ee)? int retires (ef)f5f6f7f8fd
160204400582990302516010810016000810016002050012801320400204003940039199770619990160120200160032200320064400394003911160201100991001001600001000015011110118116400361600001004004040040400404004040040
16020440039300051251601081001600081001600205001280132040020400394003919977061999016012020016003220032006440039400391116020110099100100160000100002411110118016400361600001004004040040400404004040040
1602044003930003025160108100160008100160020500128013204002040039400391997706199901601202001600322003200644003940039111602011009910010016000010000011110118016400361600001004004040040400404004040040
160204400393000632516010810016000810016002050012801320400204003940039199770619990160120200160032200320064400394003911160201100991001001600001000013211110118016400361600001004004040040400404004040040
160204400393000302516010810016000810016002050012801320400204003940039199770619990160120200160032200320064400394003911160201100991001001600001000019211110118016400361600001004004040040400404004040040
160204400393000302516010810016000810016002050012801320400204003940039199770619990160120200160032200320064400394003911160201100991001001600001000020411110118016400361600001004004040040400404004040040
160204400393000302516010810016000810016002050012801321400204003940039199770619990160120200160032200320064400394003911160201100991001001600001000021311110118016400361600001004004040040400404004040040
160204400392990302516010810016000810016002050012801321400204003940039199770619990160120200160032200320064400394003911160201100991001001600001000021611110118016400361600001004004040040400404004040040
160204400393000302516010810016000810016002050012801321400204003940039199770619990160120200160032200320064400394003911160201100991001001600001000014411110118016400361600001004004040040400404004040040
160204400393000302516010810016000810016002050012809720400204003940039199770619990160120200160032200320064400394003911160201100991001001600001002016511110118016400361600001004004040040400404004040040

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.2502

retire uop (01)cycle (02)0309181e3f51schedule uop (52)schedule int uop (53)schedule simd uop (54)dispatch int uop (56)dispatch simd uop (57)int uops in schedulers (59)ldst uops in schedulers (5b)5f6061696d6emap stall dispatch (70)simd prf full (72)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map simd uop (7e)map int uop inputs (7f)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd alu (9a)9fl1d tlb miss (a1)l1d cache writeback (a8)accfd0d2l1i cache miss demand (d3)d5map dispatch bubble (d6)d9dadbddfetch restart (de)e0eaeb? simd retires (ee)? int retires (ef)f5f6f7f8fd
160024400393000004625160010101600001016000050128000011540020400394003919996032001916001020160000203200004003940039111600211091010160000100001002213414162114340036206160000104004040040400404004040040
1600244003930000046251600101016000010160000501280000111040020400394003919996032001916001020160000203200004003940039111600211091010160000100001002213313162113440036206160000104004040040400404004040040
1600244003929900046251600101016000010160000501280000111040020400394003919996032001916001020160000203200004003940039111600211091010160000100001002213313162114340036206160000104004040040400404004040040
16002440039299000616251600101016000010160000501280000111040020400394003919996032001916001020160000203200004003940039111600211091010160000101001002213313162113240036206160000104004040040400404004040040
1600244003930000046251600101016000010160000501280000111040020400394003919996032001916001020160000203200004003940039111600211091010160000100001002213313162113340036206160000104004040040400404004040040
1600244003930000046251600101016000010160000501280000111040020400394003919996032001916001020160000203200004003940039111600211091010160000100001002213614162113240036206160000104004040040400404004040040
1600244003929900046251600101016000010160000501280000111040020400394003919996032001916001020160000203200004003940039111600211091010160000100001002213614162113340036206160000104004040040400404004040040
1600244003930000046251600101016000010160000501280000111040020400394003919996032001916001020160000203200004003940039111600211091010160000100001002213612162113240036206160000104004040040400404004040040
16002440039299000711251600101016000010160000501280000111040020400394003919996032001916001020160000203200004003940039111600211091010160000100001002213613162112440036206160000104004040040400404004040040
1600244003930000046251600101016000010160000501280000111040020400394003919996032001916001020160000203200004003940039111600211091010160000100001002213614162112340036206160000104004040040400404004040040