Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

STR (pre-index, D)

Test 1: uops

Code:

  str d0, [x6, #0x10]!

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 2.000

Integer unit issues: 1.001

Load/store unit issues: 1.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
1005176620591041101810401000463718487200010000200001001100000
1004114120011001100010001000478918505200010000200001001100000
1004118120011001100010001000478918415200010000200001001100000
1004116020011001100010001000478918721200010000200001001100000
1004117020011001100010001000478919333200010000200001001100000
1004117820011001100010001000478918900200010000200001001100000
1004116820011001100010001000472519135200010000200001001100000
1004117220011001100010001000478919801200010000200001001100000
1004117920011001100010001000478919277200010000200001001100000
1004112520011001100010001000479319495200010000200001001100000

Test 2: Latency 3->3

Code:

  str d0, [x6, #0x10]!

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 1.1376

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
10214149992070310523101801052410000434661959772010020010004200200201000510000100
10204114622010510105100001010610001434791976082010520010008200200161000310000100
10204115782010410104100001010410001434801984012010520010008200200161000310000100
10204115422010410104100001010410004434781984302011220010012200200161000310000100
10204113742010110101100001010410002435221947862010620010008200200081000110000100
10204113782010310103100001010410002435221943452010620010008200200161000310000100
10204114512010410104100001010410002434861953442010620010008200200161000410000100
10204115042010410104100001010410000434931956802010420010008200200161000410000100
10204113462010310103100001010410000434871943132010020010004200200161000410000100
10204114382010410104100001010410001435111944322010520010008200200081000110000100

1000 unrolls and 10 iterations

Result (median cycles for code): 1.1269

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
10034154692061310433101801043510002441131911622001620100082020000100011000010
10024112952001110011100001001010000430901916832001020100002020000100011000010
10024112292001110011100001001010000430971916732001020100002020000100011000010
10024112802001110011100001001010000430961907732001020100002020000100011000010
10024113402001110011100001001010000430931922052001020100002020000100011000010
10024112742001110011100001001010000430971922032001020100002020000100011000010
10024113292001110011100001001010000430911940392001020100002020000100011000010
10024114672001110011100001001010000430731983442001020100002020000100011000010
10024114492001110011100001001010000430841945272001020100002020000100011000010
10024112942001110011100001001010000430971917912001020100002020000100011000010

Test 3: throughput

Count: 8

Code:

  str d0, [x6, #0x10]!
  str d0, [x7, #0x10]!
  str d0, [x8, #0x10]!
  str d0, [x9, #0x10]!
  str d0, [x10, #0x10]!
  str d0, [x11, #0x10]!
  str d0, [x12, #0x10]!
  str d0, [x13, #0x10]!
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 1.0010

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
8021482441160692805128018080511800022403121360630160106200800082001600168000580000100
8020480085160105801058000080104800022403121360524160106200800082001600168000580000100
8020480077160105801058000080104800022403121360573160106200800082001600168000580000100
8020480072160105801058000080104800022403121360573160106200800082001600168000580000100
8020480077160105801058000080104800022403121360573160106200800082001600168000580000100
8020480077160105801058000080104800352404191361259160175200800482001600168000580000100
8020480077160105801058000080104800022403121360573160106200800082001600168000580000100
8020480077160105801058000080104800022403121360573160106200800082001600168000580000100
8020480077160105801058000080104800022403121360573160106200800082001600168000580000100
8020480084160105801058000080104800022403121360573160106200800082001600168000580000100

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 1.0007

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
800348211016061280432080180804320800032400481360292160019208001020160016800058000010
800248006616001180011080000800100800002400301360225160010208000020160000800018000010
800258008616006580048080017800520800002400301361758160010208000020160000800018000010
800248013716001180011080000800100800002400301361665160010208000020160000800018000010
800248013716001180011080000800100800002400301361629160010208000020160000800018000010
800248013716001180011080000800100800002400301361647160010208000020160000800018000010
800248013716001180011080000800100800002400301361647160010208000020160000800018000010
800248013716001180011080000800100800002400301361647160010208000020160000800018000010
800248014116001180011080000800100800002400301361683160010208000020160096800378000010
800248006616001180011080000800100800002400301360225160010208000020160000800018000010