Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

STR (post-index, D)

Test 1: uops

Code:

  str d0, [x6], #0x10

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 1.000

Issues: 2.000

Integer unit issues: 1.001

Load/store unit issues: 1.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map ldst uop (7d)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)
100516332059104110181040100046451831320001000200010011000
100411192001100110001000100048011908120001000200010011000
100411542001100110001000100048051878120001000200010011000
100412122001100110001000100048011941120001000200010011000
100411182001100110001000100048011918920001000200010011000
100411432001100110001000100048011940520001000200010011000
100411802001100110001000100048011996320001000200010011000
100411902001100110001000100048011881120001000200010011000
100411482001100110001000100048011994520001000200010011000
100411332001100110001000100048011855920001000200010011000

Test 2: Latency 3->3

Code:

  str d0, [x6], #0x10

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code): 1.1344

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
1021414665206951051510180105161000344314192507201092001001020020020010007100000100
1020411225201051010510000101061000443638191672201122001001220020016010004100000100
1020411276201041010410000101041000243620191717201062001000820020016010004100000100
1020411245201051010510000101081000243629192067201062001000820020016010004100000100
1020411220201041010410000101041000243621192319201062001000820020024010005100000100
1020411313201041010410000101041000243622192465201062001000820020016010004100000100
1020411351201011010110000101041000243627193171201062001000820020024010005100000100
1020411263201051010510000101081003645124195903201792001005120020024010005100000100
1020411192201031010310000101041000243617203125201062001000820020012010003100000100
1020411610201011010110000101041000344338194208201092001001020020016010001100000100

1000 unrolls and 10 iterations

Result (median cycles for code): 1.1324

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
10034142592061110431101801043210003438051944342001920100102020000100011000010
10024113772001110011100001001010000431181939512001020100002020000100011000010
10024113652001110011100001001010000431141936812001020100002020000100011000010
10024113602001110011100001001010000431191938612001020100002020088100331000010
10024113912001110011100001001010000431131939152001020100002020000100011000010
10024113842001110011100001001010000431181937712001020100002020000100011000010
10024113972001110011100001001010000431161937532001020100002020000100011000010
10024113692001110011100001001010000431171938972001020100002020000100011000010
10024113852001110011100001001010000431211937352001020100002020000100011000010
10024113592001110011100001001010000431191941852001020100002020000100011000010

Test 3: throughput

Count: 8

Code:

  str d0, [x6], #0x10
  str d0, [x7], #0x10
  str d0, [x8], #0x10
  str d0, [x9], #0x10
  str d0, [x10], #0x10
  str d0, [x11], #0x10
  str d0, [x12], #0x10
  str d0, [x13], #0x10
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 1.0006

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)
80214821111606928051280180805118003824043213612600160182200800480200160016080005800000100
80204800561601058010580000801048000224031213600510160106200800080200160016080005800000100
80204800481601058010580000801048000224031213600510160106200800080200160016080005800000100
8020480056160105801058000080104800352404231360737016017820280046027658174947375910548644214914706
80204800781601058010580000801048000224031213606070160106200800080200160016080005800000100
80204800481601058010580000801048000224031213600510160106200800080200160016080005800000100
80204800481601058010580000801048000224031213600510160106200800080200160016080005800000100
80204800481601058010580000801048000224031213600510160106200800080200160016080005800000100
80204800481601058010580000801048000224031213600510160106200800080200160016080005800000100
80204800481601058010580000801048000224031213600510160106200800080200160016080005800000100

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 1.0012

retire uop (01)cycle (02)schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)? int output thing (e9)? ldst retires (ed)? int retires (ef)
8003482241160612804328018080432800362401551360994160088208005020160016800058000010
8002480056160011800118000080010800002400301360205160010208000020160000800018000010
8002480056160011800118000080010800002400301360205160010208000020160000800018000010
8002480056160011800118000080010800002400301360205160010208000020160000800018000010
8002480056160011800118000080010800002400301360205160010208000020160000800018000010
8002480056160011800118000080010800002400301360205160010208000020160000800018000010
8002480056160011800118000080010800002400301360205160010208000020160000800018000010
8002480056160011800118000080010800002400301360205160010208000020160000800018000010
8002480056160011800118000080010800002400301360205160010208000020160000800018000010
8002480056160011800118000080010800002400301360205160010208000020160096800378000010