Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

LD1R (post-index, 16B)

Test 1: uops

Code:

  ld1r { v0.16b }, [x6], x8
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 60 nops): 2.000

Issues: 3.002

Integer unit issues: 1.000

Load/store unit issues: 1.000

SIMD/FP unit issues: 1.002

retire uop (01)cycle (02)03l1i tlb fill (04)l1d tlb fill (05)mmu table walk instruction (07)mmu table walk data (08)l2 tlb miss instruction (0a)l2 tlb miss data (0b)0e0f181e1f3a3f43464951schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)5f6061696d6emap rewind (75)map stall (76)dispatch uop (78)map ldst uop (7d)map simd uop (7e)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst simd load (98)inst ldst (9b)9dl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafb5bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2c9cfd0d2l1i cache miss demand (d3)l1i tlb miss demand (d4)d5map dispatch bubble (d6)d9dadbddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? simd retires (ee)? int retires (ef)f5f6f7f8fd
620052885322310304000040482228331000166743003100010031000100010001000500050011194215082257928595287453103000100010002000100028507286291161001100010001100002100000014210010100001316897326995320205420051323638022154542819510001558612477137541000100010002873628608288622880228698
620042876422300301000015047362845101016837300210001003100010001000100050005001119285002258628645287703103000100010002000100028579286811161001100010000100002100100014710012130001358595916989321305320071323938111746462821910001558212508138531000100010002874028727288322864728706
62004287152230040300003048272826700116557300310001002100010001000100050005017119386102266528583286863103000100010002000100028613286201161001100010000100002100100012710010120001319695257035322715820009323638111947442809010001530912666140061000100010002884828726286382858328766
620042880922400102000015047962828400116805300210001003100010001000100050005020119275082266328582286723103000100010002000100028545286151161001100010000100002100100013610002120001348296377055320805420103315038081651512813710001543612478139671000100010002861728715287512874228692
62004286942230030200002047132835300016715300310001002100010001000100050005000119245002264928613286643103000100010002000100128705286942161001100010000100003100000014410012120001336597046985323004720078321438122146452817410001535412553137241000100010002853528766286442863128650
62004287342230010500003048522842400116712300310001002100010001000100050005002119265002261328591287733103000100010002000100028614285941161001100010000100002100100016310012030001354096986961321005220204320538082148512819310001496212544138641000100010002871428716286602879428752
62004287472230020200002047482829600016637300310001002100010001000100050005004119275002266628648288153103000100010002000100028562286301161001100010000100002100100013810032130001338394596939316405720137319638072051492816110001524612497138331000100010002867828731286492870628619
62004287722230020200003049192835700116605300210001002100010001000100050005011119255002261128576286663103000100010002000100028645286631161001100010000100002100100015110002120001333994396906318815119986317238102250452817110001554312496137771000100010002864028592287712855028686
62004286572230020300002047042834900016642300310001003100010001000100050005020119235002261828541287033103000100010002000100028723287271161001100010000100002100100014110002130001347397347000325025220002319038122447512821310001556712461136871000100010002867128687287142858328765
62004287802220031400003048072833601016684300210001002100010001000100050005000119261008226402862228735310300010001000200010002858928652116100110001000010000210010007210012130001307396356962318515120045317238022251502817710001529412351135101000100010002859128685287282871728723

Test 2: Latency 1->2 roundtrip

Chain cycles: 3

Code:

  ld1r { v0.16b }, [x6], x8
  fmov x0, d0
  eor x8, x8, x0
  eor x8, x8, x0
  add x6, x6, x8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 11.0054

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk instruction (07)mmu table walk data (08)09l2 tlb miss instruction (0a)l2 tlb miss data (0b)0e0f18191e1f23243a3f4d51schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)5e60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd load (98)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafb5bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? simd retires (ee)? int retires (ef)f5f6f7f8fd
6020514005310860000000000100001400391395812580104501002000410000401002000010000124572753048671071122400140036140058140060131974313243870100302001000020000602002000020000140057140057115020110099100401001000010000110010001211000200071000011000032126936613971850000141010100001000050100140056140055140052140058140055
6020414005410860000000000100001400361395782580102501002000210000401002000010000124560753045251071052200140030140054140055131968313243570100302001000020000602002000020000140035140054115020110099100401001000010000010010000011000000001000010100032126935613971850000131010100001000050100140055140036140055140055140055
60204140054108500000000001300001400361395752580102501002000010000401002000010000124567353046391071052200140027140054140055131968313243570100302001000020000602002000020000140054140051115020110099100401001000010000110010000011000002001000010100032123937713971850000131012100001000050100140055140055140036140055140055
60204140035108500000000001301001400391395752580102501002000210000401002000010000124567353046391071052200140030140054140054131968313243270100302001000020000602002000020000140054140051115020110099100401001000010000010010000011000001001000010100032126936313972150000131310100001000050100140055140036140052140055140052
6020414005510860000000100100001400391395752580102501002000010000401002000010000124567353046391070927600140030140054140054131968313243570100302001000020000602002000020000140054140051115020110099100401001000010000010010000011000001001000010100032126933613971850000131310100001000050100140052140057140055140052140055
602041400351086000000110010000140036139556258010250100200021000040100200001000012456075304639107159620014003014014214005413194931324167010030200100002008160200200822000014003514013711502011009910040100100001000001001000001100060020100001010003212793661397185000013010100001000050100140150140055140055140142140055
60204140147108500010100011000014003913957525801185010020002100014010020079100001248506530463910713853001400881400541401251320151413243570100302001000020081604902008020000140132140051115020110099100401001000010000110010002011000000031000010100032126107661397185001001310100001000050100140036140142140055140153140141
60204140147108500000000011330000140039139575258011950100200021000040100200001000012456735304639107106000014003014003514005413196831324327010030200100002000060200200002000014005414005111502011009910040100100001000001001000001100000000100001010003212393361397185000001010100001000050100140055140052140055140055140036
60204140054108500000000011880001400391399305180118501102000310000401002015810040125437953046391073196800140354140035140367131970313243970100302001000020000602002000020000140055140052115020110099100401001000010000010010000011000002091000010100032126936613971850000131510100001000050100140037140055140053140150140055
6020414005211340000000019022001114003913957525801025010020002100004038220000100001245682530467810710522001400301401181400541319682413243370100304431000020000602002000020000140152140140115020110099100401001000010000110010000001000302031000110100032123108731397805001001310100001000050100140058140036140057140260140055

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 11.0056

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk instruction (07)mmu table walk data (08)09l2 tlb miss data (0b)0e0f181e1f233a3f494d51schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)6061696b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd load (98)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)a9acafb5bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2cfd0l1i cache miss demand (d3)d5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? simd retires (ee)? int retires (ef)f5f6f7f8fd
60025140053108510101000100014004101396572580014500102000410000400102000010000124584253076181071757700140032014005614004113200431324617001030020100002000060020200002000014005614004111500211091040010100001000001010001111000101110000111103141003884313972450000760100001000050010140042140043140042140057140057
60024140056108610101000100014004201396612580014500102000410000400102000010000124582153071101071811200140035014005614005313200431324567001030020100002000060020200002000014005814005311500211091040010100001000001010002211000201110000110103141003884413972750000969100001000050010140057140057140057140057140042
600241400561086101001001700114004101396452580012500102000410000400102000010000124584253077321071745400140032014005314005313200531324647001030020100002000060020200002000014005614004111500211091040010100001000001010001211000100110000011123141004874413971250000969100001000050010140042140057140057140057140057
600251400561086101010001400014002901396602580014500102000210000400102000010000124584253077321071745410140017014005714004113199031324647001030020100002000060020200002000014005614004211500211091040010100001000001010002101000200110000010113141003884413972750000999100001000050010140042140042140058140057140042
600241400411086101011002000140026013966025800145001020004100004001020000100001245842530773210717577101400170140041140041132004313245070010300201000020000600202000020000140056140056115002110910400101000010000010100011110003004100001111031410051383313972850000060100001000050010140057140042140042140042140057
600241400561085100001101400114004101396572580014500102000410000400102000010000124582153071101071745400140032014005314004113200431324617029830020100002000060020200002000014005614005511500211091040010100001000001010001111000201110000111113141003873313972750000009100001000050010140058140059140057140057140057
60024140056108610101110200014004301396452580014500102000410000400102000010000124584853071101071757700140032014005614005613200431324647001030020100002000060020200002000014004614005311500211091040010100001000001010002201000220410000111103141005884413972750000999100001000050010140057140057140057140055140054
6002414005610861000000026635200140222013969978800565004020011100034227722301111461306864535779110829940001401820140311140330132076351326057052730267101222024460752202442016114033214023151500211091040010100001000001010004711000503643610005110103141003883313971250000999100001000050010140057140057140054140068140057
600241400411086100000002000140026013967625800145001020002100004001020000100001245815530785210717811001400330140041140041132004313246470010300201000020000600202000020000140041140041115002110910400101000010000010100013110001004100001101031650041223313972450000999100001000050010140057140042140057140057140061
600241400561086101000001300014004101396602580014500102000410000400102000010000124584253095041072130200140032014004114005713200431324507001030020100002000060020200002000014005614005311500211091040010100001000001010001111000100110000111103141006873313972750018999100001000050010140057140042140042140042140057

Test 3: throughput

Count: 8

Code:

  ld1r { v0.16b }, [x6], x8
  ld1r { v0.16b }, [x6], x8
  ld1r { v0.16b }, [x6], x8
  ld1r { v0.16b }, [x6], x8
  ld1r { v0.16b }, [x6], x8
  ld1r { v0.16b }, [x6], x8
  ld1r { v0.16b }, [x6], x8
  ld1r { v0.16b }, [x6], x8
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 1.0005

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)09l2 tlb miss data (0b)0e0f18191e1f22243a3f4346494f51schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd load (98)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)a9acafb5b6bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2cfd5map dispatch bubble (d6)ddfetch restart (de)e0e7? int output thing (e9)eaeb? ldst retires (ed)? simd retires (ee)? int retires (ef)f5f6f7f8fd
1602058004162001000000310100800261660252401218010080021800008010080000800004359002375882449187308002280041800415992435999924010020080000800002001600008014480041800411180201100991001008000080000010080000014800131013800180110230051101161180038180000968000080000801008004280042800428004280042
16020480041620100000001801008002616010252401188010080018800008010080144800004358986375882449187328002280041800415992435999924010020080000800002001600008000080041800411180201100991001008000080000010080007823800250025800190010170051101161180038080000998000080000801008017980042800428004280042
16020480041621000000001801008016216662524011980100800198000080100800008013743590023758824491867180022800418004159924245999924010020080000800002001600008000080041800411180201100991001008000080000110080000614800271034800136112146051101251180038180000998000080000801008004280042800428004280042
1602048004162100001000190100800261664252401318010080018800008010080000800004359002375882449186668002280176800415992435999924010020080000800002001600008000080041800411180202100991001008000080000010080000017800150013800136110170051101161180038080000968000080000801008004280042800428004280042
160204800416210100000020100800261664252401008019280018800008010080000800004358982375882449186668002280041800415992435999924010020080000800002001602888000080172800411180201100991001008000080000010080007723800250126800190013170051101161180038180000968000080000801008004280042800428004280042
1602048004162000000000190100800261065252401198010080031800008010080000800004358978376303249186778002280041800415992435999924010020080000800002001600008000080041800411180201100991001008000080000010080008723800070029800190113170051101161180038080000968000080000801008017580042800428004280042
1602048004162000000000190000801601665252401008019280019800008010080144800004359010375882449186718002280041800415992435999924010020080000800002001600008000080041800411180201100991001008000080000010080000017800120012800136113170051101161180038180000968000080000801008004280042800428004280042
1602048004162100000010190010800261666775246724823598198280093801008000080000435899837588244918892800228004180041599243599992405252008000080000200160000800008004180041118020110099100100800008000011008000001480025000800126125170051101251180159180000998000080000801008004280042800428004280042
160204800416200000010030001080026166625240119801008001980000801008000080000435899837588244918669800228004180041599243599992401002008000080000200160000800008004180041118020110099100100800008000001008000862380026006800196012170051101251180038080000998000080000801008004280042800428004280042
16020480041620000000003300108002606662524011880100800198000080100800008000043590063758824491866680022800418004159924245999924010020080000800002001600008000080041800411180201100991001008000080000110080000014801901013800136113170051101251180038080000968000080000801008004280042800428004280042

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 1.0005

retire uop (01)cycle (02)03mmu table walk instruction (07)mmu table walk data (08)09l2 tlb miss instruction (0a)l2 tlb miss data (0b)0e0f18191e1f22233f4346494f51schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)6067696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch call (8e)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd load (98)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)a9acafb5b6bbl1d cache miss ld nonspec (bf)c2cfd5map dispatch bubble (d6)ddfetch restart (de)e0e7? int output thing (e9)eaeb? ldst retires (ed)? simd retires (ee)? int retires (ef)f5f6f7f8fd
160025800415990000000003100080026166102524002980010800198000080010800008000043584293758767491846700800228004180041599463600212400102080000800002016000080000800418004111800211009101080000800000108000001480013001380010001017050203916193980038080000668000080000800108004280042800428004280042
16002480041620000000000430008002616682524002980010800008000080010800008000043578253758824491853900800228004180041599463600212400102080000800002016000080000800418004111800211009101080000800000108000001680010001680010011317050201516372180038080000998000080000800108004280042800428004280042
1600248004162000000000015000800261664252400288001080021800008001080000800004358249375882449183860080022800418004159946360021240010208000080000201603068000080041801871180021100910108000080000010800000148001010108001060100050203916382280038180000998000080000800108004280042800428004280042
16002480041621000000000300008002616602524002980010800198000080010800008000043582613758824491856300800228004180041599463600212400102080000800002016000080000800418004111800211009101080000800000108000001480013001080009611017050201934223680038080000668000080000800108004280042800428004280042
1600248004162000000000017000800261666252400298001080000800008001080000800004357425375878949185350080022800418004159946360021240010208000080000201600008000080041800411180021100910108000080000010800000148001300980000001317050203616414180038080000668000080000800108004280042800428004280042
16002480041620000000100180008002616642524002880010800168000080010800008000043579533758822491854400800228004180041599463600212400102080000800002016000080000800418004111800211009101080000800001108000001480013101680009611017050204116203780038080000908000080000800108004280042800428004280042
16002480041621000000000160008002616682524002980010800198000080010800008000043579253758785491855300800228004180041599463600212400102080000800002016000080000800418004111800211009101080000800000108000001480012001380009601214050203916374180038180000968000080000800108017780042801778004280042
160024800416210000000002801080026106525240026800108001980000800108000080000435794137587794918387008002280041800415994636002124001020800008000020160000800008004180041118002110091010800008000001080000008001300138001000917050203816383980038080000998000080000800108004280042800428004280042
16002480041620000001000000080026166525240026800108001580000800108000080000435736137588144918532008002280041800415994636002124001020800008000020160000800008004180041118002110091010800008000001080000014800100098035761014050201916424080038180000668000080000800108004280042800428004280042
16002480041622000000000180108002616042524002980010800168000080010800008000043574053758813491837600800228004180041599463600212400102080000800002016000080000800418004111800211009101080000800000108000001480010201280010001317050204116194080038180000968000080000800108004280042800428004280042