Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

LD4 (single, post-index, H)

Test 1: uops

Code:

  ld4 { v0.h, v1.h, v2.h, v3.h }[1], [x6], x8
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 60 nops): 5.000

Issues: 6.011

Integer unit issues: 1.000

Load/store unit issues: 1.000

SIMD/FP unit issues: 4.011

retire uop (01)cycle (02)03l1i tlb fill (04)l1d tlb fill (05)mmu table walk instruction (07)mmu table walk data (08)09l2 tlb miss instruction (0a)l2 tlb miss data (0b)0e0f1e2223243a3f464951schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)5f6061696d6emap rewind (75)map stall (76)dispatch uop (78)map ldst uop (7d)map simd uop (7e)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst simd load (98)inst ldst (9b)9dl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafb5bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2cfd0d2l1i cache miss demand (d3)l1i tlb miss demand (d4)d5map dispatch bubble (d6)d9dadbddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? simd retires (ee)? int retires (ef)f5f6f7f8fd
650052876221531300110031005103279261016247601110004004100010004000100050005000476066102297228451281763106000100040002000800028263283581161001100010000100133100400121001222101402595437187348605419045345938181249522776610001522712569126331000400010002820628127284872808528135
6500427979211011101100110052142791711162276011100040111000100040001000500050004759260022943281162808031060001000400020008000281362825611610011000100001003131003000110010121114029104327337320404319185339838121645462811810001419811776124461000400010002819728526280682806228284
650042803221001010010081005239278381116251601110004011100010004000100050005000475965102296228187279953106000100040002000800028138283131161001100010000100333100400321001223111396110455723034890421929432073817947462785510001493511760123131000400010002809628055281262806728091
6500428326211011100100400051042827611157066011100040111000100040001000500050004759671922944283132811531060001000400020008000282892823311610011000100001002301003001110012201114245101107324334904419197329838121554482780310001402611802121631000400010002808328522281712817928100
650042813621021110010140005200279151115894600410004011100010004000100050005000476006002295227991281543106000100040002000800028291281561161001100010000100223100200121001223121390496557308322004419190324138121147492773010001403011668126431000400010002808428102279942810628112
6500428026211110101000310052332792311162776011100040111000100040001000500050004757580822985283962801131060001000400020008000284272818311610011000100001002131001001110002221213979102926994350004519424338038141645412771210001395111784124011000400010002817128200280772859928070
650042811321101010110041005047279470115691601110004004100010004000100050005000474527082300028121281913106000100040002000800828361281001161001100010000100323100300011000223111401010301727434750461919434323810952492817910001415011841136091000400010002824728066281162812628079
6500428179213010000100300051592827710158006011100040111000100040001000500050004746470022972280582807931060001000400020008000283832814611610011000100001001221003000510012231114107103177179343003919333321038131345372777110001382611861124201000400010002817928782281322813528101
6500428081209011100100410051502838511158686004100040111000100040001000500050004745241822979281302820131060001000400020008000283162840711610011000100001002231004012110012201113978105707303338104719110337338161338412775410001487612042125341000400010002805028424284662834028075
6500428155210211100110410051912829401157066004100040111000100040001000500050004745040822969284992818031060001000400020008000282562812311610011000100001002201002001210002231113416105217317353303519016340238131245422779710001409811871126491000400010002804228120284712808828219

Test 2: throughput

Count: 8

Code:

  ld4 { v0.h, v1.h, v2.h, v3.h }[1], [x6], x8
  ld4 { v0.h, v1.h, v2.h, v3.h }[1], [x6], x8
  ld4 { v0.h, v1.h, v2.h, v3.h }[1], [x6], x8
  ld4 { v0.h, v1.h, v2.h, v3.h }[1], [x6], x8
  ld4 { v0.h, v1.h, v2.h, v3.h }[1], [x6], x8
  ld4 { v0.h, v1.h, v2.h, v3.h }[1], [x6], x8
  ld4 { v0.h, v1.h, v2.h, v3.h }[1], [x6], x8
  ld4 { v0.h, v1.h, v2.h, v3.h }[1], [x6], x8
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 2.0007

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)09l2 tlb miss data (0b)0e0f18191e1f2223243a3f4346494e51schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)60696b6d6emap stall dispatch (70)simd prf full (72)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int store (96)inst int alu (97)inst simd load (98)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)a9acafb5b6bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0e7? int output thing (e9)eaeb? ldst retires (ed)? simd retires (ee)? int retires (ef)f5f6f7f8fd
40020516006511991000000034010011600501667996525480116801003200168000080100320000800004804994000812271748011600210160056160040799500380047480100200800003200002001600006400001600651600651180201100991000100800008000001008000001880013000800006029276005110117111600621800000138000032000080100160047160066160047160066160066
400204160046119910110000601000160041166799562548010080100320008800008010032000080000480499400057227160441160037016005616005679950038003848010020080000320000200160000640000160056160056118020110099100010080000800000100800000188001400148005561141800051101171116005308000010108000032000080100160041160041160041160057160041
400204160056119900001100001000160025066799562548010880100320008800008010032000080000480499400064227126921160021016004016005679934038003848010020080000320000200160000640000160056160040218020110099100010080000800000100800000188000010183800570101800051101351116013318000010108000032000080100160057160041160057160057160057
400204160056119800000100200100016003110079946254801088010032000880000801003200008000048049940008622714072116002701601011600657995003800284801002008000032000020016000064000016006516006511802011009910001008000080000010080007708002800680023016265105110117111600431800001308000032000080100160047160047160066160047160057
400204160056119800000000200100016002500679940254801088010032000080000801003200008000048049940006122712692016002101600561600407993403800474801002008000032000020016000064000016004616006511802011009910001008000080000010080005526800060268002360605105110117111600431800001308000032000080100160041160057160057160041160057
4002041600561198000000002001001160050166799462548010880100320016800008010032000080000480499400081227140721160046016011916005679950038004748010020080000320000200160000640000160065160100118020110099100010080000800000100800000188001400148001360141800051101171116004318000013138000032000080100160066160066160066160066160066
400204160065119811100000350100016004116679956254801088010032000880000801003200008000048049940005722716044116002131600401600567993403800474801002008000032000020016000064000016004616004611802011009910001008000080000010080007608002901298000001141800051101171116006208000013138000032000080100160041160041160057160041160041
40020416005611990000010000100016004116679956254801088010032000880000801003200008000048049940006422712692116003701600561600567995003800384801002008000032000020016000064000016005616004011802011009910001008000080000010080000008000000148000060000005110117111600370800001008000032000080100160041160057160057160041160041
40020416011011980000000020010001600410667995625480108801003200088000080100320000800004804994000612271269211600370160056160040799340380038480100200800003200002001600006400001600561600561180201100991000100800008000001008000001880000001380014610000051101171116005308000010108000032000080100160057160057160041160057160041
40020416004011980000000020010001600411007994025480100801003200088000080100320000800004804994000642271269211600910160040160040799500380038480100200800003200002001600006400001600401600561180201100991000100800008000011008000000800131013800146014180005110126111600530800001008000032000080100160041160057160041160041160041

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 2.0008

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk instruction (07)mmu table walk data (08)l2 tlb miss data (0b)0e0f18191e1f2223243a3f4346494e51schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)696d6emap stall dispatch (70)simd prf full (72)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd load (98)inst ldst (9b)9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)a9acafb5b6bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2branch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd0d5map dispatch bubble (d6)dadbddfetch restart (de)e0e7? int output thing (e9)eaeb? ldst retires (ed)? simd retires (ee)? int retires (ef)f5f6f7f8fd
40002516006511981000110026200000160158066799562548001080010320000800008001032000080000480049400091227174801600461600651600657995903800384802762080045320000201600006400001600561600561180021109101080000800001080000008001620148001361280600005020012170081016003718000013108000032000080010160060160060160060160060160060
400024160171120001000132190000016004416679940254800108001032000880000800583200008000048031840007222716044160037160059160059799500380047480010208000032000020160000640000160065160065118002110910108000080000108000782780006003280022012906100050200917009916006218000013138000032000080010160066160066160066160047160066
4000241600651199101100009001000160031106799652548002680010320016800008001032000080000480049400081227174801600461600651600657995903800474800102080000320000201600006400001600651600651180021109101080000800001080007727800304129800226052760000502007170091116006218000013138000032000080010160066160066160066160066160066
400024160065119910000000340100016005016679965254800188001032000880000800103200008000048004940007922714072160027160065160065799590380047480010208000032000020160000640000160046160065118002110910108000080000108000660800270129800226028276100050200111700961600621800001308000032000080010160066160047160066160066160066
400024160065119810110000340100016005006679965254800188001032001680000800583200008000048004940008722717480160046160065160065799590380047480010208000032000020160000640000160065160065118002110910108000080000108000772780007002880023016266000050200101700791600620800001308000032000080010160066160066160066160066160066
400024160046119910100002940000016005006079965254800268001032001680000800103200008000048004940008122714072160046160065160065799590380028480010208004532000020160000640000160065160046118002110910108000080000108000760800290128800006028265000050200121700971600431800000138000032000080010160066160066160066160066160047
400024160065119910010000360000011600501607994625480026800103200168000080055320000800004800494000792271748016002716004616006579959038004748001020800003200002016000064000016006516006511800211091010800008000010800076278000700680022612827620005020071700712160062080000008000032000080010160066160047160066160066160066
4000241600651199101000001060000016005016079965254800268001032001680000800583200008000048004940005822717480160046160065160065799590380028480010208000032000020160000640000160065160065118002110910108000080000108000652680028006800236129265200050200817009716004318000013138000032000080010160066160047160047160066160066
40002416006511981011000051301000160031166799652548002680010320064800008001032000080000480049400059227174801600461600651600657993803800474800102080000320000201600006400001600651600461180021109101080000800001080006627800290032800236028061000502001017009716006218000013138000032000080010160047160066160066160047160047
40002416006511991000000035000001600501667996525480026800103200168000080010320000800004800494000852271407216004616004616006579959038002848001020800003200002016000064000016004616006511800211091010800008000010800056278002902318002300282761000502008170091016006208000013138000032000080010160047160066160066160047160066