Apple M1 Microarchitecture Research by Dougall Johnson

Firestorm: Overview | Base Instructions | SIMD and FP Instructions
Icestorm:  Overview | Base Instructions | SIMD and FP Instructions

LD3 (single structure, B)

Test 1: uops

Code:

  ld3 { v0.b, v1.b, v2.b }[1], [x6]
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 60 nops): 4.000

Issues: 4.009

Integer unit issues: 0.000

Load/store unit issues: 1.000

SIMD/FP unit issues: 3.009

retire (01)cycle (02)0307080a0b0e0f181e22243a3f43464951inst issue (52)~issue fp/simd (54)~issue ld/st (55)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing ld/st (5a)huge thing fp/simd (5b)5f696b6d6escheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op ld/st (7d)~map op fp/simd (7e)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst fp/simd load (98)inst ldst (9b)9da0a3a6a8acafb5bbdcache load miss (bf)c2cfd0d2icache miss (d3)itlb miss (d4)d5d6d9dadbddinst fetch restart (de)e0eaebecld/st retires (ed)? fp/simd (ee)f5f6f7f8fd
6400528469213360250000310508728128102161174009300910003000100050003569492278402833128330310400010003000100060002820228163116100110001000110004100102100122301395910281687130771479197323103381015556027936144071240713255100030002878128306284172826428282
6400428317211320250110410486828095111165484009300910003000100050013576052290902811828569310400010003000100060002813528324116100110001000010004100201100231301379810156720634221260194863274380112546127860147051309113171100030002825028295282672834628315
6400428330213230280000400480528331102165304009300910003000100050003576511228780281222864631040001000300010006000281302819311610011000100001000010020210023232137331015971943495953192403453380410565327851140541234712724100030002818128209282452822028288
640042833421231025000040050032809610016453400930091000300010005001357749228880285302837331040001000300010006000281722826111610011000100001000410020110022230136251025070893343757191163430380310555627854154551240113221100030002834428383282092818328215
640042835021533020000041050552806511216134400330031000300010005001356591222827028195286913104000100030001000600028211282821161001100010000100041002011002223013361999371613088961192893382380410535827961148781250313446100030002822728188283832812328328
6400428352213180280000410503828386100161674003300910003000100050003563772283602826328255310400010003000100060002809528169116100110001000010004100201100232301412510064692734251255191993348379611585727831145201219212983100030002820728144282012860928619
640042826621132033000041050062809011216101400930091000300010005001357707229063283172835431040001000300010006000281572829911610011000100001000410020210022230137229937709634021254192803134380518625727927146901230313256100030002823128227284322836128192
640042877221131019000070052332799211216177400330091000300010005001357691022870028345283163104000100030001000600028228283351161001100010000100001001021001023013686100807139310186319331325438009566328250154101312413015100030002824528327283622826528451
640042837321129025000041051112810511116045400930031000300010005000356313228740282902831831040001000300010006000282132830511610011000100001000010021210022130137669964720732011260196643156380312576627968144591270013292100030002832128314285992870928656
64004283632123102701101105157280621101599540093009100030001000500035751822849028193283303104000100030001000600028172282151161001100010000100041002221001312013924102247109334785819221335538066566127947147171240713159100030002819128339282622828028661

Test 2: throughput

Count: 8

Code:

  ld3 { v0.b, v1.b, v2.b }[1], [x6]
  ld3 { v0.b, v1.b, v2.b }[1], [x6]
  ld3 { v0.b, v1.b, v2.b }[1], [x6]
  ld3 { v0.b, v1.b, v2.b }[1], [x6]
  ld3 { v0.b, v1.b, v2.b }[1], [x6]
  ld3 { v0.b, v1.b, v2.b }[1], [x6]
  ld3 { v0.b, v1.b, v2.b }[1], [x6]
  ld3 { v0.b, v1.b, v2.b }[1], [x6]
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 2.0008

retire (01)cycle (02)0305080b0e0f18191e1f2223243a3f4346494e51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)60696b6d6edispatch stall (70)72scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a1a3a6a8a9acafb5b6bbdcache load miss (bf)dtlb miss (c1)c2c5branch mispredict (cb)cfd5d6ddinst fetch restart (de)e0e7eaebecld/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
32020516006511990111100200000116005000015988725320106100240012800001002400008000050040005022794693016004601600651600657991403800383201002008000024000020080000480000160059160041118020110099100100800008000011008000880800320230800236113000005110117111600621013280000240000100160047160066160066160047160047
320204160046119811000003501001160050166159887253201121002400128000010024000080000500400054227907630160046016006516006579914038002832010020080000240000200800004800001600461600651180201100991001008000080000110080007728800600230800236129277100511011711160062100080000240000100160047160066160066160066160066
320204160065119911100007010011600500661598872532011210024001280000100240000800005004000542279076301600460160065160065799140380047320100200800002400002008000048000016004616004611802011009910010080000800000100800088288000712308002300310610051101171116006211313280000240000100160066160066160047160066160066
320204160065119811100007000001600501601598872532011210024001280000100240000800005004000002279469301600460160065160046799330380047320326200800002400002008000048000016006516006511802011009910010080000800000100800088288003001318002361310610051101171116006211313080000240000100160066160047160066160047160047
32020416006511981100000400000116003116615988725320112100240012800001002400008000050040004822790763016002701600651600467991403800473201002008000024000020080000480000160065160046118020110099100100800008000001008000882780030013080000603127600051101171116004311313280000240000100160066160066160047160066160066
320204160065119911110006010011600501061598872532011210024001280000100240000800005004000482279076301600460160046160065799140380028320100200800002400002008000048000016006516004611802011009910010080000800000100800099080031002238002361310610051101171116006211313280000240000100160066160066160047160047160047
320204160065119911100007010001601010661598872532011210024000680000100240000800005004000482279076301600460160065160065799330380028320100200800002400002008000048000016006516006511802011009910010080000800000100800606288000701338002360727700051101171116006211313280000240000100160066160066160047160066160066
3202041600461198111000036010001600501661598872532011210024001280000100240000800005004000232279469301600460160046160065799140380047320100200800002400002008000048000016006516006511802011009910010080000800000100800077288000602308002460302871005110117111600621013280000240000100160047160068160158160687160066
32020416006511991110000350100016003116615988725320106100240012800001002400008000050040015522790763016004601600651600467990003800473201002008000024000020080000480000160065160097118020110099100100800008000001008000972780009019718002401312860005110117111600620130280000240000100160066160047160047160047160047
3202041600651199101110070000116005016015985325320106100240006800001002400008000050040016222794693016002701600651600657991403800473201002008000024000020080000480000160065160065118020110099100100800008000001008000770800310068002361290710051101171116006211313080000240000100160047160047160066160066160047

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 2.0008

retire (01)cycle (02)03050708090a0b0e0f18191e1f2223243a3f4346494e51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)5f60696b6d6edispatch stall (70)72scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a1a3a6a7a8a9acafb5b6bbdcache load miss (bf)dtlb miss (c1)c2c5branch mispredict (cb)cdcfd0icache miss (d3)d5d6ddinst fetch restart (de)e0e7eaebecld/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
3200251600651199142010000048010021626581661598381643244341024173481914102400008000050400009227893420116003701600561600567992403800383258862080000240000208000048000016005616005661800211091010800008000011082064628800300017800006030063000528200241881818160053001328000024000010160066160565161531161645161627
320024161935121311003000636010001600260661598387232001610240006800001024000080000504000482279469301160046016006616004679914038004732001020800002400002080000480000160065160046118002110910108000080000110800087278000901231800230129071000502000817918160062013008000024000010160042160060160060160057160042
3200241600561199000000000070000016002610615987325320110102400008005810240000800005040001222793568011600400160113160041799240380038320010208000024000020800584800001601161600561180021109101080000800000108006101882337010973800000000020005022001234119271600380104608000024000010162497162248162315161409160066
320024160046119910002100012001000162300160159887823323784102415248197210246216823785041404222790763011609260162157162324786460380012320010208255224800420826104960081626871626582180021109101080000800001108000001880000020378682565611418000005062063161718716003801131028000024000010160060160042160060160042160042
32002416005712410010410041483500100160041166159838253200101024000680000102400008000050400598227929650116009101600561600567992773800223200102080000240000208000048000016005916005611800211091010800008000001080000018800140001880000011422000005020001817718160056171028000024000010160060160060160060160042160057
3200241600561198000000000020001001603771061598382532001610240006800001024000080000504000222279296501160022016005616005979927038002332001020800002400002080000480000160059160056118002110910108000080000010800000080018000178001861142200000502000717918160053187028000024000010160060160060160042160057160042
320024160041119900000000000000001600441661598762532001610240000800001024000080000504000242279141901160027016006516006579933038004732001020800002401892080000480000160054160155218002110910108000080000010800000158000000008001400101500000502000181718716003816628000024000010160042160056160055160056160056
320024160041119900000000002000000160039166159861253200161024000080000102400008000050400000227923660016003601600411600557992303800373200102080000240000208000048000016005516005411800211091010800008000001080000008001402014800146013000000502000181718816005110628000024000010160055160042160055160042160042
320024160054119900000000000000001600401661598722532001610240006800001024000080000504000082279236601160036016005516005579923038003732001020800002400002080000480000160041160054118002110910108000080000010800000158000000017800006114150000050200061791816003806028000024000010160057160056160042160105160056
320024160055140000000011001700000160040160159872253200101024000680000102400008000050400007227893420016003601600541600557990903800373200102080000240000208000048000016004116004111800211091010800008000011080000015800000001480000016237100050200071771716005919928000024000010160063160063160063160063160047