Apple M1 Microarchitecture Research by Dougall Johnson

Firestorm: Overview | Base Instructions | SIMD and FP Instructions
Icestorm:  Overview | Base Instructions | SIMD and FP Instructions

LD1R (4H)

Test 1: uops

Code:

  ld1r { v0.4h }, [x6]
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 60 nops): 2.000

Issues: 2.003

Integer unit issues: 0.000

Load/store unit issues: 1.000

SIMD/FP unit issues: 1.003

retire (01)cycle (02)03040507080a0b0e0f1e22243a3f464951inst issue (52)~issue fp/simd (54)~issue ld/st (55)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing ld/st (5a)huge thing fp/simd (5b)5f696d6escheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op ld/st (7d)~map op fp/simd (7e)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst fp/simd load (98)inst ldst (9b)9da0a1a3a6a7a8a9acafb5bbdcache load miss (bf)dtlb miss (c1)c2cfd0d2icache miss (d3)itlb miss (d4)d5d6d9dadbddinst fetch restart (de)e0eaebecld/st retires (ed)? fp/simd (ee)f5f6f7f8fd
6200528668214711111104105093282630116303200410031000100010005000119539226622824828273310200010001000100010002828728221116100110001000010042210030022100022311137471006371383211580199003324381911565728031145441312613887100010002845028472285192860828465
620042850021231112100800491628052011635220031004100010001000500011913822702281352831831020001000100010001000282302819911610011000100001002321002011210012121213735995771313430158196283376381821545827973149501298513882100010002839528235285092845328351
620042831921241110100410507328158001646420041003100010001000500011931322688282742852531020001000100010001000282242828211610011000100001001201003002110012131213764100397170335115919780334938084575828041150621274413203100010002834728525285052835628163
620042830421331201111410495128028011632320011001100010001000500011942922678282152830831020001000100010001000280072862711610011000100001003321004002210002231213398989370763212265197313302381114625927974147101256113507100010002849828544283412856628250
6200428371212312121003004966282180116449200110011000100010005000119134226162820928255310200010001000100010002820728454116100110001000110032210020021100121211139191005370993390464198043352380912585628001146871235713679100010002836328449285322834328380
620042833021231211100110505328252101646720031004100010001000500011940622648283162830231020001000100010001000282432825011610011000100001002221002000110012231113547986270173321065197393248380817636628102147641264113464100010002831228582283222830728332
6200428545212412121004105050281011116346200410031000100010005000119402226822818228545310200010001000100010002839128311116100110001000010022210020001100021211138741004970543254161196453150381817626327907152711296313655100010002839928647284502825728310
6200428333212412121004005090282641016362200410011000100010005000119428226262812128561310200010001000100010002828028305116100110001000010032210030011100021312137871013672133425260197263217380912645727897145901255413490100010002837628405282952834628503
62004282942123121310041049932815501165392004100110001000100050001193622268628204282963102000100010001000100028384284261161001100010001100223100200011000222111384299207175341735519739336738219645827974151921304013533100010002843628296282262841828379
6200428320211213121004105144281610116253200310031000100010005000119380227032825228294310200010001000100010002822828326116100110001000010033010011114100021311136511016571373402252196503276381118556627894148411268714022100010002830328262282182834528303

Test 2: Latency 1->2 roundtrip

Chain cycles: 3

Code:

  ld1r { v0.4h }, [x6]
  fmov x0, d0
  eor x8, x8, x0
  eor x8, x8, x0
  add x6, x6, x8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 11.0054

retire (01)cycle (02)030508090b0e0f18191e1f22233a3f4d4f51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)60696b6d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a1a3a6a8a9acafb5bbdcache load miss (bf)dtlb miss (c1)c2cfd5d6ddinst fetch restart (de)e0? int output thing (e9)eaebecld/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
602051400571049000000001010014004413942712936525701024010020002100003010020000100001263803669402214310939114002701400351400541318003132389601003038910000200006020010000200001400511400511150201100991004010010000100001100100021010002001100001111132101139111395684000001010100001000040100140052140052140055140052140036
6020414003510490000000010000140036139406129363257010240100200021000030100200001000012640206693734143109391140030014005414003513179731323996042530200100002000060200100002000014005414005121502011009910040100100001000001001000001100000031000000000321011391113956540000131010100001000040100140042140061140061140042140058
602041403001050110000002000014005013940612936325701024010020000100003010020000100001263803669373414310939114002701400351400541318003132389601003020010000200006020010000200001400541400511150201100991004010010000100000100100023010003001100000111132101139111395504000010010100001000040100140052140055140036140055140055
602041400541049000000001000014004213943012937152701044010020002100003010020000100001263814669371714312591114005101400601400601318063132395601003020010000200006020010000200001400941400831150201100991004010010000100000100100000010000000100000010032101139111395654000010100100001000040100140061140058140061140061140061
602041400411049111100002010014003613940612936525701024010020002100003010020000100001263958669387814312488114003001400541400351318003132399601003020010000200006020010000200001400541400511150201100991004010010000100001100100000110000000100001010032101126111395464000801313100001000040100140055140055140052140036140052
6020414005410490000000040000140039139427129365257010040100200021000030100200001000012639586693878143087011140027014005414005413179731323896010030200100002000060200100002000014005414005111502021009910040100100001000001001000001100000001000010100321011271113955940000131013100001000040100140055140055140036140055140036
6020414005410490000000010100140052139456129371257010440100200041000030100200001000012639856694022143125911140033014006014006013180631323926010030200100002000060200100002000014005714005711502011009910040100100001000001001000001100000001000010100321011271113957040000131013100001000040100140061140061140042140042140061
602041401611049100000001000014002013940612934725701004010020000100003010020000100001264020669373414308701114003001400511400541318003132389601003020010000200006020010000200001400541400511150201100991004010010000100000100100000110001000100000010032101126101395594000010100100001000040100140036140052140036140052140055
6020414005910490000000010010014003613942712936345701024010020002100003010020000100001263958669387814308806114003001400511400541317933132389601003020010000200006020010000200001400511400351150201100991004010010000100000100100000110000000100001010032101139111395594000013130100001000040100140058140042140058140058140061
60204140057104911011000130101140036139427129365257010240100200021000030100200001000012639586692947143109391140027014005414003513180031323996010030200100002000060200100002000014003514005111502011009910040100100001000001001000201100000031000010100321011271113954640000131313100001000040100140052140055140052140052140052

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 11.0053

retire (01)cycle (02)0305080b0e0f1e2223243a3f4d4f51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)60696d6edispatch stall (70)71scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a1a3a6a8a9acafb5bbdcache load miss (bf)dtlb miss (c1)c2cfd5d6ddinst fetch restart (de)e0? int output thing (e9)eaebecld/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
600251401261048101002100114003813940012936425700144001020004100003001020000100001264507669383014326437114006314005314005313182103132436600103002010000200006002010000200001400491400561150021109104001010000100000101000321100020111000011112314061132213957240000666100001000040010140054140054140054140054140054
600241400531049111002000114002613940012936425700144001020004100003001020000100001264507669383014326437114002914005314005313182103132436600103002010000200006002010000200001400531400531150021109104001010000100000101000210100010141000011110314041132213957240000666100001000040010140054140054140054140042140054
6002414004110491010011100114002613946012936425700144001020004100003001020000100001264525669383014326437114002914005314005313182103132436600103002010000200006002010000200001400541400531150021109104001010000100000101000221100020111000001012314041132213956040000666100001000040010140054140054140054140054140054
6002414005310491110021001140038139400129364257001440010200071000030010200001000012643706694070143264371140029140053140041131809031324366001030020100002000060020100002000014005814005311500211091040010100001000001010001211000101101000011110314041132213957240000660100001000040010140042140042140054140054140054
600241400531049101002100114003813940012936425700144001020004100003001020000100001264507669383014326437114001714005314005313182103132436600103002010000200006002010000200001400571400531150021109104001010000100000101000411100010011000001110314041132213957240000600100001000040010140054140054140054140054140054
600241400531049111001100114003813940012936425700144001020004100003001020000100001264507669383014326437114002914005314005313182103132436600103002010000200006002010000200001400561400531150021109104001010000100000101000221100030111000011111314041132213957240000666100001000040010140054140054140054140054140054
600241400531049111002100114003813938812936425700124001020004100003001020000100001264507669383014326437114002914005314005313180903132436600103002010000200006002010000200001400561400531150021109104001010000100000101000331100010111000001111314041132213957240000660100001000040010140054140054140054140042140054
600241400531049111001100114003813940012936425700144001020004100003001020000100001264507669383014326437114001714005314005313182103132436600103002010000200006002010000200001400571400531150021109104001010000100000101000311100020111000011110314041702213957240000606100001000040010140042140042140042140042140054
60024140053104910000134100114003813940012936425700144001020004100003001020000100001264507669383014326437114001714005314005313182103132436600103002010000200006002010000200001400541400531150021109104001010000100000101000220100020111000011110314041132213957240000066100001000040010140054140054140054140054140145
600241400531049101002100114002613940012936425700144001020004100003001020000100001264507669383014326437114001714005314005313182103132436600103002010000200006002010000200001400531400531150021109104001010000100000101000311100021111000011112314041132213957240000066100001000040010140054140054140054140054140054

Test 3: throughput

Count: 8

Code:

  ld1r { v0.4h }, [x6]
  ld1r { v0.4h }, [x6]
  ld1r { v0.4h }, [x6]
  ld1r { v0.4h }, [x6]
  ld1r { v0.4h }, [x6]
  ld1r { v0.4h }, [x6]
  ld1r { v0.4h }, [x6]
  ld1r { v0.4h }, [x6]
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.3341

retire (01)cycle (02)030508090b0e0f1e1f22233a3f4346494f51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)60696d6edispatch stall (70)72scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a1a3a6a7a8a9acafb5b6bbdcache load miss (bf)dtlb miss (c1)c2cfd5d6ddinst fetch restart (de)e0eaebecld/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
16020526739200100100780100267132121216251601451008004580000100800008000050011731831887343126709267082673767610367071601002008000080000200800008000026737267371180201100991001008000080000010080019194380058012618004061194319051101161126734130180000800001002671626738267162673826729
160204267282000000006000026713200025160145100800458000010080000800005001174628188733402670926708267376732036672160100200800008000020080000800002672826728118020110099100100800008000001008000004380039000398003961394300511011611267051010080000800001002672926729267092672926729
1602042672820000000163010126713012121625160100100800008000010080000800005001168754188357312670926728267376758036695160100200800008000020080000800002672826728118020110099100100800008000001008000004380000000398003961394300511011611267251010080000800001002672926729267292672926729
160204267282000000006301002671301212025160145100800458000010080000800005001173183188527102670926728267376766036694160100200800008000020080000800002672826728118020110099100100800008000001008026004380000000398003961394300511011611267051010080000800001002672926729267292670926729
16020426728200000100297010126713212121625160145100800458000010080000800005001168754188733402670926728267376750036692160100200800008000020080000800002672826708118020110099100100800008000001008000004380000000398003960394300511011611267251010080000800001002672926729267092670926729
16020426728200000000630100267130000251601001008004580000100800008000050011746281887387026709267282673767500366951601002008000080000200800008000026728267281180201100991001008000080000010080000008000000039800396139000511011611267251010080000800001002672926729267092672926709
1602042672820000000187000026713201202516014510080045800001008000080000500117462818840111266892672826737675003669616010020080000800002008000080000267282672811802011009910010080000800000100801300080039000398003961394300511011611267341313080000800001002673826716267382673826716
16020426737200101000258010326722277192516016510080065800001008000080000500116929518794150267182671526737665903670016010020080000800002008000080000267372673711802011009910010080000800000100800192043800592016080039615943191511011611267341313280000800001002673826738267382673826745
160204267372011111006901012671321212162516014510080045800001008000080000500117462818873341267092672826715675203679316010020080000800002008000080000267082672811802011009910010080000800000100800000438003900039800396039430051101161226708010080000800001002672926729267292672926729
1602042672820000000048001012671301212162516014510080045800001008000080000500117318318873401267092672826737675803669516010020080000800002008000080000267082672811802011009910010080000800000100800000438003900039800390104300511011611267251010080000800001002672926729267292672926709

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.3342

retire (01)cycle (02)030508090b0e0f191e1f22233a3f4346494f51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)60696d6edispatch stall (70)72scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a1a3a6a7a8a9acafb5b6bbdcache load miss (bf)dtlb miss (c1)c2cfd5d6ddinst fetch restart (de)e0eaebld/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
16002526743200110200144310410426718018183251600731080063800001080000800005011730931885113126714267332673366770367221600102080000800002080000800002673326737118002110910108000080000010800212042800561016280000615642190502014163326734998000080000102673426738267202673426734
16002426733200111101064010226725218181625160072108006380000108000080000501169681188267702671926734269846680036713160010208000080000208000080000267332673311800211091010800008000001080020194180057111598003861574219150203163726730998000080000102673826734267382673826734
16002426733200110100077000226718218181525160073108006380000108000080000501167999188380112671826733267336677036761160010208000080000208000080000267332673311800211091010800008000001080022194280057200628003761574219150203163326730998000080000102673426734267162673426716
16002426733200100100065010326718218181525160073108006180000108000080000501175054188364502671426733267336677036773160010208000080000208000080189268962673311800211091010800008000001080020204280057110628003861574219250203164326730998000080000102673426716267342673426734
1600242673320010110001280103267182181815251600731080061800001080000800005011715701887298026726267332673367219736722160376208000080000208000080000267332673311800211091010800008000001080020190800190029558003800574219250203163326730998000080000102673426716267342673426734
16002426715200101100083010326700218181625160071108006280000108000080000501167530187790902671426733267336722036713160010208000080000208000080000267332673311800211091010800008000001080020194280056100598003801584219250203167326730998000080000102673426734267342673426734
16002426733200100000080010326718218181525160029108006180000108000080000501167530187790912671426733267336735036713160010208000080000208000080000267332673311800211091010800008000001080021214280058101248000061584219150203163326712908000080000102671626734267342673426734
16002426733200100100071010126700218181625160073108006380000108000080000501171570188267702671426733267336768036708160010208000080000208000080000267332673311800211091010800008000001080020194280057001218003861574219250203163326730998000080000102673426734267372673426734
160024267332001111000287010326718218181625160071108006280000108000080000501164410187754402671426733267336771036717160010208000080000208000080000267332673311800211091010800008000001080021194280058100588003961574219150203163326730098000080000102673426734267342673426734
1600242673320010100108301042671821818162516007110800628000010800008000050116753018758480267172673726938673703671316001020800008000020800008000026733267331180021109101080000800000108001919428005700259800386157019050203163326730998000080000102673426734269222678126777