Apple M1 Microarchitecture Research by Dougall Johnson

Firestorm: Overview | Base Instructions | SIMD and FP Instructions
Icestorm:  Overview | Base Instructions | SIMD and FP Instructions

LD1R (16B)

Test 1: uops

Code:

  ld1r { v0.16b }, [x6]
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 60 nops): 2.000

Issues: 2.004

Integer unit issues: 0.000

Load/store unit issues: 1.000

SIMD/FP unit issues: 1.004

retire (01)cycle (02)03040507080a0b0e0f18191e1f22233a3f464951inst issue (52)~issue fp/simd (54)~issue ld/st (55)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing ld/st (5a)huge thing fp/simd (5b)5f696d6escheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op ld/st (7d)~map op fp/simd (7e)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst fp/simd load (98)inst ldst (9b)9da0a1a3a6a8a9acafb5b6bbdcache load miss (bf)dtlb miss (c1)c2cfd0d2icache miss (d3)itlb miss (d4)d5d6d9dadbddinst fetch restart (de)e0eaebecld/st retires (ed)? fp/simd (ee)f5f6f7f8fd
620052866021321012000003010517027818111603420041004100010001000500011933522630280342856031020001000100010001000284562853911610011000100011003431002002100020121113421995972903345043194493566381311414127795139781211912917100010002805828415279642793228181
620042864721101311101004000481128171101642520031003100010001000500011943222635284262840931020001000100010001000281552842411610011000100011001231002001100120231213403976572173446044200443379381412424227785149031249212900100010002811428660283942810528073
620042817021101010100004010520328330011630620031003100010001000500011946922667282852800031020001000100010001000281022855811610011000100001003231003022100120231113584983470853406035200173478380912383927816140891220813111100010002803628162284912862228182
620042801721201010100004010497228085111599920041004100010001000500011946922693279652810331020001000100010001000281702799611610011000100001003231004022100120231213300980272183405045199823457381213423827793152051304612949100010002820528056286212828228230
620042811321001011100004010473928226011663820041004100010001000500011927722676280812857431020001000100010001000280632852911610011000100001002231003011100120221213798966069913408044199493183382112363427821138701305912807100010002818228587280652808928005
62004282702110101010000301051702796811161382004100310001000100050001194213227102803328224310200010001000100010002812428060116100110001000010033310040221001202212133151003270873433241194833472381811434427877136811291712811100010002846828079280942793528383
6200428401210010111000030004756277571116162200410041000100010005000119451226722810928065310200010001000100010002855328503116100110001000010021310030121001202311135521023373493480039196793279380611403927966140791292414316100010002854428261285782863027933
620042814321101010100004000487028134101658420041003100010001000500011943322692281072860731020001000100010001000284242818611610011000100001002331004022100120231213447935969513346047195633444382018363927828150861230213198100010002855528441285512802928154
620042811120901211100004010494628267011593920051003100010001000500011928422648278562855031020001000100010001000282742847211610011000100001003221003012100120231213682966272673352043196093515381515423427783154991254112931100010002817028097280942846127982
62004281322100111110000400052232783310162292003100410001000100050001192842270028093285893102000100010001000100028187283491161001100010001100332100302110002023121319399437109325603719264345838139393728039139661240512582100010002839528701280672810228063

Test 2: Latency 1->2 roundtrip

Chain cycles: 3

Code:

  ld1r { v0.16b }, [x6]
  fmov x0, d0
  eor x8, x8, x0
  eor x8, x8, x0
  add x6, x6, x8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 11.0051

retire (01)cycle (02)03090e0f1e22233f4d4f51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)60696b6d6edispatch stall (70)71scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a1a3a6a7a8a9acafb5dcache load miss (bf)c5cfd5d6ddinst fetch restart (de)e0? int output thing (e9)eaebecld/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
602051400511048211100140036139411129363257010240100200021000030100200001000012640206693734143087011140027014005114005413179303132399601003020010000200006020010000200001400351400511150201100991004010010000100000100100000110000000110000110321021261113955940000101010100001000040100140052140052140052140052140052
60204140056104900010014003613941112936325701024010020000100003010020000100001264020669373414310939014002701400511400511317970313239960100302001000020000602001000020000140051140051115020110099100401001000010000010010000011000501243331000301032101126111395654000010100100001000040100140052140052140052140052140036
6020414005310490000101400361394111293632570102401002000010000301002000010000126402066937341431248601400270140035140051131797031323826010030200100002000060200100002000014005414005211502011009910040100100001000001001000601100030227856100001103210132111395594000001310100001000040100140052140052140036140052140055
602041400581049000131014003913941112934725701024010020002100003010020000100001264020669373414310939114002701400511400351317970313239960100303641000020000602001000020000140054140054115020110099100401001000010000010010000011000000001000101032101126111395594000010100100001000040100140036140052140052140052140036
602041400511049000110140036139411129363257010240100200021000030100200001000012639586693734143109391140011014003514005113179703132399601003020010000200006020010000200001400511400511150201100991004010010000100000100100000110000000010000110321011261113955940000101010100001000040100140036140052140052140036140055
60204140051105000016001400201394111293632570100401002000210000301002000010000126395866929471431093911400110140051140035131793031323826010030200100002000060200100002000014003514005111502011009910040100100001000001001000001100000002071000000032101139111395594000010010100001000040100140052140055140036140055140036
6020414005110490001011401551394111293632570102401002000010000301002000010000126380366937341431093911400300140035140051131839031323896010030200100002000060200100002000014005514003511502011009910040100100001000001001000001100000000100001103210112611139559400000100100001000040100140052140052140036140052140052
60204140035104900010114002013940612934725701004010020002100003010020000100001264020669373414310939114001101400511400511317970313239960100302001000020000602001000020000140054140035115020110099100401001000010000110010000001000001001000011032101171113954640000101010100001000040100140036140054140052140052140052
602041400351049000101140020139411129363257010240100200021000030100200001000012640206692947143109391140030014005114005113179703132399601003020010000200006020010000200001400511400511150201100991004010010000100000100100000110000010010000110321011261113955940000101010100001000040100140052140052140052140052140036
60204140051104900010114003613940612936325701024010020002100003010020000100001264020669294714310939114002701400351400541317970313238260100302001000020000602001000020000140035140051115020110099100401001000010000010010000011000000001000010032101126111395594000010010100001000040100140036140052140052140036140052

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 11.0051

retire (01)cycle (02)030508090a0b0e0f191e1f22233a3f4d4f51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a1a3a6a7a8a9acafb5bbdcache load miss (bf)dtlb miss (c1)c2branch mispredict (cb)cdcfd5d6daddinst fetch restart (de)e0? int output thing (e9)eaebecld/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
6002514005810490000000060100140036139398129363257001240010200021000030010200001000012644296693734143261850140011140051140035131915313248360010300201000020000600201000020000140051140051115002110910400101000010000010100000010000010010000101000031401111307813955440000101010100001000040010140052140052140036140052140036
600241400571049111010001010014003613938812947625700144001020004100003001020000100001264488669373414326185114009614005314013813181931324346001030182102122000060020100002000014005114003611500211091040010100001000001010000001000610111000010100003140101130121713955440000101010100001000040010140052140052140052140052140058
6002414005110480000000014010014002013940412936925700124001020004100033001020000100001271524671071714321442114008814005114005113180362132434600103002010213200006002010000200001400511404822150021109104001010000100001101000001100000100100001010000316711113010713955440000101010100001000040010140052140052140052140052140053
60024140051104900100001601001400391394001293472570010400102000210000300102000010000126448866937341432144211400581400511400511318193132424600103002010000200006002010000200001400411400571150021109104001010000100000101000001100000003100001010400314012113012813957040000101010100001000040010140036140097140052140036140052
6002414003510490000000015000014002013939812936425700124001020000100003001020000100001264429669294714325254114004314005114005113189831324346001030020100002000061316100002000014005114003511500211091040010100001000001010000011000004001000010100003140121350121213957040000101010100001000040010140036140052140036140052140036
60024140035104900000011100001400361393981293632570012400102000210000300102000010000126448866937341432618501400581404181400521318193132435600103002010000200006002010000200001400351400511150021109104001010000100000101000001100000003100011010000314010111071213957040000101010100001000040010140052140036140036140052140052
600241400511049000000007010014003613939812936325700124001020000100063001020000100001264488669294714326185014006014003514005113181931324406001030020102162000060020100002000014005714010811500211091040010100001000011010002111000000001000011100003140101110971395704000010010100001000040010140052140052140052140052140052
60024140051105200000000143521001400201393981293632570012400102000210000300102000010000126448866937341432618511400541400511400511318253132434600103002010000200006002010000200001400351400511150021109104001010000100000101000001100000000100001010000314013111014121395704000001010100001000040010140052140036140036140036140052
6002414003510490000000010100140036139398129363257001240010200021000030010200001000012644886693734143261851140069140051140051131803313243460010300201000020000600201000020000140051140035515002110910400101000010000010100032110000000310000101000031401111301061395544000001010100001000040010140036140042140058140052140052
60024140051104900000000100001400201393941293632570012400102000210000300102000010000126448866929471432144211400311400511400511318203132440600103002010000200006002010000200001400511400351150021109104001010000100000101000001100010000100000000000314011113071213986840000101010100001000040010140052140052140052140052140052

Test 3: throughput

Count: 8

Code:

  ld1r { v0.16b }, [x6]
  ld1r { v0.16b }, [x6]
  ld1r { v0.16b }, [x6]
  ld1r { v0.16b }, [x6]
  ld1r { v0.16b }, [x6]
  ld1r { v0.16b }, [x6]
  ld1r { v0.16b }, [x6]
  ld1r { v0.16b }, [x6]
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.3342

retire (01)cycle (02)030508090b0e0f18191e1f22233a3f4346494f51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)6069interrupt pending (6c)6d6edispatch stall (70)72scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)92inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9fa0a1a3a6a7a8a9acafb5b6bbdcache load miss (bf)dtlb miss (c1)c2cfd5d6ddinst fetch restart (de)e0eaebecld/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
1602052673920010011100660003267222072025160165100800648000010080000800005001167722188640312672002674326737665903669516010020080000800002008000080000267372671511802011009901001008000080000100800212143800191006080040615843190511021622267341313280000800001002673826738267382673826738
160204267372001100110066010326722077125160165100800628000010080000800005001167371188384912672502673726739665903669516010020080000800002008000080000267372673711802011009901001008000080000100800191943800192026180040615843192511021622267341313080000800001002675626716267382673826738
16020426737200111111002100032670027002516011910080065800001008000080000500116737118866431267320267372673766590366731601002008000080000200800008000026715267151180201100990100100800008000010080020224580061102608000061594319051102162226734130180000800001002673826716267382673826738
16020426737200101010006701032672237012516016410080019800001008000080000500116812518794151267260267432673766590366951601002008000080000200800008000026737267171180201100990100100800008000010080020194380059000608000001584319151102162226713130180000800001002674326718267462674226749
16020426741200121110045488103267222772325160163100800658013010080000800005001167922188783412685502739226724666903669516010020080000800002008000080000267412674111802011009901001008000080000100800201943800190212580039615843190511021622267341313280000800001002675326756267382674926742
160204267552011111100011201032673300921251601651008006580000100800008000050011701791887284126728026721267166659233669816010020080000800002008000080191267452674511802011009901001008000080000100800191943800591002180040611943190511021622267121313180000800001002671626741267162674126738
1602042671520011111000670002267612702025160163100800198000010080000800005001173027187941512674402674126737666303670016010020080000800002008000080000267152673711802011009901001008000080000100800202143800600016380040615843190511021622267341313180000800001002675326748267462674226738
1602042673720011011000670002267242771825160118100800638000010080000800005001167722187829412671802673826737665903669516010020080000800002008000080000267372673711802011009901001008000080000100800192043800591116680000615943190511021622267381313280000800001002674026738267402673826740
16020426737200111010007501032672207702516016510080065800001008000080000500117032318853691267180267372673766590366951601002008000080000200800008000026737267151180201100990100100800008000010080020214380019002648004061584319051102162226734013180000800001002671626716267382673826738
16020426715200111110006700032672237702516016310080019800001008000080000500116829118776411268260267392674566660366951601002008019280000200800008000026737267371180201100990100100800008000010080019200800190012180040615943190511021622267341313280000800001002673826738267162671626738

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.3342

retire (01)cycle (02)03090b0e0f191e22233a3f4346494f51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)5f60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a1a3a5a6a7a8acafb5b6bbdcache load miss (bf)c2cfd5d6ddinst fetch restart (de)e0e7eaebecld/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
160025267292011011090101267172011925160010108004480000108000080000501174628188334300267262670826708667636712160010208000080000208000080000267282670811800211091010800008000011080000043080039013980039613943050201416101126705014008000080000102673326733267292672926733
1600242670820000110440012671721102516001010800448000010800008000050116908518801700026715267322673266763671216001020800008000020800008000026708267281180021109101080000800000108000004308003900398000060044050208161111267290101008000080000102673326733267292673326709
16002426732200001000001267170101625160010108000080000108000080000501169085188521400267102670826733667336713160010208000080000208000080000267322672811800211091010800008000001080000043080039010800380004405020101611926729001438000080000102673326709267092670926709
160024267082000010000012669320119251600541080000800001080000800005011748871884032002670926732267326677366881600102080000800002080000800002673226728118002110910108000080000010800000430800380138800396104405020816910267290141408000080000102673326709267092670926733
16002426728200001000001267172111925160054108000080000108000080000501174887188595200267212670826732667636688160010208000080000208000080000267082670811800211091010800008000001080000000800380038800386004405020816101026705001438000080000102673326709267362673326733
160024267282000011044001266932111925160010108004480000108000080000501174887188796100266892673226708667636712160010208000080000208000080000267322670811800211091010800008000001080000043080039003880000613944050201216111026725014008000080000102673326733267092670926733
160024267082000010044100267172010251600101080044800001080000800005011686271884032002668926732267086676366881600102080000800002080000800002673226728118002110910108000080000010800000430800380008003801390050201116101126729014008000080000102670926709267332670926733
160024267322000011044001268810111625160054108004480000108000080000501169085188403200267102670826732667536688160010208000080000208000080000267322673011800211091010800008000001080000000800380038800396139430502010161010267250141008000080000102673326709267332673326709
1600242673220000000440012669320119251600541080044800001080000800005011688801885952002671226708267286653367121600102080000800002080000800002673226708118002110910108000080000010800000430800000008003860043050201016910267290101008000080000102680726739267092673326737
1600242673220000100441012671701002516005410800008000010800008000050116888018879610026698267082673266533668816001020800008000020800008000026728267281180021109101080000800000108000004308003800388003861364305020816810267290141408000080000102673326709267092670926733