Apple M1 Microarchitecture Research by Dougall Johnson

Firestorm: Overview | Base Instructions | SIMD and FP Instructions
Icestorm:  Overview | Base Instructions | SIMD and FP Instructions

LD1 (single structure, S)

Test 1: uops

Code:

  ld1 { v0.s }[1], [x6]
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 60 nops): 2.000

Issues: 2.001

Integer unit issues: 0.000

Load/store unit issues: 1.000

SIMD/FP unit issues: 1.001

retire (01)cycle (02)0307080a0b0e0f1e22243a3f464951inst issue (52)~issue fp/simd (54)~issue ld/st (55)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing ld/st (5a)huge thing fp/simd (5b)5f696d6escheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op ld/st (7d)~map op fp/simd (7e)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst fp/simd load (98)inst ldst (9b)9da0a3a6a8a9acafb5bbdcache load miss (bf)c2cfd0d2icache miss (d3)itlb miss (d4)d5d6d9dadbddinst fetch restart (de)e0eaebecld/st retires (ed)? fp/simd (ee)f5f6f7f8fd
6200528619214120100001104923282530016166200110011000100010005000119173227202793328549310200010001000100020002843928015116100110001000110002100000110002000135061031272353529850196183418381920444327889138171316013110100010002844228110281172825528157
62004280362141401301021052722828111161302001100110001000100050001192592269127934284673102000100010001000200028154285551161001100010000100021001000100021201333210367729735051046193283433381213463927849152111229812938100010002822828308280472808028545
62004284402111401000020049112820100161252001100110001000100050001190422266528009280993102000100010001000200028265281151161001100010000100021000000100020201379910459733435241047199913434382220414327855137681237813103100010002802428056280332808628478
620042809521217015001210501827940001605920011001100010001000500011938122705279752803431020001000100010002000283652812911610011000100001000210000031001202013922985772223252648199333561381114494927992139051222714453100010002810428530281732821628219
6200428082214230140002005362278560015982200010011000100010005000119265227102835728267310200010001000100020002840028689116100110001000010002100100310000020141911057772703254743200313423381112414227907138161217313355100010002799528536284002859128038
6200428015209140170002005221278611116209200110011000100010005000119241122712280342840231020001000100010002000281542808911610011000100001000210010041001202014028983672773270438198703473381915433927996139351245312823100010002851228155283182818528011
6200428483212140140006105273277980016469200010011000100010005000119255226782798928117310200010001000100020002812428043116100110001000010000100000210001110141039828730134981039193893495382214363827803147151198313793100010002803328135281542803328022
6200428105211110150002105327278460016029200210021000100010005000119316227322787327960310200010011000100020002799428114216100110001000010002100000110011120140841064772993529736194273482381514444527753141881223212742100010002816828000279952802528030
6200428094210100110002005344278930016062200010011000100010005000119298227142823228104310200010001000100020002844227996116100110001000010002100110010002020140591047771563523544195413433381214444527838141791277814412100010002807528325284602810328079
6200428048211170130002005318279660016223200110001000100010005000119288227052836128043310200010001000100020002790728105116100110001000010002100000010002010135261040972313541636195313435381415434227812141461215312807100010002806928010281412808228079

Test 2: Latency 1->2 roundtrip

Chain cycles: 3

Code:

  ld1 { v0.s }[1], [x6]
  fmov x0, d0
  eor x8, x8, x0
  eor x8, x8, x0
  add x6, x6, x8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 11.0051

retire (01)cycle (02)030b0e0f18191e1f2223243f4d4f51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)60696b6d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a3a6a8acafb5dcache load miss (bf)c2cfd5d6ddinst fetch restart (de)e0? int output thing (e9)eaebecld/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
60205140051104900000101001400361394111293632570102401002000210000301002000010000126380366937341431093911400300140051140051131797313240560100302001000020000602001000030000140051140051115020110099100401001000010000010010000010000001000011032101126111395594000010100100001000040100140052140036140052140052140052
602041400511049000001010014003913941112936325701024010020002100003010020000100001264020669373414310939114001101400511400511317973132403601003020010000200006020010000300001400351400511150201100991004010010000100000100100001100000010000110321011261113955940000101010100001000040100140052140052140052140036140052
60204140035104901100001001400201394111293632570102401002000210000301002000010000126402066937341430870111400273140051140051131797313239960100302001000020000602001000030000140051140051115020110099100401001000010000010010000010000001000011032101171113955440000101010100001000040100140052140052140052140052140036
602041400511049000009010014002013941112936325701024010020002100003010020000100001264020669373414310939114002701400511400511317973132399601003020010000200006020010000300001400511400511150201100991004010010000100000100100001100000010000010321011271113955940000101010100001000040100140052140052140102140052140052
6020414003510490000040010014003613941112936325701004010020002100003010020000100001263803669373414310939114002701400511400511317973132580601003020010000200006051810159300001400511400511150201100991004010010000100000100100000100000010000110321011261113955940000101010100001000040100140052140052140052140052140036
602041400351049000001010014003613941112934725701024010020002100003010020000100001264020669373414310939114002701400511400511317973132399601003020010000200006020010000300001400511400351150201100991004010010000100000100100001100000010000110321011262113955940000101010100001000040100140517140139140152140052140052
6020414024810521000008810014003613941112936325701024010020002100003010020000100001264020669373414310939114002701400511400511317973132400601003020010000200006020010000300001400511400351150201100991004010010000100000100100001100000010000110321011261113955940000101010100001000040100140052140052140052140052140036
60204140051104901000101001400361394111293632570100401002000210000301002000010000126380366937341431093911400270140051140051131797313239960100302001000020000602001000030000140051140102115020110099100401001000010000010010000110000001000011032101126111395594000001010100001000040100140052140052140052140052140052
60204140051104900000101001400361394111293632570102401002000210000301002000010000126402066937341431093911400270140051140051131793313242360100302001000020000602001000030000140051140051115020110099100401001000010000010010000110000001000001032101126111395464000001010100001000040100140052140052140052140052140052
60204140051104900000101001400361394111293632570102401002000010000301002000010000126402066937341431093901400270140051140051131797313256060100302001000020000602001000030000140051140051115020110099100401001000010000010010002110000001000010032101126111395594000010100100001000040100140092140056140038140052140052

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 11.0047

retire (01)cycle (02)0308090b0e0f18191e1f22233f4d4f51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)60696b6d6edispatch stall (70)71scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a1a3a6a8a9acafb5dcache load miss (bf)c2c5cfd5d6ddinst fetch restart (de)e0? int output thing (e9)eaebecld/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
6002514004710480001000101014003213939412935925700124001020002100003001020000100001264443669353814325829014002301400471400481318150313243060010300201000020000600201000030000140047140047115002110910400101000010000010100000110000000100001100314041132213956640000066100001000040010140048140048140048140048140048
6002414004710490000000100014003213939412935925700124003520002100003001020000100001264443669353814325829014031601400471400471318150313243060010306631000020000600201000030000140047140047115002110910400101000010000110100000110000120100001100314021132213956640000666100001000040010140048140048140048140048140048
6002414004710490001120101014003213939412935925700124001020002100053001020000100001264443669353814325829014002301400471400471318150313252160010300201000020000600201021530000140047140047115002110910400101000010000010100000110000000100001100314021112213956740000669100001000040010140051140337140051140048140051
6002414005010490001100101014003213939412935925700104001020002100003001020000100001264452669353814339627014002301400471400471318150313259860010303421010720428600201000030000140048140047115002110910400101000010000010100000110000220100001100314021132213956640000666100001000040010140048140048140048140036140051
6002414004710490000001101014003213939412940525700124001020002100003001020000100001264443669353814325829014002301400471400471318150313243060010300201000020000600201000030161140047140047115002110910400101000010000010100000110000000100001100316821132213956640000669100001000040010140048140048140048140051140048
6002414005410480001000101014003213939412934725700104001020002100003001020000100001264477669353814325829014001101400351400471318180313243060010300201000020000600201005430000140047140047115002110910400101000010000110100000110000100100001100314021132213962340000660100001000040010140048140048140048140048140048
6002414004710490001000101014003213939412935925700124001020002100003001020105100001264477669353814325829014002301400471400481318150313243060010300201000020000600201000030000140047140047115002110910400101000010000110100000010000000100001100314021132213961940000666100001000040010140048140048140048140048140048
60024140050104900011001301014003213939412935925700124001020002100003001020108100001264443669353814325829014002301400471400471318150313243060010300201000020000600201000030000140047140047115002110910400101000010000010100000110000100100001100314021132213956640000666100001000040010140048140036140048140051140048
6002414004710490000100001014003213939512936025700124001020002100003001020000100001264443669294714325829014002401400351400471318150313242060010300201000020000603441000030000140047140047115002110910400101000010000010100000110000000100001100314021132213956640000666100001000040010140036140051140048140048140048
6002414004710490100100100014003213939412935925700124001020002100003001020000100001264443669294714325829014002301400471400351318150313243060010300201000020000600201000030000140048140047115002110910400101000010000110100000110000000100000100314021132213956940000066100001000040010140048140048140051140036140048

Test 3: throughput

Count: 8

Code:

  ld1 { v0.s }[1], [x6]
  ld1 { v0.s }[1], [x6]
  ld1 { v0.s }[1], [x6]
  ld1 { v0.s }[1], [x6]
  ld1 { v0.s }[1], [x6]
  ld1 { v0.s }[1], [x6]
  ld1 { v0.s }[1], [x6]
  ld1 { v0.s }[1], [x6]
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 2.0007

retire (01)cycle (02)0305090b0e0f18191e1f22233a3f4346494e4f51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a1a3a5a6a8a9acafb5b6bbdcache load miss (bf)c2branch mispredict (cb)cdcfd5d6ddinst fetch restart (de)e0? int output thing (e9)eaebecld/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
1602051600561199010200057010116004121212159684162516010210080002800001008000080000500421998229395591160037160056160056139691314001416010020080000800002008000016000016005616005611802011009910010080000800000100800000430800390039800396139430005110216221600530101048000080000100160057160057160057160057160057
1602041600561198000000045010116004121212159684162516010210080002800001008000080000500421998229395471160037160056160056139691314001416010020080000800002008000016000016005616005611802011009910010080000800000100801300430800390039800396139430005110216221600530101048000080000100160057160057160057160057160057
1602041600561200001000150401011600412129159684162516010210080002800001008000080000500421017229395471160037160056160056139691314001416010020080000800002008000016000016005616005611802011009910010080000800001100800000430800391039800396139430005110216221601332101048000080000100160426160181160289160120160057
16020416005611980000000450101160041212121596901625160100100800028000010080000800005004219982294108811600371600561600561396913140014160100200800008000020080000160000160056160056118020110099100100800008000001008000004329880039004280169613943000511021622160053001048000080000100160057160057160057160057160057
1602041600561199000000045010116004121212159684162516010210080002800001008000080000500422041229395751160037160056160056139691314001416010020080000800002008000016000016005616005611802011009910010080000800000100800000430800392046800396139430005110216221600530101048000080000100160057160057160057160057160057
1602041600561199000000045010116004121212159684162516010210080002800001008000080000500422028229395471160037160056160056139691314001416010020080000800002008000016000016005616005611802011009910010080000800000100800000430800390039800396139430005110216221600370101048000080000100160057160057160057160057160041
1602041600561198000000045000116004121212159684162516010210080002800001008000080000511422041229395751160037160056160056139691181400141601002008000080000200800001600001600561600561180201100991001008000080000010080000043080039003980000613943000511021622160053001008000080000100160057160041160057160041160057
1602041600561199000000045010116002521212159684162516010210080002800001008000080000500422028229395591160037160056160056139691314001416010020080000800002008000016000016005616005611802011009910010080000800000100800000430800390039800396139430005110216221600530101048000080000100160057160057160041160057160057
16020416005611990000000450000160025212121596841625160102100800028000010080000800005004220282293954711600371600561600561396911714001416010020080000800002008000016000016004016005611802011009910010080000800001100800000430800390039800396139430015110216221600530101048000080000100160057160057160057160057160057
1602041600561198000010045010116004121212159684162516010210080002800001008015080000500422041229395470160021160056160056139691314001416010020080000800002008000016000016005616005611802011009910010080000800000100800000430800391039800396139430005110216221600530101048000080000100160057160057160057160057160057

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 2.0007

retire (01)cycle (02)0308090a0b0e0f181e1f22233a3f4346494e4f51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a3a5a6a8acafb5b6bbdcache load miss (bf)c2c5cfd2d5d6ddinst fetch restart (de)e0eaebecld/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
160025160056119901000004401011600412112159690162516001210800028000010800008000050422394229395470160037160056160060139713314003616001020800008000020800001600001600561600561180021109101080000800001108000043080039040800396139430050200181681116005310104800008000010160057160057160057160041160057
160024160060119901001004500011600412121215968419251600121080002800001080000800005042284122939575116003716005616005613971331400361600102080000800002080000160000160056160056118002110910108000080000010800004308003903880039613943005020061611816005310104800008000010160061160057160057160057160057
16002416005611980000000450001160045211215968419251600121080002800001080000800005042241922946559116003716004016005613971331400361600102080000800002080000160000160056160056118002110910108000080000010800000080039242800396139430050200111611816005310104800008000010160041160057160057160057160061
16002416005611990100000450001160534212015968416251600121080002800001080000800005042259922939575016003716005616006013971331400361600102080000800002080000160000160040160056118002110910108000080000010800004308003803880039610430050200141610616005710100800008000010160057160057160057160057160057
160024160040119901000005700001600452121215968416251600121080002800001080000800005042211322939575016003716006016005613971331400361600102080000800002080000160000160040160060118002110910108000080000010800004308003903980039613943005020091691016005310104800008000010160061160057160057160057160041
1600241600561199010000044010116004121211596841625160012108000280000108000080000504225092293954701600371600561600561397133140036160010208000080000208000016000016005616005611800211091010800008000001080000008003903980039613943205020091681116013510104800008000010160057160057160246160057160119
160024160056119801000004568600116002521121594781625160012108000280000108000080000504224192293954701600371600561600561397173140036160010208000080000208000016000016005616005611800211091010800008000001080000430800390398003961394300502009169916005314144800008000010160057160057160061160057160057
1600241600561199010000045000116004121211596841925160012108000280000108000080000504221182294829701600371600561600561397133140036160010208000080000208000016000016005616005611800211091010800008000001080000430800380398004060394300502008168111600531004800008000010160057160057160057160057160061
160024160056119800000004500011600412121215968419251600121080002800001080000800005042261922939559116003716004016005613971331400401600102080000800002080000160000160056160056118002110910108000080000010800004308003803980039613843015020312167916005314144800008000010160057160057160057160057160061
16002416005611990000000600101160041212121596841625160012108000280000108000080000504223842293954711600371600561600561397173140036160010208000080000208000016000016005616005611800211091010800008000001080000430800380398003961394300504108169916003714104800008000010160107160057160061160057160057