Apple M1 Microarchitecture Research by Dougall Johnson

Firestorm: Overview | Base Instructions | SIMD and FP Instructions
Icestorm:  Overview | Base Instructions | SIMD and FP Instructions

LD1 (multiple, one register, 2D)

Test 1: uops

Code:

  ld1 { v0.2d }, [x6]
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 60 nops): 1.000

Issues: 1.000

Integer unit issues: 0.000

Load/store unit issues: 1.000

SIMD/FP unit issues: 0.000

retire (01)cycle (02)0307090a0b0e0f1e1f22243a3f464951inst issue (52)~issue ld/st (55)~dispatch ld/st (58)huge thing ld/st (5a)5f6061696d6escheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op ld/st (7d)~map lookup ld/st (80)8283pipeline redirect (84)85inst all (8c)inst fp/simd load (98)inst ldst (9b)9da0a3a6a8acafb5bbdcache load miss (bf)cfd0d2icache miss (d3)itlb miss (d4)d5d6d9dadbddinst fetch restart (de)e0eaebecld/st retires (ed)f5f6f7f8fd
61005287452162202000020105100281021023447100010001000500080815968281962861931010001000100028314284251161001100010000100021000011000202131761037971843405136521350356538115150502805314758132971365010002892928598284092904628565
61004284162142112600020105162280921023441100010001000500030815982280992852431010001000100028439286031161001100010000100001000001000002138641037770793451105220921346838205050542840014311133451369610002867828599286422850928618
61004285472152702100020005003280400023915100010001000500030815987285162861031010001000100028309283731161001100010001100021001001001202137601020069153204134720950353538125647502824314499126221445610002900528683288622871628591
6100428595214210170002000518528161002376110001000100050002181598928105289493101000100010002842528467116100110001000010002100000100020213878968569493353105120893340338135155472830214916127551405410002879928685289952867028597
61004286822152302300011000506128404002366110001000100050001181596428129284053101000100010002872428364116100110001000010002100000100021213499972170463442125520905325438125249532810914767127771399110002871228682287142878028592
61004286262152202500020105045280770023600100010001000500021815983281662876631010001000100028469286441161001100010001100021000001000202134631034371413433165220889343538215748522813214239125551387810002859628553285632869528514
6100428685217200200110000487328227002357710001000100050008101597428163287353101000100010002851428383116100110001000010002100001100020213746982969573371134921172340738155153532806014882127071412610002869028698286222858528599
610042871021423019000200051042812000236091000100010005000000159692812528561310100010001000287372855311610011000100011000210010010002021360495867121334895420848345438185350502808914633124641379110002856628640289962891328537
61004287822181701800020004873280920023557100010001000500030815985280542855331010001000100028549284811161001100010000100021000001001200139821000869243168115420952339538155549512816514654126411431410002881128678287452859129103
61004285772162501800100104958283740023504100010001000500021815990282242896831010001000100028593287071161001100010000100021000001000002134261018070913383114920972342538155749502813314535123011397210002866728542284562855328604

Test 2: Latency 1->2 roundtrip

Chain cycles: 3

Code:

  ld1 { v0.2d }, [x6]
  fmov x0, d0
  eor x8, x8, x0
  eor x8, x8, x0
  add x6, x6, x8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 9.0047

retire (01)cycle (02)030408090e0f181e22233f4d4f51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a3a6a8acafb5dcache load miss (bf)c2c5branch mispredict (cb)cdcfd5d6ddinst fetch restart (de)e0? int output thing (e9)eaebecld/st retires (ed)gpr retires (ef)f5f6f7f8fd
50205120047899100000110120020119509109536256010340102100001000030108100081000610780055736732613435911200261200471200501132337113732501283022810010100106025610010100101200471200471150201100991004010010000100000100100001100001010000100111322101600119736400029601000040100120051120051120051120051120051
50204120047899000000000120020119509109514256010040102100001000030108100081000610789905736422613435911200261200501200501132187113748501283022810010100106025610010100101200501200471150201100991004010010000100000100100001100000010000100111322001600119736400009001000040100120051120051120048120048120048
502041200478990000001001200351195091094632560103401001000110000301001000010000107899957361886133818112001112005012004711314531136585010030200100001000060200100001000012003512004711502011009910040100100001000001001000001000000100001101003210110711119660400009981000040100120051120051120051120051120051
502041200358990000000101200351195091094632560103401021000010000301001000010000107886257354556134461112001112003512005011314531136675010030200100001000060200100001000012010012003511502011009910040100100001000001001000001000000100001100003210110711119657400029981000040100120051120051120036120051120036
502041200358990000001001200351194931094632560103401021000110000301001000010000107886257354556134461112002312004712003511314331136365010030200100001000060200100001000012005012003511502011009910040100100001000001001000011000000100001100003210110111119705400000081000040100120051120048120051120049120036
502041200358990010001101200351194931094632560100401001000110000301001000010000107920057354556134461112002612003512003511314131136365010030200100001000060200100001000012005012004711502011009910040100100001000001001000011000000100001100003210110711119646400026601000040100120051120051120048120051120051
502041200508990000001001200201195091094632560103401021000010000301001000010000107920057361886133818112002612005012005011315031136675010030200100001000060200100001000012003512004711502011009910040100100001000001001000011000000100001000003210113511119657400029601000040100120036120036120051120051120051
502041200508990000041001200351195091094492560103401021000010000301001000010000107899957354556134461112001112005012008511314331136585010030200100001000060200100001000012003512004711502011009910040100100001000001001000011000000100010000003210110711119646400029901000040100120051120051120051120036120051
502041200358990000000101200321195091094632560100401021000110000301001000010000107899957361886134461112002612010012009511314531136675010030200100001000060200100001000012005012004711502011009910040100100001000011001000011000003100000100003210110111119657400009081000040100120036120051120048120051120051
502041200388990000000101200351195091094492560103401001000110000301001000010000107886257361886136166112002312005012003511314531136675010030200100001000060200100001000012005012004711502011009910040100100001000001001000011000100100001000003210110711119657400009681000040100120036120036120052120051120051

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 9.0054

retire (01)cycle (02)0305080b0e0f1e1f22243a3f4d4f51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)60696b6d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a1a3a6a8a9acafb5bbdcache load miss (bf)dtlb miss (c1)c2c5cdcfd5d6ddinst fetch restart (de)e0? int output thing (e9)eaebecld/st retires (ed)gpr retires (ef)f5f6f7f8fd
500251200578990001120101120042119515109470256001640014100021000030010100001000010796115736524613396801200330120057120060113175311369650010300201000010000600201000010000120057120057115002110910400101000010000010100021110002011100001000000314041072311967240004101001000040010120058120058120058120058120058
500241201168991011110000120036119509109464256001340012100011000030010100001000010795575736236613366201200330120057120057113178311369650010300201000010000600201000010000120057120057115002110910400101000010000010100012110002011100001111000314021072311967240004101091000040010120108120058120058120058120058
500241200598991100020001120042119518109470256001640014100021000030010100001000010796115736524613396811200330120057120060113178311369950010300201000010000600201000010000120062120057115002110910400101000010000010100032110002001100001111100314031073211967240004101091000040010120058120058120058120058120058
500241200578991000020000120042119499109470256001640014100021000030010100001005510796115736524613396811200330120057120057113175311369650010302141000010000600201000010000120057120057115002110910400101000010000010100022110002011100001111100314031073311967640004101091000040010120058120058120058120058120058
50024120057899100002000012004211951510947025600164001410002100003001010000100001079463573652461339681120033012005712005711317531137025001030020100001000060020100001000012005712005711500211091040010100001000001010002311000202110000101000031403107231196664000201001000040010120052120052120052120052120052
50024120035899000001010012003611950910946725600134001210001100003001010000100001079557573623661336621120027012005412005111316931136905001030020100001000060020100001000012005112005111500211091040010100001000001010000011000000010000101000031403107321196664000201091000040010120052120052120060120036120052
500241200518990000110000120036119512109464256001340012100011000030010100001000010795575736236613366211200270120051120035113169311369050010300201000010000600201000010000120054120051115002110910400101000010000010100000010000003100001010000314021072311966640002101091000040010120052120052120052120055120052
500241200549000000100100120036119509109464256001340012100011000030010100001000010795575736236613366211200300120051120054113169311369050010300201006510000600201000010000120051120051115002110910400101000010000010100000110000000100001010000314031073311966640002101091000040010120054120145120052120063120052
5002412005189900000101001200361195091094642560013400121000110000300101000010000107955757362366133662012002701200511200511131693113693500103002010000100006002010000100001200511201441150021109104001010000100000101000001100000001000010100003140210723119650400021010121000040010120052120052120055120052120052
5002412005189900000100001200391194921094641046005540046100071001030150100001000010795575750963613794211200270120051120054113482311369050010300201000010000609721016110000120054120051115002110910400101000010000010100000110000004835100001010000316831072311966640002101391000040010120052120052120052120052120052

Test 3: throughput

Count: 8

Code:

  ld1 { v0.2d }, [x6]
  ld1 { v0.2d }, [x6]
  ld1 { v0.2d }, [x6]
  ld1 { v0.2d }, [x6]
  ld1 { v0.2d }, [x6]
  ld1 { v0.2d }, [x6]
  ld1 { v0.2d }, [x6]
  ld1 { v0.2d }, [x6]
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.3342

retire (01)cycle (02)0304090a0e0f18191e22233a3f4346494f51inst issue (52)~issue int (53)~issue ld/st (55)~dispatch int (56)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map lookup int (7f)~map lookup ld/st (80)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a3a6a8acafb5b6bbdcache load miss (bf)cfd5d6ddinst fetch restart (de)e0eaebecld/st retires (ed)gpr retires (ef)f5f6f7f8fd
8020526727200010000044001267162111925801001008000010080000500116800212670526707267311665431668980100200800002008000026731267271180201100991001008000080000010080000438003901448003861044511011611267041407800001002673226708267082673226728
8020426890201000000001012671621119258010010080000100800005001166525026702267312670716655316689801002008000020080000267312672711802011009910010080000800000100800000800392538800386139445110116112672810147800001002670826732267322673226834
802042671020000100114510126712012097258010010080000100800005001170666026682267312673116630316689801002008000020080000267072672711802011009910010080000800000100800004380039263880038610051101161126728000800001002673226734267082670826843
8020427183200000000001002671621201625801001008000010080000500117221602670626731267311663031668980100200800002008000026731267271180201100991001008000080000010080000438003931388003800044511011611267280147800001002672826708267322673226812
8020426736200000010044000266923010258010010080000100800005001166525126702267312673116654316665801002008000020080000267072670711802011009910010080000800000100800000800383438003860380511011611267241407800001002670826728267322673226824
80204267342000000000501002671620119258010010080000100800005001167127026706267312672716654316665801002008000020080000267272670711802011009910010080000800000100800004380039263880038610445110116112670410147800001002673226708267322672826821
80204267392000001000010126717011192580100100800001008000050011695450266822673126740166543166858010020080000200800002673126727118020110099100100800008000001008000043800392939800000038445110116112672810104800001002673226728267082673226828
802042674020100000004500126692212121625801001008000010080000500116555602670626731267071665031668980100200800002008000026707267271180201100991001008000080000010080000438003921380000603944511011611267281400800001002673226732267322670826861
8020427330200000000044001267122110258010010080000100800005001173631126851267072673116630316689801002008000020080000267072672711802011009910010080000800000100800004380039303880039603944511011611267041404800001002673226732267322673226862
802042674020000000004410126712211192580100100800001008000050011671270266822673126707166543166898010020080216200800002673126727118020110099100100800008000001008000008003930380038610051101161126728004800001002673226708267322673226828

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.3343

retire (01)cycle (02)030508090b0e0f1e223a3f4346494f51inst issue (52)~issue int (53)~issue ld/st (55)~dispatch int (56)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)5f60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map lookup int (7f)~map lookup ld/st (80)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)92inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a1a3a5a6a7a8a9acafb5b6bbdcache load miss (bf)dtlb miss (c1)c2cdcfd5d6ddinst fetch restart (de)e0eaebecld/st retires (ed)gpr retires (ef)f5f6f7f8fd
8002526737200000000450226721207192580010108000010800005011676250026711267362673616681316708800102080000208000026727267281180021109010108000080000010800000430800390104580038613943000502014161210267331313580000102673726738267372673726743
800242673920010010021132672137719258001010800001080000501167791002671126736267361666031671680010208000020800002673626736118002110901010800008000001080019190080059111678003961594319105020716612267331313580000102673826737267372671526863
800242674520010100067132672537722258001010800001080000501175533012671426736267361668131673880010208000020800002673626737118002110901010800008000001080020194308005912306480039615943190050201216101026733013580000102673726737267382671626862
80024267232001010006603267230742225800101080000108000050116768201267112673626736166813167178001020800002080000267372673711800211090101080000800000108002020430800591660338000061194319005020101612726733013580000102673826738267382673726737
80024269912001111007213267212770258001010800001080000501168016012671526715267371668131671880010208000020800002673626737118002110901010800008000001080020214308005913161800396158431910502012161111267331313580000102671526737267372673826798
80024267362001101106603267212772025800101080000108000050116779101267112673626736166823166958001020800002080000267362673611800211090101080000800000108001920008005912806380040611943191050201216111426821130580000102673726737267382673726843
800242674520111110096022672137720258001010800001080000501167219012671226736267361668231671680010208000020800002673626736118002110901010800008000001080019194308005912506380040615943190050209161011267331313580000102673726737267382673726843
80024267452001010006712267212772025800101080000108000050116847201267112673626736166823167168001020800002080000267362673611800211090101080000800001108002020430800581261648004061580191050201216101026734130580000102673726715267382671626830
800242674520010010066012669937720258001010800001080000501167219012671226736267361668231671680010208000020800002673626736118002110901010800008000001080020204308005911816180040615843190050201216117267341313580000102673726737267382673726816
8002426728201110011670326721077202580010108000010800005011732720126711267362673616681316717800102080000208000026736267151180021109110108000080000010800191943080059125061800396119431900502012161212267331313580000102673726737267382673726750