Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

LD1 (multiple, 1 reg, 2D)

Test 1: uops

Code:

  ld1 { v0.2d }, [x6]
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 60 nops): 1.000

Issues: 1.000

Integer unit issues: 0.000

Load/store unit issues: 1.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)03mmu table walk instruction (07)09l2 tlb miss instruction (0a)l2 tlb miss data (0b)0e0f1e1f22243a3f464951schedule uop (52)schedule ldst uop (55)dispatch ldst uop (58)simd uops in schedulers (5a)5f6061696d6emap rewind (75)map stall (76)dispatch uop (78)map ldst uop (7d)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst simd load (98)inst ldst (9b)9dl1d tlb access (a0)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)acafb5bbl1d cache miss ld nonspec (bf)cfd0d2l1i cache miss demand (d3)l1i tlb miss demand (d4)d5map dispatch bubble (d6)d9dadbddfetch restart (de)e0eaebec? ldst retires (ed)f5f6f7f8fd
61005287452162202000020105100281021023447100010001000500080815968281962861931010001000100028314284251161001100010000100021000011000202131761037971843405136521350356538115150502805314758132971365010002892928598284092904628565
61004284162142112600020105162280921023441100010001000500030815982280992852431010001000100028439286031161001100010000100001000001000002138641037770793451105220921346838205050542840014311133451369610002867828599286422850928618
61004285472152702100020005003280400023915100010001000500030815987285162861031010001000100028309283731161001100010001100021001001001202137601020069153204134720950353538125647502824314499126221445610002900528683288622871628591
6100428595214210170002000518528161002376110001000100050002181598928105289493101000100010002842528467116100110001000010002100000100020213878968569493353105120893340338135155472830214916127551405410002879928685289952867028597
61004286822152302300011000506128404002366110001000100050001181596428129284053101000100010002872428364116100110001000010002100000100021213499972170463442125520905325438125249532810914767127771399110002871228682287142878028592
61004286262152202500020105045280770023600100010001000500021815983281662876631010001000100028469286441161001100010001100021000001000202134631034371413433165220889343538215748522813214239125551387810002859628553285632869528514
6100428685217200200110000487328227002357710001000100050008101597428163287353101000100010002851428383116100110001000010002100001100020213746982969573371134921172340738155153532806014882127071412610002869028698286222858528599
610042871021423019000200051042812000236091000100010005000000159692812528561310100010001000287372855311610011000100011000210010010002021360495867121334895420848345438185350502808914633124641379110002856628640289962891328537
61004287822181701800020004873280920023557100010001000500030815985280542855331010001000100028549284811161001100010000100021000001001200139821000869243168115420952339538155549512816514654126411431410002881128678287452859129103
61004285772162501800100104958283740023504100010001000500021815990282242896831010001000100028593287071161001100010000100021000001000002134261018070913383114920972342538155749502813314535123011397210002866728542284562855328604

Test 2: Latency 1->2 roundtrip

Chain cycles: 3

Code:

  ld1 { v0.2d }, [x6]
  fmov x0, d0
  eor x8, x8, x0
  eor x8, x8, x0
  add x6, x6, x8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 9.0047

retire uop (01)cycle (02)03l1i tlb fill (04)mmu table walk data (08)090e0f181e22233f4d4f51schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd load (98)inst ldst (9b)9d9fl1d tlb access (a0)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)acafb5l1d cache miss ld nonspec (bf)c2branch cond mispred nonspec (c5)branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
50205120047899100000110120020119509109536256010340102100001000030108100081000610780055736732613435911200261200471200501132337113732501283022810010100106025610010100101200471200471150201100991004010010000100000100100001100001010000100111322101600119736400029601000040100120051120051120051120051120051
50204120047899000000000120020119509109514256010040102100001000030108100081000610789905736422613435911200261200501200501132187113748501283022810010100106025610010100101200501200471150201100991004010010000100000100100001100000010000100111322001600119736400009001000040100120051120051120048120048120048
502041200478990000001001200351195091094632560103401001000110000301001000010000107899957361886133818112001112005012004711314531136585010030200100001000060200100001000012003512004711502011009910040100100001000001001000001000000100001101003210110711119660400009981000040100120051120051120051120051120051
502041200358990000000101200351195091094632560103401021000010000301001000010000107886257354556134461112001112003512005011314531136675010030200100001000060200100001000012010012003511502011009910040100100001000001001000001000000100001100003210110711119657400029981000040100120051120051120036120051120036
502041200358990000001001200351194931094632560103401021000110000301001000010000107886257354556134461112002312004712003511314331136365010030200100001000060200100001000012005012003511502011009910040100100001000001001000011000000100001100003210110111119705400000081000040100120051120048120051120049120036
502041200358990010001101200351194931094632560100401001000110000301001000010000107920057354556134461112002612003512003511314131136365010030200100001000060200100001000012005012004711502011009910040100100001000001001000011000000100001100003210110711119646400026601000040100120051120051120048120051120051
502041200508990000001001200201195091094632560103401021000010000301001000010000107920057361886133818112002612005012005011315031136675010030200100001000060200100001000012003512004711502011009910040100100001000001001000011000000100001000003210113511119657400029601000040100120036120036120051120051120051
502041200508990000041001200351195091094492560103401021000010000301001000010000107899957354556134461112001112005012008511314331136585010030200100001000060200100001000012003512004711502011009910040100100001000001001000011000000100010000003210110711119646400029901000040100120051120051120051120036120051
502041200358990000000101200321195091094632560100401021000110000301001000010000107899957361886134461112002612010012009511314531136675010030200100001000060200100001000012005012004711502011009910040100100001000011001000011000003100000100003210110111119657400009081000040100120036120051120048120051120051
502041200388990000000101200351195091094492560103401001000110000301001000010000107886257361886136166112002312005012003511314531136675010030200100001000060200100001000012005012004711502011009910040100100001000001001000011000100100001000003210110711119657400009681000040100120036120036120052120051120051

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 9.0054

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)l2 tlb miss data (0b)0e0f1e1f22243a3f4d4f51schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)60696b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd load (98)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)a9acafb5bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2branch cond mispred nonspec (c5)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
500251200578990001120101120042119515109470256001640014100021000030010100001000010796115736524613396801200330120057120060113175311369650010300201000010000600201000010000120057120057115002110910400101000010000010100021110002011100001000000314041072311967240004101001000040010120058120058120058120058120058
500241201168991011110000120036119509109464256001340012100011000030010100001000010795575736236613366201200330120057120057113178311369650010300201000010000600201000010000120057120057115002110910400101000010000010100012110002011100001111000314021072311967240004101091000040010120108120058120058120058120058
500241200598991100020001120042119518109470256001640014100021000030010100001000010796115736524613396811200330120057120060113178311369950010300201000010000600201000010000120062120057115002110910400101000010000010100032110002001100001111100314031073211967240004101091000040010120058120058120058120058120058
500241200578991000020000120042119499109470256001640014100021000030010100001005510796115736524613396811200330120057120057113175311369650010302141000010000600201000010000120057120057115002110910400101000010000010100022110002011100001111100314031073311967640004101091000040010120058120058120058120058120058
50024120057899100002000012004211951510947025600164001410002100003001010000100001079463573652461339681120033012005712005711317531137025001030020100001000060020100001000012005712005711500211091040010100001000001010002311000202110000101000031403107231196664000201001000040010120052120052120052120052120052
50024120035899000001010012003611950910946725600134001210001100003001010000100001079557573623661336621120027012005412005111316931136905001030020100001000060020100001000012005112005111500211091040010100001000001010000011000000010000101000031403107321196664000201091000040010120052120052120060120036120052
500241200518990000110000120036119512109464256001340012100011000030010100001000010795575736236613366211200270120051120035113169311369050010300201000010000600201000010000120054120051115002110910400101000010000010100000010000003100001010000314021072311966640002101091000040010120052120052120052120055120052
500241200549000000100100120036119509109464256001340012100011000030010100001000010795575736236613366211200300120051120054113169311369050010300201006510000600201000010000120051120051115002110910400101000010000010100000110000000100001010000314031073311966640002101091000040010120054120145120052120063120052
5002412005189900000101001200361195091094642560013400121000110000300101000010000107955757362366133662012002701200511200511131693113693500103002010000100006002010000100001200511201441150021109104001010000100000101000001100000001000010100003140210723119650400021010121000040010120052120052120055120052120052
5002412005189900000100001200391194921094641046005540046100071001030150100001000010795575750963613794211200270120051120054113482311369050010300201000010000609721016110000120054120051115002110910400101000010000010100000110000004835100001010000316831072311966640002101391000040010120052120052120052120052120052

Test 3: throughput

Count: 8

Code:

  ld1 { v0.2d }, [x6]
  ld1 { v0.2d }, [x6]
  ld1 { v0.2d }, [x6]
  ld1 { v0.2d }, [x6]
  ld1 { v0.2d }, [x6]
  ld1 { v0.2d }, [x6]
  ld1 { v0.2d }, [x6]
  ld1 { v0.2d }, [x6]
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.3342

retire uop (01)cycle (02)03l1i tlb fill (04)09l2 tlb miss instruction (0a)0e0f18191e22233a3f4346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd load (98)inst ldst (9b)9d9fl1d tlb access (a0)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)acafb5b6bbl1d cache miss ld nonspec (bf)cfd5map dispatch bubble (d6)ddfetch restart (de)e0eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
8020526727200010000044001267162111925801001008000010080000500116800212670526707267311665431668980100200800002008000026731267271180201100991001008000080000010080000438003901448003861044511011611267041407800001002673226708267082673226728
8020426890201000000001012671621119258010010080000100800005001166525026702267312670716655316689801002008000020080000267312672711802011009910010080000800000100800000800392538800386139445110116112672810147800001002670826732267322673226834
802042671020000100114510126712012097258010010080000100800005001170666026682267312673116630316689801002008000020080000267072672711802011009910010080000800000100800004380039263880038610051101161126728000800001002673226734267082670826843
8020427183200000000001002671621201625801001008000010080000500117221602670626731267311663031668980100200800002008000026731267271180201100991001008000080000010080000438003931388003800044511011611267280147800001002672826708267322673226812
8020426736200000010044000266923010258010010080000100800005001166525126702267312673116654316665801002008000020080000267072670711802011009910010080000800000100800000800383438003860380511011611267241407800001002670826728267322673226824
80204267342000000000501002671620119258010010080000100800005001167127026706267312672716654316665801002008000020080000267272670711802011009910010080000800000100800004380039263880038610445110116112670410147800001002673226708267322672826821
80204267392000001000010126717011192580100100800001008000050011695450266822673126740166543166858010020080000200800002673126727118020110099100100800008000001008000043800392939800000038445110116112672810104800001002673226728267082673226828
802042674020100000004500126692212121625801001008000010080000500116555602670626731267071665031668980100200800002008000026707267271180201100991001008000080000010080000438003921380000603944511011611267281400800001002673226732267322670826861
8020427330200000000044001267122110258010010080000100800005001173631126851267072673116630316689801002008000020080000267072672711802011009910010080000800000100800004380039303880039603944511011611267041404800001002673226732267322673226862
802042674020000000004410126712211192580100100800001008000050011671270266822673126707166543166898010020080216200800002673126727118020110099100100800008000001008000008003930380038610051101161126728004800001002673226708267322673226828

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.3343

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)09l2 tlb miss data (0b)0e0f1e223a3f4346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)5f60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)92inst branch cond (94)inst int alu (97)inst simd load (98)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)a5ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafb5b6bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
8002526737200000000450226721207192580010108000010800005011676250026711267362673616681316708800102080000208000026727267281180021109010108000080000010800000430800390104580038613943000502014161210267331313580000102673726738267372673726743
800242673920010010021132672137719258001010800001080000501167791002671126736267361666031671680010208000020800002673626736118002110901010800008000001080019190080059111678003961594319105020716612267331313580000102673826737267372671526863
800242674520010100067132672537722258001010800001080000501175533012671426736267361668131673880010208000020800002673626737118002110901010800008000001080020194308005912306480039615943190050201216101026733013580000102673726737267382671626862
80024267232001010006603267230742225800101080000108000050116768201267112673626736166813167178001020800002080000267372673711800211090101080000800000108002020430800591660338000061194319005020101612726733013580000102673826738267382673726737
80024269912001111007213267212770258001010800001080000501168016012671526715267371668131671880010208000020800002673626737118002110901010800008000001080020214308005913161800396158431910502012161111267331313580000102671526737267372673826798
80024267362001101106603267212772025800101080000108000050116779101267112673626736166823166958001020800002080000267362673611800211090101080000800000108001920008005912806380040611943191050201216111426821130580000102673726737267382673726843
800242674520111110096022672137720258001010800001080000501167219012671226736267361668231671680010208000020800002673626736118002110901010800008000001080019194308005912506380040615943190050209161011267331313580000102673726737267382673726843
80024267452001010006712267212772025800101080000108000050116847201267112673626736166823167168001020800002080000267362673611800211090101080000800001108002020430800581261648004061580191050201216101026734130580000102673726715267382671626830
800242674520010010066012669937720258001010800001080000501167219012671226736267361668231671680010208000020800002673626736118002110901010800008000001080020204308005911816180040615843190050201216117267341313580000102673726737267382673726816
8002426728201110011670326721077202580010108000010800005011732720126711267362673616681316717800102080000208000026736267151180021109110108000080000010800191943080059125061800396119431900502012161212267331313580000102673726737267382673726750