Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

LD1 (single, post-index, B)

Test 1: uops

Code:

  ld1 { v0.b }[1], [x6], x8
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 60 nops): 2.000

Issues: 3.001

Integer unit issues: 1.000

Load/store unit issues: 1.000

SIMD/FP unit issues: 1.001

retire uop (01)cycle (02)03mmu table walk instruction (07)mmu table walk data (08)09l2 tlb miss instruction (0a)l2 tlb miss data (0b)18191e1f223a3f4346495051schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)5f696b6d6emap rewind (75)map stall (76)dispatch uop (78)map ldst uop (7d)map simd uop (7e)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst simd load (98)inst ldst (9b)9dl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)a9acafb5bbl1d cache miss ld nonspec (bf)c2c9cfd0d2l1i cache miss demand (d3)l1i tlb miss demand (d4)d5map dispatch bubble (d6)d9dadbddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? simd retires (ee)? int retires (ef)f5f6f7f8fd
62005294032352000210007004710283810100166873002100010011000100010001000500050001193415225803285892844932830001000100020002000284542851911610011000100001000021001001100121301711328195797008318694620040318338181449472821610001540812621133791000100010002865928712290022863128671
6200428686221190022000400488528289010016701300210001002100010001000100050005002119321422810028704286253103000100010002000200028621286571161001100010000100002100110101001203001312694737023321984319921319238141139472807710001502712556137581000100010002876128638286032861428721
62004286132221500170004004708284250110165143001100010021000100010001000500050021193292263232862328602310300010001000200020002856128584116100110001000010000210010011001213001331796276967317094419907320438041242452811410001543112435135751000100010002867028747287032864128639
6200428607222260015000400488528261010016642300110001001100010001000100050005002119321522586028492285893103000100010002000200028460285251161001100010000100002100100110012120013392949970603268104319974323638051749472810210001551012574137301000100010002862828554285352855828636
6200428640222200022000488048462830901101653530021000100210001000100010005000500211934102263002858428741310300010001000200020002852828549116100110001000010000310010011001213001345495246973313794920240323638111346412807410001557112715137351000100010002862428587286602864328684
6200428695230240018000160047722898401101743630021000100210001000100010005000500011934522640029059288773103000100010002000200028591288951161001100010000100002100100710012130013460971270563227145619985315438091750462806910001549612953137661000100010002876828620288182873028558
62004284492211800200004004953282490010165143001100010011000100010001000500050011192913226540285072863731030001000100020002000286422859511610011000100001000021001001100121300133869602702332217472011832613811646482810010001544712668135681000100010002863528685286342873728680
620042860022218001610050148372837300101665530011000100110001000100010005000501611944102257002855128681310300010001000200020002858128685116100110001000010000310010011001213001324598817014321394620053317038121145462803710001536412283136151000100010002871628642286342863228701
620042868322218002100016004756283130110166043002100010011000100010001000500050001193652268702849928557310300010001000200020002862428558116100110001000010000210010011001213001331395176955315964120050319738171348492815810001499012377135361000100010002877328674286992854728544
620042864622221001700040046872830001101678030021000100210001000100010005000500011931102257002851428647310300010001000200020002852128595116100110001000010000310010011001213001336996896943316364419931325238051145492817110001541712675138271000100010002864728732287012861628661

Test 2: Latency 1->2 roundtrip

Chain cycles: 3

Code:

  ld1 { v0.b }[1], [x6], x8
  fmov x0, d0
  eor x8, x8, x0
  eor x8, x8, x0
  add x6, x6, x8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 11.0055

retire uop (01)cycle (02)03mmu table walk data (08)09l2 tlb miss data (0b)0e0f18191e1f23243f43494d51schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd load (98)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)a9acafb5bbl1d cache miss ld nonspec (bf)branch mispred nonspec (cb)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? simd retires (ee)? int retires (ef)f5f6f7f8fd
602051400591085000101014000014004811139580258010450100200041000040100200001000012457185304829107111461400351400591400591319733132440701003020010000200006020020000300001400591400591150201100991004010010000100000100100000210001201100011110032101931213972650000101010100001000050100140060140143140066140060140060
6020514006610490000000230001400441113958025801045010020002100004010020000100001245718530482910711146140035140059140059131973313244070100306911187020000604422008030367142695142079115020110099100401001000010000010010001021000100710001110103210193111397415000001010100001000050100140060140060140060140060140060
60204140063104900000002000140047111395812580104501002000410000401002000010000124571853048291071114614003514005914005913197331324407010030200100002000060200200003000014006814005911502011009910040100100001000001001000002100010010100011110032101931213972750000101013100001000050100140060140060140060140060140063
602041400621085000000020001400441113958025801045010020004100004010020000100001245718530482910711224140035140059140059131973313244170100302001000020000602002000030000140059140059115020110099100401001000010000010010000021000110410001110003210193111397295000010100100001000050100140060140060140063140060140060
6020414005910850000000200014004411139580258010450100200041000040100200001000012457185304829107111461400351400611400611319733132440701003020010000200006020020000300001400591400591150201100991004010010000100001100100000210001001100011110032101931213970750000101010100001000050100140060140060140060140060140060
6020414005910860000000590001400441113958025801045010020004100004010020000100001245718530482910711146140035140059140059131973313244070100302001000020000602002000030000140059140059115020110099100401001000010000010010000021000100110001111003210193121397265000001010100001000050100140060140060140061140044140060
6020414005910860000000200014004411139580258010450100200041000040100200001000012457185304868107113021400351400621400601319743132440701003020010000200006020020000300001400591400611150201100991004010010000100000100100000210001004100011110032101931213976450000101010100001000050100140060140060140060140060140060
6020414005910860000000200014004411139580258010450100200041000040100200001000012457185305224107111461400191400431400591319733132440701003020010000200006020020000300001400591400591150201100991004010010000100000100100000210001104100011110032101931113972350000101012100001000050100140060140060140060140060140060
602041400591085000000020001400441113958025801045010020004100004010020000100001245718530422110711146140019140059140059131974313244070100302001000020000602002000030000140059140059115020110099100401001000010000010010000021000100410001111003210193111397305000010100100001000050100140103140060140060140045140044
602041400591085000010020001400461113958025801205010020004100004010020000100001245718530482910711146140036140059140059132001313242470100302001000020000602002008030000140060140059115020110099100401001000010000010010000021000110410002111003210193211397315000010100100001000050100140060140060140060140060140060

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 11.0054

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)09l2 tlb miss data (0b)0e0f18191e1f2223243a3f4d51schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)60696b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd load (98)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)a9acafb5bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2cdcfd5map dispatch bubble (d6)d9ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? simd retires (ee)? int retires (ef)f5f6f7f8fd
60025140051108601110000100000140039139658258001450010200041000040010200001000012458245307656107174210140030014005414005413200831324657001030020100002000060020200003000014005714005711500211091040010100001000011010002011000100110000101100314038803313973150000131013100001000050010140061140061140055140055140055
60024140060108510001100100000140039139660258001450010200041000040010200001000012458245307542107174210140030014005414005413200231324687001030020100002000060020200003000014005714005811500211091040010100001000001010000111000100310000101110314038801331397065000013013100001000050010140058140058140061140055140055
60024140062108610000000100000140020139658258001450010200041000040010200001000012457625307695107174210140027014005414005413198331324687001030020100002000060020200003000014006014005711500211091040010100001000001010000011000210310000100000314048804413980350000131314100001000050010140061140061140061140061140141
6002414005410861010000010000014003913965825800125001020005100004001020000100001245896530754210717421014007801400511400521320323132465700103002010000200836075020160300001400951400411150021109104001010000100000101000001100020101000010100031404880331397065000013100100001000050010140061140061140061140061140055
60024140054108511001000100000140039139655258001250010200041000140010200001000012457535307884107178890140030014006014006213200831324627001030020100002000060260200003000014005814005711500211091040010100001000001010003311000000410000111100314038803313973150000131013100001000050010140055140055140055140055140059
6002414006010861000000014000001400521396642580014500102000410000401512000010000124582453076561071812301400310140060140057132005313245970010300201000020000600202007830000140060140059115002110910400101000010000110100012010000003171100001111003140310104413973450000131010100001000050010140063140055140056140058140061
60024140060108510000019322000021401991397297680056500302001010004405752015610120125220753154361072083601402590140331140245132045451325597235233550111272236765604200823024214024514042131500211091040010100001000001010007411000700968610003101101314038803313973150000131313100001000050010140086140068140058140060140061
60024140060108510000000200000140042139664258001450010200041000040010200001000012479375308668107174210140027314005714014313200530132465700103002010048200006030620000300001400571400571150021109104001010000100000101000221100010001000010112031404880331397325000013013100001000050010140058140061140058140037140061
6002414006110861100000020000014004513965825800145001020004100004029220000100001245824530765610717421014002701400531400601320093132468700103002010000200006002020000300001400601400571150021109104001010000100000101000221100021001000010110031403880331397295000001013100001000050010140059140058140042140058140052
6002414006010861100000010000014004213965525800145001020004100004001020000100001245878530765610717421014003601400351400551320053132468700103002010000200006002020000300001400601400571150021109104001010000100000101000211100011001000010111031403880331397125000013013100001000050010140061140061140061140062140055

Test 3: throughput

Count: 8

Code:

  ld1 { v0.b }[1], [x6], x8
  ld1 { v0.b }[1], [x6], x8
  ld1 { v0.b }[1], [x6], x8
  ld1 { v0.b }[1], [x6], x8
  ld1 { v0.b }[1], [x6], x8
  ld1 { v0.b }[1], [x6], x8
  ld1 { v0.b }[1], [x6], x8
  ld1 { v0.b }[1], [x6], x8
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 2.0008

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)09l2 tlb miss data (0b)0e0f18191e1f2223243a3f4346494e4f51schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)rob full (74)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd load (98)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)a5ld unit uop (a6)l1d cache writeback (a8)a9acafb5b6bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2branch cond mispred nonspec (c5)cfd5map dispatch bubble (d6)ddfetch restart (de)e0e7? int output thing (e9)eaeb? ldst retires (ed)? simd retires (ee)? int retires (ef)f5f6f7f8fd
1602051600611241100111006640000116004606615960010252401048010080014800008010080000800005314385295582293711301600421600611601331396960314001924010020080000800002001600001600001600611600611180201100991001008000080000010080140723080025002115800196125060051101161216005808000099800008000080100160062160062160118160062160062
16020416006112411110000115068801001600461661551218296241318805838004080520805808045680483555521566419229257400160198160201160274139670275014010024151620080656804912001616461606541603441602023180201100991001008000080000110080266623118804150062608039001252360051101161116005818000009800008000080100160065160062160047160062160062
16020416004612411000000012730010116004616015960092524010480100800048000080100800008000053069052895522937117016004216006116006113969603140019240100200800008000020016000016000016006116006111802011009910010080000800001100800077008002500258001961262361051101161116005818000099800008000080100160062160062160062160062160062
16020416004612401001000032000011600461661596001025240104801008000480000801008000080000530690541445229371170160042160061160062139696031400192401002008000080000200160000160000160061160061118020110099100100800008000001008000772308000731258001901252360051101161116004318000099800008000080100160062160062160062160062160047
16020416006112411000000096300001160046166159600825240104801008000480000801008000080000531434528952229371130160027160061160061139696031400192401002008000080000200160000160000160046160061118020110099100100800008000001008000872408002401258001900262370051101161116005808000099800008000080100160062160062160062160062160062
16020416006112411011000086235200001600461661596001025240104801008000480000801008000080000530686528958229371170160027160061160046139696031400202401002008000080000200160000160000160061160061118020110099100100800008000001008000772308002620268001861252370051101161116005908000099800008000080100160062160062160062160062160064
1602041600611241110000003200101160046160159600825240104801008000480000801008000080000530690528958229371170160042160061160046139697031400192401002008000080000200160000160000160061160062118020110099100100800008000001008000772308002600268001961262370051101161116005908000090800008000080100160062160062160047160062160047
160204160061124111000000136300100160046166159600825240104801008000480000801008000080000530690528952229371170160042160046160061139696031400192401002008000080000200160000160000160061160061118020110099100100800008000011008000672308002701248001861262370051101161116005918000099800008000080100160062160047160062160047160047
16020416006112401000100014130010116004616615918071782410168106380040806508042080760806405679435788252292573801601591603411602741396443674140605256212214854048551620617048817081816004616013721802011009910010080000800000100804086231788028623629380539612623730513011611160058080000910800008000080100160047160062160132160134160063
1602041600611286111000004020100116005016615915096424010480100800048000080100801528000053048652956422937925016004616006516004613970003140023240100200800008000020016000016000016006516006511802011009910010080000800000100800086270800300030800226130277305110116111601020800001313800008000080100160066160066160066160066160066

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 2.0007

retire uop (01)cycle (02)03mmu table walk instruction (07)mmu table walk data (08)l2 tlb miss data (0b)0e0f18191e1f22233a3f4346494e4f51schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)5f60696b6d6emap stall dispatch (70)rob full (74)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd load (98)inst ldst (9b)9d9fl1d tlb access (a0)l1d cache miss ld (a3)a5ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)acafb5b6bbl1d cache miss ld nonspec (bf)c2cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0e7? int output thing (e9)eaebec? ldst retires (ed)? simd retires (ee)? int retires (ef)f5f6f7f8fd
16002516005312400001100190000160040160159590525240012800108000280000800108000080000530677529946229354560016003601600531600531397120314003524001020800008000020160000160000160053160053118002110910108000080000110800001408001200138001361101700502031643160052180000860800008000080010160056160056160056160056160054
1600241600551241000100086257200016039310615958862524001280010800028000080010800008000053068152933622935448001600210160053160055139712031400352400102080000800002016000016000016005516005311800211091010800008000001080000140800130016800136191700502031643160037080000060800008000080010160057160041160056160056160056
1600241600551241000110052000016004016615958852524001280010800028000080010800008000053067752933922935456001600360160055160055139712031400352400102080000800002016000016000016005316005311800211091010800008000001080000140800100013800126101700502031633160052080000990800008000080010160054160054160056160056160054
1600241600551241000110031000016004016615958852524001280010800028000080010800008000053067752933122935456001600340160040160055139712031400352400102080000800002016000016000016005516004011800211091010800008000001080000140800130113800126101700502041633160052080000960800008000080010160056160056160056160056160054
160024160055124100011002800001600401661595845252400128001080000800008001080000800005306775293372293545600160036016005516005513971203140035240010208000080000201600001600001600551600531180021109101080000800000108000017080013009800136110000502031633160052180000062800008000080010160056160054160056160054160054
1600241600551241000110019000016004016615958842524001280010800028000080010800008000053067752933122935456001600360160055160055139712031400312400102080000800002016000016000016005516005311800211091010800008000011080000140800130012800106191700502031623160052080000960800008000080010160041160054160056160056160054
160024160055124100010121501760001602361661558105993243636800108001480130803308045680480555513566711229202880016015301602651603231396862774140118240954208049280328201606561603281601981601982180021109101080000800000108013014118802751019800136191400502041653160052180000960800008000080010160056160056160056160056160041
1600241600531240000110019000016004016615958842524001280010800028000080010800008000053067752933122932604001600360160055160055139712031400352400102080000800002016000016000016005516005311800211091010800008000001080262140800000016800136113000502041643160050080000900800008000080010160056160054160056160054160056
1600241600531241000112231000016004016615958462524001280010800028000080010800008000053062152933122931346001600360160055160055139712031400352400102080000800002016000016000016005516005311800211091010800008000001080132170800130017800136113000502041623160052180000900800008000080010160056160056160056160056160054
1600241600401241000110015010016003810615955802524001080010800028000080010800008000053141352993822935460001600360160055160055139712031400352400102080000800002016000016000016005516005311800211091010800008000001080000170800130013800126013001502041634160052180162960800008000080010160056160056160056160056160054