Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

LD1 (single, post-index, D)

Test 1: uops

Code:

  ld1 { v0.d }[1], [x6], x8
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 60 nops): 2.000

Issues: 3.003

Integer unit issues: 1.000

Load/store unit issues: 1.000

SIMD/FP unit issues: 1.003

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk instruction (07)mmu table walk data (08)09l2 tlb miss instruction (0a)l2 tlb miss data (0b)0e0f18191e2223243a3f4951schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)5f696d6emap rewind (75)map stall (76)dispatch uop (78)map ldst uop (7d)map simd uop (7e)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst simd load (98)inst ldst (9b)l1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)a9acafb5bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2cfd0d2l1i cache miss demand (d3)l1i tlb miss demand (d4)d5map dispatch bubble (d6)d9dadbddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? simd retires (ee)? int retires (ef)f5f6f7f8fd
620052865822312210241110030004762283690166653003100010031000100010001000500050001189910226552875828755310300010001000200020002854628621116100110001000100132100201010002121013202982669483218124820052319038171949542821510001555812522136891000100010002875628838286842850328821
62004286032221210033100003100483628368016538300310001003100010001000100050005000119427226462863028785310300010001000200020002868128727116100110001000100102100100110002121113470984570193177125420094321138151852482811010001526512640133941000100010002867528935287792876728674
62004287242231180023010113000491328322016786300210001003100010001000100050005000119342226532864628917310300010001000200020002860328782116100110001000100200100100110012101013311960269523160165120178322838181852472818110001519812091139211000100010002877228966288812903128879
620042922523111820151110030004653285880167833003100010031000100010001000500050031193752264428705289093103000100010002000200028334284521161001100010001002121004001100021211135671011972803396144819808326338181553492795010001452212443130481000100010002844728400284332819828432
62004282582121161025111003000521328140016390300310001003100010001000100050005000119416226792836228349310300310001000200020002795728147116100110001000100022100001110002121113951102517167335895419835337838061448842808310001424912080128491000100010002832128534284192843028418
6200428329213120002100000200048942805101658330031000100310001000100010005000500011931722686279712828231030001000100020002000282502816811610011000100010021210020111000202011374098626975348874419755347238201649492791010001441911937133081000100010002840428437282872835128289
62004284612131200020110000000503228310016398300310001003100010001000100050005000119435227162842328583310300010001000200020002833928329116100110001000100210100401110002021213665100917152337384719681342338131345502805510001475412326130831000100010002839428379283202836928637
62004285552131220016100001200050432821501637630011000100110001000100010005005500011938122611284212852331030001000100020002000283272834811610011000100010021210030011000212111377797416991319084620528332638162048462855310001580012579143671000100010002867328942286302886928867
620042878822312110161110014001457828832017361300310001003100010001000100050005002119488226332912829294710300010001000200020002914529038116100110001000100332100347111000212101285391536791301795020604301438081152462835110001634713278145131000100010002926329341292402929329287
62004292592201190022001001000455428842017200300110001003100010001000100050005000119481322611290872934731030001000100020002000291772912011610011000100010023210010011000022111280190776797304310502067430673814451572841310001617213200147401000100010002934929249293222927629345

Test 2: Latency 1->2 roundtrip

Chain cycles: 3

Code:

  ld1 { v0.d }[1], [x6], x8
  fmov x0, d0
  eor x8, x8, x0
  eor x8, x8, x0
  add x6, x6, x8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 11.0051

retire uop (01)cycle (02)03mmu table walk data (08)09l2 tlb miss instruction (0a)l2 tlb miss data (0b)0e0f18191e1f233a3f4d51schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd load (98)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)a9acafb5l1d cache miss ld nonspec (bf)c2branch cond mispred nonspec (c5)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? simd retires (ee)? int retires (ef)f5f6f7f8fd
6020514005510850100110010001400371395722580102501002000210000401002000010000124567353046051070927601400291400541401361319493132432701003032510000200006020020000300001400381400351150201100991004010010000100001100100000110000120100000100032101107111397155000001310100001000050100140052140059140052140055140052
60204140137108600000000130101400361395802580102501002000210000401002000010000124696953045251071052201400271400511400511319683132432701003020010000200006020020000300001400541400511150201100991004010010000100001100100000110000000100000000032101931113971550000131013100001000050100140237140178140198140058140146
6020414005110860000000010001400391395722580102501002000210001401002000010000124564653045251071052201400301401421400511319653132435701003020010000200006020020000300001400541400531150201100991004010010000100001100100000010000003100001100032101931113971550000101013100001000050100140052140055140052140055140052
6020414005110860000000013000140039139572258010250100200021000040100200001000012456735304525107105220140027140051140051131965313243270100302001000020000602002000030000140054140051115020110099100401001000010000110010000011000010310000010003210193111396995000010100100001000050100140058140055140052140052140052
6020414005110860000000010001400361395722580100501002000210000401002000010000124560753039271071052201400671400351400511319653132432701003020010000200006020020000300001400351400511150201100991004010010000100001100100000110000000100001000032101931113971550000101010100001000050100140052140055140052140055140052
6020414005110860000110010001400361395722580102501002000210000401002000010000124561653039661071052201400301400541400541319653132432701003020010000200006020020000300001400351400511150201100991004010010000100001100100000110000100100001100032101931213971550000131010100001000050100140052140052140055140052140036
60204140051108600000010130001400361395725280102501002000010000401002000010000124564653046051071052201400271400351400891319683132432701003032010000200006020020000300001400351400511150201100991004010010000100001100100000110000000100001100032101931113971550000101010100001000050100140052140052140054140052140052
6020414005410860000000110001400201395722580116501002000210000401002000010000124564653045251071052201400271400351401451319683132432701003032310000200006020020000300001400511400511150201100991004010010000100001100100000110003203100001100032101931113980050000131013100001000050100140052140055140052140055140052
60204140053108500010000120001400201395725080102501002000210000401002000010000124564653045641071052201400271400541401481319683132435701003020010040200006020020000300001400561400521150201100991004010010000100001100100000110000000100001100032101931113980250011101010100001000050100140055140036140036140036140036
60204140145108710000000133880114030513964910180129501312000810003402412007810079125140653067101072406801419501424971425911323233713258070617305641008120162606882016230242140324140240515020110099100401001000010000110010004211000600966010002110003210193111397155000001013100001000050100140052140052140055140052140052

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 11.0054

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)09l2 tlb miss data (0b)0e0f18191e1f22233f4d51schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)6061696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd load (98)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)a9acafb5bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2branch cond mispred nonspec (c5)cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? simd retires (ee)? int retires (ef)f5f6f7f8fd
6002514005810850000000014000140039139660258001050010200021000040010200001000012457975307542107174210014002714005414005413200231324687001030020100002000060020200003000014005414005111500211091040010100001000001010000011000000010000101000314038822113972250000131013100001000050010140052140055140055140055140055
600241400511086000000101401014003913965850800125001020002100004001020000100001245824530765610717421001400301401471400541320023132462700103002010000200006002020000300001400541400351150021109104001010000100000101000001100021001000010100031402883413972550000131313100001000050010140055140052140052140055140055
600241400971086000000105900014003913965825800125001020002100004001020000100001245824530765610716479001400301400511400541319833132462700103002010000200006002020000300001400541400351150021109104001010000100000101000001100000001000010100031403883213972550000131315100001000050010140055140055140055140148140055
600241400541086000000001400014003913965925800125001020002100004001020000100001245824530765610717421001400301400541400541320023132462700103002010000200006002020000300001400541400541150021109104001010000100000101000001100008001000010100031402882213972550000131013100001000050010143771143212140403140058140055
60024140054108500001100200014013213965225800125001020002100004001020000100001245797530754210721942001401021400541400541320023132462700103002010000200006002020000300001400541400511150021109104001010000100000101000001100000001000010100031402883313972550000101310100001000050010140036140055140055140148140058
6002414005410850000000020001401321397652580012500102000210000400102000010000124590553076561071742100140099140055140035132002313246270010300201000020000600202000030000140051140059115002110910400101000010000010100002110000006100011010003140288221397065000013130100001000050010140142140055140055140055140055
60024140054108600000000201014003913965825800125001020002100004001020000100381245824530777010717421001400301400511400541320063132463700103002010000200006002020000300001401201400511150021109104001010000100000101000001100000031751000010100031402883313972550000131313100001000050010140056140036140056140056140055
60024140054108500000000140001400361396582580012500102000510000401512008010039124783153096031072286500140189140241142407132065351325667052730262101212008360516201623036114024014014741500211091040010100001000001010005211000200952510002101000382941063413994550030131013100001000050010140334140320140231140407140242
600241404241087000000411400014003813965825800125001020002100004001020000100001245824530765610717421001400301400541400541320023132462700103002010000200006002020000300001400551400511150021109104001010000100000101000001100000061000010000031402883213970650000101010100001000050010140055140055140055140055140055
60024140054108500000000700014003913965825800125001020002100004001020000100001245824530692210717421001400121400351400511319993132462700103002010000200006002020000300001400541400541150021109104001010000100000101000001100001031000010000031402883213972550000151013100001000050010140036140036140055140055140055

Test 3: throughput

Count: 8

Code:

  ld1 { v0.d }[1], [x6], x8
  ld1 { v0.d }[1], [x6], x8
  ld1 { v0.d }[1], [x6], x8
  ld1 { v0.d }[1], [x6], x8
  ld1 { v0.d }[1], [x6], x8
  ld1 { v0.d }[1], [x6], x8
  ld1 { v0.d }[1], [x6], x8
  ld1 { v0.d }[1], [x6], x8
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 2.0007

retire uop (01)cycle (02)0309l2 tlb miss data (0b)0e0f18191e1f233f4346494e4f5051schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)60696b6d6emap stall dispatch (70)rob full (74)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd load (98)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)a5ld unit uop (a6)l1d cache writeback (a8)a9acafb5b6bbl1d cache miss ld nonspec (bf)c2branch cond mispred nonspec (c5)cfd5map dispatch bubble (d6)dbddfetch restart (de)e0ld nt uop (e6)e7? int output thing (e9)eaeb? ldst retires (ed)? simd retires (ee)? int retires (ef)f5f6f7f8fd
1602051600531241110000102400160038166159584006424010280100800028000080100800008000053110252907222935460116007501600531600531396880314003224057220080000800002001600001600001601251600531180201100991001008000080000110080000014080010201380010001017005110116011160050008000066800008000080100160056160041160056160041160054
1602041600531286001000139800160038160159584402524010280100800028000080100800008000053099052919422935448016007301600531600401396880314001124010020080000800002001600001600001600531600531180201100991001008000080000010080000014080012001080009611017005110116011160050018000066800008000080100160054160041160041160054160105
16020416005312860000001420016003810615958450252401028010080002800008010080000800005310905292762293544801600340160040160040139675031400112401002008000080000200160000160000160053160053118020110099100100800008000001008000001708000000680010601014005110116011160050008000060800008000080100160054160054160054160056160056
1602041600531286000000558011600401661595844025240100801008000080000801008000081606101568195312122922449016003401600551600531396900314001124010020080000800002001603281600001600401600531180201100991001008000080000010080000014080140001080000601214005110116011160050018000066800008000080100160056160054160054160056160124
160204160055128600000041200160025166159558002524010280100800148000080100800008000053172652980122935460016002101601231600531396903831400112401002008000080000200160000160000160053160040118020110099100100800008000001008000001408001100080012601214005130116011160050018000066800008000080100160054160054160054160054160054
1602041601251285000000100000160038166159588502524010280100800028000080100800008000053109852919422935460016003401600531600531396880314001324010020080331800002001600001600001600401600531180201100991001008000080000010080000000800000098014060916005110116011160050018000096800008000080100160054160056160054160054160054
160204160053128500000063401160038160159208406524070680100800268026080260800008016354350653035722932604016080101601241600531396600271400322405722008016380163200160328160328160123160481341802011009910010080000800000100803920142378026202831580400601314205191143011160170008048000800008000080100160266160339160199160125160323
160204160054128500000036101160025166159588502524010280100800028000080100800008000053109852927722932604016002101600531600531396900314001124010020080000800002001600001600001600551600401180201100991001008000080000010080000014080010120108001000014005110116011160050018000069800008000080100160041160054160041160054160054
16020416005512850000001483011600401661595844025240102801008000280000801008000080000531090541716229354560160036016005316005313969003140011240100200800008000020016000016000016005316005311802011009910010080000800000100800000160800100008001060000051101160111600520180000910800008000080100160054160041160054160041160056
16020416005312850000009701160040166159584402524010280100800028000080100800008000053099452927622935448016003601600561600401396880314001124010020080000800002001600001600001600551600531180201100991001008000080000110080000014080012001080148001014005110116011160050008000067800008000080100160056160056160054160054160054

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 2.0008

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)09l2 tlb miss data (0b)0e0f18191e1f22233a3f4346494e4f5051schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)rob full (74)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd load (98)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)a5ld unit uop (a6)l1d cache writeback (a8)a9acafb5b6bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0e7? int output thing (e9)eaeb? ldst retires (ed)? simd retires (ee)? int retires (ef)f5f6f7f8fd
1600251600611241001000003100011600461661596008025240014800108000480000800108000080000530373541553229371131160042160061160061139718031400412400102080000800002016000016000016006116006111800211091010800008000001080006023598002600210080149612923630504121653160098180160912800008000080010160275160276160204160133160275
16002416026812421222002245212870000160171160159180101412406148033080052802608065080456804805552095664432292407711600271600461600551397200314004124001020800008000020160000160000160061160053118002110910108000080000010800088230800250026800196172371050203163316005808000090800008000080010160062160062160062160062160047
16002416006112411000000019000016004616015957080252400148001080004800008001080000800005303775290882293711711600421600611600461397120314004124001020800008000020160000160000160061160061118002110910108000080000110800068230800261028800196071770050203162316005808000099800008000080010160056160062160062160062160062
160024160061124110000000440000160046106159600902524001480010800028000080010800008000053037352908922937117116004216006116004613971803140035240010208000080000201600001600001600611600611180021109101080000800001108000762308002700258001861262361050203164316005208000099800008000080010160056160062160062160056160062
160024160061124100110000310000160046066159600802524001480010800028000080010800008000053037752908922937117116004216006116006113971803140041240010208000080000201600001600001600611600611180021109101080000800000108000702308002601258001901252370050203163316005808016699800008000080010160062160062160062160062160062
1600241600611241100110001600011600311661596008025240014800108000480000800108000080000530377529008229375151160042160061160061139718031400412400102080000800002016000016000016006116004611800211091010800008000001080006700800270008001960252300050203163316005818000099800008000080010160062160062160062160062160056
160024160061124110110000210000160046166159600100252400148001080004800008001080000800005303775290892293711311600421600611600611397180314003524001020800008000020160000160000160061160061118002110910108000080000010800077008001300258001960252471050203163216005808000099800008000080010160062160047160062160062160062
1600241600551241100001004701011600461661596001025240014800108000480000800108000080000531121529090229354561160042160061160061139718031400412400102080000800002016000016000016006116006111800211091010800008000001080008800800281025800006110073150203163216005808048096800008000080010160262160258160200160343160260
1600241602731242100200323595246400116026110615927870142240923804908004080390804908060880323567621566372229257381160105160272160275139660926140083240482208049280492201600001600001600611600611180021109101080000800000108000762308002501138001900262370050203163316005818000099800008000080010160062160204160205160189160204
160024160061124110000000430000160046166159600100252400148001080004800008001080000800005303775290892293394411600421600551600611396921851140077240482208000080000201600001600001600611600611180021109101080000800000108000772308002701298001961262460050203163316005818000099800008000080010160062160062160056160062160056