Apple M1 Microarchitecture Research by Dougall Johnson

Firestorm: Overview | Base Instructions | SIMD and FP Instructions
Icestorm:  Overview | Base Instructions | SIMD and FP Instructions

LDNP (signed offset, D)

Test 1: uops

Code:

  ldnp d0, d1, [x6, #0x10]
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires: 2.000

Issues: 1.000

Integer unit issues: 0.000

Load/store unit issues: 1.000

SIMD/FP unit issues: 0.000

retire (01)cycle (02)03080e0f1e22233a3f4346494f51inst issue (52)~issue ld/st (55)~dispatch ld/st (58)huge thing ld/st (5a)696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op ld/st (7d)~map lookup ld/st (80)8283pipeline redirect (84)85inst all (8c)inst fp/simd load (98)inst ldst (9b)9da0a3a6a8acafb5b6bbdcache load miss (bf)cfd5d6ddinst fetch restart (de)e0e6eaebecld/st retires (ed)? fp/simd (ee)f5f6f7f8fd
200540220004410138321116251000100010001501437339839896312710002000100039839411100110001000010004310380421038613943731162239110391010410001000395395395397395
200439430004510138320116251000100010001528437339839896313110002000100039839411100110001000010004310390381039613943731162239810381410710001000395400399399395
200439730004410138321119251000100010001528837339839896312710002000100039839411100110001000010004310390391039614043731162239110391010410001000395395395395397
2004394200017600137921119251000100010001528837339839896313110002000100039839411100110001000010004310380381038613844731162239510381414710001000399399399399395
2004398300044001383211216251000100010001501436939839496312710002000100039839411100110001000010004310390391039613943731162139110391010410001000395400400398395
20043943000450013792121216251000100010001506537239839896313110002000100039439411100110001000010004410380401038613843731162239110381414710001000399399399399399
200439830004400138301116251000100010001528437339839496313110002000100039439411100110001000010004410380381038613844731161139510381414710001000399399399399399
200439830004510139721116251000100010001505337339839892313110002000100039839411100110001000010004410380381038613844731162239110381414410001000395399399399399
2004394300044101380211219251000100010001528437339839896313110002000100039839411100110001000110004310380381038613844731162239110381414410001000399399399399401
200439830014400138301116251000100010001528837339839496313110002000100039839411100110001000010004410380381038613844731162239510391410410001000399399399400399

Test 2: Latency 1->3 roundtrip

Chain cycles: 3

Code:

  ldnp d0, d1, [x6, #0x10]
  fmov x0, d0
  eor x8, x8, x0
  eor x8, x8, x0
  add x6, x6, x8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 9.0051

retire (01)cycle (02)0307080a0b0e0f18191e1f2223243a3f4d4f51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)60696b6d6edispatch stall (70)71scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a1a3a6a7a8a9acafb5dcache load miss (bf)c5branch mispredict (cb)cdcfd5d6ddinst fetch restart (de)e0e6? int output thing (e9)eaebecld/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
602051200478991011112724358223761001123126121373110237106460450403861006910062349151141411664115851457972036191066112218301235021238641126820647114125603383699723942123677241612233118591240961241033715020110099100401001000010000010010052011000000001000011000326261357711965710000400021868100001000040100120053120048120048120048120048
602041200389301110110040100112003511953110945925601034010210000100003025110000100001079126573529361183371120011012005112003511189603112374501003020020000100006020010000100001200561200351150201100991004010010000100000100100000110000000010000110003214713578119646100004000210109100001000040100120052120055120036120052120036
60204120035899101000001000111200391195031094612560103401001000110000301001000010057108005657357256118285112003101200541200561118960311240750100302002000010000602001000010000120035120054115020110099100401001000010000010010000011000000031000011000321471356611966610000400020012100001000040100120055120055120036120055120036
602041200559001010000010001112002011950910946125601004010210001100003010010000100001079153573608461182850120030012005412005411190003112407501003020020000100006020010000100001200541200351150201100991004010010000100001100100000110000000010000010003212812176119666100004000213130100001000040100120036120036120052120055120036
602041200358991010000010000112003911950910944325601034010210001100003010010000100001079441573623061175990120030012003512005111188103112407501003020020000100006020010000100001200511200351150201100991004010010000100000100100000010000000010000100003212813578119666100004000210130100001000040100120036120055120036120055120036
60204120035900101000001000011200201195031094612560103401021000110000301001000010000107939657362306118285012001101200541200881118810311241750100302002000010000602001000010000120035120051115020110099100401001000010000010010000011000000001000010000321281358711964610000400020100100001000040100120052120052120036120060120055
602041200358991010000010010112002011950310946125601034010210000100003010010000100001079396573623061185401120030012005412003511189603112407501003020020000100006020010000100001200361200511150201100991004010010000100001100100000110000000010000110003214712187119666100004000201012100001000040100120036120055120036120055120036
6020412005489910100000100001120020119503109443256010040102100011000030100100001000010793965735293611828501200300120051120054111882031124075010030200200001000060200100001000012005412003511502011009910040100100001000001001000001100000000100001100032127108861196461000040002131312100001000040100120055120055120036120036120055
60204120054899101000000000021200201195031094592560100401001000110000301001000010000107955257362306118285012003001200351200541119000311237450100302002000010000602001000010000120054120051115020110099100401001000010000010010000001000000001000010000321461358811966610000400000012100001000040100120036120036120055120055120036
602041200358991010000010010112003911950310944325601034010210001100003010010000100001079396573623061175990120030312005412005411190003112407501003020020000100006020010000100001200541200511150201100991004010010000100000100100000110000000010000100003212812177119666100004000013100100001000040100120052120055120055120036120055

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 9.0047

retire (01)cycle (02)03080b0e0f1e22233f4d4f51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a1a3a6a8a9acafb5bbdcache load miss (bf)c2branch mispredict (cb)cfd2d5d6ddinst fetch restart (de)e0e6? int output thing (e9)eaebecld/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
60025120047899000010012002011948310944325600134001010001100003001010000100001079809573603561243440120023120047120050111903311244750010300202000010000600201000010000120068120047115002110910400101000010000010100000110000000100001010031410399331196501000040002900100001000040010120051120048120048120048120048
600241200478990010110120035119489109443256001340012100001000030010100001000010798095735888612558101200261200351200501119183112437500103002020000100006002010000100001200971200791150021109104001010000100000101000001100000012100000010031410399331196681000040002668100001000040010120048120048120051120048120036
60024120101899000011012003211948310944325600134001210001100003001010000100001079845573603561243440120011120035120047111915311243750010300202000010000600201000010000120091120093115002110910400101000010000010100000110000000100001010031410399331196501000040002905100001000040010120036120036120051120036120036
60024120047899001011012003511948010945825600134001210000100003001010000100001079769573603561255810120026120050120050111918311244450010300202000010000600201000010000120085120050115002110910400101000010000010100000010000003100001010031410399331196501000040002068100001000040010120051120051120036120051120036
60024120047899000011012003511948910945825600104001210001100003001010000100001079769573603561255810120026120035120035111903311244750010300202000010000600201000010000120088120430115002110910400101000010000010100000110000000100001000031410399331196501000040002068100001000040010120036120048120036120051120036
60024120050899000011012003511948910945525600134001210001100003001010000100001079769573603561243440120011120035120035111903311243750010300202000010000600201000010000120072120056115002110910400101000010000010100000110000000100000010031410399331196651000040000008100001000040010120051120036120051120048120048
6002412003589900001000120032119483109458256001040012100011000030010100001000010798095735293612558101200261200351200501119183112437500103002020000100006002010000100001201071200661150021109104001010000100000101000101100001036100001010031410399331196681000040002008100001000040010120036120036120052120139120270
60024120035900100010012003211948910944325600134001210001100003001010000100001079809573603561255810120026120050120050111921311244750010300202000010000600201000010000120099120082115002110910400101000010000010100000110000100100001010031410399331196501000040000905100001000040010120053120264120051120341120334
60024120050902000110012003211948310944325600104001010001100003001010000100001079809573603561245000120013120050120050111915311244750010300202000010000600201000010000120091120062115002110910400101000010000010100000010000103100001010031410399331196681000040000665100001000040010120436120819120052120051120051
60024120431899000010012002011948010945825600104001210001100003001010000100001079769573529361243440120029120035120035111903311243750010300202000010064600201000010000120086120415115002110910400101000010000010100000010000103615100000000031410399331196681000040000900100001000040010120036120051120051120051120048

Test 3: Latency 2->3 roundtrip

Chain cycles: 3

Code:

  ldnp d0, d1, [x6, #0x10]
  fmov x1, d1
  eor x8, x8, x1
  eor x8, x8, x1
  add x6, x6, x8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 9.0055

retire (01)cycle (02)0308090e0f181e233a3f43494d4f51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)60696b6d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a1a3a6a8a9acafb5bbdcache load miss (bf)cfd5d6ddinst fetch restart (de)e0e6? int output thing (e9)eaebecld/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
60205120061899020001001200401011953410946225601064010410001100003010010000100001079417573627861190810120031012005512005511190131124205010030200200001021360200100001000012004312005511502011009910040100100001000001001000002100010011000111132101110211196661000140004660100001000040100120056120056120045120056120044
6020412004389900000200120028111195341094622560103401041000210000301001000010000107941757356926117652012003101200551200431119013112419501003020020000100006020010000100001200551200551150201100991004010010000100000100100000210001001100011113210297121196691000140004065100001000040100120056120056120044120044120056
6020412004390000010112101200401111949910946225601064010210002100003010010000100001079417573627861190810120031012005512005511189931124205010030200200001000060200100001000012004312004311502011009910040100100001000001001000002100010011000111132101110211196661000140004665100001000040100120044120056120056120056120056
60204120055899000002001200401111953410946225601064010210002100003010010000100001079417573627861190810120031012005512005511190131124545010030200200001000060200100001000012004312005511502011009910040100100001000001001000000100011011000101132101110121196661000140004665100001000040100120056120044120056120056120056
60204120055899000001001200281011949910946225601034010210002100003010010000100001079417573627861190810120031012005512005511190131124195010030200200001000060200100001000012005512005511502011009910040100100001000001001000000100010011000111032102110211196661000140002665100001000040100120056120056120056120044120056
6020412005590000000200120040111194991094622560106401041000210000301001000010000107926957362786117652012003101200551200441119013112419501003020020000100006020010000102161200451200781150201100991004010010000100000100100000010001001100010113210197111196661000140002000100001000040100120056120056120044120056120044
6020412005589900110110120028111194991094512560103401041000210000301001000010000107926957362786119081012003101200551200551119013112419501003020020000100006020010000100001200551200551150201100991004010010000100000100100000210001001100011113210297121196501000140004605100001000040100120056120056120056120056120056
60204120055899000002001200281111949910946225601064010410002100003010010000100001079269573627861190810120031012005512005511189931124195010030200200001000060200100001000012004312005511502011009910040100100001000001001000002100010011000111132101110221196501000140004065100001000040100120056120056120056120044120056
60204120055899000002001200401111949910945125601064010410002100003010010000100001079269573627861190810120019012004312004311189931124195010030200200001000060200100001000012005512004311502011009910040100100001000001001000002100010011000301132102110221196661000140004065100001000040100120056120044120056120056120056
60204120055899000102001200281011949910946225601064010410002100003010010000100001079269573627861190810120031012005512005511190131124195010030200200001000060200100001000012004312004311502011009910040100100001000001001000000100010011000111032101110121196661000140004665100001000040100120056120056120044120056120056

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 9.0054

retire (01)cycle (02)030708090a0b0e0f18191e1f22233a3f4d4f51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)60696b6d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a1a3a6a7a8acafb5dcache load miss (bf)c2branch mispredict (cb)cfd0d5d6dbddinst fetch restart (de)e0e6? int output thing (e9)eaebecld/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
6002512003589900100100010000120032119480109443256001340012100091000030010100001000010798095735293612558111200120120035120047111915311243750010300202000010000600201000010000120437120051115002110910400101000010000010100000110000030100001100314002940111196681000040002665100001000040010120048120049120446120048120036
6002412014890000000000010100120032119480109455256001340012100001000030010100001000010798095735293612434411200230120047120047111915311244450010306662010810000600201000010000120047120035115002110910400101000010000110100000110000000100001000314001940111196501000040000068100001000040010120051120036120048120048120036
6002412005089900100000010000120032119483109458256001340010100011000030151100521000010798455736035612434401200260120050120050111915311244750010300202000010000600201000010000120047120035115002110910400101000010000010100000110000000100001100314001990111196681000040000690100001000040010120036120048120036120051120036
6002412033689900000000410100120035119483109443256001340010100011000030010100001000010798455736035612434411200260120035120050111918311244750253300202000010000600201000010000120050120047115002110910400101000010000010100000110000003100051100314001940111196651000040002665100001000040010120048120036120048120048120048
6002412004789920100110010100120032119480109455256001040012100011000030010101581000010797695735293612434411200230120047120047111935311244450010300202000010000600201000010000120047120047115002110910400101000010000010100000010000000100000000314001990111196651000040002605100001000040010120048120441120036120048120036
6002412004789900010004010000120020119480109443256001340010100091000030010100001000010798095735293612434411200250120047120047111915311243750010300202000010000600201000010000120047120047115002110910400101000010000010100000110000000100001100314001990311196651000040000665100001000040010120036120048120036120048120036
60024120047899000000000523521001200321194801094551056001040012100001000030010100001000010798095735293612434411200230120047120047111915311243750010306592010810000600201000010000120035120047115002110910400101000010000010100000110000000100001100314001990161196501000040002665100001000040010120048120048120036120048120051
60024120047899000000010270000120020119489109455256001040012100001000030010100001000010847815735293612558111200230120049120047111915311244450010300202000010000600201000010000120047120047115002110910400101000010000010100000110000000100001000314001940111196501000040002665100001000040010120036120048120048120036120420
600241200508990000000003701001200321194811094432560010400121000110000300101000010000107982757443506125737112001401200351200471119153112444500103002020000100006002010000100001200471200471150021109104001010000100000101000001100000001000001003140021270111196651000040002660100001000040010122253122314122421122448122408
600241223339174110000252533132200100122543120658110183256001040078100001000030010100001000010798455738245612434401219990122526122542112414382113294563623423922760114556809411181111281226401225642715002110910400101000010000010100430110037179617010027112042500742501091225101004840238998100001000040010122849122625120037120051120051

Test 4: throughput

Count: 8

Code:

  ldnp d0, d1, [x6, #0x10]
  ldnp d0, d1, [x6, #0x10]
  ldnp d0, d1, [x6, #0x10]
  ldnp d0, d1, [x6, #0x10]
  ldnp d0, d1, [x6, #0x10]
  ldnp d0, d1, [x6, #0x10]
  ldnp d0, d1, [x6, #0x10]
  ldnp d0, d1, [x6, #0x10]
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.3341

retire (01)cycle (02)03090b0e0f18191e22233a3f4346494f51inst issue (52)~issue int (53)~issue ld/st (55)~dispatch int (56)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)60696d6edispatch stall (70)72scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map lookup int (7f)~map lookup ld/st (80)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a3a6a8acafb5b6bbdcache load miss (bf)c2cfd5d6ddinst fetch restart (de)e0e6eaebecld/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
160205267292001000004410126712200162580100100800001008000050011688801266822670726727663003678780100200160000200800002672726727118020110099100100800008000001008000043800390398003961394405110516312670480038010780000800001002673226732267282673226728
16020426727200000100441012671201119258010010080000100800005001174628126706267072673166300367518010020016000020080000267312670711802011009910010080000800000100800004380000038800386104405110316532670480000014780000800001002670826708267322673226732
16020426731200000000440012669401200258010010080000100800005001168627126682267072670766300367978010020016000020080000267312672711802011009910010080000800000100800000800381418000001044051103164226724800001010480000800001002672826728267322673226732
16020426707200000000450012671600019258010010080000100800005001168880126706267312673166540366658010020016000020080000267312672711802011009910010080000800000100800004380000038800386138440511041644267048003800080000800001002670826732267282672826734
16020426707200000000010126716210025801001008000010080000500116908512670626707267316654036805801002001600002008000026707267271180201100991001008000080000010080000438003803980038613844051104164226728800001414080000800001002673226732267082672826732
16020426731200000000440012679501002580100100800001008000050011688801267062673126707665003680080100200160000200800002670726727118020110099100100800008000001008000043800380398003961380051105164226728800001410780000800001002670826732267322673226728
160204267072000000004401026692212121925801001008000010080000500117488712670926731267316654036665801002001600002008000026727267071180201100991001008000080000010080001438003813980039613844051106164226728800381414780000800001002672826728267282672826711
160204267532010100005301126692011202580100100800001008000050011688801267062673126707663003674080100200160000200800002673126707118020110099100100800008000001008000043800380388003860390051105163126728800381414480000800001002673226732267082670826732
1602042673120000000044010267162100468010010080000100800005001168627126706267312672766300367608010020016000020080000267072672711802011009910010080000800000100800004380038038800386138005110516412672880000014080000800001002670826708267322673226732
16020426731200000000300126716012119258010010080000100800005001174628126706267272670766300367578010020016000020080000267312672711802011009910010080000800000100800000800380428003861000511041643267288003800780000800001002670826708267082670826708

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.3341

retire (01)cycle (02)030e0f18191e1f22233a3f4346494f51inst issue (52)~issue int (53)~issue ld/st (55)~dispatch int (56)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)60696d6edispatch stall (70)72scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map lookup int (7f)~map lookup ld/st (80)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a3a6a8acafb5b6bbdcache load miss (bf)cdcfd5d6ddinst fetch restart (de)e0e6eaebecld/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
160025267282001000960000266922120162580010108000010800005011687541267112673626782666203670780010201600002080000267272672711800211091010800008000001080000438003900800006039430502091666267248003901008000080000102672826708267282672826728
1600242672820011004501002671221212025800101080000108000050117318302671126827267356688036707800102016000020800002672726727118002110910108000080000010800004380039039800396139430502061666267248000010048000080000102672826728267282672826728
1600242672720000004500002669221201625800101080000108000050117318302671126848267206672036707800102016000020800002672726707118002110910108000080000010800004380039139800396139430502051687267248003901048000080000102672826708267282672826729
1600242689620100111774401012671221212162580010108000010800005011687541267112670726727672803671280010201607562080192268942672721800211091010800008000001080000438000001114800396139430504271665267248026010048000080000102672826728267282672826728
160024267272000000450000266922121233898801401080130108000050117462802671126739268376673036707800102016000020800002672726727118002110910108000080000010800000800000428003961394305020716772670780039101048000080000102673326710267322672826730
16002426727200000054010126719212120258001010800001080000501174628126689267392674466740367078001020160000208000026727267271180021109101080000800000108000043800390398003901394305020716562672480039101048000080000102672826728267082672826728
16002426727200000000101266922120225800101080000108000050117318302671126736267436672036707800102016000020800002672726727118002110910108000080000010800004380039008003901394305020516752670480039101048000080000102672826728267082672826728
1600242672720000004501012669221212162580010108000010800005011688800267112673426738667203668780010201600002080000267272672711800211091010800008000001080000438003904280039600005020616652670480039101048000080000102672826708267282670826728
160024267272000040930101266922121280258001010800001080000501170007026711267292672866720366878001020160000208000026727267071180021109101080000800000108000043800390398003961394305020516642672680039101048000080000102670826728267282672826728
16002426707200000045010126712001216258001010800001080000501168754126873267282679766820367078001020160000208000026727267271180021109101080000800001108000043800390398003961394305020716572672480039101048000080000102672826731267282670826728