Apple M1 Microarchitecture Research by Dougall Johnson

Firestorm: Overview | Base Instructions | SIMD and FP Instructions
Icestorm:  Overview | Base Instructions | SIMD and FP Instructions

LD1 (multiple, one register, 1D)

Test 1: uops

Code:

  ld1 { v0.1d }, [x6]
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 60 nops): 1.000

Issues: 1.000

Integer unit issues: 0.000

Load/store unit issues: 1.000

SIMD/FP unit issues: 0.000

retire (01)cycle (02)030408090b0e0f1e22243a3f464951inst issue (52)~issue ld/st (55)~dispatch ld/st (58)huge thing ld/st (5a)5f696d6escheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op ld/st (7d)~map lookup ld/st (80)8283pipeline redirect (84)85inst all (8c)inst fp/simd load (98)inst ldst (9b)9da0a1a3a6a8a9acafb5bbdcache load miss (bf)c2cfd0d2icache miss (d3)itlb miss (d4)d5d6d9dadbddinst fetch restart (de)dfe0eaebecld/st retires (ed)f5f6f7f8fd
61005294802197020004004656289070024389100010001000500041596628660292983101000100010002915229150116100110001000010000210000001001213012872910268693108069216383125380617575432846116346138901583010002948829308294082930129406
610042939722010000002004493287770024368100010001000500071595528830294033101000100010002920829205116100110001000010000210010021000213013178928069523042157217593053380721635242832316459138001584110002947529400293492932929423
61004294162214000002004871289180024391100010001000500071596928664294343101000100010002929129204116100110001000010000210020001000202012888904868343062147217683068380918625132837316505138481591810002938229323293452936529412
61004293652203000004004486287750024394100010001000500041596128617294383101000100010002929529262116100110001000010000310000001000203013075923969313063144217183128381419625932842616347137691563010002933529373294092936129516
61004293672203000004004528289920024355100010001000500061597328676294483101000100010002922529182116100110001000010000210000001000203012769916768423056448217913125381015664432837216418136201554610002941829310294162933029354
61004293352193000000004567288510024377100010001000500511595328679294023101000100010002923329262116100110001000010000310000001001203012906936268863092151217543044381019665632837616443138151583310002938729458293052948429421
61004294222212001004104578288260024364100010001000500061596328690294513101000100010002919629130116100110001000010000310002001000213012881922168463065245218443124381016654932840016414139361585010002939329435294212936629406
61004294522203000004004576289230024297100010001000500021596028702293583101000100010002927429269116100110001000010000310000001000213012899909868633077043218383148381014685432842616473139071589910002942329410294182939129442
61004294502202000004104606288311024352100010001000500061596528688294123101000100010002926329280116100110001000010000210000001002213012864915668373040244216773115381119656032854616337137561567010002933329484292802932129371
61004293752203000004104552288091024408100010001000500071596328602293833101000100010002926529176116100110001000010000310000001000313012930920569823095051217423203381514635632840216385138171575110002944829379294102934829360

Test 2: Latency 1->2 roundtrip

Chain cycles: 3

Code:

  ld1 { v0.1d }, [x6]
  fmov x0, d0
  eor x8, x8, x0
  eor x8, x8, x0
  add x6, x6, x8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 9.0051

retire (01)cycle (02)030e0f1e22243f4d4f51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a3a6a8a9acafb5dcache load miss (bf)cdcfd5d6ddinst fetch restart (de)e0? int output thing (e9)eaebecld/st retires (ed)gpr retires (ef)f5f6f7f8fd
502051200579000010012003611954410944925601034010210001100003010010000100001079008573545561344611120027120051120052113146311366850100302001000010000602001000010000120051120052115020110099100401001000010000010010000110000002710000110321041071311965840002101091000040100120052120052120052120052120052
50204120051899111101200201194931094642560139401021000110000301001000010000107900857362366136317112002712005112005111314631136685010030200100001000060200100001000012005112005111502011009910040100100001000001001000011000000010000110321031072311965840002111091000040100120052120052120052120052120052
50204120051899000101200361195101094642560103401021000110000301001000010000107900857362366136317112002712005112005111314631136685010030200100001000060200100001000012005112005111502011009910040100100001000001001000011000000010000100321031073311965840002101091000040100120052120052120052120052120052
50204120051899001101200361195101094642560103401021000110000301001000010000107900857354556136317012002712005112005111314631136685010030200100001000060200100001000012005112005121502011009910040100100001000001001000011000000010000110324931073311965840002101091000040100120052120052120052120052120052
50204120051899001101200361195101094642560103401021000110000301001000010000107900857362366136317112003312005212005111314631136705010030200100001000060200100641000012005112003511502011009910040100100001000001001000011000000010000110321021073311965840002101091000040100120052120052120036120052120052
50204120051899001001200361195101094642560103401021000110000301001000010000107900857362366136317112003312003512005111314631136685010030397100001000060200100001000012005112005111502011009910040100100001000001001000011000000310000110321031073311969540000101091000040100120052120052120052120052120052
50204120051899001101200361195101094645460103401021000110000301001000010000107900857362366136317112001112005112005111314631136685038630200100001000060200100001000012005112005111502011009910040100100001000001001000011000000010000110321021073311965840002101091000040100120052120052120052120052120052
50204120051900001101200361195101094642560103401021000110000301001000010000107900857362366136317112002712005112005111314631136685010030200100001000060200100001000012005112005111502011009910040100100001000001001000011000000010000110321031073311965840002101091000040100120052120052120052120052120036
5020412005189900100120036119510109464256010340102100011000030100100001000010790085736236613631711200271200791200511131463113668501003020010000100006020010000100001200511200511150201100991004010010000100000100100001100000001000011032103107131196584000201091000040100120052120052120052120052120052
502041200518990016101200201195101094492560103401021000110000301001000010000107900857362366136317012007112005112005111314131136585010030200100001000060200100001000012005812005111502011009910040100100001000011001000011000000010000110321031071311965840002101001000040100120052120052120052120052120052

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 9.0047

retire (01)cycle (02)0308090b0e0f191e1f2223243f4d4f51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)60696b6d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a3a6a8a9acafb5dcache load miss (bf)c2c5cdcfd5d6ddinst fetch restart (de)e0? int output thing (e9)eaebecld/st retires (ed)gpr retires (ef)f5f6f7f8fd
50025120047899000000001001200391195051094632560010400121000910000300101000010000107952357360446133499012002301200481200471131533113674500103002010000100006002010000100001200751200621150021109104001010000100000101000011000070010000110003140510722119651400026651000040010120051120036120036120051120051
50024120047899000000101001200351195081094632560013400121000110000300101000010000107952357360446132757012001101200501200501131653113674500103002010000100006002010000100001200681200371150021109104001010000100000101000001000100010000010003140210711119665400000081000040010120036120036120036120051120053
5002412004790000000081000012002011949210946725600104002010005100003001010000100001079548573545561327570120026012005012005211315353113689500103002010000100006002010000100001201071200891150021109104001010000100000101000011000000010006110003140213122119650400029081000040010120036120051120036120048120036
50024120037899000000001001200321195081094632560010400101000110000300101010710000107952357361886133652012001101200471200351131533113689500103002010000100006002010000100001200771200761150021109104001010000100000101000011000000310000112003140110711119665400009081000040010120056120036120051120051120051
500241204318990000001300001200321195081094492560013400121000110000300101000010000107951757361886132757012002601200501200501131683113674504963002010000100006002010000100001200971200811150021109104001010000100001101000001000000310000110003140110711119662400029901000040010120051120048120051120050120048
500241200478990000002201001200201195081094492560013400121000110000300101015710000107954857354556133652012001101200351200501131683113674500103002010216100006002010000100001200781200731150021109104001010000100000101000011000000010000010003140110711119665400029051000040010120048120048120051120051120036
500241200508990000000000012003511949210944925600134001010000100003001010000100001079523573623661327570120023012005012005311328131136785001030020100001000060020100001000012008212006311500211091040010100001000001010004110006001433410000112003140112421119962400000981000040010120594120587120324120051120042
50024120035899101004187740001205761195081097515460013400121000910000300101000010049107954857361886133499012010701209601204501131573113689500103097810000100006002010000100001200901200542150021109104001010000100000101000511000404363510000110003140210722119665400029081000040010120052120053120147120036120154
500241200358990000001800001200351194941094632560013400101000110000300101000010000107951757361886144281012002601200511204171131683113689500103002010000100006002010107100001200771200841150021109104001010000100000101000011000010010000012003170211922119739400020681000040010120051120055120048120036120148
50024120047899000111138810012003611950810944925600134001210000100003001010000100001079523573604461327570120029012005012005011315331136765001030020100001000060020100001000012010512004011500211091040010100001000001010000110000110310000100003140210731119665400089001000040010120036120051120036120051120036

Test 3: throughput

Count: 8

Code:

  ld1 { v0.1d }, [x6]
  ld1 { v0.1d }, [x6]
  ld1 { v0.1d }, [x6]
  ld1 { v0.1d }, [x6]
  ld1 { v0.1d }, [x6]
  ld1 { v0.1d }, [x6]
  ld1 { v0.1d }, [x6]
  ld1 { v0.1d }, [x6]
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.3342

retire (01)cycle (02)030508090b0e0f1e22233a3f4346494f51inst issue (52)~issue int (53)~issue ld/st (55)~dispatch int (56)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)696b6d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map lookup int (7f)~map lookup ld/st (80)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)92inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a1a3a5a6a7a8a9acafb5b6bbdcache load miss (bf)dtlb miss (c1)c2cfd5d6ddinst fetch restart (de)e0eaebecld/st retires (ed)gpr retires (ef)f5f6f7f8fd
8020526737200111100671032672137719258010010080000100800005001167224267120267362673616659316695801002008000020080000267382674511802011009911001008000080000010080019194308005900461800396119431915110116112673313135800001002697026753267152672726737
8020426736200110000671022669927720258010010080000100800005001169457267110267362673616659316694801002008000020080000267372674711802011009901001008000080000110080020204308005900061800406158431905110116112671113135800001002671526738267372673726738
8020426736200111100670022669937121925801001008000010080000500116755526711326736267141665931669580100200800002008000026736267361180201100990100100800008000001008001920008001900061800396119431905110116112671113130800001002673726737267372673726737
8020426736200100000210022672227720258010010080130100800005001167316267110267362673616658316694801002008000020080000268932679811802011009901001008000080000010080019194308005810064800406119431905110116112673313135800001002673826737267372673826715
80204267362001011006700226721207202580100100800001008000050011675552668902673626737166593166948010020080000200800002674526740118020110099010010080000800000100800191900800590016080040615901915110116112673313135800001002673726738267372673726737
80204267362001111006700226721377125801001008000010080000500116986826711026714267361663731667280100200800002008000026724267371180201100990100100800008000001008002020430800190016180041611901915110116112673313135800001002673726738267372673726738
8020426736200101100671032672139719258010010080000100800005001169868267110267362673616659316695801002008000020080000267442673611802011009911001008000080000010080020204308005800160800406159451905110116112673313135800001002673826737267372673826737
8020426736200110000670022669937720258010010080000100800005001167555267120267372673716659316694801002008000020080000269012681611802011009901001008000080000010080019194308005800061800006159431915110116112671513135800001002673726737267372673726758
80204267372001011006700326721377202580100100800001008000050011675552671202673726737166593166948010020080000200800002689926807118020110099010010080000800000100800202043080060101618003961594319151101161126733005800001002673726738267372673726738
8020426736200110000660032672237719258010010080000100800005001168529267110267362673616659316695801002008000020080000267442673611802011009901001008000080000010080019204308005910161800406119431915110116112673413135800001002673726737267152673726738

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.3340

retire (01)cycle (02)030508090b0e0f1e223a3f4346494f51inst issue (52)~issue int (53)~issue ld/st (55)~dispatch int (56)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)5f60696b6d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map lookup int (7f)~map lookup ld/st (80)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a1a3a5a6a7a8a9acafb5b6bbdcache load miss (bf)dtlb miss (c1)c2c5branch mispredict (cb)cfd2d5d6ddinst fetch restart (de)e0eaebecld/st retires (ed)gpr retires (ef)f5f6f7f8fd
80025267302000110006501266932018125080010108000010800005011676600026708026736267151668431670780010208000020800002695326747118002110910108000080000110800000008003900042800396119411910050200101614122673009280000102673326733267332673326716
800242673220010000066022670301818025800101080000108000050116530400267070267322673616660316713800102080000208000026966267511180021109101080000800000108001919420800580015980037011942191005020081610726725100080000102670926728267132672826728
8002426728200000000002267132180162580010108000010800005011678670026697026728267121667231671180010208000020800002694626735118002110910108000080000010800000390800000004280000010000005020012161213267191010480000102672926728267232670926709
8002426732200000000000267072121202580010108000010800005011664770026708026729267271665231669080010208000020800002672926725118002110910108000080000010800000390800000100800390135430000505009161072672460280000102672326728267282672326723
80024267282000000004111266932000258001010800001080000501166750002668302672726727166723167088001020800002080000267282672211800211091010800008000001080000043080039000080039613943000050200916128267191010080000102670926728267092672826723
800242672220000000041022670700120258001010800001080000501166750002669702672726708166523166888001020800002080000267082672211800211091010800008000011080000000800000004280035610430000502001516101026705610280000102670926721267292670926723
80024267082000000005702266930012162580010108000010800005011667500026683026728267081665531670780010208000020800002672226724118002110910108000080000010800000390800350003580000613939000050200121610102672406480000102672926709267262672826723
8002426728200000000450026693212002580010108000010800005011668960126703026727267271666713166888001020800002080000267272672211800211091010800008000001080000043080039000080000613500000502008167122672400480000102672926723267282670926723
80024267272000000004501267122180162580010108000010800005011688430026702026728267271667231668880010208000020800002672826729118002110910108000080000010800000390800350003980000000430000502001316121226705100480000102670926723267232672326729
800242672720100000141122669321201225800101080000108000050116734300266830267282672216672316702800102080000208000026847268091180021109101080000800001108000003908003900035800396039430000502001316712267191010480000102672826709267282672826728