Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

LD1 (multiple, 1 reg, 1D)

Test 1: uops

Code:

  ld1 { v0.1d }, [x6]
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 60 nops): 1.000

Issues: 1.000

Integer unit issues: 0.000

Load/store unit issues: 1.000

SIMD/FP unit issues: 0.000

retire uop (01)cycle (02)03l1i tlb fill (04)mmu table walk data (08)09l2 tlb miss data (0b)0e0f1e22243a3f464951schedule uop (52)schedule ldst uop (55)dispatch ldst uop (58)simd uops in schedulers (5a)5f696d6emap rewind (75)map stall (76)dispatch uop (78)map ldst uop (7d)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst simd load (98)inst ldst (9b)9dl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)a9acafb5bbl1d cache miss ld nonspec (bf)c2cfd0d2l1i cache miss demand (d3)l1i tlb miss demand (d4)d5map dispatch bubble (d6)d9dadbddfetch restart (de)dfe0eaebec? ldst retires (ed)f5f6f7f8fd
61005294802197020004004656289070024389100010001000500041596628660292983101000100010002915229150116100110001000010000210000001001213012872910268693108069216383125380617575432846116346138901583010002948829308294082930129406
610042939722010000002004493287770024368100010001000500071595528830294033101000100010002920829205116100110001000010000210010021000213013178928069523042157217593053380721635242832316459138001584110002947529400293492932929423
61004294162214000002004871289180024391100010001000500071596928664294343101000100010002929129204116100110001000010000210020001000202012888904868343062147217683068380918625132837316505138481591810002938229323293452936529412
61004293652203000004004486287750024394100010001000500041596128617294383101000100010002929529262116100110001000010000310000001000203013075923969313063144217183128381419625932842616347137691563010002933529373294092936129516
61004293672203000004004528289920024355100010001000500061597328676294483101000100010002922529182116100110001000010000210000001000203012769916768423056448217913125381015664432837216418136201554610002941829310294162933029354
61004293352193000000004567288510024377100010001000500511595328679294023101000100010002923329262116100110001000010000310000001001203012906936268863092151217543044381019665632837616443138151583310002938729458293052948429421
61004294222212001004104578288260024364100010001000500061596328690294513101000100010002919629130116100110001000010000310002001000213012881922168463065245218443124381016654932840016414139361585010002939329435294212936629406
61004294522203000004004576289230024297100010001000500021596028702293583101000100010002927429269116100110001000010000310000001000213012899909868633077043218383148381014685432842616473139071589910002942329410294182939129442
61004294502202000004104606288311024352100010001000500061596528688294123101000100010002926329280116100110001000010000210000001002213012864915668373040244216773115381119656032854616337137561567010002933329484292802932129371
61004293752203000004104552288091024408100010001000500071596328602293833101000100010002926529176116100110001000010000310000001000313012930920569823095051217423203381514635632840216385138171575110002944829379294102934829360

Test 2: Latency 1->2 roundtrip

Chain cycles: 3

Code:

  ld1 { v0.1d }, [x6]
  fmov x0, d0
  eor x8, x8, x0
  eor x8, x8, x0
  add x6, x6, x8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 9.0051

retire uop (01)cycle (02)030e0f1e22243f4d4f51schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)60696d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd load (98)inst ldst (9b)9d9fl1d tlb access (a0)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)a9acafb5l1d cache miss ld nonspec (bf)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
502051200579000010012003611954410944925601034010210001100003010010000100001079008573545561344611120027120051120052113146311366850100302001000010000602001000010000120051120052115020110099100401001000010000010010000110000002710000110321041071311965840002101091000040100120052120052120052120052120052
50204120051899111101200201194931094642560139401021000110000301001000010000107900857362366136317112002712005112005111314631136685010030200100001000060200100001000012005112005111502011009910040100100001000001001000011000000010000110321031072311965840002111091000040100120052120052120052120052120052
50204120051899000101200361195101094642560103401021000110000301001000010000107900857362366136317112002712005112005111314631136685010030200100001000060200100001000012005112005111502011009910040100100001000001001000011000000010000100321031073311965840002101091000040100120052120052120052120052120052
50204120051899001101200361195101094642560103401021000110000301001000010000107900857354556136317012002712005112005111314631136685010030200100001000060200100001000012005112005121502011009910040100100001000001001000011000000010000110324931073311965840002101091000040100120052120052120052120052120052
50204120051899001101200361195101094642560103401021000110000301001000010000107900857362366136317112003312005212005111314631136705010030200100001000060200100641000012005112003511502011009910040100100001000001001000011000000010000110321021073311965840002101091000040100120052120052120036120052120052
50204120051899001001200361195101094642560103401021000110000301001000010000107900857362366136317112003312003512005111314631136685010030397100001000060200100001000012005112005111502011009910040100100001000001001000011000000310000110321031073311969540000101091000040100120052120052120052120052120052
50204120051899001101200361195101094645460103401021000110000301001000010000107900857362366136317112001112005112005111314631136685038630200100001000060200100001000012005112005111502011009910040100100001000001001000011000000010000110321021073311965840002101091000040100120052120052120052120052120052
50204120051900001101200361195101094642560103401021000110000301001000010000107900857362366136317112002712005112005111314631136685010030200100001000060200100001000012005112005111502011009910040100100001000001001000011000000010000110321031073311965840002101091000040100120052120052120052120052120036
5020412005189900100120036119510109464256010340102100011000030100100001000010790085736236613631711200271200791200511131463113668501003020010000100006020010000100001200511200511150201100991004010010000100000100100001100000001000011032103107131196584000201091000040100120052120052120052120052120052
502041200518990016101200201195101094492560103401021000110000301001000010000107900857362366136317012007112005112005111314131136585010030200100001000060200100001000012005812005111502011009910040100100001000011001000011000000010000110321031071311965840002101001000040100120052120052120052120052120052

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 9.0047

retire uop (01)cycle (02)03mmu table walk data (08)09l2 tlb miss data (0b)0e0f191e1f2223243f4d4f51schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)60696b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd load (98)inst ldst (9b)9d9fl1d tlb access (a0)l1d cache miss ld (a3)ld unit uop (a6)l1d cache writeback (a8)a9acafb5l1d cache miss ld nonspec (bf)c2branch cond mispred nonspec (c5)cdcfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
50025120047899000000001001200391195051094632560010400121000910000300101000010000107952357360446133499012002301200481200471131533113674500103002010000100006002010000100001200751200621150021109104001010000100000101000011000070010000110003140510722119651400026651000040010120051120036120036120051120051
50024120047899000000101001200351195081094632560013400121000110000300101000010000107952357360446132757012001101200501200501131653113674500103002010000100006002010000100001200681200371150021109104001010000100000101000001000100010000010003140210711119665400000081000040010120036120036120036120051120053
5002412004790000000081000012002011949210946725600104002010005100003001010000100001079548573545561327570120026012005012005211315353113689500103002010000100006002010000100001201071200891150021109104001010000100000101000011000000010006110003140213122119650400029081000040010120036120051120036120048120036
50024120037899000000001001200321195081094632560010400101000110000300101010710000107952357361886133652012001101200471200351131533113689500103002010000100006002010000100001200771200761150021109104001010000100000101000011000000310000112003140110711119665400009081000040010120056120036120051120051120051
500241204318990000001300001200321195081094492560013400121000110000300101000010000107951757361886132757012002601200501200501131683113674504963002010000100006002010000100001200971200811150021109104001010000100001101000001000000310000110003140110711119662400029901000040010120051120048120051120050120048
500241200478990000002201001200201195081094492560013400121000110000300101015710000107954857354556133652012001101200351200501131683113674500103002010216100006002010000100001200781200731150021109104001010000100000101000011000000010000010003140110711119665400029051000040010120048120048120051120051120036
500241200508990000000000012003511949210944925600134001010000100003001010000100001079523573623661327570120023012005012005311328131136785001030020100001000060020100001000012008212006311500211091040010100001000001010004110006001433410000112003140112421119962400000981000040010120594120587120324120051120042
50024120035899101004187740001205761195081097515460013400121000910000300101000010049107954857361886133499012010701209601204501131573113689500103097810000100006002010000100001200901200542150021109104001010000100000101000511000404363510000110003140210722119665400029081000040010120052120053120147120036120154
500241200358990000001800001200351194941094632560013400101000110000300101000010000107951757361886144281012002601200511204171131683113689500103002010000100006002010107100001200771200841150021109104001010000100000101000011000010010000012003170211922119739400020681000040010120051120055120048120036120148
50024120047899000111138810012003611950810944925600134001210000100003001010000100001079523573604461327570120029012005012005011315331136765001030020100001000060020100001000012010512004011500211091040010100001000001010000110000110310000100003140210731119665400089001000040010120036120051120036120051120036

Test 3: throughput

Count: 8

Code:

  ld1 { v0.1d }, [x6]
  ld1 { v0.1d }, [x6]
  ld1 { v0.1d }, [x6]
  ld1 { v0.1d }, [x6]
  ld1 { v0.1d }, [x6]
  ld1 { v0.1d }, [x6]
  ld1 { v0.1d }, [x6]
  ld1 { v0.1d }, [x6]
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.3342

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)09l2 tlb miss data (0b)0e0f1e22233a3f4346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)696b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)92inst branch cond (94)inst int alu (97)inst simd load (98)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)a5ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafb5b6bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2cfd5map dispatch bubble (d6)ddfetch restart (de)e0eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
8020526737200111100671032672137719258010010080000100800005001167224267120267362673616659316695801002008000020080000267382674511802011009911001008000080000010080019194308005900461800396119431915110116112673313135800001002697026753267152672726737
8020426736200110000671022669927720258010010080000100800005001169457267110267362673616659316694801002008000020080000267372674711802011009901001008000080000110080020204308005900061800406158431905110116112671113135800001002671526738267372673726738
8020426736200111100670022669937121925801001008000010080000500116755526711326736267141665931669580100200800002008000026736267361180201100990100100800008000001008001920008001900061800396119431905110116112671113130800001002673726737267372673726737
8020426736200100000210022672227720258010010080130100800005001167316267110267362673616658316694801002008000020080000268932679811802011009901001008000080000010080019194308005810064800406119431905110116112673313135800001002673826737267372673826715
80204267362001011006700226721207202580100100800001008000050011675552668902673626737166593166948010020080000200800002674526740118020110099010010080000800000100800191900800590016080040615901915110116112673313135800001002673726738267372673726737
80204267362001111006700226721377125801001008000010080000500116986826711026714267361663731667280100200800002008000026724267371180201100990100100800008000001008002020430800190016180041611901915110116112673313135800001002673726738267372673726738
8020426736200101100671032672139719258010010080000100800005001169868267110267362673616659316695801002008000020080000267442673611802011009911001008000080000010080020204308005800160800406159451905110116112673313135800001002673826737267372673826737
8020426736200110000670022669937720258010010080000100800005001167555267120267372673716659316694801002008000020080000269012681611802011009901001008000080000010080019194308005800061800006159431915110116112671513135800001002673726737267372673726758
80204267372001011006700326721377202580100100800001008000050011675552671202673726737166593166948010020080000200800002689926807118020110099010010080000800000100800202043080060101618003961594319151101161126733005800001002673726738267372673726738
8020426736200110000660032672237719258010010080000100800005001168529267110267362673616659316695801002008000020080000267442673611802011009901001008000080000010080019204308005910161800406119431915110116112673413135800001002673726737267152673726738

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.3340

retire uop (01)cycle (02)03l1d tlb fill (05)mmu table walk data (08)09l2 tlb miss data (0b)0e0f1e223a3f4346494f51schedule uop (52)schedule int uop (53)schedule ldst uop (55)dispatch int uop (56)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)5f60696b6d6emap stall dispatch (70)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map int uop inputs (7f)map ldst uop inputs (80)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd load (98)inst ldst (9b)9d9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss ld (a3)a5ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafb5b6bbl1d cache miss ld nonspec (bf)l1d tlb miss nonspec (c1)c2branch cond mispred nonspec (c5)branch mispred nonspec (cb)cfd2d5map dispatch bubble (d6)ddfetch restart (de)e0eaebec? ldst retires (ed)? int retires (ef)f5f6f7f8fd
80025267302000110006501266932018125080010108000010800005011676600026708026736267151668431670780010208000020800002695326747118002110910108000080000110800000008003900042800396119411910050200101614122673009280000102673326733267332673326716
800242673220010000066022670301818025800101080000108000050116530400267070267322673616660316713800102080000208000026966267511180021109101080000800000108001919420800580015980037011942191005020081610726725100080000102670926728267132672826728
8002426728200000000002267132180162580010108000010800005011678670026697026728267121667231671180010208000020800002694626735118002110910108000080000010800000390800000004280000010000005020012161213267191010480000102672926728267232670926709
8002426732200000000000267072121202580010108000010800005011664770026708026729267271665231669080010208000020800002672926725118002110910108000080000010800000390800000100800390135430000505009161072672460280000102672326728267282672326723
80024267282000000004111266932000258001010800001080000501166750002668302672726727166723167088001020800002080000267282672211800211091010800008000001080000043080039000080039613943000050200916128267191010080000102670926728267092672826723
800242672220000000041022670700120258001010800001080000501166750002669702672726708166523166888001020800002080000267082672211800211091010800008000011080000000800000004280035610430000502001516101026705610280000102670926721267292670926723
80024267082000000005702266930012162580010108000010800005011667500026683026728267081665531670780010208000020800002672226724118002110910108000080000010800000390800350003580000613939000050200121610102672406480000102672926709267262672826723
8002426728200000000450026693212002580010108000010800005011668960126703026727267271666713166888001020800002080000267272672211800211091010800008000001080000043080039000080000613500000502008167122672400480000102672926723267282670926723
80024267272000000004501267122180162580010108000010800005011688430026702026728267271667231668880010208000020800002672826729118002110910108000080000010800000390800350003980000000430000502001316121226705100480000102670926723267232672326729
800242672720100000141122669321201225800101080000108000050116734300266830267282672216672316702800102080000208000026847268091180021109101080000800001108000003908003900035800396039430000502001316712267191010480000102672826709267282672826728