Apple M1 Microarchitecture Research by Dougall Johnson

Firestorm: Overview | Base Instructions | SIMD and FP Instructions
Icestorm:  Overview | Base Instructions | SIMD and FP Instructions

LD1 (single structure, D)

Test 1: uops

Code:

  ld1 { v0.d }[1], [x6]
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 60 nops): 2.000

Issues: 2.001

Integer unit issues: 0.000

Load/store unit issues: 1.000

SIMD/FP unit issues: 1.001

retire (01)cycle (02)03070a0e0f1e223a3f464951inst issue (52)~issue fp/simd (54)~issue ld/st (55)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing ld/st (5a)huge thing fp/simd (5b)5f696d6escheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op ld/st (7d)~map op fp/simd (7e)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst fp/simd load (98)inst ldst (9b)9da0a3a6a8a9acafb5bbdcache load miss (bf)c2cfd0d2icache miss (d3)itlb miss (d4)d5d6d9dadbddinst fetch restart (de)e0eaebecld/st retires (ed)? fp/simd (ee)f5f6f7f8fd
620052942622125221040455428762001735320011002100010001000500011899522695290712929431020001000100010002000291612923711610011000100011000310000031002210013582925168573062989208413087381421666528410163431395415345100010002933029360294022942929214
6200429353219232700004629288920017241200210021000100010005000119074226002902929286310200010001000100020002917029149116100110001000110002100000010003130130789245680230031464208073061381316656228457162181418315322100010002934729309293742917629432
6200429335220252000004582288450017296200110011000100010005000119410226472908629204310200010001000100020002909129145116100110001000010003100000010002030132199168684330591565206883055381818646728467162501387715262100010002936629356293122935529482
620042928922024250000463928846001732720011000100010001000500011944022633291122919131020001000100010002000291262914511610011000100011000210000059710010020128319418686231491460206973047381513646228358164811410515236100010002935829320293052929029329
620042942822027200000464728818101736320011001100010001000500011935422662291312933431020001000100010002000291382913211610011000100001000210000001000300012966932867473050863207763060381312746228379162931377415209100010002926629301293632922529358
6200429412220242900804595288131017273200110011000100010005000119543226342914329276310200010001000100020002916129316116100110001000010000100000010002030129249277686430131063207113104381314676328450162271386715256100010002938629315293572933629293
6200429333219243501014596288251017319200110011000100010005000119030226382911029419310200010001000100020002911229056116100110001000110003100000010003000129659252685430971162206053116381514646728434161381381515221100010002928029218292882926729220
6200429277220202100204635288940017308200110011000100010005000119433225732908829268310200010001000100020002914029109116100110001000010000100200010012020135179248681031091363206663284381118606528409159341391115410100010002929529239293592931429275
6200429328220272400004541288290117196200010011000100010005000119070226412903529348310200010001000100020002920529116116100110001000010000100100010003000129829191693630781464206673229381117696328376163441398415168100010002926329367292602929529358
620042936221919220001464728794011731020011001100010001000500011944822654290692914831020001000100010002000291672915611610011000100001000010000031001203012930920268623083963206803149381111646228679163531394015324100010002925529434293552939129404

Test 2: Latency 1->2 roundtrip

Chain cycles: 3

Code:

  ld1 { v0.d }[1], [x6]
  fmov x0, d0
  eor x8, x8, x0
  eor x8, x8, x0
  add x6, x6, x8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 11.0051

retire (01)cycle (02)0308090a0e0f18191e1f2223243f4d4f51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)60696d6edispatch stall (70)71scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a1a3a6a8a9acafb5dcache load miss (bf)branch mispredict (cb)cdcfd5d6ddinst fetch restart (de)e0? int output thing (e9)eaebecld/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
60205140051104901000001010014003613941112936325701024010020002100003010020000100001264020669373414310939114002714005114005113179703132399601003020010000200006020010000300001400511400511150201100991004010010000100000100100000110000000100001100321021262213955940000101010100001000040100140036140036140052140052140036
60204140035104900000001010014003613941112936325701024010020002100003010020000100001264020669373414310939114002714005114003513179703132399604253020010000200006020010000300001400511400511150201100991004010010000100000100100000010000000100001100321021262213955940000101010100001000040100140052140052140036140052140036
60204140035104900000001010014002013941112936325701024010020002100003010020000100001264020669294714310939114002714005114009913179703132399601003020010000200006020010000300001400511400511150201100991004010010000100000100100000110000000100001100321021263213955940000101010100001000040100140052140052140036140052140052
6020414005110490000000101001400361394111293632570100401002000210000301002000010055126381466937341431093911400271400351400511317970313238260100302001000020000602001000030000140035140051115020110099100401001000010000010010000011000010310000100032102126221396634000001010100001000040100140036140052140052140052140052
6020414005110490000000101001400361394111293632570100401002000010000301002000010000126402066937341431093911400271400511400351317970313239960100302001000020000602001000030000140035140051115020110099100401001000010000010010000001000000010000110032102126221395604000001010100001000040100140052140052140036140036140052
60204140051104900000001010014003613941112936325701024010020002100003010020000100001264020669373414310939114002714005114005113179703132399601003020010000200006020010000300001400511400351150201100991004010010000100000100100000010000000100001000321021273313955940000101010100001000040100140052140052140052140052140052
60204140051104900000000000014002013941112936325701024010020000100003010020000100001264020669294714308701114002714005114005113179303132399601003020010000200006020010000300001400511400561150201100991004010010000100000100100000110000000100001000321031273313955940000101010100001000040100140052140052140052140052140052
6020414005110490000000101001400361394111293472570102401002000210000301002000010000126380366937341431093901400271400511400511317970313239960100302001000020000602001000030000140035140035115020110099100401001000010000010010000011000000010000110032102126221395594000010010100001000040100140036140036140052140052140036
60204140035104900000000010014003613941112936325701024010020002100003010020000100001263803669373414310939014002814005114005113179703132382601003020010000200006020010000300001400541400351150201100991004010010000100000100100000110000000100001100321031262213955940000101010100001000040100140036140052140036140052140052
6020414010610490000000100001400361394111293642570100401002000210000301002000010000126402066929471431093901400111400351400511317970313239960100302001000020000602001000030000140051140051115020110099100401001000010000010010000011000000010000110032103126331395594000001010100001000040100140052140052140052140052140052

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 11.0047

retire (01)cycle (02)0304080b0e0f18191e1f2223243f4d4f51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)60696d6edispatch stall (70)71scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a3a6a8acafb5dcache load miss (bf)c5branch mispredict (cb)cfd2d5d6ddinst fetch restart (de)e0? int output thing (e9)eaebecld/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
60025140047104800000001010014041313939712936225700124001020002100003001020000100001264477669387814325829114002614005014005013181503132430600103002010000200006002010000300001400501400472150021109104001010000100000101000011000000100001000314001111391313955440000066100001000040010140036140036140051140036140051
6002414005010490000000100001400351393971293472570010400102000210000300102000010000126442966929471432144211400261400351400501318140313242060010300201000020000600201000030000140050140047115002110910400101000010000010100001100000010000100031400811381113956940000969100001000040010140051140036140048140048140048
600241400351049000000060000140032139401129347257001240010200001000130010200001000012644436693538143258291140011140035140050131818031324376001030020100002000060020100003000014005014004711500211091040010100001000001010000010000001000000003140010111131013956940000909100001000040010140036140051140051140036140036
60024140035104900000005620000140032139394129347257001240010200021000230010200001000012644296692947143260331140023140047140035131815031324206001030020100002012560020100003000014005414004711500211091040010100001000001010000010000001000011003140011113111113956940000000100001000040010140051140051140051140036140036
600241400501049000000010000140020139397129362257001040010200021000030010200001000012644436692947143258291140026140050140035131803031324206001030020100002000060020100003000014004714004711500211091040010100001000001010000010000001000000003140011111121313956940000960100001000040010140051140051140051140036140036
600241400501049000010010000140043139394129348257001240010200001000030152200001000012644776693538143258291140011140050140047131815031324336001030020100002000060020100003000014005014003511500211091040010100001000001010000010000001000001003140011113121213956940000966100001000040010140051140036140051140051140036
60024140050104900000001010014002013940212934725700104001020002100003001020000100001264429669294714321442114002614005514005013180303132433600103002010000200006002010000300001400501400471150021109104001010000100001101000001000040100000100314001311311913956940000000100001000040010140036140051140051140051140051
60024140047104900000000010014003413939412935925700124001020002100003001020000100001264429669390214321442114002314005014005113181803132433600103002010000200006002010000300001400471400471150021109104001010000100000101000001000100100001000314009113131413955440000969100001000040010140051140051140036140048140051
60024140050104900000000000014003513939712936225700124001020000100003001020000100001264429669368514326112114003114005014005013180303132420600103002010000200006002010000300001400501400471150021109104001010000100000101000011000101100001000314001211110713956940000999100001000040010140051140036140036140036140036
600251400531049000000000000140020139397129359257001240010200021000030010200001000012644776692947143261121140011140050140050131818031324336001030020100002000060020100003000014005014003511500211091040010100001000001010000110000001000001003140010113101213955440000009100001000040010140051140052140051140051140051

Test 3: throughput

Count: 8

Code:

  ld1 { v0.d }[1], [x6]
  ld1 { v0.d }[1], [x6]
  ld1 { v0.d }[1], [x6]
  ld1 { v0.d }[1], [x6]
  ld1 { v0.d }[1], [x6]
  ld1 { v0.d }[1], [x6]
  ld1 { v0.d }[1], [x6]
  ld1 { v0.d }[1], [x6]
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 2.0006

retire (01)cycle (02)030708090a0b0e0f18191e1f22233a3f4346494e4f51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a1a3a5a6a8a9acafb5b6bbdcache load miss (bf)branch mispredict (cb)cfd0d2d5d6ddinst fetch restart (de)e0? int output thing (e9)eaebecld/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
160205160052119900100010041010116003721818159686122516010210080002800001008000080000500421810229392401600331600521600521396873140010160100200800008000020080000160000160052160052118020210099100100800008000001008000003908000000358003561353905110004161116004900628000080000100160041160041160041160053160053
160204160052119910000000041010116003721818159686122516010210080002800001008000080000500421017229381471600331600521600521396873140010160100200800008000020080000160000160052160040118020110099100100800008000001008000003908000000358000061353915110001161116004906028000080000100160053160053160053160053160053
160204160052119900000000041010116003701818159686112516010210080002800001008000080000500421810229392401600211600521600401396873140010160100200800008017120080000160000160052160052118020110099100100800008000001008000003908003500358003660353905138301161116004906628000080000100160053160041160053160053160053
160204160052119900000000000101160037218015968612251601021008000280000100800008000050042181022939240160033160052160052139687314001016010020080000800002008000016000016005216005211802011009910010080000800000100800000390800350035800006135005110001161116003706628000080000100160053160053160053160041160053
160204160052119900000000041000116003721818159690112516010210080002800001008000080000500421810229392401600331600521600521396873140010160100200800008000020080000160000160052160052118020110099100100800008000001008000003908003500358003561353905110001161116004900028000080000100160053160053160053160053160053
1602041600521199000000000410101160025218181596860251601021008001480000100800008000050042181022939228160033160052160052139687314001016010020080000800002008000016000016005216005231802011009910010080000800000100800000390800350035800356103905110001161116003706628000080000100160053160053160053160053160053
1602041600521199000000000001011600372181815969012251601021008000280000100800008000050042181022939203160033160052160052139687314001016010020080000800002008000016000016005216005211802011009910010080000800000100800000390800000035800356135005110001161116003706008000080000100160041160053160053160053160053
16020416005211990000000000000116003721818159686122516010010080002800001008000080000500421810229392401600331600521600521396873140010160100200800008000020080000160000160048160060118020110099100100800008000001008000003908003500388003561353905110001161116003706628000080000100160053160041160041160053160041
16020416005211980000000004100011600372181815969011251601021008000280000100800008000050042101722939240160021160052160040139687313999816010020080000800002008017116000016005216005211802011009910010080000800000100800000008000000358003561353905110001161116004906028000080000100160053160053160053160041160041
1602041600521199000000000410101160037000159686122516010210080002800001008000080000500421017229392401600211600401600401396873139998160100200800008000020080000160000160052160100118020110099100100800008000001008000003908003500358003561353905110001161116004906028000080000100160053160053160053160053160053

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 2.0006

retire (01)cycle (02)0305090e18191e22233a3f4346494e4f51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)60696b6d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a1a3a5a6a7a8a9acafb5b6bbdcache load miss (bf)dtlb miss (c1)cfd0d2d5d6ddinst fetch restart (de)e0eaebecld/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
160025160052119901000451021600252181815968416251600121080002800001080000800005042206622938147116003301600521600521397093140032160010208000080000208000016000016004016005211800211091010800008000001080000039080039000080039603500502000716471600531064800008000010160057160057160041160107160041
1600241600521199000004500216008621212159690162516001210800008000010800008000050422066229392401160021016005616005213971331400361600102080000800002080000160000160056160052118002110910108000080000010800000390800350003580035613539050200031676160049064800008000010160057160057160057160057160041
1600241600401199000004110216002521812159690162516001210800028000010800008000050421085229392401160021016005616005613971331400361600102080000800002080000160000160056160052118002110910108000080000110800000390800350003980039003539050200181697160053060800008000010160057160057160057160041160041
16002416024011990000245002160025212121596901625160012108000280000108000080000504218782293814701600370160056160052139713314003616001020800008000020800001600001600401600521180021109101080000800000108000003908004001039800396135430502000416441600531064800008000010160041160041160041160057160041
160024160040119900000650031600412121215969002516001210800028000010800008000050422109229392401160037016004016005613969731400201600102080000800002080000160000160052160040118002110910108000080000010800000008003900039800356039005020005164716005310100800008000010160053160057160057160057160057
160024160052119900000450001600412121215968402516001210800028000010800008000050421085229381470160037016005616005613971331400201600102080000800002080000160000160056160052118002110910108000080000010800000390800390000800396035390502000816441601031064800008000010160053160091160041160053160041
160024160040119900000000216004120121596841225160012108000080000108000080000504210852293814711600330160052160056139713314002016001020800008000020800001600001600611600611180021109101080000800000108001920008001910221800006157421950200081643160043992800008000010160062160062160047160047160062
1600241600611199100006510216010301812159684025160012108000280000108000080000504220662293924011600330160056160040139697314003616001020800008000020800001600001600561600521180021109101080000800000108000003908000000039800000139430502000416441600371064800008000010160057160057160106160053160053
160024160040119900000451021600372121215968416251600121080002800001080000800005042108522938147116002101600561600561396973140036160010208000080000208000016000016005616004011800211091010800008000001080000039080039000080039613500502000816341600371064800008000010160041160053160053160053160053
160024160056119900000451021600312181815968515251600121080004800001080000800005042241022940044016004201600611600611397183140041160010208000080000208000016000016004016005211800211091010800008000001080000039080000000128003960350050200081635160103662800008000010160057160057160041160053160053