Apple Microarchitecture Research by Dougall Johnson

M1/A14 P-core (Firestorm): Overview | Base Instructions | SIMD and FP Instructions
M1/A14 E-core (Icestorm):  Overview | Base Instructions | SIMD and FP Instructions

ST1 (single, S)

Test 1: uops

Code:

  st1 { v0.s }[1], [x6]
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 60 nops): 2.000

Issues: 2.000

Integer unit issues: 0.000

Load/store unit issues: 1.000

SIMD/FP unit issues: 1.000

retire uop (01)cycle (02)03l1i tlb fill (04)mmu table walk instruction (07)09l2 tlb miss instruction (0a)1e1f2223243a3f464951schedule uop (52)schedule simd uop (54)schedule ldst uop (55)dispatch simd uop (57)dispatch ldst uop (58)simd uops in schedulers (5a)ldst uops in schedulers (5b)5f696d6emap rewind (75)map stall (76)dispatch uop (78)map ldst uop (7d)map simd uop (7e)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst simd store (99)inst ldst (9b)l1d tlb access (a0)l1d cache miss st (a2)ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)acafl1d cache miss st nonspec (c0)cfd0d2l1i cache miss demand (d3)l1i tlb miss demand (d4)d5map dispatch bubble (d6)d9dadbddfetch restart (de)e0eaebec? ldst retires (ed)? simd retires (ee)f5f6f7f8fd
6200629421228430301000465329090111864620001000100010001000109088000102174628986293813102000100010002000100029199291841161001100010001000301000731000312972921469953102244206883112381310514628317162061330614699100010002929129266292362920429307
62004293442260201000004589290371118309200010001000100010001090380005217102905329360310200010001000200010002912229235116100110001000100030100086100001306892186886313814520530314138065434428588162001332914758100010002913929230292122936129116
620042928422723020100045652909411182152000100010001000100010910800052173128896293183102000100010002000100029254291581161001100010001000001000323100031310394216901312524120576310638109424728423162631310615089100010002915629294292642922729292
620042903722603010000045152905311183022000100010001000100010912800052174428868292783102000100010002000100029205292311161001100010001000301000313100001290993516893314004520585319538146455328667161321330415096100010002934729417292492942829299
620042926922603040010045672916211184842000100010001000100010906800052171228940291713102000100010002000100029214291591161001100010001000301000336100001314295266969314404920617326038095435028765162041330614886100010002970329740297832944929346
6200429451236021301000462829248111735220001000100010001000109058008252175428178286003102000100010002000100028274282431161001100010001000301000461000313149963169693200047199323311381913424328171155441261614396100010002873628676287602854428824
6200428633233020101001506128069101786820001000100010001000109068000152176528143282513102000100010002000100028255280951161001100010001000301000510210000136131013571973495148196943299380110445127785146281198213394100010002816628183282262820028141
620042832921203020000050502819311174162000100010001000100010910800015217542812928244310200010001000200010002819328144116100110001000100030100033100001381299927189342424419642338938067444727909141831232113259100010002836928114282552825228298
62004283282120100000005184284480117215200010001000100010001090780001621695281462840031020001000100020001000282272823911610011000100010000010003101000013872102687061341814519662328538038414327938138201214613298100010002835628170284222828228165
6200428204213020101000503928133011742320001000100010001000109078000112170628167281673102000100010002000100028295281981161001100010001000301000331000313860103457253345224419630340338119444227901144041191813321100010002819928177282422837028245

Test 2: throughput

Count: 8

Code:

  st1 { v0.s }[1], [x6]
  st1 { v0.s }[1], [x6]
  st1 { v0.s }[1], [x6]
  st1 { v0.s }[1], [x6]
  st1 { v0.s }[1], [x6]
  st1 { v0.s }[1], [x6]
  st1 { v0.s }[1], [x6]
  st1 { v0.s }[1], [x6]
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.5005

retire uop (01)cycle (02)03mmu table walk data (08)09l2 tlb miss instruction (0a)l2 tlb miss data (0b)18191e1f2223243a3f46494f51schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)696d6emap stall dispatch (70)rob full (74)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd store (99)inst ldst (9b)9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)a4ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)acafbcl1d cache miss st nonspec (c0)c2cfd5map dispatch bubble (d6)ddfetch restart (de)e0? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)f5f6f7f8fd
160206400423220000000400004002801604716010010080116800001028011680108500186019664000040021400434004319959032000116010020080000801192001600008000040043400432180201100991001008000080000100800000421010800020109780002242051281251140039080000800001004004340044400434004440044
160204400433220001000010004002801602516010010080000800001008011580000500183971264000040021400424025119959032000016010020080120800002001600008000040042400431180201100991001008000080000100800000420080002058000200051101161140040080000800001004004440043400444024340043
16020440046321000000123000040227016025160100100800008012010080000800005001839712640000400214004440042199590320000160100200800008000020016000080120402484004311802011009910010080000800001008006204200800020109880062242051101161140039080000800001004004440043400444004340044
160204400433220010000300004002816160251601001008000080000100800008000050018478996400004006240043400421995903200011603242008000080000200160240800004004340043118020110099100100800008000010080000042990800021280002242051101161140040080000800001004004340044400434004440044
16020440043321000000030000400281616025160100100800008000010080000800005001839712640928400214004340043199590320001160324200800008000020016024080000400434004211802011009910010080000800001008000004200800020110780002244051451161140232080000800001004019440043400444024440245
1602044004232301000003000040230161602516045210080116800001008000080108500184812464092840214402484024519959016200001603432008000080120200160000802404004340246118020110099100100800008000010080060042960800020280122242051271161140232180000800001004024840192402464004440452
16020440245324000002132910000400281616412581613321028104380540100809278075652719055856464804002140245402492010703200011601002008012080000200160238801204024740043318020110099100100800008000010080000242980800620110780002242051102251240422080000800001004004340250400434024440244
1602044024732110010113230000402281616025160100100800008000010080000800005001839712640000400214004340043199590320001160100200800008000020016000080000400434004311802011009910010080000800001008000004200800020580000242051101161140040080000800001004004440043400454004740044
160204400433210000000000004002716160251601001008000080000100800008000050018397126409284002140240400431995903200011601002008000080000200160238800004004340043118020110099100100800008000010080000046960800000280002242051101161140039080000800001004024840044400444004440044
16020440042322000000213000040030016025160100100800008000010080000800005001839712640000400214004540042199590320001160100200800008000020016000080000400434004311802011009910010080000800001008000004200800021280000242051101161140040080000800001004004440043400444004440043

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.5005

retire uop (01)cycle (02)03mmu table walk data (08)l2 tlb miss data (0b)18191e1f22233a3f46494f51schedule uop (52)schedule int uop (53)schedule simd uop (54)schedule ldst uop (55)dispatch int uop (56)dispatch simd uop (57)dispatch ldst uop (58)int uops in schedulers (59)simd uops in schedulers (5a)ldst uops in schedulers (5b)696d6emap stall dispatch (70)rob full (74)map rewind (75)map stall (76)dispatch uop (78)map int uop (7c)map ldst uop (7d)map simd uop (7e)map int uop inputs (7f)map ldst uop inputs (80)map simd uop inputs (81)8283flush restart other nonspec (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst branch cond (94)inst int alu (97)inst simd store (99)inst ldst (9b)9fl1d tlb access (a0)l1d tlb miss (a1)l1d cache miss st (a2)a4ld unit uop (a6)st unit uop (a7)l1d cache writeback (a8)a9acafbcl1d cache miss st nonspec (c0)c2cfd2l1i cache miss demand (d3)d5map dispatch bubble (d6)dbddfetch restart (de)e0? int output thing (e9)? ldst retires (ed)? simd retires (ee)? int retires (ef)f5f6f7f8fd
1600264004331100100400040027161602516001010800008000010800008000050183971264000040021400434004319982032004216001020800008000020160000800004004340042118002110910108000080000108000000008000210880002242050200761602194244008000080000104004340044400434004440043
160024400433100001001004002816160251600101080000800001080000800005018397126400004002140043400431998203200231600102080000800002016000080000400434004311800211091010800008000010800000420080002002800022420502005141606134004008000080000104004440044400434024840044
16002440042310000003000400281600251600101080000800001080000800005018397126400004002140043400431998203200221600102080000800002016000080000400434004211800211091010800008000010800000440080002005800002420502002141601374004008000080000104004340044400444004440443
16002440043322000012910004002816160251600101080000800001080000800005018397126400004002140042400421998203201851600102080000800002016000080000400514004211800211091010800008000010800000420080002008800022420502004101605144003908000080000104004340044400434004440044
160024400433100000030004002816160251600101080000800001080000800005018397126400004002140043400431998203200231600102080000800002016000080000400514004311800211091010800008000010800000420080000002800022420502002141601374023208000080000104004440044400444004440044
16002440043310000003000402350160251600101080000800001080000800005018397126400004002140043402421998203200221600102080000800002016000080000400524004311800211091010800008000010800000420080002002800022005020021316013124003908000080000104004440045400444004340044
16002440043310000012300040028161613925160010108000080000108000080000501839712640000400214004340042199820320023160010208000080000201600008000040043400511180021109101080000800001080000000080000002800022420502002121601364004008000080000104004440043400434004340044
16002440043311000003000400271616025160010108000080000108000080000501839712640000400214004340043199820320023160010208000080000201600008012040043400541180021109101080000800001080000000080000002800002420502002151601374004008000080000104004440043400434004440043
16002440042310000003000400271616025160010108000080000108000080000501839712640000400214024740043199820320023160010208000080000201600008000040043400431180021109101080000800001080000000080000003800020420502002141601464004008000080000104004340044400434004440044
16002440043310000012910004002816160251600101080000800001080000800005018397126400004002140043400431998203200221600102080000800002016000080000400424004311800211091010800008000010800600420080000002800022420502005151606134003908000080000104004440043402464004340043