Apple M1 Microarchitecture Research by Dougall Johnson

Firestorm: Overview | Base Instructions | SIMD and FP Instructions
Icestorm:  Overview | Base Instructions | SIMD and FP Instructions

LD1 (single structure, H)

Test 1: uops

Code:

  ld1 { v0.h }[1], [x6]
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 60 nops): 2.000

Issues: 2.001

Integer unit issues: 0.000

Load/store unit issues: 1.000

SIMD/FP unit issues: 1.001

retire (01)cycle (02)030708090a0e0f18191e223a3f464951inst issue (52)~issue fp/simd (54)~issue ld/st (55)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing ld/st (5a)huge thing fp/simd (5b)5f6061696d6escheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op ld/st (7d)~map op fp/simd (7e)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst fp/simd load (98)inst ldst (9b)9da0a3a6a8acafb5bbdcache load miss (bf)cfd0d2icache miss (d3)itlb miss (d4)d5d6d9dadbddinst fetch restart (de)e0eaebecld/st retires (ed)? fp/simd (ee)f5f6f7f8fd
62005293572202100200100314561287771017320200110011000100010005000119501000226182912729307310200010001000100020002918029151116100110001000010000100003100020012991915568663079966206253076382110565428421165261375015336100010002926929222291892930829191
6200429255220250025000030455328809001719020001001100010001000500011949410226322907629184310200010001000100020002908329158116100110001000010000100101021000210128349198688730871349205333106381312565028326163491390314985100010002919729312292402915629359
620042923421919002000003145802871510172892001100110001000100050001194600022669291262930931020001000100010002000291232905311610011000100001000310010941001203129188989677230391254206883112381910484928344163021386914989100010002927729183292782932029266
620042925921925002100003046192871210172192001100110001000100050001193500022665290332922631020001000100010002000290932907811610011000100001000310010181000210128499111679330191147206033118382014515428372162861402315280100010002939329257292632924029228
62004292722191900210000204957287471017334200110011000100010005029119455002259729076292653102000100010001000200029164290341161001100010000100031000071001313129839182683430601154206393115381914515228340166341406315024100010002922529218292792931829251
6200429341218220021000030467228758001729220011000100010001000500011944500227222908329257310200010001000100020002907729074116100110001000010000100008210002131301592946861308094320556306138179464928366162931385515273100010002936129204292632922129207
62004292832202700250000314650287411117257200110001000100010005000119357002261729065293843102000100010001000200029104291241161001100010000100001000099100121012916925068543071846206453133381212525428379165891417914982100010002936929330292472933729299
62004293092192600170000014835287570117331200110011000100010005000119526002260929051292583102000100010001000200029023290621161001100010000100001000099100121312926925468073014195320586308138179505428322163831387615176100010002925829273293372920729306
62004292532192400200000714606288740017268200110011000100010005000119546002260429033292993102000100010001000200029134291511161001100010000100031000111000203129679279692230171054207273048381416505328400164891391315130100010002923229270292942933329204
620042922722022002201000145382877010172652000100010001000100050011194930022649290592920131020001000100010002000290832926711610011000100001000310010751001212129059043684931011552205533110381714555628309161961380215033100010002921629157292122920329280

Test 2: Latency 1->2 roundtrip

Chain cycles: 3

Code:

  ld1 { v0.h }[1], [x6]
  fmov x0, d0
  eor x8, x8, x0
  eor x8, x8, x0
  add x6, x6, x8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 11.0051

retire (01)cycle (02)030508090b0e0f18191e1f22233a3f4d4f51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a1a3a6a7a8a9acafb5bbdcache load miss (bf)dtlb miss (c1)c2branch mispredict (cb)cdcfd5d6ddinst fetch restart (de)e0? int output thing (e9)eaebecld/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
602051400511049001000001010014042113945612936825701044010020002100003010020000100001263985669402214309667114003314005714005713180331323926010030200100002000060200100003000014005714004111502011009910040100100001000011001000220100020111100001101000321011391113956840000101010100001000040100140052140052140052140036140052
602041400511049000000001000014002013943012936825701024010020014100003010020000100001263814669402214312591014002714005114005113179331323826010030200100002000060200100003000014005114005111502011009910040100100001000001001000110100020011100001111100321011391113956840000101010100001000040100140425140057140052140052140052
6020414005110490000000010000140036139427129363257010240100200021000030100200001000012640206693734143109391140013140051140035131797313239960100302001000020000602001000030000140035140051115020110099100401001000010000010010002101000100111000011110003210113911139568400000100100001000040100140052140036140052140036140036
60204140417104900000000000001401261394301293682570104401002000410000301002000010000126398566963571431259111400331400571400571318033132392601003020010000200006020010000300001400571400571150201100991004010010000100000100100000110000000010000100000032101126111395594000010100100001000040100140038140436140055140052140036
602041400351049000000001001014003613940612936410770100401002000010000301002000010000126402066938781431093911400271403331401281317933132382601003020010000200006020010000300001400571400511150201100991004010010000100000100100000110000010010001001000040612368111416734021710100100001000040100142022143337143343143380143556
602041435501074000100373648971936010143772141548130417157670772405472017710008301002000010000126398566940221431259111400171400411400611318013132393603953020010000201076020010000300001400571400571150201100991004010010000100001100100000110000000010032101000032101126111395494000010010100001000040100140052140052140052140052140052
6020414005110490000000000010140020139430129368257010440100200021000030100200001000012639856693244143125911140033140057140057131803313238260100302001021420000602001000030000140053140051115020110099100401001000010000110010002211000200111000011111003210111411139568400000100100001000040100140052140052140052140036140194
60204140038104800000000100101400361394301293682570104401002000410000301002000010000126398566940221431259111400271400511400351317973132399601003020010000200006020010000300001400351400511150201100991004010010000100000100100000110000000010000101000032101126111395464000001010100001000040100140042140058140058140058140058
602041400571049100100001001114002713940612936325701004010020002100003010020000100001264020669373414310939114002714005114003513179731323826010030200100002042760200100003000014005114003511502011009910040100100001000011001000211100010001100001111100321011142113956840000101010100001000040100140054140402140052140036140052
6020414009710490000010010010140036139406129347257010040100200021000030100200001000012639586693734143109391140012140051140051131797313238260100302001000020000602001000030000140051140051115020110099100401001000010000110010000001000000001000000100003210117311139559400000010100001000040100140036140052140036140052140055

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 11.0047

retire (01)cycle (02)030508090b0e0f1e2223243a3f4d4f51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)6061696b6d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a1a3a6a8a9acafb5bbdcache load miss (bf)dtlb miss (c1)c2cdcfd0d2icache miss (d3)d5d6d9ddinst fetch restart (de)e0? int output thing (e9)eaebecld/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
60025140047104900101021001140038139400129353257001240010200041000030010200001000012645076693830143264371014001701400531400411318213132436600103002010000200006002010000300001400531400531150021109104001010000100000101000231100020111100001111003140000411302213957240000666100001000040010140054140054140054140042140054
6002414010810491011001100114003813940012936825700144001020004100003001020000100001264543669383014326437101400290140053140053131821313243660010300201000020000600201000030000140053140053115002110910400101000010000010100022110002011100000111203140000211302213956040000606100001000040010140042140054140054140042140042
600241400531049110100210001401431394001293642570014400102000210000300102000010000126450766984481432908910140029014004114009513183650313349660010301811005320108619661000030324140244140141115002110910400101000010000010100024110002017100031101103140000211302213956040008066100001000040010140054140054140054140054140054
6002414005310491111002100114003913940012936425700144001020002100003001020000100001264507669383014326437101400290140053140054131821313254960010305031007520216606621000030000140056140053115002110910400101000010000010100012110002304100000111103140000211302213957240000666100001000040010140057140045140058140054140054
60024140053104911111114100114008413940012936725700144001020004100003001020000100001266852669589414326437001400290140053140053131821313243660010300201000020000600201000030000140053140053115002110910400101000010000110100022110002011100001111203140000211302213957240000666100001000040010140054140054140054140054140054
6002414005310491111002100114003813940012936425700144001020004100003001020000100561264507669383014326437101400290140053140053131821313243660010300201000020000600201000030000140053140041115002110910400101000010000010100011110001011100001111103140000211302213956040000666100001000040010140054140054140054140054140054
6002414004110481100002000014003813940012936425700144001020004100003001020000100001264507669383014326437101400290140054140041131821313243660010300201006420000600201000030000140041140053115002110910400101000010000010100022110002011100000111003140000211302213957240000666100001000040010140054140054140054140054140044
6002414004110491100001000114003813938812936425700144001020004100003016020000100001264507669383014326437001400290140053140053131821313243660010300201000020000600201000030000140053140053115002110910400101000010000010100022010002011100001111003140000211302213957240000006100001000040010140054140054140054140054140054
6002414005310491111002100114003813940012936425700144001020004100003001020000100001264507669383014326437001400170140053140053131821313243660010300201000020000600201000030000140053140041115002110910400101000010000010100032110002001100001111003140000211302213957240000666100001000040010140054140042140054140054140054
6002414005310491101002100114003813940012936425700144001020004100003001020000100001264507669383014326437001400170140053140041131821313243660010300201006220000600201000030000140056140053115002110910400101000010000110100033110003014100001111003177000211302213957240000666100001000040010140054140054140054140054140042

Test 3: throughput

Count: 8

Code:

  ld1 { v0.h }[1], [x6]
  ld1 { v0.h }[1], [x6]
  ld1 { v0.h }[1], [x6]
  ld1 { v0.h }[1], [x6]
  ld1 { v0.h }[1], [x6]
  ld1 { v0.h }[1], [x6]
  ld1 { v0.h }[1], [x6]
  ld1 { v0.h }[1], [x6]
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 2.0007

retire (01)cycle (02)030b0e0f1e1f22233a3f4346494e4f51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)6061696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a3a6a8a9acafb5b6bbdcache load miss (bf)c2c5branch mispredict (cb)cfd0d5d6ddinst fetch restart (de)e0eaebecld/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
160205160056119900045000016004521211596900251601021008000280133100800008000050042204122939575151600411600401600601396753140018160100200800008000020080000160000160060160056118020110099100100800008000001008000008003800080168613944000511051161116003714078000080000100160061160061160057160057160061
160204160056119900000101160025212015968402516010010080000800001008000080000500421017229395471516004116004016005613969531400191601002008000080000200800001600001600601600561180201100991001008000080000010080000438000010080038013944000511051161116005314078000080000100160061160061160061160041160061
1602041600401199000440001160045211159685172516010210080004800001008000080000500421017229395471516004116006016006013969531399981601002008000080000200800001600001600401600561180201100991001008000080000010080000438003800080038000430005110511611160053141048000080000100160057160061160041160041160041
160204160040121000000101160045211159690025160244100800028000010080000800005004222042293957515160042160060160060139695314001816010020080000800002008000016000016004016004011802011009910010080000800000100800000800000008003861380000511051161116005701478000080000100160061160061160061160061160041
160204160060119900044000116004101121596841625160100100800028000010080000800005004222042293994715160041160040160040139695313999816010020080000800002008000016000016004016005611802011009910010080000800000100800000800000008003860044000511051161116005710078000080000100160061160041160061160061160061
1602041600401199000440000160045210159690162516010010080002800001008000080000500422226229395591516004116006016011013969531400181601002008000080000200800001600001600561600561180201100991001008000080000010080000080000003880000603900005110511611160057101048000080000100160057160061160061160061160061
1602041600561199000001011600252012159684162516010210080000800001008000080000500422232229395591516004116006016006013969531400181601002008000080000200800001600001600601600561180201100991001008000080000110080000438005400080038610440005110511611160037141408000080000100160061160041160061160041160061
16020416006011980000010116004500115968502516010210080000800001008000080000500422232229381471516002116006016006013967531400181601002008000080000200800001600001600401600561180201100991001008000080000010080000438003810388003860000005137511611160037141008000080000100160057160041160057160041160041
160204160056119800044010016004120121596841625160102100800028000010080000800005004210292293994715160041160060160056139691314001816010020080000800002008000016000016004016005611802011009910010080000800000100800004380039000800386139430005110511611160053141478000080000100160061160061160061160061160061
160204160040119900044010116004521121596731925160102100800028000010080000800005004222042293814715160041160060160040139695313999816010020080000800002008000016000016006016005611802011009910010080000800000100800004380000000800006039440005110511611160037141078000080000100160041160057160061160061160041

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 2.0008

retire (01)cycle (02)0305080b0e0f1e22233a3f4346494e4f51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)60696b6d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a1a3a5a6a7a8a9acafb5b6bbdcache load miss (bf)dtlb miss (c1)c2c5branch mispredict (cb)cfd5d6dbddinst fetch restart (de)e0eaebecld/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
160025160061119910100210021600462181815968502516001410800028000010800008000050421532229400440160042016006116011113971831400411600102080000800002080000160000160061160110118002110910108000080000010800192000800560015980000615701910050201016046160058092800008000010160062160062160062160062160062
160024160061119910100650031600462018159687162516001410800048000010800008000050422410229400440160042016004616004613971831400411600102080000800002080000160000160096160061118002110910108000080000010800191942080057100598000001570192005020616056160058992800008000010160047160062160062160062160047
1600241601091199111006510316004621818159685152516001410800048000010800008000050422384229400150160042016006116006113971831400261600102080000800002080000160000160046160046118002110910108000080000010800201942118800582002180000015742190005020616056160058902800008000010160047160062160062160062160062
160024160061119810000651011600310181815968802516001210800048000010800008000050422388229400440160042016004616006113970331400261600102080000800002080000160340160061160046118002110910108000080000110800191900800551005980038001942192005020716064160058992800008000010160047160047160047160062160062
160025160046119911000650021600462181815968516251600121080004800001080000800005042152122938681016002701600461600611397183140041160010208000080000208000016000016006116006111800211091010800008000001080020190080019000218000061570191005020616066160058992800008000010160062160047160047160047160062
1600251600461199111006500216004621818159685162516001410800048000010800008000050421533229400310160081016006116004613971831400411600102080000800002080000160000160061160046118002110910108000080000110800201900800571002180038605742191005020716066160043992800008000010160062160062160047160062160047
1600241600611198111002100316004601801596851525160012108000480000108000080000504215142294004401600660160061160046139703314004116001020800008000020800001600001600461600611180021109101080000800000108002019420800191115980000615742191005020416046160043902800008000010160062160062160062160062160062
16002416006111991100019900216009420181596851525160014108000480000108000080000504223952293869301600420160061160046139718314004116001020800008000020800001600001600461600611180021109101080000800000108002020420800571005980000611942190005020616064160043092800008000010160108160062160062160047160047
160024160061119910111220011600462018159479162516001412800048000010800008032150421532229400440160027016004616004613969247140052160010208000080000208000016000016006116004611800211091010800008000001080020200080057100598000060570181005020416064160058992800008000010160062160047160047160062160062
16002416004611991110065101160046018181596851625160014108000480000108000080000504215322294002301600270160061160061139718314004116001020801708000020800001600001600611600611180021109101080000800000108002019420800570012180000605742191015020516078160043092800008000010160047160047160062160062160062