Apple M1 Microarchitecture Research by Dougall Johnson

Firestorm: Overview | Base Instructions | SIMD and FP Instructions
Icestorm:  Overview | Base Instructions | SIMD and FP Instructions

LD1R (2D)

Test 1: uops

Code:

  ld1r { v0.2d }, [x6]
  nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop ; nop
  mov x0, 1
  mov x1, 2
  mov x8, 0

(no loop instructions)

1000 unrolls and 1 iteration

Retires (minus 60 nops): 2.000

Issues: 2.003

Integer unit issues: 0.000

Load/store unit issues: 1.000

SIMD/FP unit issues: 1.003

retire (01)cycle (02)03050708090a0b0e0f18191e22243a3f464951inst issue (52)~issue fp/simd (54)~issue ld/st (55)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing ld/st (5a)huge thing fp/simd (5b)5f696d6escheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op ld/st (7d)~map op fp/simd (7e)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst fp/simd load (98)inst ldst (9b)9da0a1a3a6a8a9acafb5bbdcache load miss (bf)dtlb miss (c1)c2cfd0d2icache miss (d3)itlb miss (d4)d5d6d9dadbddinst fetch restart (de)e0eaebecld/st retires (ed)? fp/simd (ee)f5f6f7f8fd
620052940622101900170100011046212876911171962003100310001000100050001193352261429052292603102000100010001000100029073290631161001100010000100133100202410001130012988924268423060544206053147381713414228382161941386315121100010002927229240292362926429271
620042924622011411110000041046642878400171722004100410001000100050001195102259429011292703102000100010001000100029136291191161001100010000100133100211710003121012848918668623076134020630308838186374128338161991391315077100010002917229234292532926829252
6200429215218115111710000510455928738001724320041006100010001000500011943222608290662921731020001000100010001000291982912711610011000100001001221003202281000313111291092556915310464520575314338178454528337162861394214808100010002930229212292712919929269
62004292742191161117100004004616287520017308200410041000100010005000119493226252899529288310200010001000100010012910429137116100110001000010011210021711010003131112996911668333109645206153062381211384228387163091374115072100010002935229246292092927129255
62004292492191171119100004004622288010017258200310041000100010005000119371225952909929221310200010001000100010002909029097116100110001000010013310021111000313111297292106870309664020570312438194424228352163891394514980100010002933029195293112928629200
620042916721911511161000016104619287290017284200310041000100010005000119524226442899229138310200010001000100010002914629115116100110001000010014210030011000312121289291546837306054120599307438189394428320162851374515043100010002896929280292792921029248
6200429185218118101610000510465228721001712720031004100010001000500011942322609290592923931020001000100010001000290582906911610011000100001001103100128171000212111307791966820306654320627310338155424428372161001376315050100010002926929184293532921729299
6200429191219118111110000110467528839001722520041004100010001000500011948522708290192917631020001000100010001000291532907511610011000100001001441002230129710002131112947922170143082542205903086381711444028337162151386215162100010002939829176292552926729218
62004292382191120012111004410459228724001724620021003100010001000500011926022646290352922031020001000100010001000291322913111610011000100001001321000730410002130012835913668643052840206183081381612454428308163741378514836100010002919529274292442925029198
6200429290219018001401000310458128781111723120031003100010001000500012027122632290492927731020001000100010001000291132915311610011000100001001131001506310002020012906937468693066845206163072381313444528375162461378814911100010002926929326292822928029233

Test 2: Latency 1->2 roundtrip

Chain cycles: 3

Code:

  ld1r { v0.2d }, [x6]
  fmov x0, d0
  eor x8, x8, x0
  eor x8, x8, x0
  add x6, x6, x8
  mov x0, 1
  mov x1, 2
  mov x8, 0

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code, minus 3 chain cycles): 11.0054

retire (01)cycle (02)0308090b0e0f18191e1f2223243f4d4f51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)60696b6d6edispatch stall (70)71scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a1a3a6a7a8a9acafb5bbdcache load miss (bf)dtlb miss (c1)c2cdcfd0d5d6ddinst fetch restart (de)e0? int output thing (e9)eaebecld/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
602051400571049000000010000140020139406129347257010040100200001000030100200001000012640206692947143109390140011014005414003513180003132389601003020010000200006020010213200001400511400511150201100991004010010000100000100100000010000000010000101000321002139611395684000013010100001000040100140055140036140036140055140055
6020414003510490000000100001400391394271293472570102401002000210000301002000010000126380366938781431093901400113140035140054131800031323996010030200100002000060200100002000014043114005111502011009910040100100001000001001000000100000000100001000003210011271113956540000131313100001000040100140055140055140036140055140055
602041400541049000000010100140039139406129365257010040100200021000030100200001000012640206693235143185560140030014028414005713179303132389601003020010000200006020010000200001400541400351150201100991004010010000100000100100000110001000010000101000321001139111395654000001313100001000040100140055140055140055140036140036
602041400351049100000010100140039139427129365257010240100200001000030100200001000012638036692947143122820140030014003514003513179703132399601003020010000200006020010211200001400541400351150201100991004010010000100000100100000110000000010000101000321001139111395654000013013100001000040100140036140055140055140103140055
60204140054104900000001010014002013940612936325701024010020002100003010020000100001264020669387814310939114001101400511400351318000313238260100302001000020000602001000020000140054140035115020110099100401001000010000010010000001000000001000010100032100113911139559400000010100001000040100140055140057140060140432140055
602041400541049000000010000140039139411129363257010040100200021000030100200001000012640206693734143122820140030014003714005413179303132382601003020010000200006020010000200001400541400511150201100991004010010000100000100100000110000230481000010100032100113911139565400000013100001000040100140055140036140055140055140055
602041400541049000000000100140036139427129347257010240100200001000030100200001000012639586692947143122821140030014005414005413180003132399601003020010000200006020010000200001400351400351150201100991004010010000100000100100000110000000010000001000321001139111395654000013130100001000040100140052140052140428140055140058
6020414003510490000000403520001400391394271293652570102401002000210000301002000010000126380366938781430870101400300140051140051131797031323996010030200100002000060200100002000014003514005111502011009910040100100001000001001000000100000300100001010003210011271113955940000101310100001000040100140055140055140036140052140055
60204140054104900000001010014003913942712936525701024010020002100003010020000100001263803669387814308701014001101400541400351318000313238260100302001000020000602001000020000140054140051115020110099100401001000010000010010000011000000001000000100032100113911139565400001300100001000040100140055140055140055140036140055
602041400541049000000082010014003913942712936525701024010020000100003010020000100001263803669373414308701014003001400351400601318000313260560100302001000020000602001000020000140055140423115020110099100401001000010000010010000011000000001000010100032100113911139546400000010100001000040100140055140036140055140052140055

1000 unrolls and 10 iterations

Result (median cycles for code, minus 3 chain cycles): 11.0050

retire (01)cycle (02)03090b0e0f181e22233f4d4f51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)60696d6edispatch stall (70)71scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a3a6a8acafb5dcache load miss (bf)c5cdcfd5d6ddinst fetch restart (de)e0? int output thing (e9)eaebecld/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
6002514004710481000011014002013939412935925700104001020002100003001020000100001264443669353814325933114001314005214005013181803132433600103002010000200006002010000200001400471400351150021109104001010000100000101000011000050100001100314031113313955440000960100001000040010140051140036140048140036140051
6002414004710490000011014003213939712936225700104001020002100003001020000100001264443669352214325829014002614005014003513180303132435600103002010062200006002010000200001400501400471150021109104001010000100000101000001000010100001000314031133313956640000969100001000040010140051140036140067140036140048
6002414003510480001001014003513939712934725700124001020000100003001020000100001264477669368514325829014002614005014004713181503132506600103002010000200006002010000200001400501400471150021109104001010000100000101000001000000100000100314031133313956940000669100001000040010140036140036140036140051140036
6002414003510490000000014003513939712936225700124001020002100003001020000100001264477669294714325829014001114005014005013180303132436600103002010000200006002010000200001400351400471150021109104001010000100000101000011000000100001100314031113313956640000999100001000040010140048140051140050140049140037
60024140108104901000181014003513940512936225700354001020002100003001020000100001264477669368514325829014003114005014003513181803132438600103002010000200006002010000200001400501400471150021109104001010000100000101000011000000100001100314031133313956940000669100001000040010140052140038140053140423140036
6002414005010490000061014003213939412936225700104001020002100003001020000100001264477669368514325829114002614003514003513180303132433600103002010000200006002010000200001400471400471150021109104001010000100000101000011000000100001100314021112313956640000969100001000040010140051140036140036140051140051
6002414005010490000011014002013939712936225700124001020002100003001020000100001264477669368514321442014002614008214005013180303132478600103002010000200006002010000200001400501400471150021109104001010000100000101000011000000100001000314031133213955440000006100001000040010140036140051140051140036140051
6002414005010490000011014007913939412936225700104001020002100003001020000100001264429669368514321442014001114003514005013180303132462600103002010000200006002010000200001400351400471150021109104001010000100000101000001000000100001100317731113313956940000960100001000040010140048140048140048140036140048
6002414005010490000061014004013939712936225700104001020002100003001020000100001264477669368514325829114002614005014005013181803132466600103002010000200006002010000200001400351400471150021109104001010000100000101000001000000100001100314021133313956840000099100001000040010140051140051140036140082140048
6002414003510490000011014003513939412936225700124001020000100003001020000100001264477669368514326555114001114005514003513181803132478600103002010000200006002010000200001400501400351150021109104001010000100001101000011000010100000100314031113213956940000009100001000040010140051140051140051140048140036

Test 3: throughput

Count: 8

Code:

  ld1r { v0.2d }, [x6]
  ld1r { v0.2d }, [x6]
  ld1r { v0.2d }, [x6]
  ld1r { v0.2d }, [x6]
  ld1r { v0.2d }, [x6]
  ld1r { v0.2d }, [x6]
  ld1r { v0.2d }, [x6]
  ld1r { v0.2d }, [x6]
  mov x7, x6
  mov x8, x6
  mov x9, x6
  mov x10, x6
  mov x11, x6
  mov x12, x6
  mov x13, x6

(fused SUBS/B.cc loop)

100 unrolls and 100 iterations

Result (median cycles for code divided by count): 0.3342

retire (01)cycle (02)0305080b0e0f1e22233a3f4346494f51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)60696d6edispatch stall (70)scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a1a3a6a7a8a9acafb5b6bbdcache load miss (bf)dtlb miss (c1)c2cdcfd5d6ddinst fetch restart (de)e0eaebecld/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
160205267372001011010200326722207025160165100800198000010080000800005001176870187829412669726737267156659366951601002008000080000200800008000026737267371180201100991001008000080000010080019194380019102218003961044000511031633267291410080000800001002673326709267332670926741
16020426708200000014510026713000192516014410080044800001008000080000500116888018840111267132673226708663036690160100200800008000020080000800002670826728118020110099100100800008000001008000004380039020548000061394400051103162326725140380000800001002673326729267362673326733
1602042672820000011440002671720019251601441008004480000100800008000050011696231880202126715267202673266303666616010020080000800002008000080000267322672811802011009910010080000800000100800000438000000039800006139000051103163326705140080000800001002673326733267332673326709
160204267282000001144000267170120192516010010080000800001008000080000500117462818870671267132673226732665436690160100200800008000020080000800002673226728118020110099100100800008000001008000000800000000800000104300051103162326729014380000800001002670926709267332673326733
160204267282000001023003267003701925160165100800648000010080000800005001169295188353212672626737267376659366951601002008000080000200800008000026737267371180201100991001008000080000010080019194380058000608000060580192051103163326734013080000800001002673826716267382671626716
1602042671520011010440012671721200251601441008000080000100800008000050011688801887334126712267082670866503666616010020080000800002008000080000267082672811802011009910010080000800000100800000080038000388003861043000511031633267051414380000800001002673326709267292673326709
160204267282000001044001266932011925160144100800008000010080000800005001169085188521412671826732267086630366901601002008000080000200800008000026708267281180201100991001008000080000010080000044800390003880038600440005110316332670500380000800001002670926733267292673326733
16020426708201000114400126717012102516014410080044800001008000080000500116888018835731267132673226732665036666160100200800008000020080000800002670826728118020110099100100800008000001008000004380038000438003801044000511231633267051410080000800001002670926733267332670926709
160204267322000001044101266930111925160144100800008000010080000800005001174887188722812672726737267376637366731601002008000080000200800008000026715267371180201100991001008000080000010080019200800611012180000605843190051103162326712130280000800001002671626716267382673826738
1602042673720011111670002670020702516016510080019800001008000080000500117017918871601267222672826708663036666160100200800008000020080000800002673226728118020110099100100800008000001008000004380038000388003801043000511031633267291414080000800001002672926709267332670926709

1000 unrolls and 10 iterations

Result (median cycles for code divided by count): 0.3340

retire (01)cycle (02)0304090b0e0f1e1f22233a3f4346494f51inst issue (52)~issue int (53)~issue fp/simd (54)~issue ld/st (55)~dispatch int (56)~dispatch fp/simd (57)~dispatch ld/st (58)huge thing int (59)huge thing ld/st (5a)huge thing fp/simd (5b)60696d6edispatch stall (70)72scheduler rewind (75)scheduler stall (76)~dispatch op (78)~map op int (7c)~map op ld/st (7d)~map op fp/simd (7e)~map lookup int (7f)~map lookup ld/st (80)~map lookup fp/simd (81)8283pipeline redirect (84)85inst all (8c)inst branch (8d)inst branch taken (90)inst b.cc (94)inst integer (97)inst fp/simd load (98)inst ldst (9b)9d9fa0a1a3a5a6a8acafb5b6bbdcache load miss (bf)c2c5cfd2d5d6ddinst fetch restart (de)e0eaebld/st retires (ed)? fp/simd (ee)gpr retires (ef)f5f6f7f8fd
1600252672520011011410101267080181812251600521080041800001080000800005011701071884807026706267292672366680367031600102080000800002080000800002672326723118002110910108000080000010800000390800000358003561353900502003165526720068000080000102672426709267242672426724
1600242672320000011410101267082181812251600511080041800001080000800005011701071884807026704267082672366530367031600102080000800002080000800002670826723118002110910108000080000010800000390800350358003561353900502002163526705608000080000102672426724267242672426724
160024267082000001041000126708218181125160051108000080000108000080000501170107188480702671126723267236653036703160010208000080000208000080000267232670811800211091010800008000001080000039080035135800356135000502002162326720608000080000102672426724267242672426724
1600242672320000011410101266932181812251600511080041800001080000800005011698441884807026715267292670866680367031600102080000800002080000800002672327130118002110910108000080000010800000008000003580035010000502005165526720668000080000102672426709267242672426709
16002426708200000104101002670821818122516005110800008000010800008000050116888018848070267042672326708666803668816001020800008000020800008000026708267231180021109101080000800000108000000080035035800350103900502003165626720668000080000102670926709267242670926709
160024267232000001041010126708201811251600511080000800001080000800005011701071884807026704267232672366530367031600102080000800002080000800002672326708118002110910108000080000010800000390800000358003561353900502005163326720008000080000102672426724267242672426709
160024267232000001041010126708018012251600101080000800001080000800005011688801884807026712267232672366680367031600102080000800002080000800002670826708118002110910108000080000010800000390800350358003561353900502003163226720608000080000102672426709267092670926724
160024267232000001000101267082181802516005110800418000010800008000050117010718848070267122672326723665303670316001020800008000020800008000026708267231180021109101080000800000108000003908003503580000610000502003163226705068000080000102672426724267242672426709
1600242672320000010001012670820183251600511080041800001080000800005011701071884807026712267232672366530367031600102080000800002080000800002672326708118002110910108000080000010800000390800350358003560353900502006165526720068000080000102672426724267242672426724
16002426708200000104101012670801818025160052108004180000108000080000501170107188480712669726723267236668036703160010208000080000208000080000267262672311800211091010800008000001080000039080035135800350103900502003163226720608000080000102670926709267092671226724