├── global.json
├── src
└── PerformanceExplorer
│ ├── project.json
│ ├── Properties
│ └── AssemblyInfo.cs
│ ├── PerformanceExplorer.xproj
│ └── Program.cs
├── PerformanceExplorer.sln
├── .gitattributes
├── scripts
├── ModelPolicyV1Size.R.txt
└── ModelPolicyV1Perf.R.txt
├── .gitignore
└── notes
└── notes-aug-2016.md
/global.json:
--------------------------------------------------------------------------------
1 | {
2 | "projects": [ "src", "test" ]
3 | }
4 |
--------------------------------------------------------------------------------
/src/PerformanceExplorer/project.json:
--------------------------------------------------------------------------------
1 | {
2 | "version": "1.0.0-*",
3 | "buildOptions": {
4 | "emitEntryPoint": true
5 | },
6 |
7 | "dependencies": {
8 | "Microsoft.NETCore.App": {
9 | "type": "platform",
10 | "version": "1.0.0"
11 | },
12 | "System.Xml.XmlSerializer": "4.0.11"
13 | },
14 |
15 | "frameworks": {
16 | "netcoreapp1.0": {}
17 | }
18 | }
19 |
--------------------------------------------------------------------------------
/src/PerformanceExplorer/Properties/AssemblyInfo.cs:
--------------------------------------------------------------------------------
1 | using System.Reflection;
2 | using System.Runtime.CompilerServices;
3 | using System.Runtime.InteropServices;
4 |
5 | // General Information about an assembly is controlled through the following
6 | // set of attributes. Change these attribute values to modify the information
7 | // associated with an assembly.
8 | [assembly: AssemblyConfiguration("")]
9 | [assembly: AssemblyCompany("")]
10 | [assembly: AssemblyProduct("PerformanceExplorer")]
11 | [assembly: AssemblyTrademark("")]
12 |
13 | // Setting ComVisible to false makes the types in this assembly not visible
14 | // to COM components. If you need to access a type in this assembly from
15 | // COM, set the ComVisible attribute to true on that type.
16 | [assembly: ComVisible(false)]
17 |
18 | // The following GUID is for the ID of the typelib if this project is exposed to COM
19 | [assembly: Guid("20b8742b-3910-40d1-9dd7-a5e3db9dd066")]
20 |
--------------------------------------------------------------------------------
/src/PerformanceExplorer/PerformanceExplorer.xproj:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | 14.0
5 | $(MSBuildExtensionsPath32)\Microsoft\VisualStudio\v$(VisualStudioVersion)
6 |
7 |
8 |
9 |
10 | 20b8742b-3910-40d1-9dd7-a5e3db9dd066
11 | PerformanceExplorer
12 | ..\..\artifacts\obj\$(MSBuildProjectName)
13 | ..\..\artifacts\
14 | v4.5.2
15 |
16 |
17 |
18 | 2.0
19 |
20 |
21 |
22 |
--------------------------------------------------------------------------------
/PerformanceExplorer.sln:
--------------------------------------------------------------------------------
1 |
2 | Microsoft Visual Studio Solution File, Format Version 12.00
3 | # Visual Studio 14
4 | VisualStudioVersion = 14.0.25123.0
5 | MinimumVisualStudioVersion = 10.0.40219.1
6 | Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "src", "src", "{95943EFC-A666-4605-8FF3-09C5BB169ABE}"
7 | EndProject
8 | Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Solution Items", "Solution Items", "{D05E4B15-0977-42AA-9A3B-29DFC9A40523}"
9 | ProjectSection(SolutionItems) = preProject
10 | global.json = global.json
11 | EndProjectSection
12 | EndProject
13 | Project("{8BB2217D-0F2D-49D1-97BC-3654ED321F3B}") = "PerformanceExplorer", "src\PerformanceExplorer\PerformanceExplorer.xproj", "{20B8742B-3910-40D1-9DD7-A5E3DB9DD066}"
14 | EndProject
15 | Global
16 | GlobalSection(SolutionConfigurationPlatforms) = preSolution
17 | Debug|Any CPU = Debug|Any CPU
18 | Release|Any CPU = Release|Any CPU
19 | EndGlobalSection
20 | GlobalSection(ProjectConfigurationPlatforms) = postSolution
21 | {20B8742B-3910-40D1-9DD7-A5E3DB9DD066}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
22 | {20B8742B-3910-40D1-9DD7-A5E3DB9DD066}.Debug|Any CPU.Build.0 = Debug|Any CPU
23 | {20B8742B-3910-40D1-9DD7-A5E3DB9DD066}.Release|Any CPU.ActiveCfg = Release|Any CPU
24 | {20B8742B-3910-40D1-9DD7-A5E3DB9DD066}.Release|Any CPU.Build.0 = Release|Any CPU
25 | EndGlobalSection
26 | GlobalSection(SolutionProperties) = preSolution
27 | HideSolutionNode = FALSE
28 | EndGlobalSection
29 | GlobalSection(NestedProjects) = preSolution
30 | {20B8742B-3910-40D1-9DD7-A5E3DB9DD066} = {95943EFC-A666-4605-8FF3-09C5BB169ABE}
31 | EndGlobalSection
32 | EndGlobal
33 |
--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | ###############################################################################
2 | # Set default behavior to automatically normalize line endings.
3 | ###############################################################################
4 | * text=auto
5 |
6 | ###############################################################################
7 | # Set default behavior for command prompt diff.
8 | #
9 | # This is need for earlier builds of msysgit that does not have it on by
10 | # default for csharp files.
11 | # Note: This is only used by command line
12 | ###############################################################################
13 | #*.cs diff=csharp
14 |
15 | ###############################################################################
16 | # Set the merge driver for project and solution files
17 | #
18 | # Merging from the command prompt will add diff markers to the files if there
19 | # are conflicts (Merging from VS is not affected by the settings below, in VS
20 | # the diff markers are never inserted). Diff markers may cause the following
21 | # file extensions to fail to load in VS. An alternative would be to treat
22 | # these files as binary and thus will always conflict and require user
23 | # intervention with every merge. To do so, just uncomment the entries below
24 | ###############################################################################
25 | #*.sln merge=binary
26 | #*.csproj merge=binary
27 | #*.vbproj merge=binary
28 | #*.vcxproj merge=binary
29 | #*.vcproj merge=binary
30 | #*.dbproj merge=binary
31 | #*.fsproj merge=binary
32 | #*.lsproj merge=binary
33 | #*.wixproj merge=binary
34 | #*.modelproj merge=binary
35 | #*.sqlproj merge=binary
36 | #*.wwaproj merge=binary
37 |
38 | ###############################################################################
39 | # behavior for image files
40 | #
41 | # image files are treated as binary by default.
42 | ###############################################################################
43 | #*.jpg binary
44 | #*.png binary
45 | #*.gif binary
46 |
47 | ###############################################################################
48 | # diff behavior for common document formats
49 | #
50 | # Convert binary document formats to text before diffing them. This feature
51 | # is only available from the command line. Turn it on by uncommenting the
52 | # entries below.
53 | ###############################################################################
54 | #*.doc diff=astextplain
55 | #*.DOC diff=astextplain
56 | #*.docx diff=astextplain
57 | #*.DOCX diff=astextplain
58 | #*.dot diff=astextplain
59 | #*.DOT diff=astextplain
60 | #*.pdf diff=astextplain
61 | #*.PDF diff=astextplain
62 | #*.rtf diff=astextplain
63 | #*.RTF diff=astextplain
64 |
--------------------------------------------------------------------------------
/scripts/ModelPolicyV1Size.R.txt:
--------------------------------------------------------------------------------
1 | ## Read in raw data set
2 |
3 | InlineData.raw <- read.csv("c:\\repos\\inlinedata\\mscorlib.data.model-rel.log.parsed", header=TRUE, comment.char="")
4 |
5 | ## Identify factors and logicals
6 |
7 | #InlineData.raw$Arg0Type <- as.factor(InlineData.raw$Arg0Type)
8 | #InlineData.raw$Arg1Type <- as.factor(InlineData.raw$Arg1Type)
9 | #InlineData.raw$Arg2Type <- as.factor(InlineData.raw$Arg2Type)
10 | #InlineData.raw$Arg3Type <- as.factor(InlineData.raw$Arg3Type)
11 | #InlineData.raw$Arg4Type <- as.factor(InlineData.raw$Arg4Type)
12 | #InlineData.raw$Arg5Type <- as.factor(InlineData.raw$Arg5Type)
13 | #InlineData.raw$ReturnType <- as.factor(InlineData.raw$ReturnType)
14 | #InlineData.raw$CallsiteFrequency <- as.factor(InlineData.raw$CallsiteFrequency)
15 | InlineData.raw$IsForceInline <- as.logical(InlineData.raw$IsForceInline)
16 | InlineData.raw$IsInstanceCtor <- as.logical(InlineData.raw$IsInstanceCtor)
17 | InlineData.raw$IsFromPromotableValueClass <- as.logical(InlineData.raw$IsFromPromotableValueClass)
18 | InlineData.raw$HasSimd <- as.logical(InlineData.raw$HasSimd)
19 | InlineData.raw$LooksLikeWrapperMethod <- as.logical(InlineData.raw$LooksLikeWrapperMethod)
20 | InlineData.raw$ArgFeedsConstantTest <- as.logical(InlineData.raw$ArgFeedsConstantTest)
21 | InlineData.raw$IsMostlyLoadStore <- as.logical(InlineData.raw$IsMostlyLoadStore)
22 | InlineData.raw$ArgFeedsRangeCheck <- as.logical(InlineData.raw$ArgFeedsRangeCheck)
23 | InlineData.raw$ConstantFeedsConstantTest <- as.logical(InlineData.raw$ConstantFeedsConstantTest)
24 |
25 | ## Remove Version0 (roots) and strip non-predictive columns
26 |
27 | Col.N <- c("Method", "Version", "JitTime", "HotSize", "ColdSize")
28 | InlineData <- InlineData.raw[InlineData.raw$Version > 0, setdiff(names(InlineData.raw), Col.N)]
29 | InlineDataV0r <- InlineData.raw[InlineData.raw$Version == 0, ]
30 |
31 | ## Produce frame with just predictive columns and the result we want to estimate
32 |
33 | Col.Z <- c("JitTimeDelta", "ColdSizeDelta", "HotSizeDelta", "ModelCodeSizeEstimate")
34 | Col.XY <- setdiff(names(InlineData), Col.Z)
35 | Col.Y <- c("TotalSizeDelta")
36 | Col.X <- setdiff(Col.XY, Col.Y)
37 |
38 | ## Examine existing models
39 |
40 | Model.P <- InlineData$ModelCodeSizeEstimate/10
41 | Actual <- InlineData$TotalSizeDelta
42 |
43 | ## Build new models
44 | ## using glmnet for modelling
45 |
46 | ## install.packages("glmnet")
47 | library(glmnet)
48 | InlineData.XY <- InlineData[, Col.XY]
49 | set.seed(1001)
50 |
51 | FullModel.M <- model.matrix(TotalSizeDelta ~ ., InlineData.XY)
52 | FullModel <- cv.glmnet(FullModel.M, Actual)
53 | FullModel.P <- predict(FullModel, FullModel.M, s="lambda.1se")
54 | FullModel.C <- coef(FullModel, s="lambda.1se")
55 |
56 | Full.S <- sum(InlineData$TotalSizeDelta^ 2)
57 | FullModel.SE <- sum((FullModel.P - Actual)^2)
58 | FullModel.MSE <- FullModel.SE / nrow(InlineData)
59 | FullModel.AE <- sum(abs(FullModel.P - Actual))
60 | FullModel.MAE <- FullModel.AE / nrow(InlineData)
61 | FullModel.R <- 1 - FullModel.SE / Full.S
62 |
--------------------------------------------------------------------------------
/scripts/ModelPolicyV1Perf.R.txt:
--------------------------------------------------------------------------------
1 | ## Used for initial ModelPolicy performance model
2 |
3 | D <- read.csv("c:\\repos\\performanceexplorer\\data\\all-benchmark-v3-a13.csv")
4 |
5 | ## Identify factors and logicals
6 |
7 | D$Arg0Type <- as.factor(D$Arg0Type)
8 | D$Arg1Type <- as.factor(D$Arg1Type)
9 | D$Arg2Type <- as.factor(D$Arg2Type)
10 | D$Arg3Type <- as.factor(D$Arg3Type)
11 | D$Arg4Type <- as.factor(D$Arg4Type)
12 | D$Arg5Type <- as.factor(D$Arg5Type)
13 | D$ReturnType <- as.factor(D$ReturnType)
14 | D$CallsiteFrequency <- as.factor(D$CallsiteFrequency)
15 | D$IsForceInline <- as.logical(D$IsForceInline)
16 | D$IsInstanceCtor <- as.logical(D$IsInstanceCtor)
17 | D$IsFromPromotableValueClass <- as.logical(D$IsFromPromotableValueClass)
18 | D$HasSimd <- as.logical(D$HasSimd)
19 | D$LooksLikeWrapperMethod <- as.logical(D$LooksLikeWrapperMethod)
20 | D$ArgFeedsConstantTest <- as.logical(D$ArgFeedsConstantTest)
21 | D$IsMostlyLoadStore <- as.logical(D$IsMostlyLoadStore)
22 | D$ArgFeedsRangeCheck <- as.logical(D$ArgFeedsRangeCheck)
23 | D$ConstantFeedsConstantTest <- as.logical(D$ConstantFeedsConstantTest)
24 |
25 | ## Filter to observations with at least 0.8 confidence
26 | ## Filter to observations where the call happened at least 1000x
27 | ## Filter to cases where per call impact is within -100, 100
28 |
29 | Dcq <- D[(D$Confidence > 0.8) & (D$CallDelta > 1000) & (abs(D$InstRetiredPerCallDelta) < 100), ]
30 |
31 | ## (have 210 entries for V3 data set)
32 |
33 | ## Identify non-predictive columns so we can strip them
34 |
35 | Col.NP <- c("Benchmark", "SubBenchmark", "Method", "Version", "JitTime", "HotSize", "ColdSize", "Depth")
36 | Study <- Dcq[Dcq$Version > 0, setdiff(names(Dcq), Col.NP)]
37 |
38 | ## Produce frame with just predictive columns and the result we want to estimate
39 |
40 | Col.YY <- c("HotSizeDelta","ColdSizeDelta","JitTimeDelta","InstRetiredDelta",
41 | "InstRetiredPct","CallDelta","InstRetiredPerCallDelta","Confidence")
42 | Col.Y <- c("InstRetiredPerCallDelta")
43 | Col.Yx <- setdiff(Col.YY, Col.Y)
44 |
45 | Col.XY <- setdiff(names(Study), Col.Yx)
46 | Col.X <- setdiff(Col.XY, Col.Y)
47 |
48 | Study.XY <- Study[, Col.XY]
49 | Study.X <- Study[, Col.X]
50 | Actual <- Study[, Col.Y]
51 |
52 | ## install.packages("glmnet")
53 |
54 | library(glmnet)
55 |
56 | ## Model dependent var vs observations
57 |
58 | set.seed(1001)
59 | F <- paste(Col.Y[1], " ~ .")
60 | FullModel.M <- model.matrix(as.formula(F), Study.XY)
61 | FullModel <- cv.glmnet(FullModel.M, Actual)
62 |
63 | # run new predictions
64 |
65 | FullModel.P <- predict(FullModel, FullModel.M, s="lambda.min")
66 | FullModel.C <- predict(FullModel, FullModel.M, s="lambda.min", type="coefficients")
67 |
68 | P <- data.frame(Actual, FullModel.P, FullModel.P - Actual)
69 | names(P) <- c("Actual", "FullModel", "FullModel.res")
70 |
71 | ## Scoring
72 |
73 | Full.S <- sum(Actual^ 2)
74 | FullModel.SE <- sum((FullModel.P - Actual)^2)
75 | FullModel.MSE <- FullModel.SE / nrow(Study)
76 | FullModel.AE <- sum(abs(FullModel.P - Actual))
77 | FullModel.MAE <- FullModel.AE / nrow(Study)
78 | FullModel.R <- 1 - FullModel.SE / Full.S
79 |
80 | ## Plotting
81 |
82 | library(ggplot2)
83 |
84 | Plot.F <- ggplot(P, aes(x=FullModel.P, y=Actual)) + geom_boxplot(aes(group=cut_width(FullModel, 1)), outlier.color="red") + coord_cartesian(ylim=c(-40,40), xlim=c(-40,40)) + geom_abline(slope=1, color="blue")
85 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | ## Ignore Visual Studio temporary files, build results, and
2 | ## files generated by popular Visual Studio add-ons.
3 |
4 | # User-specific files
5 | *.suo
6 | *.user
7 | *.userosscache
8 | *.sln.docstates
9 |
10 | # User-specific files (MonoDevelop/Xamarin Studio)
11 | *.userprefs
12 |
13 | # Build results
14 | [Dd]ebug/
15 | [Dd]ebugPublic/
16 | [Rr]elease/
17 | [Rr]eleases/
18 | [Xx]64/
19 | [Xx]86/
20 | [Bb]uild/
21 | bld/
22 | [Bb]in/
23 | [Oo]bj/
24 |
25 | # Visual Studio 2015 cache/options directory
26 | .vs/
27 | # Uncomment if you have tasks that create the project's static files in wwwroot
28 | #wwwroot/
29 |
30 | # MSTest test Results
31 | [Tt]est[Rr]esult*/
32 | [Bb]uild[Ll]og.*
33 |
34 | # NUNIT
35 | *.VisualState.xml
36 | TestResult.xml
37 |
38 | # Build Results of an ATL Project
39 | [Dd]ebugPS/
40 | [Rr]eleasePS/
41 | dlldata.c
42 |
43 | # DNX
44 | project.lock.json
45 | artifacts/
46 |
47 | *_i.c
48 | *_p.c
49 | *_i.h
50 | *.ilk
51 | *.meta
52 | *.obj
53 | *.pch
54 | *.pdb
55 | *.pgc
56 | *.pgd
57 | *.rsp
58 | *.sbr
59 | *.tlb
60 | *.tli
61 | *.tlh
62 | *.tmp
63 | *.tmp_proj
64 | *.log
65 | *.vspscc
66 | *.vssscc
67 | .builds
68 | *.pidb
69 | *.svclog
70 | *.scc
71 |
72 | # Chutzpah Test files
73 | _Chutzpah*
74 |
75 | # Visual C++ cache files
76 | ipch/
77 | *.aps
78 | *.ncb
79 | *.opendb
80 | *.opensdf
81 | *.sdf
82 | *.cachefile
83 | *.VC.db
84 |
85 | # Visual Studio profiler
86 | *.psess
87 | *.vsp
88 | *.vspx
89 | *.sap
90 |
91 | # TFS 2012 Local Workspace
92 | $tf/
93 |
94 | # Guidance Automation Toolkit
95 | *.gpState
96 |
97 | # ReSharper is a .NET coding add-in
98 | _ReSharper*/
99 | *.[Rr]e[Ss]harper
100 | *.DotSettings.user
101 |
102 | # JustCode is a .NET coding add-in
103 | .JustCode
104 |
105 | # TeamCity is a build add-in
106 | _TeamCity*
107 |
108 | # DotCover is a Code Coverage Tool
109 | *.dotCover
110 |
111 | # NCrunch
112 | _NCrunch_*
113 | .*crunch*.local.xml
114 | nCrunchTemp_*
115 |
116 | # MightyMoose
117 | *.mm.*
118 | AutoTest.Net/
119 |
120 | # Web workbench (sass)
121 | .sass-cache/
122 |
123 | # Installshield output folder
124 | [Ee]xpress/
125 |
126 | # DocProject is a documentation generator add-in
127 | DocProject/buildhelp/
128 | DocProject/Help/*.HxT
129 | DocProject/Help/*.HxC
130 | DocProject/Help/*.hhc
131 | DocProject/Help/*.hhk
132 | DocProject/Help/*.hhp
133 | DocProject/Help/Html2
134 | DocProject/Help/html
135 |
136 | # Click-Once directory
137 | publish/
138 |
139 | # Publish Web Output
140 | *.[Pp]ublish.xml
141 | *.azurePubxml
142 |
143 | # TODO: Un-comment the next line if you do not want to checkin
144 | # your web deploy settings because they may include unencrypted
145 | # passwords
146 | #*.pubxml
147 | *.publishproj
148 |
149 | # NuGet Packages
150 | *.nupkg
151 | # The packages folder can be ignored because of Package Restore
152 | **/packages/*
153 | # except build/, which is used as an MSBuild target.
154 | !**/packages/build/
155 | # Uncomment if necessary however generally it will be regenerated when needed
156 | #!**/packages/repositories.config
157 | # NuGet v3's project.json files produces more ignoreable files
158 | *.nuget.props
159 | *.nuget.targets
160 |
161 | # Microsoft Azure Build Output
162 | csx/
163 | *.build.csdef
164 |
165 | # Microsoft Azure Emulator
166 | ecf/
167 | rcf/
168 |
169 | # Microsoft Azure ApplicationInsights config file
170 | ApplicationInsights.config
171 |
172 | # Windows Store app package directory
173 | AppPackages/
174 | BundleArtifacts/
175 |
176 | # Visual Studio cache files
177 | # files ending in .cache can be ignored
178 | *.[Cc]ache
179 | # but keep track of directories ending in .cache
180 | !*.[Cc]ache/
181 |
182 | # Others
183 | ClientBin/
184 | [Ss]tyle[Cc]op.*
185 | ~$*
186 | *~
187 | *.dbmdl
188 | *.dbproj.schemaview
189 | *.pfx
190 | *.publishsettings
191 | node_modules/
192 | orleans.codegen.cs
193 |
194 | # RIA/Silverlight projects
195 | Generated_Code/
196 |
197 | # Backup & report files from converting an old project file
198 | # to a newer Visual Studio version. Backup files are not needed,
199 | # because we have git ;-)
200 | _UpgradeReport_Files/
201 | Backup*/
202 | UpgradeLog*.XML
203 | UpgradeLog*.htm
204 |
205 | # SQL Server files
206 | *.mdf
207 | *.ldf
208 |
209 | # Business Intelligence projects
210 | *.rdl.data
211 | *.bim.layout
212 | *.bim_*.settings
213 |
214 | # Microsoft Fakes
215 | FakesAssemblies/
216 |
217 | # GhostDoc plugin setting file
218 | *.GhostDoc.xml
219 |
220 | # Node.js Tools for Visual Studio
221 | .ntvs_analysis.dat
222 |
223 | # Visual Studio 6 build log
224 | *.plg
225 |
226 | # Visual Studio 6 workspace options file
227 | *.opt
228 |
229 | # Visual Studio LightSwitch build output
230 | **/*.HTMLClient/GeneratedArtifacts
231 | **/*.DesktopClient/GeneratedArtifacts
232 | **/*.DesktopClient/ModelManifest.xml
233 | **/*.Server/GeneratedArtifacts
234 | **/*.Server/ModelManifest.xml
235 | _Pvt_Extensions
236 |
237 | # LightSwitch generated files
238 | GeneratedArtifacts/
239 | ModelManifest.xml
240 |
241 | # Paket dependency manager
242 | .paket/paket.exe
243 |
244 | # FAKE - F# Make
245 | .fake/
--------------------------------------------------------------------------------
/notes/notes-aug-2016.md:
--------------------------------------------------------------------------------
1 | # Some Notes on Using Machine Learning to Develop Inlining Heuristics
2 |
3 | August 2016
4 |
5 | ## Overview
6 |
7 | This document describes the work done from roughly February to August
8 | 2016 to use machine learning techniques to develop improved inlining
9 | heuristics for RyuJit.
10 |
11 | Based on this work, RyuJit now includes an inlining heuristic that is
12 | based on machine learning -- the ModelPolicy. This policy can be
13 | enabled by setting COMPlus_JitInlinePolicyModel=1 in environments
14 | where the jit generates code. Measurements on various internal
15 | benchmarks have shown this new policy gives roughly 2% geomean CQ
16 | improvement, 2% geomean CS reduction, and 1% throughput reduction.
17 | Measurements on "realistic" applications has just begun and the
18 | initial results are not as encouraging, but we are still optimistic
19 | that with some more work, the ModelPolicy or something quite similar
20 | can be enabled as the default policy going forward.
21 |
22 | A number of new measurement techniques were developed to support the
23 | modelling process. Even so, the models built so far are not entirely
24 | satisfactory. There are significant challenges and open questions
25 | in many areas of the work.
26 |
27 | The remainder of this aims to describe the work that has been done,
28 | present the challenges that remain, and suggest avenues for further
29 | investigation. Note this is still a work in progress and some aspects
30 | of it are incomplete.
31 |
32 | ## Background
33 |
34 | The desirability of a machine-learning approach to the development of
35 | inlining heuristics was based on both past experience and some
36 | promising results from the literature.
37 |
38 | Past experience in manual development of inlining heuristics has shown
39 | that it is a complex and challenging endeavor. Typically, the heuristic
40 | developer must carefully study some number of examples to try and
41 | discern what factors lead to "good" inlines. These factors are then
42 | coded as heuristics, and combined via some ad-hoc method (say, via
43 | weights) to produce an overall figure of merit. A large number of
44 | rounds of experimental tuning on benchmarks are then used to select
45 | weight values.
46 |
47 | Failure of the heuristic to perform on certain benchmarks can and
48 | perhaps should lead to refining existing heuristics or the development
49 | of new heuristics, or to the improvement of downstream optimization
50 | abilities in the compiler, but often instead is handled by adjusting
51 | the various weights to try and obtain the desired outcome. There
52 | is inevitable bias in the developer's choice of factors and the expert
53 | analysis required to gain insight only scales to relatively small
54 | numbers of examples. Rigorous analysis to cross-check the importance
55 | of factors is not always done and performance of the model over time
56 | is typically not measured. This can lead to misleading confidence in
57 | the heuristics, since benchmark program never change, while real
58 | applications evolve over time, sometimes quite rapidly.
59 |
60 | The recent literature describes some successes in using machine
61 | learning to create good inlining heuristics. One example is [Automatic
62 | Construction of Inlining Heuristics using Machine
63 | Learning](http://dl.acm.org/citation.cfm?id=2495914) by Kulkarni,
64 | Cavazos, Wimmer, and Simon. Here Kulkarni et. al. treat inline
65 | profitability as an unsupervised learning problem, and create a
66 | well-performing heuristic black box (neural network) using
67 | evolutionary programming techniques. They then turn around and use
68 | this black box as an oracle to label inline instances, and from this
69 | guide a supervised machine learning algorithm to produce a decision
70 | tree that expresses the profitability heuristics in terms sensible to
71 | the compiler writer.
72 |
73 | It was hoped that machine learning techniques would lead to decent
74 | models that could be created relatively quickly, so that new models
75 | could be developed as the jit was ported to new architectures and new
76 | operating systems. Also as the capabilities of the jit or runtime were
77 | extended (say by improving register allocation or optimization) it
78 | would be possible to quickly re-tune the inliner to take best
79 | advantage of new capabilities, and/or to validate continued good
80 | behavior as key applications evolve. These tasks remain within the
81 | scope of our ambition, though we have not yet proven that such things
82 | are possible.
83 |
84 | Our inability (described in more detail below) to easily derive good
85 | performance models based on machine learning is most likely an
86 | indictment of some aspects of our overall process, though it is also
87 | possible that our difficulties simply reflect the degree of challenge
88 | inherent in improving heuristics in a mature and complex system with
89 | various realistic constraints.
90 |
91 | This [initial design
92 | note](https://github.com/dotnet/coreclr/blob/master/Documentation/design-docs/inlining-plans.md)
93 | -- describing the early views on the project -- may be of interest.
94 |
95 | ## Motivation
96 |
97 | The primary motivation for working on inlining was the potential for
98 | improved code quality (CQ) at similar or improved levels of code size
99 | (CS) and jit throughput (TP).
100 |
101 | This potential had been observed in many manual examples and bug
102 | reports, as well as experiments to simply make the inliner more
103 | aggressive.
104 |
105 | Nominal upside in CQ, given the current optimization capabilities of
106 | the jit, is in the range of 3-4% (geomean) across a variety of
107 | programs. As is always the case with such measures, the underlying
108 | distribution is broad, with some programs speeding up by substantially
109 | more, many remaining about the same, an a few slowing down.
110 |
111 | CQ generally increases steadily in aggregate with more inlining. For
112 | reasonable amounts of inlining, cases where inlining hurts performance
113 | are fairly rare. At high enough levels of inlining there may be
114 | adverse interactions as optimizer thresholds are tripped, and
115 | eventually the impact of the larger code is felt as contention for the
116 | limited physical memory resources of the host machine.
117 |
118 | CS (and TP) are often thought of as constraint or penalty terms rather
119 | than as optimization objectives. It is clear from experiments that
120 | inlining of suitably small methods will decrease CS and TP, so
121 | obtaining the "minimal" value for these metrics requires some amount
122 | of inlining. Too much inlining will increase CS without providing
123 | improvements in CQ.
124 |
125 | So, for a given level of CQ, there is a range of CS values that can
126 | obtain that CQ. The "ideal" level is then the minimal CS needed;
127 | roughly speaking, there is a CS/CQ tradeoff region with a bounding
128 | curve at the minimum CQ level. The locus and shape of this curve is
129 | unknown and must be discovered empirically. The curve will also vary
130 | considerably depending on the benchmarks. Ensemble measures of
131 | performance are needed, and (as noted above) when comparing
132 | two well-performing heuristics, there will be always be examples
133 | where one heuristic outperforms the other.
134 |
135 | Various system design goals and runtime capabilities (eg desire for
136 | fast startup or good steady-state performance or blend of both,
137 | ability to dynamically re-optimize) dictate which regions of the curve
138 | is most desirable. The challenge, then, is to develop an inlining
139 | heuristic that picks out an appropriate point that lies on or near the
140 | tradeoff curve given the design goals. The shape of the tradeoff
141 | curve is also of interest.
142 |
143 | In our case the ambition is to build a new inlining heuristic that can
144 | increase CQ to get at as much of the headroom as practical, while
145 | decreasing CS and TP.
146 |
147 | ## History
148 |
149 | The work done so far proceeded in roughly 4 stages, in order:
150 | refactoring, size measurements and modelling, time measurements and
151 | modelling, and speed measurements and modelling. These are described
152 | briefly below and in more detail in subsequent sections.
153 |
154 | Refactoring was done to enable the jit to have multiple inlining
155 | policies that could exist side by side. For compatibility reasons it
156 | was desirable to preserve the existing (legacy) behavior, and allowing
157 | other policies side by side facilitates experimentation. The legacy
158 | inliner's decision making was intertwined with the observation
159 | process, so it was necessary to separate these out to decouple policy
160 | from observation.
161 |
162 | Size impact of inlining was measured using the "crossgen" feature of
163 | the CLR. Here the jit is asked to generate code for most of the
164 | methods in an assembly ahead of time. The size impact of each inline
165 | was recorded along with the various observational values that were
166 | available to feed into a heuristic. This data fed into a size model
167 | that produced a size estimating heuristic. The models developed so
168 | far seem reasonably accurate, with an R^2 value of around 0.6.
169 |
170 | The time impact of inlining was measured by capturing CPU cycles
171 | expended in the jit between the time inlining had finished and the
172 | time the native code was generated (notably, this omits the time spent
173 | inlining, which is more difficult to measure). Modelling showed this
174 | time was closely related to the overall emitted size of the method,
175 | which was shown to be fairly reliably estimated by the sum of an
176 | initial time estimate plus the size impact of each successive inline.
177 |
178 | The performance impact of inlines was measured by enabling hardware
179 | performance monitoring counters to capture the number of instructions
180 | retired as the jitted code ran. Inlines were measured in isolation,
181 | one by one, and the difference in instructions retired was attributed
182 | to the inline. This data along with observations formed the data set
183 | that feed the speed model. Unfortunately, this performance data has
184 | proven to be difficult to model accurately.
185 |
186 | ## Constraints and Assumptions
187 |
188 | The CoreCLR currently gives its jit one opportunity to generate code
189 | for a method. The time it takes the jit to generate native code is a
190 | concern (eg it potentially impacts application start-up time), and
191 | given the general size-time relationship, this limits the ability of
192 | the jit to inline aggressively or to perform deep analysis in an
193 | attempt to find an optimal set of inlines for a method. The jit also
194 | has very limited ability to convey knowledge from one invocation to
195 | the next, so analysis costs cannot effectively be amortized.
196 |
197 | Currently the jit walks it is IR in linear fashion deciding whether to
198 | inline each time it sees a candidate. If the decision is *yes* then the
199 | inlined code is spliced in place of the call and (because of the order
200 | of the walk) immediately scanned for inlining candidates. Thus the
201 | inlining is performed "depth first" and is done without much knowledge
202 | of the number or location of other candidates in the code
203 | stream. Inlining is done very early on before any significant analysis
204 | has been done to the IR -- there is a flow graph but no loop nesting,
205 | dataflow, or profile estimates are generally available.
206 |
207 | Thus the heuristic we have in mind is one that assesses each inline
208 | independent of any assessments done before. Factors visible at the
209 | immediate call site and some general information about the accumulated
210 | IR can be used to influence decisions, so it's possible given a method
211 | A with two callsites for to B that one call to B gets inlined and the
212 | other doesn't.
213 |
214 | ## Overall Approach to Heuristic Creation
215 |
216 | The work history above reflects the initial proposal for heuristic
217 | creation -- first build size and speed models, and then combine those
218 | to create a heuristic. The general idea was to have an explicit
219 | size/speed tradeoff made per inline. The idealized heuristic is:
220 | ```
221 | if (SizeDelta <= 0) { inline; }
222 | else if (SpeedDelta > alpha * SizeDelta) { inline; }
223 | ```
224 | where here SizeDelta represents the increase in code size caused by
225 | the inline, SpeedDelta is the decrease in instructions executed, and
226 | alpha is a tradeoff factor. So good inlines either decrease size, or
227 | justify their size increase with a speed decrease, and alpha
228 | describes how willing we are to trade speed for size.
229 |
230 | This is roughly the heuristic implemented by the ModelPolicy.
231 | SizeDelta and SpeedDelta are computed by models derived from machine
232 | learning, alpha is manually chosen by "tuning" to give the desired
233 | tradeoff.
234 |
235 | However, the implemented model has an additional parameter, one whose
236 | presence reflects one of the key challenges present in this work. The
237 | size model has natural units of bytes of code (or instructions, if
238 | they're fixed size). Size impacts from inlining are typically small,
239 | say in the range of a few hundred bytes one way or the other. But the
240 | speed impact of an inline can vary over a much wider range. If we
241 | measure the actual change in instructions retired on a benchmark given
242 | one inline difference, the value may vary from -1e9 to 1e9 with many
243 | values clustered closely around zero.
244 |
245 | In an attempt to pull these values into a more manageable range for
246 | modelling, the value provided by the model is instructions retired per
247 | call to the callee. This needs to be multiplied by a "call site
248 | weight" beta to reflect the importance of the call site to the caller,
249 | and further by some "root method weight" to reflect the importance of
250 | the root method to the overall benchmark. We currently use ad-hoc
251 | methods to estimate beta and ignore the root method weight, so the
252 | full heuristic is:
253 | ```
254 | if (SizeDelta <= 0) { inline; }
255 | else if (beta * PerCallSpeedDelta > alpha * SizeDelta) { inline; }
256 | ```
257 | Here beta can vary depending on call site, and alpha is the fixed
258 | size-speed tradeoff.
259 |
260 | One might legitimately question this model, even if all of the
261 | quantities could be estimated perfectly. More on this subsequently.
262 |
263 | ## Some Terminology
264 |
265 | An *inline tree* is the set of inlines done into a method. The root
266 | method is the initial method; the top level inlines are the
267 | descendants of the root, and so on.
268 |
269 | An *inline forest* is the set of inline trees that are in effect for a
270 | benchmark run. There is one inline tree for each method executed.
271 |
272 | An inline tree X is a *subtree* of an inline tree Y if Y contains all
273 | the inlines in X and possibly more. A tree X is a *proper parent* of Y
274 | if Y contains just one extra inline.
275 |
276 | ## Size Modelling and Measurements
277 |
278 | ### Size Measurements
279 |
280 | To measure size, the legacy inliner was modified so that in each
281 | method, it would stop inlining after some number of inlines, K, where
282 | K could be specified externally. The jit would then generate native
283 | code for each method and measure the methods' native code size. Since
284 | the inlining decisions made in each method jitted are independent,
285 | data from many inlining instances can be collected in one run of
286 | crossgen, potentially one per "root" method.
287 |
288 | The overall process ran from K = 0 up to Kmax. For each run the
289 | size of the method was dumped to a file along with various
290 | observational values that were available to feed into a heuristic, and
291 | various metadata used to identify the root method. For each row of
292 | data, the value of K was recorded as the "version" of the inlining
293 | experiment.
294 |
295 | Given the raw data, the native size impact of each inline can then be
296 | determined by a post-processing pass: for each method and each inline
297 | into the method, the size change is found by subtracting the method
298 | size for case where J-1 inlines were performed from the size when J
299 | inlines were performed. Note not all methods will be able to perform
300 | the full set of K inlines, so as K increases, the number of methods
301 | that do more inlines decrease. So if there are initially N root
302 | methods the total number of rows of inline data with a given version
303 | decreases as the version increases.
304 |
305 | Reliably identifying the root across runs proved nontrivial, since the
306 | main values used as identifying keys (token and hash) were not
307 | sufficiently unique. These might come from additional stub methods
308 | created by the crossgen process or perhaps from multiply instantiated
309 | generic methods. Post-processing would thus ignore any data from a
310 | method where the key was not unique (eg multiple version 0 rows with
311 | the same token and hash).
312 |
313 | The [data set used](../data/mscorlib.data.model-rel-log.parsed) to
314 | develop the current model is taken from a crossgen of the CoreCLR core
315 | library. It has 29854 rows. Given the special role played by this
316 | library it is quite possible this is not a good representative set of
317 | methods. Considerably more and more diverse data was gathered (upwards
318 | of 1M rows using the desktop "SPMI" method) but this data proved
319 | unwieldy. The data gathered is also specific to x64 and windows and
320 | the behavior of the jit at that time.
321 |
322 | Subsequent work on performance measurement has created new data sets
323 | that could be used for size modelling, since a similar sort of
324 | K-limiting approach was used for performance, and the factor
325 | observations for size and speed are common. The most recent such data
326 | set is the [v12 data](../data/all-benchmark-v12-a15.csv).
327 |
328 | ### Size Modelling
329 |
330 | The size data is "noise free" in that (absent errors in coding) the
331 | sizes in the data set should be completely accurate. Given the
332 | relatively simple behavior of the jit it was felt that a linear model
333 | should work well.
334 |
335 | The model needs to be relatively simple to implement and quick to
336 | evaluate, and it is highly desirable that it be interpretable. Based
337 | on this the model developed is a penalized linear model using R's
338 | 'glmnet'. [This script](../scripts/ModelPolicyV1Size.R.txt) was used
339 | to derive the model. It is implemented by
340 | `DiscretionaryPolicy::EstimateCodeSize` in the code base. This model
341 | explains about 55% of the variance in the mscorlib size data, and 65%
342 | of the variance seen in the v12 data.
343 |
344 | Naive use of more sophisticated models (eg random forests, gradient
345 | boosting, mars) to see how much the linear model might be leaving
346 | behind didn't yield much improvement.
347 |
348 | So the belief is that some the remaining unexplained variance comes
349 | from missing observations. An exploration of poorly fitting examples
350 | would likely prove fruitful. There is likely some nontrivial amount of
351 | variation that will never be easily explained -- the jit's code
352 | generation can be quite sensitive to the exact details of both root
353 | method and callee.
354 |
355 | Exactly how close one can come to modelling size is an open question.
356 |
357 | The degree to which the inaccuracy of the current size model hurts the
358 | overall inline heuristic performance is another. The belief is that
359 | the speed and high-level structure of the heuristic are likely larger
360 | contributors to poor performance. However, they may also be more
361 | difficult to improve.
362 |
363 | ### Size Model Viewed as Classification
364 |
365 | Given the form of the idealized heuristic, drawing a clear distinction
366 | between size-increasing and non-size-increasing inlines is
367 | important. We can view the regression model developed above as a
368 | classifier and see how well it performs at this task (here on the V12
369 | data):
370 |
371 | Actual | Est Decrease | Est Increase | Total
372 | ---------------|--------------|--------------|-------
373 | Size Decrease | 2052 | 776 | 2828
374 | Size Increase | 132 | 3162 | 3298
375 | Total | 2188 | 3938 | 6126
376 |
377 | So the model is quite accurate in predicting size increasing cases,
378 | getting only 132/3298 wrong (96% accuracy).
379 |
380 | It's not as good at predicting size decreasing cases: 776/2828 were
381 | misclassified as size increasing (72% accuracy).
382 |
383 | To better implement the idealized heuristic, it might make sense to
384 | bias the model to increase the accuracy of classifying size decreasing
385 | cases. For instance, setting the classification threshold to EstSize -
386 | 40 (recall value is in bytes * 10) would give roughly balanced error
387 | rates. The downside is that a larger number of size increasing cases
388 | are now inlined without further scrutiny.
389 |
390 | For inlines classified as size increasing, the magnitude of the size
391 | comes into play, so one might also attempt to make more accurate
392 | predictions for size increasing inlines and trade off accuracy in
393 | predicting the magnitude of size decreases.
394 |
395 | ## Speed Model and Measurements
396 |
397 | ### Speed Measurements
398 |
399 | While noise-free size measurements are easy to come by, some degree of
400 | noise is inevitable for most approaches to speed measurements.
401 | Generally speaking any inline whose impact is of the same magnitude as
402 | the ambient noise level will very difficult to measure.
403 |
404 | The most common approach is to measure wall-clock or process time or
405 | cycle time for a benchmark. It is difficult to get noise levels for
406 | these approaches below roughly 1% of the overall runtime of the
407 | test. This amount of noise restricts the set of measurable
408 | inlines. Aside from interference by other processes running on the
409 | host machine, time-based measurements also can fall prey to
410 | microarchitectural implementation issues, in particular things like
411 | loop alignments, global branch prediction, various security-inspired
412 | randomization techniques, power management, and so on. Thus even
413 | run-to-run repeatability on an otherwise quiet machine will be
414 | impacted. The inliner also operates early enough in the compilation
415 | pipeline that machine microarchitecture is of a secondary concern.
416 |
417 | To avoid some of these pitfalls we have adopted the instructions
418 | retired by the benchmark as our primary performance metric. This is
419 | relatively insensitive to microarchitectural detals (with some caveats)
420 | and noise levels of 0.01% - 0.1% are not difficult to come by.
421 |
422 | Measuring instructions retired on Windows requires elevation since
423 | only the kernel can access and program the performance monitoring
424 | counters (PMC).
425 |
426 | ### Isolating Inlines
427 |
428 | To capture the per-inline impact on performance one must be able to
429 | run the benchmark twice, varying just one inline between the two runs.
430 | To do this requires some care. The K-limiting approach used during the
431 | size data collection does not sufficiently control inlining.
432 |
433 | So instead we developed some alternate techniques. One of them is to
434 | use K-limiting along with the ability to suppress inlining in all but
435 | one root method and a FullPolicy inline heuristic that inlines where
436 | possible. This combination allows successive measurements where just
437 | one inline differs (with some care taken to handle for force inlines).
438 | Note the "context" of the inline is the one that arises from the DFS
439 | enumeration. So as K increases we may be inlining deep into the tree.
440 |
441 | Because we wanted a greater quantity of samples for shallow inlines we
442 | developed a second technique where inline replay is used to carefully
443 | control inlining. An inline forest was grown by enabling some an
444 | inline policy and collecting up the initial inline forest. Inlining
445 | for each root in the forest was then disabled and a new run was done;
446 | this collects forests for new roots (methods that were always inlined
447 | in the initial run). This process continues until closure, yielding a
448 | full forest for that policy.
449 |
450 | This full forest is expressed in inline XML. Inline replay can then be
451 | used to isolate inlines to just one tree in the forest by measuring
452 | the performance of a tree and one of its proper parents. In actuality
453 | we measured each tree by growing from the empty tree towards the full
454 | tree in a breadth-first fashion. It is probably a good idea to try a
455 | greater variety of exploration orders here.
456 |
457 | As an aside, using an aggressive policy like the FullPolicy, one can
458 | enumerate the "jit-visible" call graph, a graph that shows the range
459 | of possible inline trees and hence inline forests.
460 |
461 | ### How to Measure an Inline
462 |
463 | The measurement process described below is orchestrated by the
464 | [PerformanceExplorer](https://github.com/AndyAyersMS/PerformanceExplorer).
465 |
466 | Benchmarks are set up to run under xunit-performance. Per-benchmark
467 | times are roughly normalized to 1 second to try and keep the variance
468 | constant in each benchmark.
469 |
470 | Xunit-performance runs the benchmark code via reflection. For each
471 | attributed method it runs some number of iterations, enabling
472 | performance counting via ETW (on windows). It also issues events for
473 | the start and end of each benchmark and each iteration of the
474 | benchmark. The event stream is post processed to find events
475 | attributed to the benchmark process that fall within the iteration
476 | span of a particular benchmark method. These events are counted and
477 | the total is multiplied by the PMC reload interval (100,000 I believe)
478 | to give the overall instruction retired estimate for that iteration.
479 | The raw iteration data is then written to an XML file. This data is
480 | read by the orchestrating process and an average count is computed
481 | from the iterations. This average value is then subtracted from the
482 | averaged value measured by in the proper parent run to get the
483 | per-inline performance impact.
484 |
485 | The overall exploration process repeats the above for each root in
486 | each benchmark, growing trees from their noinline version up to the
487 | full version seen in the forest. Exploration is short-circuited for a
488 | root if the full tree performance for that root does not differ
489 | significantly from the noinline version, or if the root has no
490 | inlines. Roots are explored in the order of the number of calls (see
491 | below).
492 |
493 | ### Additional Data Measured
494 |
495 | Along with the measured change in instructions retired, it seemed
496 | important to also get some idea about how the call counts were
497 | changing in the program -- in particular how often the root being
498 | explored was called, and how frequently it called the method being
499 | inlined. To do this a special instrumentation mode was developed that
500 | hijacks the IBC mechanism present in the CoreCLR. each jitted method's
501 | entry was instrumented with a counter to count the number of
502 | calls. These counts are dumped on jit shutdown. The root method count
503 | is directly available; the callee count can be deduced by knowing its
504 | value in the noinline run and accounting for any other inlines of that
505 | callee made along the way.
506 |
507 | One failing of this method is that if the callee comes from a
508 | prejitted image it will never run in instrumented form. To work around
509 | this the use of prejitted images can be disabled. This creates its own
510 | set of complications because every benchmark contains a sizeable
511 | amount of startup code that might be repeatedly explored. So
512 | optionally the explorer maintains a list of already explored methods
513 | and tries to avoid re-exploration.
514 |
515 | Another failing is that the call counts are captured by running the
516 | benchmark tests normally and not by running them under
517 | xunit-performance. The benchmarks have been set up so that key
518 | portions behave identically under both scenarios, but the real
519 | possibility exists that the call counts measure this way diverge from
520 | the counts running under the performance harness.
521 |
522 | It would probably be better to capture the call count data via the
523 | normal profiling API so that a special build of the jit with this
524 | capability is not needed (though note a special build is still needed
525 | to get at the inline data).
526 |
527 | ### Coping with Noise
528 |
529 | The impact of specific inlines can be elevated above the noise by
530 | iteration -- repeatedly invoking methods in loops. This elevation
531 | generally a manual process and so restricts the set of inlines that can
532 | be studied. But some degree of this is probably necessary.
533 |
534 | Adoption of a benchmarking framework like xunit-perf allows for the
535 | iteration strategy to be determined after the benchmark is authored.
536 |
537 | Runs can be repeated with the well-known result that if the noise is
538 | uncorrelated, the noise level will fall of with the square root of the
539 | number of runs. However on windows at least we have seen that the
540 | ambient noise level can vary over periods of minutes to hours. So some
541 | kind of adaptive iteration strategy might be required where the
542 | benchmarking harness periodically runs some known-effort workload to
543 | assess the ambient noise, and then either records the noise level or
544 | tries to adjust (or perhaps defer) data gathering to compensate for
545 | higher noise levels.
546 |
547 | There is also some inherent sampling noise. Consider this simple model
548 | of how PMC sampling works. The per-CPU PMC is programmed with some
549 | count-down value N. The OS then schedules processes to this
550 | CPU. Instructions are executed and the counter counts down. When it
551 | hits zero, the entire allotment of counts is "charged" to the current
552 | process. Suppose during this time the process being benchmarked ran
553 | for most of the time but for some fraction of instructions alpha,
554 | other processes ran on the CPU. Then the expected instruction charge
555 | to the benchmark process for this interval is:
556 | ```
557 | E = alpha * 0 + (1-alpha) * N
558 | ```
559 | which reflects that on some of the PMC rollovers the process is not
560 | charged even though it made progress, and on others it is charged but
561 | made somewhat less progress then the charge would indicate.
562 |
563 | The actual progress towards completion is given by the same formula.
564 | If the entire benchmark runs for K instructions then on average during
565 | the benchmark's execution the number of charges will be K/E, and hence
566 | the expected value for the total charge is K, which equals the actual
567 | total charge. So the added noise here does not apparently bias the
568 | estimated mean. It does however, create variance.
569 |
570 | The existence of this variance is readily observable. Unfortunately
571 | the exact nature of this variance is not well characterized. See for
572 | instance the discussions in
573 | [Flater](http://nvlpubs.nist.gov/nistpubs/technicalnotes/NIST.TN.1826.pdf).
574 | However it seems reasonable to assume that the variance increases,
575 | perhaps nonlinearly, with increasing alpha, and also increases if
576 | alpha itself is subject to variation, and that the variance does not
577 | go to zero as the benchmark is run for longer intervals.
578 |
579 | ### Speed Model -- Idealized
580 |
581 | The idealized speed model for the change in instructions retired
582 | for a single isolated line into root at some site is:
583 | ```
584 | InstRetiredDelta = RootCallCount * InstRetiredPerRootCallDelta
585 | InstRetiredPerRootCallDelta = Overhead + CallDelta * InstRetiredPerCallDelta
586 | InstRetiredPerCallDelta = F(...)
587 | ```
588 | See table below for explanation of these terms. InstRetiredDelta,
589 | RootCallCount, InstRetiredPerRootCallDelta, Overhead, CallDelta, and
590 | InstRetiredPerCallDelta measured after the fact.
591 |
592 | For predictive purposes they must be derived from modelling.
593 |
594 | ### Speed Model -- Modelling
595 |
596 | Attempts to coax workable performance models out of the data gathered
597 | above have largely fallen flat.
598 |
599 | The first challenge is to figure out what to model. With the current
600 | data set, there are several viable options:
601 | - InstRetiredDelta
602 | - InstRetiredPct
603 | - InstrRetiredPerRootCallDelta
604 | - InstrRetiredPerCallDelta
605 |
606 | The first two measures reflect the realized potential of the inline in
607 | some benchmark setting. They seem somewhat arbitrary -- if the
608 | benchmark was run for twice as long, the overall change in
609 | instructions would double as well. And the percentage value likewise
610 | seems off -- consider a test like the CscBench that has two timed
611 | sub-benchmarks. If an inline benefits one and not the other, the
612 | percentage change in instructions retired depends on the relative
613 | number of instructions run for the two sub-benchmarks. In terms of the
614 | idealized model, `RootCallCount` is thus something we can't easily
615 | characterize.
616 |
617 | So some sort of relative measure seems more appropriate. Because the
618 | jit generally has no notion of the overall importance of the root
619 | method in the ongoing processing (with some exceptions: the .cctor is
620 | known to run rarely, and when/if there's profile feedback, the jit
621 | might know something from past observations), it must presume that the
622 | root method might be called frequently. So a plausible figure of merit
623 | for inline benefit is the change in instructions retired per call to
624 | the root method: `InstRetiredPerRootCallDelta`.
625 |
626 | One could likewise argue that the right figure of merit for speed is
627 | `InstRetiredPerCallDelta` -- the change in instructions retired per
628 | call to the inlinee. This could be multiplied by a local estimate for
629 | call site frequency to arrive at a projected per call benefit. The jit
630 | computes block frequency estimates for other purposes and it would be
631 | good if all such estimates agreed. So instead of having this be implicit
632 | in the inline profit model, it could be made explicit.
633 |
634 | With either of these relative measures there is still potential for
635 | wide dynamic range as instruction retirement counts can be amplified
636 | by loops in the root or in the callee.
637 |
638 | Measurement of either of these requires that `RootCallCount` and
639 | `CallDelta` be captured. This is currently done with the special
640 | instrumentation mentioned above.
641 |
642 | Note also that `CallDelta` may well be zero, in which case the
643 | `InstRetiredPerRootCall` reflects just the `Overhead` term. This term
644 | represents changes in the root that are not "in the vicinity" of the
645 | inline site -- eg extra register saves in the prolog or epilog, or
646 | improved or pessimized code in other parts of the root method.
647 |
648 | Also, the number of times `CallDelta` is observed to be zero is
649 | overstated in the V12 data set because before call count values are
650 | not always available (see note above about additional data in the
651 | presence of crossgen). This should be fixed in the forthcoming V13
652 | data set.
653 |
654 | Unfortunately, it is proving difficult to find good models for
655 | any of the measures above. Some potential explanations:
656 |
657 | - High noise levels. Typical noise of 0.01% - 0.1% still means variations
658 | on the order of 5M instructions. Many inlines will have absolute
659 | impact below this level.
660 | - Missing key observations
661 | - Errors in measurement or in post-processing
662 | - Poor selection of benchmarks
663 | - Varying noise levels
664 |
665 | ### Speed Model -- Modelling Attempts
666 |
667 | Various approaches that have been tried, without success, for
668 | performance models:
669 |
670 | - Find some subset of the data that is predictable. For instance
671 | cases with high `CallDelta`
672 | - General linear modelling with nonlinear terms and interaction terms
673 | - Nonlinear models like mars
674 | - Quantile an robust regressions
675 | - Trying to classify rather than regress, classify as "improvement" or
676 | "regression", or some multinomial sense of "goodness".
677 | - Transforming the response to reduce the dynamic range (~ predict log of delta)
678 | - Temporarily allowing some output terms (eg `RootCallCount`, `CallDelta`) in models
679 | - Ensemble models (random forest, gradient boosting). While we might not want
680 | to implement such a model, if they're unable to predict results well, then there
681 | is not much hope for simpler implementable models
682 | - Weighted models, where the weight is used to
683 | - Cope with potential heteroscedastic results
684 | - Ignore impact of outliers
685 | - Emphasize instances felt to be above the noise level
686 |
687 | Very few models can explain more than a few percent of the variation.
688 |
689 | ### Speed Model -- Implemented Model
690 |
691 | The model currently implemented in the `ModelPolicy` came from an
692 | early V3 [data set](../data/all-benchmark-v3-a13.csv), and relies on
693 | just 210 observations. It predicts `InstRetiredPerCallDelta`. It is a
694 | penalized linear model that can explain only about 24% of the
695 | variation. [This is the script](../scripts/ModelPolicyV1Perf.R.txt)
696 | used to derive the model
697 |
698 | For use in the heuristic, the speed estimate from the model is
699 | multiplied by a local estimate of `CallDelta` to give an estimate of
700 | `InstRetiredPerRootCallDelta`. This local estimate is ad-hoc and was
701 | chosen to give some boost to call sites believed to be in loops in the
702 | root method.
703 |
704 | This version of the model was intended to be preliminary so that a
705 | trial implementation of the ModelPolicy and idealized heuristic could
706 | be assessed. However, no better model has emerged in the time since.
707 |
708 | ## Current Heuristic
709 |
710 | The current ModelPolicy heuristic follows the form of the idealized
711 | heuristic. It uses the size model and speed model, along with a local
712 | call site weight and a size-speed tradeoff parameter. The weight and
713 | tradeoff parameters were set based on benchmark runs and size
714 | assessments.
715 |
716 | Results show about a 2% geomean improvement in the CoreCLR benchmarks,
717 | with around a 2% size decrease in the core library crossgen size, and
718 | about a 1% throughput improvement.
719 |
720 | Evaluation of this heuristic on other benchmarks is just beginning.
721 | Some tests on parts of RavenDB show a possible 2% CQ improvement,
722 | though there were some interactions with force inline
723 | directives. Measurements on ASP.Net Techempower plaintext show about
724 | at 2% regression.
725 |
726 | Viewed as a classifier, here's how well the implemented model does at
727 | implementing the idealized heuristic (V12 data):
728 |
729 | Actual | Est Size Decrease | Est Profitable | Est Don't Inline | Total
730 | --------------|-------------------|----------------|------------------|-------
731 | Size Decrease | 2052 | 86 | 690 | 2828
732 | Profitable | 25 | 7 | 384 | 416
733 | Don't Inline | 111 | 39 | 2732 | 2882
734 | Total | 2188 | 132 | 3806 | 6126
735 |
736 | Accuracy is 78% overall. The largest errors come from inlines that
737 | actually decrease size, but are estimated to increase size, and then
738 | judged as unprofitable (690), and from inlines that are correctly
739 | estimated to increase size but are then assessed as unprofitable.
740 |
741 | Note that there may be substantial labelling error for the
742 | size-increasing cases, given the high noise levels in profitability
743 | measurements and the low impact of many inline instances.
744 |
745 | ## Alternatives
746 |
747 | ### Full-on Classification Model
748 |
749 | One might legitimately ask if it would be better to try and learn the
750 | idealized heuristic directly. Such a model would incorporate aspects
751 | of the size and speed models, though they might no longer be
752 | distinguishable as such.
753 |
754 | ### Learning from Inline Forests
755 |
756 | Instead of measuring inlines in isolation, one might attempt to infer
757 | value by studying performance changes for entire inline forests. This
758 | seems to match (in spirit) the approach taken in Kulkarni, et. al. A
759 | randomized heuristic is used, and this creates a collection of forests
760 | and performance results. Results are projected back onto individual
761 | inlines in the forest and, for each inline, the projected results are
762 | aggregated into some kind of label for that inline.
763 |
764 | For instance, one could track three numbers (possibly weighted by
765 | magnitude of the change) for each instance: the number of times it
766 | appears in a run that increases performance, the number of times in
767 | appears in a run that decreases performance, and the number of times
768 | it does not appear at all. The objective would be to then learn how to
769 | identify inlines whose appearance is correlated with improved
770 | performance.
771 |
772 | ### Finding Ideal Forests
773 |
774 | Along these lines one might also use randomness or a genetic approach
775 | to try and identify the "optimal" inline forest for each benchmark,
776 | and then attempt to generalize from there to a good overall inline
777 | heuristic.
778 |
779 | ## Inline Data Files
780 |
781 | The files have a header row with column names (meanings below), and
782 | then data rows, one per inline instance.
783 |
784 | Column Type | Meaning | Use in Heuristic?
785 | ------------|--------------------------------------|-----
786 | input | observation available for heuristic | Yes
787 | estimate | value internally derived from inputs | Maybe
788 | meta | metadata about the instance | No
789 | output | measured result | No
790 |
791 | The table below describes the V12 (and forthcoming V13) data sets.
792 | Older files may have a subset of this data, and may contain a Version0
793 | row for each method giving method information without any inlines.
794 |
795 | Column Name | Type | Meaning
796 | --------------------- |----------|--------
797 | Benchmark | meta | Name of benchmark program
798 | SubBenchmark | meta | none (all sub-benchmark data now aggregated)
799 | Method | meta | Token value of the root method
800 | Version | meta | Ordinal number of this inline
801 | HotSize | output | Hot code size of method after this inline (bytes)
802 | ColdSize | output | Cold code size of method after this inline (bytes)
803 | JitTime | output | Time spent in code gen after inlining (microseconds)
804 | SizeEstimate | estimate | Estimated code size for this method (hot + cold)
805 | TimeEstmate | estimate | Estimated jit time for this method (microseconds)
806 | ILSize | input | Size of callee method IL buffer (bytes)
807 | CallsiteFrequency | estimate | Importance of the call site (factor)
808 | InstructionCount | input | Number of MSIL instructions in the callee IL
809 | LoadStoreCount | input | Number of "load-store" MSIL instructions in callee IL
810 | Depth | input | Depth of this call site (1 == top-level)
811 | BlockCount | input | Number of basic blocks in the callee
812 | Maxstack | input | Maxstack value from callee method header
813 | ArgCount | input | Number of arguments to callee (from signature)
814 | ArgNType | input | Type of Nth argument (factor, CorInfoType)
815 | ArgNSize | input | Size of Nth argument (bytes)
816 | LocalCount | input | Number of locals in callee (from signature)
817 | ReturnType | input | Type of return value (factor, CorInfoType)
818 | ReturnSize | input | Size of return value (bytes)
819 | ArgAccessCount | input | Number of LDARG/STARG opcodes in callee IL
820 | LocalAccessCount | input | Number of LDLOC/STLOC opcodes in callee IL
821 | IntConstantCount | input | number of LDC_I and LDNULLopcodes in callee IL
822 | FloatConstantCount | input | number of LDC_R opcodes in callee IL
823 | IntLoadCount | input | number of LDIND_I/U opcodes in callee IL
824 | FloatLoadCount | input | number of LDIND_R opcodes in callee IL
825 | IntStoreCount | input | number of STIND_I opcodes in callee IL
826 | FloatStoreCount | input | number of STIND_R opcodes in callee IL
827 | SimpleMathCount | input | number of ADD/SUB/.../CONV_I/U opcodes in callee IL
828 | ComplexMathCount | input | number of MUL/DIV/REM/CONV_R opcodes in callee IL
829 | OverflowMathCount | input | number of CONV_OVF and math_OVF opcodes in callee IL
830 | IntArrayLoadCount | input | number of LDELEM_I/U opcodes in callee IL
831 | FloatArrayLoadCount | input | number of LDELEM_R opcodes in callee IL
832 | RefArrayLoadCount | input | number of LDELEM_REF opcodes in callee IL
833 | StructArrayLoadCount | input | number of LDELEM opcodes in callee IL
834 | IntArrayStoreCount | input | number of STELEM_I/U opcodes in callee IL
835 | FloatArrayStoreCount | input | number of STELEM_R opcodes in callee IL
836 | RefArrayStoreCount | input | number of STELEM_REF opcodes in callee IL
837 | StructArrayStoreCount | input | number of STELEM opcodes in callee IL
838 | StructOperationCount | input | number of *OBJ and *BLK opcodes in callee IL
839 | ObjectModelCount | input | number of CASTCLASS/BOX/etc opcodes in callee IL
840 | FieldLoadCount | input | number of LDLEN/LDFLD/REFANY* in callee IL
841 | FieldStoreCount | input | number of STFLD in callee IL
842 | StaticFieldLoadCount | input | number of LDSFLD in callee IL
843 | StaticFieldStoreCount | input | number of STSFLD in callee IL
844 | LoadAddressCount | input | number of LDLOCA/LDARGA/LD*A in callee IL
845 | ThrowCount | input | number of THROW/RETHROW in callee IL
846 | ReturnCount | input | number of RET in callee IL (new in V13)
847 | CallCount | input | number of CALL*/NEW*/JMP in callee IL
848 | CallSiteWeight | estimate | numeric weight of call site
849 | IsForceInline | input | true if callee is force inline
850 | IsInstanceCtor | input | true if callee is an .ctor
851 | IsFromPromotableValueClass | input | true if callee is from promotable value class
852 | HasSimd | input | true if callee has simd args/locals
853 | LooksLikeWrapperMethod | input | true if callee simply wraps another call
854 | ArgFeedsConstantTest | input | number of times an arg reaches compare vs constant
855 | IsMostlyLoadStore | input | true if loadstore count is large fraction of instruction count
856 | ArgFeedsRangeCheck | input | number of times an arg reaces compare vs ldlen
857 | ConstantArgFeedsConstantTest | input | number of times a constant arg reaches a compare vs constant
858 | CalleeNativeSizeEstimate | estimate | LegacyPolicy's size estimate for callee (bytes * 10)
859 | CallsiteNativeSizeEstimate | estimate | LegacyPolicy's size estimate for "savings" from inlining (bytes * 10)
860 | ModelCodeSizeEstimate | estimate | ModelPolicy's size estimate (bytes * 10)
861 | ModelPerCallInstructionEstimate | estimate | ModelPolicy's speed estimate (inst retired per call to callee)
862 | IsClassCtor | input | True if callee is a .cctor (v13)
863 | IsSameThis | input | True if callee and root are both instances with same this pointer (v13)
864 | CallerHasNewArray | input | True if caller contains NEWARR operation (v13)
865 | CallerHasNewObj | input | True if caller contains NEWOBJ operation (v13)
866 | HotSizeDelta | output | Change in hot size because of this inline (bytes)
867 | ColdSizeDelta | output | Change in cold size because of this inline (bytes)
868 | JitTimeDelta | output | Change in jit time because of this inline (microseconds)
869 | InstRetiredDelta | output | Change in instructions retired because of ths inline (millions)
870 | InstRetiredSD | estimate | Estimated standard deviation of InstRetiredDelta (millions)
871 | InstRetiredPct | output | Percent change in instructions retired
872 | CallDelta | output | Change in number of calls to the callee because of this inline
873 | InstRetiredPerCallDelta | output | InstRetiredDelta/CallDelta or 0 if CallDelta is 0
874 | RootCallCount | output | Number of calls to root method
875 | InstRetiredPerRootCallDelta | output | InstRetiredDelta/RootCallCount
876 | Confidence | meta | Bootstrap confidence that this inline caused instructions retired to change
877 |
878 | ## Options for Changing Inliner Behavior
879 |
880 | Build a release jit with -DINLINE_DATA=1. This enables the following COMPlus settings:
881 |
882 | Setting | Effect
883 | ----------------------|---------------
884 | JitInlineDumpData | dumps inline data
885 | JitInlineDumpXml | dumps inlines in xml format
886 | JitInlinePolicyReplay | enable replay from replay file
887 | JitInlineReplayFile | name of the replay file to read from
888 | JitInlinePolicyFull | enable FullPolicy heuristic
889 | JitInlinePolicyModel | enable ModelPolicy heuristic
890 | JitInlineLimit | enable K-limiting
891 | JitNoInlineRange | disable inlines in a subset of methods
892 |
893 | ## List of Areas for Investigation
894 |
895 | - Improvements to Size data collection
896 | - Modify collection process to walk inline trees in various orders
897 | - Improvements to the Size models
898 | - Analyze cases where existing size model makes poor predictions
899 | - Look for correlated inputs
900 | - Look for inputs columns with zero variance and/or low variance,
901 | and either remove them or add cases to boost their relevance
902 | - See if there is a better way to account for the operations done by the inlinee
903 | - Improvements to the Speed data collection
904 | - Reduce noise levels (thread affinity, priority, more frequent sampling, etc)
905 | - Identify noisy runs and retry if warranted
906 | - Round-robin execution to "sample" benchmarks at different times
907 | - More iterations, more reruns
908 | - Eliminate noise entirely using instrumentation or a tool like PIN
909 | - Understand possible divergence between xunit-perf and regular runs
910 | - Get rid of need for instrumented build, use CLR profiler API instead
911 | - Get rid of split modelling where sometimes the program is run
912 | under the perf harness and other times it is run normally
913 | - Directly measure call site frequency rather than infer it
914 | - Modify collection process to walk inline trees in various orders
915 | - Generalize collection to work with any test program
916 | - Wider variety of measurements
917 | - Develop techniques to measure and attribute performance to
918 | inline ensembles to speed up collection
919 | - Improvements to the Speed model
920 | - Settle on proper figure of merit: Instructions or Instructions per XXX
921 | - Deal with potential heteroscedasticity
922 | - Improvements to the idealized heuristic
923 | - Randomized studies looking for good inlining patterns
924 | - Manual tuning to do likewise
925 | - Find way to code up oracular inliner and benchmark the heuristics that way
926 | - Improvements to the actual heuristic
927 | - If local call site weight estimate needed, find good way to create one
928 | - If root method importance estimate needed, find good way to create one
929 | - Build full-model classifiers that incorporate tradeoffs
930 | - Automate process of tuning parameters
931 | - Other
932 | - Impact of instrumented counts (IBC) on heuristics
933 | - Are different heuristics warranted for prejit and jit?
934 | - Investigate stability of models over time
935 | - Investigate variability of models for different OSs or ISAs
936 |
937 |
938 |
--------------------------------------------------------------------------------
/src/PerformanceExplorer/Program.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.Collections.Generic;
3 | using System.Diagnostics;
4 | using System.Linq;
5 | using System.IO;
6 | using System.Xml.Serialization;
7 | using System.Text;
8 | using System.Xml.Linq;
9 |
10 | namespace PerformanceExplorer
11 | {
12 | // A Configuration describes the environment
13 | // used to perform a particular run.
14 | public class Configuration
15 | {
16 | public Configuration(string name)
17 | {
18 | Name = name;
19 | Environment = new Dictionary();
20 | ResultsDirectory = Program.RESULTS_DIR;
21 |
22 | if (Program.DisableZap)
23 | {
24 | Environment["COMPlus_ZapDisable"] = "1";
25 | }
26 | }
27 |
28 | public string Name;
29 | public Dictionary Environment;
30 | public string ResultsDirectory;
31 | }
32 |
33 | // PerformanceData describes performance
34 | // measurements for benchmark runs
35 | public class PerformanceData
36 | {
37 | public PerformanceData()
38 | {
39 | ExecutionTime = new SortedDictionary>();
40 | InstructionCount = new SortedDictionary>();
41 | id = ++idGen;
42 | }
43 |
44 | static int idGen = 0;
45 | static int id;
46 |
47 | // subPart -> list of data
48 | public SortedDictionary> ExecutionTime;
49 | public SortedDictionary> InstructionCount;
50 |
51 | public void Print(string configName)
52 | {
53 | foreach (string subBench in ExecutionTime.Keys)
54 | {
55 | Summarize(subBench, configName);
56 | }
57 | }
58 |
59 | public void Summarize(string subBench, string configName)
60 | {
61 | Console.Write("### [{0}] {1} perf for {2}", id, configName, subBench);
62 |
63 | if (ExecutionTime.ContainsKey(subBench))
64 | {
65 | Console.Write(" time {0:0.00} milliseconds (~{1:0.00}%)",
66 | Average(ExecutionTime[subBench]),
67 | PercentDeviation(ExecutionTime[subBench]));
68 | }
69 |
70 | if (InstructionCount.ContainsKey(subBench))
71 | {
72 | Console.Write(" instructions {0:0.00} million (~{1:0.00}%)",
73 | Average(InstructionCount[subBench]) / (1000 * 1000),
74 | PercentDeviation(InstructionCount[subBench]));
75 | }
76 |
77 | Console.WriteLine();
78 | }
79 |
80 | public static double Average(List data)
81 | {
82 | if (data.Count() < 1)
83 | {
84 | return -1;
85 | }
86 |
87 | return data.Average();
88 | }
89 | public static double StdDeviation(List data)
90 | {
91 | if (data.Count() < 2)
92 | {
93 | return 0;
94 | }
95 |
96 | double avg = Average(data);
97 | double sqError = 0;
98 | foreach (double d in data)
99 | {
100 | sqError += (avg - d) * (avg - d);
101 | }
102 | double estSD = Math.Sqrt(sqError / (data.Count() - 1));
103 | return estSD;
104 | }
105 | public static double PercentDeviation(List data)
106 | {
107 | return 100.0 * StdDeviation(data) / Average(data);
108 | }
109 |
110 | // Use bootstrap to test hypothesis that difference in
111 | // means of the two data sets is significant at indicated level.
112 | // Return value is p value between 0 and 1.
113 | public static double Confidence(List data1, List data2)
114 | {
115 | int kb = data1.Count();
116 | int kd = data2.Count();
117 |
118 | double d1ave = Average(data1);
119 | double d2ave = Average(data2);
120 | double basestat = Math.Abs(d1ave - d2ave);
121 |
122 | // perform a boostrap test to estimate the one-sided
123 | // confidence that this diff could be significant.
124 |
125 | List mergedData = new List(kb + kd);
126 | mergedData.AddRange(data1);
127 | mergedData.AddRange(data2);
128 |
129 | double confidence = Bootstrap(basestat, kb, kd, mergedData);
130 |
131 | return confidence;
132 | }
133 |
134 | // Use bootstrap to produce a p value for the hypothesis that the
135 | // difference shown in basestat is significant.
136 | // k1 and k2 are the sizes of the two sample populations
137 | // data is the combined set of observations.
138 | static double Bootstrap(double basestat, int k1, int k2, List data)
139 | {
140 | double obs = 0;
141 | Random r = new Random(RandomSeed);
142 |
143 | for (int i = 0; i < NumberOfBootstrapTrials; i++)
144 | {
145 | List z1 = Sample(data, k1, r);
146 | List z2 = Sample(data, k2, r);
147 |
148 | double z1average = Average(z1);
149 | double z2average = Average(z2);
150 |
151 | double zmedian = Math.Abs(z1average - z2average);
152 |
153 | if (zmedian < basestat)
154 | {
155 | obs++;
156 | }
157 | }
158 |
159 | return obs / NumberOfBootstrapTrials;
160 | }
161 |
162 | // Return a random sample (with replacement) of size n from the array data
163 | static List Sample(List data, int n, Random r)
164 | {
165 | int l = data.Count;
166 | List x = new List(n);
167 | for (int i = 0; i < n; i++)
168 | {
169 | int j = r.Next(0, l);
170 | x.Add(data[j]);
171 | }
172 |
173 | return x;
174 | }
175 |
176 | // Use fixed random seed so that we don't see the bootstrap p-values
177 | // wander from invocation to invocation.
178 | const int RandomSeed = 77;
179 |
180 | // The bootstrap test works by taking a number of random samples
181 | // and computing how frequently the samples exhibit the same
182 | // statistic as observed statstic. N determines the
183 | // number of bootstrap trials to run. A higher value is better
184 | // but takes longer.
185 | const int NumberOfBootstrapTrials = 1000;
186 | }
187 |
188 | // Information that identifies a method
189 | public struct MethodId : IEquatable
190 | {
191 | public override bool Equals(object obj)
192 | {
193 | return (obj is MethodId) && Equals((MethodId)obj);
194 | }
195 | public bool Equals(MethodId other)
196 | {
197 | return (this.Token == other.Token && this.Hash == other.Hash);
198 | }
199 | public override string ToString()
200 | {
201 | return String.Format("{0:X8}-{1:X8}", Token, Hash);
202 | }
203 |
204 | public override int GetHashCode()
205 | {
206 | int hash = 23;
207 | hash = hash * 31 + (int)Token;
208 | hash = hash * 31 + (int)Hash;
209 | return hash;
210 | }
211 |
212 | public uint Token;
213 | public uint Hash;
214 | }
215 |
216 | // A method seen either in jitting or inlining
217 | public class Method
218 | {
219 | public Method()
220 | {
221 | Callers = new List();
222 | Callees = new List();
223 | }
224 |
225 | public MethodId getId()
226 | {
227 | MethodId id = new MethodId();
228 | id.Token = Token;
229 | id.Hash = Hash;
230 | return id;
231 | }
232 |
233 | public static int HasMoreInlines(Method x, Method y)
234 | {
235 | return (int) y.InlineCount - (int) x.InlineCount;
236 | }
237 |
238 | public static int HasMoreCalls(Method x, Method y)
239 | {
240 | if (x.CallCount > y.CallCount)
241 | {
242 | return -1;
243 | }
244 | else if (x.CallCount < y.CallCount)
245 | {
246 | return 1;
247 | }
248 | else
249 | {
250 | return 0;
251 | }
252 | }
253 |
254 | public double NumSubtrees()
255 | {
256 | double result = 1;
257 | foreach (Inline i in Inlines)
258 | {
259 | result *= (1 + i.NumSubtrees());
260 | }
261 | return result;
262 | }
263 |
264 | public void Dump()
265 | {
266 | Console.WriteLine("Inlines into {0} {1:X8}", Name, Token);
267 | foreach (Inline x in Inlines)
268 | {
269 | x.Dump(2);
270 | }
271 | }
272 |
273 | public Inline[] GetBfsSubtree(int k, out Inline lastInline)
274 | {
275 | List l = new List(k);
276 | Queue q = new Queue();
277 | lastInline = null;
278 |
279 | foreach (Inline i in Inlines)
280 | {
281 | q.Enqueue(i);
282 | }
283 |
284 | // BFS until we've enumerated the first k.
285 | while (q.Count() > 0)
286 | {
287 | Inline i = q.Dequeue();
288 | l.Add(i);
289 |
290 | if (l.Count() == k)
291 | {
292 | lastInline = i;
293 | break;
294 | }
295 |
296 | foreach (Inline ii in i.Inlines)
297 | {
298 | q.Enqueue(ii);
299 | }
300 | }
301 |
302 | // DFS to copy with the list telling us
303 | // what to include.
304 | return GetDfsSubtree(Inlines, l, lastInline);
305 | }
306 |
307 | Inline[] GetDfsSubtree(Inline[] inlines, List filter, Inline lastInline)
308 | {
309 | List newInlines = new List();
310 | foreach (Inline x in inlines)
311 | {
312 | if (filter.Contains(x))
313 | {
314 | Inline xn = x.ShallowCopy();
315 | // Flag the last inline so the jit can collect
316 | // data for it during replay.
317 | if (x == lastInline)
318 | {
319 | xn.CollectData = 1;
320 | }
321 | newInlines.Add(xn);
322 | xn.Inlines = GetDfsSubtree(x.Inlines, filter, lastInline);
323 | }
324 | }
325 |
326 | return newInlines.ToArray();
327 | }
328 |
329 | public Method ShallowCopy()
330 | {
331 | Method r = new Method();
332 | r.Token = Token;
333 | r.Hash = Hash;
334 | r.Name = Name;
335 | r.Inlines = new Inline[0];
336 | return r;
337 | }
338 |
339 | public uint Token;
340 | public uint Hash;
341 | public string Name;
342 | public uint InlineCount;
343 | public uint HotSize;
344 | public uint ColdSize;
345 | public uint JitTime;
346 | public uint SizeEstimate;
347 | public uint TimeEstimate;
348 | public Inline[] Inlines;
349 | public ulong CallCount;
350 | public void MarkAsDuplicate() { IsDuplicate = true; }
351 | public bool CheckIsDuplicate() { return IsDuplicate; }
352 | private bool IsDuplicate;
353 |
354 | public List Callers;
355 | public List Callees;
356 | }
357 |
358 | // THe jit-visible call graph
359 | public class CallGraph
360 | {
361 | public CallGraph(Results fullResults)
362 | {
363 | Map = fullResults.Methods;
364 | Nodes = new HashSet();
365 | Roots = new HashSet();
366 | Leaves = new HashSet();
367 | Build();
368 | }
369 | public void Build()
370 | {
371 | // Populate per-method caller and callee lists
372 | // Drive via IDs to consolidate dups
373 | foreach(MethodId callerId in Map.Keys)
374 | {
375 | Method caller = Map[callerId];
376 |
377 | foreach (Inline i in caller.Inlines)
378 | {
379 | MethodId calleeId = i.GetMethodId();
380 |
381 | // Not sure why it wouldn't....
382 | if (Map.ContainsKey(calleeId))
383 | {
384 | Method callee = Map[calleeId];
385 |
386 | caller.Callees.Add(callee);
387 | callee.Callers.Add(caller);
388 | }
389 | }
390 | }
391 |
392 | foreach (MethodId methodId in Map.Keys)
393 | {
394 | Method method = Map[methodId];
395 |
396 | Nodes.Add(method);
397 |
398 | // Methods with no callers are roots.
399 | if (method.Callers.Count == 0)
400 | {
401 | Roots.Add(method);
402 | }
403 |
404 | // Methods with no callees are leaves.
405 | if (method.Callees.Count == 0)
406 | {
407 | Leaves.Add(method);
408 | }
409 | }
410 | }
411 |
412 | public Dictionary Map;
413 | public HashSet Nodes;
414 | public HashSet Roots;
415 | public HashSet Leaves;
416 |
417 | public void DumpDot(string file)
418 | {
419 | using (StreamWriter outFile = File.CreateText(file))
420 | {
421 | outFile.WriteLine("digraph CallGraph {");
422 | foreach (Method m in Nodes)
423 | {
424 | outFile.WriteLine("\"{0:X8}-{1:X8}\" [ label=\"{2}\"];", m.Token, m.Hash, m.Name);
425 |
426 | foreach (Method p in m.Callees)
427 | {
428 | outFile.WriteLine("\"{0:X8}-{1:X8}\" -> \"{2:X8}-{3:X8}\";",
429 | m.Token, m.Hash, p.Token, p.Hash);
430 | }
431 | }
432 | outFile.WriteLine("}");
433 | }
434 | }
435 | }
436 |
437 | // A node in an inline tree.
438 | public class Inline
439 | {
440 | public double NumSubtrees()
441 | {
442 | double result = 1;
443 | foreach (Inline i in Inlines)
444 | {
445 | result *= (1 + i.NumSubtrees());
446 | }
447 | return result;
448 | }
449 |
450 | public uint Token;
451 | public uint Hash;
452 | public uint Offset;
453 | public uint CollectData;
454 | public string Reason;
455 | public string Data;
456 | public Inline[] Inlines;
457 |
458 | public Inline ShallowCopy()
459 | {
460 | Inline x = new Inline();
461 | x.Token = Token;
462 | x.Hash = Hash;
463 | x.Offset = Offset;
464 | x.Reason = Reason;
465 | x.CollectData = 0;
466 | x.Inlines = new Inline[0];
467 | return x;
468 | }
469 |
470 | public MethodId GetMethodId()
471 | {
472 | MethodId id = new MethodId();
473 | id.Token = Token;
474 | id.Hash = Hash;
475 | return id;
476 | }
477 | public void Dump(int indent)
478 | {
479 | for (int i = 0; i < indent; i++) Console.Write(" ");
480 | Console.WriteLine("{0:X8} {1}", Token, Reason);
481 | foreach (Inline x in Inlines)
482 | {
483 | x.Dump(indent + 2);
484 | }
485 | }
486 | }
487 |
488 | // InlineForest describes the inline forest used for the run.
489 | public class InlineForest
490 | {
491 | public string Policy;
492 | public string DataSchema;
493 | public Method[] Methods;
494 | }
495 |
496 | // The benchmark of interest
497 | public class Benchmark
498 | {
499 | public string ShortName;
500 | public string FullPath;
501 | public int ExitCode;
502 | }
503 |
504 | // The results of running a benchmark
505 | public class Results
506 | {
507 | public Results()
508 | {
509 | Performance = new PerformanceData();
510 | }
511 |
512 | public int ExitCode;
513 | public string LogFile;
514 | public bool Success;
515 | public InlineForest InlineForest;
516 | public Dictionary Methods;
517 | public PerformanceData Performance;
518 | public string Name;
519 | }
520 |
521 | public class InlineDelta : IComparable
522 | {
523 | public Method rootMethod;
524 | public MethodId inlineMethodId;
525 | public double pctDelta;
526 | public int index;
527 | public string subBench;
528 | public double confidence;
529 | public double instructionsDelta;
530 | public double callsDelta;
531 | public bool hasPerCallDelta;
532 | public double perCallDelta;
533 | public int CompareTo(InlineDelta other)
534 | {
535 | return -Math.Abs(pctDelta).CompareTo(Math.Abs(other.pctDelta));
536 | }
537 | }
538 |
539 | public class Exploration : IComparable
540 | {
541 | public Results baseResults;
542 | public Results endResults;
543 | public Benchmark benchmark;
544 |
545 | // Consider benchmarks with fewer roots as better
546 | // candidates for exploration.
547 | public int CompareTo(Exploration other)
548 | {
549 | return endResults.Methods.Count() - other.endResults.Methods.Count();
550 | }
551 |
552 | public void Explore(StreamWriter combinedDataFile, ref bool combinedHasHeader, Dictionary blacklist)
553 | {
554 | Console.WriteLine("$$$ Exploring significant perf diff in {0} between {1} and {2}",
555 | benchmark.ShortName, baseResults.Name, endResults.Name);
556 |
557 | // Summary of performance results
558 | List deltas = new List();
559 |
560 | // Fully detailed result trees with performance data
561 | Dictionary recapturedData = new Dictionary();
562 |
563 | // Similar but for call count reductions....
564 | Dictionary recapturedCC = new Dictionary();
565 |
566 | // Count methods in end results with inlines, and total subtree size.
567 | int candidateCount = 0;
568 | int exploreCount = 0;
569 | foreach (Method m in endResults.Methods.Values)
570 | {
571 | int endCount = (int)m.InlineCount;
572 | if (endCount > 0)
573 | {
574 | candidateCount++;
575 | exploreCount += endCount;
576 | }
577 | }
578 |
579 | Console.WriteLine("$$$ Examining {0} methods, {1} inline combinations", candidateCount, exploreCount);
580 | if (blacklist != null)
581 | {
582 | Console.WriteLine("$$$ blacklist in use: {0} entries", blacklist.Count);
583 | }
584 |
585 | // Todo: order methods by call count. Find top N% of these. Determine callers (and up the tree)
586 | // Explore from there.
587 |
588 | // Explore each method with inlines. Arbitrarily bail after some number of explorations.
589 | int methodsExplored = 0;
590 | int inlinesExplored = 0;
591 | int perMethodExplorationLimit = 50;
592 | int perBenchmarkExplorationLimit = 1000;
593 | List methodsToExplore = new List(endResults.Methods.Values);
594 | methodsToExplore.Sort(Method.HasMoreCalls);
595 |
596 | foreach (Method rootMethod in methodsToExplore)
597 | {
598 | Console.WriteLine("$$$ InlinesExplored {0} MethodsExplored {1}", inlinesExplored, methodsExplored);
599 | Console.WriteLine("$$$ Exploring inlines for {0}", rootMethod.Name);
600 |
601 | // Only explore methods that had inlines
602 | int endCount = (int) rootMethod.InlineCount;
603 |
604 | if (endCount == 0)
605 | {
606 | Console.WriteLine("$$$ Skipping -- no inlines");
607 | continue;
608 | }
609 |
610 | // Optionally just explore some paritcular root
611 | if ((Program.RootToken != null) && rootMethod.Token != Program.RootTokenValue)
612 | {
613 | Console.WriteLine("$$$ Skipping -- does not match specified root token {0}", Program.RootToken);
614 | continue;
615 | }
616 |
617 | // Don't bother exploring main since it won't be invoked via xperf
618 | // and so any apparent call count reductions from main will be misleading.
619 | if (rootMethod.Name.Equals("Main"))
620 | {
621 | Console.WriteLine("$$$ Skipping -- not driven by xunit-perf");
622 | continue;
623 | }
624 |
625 | // Only expore methods that were called in the noinline run
626 | if (rootMethod.CallCount == 0)
627 | {
628 | Console.WriteLine("$$$ Skipping -- not called");
629 | continue;
630 | }
631 |
632 | // Don't re-explore a method on the blacklist, unless we see significantly more calls to it than
633 | // we have ever seen before. This short-circuts exploration for common startup code and the like,
634 | // if we disable zap.
635 | if (blacklist != null)
636 | {
637 | if (blacklist.ContainsKey(rootMethod.Hash))
638 | {
639 | Console.WriteLine("$$$ method is on the blacklist");
640 | ulong oldCallCount = blacklist[rootMethod.Hash];
641 |
642 | if (rootMethod.CallCount <= 2 * oldCallCount)
643 | {
644 | Console.WriteLine("$$$ Skipping -- already explored this method with {0} calls, now seeing it with {1}",
645 | oldCallCount, rootMethod.CallCount);
646 | continue;
647 | }
648 | else
649 | {
650 | Console.WriteLine("$$$ will re-explore this method, previous had {0} calls, now seeing it with {1}",
651 | oldCallCount, rootMethod.CallCount);
652 | }
653 | }
654 | else
655 | {
656 | Console.WriteLine("$$$ method not on blacklist");
657 | }
658 | }
659 |
660 | // Limit volume of exploration
661 | if (inlinesExplored >= perBenchmarkExplorationLimit)
662 | {
663 | Console.WriteLine("$$$ Reached benchmark limit of {0} explored inlines, moving on to next benchmark",
664 | perBenchmarkExplorationLimit);
665 | break;
666 | }
667 |
668 | if (endCount > perMethodExplorationLimit)
669 | {
670 | int newEndCount = perMethodExplorationLimit;
671 | Console.WriteLine("$$$ Limiting exploration for this root to {0} inlines out of {1}", newEndCount, endCount);
672 | endCount = newEndCount;
673 | }
674 |
675 | // Trim exploration here if full explore would put us over the limit
676 | if (inlinesExplored + endCount >= perBenchmarkExplorationLimit)
677 | {
678 | int newEndCount = perBenchmarkExplorationLimit - inlinesExplored;
679 | Console.WriteLine("$$$ Might hit limit of {0} inlines explored, trimming end count from {1} to {2}",
680 | perBenchmarkExplorationLimit, endCount, newEndCount);
681 | endCount = newEndCount;
682 | }
683 |
684 | // Add method to the blacklist, if we're keeping one.
685 | if (blacklist != null)
686 | {
687 | Console.WriteLine("$$$ adding {0} to blacklist with {1} calls", rootMethod.Name, rootMethod.CallCount);
688 | blacklist[rootMethod.Hash] = rootMethod.CallCount;
689 | }
690 |
691 | // Noinline perf is already "known" from the baseline, so exclude that here.
692 | //
693 | // The maximal subtree perf may not equal the end perf because the latter allows inlines
694 | // in all methods, and we're just inlining into one method at a time here.
695 | Console.WriteLine("$$$ [{0}] examining method {1} {2:X8} with {3} inlines and {4} permutations via BFS.",
696 | methodsExplored++, rootMethod.Name, rootMethod.Token, endCount, rootMethod.NumSubtrees() - 1);
697 | rootMethod.Dump();
698 |
699 | // Now for the actual experiment. We're going to grow the method's inline tree from the
700 | // baseline tree (which is noinline) towards the end result tree. For sufficiently large trees
701 | // there are lots of intermediate subtrees. For now we just do a simple breadth-first linear
702 | // exploration.
703 | //
704 | // However, we'll measure the full tree first. If there's no significant diff between it
705 | // and the noinline tree, then we won't bother enumerating and measuring the remaining subtrees.
706 | Results[] explorationResults = new Results[endCount + 1];
707 | explorationResults[0] = baseResults;
708 |
709 | // After we measure perf via xunit-perf, do a normal run to recapture inline observations.
710 | // We could enable observations in the perf run, but we'd get inline xml for all the xunit
711 | // scaffolding too. This way we get something minimal.
712 | Results[] recaptureResults = new Results[endCount + 1];
713 | recaptureResults[0] = baseResults;
714 |
715 | // Call count reduction at each step of the tree expansion
716 | double[] ccDeltas = new double[endCount + 1];
717 |
718 | // We take advantage of the fact that for replay Xml, the default is to not inline.
719 | // So we only need to emit Xml for the methods we want to inline. Since we're only
720 | // inlining into one method, our forest just has one Method entry.
721 | InlineForest kForest = new InlineForest();
722 | kForest.Policy = "ReplayPolicy";
723 | kForest.Methods = new Method[1];
724 | kForest.Methods[0] = rootMethod.ShallowCopy();
725 |
726 | // Always explore methods with one or two possible inlines, since checking to see if the
727 | // exploration is worthwhile costs just as much as doing the exploration.
728 | //
729 | // If there are multiple inlines, then jump to the end to see if any of them matter.
730 | // If not then don't bother exploring the intermediate states.
731 | //
732 | // This might bias the exploration into visiting more good cases than "normal".
733 | if (rootMethod.InlineCount > 2)
734 | {
735 | // See if any inline in the tree has a perf impact. If not, don't bother exploring.
736 | ulong dontcare = 0;
737 | ExploreSubtree(kForest, endCount, rootMethod, benchmark, explorationResults, null, null, out dontcare);
738 | bool shouldExplore = CheckResults(explorationResults, endCount, 0);
739 |
740 | if (!shouldExplore)
741 | {
742 | Console.WriteLine("$$$ Full subtree perf NOT significant, skipping...");
743 | continue;
744 | }
745 | else
746 | {
747 | Console.WriteLine("$$$ Full subtree perf significant, exploring...");
748 | }
749 | }
750 | else
751 | {
752 | Console.WriteLine("$$$ Single/Double inline, exploring...");
753 | }
754 |
755 | // Keep track of the current call count for each method.
756 | // Initial value is the base model's count.
757 | Dictionary callCounts = new Dictionary(baseResults.Methods.Count());
758 | foreach (MethodId id in baseResults.Methods.Keys)
759 | {
760 | callCounts[id] = baseResults.Methods[id].CallCount;
761 | }
762 |
763 | // TODO: Every so often, rerun the noinline baseline, and see if we have baseline shift.
764 |
765 | ccDeltas[0] = 0;
766 |
767 | for (int k = 1; k <= endCount; k++)
768 | {
769 | inlinesExplored++;
770 | ulong ccDelta = 0;
771 | Inline lastInlineK =
772 | ExploreSubtree(kForest, k, rootMethod, benchmark, explorationResults, recaptureResults, callCounts, out ccDelta);
773 | ShowResults(explorationResults, k, k - 1, rootMethod, lastInlineK, deltas, ccDelta);
774 | ccDeltas[k] = ccDelta;
775 | }
776 |
777 | // Save off results for later processing.
778 | recapturedData[rootMethod.getId()] = recaptureResults;
779 | recapturedCC[rootMethod.getId()] = ccDeltas;
780 | }
781 |
782 | // Sort deltas and display
783 | deltas.Sort();
784 | Console.WriteLine("$$$ --- {0}: inlines in order of impact ---", endResults.Name);
785 | foreach (InlineDelta dd in deltas)
786 | {
787 | string currentMethodName = null;
788 | if (baseResults.Methods != null && baseResults.Methods.ContainsKey(dd.inlineMethodId))
789 | {
790 | currentMethodName = baseResults.Methods[dd.inlineMethodId].Name;
791 | }
792 | else
793 | {
794 | currentMethodName = dd.inlineMethodId.ToString();
795 | }
796 |
797 | Console.Write("$$$ --- [{0,2:D2}] {1,12} -> {2,-12} {3,6:0.00}%",
798 | dd.index, dd.rootMethod.Name, currentMethodName, dd.pctDelta);
799 | if (dd.hasPerCallDelta)
800 | {
801 | Console.Write(" {0,10:0.00} pc", dd.perCallDelta);
802 | }
803 | Console.WriteLine();
804 | }
805 |
806 | // Build integrated data model...
807 | string dataModelName = String.Format("{0}-{1}-data-model.csv", benchmark.ShortName, endResults.Name);
808 | string dataModelFileName = Path.Combine(Program.RESULTS_DIR, dataModelName);
809 | bool hasHeader = false;
810 | char[] comma = new char[] { ',' };
811 | using (StreamWriter dataModelFile = File.CreateText(dataModelFileName))
812 | {
813 | foreach (MethodId methodId in recapturedData.Keys)
814 | {
815 | Results[] resultsSet = recapturedData[methodId];
816 | double[] ccDeltas = recapturedCC[methodId];
817 |
818 | // resultsSet[0] is the noinline run. We don't have a entry
819 | // for it, but key column values are spilled into the inline Xml and
820 | // so deserialized into method entries.
821 | if (!baseResults.Methods.ContainsKey(methodId))
822 | {
823 | Console.WriteLine("!!! Odd -- no base data for root {0}", methodId);
824 | continue;
825 | }
826 |
827 | int baseMethodHotSize = (int) baseResults.Methods[methodId].HotSize;
828 | int baseMethodColdSize = (int) baseResults.Methods[methodId].ColdSize;
829 | int baseMethodJitTime = (int) baseResults.Methods[methodId].JitTime;
830 | ulong baseMethodCallCount = baseResults.Methods[methodId].CallCount;
831 |
832 | for (int i = 1; i < resultsSet.Length; i++)
833 | {
834 | Results rK = resultsSet[i];
835 | Results rKm1 = resultsSet[i - 1];
836 |
837 | if (rK == null || rKm1 == null)
838 | {
839 | continue;
840 | }
841 |
842 | // Load up the recapture xml
843 | XElement root = XElement.Load(rK.LogFile);
844 |
845 | // Look for the embedded inliner observation schema
846 | IEnumerable schemas = from el in root.Descendants("DataSchema") select el;
847 | XElement schema = schemas.First();
848 | string schemaString = (string)schema;
849 | // Add on the performance data column headers
850 | string extendedSchemaString =
851 | "Benchmark,SubBenchmark," +
852 | schemaString +
853 | ",HotSizeDelta,ColdSizeDelta,JitTimeDelta,InstRetiredDelta,InstRetired,InstRetiredSD" +
854 | ",CallDelta,InstRetiredPerCallDelta,RootCallCount,InstRetiredPerRootCallDelta,Confidence";
855 |
856 | // If we haven't yet emitted a local header, do so now.
857 | if (!hasHeader)
858 | {
859 | dataModelFile.WriteLine(extendedSchemaString);
860 | hasHeader = true;
861 | }
862 |
863 | // Similarly for the combined data file
864 | if (!combinedHasHeader)
865 | {
866 | combinedDataFile.WriteLine(extendedSchemaString);
867 | combinedHasHeader = true;
868 | }
869 |
870 | // Figure out relative position of a few key columns
871 | string[] columnNames = schemaString.Split(comma);
872 | int hotSizeIndex = -1;
873 | int coldSizeIndex = -1;
874 | int jitTimeIndex = -1;
875 | int index = 0;
876 | foreach (string s in columnNames)
877 | {
878 | switch (s)
879 | {
880 | case "HotSize":
881 | hotSizeIndex = index;
882 | break;
883 | case "ColdSize":
884 | coldSizeIndex = index;
885 | break;
886 | case "JitTime":
887 | jitTimeIndex = index;
888 | break;
889 | }
890 |
891 | index++;
892 | }
893 |
894 | // Find the embededed inline observation data
895 | IEnumerable data = from el in root.Descendants("Data") select el;
896 | string dataString = (string)data.First();
897 | string[] dataStringX = dataString.Split(comma);
898 |
899 | // Split out the observations that we need for extended info.
900 | int currentMethodHotSize = hotSizeIndex >= 0 ? Int32.Parse(dataStringX[hotSizeIndex]) : 0;
901 | int currentMethodColdSize = coldSizeIndex >= 0 ? Int32.Parse(dataStringX[coldSizeIndex]) : 0;
902 | int currentMethodJitTime = jitTimeIndex >= 0 ? Int32.Parse(dataStringX[jitTimeIndex]) : 0;
903 | double currentCCDelta = ccDeltas[i];
904 |
905 | // How to handle data from multi-part benchmarks?
906 | // Aggregate it here, iteration-wise
907 | int subParts = rK.Performance.InstructionCount.Keys.Count;
908 | List arKData = null;
909 | List arKm1Data = null;
910 | foreach (string subBench in rK.Performance.InstructionCount.Keys)
911 | {
912 | if (!rK.Performance.InstructionCount.ContainsKey(subBench))
913 | {
914 | Console.WriteLine("!!! Odd -- no data for root {0} on {1} at index {2}",
915 | methodId, subBench, i);
916 | break;
917 | }
918 |
919 | if (!rKm1.Performance.InstructionCount.ContainsKey(subBench))
920 | {
921 | Console.WriteLine("!!! Odd -- no data for root {0} on {1} at index {2}",
922 | methodId, subBench, i - 1);
923 | break;
924 | }
925 |
926 | List rKData = rK.Performance.InstructionCount[subBench];
927 | List rKm1Data = rKm1.Performance.InstructionCount[subBench];
928 |
929 | if (arKData == null)
930 | {
931 | // Occasionally we'll lose xunit perf data, for reasons unknown
932 | if (rKData.Count != rKm1Data.Count)
933 | {
934 | Console.WriteLine("!!! Odd -- mismatched data for root {0} on {1} at index {2}",
935 | methodId, subBench, i);
936 | break;
937 | }
938 |
939 | // Copy first sub bench's data
940 | arKData = new List(rKData);
941 | arKm1Data = new List(rKm1Data);
942 | }
943 | else
944 | {
945 | // Accumulate remainder
946 | for (int ii = 0; ii < arKData.Count; ii++)
947 | {
948 | arKData[ii] += rKData[ii];
949 | arKm1Data[ii] += rKm1Data[ii];
950 | }
951 | }
952 | }
953 |
954 | if (arKData == null)
955 | {
956 | Console.WriteLine("!!! bailing out on index {0}", i);
957 | continue;
958 | }
959 |
960 | double confidence = PerformanceData.Confidence(arKData, arKm1Data);
961 | double arKAvg = PerformanceData.Average(arKData);
962 | double arKm1Avg = PerformanceData.Average(arKm1Data);
963 | double arKSD = PerformanceData.StdDeviation(arKData);
964 | double change = arKAvg - arKm1Avg;
965 | // Number of instructions saved per call to the current inlinee
966 | double perCallDelta = (currentCCDelta == 0) ? 0 : change / currentCCDelta;
967 | // Number of instructions saved per call to the root method
968 | double perRootDelta = (baseMethodCallCount == 0) ? 0 : change / baseMethodCallCount;
969 |
970 | int hotSizeDelta = currentMethodHotSize - baseMethodHotSize;
971 | int coldSizeDelta = currentMethodColdSize - baseMethodColdSize;
972 | int jitTimeDelta = currentMethodJitTime - baseMethodJitTime;
973 | int oneMillion = 1000 * 1000;
974 |
975 | dataModelFile.WriteLine("{0},{1},{2},{3},{4},{5},{6:0.00},{7:0.00},{8:0.00},{9:0.00},{10:0.00},{11:0.00},{12:0.00},{13:0.00}",
976 | benchmark.ShortName, "agg",
977 | dataString,
978 | hotSizeDelta, coldSizeDelta, jitTimeDelta,
979 | change / oneMillion, arKAvg / oneMillion, arKSD/ oneMillion, currentCCDelta, perCallDelta,
980 | baseMethodCallCount, perRootDelta, confidence);
981 |
982 | combinedDataFile.WriteLine("{0},{1},{2},{3},{4},{5},{6:0.00},{7:0.00},{8:0.00},{9:0.00},{10:0.00},{11:0.00},{12:0.00},{13:0.00}",
983 | benchmark.ShortName, "agg",
984 | dataString,
985 | hotSizeDelta, coldSizeDelta, jitTimeDelta,
986 | change / oneMillion, arKAvg / oneMillion, arKSD / oneMillion, currentCCDelta, perCallDelta,
987 | baseMethodCallCount, perRootDelta, confidence);
988 |
989 | baseMethodHotSize = currentMethodHotSize;
990 | baseMethodColdSize = currentMethodColdSize;
991 | baseMethodJitTime = currentMethodJitTime;
992 | }
993 | }
994 | }
995 | }
996 |
997 | Inline ExploreSubtree(InlineForest kForest, int k, Method rootMethod,
998 | Benchmark benchmark, Results[] explorationResults, Results[] recaptureResults,
999 | Dictionary callCounts, out ulong ccDelta)
1000 | {
1001 | ccDelta = 0;
1002 |
1003 | // Build inline subtree for method with first K nodes and swap it into the tree.
1004 | int index = 0;
1005 | Inline currentInline = null;
1006 | Inline[] mkInlines = rootMethod.GetBfsSubtree(k, out currentInline);
1007 |
1008 | if (mkInlines == null)
1009 | {
1010 | Console.WriteLine("$$$ {0} [{1}] Can't get this inline subtree yet, sorry", rootMethod.Name, k);
1011 | return null;
1012 | }
1013 |
1014 | kForest.Methods[index].Inlines = mkInlines;
1015 | kForest.Methods[index].InlineCount = (uint) k;
1016 |
1017 | // Externalize the inline xml
1018 | XmlSerializer xo = new XmlSerializer(typeof(InlineForest));
1019 | string testName = String.Format("{0}-{1}-{2:X8}-{3}", benchmark.ShortName, endResults.Name, rootMethod.Token, k);
1020 | string xmlName = testName + ".xml";
1021 | string resultsDir = Program.RESULTS_DIR;
1022 | string replayFileName = Path.Combine(resultsDir, xmlName);
1023 | using (Stream xmlOutFile = new FileStream(replayFileName, FileMode.Create))
1024 | {
1025 | xo.Serialize(xmlOutFile, kForest);
1026 | }
1027 |
1028 | // Run the test and record the perf results.
1029 | XunitPerfRunner x = new XunitPerfRunner();
1030 | Configuration c = new Configuration(testName);
1031 | c.Environment["COMPlus_JitInlinePolicyReplay"] = "1";
1032 | c.Environment["COMPlus_JitInlineReplayFile"] = replayFileName;
1033 | Results resultsK = x.RunBenchmark(benchmark, c);
1034 | explorationResults[k] = resultsK;
1035 |
1036 | if (recaptureResults != null)
1037 | {
1038 | // Run test and recapture the inline XML along with observational data about the last inline
1039 | string retestName = String.Format("{0}-{1}-{2:X8}-{3}-data", benchmark.ShortName, endResults.Name, rootMethod.Token, k);
1040 | Configuration cr = new Configuration(retestName);
1041 | CoreClrRunner clr = new CoreClrRunner();
1042 | cr.Environment["COMPlus_JitInlinePolicyReplay"] = "1";
1043 | cr.Environment["COMPlus_JitInlineReplayFile"] = replayFileName;
1044 | // Ask for "minimal" replay XML here
1045 | cr.Environment["COMPlus_JitInlineDumpXml"] = "2";
1046 | cr.Environment["COMPlus_JitInlineDumpData"] = "1";
1047 | Results resultsClr = clr.RunBenchmark(benchmark, cr);
1048 | // Snag performance data from above
1049 | resultsClr.Performance = resultsK.Performance;
1050 | recaptureResults[k] = resultsClr;
1051 | }
1052 |
1053 | // Run and capture method call counts
1054 | //
1055 | // Note if we've really done a pure isolation experiment than there should be at most
1056 | // one method whose call count changes. Might be interesting to try and verify this!
1057 | // (would require zap disable or similar so we get call counts for all methods)
1058 | if (Program.CaptureCallCounts && callCounts != null)
1059 | {
1060 | string callCountName = String.Format("{0}-{1}-{2:X8}-{3}-cc", benchmark.ShortName, endResults.Name, rootMethod.Token, k);
1061 | Configuration cc = new Configuration(callCountName);
1062 | CoreClrRunner clr = new CoreClrRunner();
1063 | cc.Environment["COMPlus_JitInlinePolicyReplay"] = "1";
1064 | cc.Environment["COMPlus_JitInlineReplayFile"] = replayFileName;
1065 | // Ask for method entry instrumentation
1066 | cc.Environment["COMPlus_JitMeasureEntryCounts"] = "1";
1067 | Results resultsCC = clr.RunBenchmark(benchmark, cc);
1068 |
1069 | MethodId currentId = currentInline.GetMethodId();
1070 | bool foundcc = false;
1071 | // Parse results back and find call count for the current inline.
1072 | using (StreamReader callCountStream = File.OpenText(resultsCC.LogFile))
1073 | {
1074 | string callCountLine = callCountStream.ReadLine();
1075 | while (callCountLine != null)
1076 | {
1077 | string[] callCountFields = callCountLine.Split(new char[] { ',' });
1078 | if (callCountFields.Length == 3)
1079 | {
1080 | uint token = UInt32.Parse(callCountFields[0], System.Globalization.NumberStyles.HexNumber);
1081 | uint hash = UInt32.Parse(callCountFields[1], System.Globalization.NumberStyles.HexNumber);
1082 | ulong count = UInt64.Parse(callCountFields[2]);
1083 |
1084 | if (token == currentId.Token && hash == currentId.Hash)
1085 | {
1086 | foundcc = true;
1087 |
1088 | if (callCounts.ContainsKey(currentId))
1089 | {
1090 | // Note we expect it not to increase!
1091 | //
1092 | // Zero is possible if we inline at a call site that was not hit.
1093 | // We may even see perf impact with zero call count change,
1094 | // because of changes elsewhere in the method in code that is hit.
1095 | ulong oldCount = callCounts[currentId];
1096 | callCounts[currentId] = count;
1097 | Console.WriteLine("Call count for {0:X8}-{1:X8} went from {2} to {3}",
1098 | token, hash, oldCount, count);
1099 | ccDelta = oldCount - count;
1100 | if (ccDelta < 0)
1101 | {
1102 | Console.WriteLine("Call count unexpectedly increased!");
1103 | }
1104 | }
1105 | else
1106 | {
1107 | // Don't really expect to hit this.. we'll never see this method as a root.
1108 | Console.WriteLine("Call count for {0:X8}-{1:X8} went from {2} to {3}",
1109 | token, hash, "unknown", count);
1110 | }
1111 | break;
1112 | }
1113 | }
1114 |
1115 | callCountLine = callCountStream.ReadLine();
1116 | }
1117 | }
1118 |
1119 | if (!foundcc)
1120 | {
1121 | // The method was evidently not called in the latest run.
1122 | if (callCounts.ContainsKey(currentId))
1123 | {
1124 | // It was called in earlier runs, so assume we've inlined the last call.
1125 | ccDelta = callCounts[currentId];
1126 | Console.WriteLine("### No (after) call count entry for {0:X8}-{1:X8}. Assuming all calls inlined. ccdelta = {2}.",
1127 | currentId.Token, currentId.Hash, ccDelta);
1128 | }
1129 | else
1130 | {
1131 | // It was not called in earlier runs, assume it was never called.
1132 | ccDelta = 0;
1133 | Console.WriteLine("### No (before) call count entry for {0:X8}-{1:X8}. Assuming method never called. ccdelta = 0.",
1134 | currentId.Token, currentId.Hash);
1135 | }
1136 |
1137 | // Going forward, we don't expect to see this method be called
1138 | callCounts[currentId] = 0;
1139 | }
1140 | }
1141 |
1142 | return currentInline;
1143 | }
1144 |
1145 | // Determine confidence level that performance differs in the two indicated
1146 | // result sets.
1147 | //
1148 | // If we can't tell the difference between the two, it may
1149 | // mean either (a) the method or call site was never executed, or (b)
1150 | // the inlines had no perf impact.
1151 | //
1152 | // We could still add this info to our model, since the jit won't generally
1153 | // be able to tell if a callee will be executed, but for now we just look
1154 | // for impactful changes.
1155 | bool CheckResults(Results[] explorationResults, int diffIndex, int baseIndex)
1156 | {
1157 | Results baseResults = explorationResults[baseIndex];
1158 | Results diffResults = explorationResults[diffIndex];
1159 |
1160 | // Make sure runs happened. Might not if we couldn't find the base method.
1161 | if (baseResults == null)
1162 | {
1163 | Console.WriteLine("$$$ Can't get base run data, sorry");
1164 | return false;
1165 | }
1166 |
1167 | if (diffResults == null)
1168 | {
1169 | Console.WriteLine("$$$ Can't get diff run data, sorry");
1170 | return false;
1171 | }
1172 |
1173 | bool signficant = false;
1174 |
1175 | foreach (string subBench in baseResults.Performance.InstructionCount.Keys)
1176 | {
1177 | List baseData = baseResults.Performance.InstructionCount[subBench];
1178 | List diffData = diffResults.Performance.InstructionCount[subBench];
1179 | double confidence = PerformanceData.Confidence(baseData, diffData);
1180 |
1181 | signficant |= (confidence > 0.8);
1182 | }
1183 |
1184 | return signficant;
1185 | }
1186 | void ShowResults(Results[] explorationResults, int diffIndex, int baseIndex,
1187 | Method rootMethod, Inline currentInline, List deltas, ulong ccDelta)
1188 | {
1189 | Results zeroResults = explorationResults[0];
1190 | Results baseResults = explorationResults[baseIndex];
1191 | Results diffResults = explorationResults[diffIndex];
1192 |
1193 | // Make sure runs happened. Might not if we couldn't find the base method.
1194 | if (zeroResults == null)
1195 | {
1196 | Console.WriteLine("$$$ Can't get noinline run data, sorry");
1197 | return;
1198 | }
1199 |
1200 | if (baseResults == null)
1201 | {
1202 | Console.WriteLine("$$$ Can't get base run data, sorry");
1203 | return;
1204 | }
1205 |
1206 | if (diffResults == null)
1207 | {
1208 | Console.WriteLine("$$$ Can't get diff run data, sorry");
1209 | return;
1210 | }
1211 |
1212 | // Try and get the name of the last inline.
1213 | // We may not know it, if the method was prejitted, since it will
1214 | // never be a jit root.
1215 | // If so, use the token value.
1216 | MethodId currentMethodId = currentInline.GetMethodId();
1217 | string currentMethodName = null;
1218 | if (baseResults.Methods != null && baseResults.Methods.ContainsKey(currentMethodId))
1219 | {
1220 | currentMethodName = baseResults.Methods[currentMethodId].Name;
1221 | }
1222 | else
1223 | {
1224 | currentMethodName = String.Format("Token {0:X8} Hash {1:X8}",
1225 | currentMethodId.Token, currentMethodId.Hash);
1226 | }
1227 |
1228 | Console.WriteLine("$$$ Root {0} index {1} inlining {2}", rootMethod.Name, diffIndex, currentMethodName);
1229 |
1230 | foreach (string subBench in baseResults.Performance.InstructionCount.Keys)
1231 | {
1232 | List zeroData = zeroResults.Performance.InstructionCount[subBench];
1233 | List baseData = baseResults.Performance.InstructionCount[subBench];
1234 | List diffData = diffResults.Performance.InstructionCount[subBench];
1235 |
1236 | double confidence = PerformanceData.Confidence(baseData, diffData);
1237 | double baseAvg = PerformanceData.Average(baseData);
1238 | double diffAvg = PerformanceData.Average(diffData);
1239 | double change = diffAvg - baseAvg;
1240 | double pctDiff = 100.0 * change / baseAvg;
1241 |
1242 | double confidence0 = PerformanceData.Confidence(baseData, diffData);
1243 | double zeroAvg = PerformanceData.Average(zeroData);
1244 | double change0 = diffAvg - zeroAvg;
1245 | double pctDiff0 = 100.0 * change0 / zeroAvg;
1246 |
1247 | Console.WriteLine("{0:30}: base {1:0.00}M new {2:0.00}M delta {3:0.00}M ({4:0.00}%) confidence {5:0.00}",
1248 | subBench,
1249 | baseAvg / (1000 * 1000), diffAvg / (1000 * 1000),
1250 | change / (1000 * 1000), pctDiff, confidence);
1251 | Console.Write("{0:30} noinl {1:0.00}M delta {2:0.00}M ({3:0.00}%) confidence {4:0.00}",
1252 | "", zeroAvg / (1000 * 1000), change0 / (1000 * 1000), pctDiff0, confidence0);
1253 |
1254 | if (ccDelta != 0)
1255 | {
1256 | Console.Write(" cc-delta {0} ipc {1:0.00}", ccDelta, change / ccDelta );
1257 | }
1258 |
1259 | Console.WriteLine();
1260 |
1261 | if (deltas != null)
1262 | {
1263 | InlineDelta d = new InlineDelta();
1264 |
1265 | d.rootMethod = rootMethod;
1266 | d.inlineMethodId = currentMethodId;
1267 | d.pctDelta = pctDiff;
1268 | d.index = diffIndex;
1269 | d.subBench = subBench;
1270 | d.confidence = confidence;
1271 | d.instructionsDelta = change;
1272 | if (ccDelta != 0)
1273 | {
1274 | d.hasPerCallDelta = true;
1275 | d.perCallDelta = change / ccDelta;
1276 | d.callsDelta = ccDelta;
1277 | }
1278 |
1279 | deltas.Add(d);
1280 | }
1281 | }
1282 | }
1283 | }
1284 |
1285 | // A mechanism to run the benchmark
1286 | public abstract class Runner
1287 | {
1288 | public abstract Results RunBenchmark(Benchmark b, Configuration c);
1289 | }
1290 |
1291 | public class CoreClrRunner : Runner
1292 | {
1293 | public CoreClrRunner()
1294 | {
1295 | cmdExe = Program.SHELL;
1296 | runnerExe = Program.CORERUN;
1297 | }
1298 |
1299 | public override Results RunBenchmark(Benchmark b, Configuration c)
1300 | {
1301 | // Make sure there's an exe to run.
1302 | if (!File.Exists(runnerExe))
1303 | {
1304 | Console.WriteLine("Can't find runner exe: '{0}'", runnerExe);
1305 | return null;
1306 | }
1307 |
1308 | // Setup process information
1309 | System.Diagnostics.Process runnerProcess = new Process();
1310 | runnerProcess.StartInfo.FileName = cmdExe;
1311 | string stderrName = c.ResultsDirectory + @"\" + b.ShortName + "-" + c.Name + ".xml";
1312 |
1313 | foreach (string envVar in c.Environment.Keys)
1314 | {
1315 | runnerProcess.StartInfo.Environment[envVar] = c.Environment[envVar];
1316 | }
1317 | runnerProcess.StartInfo.Environment["CORE_ROOT"] = Path.GetDirectoryName(runnerExe);
1318 | runnerProcess.StartInfo.Arguments = "/C \"" + runnerExe + " " + b.FullPath + " 2> " + stderrName + "\"";
1319 | runnerProcess.StartInfo.WorkingDirectory = System.IO.Path.GetDirectoryName(b.FullPath);
1320 | runnerProcess.StartInfo.UseShellExecute = false;
1321 |
1322 | if (Program.VeryVerbose)
1323 | {
1324 | Console.WriteLine("CoreCLR: launching " + runnerProcess.StartInfo.Arguments);
1325 | }
1326 |
1327 | runnerProcess.Start();
1328 | runnerProcess.WaitForExit();
1329 |
1330 | if (Program.Verbose)
1331 | {
1332 | Console.WriteLine("CoreCLR: Finished running {0} -- configuration: {1}, exit code: {2} (expected {3})",
1333 | b.ShortName, c.Name, runnerProcess.ExitCode, b.ExitCode);
1334 | }
1335 |
1336 | Results results = new Results();
1337 | results.Success = (b.ExitCode == runnerProcess.ExitCode);
1338 | results.ExitCode = b.ExitCode;
1339 | results.LogFile = stderrName;
1340 | results.Name = c.Name;
1341 |
1342 | // TODO: Iterate to get perf data
1343 | List timeData = new List(1);
1344 | timeData.Add(runnerProcess.ExitTime.Subtract(runnerProcess.StartTime).TotalMilliseconds);
1345 | results.Performance.ExecutionTime[b.ShortName] = timeData;
1346 | return results;
1347 | }
1348 |
1349 | private string runnerExe;
1350 | private string cmdExe;
1351 | }
1352 |
1353 | public class XunitPerfRunner : Runner
1354 | {
1355 | public XunitPerfRunner()
1356 | {
1357 | SetupSandbox();
1358 | }
1359 |
1360 | void SetupSandbox()
1361 | {
1362 | // Only do this once per run
1363 | if (sandboxIsSetup)
1364 | {
1365 | return;
1366 | }
1367 |
1368 | if (Directory.Exists(sandboxDir))
1369 | {
1370 | if (Program.Verbose)
1371 | {
1372 | Console.WriteLine("...Cleaning old xunit-perf sandbox '{0}'", sandboxDir);
1373 | }
1374 | Directory.Delete(sandboxDir, true);
1375 | }
1376 |
1377 | if (Program.Verbose)
1378 | {
1379 | Console.WriteLine("...Creating new xunit-perf sandbox '{0}'", sandboxDir);
1380 | }
1381 | Directory.CreateDirectory(sandboxDir);
1382 | DirectoryInfo sandboxDirectoryInfo = new DirectoryInfo(sandboxDir);
1383 |
1384 | // Copy over xunit packages
1385 | string xUnitPerfRunner = Path.Combine(coreclrRoot, @"packages\Microsoft.DotNet.xunit.performance.runner.Windows\1.0.0-alpha-build0040\tools");
1386 | string xUnitPerfAnalysis = Path.Combine(coreclrRoot, @"packages\Microsoft.DotNet.xunit.performance.analysis\1.0.0-alpha-build0040\tools");
1387 | string xUnitPerfConsole = Path.Combine(coreclrRoot, @"packages\xunit.console.netcore\1.0.2-prerelease-00177\runtimes\any\native");
1388 |
1389 | CopyAll(new DirectoryInfo(xUnitPerfRunner), sandboxDirectoryInfo);
1390 | CopyAll(new DirectoryInfo(xUnitPerfConsole), sandboxDirectoryInfo);
1391 | CopyAll(new DirectoryInfo(xUnitPerfAnalysis), sandboxDirectoryInfo);
1392 | CopyAll(new DirectoryInfo(testOverlayRoot), sandboxDirectoryInfo);
1393 |
1394 | sandboxIsSetup = true;
1395 | }
1396 |
1397 | public static void CopyAll(DirectoryInfo source, DirectoryInfo target)
1398 | {
1399 | Directory.CreateDirectory(target.FullName);
1400 |
1401 | // Copy each file into the new directory.
1402 | foreach (FileInfo fi in source.GetFiles())
1403 | {
1404 | fi.CopyTo(Path.Combine(target.FullName, fi.Name), true);
1405 | }
1406 |
1407 | // Copy each subdirectory using recursion.
1408 | foreach (DirectoryInfo diSourceSubDir in source.GetDirectories())
1409 | {
1410 | DirectoryInfo nextTargetSubDir =
1411 | target.CreateSubdirectory(diSourceSubDir.Name);
1412 | CopyAll(diSourceSubDir, nextTargetSubDir);
1413 | }
1414 | }
1415 |
1416 | // See if there's some way to just run a particular sub benchmark?
1417 | public override Results RunBenchmark(Benchmark b, Configuration c)
1418 | {
1419 | // Copy benchmark to sandbox
1420 | string benchmarkFile = Path.GetFileName(b.FullPath);
1421 | File.Copy(b.FullPath, Path.Combine(sandboxDir, benchmarkFile), true);
1422 |
1423 | // Setup process information
1424 | System.Diagnostics.Process runnerProcess = new Process();
1425 | runnerProcess.StartInfo.FileName = Path.Combine(sandboxDir, "xunit.performance.run.exe");
1426 | string perfName = c.Name + "-" + b.ShortName;
1427 |
1428 | foreach (string envVar in c.Environment.Keys)
1429 | {
1430 | runnerProcess.StartInfo.Environment[envVar] = c.Environment[envVar];
1431 | }
1432 | runnerProcess.StartInfo.Environment["CORE_ROOT"] = sandboxDir;
1433 | runnerProcess.StartInfo.Environment["XUNIT_PERFORMANCE_MIN_ITERATION"] = Program.MinIterations.ToString();
1434 | runnerProcess.StartInfo.Environment["XUNIT_PERFORMANCE_MAX_ITERATION"] = Program.MaxIterations.ToString();
1435 |
1436 | runnerProcess.StartInfo.Arguments = benchmarkFile +
1437 | " -nologo -runner xunit.console.netcore.exe -runnerhost corerun.exe -runid " +
1438 | perfName +
1439 | (Program.ClassFilter == null ? "" : " -class " + Program.ClassFilter) +
1440 | (Program.MethodFilter == null ? "" : " -method " + Program.MethodFilter);
1441 |
1442 | runnerProcess.StartInfo.WorkingDirectory = sandboxDir;
1443 | runnerProcess.StartInfo.UseShellExecute = false;
1444 |
1445 | if (Program.VeryVerbose)
1446 | {
1447 | Console.WriteLine("xUnitPerf: launching " + runnerProcess.StartInfo.Arguments);
1448 | }
1449 |
1450 | runnerProcess.Start();
1451 | runnerProcess.WaitForExit();
1452 |
1453 | if (Program.VeryVerbose)
1454 | {
1455 | // Xunit doesn't run Main so no 100 exit here.
1456 | Console.WriteLine("xUnitPerf: Finished running {0} -- configuration: {1}, exit code: {2}",
1457 | b.ShortName, c.Name, runnerProcess.ExitCode);
1458 | }
1459 |
1460 | // Parse iterations out of perf-*.xml
1461 | string xmlPerfResultsFile = Path.Combine(sandboxDir, perfName) + ".xml";
1462 | XElement root = XElement.Load(xmlPerfResultsFile);
1463 | IEnumerable subBenchmarks = from el in root.Descendants("test") select el;
1464 |
1465 | // We keep the raw iterations results and just summarize here.
1466 | Results results = new Results();
1467 | PerformanceData perfData = results.Performance;
1468 |
1469 | foreach (XElement sub in subBenchmarks)
1470 | {
1471 | string subName = (string)sub.Attribute("name");
1472 |
1473 | IEnumerable iExecutionTimes =
1474 | from el in sub.Descendants("iteration")
1475 | where el.Attribute("Duration") != null && (string)el.Attribute("index") != "0"
1476 | select Double.Parse((string)el.Attribute("Duration"));
1477 |
1478 | IEnumerable iInstructionsRetired =
1479 | from el in sub.Descendants("iteration")
1480 | where el.Attribute("InstRetired") != null && (string)el.Attribute("index") != "0"
1481 | select Double.Parse((string)el.Attribute("InstRetired"));
1482 |
1483 | perfData.ExecutionTime[subName] = new List(iExecutionTimes);
1484 | perfData.InstructionCount[subName] = new List(iInstructionsRetired);
1485 | }
1486 |
1487 | if (Program.Verbose)
1488 | {
1489 | perfData.Print(c.Name);
1490 | }
1491 |
1492 | results.Success = (b.ExitCode == runnerProcess.ExitCode);
1493 | results.ExitCode = b.ExitCode;
1494 | results.LogFile = "";
1495 | results.Name = c.Name;
1496 |
1497 | return results;
1498 | }
1499 |
1500 | static string sandboxDir = Program.SANDBOX_DIR;
1501 | static string coreclrRoot = Program.CORECLR_ROOT;
1502 | static string testOverlayRoot = Path.Combine(coreclrRoot, @"bin\tests\Windows_NT.x64.Release\tests\Core_Root");
1503 | static bool sandboxIsSetup;
1504 | }
1505 |
1506 | public class Program
1507 | {
1508 |
1509 | // The noinline model is one where inlining is disabled.
1510 | // The inline forest here is minimal.
1511 | //
1512 | // An attributed profile of this model helps the tool
1513 | // identify areas for investigation.
1514 | Results BuildNoInlineModel(Runner r, Runner x, Benchmark b)
1515 | {
1516 | Console.WriteLine("----");
1517 | Console.WriteLine("---- No Inline Model for {0}", b.ShortName);
1518 |
1519 | // Create empty inline replay XML
1520 | InlineForest emptyForest = new InlineForest();
1521 | emptyForest.Policy = "ReplayPolicy";
1522 | XmlSerializer emptySerializer = new XmlSerializer(typeof(InlineForest));
1523 | string emptyXmlFile = String.Format("{0}-empy-replay.xml", b.ShortName);
1524 | string emptyXmlPath = Path.Combine(Program.RESULTS_DIR, emptyXmlFile);
1525 | using (Stream emptyXmlStream = new FileStream(emptyXmlPath, FileMode.Create))
1526 | {
1527 | emptySerializer.Serialize(emptyXmlStream, emptyForest);
1528 | }
1529 |
1530 | // Replay with empty xml and recapture the full noinline xml. Latter will
1531 | // show all the methods that were jitted.
1532 | Configuration noInlineConfig = new Configuration("noinl");
1533 | noInlineConfig.ResultsDirectory = Program.RESULTS_DIR;
1534 | noInlineConfig.Environment["COMPlus_JitInlinePolicyReplay"] = "1";
1535 | noInlineConfig.Environment["COMPlus_JitInlineReplayFile"] = emptyXmlPath;
1536 | noInlineConfig.Environment["COMPlus_JitInlineDumpXml"] = "1";
1537 |
1538 | Results noInlineResults = r.RunBenchmark(b, noInlineConfig);
1539 |
1540 | if (noInlineResults == null || !noInlineResults.Success)
1541 | {
1542 | Console.WriteLine("Noinline run failed\n");
1543 | return null;
1544 | }
1545 |
1546 | if (Program.ExploreInlines)
1547 | {
1548 | // Parse noinline xml
1549 | XmlSerializer xml = new XmlSerializer(typeof(InlineForest));
1550 | InlineForest f;
1551 | Stream xmlFile = new FileStream(noInlineResults.LogFile, FileMode.Open);
1552 | try
1553 | {
1554 | f = (InlineForest)xml.Deserialize(xmlFile);
1555 | }
1556 | catch (System.Exception ex)
1557 | {
1558 | Console.WriteLine("Xml deserialization failed: " + ex.Message);
1559 | return null;
1560 | }
1561 |
1562 | long inlineCount = f.Methods.Sum(m => m.InlineCount);
1563 | Console.WriteLine("*** Noinline config has {0} methods, {1} inlines", f.Methods.Length, inlineCount);
1564 | noInlineResults.InlineForest = f;
1565 |
1566 | // Determine set of unique method Ids and build map from ID to method
1567 | Dictionary idCounts = new Dictionary();
1568 | Dictionary methods = new Dictionary(f.Methods.Length);
1569 |
1570 | foreach (Method m in f.Methods)
1571 | {
1572 | MethodId id = m.getId();
1573 | methods[id] = m;
1574 |
1575 | if (idCounts.ContainsKey(id))
1576 | {
1577 | idCounts[id]++;
1578 | }
1579 | else
1580 | {
1581 | idCounts[id] = 1;
1582 | }
1583 | }
1584 |
1585 | noInlineResults.Methods = methods;
1586 |
1587 | Console.WriteLine("*** Noinline config has {0} unique method IDs", idCounts.Count);
1588 |
1589 | foreach (MethodId m in idCounts.Keys)
1590 | {
1591 | uint count = idCounts[m];
1592 | if (count > 1)
1593 | {
1594 | Console.WriteLine("*** MethodId Token:0x{0:X8} Hash:0x{1:X8} has {2} duplicates", m.Token, m.Hash, count);
1595 | }
1596 | }
1597 |
1598 | // Mark methods in noinline results that do not have unique IDs
1599 | foreach (Method m in f.Methods)
1600 | {
1601 | MethodId id = m.getId();
1602 | if (idCounts[id] > 1)
1603 | {
1604 | m.MarkAsDuplicate();
1605 | }
1606 | }
1607 | }
1608 |
1609 | // Get noinline perf numbers using empty replay xml
1610 | Configuration noinlinePerfConfig = new Configuration("noinline-perf");
1611 | noinlinePerfConfig.ResultsDirectory = Program.RESULTS_DIR;
1612 | noinlinePerfConfig.Environment["COMPlus_JitInlinePolicyReplay"] = "1";
1613 | noinlinePerfConfig.Environment["COMPlus_JitInlineReplayFile"] = emptyXmlPath;
1614 | Results perfResults = x.RunBenchmark(b, noinlinePerfConfig);
1615 | Console.WriteLine("-- Updating noinline results");
1616 | noInlineResults.Performance = perfResults.Performance;
1617 | noInlineResults.Performance.Print(noInlineConfig.Name);
1618 |
1619 | // Get noinline method call counts
1620 | // Todo: use xunit runner and capture stderr? Downside is that xunit-perf
1621 | // entry points won't be in the baseline method set.
1622 | if (CaptureCallCounts)
1623 | {
1624 | Configuration noInlineCallCountConfig = new Configuration("noinline-cc");
1625 | noInlineCallCountConfig.ResultsDirectory = Program.RESULTS_DIR;
1626 | noInlineCallCountConfig.Environment["COMPlus_JitInlinePolicyReplay"] = "1";
1627 | noInlineCallCountConfig.Environment["COMPlus_JitInlineReplayFile"] = emptyXmlPath;
1628 | noInlineCallCountConfig.Environment["COMPlus_JitMeasureEntryCounts"] = "1";
1629 | Results ccResults = r.RunBenchmark(b, noInlineCallCountConfig);
1630 |
1631 | AnnotateCallCounts(ccResults, noInlineResults);
1632 | }
1633 |
1634 | return noInlineResults;
1635 | }
1636 |
1637 | // The legacy model reflects the current jit behavior.
1638 | // Scoring of runs will be relative to this data.
1639 | // The inherent noise level is also estimated here.
1640 | Results BuildLegacyModel(Runner r, Runner x, Benchmark b, bool enhanced = false)
1641 | {
1642 | string modelName = enhanced ? "EnhancedLegacy" : "Legacy";
1643 | Console.WriteLine("----");
1644 | Console.WriteLine("---- {0} Model for {1}", modelName, b.ShortName);
1645 |
1646 | Configuration legacyConfig = new Configuration(modelName);
1647 | legacyConfig.ResultsDirectory = Program.RESULTS_DIR;
1648 | legacyConfig.Environment["COMPlus_JitInlineDumpXml"] = "1";
1649 | if (!enhanced)
1650 | {
1651 | legacyConfig.Environment["COMPlus_JitInlinePolicyLegacy"] = "1";
1652 | }
1653 |
1654 | Results legacyResults = r.RunBenchmark(b, legacyConfig);
1655 |
1656 | if (legacyResults == null || !legacyResults.Success)
1657 | {
1658 | Console.WriteLine("Legacy run failed\n");
1659 | return null;
1660 | }
1661 |
1662 | if (Program.ExploreInlines)
1663 | {
1664 | XmlSerializer xml = new XmlSerializer(typeof(InlineForest));
1665 | InlineForest f;
1666 | Stream xmlFile = new FileStream(legacyResults.LogFile, FileMode.Open);
1667 | f = (InlineForest)xml.Deserialize(xmlFile);
1668 | long inlineCount = f.Methods.Sum(m => m.InlineCount);
1669 | Console.WriteLine("*** Legacy config has {0} methods, {1} inlines", f.Methods.Length, inlineCount);
1670 | legacyResults.InlineForest = f;
1671 |
1672 | // Populate the methodId -> method lookup table
1673 | Dictionary methods = new Dictionary(f.Methods.Length);
1674 | foreach (Method m in f.Methods)
1675 | {
1676 | MethodId id = m.getId();
1677 | methods[id] = m;
1678 | }
1679 | legacyResults.Methods = methods;
1680 | }
1681 |
1682 | // Now get legacy perf numbers
1683 | Configuration legacyPerfConfig = new Configuration(modelName + "-perf");
1684 | if (!enhanced)
1685 | {
1686 | legacyPerfConfig.Environment["COMPlus_JitInlinePolicyLegacy"] = "1";
1687 | }
1688 | legacyPerfConfig.ResultsDirectory = Program.RESULTS_DIR;
1689 | Results perfResults = x.RunBenchmark(b, legacyPerfConfig);
1690 | legacyResults.Performance = perfResults.Performance;
1691 | legacyResults.Performance.Print(legacyConfig.Name);
1692 |
1693 | // Get legacy method call counts
1694 | if (CaptureCallCounts)
1695 | {
1696 | Configuration legacyCallCountConfig = new Configuration(modelName + "cc");
1697 | legacyCallCountConfig.ResultsDirectory = Program.RESULTS_DIR;
1698 | legacyCallCountConfig.Environment["COMPlus_JitMeasureEntryCounts"] = "1";
1699 | if (!enhanced)
1700 | {
1701 | legacyCallCountConfig.Environment["COMPlus_JitInlinePolicyLegacy"] = "1";
1702 | }
1703 | Results ccResults = r.RunBenchmark(b, legacyCallCountConfig);
1704 |
1705 | // Parse results back and annotate base method set
1706 | AnnotateCallCounts(ccResults, legacyResults);
1707 | }
1708 |
1709 | return legacyResults;
1710 | }
1711 |
1712 | // The full model creates an inline forest at some prescribed
1713 | // depth. The inline configurations that will be explored
1714 | // are sub-forests of this full forest.
1715 | Results BuildFullModel(Runner r, Runner x, Benchmark b, Results noinlineResults)
1716 | {
1717 | Console.WriteLine("----");
1718 | Console.WriteLine("---- Full Model for {0}", b.ShortName);
1719 |
1720 | string resultsDir = Program.RESULTS_DIR;
1721 | // Because we're jitting and inlining some methods won't be jitted on
1722 | // their own at all. To unearth full trees for all methods we need
1723 | // to iterate. The rough idea is as follows.
1724 | //
1725 | // Run with FullPolicy for all methods. This will end up jitting
1726 | // some subset of methods seen in the noinline config. Compute this subset,
1727 | // collect up their trees, and then disable inlining for those methods.
1728 | // Rerun. This time around some of the methods missed in the first will
1729 | // be jitted and will grow inline trees. Collect these new trees and
1730 | // add those methods to the disabled set. Repeat until we've seen all methods.
1731 | //
1732 | // Unfortunately we don't have unique IDs for methods. To handle this we
1733 | // need to determine which methods do have unique IDs.
1734 |
1735 | // This is the count of noinline methods with unique IDs.
1736 | int methodCount = ExploreInlines ? noinlineResults.Methods.Count : 1;
1737 |
1738 | // We'll collect up these methods with their full trees here.
1739 | HashSet fullMethodIds = new HashSet();
1740 | List fullMethods = new List(methodCount);
1741 | uint iteration = 0;
1742 | uint maxInlineCount = 0;
1743 | uint leafMethodCount = 0;
1744 | uint newMethodCount = 0;
1745 | Method maxInlineMethod = null;
1746 | bool failed = false;
1747 |
1748 | while (fullMethodIds.Count < methodCount + newMethodCount)
1749 | {
1750 | iteration++;
1751 |
1752 | Console.WriteLine("*** Full config -- iteration {0}, still need trees for {1} out of {2} methods",
1753 | iteration, methodCount + newMethodCount - fullMethodIds.Count, methodCount + newMethodCount);
1754 |
1755 | Configuration fullConfiguration = new Configuration("full-" + iteration);
1756 | fullConfiguration.ResultsDirectory = resultsDir;
1757 | fullConfiguration.Environment["COMPlus_JitInlinePolicyFull"] = "1";
1758 | fullConfiguration.Environment["COMPlus_JitInlineDepth"] = "10";
1759 | fullConfiguration.Environment["COMPlus_JitInlineSize"] = "200";
1760 | fullConfiguration.Environment["COMPlus_JitInlineDumpXml"] = "1";
1761 |
1762 | // Build an exclude string disabling inlining in all the methods we've
1763 | // collected so far. If there are no methods yet, don't bother.
1764 | if (fullMethodIds.Count > 0)
1765 | {
1766 | StringBuilder sb = new StringBuilder();
1767 | foreach (MethodId id in fullMethodIds)
1768 | {
1769 | sb.Append(" ");
1770 | sb.Append(id.Hash);
1771 | }
1772 | string excludeString = sb.ToString();
1773 | // Console.WriteLine("*** exclude string: {0}\n", excludeString);
1774 | fullConfiguration.Environment["COMPlus_JitNoInlineRange"] = excludeString;
1775 | }
1776 |
1777 | // Run this iteration
1778 | Results currentResults = r.RunBenchmark(b, fullConfiguration);
1779 |
1780 | if (currentResults == null || !currentResults.Success)
1781 | {
1782 | failed = true;
1783 | Console.WriteLine("Full run failed\n");
1784 | break;
1785 | }
1786 |
1787 | // Parse the resulting xml
1788 | XmlSerializer xml = new XmlSerializer(typeof(InlineForest));
1789 | Stream xmlFile = new FileStream(currentResults.LogFile, FileMode.Open);
1790 | InlineForest f = (InlineForest) xml.Deserialize(xmlFile);
1791 | long inlineCount = f.Methods.Sum(m => m.InlineCount);
1792 | Console.WriteLine("*** This iteration of full config has {0} methods, {1} inlines", f.Methods.Length, inlineCount);
1793 | currentResults.InlineForest = f;
1794 |
1795 | // Find the set of new methods that we saw
1796 | HashSet newMethodIds = new HashSet();
1797 | foreach (Method m in f.Methods)
1798 | {
1799 | MethodId id = m.getId();
1800 |
1801 | if (!fullMethodIds.Contains(id) && !newMethodIds.Contains(id))
1802 | {
1803 | fullMethods.Add(m);
1804 | newMethodIds.Add(id);
1805 |
1806 | if (ExploreInlines && !noinlineResults.Methods.ContainsKey(id))
1807 | {
1808 | // Need to figure out why this happens.
1809 | //
1810 | // Suspect we're inlining force inlines in the noinline model but not here.
1811 | Console.WriteLine("*** full model uncovered new method: Token:0x{0:X8} Hash:0x{1:X8}", m.Token, m.Hash);
1812 | newMethodCount++;
1813 | }
1814 |
1815 | if (m.InlineCount > maxInlineCount)
1816 | {
1817 | maxInlineCount = m.InlineCount;
1818 | maxInlineMethod = m;
1819 | }
1820 |
1821 | if (m.InlineCount == 0)
1822 | {
1823 | leafMethodCount++;
1824 | }
1825 | }
1826 | }
1827 |
1828 | Console.WriteLine("*** found {0} new methods", newMethodIds.Count);
1829 |
1830 | if (newMethodIds.Count == 0)
1831 | {
1832 | failed = true;
1833 | Console.WriteLine("*** bailing out, unable to make forward progress");
1834 | break;
1835 | }
1836 |
1837 | fullMethodIds.UnionWith(newMethodIds);
1838 | }
1839 |
1840 | if (failed)
1841 | {
1842 | return null;
1843 | }
1844 |
1845 | Console.WriteLine("*** Full model complete, took {0} iterations", iteration);
1846 |
1847 | // Now build the aggregate inline forest....
1848 | InlineForest fullForest = new InlineForest();
1849 | fullForest.Methods = fullMethods.ToArray();
1850 |
1851 | // And consolidate into a results set
1852 | Results fullResults = new Results();
1853 | fullResults.InlineForest = fullForest;
1854 | fullResults.Name = "full";
1855 |
1856 | // Populate the methodId -> method lookup table
1857 | Dictionary methods = new Dictionary(fullMethods.Count);
1858 | foreach (Method m in fullMethods)
1859 | {
1860 | MethodId id = m.getId();
1861 | methods[id] = m;
1862 | }
1863 | fullResults.Methods = methods;
1864 |
1865 | long fullInlineCount = fullForest.Methods.Sum(m => m.InlineCount);
1866 | uint nonLeafMethodCount = (uint) fullMethods.Count - leafMethodCount;
1867 | Console.WriteLine("*** Full config has {0} methods, {1} inlines", fullForest.Methods.Length, fullInlineCount);
1868 | Console.WriteLine("*** {0} leaf methods, {1} methods with inlines, {2} average inline count",
1869 | leafMethodCount, nonLeafMethodCount, fullInlineCount/ nonLeafMethodCount);
1870 | Console.WriteLine("*** {0} max inline count for method 0x{1:X8} -- {2} subtrees",
1871 | maxInlineCount, maxInlineMethod.Token, maxInlineMethod.NumSubtrees());
1872 |
1873 | // Serialize out the consolidated set of trees
1874 | XmlSerializer xo = new XmlSerializer(typeof(InlineForest));
1875 | Stream xmlOutFile = new FileStream(Path.Combine(resultsDir, b.ShortName + "-full-consolidated.xml"), FileMode.Create);
1876 | xo.Serialize(xmlOutFile, fullForest);
1877 |
1878 | // Now get full perf numbers -- just for the initial set
1879 | Configuration fullPerfConfig = new Configuration("full-perf");
1880 | fullPerfConfig.Environment["COMPlus_JitInlinePolicyFull"] = "1";
1881 | fullPerfConfig.Environment["COMPlus_JitInlineDepth"] = "10";
1882 | fullPerfConfig.Environment["COMPlus_JitInlineSize"] = "200";
1883 | fullPerfConfig.ResultsDirectory = Program.RESULTS_DIR;
1884 | Results perfResults = x.RunBenchmark(b, fullPerfConfig);
1885 | fullResults.Performance = perfResults.Performance;
1886 | fullResults.Performance.Print("full");
1887 |
1888 | // Get full call counts.
1889 | // Ideally, perhaps, drive this from the noinline set...?
1890 | if (CaptureCallCounts)
1891 | {
1892 | Configuration fullPerfCallCountConfig = new Configuration("full-perf-cc");
1893 | fullPerfCallCountConfig.ResultsDirectory = Program.RESULTS_DIR;
1894 | fullPerfCallCountConfig.Environment["COMPlus_JitInlinePolicyFull"] = "1";
1895 | fullPerfCallCountConfig.Environment["COMPlus_JitInlineDepth"] = "10";
1896 | fullPerfCallCountConfig.Environment["COMPlus_JitInlineSize"] = "200";
1897 | fullPerfCallCountConfig.Environment["COMPlus_JitMeasureEntryCounts"] = "1";
1898 | Results ccResults = r.RunBenchmark(b, fullPerfCallCountConfig);
1899 |
1900 | AnnotateCallCounts(ccResults, fullResults);
1901 | }
1902 |
1903 | return fullResults;
1904 | }
1905 |
1906 | // The "model" model uses heuristics based on modelling actual
1907 | // observations
1908 | Results BuildModelModel(Runner r, Runner x, Benchmark b, bool altModel = false)
1909 | {
1910 | string modelName = "Model" + (altModel ? "2" : "");
1911 | string variant = altModel ? "2" : "1";
1912 |
1913 | Console.WriteLine("----");
1914 | Console.WriteLine("---- {0} Model for {1}", modelName, b.ShortName);
1915 |
1916 | Configuration modelConfig = new Configuration(modelName);
1917 | modelConfig.ResultsDirectory = Program.RESULTS_DIR;
1918 | modelConfig.Environment["COMPlus_JitInlinePolicyModel"] = variant;
1919 | modelConfig.Environment["COMPlus_JitInlineDumpXml"] = "1";
1920 |
1921 | Results modelResults = r.RunBenchmark(b, modelConfig);
1922 |
1923 | if (modelResults == null || !modelResults.Success)
1924 | {
1925 | Console.WriteLine("{0} run failed\n", modelConfig.Name);
1926 | return null;
1927 | }
1928 |
1929 | if (Program.ExploreInlines)
1930 | {
1931 | XmlSerializer xml = new XmlSerializer(typeof(InlineForest));
1932 | Stream xmlFile = new FileStream(modelResults.LogFile, FileMode.Open);
1933 | InlineForest f = (InlineForest)xml.Deserialize(xmlFile);
1934 | long inlineCount = f.Methods.Sum(m => m.InlineCount);
1935 | Console.WriteLine("*** {0} config has {1} methods, {2} inlines",
1936 | modelConfig.Name, f.Methods.Length, inlineCount);
1937 | modelResults.InlineForest = f;
1938 |
1939 | // Populate the methodId -> method lookup table
1940 | Dictionary methods = new Dictionary(f.Methods.Length);
1941 | foreach (Method m in f.Methods)
1942 | {
1943 | MethodId id = m.getId();
1944 | methods[id] = m;
1945 | }
1946 | modelResults.Methods = methods;
1947 | }
1948 |
1949 | // Now get perf numbers
1950 | Configuration modelPerfConfig = new Configuration(modelName + "-perf");
1951 | modelPerfConfig.ResultsDirectory = Program.RESULTS_DIR;
1952 | modelPerfConfig.Environment["COMPlus_JitInlinePolicyModel"] = variant;
1953 | Results perfResults = x.RunBenchmark(b, modelPerfConfig);
1954 | modelResults.Performance = perfResults.Performance;
1955 | modelResults.Performance.Print(modelConfig.Name);
1956 |
1957 | // Get method call counts
1958 | if (CaptureCallCounts)
1959 | {
1960 | Configuration modelCallCountConfig = new Configuration(modelName + "-cc");
1961 | modelCallCountConfig.ResultsDirectory = Program.RESULTS_DIR;
1962 | modelCallCountConfig.Environment["COMPlus_JitMeasureEntryCounts"] = "1";
1963 | modelCallCountConfig.Environment["COMPlus_JitInlinePolicyModel"] = "1";
1964 | Results ccResults = r.RunBenchmark(b, modelCallCountConfig);
1965 |
1966 | // Parse results back and annotate base method set
1967 | AnnotateCallCounts(ccResults, modelResults);
1968 | }
1969 |
1970 | return modelResults;
1971 | }
1972 |
1973 | // The size model tries not to increase method size
1974 | Results BuildSizeModel(Runner r, Runner x, Benchmark b)
1975 | {
1976 | Console.WriteLine("----");
1977 | Console.WriteLine("---- Size Model for {0}", b.ShortName);
1978 |
1979 | Configuration sizeConfig = new Configuration("size");
1980 | sizeConfig.ResultsDirectory = Program.RESULTS_DIR;
1981 | sizeConfig.Environment["COMPlus_JitInlinePolicySize"] = "1";
1982 | sizeConfig.Environment["COMPlus_JitInlineDumpXml"] = "1";
1983 |
1984 | Results results = r.RunBenchmark(b, sizeConfig);
1985 |
1986 | if (results == null || !results.Success)
1987 | {
1988 | Console.WriteLine("{0} run failed\n", sizeConfig.Name);
1989 | return null;
1990 | }
1991 |
1992 | XmlSerializer xml = new XmlSerializer(typeof(InlineForest));
1993 | InlineForest f;
1994 | Stream xmlFile = new FileStream(results.LogFile, FileMode.Open);
1995 | f = (InlineForest)xml.Deserialize(xmlFile);
1996 | long inlineCount = f.Methods.Sum(m => m.InlineCount);
1997 | Console.WriteLine("*** {0} config has {1} methods, {2} inlines",
1998 | sizeConfig.Name, f.Methods.Length, inlineCount);
1999 | results.InlineForest = f;
2000 |
2001 | // Now get perf numbers
2002 | Configuration sizePerfConfig = new Configuration("size-perf");
2003 | sizePerfConfig.ResultsDirectory = Program.RESULTS_DIR;
2004 | sizePerfConfig.Environment["COMPlus_JitInlinePolicySize"] = "1";
2005 | Results perfResults = x.RunBenchmark(b, sizePerfConfig);
2006 | results.Performance = perfResults.Performance;
2007 | results.Performance.Print(sizeConfig.Name);
2008 |
2009 | return results;
2010 | }
2011 |
2012 | // The random model is random
2013 | Results BuildRandomModel(Runner r, Runner x, Benchmark b, uint seed)
2014 | {
2015 | Console.WriteLine("----");
2016 | Console.WriteLine("---- Random Model {0:X} for {1}", seed, b.ShortName);
2017 |
2018 | string seedString = String.Format("0x{0:X}", seed);
2019 | Configuration randomConfig = new Configuration("random-" + seedString);
2020 | randomConfig.ResultsDirectory = Program.RESULTS_DIR;
2021 | randomConfig.Environment["COMPlus_JitInlinePolicyRandom"] = seedString;
2022 | randomConfig.Environment["COMPlus_JitInlineDumpXml"] = "2"; // minimal XML
2023 | randomConfig.Environment["COMPlus_JitInlineDumpData"] = "2"; // full data set
2024 |
2025 | Results results = r.RunBenchmark(b, randomConfig);
2026 |
2027 | if (results == null || !results.Success)
2028 | {
2029 | Console.WriteLine("{0} run failed\n", randomConfig.Name);
2030 | return null;
2031 | }
2032 |
2033 | XmlSerializer xml = new XmlSerializer(typeof(InlineForest));
2034 | InlineForest f;
2035 | Stream xmlFile = new FileStream(results.LogFile, FileMode.Open);
2036 | f = (InlineForest)xml.Deserialize(xmlFile);
2037 | long inlineCount = f.Methods.Sum(m => m.InlineCount);
2038 | Console.WriteLine("*** {0} config has {1} methods, {2} inlines",
2039 | randomConfig.Name, f.Methods.Length, inlineCount);
2040 | results.InlineForest = f;
2041 |
2042 | // Now get perf numbers
2043 | Configuration randomPerfConfig = new Configuration(randomConfig.Name + "-perf");
2044 | randomPerfConfig.ResultsDirectory = Program.RESULTS_DIR;
2045 | randomPerfConfig.Environment["COMPlus_JitInlinePolicyRandom"] = seedString;
2046 | Results perfResults = x.RunBenchmark(b, randomPerfConfig);
2047 | results.Performance = perfResults.Performance;
2048 | results.Performance.Print(randomConfig.Name);
2049 |
2050 | return results;
2051 | }
2052 |
2053 | static void SetupResults()
2054 | {
2055 | if (Directory.Exists(Program.RESULTS_DIR))
2056 | {
2057 | if (Program.Verbose)
2058 | {
2059 | Console.WriteLine("...Cleaning old results dir '{0}'", Program.RESULTS_DIR);
2060 | }
2061 | Directory.Delete(Program.RESULTS_DIR, true);
2062 | }
2063 |
2064 | if (Program.Verbose)
2065 | {
2066 | Console.WriteLine("...Creating new results '{0}'", Program.RESULTS_DIR);
2067 | }
2068 |
2069 | Directory.CreateDirectory(Program.RESULTS_DIR);
2070 | DirectoryInfo sandboxDirectoryInfo = new DirectoryInfo(Program.RESULTS_DIR);
2071 | }
2072 |
2073 | // Paths to repos and binaries.
2074 | public static string REPO_ROOT = @"c:\repos";
2075 | public static string CORECLR_ROOT = REPO_ROOT + @"\coreclr";
2076 | public static string CORECLR_BENCHMARK_ROOT = CORECLR_ROOT + @"\bin\tests\Windows_NT.x64.Release\JIT\performance\codequality";
2077 | public static string CORERUN = CORECLR_ROOT + @"\bin\tests\Windows_NT.x64.release\tests\Core_Root\corerun.exe";
2078 | public static string SHELL = @"c:\windows\system32\cmd.exe";
2079 | public static string RESULTS_DIR = REPO_ROOT + @"\PerformanceExplorer\results";
2080 | public static string SANDBOX_DIR = REPO_ROOT + @"\PerformanceExplorer\sandbox";
2081 |
2082 | // Various aspects of the exploration that can be enabled/disabled.
2083 | public static bool DisableZap = false;
2084 | public static bool UseNoInlineModel = false;
2085 | public static bool UseLegacyModel = false;
2086 | public static bool UseEnhancedLegacyModel = false;
2087 | public static bool UseFullModel = false;
2088 | public static bool UseModelModel = false;
2089 | public static bool UseAltModel = false;
2090 | public static bool UseSizeModel = false;
2091 | public static bool UseRandomModel = false;
2092 | public static uint RandomSeed = 0x55;
2093 | public static uint RandomTries = 1;
2094 | public static bool ExploreInlines = true;
2095 | public static bool ClassifyInlines = false;
2096 | public static bool CaptureCallCounts = true;
2097 | public static bool SkipProblemBenchmarks = true;
2098 | public static uint MinIterations = 10;
2099 | public static uint MaxIterations = 10;
2100 | public static string ClassFilter = null;
2101 | public static string MethodFilter = null;
2102 | public static string RootToken = null;
2103 | public static uint RootTokenValue = 0;
2104 | public static bool Verbose = true;
2105 | public static bool VeryVerbose = false;
2106 |
2107 | public static List ParseArgs(string[] args)
2108 | {
2109 | List benchNames = new List();
2110 |
2111 | for (int i = 0; i< args.Length; i++)
2112 | {
2113 | string arg = args[i];
2114 |
2115 | if (arg[0] == '-')
2116 | {
2117 | if (arg == "-perf")
2118 | {
2119 | ExploreInlines = false;
2120 | CaptureCallCounts = false;
2121 | }
2122 | else if (arg == "-disableZap")
2123 | {
2124 | DisableZap = true;
2125 | }
2126 | else if (arg == "-allTests")
2127 | {
2128 | SkipProblemBenchmarks = false;
2129 | }
2130 | else if (arg == "-useNoInline")
2131 | {
2132 | UseNoInlineModel = true;
2133 | }
2134 | else if (arg == "-useLegacy")
2135 | {
2136 | UseLegacyModel = true;
2137 | }
2138 | else if (arg == "-useEnhancedLegacy")
2139 | {
2140 | UseEnhancedLegacyModel = true;
2141 | }
2142 | else if (arg == "-useFull")
2143 | {
2144 | UseFullModel = true;
2145 | }
2146 | else if (arg == "-useSize")
2147 | {
2148 | UseSizeModel = true;
2149 | }
2150 | else if (arg == "-useModel")
2151 | {
2152 | UseModelModel = true;
2153 | }
2154 | else if (arg == "-useAltModel")
2155 | {
2156 | UseAltModel = true;
2157 | }
2158 | else if (arg == "-noExplore")
2159 | {
2160 | ExploreInlines = false;
2161 | }
2162 | else if (arg == "-useRandom")
2163 | {
2164 | UseRandomModel = true;
2165 | }
2166 | else if (arg == "-classify")
2167 | {
2168 | ClassifyInlines = true;
2169 | }
2170 | else if (arg == "-randomTries" && (i + 1) < args.Length)
2171 | {
2172 | RandomTries = UInt32.Parse(args[++i]);
2173 | }
2174 | else if (arg == "-minIterations" && (i + 1) < args.Length)
2175 | {
2176 | MinIterations = UInt32.Parse(args[++i]);
2177 | }
2178 | else if (arg == "-maxIterations" && (i + 1) < args.Length)
2179 | {
2180 | MaxIterations = UInt32.Parse(args[++i]);
2181 | }
2182 | else if (arg == "-method" && (i + 1) < args.Length)
2183 | {
2184 | MethodFilter = args[++i];
2185 | }
2186 | else if (arg == "-class" && (i + 1) < args.Length)
2187 | {
2188 | ClassFilter = args[++i];
2189 | }
2190 | else if (arg == "-rootToken" && (i + 1) < args.Length)
2191 | {
2192 | RootToken = args[++i];
2193 | RootTokenValue = UInt32.Parse(RootToken, System.Globalization.NumberStyles.HexNumber);
2194 | }
2195 | else
2196 | {
2197 | Console.WriteLine("... ignoring '{0}'", arg);
2198 | }
2199 | }
2200 | else
2201 | {
2202 | benchNames.Add(arg);
2203 | }
2204 | }
2205 |
2206 | bool hasInlineModel =
2207 | UseLegacyModel ||
2208 | UseEnhancedLegacyModel ||
2209 | UseModelModel ||
2210 | UseAltModel ||
2211 | UseFullModel ||
2212 | UseRandomModel ||
2213 | UseSizeModel;
2214 |
2215 | if (ExploreInlines)
2216 | {
2217 | // Exploration should at least run a noinline model
2218 | if (!UseNoInlineModel)
2219 | {
2220 | Console.WriteLine("...Exploration: forcibly enabling NoInlineModel");
2221 | UseNoInlineModel = true;
2222 | }
2223 |
2224 | // If no alternate models are selected, forcibly enable the full model.
2225 | if (!hasInlineModel)
2226 | {
2227 | Console.WriteLine("...Exploration: forcibly enabling FullModel");
2228 | UseFullModel = true;
2229 | }
2230 | }
2231 | else if (!(hasInlineModel || UseNoInlineModel))
2232 | {
2233 | // perf should run at least one model. Choose current default.
2234 | Console.WriteLine("...Performance: forcibly enabling EnhancedLegacyModel");
2235 | UseEnhancedLegacyModel = true;
2236 | }
2237 |
2238 | return benchNames;
2239 | }
2240 |
2241 | public static bool Configure()
2242 | {
2243 | // Verify repo root
2244 | if (Directory.Exists(REPO_ROOT))
2245 | {
2246 | if (Directory.Exists(Path.Combine(REPO_ROOT, "coreclr")))
2247 | {
2248 | return true;
2249 | }
2250 | }
2251 |
2252 | // Else search up from current WD
2253 | string cwd = Directory.GetCurrentDirectory();
2254 | Console.WriteLine("... coreclr repo not at {0}, searching up from {1}", REPO_ROOT, cwd);
2255 | DirectoryInfo cwdi = new DirectoryInfo(cwd);
2256 | bool found = false;
2257 | while (cwdi != null)
2258 | {
2259 | string prospect = Path.Combine(cwdi.FullName, "coreclr");
2260 | Console.WriteLine("... looking for {0}", prospect);
2261 | if (Directory.Exists(prospect))
2262 | {
2263 | REPO_ROOT = cwdi.FullName;
2264 | Console.WriteLine("... found coreclr repo at {0}", prospect);
2265 | found = true;
2266 | break;
2267 | }
2268 |
2269 | cwdi = cwdi.Parent;
2270 | }
2271 |
2272 | if (!found)
2273 | {
2274 | return false;
2275 | }
2276 |
2277 | // Set up other paths
2278 | CORECLR_ROOT = Path.Combine(REPO_ROOT, "coreclr");
2279 | CORECLR_BENCHMARK_ROOT = Path.Combine(new string[]
2280 | {CORECLR_ROOT, "bin", "tests", "Windows_NT.x64.Release", "JIT", "performance", "codequality"});
2281 | CORERUN = Path.Combine(new string[]
2282 | { CORECLR_ROOT, "bin", "tests", "Windows_NT.x64.release", "tests", "Core_Root", "corerun.exe"});
2283 | RESULTS_DIR = Path.Combine(REPO_ROOT, "PerformanceExplorer", "results");
2284 | SANDBOX_DIR = Path.Combine(REPO_ROOT, "PerformanceExplorer", "sandbox");
2285 |
2286 | return true;
2287 | }
2288 |
2289 | public static int Main(string[] args)
2290 | {
2291 | List benchNames = ParseArgs(args);
2292 | bool ok = Configure();
2293 | if (!ok)
2294 | {
2295 | Console.WriteLine("Cound not find coreclr repo");
2296 | return -1;
2297 | }
2298 |
2299 | SetupResults();
2300 | Program p = new Program();
2301 |
2302 | // Enumerate benchmarks that can be run
2303 | string benchmarkRoot = CORECLR_BENCHMARK_ROOT;
2304 | Console.WriteLine("...Enumerating benchmarks under {0}", benchmarkRoot);
2305 | Dictionary benchmarks = new Dictionary();
2306 | DirectoryInfo benchmarkRootInfo = new DirectoryInfo(benchmarkRoot);
2307 | foreach (FileInfo f in benchmarkRootInfo.GetFiles("*.exe", SearchOption.AllDirectories))
2308 | {
2309 | benchmarks.Add(f.Name, f.FullName);
2310 | }
2311 |
2312 | Console.WriteLine("...Found {0} benchmarks", benchmarks.Count());
2313 |
2314 | // If an arg is passed, run benchmarks that contain that arg as a substring.
2315 | // Otherwise run them all.
2316 | List benchmarksToRun = new List();
2317 |
2318 | if (benchNames.Count == 0)
2319 | {
2320 | Console.WriteLine("...Running all benchmarks");
2321 | benchmarksToRun.AddRange(benchmarks.Values);
2322 | }
2323 | else
2324 | {
2325 | Console.WriteLine("...Scanning for benchmarks matching your pattern(s)");
2326 | foreach (string item in benchNames)
2327 | {
2328 | int beforeCount = benchmarksToRun.Count;
2329 | foreach (string benchName in benchmarks.Keys)
2330 | {
2331 | if (benchmarks[benchName].IndexOf(item, StringComparison.OrdinalIgnoreCase) >= 0)
2332 | {
2333 | benchmarksToRun.Add(benchmarks[benchName]);
2334 | }
2335 | }
2336 |
2337 | if (benchmarksToRun.Count == 0)
2338 | {
2339 | Console.WriteLine("No benchmark matches '{0}'", item);
2340 | }
2341 | else
2342 | {
2343 | Console.WriteLine("{0} benchmarks matched '{1}'",
2344 | benchmarksToRun.Count - beforeCount, item);
2345 | }
2346 | }
2347 | }
2348 |
2349 | int result = p.RunBenchmarks(benchmarksToRun);
2350 |
2351 | return result;
2352 | }
2353 |
2354 | int RunBenchmarks(List benchmarksToRun)
2355 | {
2356 | Runner r = new CoreClrRunner();
2357 | Runner x = new XunitPerfRunner();
2358 |
2359 | // Build integrated data model...
2360 | string dataModelName = "All-Benchmark-data-model.csv";
2361 | string dataModelFileName = Path.Combine(Program.RESULTS_DIR, dataModelName);
2362 | bool hasHeader = false;
2363 | StreamWriter dataModelFile = null;
2364 | Dictionary blacklist = null;
2365 | if (ExploreInlines)
2366 | {
2367 | dataModelFile = File.CreateText(dataModelFileName);
2368 |
2369 | if (DisableZap)
2370 | {
2371 | // Use blacklist if we disable zap so we won't repeatedly
2372 | // explore the same startup paths in the core library across benchmarks
2373 | blacklist = new Dictionary();
2374 | }
2375 | }
2376 |
2377 | // Collect up result sets
2378 | List> aggregateResults = new List>(benchmarksToRun.Count());
2379 |
2380 | foreach (string s in benchmarksToRun)
2381 | {
2382 | // Ignore benchmarks that are not reliable enough for us to to measure when looking for
2383 | // per-inline deltas.
2384 | if (SkipProblemBenchmarks)
2385 | {
2386 | if (s.IndexOf("bytemark", StringComparison.OrdinalIgnoreCase) >= 0)
2387 | {
2388 | Console.WriteLine(".... bytemark disabled (noisy), sorry");
2389 | continue;
2390 | }
2391 |
2392 | if (s.IndexOf("raytracer", StringComparison.OrdinalIgnoreCase) >= 0)
2393 | {
2394 | Console.WriteLine(".... raytracer disabled (nondeterministic), sorry");
2395 | continue;
2396 | }
2397 |
2398 | if (s.IndexOf("constantarg", StringComparison.OrdinalIgnoreCase) >= 0)
2399 | {
2400 | Console.WriteLine(".... constantarg disabled (too much detail), sorry");
2401 | continue;
2402 | }
2403 |
2404 | if (s.IndexOf("functions", StringComparison.OrdinalIgnoreCase) >= 0)
2405 | {
2406 | Console.WriteLine(".... functions disabled (too much detail), sorry");
2407 | continue;
2408 | }
2409 | }
2410 |
2411 | List benchmarkResults = new List();
2412 | Benchmark b = new Benchmark();
2413 | b.ShortName = Path.GetFileName(s);
2414 | b.FullPath = s;
2415 | b.ExitCode = 100;
2416 |
2417 | Results noInlineResults = null;
2418 |
2419 | if (UseNoInlineModel)
2420 | {
2421 | noInlineResults = BuildNoInlineModel(r, x, b);
2422 | if (noInlineResults == null)
2423 | {
2424 | Console.WriteLine("Skipping remainder of runs for {0}", b.ShortName);
2425 | continue;
2426 | }
2427 | benchmarkResults.Add(noInlineResults);
2428 | }
2429 |
2430 | if (UseLegacyModel)
2431 | {
2432 | Results legacyResults = BuildLegacyModel(r, x, b);
2433 | if (legacyResults == null)
2434 | {
2435 | Console.WriteLine("Skipping remainder of runs for {0}", b.ShortName);
2436 | continue;
2437 | }
2438 | benchmarkResults.Add(legacyResults);
2439 | }
2440 |
2441 | if (UseEnhancedLegacyModel)
2442 | {
2443 | Results enhancedLegacyResults = BuildLegacyModel(r, x, b, true);
2444 | if (enhancedLegacyResults == null)
2445 | {
2446 | Console.WriteLine("Skipping remainder of runs for {0}", b.ShortName);
2447 | continue;
2448 | }
2449 | benchmarkResults.Add(enhancedLegacyResults);
2450 | }
2451 |
2452 | if (UseFullModel)
2453 | {
2454 | Results fullResults = BuildFullModel(r, x, b, noInlineResults);
2455 | if (fullResults == null)
2456 | {
2457 | Console.WriteLine("Skipping remainder of runs for {0}", b.ShortName);
2458 | continue;
2459 | }
2460 |
2461 | benchmarkResults.Add(fullResults);
2462 |
2463 | CallGraph g = new CallGraph(fullResults);
2464 | string fileName = b.ShortName + "-callgraph.dot";
2465 | g.DumpDot(Path.Combine(RESULTS_DIR, fileName));
2466 | }
2467 |
2468 | if (UseModelModel)
2469 | {
2470 | Results modelResults = BuildModelModel(r, x, b);
2471 | if (modelResults == null)
2472 | {
2473 | Console.WriteLine("Skipping remainder of runs for {0}", b.ShortName);
2474 | continue;
2475 | }
2476 | benchmarkResults.Add(modelResults);
2477 | }
2478 |
2479 | if (UseAltModel)
2480 | {
2481 | Results altModelResults = BuildModelModel(r, x, b, true);
2482 | if (altModelResults == null)
2483 | {
2484 | Console.WriteLine("Skipping remainder of runs for {0}", b.ShortName);
2485 | continue;
2486 | }
2487 | benchmarkResults.Add(altModelResults);
2488 | }
2489 |
2490 | if (UseSizeModel)
2491 | {
2492 | Results sizeResults = BuildSizeModel(r, x, b);
2493 | if (sizeResults == null)
2494 | {
2495 | Console.WriteLine("Skipping remainder of runs for {0}", b.ShortName);
2496 | continue;
2497 | }
2498 | benchmarkResults.Add(sizeResults);
2499 | }
2500 |
2501 | if (UseRandomModel)
2502 | {
2503 | uint seed = RandomSeed;
2504 | for (uint i = 0; i < RandomTries; i++, seed += RandomSeed)
2505 | {
2506 | Results randomResults = BuildRandomModel(r, x, b, seed);
2507 | if (randomResults == null)
2508 | {
2509 | Console.WriteLine("Skipping remainder of runs for {0}", b.ShortName);
2510 | continue;
2511 | }
2512 | benchmarkResults.Add(randomResults);
2513 | }
2514 | }
2515 |
2516 | aggregateResults.Add(benchmarkResults);
2517 |
2518 | if (ExploreInlines)
2519 | {
2520 | var thingsToExplore = ExaminePerf(b, benchmarkResults);
2521 |
2522 | foreach (Exploration e in thingsToExplore)
2523 | {
2524 | e.Explore(dataModelFile, ref hasHeader, blacklist);
2525 | }
2526 |
2527 | dataModelFile.Flush();
2528 | }
2529 |
2530 | if (ClassifyInlines)
2531 | {
2532 | Console.WriteLine("Beginning classification");
2533 |
2534 | // Build map from inline data string to results that contain inlines with that string.
2535 | // For now we just track existence and not multiplicity...
2536 | Dictionary> dataIndex = new Dictionary>();
2537 | HashSet allResults = new HashSet();
2538 |
2539 | uint resultCount = 0;
2540 | uint inlineCount = 0;
2541 |
2542 | foreach (List rr in aggregateResults)
2543 | {
2544 | foreach (Results rrr in rr)
2545 | {
2546 | resultCount++;
2547 | allResults.Add(rrr);
2548 |
2549 | foreach (Method mm in rrr.InlineForest.Methods)
2550 | {
2551 | Queue inlines = new Queue();
2552 | foreach (Inline ii in mm.Inlines)
2553 | {
2554 | inlines.Enqueue(ii);
2555 | }
2556 |
2557 | while (inlines.Count > 0)
2558 | {
2559 | inlineCount++;
2560 | Inline iii = inlines.Dequeue();
2561 | HashSet zz = null;
2562 | if (!dataIndex.TryGetValue(iii.Data, out zz))
2563 | {
2564 | zz = new HashSet();
2565 | dataIndex[iii.Data] = zz;
2566 | }
2567 | zz.Add(rrr);
2568 |
2569 | foreach (Inline jjj in iii.Inlines)
2570 | {
2571 | inlines.Enqueue(jjj);
2572 | }
2573 | }
2574 | }
2575 | }
2576 | }
2577 |
2578 | Console.WriteLine("Found {0} inlines, {1} data vectors, {2} results", inlineCount, dataIndex.Count, resultCount);
2579 |
2580 | // Walk through the data vectors looking for ones that appear in some results but not all.
2581 | // These are the ones we can label as good/bad by comparing the results distributions for
2582 | // cases where they do and do not appear.
2583 | //
2584 | // NB including the various "model" estimates in the data vector may cause false dichotomies
2585 | // and artificially inflate the number of vectors; consider suppressing them (if the estimates
2586 | // are functions of the rest of the vector's values they are probably harmless).
2587 | uint useableData = 0;
2588 | uint confidentData = 0;
2589 | foreach (string ddd in dataIndex.Keys)
2590 | {
2591 | int appearances = dataIndex[ddd].Count;
2592 |
2593 | // By virtue of how we constructed the dataIndex each data vector should have at least
2594 | // one appearance, and no more than the total number of results.
2595 | if (appearances < 1 || appearances > resultCount)
2596 | {
2597 | Console.WriteLine("Unexpected number of appearances {0} for {1}", appearances, ddd);
2598 | continue;
2599 | }
2600 |
2601 | // Limits here are ad-hoc, but too few appearances or to many appearances will make it tough
2602 | // to infer the impact of an inline with this data vector. Perhaps we should
2603 | // say the # of appearances is still enough to estimate the distributions
2604 | // of results with and without inlines with this data vector, so a number limit like
2605 | // 30 seems plausible.
2606 | double fraction = (double)appearances / resultCount;
2607 | if (fraction < 0.10 || fraction > 0.90)
2608 | {
2609 | continue;
2610 | }
2611 |
2612 | useableData++;
2613 |
2614 | // Now the idea is to estimate the impact of inlines with this data vector.
2615 | // We have two sets of results, both with plausible numbers of samples: ones
2616 | // where inlines with this data vector happened, and the other where the inlines
2617 | // did not happen.
2618 | //
2619 | // We want to turn this into some kind of label for the data vector, either as
2620 | // a "good" inline data vector or a "bad" one.
2621 | //
2622 | // Roughly speaking we want to compute the empirical distributions for the two
2623 | // sets of results, and see if the difference is statistically significant. If it is,
2624 | // then the magnitude of the difference can contribute to the label.
2625 | //
2626 | // Some challenges: in general the results will come from many different benchmarks
2627 | // and it probably doesn't make sense to aggregate across benchmarks and then do the
2628 | // scoring. So we probably want to go benchmark by benchmark. This means that
2629 | // there will be some benchmarks where the result sets are too small to draw meaningful
2630 | // statistics and those will need to be left out. So for each data vector we may end
2631 | // up with a varying amount of data, depending on whether that vector is comon to
2632 | // many tests or specific to one or a few.
2633 | //
2634 | // For now assume all results can be used...
2635 | HashSet includedResults = dataIndex[ddd];
2636 | HashSet excludedResults = new HashSet(allResults.Except(includedResults));
2637 |
2638 | List includedData = new List();
2639 | List excludedData = new List();
2640 |
2641 | foreach (Results ir in includedResults)
2642 | {
2643 | foreach (string sir in ir.Performance.InstructionCount.Keys)
2644 | {
2645 | includedData.AddRange(ir.Performance.InstructionCount[sir]);
2646 | }
2647 | }
2648 |
2649 | foreach (Results er in excludedResults)
2650 | {
2651 | foreach (string ser in er.Performance.InstructionCount.Keys)
2652 | {
2653 | excludedData.AddRange(er.Performance.InstructionCount[ser]);
2654 | }
2655 | }
2656 |
2657 | double confidence = PerformanceData.Confidence(includedData, excludedData);
2658 |
2659 | // If we can't tell the two results set medians apart with any confidence, we can't infer
2660 | // the impact of inlines with this data vector.
2661 | if (confidence < 0.8)
2662 | {
2663 | continue;
2664 | }
2665 |
2666 | confidentData++;
2667 | }
2668 |
2669 | Console.WriteLine("{0} data vectors are usable; {1} with confidence", useableData, confidentData);
2670 | }
2671 | }
2672 |
2673 | // aggregateResults is a list of list of results
2674 | // outer list is one per "benchmark"
2675 | // inner list is one per model
2676 | // .. a benchmark may have multiple parts
2677 |
2678 | Console.WriteLine("---- Perf Results----");
2679 | Console.Write("{0,-42}", "Test");
2680 | int modelCount = 0;
2681 | foreach (Results rq in aggregateResults.First())
2682 | {
2683 | Console.Write(" {0,8}.T {0,8}.I", rq.Name);
2684 | modelCount += 1;
2685 | }
2686 | Console.WriteLine();
2687 |
2688 | int totalPartCount = 0;
2689 | foreach (List rr in aggregateResults)
2690 | {
2691 | Results f = rr.First();
2692 | totalPartCount += f.Performance.InstructionCount.Count;
2693 | }
2694 |
2695 | double[] timeLogSum = new double[modelCount];
2696 | double[] instrLogSum = new double[modelCount];
2697 |
2698 | foreach (List rr in aggregateResults)
2699 | {
2700 | ComparePerf(rr, timeLogSum, instrLogSum);
2701 | }
2702 |
2703 | Console.Write("{0,-42}", "GeoMeans");
2704 | for (int j = 0; j < modelCount; j++)
2705 | {
2706 | double gmTime = Math.Exp(timeLogSum[j] / totalPartCount);
2707 | Console.Write(" {0,10:0.00}", gmTime);
2708 |
2709 | double gmInstr = Math.Exp(instrLogSum[j] / totalPartCount);
2710 | Console.Write(" {0,10:0.00}", gmInstr);
2711 | }
2712 |
2713 | return 100;
2714 | }
2715 |
2716 | void ComparePerf(List results, double[] timeLogSum, double[] instrLogSum)
2717 | {
2718 | Results baseline = results.First();
2719 |
2720 | foreach (string subBench in baseline.Performance.ExecutionTime.Keys)
2721 | {
2722 | Console.Write("{0,-42}", subBench);
2723 |
2724 | int modelNumber = 0;
2725 |
2726 | foreach (Results diff in results)
2727 | {
2728 | double diffTime = PerformanceData.Average(diff.Performance.ExecutionTime[subBench]);
2729 | Console.Write(" {0,10:0.00}", diffTime);
2730 |
2731 | double diffInst = PerformanceData.Average(diff.Performance.InstructionCount[subBench]);
2732 | Console.Write(" {0,10:0.00}", diffInst / (1000 * 1000));
2733 |
2734 | timeLogSum[modelNumber] += Math.Log(diffTime);
2735 | instrLogSum[modelNumber] += Math.Log(diffInst / (1000 * 1000));
2736 |
2737 | modelNumber++;
2738 | }
2739 | Console.WriteLine();
2740 | }
2741 | }
2742 |
2743 | List ExaminePerf(Benchmark b, List results)
2744 | {
2745 | Results baseline = results.First();
2746 | Console.WriteLine("---- Perf Examination----");
2747 | List interestingResults = new List();
2748 |
2749 | foreach (Results diff in results)
2750 | {
2751 | // No need to investigate the baseline
2752 | if (diff == baseline)
2753 | {
2754 | continue;
2755 | }
2756 |
2757 | // See if any of the sub-bench results are both significantly different
2758 | // than the baseline and measured with high confidence.
2759 | bool added = false;
2760 |
2761 | foreach (string subBench in baseline.Performance.InstructionCount.Keys)
2762 | {
2763 | List baseData = baseline.Performance.InstructionCount[subBench];
2764 | double baseAvg = PerformanceData.Average(baseData);
2765 | List diffData = diff.Performance.InstructionCount[subBench];
2766 | double diffAvg = PerformanceData.Average(diffData);
2767 | double confidence = PerformanceData.Confidence(baseData, diffData);
2768 | double avgDiff = diffAvg - baseAvg;
2769 | double pctDiff = 100 * avgDiff / baseAvg;
2770 | double interestingDiff = 1;
2771 | double confidentDiff = 0.9;
2772 | bool interesting = Math.Abs(pctDiff) > interestingDiff;
2773 | bool confident = confidence > confidentDiff;
2774 | string interestVerb = interesting ? "is" : "is not";
2775 | string confidentVerb = confident ? "and is" : "and is not";
2776 | bool show = interesting && confident;
2777 |
2778 | if (!added & interesting && confident)
2779 | {
2780 | Exploration e = new Exploration();
2781 | e.baseResults = baseline;
2782 | e.endResults = diff;
2783 | e.benchmark = b;
2784 | interestingResults.Add(e);
2785 | added = true;
2786 |
2787 | Console.WriteLine(
2788 | "$$$ {0} diff {1} in instructions between {2} ({3}) and {4} ({5}) "
2789 | + "{6} interesting {7:0.00}% {8} significant p={9:0.00}",
2790 | subBench, avgDiff / (1000 * 1000),
2791 | baseline.Name, baseAvg / (1000 * 1000),
2792 | diff.Name, diffAvg / (1000 * 1000),
2793 | interestVerb, pctDiff,
2794 | confidentVerb, confidence);
2795 |
2796 | break;
2797 | }
2798 | }
2799 |
2800 | if (!added)
2801 | {
2802 | Console.WriteLine("$$$ {0} performance diff from {1} was not significant and confident", b.ShortName, diff.Name);
2803 | }
2804 | }
2805 |
2806 | return interestingResults;
2807 | }
2808 |
2809 | static void AnnotateCallCounts(Results ccResults, Results results)
2810 | {
2811 | // Parse results back and annotate base method set
2812 | using (StreamReader callCountStream = File.OpenText(ccResults.LogFile))
2813 | {
2814 | string callCountLine = callCountStream.ReadLine();
2815 | while (callCountLine != null)
2816 | {
2817 | string[] callCountFields = callCountLine.Split(new char[] { ',' });
2818 | if (callCountFields.Length == 3)
2819 | {
2820 | uint token = UInt32.Parse(callCountFields[0], System.Globalization.NumberStyles.HexNumber);
2821 | uint hash = UInt32.Parse(callCountFields[1], System.Globalization.NumberStyles.HexNumber);
2822 | ulong count = UInt64.Parse(callCountFields[2]);
2823 |
2824 | MethodId id = new MethodId();
2825 | id.Hash = hash;
2826 | id.Token = token;
2827 |
2828 | if (results.Methods.ContainsKey(id))
2829 | {
2830 | Method m = results.Methods[id];
2831 | m.CallCount = count;
2832 | Console.WriteLine("{0} called {1} times", m.Name, count);
2833 | }
2834 | else
2835 | {
2836 | Console.WriteLine("{0:X8} {1:X8} called {2} times, but is not in base set?", token, hash, count);
2837 | }
2838 | }
2839 | callCountLine = callCountStream.ReadLine();
2840 | }
2841 | }
2842 | }
2843 | }
2844 | }
2845 |
--------------------------------------------------------------------------------