├── global.json ├── src └── PerformanceExplorer │ ├── project.json │ ├── Properties │ └── AssemblyInfo.cs │ ├── PerformanceExplorer.xproj │ └── Program.cs ├── PerformanceExplorer.sln ├── .gitattributes ├── scripts ├── ModelPolicyV1Size.R.txt └── ModelPolicyV1Perf.R.txt ├── .gitignore └── notes └── notes-aug-2016.md /global.json: -------------------------------------------------------------------------------- 1 | { 2 | "projects": [ "src", "test" ] 3 | } 4 | -------------------------------------------------------------------------------- /src/PerformanceExplorer/project.json: -------------------------------------------------------------------------------- 1 | { 2 | "version": "1.0.0-*", 3 | "buildOptions": { 4 | "emitEntryPoint": true 5 | }, 6 | 7 | "dependencies": { 8 | "Microsoft.NETCore.App": { 9 | "type": "platform", 10 | "version": "1.0.0" 11 | }, 12 | "System.Xml.XmlSerializer": "4.0.11" 13 | }, 14 | 15 | "frameworks": { 16 | "netcoreapp1.0": {} 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /src/PerformanceExplorer/Properties/AssemblyInfo.cs: -------------------------------------------------------------------------------- 1 | using System.Reflection; 2 | using System.Runtime.CompilerServices; 3 | using System.Runtime.InteropServices; 4 | 5 | // General Information about an assembly is controlled through the following 6 | // set of attributes. Change these attribute values to modify the information 7 | // associated with an assembly. 8 | [assembly: AssemblyConfiguration("")] 9 | [assembly: AssemblyCompany("")] 10 | [assembly: AssemblyProduct("PerformanceExplorer")] 11 | [assembly: AssemblyTrademark("")] 12 | 13 | // Setting ComVisible to false makes the types in this assembly not visible 14 | // to COM components. If you need to access a type in this assembly from 15 | // COM, set the ComVisible attribute to true on that type. 16 | [assembly: ComVisible(false)] 17 | 18 | // The following GUID is for the ID of the typelib if this project is exposed to COM 19 | [assembly: Guid("20b8742b-3910-40d1-9dd7-a5e3db9dd066")] 20 | -------------------------------------------------------------------------------- /src/PerformanceExplorer/PerformanceExplorer.xproj: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 14.0 5 | $(MSBuildExtensionsPath32)\Microsoft\VisualStudio\v$(VisualStudioVersion) 6 | 7 | 8 | 9 | 10 | 20b8742b-3910-40d1-9dd7-a5e3db9dd066 11 | PerformanceExplorer 12 | ..\..\artifacts\obj\$(MSBuildProjectName) 13 | ..\..\artifacts\ 14 | v4.5.2 15 | 16 | 17 | 18 | 2.0 19 | 20 | 21 | 22 | -------------------------------------------------------------------------------- /PerformanceExplorer.sln: -------------------------------------------------------------------------------- 1 | 2 | Microsoft Visual Studio Solution File, Format Version 12.00 3 | # Visual Studio 14 4 | VisualStudioVersion = 14.0.25123.0 5 | MinimumVisualStudioVersion = 10.0.40219.1 6 | Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "src", "src", "{95943EFC-A666-4605-8FF3-09C5BB169ABE}" 7 | EndProject 8 | Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Solution Items", "Solution Items", "{D05E4B15-0977-42AA-9A3B-29DFC9A40523}" 9 | ProjectSection(SolutionItems) = preProject 10 | global.json = global.json 11 | EndProjectSection 12 | EndProject 13 | Project("{8BB2217D-0F2D-49D1-97BC-3654ED321F3B}") = "PerformanceExplorer", "src\PerformanceExplorer\PerformanceExplorer.xproj", "{20B8742B-3910-40D1-9DD7-A5E3DB9DD066}" 14 | EndProject 15 | Global 16 | GlobalSection(SolutionConfigurationPlatforms) = preSolution 17 | Debug|Any CPU = Debug|Any CPU 18 | Release|Any CPU = Release|Any CPU 19 | EndGlobalSection 20 | GlobalSection(ProjectConfigurationPlatforms) = postSolution 21 | {20B8742B-3910-40D1-9DD7-A5E3DB9DD066}.Debug|Any CPU.ActiveCfg = Debug|Any CPU 22 | {20B8742B-3910-40D1-9DD7-A5E3DB9DD066}.Debug|Any CPU.Build.0 = Debug|Any CPU 23 | {20B8742B-3910-40D1-9DD7-A5E3DB9DD066}.Release|Any CPU.ActiveCfg = Release|Any CPU 24 | {20B8742B-3910-40D1-9DD7-A5E3DB9DD066}.Release|Any CPU.Build.0 = Release|Any CPU 25 | EndGlobalSection 26 | GlobalSection(SolutionProperties) = preSolution 27 | HideSolutionNode = FALSE 28 | EndGlobalSection 29 | GlobalSection(NestedProjects) = preSolution 30 | {20B8742B-3910-40D1-9DD7-A5E3DB9DD066} = {95943EFC-A666-4605-8FF3-09C5BB169ABE} 31 | EndGlobalSection 32 | EndGlobal 33 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | ############################################################################### 2 | # Set default behavior to automatically normalize line endings. 3 | ############################################################################### 4 | * text=auto 5 | 6 | ############################################################################### 7 | # Set default behavior for command prompt diff. 8 | # 9 | # This is need for earlier builds of msysgit that does not have it on by 10 | # default for csharp files. 11 | # Note: This is only used by command line 12 | ############################################################################### 13 | #*.cs diff=csharp 14 | 15 | ############################################################################### 16 | # Set the merge driver for project and solution files 17 | # 18 | # Merging from the command prompt will add diff markers to the files if there 19 | # are conflicts (Merging from VS is not affected by the settings below, in VS 20 | # the diff markers are never inserted). Diff markers may cause the following 21 | # file extensions to fail to load in VS. An alternative would be to treat 22 | # these files as binary and thus will always conflict and require user 23 | # intervention with every merge. To do so, just uncomment the entries below 24 | ############################################################################### 25 | #*.sln merge=binary 26 | #*.csproj merge=binary 27 | #*.vbproj merge=binary 28 | #*.vcxproj merge=binary 29 | #*.vcproj merge=binary 30 | #*.dbproj merge=binary 31 | #*.fsproj merge=binary 32 | #*.lsproj merge=binary 33 | #*.wixproj merge=binary 34 | #*.modelproj merge=binary 35 | #*.sqlproj merge=binary 36 | #*.wwaproj merge=binary 37 | 38 | ############################################################################### 39 | # behavior for image files 40 | # 41 | # image files are treated as binary by default. 42 | ############################################################################### 43 | #*.jpg binary 44 | #*.png binary 45 | #*.gif binary 46 | 47 | ############################################################################### 48 | # diff behavior for common document formats 49 | # 50 | # Convert binary document formats to text before diffing them. This feature 51 | # is only available from the command line. Turn it on by uncommenting the 52 | # entries below. 53 | ############################################################################### 54 | #*.doc diff=astextplain 55 | #*.DOC diff=astextplain 56 | #*.docx diff=astextplain 57 | #*.DOCX diff=astextplain 58 | #*.dot diff=astextplain 59 | #*.DOT diff=astextplain 60 | #*.pdf diff=astextplain 61 | #*.PDF diff=astextplain 62 | #*.rtf diff=astextplain 63 | #*.RTF diff=astextplain 64 | -------------------------------------------------------------------------------- /scripts/ModelPolicyV1Size.R.txt: -------------------------------------------------------------------------------- 1 | ## Read in raw data set 2 | 3 | InlineData.raw <- read.csv("c:\\repos\\inlinedata\\mscorlib.data.model-rel.log.parsed", header=TRUE, comment.char="") 4 | 5 | ## Identify factors and logicals 6 | 7 | #InlineData.raw$Arg0Type <- as.factor(InlineData.raw$Arg0Type) 8 | #InlineData.raw$Arg1Type <- as.factor(InlineData.raw$Arg1Type) 9 | #InlineData.raw$Arg2Type <- as.factor(InlineData.raw$Arg2Type) 10 | #InlineData.raw$Arg3Type <- as.factor(InlineData.raw$Arg3Type) 11 | #InlineData.raw$Arg4Type <- as.factor(InlineData.raw$Arg4Type) 12 | #InlineData.raw$Arg5Type <- as.factor(InlineData.raw$Arg5Type) 13 | #InlineData.raw$ReturnType <- as.factor(InlineData.raw$ReturnType) 14 | #InlineData.raw$CallsiteFrequency <- as.factor(InlineData.raw$CallsiteFrequency) 15 | InlineData.raw$IsForceInline <- as.logical(InlineData.raw$IsForceInline) 16 | InlineData.raw$IsInstanceCtor <- as.logical(InlineData.raw$IsInstanceCtor) 17 | InlineData.raw$IsFromPromotableValueClass <- as.logical(InlineData.raw$IsFromPromotableValueClass) 18 | InlineData.raw$HasSimd <- as.logical(InlineData.raw$HasSimd) 19 | InlineData.raw$LooksLikeWrapperMethod <- as.logical(InlineData.raw$LooksLikeWrapperMethod) 20 | InlineData.raw$ArgFeedsConstantTest <- as.logical(InlineData.raw$ArgFeedsConstantTest) 21 | InlineData.raw$IsMostlyLoadStore <- as.logical(InlineData.raw$IsMostlyLoadStore) 22 | InlineData.raw$ArgFeedsRangeCheck <- as.logical(InlineData.raw$ArgFeedsRangeCheck) 23 | InlineData.raw$ConstantFeedsConstantTest <- as.logical(InlineData.raw$ConstantFeedsConstantTest) 24 | 25 | ## Remove Version0 (roots) and strip non-predictive columns 26 | 27 | Col.N <- c("Method", "Version", "JitTime", "HotSize", "ColdSize") 28 | InlineData <- InlineData.raw[InlineData.raw$Version > 0, setdiff(names(InlineData.raw), Col.N)] 29 | InlineDataV0r <- InlineData.raw[InlineData.raw$Version == 0, ] 30 | 31 | ## Produce frame with just predictive columns and the result we want to estimate 32 | 33 | Col.Z <- c("JitTimeDelta", "ColdSizeDelta", "HotSizeDelta", "ModelCodeSizeEstimate") 34 | Col.XY <- setdiff(names(InlineData), Col.Z) 35 | Col.Y <- c("TotalSizeDelta") 36 | Col.X <- setdiff(Col.XY, Col.Y) 37 | 38 | ## Examine existing models 39 | 40 | Model.P <- InlineData$ModelCodeSizeEstimate/10 41 | Actual <- InlineData$TotalSizeDelta 42 | 43 | ## Build new models 44 | ## using glmnet for modelling 45 | 46 | ## install.packages("glmnet") 47 | library(glmnet) 48 | InlineData.XY <- InlineData[, Col.XY] 49 | set.seed(1001) 50 | 51 | FullModel.M <- model.matrix(TotalSizeDelta ~ ., InlineData.XY) 52 | FullModel <- cv.glmnet(FullModel.M, Actual) 53 | FullModel.P <- predict(FullModel, FullModel.M, s="lambda.1se") 54 | FullModel.C <- coef(FullModel, s="lambda.1se") 55 | 56 | Full.S <- sum(InlineData$TotalSizeDelta^ 2) 57 | FullModel.SE <- sum((FullModel.P - Actual)^2) 58 | FullModel.MSE <- FullModel.SE / nrow(InlineData) 59 | FullModel.AE <- sum(abs(FullModel.P - Actual)) 60 | FullModel.MAE <- FullModel.AE / nrow(InlineData) 61 | FullModel.R <- 1 - FullModel.SE / Full.S 62 | -------------------------------------------------------------------------------- /scripts/ModelPolicyV1Perf.R.txt: -------------------------------------------------------------------------------- 1 | ## Used for initial ModelPolicy performance model 2 | 3 | D <- read.csv("c:\\repos\\performanceexplorer\\data\\all-benchmark-v3-a13.csv") 4 | 5 | ## Identify factors and logicals 6 | 7 | D$Arg0Type <- as.factor(D$Arg0Type) 8 | D$Arg1Type <- as.factor(D$Arg1Type) 9 | D$Arg2Type <- as.factor(D$Arg2Type) 10 | D$Arg3Type <- as.factor(D$Arg3Type) 11 | D$Arg4Type <- as.factor(D$Arg4Type) 12 | D$Arg5Type <- as.factor(D$Arg5Type) 13 | D$ReturnType <- as.factor(D$ReturnType) 14 | D$CallsiteFrequency <- as.factor(D$CallsiteFrequency) 15 | D$IsForceInline <- as.logical(D$IsForceInline) 16 | D$IsInstanceCtor <- as.logical(D$IsInstanceCtor) 17 | D$IsFromPromotableValueClass <- as.logical(D$IsFromPromotableValueClass) 18 | D$HasSimd <- as.logical(D$HasSimd) 19 | D$LooksLikeWrapperMethod <- as.logical(D$LooksLikeWrapperMethod) 20 | D$ArgFeedsConstantTest <- as.logical(D$ArgFeedsConstantTest) 21 | D$IsMostlyLoadStore <- as.logical(D$IsMostlyLoadStore) 22 | D$ArgFeedsRangeCheck <- as.logical(D$ArgFeedsRangeCheck) 23 | D$ConstantFeedsConstantTest <- as.logical(D$ConstantFeedsConstantTest) 24 | 25 | ## Filter to observations with at least 0.8 confidence 26 | ## Filter to observations where the call happened at least 1000x 27 | ## Filter to cases where per call impact is within -100, 100 28 | 29 | Dcq <- D[(D$Confidence > 0.8) & (D$CallDelta > 1000) & (abs(D$InstRetiredPerCallDelta) < 100), ] 30 | 31 | ## (have 210 entries for V3 data set) 32 | 33 | ## Identify non-predictive columns so we can strip them 34 | 35 | Col.NP <- c("Benchmark", "SubBenchmark", "Method", "Version", "JitTime", "HotSize", "ColdSize", "Depth") 36 | Study <- Dcq[Dcq$Version > 0, setdiff(names(Dcq), Col.NP)] 37 | 38 | ## Produce frame with just predictive columns and the result we want to estimate 39 | 40 | Col.YY <- c("HotSizeDelta","ColdSizeDelta","JitTimeDelta","InstRetiredDelta", 41 | "InstRetiredPct","CallDelta","InstRetiredPerCallDelta","Confidence") 42 | Col.Y <- c("InstRetiredPerCallDelta") 43 | Col.Yx <- setdiff(Col.YY, Col.Y) 44 | 45 | Col.XY <- setdiff(names(Study), Col.Yx) 46 | Col.X <- setdiff(Col.XY, Col.Y) 47 | 48 | Study.XY <- Study[, Col.XY] 49 | Study.X <- Study[, Col.X] 50 | Actual <- Study[, Col.Y] 51 | 52 | ## install.packages("glmnet") 53 | 54 | library(glmnet) 55 | 56 | ## Model dependent var vs observations 57 | 58 | set.seed(1001) 59 | F <- paste(Col.Y[1], " ~ .") 60 | FullModel.M <- model.matrix(as.formula(F), Study.XY) 61 | FullModel <- cv.glmnet(FullModel.M, Actual) 62 | 63 | # run new predictions 64 | 65 | FullModel.P <- predict(FullModel, FullModel.M, s="lambda.min") 66 | FullModel.C <- predict(FullModel, FullModel.M, s="lambda.min", type="coefficients") 67 | 68 | P <- data.frame(Actual, FullModel.P, FullModel.P - Actual) 69 | names(P) <- c("Actual", "FullModel", "FullModel.res") 70 | 71 | ## Scoring 72 | 73 | Full.S <- sum(Actual^ 2) 74 | FullModel.SE <- sum((FullModel.P - Actual)^2) 75 | FullModel.MSE <- FullModel.SE / nrow(Study) 76 | FullModel.AE <- sum(abs(FullModel.P - Actual)) 77 | FullModel.MAE <- FullModel.AE / nrow(Study) 78 | FullModel.R <- 1 - FullModel.SE / Full.S 79 | 80 | ## Plotting 81 | 82 | library(ggplot2) 83 | 84 | Plot.F <- ggplot(P, aes(x=FullModel.P, y=Actual)) + geom_boxplot(aes(group=cut_width(FullModel, 1)), outlier.color="red") + coord_cartesian(ylim=c(-40,40), xlim=c(-40,40)) + geom_abline(slope=1, color="blue") 85 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | ## Ignore Visual Studio temporary files, build results, and 2 | ## files generated by popular Visual Studio add-ons. 3 | 4 | # User-specific files 5 | *.suo 6 | *.user 7 | *.userosscache 8 | *.sln.docstates 9 | 10 | # User-specific files (MonoDevelop/Xamarin Studio) 11 | *.userprefs 12 | 13 | # Build results 14 | [Dd]ebug/ 15 | [Dd]ebugPublic/ 16 | [Rr]elease/ 17 | [Rr]eleases/ 18 | [Xx]64/ 19 | [Xx]86/ 20 | [Bb]uild/ 21 | bld/ 22 | [Bb]in/ 23 | [Oo]bj/ 24 | 25 | # Visual Studio 2015 cache/options directory 26 | .vs/ 27 | # Uncomment if you have tasks that create the project's static files in wwwroot 28 | #wwwroot/ 29 | 30 | # MSTest test Results 31 | [Tt]est[Rr]esult*/ 32 | [Bb]uild[Ll]og.* 33 | 34 | # NUNIT 35 | *.VisualState.xml 36 | TestResult.xml 37 | 38 | # Build Results of an ATL Project 39 | [Dd]ebugPS/ 40 | [Rr]eleasePS/ 41 | dlldata.c 42 | 43 | # DNX 44 | project.lock.json 45 | artifacts/ 46 | 47 | *_i.c 48 | *_p.c 49 | *_i.h 50 | *.ilk 51 | *.meta 52 | *.obj 53 | *.pch 54 | *.pdb 55 | *.pgc 56 | *.pgd 57 | *.rsp 58 | *.sbr 59 | *.tlb 60 | *.tli 61 | *.tlh 62 | *.tmp 63 | *.tmp_proj 64 | *.log 65 | *.vspscc 66 | *.vssscc 67 | .builds 68 | *.pidb 69 | *.svclog 70 | *.scc 71 | 72 | # Chutzpah Test files 73 | _Chutzpah* 74 | 75 | # Visual C++ cache files 76 | ipch/ 77 | *.aps 78 | *.ncb 79 | *.opendb 80 | *.opensdf 81 | *.sdf 82 | *.cachefile 83 | *.VC.db 84 | 85 | # Visual Studio profiler 86 | *.psess 87 | *.vsp 88 | *.vspx 89 | *.sap 90 | 91 | # TFS 2012 Local Workspace 92 | $tf/ 93 | 94 | # Guidance Automation Toolkit 95 | *.gpState 96 | 97 | # ReSharper is a .NET coding add-in 98 | _ReSharper*/ 99 | *.[Rr]e[Ss]harper 100 | *.DotSettings.user 101 | 102 | # JustCode is a .NET coding add-in 103 | .JustCode 104 | 105 | # TeamCity is a build add-in 106 | _TeamCity* 107 | 108 | # DotCover is a Code Coverage Tool 109 | *.dotCover 110 | 111 | # NCrunch 112 | _NCrunch_* 113 | .*crunch*.local.xml 114 | nCrunchTemp_* 115 | 116 | # MightyMoose 117 | *.mm.* 118 | AutoTest.Net/ 119 | 120 | # Web workbench (sass) 121 | .sass-cache/ 122 | 123 | # Installshield output folder 124 | [Ee]xpress/ 125 | 126 | # DocProject is a documentation generator add-in 127 | DocProject/buildhelp/ 128 | DocProject/Help/*.HxT 129 | DocProject/Help/*.HxC 130 | DocProject/Help/*.hhc 131 | DocProject/Help/*.hhk 132 | DocProject/Help/*.hhp 133 | DocProject/Help/Html2 134 | DocProject/Help/html 135 | 136 | # Click-Once directory 137 | publish/ 138 | 139 | # Publish Web Output 140 | *.[Pp]ublish.xml 141 | *.azurePubxml 142 | 143 | # TODO: Un-comment the next line if you do not want to checkin 144 | # your web deploy settings because they may include unencrypted 145 | # passwords 146 | #*.pubxml 147 | *.publishproj 148 | 149 | # NuGet Packages 150 | *.nupkg 151 | # The packages folder can be ignored because of Package Restore 152 | **/packages/* 153 | # except build/, which is used as an MSBuild target. 154 | !**/packages/build/ 155 | # Uncomment if necessary however generally it will be regenerated when needed 156 | #!**/packages/repositories.config 157 | # NuGet v3's project.json files produces more ignoreable files 158 | *.nuget.props 159 | *.nuget.targets 160 | 161 | # Microsoft Azure Build Output 162 | csx/ 163 | *.build.csdef 164 | 165 | # Microsoft Azure Emulator 166 | ecf/ 167 | rcf/ 168 | 169 | # Microsoft Azure ApplicationInsights config file 170 | ApplicationInsights.config 171 | 172 | # Windows Store app package directory 173 | AppPackages/ 174 | BundleArtifacts/ 175 | 176 | # Visual Studio cache files 177 | # files ending in .cache can be ignored 178 | *.[Cc]ache 179 | # but keep track of directories ending in .cache 180 | !*.[Cc]ache/ 181 | 182 | # Others 183 | ClientBin/ 184 | [Ss]tyle[Cc]op.* 185 | ~$* 186 | *~ 187 | *.dbmdl 188 | *.dbproj.schemaview 189 | *.pfx 190 | *.publishsettings 191 | node_modules/ 192 | orleans.codegen.cs 193 | 194 | # RIA/Silverlight projects 195 | Generated_Code/ 196 | 197 | # Backup & report files from converting an old project file 198 | # to a newer Visual Studio version. Backup files are not needed, 199 | # because we have git ;-) 200 | _UpgradeReport_Files/ 201 | Backup*/ 202 | UpgradeLog*.XML 203 | UpgradeLog*.htm 204 | 205 | # SQL Server files 206 | *.mdf 207 | *.ldf 208 | 209 | # Business Intelligence projects 210 | *.rdl.data 211 | *.bim.layout 212 | *.bim_*.settings 213 | 214 | # Microsoft Fakes 215 | FakesAssemblies/ 216 | 217 | # GhostDoc plugin setting file 218 | *.GhostDoc.xml 219 | 220 | # Node.js Tools for Visual Studio 221 | .ntvs_analysis.dat 222 | 223 | # Visual Studio 6 build log 224 | *.plg 225 | 226 | # Visual Studio 6 workspace options file 227 | *.opt 228 | 229 | # Visual Studio LightSwitch build output 230 | **/*.HTMLClient/GeneratedArtifacts 231 | **/*.DesktopClient/GeneratedArtifacts 232 | **/*.DesktopClient/ModelManifest.xml 233 | **/*.Server/GeneratedArtifacts 234 | **/*.Server/ModelManifest.xml 235 | _Pvt_Extensions 236 | 237 | # LightSwitch generated files 238 | GeneratedArtifacts/ 239 | ModelManifest.xml 240 | 241 | # Paket dependency manager 242 | .paket/paket.exe 243 | 244 | # FAKE - F# Make 245 | .fake/ -------------------------------------------------------------------------------- /notes/notes-aug-2016.md: -------------------------------------------------------------------------------- 1 | # Some Notes on Using Machine Learning to Develop Inlining Heuristics 2 | 3 | August 2016 4 | 5 | ## Overview 6 | 7 | This document describes the work done from roughly February to August 8 | 2016 to use machine learning techniques to develop improved inlining 9 | heuristics for RyuJit. 10 | 11 | Based on this work, RyuJit now includes an inlining heuristic that is 12 | based on machine learning -- the ModelPolicy. This policy can be 13 | enabled by setting COMPlus_JitInlinePolicyModel=1 in environments 14 | where the jit generates code. Measurements on various internal 15 | benchmarks have shown this new policy gives roughly 2% geomean CQ 16 | improvement, 2% geomean CS reduction, and 1% throughput reduction. 17 | Measurements on "realistic" applications has just begun and the 18 | initial results are not as encouraging, but we are still optimistic 19 | that with some more work, the ModelPolicy or something quite similar 20 | can be enabled as the default policy going forward. 21 | 22 | A number of new measurement techniques were developed to support the 23 | modelling process. Even so, the models built so far are not entirely 24 | satisfactory. There are significant challenges and open questions 25 | in many areas of the work. 26 | 27 | The remainder of this aims to describe the work that has been done, 28 | present the challenges that remain, and suggest avenues for further 29 | investigation. Note this is still a work in progress and some aspects 30 | of it are incomplete. 31 | 32 | ## Background 33 | 34 | The desirability of a machine-learning approach to the development of 35 | inlining heuristics was based on both past experience and some 36 | promising results from the literature. 37 | 38 | Past experience in manual development of inlining heuristics has shown 39 | that it is a complex and challenging endeavor. Typically, the heuristic 40 | developer must carefully study some number of examples to try and 41 | discern what factors lead to "good" inlines. These factors are then 42 | coded as heuristics, and combined via some ad-hoc method (say, via 43 | weights) to produce an overall figure of merit. A large number of 44 | rounds of experimental tuning on benchmarks are then used to select 45 | weight values. 46 | 47 | Failure of the heuristic to perform on certain benchmarks can and 48 | perhaps should lead to refining existing heuristics or the development 49 | of new heuristics, or to the improvement of downstream optimization 50 | abilities in the compiler, but often instead is handled by adjusting 51 | the various weights to try and obtain the desired outcome. There 52 | is inevitable bias in the developer's choice of factors and the expert 53 | analysis required to gain insight only scales to relatively small 54 | numbers of examples. Rigorous analysis to cross-check the importance 55 | of factors is not always done and performance of the model over time 56 | is typically not measured. This can lead to misleading confidence in 57 | the heuristics, since benchmark program never change, while real 58 | applications evolve over time, sometimes quite rapidly. 59 | 60 | The recent literature describes some successes in using machine 61 | learning to create good inlining heuristics. One example is [Automatic 62 | Construction of Inlining Heuristics using Machine 63 | Learning](http://dl.acm.org/citation.cfm?id=2495914) by Kulkarni, 64 | Cavazos, Wimmer, and Simon. Here Kulkarni et. al. treat inline 65 | profitability as an unsupervised learning problem, and create a 66 | well-performing heuristic black box (neural network) using 67 | evolutionary programming techniques. They then turn around and use 68 | this black box as an oracle to label inline instances, and from this 69 | guide a supervised machine learning algorithm to produce a decision 70 | tree that expresses the profitability heuristics in terms sensible to 71 | the compiler writer. 72 | 73 | It was hoped that machine learning techniques would lead to decent 74 | models that could be created relatively quickly, so that new models 75 | could be developed as the jit was ported to new architectures and new 76 | operating systems. Also as the capabilities of the jit or runtime were 77 | extended (say by improving register allocation or optimization) it 78 | would be possible to quickly re-tune the inliner to take best 79 | advantage of new capabilities, and/or to validate continued good 80 | behavior as key applications evolve. These tasks remain within the 81 | scope of our ambition, though we have not yet proven that such things 82 | are possible. 83 | 84 | Our inability (described in more detail below) to easily derive good 85 | performance models based on machine learning is most likely an 86 | indictment of some aspects of our overall process, though it is also 87 | possible that our difficulties simply reflect the degree of challenge 88 | inherent in improving heuristics in a mature and complex system with 89 | various realistic constraints. 90 | 91 | This [initial design 92 | note](https://github.com/dotnet/coreclr/blob/master/Documentation/design-docs/inlining-plans.md) 93 | -- describing the early views on the project -- may be of interest. 94 | 95 | ## Motivation 96 | 97 | The primary motivation for working on inlining was the potential for 98 | improved code quality (CQ) at similar or improved levels of code size 99 | (CS) and jit throughput (TP). 100 | 101 | This potential had been observed in many manual examples and bug 102 | reports, as well as experiments to simply make the inliner more 103 | aggressive. 104 | 105 | Nominal upside in CQ, given the current optimization capabilities of 106 | the jit, is in the range of 3-4% (geomean) across a variety of 107 | programs. As is always the case with such measures, the underlying 108 | distribution is broad, with some programs speeding up by substantially 109 | more, many remaining about the same, an a few slowing down. 110 | 111 | CQ generally increases steadily in aggregate with more inlining. For 112 | reasonable amounts of inlining, cases where inlining hurts performance 113 | are fairly rare. At high enough levels of inlining there may be 114 | adverse interactions as optimizer thresholds are tripped, and 115 | eventually the impact of the larger code is felt as contention for the 116 | limited physical memory resources of the host machine. 117 | 118 | CS (and TP) are often thought of as constraint or penalty terms rather 119 | than as optimization objectives. It is clear from experiments that 120 | inlining of suitably small methods will decrease CS and TP, so 121 | obtaining the "minimal" value for these metrics requires some amount 122 | of inlining. Too much inlining will increase CS without providing 123 | improvements in CQ. 124 | 125 | So, for a given level of CQ, there is a range of CS values that can 126 | obtain that CQ. The "ideal" level is then the minimal CS needed; 127 | roughly speaking, there is a CS/CQ tradeoff region with a bounding 128 | curve at the minimum CQ level. The locus and shape of this curve is 129 | unknown and must be discovered empirically. The curve will also vary 130 | considerably depending on the benchmarks. Ensemble measures of 131 | performance are needed, and (as noted above) when comparing 132 | two well-performing heuristics, there will be always be examples 133 | where one heuristic outperforms the other. 134 | 135 | Various system design goals and runtime capabilities (eg desire for 136 | fast startup or good steady-state performance or blend of both, 137 | ability to dynamically re-optimize) dictate which regions of the curve 138 | is most desirable. The challenge, then, is to develop an inlining 139 | heuristic that picks out an appropriate point that lies on or near the 140 | tradeoff curve given the design goals. The shape of the tradeoff 141 | curve is also of interest. 142 | 143 | In our case the ambition is to build a new inlining heuristic that can 144 | increase CQ to get at as much of the headroom as practical, while 145 | decreasing CS and TP. 146 | 147 | ## History 148 | 149 | The work done so far proceeded in roughly 4 stages, in order: 150 | refactoring, size measurements and modelling, time measurements and 151 | modelling, and speed measurements and modelling. These are described 152 | briefly below and in more detail in subsequent sections. 153 | 154 | Refactoring was done to enable the jit to have multiple inlining 155 | policies that could exist side by side. For compatibility reasons it 156 | was desirable to preserve the existing (legacy) behavior, and allowing 157 | other policies side by side facilitates experimentation. The legacy 158 | inliner's decision making was intertwined with the observation 159 | process, so it was necessary to separate these out to decouple policy 160 | from observation. 161 | 162 | Size impact of inlining was measured using the "crossgen" feature of 163 | the CLR. Here the jit is asked to generate code for most of the 164 | methods in an assembly ahead of time. The size impact of each inline 165 | was recorded along with the various observational values that were 166 | available to feed into a heuristic. This data fed into a size model 167 | that produced a size estimating heuristic. The models developed so 168 | far seem reasonably accurate, with an R^2 value of around 0.6. 169 | 170 | The time impact of inlining was measured by capturing CPU cycles 171 | expended in the jit between the time inlining had finished and the 172 | time the native code was generated (notably, this omits the time spent 173 | inlining, which is more difficult to measure). Modelling showed this 174 | time was closely related to the overall emitted size of the method, 175 | which was shown to be fairly reliably estimated by the sum of an 176 | initial time estimate plus the size impact of each successive inline. 177 | 178 | The performance impact of inlines was measured by enabling hardware 179 | performance monitoring counters to capture the number of instructions 180 | retired as the jitted code ran. Inlines were measured in isolation, 181 | one by one, and the difference in instructions retired was attributed 182 | to the inline. This data along with observations formed the data set 183 | that feed the speed model. Unfortunately, this performance data has 184 | proven to be difficult to model accurately. 185 | 186 | ## Constraints and Assumptions 187 | 188 | The CoreCLR currently gives its jit one opportunity to generate code 189 | for a method. The time it takes the jit to generate native code is a 190 | concern (eg it potentially impacts application start-up time), and 191 | given the general size-time relationship, this limits the ability of 192 | the jit to inline aggressively or to perform deep analysis in an 193 | attempt to find an optimal set of inlines for a method. The jit also 194 | has very limited ability to convey knowledge from one invocation to 195 | the next, so analysis costs cannot effectively be amortized. 196 | 197 | Currently the jit walks it is IR in linear fashion deciding whether to 198 | inline each time it sees a candidate. If the decision is *yes* then the 199 | inlined code is spliced in place of the call and (because of the order 200 | of the walk) immediately scanned for inlining candidates. Thus the 201 | inlining is performed "depth first" and is done without much knowledge 202 | of the number or location of other candidates in the code 203 | stream. Inlining is done very early on before any significant analysis 204 | has been done to the IR -- there is a flow graph but no loop nesting, 205 | dataflow, or profile estimates are generally available. 206 | 207 | Thus the heuristic we have in mind is one that assesses each inline 208 | independent of any assessments done before. Factors visible at the 209 | immediate call site and some general information about the accumulated 210 | IR can be used to influence decisions, so it's possible given a method 211 | A with two callsites for to B that one call to B gets inlined and the 212 | other doesn't. 213 | 214 | ## Overall Approach to Heuristic Creation 215 | 216 | The work history above reflects the initial proposal for heuristic 217 | creation -- first build size and speed models, and then combine those 218 | to create a heuristic. The general idea was to have an explicit 219 | size/speed tradeoff made per inline. The idealized heuristic is: 220 | ``` 221 | if (SizeDelta <= 0) { inline; } 222 | else if (SpeedDelta > alpha * SizeDelta) { inline; } 223 | ``` 224 | where here SizeDelta represents the increase in code size caused by 225 | the inline, SpeedDelta is the decrease in instructions executed, and 226 | alpha is a tradeoff factor. So good inlines either decrease size, or 227 | justify their size increase with a speed decrease, and alpha 228 | describes how willing we are to trade speed for size. 229 | 230 | This is roughly the heuristic implemented by the ModelPolicy. 231 | SizeDelta and SpeedDelta are computed by models derived from machine 232 | learning, alpha is manually chosen by "tuning" to give the desired 233 | tradeoff. 234 | 235 | However, the implemented model has an additional parameter, one whose 236 | presence reflects one of the key challenges present in this work. The 237 | size model has natural units of bytes of code (or instructions, if 238 | they're fixed size). Size impacts from inlining are typically small, 239 | say in the range of a few hundred bytes one way or the other. But the 240 | speed impact of an inline can vary over a much wider range. If we 241 | measure the actual change in instructions retired on a benchmark given 242 | one inline difference, the value may vary from -1e9 to 1e9 with many 243 | values clustered closely around zero. 244 | 245 | In an attempt to pull these values into a more manageable range for 246 | modelling, the value provided by the model is instructions retired per 247 | call to the callee. This needs to be multiplied by a "call site 248 | weight" beta to reflect the importance of the call site to the caller, 249 | and further by some "root method weight" to reflect the importance of 250 | the root method to the overall benchmark. We currently use ad-hoc 251 | methods to estimate beta and ignore the root method weight, so the 252 | full heuristic is: 253 | ``` 254 | if (SizeDelta <= 0) { inline; } 255 | else if (beta * PerCallSpeedDelta > alpha * SizeDelta) { inline; } 256 | ``` 257 | Here beta can vary depending on call site, and alpha is the fixed 258 | size-speed tradeoff. 259 | 260 | One might legitimately question this model, even if all of the 261 | quantities could be estimated perfectly. More on this subsequently. 262 | 263 | ## Some Terminology 264 | 265 | An *inline tree* is the set of inlines done into a method. The root 266 | method is the initial method; the top level inlines are the 267 | descendants of the root, and so on. 268 | 269 | An *inline forest* is the set of inline trees that are in effect for a 270 | benchmark run. There is one inline tree for each method executed. 271 | 272 | An inline tree X is a *subtree* of an inline tree Y if Y contains all 273 | the inlines in X and possibly more. A tree X is a *proper parent* of Y 274 | if Y contains just one extra inline. 275 | 276 | ## Size Modelling and Measurements 277 | 278 | ### Size Measurements 279 | 280 | To measure size, the legacy inliner was modified so that in each 281 | method, it would stop inlining after some number of inlines, K, where 282 | K could be specified externally. The jit would then generate native 283 | code for each method and measure the methods' native code size. Since 284 | the inlining decisions made in each method jitted are independent, 285 | data from many inlining instances can be collected in one run of 286 | crossgen, potentially one per "root" method. 287 | 288 | The overall process ran from K = 0 up to Kmax. For each run the 289 | size of the method was dumped to a file along with various 290 | observational values that were available to feed into a heuristic, and 291 | various metadata used to identify the root method. For each row of 292 | data, the value of K was recorded as the "version" of the inlining 293 | experiment. 294 | 295 | Given the raw data, the native size impact of each inline can then be 296 | determined by a post-processing pass: for each method and each inline 297 | into the method, the size change is found by subtracting the method 298 | size for case where J-1 inlines were performed from the size when J 299 | inlines were performed. Note not all methods will be able to perform 300 | the full set of K inlines, so as K increases, the number of methods 301 | that do more inlines decrease. So if there are initially N root 302 | methods the total number of rows of inline data with a given version 303 | decreases as the version increases. 304 | 305 | Reliably identifying the root across runs proved nontrivial, since the 306 | main values used as identifying keys (token and hash) were not 307 | sufficiently unique. These might come from additional stub methods 308 | created by the crossgen process or perhaps from multiply instantiated 309 | generic methods. Post-processing would thus ignore any data from a 310 | method where the key was not unique (eg multiple version 0 rows with 311 | the same token and hash). 312 | 313 | The [data set used](../data/mscorlib.data.model-rel-log.parsed) to 314 | develop the current model is taken from a crossgen of the CoreCLR core 315 | library. It has 29854 rows. Given the special role played by this 316 | library it is quite possible this is not a good representative set of 317 | methods. Considerably more and more diverse data was gathered (upwards 318 | of 1M rows using the desktop "SPMI" method) but this data proved 319 | unwieldy. The data gathered is also specific to x64 and windows and 320 | the behavior of the jit at that time. 321 | 322 | Subsequent work on performance measurement has created new data sets 323 | that could be used for size modelling, since a similar sort of 324 | K-limiting approach was used for performance, and the factor 325 | observations for size and speed are common. The most recent such data 326 | set is the [v12 data](../data/all-benchmark-v12-a15.csv). 327 | 328 | ### Size Modelling 329 | 330 | The size data is "noise free" in that (absent errors in coding) the 331 | sizes in the data set should be completely accurate. Given the 332 | relatively simple behavior of the jit it was felt that a linear model 333 | should work well. 334 | 335 | The model needs to be relatively simple to implement and quick to 336 | evaluate, and it is highly desirable that it be interpretable. Based 337 | on this the model developed is a penalized linear model using R's 338 | 'glmnet'. [This script](../scripts/ModelPolicyV1Size.R.txt) was used 339 | to derive the model. It is implemented by 340 | `DiscretionaryPolicy::EstimateCodeSize` in the code base. This model 341 | explains about 55% of the variance in the mscorlib size data, and 65% 342 | of the variance seen in the v12 data. 343 | 344 | Naive use of more sophisticated models (eg random forests, gradient 345 | boosting, mars) to see how much the linear model might be leaving 346 | behind didn't yield much improvement. 347 | 348 | So the belief is that some the remaining unexplained variance comes 349 | from missing observations. An exploration of poorly fitting examples 350 | would likely prove fruitful. There is likely some nontrivial amount of 351 | variation that will never be easily explained -- the jit's code 352 | generation can be quite sensitive to the exact details of both root 353 | method and callee. 354 | 355 | Exactly how close one can come to modelling size is an open question. 356 | 357 | The degree to which the inaccuracy of the current size model hurts the 358 | overall inline heuristic performance is another. The belief is that 359 | the speed and high-level structure of the heuristic are likely larger 360 | contributors to poor performance. However, they may also be more 361 | difficult to improve. 362 | 363 | ### Size Model Viewed as Classification 364 | 365 | Given the form of the idealized heuristic, drawing a clear distinction 366 | between size-increasing and non-size-increasing inlines is 367 | important. We can view the regression model developed above as a 368 | classifier and see how well it performs at this task (here on the V12 369 | data): 370 | 371 | Actual | Est Decrease | Est Increase | Total 372 | ---------------|--------------|--------------|------- 373 | Size Decrease | 2052 | 776 | 2828 374 | Size Increase | 132 | 3162 | 3298 375 | Total | 2188 | 3938 | 6126 376 | 377 | So the model is quite accurate in predicting size increasing cases, 378 | getting only 132/3298 wrong (96% accuracy). 379 | 380 | It's not as good at predicting size decreasing cases: 776/2828 were 381 | misclassified as size increasing (72% accuracy). 382 | 383 | To better implement the idealized heuristic, it might make sense to 384 | bias the model to increase the accuracy of classifying size decreasing 385 | cases. For instance, setting the classification threshold to EstSize - 386 | 40 (recall value is in bytes * 10) would give roughly balanced error 387 | rates. The downside is that a larger number of size increasing cases 388 | are now inlined without further scrutiny. 389 | 390 | For inlines classified as size increasing, the magnitude of the size 391 | comes into play, so one might also attempt to make more accurate 392 | predictions for size increasing inlines and trade off accuracy in 393 | predicting the magnitude of size decreases. 394 | 395 | ## Speed Model and Measurements 396 | 397 | ### Speed Measurements 398 | 399 | While noise-free size measurements are easy to come by, some degree of 400 | noise is inevitable for most approaches to speed measurements. 401 | Generally speaking any inline whose impact is of the same magnitude as 402 | the ambient noise level will very difficult to measure. 403 | 404 | The most common approach is to measure wall-clock or process time or 405 | cycle time for a benchmark. It is difficult to get noise levels for 406 | these approaches below roughly 1% of the overall runtime of the 407 | test. This amount of noise restricts the set of measurable 408 | inlines. Aside from interference by other processes running on the 409 | host machine, time-based measurements also can fall prey to 410 | microarchitectural implementation issues, in particular things like 411 | loop alignments, global branch prediction, various security-inspired 412 | randomization techniques, power management, and so on. Thus even 413 | run-to-run repeatability on an otherwise quiet machine will be 414 | impacted. The inliner also operates early enough in the compilation 415 | pipeline that machine microarchitecture is of a secondary concern. 416 | 417 | To avoid some of these pitfalls we have adopted the instructions 418 | retired by the benchmark as our primary performance metric. This is 419 | relatively insensitive to microarchitectural detals (with some caveats) 420 | and noise levels of 0.01% - 0.1% are not difficult to come by. 421 | 422 | Measuring instructions retired on Windows requires elevation since 423 | only the kernel can access and program the performance monitoring 424 | counters (PMC). 425 | 426 | ### Isolating Inlines 427 | 428 | To capture the per-inline impact on performance one must be able to 429 | run the benchmark twice, varying just one inline between the two runs. 430 | To do this requires some care. The K-limiting approach used during the 431 | size data collection does not sufficiently control inlining. 432 | 433 | So instead we developed some alternate techniques. One of them is to 434 | use K-limiting along with the ability to suppress inlining in all but 435 | one root method and a FullPolicy inline heuristic that inlines where 436 | possible. This combination allows successive measurements where just 437 | one inline differs (with some care taken to handle for force inlines). 438 | Note the "context" of the inline is the one that arises from the DFS 439 | enumeration. So as K increases we may be inlining deep into the tree. 440 | 441 | Because we wanted a greater quantity of samples for shallow inlines we 442 | developed a second technique where inline replay is used to carefully 443 | control inlining. An inline forest was grown by enabling some an 444 | inline policy and collecting up the initial inline forest. Inlining 445 | for each root in the forest was then disabled and a new run was done; 446 | this collects forests for new roots (methods that were always inlined 447 | in the initial run). This process continues until closure, yielding a 448 | full forest for that policy. 449 | 450 | This full forest is expressed in inline XML. Inline replay can then be 451 | used to isolate inlines to just one tree in the forest by measuring 452 | the performance of a tree and one of its proper parents. In actuality 453 | we measured each tree by growing from the empty tree towards the full 454 | tree in a breadth-first fashion. It is probably a good idea to try a 455 | greater variety of exploration orders here. 456 | 457 | As an aside, using an aggressive policy like the FullPolicy, one can 458 | enumerate the "jit-visible" call graph, a graph that shows the range 459 | of possible inline trees and hence inline forests. 460 | 461 | ### How to Measure an Inline 462 | 463 | The measurement process described below is orchestrated by the 464 | [PerformanceExplorer](https://github.com/AndyAyersMS/PerformanceExplorer). 465 | 466 | Benchmarks are set up to run under xunit-performance. Per-benchmark 467 | times are roughly normalized to 1 second to try and keep the variance 468 | constant in each benchmark. 469 | 470 | Xunit-performance runs the benchmark code via reflection. For each 471 | attributed method it runs some number of iterations, enabling 472 | performance counting via ETW (on windows). It also issues events for 473 | the start and end of each benchmark and each iteration of the 474 | benchmark. The event stream is post processed to find events 475 | attributed to the benchmark process that fall within the iteration 476 | span of a particular benchmark method. These events are counted and 477 | the total is multiplied by the PMC reload interval (100,000 I believe) 478 | to give the overall instruction retired estimate for that iteration. 479 | The raw iteration data is then written to an XML file. This data is 480 | read by the orchestrating process and an average count is computed 481 | from the iterations. This average value is then subtracted from the 482 | averaged value measured by in the proper parent run to get the 483 | per-inline performance impact. 484 | 485 | The overall exploration process repeats the above for each root in 486 | each benchmark, growing trees from their noinline version up to the 487 | full version seen in the forest. Exploration is short-circuited for a 488 | root if the full tree performance for that root does not differ 489 | significantly from the noinline version, or if the root has no 490 | inlines. Roots are explored in the order of the number of calls (see 491 | below). 492 | 493 | ### Additional Data Measured 494 | 495 | Along with the measured change in instructions retired, it seemed 496 | important to also get some idea about how the call counts were 497 | changing in the program -- in particular how often the root being 498 | explored was called, and how frequently it called the method being 499 | inlined. To do this a special instrumentation mode was developed that 500 | hijacks the IBC mechanism present in the CoreCLR. each jitted method's 501 | entry was instrumented with a counter to count the number of 502 | calls. These counts are dumped on jit shutdown. The root method count 503 | is directly available; the callee count can be deduced by knowing its 504 | value in the noinline run and accounting for any other inlines of that 505 | callee made along the way. 506 | 507 | One failing of this method is that if the callee comes from a 508 | prejitted image it will never run in instrumented form. To work around 509 | this the use of prejitted images can be disabled. This creates its own 510 | set of complications because every benchmark contains a sizeable 511 | amount of startup code that might be repeatedly explored. So 512 | optionally the explorer maintains a list of already explored methods 513 | and tries to avoid re-exploration. 514 | 515 | Another failing is that the call counts are captured by running the 516 | benchmark tests normally and not by running them under 517 | xunit-performance. The benchmarks have been set up so that key 518 | portions behave identically under both scenarios, but the real 519 | possibility exists that the call counts measure this way diverge from 520 | the counts running under the performance harness. 521 | 522 | It would probably be better to capture the call count data via the 523 | normal profiling API so that a special build of the jit with this 524 | capability is not needed (though note a special build is still needed 525 | to get at the inline data). 526 | 527 | ### Coping with Noise 528 | 529 | The impact of specific inlines can be elevated above the noise by 530 | iteration -- repeatedly invoking methods in loops. This elevation 531 | generally a manual process and so restricts the set of inlines that can 532 | be studied. But some degree of this is probably necessary. 533 | 534 | Adoption of a benchmarking framework like xunit-perf allows for the 535 | iteration strategy to be determined after the benchmark is authored. 536 | 537 | Runs can be repeated with the well-known result that if the noise is 538 | uncorrelated, the noise level will fall of with the square root of the 539 | number of runs. However on windows at least we have seen that the 540 | ambient noise level can vary over periods of minutes to hours. So some 541 | kind of adaptive iteration strategy might be required where the 542 | benchmarking harness periodically runs some known-effort workload to 543 | assess the ambient noise, and then either records the noise level or 544 | tries to adjust (or perhaps defer) data gathering to compensate for 545 | higher noise levels. 546 | 547 | There is also some inherent sampling noise. Consider this simple model 548 | of how PMC sampling works. The per-CPU PMC is programmed with some 549 | count-down value N. The OS then schedules processes to this 550 | CPU. Instructions are executed and the counter counts down. When it 551 | hits zero, the entire allotment of counts is "charged" to the current 552 | process. Suppose during this time the process being benchmarked ran 553 | for most of the time but for some fraction of instructions alpha, 554 | other processes ran on the CPU. Then the expected instruction charge 555 | to the benchmark process for this interval is: 556 | ``` 557 | E = alpha * 0 + (1-alpha) * N 558 | ``` 559 | which reflects that on some of the PMC rollovers the process is not 560 | charged even though it made progress, and on others it is charged but 561 | made somewhat less progress then the charge would indicate. 562 | 563 | The actual progress towards completion is given by the same formula. 564 | If the entire benchmark runs for K instructions then on average during 565 | the benchmark's execution the number of charges will be K/E, and hence 566 | the expected value for the total charge is K, which equals the actual 567 | total charge. So the added noise here does not apparently bias the 568 | estimated mean. It does however, create variance. 569 | 570 | The existence of this variance is readily observable. Unfortunately 571 | the exact nature of this variance is not well characterized. See for 572 | instance the discussions in 573 | [Flater](http://nvlpubs.nist.gov/nistpubs/technicalnotes/NIST.TN.1826.pdf). 574 | However it seems reasonable to assume that the variance increases, 575 | perhaps nonlinearly, with increasing alpha, and also increases if 576 | alpha itself is subject to variation, and that the variance does not 577 | go to zero as the benchmark is run for longer intervals. 578 | 579 | ### Speed Model -- Idealized 580 | 581 | The idealized speed model for the change in instructions retired 582 | for a single isolated line into root at some site is: 583 | ``` 584 | InstRetiredDelta = RootCallCount * InstRetiredPerRootCallDelta 585 | InstRetiredPerRootCallDelta = Overhead + CallDelta * InstRetiredPerCallDelta 586 | InstRetiredPerCallDelta = F(...) 587 | ``` 588 | See table below for explanation of these terms. InstRetiredDelta, 589 | RootCallCount, InstRetiredPerRootCallDelta, Overhead, CallDelta, and 590 | InstRetiredPerCallDelta measured after the fact. 591 | 592 | For predictive purposes they must be derived from modelling. 593 | 594 | ### Speed Model -- Modelling 595 | 596 | Attempts to coax workable performance models out of the data gathered 597 | above have largely fallen flat. 598 | 599 | The first challenge is to figure out what to model. With the current 600 | data set, there are several viable options: 601 | - InstRetiredDelta 602 | - InstRetiredPct 603 | - InstrRetiredPerRootCallDelta 604 | - InstrRetiredPerCallDelta 605 | 606 | The first two measures reflect the realized potential of the inline in 607 | some benchmark setting. They seem somewhat arbitrary -- if the 608 | benchmark was run for twice as long, the overall change in 609 | instructions would double as well. And the percentage value likewise 610 | seems off -- consider a test like the CscBench that has two timed 611 | sub-benchmarks. If an inline benefits one and not the other, the 612 | percentage change in instructions retired depends on the relative 613 | number of instructions run for the two sub-benchmarks. In terms of the 614 | idealized model, `RootCallCount` is thus something we can't easily 615 | characterize. 616 | 617 | So some sort of relative measure seems more appropriate. Because the 618 | jit generally has no notion of the overall importance of the root 619 | method in the ongoing processing (with some exceptions: the .cctor is 620 | known to run rarely, and when/if there's profile feedback, the jit 621 | might know something from past observations), it must presume that the 622 | root method might be called frequently. So a plausible figure of merit 623 | for inline benefit is the change in instructions retired per call to 624 | the root method: `InstRetiredPerRootCallDelta`. 625 | 626 | One could likewise argue that the right figure of merit for speed is 627 | `InstRetiredPerCallDelta` -- the change in instructions retired per 628 | call to the inlinee. This could be multiplied by a local estimate for 629 | call site frequency to arrive at a projected per call benefit. The jit 630 | computes block frequency estimates for other purposes and it would be 631 | good if all such estimates agreed. So instead of having this be implicit 632 | in the inline profit model, it could be made explicit. 633 | 634 | With either of these relative measures there is still potential for 635 | wide dynamic range as instruction retirement counts can be amplified 636 | by loops in the root or in the callee. 637 | 638 | Measurement of either of these requires that `RootCallCount` and 639 | `CallDelta` be captured. This is currently done with the special 640 | instrumentation mentioned above. 641 | 642 | Note also that `CallDelta` may well be zero, in which case the 643 | `InstRetiredPerRootCall` reflects just the `Overhead` term. This term 644 | represents changes in the root that are not "in the vicinity" of the 645 | inline site -- eg extra register saves in the prolog or epilog, or 646 | improved or pessimized code in other parts of the root method. 647 | 648 | Also, the number of times `CallDelta` is observed to be zero is 649 | overstated in the V12 data set because before call count values are 650 | not always available (see note above about additional data in the 651 | presence of crossgen). This should be fixed in the forthcoming V13 652 | data set. 653 | 654 | Unfortunately, it is proving difficult to find good models for 655 | any of the measures above. Some potential explanations: 656 | 657 | - High noise levels. Typical noise of 0.01% - 0.1% still means variations 658 | on the order of 5M instructions. Many inlines will have absolute 659 | impact below this level. 660 | - Missing key observations 661 | - Errors in measurement or in post-processing 662 | - Poor selection of benchmarks 663 | - Varying noise levels 664 | 665 | ### Speed Model -- Modelling Attempts 666 | 667 | Various approaches that have been tried, without success, for 668 | performance models: 669 | 670 | - Find some subset of the data that is predictable. For instance 671 | cases with high `CallDelta` 672 | - General linear modelling with nonlinear terms and interaction terms 673 | - Nonlinear models like mars 674 | - Quantile an robust regressions 675 | - Trying to classify rather than regress, classify as "improvement" or 676 | "regression", or some multinomial sense of "goodness". 677 | - Transforming the response to reduce the dynamic range (~ predict log of delta) 678 | - Temporarily allowing some output terms (eg `RootCallCount`, `CallDelta`) in models 679 | - Ensemble models (random forest, gradient boosting). While we might not want 680 | to implement such a model, if they're unable to predict results well, then there 681 | is not much hope for simpler implementable models 682 | - Weighted models, where the weight is used to 683 | - Cope with potential heteroscedastic results 684 | - Ignore impact of outliers 685 | - Emphasize instances felt to be above the noise level 686 | 687 | Very few models can explain more than a few percent of the variation. 688 | 689 | ### Speed Model -- Implemented Model 690 | 691 | The model currently implemented in the `ModelPolicy` came from an 692 | early V3 [data set](../data/all-benchmark-v3-a13.csv), and relies on 693 | just 210 observations. It predicts `InstRetiredPerCallDelta`. It is a 694 | penalized linear model that can explain only about 24% of the 695 | variation. [This is the script](../scripts/ModelPolicyV1Perf.R.txt) 696 | used to derive the model 697 | 698 | For use in the heuristic, the speed estimate from the model is 699 | multiplied by a local estimate of `CallDelta` to give an estimate of 700 | `InstRetiredPerRootCallDelta`. This local estimate is ad-hoc and was 701 | chosen to give some boost to call sites believed to be in loops in the 702 | root method. 703 | 704 | This version of the model was intended to be preliminary so that a 705 | trial implementation of the ModelPolicy and idealized heuristic could 706 | be assessed. However, no better model has emerged in the time since. 707 | 708 | ## Current Heuristic 709 | 710 | The current ModelPolicy heuristic follows the form of the idealized 711 | heuristic. It uses the size model and speed model, along with a local 712 | call site weight and a size-speed tradeoff parameter. The weight and 713 | tradeoff parameters were set based on benchmark runs and size 714 | assessments. 715 | 716 | Results show about a 2% geomean improvement in the CoreCLR benchmarks, 717 | with around a 2% size decrease in the core library crossgen size, and 718 | about a 1% throughput improvement. 719 | 720 | Evaluation of this heuristic on other benchmarks is just beginning. 721 | Some tests on parts of RavenDB show a possible 2% CQ improvement, 722 | though there were some interactions with force inline 723 | directives. Measurements on ASP.Net Techempower plaintext show about 724 | at 2% regression. 725 | 726 | Viewed as a classifier, here's how well the implemented model does at 727 | implementing the idealized heuristic (V12 data): 728 | 729 | Actual | Est Size Decrease | Est Profitable | Est Don't Inline | Total 730 | --------------|-------------------|----------------|------------------|------- 731 | Size Decrease | 2052 | 86 | 690 | 2828 732 | Profitable | 25 | 7 | 384 | 416 733 | Don't Inline | 111 | 39 | 2732 | 2882 734 | Total | 2188 | 132 | 3806 | 6126 735 | 736 | Accuracy is 78% overall. The largest errors come from inlines that 737 | actually decrease size, but are estimated to increase size, and then 738 | judged as unprofitable (690), and from inlines that are correctly 739 | estimated to increase size but are then assessed as unprofitable. 740 | 741 | Note that there may be substantial labelling error for the 742 | size-increasing cases, given the high noise levels in profitability 743 | measurements and the low impact of many inline instances. 744 | 745 | ## Alternatives 746 | 747 | ### Full-on Classification Model 748 | 749 | One might legitimately ask if it would be better to try and learn the 750 | idealized heuristic directly. Such a model would incorporate aspects 751 | of the size and speed models, though they might no longer be 752 | distinguishable as such. 753 | 754 | ### Learning from Inline Forests 755 | 756 | Instead of measuring inlines in isolation, one might attempt to infer 757 | value by studying performance changes for entire inline forests. This 758 | seems to match (in spirit) the approach taken in Kulkarni, et. al. A 759 | randomized heuristic is used, and this creates a collection of forests 760 | and performance results. Results are projected back onto individual 761 | inlines in the forest and, for each inline, the projected results are 762 | aggregated into some kind of label for that inline. 763 | 764 | For instance, one could track three numbers (possibly weighted by 765 | magnitude of the change) for each instance: the number of times it 766 | appears in a run that increases performance, the number of times in 767 | appears in a run that decreases performance, and the number of times 768 | it does not appear at all. The objective would be to then learn how to 769 | identify inlines whose appearance is correlated with improved 770 | performance. 771 | 772 | ### Finding Ideal Forests 773 | 774 | Along these lines one might also use randomness or a genetic approach 775 | to try and identify the "optimal" inline forest for each benchmark, 776 | and then attempt to generalize from there to a good overall inline 777 | heuristic. 778 | 779 | ## Inline Data Files 780 | 781 | The files have a header row with column names (meanings below), and 782 | then data rows, one per inline instance. 783 | 784 | Column Type | Meaning | Use in Heuristic? 785 | ------------|--------------------------------------|----- 786 | input | observation available for heuristic | Yes 787 | estimate | value internally derived from inputs | Maybe 788 | meta | metadata about the instance | No 789 | output | measured result | No 790 | 791 | The table below describes the V12 (and forthcoming V13) data sets. 792 | Older files may have a subset of this data, and may contain a Version0 793 | row for each method giving method information without any inlines. 794 | 795 | Column Name | Type | Meaning 796 | --------------------- |----------|-------- 797 | Benchmark | meta | Name of benchmark program 798 | SubBenchmark | meta | none (all sub-benchmark data now aggregated) 799 | Method | meta | Token value of the root method 800 | Version | meta | Ordinal number of this inline 801 | HotSize | output | Hot code size of method after this inline (bytes) 802 | ColdSize | output | Cold code size of method after this inline (bytes) 803 | JitTime | output | Time spent in code gen after inlining (microseconds) 804 | SizeEstimate | estimate | Estimated code size for this method (hot + cold) 805 | TimeEstmate | estimate | Estimated jit time for this method (microseconds) 806 | ILSize | input | Size of callee method IL buffer (bytes) 807 | CallsiteFrequency | estimate | Importance of the call site (factor) 808 | InstructionCount | input | Number of MSIL instructions in the callee IL 809 | LoadStoreCount | input | Number of "load-store" MSIL instructions in callee IL 810 | Depth | input | Depth of this call site (1 == top-level) 811 | BlockCount | input | Number of basic blocks in the callee 812 | Maxstack | input | Maxstack value from callee method header 813 | ArgCount | input | Number of arguments to callee (from signature) 814 | ArgNType | input | Type of Nth argument (factor, CorInfoType) 815 | ArgNSize | input | Size of Nth argument (bytes) 816 | LocalCount | input | Number of locals in callee (from signature) 817 | ReturnType | input | Type of return value (factor, CorInfoType) 818 | ReturnSize | input | Size of return value (bytes) 819 | ArgAccessCount | input | Number of LDARG/STARG opcodes in callee IL 820 | LocalAccessCount | input | Number of LDLOC/STLOC opcodes in callee IL 821 | IntConstantCount | input | number of LDC_I and LDNULLopcodes in callee IL 822 | FloatConstantCount | input | number of LDC_R opcodes in callee IL 823 | IntLoadCount | input | number of LDIND_I/U opcodes in callee IL 824 | FloatLoadCount | input | number of LDIND_R opcodes in callee IL 825 | IntStoreCount | input | number of STIND_I opcodes in callee IL 826 | FloatStoreCount | input | number of STIND_R opcodes in callee IL 827 | SimpleMathCount | input | number of ADD/SUB/.../CONV_I/U opcodes in callee IL 828 | ComplexMathCount | input | number of MUL/DIV/REM/CONV_R opcodes in callee IL 829 | OverflowMathCount | input | number of CONV_OVF and math_OVF opcodes in callee IL 830 | IntArrayLoadCount | input | number of LDELEM_I/U opcodes in callee IL 831 | FloatArrayLoadCount | input | number of LDELEM_R opcodes in callee IL 832 | RefArrayLoadCount | input | number of LDELEM_REF opcodes in callee IL 833 | StructArrayLoadCount | input | number of LDELEM opcodes in callee IL 834 | IntArrayStoreCount | input | number of STELEM_I/U opcodes in callee IL 835 | FloatArrayStoreCount | input | number of STELEM_R opcodes in callee IL 836 | RefArrayStoreCount | input | number of STELEM_REF opcodes in callee IL 837 | StructArrayStoreCount | input | number of STELEM opcodes in callee IL 838 | StructOperationCount | input | number of *OBJ and *BLK opcodes in callee IL 839 | ObjectModelCount | input | number of CASTCLASS/BOX/etc opcodes in callee IL 840 | FieldLoadCount | input | number of LDLEN/LDFLD/REFANY* in callee IL 841 | FieldStoreCount | input | number of STFLD in callee IL 842 | StaticFieldLoadCount | input | number of LDSFLD in callee IL 843 | StaticFieldStoreCount | input | number of STSFLD in callee IL 844 | LoadAddressCount | input | number of LDLOCA/LDARGA/LD*A in callee IL 845 | ThrowCount | input | number of THROW/RETHROW in callee IL 846 | ReturnCount | input | number of RET in callee IL (new in V13) 847 | CallCount | input | number of CALL*/NEW*/JMP in callee IL 848 | CallSiteWeight | estimate | numeric weight of call site 849 | IsForceInline | input | true if callee is force inline 850 | IsInstanceCtor | input | true if callee is an .ctor 851 | IsFromPromotableValueClass | input | true if callee is from promotable value class 852 | HasSimd | input | true if callee has simd args/locals 853 | LooksLikeWrapperMethod | input | true if callee simply wraps another call 854 | ArgFeedsConstantTest | input | number of times an arg reaches compare vs constant 855 | IsMostlyLoadStore | input | true if loadstore count is large fraction of instruction count 856 | ArgFeedsRangeCheck | input | number of times an arg reaces compare vs ldlen 857 | ConstantArgFeedsConstantTest | input | number of times a constant arg reaches a compare vs constant 858 | CalleeNativeSizeEstimate | estimate | LegacyPolicy's size estimate for callee (bytes * 10) 859 | CallsiteNativeSizeEstimate | estimate | LegacyPolicy's size estimate for "savings" from inlining (bytes * 10) 860 | ModelCodeSizeEstimate | estimate | ModelPolicy's size estimate (bytes * 10) 861 | ModelPerCallInstructionEstimate | estimate | ModelPolicy's speed estimate (inst retired per call to callee) 862 | IsClassCtor | input | True if callee is a .cctor (v13) 863 | IsSameThis | input | True if callee and root are both instances with same this pointer (v13) 864 | CallerHasNewArray | input | True if caller contains NEWARR operation (v13) 865 | CallerHasNewObj | input | True if caller contains NEWOBJ operation (v13) 866 | HotSizeDelta | output | Change in hot size because of this inline (bytes) 867 | ColdSizeDelta | output | Change in cold size because of this inline (bytes) 868 | JitTimeDelta | output | Change in jit time because of this inline (microseconds) 869 | InstRetiredDelta | output | Change in instructions retired because of ths inline (millions) 870 | InstRetiredSD | estimate | Estimated standard deviation of InstRetiredDelta (millions) 871 | InstRetiredPct | output | Percent change in instructions retired 872 | CallDelta | output | Change in number of calls to the callee because of this inline 873 | InstRetiredPerCallDelta | output | InstRetiredDelta/CallDelta or 0 if CallDelta is 0 874 | RootCallCount | output | Number of calls to root method 875 | InstRetiredPerRootCallDelta | output | InstRetiredDelta/RootCallCount 876 | Confidence | meta | Bootstrap confidence that this inline caused instructions retired to change 877 | 878 | ## Options for Changing Inliner Behavior 879 | 880 | Build a release jit with -DINLINE_DATA=1. This enables the following COMPlus settings: 881 | 882 | Setting | Effect 883 | ----------------------|--------------- 884 | JitInlineDumpData | dumps inline data 885 | JitInlineDumpXml | dumps inlines in xml format 886 | JitInlinePolicyReplay | enable replay from replay file 887 | JitInlineReplayFile | name of the replay file to read from 888 | JitInlinePolicyFull | enable FullPolicy heuristic 889 | JitInlinePolicyModel | enable ModelPolicy heuristic 890 | JitInlineLimit | enable K-limiting 891 | JitNoInlineRange | disable inlines in a subset of methods 892 | 893 | ## List of Areas for Investigation 894 | 895 | - Improvements to Size data collection 896 | - Modify collection process to walk inline trees in various orders 897 | - Improvements to the Size models 898 | - Analyze cases where existing size model makes poor predictions 899 | - Look for correlated inputs 900 | - Look for inputs columns with zero variance and/or low variance, 901 | and either remove them or add cases to boost their relevance 902 | - See if there is a better way to account for the operations done by the inlinee 903 | - Improvements to the Speed data collection 904 | - Reduce noise levels (thread affinity, priority, more frequent sampling, etc) 905 | - Identify noisy runs and retry if warranted 906 | - Round-robin execution to "sample" benchmarks at different times 907 | - More iterations, more reruns 908 | - Eliminate noise entirely using instrumentation or a tool like PIN 909 | - Understand possible divergence between xunit-perf and regular runs 910 | - Get rid of need for instrumented build, use CLR profiler API instead 911 | - Get rid of split modelling where sometimes the program is run 912 | under the perf harness and other times it is run normally 913 | - Directly measure call site frequency rather than infer it 914 | - Modify collection process to walk inline trees in various orders 915 | - Generalize collection to work with any test program 916 | - Wider variety of measurements 917 | - Develop techniques to measure and attribute performance to 918 | inline ensembles to speed up collection 919 | - Improvements to the Speed model 920 | - Settle on proper figure of merit: Instructions or Instructions per XXX 921 | - Deal with potential heteroscedasticity 922 | - Improvements to the idealized heuristic 923 | - Randomized studies looking for good inlining patterns 924 | - Manual tuning to do likewise 925 | - Find way to code up oracular inliner and benchmark the heuristics that way 926 | - Improvements to the actual heuristic 927 | - If local call site weight estimate needed, find good way to create one 928 | - If root method importance estimate needed, find good way to create one 929 | - Build full-model classifiers that incorporate tradeoffs 930 | - Automate process of tuning parameters 931 | - Other 932 | - Impact of instrumented counts (IBC) on heuristics 933 | - Are different heuristics warranted for prejit and jit? 934 | - Investigate stability of models over time 935 | - Investigate variability of models for different OSs or ISAs 936 | 937 | 938 | -------------------------------------------------------------------------------- /src/PerformanceExplorer/Program.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Diagnostics; 4 | using System.Linq; 5 | using System.IO; 6 | using System.Xml.Serialization; 7 | using System.Text; 8 | using System.Xml.Linq; 9 | 10 | namespace PerformanceExplorer 11 | { 12 | // A Configuration describes the environment 13 | // used to perform a particular run. 14 | public class Configuration 15 | { 16 | public Configuration(string name) 17 | { 18 | Name = name; 19 | Environment = new Dictionary(); 20 | ResultsDirectory = Program.RESULTS_DIR; 21 | 22 | if (Program.DisableZap) 23 | { 24 | Environment["COMPlus_ZapDisable"] = "1"; 25 | } 26 | } 27 | 28 | public string Name; 29 | public Dictionary Environment; 30 | public string ResultsDirectory; 31 | } 32 | 33 | // PerformanceData describes performance 34 | // measurements for benchmark runs 35 | public class PerformanceData 36 | { 37 | public PerformanceData() 38 | { 39 | ExecutionTime = new SortedDictionary>(); 40 | InstructionCount = new SortedDictionary>(); 41 | id = ++idGen; 42 | } 43 | 44 | static int idGen = 0; 45 | static int id; 46 | 47 | // subPart -> list of data 48 | public SortedDictionary> ExecutionTime; 49 | public SortedDictionary> InstructionCount; 50 | 51 | public void Print(string configName) 52 | { 53 | foreach (string subBench in ExecutionTime.Keys) 54 | { 55 | Summarize(subBench, configName); 56 | } 57 | } 58 | 59 | public void Summarize(string subBench, string configName) 60 | { 61 | Console.Write("### [{0}] {1} perf for {2}", id, configName, subBench); 62 | 63 | if (ExecutionTime.ContainsKey(subBench)) 64 | { 65 | Console.Write(" time {0:0.00} milliseconds (~{1:0.00}%)", 66 | Average(ExecutionTime[subBench]), 67 | PercentDeviation(ExecutionTime[subBench])); 68 | } 69 | 70 | if (InstructionCount.ContainsKey(subBench)) 71 | { 72 | Console.Write(" instructions {0:0.00} million (~{1:0.00}%)", 73 | Average(InstructionCount[subBench]) / (1000 * 1000), 74 | PercentDeviation(InstructionCount[subBench])); 75 | } 76 | 77 | Console.WriteLine(); 78 | } 79 | 80 | public static double Average(List data) 81 | { 82 | if (data.Count() < 1) 83 | { 84 | return -1; 85 | } 86 | 87 | return data.Average(); 88 | } 89 | public static double StdDeviation(List data) 90 | { 91 | if (data.Count() < 2) 92 | { 93 | return 0; 94 | } 95 | 96 | double avg = Average(data); 97 | double sqError = 0; 98 | foreach (double d in data) 99 | { 100 | sqError += (avg - d) * (avg - d); 101 | } 102 | double estSD = Math.Sqrt(sqError / (data.Count() - 1)); 103 | return estSD; 104 | } 105 | public static double PercentDeviation(List data) 106 | { 107 | return 100.0 * StdDeviation(data) / Average(data); 108 | } 109 | 110 | // Use bootstrap to test hypothesis that difference in 111 | // means of the two data sets is significant at indicated level. 112 | // Return value is p value between 0 and 1. 113 | public static double Confidence(List data1, List data2) 114 | { 115 | int kb = data1.Count(); 116 | int kd = data2.Count(); 117 | 118 | double d1ave = Average(data1); 119 | double d2ave = Average(data2); 120 | double basestat = Math.Abs(d1ave - d2ave); 121 | 122 | // perform a boostrap test to estimate the one-sided 123 | // confidence that this diff could be significant. 124 | 125 | List mergedData = new List(kb + kd); 126 | mergedData.AddRange(data1); 127 | mergedData.AddRange(data2); 128 | 129 | double confidence = Bootstrap(basestat, kb, kd, mergedData); 130 | 131 | return confidence; 132 | } 133 | 134 | // Use bootstrap to produce a p value for the hypothesis that the 135 | // difference shown in basestat is significant. 136 | // k1 and k2 are the sizes of the two sample populations 137 | // data is the combined set of observations. 138 | static double Bootstrap(double basestat, int k1, int k2, List data) 139 | { 140 | double obs = 0; 141 | Random r = new Random(RandomSeed); 142 | 143 | for (int i = 0; i < NumberOfBootstrapTrials; i++) 144 | { 145 | List z1 = Sample(data, k1, r); 146 | List z2 = Sample(data, k2, r); 147 | 148 | double z1average = Average(z1); 149 | double z2average = Average(z2); 150 | 151 | double zmedian = Math.Abs(z1average - z2average); 152 | 153 | if (zmedian < basestat) 154 | { 155 | obs++; 156 | } 157 | } 158 | 159 | return obs / NumberOfBootstrapTrials; 160 | } 161 | 162 | // Return a random sample (with replacement) of size n from the array data 163 | static List Sample(List data, int n, Random r) 164 | { 165 | int l = data.Count; 166 | List x = new List(n); 167 | for (int i = 0; i < n; i++) 168 | { 169 | int j = r.Next(0, l); 170 | x.Add(data[j]); 171 | } 172 | 173 | return x; 174 | } 175 | 176 | // Use fixed random seed so that we don't see the bootstrap p-values 177 | // wander from invocation to invocation. 178 | const int RandomSeed = 77; 179 | 180 | // The bootstrap test works by taking a number of random samples 181 | // and computing how frequently the samples exhibit the same 182 | // statistic as observed statstic. N determines the 183 | // number of bootstrap trials to run. A higher value is better 184 | // but takes longer. 185 | const int NumberOfBootstrapTrials = 1000; 186 | } 187 | 188 | // Information that identifies a method 189 | public struct MethodId : IEquatable 190 | { 191 | public override bool Equals(object obj) 192 | { 193 | return (obj is MethodId) && Equals((MethodId)obj); 194 | } 195 | public bool Equals(MethodId other) 196 | { 197 | return (this.Token == other.Token && this.Hash == other.Hash); 198 | } 199 | public override string ToString() 200 | { 201 | return String.Format("{0:X8}-{1:X8}", Token, Hash); 202 | } 203 | 204 | public override int GetHashCode() 205 | { 206 | int hash = 23; 207 | hash = hash * 31 + (int)Token; 208 | hash = hash * 31 + (int)Hash; 209 | return hash; 210 | } 211 | 212 | public uint Token; 213 | public uint Hash; 214 | } 215 | 216 | // A method seen either in jitting or inlining 217 | public class Method 218 | { 219 | public Method() 220 | { 221 | Callers = new List(); 222 | Callees = new List(); 223 | } 224 | 225 | public MethodId getId() 226 | { 227 | MethodId id = new MethodId(); 228 | id.Token = Token; 229 | id.Hash = Hash; 230 | return id; 231 | } 232 | 233 | public static int HasMoreInlines(Method x, Method y) 234 | { 235 | return (int) y.InlineCount - (int) x.InlineCount; 236 | } 237 | 238 | public static int HasMoreCalls(Method x, Method y) 239 | { 240 | if (x.CallCount > y.CallCount) 241 | { 242 | return -1; 243 | } 244 | else if (x.CallCount < y.CallCount) 245 | { 246 | return 1; 247 | } 248 | else 249 | { 250 | return 0; 251 | } 252 | } 253 | 254 | public double NumSubtrees() 255 | { 256 | double result = 1; 257 | foreach (Inline i in Inlines) 258 | { 259 | result *= (1 + i.NumSubtrees()); 260 | } 261 | return result; 262 | } 263 | 264 | public void Dump() 265 | { 266 | Console.WriteLine("Inlines into {0} {1:X8}", Name, Token); 267 | foreach (Inline x in Inlines) 268 | { 269 | x.Dump(2); 270 | } 271 | } 272 | 273 | public Inline[] GetBfsSubtree(int k, out Inline lastInline) 274 | { 275 | List l = new List(k); 276 | Queue q = new Queue(); 277 | lastInline = null; 278 | 279 | foreach (Inline i in Inlines) 280 | { 281 | q.Enqueue(i); 282 | } 283 | 284 | // BFS until we've enumerated the first k. 285 | while (q.Count() > 0) 286 | { 287 | Inline i = q.Dequeue(); 288 | l.Add(i); 289 | 290 | if (l.Count() == k) 291 | { 292 | lastInline = i; 293 | break; 294 | } 295 | 296 | foreach (Inline ii in i.Inlines) 297 | { 298 | q.Enqueue(ii); 299 | } 300 | } 301 | 302 | // DFS to copy with the list telling us 303 | // what to include. 304 | return GetDfsSubtree(Inlines, l, lastInline); 305 | } 306 | 307 | Inline[] GetDfsSubtree(Inline[] inlines, List filter, Inline lastInline) 308 | { 309 | List newInlines = new List(); 310 | foreach (Inline x in inlines) 311 | { 312 | if (filter.Contains(x)) 313 | { 314 | Inline xn = x.ShallowCopy(); 315 | // Flag the last inline so the jit can collect 316 | // data for it during replay. 317 | if (x == lastInline) 318 | { 319 | xn.CollectData = 1; 320 | } 321 | newInlines.Add(xn); 322 | xn.Inlines = GetDfsSubtree(x.Inlines, filter, lastInline); 323 | } 324 | } 325 | 326 | return newInlines.ToArray(); 327 | } 328 | 329 | public Method ShallowCopy() 330 | { 331 | Method r = new Method(); 332 | r.Token = Token; 333 | r.Hash = Hash; 334 | r.Name = Name; 335 | r.Inlines = new Inline[0]; 336 | return r; 337 | } 338 | 339 | public uint Token; 340 | public uint Hash; 341 | public string Name; 342 | public uint InlineCount; 343 | public uint HotSize; 344 | public uint ColdSize; 345 | public uint JitTime; 346 | public uint SizeEstimate; 347 | public uint TimeEstimate; 348 | public Inline[] Inlines; 349 | public ulong CallCount; 350 | public void MarkAsDuplicate() { IsDuplicate = true; } 351 | public bool CheckIsDuplicate() { return IsDuplicate; } 352 | private bool IsDuplicate; 353 | 354 | public List Callers; 355 | public List Callees; 356 | } 357 | 358 | // THe jit-visible call graph 359 | public class CallGraph 360 | { 361 | public CallGraph(Results fullResults) 362 | { 363 | Map = fullResults.Methods; 364 | Nodes = new HashSet(); 365 | Roots = new HashSet(); 366 | Leaves = new HashSet(); 367 | Build(); 368 | } 369 | public void Build() 370 | { 371 | // Populate per-method caller and callee lists 372 | // Drive via IDs to consolidate dups 373 | foreach(MethodId callerId in Map.Keys) 374 | { 375 | Method caller = Map[callerId]; 376 | 377 | foreach (Inline i in caller.Inlines) 378 | { 379 | MethodId calleeId = i.GetMethodId(); 380 | 381 | // Not sure why it wouldn't.... 382 | if (Map.ContainsKey(calleeId)) 383 | { 384 | Method callee = Map[calleeId]; 385 | 386 | caller.Callees.Add(callee); 387 | callee.Callers.Add(caller); 388 | } 389 | } 390 | } 391 | 392 | foreach (MethodId methodId in Map.Keys) 393 | { 394 | Method method = Map[methodId]; 395 | 396 | Nodes.Add(method); 397 | 398 | // Methods with no callers are roots. 399 | if (method.Callers.Count == 0) 400 | { 401 | Roots.Add(method); 402 | } 403 | 404 | // Methods with no callees are leaves. 405 | if (method.Callees.Count == 0) 406 | { 407 | Leaves.Add(method); 408 | } 409 | } 410 | } 411 | 412 | public Dictionary Map; 413 | public HashSet Nodes; 414 | public HashSet Roots; 415 | public HashSet Leaves; 416 | 417 | public void DumpDot(string file) 418 | { 419 | using (StreamWriter outFile = File.CreateText(file)) 420 | { 421 | outFile.WriteLine("digraph CallGraph {"); 422 | foreach (Method m in Nodes) 423 | { 424 | outFile.WriteLine("\"{0:X8}-{1:X8}\" [ label=\"{2}\"];", m.Token, m.Hash, m.Name); 425 | 426 | foreach (Method p in m.Callees) 427 | { 428 | outFile.WriteLine("\"{0:X8}-{1:X8}\" -> \"{2:X8}-{3:X8}\";", 429 | m.Token, m.Hash, p.Token, p.Hash); 430 | } 431 | } 432 | outFile.WriteLine("}"); 433 | } 434 | } 435 | } 436 | 437 | // A node in an inline tree. 438 | public class Inline 439 | { 440 | public double NumSubtrees() 441 | { 442 | double result = 1; 443 | foreach (Inline i in Inlines) 444 | { 445 | result *= (1 + i.NumSubtrees()); 446 | } 447 | return result; 448 | } 449 | 450 | public uint Token; 451 | public uint Hash; 452 | public uint Offset; 453 | public uint CollectData; 454 | public string Reason; 455 | public string Data; 456 | public Inline[] Inlines; 457 | 458 | public Inline ShallowCopy() 459 | { 460 | Inline x = new Inline(); 461 | x.Token = Token; 462 | x.Hash = Hash; 463 | x.Offset = Offset; 464 | x.Reason = Reason; 465 | x.CollectData = 0; 466 | x.Inlines = new Inline[0]; 467 | return x; 468 | } 469 | 470 | public MethodId GetMethodId() 471 | { 472 | MethodId id = new MethodId(); 473 | id.Token = Token; 474 | id.Hash = Hash; 475 | return id; 476 | } 477 | public void Dump(int indent) 478 | { 479 | for (int i = 0; i < indent; i++) Console.Write(" "); 480 | Console.WriteLine("{0:X8} {1}", Token, Reason); 481 | foreach (Inline x in Inlines) 482 | { 483 | x.Dump(indent + 2); 484 | } 485 | } 486 | } 487 | 488 | // InlineForest describes the inline forest used for the run. 489 | public class InlineForest 490 | { 491 | public string Policy; 492 | public string DataSchema; 493 | public Method[] Methods; 494 | } 495 | 496 | // The benchmark of interest 497 | public class Benchmark 498 | { 499 | public string ShortName; 500 | public string FullPath; 501 | public int ExitCode; 502 | } 503 | 504 | // The results of running a benchmark 505 | public class Results 506 | { 507 | public Results() 508 | { 509 | Performance = new PerformanceData(); 510 | } 511 | 512 | public int ExitCode; 513 | public string LogFile; 514 | public bool Success; 515 | public InlineForest InlineForest; 516 | public Dictionary Methods; 517 | public PerformanceData Performance; 518 | public string Name; 519 | } 520 | 521 | public class InlineDelta : IComparable 522 | { 523 | public Method rootMethod; 524 | public MethodId inlineMethodId; 525 | public double pctDelta; 526 | public int index; 527 | public string subBench; 528 | public double confidence; 529 | public double instructionsDelta; 530 | public double callsDelta; 531 | public bool hasPerCallDelta; 532 | public double perCallDelta; 533 | public int CompareTo(InlineDelta other) 534 | { 535 | return -Math.Abs(pctDelta).CompareTo(Math.Abs(other.pctDelta)); 536 | } 537 | } 538 | 539 | public class Exploration : IComparable 540 | { 541 | public Results baseResults; 542 | public Results endResults; 543 | public Benchmark benchmark; 544 | 545 | // Consider benchmarks with fewer roots as better 546 | // candidates for exploration. 547 | public int CompareTo(Exploration other) 548 | { 549 | return endResults.Methods.Count() - other.endResults.Methods.Count(); 550 | } 551 | 552 | public void Explore(StreamWriter combinedDataFile, ref bool combinedHasHeader, Dictionary blacklist) 553 | { 554 | Console.WriteLine("$$$ Exploring significant perf diff in {0} between {1} and {2}", 555 | benchmark.ShortName, baseResults.Name, endResults.Name); 556 | 557 | // Summary of performance results 558 | List deltas = new List(); 559 | 560 | // Fully detailed result trees with performance data 561 | Dictionary recapturedData = new Dictionary(); 562 | 563 | // Similar but for call count reductions.... 564 | Dictionary recapturedCC = new Dictionary(); 565 | 566 | // Count methods in end results with inlines, and total subtree size. 567 | int candidateCount = 0; 568 | int exploreCount = 0; 569 | foreach (Method m in endResults.Methods.Values) 570 | { 571 | int endCount = (int)m.InlineCount; 572 | if (endCount > 0) 573 | { 574 | candidateCount++; 575 | exploreCount += endCount; 576 | } 577 | } 578 | 579 | Console.WriteLine("$$$ Examining {0} methods, {1} inline combinations", candidateCount, exploreCount); 580 | if (blacklist != null) 581 | { 582 | Console.WriteLine("$$$ blacklist in use: {0} entries", blacklist.Count); 583 | } 584 | 585 | // Todo: order methods by call count. Find top N% of these. Determine callers (and up the tree) 586 | // Explore from there. 587 | 588 | // Explore each method with inlines. Arbitrarily bail after some number of explorations. 589 | int methodsExplored = 0; 590 | int inlinesExplored = 0; 591 | int perMethodExplorationLimit = 50; 592 | int perBenchmarkExplorationLimit = 1000; 593 | List methodsToExplore = new List(endResults.Methods.Values); 594 | methodsToExplore.Sort(Method.HasMoreCalls); 595 | 596 | foreach (Method rootMethod in methodsToExplore) 597 | { 598 | Console.WriteLine("$$$ InlinesExplored {0} MethodsExplored {1}", inlinesExplored, methodsExplored); 599 | Console.WriteLine("$$$ Exploring inlines for {0}", rootMethod.Name); 600 | 601 | // Only explore methods that had inlines 602 | int endCount = (int) rootMethod.InlineCount; 603 | 604 | if (endCount == 0) 605 | { 606 | Console.WriteLine("$$$ Skipping -- no inlines"); 607 | continue; 608 | } 609 | 610 | // Optionally just explore some paritcular root 611 | if ((Program.RootToken != null) && rootMethod.Token != Program.RootTokenValue) 612 | { 613 | Console.WriteLine("$$$ Skipping -- does not match specified root token {0}", Program.RootToken); 614 | continue; 615 | } 616 | 617 | // Don't bother exploring main since it won't be invoked via xperf 618 | // and so any apparent call count reductions from main will be misleading. 619 | if (rootMethod.Name.Equals("Main")) 620 | { 621 | Console.WriteLine("$$$ Skipping -- not driven by xunit-perf"); 622 | continue; 623 | } 624 | 625 | // Only expore methods that were called in the noinline run 626 | if (rootMethod.CallCount == 0) 627 | { 628 | Console.WriteLine("$$$ Skipping -- not called"); 629 | continue; 630 | } 631 | 632 | // Don't re-explore a method on the blacklist, unless we see significantly more calls to it than 633 | // we have ever seen before. This short-circuts exploration for common startup code and the like, 634 | // if we disable zap. 635 | if (blacklist != null) 636 | { 637 | if (blacklist.ContainsKey(rootMethod.Hash)) 638 | { 639 | Console.WriteLine("$$$ method is on the blacklist"); 640 | ulong oldCallCount = blacklist[rootMethod.Hash]; 641 | 642 | if (rootMethod.CallCount <= 2 * oldCallCount) 643 | { 644 | Console.WriteLine("$$$ Skipping -- already explored this method with {0} calls, now seeing it with {1}", 645 | oldCallCount, rootMethod.CallCount); 646 | continue; 647 | } 648 | else 649 | { 650 | Console.WriteLine("$$$ will re-explore this method, previous had {0} calls, now seeing it with {1}", 651 | oldCallCount, rootMethod.CallCount); 652 | } 653 | } 654 | else 655 | { 656 | Console.WriteLine("$$$ method not on blacklist"); 657 | } 658 | } 659 | 660 | // Limit volume of exploration 661 | if (inlinesExplored >= perBenchmarkExplorationLimit) 662 | { 663 | Console.WriteLine("$$$ Reached benchmark limit of {0} explored inlines, moving on to next benchmark", 664 | perBenchmarkExplorationLimit); 665 | break; 666 | } 667 | 668 | if (endCount > perMethodExplorationLimit) 669 | { 670 | int newEndCount = perMethodExplorationLimit; 671 | Console.WriteLine("$$$ Limiting exploration for this root to {0} inlines out of {1}", newEndCount, endCount); 672 | endCount = newEndCount; 673 | } 674 | 675 | // Trim exploration here if full explore would put us over the limit 676 | if (inlinesExplored + endCount >= perBenchmarkExplorationLimit) 677 | { 678 | int newEndCount = perBenchmarkExplorationLimit - inlinesExplored; 679 | Console.WriteLine("$$$ Might hit limit of {0} inlines explored, trimming end count from {1} to {2}", 680 | perBenchmarkExplorationLimit, endCount, newEndCount); 681 | endCount = newEndCount; 682 | } 683 | 684 | // Add method to the blacklist, if we're keeping one. 685 | if (blacklist != null) 686 | { 687 | Console.WriteLine("$$$ adding {0} to blacklist with {1} calls", rootMethod.Name, rootMethod.CallCount); 688 | blacklist[rootMethod.Hash] = rootMethod.CallCount; 689 | } 690 | 691 | // Noinline perf is already "known" from the baseline, so exclude that here. 692 | // 693 | // The maximal subtree perf may not equal the end perf because the latter allows inlines 694 | // in all methods, and we're just inlining into one method at a time here. 695 | Console.WriteLine("$$$ [{0}] examining method {1} {2:X8} with {3} inlines and {4} permutations via BFS.", 696 | methodsExplored++, rootMethod.Name, rootMethod.Token, endCount, rootMethod.NumSubtrees() - 1); 697 | rootMethod.Dump(); 698 | 699 | // Now for the actual experiment. We're going to grow the method's inline tree from the 700 | // baseline tree (which is noinline) towards the end result tree. For sufficiently large trees 701 | // there are lots of intermediate subtrees. For now we just do a simple breadth-first linear 702 | // exploration. 703 | // 704 | // However, we'll measure the full tree first. If there's no significant diff between it 705 | // and the noinline tree, then we won't bother enumerating and measuring the remaining subtrees. 706 | Results[] explorationResults = new Results[endCount + 1]; 707 | explorationResults[0] = baseResults; 708 | 709 | // After we measure perf via xunit-perf, do a normal run to recapture inline observations. 710 | // We could enable observations in the perf run, but we'd get inline xml for all the xunit 711 | // scaffolding too. This way we get something minimal. 712 | Results[] recaptureResults = new Results[endCount + 1]; 713 | recaptureResults[0] = baseResults; 714 | 715 | // Call count reduction at each step of the tree expansion 716 | double[] ccDeltas = new double[endCount + 1]; 717 | 718 | // We take advantage of the fact that for replay Xml, the default is to not inline. 719 | // So we only need to emit Xml for the methods we want to inline. Since we're only 720 | // inlining into one method, our forest just has one Method entry. 721 | InlineForest kForest = new InlineForest(); 722 | kForest.Policy = "ReplayPolicy"; 723 | kForest.Methods = new Method[1]; 724 | kForest.Methods[0] = rootMethod.ShallowCopy(); 725 | 726 | // Always explore methods with one or two possible inlines, since checking to see if the 727 | // exploration is worthwhile costs just as much as doing the exploration. 728 | // 729 | // If there are multiple inlines, then jump to the end to see if any of them matter. 730 | // If not then don't bother exploring the intermediate states. 731 | // 732 | // This might bias the exploration into visiting more good cases than "normal". 733 | if (rootMethod.InlineCount > 2) 734 | { 735 | // See if any inline in the tree has a perf impact. If not, don't bother exploring. 736 | ulong dontcare = 0; 737 | ExploreSubtree(kForest, endCount, rootMethod, benchmark, explorationResults, null, null, out dontcare); 738 | bool shouldExplore = CheckResults(explorationResults, endCount, 0); 739 | 740 | if (!shouldExplore) 741 | { 742 | Console.WriteLine("$$$ Full subtree perf NOT significant, skipping..."); 743 | continue; 744 | } 745 | else 746 | { 747 | Console.WriteLine("$$$ Full subtree perf significant, exploring..."); 748 | } 749 | } 750 | else 751 | { 752 | Console.WriteLine("$$$ Single/Double inline, exploring..."); 753 | } 754 | 755 | // Keep track of the current call count for each method. 756 | // Initial value is the base model's count. 757 | Dictionary callCounts = new Dictionary(baseResults.Methods.Count()); 758 | foreach (MethodId id in baseResults.Methods.Keys) 759 | { 760 | callCounts[id] = baseResults.Methods[id].CallCount; 761 | } 762 | 763 | // TODO: Every so often, rerun the noinline baseline, and see if we have baseline shift. 764 | 765 | ccDeltas[0] = 0; 766 | 767 | for (int k = 1; k <= endCount; k++) 768 | { 769 | inlinesExplored++; 770 | ulong ccDelta = 0; 771 | Inline lastInlineK = 772 | ExploreSubtree(kForest, k, rootMethod, benchmark, explorationResults, recaptureResults, callCounts, out ccDelta); 773 | ShowResults(explorationResults, k, k - 1, rootMethod, lastInlineK, deltas, ccDelta); 774 | ccDeltas[k] = ccDelta; 775 | } 776 | 777 | // Save off results for later processing. 778 | recapturedData[rootMethod.getId()] = recaptureResults; 779 | recapturedCC[rootMethod.getId()] = ccDeltas; 780 | } 781 | 782 | // Sort deltas and display 783 | deltas.Sort(); 784 | Console.WriteLine("$$$ --- {0}: inlines in order of impact ---", endResults.Name); 785 | foreach (InlineDelta dd in deltas) 786 | { 787 | string currentMethodName = null; 788 | if (baseResults.Methods != null && baseResults.Methods.ContainsKey(dd.inlineMethodId)) 789 | { 790 | currentMethodName = baseResults.Methods[dd.inlineMethodId].Name; 791 | } 792 | else 793 | { 794 | currentMethodName = dd.inlineMethodId.ToString(); 795 | } 796 | 797 | Console.Write("$$$ --- [{0,2:D2}] {1,12} -> {2,-12} {3,6:0.00}%", 798 | dd.index, dd.rootMethod.Name, currentMethodName, dd.pctDelta); 799 | if (dd.hasPerCallDelta) 800 | { 801 | Console.Write(" {0,10:0.00} pc", dd.perCallDelta); 802 | } 803 | Console.WriteLine(); 804 | } 805 | 806 | // Build integrated data model... 807 | string dataModelName = String.Format("{0}-{1}-data-model.csv", benchmark.ShortName, endResults.Name); 808 | string dataModelFileName = Path.Combine(Program.RESULTS_DIR, dataModelName); 809 | bool hasHeader = false; 810 | char[] comma = new char[] { ',' }; 811 | using (StreamWriter dataModelFile = File.CreateText(dataModelFileName)) 812 | { 813 | foreach (MethodId methodId in recapturedData.Keys) 814 | { 815 | Results[] resultsSet = recapturedData[methodId]; 816 | double[] ccDeltas = recapturedCC[methodId]; 817 | 818 | // resultsSet[0] is the noinline run. We don't have a entry 819 | // for it, but key column values are spilled into the inline Xml and 820 | // so deserialized into method entries. 821 | if (!baseResults.Methods.ContainsKey(methodId)) 822 | { 823 | Console.WriteLine("!!! Odd -- no base data for root {0}", methodId); 824 | continue; 825 | } 826 | 827 | int baseMethodHotSize = (int) baseResults.Methods[methodId].HotSize; 828 | int baseMethodColdSize = (int) baseResults.Methods[methodId].ColdSize; 829 | int baseMethodJitTime = (int) baseResults.Methods[methodId].JitTime; 830 | ulong baseMethodCallCount = baseResults.Methods[methodId].CallCount; 831 | 832 | for (int i = 1; i < resultsSet.Length; i++) 833 | { 834 | Results rK = resultsSet[i]; 835 | Results rKm1 = resultsSet[i - 1]; 836 | 837 | if (rK == null || rKm1 == null) 838 | { 839 | continue; 840 | } 841 | 842 | // Load up the recapture xml 843 | XElement root = XElement.Load(rK.LogFile); 844 | 845 | // Look for the embedded inliner observation schema 846 | IEnumerable schemas = from el in root.Descendants("DataSchema") select el; 847 | XElement schema = schemas.First(); 848 | string schemaString = (string)schema; 849 | // Add on the performance data column headers 850 | string extendedSchemaString = 851 | "Benchmark,SubBenchmark," + 852 | schemaString + 853 | ",HotSizeDelta,ColdSizeDelta,JitTimeDelta,InstRetiredDelta,InstRetired,InstRetiredSD" + 854 | ",CallDelta,InstRetiredPerCallDelta,RootCallCount,InstRetiredPerRootCallDelta,Confidence"; 855 | 856 | // If we haven't yet emitted a local header, do so now. 857 | if (!hasHeader) 858 | { 859 | dataModelFile.WriteLine(extendedSchemaString); 860 | hasHeader = true; 861 | } 862 | 863 | // Similarly for the combined data file 864 | if (!combinedHasHeader) 865 | { 866 | combinedDataFile.WriteLine(extendedSchemaString); 867 | combinedHasHeader = true; 868 | } 869 | 870 | // Figure out relative position of a few key columns 871 | string[] columnNames = schemaString.Split(comma); 872 | int hotSizeIndex = -1; 873 | int coldSizeIndex = -1; 874 | int jitTimeIndex = -1; 875 | int index = 0; 876 | foreach (string s in columnNames) 877 | { 878 | switch (s) 879 | { 880 | case "HotSize": 881 | hotSizeIndex = index; 882 | break; 883 | case "ColdSize": 884 | coldSizeIndex = index; 885 | break; 886 | case "JitTime": 887 | jitTimeIndex = index; 888 | break; 889 | } 890 | 891 | index++; 892 | } 893 | 894 | // Find the embededed inline observation data 895 | IEnumerable data = from el in root.Descendants("Data") select el; 896 | string dataString = (string)data.First(); 897 | string[] dataStringX = dataString.Split(comma); 898 | 899 | // Split out the observations that we need for extended info. 900 | int currentMethodHotSize = hotSizeIndex >= 0 ? Int32.Parse(dataStringX[hotSizeIndex]) : 0; 901 | int currentMethodColdSize = coldSizeIndex >= 0 ? Int32.Parse(dataStringX[coldSizeIndex]) : 0; 902 | int currentMethodJitTime = jitTimeIndex >= 0 ? Int32.Parse(dataStringX[jitTimeIndex]) : 0; 903 | double currentCCDelta = ccDeltas[i]; 904 | 905 | // How to handle data from multi-part benchmarks? 906 | // Aggregate it here, iteration-wise 907 | int subParts = rK.Performance.InstructionCount.Keys.Count; 908 | List arKData = null; 909 | List arKm1Data = null; 910 | foreach (string subBench in rK.Performance.InstructionCount.Keys) 911 | { 912 | if (!rK.Performance.InstructionCount.ContainsKey(subBench)) 913 | { 914 | Console.WriteLine("!!! Odd -- no data for root {0} on {1} at index {2}", 915 | methodId, subBench, i); 916 | break; 917 | } 918 | 919 | if (!rKm1.Performance.InstructionCount.ContainsKey(subBench)) 920 | { 921 | Console.WriteLine("!!! Odd -- no data for root {0} on {1} at index {2}", 922 | methodId, subBench, i - 1); 923 | break; 924 | } 925 | 926 | List rKData = rK.Performance.InstructionCount[subBench]; 927 | List rKm1Data = rKm1.Performance.InstructionCount[subBench]; 928 | 929 | if (arKData == null) 930 | { 931 | // Occasionally we'll lose xunit perf data, for reasons unknown 932 | if (rKData.Count != rKm1Data.Count) 933 | { 934 | Console.WriteLine("!!! Odd -- mismatched data for root {0} on {1} at index {2}", 935 | methodId, subBench, i); 936 | break; 937 | } 938 | 939 | // Copy first sub bench's data 940 | arKData = new List(rKData); 941 | arKm1Data = new List(rKm1Data); 942 | } 943 | else 944 | { 945 | // Accumulate remainder 946 | for (int ii = 0; ii < arKData.Count; ii++) 947 | { 948 | arKData[ii] += rKData[ii]; 949 | arKm1Data[ii] += rKm1Data[ii]; 950 | } 951 | } 952 | } 953 | 954 | if (arKData == null) 955 | { 956 | Console.WriteLine("!!! bailing out on index {0}", i); 957 | continue; 958 | } 959 | 960 | double confidence = PerformanceData.Confidence(arKData, arKm1Data); 961 | double arKAvg = PerformanceData.Average(arKData); 962 | double arKm1Avg = PerformanceData.Average(arKm1Data); 963 | double arKSD = PerformanceData.StdDeviation(arKData); 964 | double change = arKAvg - arKm1Avg; 965 | // Number of instructions saved per call to the current inlinee 966 | double perCallDelta = (currentCCDelta == 0) ? 0 : change / currentCCDelta; 967 | // Number of instructions saved per call to the root method 968 | double perRootDelta = (baseMethodCallCount == 0) ? 0 : change / baseMethodCallCount; 969 | 970 | int hotSizeDelta = currentMethodHotSize - baseMethodHotSize; 971 | int coldSizeDelta = currentMethodColdSize - baseMethodColdSize; 972 | int jitTimeDelta = currentMethodJitTime - baseMethodJitTime; 973 | int oneMillion = 1000 * 1000; 974 | 975 | dataModelFile.WriteLine("{0},{1},{2},{3},{4},{5},{6:0.00},{7:0.00},{8:0.00},{9:0.00},{10:0.00},{11:0.00},{12:0.00},{13:0.00}", 976 | benchmark.ShortName, "agg", 977 | dataString, 978 | hotSizeDelta, coldSizeDelta, jitTimeDelta, 979 | change / oneMillion, arKAvg / oneMillion, arKSD/ oneMillion, currentCCDelta, perCallDelta, 980 | baseMethodCallCount, perRootDelta, confidence); 981 | 982 | combinedDataFile.WriteLine("{0},{1},{2},{3},{4},{5},{6:0.00},{7:0.00},{8:0.00},{9:0.00},{10:0.00},{11:0.00},{12:0.00},{13:0.00}", 983 | benchmark.ShortName, "agg", 984 | dataString, 985 | hotSizeDelta, coldSizeDelta, jitTimeDelta, 986 | change / oneMillion, arKAvg / oneMillion, arKSD / oneMillion, currentCCDelta, perCallDelta, 987 | baseMethodCallCount, perRootDelta, confidence); 988 | 989 | baseMethodHotSize = currentMethodHotSize; 990 | baseMethodColdSize = currentMethodColdSize; 991 | baseMethodJitTime = currentMethodJitTime; 992 | } 993 | } 994 | } 995 | } 996 | 997 | Inline ExploreSubtree(InlineForest kForest, int k, Method rootMethod, 998 | Benchmark benchmark, Results[] explorationResults, Results[] recaptureResults, 999 | Dictionary callCounts, out ulong ccDelta) 1000 | { 1001 | ccDelta = 0; 1002 | 1003 | // Build inline subtree for method with first K nodes and swap it into the tree. 1004 | int index = 0; 1005 | Inline currentInline = null; 1006 | Inline[] mkInlines = rootMethod.GetBfsSubtree(k, out currentInline); 1007 | 1008 | if (mkInlines == null) 1009 | { 1010 | Console.WriteLine("$$$ {0} [{1}] Can't get this inline subtree yet, sorry", rootMethod.Name, k); 1011 | return null; 1012 | } 1013 | 1014 | kForest.Methods[index].Inlines = mkInlines; 1015 | kForest.Methods[index].InlineCount = (uint) k; 1016 | 1017 | // Externalize the inline xml 1018 | XmlSerializer xo = new XmlSerializer(typeof(InlineForest)); 1019 | string testName = String.Format("{0}-{1}-{2:X8}-{3}", benchmark.ShortName, endResults.Name, rootMethod.Token, k); 1020 | string xmlName = testName + ".xml"; 1021 | string resultsDir = Program.RESULTS_DIR; 1022 | string replayFileName = Path.Combine(resultsDir, xmlName); 1023 | using (Stream xmlOutFile = new FileStream(replayFileName, FileMode.Create)) 1024 | { 1025 | xo.Serialize(xmlOutFile, kForest); 1026 | } 1027 | 1028 | // Run the test and record the perf results. 1029 | XunitPerfRunner x = new XunitPerfRunner(); 1030 | Configuration c = new Configuration(testName); 1031 | c.Environment["COMPlus_JitInlinePolicyReplay"] = "1"; 1032 | c.Environment["COMPlus_JitInlineReplayFile"] = replayFileName; 1033 | Results resultsK = x.RunBenchmark(benchmark, c); 1034 | explorationResults[k] = resultsK; 1035 | 1036 | if (recaptureResults != null) 1037 | { 1038 | // Run test and recapture the inline XML along with observational data about the last inline 1039 | string retestName = String.Format("{0}-{1}-{2:X8}-{3}-data", benchmark.ShortName, endResults.Name, rootMethod.Token, k); 1040 | Configuration cr = new Configuration(retestName); 1041 | CoreClrRunner clr = new CoreClrRunner(); 1042 | cr.Environment["COMPlus_JitInlinePolicyReplay"] = "1"; 1043 | cr.Environment["COMPlus_JitInlineReplayFile"] = replayFileName; 1044 | // Ask for "minimal" replay XML here 1045 | cr.Environment["COMPlus_JitInlineDumpXml"] = "2"; 1046 | cr.Environment["COMPlus_JitInlineDumpData"] = "1"; 1047 | Results resultsClr = clr.RunBenchmark(benchmark, cr); 1048 | // Snag performance data from above 1049 | resultsClr.Performance = resultsK.Performance; 1050 | recaptureResults[k] = resultsClr; 1051 | } 1052 | 1053 | // Run and capture method call counts 1054 | // 1055 | // Note if we've really done a pure isolation experiment than there should be at most 1056 | // one method whose call count changes. Might be interesting to try and verify this! 1057 | // (would require zap disable or similar so we get call counts for all methods) 1058 | if (Program.CaptureCallCounts && callCounts != null) 1059 | { 1060 | string callCountName = String.Format("{0}-{1}-{2:X8}-{3}-cc", benchmark.ShortName, endResults.Name, rootMethod.Token, k); 1061 | Configuration cc = new Configuration(callCountName); 1062 | CoreClrRunner clr = new CoreClrRunner(); 1063 | cc.Environment["COMPlus_JitInlinePolicyReplay"] = "1"; 1064 | cc.Environment["COMPlus_JitInlineReplayFile"] = replayFileName; 1065 | // Ask for method entry instrumentation 1066 | cc.Environment["COMPlus_JitMeasureEntryCounts"] = "1"; 1067 | Results resultsCC = clr.RunBenchmark(benchmark, cc); 1068 | 1069 | MethodId currentId = currentInline.GetMethodId(); 1070 | bool foundcc = false; 1071 | // Parse results back and find call count for the current inline. 1072 | using (StreamReader callCountStream = File.OpenText(resultsCC.LogFile)) 1073 | { 1074 | string callCountLine = callCountStream.ReadLine(); 1075 | while (callCountLine != null) 1076 | { 1077 | string[] callCountFields = callCountLine.Split(new char[] { ',' }); 1078 | if (callCountFields.Length == 3) 1079 | { 1080 | uint token = UInt32.Parse(callCountFields[0], System.Globalization.NumberStyles.HexNumber); 1081 | uint hash = UInt32.Parse(callCountFields[1], System.Globalization.NumberStyles.HexNumber); 1082 | ulong count = UInt64.Parse(callCountFields[2]); 1083 | 1084 | if (token == currentId.Token && hash == currentId.Hash) 1085 | { 1086 | foundcc = true; 1087 | 1088 | if (callCounts.ContainsKey(currentId)) 1089 | { 1090 | // Note we expect it not to increase! 1091 | // 1092 | // Zero is possible if we inline at a call site that was not hit. 1093 | // We may even see perf impact with zero call count change, 1094 | // because of changes elsewhere in the method in code that is hit. 1095 | ulong oldCount = callCounts[currentId]; 1096 | callCounts[currentId] = count; 1097 | Console.WriteLine("Call count for {0:X8}-{1:X8} went from {2} to {3}", 1098 | token, hash, oldCount, count); 1099 | ccDelta = oldCount - count; 1100 | if (ccDelta < 0) 1101 | { 1102 | Console.WriteLine("Call count unexpectedly increased!"); 1103 | } 1104 | } 1105 | else 1106 | { 1107 | // Don't really expect to hit this.. we'll never see this method as a root. 1108 | Console.WriteLine("Call count for {0:X8}-{1:X8} went from {2} to {3}", 1109 | token, hash, "unknown", count); 1110 | } 1111 | break; 1112 | } 1113 | } 1114 | 1115 | callCountLine = callCountStream.ReadLine(); 1116 | } 1117 | } 1118 | 1119 | if (!foundcc) 1120 | { 1121 | // The method was evidently not called in the latest run. 1122 | if (callCounts.ContainsKey(currentId)) 1123 | { 1124 | // It was called in earlier runs, so assume we've inlined the last call. 1125 | ccDelta = callCounts[currentId]; 1126 | Console.WriteLine("### No (after) call count entry for {0:X8}-{1:X8}. Assuming all calls inlined. ccdelta = {2}.", 1127 | currentId.Token, currentId.Hash, ccDelta); 1128 | } 1129 | else 1130 | { 1131 | // It was not called in earlier runs, assume it was never called. 1132 | ccDelta = 0; 1133 | Console.WriteLine("### No (before) call count entry for {0:X8}-{1:X8}. Assuming method never called. ccdelta = 0.", 1134 | currentId.Token, currentId.Hash); 1135 | } 1136 | 1137 | // Going forward, we don't expect to see this method be called 1138 | callCounts[currentId] = 0; 1139 | } 1140 | } 1141 | 1142 | return currentInline; 1143 | } 1144 | 1145 | // Determine confidence level that performance differs in the two indicated 1146 | // result sets. 1147 | // 1148 | // If we can't tell the difference between the two, it may 1149 | // mean either (a) the method or call site was never executed, or (b) 1150 | // the inlines had no perf impact. 1151 | // 1152 | // We could still add this info to our model, since the jit won't generally 1153 | // be able to tell if a callee will be executed, but for now we just look 1154 | // for impactful changes. 1155 | bool CheckResults(Results[] explorationResults, int diffIndex, int baseIndex) 1156 | { 1157 | Results baseResults = explorationResults[baseIndex]; 1158 | Results diffResults = explorationResults[diffIndex]; 1159 | 1160 | // Make sure runs happened. Might not if we couldn't find the base method. 1161 | if (baseResults == null) 1162 | { 1163 | Console.WriteLine("$$$ Can't get base run data, sorry"); 1164 | return false; 1165 | } 1166 | 1167 | if (diffResults == null) 1168 | { 1169 | Console.WriteLine("$$$ Can't get diff run data, sorry"); 1170 | return false; 1171 | } 1172 | 1173 | bool signficant = false; 1174 | 1175 | foreach (string subBench in baseResults.Performance.InstructionCount.Keys) 1176 | { 1177 | List baseData = baseResults.Performance.InstructionCount[subBench]; 1178 | List diffData = diffResults.Performance.InstructionCount[subBench]; 1179 | double confidence = PerformanceData.Confidence(baseData, diffData); 1180 | 1181 | signficant |= (confidence > 0.8); 1182 | } 1183 | 1184 | return signficant; 1185 | } 1186 | void ShowResults(Results[] explorationResults, int diffIndex, int baseIndex, 1187 | Method rootMethod, Inline currentInline, List deltas, ulong ccDelta) 1188 | { 1189 | Results zeroResults = explorationResults[0]; 1190 | Results baseResults = explorationResults[baseIndex]; 1191 | Results diffResults = explorationResults[diffIndex]; 1192 | 1193 | // Make sure runs happened. Might not if we couldn't find the base method. 1194 | if (zeroResults == null) 1195 | { 1196 | Console.WriteLine("$$$ Can't get noinline run data, sorry"); 1197 | return; 1198 | } 1199 | 1200 | if (baseResults == null) 1201 | { 1202 | Console.WriteLine("$$$ Can't get base run data, sorry"); 1203 | return; 1204 | } 1205 | 1206 | if (diffResults == null) 1207 | { 1208 | Console.WriteLine("$$$ Can't get diff run data, sorry"); 1209 | return; 1210 | } 1211 | 1212 | // Try and get the name of the last inline. 1213 | // We may not know it, if the method was prejitted, since it will 1214 | // never be a jit root. 1215 | // If so, use the token value. 1216 | MethodId currentMethodId = currentInline.GetMethodId(); 1217 | string currentMethodName = null; 1218 | if (baseResults.Methods != null && baseResults.Methods.ContainsKey(currentMethodId)) 1219 | { 1220 | currentMethodName = baseResults.Methods[currentMethodId].Name; 1221 | } 1222 | else 1223 | { 1224 | currentMethodName = String.Format("Token {0:X8} Hash {1:X8}", 1225 | currentMethodId.Token, currentMethodId.Hash); 1226 | } 1227 | 1228 | Console.WriteLine("$$$ Root {0} index {1} inlining {2}", rootMethod.Name, diffIndex, currentMethodName); 1229 | 1230 | foreach (string subBench in baseResults.Performance.InstructionCount.Keys) 1231 | { 1232 | List zeroData = zeroResults.Performance.InstructionCount[subBench]; 1233 | List baseData = baseResults.Performance.InstructionCount[subBench]; 1234 | List diffData = diffResults.Performance.InstructionCount[subBench]; 1235 | 1236 | double confidence = PerformanceData.Confidence(baseData, diffData); 1237 | double baseAvg = PerformanceData.Average(baseData); 1238 | double diffAvg = PerformanceData.Average(diffData); 1239 | double change = diffAvg - baseAvg; 1240 | double pctDiff = 100.0 * change / baseAvg; 1241 | 1242 | double confidence0 = PerformanceData.Confidence(baseData, diffData); 1243 | double zeroAvg = PerformanceData.Average(zeroData); 1244 | double change0 = diffAvg - zeroAvg; 1245 | double pctDiff0 = 100.0 * change0 / zeroAvg; 1246 | 1247 | Console.WriteLine("{0:30}: base {1:0.00}M new {2:0.00}M delta {3:0.00}M ({4:0.00}%) confidence {5:0.00}", 1248 | subBench, 1249 | baseAvg / (1000 * 1000), diffAvg / (1000 * 1000), 1250 | change / (1000 * 1000), pctDiff, confidence); 1251 | Console.Write("{0:30} noinl {1:0.00}M delta {2:0.00}M ({3:0.00}%) confidence {4:0.00}", 1252 | "", zeroAvg / (1000 * 1000), change0 / (1000 * 1000), pctDiff0, confidence0); 1253 | 1254 | if (ccDelta != 0) 1255 | { 1256 | Console.Write(" cc-delta {0} ipc {1:0.00}", ccDelta, change / ccDelta ); 1257 | } 1258 | 1259 | Console.WriteLine(); 1260 | 1261 | if (deltas != null) 1262 | { 1263 | InlineDelta d = new InlineDelta(); 1264 | 1265 | d.rootMethod = rootMethod; 1266 | d.inlineMethodId = currentMethodId; 1267 | d.pctDelta = pctDiff; 1268 | d.index = diffIndex; 1269 | d.subBench = subBench; 1270 | d.confidence = confidence; 1271 | d.instructionsDelta = change; 1272 | if (ccDelta != 0) 1273 | { 1274 | d.hasPerCallDelta = true; 1275 | d.perCallDelta = change / ccDelta; 1276 | d.callsDelta = ccDelta; 1277 | } 1278 | 1279 | deltas.Add(d); 1280 | } 1281 | } 1282 | } 1283 | } 1284 | 1285 | // A mechanism to run the benchmark 1286 | public abstract class Runner 1287 | { 1288 | public abstract Results RunBenchmark(Benchmark b, Configuration c); 1289 | } 1290 | 1291 | public class CoreClrRunner : Runner 1292 | { 1293 | public CoreClrRunner() 1294 | { 1295 | cmdExe = Program.SHELL; 1296 | runnerExe = Program.CORERUN; 1297 | } 1298 | 1299 | public override Results RunBenchmark(Benchmark b, Configuration c) 1300 | { 1301 | // Make sure there's an exe to run. 1302 | if (!File.Exists(runnerExe)) 1303 | { 1304 | Console.WriteLine("Can't find runner exe: '{0}'", runnerExe); 1305 | return null; 1306 | } 1307 | 1308 | // Setup process information 1309 | System.Diagnostics.Process runnerProcess = new Process(); 1310 | runnerProcess.StartInfo.FileName = cmdExe; 1311 | string stderrName = c.ResultsDirectory + @"\" + b.ShortName + "-" + c.Name + ".xml"; 1312 | 1313 | foreach (string envVar in c.Environment.Keys) 1314 | { 1315 | runnerProcess.StartInfo.Environment[envVar] = c.Environment[envVar]; 1316 | } 1317 | runnerProcess.StartInfo.Environment["CORE_ROOT"] = Path.GetDirectoryName(runnerExe); 1318 | runnerProcess.StartInfo.Arguments = "/C \"" + runnerExe + " " + b.FullPath + " 2> " + stderrName + "\""; 1319 | runnerProcess.StartInfo.WorkingDirectory = System.IO.Path.GetDirectoryName(b.FullPath); 1320 | runnerProcess.StartInfo.UseShellExecute = false; 1321 | 1322 | if (Program.VeryVerbose) 1323 | { 1324 | Console.WriteLine("CoreCLR: launching " + runnerProcess.StartInfo.Arguments); 1325 | } 1326 | 1327 | runnerProcess.Start(); 1328 | runnerProcess.WaitForExit(); 1329 | 1330 | if (Program.Verbose) 1331 | { 1332 | Console.WriteLine("CoreCLR: Finished running {0} -- configuration: {1}, exit code: {2} (expected {3})", 1333 | b.ShortName, c.Name, runnerProcess.ExitCode, b.ExitCode); 1334 | } 1335 | 1336 | Results results = new Results(); 1337 | results.Success = (b.ExitCode == runnerProcess.ExitCode); 1338 | results.ExitCode = b.ExitCode; 1339 | results.LogFile = stderrName; 1340 | results.Name = c.Name; 1341 | 1342 | // TODO: Iterate to get perf data 1343 | List timeData = new List(1); 1344 | timeData.Add(runnerProcess.ExitTime.Subtract(runnerProcess.StartTime).TotalMilliseconds); 1345 | results.Performance.ExecutionTime[b.ShortName] = timeData; 1346 | return results; 1347 | } 1348 | 1349 | private string runnerExe; 1350 | private string cmdExe; 1351 | } 1352 | 1353 | public class XunitPerfRunner : Runner 1354 | { 1355 | public XunitPerfRunner() 1356 | { 1357 | SetupSandbox(); 1358 | } 1359 | 1360 | void SetupSandbox() 1361 | { 1362 | // Only do this once per run 1363 | if (sandboxIsSetup) 1364 | { 1365 | return; 1366 | } 1367 | 1368 | if (Directory.Exists(sandboxDir)) 1369 | { 1370 | if (Program.Verbose) 1371 | { 1372 | Console.WriteLine("...Cleaning old xunit-perf sandbox '{0}'", sandboxDir); 1373 | } 1374 | Directory.Delete(sandboxDir, true); 1375 | } 1376 | 1377 | if (Program.Verbose) 1378 | { 1379 | Console.WriteLine("...Creating new xunit-perf sandbox '{0}'", sandboxDir); 1380 | } 1381 | Directory.CreateDirectory(sandboxDir); 1382 | DirectoryInfo sandboxDirectoryInfo = new DirectoryInfo(sandboxDir); 1383 | 1384 | // Copy over xunit packages 1385 | string xUnitPerfRunner = Path.Combine(coreclrRoot, @"packages\Microsoft.DotNet.xunit.performance.runner.Windows\1.0.0-alpha-build0040\tools"); 1386 | string xUnitPerfAnalysis = Path.Combine(coreclrRoot, @"packages\Microsoft.DotNet.xunit.performance.analysis\1.0.0-alpha-build0040\tools"); 1387 | string xUnitPerfConsole = Path.Combine(coreclrRoot, @"packages\xunit.console.netcore\1.0.2-prerelease-00177\runtimes\any\native"); 1388 | 1389 | CopyAll(new DirectoryInfo(xUnitPerfRunner), sandboxDirectoryInfo); 1390 | CopyAll(new DirectoryInfo(xUnitPerfConsole), sandboxDirectoryInfo); 1391 | CopyAll(new DirectoryInfo(xUnitPerfAnalysis), sandboxDirectoryInfo); 1392 | CopyAll(new DirectoryInfo(testOverlayRoot), sandboxDirectoryInfo); 1393 | 1394 | sandboxIsSetup = true; 1395 | } 1396 | 1397 | public static void CopyAll(DirectoryInfo source, DirectoryInfo target) 1398 | { 1399 | Directory.CreateDirectory(target.FullName); 1400 | 1401 | // Copy each file into the new directory. 1402 | foreach (FileInfo fi in source.GetFiles()) 1403 | { 1404 | fi.CopyTo(Path.Combine(target.FullName, fi.Name), true); 1405 | } 1406 | 1407 | // Copy each subdirectory using recursion. 1408 | foreach (DirectoryInfo diSourceSubDir in source.GetDirectories()) 1409 | { 1410 | DirectoryInfo nextTargetSubDir = 1411 | target.CreateSubdirectory(diSourceSubDir.Name); 1412 | CopyAll(diSourceSubDir, nextTargetSubDir); 1413 | } 1414 | } 1415 | 1416 | // See if there's some way to just run a particular sub benchmark? 1417 | public override Results RunBenchmark(Benchmark b, Configuration c) 1418 | { 1419 | // Copy benchmark to sandbox 1420 | string benchmarkFile = Path.GetFileName(b.FullPath); 1421 | File.Copy(b.FullPath, Path.Combine(sandboxDir, benchmarkFile), true); 1422 | 1423 | // Setup process information 1424 | System.Diagnostics.Process runnerProcess = new Process(); 1425 | runnerProcess.StartInfo.FileName = Path.Combine(sandboxDir, "xunit.performance.run.exe"); 1426 | string perfName = c.Name + "-" + b.ShortName; 1427 | 1428 | foreach (string envVar in c.Environment.Keys) 1429 | { 1430 | runnerProcess.StartInfo.Environment[envVar] = c.Environment[envVar]; 1431 | } 1432 | runnerProcess.StartInfo.Environment["CORE_ROOT"] = sandboxDir; 1433 | runnerProcess.StartInfo.Environment["XUNIT_PERFORMANCE_MIN_ITERATION"] = Program.MinIterations.ToString(); 1434 | runnerProcess.StartInfo.Environment["XUNIT_PERFORMANCE_MAX_ITERATION"] = Program.MaxIterations.ToString(); 1435 | 1436 | runnerProcess.StartInfo.Arguments = benchmarkFile + 1437 | " -nologo -runner xunit.console.netcore.exe -runnerhost corerun.exe -runid " + 1438 | perfName + 1439 | (Program.ClassFilter == null ? "" : " -class " + Program.ClassFilter) + 1440 | (Program.MethodFilter == null ? "" : " -method " + Program.MethodFilter); 1441 | 1442 | runnerProcess.StartInfo.WorkingDirectory = sandboxDir; 1443 | runnerProcess.StartInfo.UseShellExecute = false; 1444 | 1445 | if (Program.VeryVerbose) 1446 | { 1447 | Console.WriteLine("xUnitPerf: launching " + runnerProcess.StartInfo.Arguments); 1448 | } 1449 | 1450 | runnerProcess.Start(); 1451 | runnerProcess.WaitForExit(); 1452 | 1453 | if (Program.VeryVerbose) 1454 | { 1455 | // Xunit doesn't run Main so no 100 exit here. 1456 | Console.WriteLine("xUnitPerf: Finished running {0} -- configuration: {1}, exit code: {2}", 1457 | b.ShortName, c.Name, runnerProcess.ExitCode); 1458 | } 1459 | 1460 | // Parse iterations out of perf-*.xml 1461 | string xmlPerfResultsFile = Path.Combine(sandboxDir, perfName) + ".xml"; 1462 | XElement root = XElement.Load(xmlPerfResultsFile); 1463 | IEnumerable subBenchmarks = from el in root.Descendants("test") select el; 1464 | 1465 | // We keep the raw iterations results and just summarize here. 1466 | Results results = new Results(); 1467 | PerformanceData perfData = results.Performance; 1468 | 1469 | foreach (XElement sub in subBenchmarks) 1470 | { 1471 | string subName = (string)sub.Attribute("name"); 1472 | 1473 | IEnumerable iExecutionTimes = 1474 | from el in sub.Descendants("iteration") 1475 | where el.Attribute("Duration") != null && (string)el.Attribute("index") != "0" 1476 | select Double.Parse((string)el.Attribute("Duration")); 1477 | 1478 | IEnumerable iInstructionsRetired = 1479 | from el in sub.Descendants("iteration") 1480 | where el.Attribute("InstRetired") != null && (string)el.Attribute("index") != "0" 1481 | select Double.Parse((string)el.Attribute("InstRetired")); 1482 | 1483 | perfData.ExecutionTime[subName] = new List(iExecutionTimes); 1484 | perfData.InstructionCount[subName] = new List(iInstructionsRetired); 1485 | } 1486 | 1487 | if (Program.Verbose) 1488 | { 1489 | perfData.Print(c.Name); 1490 | } 1491 | 1492 | results.Success = (b.ExitCode == runnerProcess.ExitCode); 1493 | results.ExitCode = b.ExitCode; 1494 | results.LogFile = ""; 1495 | results.Name = c.Name; 1496 | 1497 | return results; 1498 | } 1499 | 1500 | static string sandboxDir = Program.SANDBOX_DIR; 1501 | static string coreclrRoot = Program.CORECLR_ROOT; 1502 | static string testOverlayRoot = Path.Combine(coreclrRoot, @"bin\tests\Windows_NT.x64.Release\tests\Core_Root"); 1503 | static bool sandboxIsSetup; 1504 | } 1505 | 1506 | public class Program 1507 | { 1508 | 1509 | // The noinline model is one where inlining is disabled. 1510 | // The inline forest here is minimal. 1511 | // 1512 | // An attributed profile of this model helps the tool 1513 | // identify areas for investigation. 1514 | Results BuildNoInlineModel(Runner r, Runner x, Benchmark b) 1515 | { 1516 | Console.WriteLine("----"); 1517 | Console.WriteLine("---- No Inline Model for {0}", b.ShortName); 1518 | 1519 | // Create empty inline replay XML 1520 | InlineForest emptyForest = new InlineForest(); 1521 | emptyForest.Policy = "ReplayPolicy"; 1522 | XmlSerializer emptySerializer = new XmlSerializer(typeof(InlineForest)); 1523 | string emptyXmlFile = String.Format("{0}-empy-replay.xml", b.ShortName); 1524 | string emptyXmlPath = Path.Combine(Program.RESULTS_DIR, emptyXmlFile); 1525 | using (Stream emptyXmlStream = new FileStream(emptyXmlPath, FileMode.Create)) 1526 | { 1527 | emptySerializer.Serialize(emptyXmlStream, emptyForest); 1528 | } 1529 | 1530 | // Replay with empty xml and recapture the full noinline xml. Latter will 1531 | // show all the methods that were jitted. 1532 | Configuration noInlineConfig = new Configuration("noinl"); 1533 | noInlineConfig.ResultsDirectory = Program.RESULTS_DIR; 1534 | noInlineConfig.Environment["COMPlus_JitInlinePolicyReplay"] = "1"; 1535 | noInlineConfig.Environment["COMPlus_JitInlineReplayFile"] = emptyXmlPath; 1536 | noInlineConfig.Environment["COMPlus_JitInlineDumpXml"] = "1"; 1537 | 1538 | Results noInlineResults = r.RunBenchmark(b, noInlineConfig); 1539 | 1540 | if (noInlineResults == null || !noInlineResults.Success) 1541 | { 1542 | Console.WriteLine("Noinline run failed\n"); 1543 | return null; 1544 | } 1545 | 1546 | if (Program.ExploreInlines) 1547 | { 1548 | // Parse noinline xml 1549 | XmlSerializer xml = new XmlSerializer(typeof(InlineForest)); 1550 | InlineForest f; 1551 | Stream xmlFile = new FileStream(noInlineResults.LogFile, FileMode.Open); 1552 | try 1553 | { 1554 | f = (InlineForest)xml.Deserialize(xmlFile); 1555 | } 1556 | catch (System.Exception ex) 1557 | { 1558 | Console.WriteLine("Xml deserialization failed: " + ex.Message); 1559 | return null; 1560 | } 1561 | 1562 | long inlineCount = f.Methods.Sum(m => m.InlineCount); 1563 | Console.WriteLine("*** Noinline config has {0} methods, {1} inlines", f.Methods.Length, inlineCount); 1564 | noInlineResults.InlineForest = f; 1565 | 1566 | // Determine set of unique method Ids and build map from ID to method 1567 | Dictionary idCounts = new Dictionary(); 1568 | Dictionary methods = new Dictionary(f.Methods.Length); 1569 | 1570 | foreach (Method m in f.Methods) 1571 | { 1572 | MethodId id = m.getId(); 1573 | methods[id] = m; 1574 | 1575 | if (idCounts.ContainsKey(id)) 1576 | { 1577 | idCounts[id]++; 1578 | } 1579 | else 1580 | { 1581 | idCounts[id] = 1; 1582 | } 1583 | } 1584 | 1585 | noInlineResults.Methods = methods; 1586 | 1587 | Console.WriteLine("*** Noinline config has {0} unique method IDs", idCounts.Count); 1588 | 1589 | foreach (MethodId m in idCounts.Keys) 1590 | { 1591 | uint count = idCounts[m]; 1592 | if (count > 1) 1593 | { 1594 | Console.WriteLine("*** MethodId Token:0x{0:X8} Hash:0x{1:X8} has {2} duplicates", m.Token, m.Hash, count); 1595 | } 1596 | } 1597 | 1598 | // Mark methods in noinline results that do not have unique IDs 1599 | foreach (Method m in f.Methods) 1600 | { 1601 | MethodId id = m.getId(); 1602 | if (idCounts[id] > 1) 1603 | { 1604 | m.MarkAsDuplicate(); 1605 | } 1606 | } 1607 | } 1608 | 1609 | // Get noinline perf numbers using empty replay xml 1610 | Configuration noinlinePerfConfig = new Configuration("noinline-perf"); 1611 | noinlinePerfConfig.ResultsDirectory = Program.RESULTS_DIR; 1612 | noinlinePerfConfig.Environment["COMPlus_JitInlinePolicyReplay"] = "1"; 1613 | noinlinePerfConfig.Environment["COMPlus_JitInlineReplayFile"] = emptyXmlPath; 1614 | Results perfResults = x.RunBenchmark(b, noinlinePerfConfig); 1615 | Console.WriteLine("-- Updating noinline results"); 1616 | noInlineResults.Performance = perfResults.Performance; 1617 | noInlineResults.Performance.Print(noInlineConfig.Name); 1618 | 1619 | // Get noinline method call counts 1620 | // Todo: use xunit runner and capture stderr? Downside is that xunit-perf 1621 | // entry points won't be in the baseline method set. 1622 | if (CaptureCallCounts) 1623 | { 1624 | Configuration noInlineCallCountConfig = new Configuration("noinline-cc"); 1625 | noInlineCallCountConfig.ResultsDirectory = Program.RESULTS_DIR; 1626 | noInlineCallCountConfig.Environment["COMPlus_JitInlinePolicyReplay"] = "1"; 1627 | noInlineCallCountConfig.Environment["COMPlus_JitInlineReplayFile"] = emptyXmlPath; 1628 | noInlineCallCountConfig.Environment["COMPlus_JitMeasureEntryCounts"] = "1"; 1629 | Results ccResults = r.RunBenchmark(b, noInlineCallCountConfig); 1630 | 1631 | AnnotateCallCounts(ccResults, noInlineResults); 1632 | } 1633 | 1634 | return noInlineResults; 1635 | } 1636 | 1637 | // The legacy model reflects the current jit behavior. 1638 | // Scoring of runs will be relative to this data. 1639 | // The inherent noise level is also estimated here. 1640 | Results BuildLegacyModel(Runner r, Runner x, Benchmark b, bool enhanced = false) 1641 | { 1642 | string modelName = enhanced ? "EnhancedLegacy" : "Legacy"; 1643 | Console.WriteLine("----"); 1644 | Console.WriteLine("---- {0} Model for {1}", modelName, b.ShortName); 1645 | 1646 | Configuration legacyConfig = new Configuration(modelName); 1647 | legacyConfig.ResultsDirectory = Program.RESULTS_DIR; 1648 | legacyConfig.Environment["COMPlus_JitInlineDumpXml"] = "1"; 1649 | if (!enhanced) 1650 | { 1651 | legacyConfig.Environment["COMPlus_JitInlinePolicyLegacy"] = "1"; 1652 | } 1653 | 1654 | Results legacyResults = r.RunBenchmark(b, legacyConfig); 1655 | 1656 | if (legacyResults == null || !legacyResults.Success) 1657 | { 1658 | Console.WriteLine("Legacy run failed\n"); 1659 | return null; 1660 | } 1661 | 1662 | if (Program.ExploreInlines) 1663 | { 1664 | XmlSerializer xml = new XmlSerializer(typeof(InlineForest)); 1665 | InlineForest f; 1666 | Stream xmlFile = new FileStream(legacyResults.LogFile, FileMode.Open); 1667 | f = (InlineForest)xml.Deserialize(xmlFile); 1668 | long inlineCount = f.Methods.Sum(m => m.InlineCount); 1669 | Console.WriteLine("*** Legacy config has {0} methods, {1} inlines", f.Methods.Length, inlineCount); 1670 | legacyResults.InlineForest = f; 1671 | 1672 | // Populate the methodId -> method lookup table 1673 | Dictionary methods = new Dictionary(f.Methods.Length); 1674 | foreach (Method m in f.Methods) 1675 | { 1676 | MethodId id = m.getId(); 1677 | methods[id] = m; 1678 | } 1679 | legacyResults.Methods = methods; 1680 | } 1681 | 1682 | // Now get legacy perf numbers 1683 | Configuration legacyPerfConfig = new Configuration(modelName + "-perf"); 1684 | if (!enhanced) 1685 | { 1686 | legacyPerfConfig.Environment["COMPlus_JitInlinePolicyLegacy"] = "1"; 1687 | } 1688 | legacyPerfConfig.ResultsDirectory = Program.RESULTS_DIR; 1689 | Results perfResults = x.RunBenchmark(b, legacyPerfConfig); 1690 | legacyResults.Performance = perfResults.Performance; 1691 | legacyResults.Performance.Print(legacyConfig.Name); 1692 | 1693 | // Get legacy method call counts 1694 | if (CaptureCallCounts) 1695 | { 1696 | Configuration legacyCallCountConfig = new Configuration(modelName + "cc"); 1697 | legacyCallCountConfig.ResultsDirectory = Program.RESULTS_DIR; 1698 | legacyCallCountConfig.Environment["COMPlus_JitMeasureEntryCounts"] = "1"; 1699 | if (!enhanced) 1700 | { 1701 | legacyCallCountConfig.Environment["COMPlus_JitInlinePolicyLegacy"] = "1"; 1702 | } 1703 | Results ccResults = r.RunBenchmark(b, legacyCallCountConfig); 1704 | 1705 | // Parse results back and annotate base method set 1706 | AnnotateCallCounts(ccResults, legacyResults); 1707 | } 1708 | 1709 | return legacyResults; 1710 | } 1711 | 1712 | // The full model creates an inline forest at some prescribed 1713 | // depth. The inline configurations that will be explored 1714 | // are sub-forests of this full forest. 1715 | Results BuildFullModel(Runner r, Runner x, Benchmark b, Results noinlineResults) 1716 | { 1717 | Console.WriteLine("----"); 1718 | Console.WriteLine("---- Full Model for {0}", b.ShortName); 1719 | 1720 | string resultsDir = Program.RESULTS_DIR; 1721 | // Because we're jitting and inlining some methods won't be jitted on 1722 | // their own at all. To unearth full trees for all methods we need 1723 | // to iterate. The rough idea is as follows. 1724 | // 1725 | // Run with FullPolicy for all methods. This will end up jitting 1726 | // some subset of methods seen in the noinline config. Compute this subset, 1727 | // collect up their trees, and then disable inlining for those methods. 1728 | // Rerun. This time around some of the methods missed in the first will 1729 | // be jitted and will grow inline trees. Collect these new trees and 1730 | // add those methods to the disabled set. Repeat until we've seen all methods. 1731 | // 1732 | // Unfortunately we don't have unique IDs for methods. To handle this we 1733 | // need to determine which methods do have unique IDs. 1734 | 1735 | // This is the count of noinline methods with unique IDs. 1736 | int methodCount = ExploreInlines ? noinlineResults.Methods.Count : 1; 1737 | 1738 | // We'll collect up these methods with their full trees here. 1739 | HashSet fullMethodIds = new HashSet(); 1740 | List fullMethods = new List(methodCount); 1741 | uint iteration = 0; 1742 | uint maxInlineCount = 0; 1743 | uint leafMethodCount = 0; 1744 | uint newMethodCount = 0; 1745 | Method maxInlineMethod = null; 1746 | bool failed = false; 1747 | 1748 | while (fullMethodIds.Count < methodCount + newMethodCount) 1749 | { 1750 | iteration++; 1751 | 1752 | Console.WriteLine("*** Full config -- iteration {0}, still need trees for {1} out of {2} methods", 1753 | iteration, methodCount + newMethodCount - fullMethodIds.Count, methodCount + newMethodCount); 1754 | 1755 | Configuration fullConfiguration = new Configuration("full-" + iteration); 1756 | fullConfiguration.ResultsDirectory = resultsDir; 1757 | fullConfiguration.Environment["COMPlus_JitInlinePolicyFull"] = "1"; 1758 | fullConfiguration.Environment["COMPlus_JitInlineDepth"] = "10"; 1759 | fullConfiguration.Environment["COMPlus_JitInlineSize"] = "200"; 1760 | fullConfiguration.Environment["COMPlus_JitInlineDumpXml"] = "1"; 1761 | 1762 | // Build an exclude string disabling inlining in all the methods we've 1763 | // collected so far. If there are no methods yet, don't bother. 1764 | if (fullMethodIds.Count > 0) 1765 | { 1766 | StringBuilder sb = new StringBuilder(); 1767 | foreach (MethodId id in fullMethodIds) 1768 | { 1769 | sb.Append(" "); 1770 | sb.Append(id.Hash); 1771 | } 1772 | string excludeString = sb.ToString(); 1773 | // Console.WriteLine("*** exclude string: {0}\n", excludeString); 1774 | fullConfiguration.Environment["COMPlus_JitNoInlineRange"] = excludeString; 1775 | } 1776 | 1777 | // Run this iteration 1778 | Results currentResults = r.RunBenchmark(b, fullConfiguration); 1779 | 1780 | if (currentResults == null || !currentResults.Success) 1781 | { 1782 | failed = true; 1783 | Console.WriteLine("Full run failed\n"); 1784 | break; 1785 | } 1786 | 1787 | // Parse the resulting xml 1788 | XmlSerializer xml = new XmlSerializer(typeof(InlineForest)); 1789 | Stream xmlFile = new FileStream(currentResults.LogFile, FileMode.Open); 1790 | InlineForest f = (InlineForest) xml.Deserialize(xmlFile); 1791 | long inlineCount = f.Methods.Sum(m => m.InlineCount); 1792 | Console.WriteLine("*** This iteration of full config has {0} methods, {1} inlines", f.Methods.Length, inlineCount); 1793 | currentResults.InlineForest = f; 1794 | 1795 | // Find the set of new methods that we saw 1796 | HashSet newMethodIds = new HashSet(); 1797 | foreach (Method m in f.Methods) 1798 | { 1799 | MethodId id = m.getId(); 1800 | 1801 | if (!fullMethodIds.Contains(id) && !newMethodIds.Contains(id)) 1802 | { 1803 | fullMethods.Add(m); 1804 | newMethodIds.Add(id); 1805 | 1806 | if (ExploreInlines && !noinlineResults.Methods.ContainsKey(id)) 1807 | { 1808 | // Need to figure out why this happens. 1809 | // 1810 | // Suspect we're inlining force inlines in the noinline model but not here. 1811 | Console.WriteLine("*** full model uncovered new method: Token:0x{0:X8} Hash:0x{1:X8}", m.Token, m.Hash); 1812 | newMethodCount++; 1813 | } 1814 | 1815 | if (m.InlineCount > maxInlineCount) 1816 | { 1817 | maxInlineCount = m.InlineCount; 1818 | maxInlineMethod = m; 1819 | } 1820 | 1821 | if (m.InlineCount == 0) 1822 | { 1823 | leafMethodCount++; 1824 | } 1825 | } 1826 | } 1827 | 1828 | Console.WriteLine("*** found {0} new methods", newMethodIds.Count); 1829 | 1830 | if (newMethodIds.Count == 0) 1831 | { 1832 | failed = true; 1833 | Console.WriteLine("*** bailing out, unable to make forward progress"); 1834 | break; 1835 | } 1836 | 1837 | fullMethodIds.UnionWith(newMethodIds); 1838 | } 1839 | 1840 | if (failed) 1841 | { 1842 | return null; 1843 | } 1844 | 1845 | Console.WriteLine("*** Full model complete, took {0} iterations", iteration); 1846 | 1847 | // Now build the aggregate inline forest.... 1848 | InlineForest fullForest = new InlineForest(); 1849 | fullForest.Methods = fullMethods.ToArray(); 1850 | 1851 | // And consolidate into a results set 1852 | Results fullResults = new Results(); 1853 | fullResults.InlineForest = fullForest; 1854 | fullResults.Name = "full"; 1855 | 1856 | // Populate the methodId -> method lookup table 1857 | Dictionary methods = new Dictionary(fullMethods.Count); 1858 | foreach (Method m in fullMethods) 1859 | { 1860 | MethodId id = m.getId(); 1861 | methods[id] = m; 1862 | } 1863 | fullResults.Methods = methods; 1864 | 1865 | long fullInlineCount = fullForest.Methods.Sum(m => m.InlineCount); 1866 | uint nonLeafMethodCount = (uint) fullMethods.Count - leafMethodCount; 1867 | Console.WriteLine("*** Full config has {0} methods, {1} inlines", fullForest.Methods.Length, fullInlineCount); 1868 | Console.WriteLine("*** {0} leaf methods, {1} methods with inlines, {2} average inline count", 1869 | leafMethodCount, nonLeafMethodCount, fullInlineCount/ nonLeafMethodCount); 1870 | Console.WriteLine("*** {0} max inline count for method 0x{1:X8} -- {2} subtrees", 1871 | maxInlineCount, maxInlineMethod.Token, maxInlineMethod.NumSubtrees()); 1872 | 1873 | // Serialize out the consolidated set of trees 1874 | XmlSerializer xo = new XmlSerializer(typeof(InlineForest)); 1875 | Stream xmlOutFile = new FileStream(Path.Combine(resultsDir, b.ShortName + "-full-consolidated.xml"), FileMode.Create); 1876 | xo.Serialize(xmlOutFile, fullForest); 1877 | 1878 | // Now get full perf numbers -- just for the initial set 1879 | Configuration fullPerfConfig = new Configuration("full-perf"); 1880 | fullPerfConfig.Environment["COMPlus_JitInlinePolicyFull"] = "1"; 1881 | fullPerfConfig.Environment["COMPlus_JitInlineDepth"] = "10"; 1882 | fullPerfConfig.Environment["COMPlus_JitInlineSize"] = "200"; 1883 | fullPerfConfig.ResultsDirectory = Program.RESULTS_DIR; 1884 | Results perfResults = x.RunBenchmark(b, fullPerfConfig); 1885 | fullResults.Performance = perfResults.Performance; 1886 | fullResults.Performance.Print("full"); 1887 | 1888 | // Get full call counts. 1889 | // Ideally, perhaps, drive this from the noinline set...? 1890 | if (CaptureCallCounts) 1891 | { 1892 | Configuration fullPerfCallCountConfig = new Configuration("full-perf-cc"); 1893 | fullPerfCallCountConfig.ResultsDirectory = Program.RESULTS_DIR; 1894 | fullPerfCallCountConfig.Environment["COMPlus_JitInlinePolicyFull"] = "1"; 1895 | fullPerfCallCountConfig.Environment["COMPlus_JitInlineDepth"] = "10"; 1896 | fullPerfCallCountConfig.Environment["COMPlus_JitInlineSize"] = "200"; 1897 | fullPerfCallCountConfig.Environment["COMPlus_JitMeasureEntryCounts"] = "1"; 1898 | Results ccResults = r.RunBenchmark(b, fullPerfCallCountConfig); 1899 | 1900 | AnnotateCallCounts(ccResults, fullResults); 1901 | } 1902 | 1903 | return fullResults; 1904 | } 1905 | 1906 | // The "model" model uses heuristics based on modelling actual 1907 | // observations 1908 | Results BuildModelModel(Runner r, Runner x, Benchmark b, bool altModel = false) 1909 | { 1910 | string modelName = "Model" + (altModel ? "2" : ""); 1911 | string variant = altModel ? "2" : "1"; 1912 | 1913 | Console.WriteLine("----"); 1914 | Console.WriteLine("---- {0} Model for {1}", modelName, b.ShortName); 1915 | 1916 | Configuration modelConfig = new Configuration(modelName); 1917 | modelConfig.ResultsDirectory = Program.RESULTS_DIR; 1918 | modelConfig.Environment["COMPlus_JitInlinePolicyModel"] = variant; 1919 | modelConfig.Environment["COMPlus_JitInlineDumpXml"] = "1"; 1920 | 1921 | Results modelResults = r.RunBenchmark(b, modelConfig); 1922 | 1923 | if (modelResults == null || !modelResults.Success) 1924 | { 1925 | Console.WriteLine("{0} run failed\n", modelConfig.Name); 1926 | return null; 1927 | } 1928 | 1929 | if (Program.ExploreInlines) 1930 | { 1931 | XmlSerializer xml = new XmlSerializer(typeof(InlineForest)); 1932 | Stream xmlFile = new FileStream(modelResults.LogFile, FileMode.Open); 1933 | InlineForest f = (InlineForest)xml.Deserialize(xmlFile); 1934 | long inlineCount = f.Methods.Sum(m => m.InlineCount); 1935 | Console.WriteLine("*** {0} config has {1} methods, {2} inlines", 1936 | modelConfig.Name, f.Methods.Length, inlineCount); 1937 | modelResults.InlineForest = f; 1938 | 1939 | // Populate the methodId -> method lookup table 1940 | Dictionary methods = new Dictionary(f.Methods.Length); 1941 | foreach (Method m in f.Methods) 1942 | { 1943 | MethodId id = m.getId(); 1944 | methods[id] = m; 1945 | } 1946 | modelResults.Methods = methods; 1947 | } 1948 | 1949 | // Now get perf numbers 1950 | Configuration modelPerfConfig = new Configuration(modelName + "-perf"); 1951 | modelPerfConfig.ResultsDirectory = Program.RESULTS_DIR; 1952 | modelPerfConfig.Environment["COMPlus_JitInlinePolicyModel"] = variant; 1953 | Results perfResults = x.RunBenchmark(b, modelPerfConfig); 1954 | modelResults.Performance = perfResults.Performance; 1955 | modelResults.Performance.Print(modelConfig.Name); 1956 | 1957 | // Get method call counts 1958 | if (CaptureCallCounts) 1959 | { 1960 | Configuration modelCallCountConfig = new Configuration(modelName + "-cc"); 1961 | modelCallCountConfig.ResultsDirectory = Program.RESULTS_DIR; 1962 | modelCallCountConfig.Environment["COMPlus_JitMeasureEntryCounts"] = "1"; 1963 | modelCallCountConfig.Environment["COMPlus_JitInlinePolicyModel"] = "1"; 1964 | Results ccResults = r.RunBenchmark(b, modelCallCountConfig); 1965 | 1966 | // Parse results back and annotate base method set 1967 | AnnotateCallCounts(ccResults, modelResults); 1968 | } 1969 | 1970 | return modelResults; 1971 | } 1972 | 1973 | // The size model tries not to increase method size 1974 | Results BuildSizeModel(Runner r, Runner x, Benchmark b) 1975 | { 1976 | Console.WriteLine("----"); 1977 | Console.WriteLine("---- Size Model for {0}", b.ShortName); 1978 | 1979 | Configuration sizeConfig = new Configuration("size"); 1980 | sizeConfig.ResultsDirectory = Program.RESULTS_DIR; 1981 | sizeConfig.Environment["COMPlus_JitInlinePolicySize"] = "1"; 1982 | sizeConfig.Environment["COMPlus_JitInlineDumpXml"] = "1"; 1983 | 1984 | Results results = r.RunBenchmark(b, sizeConfig); 1985 | 1986 | if (results == null || !results.Success) 1987 | { 1988 | Console.WriteLine("{0} run failed\n", sizeConfig.Name); 1989 | return null; 1990 | } 1991 | 1992 | XmlSerializer xml = new XmlSerializer(typeof(InlineForest)); 1993 | InlineForest f; 1994 | Stream xmlFile = new FileStream(results.LogFile, FileMode.Open); 1995 | f = (InlineForest)xml.Deserialize(xmlFile); 1996 | long inlineCount = f.Methods.Sum(m => m.InlineCount); 1997 | Console.WriteLine("*** {0} config has {1} methods, {2} inlines", 1998 | sizeConfig.Name, f.Methods.Length, inlineCount); 1999 | results.InlineForest = f; 2000 | 2001 | // Now get perf numbers 2002 | Configuration sizePerfConfig = new Configuration("size-perf"); 2003 | sizePerfConfig.ResultsDirectory = Program.RESULTS_DIR; 2004 | sizePerfConfig.Environment["COMPlus_JitInlinePolicySize"] = "1"; 2005 | Results perfResults = x.RunBenchmark(b, sizePerfConfig); 2006 | results.Performance = perfResults.Performance; 2007 | results.Performance.Print(sizeConfig.Name); 2008 | 2009 | return results; 2010 | } 2011 | 2012 | // The random model is random 2013 | Results BuildRandomModel(Runner r, Runner x, Benchmark b, uint seed) 2014 | { 2015 | Console.WriteLine("----"); 2016 | Console.WriteLine("---- Random Model {0:X} for {1}", seed, b.ShortName); 2017 | 2018 | string seedString = String.Format("0x{0:X}", seed); 2019 | Configuration randomConfig = new Configuration("random-" + seedString); 2020 | randomConfig.ResultsDirectory = Program.RESULTS_DIR; 2021 | randomConfig.Environment["COMPlus_JitInlinePolicyRandom"] = seedString; 2022 | randomConfig.Environment["COMPlus_JitInlineDumpXml"] = "2"; // minimal XML 2023 | randomConfig.Environment["COMPlus_JitInlineDumpData"] = "2"; // full data set 2024 | 2025 | Results results = r.RunBenchmark(b, randomConfig); 2026 | 2027 | if (results == null || !results.Success) 2028 | { 2029 | Console.WriteLine("{0} run failed\n", randomConfig.Name); 2030 | return null; 2031 | } 2032 | 2033 | XmlSerializer xml = new XmlSerializer(typeof(InlineForest)); 2034 | InlineForest f; 2035 | Stream xmlFile = new FileStream(results.LogFile, FileMode.Open); 2036 | f = (InlineForest)xml.Deserialize(xmlFile); 2037 | long inlineCount = f.Methods.Sum(m => m.InlineCount); 2038 | Console.WriteLine("*** {0} config has {1} methods, {2} inlines", 2039 | randomConfig.Name, f.Methods.Length, inlineCount); 2040 | results.InlineForest = f; 2041 | 2042 | // Now get perf numbers 2043 | Configuration randomPerfConfig = new Configuration(randomConfig.Name + "-perf"); 2044 | randomPerfConfig.ResultsDirectory = Program.RESULTS_DIR; 2045 | randomPerfConfig.Environment["COMPlus_JitInlinePolicyRandom"] = seedString; 2046 | Results perfResults = x.RunBenchmark(b, randomPerfConfig); 2047 | results.Performance = perfResults.Performance; 2048 | results.Performance.Print(randomConfig.Name); 2049 | 2050 | return results; 2051 | } 2052 | 2053 | static void SetupResults() 2054 | { 2055 | if (Directory.Exists(Program.RESULTS_DIR)) 2056 | { 2057 | if (Program.Verbose) 2058 | { 2059 | Console.WriteLine("...Cleaning old results dir '{0}'", Program.RESULTS_DIR); 2060 | } 2061 | Directory.Delete(Program.RESULTS_DIR, true); 2062 | } 2063 | 2064 | if (Program.Verbose) 2065 | { 2066 | Console.WriteLine("...Creating new results '{0}'", Program.RESULTS_DIR); 2067 | } 2068 | 2069 | Directory.CreateDirectory(Program.RESULTS_DIR); 2070 | DirectoryInfo sandboxDirectoryInfo = new DirectoryInfo(Program.RESULTS_DIR); 2071 | } 2072 | 2073 | // Paths to repos and binaries. 2074 | public static string REPO_ROOT = @"c:\repos"; 2075 | public static string CORECLR_ROOT = REPO_ROOT + @"\coreclr"; 2076 | public static string CORECLR_BENCHMARK_ROOT = CORECLR_ROOT + @"\bin\tests\Windows_NT.x64.Release\JIT\performance\codequality"; 2077 | public static string CORERUN = CORECLR_ROOT + @"\bin\tests\Windows_NT.x64.release\tests\Core_Root\corerun.exe"; 2078 | public static string SHELL = @"c:\windows\system32\cmd.exe"; 2079 | public static string RESULTS_DIR = REPO_ROOT + @"\PerformanceExplorer\results"; 2080 | public static string SANDBOX_DIR = REPO_ROOT + @"\PerformanceExplorer\sandbox"; 2081 | 2082 | // Various aspects of the exploration that can be enabled/disabled. 2083 | public static bool DisableZap = false; 2084 | public static bool UseNoInlineModel = false; 2085 | public static bool UseLegacyModel = false; 2086 | public static bool UseEnhancedLegacyModel = false; 2087 | public static bool UseFullModel = false; 2088 | public static bool UseModelModel = false; 2089 | public static bool UseAltModel = false; 2090 | public static bool UseSizeModel = false; 2091 | public static bool UseRandomModel = false; 2092 | public static uint RandomSeed = 0x55; 2093 | public static uint RandomTries = 1; 2094 | public static bool ExploreInlines = true; 2095 | public static bool ClassifyInlines = false; 2096 | public static bool CaptureCallCounts = true; 2097 | public static bool SkipProblemBenchmarks = true; 2098 | public static uint MinIterations = 10; 2099 | public static uint MaxIterations = 10; 2100 | public static string ClassFilter = null; 2101 | public static string MethodFilter = null; 2102 | public static string RootToken = null; 2103 | public static uint RootTokenValue = 0; 2104 | public static bool Verbose = true; 2105 | public static bool VeryVerbose = false; 2106 | 2107 | public static List ParseArgs(string[] args) 2108 | { 2109 | List benchNames = new List(); 2110 | 2111 | for (int i = 0; i< args.Length; i++) 2112 | { 2113 | string arg = args[i]; 2114 | 2115 | if (arg[0] == '-') 2116 | { 2117 | if (arg == "-perf") 2118 | { 2119 | ExploreInlines = false; 2120 | CaptureCallCounts = false; 2121 | } 2122 | else if (arg == "-disableZap") 2123 | { 2124 | DisableZap = true; 2125 | } 2126 | else if (arg == "-allTests") 2127 | { 2128 | SkipProblemBenchmarks = false; 2129 | } 2130 | else if (arg == "-useNoInline") 2131 | { 2132 | UseNoInlineModel = true; 2133 | } 2134 | else if (arg == "-useLegacy") 2135 | { 2136 | UseLegacyModel = true; 2137 | } 2138 | else if (arg == "-useEnhancedLegacy") 2139 | { 2140 | UseEnhancedLegacyModel = true; 2141 | } 2142 | else if (arg == "-useFull") 2143 | { 2144 | UseFullModel = true; 2145 | } 2146 | else if (arg == "-useSize") 2147 | { 2148 | UseSizeModel = true; 2149 | } 2150 | else if (arg == "-useModel") 2151 | { 2152 | UseModelModel = true; 2153 | } 2154 | else if (arg == "-useAltModel") 2155 | { 2156 | UseAltModel = true; 2157 | } 2158 | else if (arg == "-noExplore") 2159 | { 2160 | ExploreInlines = false; 2161 | } 2162 | else if (arg == "-useRandom") 2163 | { 2164 | UseRandomModel = true; 2165 | } 2166 | else if (arg == "-classify") 2167 | { 2168 | ClassifyInlines = true; 2169 | } 2170 | else if (arg == "-randomTries" && (i + 1) < args.Length) 2171 | { 2172 | RandomTries = UInt32.Parse(args[++i]); 2173 | } 2174 | else if (arg == "-minIterations" && (i + 1) < args.Length) 2175 | { 2176 | MinIterations = UInt32.Parse(args[++i]); 2177 | } 2178 | else if (arg == "-maxIterations" && (i + 1) < args.Length) 2179 | { 2180 | MaxIterations = UInt32.Parse(args[++i]); 2181 | } 2182 | else if (arg == "-method" && (i + 1) < args.Length) 2183 | { 2184 | MethodFilter = args[++i]; 2185 | } 2186 | else if (arg == "-class" && (i + 1) < args.Length) 2187 | { 2188 | ClassFilter = args[++i]; 2189 | } 2190 | else if (arg == "-rootToken" && (i + 1) < args.Length) 2191 | { 2192 | RootToken = args[++i]; 2193 | RootTokenValue = UInt32.Parse(RootToken, System.Globalization.NumberStyles.HexNumber); 2194 | } 2195 | else 2196 | { 2197 | Console.WriteLine("... ignoring '{0}'", arg); 2198 | } 2199 | } 2200 | else 2201 | { 2202 | benchNames.Add(arg); 2203 | } 2204 | } 2205 | 2206 | bool hasInlineModel = 2207 | UseLegacyModel || 2208 | UseEnhancedLegacyModel || 2209 | UseModelModel || 2210 | UseAltModel || 2211 | UseFullModel || 2212 | UseRandomModel || 2213 | UseSizeModel; 2214 | 2215 | if (ExploreInlines) 2216 | { 2217 | // Exploration should at least run a noinline model 2218 | if (!UseNoInlineModel) 2219 | { 2220 | Console.WriteLine("...Exploration: forcibly enabling NoInlineModel"); 2221 | UseNoInlineModel = true; 2222 | } 2223 | 2224 | // If no alternate models are selected, forcibly enable the full model. 2225 | if (!hasInlineModel) 2226 | { 2227 | Console.WriteLine("...Exploration: forcibly enabling FullModel"); 2228 | UseFullModel = true; 2229 | } 2230 | } 2231 | else if (!(hasInlineModel || UseNoInlineModel)) 2232 | { 2233 | // perf should run at least one model. Choose current default. 2234 | Console.WriteLine("...Performance: forcibly enabling EnhancedLegacyModel"); 2235 | UseEnhancedLegacyModel = true; 2236 | } 2237 | 2238 | return benchNames; 2239 | } 2240 | 2241 | public static bool Configure() 2242 | { 2243 | // Verify repo root 2244 | if (Directory.Exists(REPO_ROOT)) 2245 | { 2246 | if (Directory.Exists(Path.Combine(REPO_ROOT, "coreclr"))) 2247 | { 2248 | return true; 2249 | } 2250 | } 2251 | 2252 | // Else search up from current WD 2253 | string cwd = Directory.GetCurrentDirectory(); 2254 | Console.WriteLine("... coreclr repo not at {0}, searching up from {1}", REPO_ROOT, cwd); 2255 | DirectoryInfo cwdi = new DirectoryInfo(cwd); 2256 | bool found = false; 2257 | while (cwdi != null) 2258 | { 2259 | string prospect = Path.Combine(cwdi.FullName, "coreclr"); 2260 | Console.WriteLine("... looking for {0}", prospect); 2261 | if (Directory.Exists(prospect)) 2262 | { 2263 | REPO_ROOT = cwdi.FullName; 2264 | Console.WriteLine("... found coreclr repo at {0}", prospect); 2265 | found = true; 2266 | break; 2267 | } 2268 | 2269 | cwdi = cwdi.Parent; 2270 | } 2271 | 2272 | if (!found) 2273 | { 2274 | return false; 2275 | } 2276 | 2277 | // Set up other paths 2278 | CORECLR_ROOT = Path.Combine(REPO_ROOT, "coreclr"); 2279 | CORECLR_BENCHMARK_ROOT = Path.Combine(new string[] 2280 | {CORECLR_ROOT, "bin", "tests", "Windows_NT.x64.Release", "JIT", "performance", "codequality"}); 2281 | CORERUN = Path.Combine(new string[] 2282 | { CORECLR_ROOT, "bin", "tests", "Windows_NT.x64.release", "tests", "Core_Root", "corerun.exe"}); 2283 | RESULTS_DIR = Path.Combine(REPO_ROOT, "PerformanceExplorer", "results"); 2284 | SANDBOX_DIR = Path.Combine(REPO_ROOT, "PerformanceExplorer", "sandbox"); 2285 | 2286 | return true; 2287 | } 2288 | 2289 | public static int Main(string[] args) 2290 | { 2291 | List benchNames = ParseArgs(args); 2292 | bool ok = Configure(); 2293 | if (!ok) 2294 | { 2295 | Console.WriteLine("Cound not find coreclr repo"); 2296 | return -1; 2297 | } 2298 | 2299 | SetupResults(); 2300 | Program p = new Program(); 2301 | 2302 | // Enumerate benchmarks that can be run 2303 | string benchmarkRoot = CORECLR_BENCHMARK_ROOT; 2304 | Console.WriteLine("...Enumerating benchmarks under {0}", benchmarkRoot); 2305 | Dictionary benchmarks = new Dictionary(); 2306 | DirectoryInfo benchmarkRootInfo = new DirectoryInfo(benchmarkRoot); 2307 | foreach (FileInfo f in benchmarkRootInfo.GetFiles("*.exe", SearchOption.AllDirectories)) 2308 | { 2309 | benchmarks.Add(f.Name, f.FullName); 2310 | } 2311 | 2312 | Console.WriteLine("...Found {0} benchmarks", benchmarks.Count()); 2313 | 2314 | // If an arg is passed, run benchmarks that contain that arg as a substring. 2315 | // Otherwise run them all. 2316 | List benchmarksToRun = new List(); 2317 | 2318 | if (benchNames.Count == 0) 2319 | { 2320 | Console.WriteLine("...Running all benchmarks"); 2321 | benchmarksToRun.AddRange(benchmarks.Values); 2322 | } 2323 | else 2324 | { 2325 | Console.WriteLine("...Scanning for benchmarks matching your pattern(s)"); 2326 | foreach (string item in benchNames) 2327 | { 2328 | int beforeCount = benchmarksToRun.Count; 2329 | foreach (string benchName in benchmarks.Keys) 2330 | { 2331 | if (benchmarks[benchName].IndexOf(item, StringComparison.OrdinalIgnoreCase) >= 0) 2332 | { 2333 | benchmarksToRun.Add(benchmarks[benchName]); 2334 | } 2335 | } 2336 | 2337 | if (benchmarksToRun.Count == 0) 2338 | { 2339 | Console.WriteLine("No benchmark matches '{0}'", item); 2340 | } 2341 | else 2342 | { 2343 | Console.WriteLine("{0} benchmarks matched '{1}'", 2344 | benchmarksToRun.Count - beforeCount, item); 2345 | } 2346 | } 2347 | } 2348 | 2349 | int result = p.RunBenchmarks(benchmarksToRun); 2350 | 2351 | return result; 2352 | } 2353 | 2354 | int RunBenchmarks(List benchmarksToRun) 2355 | { 2356 | Runner r = new CoreClrRunner(); 2357 | Runner x = new XunitPerfRunner(); 2358 | 2359 | // Build integrated data model... 2360 | string dataModelName = "All-Benchmark-data-model.csv"; 2361 | string dataModelFileName = Path.Combine(Program.RESULTS_DIR, dataModelName); 2362 | bool hasHeader = false; 2363 | StreamWriter dataModelFile = null; 2364 | Dictionary blacklist = null; 2365 | if (ExploreInlines) 2366 | { 2367 | dataModelFile = File.CreateText(dataModelFileName); 2368 | 2369 | if (DisableZap) 2370 | { 2371 | // Use blacklist if we disable zap so we won't repeatedly 2372 | // explore the same startup paths in the core library across benchmarks 2373 | blacklist = new Dictionary(); 2374 | } 2375 | } 2376 | 2377 | // Collect up result sets 2378 | List> aggregateResults = new List>(benchmarksToRun.Count()); 2379 | 2380 | foreach (string s in benchmarksToRun) 2381 | { 2382 | // Ignore benchmarks that are not reliable enough for us to to measure when looking for 2383 | // per-inline deltas. 2384 | if (SkipProblemBenchmarks) 2385 | { 2386 | if (s.IndexOf("bytemark", StringComparison.OrdinalIgnoreCase) >= 0) 2387 | { 2388 | Console.WriteLine(".... bytemark disabled (noisy), sorry"); 2389 | continue; 2390 | } 2391 | 2392 | if (s.IndexOf("raytracer", StringComparison.OrdinalIgnoreCase) >= 0) 2393 | { 2394 | Console.WriteLine(".... raytracer disabled (nondeterministic), sorry"); 2395 | continue; 2396 | } 2397 | 2398 | if (s.IndexOf("constantarg", StringComparison.OrdinalIgnoreCase) >= 0) 2399 | { 2400 | Console.WriteLine(".... constantarg disabled (too much detail), sorry"); 2401 | continue; 2402 | } 2403 | 2404 | if (s.IndexOf("functions", StringComparison.OrdinalIgnoreCase) >= 0) 2405 | { 2406 | Console.WriteLine(".... functions disabled (too much detail), sorry"); 2407 | continue; 2408 | } 2409 | } 2410 | 2411 | List benchmarkResults = new List(); 2412 | Benchmark b = new Benchmark(); 2413 | b.ShortName = Path.GetFileName(s); 2414 | b.FullPath = s; 2415 | b.ExitCode = 100; 2416 | 2417 | Results noInlineResults = null; 2418 | 2419 | if (UseNoInlineModel) 2420 | { 2421 | noInlineResults = BuildNoInlineModel(r, x, b); 2422 | if (noInlineResults == null) 2423 | { 2424 | Console.WriteLine("Skipping remainder of runs for {0}", b.ShortName); 2425 | continue; 2426 | } 2427 | benchmarkResults.Add(noInlineResults); 2428 | } 2429 | 2430 | if (UseLegacyModel) 2431 | { 2432 | Results legacyResults = BuildLegacyModel(r, x, b); 2433 | if (legacyResults == null) 2434 | { 2435 | Console.WriteLine("Skipping remainder of runs for {0}", b.ShortName); 2436 | continue; 2437 | } 2438 | benchmarkResults.Add(legacyResults); 2439 | } 2440 | 2441 | if (UseEnhancedLegacyModel) 2442 | { 2443 | Results enhancedLegacyResults = BuildLegacyModel(r, x, b, true); 2444 | if (enhancedLegacyResults == null) 2445 | { 2446 | Console.WriteLine("Skipping remainder of runs for {0}", b.ShortName); 2447 | continue; 2448 | } 2449 | benchmarkResults.Add(enhancedLegacyResults); 2450 | } 2451 | 2452 | if (UseFullModel) 2453 | { 2454 | Results fullResults = BuildFullModel(r, x, b, noInlineResults); 2455 | if (fullResults == null) 2456 | { 2457 | Console.WriteLine("Skipping remainder of runs for {0}", b.ShortName); 2458 | continue; 2459 | } 2460 | 2461 | benchmarkResults.Add(fullResults); 2462 | 2463 | CallGraph g = new CallGraph(fullResults); 2464 | string fileName = b.ShortName + "-callgraph.dot"; 2465 | g.DumpDot(Path.Combine(RESULTS_DIR, fileName)); 2466 | } 2467 | 2468 | if (UseModelModel) 2469 | { 2470 | Results modelResults = BuildModelModel(r, x, b); 2471 | if (modelResults == null) 2472 | { 2473 | Console.WriteLine("Skipping remainder of runs for {0}", b.ShortName); 2474 | continue; 2475 | } 2476 | benchmarkResults.Add(modelResults); 2477 | } 2478 | 2479 | if (UseAltModel) 2480 | { 2481 | Results altModelResults = BuildModelModel(r, x, b, true); 2482 | if (altModelResults == null) 2483 | { 2484 | Console.WriteLine("Skipping remainder of runs for {0}", b.ShortName); 2485 | continue; 2486 | } 2487 | benchmarkResults.Add(altModelResults); 2488 | } 2489 | 2490 | if (UseSizeModel) 2491 | { 2492 | Results sizeResults = BuildSizeModel(r, x, b); 2493 | if (sizeResults == null) 2494 | { 2495 | Console.WriteLine("Skipping remainder of runs for {0}", b.ShortName); 2496 | continue; 2497 | } 2498 | benchmarkResults.Add(sizeResults); 2499 | } 2500 | 2501 | if (UseRandomModel) 2502 | { 2503 | uint seed = RandomSeed; 2504 | for (uint i = 0; i < RandomTries; i++, seed += RandomSeed) 2505 | { 2506 | Results randomResults = BuildRandomModel(r, x, b, seed); 2507 | if (randomResults == null) 2508 | { 2509 | Console.WriteLine("Skipping remainder of runs for {0}", b.ShortName); 2510 | continue; 2511 | } 2512 | benchmarkResults.Add(randomResults); 2513 | } 2514 | } 2515 | 2516 | aggregateResults.Add(benchmarkResults); 2517 | 2518 | if (ExploreInlines) 2519 | { 2520 | var thingsToExplore = ExaminePerf(b, benchmarkResults); 2521 | 2522 | foreach (Exploration e in thingsToExplore) 2523 | { 2524 | e.Explore(dataModelFile, ref hasHeader, blacklist); 2525 | } 2526 | 2527 | dataModelFile.Flush(); 2528 | } 2529 | 2530 | if (ClassifyInlines) 2531 | { 2532 | Console.WriteLine("Beginning classification"); 2533 | 2534 | // Build map from inline data string to results that contain inlines with that string. 2535 | // For now we just track existence and not multiplicity... 2536 | Dictionary> dataIndex = new Dictionary>(); 2537 | HashSet allResults = new HashSet(); 2538 | 2539 | uint resultCount = 0; 2540 | uint inlineCount = 0; 2541 | 2542 | foreach (List rr in aggregateResults) 2543 | { 2544 | foreach (Results rrr in rr) 2545 | { 2546 | resultCount++; 2547 | allResults.Add(rrr); 2548 | 2549 | foreach (Method mm in rrr.InlineForest.Methods) 2550 | { 2551 | Queue inlines = new Queue(); 2552 | foreach (Inline ii in mm.Inlines) 2553 | { 2554 | inlines.Enqueue(ii); 2555 | } 2556 | 2557 | while (inlines.Count > 0) 2558 | { 2559 | inlineCount++; 2560 | Inline iii = inlines.Dequeue(); 2561 | HashSet zz = null; 2562 | if (!dataIndex.TryGetValue(iii.Data, out zz)) 2563 | { 2564 | zz = new HashSet(); 2565 | dataIndex[iii.Data] = zz; 2566 | } 2567 | zz.Add(rrr); 2568 | 2569 | foreach (Inline jjj in iii.Inlines) 2570 | { 2571 | inlines.Enqueue(jjj); 2572 | } 2573 | } 2574 | } 2575 | } 2576 | } 2577 | 2578 | Console.WriteLine("Found {0} inlines, {1} data vectors, {2} results", inlineCount, dataIndex.Count, resultCount); 2579 | 2580 | // Walk through the data vectors looking for ones that appear in some results but not all. 2581 | // These are the ones we can label as good/bad by comparing the results distributions for 2582 | // cases where they do and do not appear. 2583 | // 2584 | // NB including the various "model" estimates in the data vector may cause false dichotomies 2585 | // and artificially inflate the number of vectors; consider suppressing them (if the estimates 2586 | // are functions of the rest of the vector's values they are probably harmless). 2587 | uint useableData = 0; 2588 | uint confidentData = 0; 2589 | foreach (string ddd in dataIndex.Keys) 2590 | { 2591 | int appearances = dataIndex[ddd].Count; 2592 | 2593 | // By virtue of how we constructed the dataIndex each data vector should have at least 2594 | // one appearance, and no more than the total number of results. 2595 | if (appearances < 1 || appearances > resultCount) 2596 | { 2597 | Console.WriteLine("Unexpected number of appearances {0} for {1}", appearances, ddd); 2598 | continue; 2599 | } 2600 | 2601 | // Limits here are ad-hoc, but too few appearances or to many appearances will make it tough 2602 | // to infer the impact of an inline with this data vector. Perhaps we should 2603 | // say the # of appearances is still enough to estimate the distributions 2604 | // of results with and without inlines with this data vector, so a number limit like 2605 | // 30 seems plausible. 2606 | double fraction = (double)appearances / resultCount; 2607 | if (fraction < 0.10 || fraction > 0.90) 2608 | { 2609 | continue; 2610 | } 2611 | 2612 | useableData++; 2613 | 2614 | // Now the idea is to estimate the impact of inlines with this data vector. 2615 | // We have two sets of results, both with plausible numbers of samples: ones 2616 | // where inlines with this data vector happened, and the other where the inlines 2617 | // did not happen. 2618 | // 2619 | // We want to turn this into some kind of label for the data vector, either as 2620 | // a "good" inline data vector or a "bad" one. 2621 | // 2622 | // Roughly speaking we want to compute the empirical distributions for the two 2623 | // sets of results, and see if the difference is statistically significant. If it is, 2624 | // then the magnitude of the difference can contribute to the label. 2625 | // 2626 | // Some challenges: in general the results will come from many different benchmarks 2627 | // and it probably doesn't make sense to aggregate across benchmarks and then do the 2628 | // scoring. So we probably want to go benchmark by benchmark. This means that 2629 | // there will be some benchmarks where the result sets are too small to draw meaningful 2630 | // statistics and those will need to be left out. So for each data vector we may end 2631 | // up with a varying amount of data, depending on whether that vector is comon to 2632 | // many tests or specific to one or a few. 2633 | // 2634 | // For now assume all results can be used... 2635 | HashSet includedResults = dataIndex[ddd]; 2636 | HashSet excludedResults = new HashSet(allResults.Except(includedResults)); 2637 | 2638 | List includedData = new List(); 2639 | List excludedData = new List(); 2640 | 2641 | foreach (Results ir in includedResults) 2642 | { 2643 | foreach (string sir in ir.Performance.InstructionCount.Keys) 2644 | { 2645 | includedData.AddRange(ir.Performance.InstructionCount[sir]); 2646 | } 2647 | } 2648 | 2649 | foreach (Results er in excludedResults) 2650 | { 2651 | foreach (string ser in er.Performance.InstructionCount.Keys) 2652 | { 2653 | excludedData.AddRange(er.Performance.InstructionCount[ser]); 2654 | } 2655 | } 2656 | 2657 | double confidence = PerformanceData.Confidence(includedData, excludedData); 2658 | 2659 | // If we can't tell the two results set medians apart with any confidence, we can't infer 2660 | // the impact of inlines with this data vector. 2661 | if (confidence < 0.8) 2662 | { 2663 | continue; 2664 | } 2665 | 2666 | confidentData++; 2667 | } 2668 | 2669 | Console.WriteLine("{0} data vectors are usable; {1} with confidence", useableData, confidentData); 2670 | } 2671 | } 2672 | 2673 | // aggregateResults is a list of list of results 2674 | // outer list is one per "benchmark" 2675 | // inner list is one per model 2676 | // .. a benchmark may have multiple parts 2677 | 2678 | Console.WriteLine("---- Perf Results----"); 2679 | Console.Write("{0,-42}", "Test"); 2680 | int modelCount = 0; 2681 | foreach (Results rq in aggregateResults.First()) 2682 | { 2683 | Console.Write(" {0,8}.T {0,8}.I", rq.Name); 2684 | modelCount += 1; 2685 | } 2686 | Console.WriteLine(); 2687 | 2688 | int totalPartCount = 0; 2689 | foreach (List rr in aggregateResults) 2690 | { 2691 | Results f = rr.First(); 2692 | totalPartCount += f.Performance.InstructionCount.Count; 2693 | } 2694 | 2695 | double[] timeLogSum = new double[modelCount]; 2696 | double[] instrLogSum = new double[modelCount]; 2697 | 2698 | foreach (List rr in aggregateResults) 2699 | { 2700 | ComparePerf(rr, timeLogSum, instrLogSum); 2701 | } 2702 | 2703 | Console.Write("{0,-42}", "GeoMeans"); 2704 | for (int j = 0; j < modelCount; j++) 2705 | { 2706 | double gmTime = Math.Exp(timeLogSum[j] / totalPartCount); 2707 | Console.Write(" {0,10:0.00}", gmTime); 2708 | 2709 | double gmInstr = Math.Exp(instrLogSum[j] / totalPartCount); 2710 | Console.Write(" {0,10:0.00}", gmInstr); 2711 | } 2712 | 2713 | return 100; 2714 | } 2715 | 2716 | void ComparePerf(List results, double[] timeLogSum, double[] instrLogSum) 2717 | { 2718 | Results baseline = results.First(); 2719 | 2720 | foreach (string subBench in baseline.Performance.ExecutionTime.Keys) 2721 | { 2722 | Console.Write("{0,-42}", subBench); 2723 | 2724 | int modelNumber = 0; 2725 | 2726 | foreach (Results diff in results) 2727 | { 2728 | double diffTime = PerformanceData.Average(diff.Performance.ExecutionTime[subBench]); 2729 | Console.Write(" {0,10:0.00}", diffTime); 2730 | 2731 | double diffInst = PerformanceData.Average(diff.Performance.InstructionCount[subBench]); 2732 | Console.Write(" {0,10:0.00}", diffInst / (1000 * 1000)); 2733 | 2734 | timeLogSum[modelNumber] += Math.Log(diffTime); 2735 | instrLogSum[modelNumber] += Math.Log(diffInst / (1000 * 1000)); 2736 | 2737 | modelNumber++; 2738 | } 2739 | Console.WriteLine(); 2740 | } 2741 | } 2742 | 2743 | List ExaminePerf(Benchmark b, List results) 2744 | { 2745 | Results baseline = results.First(); 2746 | Console.WriteLine("---- Perf Examination----"); 2747 | List interestingResults = new List(); 2748 | 2749 | foreach (Results diff in results) 2750 | { 2751 | // No need to investigate the baseline 2752 | if (diff == baseline) 2753 | { 2754 | continue; 2755 | } 2756 | 2757 | // See if any of the sub-bench results are both significantly different 2758 | // than the baseline and measured with high confidence. 2759 | bool added = false; 2760 | 2761 | foreach (string subBench in baseline.Performance.InstructionCount.Keys) 2762 | { 2763 | List baseData = baseline.Performance.InstructionCount[subBench]; 2764 | double baseAvg = PerformanceData.Average(baseData); 2765 | List diffData = diff.Performance.InstructionCount[subBench]; 2766 | double diffAvg = PerformanceData.Average(diffData); 2767 | double confidence = PerformanceData.Confidence(baseData, diffData); 2768 | double avgDiff = diffAvg - baseAvg; 2769 | double pctDiff = 100 * avgDiff / baseAvg; 2770 | double interestingDiff = 1; 2771 | double confidentDiff = 0.9; 2772 | bool interesting = Math.Abs(pctDiff) > interestingDiff; 2773 | bool confident = confidence > confidentDiff; 2774 | string interestVerb = interesting ? "is" : "is not"; 2775 | string confidentVerb = confident ? "and is" : "and is not"; 2776 | bool show = interesting && confident; 2777 | 2778 | if (!added & interesting && confident) 2779 | { 2780 | Exploration e = new Exploration(); 2781 | e.baseResults = baseline; 2782 | e.endResults = diff; 2783 | e.benchmark = b; 2784 | interestingResults.Add(e); 2785 | added = true; 2786 | 2787 | Console.WriteLine( 2788 | "$$$ {0} diff {1} in instructions between {2} ({3}) and {4} ({5}) " 2789 | + "{6} interesting {7:0.00}% {8} significant p={9:0.00}", 2790 | subBench, avgDiff / (1000 * 1000), 2791 | baseline.Name, baseAvg / (1000 * 1000), 2792 | diff.Name, diffAvg / (1000 * 1000), 2793 | interestVerb, pctDiff, 2794 | confidentVerb, confidence); 2795 | 2796 | break; 2797 | } 2798 | } 2799 | 2800 | if (!added) 2801 | { 2802 | Console.WriteLine("$$$ {0} performance diff from {1} was not significant and confident", b.ShortName, diff.Name); 2803 | } 2804 | } 2805 | 2806 | return interestingResults; 2807 | } 2808 | 2809 | static void AnnotateCallCounts(Results ccResults, Results results) 2810 | { 2811 | // Parse results back and annotate base method set 2812 | using (StreamReader callCountStream = File.OpenText(ccResults.LogFile)) 2813 | { 2814 | string callCountLine = callCountStream.ReadLine(); 2815 | while (callCountLine != null) 2816 | { 2817 | string[] callCountFields = callCountLine.Split(new char[] { ',' }); 2818 | if (callCountFields.Length == 3) 2819 | { 2820 | uint token = UInt32.Parse(callCountFields[0], System.Globalization.NumberStyles.HexNumber); 2821 | uint hash = UInt32.Parse(callCountFields[1], System.Globalization.NumberStyles.HexNumber); 2822 | ulong count = UInt64.Parse(callCountFields[2]); 2823 | 2824 | MethodId id = new MethodId(); 2825 | id.Hash = hash; 2826 | id.Token = token; 2827 | 2828 | if (results.Methods.ContainsKey(id)) 2829 | { 2830 | Method m = results.Methods[id]; 2831 | m.CallCount = count; 2832 | Console.WriteLine("{0} called {1} times", m.Name, count); 2833 | } 2834 | else 2835 | { 2836 | Console.WriteLine("{0:X8} {1:X8} called {2} times, but is not in base set?", token, hash, count); 2837 | } 2838 | } 2839 | callCountLine = callCountStream.ReadLine(); 2840 | } 2841 | } 2842 | } 2843 | } 2844 | } 2845 | --------------------------------------------------------------------------------