├── ASD.py
└── README.md


/ASD.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | from scipy.stats import norm as normal
  3 | from scipy.stats import mannwhitneyu as Utest
  4 | import numpy as np
  5 | # import matplotlib.pyplot as plt
  6 | 
  7 | 
  8 | F = []
  9 | G = []
 10 | n = 0
 11 | m = 0
 12 | def buildOrigCDFs(f, g):
 13 |     global F
 14 |     global G
 15 |     global n
 16 |     global m
 17 |     F = np.sort(f)
 18 |     n = len(F)
 19 |     G = np.sort(g)
 20 |     m = len(G)
 21 | 
 22 | 
 23 | def buildNewCDFs(f, g):
 24 |     global Fb
 25 |     global Gb
 26 |     Fb = np.sort(f)
 27 |     Gb = np.sort(g)
 28 | 
 29 | 
 30 | def invG(p):
 31 |     index = int(np.ceil(p*m))
 32 |     if index >= m:
 33 |         return G[m-1]
 34 |     elif index == 0:
 35 |         return G[0]
 36 |     return G[index-1]
 37 | 
 38 | 
 39 | def invF(p):
 40 |     index = int(np.ceil(p*n))
 41 |     if index >= n:
 42 |         return F[n-1]
 43 |     elif index == 0:
 44 |         return F[0]
 45 |     return F[index-1]
 46 | 
 47 | 
 48 | def invGnew(p, M):
 49 |     index = int(np.ceil(p*M))
 50 |     if index >= M:
 51 |         return Gb[M-1]
 52 |     elif index == 0:
 53 |         return Gb[0]
 54 |     return Gb[index-1]
 55 | 
 56 | 
 57 | def invFnew(p, N):
 58 |     index = int(np.ceil(p*N))
 59 |     if index >= N:
 60 |         return Fb[N-1]
 61 |     elif index == 0:
 62 |         return Fb[0]
 63 |     return Fb[index-1]
 64 | 
 65 | 
 66 | def epsilon(dp):
 67 |     s = 0.0
 68 |     se = 0.0
 69 |     for p in np.arange(0, 1, dp):
 70 |         temp = invG(p)-invF(p)
 71 |         tempe = max(temp, 0)
 72 |         s = s+temp*temp*dp
 73 |         se = se+tempe*tempe*dp
 74 |     if s != 0:
 75 |         return se/s
 76 |     else:
 77 |         print("The denominator is 0")
 78 |         return 0.0
 79 | 
 80 | 
 81 | def epsilonNew(dp, N, M):
 82 |     denom = 0.0
 83 |     numer = 0.0
 84 |     for p in np.arange(0, 1, dp):
 85 |         diff = invGnew(p, M) - invFnew(p, N)  # check when F-1(t)<G-1(t)
 86 |         posdiff = max(diff, 0)
 87 |         denom += diff * diff * dp
 88 |         numer += posdiff * posdiff * dp
 89 |     if denom != 0.0:
 90 |         return numer/denom
 91 |     else:
 92 |         print("The denominator is 0")
 93 |         return 0.0
 94 | 
 95 | 
 96 | #############
 97 | 
 98 | 
 99 | def COS(data_A, data_B):
100 |     # collection of statistics table
101 |     print("AVG ", np.average(data_A), np.average(data_B))
102 |     print("STD ", np.std(data_A), np.std(data_B))
103 |     print("MEDIAN ", np.median(data_A), np.median(data_B))
104 |     print("MIN ", np.min(data_A), np.min(data_B))
105 |     print("MAX ", np.max(data_A), np.max(data_B))
106 | 
107 | 
108 | def MannWhitney(data_A, data_B):
109 |     # Mann-Whitney U test for stochastic dominance
110 |     # Use only when the number of observation in each sample is > 20
111 |     if n<20 or m<20:
112 |         print("Use only when the number of observation in each sample is > 20")
113 |         return 1.0
114 |     _, pval = Utest(data_A, data_B, alternative='less')
115 |     return pval
116 | 
117 | 
118 | ##############################################################
119 | def main():
120 |     if len(sys.argv) < 3:
121 |         print("Not enough arguments\n")
122 |         sys.exit()
123 | 
124 |     filename_A = sys.argv[1]    # scores from algorithm A
125 |     filename_B = sys.argv[2]    # scores from algorithm B
126 |     alpha = float(sys.argv[3])        # significance level of statistical test
127 | 
128 |     with open(filename_A) as f:
129 |         data_A = f.read().splitlines()
130 | 
131 |     with open(filename_B) as f:
132 |         data_B = f.read().splitlines()
133 | 
134 |     data_A = list(map(float, data_A))
135 |     data_B = list(map(float, data_B))
136 | 
137 |     buildOrigCDFs(data_A, data_B)
138 | 
139 |     # constants
140 |     dp = 0.005  # differential of the variable p - for integral calculations
141 |     N = 1000    # num of samples from F for sigma estimate
142 |     M = 1000    # num of samples from G for sigma estimate
143 |     B = 1000    # bootstrap iterations for sigma estimate
144 | 
145 |     # calculate the epsilon quotient
146 |     eps_FnGm = epsilon(dp)
147 | 
148 |     # estimate the variance
149 |     lamda = (0.0 + N) / (N + M)
150 |     const = np.sqrt((1.0 * N * M) / (N + M + 0.0))
151 |     samples = []
152 |     for b in range(B):
153 |         Fb = []
154 |         Gb = []
155 |         Fvalues = []
156 |         Gvalues = []
157 |         uniF = np.random.uniform(0, 1, N)
158 |         uniG = np.random.uniform(0, 1, M)
159 |         for i in range(0, N):
160 |             Fvalues.append(invF(uniF[i]))
161 |         for j in range(0, M):
162 |             Gvalues.append(invG(uniG[j]))
163 |         buildNewCDFs(Fvalues, Gvalues)
164 |         distance = epsilonNew(dp, N, M)
165 |         samples.append(distance)
166 | 
167 |     sigma = np.std(samples)
168 | 
169 |     min_epsilon = min(max(eps_FnGm - (1/const) * sigma * normal.ppf(alpha), 0.0), 1.0)
170 |     print("The minimal epsilon for which Algorithm A is almost "
171 |           "stochastically greater than algorithm B is ", min_epsilon)
172 |     if min_epsilon <= 0.5 and min_epsilon > 0.0:
173 |         print("since epsilon <= 0.5 we will claim that A is "
174 |               "better than B with significance level alpha=", alpha)
175 |     elif min_epsilon == 0.0:
176 |         print('since epsilon = 0, algorithm A is stochatically dominant over B')
177 | 
178 |     else:
179 |         print("since epsilon > 0.5 we will claim that A "
180 |               "is not better than B with significance level alpha=", alpha)
181 | 
182 | #     print(MannWhitney(data_A, data_B)<alpha)
183 | #     COS(data_A, data_B)
184 | 
185 | 
186 | if __name__ == "__main__":
187 |     main()
188 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Deep Dominance - How to properly compare deep neural models
 2 | 
 3 | This repository contains an implementation of a method for comparing between two deep neural models described in (Dror et al., 2019):
 4 | 
 5 | "Deep Dominance - How to Properly Compare Deep Neural Models." Rotem Dror, Segev Shlomov and Roi Reichart. In Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics (ACL2019).
 6 | 
 7 | Given two algorithms, each associated with a set of test-set scores, our goal is to determine which algorithm, 
 8 | if any, is superior. The score distributions are generated when running two different DNNs with various 
 9 | hyperparameter configurations and random seeds. This code implements a method for comparing between 
10 | two score distributions based on a measure of "almost stochastic dominance".
11 | 
12 | Details about the implementation and theoretical justifications is described in our paper: Deep Dominance - How to properly compare deep neural models.
13 | 
14 | ## Getting Started
15 | 
16 | These instructions will get you a copy of the project up and running on your local machine for development and testing purposes. See deployment for notes on how to deploy the project on a live system.
17 | 
18 | ### Prerequisites
19 | 
20 | The implementation is implemented in Python 3.6.
21 | 
22 | ### Running the Code
23 | 
24 | To run the code you need to run the following command line:
25 | 
26 | ```
27 | python ASD.py filename_AlgA filename_AlgA alpha 
28 | ```
29 | This script will read the first file of scores from algorithm A from `filename_AlgA` and the scores from algorithm B from 
30 | `filename_AlgB`. The last input to the script is the desired significance level of the statistical test which should be 
31 | entered instead of `alpha`. 
32 | 
33 | An example run:
34 | 
35 | ```
36 | python ASD.py ./scores/scoresA ./scores/scoresB 0.05 
37 | ```
38 | 
39 | ### Input Files
40 | 
41 | The input consists of two files with the results of applying each algorithm (A and B) on a dataset. The results for each algorithm should be in the following form (the result for each sample in X separated by lines) :
42 | 
43 | ```
44 | 46.1726
45 | 68.5210
46 | 51.1151
47 | 45.8590
48 | 55.2119
49 | 36.5653
50 | 37.4119
51 | 39.8117
52 | 51.7002
53 | ```
54 | 
55 | 
56 | ### Output
57 | 
58 | There are 2 possible outputs for the script. The output form depends on whether algorithm A is better than B or otherwise.
59 | 
60 | If algorithm A is better than algorithm B according to the test then the output will be of the form:
61 | ```
62 | The minimal epsilon for which Algorithm A is almost stochastically greater than algorithm B is _____
63 | since epsilon <= 0.5 we will claim that A is better than B with significance level alpha= ______
64 | ```
65 | 
66 | If algorithm B is better than algorithm A according to the test then the output will be of the form:
67 | ```
68 | The minimal epsilon for which Algorithm A is almost stochastically greater than algorithm B is _____
69 | since epsilon > 0.5 we will claim that A is not better than B with significance level alpha= ______
70 | ```
71 | For more details about the meaning of the output please read our paper: Deep Dominance - How to properly compare deep neural models.
72 | 
73 | ### Citation
74 | If you make use of this code for research purposes, we'll appreciate citing the following:
75 | ```
76 | @InProceedings{P,
77 |   author = 	"Dror, Rotem
78 | 		and Shlomov, Segev
79 | 		and Reichart, Roi",
80 |   title = 	"Deep Dominance - How to Properly Compare Deep Neural Models",
81 |   booktitle = 	"Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
82 |   year = 	"2019",
83 |   publisher = 	"Association for Computational Linguistics",
84 |   pages = 	"",
85 |   location = 	"Florence, Italy",
86 |   url = 	""
87 | }
88 | ```
89 | 
90 | ## Contact Information
91 | This file and the code was written by Rotem Dror. The methods are described in the above paper [(Dror et al., 2019)](https://www.aclweb.org/anthology/P19-1266/). For questions please write to: rtmdrr@seas.upenn.edu
92 | 
93 | 
94 | 
95 | 
96 | 


--------------------------------------------------------------------------------