├── .gitignore ├── LICENSE ├── PyMetalog_usagetest.py ├── README.md ├── pymetalog ├── __init__.py ├── a_vector.py ├── class_method.py ├── metalog.py ├── pdf_quantile_functions.py └── support.py └── setup.py /.gitignore: -------------------------------------------------------------------------------- 1 | /pymetalog/__pycache__ 2 | .DS_Store 3 | 4 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 colsmit 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /PyMetalog_usagetest.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import matplotlib.pyplot as plt 4 | import pymetalog as pm 5 | 6 | 7 | fish_data = pm.example_data 8 | 9 | # metalog creation 10 | fish_metalog = pm.metalog( 11 | x=fish_data, 12 | bounds=[0, 40], 13 | boundedness="b", 14 | term_limit=15, 15 | term_lower_bound=2, 16 | step_len=0.001, 17 | penalty=None, 18 | ) 19 | 20 | # summary function 21 | pm.summary(fish_metalog) 22 | 23 | # # plot function - right now this saves plots to local 24 | pm.plot(fish_metalog) 25 | plt.show() 26 | 27 | # # metalog random sampling 28 | r_gens = pm.rmetalog(fish_metalog, n=1000, term=9, generator="hdr") 29 | plt.hist(r_gens, 14) 30 | plt.show() 31 | 32 | # quantiles from a percentile 33 | qs = pm.qmetalog(fish_metalog, y=[0.25, 0.5, 0.75], term=9) 34 | print("qmetalog demo: " + str(qs)) 35 | 36 | # probabilities from a quantile 37 | ps = pm.pmetalog(fish_metalog, q=[3, 10, 25], term=9) 38 | print("pmetalog demo: " + str(ps)) 39 | 40 | # density from a quantile 41 | ds = pm.dmetalog(fish_metalog, q=[3, 10, 25], term=9) 42 | print("dmetalog demo: " + str(ds)) 43 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | pymetalog 2 | ================ 3 | Colin Smith, Travis Jefferies, Isaac J. Faber 4 | 5 | `pip install pymetalog` 6 | 7 | ### The Python Metalog Distribution 8 | 9 | This repo is a working project for a python package (**pymetalog**) that generates functions 10 | for the metalog distribution. The metalog distribution is a highly 11 | flexible probability distribution that can be used to model data without 12 | traditional parameters. 13 | 14 | ### Metalog Background 15 | 16 | In economics, business, engineering, science and other fields, 17 | continuous uncertainties frequently arise that are not easily- or 18 | well-characterized by previously-named continuous probability 19 | distributions. Frequently, there is data available from measurements, 20 | assessments, derivations, simulations or other sources that characterize 21 | the range of an uncertainty. But the underlying process that generated 22 | this data is either unknown or fails to lend itself to convenient 23 | derivation of equations that appropriately characterize the probability 24 | density (PDF), cumulative (CDF) or quantile distribution functions. 25 | 26 | The metalog distributions are a family of continuous univariate 27 | probability distributions that directly address this need. They can be 28 | used in most any situation in which CDF data is known and a flexible, 29 | simple, and easy-to-use continuous probability distribution is needed to 30 | represent that data. Consider their [uses and 31 | benefits](http://www.metalogdistributions.com/usesbenefits.html). Also 32 | consider their 33 | [applications](http://www.metalogdistributions.com/applicationsdata.html) 34 | over a wide range of fields and data sources. 35 | 36 | This repository is a complement and extension of the information found 37 | in the [paper 38 | published](http://pubsonline.informs.org/doi/abs/10.1287/deca.2016.0338) 39 | in Decision Analysis and the 40 | [website](http://www.metalogdistributions.com/) 41 | -------------------------------------------------------------------------------- /pymetalog/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | from .metalog import metalog 4 | from .class_method import rmetalog, plot, qmetalog, pmetalog, dmetalog, summary, update 5 | 6 | name = "pymetalog" 7 | 8 | this_pth = os.path.dirname(__file__) 9 | data_path = os.path.join(this_pth, "examples", "fishout.csv") 10 | example_data = np.loadtxt(data_path, delimiter=",", skiprows=1, dtype="str")[ 11 | :, 1 12 | ].astype(np.float) 13 | -------------------------------------------------------------------------------- /pymetalog/a_vector.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import scipy as sp 4 | 5 | from scipy.optimize import linprog, minimize, NonlinearConstraint 6 | from .pdf_quantile_functions import pdf_quantile_builder 7 | from .support import diffMatMetalog, pdfMetalog, quantileMetalog, newtons_method_metalog 8 | 9 | import time 10 | import warnings 11 | 12 | 13 | def a_vector_OLS_and_LP( 14 | m_dict, 15 | bounds, 16 | boundedness, 17 | term_limit, 18 | term_lower_bound, 19 | fit_method, 20 | alpha, 21 | diff_error=0.001, 22 | diff_step=0.001, 23 | ): 24 | 25 | """Main workhorse function of pymetalog package. 26 | Called during metalog.__init__ method call. 27 | 28 | Args: 29 | m_dict (:obj:`dict` with keys ['params', 'dataValues', 'Y']): Initialized output_dict variable from metalog class. 30 | - m_dict['params']: (:obj:`dict` with keys ['bounds', 'boundedness', 'term_limit', 'term_lower_bound', 'step_len', 'fit_method']): 31 | * 'bounds': metalog.bounds 32 | * 'boundedness': metalog.boundedness 33 | * 'term_limit': metalog.term_limit 34 | * 'term_lower_bound': metalog.term_lower_bound 35 | * 'step_len': metalog.step_len 36 | * 'fit_method': metalog.fit_method 37 | 38 | - m_dict['dataValues']: (:obj:`pandas.DataFrame` with columns ['x','probs','z'] of type numeric): 39 | * 'x': metalog.x 40 | * 'probs': metalog.probs 41 | * 'z': column calculated in metalog.append_zvector method 42 | - depends on metalog.boundedness attribute 43 | - metalog.boundedness = 'u': 44 | * 'z' = metalog.x 45 | - metalog.boundedness = 'sl': 46 | * 'z' = log( (metalog.x-lower_bound) ) 47 | - metalog.boundedness = 'su': 48 | * 'z' = = log( (upper_bound-metalog.x) ) 49 | - metalog.boundedness = 'b': 50 | * 'z' = log( (metalog.x-lower_bound) / (upper_bound-metalog.x) ) 51 | 52 | - m_dict['Y']: (:obj:`pandas.DataFrame` with columns ['y1','y2','y3','y4', ... ,'yn'] of type numeric): 53 | * 'y1': numpy.array of ones with length equal to len(x) 54 | * 'y2': numpy.array of numeric values equal to the term attached to s in the logistic quantile function np.log(m_dict['dataValues']['probs'] / (1 - m_dict['dataValues']['probs'])) 55 | * 'y3': numpy.array of numeric values (m_dict['dataValues']['probs'] - 0.5) * m_dict['Y']['y2'] 56 | * 'y4': numpy.array of numeric values m_dict['Y']['y4'] = m_dict['dataValues']['probs'] - 0.5 57 | * 'yn': numpy.array of numeric values: 58 | - if n in 'yn' is odd, 59 | m_dict['Y']['yn'] = m_dict['Y']['y4']**(int(i//2)) 60 | - if n in 'yn' is even, 61 | zn = 'y' + str(n-1) 62 | m_dict['Y'][yn] = m_dict['Y']['y2'] * m_dict['Y'][zn] 63 | 64 | bounds (:obj:`list`): Upper and lower limits to filter the data with before calculating metalog quantiles/pdfs. 65 | - should be set in conjunction with the `boundedness` parameter 66 | 67 | boundedness (:obj:`str`): String that is used to specify the type of metalog to fit. 68 | - must be in set ('u','sl','su','b') 69 | - Default: 'u' 70 | * Fits an unbounded metalog 71 | - 'sl' fits a strictly lower bounded metalog 72 | * len(bounds) must == 1 73 | - 'su' fits a strictly upper bounded metalog 74 | * len(bounds) must == 1 75 | - 'b' fits a upper/lower bounded metalog 76 | * len(bounds) must == 2 77 | * bounds[1] must be > bounds[0] 78 | 79 | term_limit (:obj:`int`): The upper limit of the range of metalog terms to use to fit the data. 80 | - strictly > term_lower_bound 81 | - in range [3,30] 82 | 83 | term_lower_bound (:obj:`int`): The lower limit of the range of metalog terms to use to fit the data. 84 | - strictly < term_limit 85 | - in range [2,29] 86 | 87 | fit_method (:obj:`str`): Fit method to use to fit metalog distribution. 88 | - must be in set ('any','OLS','LP','MLE') 89 | - Default: 'any' 90 | * first tries 'OLS' method than 'LP' 91 | - 'OLS' only tries to fit by solving directly for a coefficients using ordinary least squares method 92 | - 'LP' only tries to estimate fit using simplex linear program optimization routine 93 | - 'MLE' first tries 'OLS' method than falls back to a maximum likelihood estimation routine 94 | 95 | alpha (:obj:`float`, optional): Regularization term to add to OLS fit 96 | - strictly >= 0. 97 | - should be set in conjunction with `penalty` parameter 98 | - Default: 0. (no regularization, OLS) 99 | 100 | diff_error (:obj:`float`, optional): Value used to in scipy.optimize.linprog method call 101 | to init the array of values representing the 102 | upper-bound of each inequality constraint (row) in A_ub. 103 | - #TODO: Insert maths 104 | 105 | diff_step (:obj:`float`, optional): Value passed to `step_len` parameter in support.py diffMatMetalog method call 106 | defines the bin width for the Reimann sum of the differences differentiation method 107 | - diffMatMetalog differentiates the metalog pdf 108 | * Differentiation reference: https://math.stackexchange.com/a/313135 109 | Returns: 110 | m_dict: (:obj:`dict` with keys ['params', 'dataValues', 'Y', 'A', 'M', 'Validation']) 111 | - m_dict['A']: (:obj:`pandas.DataFrame` with columns ['a2','a3', ... ,'an'] of type numeric): 112 | * a2, a3, ... , an are our a coefficients returned by the method specified in `fit_method` 113 | 114 | - m_dict['M']: (:obj:`pandas.DataFrame` with columns 0:'pdf_1',1:'cdf_1',2:'pdf_2',3:'cdf_2', 115 | ...,((2*(term_limit-term_lower_bound))+1)-1:'pdf_n', 116 | ((2*(term_limit-term_lower_bound))+1):'cdf_n' 117 | where n is the total number of metalog fits determined by (term_limit-term_lower_bound)+1 118 | ) 119 | * pdf_1, pdf_2, ... , pdf_n are the metalog pdfs returned by pdf_quantile_builder.pdfMetalog method 120 | * cdf_1, cdf_2, ... , cdf_n are the metalog quantiles returned by pdf_quantile_builder.quantileMetalog method 121 | 122 | - m_dict['y']: (:obj: `numpy.ndarray` of type float): 123 | * Array of bin widths for both the pdf_n and cdf_n 124 | 125 | - m_dict['Validation']: (:obj:`pandas.DataFrame` with columns ['term', 'valid', 'method'] of type str): 126 | * 'term': each metalog estimation given a number of terms 127 | * 'valid': boolean flag indicating if the metalog estimation was valid or not 128 | * 'method': a string indicating which method was used for the metalog estimation 129 | 130 | """ 131 | 132 | A = pd.DataFrame() 133 | c_a_names = [] 134 | c_m_names = [] 135 | Mh = pd.DataFrame() 136 | Validation = pd.DataFrame() 137 | df_MH_temp_list = list() 138 | df_A_temp_list = list() 139 | df_Validation_temp_list = list() 140 | 141 | # TODO: Large for-loop can probably be factored into smaller functions 142 | for i in range(term_lower_bound, term_limit + 1): 143 | Y = m_dict["Y"].iloc[:, 0:i] 144 | eye = np.eye(Y.shape[1]) 145 | z = m_dict["dataValues"]["z"] 146 | y = m_dict["dataValues"]["probs"] 147 | step_len = m_dict["params"]["step_len"] 148 | methodFit = "OLS" 149 | a_name = "a" + str(i) 150 | m_name = "m" + str(i) 151 | M_name = "M" + str(i) 152 | c_m_names = np.append(c_m_names, [m_name, M_name]) 153 | c_a_names = np.append(c_a_names, a_name) 154 | 155 | if fit_method == "any" or fit_method == "MLE": 156 | try: 157 | temp = np.dot( 158 | np.dot(np.linalg.inv(np.dot(Y.T, Y) + alpha * eye), Y.T), z 159 | ) 160 | except: 161 | # use LP solver if OLS breaks 162 | temp = a_vector_LP( 163 | m_dict, 164 | term_limit=i, 165 | term_lower_bound=i, 166 | diff_error=diff_error, 167 | diff_step=diff_step, 168 | ) 169 | methodFit = "Linear Program" 170 | if fit_method == "OLS": 171 | try: 172 | temp = np.dot( 173 | np.dot(np.linalg.inv(np.dot(Y.T, Y) + alpha * eye), Y.T), z 174 | ) 175 | except: 176 | raise RuntimeError( 177 | "OLS was unable to solve infeasible or poorly formulated problem" 178 | ) 179 | if fit_method == "LP": 180 | temp = a_vector_LP( 181 | m_dict, 182 | term_limit=i, 183 | term_lower_bound=i, 184 | diff_error=diff_error, 185 | diff_step=diff_step, 186 | ) 187 | methodFit = "Linear Program" 188 | 189 | if fit_method == "MLE": 190 | temp = a_vector_MLE(temp, y, i, m_dict, bounds, boundedness) 191 | 192 | temp = np.append(temp, np.zeros(term_limit - i)) 193 | 194 | # build a y vector for smaller data sets 195 | if len(z) < 100: 196 | y2 = np.linspace(step_len, 1 - step_len, int((1 - step_len) / step_len)) 197 | tailstep = step_len / 10 198 | y1 = np.linspace( 199 | tailstep, (min(y2) - tailstep), int((min(y2) - tailstep) / tailstep) 200 | ) 201 | y3 = np.linspace( 202 | (max(y2) + tailstep), 203 | (max(y2) + tailstep * 9), 204 | int((tailstep * 9) / tailstep), 205 | ) 206 | y = np.hstack((y1, y2, y3)) 207 | 208 | # Get the dict and quantile values back for validation 209 | temp_dict = pdf_quantile_builder( 210 | temp, y=y, term_limit=i, bounds=bounds, boundedness=boundedness 211 | ) 212 | 213 | # If it not a valid pdf run and the OLS version was used the LP version 214 | if (temp_dict["valid"] == "no") and (fit_method != "OLS"): 215 | temp = a_vector_LP( 216 | m_dict, 217 | term_limit=i, 218 | term_lower_bound=i, 219 | diff_error=diff_error, 220 | diff_step=diff_step, 221 | ) 222 | temp = np.append(temp, np.zeros(term_limit - i)) 223 | methodFit = "Linear Program" 224 | 225 | # Get the dict and quantile values back for validation 226 | temp_dict = pdf_quantile_builder( 227 | temp, y=y, term_limit=i, bounds=bounds, boundedness=boundedness 228 | ) 229 | 230 | df_MH_temp_list.append(pd.DataFrame(temp_dict["m"])) 231 | df_MH_temp_list.append(pd.DataFrame(temp_dict["M"])) 232 | df_A_temp_list.append(pd.DataFrame(temp)) 233 | 234 | tempValidation = pd.DataFrame( 235 | data={"term": [i], "valid": [temp_dict["valid"]], "method": [methodFit]} 236 | ) 237 | df_Validation_temp_list.append(tempValidation) 238 | 239 | Validation = pd.concat(df_Validation_temp_list, axis=0) 240 | Mh = pd.concat(df_MH_temp_list, axis=1) 241 | A = pd.concat(df_A_temp_list, axis=1) 242 | 243 | A.columns = c_a_names 244 | Mh.columns = c_m_names 245 | 246 | m_dict["A"] = A 247 | m_dict["M"] = Mh 248 | m_dict["M"]["y"] = temp_dict["y"] 249 | m_dict["Validation"] = Validation 250 | 251 | A = np.column_stack((np.repeat(1.0, len(A)), A)) 252 | Est = np.dot(m_dict["Y"], A) 253 | ncols = A.shape[1] 254 | Z = np.column_stack( 255 | ( 256 | np.array(m_dict["dataValues"]["z"]), 257 | np.repeat(m_dict["dataValues"]["z"], ncols - 1).reshape( 258 | len(m_dict["dataValues"]["z"]), ncols - 1 259 | ), 260 | ) 261 | ) 262 | 263 | m_dict["square_residual_error"] = ((Z - Est) ** 2).sum(axis=1) 264 | 265 | return m_dict 266 | 267 | 268 | def a_vector_LP( 269 | m_dict, term_limit, term_lower_bound, diff_error=0.001, diff_step=0.001 270 | ): 271 | """TODO: write docstring""" 272 | cnames = np.array([]) 273 | 274 | for i in range(term_lower_bound, term_limit + 1): 275 | Y = m_dict["Y"].iloc[:, 0:i] 276 | z = m_dict["dataValues"]["z"] 277 | 278 | # Bulding the objective function using abs value LP formulation 279 | Y_neg = -Y 280 | 281 | new_Y = pd.DataFrame({"y1": Y.iloc[:, 0], "y1_neg": Y_neg.iloc[:, 0]}) 282 | 283 | for c in range(1, len(Y.iloc[0, :])): 284 | new_Y["y" + str(c + 1)] = Y.iloc[:, c] 285 | new_Y["y" + str(c + 1) + "_neg"] = Y_neg.iloc[:, c] 286 | 287 | a = np.array(["".join(["a", str(i)])]) 288 | cnames = np.append(cnames, a, axis=0) 289 | 290 | # Building the constraint matrix 291 | error_mat = np.array([]) 292 | 293 | for j in range(1, len(Y.iloc[:, 0]) + 1): 294 | front_zeros = np.zeros(2 * (j - 1)) 295 | ones = [1, -1] 296 | trail_zeroes = np.zeros(2 * (len(Y.iloc[:, 1]) - j)) 297 | if j == 1: 298 | error_vars = np.append(ones, trail_zeroes) 299 | 300 | elif j != 1: 301 | error_vars = np.append(front_zeros, ones) 302 | error_vars = np.append(error_vars, trail_zeroes) 303 | 304 | if error_mat.size == 0: 305 | error_mat = np.append(error_mat, error_vars, axis=0) 306 | else: 307 | error_mat = np.vstack((error_mat, error_vars)) 308 | 309 | new = pd.concat((pd.DataFrame(data=error_mat), new_Y), axis=1) 310 | diff_mat = diffMatMetalog(i, diff_step) 311 | diff_zeros = [] 312 | 313 | for t in range(0, len(diff_mat.iloc[:, 0])): 314 | zeros_temp = np.zeros(2 * len(Y.iloc[:, 0])) 315 | 316 | if np.size(diff_zeros) == 0: 317 | diff_zeros = zeros_temp 318 | else: 319 | diff_zeros = np.vstack((zeros_temp, diff_zeros)) 320 | 321 | diff_mat = np.concatenate((diff_zeros, diff_mat), axis=1) 322 | 323 | # Combine the total constraint matrix 324 | lp_mat = np.concatenate((new, diff_mat), axis=0) 325 | 326 | # Objective function coeficients 327 | c = np.append(np.ones(2 * len(Y.iloc[:, 1])), np.zeros(2 * i)) 328 | 329 | # Constraint matrices 330 | A_eq = lp_mat[: len(Y.iloc[:, 1]), :] 331 | A_ub = -1 * lp_mat[len(Y.iloc[:, 1]) :, :] 332 | b_eq = z 333 | b_ub = -1 * np.repeat(diff_error, len(diff_mat[:, 0])) 334 | 335 | # Solving the linear program w/ scipy (for now) 336 | lp_sol = linprog( 337 | c, 338 | A_ub=A_ub, 339 | b_ub=b_ub, 340 | A_eq=A_eq, 341 | b_eq=b_eq, 342 | method="simplex", 343 | options={"maxiter": 5000, "tol": 1.0e-5, "disp": False}, 344 | ) 345 | 346 | # Consolidating solution back into the a vector 347 | tempLP = lp_sol.x[(2 * len(Y.iloc[:, 1])) : (len(lp_sol.x) + 1)] 348 | temp = [] 349 | 350 | for r in range(0, ((len(tempLP) // 2))): 351 | temp.append(tempLP[(r * 2)] - tempLP[(2 * r) + 1]) 352 | 353 | return temp 354 | 355 | 356 | def a_vector_MLE(a, y, term, m_dict, bounds, boundedness): 357 | """TODO: write docstring""" 358 | ym = [ 359 | newtons_method_metalog(a, xi, term, bounds, boundedness) 360 | for xi in m_dict["dataValues"]["x"] 361 | ] 362 | 363 | def MLE_quantile_constraints(x): 364 | M = [ 365 | quantileMetalog(x[:term], yi, term, bounds=bounds, boundedness=boundedness) 366 | for yi in x[term:] 367 | ] 368 | return m_dict["dataValues"]["x"] - M 369 | 370 | def MLE_objective_function(x, y, term, m_dict): 371 | return -np.sum( 372 | [ 373 | np.log10(pdfMetalog(x[:term], yi, term, bounds, boundedness)) 374 | for yi in np.absolute(x[term:]) 375 | ] 376 | ) 377 | 378 | m_dict[str("MLE" + str(term))] = {} 379 | 380 | x0 = np.hstack((a[:term], ym)) 381 | m_dict[str("MLE" + str(term))]["oldobj"] = -MLE_objective_function( 382 | x0, y, term, m_dict 383 | ) 384 | bnd = ((None, None),) * len(a) + ((0, 1),) * (len(x0) - len(a)) 385 | con = NonlinearConstraint(MLE_quantile_constraints, 0, 0) 386 | 387 | mle = minimize( 388 | MLE_objective_function, x0, args=(y, term, m_dict), bounds=bnd, constraints=con 389 | ) 390 | 391 | m_dict[str("MLE" + str(term))]["newobj"] = -MLE_objective_function( 392 | mle.x, y, term, m_dict 393 | ) 394 | m_dict[str("MLE" + str(term))]["A"] = mle.x[:term] 395 | m_dict[str("MLE" + str(term))]["Y"] = mle.x[term:] 396 | 397 | m_dict[str("MLE" + str(term))]["oldA"] = a 398 | m_dict[str("MLE" + str(term))]["oldY"] = y 399 | 400 | out_temp = np.zeros_like(a) 401 | for i in range(term): 402 | out_temp[i] = mle.x[i] 403 | 404 | return out_temp 405 | -------------------------------------------------------------------------------- /pymetalog/class_method.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | from scipy.stats import t 5 | from .support import newtons_method_metalog, pdfMetalog_density 6 | from .metalog import metalog 7 | 8 | 9 | def summary(m): 10 | """Prints information about the fitted metalog m. 11 | Prints to console: 12 | - metalog.output_dict['params']['term_limit'] 13 | - metalog.output_dict['params']['term_lower_bound'] 14 | - metalog.output_dict['params']['boundedness'] 15 | - metalog.output_dict['params']['bounds'] 16 | - metalog.output_dict['params']['step_len'] 17 | - metalog.output_dict['params']['fit_method'] 18 | - metalog.output_dict['Validation'] 19 | - metalog.output_dict['params']['nobs'] 20 | 21 | Args: 22 | m (:obj:`metalog`): A fitted metalog object. 23 | 24 | """ 25 | print( 26 | " -----------------------------------------------\n", 27 | "Summary of Metalog Distribution Object\n", 28 | "-----------------------------------------------\n", 29 | "\nParameters\n", 30 | "Term Limit: ", 31 | m.output_dict["params"]["term_limit"], 32 | "\n", 33 | "Term Lower Bound: ", 34 | m.output_dict["params"]["term_lower_bound"], 35 | "\n", 36 | "Boundedness: ", 37 | m.output_dict["params"]["boundedness"], 38 | "\n", 39 | "Bounds (only used based on boundedness): ", 40 | m.output_dict["params"]["bounds"], 41 | "\n", 42 | "Step Length for Distribution Summary: ", 43 | m.output_dict["params"]["step_len"], 44 | "\n", 45 | "Method Use for Fitting: ", 46 | m.output_dict["params"]["fit_method"], 47 | "\n", 48 | "\n\n Validation and Fit Method", 49 | "Number of Data Points Used: ", 50 | m.output_dict["params"]["nobs"], 51 | "\n", 52 | ) 53 | print(m.output_dict["Validation"].to_string(index=False)) 54 | 55 | 56 | def rmetalog(m, n=1, term=2, generator="rand"): 57 | """Take n random draws from fitted metalog m using specified number of terms. 58 | Uses specified random seed. 59 | 60 | Args: 61 | m (:obj:`metalog`): A fitted metalog object. 62 | 63 | n (:obj:`int`, optional): Number of random draws to take from fitted metalog. 64 | - strictly >= 1 65 | - Default: 1 66 | 67 | term (:obj:`int`, optional): Number of metalog terms to use when making random draws. 68 | - strictly >= 2 69 | - must be in range [m.term_lower_bound, m.term_limit] 70 | - Default: 2 71 | 72 | generator (:obj:`str`, optional): String that is used to specify the random number generator. 73 | - must be in set ('rand','hdr') 74 | * 'rand' uses `np.random.rand`, results are random each time 75 | * 'hdr' uses Hubbard Decision Research (HDR) random number generator, results are repeatable 76 | - Default: 'rand' 77 | 78 | Returns: 79 | (:obj:`numpy.ndarray`): n length numpy array of random draws from fitted metalog. 80 | 81 | """ 82 | m = m.output_dict 83 | valid_terms = np.asarray(m["Validation"]["term"]) 84 | valid_terms_printout = " ".join(str(t) for t in valid_terms) 85 | 86 | if (type(n) != int) or (n < 1) or ((n % 1) != 0): 87 | raise TypeError("Error: n must be a positive numeric interger") 88 | if ( 89 | (type(term) != int) 90 | or (term < 2) 91 | or ((term % 1) != 0) 92 | or not (term in valid_terms) 93 | ): 94 | raise TypeError( 95 | "Error: term must be a single positive numeric interger contained " 96 | "in the metalog object. Available terms are: " + valid_terms_printout 97 | ) 98 | 99 | if generator == "hdr": 100 | x_arr = np.arange(1, n + 1) 101 | v_index = np.random.randint(80000) 102 | 103 | def hdrgen(pm_index): 104 | return ( 105 | np.mod( 106 | ( 107 | ( 108 | np.mod( 109 | (v_index + 1000000) 110 | ^ 2 + (v_index + 1000000) * (pm_index + 10000000), 111 | 99999989, 112 | ) 113 | ) 114 | + 1000007 115 | ) 116 | * ( 117 | ( 118 | np.mod( 119 | (pm_index + 10000000) 120 | ^ 2 121 | + (pm_index + 10000000) 122 | * ( 123 | np.mod( 124 | (v_index + 1000000) 125 | ^ 2 126 | + (v_index + 1000000) * (pm_index + 10000000), 127 | 99999989, 128 | ) 129 | ), 130 | 99999989, 131 | ) 132 | ) 133 | + 1000013 134 | ), 135 | 2147483647, 136 | ) 137 | + 0.5 138 | ) / 2147483647 139 | 140 | vhdrgen = np.vectorize(hdrgen) 141 | x = vhdrgen(x_arr) 142 | 143 | else: 144 | x = np.random.rand(n) 145 | 146 | Y = pd.DataFrame(np.array([np.repeat(1, n)]).T, columns=["y1"]) 147 | 148 | # Construct initial Y Matrix values 149 | Y["y2"] = np.log(x / (1 - x)) 150 | if term > 2: 151 | Y["y3"] = (x - 0.5) * Y["y2"] 152 | if term > 3: 153 | Y["y4"] = x - 0.5 154 | 155 | # Complete the values through the term limit 156 | if term > 4: 157 | for i in range(5, (term + 1)): 158 | y = "".join(["y", str(i)]) 159 | if i % 2 != 0: 160 | Y[y] = Y["y4"] ** (i // 2) 161 | if i % 2 == 0: 162 | z = "".join(["y", str(i - 1)]) 163 | Y[y] = Y["y2"] * Y[z] 164 | 165 | amat = "".join(["a", str(term)]) 166 | a = m["A"][amat].iloc[0:(term)].to_frame() 167 | s = np.dot(Y, a) 168 | 169 | if m["params"]["boundedness"] == "sl": 170 | s = m["params"]["bounds"][0] + np.exp(s) 171 | 172 | if m["params"]["boundedness"] == "su": 173 | s = m["params"]["bounds"][1] - np.exp(-(s)) 174 | 175 | if m["params"]["boundedness"] == "b": 176 | s = (m["params"]["bounds"][0] + (m["params"]["bounds"][1]) * np.exp(s)) / ( 177 | 1 + np.exp(s) 178 | ) 179 | 180 | return s 181 | 182 | 183 | def dmetalog(m, q, term=3): 184 | """Generate density values with user specified quantiles from a fitted metalog object. 185 | Generated using user specified number of terms. 186 | Quantiles are generated using a Newton's Method approximation. 187 | 188 | Args: 189 | m (:obj:`metalog`): A fitted metalog object. 190 | 191 | q (:obj:`list` | `numpy.ndarray`): Quantiles to return density values for. 192 | 193 | term (:obj:`int`, optional): Number of metalog terms to use when generating densities. 194 | - strictly >= 2 195 | - must be in range [m.term_lower_bound, m.term_limit] 196 | - Default: 3 197 | 198 | Returns: 199 | (:obj:`list`): len(q) list of density values from fitted metalog. 200 | 201 | """ 202 | valid_terms = np.asarray(m.output_dict["Validation"]["term"]) 203 | 204 | if (type(q) != list) and (type(q) != np.ndarray): 205 | raise TypeError("Error: input q must be a list or numpy array.") 206 | 207 | if ( 208 | (term not in valid_terms) 209 | or type(term) != int 210 | or (term < 2) 211 | or ((term % 1) != 0) 212 | ): 213 | raise TypeError( 214 | "Error: term must be a single positive numeric interger contained in the metalog object. Available " 215 | "terms are: " + " ".join(map(str, valid_terms)) 216 | ) 217 | 218 | qs = list(map(lambda qi: newtons_method_metalog(q=qi, m=m, term=term), q)) 219 | ds = list(map(lambda yi: pdfMetalog_density(y=yi, m=m, t=term), qs)) 220 | 221 | return ds 222 | 223 | 224 | def pmetalog(m, q, term=3): 225 | """Generate probabilities with user specified quantiles from a fitted metalog object. 226 | Generated using user specified number of terms. 227 | Quantiles are generated using a Newton's Method approximation. 228 | 229 | Args: 230 | m (:obj:`metalog`): A fitted metalog object. 231 | 232 | q (:obj:`list` | `numpy.ndarray`): Quantiles to return probabilities values for. 233 | 234 | term (:obj:`int`, optional): Number of metalog terms to use when generating probabilities. 235 | - strictly >= 2 236 | - must be in range [m.term_lower_bound, m.term_limit] 237 | - Default: 3 238 | 239 | Returns: 240 | (:obj:`list`): len(q) list of probabilities from fitted metalog. 241 | 242 | """ 243 | valid_terms = np.asarray(m.output_dict["Validation"]["term"]) 244 | 245 | if (type(q) != list) and (type(q) != np.ndarray): 246 | raise TypeError("Error: input q must be a list or numpy array") 247 | if not isinstance(q, (int, float, complex)) and not all( 248 | isinstance(x, (int, float, complex)) for x in q 249 | ): 250 | raise TypeError("Error: all elements in q must be numeric") 251 | if ( 252 | (term in valid_terms) != True 253 | or type(term) != int 254 | or (term < 2) 255 | or ((term % 1) != 0) 256 | ): 257 | raise TypeError( 258 | "Error: term must be a single positive numeric interger contained in the metalog object. Available " 259 | "terms are: " + " ".join(map(str, valid_terms)) 260 | ) 261 | 262 | qs = list(map(lambda qi: newtons_method_metalog(q=qi, m=m, term=term), q)) 263 | return qs 264 | 265 | 266 | def qmetalog(m, y, term=3): 267 | """Generate quantiles with a probability from a fitted metalog object. 268 | 269 | Args: 270 | m (:obj:`metalog`): A fitted metalog object. 271 | 272 | y (:obj:`list` | `numpy.ndarray`): Probabilities to return quantile values for. 273 | 274 | term (:obj:`int`, optional): Number of metalog terms to use when generating quantiles. 275 | - strictly >= 2 276 | - must be in range [m.term_lower_bound, m.term_limit] 277 | - Default: 3 278 | 279 | Returns: 280 | (:obj:`numpy.ndarray`): len(q) length numpy array of quantiles from fitted metalog. 281 | 282 | """ 283 | m = m.output_dict 284 | valid_terms = np.asarray(m["Validation"]["term"]) 285 | valid_terms_printout = " ".join(str(t) for t in valid_terms) 286 | 287 | if type(y) != list: 288 | raise TypeError("Error: y must be a list of numeric values") 289 | y = np.asarray(y) 290 | if ( 291 | (all(isinstance(x, (int, float, complex)) for x in y)) != True 292 | or (max(y) >= 1) 293 | or (min(y) <= 0) 294 | ): 295 | raise TypeError( 296 | "Error: y or all elements in y must be positive numeric values between 0 and 1" 297 | ) 298 | if ( 299 | (type(term) != int) 300 | or (term < 2) 301 | or ((term % 1) != 0) 302 | or (term in valid_terms) != True 303 | ): 304 | raise TypeError( 305 | "Error: term must be a single positive numeric integer contained " 306 | "in the metalog object. Available terms are: " + valid_terms_printout 307 | ) 308 | 309 | Y = pd.DataFrame(np.array([np.repeat(1, len(y))]).T, columns=["y1"]) 310 | 311 | # Construct the Y Matrix initial values 312 | Y["y2"] = np.log(y / (1 - y)) 313 | if term > 2: 314 | Y["y3"] = (y - 0.5) * Y["y2"] 315 | if term > 3: 316 | Y["y4"] = y - 0.5 317 | 318 | # Complete the values through the term limit 319 | if term > 4: 320 | for i in range(5, (term + 1)): 321 | y = "".join(["y", str(i)]) 322 | if i % 2 != 0: 323 | Y[y] = Y["y4"] ** (i // 2) 324 | if i % 2 == 0: 325 | z = "".join(["y", str(i - 1)]) 326 | Y[y] = Y["y2"] * Y[z] 327 | 328 | amat = "".join(["a", str(term)]) 329 | a = m["A"][amat].iloc[0:(term)].to_frame() 330 | s = np.dot(Y, a) 331 | 332 | if m["params"]["boundedness"] == "sl": 333 | s = m["params"]["bounds"][0] + np.exp(s) 334 | 335 | if m["params"]["boundedness"] == "su": 336 | s = m["params"]["bounds"][1] - np.exp(-(s)) 337 | 338 | if m["params"]["boundedness"] == "b": 339 | s = (m["params"]["bounds"][0] + (m["params"]["bounds"][1]) * np.exp(s)) / ( 340 | 1 + np.exp(s) 341 | ) 342 | 343 | s = s.flatten() 344 | return s 345 | 346 | 347 | def plot(m): 348 | """Plots PDF and Quantile panels for each term of fitted metalog m. 349 | 350 | Args: 351 | m (:obj:`metalog`): A fitted metalog object. 352 | 353 | Returns: 354 | (:obj:`dict` with keys ['pdf', 'cdf']): PDF and Quantile panel plots. 355 | """ 356 | x = m.output_dict 357 | # build plots 358 | InitalResults = pd.DataFrame( 359 | data={ 360 | "term": ( 361 | np.repeat( 362 | (str(x["params"]["term_lower_bound"]) + " Terms"), 363 | len(x["M"].iloc[:, 0]), 364 | ) 365 | ), 366 | "pdfValues": x["M"].iloc[:, 0], 367 | "quantileValues": x["M"].iloc[:, 1], 368 | "cumValue": x["M"]["y"], 369 | } 370 | ) 371 | 372 | if len(x["M"].columns) > 3: 373 | for i in range(2, ((len(x["M"].iloc[0, :]) - 1) // 2 + 1)): 374 | TempResults = pd.DataFrame( 375 | data={ 376 | "term": np.repeat( 377 | (str(x["params"]["term_lower_bound"] + (i - 1)) + " Terms"), 378 | len(x["M"].iloc[:, 0]), 379 | ), 380 | "pdfValues": x["M"].iloc[:, (i * 2 - 2)], 381 | "quantileValues": x["M"].iloc[:, (i * 2 - 1)], 382 | "cumValue": x["M"]["y"], 383 | } 384 | ) 385 | 386 | InitalResults = InitalResults.append( 387 | pd.DataFrame(data=TempResults), ignore_index=True 388 | ) 389 | 390 | # PDF plot 391 | ymin = np.min(InitalResults["pdfValues"]) 392 | ymax = np.max(InitalResults["pdfValues"]) 393 | nterms = InitalResults.term.nunique() 394 | 395 | nrow = (nterms + 3) // 4 396 | 397 | if nterms < 4: 398 | ncol = nterms 399 | else: 400 | ncol = 4 401 | 402 | pdf_fig, axes = plt.subplots(nrow, ncol, sharey="col", squeeze=False) 403 | 404 | for t in range(nterms): 405 | data = InitalResults[ 406 | (InitalResults["term"] == (InitalResults.term.unique()[t])) 407 | ] 408 | x = data["quantileValues"] 409 | y = data["pdfValues"] 410 | r = t // 4 411 | c = t % 4 412 | axes[r, c].plot(x, y) 413 | axes[r, c].set_ylim(ymin, ymax * 1.1) 414 | axes[r, c].set_title(InitalResults.term.unique()[t]) 415 | axes[r, c].tick_params(axis="both", which="major", labelsize=10) 416 | axes[r, c].tick_params(axis="both", which="minor", labelsize=10) 417 | 418 | for t in range(nterms, nrow * ncol): 419 | r = t // 4 420 | c = t % 4 421 | axes[r, c].axis("off") 422 | 423 | pdf_fig.text(0.5, 0.04, "Quantile Values", ha="center") 424 | pdf_fig.text(0.04, 0.5, "PDF Values", va="center", rotation="vertical") 425 | 426 | plt.yscale("linear") 427 | plt.tight_layout(rect=[0.05, 0.05, 1, 1]) 428 | 429 | # Quantile Plot 430 | ymin = np.min(InitalResults["cumValue"]) 431 | ymax = np.max(InitalResults["cumValue"]) 432 | nterms = InitalResults.term.nunique() 433 | 434 | nrow = (nterms + 3) // 4 435 | 436 | if nterms < 4: 437 | ncol = nterms 438 | else: 439 | ncol = 4 440 | 441 | cdf_fig, axes = plt.subplots(nrow, ncol, sharey="col", squeeze=False) 442 | 443 | for t in range(nterms): 444 | data = InitalResults[ 445 | (InitalResults["term"] == (InitalResults.term.unique()[t])) 446 | ] 447 | x = data["quantileValues"] 448 | y = data["cumValue"] 449 | r = t // 4 450 | c = t % 4 451 | axes[r, c].plot(x, y) 452 | axes[r, c].set_ylim(ymin, ymax * 1.1) 453 | axes[r, c].set_title(InitalResults.term.unique()[t]) 454 | axes[r, c].tick_params(axis="both", which="major", labelsize=10) 455 | axes[r, c].tick_params(axis="both", which="minor", labelsize=10) 456 | 457 | for t in range(nterms, nrow * ncol): 458 | r = t // 4 459 | c = t % 4 460 | axes[r, c].axis("off") 461 | 462 | cdf_fig.text(0.5, 0.04, "Quantile Values", ha="center") 463 | cdf_fig.text(0.04, 0.5, "CDF Values", va="center", rotation="vertical") 464 | 465 | plt.yscale("linear") 466 | plt.tight_layout(rect=[0.05, 0.05, 1, 1]) 467 | 468 | return {"pdf": pdf_fig, "cdf": cdf_fig} 469 | 470 | 471 | def update(m, new_data, penalty=None, alpha=0.0): 472 | """Updates a previously fitted metalog object with new data. 473 | 474 | Args: 475 | m (:obj:`metalog`): The previously fitted metalog object to be updated with `new_data`. 476 | - `save_data` parameter must have been set equal to True in original metalog fit. 477 | 478 | new_data (:obj:`list` | `numpy.ndarray` | `pandas.Series`): Input data to update the metalog object with. 479 | - must be an array of allowable types: int, float, numpy.int64, numpy.float64 480 | 481 | penalty (:obj:`str`, optional): Used to specify the norm used in the regularization. 482 | - must be in set ('l2', None) 483 | * 'l2' performs Ridge Regression instead of OLS 484 | - Automatically shrinks a coefficients, leading to "smoother" fits 485 | - should be set in conjunction with `alpha` parameter 486 | - Default: None 487 | 488 | alpha (:obj:`float`, optional): Regularization term to add to OLS fit. 489 | - strictly >= 0. 490 | - should be set in conjunction with `penalty` parameter 491 | - Default: 0. (no regularization, OLS) 492 | 493 | Returns: 494 | (:obj:`metalog`): Input metalog object that has been updated using `new_data` 495 | 496 | Raises: 497 | ValueError: 'Input metalog `m.save_data` parameter must be True' 498 | TypeError: 'Input x must be an array or pandas Series' 499 | TypeError: 'Input x must be an array of allowable types: int, float, numpy.int64, or numpy.float64' 500 | IndexError: 'Input x must be of length 3 or greater' 501 | """ 502 | 503 | if not m.save_data: 504 | raise ValueError("Input metalog `m.save_data` parameter must be True") 505 | if ( 506 | (type(new_data) != list) 507 | and (type(new_data) != np.ndarray) 508 | and (type(new_data) != pd.Series) 509 | ): 510 | raise TypeError("Input x must be an array or pandas Series") 511 | if isinstance(new_data, pd.Series): 512 | new_data = new_data.values.copy() 513 | if not all([isinstance(x, (int, float, np.int64, np.float64)) for x in new_data]): 514 | raise TypeError( 515 | "Input x must be an array of allowable types: int, float, numpy.int64, or numpy.float64" 516 | ) 517 | if np.size(new_data) < 3: 518 | raise IndexError("Input x must be of length 3 or greater") 519 | 520 | old_append_new_data = np.append(m.x, new_data) 521 | 522 | updated_metalog = metalog( 523 | old_append_new_data, 524 | bounds=m.output_dict["params"]["bounds"], 525 | boundedness=m.output_dict["params"]["boundedness"], 526 | term_limit=m.output_dict["params"]["term_limit"], 527 | term_lower_bound=m.output_dict["params"]["term_lower_bound"], 528 | step_len=m.output_dict["params"]["step_len"], 529 | probs=None, 530 | fit_method=m.output_dict["params"]["fit_method"], 531 | penalty=penalty, 532 | alpha=alpha, 533 | save_data=True, 534 | ) 535 | 536 | Y = updated_metalog.output_dict["Y"].values 537 | gamma = Y.T.dot(Y) 538 | updated_metalog.output_dict["params"]["bayes"]["gamma"] = gamma 539 | updated_metalog.output_dict["params"]["bayes"]["mu"] = updated_metalog.output_dict[ 540 | "A" 541 | ] 542 | v = list() 543 | for i in range( 544 | updated_metalog.output_dict["params"]["term_lower_bound"], 545 | updated_metalog.output_dict["params"]["term_limit"] + 1, 546 | ): 547 | v.append(updated_metalog.output_dict["params"]["nobs"] - i) 548 | v = np.array(v) 549 | a = v / 2 550 | updated_metalog.output_dict["params"]["bayes"]["a"] = a 551 | updated_metalog.output_dict["params"]["bayes"]["v"] = v 552 | 553 | # for now, just using 3 term standard metalog 554 | v = v[1] 555 | a = a[1] 556 | s = np.array([0.1, 0.5, 0.9]) 557 | Ys = np.repeat(1.0, 3) 558 | 559 | Ys = np.column_stack( 560 | [np.repeat(1, 3), np.log(s / (1 - s)), (s - 0.5) * np.log(s / (1 - s))] 561 | ) 562 | three_term_metalog_fit_idx = "a{}".format(updated_metalog.term_limit - 3) 563 | q_bar = np.dot( 564 | Ys, updated_metalog.output_dict["A"][three_term_metalog_fit_idx].values[-3:] 565 | ) 566 | 567 | updated_metalog.output_dict["params"]["bayes"]["q_bar"] = q_bar 568 | 569 | est = (q_bar[2] - q_bar[1]) / 2 + q_bar[1] 570 | s2 = ((q_bar[2] - q_bar[1]) / t.ppf(0.9, np.array(v))) ** 2 571 | 572 | gamma = gamma[:3, :3] 573 | 574 | # build covariance matrix for students t 575 | sig = Ys.dot(np.linalg.solve(gamma, np.eye(len(gamma)))).dot(Ys.T) 576 | 577 | # b = 0.5 * self.output_dict['params']['square_residual_error'][len(self.output_dict['params']['square_residual_error'])] 578 | b = (a * s2) / gamma[1, 1] 579 | updated_metalog.output_dict["params"]["bayes"]["sig"] = (b / a) * sig 580 | updated_metalog.output_dict["params"]["bayes"]["b"] = b 581 | 582 | return updated_metalog 583 | -------------------------------------------------------------------------------- /pymetalog/metalog.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | from scipy.stats import t 4 | from .support import MLprobs 5 | from .a_vector import a_vector_OLS_and_LP 6 | 7 | 8 | class metalog: 9 | """ 10 | Main class in pymetalog package. 11 | The pymetalog package is a python implementation of Tom Keelin's metalog distributions. 12 | 13 | The metalog distributions are a family of continuous univariate 14 | probability distributions that are convenient derivations of equations that appropriately 15 | characterize the probability density (PDF), cumulative (CDF) or quantile distribution functions. 16 | 17 | They can be used in most any situation in which CDF data is known and a flexible, 18 | simple, and easy-to-use continuous probability distribution is needed to 19 | represent that data. See links below for more details. 20 | 21 | Uses and benefits: http://www.metalogdistributions.com/usesbenefits.html 22 | Applications: http://www.metalogdistributions.com/applicationsdata.html 23 | 24 | Theory: http://pubsonline.informs.org/doi/abs/10.1287/deca.2016.0338 25 | Homepage: http://www.metalogdistributions.com/ 26 | 27 | Attributes: 28 | x (:obj: `numpy.ndarray`): Input array being fit with the metalog distribution. 29 | nobs (:obj:`int`): Number of data points in x. 30 | boundedness (:obj: `str`): String type of metalog to fit ('u' | 'sl' | 'su' | 'b'). 31 | bounds (:obj: `list`): List upper and lower limits to filter array with before calculating metalog quantiles/pdfs. 32 | term_limit (:obj: `int`): Int upper limit of the range of metalog terms to use to fit the data. 33 | term_lower_bound (:obj: `int`): Int lower limit of the range of metalog terms to use to fit the data. 34 | step_len (:obj: `float`): Float bin width used to estimate the metalog fit. 35 | probs (:obj: `numpy.ndarray`): Input array of probabilities associated with the data values in `x`. 36 | fit_method (:obj: `str`): String type of metalog fit method ('any' | 'OLS' | 'LP' | 'MLE'). 37 | penalty (:obj:`str`): Used to specify the norm used in the regularization. 38 | alpha (:obj:`float`): Regularization term to add to OLS fit. 39 | 40 | output_dict (:obj:`dict` with keys ['params', 'dataValues', 'Y', 'A', 'M', 'Validation']). 41 | - output_dict['params'] (:obj:`dict`): 42 | - output_dict['params']['bounds'] = `bounds` 43 | - output_dict['params']['boundedness'] = `boundedness` 44 | - output_dict['params']['term_limit'] = `term_limit` 45 | - output_dict['params']['term_lower_bound'] = `term_lower_bound` 46 | - output_dict['params']['step_len'] = `step_len` 47 | - output_dict['params']['fit_method'] = `fit_method` 48 | - output_dict['params']['square_residual_error'] = Squared residual error (y_i - yhat_i)^2` 49 | 50 | - output_dict['dataValues'] (:obj:`dict`). 51 | - output_dict['dataValues']['x']: `x` 52 | - output_dict['dataValues']['probs']: `probs` 53 | - output_dict['dataValues']['z']: column calculated in `append_zvector` method 54 | * depends on `boundedness` attribute 55 | * `boundedness` = 'u': 56 | * output_dict['dataValues']['z'] = `x` 57 | * `boundedness` = 'sl': 58 | * output_dict['dataValues']['z'] = log( (`x`-lower_bound) ) 59 | * `boundedness` = 'su': 60 | * output_dict['dataValues']['z'] = = log( (upper_bound-`x`) ) 61 | * `boundedness` = 'b': 62 | * output_dict['dataValues']['z'] = log( (`x`-lower_bound) / (upper_bound-`x`) ) 63 | 64 | - output_dict['Y'] (:obj:`pandas.DataFrame` with columns ['y1','y2','y3','y4', ... ,'yn'] of type numeric). 65 | - output_dict['Y']['y1']: numpy.array of ones with length equal to len(`x`) 66 | - output_dict['Y']['y2']: numpy.array of numeric values equal to the term attached to s in the logistic quantile function np.log(output_dict['dataValues']['probs'] / (1 - output_dict['dataValues']['probs'])) 67 | - output_dict['Y']['y3']: numpy.array of numeric values (output_dict['dataValues']['probs'] - 0.5) * output_dict['Y']['y2'] 68 | - output_dict['Y']['y4']: numpy.array of numeric values output_dict['Y']['y4'] = output_dict['dataValues']['probs'] - 0.5 69 | - output_dict['Y']['yn']: numpy.array of numeric values: 70 | * if n in 'yn' is odd, 71 | output_dict['Y']['yn'] = output_dict['Y']['y4']**(int(i//2)) 72 | * if n in 'yn' is even, 73 | zn = 'y' + str(n-1) 74 | output_dict['Y'][yn] = output_dict['Y']['y2'] * output_dict['Y'][zn] 75 | 76 | - output_dict['A']: (:obj:`pandas.DataFrame` with columns ['a2','a3', ... ,'an'] of type numeric): 77 | * 'a2', 'a3', ... , 'an' are our a coefficients returned by the method specified in `fit_method` 78 | 79 | - output_dict['M']: (:obj:`pandas.DataFrame` with columns ['m2', 'M2', 'm3', 'M3', ... , 'mn', 'Mn'] of type numeric): 80 | * 'm2', 'M2', 'm3', 'M3', ... , 'mn', 'Mn' are the metalog pdf/quantile fit estimates returned by the method specified in `fit_method` 81 | * 'mn' is the pdf fit of metalog term n 82 | * 'Mn' is the quantile fit of metalog term n 83 | 84 | - output_dict['Validation']: (:obj:`pandas.DataFrame` with columns ['term', 'valid', 'method']): 85 | * 'term' (:obj: `int`): each metalog estimation given a number of terms 86 | * 'valid' (:obj: `str`): boolean flag indicating if the metalog estimation was valid or not 87 | * 'method' (:obj: `str`): a string indicating which method was used for the metalog estimation 88 | 89 | Methods: 90 | get_params(`bounds`, `boundedness`, `term_limit`, `term_lower_bound`, `step_len`, `fit_method`) -> output_dict['params'] (:obj:`dict`) 91 | append_zvector(`bounds`, `boundedness`) -> df_x: (:obj:`pandas.DataFrame` with columns ['x','probs','z'] of type numeric) 92 | 93 | """ 94 | 95 | def __init__( 96 | self, 97 | x, 98 | bounds=[0, 1], 99 | boundedness="u", 100 | term_limit=13, 101 | term_lower_bound=2, 102 | step_len=0.01, 103 | probs=None, 104 | fit_method="any", 105 | penalty=None, 106 | alpha=0.0, 107 | ): 108 | """Fits a metalog distribution using the input array `x`. 109 | 110 | Args: 111 | x (:obj:`list` | `numpy.ndarray` | `pandas.Series`): Input data to fit the metalog distribution to. 112 | - must be an array of allowable types: int, float, numpy.int64, numpy.float64 113 | 114 | bounds (:obj:`list`, optional): Upper and lower limits to filter the data with before calculating metalog quantiles/pdfs. 115 | - should be set in conjunction with the `boundedness` parameter 116 | - Default: [0,1] 117 | 118 | boundedness (:obj:`str`, optional): String that is used to specify the type of metalog to fit. 119 | - must be in set ('u','sl','su','b') 120 | - Default: 'u' 121 | * Fits an unbounded metalog 122 | - 'sl' fits a strictly lower bounded metalog 123 | * len(bounds) must == 1 124 | - 'su' fits a strictly upper bounded metalog 125 | * len(bounds) must == 1 126 | - 'b' fits a upper/lower bounded metalog 127 | * len(bounds) must == 2 128 | * bounds[1] must be > bounds[0] 129 | 130 | term_limit (:obj:`int`, optional): The upper limit of the range of metalog terms to use to fit the data. 131 | - strictly > term_lower_bound 132 | - in range [3,30] 133 | 134 | term_lower_bound (:obj:`int`, optional): The lower limit of the range of metalog terms to use to fit the data. 135 | - strictly < term_limit 136 | - in range [2,29] 137 | 138 | step_len (:obj:`float`, optional): Used to specify the bin width used to estimate the metalog. 139 | - must be in range [0.001, 0.01] 140 | 141 | probs (:obj:`list` | `numpy.ndarray`, optional): Probabilities associated with the data values in x. 142 | - must be an array of integer or float data 143 | - all elements must be in range [0,1] 144 | 145 | fit_method (:obj:`str`, optional): Fit method to use to fit metalog distribution. 146 | - must be in set ('any','OLS','LP','MLE') 147 | - Default: 'any' 148 | * first tries 'OLS' method than 'LP' 149 | - 'OLS' only tries to fit by solving directly for a coefficients using ordinary least squares method 150 | - 'LP' only tries to estimate fit using simplex linear program optimization routine 151 | - 'MLE' first tries 'OLS' method than falls back to a maximum likelihood estimation routine 152 | 153 | penalty (:obj:`str`, optional): Used to specify the norm used in the regularization. 154 | - must be in set ('l2', None) 155 | * 'l2' performs Ridge Regression instead of OLS 156 | - Automatically shrinks a coefficients, leading to "smoother" fits 157 | - should be set in conjunction with `alpha` parameter 158 | - Default: None 159 | 160 | alpha (:obj:`float`, optional): Regularization term to add to OLS fit. 161 | - strictly >= 0. 162 | - should be set in conjunction with `penalty` parameter 163 | - Default: 0. (no regularization, OLS) 164 | 165 | Raises: 166 | TypeError: 'Input x must be an array or pandas Series' 167 | TypeError: 'Input x must be an array of allowable types: int, float, numpy.int64, or numpy.float64' 168 | TypeError: 'bounds parameter must be of type list' 169 | TypeError: 'bounds parameter must be list of integers' 170 | TypeError: 'term_limit parameter should be an integer between 3 and 30' 171 | TypeError: 'term_lower_bound parameter should be an integer' 172 | TypeError: 'Input probabilities must be an array' 173 | TypeError: 'Input probabilities must be an array of integer or float data' 174 | 175 | IndexError: 'Input x must be of length 3 or greater' 176 | IndexError: 'Must supply only one bound for semi-lower or semi-upper boundedness' 177 | IndexError: 'Must supply exactly two bounds for bounded boundedness (i.e. [0,30])' 178 | IndexError: 'probs vector and x vector must be the same length' 179 | 180 | ValueError: 'for semi-lower boundedness the lower bound must be less than the smallest value in x' 181 | ValueError: 'for semi-upper boundedness the upper bound must be greater than the largest value in x' 182 | ValueError: 'Upper bound must be greater than lower bound' 183 | ValueError: 'boundedness parameter must be u, su, sl or b only' 184 | ValueError: 'term_limit parameter should be an integer between 3 and 30' 185 | ValueError: 'term_limit must be less than or equal to the length of the vector x' 186 | ValueError: 'term_lower_bound parameter should be greater than or equal to 2' 187 | ValueError: 'term_lower_bound parameter must be less than or equal to term_limit parameter' 188 | ValueError: 'step_len must be >= to 0.001 and <= to 0.01' 189 | ValueError: 'Input probabilities cannot contain nans' 190 | ValueError: 'Input probabilities must have values between, not including, 0 and 1' 191 | ValueError: 'fit_method can only be values OLS, LP, any, or MLE' 192 | ValueError: 'penalty can only be values l2 or None' 193 | ValueError: 'alpha must only be a float >= 0.' 194 | 195 | Example: 196 | 197 | Fit a metalog to a numpy.ndarray of numeric data. 198 | 199 | >>> import numpy as np 200 | import pandas as pd 201 | import matplotlib.pyplot as plt 202 | import pymetalog as pm 203 | 204 | >>> fish_data = np.loadtxt('fishout.csv', delimiter=',', skiprows=1, dtype='str')[:,1].astype(np.float) 205 | >>> fish_metalog = pm.metalog(x=fish_data, bounds=[0,60], boundedness='b', term_limit=9, term_lower_bound=2, step_len=.001,) 206 | >>> pm.summary(fish_metalog) 207 | >>> # plot function - right now this saves plots to local 208 | pm.plot(fish_metalog) 209 | plt.show() 210 | 211 | """ 212 | 213 | self.x = x.copy() 214 | self.boundedness = boundedness 215 | self.bounds = bounds[:] 216 | self.term_limit = term_limit 217 | self.term_lower_bound = term_lower_bound 218 | self.step_len = step_len 219 | self.probs = probs 220 | self.fit_method = fit_method 221 | self.penalty = penalty 222 | self.nobs = len(x) 223 | 224 | if penalty == None: 225 | alpha = 0.0 226 | 227 | self.alpha = alpha 228 | 229 | if probs == None: 230 | df_x = MLprobs(self.x, step_len=step_len) 231 | 232 | else: 233 | df_x = pd.DataFrame() 234 | df_x["x"] = self.x 235 | df_x["probs"] = self.probs 236 | 237 | output_dict = {} 238 | 239 | # build z vector based on boundedness 240 | df_x = self.append_zvector(df_x) 241 | 242 | output_dict["params"] = self.get_params() 243 | output_dict["dataValues"] = df_x 244 | 245 | # Construct the Y Matrix initial values 246 | Y = pd.DataFrame() 247 | Y["y1"] = np.ones(len(df_x["x"])) 248 | Y["y2"] = np.log(df_x["probs"] / (1 - df_x["probs"])) 249 | Y["y3"] = (df_x["probs"] - 0.5) * Y["y2"] 250 | 251 | if self.term_limit > 3: 252 | Y["y4"] = df_x["probs"] - 0.5 253 | 254 | # Complete the values through the term limit 255 | if term_limit > 4: 256 | for i in range(5, self.term_limit + 1): 257 | yn = "y" + str(i) 258 | 259 | if i % 2 != 0: 260 | Y[yn] = Y["y4"] ** (int(i // 2)) 261 | 262 | if i % 2 == 0: 263 | zn = "y" + str(i - 1) 264 | Y[yn] = Y["y2"] * Y[zn] 265 | 266 | output_dict["Y"] = Y 267 | 268 | self.output_dict = a_vector_OLS_and_LP( 269 | output_dict, 270 | bounds=self.bounds, 271 | boundedness=self.boundedness, 272 | term_limit=self.term_limit, 273 | term_lower_bound=self.term_lower_bound, 274 | fit_method=self.fit_method, 275 | alpha=self.alpha, 276 | diff_error=0.001, 277 | diff_step=0.001, 278 | ) 279 | 280 | # input validation... 281 | @property 282 | def x(self): 283 | """x (:obj:`list` | `numpy.ndarray` | `pandas.Series`): Input data to fit a metalog to.""" 284 | 285 | return self._x 286 | 287 | @x.setter 288 | def x(self, xs): 289 | if (type(xs) != list) and (type(xs) != np.ndarray) and (type(xs) != pd.Series): 290 | raise TypeError("Input x must be an array or pandas Series") 291 | if isinstance(xs, pd.Series): 292 | xs = xs.values.copy() 293 | if not all(isinstance(x, (int, float, np.int64, np.float64)) for x in xs): 294 | raise TypeError( 295 | "Input x must be an array of allowable types: int, float, numpy.int64, or numpy.float64" 296 | ) 297 | if np.size(xs) < 3: 298 | raise IndexError("Input x must be of length 3 or greater") 299 | self._x = xs 300 | 301 | @property 302 | def bounds(self): 303 | """bounds (:obj:`list`, optional): Upper and lower limits to filter the data with before calculating metalog quantiles/pdfs.""" 304 | 305 | return self._bounds 306 | 307 | @bounds.setter 308 | def bounds(self, bs): 309 | if type(bs) != list: 310 | raise TypeError("bounds parameter must be of type list") 311 | if not all(isinstance(x, (int)) for x in bs): 312 | raise TypeError("bounds parameter must be list of integers") 313 | if (self.boundedness == "sl" or self.boundedness == "su") and len(bs) != 1: 314 | raise IndexError( 315 | "Must supply only one bound for semi-lower or semi-upper boundedness" 316 | ) 317 | if self.boundedness == "b" and len(bs) != 2: 318 | raise IndexError( 319 | "Must supply exactly two bounds for bounded boundedness (i.e. [0,30])" 320 | ) 321 | if self.boundedness == "su": 322 | bs_o = [np.min(self.x), bs[0]] 323 | if self.boundedness == "sl": 324 | bs_o = [bs[0], np.max(self.x)] 325 | if self.boundedness == "b" or self.boundedness == "u": 326 | bs_o = bs 327 | if self.boundedness == "sl" and np.min(self.x) < bs_o[0]: 328 | raise ValueError( 329 | "for semi-lower boundedness the lower bound must be less than the smallest value in x" 330 | ) 331 | if self.boundedness == "su" and np.max(self.x) > bs_o[1]: 332 | raise ValueError( 333 | "for semi-upper boundedness the upper bound must be greater than the largest value in x" 334 | ) 335 | if bs_o[0] > bs_o[1] and self.boundedness == "b": 336 | raise ValueError("Upper bound must be greater than lower bound") 337 | self._bounds = bs_o 338 | 339 | @property 340 | def boundedness(self): 341 | """boundedness (:obj:`str`, optional): String that is used to specify the type of metalog to fit.""" 342 | 343 | return self._boundedness 344 | 345 | @boundedness.setter 346 | def boundedness(self, bns): 347 | if bns != "u" and bns != "b" and bns != "su" and bns != "sl": 348 | raise ValueError("boundedness parameter must be u, su, sl or b only") 349 | self._boundedness = bns 350 | 351 | @property 352 | def term_limit(self): 353 | """term_limit (:obj:`int`, optional): The upper limit of the range of a coefficients to generate.""" 354 | 355 | return self._term_limit 356 | 357 | @term_limit.setter 358 | def term_limit(self, tl): 359 | if type(tl) != int: 360 | raise TypeError( 361 | "term_limit parameter should be an integer between 3 and 30" 362 | ) 363 | if tl > 30 or tl < 3: 364 | raise ValueError( 365 | "term_limit parameter should be an integer between 3 and 30" 366 | ) 367 | if tl > len(self.x): 368 | raise ValueError( 369 | "term_limit must be less than or equal to the length of the vector x" 370 | ) 371 | self._term_limit = tl 372 | 373 | @property 374 | def term_lower_bound(self): 375 | """term_lower_bound (:obj:`int`, optional): The lower limit of the range of a coefficients to generate.""" 376 | 377 | return self._term_lower_bound 378 | 379 | @term_lower_bound.setter 380 | def term_lower_bound(self, tlb): 381 | if type(tlb) != int: 382 | raise TypeError("term_lower_bound parameter should be an integer") 383 | if tlb < 2: 384 | raise ValueError( 385 | "term_lower_bound parameter should be greater than or equal to 2" 386 | ) 387 | if tlb > self.term_limit: 388 | raise ValueError( 389 | "term_lower_bound parameter must be less than or equal to term_limit parameter" 390 | ) 391 | self._term_lower_bound = tlb 392 | 393 | @property 394 | def step_len(self): 395 | """step_len (:obj:`float`, optional): Used to specify the bin width used to estimate the metalog.""" 396 | 397 | return self._step_len 398 | 399 | @step_len.setter 400 | def step_len(self, sl): 401 | if sl < 0.001 or sl > 0.01: 402 | raise ValueError("step_len must be >= to 0.001 and <= to 0.01") 403 | self._step_len = sl 404 | 405 | @property 406 | def probs(self): 407 | """probs (:obj:`list` | `numpy.ndarray`, optional): Probabilities associated with the data values in x.""" 408 | 409 | return self._probs 410 | 411 | @probs.setter 412 | def probs(self, ps): 413 | if ps != None: 414 | if not isinstance(ps, (list, np.ndarray)): 415 | raise TypeError("Input probabilities must be an array") 416 | if not all(isinstance(x, (int, float)) for x in ps): 417 | raise TypeError( 418 | "Input probabilities must be an array of integer or float data" 419 | ) 420 | if np.size(np.where(np.isnan(ps))) != 0: 421 | raise ValueError("Input probabilities cannot contain nans") 422 | if np.max(ps) > 1 or np.min(ps) < 0: 423 | raise ValueError( 424 | "Input probabilities must have values between, not including, 0 and 1" 425 | ) 426 | if len(ps) != len(self.x): 427 | raise IndexError("probs vector and x vector must be the same length") 428 | ps = ps.copy() 429 | self._probs = ps 430 | 431 | @property 432 | def fit_method(self): 433 | """fit_method (:obj:`str`, optional): Fit method to use to fit metalog distribution.""" 434 | 435 | return self._fit_method 436 | 437 | @fit_method.setter 438 | def fit_method(self, fm): 439 | if fm != "OLS" and fm != "LP" and fm != "any" and fm != "MLE": 440 | raise ValueError("fit_method can only be values OLS, LP, any, or MLE") 441 | self._fit_method = fm 442 | 443 | @property 444 | def penalty(self): 445 | """penalty (:obj:`str`, optional): Used to specify the norm used in the regularization.""" 446 | 447 | return self._penalty 448 | 449 | @fit_method.setter 450 | def penalty(self, p): 451 | if p != "l2" and p is not None: 452 | raise ValueError("penalty can only be values l2 or None") 453 | self._penalty = p 454 | 455 | @property 456 | def alpha(self): 457 | """alpha (:obj:`float`): L2 regularization term to add to OLS fit""" 458 | 459 | return self._alpha 460 | 461 | @alpha.setter 462 | def alpha(self, a): 463 | if a < 0 or not isinstance(a, float): 464 | raise ValueError("alpha must only be a float >= 0.") 465 | self._alpha = a 466 | 467 | def get_params(self): 468 | """Sets the `params` key (dict) of `output_dict` object prior to input to `a_vector_OLS_and_LP` method. 469 | - Uses metalog attributes to set keys 470 | 471 | Returns: 472 | params: (:obj:`dict`): Dictionary that is used as input to `a_vector_OLS_and_LP` method. 473 | 474 | """ 475 | 476 | params = {} 477 | params["bounds"] = self.bounds 478 | params["boundedness"] = self.boundedness 479 | params["term_limit"] = self.term_limit 480 | params["term_lower_bound"] = self.term_lower_bound 481 | params["step_len"] = self.step_len 482 | params["fit_method"] = self.fit_method 483 | params["nobs"] = self.nobs 484 | 485 | return params 486 | 487 | def append_zvector(self, df_x): 488 | """Sets the `dataValues` key (pandas.DataFrame) of `output_dict` object prior to input to `a_vector_OLS_and_LP` method. 489 | 490 | Uses `boundedness` attribute to set z vector 491 | - 'u': output_dict['dataValues']['z'] = x 492 | * Start with all the input data 493 | - 'sl': output_dict['dataValues']['z'] = log( (x-lower_bound) ) 494 | - 'su': output_dict['dataValues']['z'] = log( (upper_bound-x) ) 495 | - 'b': output_dict['dataValues']['z'] = log( (x-lower_bound) / (upper_bound-x) ) 496 | 497 | Returns: 498 | df_x: (:obj:`pandas.DataFrame` with columns ['x','probs','z'] of type numeric): DataFrame that is used as input to `a_vector_OLS_and_LP` method. 499 | - df_x['x']: metalog.x 500 | - df_x['probs']: metalog.probs 501 | - df_x['z']: z vector above 502 | """ 503 | 504 | if self.boundedness == "u": 505 | df_x["z"] = df_x["x"] 506 | if self.boundedness == "sl": 507 | df_x["z"] = np.log(np.array((df_x["x"] - self.bounds[0]), dtype=np.float64)) 508 | if self.boundedness == "su": 509 | df_x["z"] = -np.log( 510 | np.array((self.bounds[1] - df_x["x"]), dtype=np.float64) 511 | ) 512 | if self.boundedness == "b": 513 | df_x["z"] = np.log( 514 | np.array( 515 | ((df_x["x"] - self.bounds[0]) / (self.bounds[1] - df_x["x"])), 516 | dtype=np.float64, 517 | ) 518 | ) 519 | 520 | return df_x 521 | 522 | def __getitem__(self): 523 | return self.output_dict 524 | 525 | def __getitem__(self, arr): 526 | if arr not in self.output_dict: 527 | raise KeyError() 528 | return self.output_dict[arr] 529 | -------------------------------------------------------------------------------- /pymetalog/pdf_quantile_functions.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from .support import pdfMetalog, quantileMetalog 3 | 4 | 5 | def pdf_quantile_builder(temp, y, term_limit, bounds, boundedness): 6 | """Builds the metalog pdf and quantile arrays based on the a coefficients found by fitting metalog distribution. 7 | 8 | Args: 9 | temp (:obj: `numpy.ndarray` of type float): Array of a coefficients found by fitting metalog distribution. 10 | - Fit method is specified by metalog.fit_method attribute 11 | 12 | y (:obj: `numpy.ndarray` of type float): Array of bin widths specified for `a` parameter 13 | 14 | term_limit (:obj: `int`): The upper limit of the range of metalog terms to use to fit the data. 15 | - metalog.term_limit attribute 16 | - in range [3,30] 17 | 18 | bounds (:obj:`list`): Upper and lower limits to filter the data with before calculating metalog quantiles/pdfs. 19 | - metalog.bounds attribute 20 | - Default: [0,1] 21 | 22 | boundedness (:obj: `str`): String that is used to specify the type of metalog to fit. 23 | - metalog.boundedness attribute 24 | 25 | Returns: 26 | q_dict (:obj:`dict` with keys ['m', 'M', 'y', 'valid']): Initialized output_dict variable from metalog class. 27 | - q_dict['m']: (:obj:`numpy.ndarray` of type float): Array of metalog pdf values. 28 | * Returned by `pdfMetalog` method 29 | * Influenced by `boundedness` parameter 30 | * A valid metalog fit will return an array having all elements strictly > 0 31 | 32 | - q_dict['M']: (:obj:`numpy.ndarray` of type float): Array of metalog quantile values. 33 | * Returned by `quantileMetalog` method 34 | * Influenced by `boundedness` parameter 35 | - `boundedness` = 'sl': Inserts `bounds`[0] to the front of the quantile array 36 | - `boundedness` = 'su': Appends `bounds`[1] to the end of the quantile array 37 | - `boundedness` = 'b': Inserts `bounds`[0] to the front of the quantile array 38 | and appends `bounds`[1] to the end of the quantile array 39 | 40 | - q_dict['y']: (:obj:`numpy.ndarray` of type float): Array of bin widths specified for the pdfs/quantiles. 41 | * Influenced by `boundedness` parameter 42 | - `boundedness` = 'sl': Inserts `bounds`[0] at the front of the quantile array 43 | - `boundedness` = 'su': Appends `bounds`[1] to the end of the quantile array 44 | - `boundedness` = 'b': Inserts `bounds`[0] at the front of the quantile array 45 | and appends `bounds`[1] to the end of the quantile array 46 | 47 | - q_dict['valid']: (:obj:`str`): A string indicating if the metalog pdf generated by `pdfMetalog` method is valid or not. 48 | * If all values in the metalog pdf are >= 0, q_dict['valid'] = 'yes' 49 | * If any values in the metalog pdf are < 0, q_dict['valid'] = 'no' 50 | 51 | """ 52 | q_dict = {} 53 | 54 | # build pdf 55 | m = pdfMetalog(temp, y[0], term_limit, bounds=bounds, boundedness=boundedness) 56 | 57 | for j in range(2, len(y) + 1): 58 | tempPDF = pdfMetalog( 59 | temp, y[j - 1], term_limit, bounds=bounds, boundedness=boundedness 60 | ) 61 | m = np.append(m, tempPDF) 62 | 63 | # Build quantile values 64 | M = quantileMetalog(temp, y[1], term_limit, bounds=bounds, boundedness=boundedness) 65 | 66 | for j in range(2, len(y) + 1): 67 | tempQant = quantileMetalog( 68 | temp, y[j - 1], term_limit, bounds=bounds, boundedness=boundedness 69 | ) 70 | M = np.append(M, tempQant) 71 | 72 | # Add trailing and leading zero's for pdf bounds 73 | if boundedness == "sl": 74 | m = np.append(0, m) 75 | M = np.append(bounds[0], M) 76 | 77 | if boundedness == "su": 78 | m = np.append(m, 0) 79 | M = np.append(M, bounds[1]) 80 | 81 | if boundedness == "b": 82 | m = np.append(0, m) 83 | m = np.append(m, 0) 84 | M = np.append(bounds[0], M) 85 | M = np.append(M, bounds[1]) 86 | 87 | # Add y values for bounded models 88 | if boundedness == "sl": 89 | y = np.append(0, y) 90 | 91 | if boundedness == "su": 92 | y = np.append(y, 1) 93 | 94 | if boundedness == "b": 95 | y = np.append(0, y) 96 | y = np.append(y, 1) 97 | 98 | q_dict["m"] = m 99 | q_dict["M"] = M 100 | q_dict["y"] = y 101 | 102 | # PDF validation 103 | q_dict["valid"] = pdfMetalogValidation(q_dict["m"]) 104 | 105 | return q_dict 106 | 107 | 108 | def pdfMetalogValidation(x): 109 | """Validation that all calculated metalog pdf values are greater than or equal to 0. 110 | 111 | Args: 112 | x (:obj: `numpy.ndarray` of type float): Array of metalog pdf values. 113 | - Returned by `pdfMetalog` method 114 | - Influenced by `boundedness` parameter 115 | 116 | Returns: 117 | 'yes' | 'no' (:obj:`str`): 'yes' if all elements strictly >= 0, else 'no'. 118 | """ 119 | y = np.min(x) 120 | if y >= 0: 121 | return "yes" 122 | else: 123 | return "no" 124 | -------------------------------------------------------------------------------- /pymetalog/support.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | 4 | 5 | def MLprobs(x_old, step_len): 6 | """Returns the quantile values x['x'] and corresponding bins x['y']. 7 | Called during metalog.__init__ method call. 8 | 9 | Args: 10 | x_old (:obj: `numpy.ndarray` of type numeric): Input data to fit the metalog distribution to. 11 | - must be an array of allowable types: int, float, numpy.int64, numpy.float64 12 | 13 | step_len (:obj:`float`): Used to specify the bin width used to estimate the metalog. 14 | 15 | Returns: 16 | x: (:obj:`dict` with keys ['x','probs'] of type float): 17 | - x['x']: (:obj:`numpy.ndarray` of type float): 18 | * x['x'] is the quantile values found using the bin widths array x['y] - which is specified using the `step_len` parameter 19 | 20 | - x['probs']: (:obj:`numpy.ndarray` of type float): 21 | * x['probs'] is the array of bin widths specified for x['x'] 22 | 23 | """ 24 | 25 | l = len(x_old) 26 | x = pd.DataFrame() 27 | x["x"] = x_old.copy() 28 | 29 | x.sort_values(by="x") 30 | 31 | x["probs"] = 0 32 | for i in range(0, l): 33 | if i == 0: 34 | x.loc[i, "probs"] = 0.5 / l 35 | else: 36 | x.loc[i, "probs"] = x.loc[i - 1, "probs"] + 1 / l 37 | 38 | # TODO method for turning off and on this n>100 estimation 39 | if len(x.index) > 100: 40 | y2 = np.linspace(step_len, 1 - step_len, int((1 - step_len) / step_len)) 41 | 42 | tailstep = step_len / 10 43 | 44 | y1 = np.linspace( 45 | tailstep, (min(y2) - tailstep), int((min(y2) - tailstep) / tailstep) 46 | ) 47 | 48 | y3 = np.linspace( 49 | (max(y2) + tailstep), 50 | (max(y2) + tailstep * 9), 51 | int((tailstep * 9) / tailstep), 52 | ) 53 | 54 | y = np.hstack((y1, y2, y3)) 55 | 56 | x_new = np.quantile(x_old, y) 57 | 58 | df_x = {} 59 | df_x["x"] = x_new 60 | df_x["probs"] = y 61 | x = df_x 62 | 63 | return x 64 | 65 | 66 | def pdfMetalog(a, y, t, bounds=[], boundedness="u"): 67 | """Estimates the metalog pdf given the a coefficients and percentiles found using the specified metalog.fit_method attribute. 68 | Called during metalog.__init__ method call if `fit_method`='MLE'. 69 | Called during pdf_quantile_builder method call. 70 | 71 | Args: 72 | a (:obj: `numpy.ndarray` of type float): Array of a coefficients found by fitting metalog distribution using the `fit_method` parameter. 73 | 74 | y (:obj: `numpy.ndarray` of type float): Array of bin widths specified for `a` parameter 75 | 76 | t (:obj: `int`): The upper limit of the range of metalog terms to use to fit the data. 77 | - metalog.term_limit attribute 78 | - in range [3,30] 79 | 80 | bounds (:obj: `list`, optional): Upper and lower limits to filter the data with before calculating metalog quantiles/pdfs. 81 | - should be set in conjunction with the `boundedness` parameter 82 | - Default: [0,1] 83 | 84 | boundedness (:obj: `str`, optional): String that is used to specify the type of metalog to fit. 85 | - must be in set ('u','sl','su','b') 86 | - Default: 'u' 87 | * Fits an unbounded metalog 88 | * If `boundedness` parameter != 'u' we must calculate the metalog quantiles using an unbounded metalog, via the `quantileMetalog` method. 89 | - 'sl' fits a strictly lower bounded metalog 90 | * len(bounds) must == 1 91 | - 'su' fits a strictly upper bounded metalog 92 | * len(bounds) must == 1 93 | - 'b' fits a upper/lower bounded metalog 94 | * len(bounds) must == 2 95 | * bounds[1] must be > bounds[0] 96 | 97 | Returns: 98 | x: (:obj: `numpy.ndarray` of type float): Array of metalog pdf values. 99 | 100 | """ 101 | if y <= 0: 102 | y = 0.00001 103 | 104 | if y >= 1: 105 | y = 0.99999 106 | 107 | d = y * (1 - y) 108 | f = y - 0.5 109 | l = np.log(y / (1 - y)) 110 | 111 | # Initiate pdf 112 | 113 | # For the first three terms 114 | x = a[1] / d 115 | if len(a) > 2 and a[2] != 0: 116 | x = x + a[2] * ((f / d) + l) 117 | 118 | # For the fourth term 119 | if t > 3: 120 | x = x + a[3] 121 | 122 | # Initalize some counting variables 123 | e = 1 124 | o = 1 125 | 126 | # For all other terms greater than 4 127 | if t > 4: 128 | for i in range(5, t + 1): 129 | if (i % 2) != 0: 130 | # iff odd 131 | x = x + ((o + 1) * a[i - 1] * f ** o) 132 | o = o + 1 133 | 134 | if (i % 2) == 0: 135 | # iff even 136 | x = x + a[i - 1] * (((f ** (e + 1)) / d) + (e + 1) * (f ** e) * l) 137 | e = e + 1 138 | 139 | # Some change of variables here for boundedness 140 | x = x ** (-1) 141 | 142 | if boundedness != "u": 143 | M = quantileMetalog(a, y, t, bounds=bounds, boundedness="u") 144 | 145 | if boundedness == "sl": 146 | x = x * np.exp(-M) 147 | 148 | if boundedness == "su": 149 | x = x * np.exp(M) 150 | 151 | if boundedness == "b": 152 | x = (x * (1 + np.exp(M)) ** 2) / ((bounds[1] - bounds[0]) * np.exp(M)) 153 | 154 | if x <= 0: 155 | x = 0.00001 156 | # print(str(x) + " zoop") 157 | 158 | return x 159 | 160 | 161 | def quantileMetalog(a, y, t, bounds=[], boundedness="u"): 162 | """Estimates the metalog quantiles given the a coefficients and percentiles found using the specified metalog.fit_method attribute. 163 | Called during metalog.__init__ method call if `fit_method`='MLE'. 164 | Called during pdf_quantile_builder method call. 165 | 166 | Args: 167 | a (:obj: `numpy.ndarray` of type float): Array of a coefficients found by fitting metalog distribution using the `fit_method` parameter. 168 | 169 | y (:obj: `numpy.ndarray` of type float): Array of bin widths specified for `a` parameter 170 | 171 | t (:obj: `int`): The upper limit of the range of metalog terms to use to fit the data. 172 | - metalog.term_limit attribute 173 | - in range [3,30] 174 | 175 | bounds (:obj: `list`, optional): Upper and lower limits to filter the data with before calculating metalog quantiles/pdfs. 176 | - should be set in conjunction with the `boundedness` parameter 177 | - Default: [0,1] 178 | 179 | boundedness (:obj: `str`, optional): String that is used to specify the type of metalog to fit. 180 | - must be in set ('u','sl','su','b') 181 | - Default: 'u' 182 | * Fits an unbounded metalog 183 | - 'sl' fits a strictly lower bounded metalog 184 | * len(bounds) must == 1 185 | - 'su' fits a strictly upper bounded metalog 186 | * len(bounds) must == 1 187 | - 'b' fits a upper/lower bounded metalog 188 | * len(bounds) must == 2 189 | * bounds[1] must be > bounds[0] 190 | 191 | Returns: 192 | x: (:obj: `numpy.ndarray` of type float): Array of metalog quantile values. 193 | 194 | """ 195 | if y <= 0: 196 | y = 0.00001 197 | 198 | if y >= 1: 199 | y = 0.99999 200 | # Some values for calculation 201 | f = y - 0.5 202 | l = np.log(y / (1 - y)) 203 | 204 | # For the first three terms 205 | x = a[0] + a[1] * l 206 | if t > 2: 207 | x = x + a[2] * f * l 208 | 209 | # For the fourth term 210 | if t > 3: 211 | x = x + a[3] * f 212 | 213 | # Some tracking variables 214 | o = 2 215 | e = 2 216 | 217 | # For all other terms greater than 4 218 | if t > 4: 219 | for i in range(5, t + 1): 220 | if (i % 2) == 0: 221 | x = x + a[i - 1] * f ** e * l 222 | e = e + 1 223 | if (i % 2) != 0: 224 | x = x + a[i - 1] * f ** o 225 | o = o + 1 226 | 227 | if boundedness == "sl": 228 | x = bounds[0] + np.exp(x) 229 | 230 | if boundedness == "su": 231 | x = bounds[1] - np.exp(-x) 232 | 233 | if boundedness == "b": 234 | x = (bounds[0] + bounds[1] * np.exp(x)) / (1 + np.exp(x)) 235 | 236 | return x 237 | 238 | 239 | def diffMatMetalog(term_limit, step_len): 240 | """TODO: write docstring""" 241 | y = np.arange(step_len, 1, step_len) 242 | Diff = np.array([]) 243 | 244 | for i in range(0, (len(y))): 245 | d = y[i] * (1 - y[i]) 246 | f = y[i] - 0.5 247 | l = np.log(y[i] / (1 - y[i])) 248 | 249 | # Initiate pdf 250 | diffVector = 0 251 | 252 | # For the first three terms 253 | x = 1 / d 254 | diffVector = [diffVector, x] 255 | 256 | if term_limit > 2: 257 | diffVector.append((f / d) + l) 258 | 259 | # For the fourth term 260 | if term_limit > 3: 261 | diffVector.append(1) 262 | 263 | # Initalize some counting variables 264 | e = 1 265 | o = 1 266 | 267 | # For all other terms greater than 4 268 | if term_limit > 4: 269 | for i in range(5, (term_limit + 1)): 270 | if (i % 2) != 0: 271 | # iff odd 272 | diffVector.append((o + 1) * f ** o) 273 | o = o + 1 274 | 275 | if (i % 2) == 0: 276 | # iff even 277 | diffVector.append(((f ** (e + 1)) / d) + (e + 1) * (f ** e) * l) 278 | e = e + 1 279 | if np.size(Diff) == 0: 280 | Diff = diffVector 281 | else: 282 | Diff = np.vstack((Diff, diffVector)) 283 | 284 | Diff_neg = -1 * (Diff) 285 | new_Diff = np.hstack((Diff[:, [0]], Diff_neg[:, [0]])) 286 | 287 | for c in range(1, (len(Diff[1, :]))): 288 | new_Diff = np.hstack((new_Diff, Diff[:, [c]])) 289 | new_Diff = np.hstack((new_Diff, Diff_neg[:, [c]])) 290 | 291 | new_Diff = pd.DataFrame(data=new_Diff) 292 | 293 | return new_Diff 294 | 295 | 296 | def newtons_method_metalog(m, q, term, bounds=None, boundedness=None): 297 | """TODO: write docstring""" 298 | # a simple newtons method application 299 | if bounds == None: 300 | bounds = m["params"]["bounds"] 301 | if boundedness == None: 302 | boundedness = m["params"]["boundedness"] 303 | 304 | # if m is metalog 305 | try: 306 | m = m.output_dict 307 | avec = "a" + str(term) 308 | a = m["A"][avec] 309 | except: 310 | a = m 311 | 312 | # TODO there should be setters for at least some of these hyperparameters 313 | alpha_step = 0.5 314 | err = 1e-10 315 | temp_err = 0.1 316 | y_now = 0.5 317 | 318 | i = 1 319 | while temp_err > err: 320 | frist_function = quantileMetalog(a, y_now, term, bounds, boundedness) - q 321 | derv_function = pdfMetalog(a, y_now, term, bounds, boundedness) 322 | y_next = y_now - alpha_step * (frist_function * derv_function) 323 | temp_err = abs((y_next - y_now)) 324 | 325 | if y_next > 1: 326 | y_next = 0.99999 327 | 328 | if y_next < 0: 329 | y_next = 0.000001 330 | 331 | y_now = y_next 332 | i = i + 1 333 | 334 | if i > 10000: 335 | raise StopIteration( 336 | "Approximation taking too long, quantile value: " 337 | + str(q) 338 | + " is to far from distribution median. Try plot() to see distribution." 339 | ) 340 | 341 | return y_now 342 | 343 | 344 | def pdfMetalog_density(m, t, y): 345 | m = m.output_dict 346 | avec = "a" + str(t) 347 | a = m["A"][avec] 348 | bounds = m["params"]["bounds"] 349 | boundedness = m["params"]["boundedness"] 350 | 351 | d = y * (1 - y) 352 | f = y - 0.5 353 | l = np.log(y / (1 - y)) 354 | 355 | # Initiate pdf 356 | 357 | # For the first three terms 358 | x = a[1] / d 359 | if a[2] != 0: 360 | x = x + a[2] * ((f / d) + l) 361 | 362 | # For the fourth term 363 | if t > 3: 364 | x = x + a[3] 365 | 366 | # Initalize some counting variables 367 | e = 1 368 | o = 1 369 | 370 | # For all other terms greater than 4 371 | if t > 4: 372 | for i in range(5, t + 1): 373 | if (i % 2) != 0: 374 | # iff odd 375 | x = x + ((o + 1) * a[i - 1] * f ** o) 376 | o = o + 1 377 | 378 | if (i % 2) == 0: 379 | # iff even 380 | x = x + a[i - 1] * (((f ** (e + 1)) / d) + (e + 1) * (f ** e) * l) 381 | e = e + 1 382 | 383 | # Some change of variables here for boundedness 384 | 385 | x = x ** (-1) 386 | 387 | if boundedness != "u": 388 | M = quantileMetalog(a, y, t, bounds=bounds, boundedness="u") 389 | 390 | if boundedness == "sl": 391 | x = x * np.exp(-M) 392 | 393 | if boundedness == "su": 394 | x = x * np.exp(M) 395 | 396 | if boundedness == "b": 397 | x = (x * (1 + np.exp(M)) ** 2) / ((bounds[1] - bounds[0]) * np.exp(M)) 398 | 399 | return x 400 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import setuptools 2 | 3 | with open("README.md", "r") as fh: 4 | long_description = fh.read() 5 | 6 | setuptools.setup( 7 | name="pymetalog", 8 | version="0.2.1", 9 | author="Colin Smith, Travis Jefferies, Isaac J. Faber", 10 | description="A python package that generates functions for the metalog distribution. The metalog distribution is a highly flexible probability distribution that can be used to model data without traditional parameters.", 11 | long_description=long_description, 12 | long_description_content_type="text/markdown", 13 | url="https://github.com/tjefferies/pymetalog", 14 | packages=setuptools.find_packages(), 15 | classifiers=[ 16 | "Programming Language :: Python :: 3", 17 | "License :: OSI Approved :: MIT License", 18 | "Operating System :: OS Independent", 19 | ], 20 | package_data={'pymetalog': ["examples/*"]}, 21 | install_requires=[ 22 | 'numpy', 23 | 'pandas', 24 | 'scipy', 25 | 'seaborn', 26 | ], 27 | ) 28 | --------------------------------------------------------------------------------