├── .gitignore
├── LICENSE
├── PyMetalog_usagetest.py
├── README.md
├── pymetalog
    ├── __init__.py
    ├── a_vector.py
    ├── class_method.py
    ├── metalog.py
    ├── pdf_quantile_functions.py
    └── support.py
└── setup.py


/.gitignore:
--------------------------------------------------------------------------------
1 | /pymetalog/__pycache__
2 | .DS_Store
3 | 
4 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019 colsmit
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/PyMetalog_usagetest.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | import matplotlib.pyplot as plt
 4 | import pymetalog as pm
 5 | 
 6 | 
 7 | fish_data = pm.example_data
 8 | 
 9 | # metalog creation
10 | fish_metalog = pm.metalog(
11 |     x=fish_data,
12 |     bounds=[0, 40],
13 |     boundedness="b",
14 |     term_limit=15,
15 |     term_lower_bound=2,
16 |     step_len=0.001,
17 |     penalty=None,
18 | )
19 | 
20 | # summary function
21 | pm.summary(fish_metalog)
22 | 
23 | # # plot function - right now this saves plots to local
24 | pm.plot(fish_metalog)
25 | plt.show()
26 | 
27 | # # metalog random sampling
28 | r_gens = pm.rmetalog(fish_metalog, n=1000, term=9, generator="hdr")
29 | plt.hist(r_gens, 14)
30 | plt.show()
31 | 
32 | # quantiles from a percentile
33 | qs = pm.qmetalog(fish_metalog, y=[0.25, 0.5, 0.75], term=9)
34 | print("qmetalog demo: " + str(qs))
35 | 
36 | # probabilities from a quantile
37 | ps = pm.pmetalog(fish_metalog, q=[3, 10, 25], term=9)
38 | print("pmetalog demo: " + str(ps))
39 | 
40 | # density from a quantile
41 | ds = pm.dmetalog(fish_metalog, q=[3, 10, 25], term=9)
42 | print("dmetalog demo: " + str(ds))
43 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | pymetalog
 2 | ================
 3 | Colin Smith, Travis Jefferies, Isaac J. Faber
 4 | 
 5 | `pip install pymetalog`
 6 | 
 7 | ### The Python Metalog Distribution
 8 | 
 9 | This repo is a working project for a python package (**pymetalog**) that generates functions
10 | for the metalog distribution. The metalog distribution is a highly
11 | flexible probability distribution that can be used to model data without
12 | traditional parameters.
13 | 
14 | ### Metalog Background
15 | 
16 | In economics, business, engineering, science and other fields,
17 | continuous uncertainties frequently arise that are not easily- or
18 | well-characterized by previously-named continuous probability
19 | distributions. Frequently, there is data available from measurements,
20 | assessments, derivations, simulations or other sources that characterize
21 | the range of an uncertainty. But the underlying process that generated
22 | this data is either unknown or fails to lend itself to convenient
23 | derivation of equations that appropriately characterize the probability
24 | density (PDF), cumulative (CDF) or quantile distribution functions.
25 | 
26 | The metalog distributions are a family of continuous univariate
27 | probability distributions that directly address this need. They can be
28 | used in most any situation in which CDF data is known and a flexible,
29 | simple, and easy-to-use continuous probability distribution is needed to
30 | represent that data. Consider their [uses and
31 | benefits](http://www.metalogdistributions.com/usesbenefits.html). Also
32 | consider their
33 | [applications](http://www.metalogdistributions.com/applicationsdata.html)
34 | over a wide range of fields and data sources.
35 | 
36 | This repository is a complement and extension of the information found
37 | in the [paper
38 | published](http://pubsonline.informs.org/doi/abs/10.1287/deca.2016.0338)
39 | in Decision Analysis and the
40 | [website](http://www.metalogdistributions.com/)
41 | 


--------------------------------------------------------------------------------
/pymetalog/__init__.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import numpy as np
 3 | from .metalog import metalog
 4 | from .class_method import rmetalog, plot, qmetalog, pmetalog, dmetalog, summary, update
 5 | 
 6 | name = "pymetalog"
 7 | 
 8 | this_pth = os.path.dirname(__file__)
 9 | data_path = os.path.join(this_pth, "examples", "fishout.csv")
10 | example_data = np.loadtxt(data_path, delimiter=",", skiprows=1, dtype="str")[
11 |     :, 1
12 | ].astype(np.float)
13 | 


--------------------------------------------------------------------------------
/pymetalog/a_vector.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import numpy as np
  3 | import scipy as sp
  4 | 
  5 | from scipy.optimize import linprog, minimize, NonlinearConstraint
  6 | from .pdf_quantile_functions import pdf_quantile_builder
  7 | from .support import diffMatMetalog, pdfMetalog, quantileMetalog, newtons_method_metalog
  8 | 
  9 | import time
 10 | import warnings
 11 | 
 12 | 
 13 | def a_vector_OLS_and_LP(
 14 |     m_dict,
 15 |     bounds,
 16 |     boundedness,
 17 |     term_limit,
 18 |     term_lower_bound,
 19 |     fit_method,
 20 |     alpha,
 21 |     diff_error=0.001,
 22 |     diff_step=0.001,
 23 | ):
 24 | 
 25 |     """Main workhorse function of pymetalog package.
 26 |         Called during metalog.__init__ method call.
 27 | 
 28 |     Args:
 29 |         m_dict (:obj:`dict` with keys ['params', 'dataValues', 'Y']): Initialized output_dict variable from metalog class.
 30 |             - m_dict['params']: (:obj:`dict` with keys ['bounds', 'boundedness', 'term_limit', 'term_lower_bound', 'step_len', 'fit_method']):
 31 |                 * 'bounds': metalog.bounds
 32 |                 * 'boundedness': metalog.boundedness
 33 |                 * 'term_limit': metalog.term_limit
 34 |                 * 'term_lower_bound': metalog.term_lower_bound
 35 |                 * 'step_len': metalog.step_len
 36 |                 * 'fit_method': metalog.fit_method
 37 | 
 38 |             - m_dict['dataValues']: (:obj:`pandas.DataFrame` with columns ['x','probs','z']  of type numeric):
 39 |                 * 'x': metalog.x
 40 |                 * 'probs': metalog.probs
 41 |                 * 'z': column calculated in metalog.append_zvector method
 42 |                     - depends on metalog.boundedness attribute
 43 |                     - metalog.boundedness = 'u':
 44 |                         * 'z' = metalog.x
 45 |                     - metalog.boundedness = 'sl':
 46 |                         * 'z' = log( (metalog.x-lower_bound) )
 47 |                     - metalog.boundedness = 'su':
 48 |                         * 'z' = = log( (upper_bound-metalog.x) )
 49 |                     - metalog.boundedness = 'b':
 50 |                         * 'z' = log( (metalog.x-lower_bound) / (upper_bound-metalog.x) )
 51 | 
 52 |             - m_dict['Y']: (:obj:`pandas.DataFrame` with columns ['y1','y2','y3','y4', ... ,'yn']  of type numeric):
 53 |                 * 'y1': numpy.array of ones with length equal to len(x)
 54 |                 * 'y2': numpy.array of numeric values equal to the term attached to s in the logistic quantile function np.log(m_dict['dataValues']['probs'] / (1 - m_dict['dataValues']['probs']))
 55 |                 * 'y3': numpy.array of numeric values (m_dict['dataValues']['probs'] - 0.5) * m_dict['Y']['y2']
 56 |                 * 'y4': numpy.array of numeric values m_dict['Y']['y4'] = m_dict['dataValues']['probs'] - 0.5
 57 |                 * 'yn': numpy.array of numeric values:
 58 |                     - if n in 'yn' is odd,
 59 |                         m_dict['Y']['yn'] = m_dict['Y']['y4']**(int(i//2))
 60 |                     - if n in 'yn' is even,
 61 |                         zn = 'y' + str(n-1)
 62 |                         m_dict['Y'][yn] = m_dict['Y']['y2'] * m_dict['Y'][zn]
 63 | 
 64 |         bounds (:obj:`list`): Upper and lower limits to filter the data with before calculating metalog quantiles/pdfs.
 65 |             - should be set in conjunction with the `boundedness` parameter
 66 | 
 67 |         boundedness (:obj:`str`): String that is used to specify the type of metalog to fit.
 68 |             - must be in set ('u','sl','su','b')
 69 |             - Default: 'u'
 70 |                 * Fits an unbounded metalog
 71 |             - 'sl' fits a strictly lower bounded metalog
 72 |                 * len(bounds) must == 1
 73 |             - 'su' fits a strictly upper bounded metalog
 74 |                 * len(bounds) must == 1
 75 |             - 'b' fits a upper/lower bounded metalog
 76 |                 * len(bounds) must == 2
 77 |                 * bounds[1] must be > bounds[0]
 78 | 
 79 |         term_limit (:obj:`int`): The upper limit of the range of metalog terms to use to fit the data.
 80 |             - strictly > term_lower_bound
 81 |             - in range [3,30]
 82 | 
 83 |         term_lower_bound (:obj:`int`): The lower limit of the range of metalog terms to use to fit the data.
 84 |             - strictly < term_limit
 85 |             - in range [2,29]
 86 | 
 87 |         fit_method (:obj:`str`): Fit method to use to fit metalog distribution.
 88 |             - must be in set ('any','OLS','LP','MLE')
 89 |             - Default: 'any'
 90 |                 * first tries 'OLS' method than 'LP'
 91 |             - 'OLS' only tries to fit by solving directly for a coefficients using ordinary least squares method
 92 |             - 'LP' only tries to estimate fit using simplex linear program optimization routine
 93 |             - 'MLE' first tries 'OLS' method than falls back to a maximum likelihood estimation routine
 94 | 
 95 |         alpha (:obj:`float`, optional): Regularization term to add to OLS fit
 96 |             - strictly >= 0.
 97 |             - should be set in conjunction with `penalty` parameter
 98 |             - Default: 0. (no regularization, OLS)
 99 | 
100 |         diff_error (:obj:`float`, optional): Value used to in scipy.optimize.linprog method call
101 |                                              to init the array of values representing the
102 |                                              upper-bound of each inequality constraint (row) in A_ub.
103 |             - #TODO: Insert maths
104 | 
105 |         diff_step (:obj:`float`, optional): Value passed to `step_len` parameter in support.py diffMatMetalog method call
106 |                                              defines the bin width for the Reimann sum of the differences differentiation method
107 |             - diffMatMetalog differentiates the metalog pdf
108 |                 * Differentiation reference: https://math.stackexchange.com/a/313135
109 |     Returns:
110 |         m_dict: (:obj:`dict` with keys ['params', 'dataValues', 'Y', 'A', 'M', 'Validation'])
111 |             - m_dict['A']: (:obj:`pandas.DataFrame` with columns ['a2','a3', ... ,'an'] of type numeric):
112 |                 * a2, a3, ... , an are our a coefficients returned by the method specified in `fit_method`
113 | 
114 |             - m_dict['M']: (:obj:`pandas.DataFrame` with columns 0:'pdf_1',1:'cdf_1',2:'pdf_2',3:'cdf_2',
115 |                             ...,((2*(term_limit-term_lower_bound))+1)-1:'pdf_n',
116 |                                 ((2*(term_limit-term_lower_bound))+1):'cdf_n'
117 |                             where n is the total number of metalog fits determined by (term_limit-term_lower_bound)+1
118 |                             )
119 |                 * pdf_1, pdf_2, ... , pdf_n are the metalog pdfs returned by pdf_quantile_builder.pdfMetalog method
120 |                 * cdf_1, cdf_2, ... , cdf_n are the metalog quantiles returned by pdf_quantile_builder.quantileMetalog method
121 | 
122 |             - m_dict['y']: (:obj: `numpy.ndarray` of type float):
123 |                 * Array of bin widths for both the pdf_n and cdf_n
124 | 
125 |             - m_dict['Validation']: (:obj:`pandas.DataFrame` with columns ['term', 'valid', 'method'] of type str):
126 |                 * 'term': each metalog estimation given a number of terms
127 |                 * 'valid': boolean flag indicating if the metalog estimation was valid or not
128 |                 * 'method': a string indicating which method was used for the metalog estimation
129 | 
130 |     """
131 | 
132 |     A = pd.DataFrame()
133 |     c_a_names = []
134 |     c_m_names = []
135 |     Mh = pd.DataFrame()
136 |     Validation = pd.DataFrame()
137 |     df_MH_temp_list = list()
138 |     df_A_temp_list = list()
139 |     df_Validation_temp_list = list()
140 | 
141 |     # TODO: Large for-loop can probably be factored into smaller functions
142 |     for i in range(term_lower_bound, term_limit + 1):
143 |         Y = m_dict["Y"].iloc[:, 0:i]
144 |         eye = np.eye(Y.shape[1])
145 |         z = m_dict["dataValues"]["z"]
146 |         y = m_dict["dataValues"]["probs"]
147 |         step_len = m_dict["params"]["step_len"]
148 |         methodFit = "OLS"
149 |         a_name = "a" + str(i)
150 |         m_name = "m" + str(i)
151 |         M_name = "M" + str(i)
152 |         c_m_names = np.append(c_m_names, [m_name, M_name])
153 |         c_a_names = np.append(c_a_names, a_name)
154 | 
155 |         if fit_method == "any" or fit_method == "MLE":
156 |             try:
157 |                 temp = np.dot(
158 |                     np.dot(np.linalg.inv(np.dot(Y.T, Y) + alpha * eye), Y.T), z
159 |                 )
160 |             except:
161 |                 # use LP solver if OLS breaks
162 |                 temp = a_vector_LP(
163 |                     m_dict,
164 |                     term_limit=i,
165 |                     term_lower_bound=i,
166 |                     diff_error=diff_error,
167 |                     diff_step=diff_step,
168 |                 )
169 |                 methodFit = "Linear Program"
170 |         if fit_method == "OLS":
171 |             try:
172 |                 temp = np.dot(
173 |                     np.dot(np.linalg.inv(np.dot(Y.T, Y) + alpha * eye), Y.T), z
174 |                 )
175 |             except:
176 |                 raise RuntimeError(
177 |                     "OLS was unable to solve infeasible or poorly formulated problem"
178 |                 )
179 |         if fit_method == "LP":
180 |             temp = a_vector_LP(
181 |                 m_dict,
182 |                 term_limit=i,
183 |                 term_lower_bound=i,
184 |                 diff_error=diff_error,
185 |                 diff_step=diff_step,
186 |             )
187 |             methodFit = "Linear Program"
188 | 
189 |         if fit_method == "MLE":
190 |             temp = a_vector_MLE(temp, y, i, m_dict, bounds, boundedness)
191 | 
192 |         temp = np.append(temp, np.zeros(term_limit - i))
193 | 
194 |         # build a y vector for smaller data sets
195 |         if len(z) < 100:
196 |             y2 = np.linspace(step_len, 1 - step_len, int((1 - step_len) / step_len))
197 |             tailstep = step_len / 10
198 |             y1 = np.linspace(
199 |                 tailstep, (min(y2) - tailstep), int((min(y2) - tailstep) / tailstep)
200 |             )
201 |             y3 = np.linspace(
202 |                 (max(y2) + tailstep),
203 |                 (max(y2) + tailstep * 9),
204 |                 int((tailstep * 9) / tailstep),
205 |             )
206 |             y = np.hstack((y1, y2, y3))
207 | 
208 |         # Get the dict and quantile values back for validation
209 |         temp_dict = pdf_quantile_builder(
210 |             temp, y=y, term_limit=i, bounds=bounds, boundedness=boundedness
211 |         )
212 | 
213 |         # If it not a valid pdf run and the OLS version was used the LP version
214 |         if (temp_dict["valid"] == "no") and (fit_method != "OLS"):
215 |             temp = a_vector_LP(
216 |                 m_dict,
217 |                 term_limit=i,
218 |                 term_lower_bound=i,
219 |                 diff_error=diff_error,
220 |                 diff_step=diff_step,
221 |             )
222 |             temp = np.append(temp, np.zeros(term_limit - i))
223 |             methodFit = "Linear Program"
224 | 
225 |             # Get the dict and quantile values back for validation
226 |             temp_dict = pdf_quantile_builder(
227 |                 temp, y=y, term_limit=i, bounds=bounds, boundedness=boundedness
228 |             )
229 | 
230 |         df_MH_temp_list.append(pd.DataFrame(temp_dict["m"]))
231 |         df_MH_temp_list.append(pd.DataFrame(temp_dict["M"]))
232 |         df_A_temp_list.append(pd.DataFrame(temp))
233 | 
234 |         tempValidation = pd.DataFrame(
235 |             data={"term": [i], "valid": [temp_dict["valid"]], "method": [methodFit]}
236 |         )
237 |         df_Validation_temp_list.append(tempValidation)
238 | 
239 |     Validation = pd.concat(df_Validation_temp_list, axis=0)
240 |     Mh = pd.concat(df_MH_temp_list, axis=1)
241 |     A = pd.concat(df_A_temp_list, axis=1)
242 | 
243 |     A.columns = c_a_names
244 |     Mh.columns = c_m_names
245 | 
246 |     m_dict["A"] = A
247 |     m_dict["M"] = Mh
248 |     m_dict["M"]["y"] = temp_dict["y"]
249 |     m_dict["Validation"] = Validation
250 | 
251 |     A = np.column_stack((np.repeat(1.0, len(A)), A))
252 |     Est = np.dot(m_dict["Y"], A)
253 |     ncols = A.shape[1]
254 |     Z = np.column_stack(
255 |         (
256 |             np.array(m_dict["dataValues"]["z"]),
257 |             np.repeat(m_dict["dataValues"]["z"], ncols - 1).reshape(
258 |                 len(m_dict["dataValues"]["z"]), ncols - 1
259 |             ),
260 |         )
261 |     )
262 | 
263 |     m_dict["square_residual_error"] = ((Z - Est) ** 2).sum(axis=1)
264 | 
265 |     return m_dict
266 | 
267 | 
268 | def a_vector_LP(
269 |     m_dict, term_limit, term_lower_bound, diff_error=0.001, diff_step=0.001
270 | ):
271 |     """TODO: write docstring"""
272 |     cnames = np.array([])
273 | 
274 |     for i in range(term_lower_bound, term_limit + 1):
275 |         Y = m_dict["Y"].iloc[:, 0:i]
276 |         z = m_dict["dataValues"]["z"]
277 | 
278 |         # Bulding the objective function using abs value LP formulation
279 |         Y_neg = -Y
280 | 
281 |         new_Y = pd.DataFrame({"y1": Y.iloc[:, 0], "y1_neg": Y_neg.iloc[:, 0]})
282 | 
283 |         for c in range(1, len(Y.iloc[0, :])):
284 |             new_Y["y" + str(c + 1)] = Y.iloc[:, c]
285 |             new_Y["y" + str(c + 1) + "_neg"] = Y_neg.iloc[:, c]
286 | 
287 |         a = np.array(["".join(["a", str(i)])])
288 |         cnames = np.append(cnames, a, axis=0)
289 | 
290 |         # Building the constraint matrix
291 |         error_mat = np.array([])
292 | 
293 |         for j in range(1, len(Y.iloc[:, 0]) + 1):
294 |             front_zeros = np.zeros(2 * (j - 1))
295 |             ones = [1, -1]
296 |             trail_zeroes = np.zeros(2 * (len(Y.iloc[:, 1]) - j))
297 |             if j == 1:
298 |                 error_vars = np.append(ones, trail_zeroes)
299 | 
300 |             elif j != 1:
301 |                 error_vars = np.append(front_zeros, ones)
302 |                 error_vars = np.append(error_vars, trail_zeroes)
303 | 
304 |             if error_mat.size == 0:
305 |                 error_mat = np.append(error_mat, error_vars, axis=0)
306 |             else:
307 |                 error_mat = np.vstack((error_mat, error_vars))
308 | 
309 |         new = pd.concat((pd.DataFrame(data=error_mat), new_Y), axis=1)
310 |         diff_mat = diffMatMetalog(i, diff_step)
311 |         diff_zeros = []
312 | 
313 |         for t in range(0, len(diff_mat.iloc[:, 0])):
314 |             zeros_temp = np.zeros(2 * len(Y.iloc[:, 0]))
315 | 
316 |             if np.size(diff_zeros) == 0:
317 |                 diff_zeros = zeros_temp
318 |             else:
319 |                 diff_zeros = np.vstack((zeros_temp, diff_zeros))
320 | 
321 |         diff_mat = np.concatenate((diff_zeros, diff_mat), axis=1)
322 | 
323 |         # Combine the total constraint matrix
324 |         lp_mat = np.concatenate((new, diff_mat), axis=0)
325 | 
326 |         # Objective function coeficients
327 |         c = np.append(np.ones(2 * len(Y.iloc[:, 1])), np.zeros(2 * i))
328 | 
329 |         # Constraint matrices
330 |         A_eq = lp_mat[: len(Y.iloc[:, 1]), :]
331 |         A_ub = -1 * lp_mat[len(Y.iloc[:, 1]) :, :]
332 |         b_eq = z
333 |         b_ub = -1 * np.repeat(diff_error, len(diff_mat[:, 0]))
334 | 
335 |         # Solving the linear program w/ scipy (for now)
336 |         lp_sol = linprog(
337 |             c,
338 |             A_ub=A_ub,
339 |             b_ub=b_ub,
340 |             A_eq=A_eq,
341 |             b_eq=b_eq,
342 |             method="simplex",
343 |             options={"maxiter": 5000, "tol": 1.0e-5, "disp": False},
344 |         )
345 | 
346 |         # Consolidating solution back into the a vector
347 |         tempLP = lp_sol.x[(2 * len(Y.iloc[:, 1])) : (len(lp_sol.x) + 1)]
348 |         temp = []
349 | 
350 |         for r in range(0, ((len(tempLP) // 2))):
351 |             temp.append(tempLP[(r * 2)] - tempLP[(2 * r) + 1])
352 | 
353 |     return temp
354 | 
355 | 
356 | def a_vector_MLE(a, y, term, m_dict, bounds, boundedness):
357 |     """TODO: write docstring"""
358 |     ym = [
359 |         newtons_method_metalog(a, xi, term, bounds, boundedness)
360 |         for xi in m_dict["dataValues"]["x"]
361 |     ]
362 | 
363 |     def MLE_quantile_constraints(x):
364 |         M = [
365 |             quantileMetalog(x[:term], yi, term, bounds=bounds, boundedness=boundedness)
366 |             for yi in x[term:]
367 |         ]
368 |         return m_dict["dataValues"]["x"] - M
369 | 
370 |     def MLE_objective_function(x, y, term, m_dict):
371 |         return -np.sum(
372 |             [
373 |                 np.log10(pdfMetalog(x[:term], yi, term, bounds, boundedness))
374 |                 for yi in np.absolute(x[term:])
375 |             ]
376 |         )
377 | 
378 |     m_dict[str("MLE" + str(term))] = {}
379 | 
380 |     x0 = np.hstack((a[:term], ym))
381 |     m_dict[str("MLE" + str(term))]["oldobj"] = -MLE_objective_function(
382 |         x0, y, term, m_dict
383 |     )
384 |     bnd = ((None, None),) * len(a) + ((0, 1),) * (len(x0) - len(a))
385 |     con = NonlinearConstraint(MLE_quantile_constraints, 0, 0)
386 | 
387 |     mle = minimize(
388 |         MLE_objective_function, x0, args=(y, term, m_dict), bounds=bnd, constraints=con
389 |     )
390 | 
391 |     m_dict[str("MLE" + str(term))]["newobj"] = -MLE_objective_function(
392 |         mle.x, y, term, m_dict
393 |     )
394 |     m_dict[str("MLE" + str(term))]["A"] = mle.x[:term]
395 |     m_dict[str("MLE" + str(term))]["Y"] = mle.x[term:]
396 | 
397 |     m_dict[str("MLE" + str(term))]["oldA"] = a
398 |     m_dict[str("MLE" + str(term))]["oldY"] = y
399 | 
400 |     out_temp = np.zeros_like(a)
401 |     for i in range(term):
402 |         out_temp[i] = mle.x[i]
403 | 
404 |     return out_temp
405 | 


--------------------------------------------------------------------------------
/pymetalog/class_method.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import numpy as np
  3 | import matplotlib.pyplot as plt
  4 | from scipy.stats import t
  5 | from .support import newtons_method_metalog, pdfMetalog_density
  6 | from .metalog import metalog
  7 | 
  8 | 
  9 | def summary(m):
 10 |     """Prints information about the fitted metalog m.
 11 |         Prints to console:
 12 |           - metalog.output_dict['params']['term_limit']
 13 |           - metalog.output_dict['params']['term_lower_bound']
 14 |           - metalog.output_dict['params']['boundedness']
 15 |           - metalog.output_dict['params']['bounds']
 16 |           - metalog.output_dict['params']['step_len']
 17 |           - metalog.output_dict['params']['fit_method']
 18 |           - metalog.output_dict['Validation']
 19 |           - metalog.output_dict['params']['nobs']
 20 | 
 21 |     Args:
 22 |         m (:obj:`metalog`): A fitted metalog object.
 23 | 
 24 |     """
 25 |     print(
 26 |         " -----------------------------------------------\n",
 27 |         "Summary of Metalog Distribution Object\n",
 28 |         "-----------------------------------------------\n",
 29 |         "\nParameters\n",
 30 |         "Term Limit: ",
 31 |         m.output_dict["params"]["term_limit"],
 32 |         "\n",
 33 |         "Term Lower Bound: ",
 34 |         m.output_dict["params"]["term_lower_bound"],
 35 |         "\n",
 36 |         "Boundedness: ",
 37 |         m.output_dict["params"]["boundedness"],
 38 |         "\n",
 39 |         "Bounds (only used based on boundedness): ",
 40 |         m.output_dict["params"]["bounds"],
 41 |         "\n",
 42 |         "Step Length for Distribution Summary: ",
 43 |         m.output_dict["params"]["step_len"],
 44 |         "\n",
 45 |         "Method Use for Fitting: ",
 46 |         m.output_dict["params"]["fit_method"],
 47 |         "\n",
 48 |         "\n\n Validation and Fit Method",
 49 |         "Number of Data Points Used: ",
 50 |         m.output_dict["params"]["nobs"],
 51 |         "\n",
 52 |     )
 53 |     print(m.output_dict["Validation"].to_string(index=False))
 54 | 
 55 | 
 56 | def rmetalog(m, n=1, term=2, generator="rand"):
 57 |     """Take n random draws from fitted metalog m using specified number of terms.
 58 |         Uses specified random seed.
 59 | 
 60 |     Args:
 61 |         m (:obj:`metalog`): A fitted metalog object.
 62 | 
 63 |         n (:obj:`int`, optional): Number of random draws to take from fitted metalog.
 64 |           - strictly >= 1
 65 |           - Default: 1
 66 | 
 67 |         term (:obj:`int`, optional): Number of metalog terms to use when making random draws.
 68 |           - strictly >= 2
 69 |           - must be in range [m.term_lower_bound, m.term_limit]
 70 |           - Default: 2
 71 | 
 72 |         generator (:obj:`str`, optional): String that is used to specify the random number generator.
 73 |           - must be in set ('rand','hdr')
 74 |             * 'rand' uses `np.random.rand`, results are random each time
 75 |             * 'hdr' uses Hubbard Decision Research (HDR) random number generator, results are repeatable
 76 |           - Default: 'rand'
 77 | 
 78 |     Returns:
 79 |         (:obj:`numpy.ndarray`): n length numpy array of random draws from fitted metalog.
 80 | 
 81 |     """
 82 |     m = m.output_dict
 83 |     valid_terms = np.asarray(m["Validation"]["term"])
 84 |     valid_terms_printout = " ".join(str(t) for t in valid_terms)
 85 | 
 86 |     if (type(n) != int) or (n < 1) or ((n % 1) != 0):
 87 |         raise TypeError("Error: n must be a positive numeric interger")
 88 |     if (
 89 |         (type(term) != int)
 90 |         or (term < 2)
 91 |         or ((term % 1) != 0)
 92 |         or not (term in valid_terms)
 93 |     ):
 94 |         raise TypeError(
 95 |             "Error: term must be a single positive numeric interger contained "
 96 |             "in the metalog object. Available terms are: " + valid_terms_printout
 97 |         )
 98 | 
 99 |     if generator == "hdr":
100 |         x_arr = np.arange(1, n + 1)
101 |         v_index = np.random.randint(80000)
102 | 
103 |         def hdrgen(pm_index):
104 |             return (
105 |                 np.mod(
106 |                     (
107 |                         (
108 |                             np.mod(
109 |                                 (v_index + 1000000)
110 |                                 ^ 2 + (v_index + 1000000) * (pm_index + 10000000),
111 |                                 99999989,
112 |                             )
113 |                         )
114 |                         + 1000007
115 |                     )
116 |                     * (
117 |                         (
118 |                             np.mod(
119 |                                 (pm_index + 10000000)
120 |                                 ^ 2
121 |                                 + (pm_index + 10000000)
122 |                                 * (
123 |                                     np.mod(
124 |                                         (v_index + 1000000)
125 |                                         ^ 2
126 |                                         + (v_index + 1000000) * (pm_index + 10000000),
127 |                                         99999989,
128 |                                     )
129 |                                 ),
130 |                                 99999989,
131 |                             )
132 |                         )
133 |                         + 1000013
134 |                     ),
135 |                     2147483647,
136 |                 )
137 |                 + 0.5
138 |             ) / 2147483647
139 | 
140 |         vhdrgen = np.vectorize(hdrgen)
141 |         x = vhdrgen(x_arr)
142 | 
143 |     else:
144 |         x = np.random.rand(n)
145 | 
146 |     Y = pd.DataFrame(np.array([np.repeat(1, n)]).T, columns=["y1"])
147 | 
148 |     # Construct initial Y Matrix values
149 |     Y["y2"] = np.log(x / (1 - x))
150 |     if term > 2:
151 |         Y["y3"] = (x - 0.5) * Y["y2"]
152 |     if term > 3:
153 |         Y["y4"] = x - 0.5
154 | 
155 |     # Complete the values through the term limit
156 |     if term > 4:
157 |         for i in range(5, (term + 1)):
158 |             y = "".join(["y", str(i)])
159 |             if i % 2 != 0:
160 |                 Y[y] = Y["y4"] ** (i // 2)
161 |             if i % 2 == 0:
162 |                 z = "".join(["y", str(i - 1)])
163 |                 Y[y] = Y["y2"] * Y[z]
164 | 
165 |     amat = "".join(["a", str(term)])
166 |     a = m["A"][amat].iloc[0:(term)].to_frame()
167 |     s = np.dot(Y, a)
168 | 
169 |     if m["params"]["boundedness"] == "sl":
170 |         s = m["params"]["bounds"][0] + np.exp(s)
171 | 
172 |     if m["params"]["boundedness"] == "su":
173 |         s = m["params"]["bounds"][1] - np.exp(-(s))
174 | 
175 |     if m["params"]["boundedness"] == "b":
176 |         s = (m["params"]["bounds"][0] + (m["params"]["bounds"][1]) * np.exp(s)) / (
177 |             1 + np.exp(s)
178 |         )
179 | 
180 |     return s
181 | 
182 | 
183 | def dmetalog(m, q, term=3):
184 |     """Generate density values with user specified quantiles from a fitted metalog object.
185 |         Generated using user specified number of terms.
186 |         Quantiles are generated using a Newton's Method approximation.
187 | 
188 |     Args:
189 |         m (:obj:`metalog`): A fitted metalog object.
190 | 
191 |         q (:obj:`list` | `numpy.ndarray`): Quantiles to return density values for.
192 | 
193 |         term (:obj:`int`, optional): Number of metalog terms to use when generating densities.
194 |           - strictly >= 2
195 |           - must be in range [m.term_lower_bound, m.term_limit]
196 |           - Default: 3
197 | 
198 |     Returns:
199 |         (:obj:`list`): len(q) list of density values from fitted metalog.
200 | 
201 |     """
202 |     valid_terms = np.asarray(m.output_dict["Validation"]["term"])
203 | 
204 |     if (type(q) != list) and (type(q) != np.ndarray):
205 |         raise TypeError("Error: input q must be a list or numpy array.")
206 | 
207 |     if (
208 |         (term not in valid_terms)
209 |         or type(term) != int
210 |         or (term < 2)
211 |         or ((term % 1) != 0)
212 |     ):
213 |         raise TypeError(
214 |             "Error: term must be a single positive numeric interger contained in the metalog object. Available "
215 |             "terms are: " + " ".join(map(str, valid_terms))
216 |         )
217 | 
218 |     qs = list(map(lambda qi: newtons_method_metalog(q=qi, m=m, term=term), q))
219 |     ds = list(map(lambda yi: pdfMetalog_density(y=yi, m=m, t=term), qs))
220 | 
221 |     return ds
222 | 
223 | 
224 | def pmetalog(m, q, term=3):
225 |     """Generate probabilities with user specified quantiles from a fitted metalog object.
226 |         Generated using user specified number of terms.
227 |         Quantiles are generated using a Newton's Method approximation.
228 | 
229 |     Args:
230 |         m (:obj:`metalog`): A fitted metalog object.
231 | 
232 |         q (:obj:`list` | `numpy.ndarray`): Quantiles to return probabilities values for.
233 | 
234 |         term (:obj:`int`, optional): Number of metalog terms to use when generating probabilities.
235 |           - strictly >= 2
236 |           - must be in range [m.term_lower_bound, m.term_limit]
237 |           - Default: 3
238 | 
239 |     Returns:
240 |         (:obj:`list`): len(q) list of probabilities from fitted metalog.
241 | 
242 |     """
243 |     valid_terms = np.asarray(m.output_dict["Validation"]["term"])
244 | 
245 |     if (type(q) != list) and (type(q) != np.ndarray):
246 |         raise TypeError("Error: input q must be a list or numpy array")
247 |     if not isinstance(q, (int, float, complex)) and not all(
248 |         isinstance(x, (int, float, complex)) for x in q
249 |     ):
250 |         raise TypeError("Error: all elements in q must be numeric")
251 |     if (
252 |         (term in valid_terms) != True
253 |         or type(term) != int
254 |         or (term < 2)
255 |         or ((term % 1) != 0)
256 |     ):
257 |         raise TypeError(
258 |             "Error: term must be a single positive numeric interger contained in the metalog object. Available "
259 |             "terms are: " + " ".join(map(str, valid_terms))
260 |         )
261 | 
262 |     qs = list(map(lambda qi: newtons_method_metalog(q=qi, m=m, term=term), q))
263 |     return qs
264 | 
265 | 
266 | def qmetalog(m, y, term=3):
267 |     """Generate quantiles with a probability from a fitted metalog object.
268 | 
269 |     Args:
270 |         m (:obj:`metalog`): A fitted metalog object.
271 | 
272 |         y (:obj:`list` | `numpy.ndarray`): Probabilities to return quantile values for.
273 | 
274 |         term (:obj:`int`, optional): Number of metalog terms to use when generating quantiles.
275 |           - strictly >= 2
276 |           - must be in range [m.term_lower_bound, m.term_limit]
277 |           - Default: 3
278 | 
279 |     Returns:
280 |         (:obj:`numpy.ndarray`): len(q) length numpy array of quantiles from fitted metalog.
281 | 
282 |     """
283 |     m = m.output_dict
284 |     valid_terms = np.asarray(m["Validation"]["term"])
285 |     valid_terms_printout = " ".join(str(t) for t in valid_terms)
286 | 
287 |     if type(y) != list:
288 |         raise TypeError("Error: y must be a list of numeric values")
289 |     y = np.asarray(y)
290 |     if (
291 |         (all(isinstance(x, (int, float, complex)) for x in y)) != True
292 |         or (max(y) >= 1)
293 |         or (min(y) <= 0)
294 |     ):
295 |         raise TypeError(
296 |             "Error: y or all elements in y must be positive numeric values between 0 and 1"
297 |         )
298 |     if (
299 |         (type(term) != int)
300 |         or (term < 2)
301 |         or ((term % 1) != 0)
302 |         or (term in valid_terms) != True
303 |     ):
304 |         raise TypeError(
305 |             "Error: term must be a single positive numeric integer contained "
306 |             "in the metalog object. Available terms are: " + valid_terms_printout
307 |         )
308 | 
309 |     Y = pd.DataFrame(np.array([np.repeat(1, len(y))]).T, columns=["y1"])
310 | 
311 |     # Construct the Y Matrix initial values
312 |     Y["y2"] = np.log(y / (1 - y))
313 |     if term > 2:
314 |         Y["y3"] = (y - 0.5) * Y["y2"]
315 |     if term > 3:
316 |         Y["y4"] = y - 0.5
317 | 
318 |     # Complete the values through the term limit
319 |     if term > 4:
320 |         for i in range(5, (term + 1)):
321 |             y = "".join(["y", str(i)])
322 |             if i % 2 != 0:
323 |                 Y[y] = Y["y4"] ** (i // 2)
324 |             if i % 2 == 0:
325 |                 z = "".join(["y", str(i - 1)])
326 |                 Y[y] = Y["y2"] * Y[z]
327 | 
328 |     amat = "".join(["a", str(term)])
329 |     a = m["A"][amat].iloc[0:(term)].to_frame()
330 |     s = np.dot(Y, a)
331 | 
332 |     if m["params"]["boundedness"] == "sl":
333 |         s = m["params"]["bounds"][0] + np.exp(s)
334 | 
335 |     if m["params"]["boundedness"] == "su":
336 |         s = m["params"]["bounds"][1] - np.exp(-(s))
337 | 
338 |     if m["params"]["boundedness"] == "b":
339 |         s = (m["params"]["bounds"][0] + (m["params"]["bounds"][1]) * np.exp(s)) / (
340 |             1 + np.exp(s)
341 |         )
342 | 
343 |     s = s.flatten()
344 |     return s
345 | 
346 | 
347 | def plot(m):
348 |     """Plots PDF and Quantile panels for each term of fitted metalog m.
349 | 
350 |     Args:
351 |         m (:obj:`metalog`): A fitted metalog object.
352 | 
353 |     Returns:
354 |         (:obj:`dict` with keys ['pdf', 'cdf']): PDF and Quantile panel plots.
355 |     """
356 |     x = m.output_dict
357 |     # build plots
358 |     InitalResults = pd.DataFrame(
359 |         data={
360 |             "term": (
361 |                 np.repeat(
362 |                     (str(x["params"]["term_lower_bound"]) + " Terms"),
363 |                     len(x["M"].iloc[:, 0]),
364 |                 )
365 |             ),
366 |             "pdfValues": x["M"].iloc[:, 0],
367 |             "quantileValues": x["M"].iloc[:, 1],
368 |             "cumValue": x["M"]["y"],
369 |         }
370 |     )
371 | 
372 |     if len(x["M"].columns) > 3:
373 |         for i in range(2, ((len(x["M"].iloc[0, :]) - 1) // 2 + 1)):
374 |             TempResults = pd.DataFrame(
375 |                 data={
376 |                     "term": np.repeat(
377 |                         (str(x["params"]["term_lower_bound"] + (i - 1)) + " Terms"),
378 |                         len(x["M"].iloc[:, 0]),
379 |                     ),
380 |                     "pdfValues": x["M"].iloc[:, (i * 2 - 2)],
381 |                     "quantileValues": x["M"].iloc[:, (i * 2 - 1)],
382 |                     "cumValue": x["M"]["y"],
383 |                 }
384 |             )
385 | 
386 |             InitalResults = InitalResults.append(
387 |                 pd.DataFrame(data=TempResults), ignore_index=True
388 |             )
389 | 
390 |     # PDF plot
391 |     ymin = np.min(InitalResults["pdfValues"])
392 |     ymax = np.max(InitalResults["pdfValues"])
393 |     nterms = InitalResults.term.nunique()
394 | 
395 |     nrow = (nterms + 3) // 4
396 | 
397 |     if nterms < 4:
398 |         ncol = nterms
399 |     else:
400 |         ncol = 4
401 | 
402 |     pdf_fig, axes = plt.subplots(nrow, ncol, sharey="col", squeeze=False)
403 | 
404 |     for t in range(nterms):
405 |         data = InitalResults[
406 |             (InitalResults["term"] == (InitalResults.term.unique()[t]))
407 |         ]
408 |         x = data["quantileValues"]
409 |         y = data["pdfValues"]
410 |         r = t // 4
411 |         c = t % 4
412 |         axes[r, c].plot(x, y)
413 |         axes[r, c].set_ylim(ymin, ymax * 1.1)
414 |         axes[r, c].set_title(InitalResults.term.unique()[t])
415 |         axes[r, c].tick_params(axis="both", which="major", labelsize=10)
416 |         axes[r, c].tick_params(axis="both", which="minor", labelsize=10)
417 | 
418 |     for t in range(nterms, nrow * ncol):
419 |         r = t // 4
420 |         c = t % 4
421 |         axes[r, c].axis("off")
422 | 
423 |     pdf_fig.text(0.5, 0.04, "Quantile Values", ha="center")
424 |     pdf_fig.text(0.04, 0.5, "PDF Values", va="center", rotation="vertical")
425 | 
426 |     plt.yscale("linear")
427 |     plt.tight_layout(rect=[0.05, 0.05, 1, 1])
428 | 
429 |     # Quantile Plot
430 |     ymin = np.min(InitalResults["cumValue"])
431 |     ymax = np.max(InitalResults["cumValue"])
432 |     nterms = InitalResults.term.nunique()
433 | 
434 |     nrow = (nterms + 3) // 4
435 | 
436 |     if nterms < 4:
437 |         ncol = nterms
438 |     else:
439 |         ncol = 4
440 | 
441 |     cdf_fig, axes = plt.subplots(nrow, ncol, sharey="col", squeeze=False)
442 | 
443 |     for t in range(nterms):
444 |         data = InitalResults[
445 |             (InitalResults["term"] == (InitalResults.term.unique()[t]))
446 |         ]
447 |         x = data["quantileValues"]
448 |         y = data["cumValue"]
449 |         r = t // 4
450 |         c = t % 4
451 |         axes[r, c].plot(x, y)
452 |         axes[r, c].set_ylim(ymin, ymax * 1.1)
453 |         axes[r, c].set_title(InitalResults.term.unique()[t])
454 |         axes[r, c].tick_params(axis="both", which="major", labelsize=10)
455 |         axes[r, c].tick_params(axis="both", which="minor", labelsize=10)
456 | 
457 |     for t in range(nterms, nrow * ncol):
458 |         r = t // 4
459 |         c = t % 4
460 |         axes[r, c].axis("off")
461 | 
462 |     cdf_fig.text(0.5, 0.04, "Quantile Values", ha="center")
463 |     cdf_fig.text(0.04, 0.5, "CDF Values", va="center", rotation="vertical")
464 | 
465 |     plt.yscale("linear")
466 |     plt.tight_layout(rect=[0.05, 0.05, 1, 1])
467 | 
468 |     return {"pdf": pdf_fig, "cdf": cdf_fig}
469 | 
470 | 
471 | def update(m, new_data, penalty=None, alpha=0.0):
472 |     """Updates a previously fitted metalog object with new data.
473 | 
474 |     Args:
475 |         m (:obj:`metalog`): The previously fitted metalog object to be updated with `new_data`.
476 |           - `save_data` parameter must have been set equal to True in original metalog fit.
477 | 
478 |         new_data (:obj:`list` | `numpy.ndarray` | `pandas.Series`): Input data to update the metalog object with.
479 |           - must be an array of allowable types: int, float, numpy.int64, numpy.float64
480 | 
481 |         penalty (:obj:`str`, optional): Used to specify the norm used in the regularization.
482 |             - must be in set ('l2', None)
483 |                 * 'l2' performs Ridge Regression instead of OLS
484 |                     - Automatically shrinks a coefficients, leading to "smoother" fits
485 |             - should be set in conjunction with `alpha` parameter
486 |             - Default: None
487 | 
488 |         alpha (:obj:`float`, optional): Regularization term to add to OLS fit.
489 |             - strictly >= 0.
490 |             - should be set in conjunction with `penalty` parameter
491 |             - Default: 0. (no regularization, OLS)
492 | 
493 |     Returns:
494 |         (:obj:`metalog`): Input metalog object that has been updated using `new_data`
495 | 
496 |     Raises:
497 |       ValueError: 'Input metalog `m.save_data` parameter must be True'
498 |       TypeError: 'Input x must be an array or pandas Series'
499 |       TypeError: 'Input x must be an array of allowable types: int, float, numpy.int64, or numpy.float64'
500 |       IndexError: 'Input x must be of length 3 or greater'
501 |     """
502 | 
503 |     if not m.save_data:
504 |         raise ValueError("Input metalog `m.save_data` parameter must be True")
505 |     if (
506 |         (type(new_data) != list)
507 |         and (type(new_data) != np.ndarray)
508 |         and (type(new_data) != pd.Series)
509 |     ):
510 |         raise TypeError("Input x must be an array or pandas Series")
511 |     if isinstance(new_data, pd.Series):
512 |         new_data = new_data.values.copy()
513 |     if not all([isinstance(x, (int, float, np.int64, np.float64)) for x in new_data]):
514 |         raise TypeError(
515 |             "Input x must be an array of allowable types: int, float, numpy.int64, or numpy.float64"
516 |         )
517 |     if np.size(new_data) < 3:
518 |         raise IndexError("Input x must be of length 3 or greater")
519 | 
520 |     old_append_new_data = np.append(m.x, new_data)
521 | 
522 |     updated_metalog = metalog(
523 |         old_append_new_data,
524 |         bounds=m.output_dict["params"]["bounds"],
525 |         boundedness=m.output_dict["params"]["boundedness"],
526 |         term_limit=m.output_dict["params"]["term_limit"],
527 |         term_lower_bound=m.output_dict["params"]["term_lower_bound"],
528 |         step_len=m.output_dict["params"]["step_len"],
529 |         probs=None,
530 |         fit_method=m.output_dict["params"]["fit_method"],
531 |         penalty=penalty,
532 |         alpha=alpha,
533 |         save_data=True,
534 |     )
535 | 
536 |     Y = updated_metalog.output_dict["Y"].values
537 |     gamma = Y.T.dot(Y)
538 |     updated_metalog.output_dict["params"]["bayes"]["gamma"] = gamma
539 |     updated_metalog.output_dict["params"]["bayes"]["mu"] = updated_metalog.output_dict[
540 |         "A"
541 |     ]
542 |     v = list()
543 |     for i in range(
544 |         updated_metalog.output_dict["params"]["term_lower_bound"],
545 |         updated_metalog.output_dict["params"]["term_limit"] + 1,
546 |     ):
547 |         v.append(updated_metalog.output_dict["params"]["nobs"] - i)
548 |     v = np.array(v)
549 |     a = v / 2
550 |     updated_metalog.output_dict["params"]["bayes"]["a"] = a
551 |     updated_metalog.output_dict["params"]["bayes"]["v"] = v
552 | 
553 |     # for now, just using 3 term standard metalog
554 |     v = v[1]
555 |     a = a[1]
556 |     s = np.array([0.1, 0.5, 0.9])
557 |     Ys = np.repeat(1.0, 3)
558 | 
559 |     Ys = np.column_stack(
560 |         [np.repeat(1, 3), np.log(s / (1 - s)), (s - 0.5) * np.log(s / (1 - s))]
561 |     )
562 |     three_term_metalog_fit_idx = "a{}".format(updated_metalog.term_limit - 3)
563 |     q_bar = np.dot(
564 |         Ys, updated_metalog.output_dict["A"][three_term_metalog_fit_idx].values[-3:]
565 |     )
566 | 
567 |     updated_metalog.output_dict["params"]["bayes"]["q_bar"] = q_bar
568 | 
569 |     est = (q_bar[2] - q_bar[1]) / 2 + q_bar[1]
570 |     s2 = ((q_bar[2] - q_bar[1]) / t.ppf(0.9, np.array(v))) ** 2
571 | 
572 |     gamma = gamma[:3, :3]
573 | 
574 |     # build covariance matrix for students t
575 |     sig = Ys.dot(np.linalg.solve(gamma, np.eye(len(gamma)))).dot(Ys.T)
576 | 
577 |     # b = 0.5 * self.output_dict['params']['square_residual_error'][len(self.output_dict['params']['square_residual_error'])]
578 |     b = (a * s2) / gamma[1, 1]
579 |     updated_metalog.output_dict["params"]["bayes"]["sig"] = (b / a) * sig
580 |     updated_metalog.output_dict["params"]["bayes"]["b"] = b
581 | 
582 |     return updated_metalog
583 | 


--------------------------------------------------------------------------------
/pymetalog/metalog.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | from scipy.stats import t
  4 | from .support import MLprobs
  5 | from .a_vector import a_vector_OLS_and_LP
  6 | 
  7 | 
  8 | class metalog:
  9 |     """
 10 |     Main class in pymetalog package.
 11 |     The pymetalog package is a python implementation of Tom Keelin's metalog distributions.
 12 | 
 13 |     The metalog distributions are a family of continuous univariate
 14 |     probability distributions that are convenient derivations of equations that appropriately
 15 |     characterize the probability density (PDF), cumulative (CDF) or quantile distribution functions.
 16 | 
 17 |     They can be used in most any situation in which CDF data is known and a flexible,
 18 |     simple, and easy-to-use continuous probability distribution is needed to
 19 |     represent that data. See links below for more details.
 20 | 
 21 |     Uses and benefits: http://www.metalogdistributions.com/usesbenefits.html
 22 |     Applications: http://www.metalogdistributions.com/applicationsdata.html
 23 | 
 24 |     Theory: http://pubsonline.informs.org/doi/abs/10.1287/deca.2016.0338
 25 |     Homepage: http://www.metalogdistributions.com/
 26 | 
 27 |     Attributes:
 28 |         x (:obj: `numpy.ndarray`): Input array being fit with the metalog distribution.
 29 |         nobs (:obj:`int`): Number of data points in x.
 30 |         boundedness (:obj: `str`): String type of metalog to fit ('u' | 'sl' | 'su' | 'b').
 31 |         bounds (:obj: `list`): List upper and lower limits to filter array with before calculating metalog quantiles/pdfs.
 32 |         term_limit (:obj: `int`): Int upper limit of the range of metalog terms to use to fit the data.
 33 |         term_lower_bound (:obj: `int`): Int lower limit of the range of metalog terms to use to fit the data.
 34 |         step_len (:obj: `float`): Float bin width used to estimate the metalog fit.
 35 |         probs (:obj: `numpy.ndarray`): Input array of probabilities associated with the data values in `x`.
 36 |         fit_method (:obj: `str`): String type of metalog fit method ('any' | 'OLS' | 'LP' | 'MLE').
 37 |         penalty (:obj:`str`): Used to specify the norm used in the regularization.
 38 |         alpha (:obj:`float`): Regularization term to add to OLS fit.
 39 | 
 40 |         output_dict (:obj:`dict` with keys ['params', 'dataValues', 'Y', 'A', 'M', 'Validation']).
 41 |             - output_dict['params'] (:obj:`dict`):
 42 |                 - output_dict['params']['bounds'] = `bounds`
 43 |                 - output_dict['params']['boundedness'] = `boundedness`
 44 |                 - output_dict['params']['term_limit'] = `term_limit`
 45 |                 - output_dict['params']['term_lower_bound'] = `term_lower_bound`
 46 |                 - output_dict['params']['step_len'] = `step_len`
 47 |                 - output_dict['params']['fit_method'] = `fit_method`
 48 |                 - output_dict['params']['square_residual_error'] = Squared residual error (y_i - yhat_i)^2`
 49 | 
 50 |             - output_dict['dataValues'] (:obj:`dict`).
 51 |                 - output_dict['dataValues']['x']: `x`
 52 |                 - output_dict['dataValues']['probs']: `probs`
 53 |                 - output_dict['dataValues']['z']: column calculated in `append_zvector` method
 54 |                     * depends on `boundedness` attribute
 55 |                     * `boundedness` = 'u':
 56 |                         * output_dict['dataValues']['z'] = `x`
 57 |                     * `boundedness` = 'sl':
 58 |                         * output_dict['dataValues']['z'] = log( (`x`-lower_bound) )
 59 |                     * `boundedness` = 'su':
 60 |                         * output_dict['dataValues']['z'] = = log( (upper_bound-`x`) )
 61 |                     * `boundedness` = 'b':
 62 |                         * output_dict['dataValues']['z'] = log( (`x`-lower_bound) / (upper_bound-`x`) )
 63 | 
 64 |             - output_dict['Y'] (:obj:`pandas.DataFrame` with columns ['y1','y2','y3','y4', ... ,'yn']  of type numeric).
 65 |                 - output_dict['Y']['y1']: numpy.array of ones with length equal to len(`x`)
 66 |                 - output_dict['Y']['y2']: numpy.array of numeric values equal to the term attached to s in the logistic quantile function np.log(output_dict['dataValues']['probs'] / (1 - output_dict['dataValues']['probs']))
 67 |                 - output_dict['Y']['y3']: numpy.array of numeric values (output_dict['dataValues']['probs'] - 0.5) * output_dict['Y']['y2']
 68 |                 - output_dict['Y']['y4']: numpy.array of numeric values output_dict['Y']['y4'] = output_dict['dataValues']['probs'] - 0.5
 69 |                 - output_dict['Y']['yn']: numpy.array of numeric values:
 70 |                     * if n in 'yn' is odd,
 71 |                         output_dict['Y']['yn'] = output_dict['Y']['y4']**(int(i//2))
 72 |                     * if n in 'yn' is even,
 73 |                         zn = 'y' + str(n-1)
 74 |                         output_dict['Y'][yn] = output_dict['Y']['y2'] * output_dict['Y'][zn]
 75 | 
 76 |             - output_dict['A']: (:obj:`pandas.DataFrame` with columns ['a2','a3', ... ,'an'] of type numeric):
 77 |                 * 'a2', 'a3', ... , 'an' are our a coefficients returned by the method specified in `fit_method`
 78 | 
 79 |             - output_dict['M']: (:obj:`pandas.DataFrame` with columns ['m2', 'M2', 'm3', 'M3', ... , 'mn', 'Mn'] of type numeric):
 80 |                 * 'm2', 'M2', 'm3', 'M3', ... , 'mn', 'Mn' are the metalog pdf/quantile fit estimates returned by the method specified in `fit_method`
 81 |                 * 'mn' is the pdf fit of metalog term n
 82 |                 * 'Mn' is the quantile fit of metalog term n
 83 | 
 84 |             - output_dict['Validation']: (:obj:`pandas.DataFrame` with columns ['term', 'valid', 'method']):
 85 |                 * 'term' (:obj: `int`): each metalog estimation given a number of terms
 86 |                 * 'valid' (:obj: `str`): boolean flag indicating if the metalog estimation was valid or not
 87 |                 * 'method' (:obj: `str`): a string indicating which method was used for the metalog estimation
 88 | 
 89 |     Methods:
 90 |         get_params(`bounds`, `boundedness`, `term_limit`, `term_lower_bound`, `step_len`, `fit_method`) -> output_dict['params'] (:obj:`dict`)
 91 |         append_zvector(`bounds`, `boundedness`) -> df_x: (:obj:`pandas.DataFrame` with columns ['x','probs','z'] of type numeric)
 92 | 
 93 |     """
 94 | 
 95 |     def __init__(
 96 |         self,
 97 |         x,
 98 |         bounds=[0, 1],
 99 |         boundedness="u",
100 |         term_limit=13,
101 |         term_lower_bound=2,
102 |         step_len=0.01,
103 |         probs=None,
104 |         fit_method="any",
105 |         penalty=None,
106 |         alpha=0.0,
107 |     ):
108 |         """Fits a metalog distribution using the input array `x`.
109 | 
110 |         Args:
111 |             x (:obj:`list` | `numpy.ndarray` | `pandas.Series`): Input data to fit the metalog distribution to.
112 |                 - must be an array of allowable types: int, float, numpy.int64, numpy.float64
113 | 
114 |             bounds (:obj:`list`, optional): Upper and lower limits to filter the data with before calculating metalog quantiles/pdfs.
115 |                 - should be set in conjunction with the `boundedness` parameter
116 |                 - Default: [0,1]
117 | 
118 |             boundedness (:obj:`str`, optional): String that is used to specify the type of metalog to fit.
119 |                 - must be in set ('u','sl','su','b')
120 |                 - Default: 'u'
121 |                     * Fits an unbounded metalog
122 |                 - 'sl' fits a strictly lower bounded metalog
123 |                     * len(bounds) must == 1
124 |                 - 'su' fits a strictly upper bounded metalog
125 |                     * len(bounds) must == 1
126 |                 - 'b' fits a upper/lower bounded metalog
127 |                     * len(bounds) must == 2
128 |                     * bounds[1] must be > bounds[0]
129 | 
130 |             term_limit (:obj:`int`, optional): The upper limit of the range of metalog terms to use to fit the data.
131 |                 - strictly > term_lower_bound
132 |                 - in range [3,30]
133 | 
134 |             term_lower_bound (:obj:`int`, optional): The lower limit of the range of metalog terms to use to fit the data.
135 |                 - strictly < term_limit
136 |                 - in range [2,29]
137 | 
138 |             step_len (:obj:`float`, optional): Used to specify the bin width used to estimate the metalog.
139 |                 - must be in range [0.001, 0.01]
140 | 
141 |             probs (:obj:`list` | `numpy.ndarray`, optional): Probabilities associated with the data values in x.
142 |                 - must be an array of integer or float data
143 |                 - all elements must be in range [0,1]
144 | 
145 |             fit_method (:obj:`str`, optional): Fit method to use to fit metalog distribution.
146 |                 - must be in set ('any','OLS','LP','MLE')
147 |                 - Default: 'any'
148 |                     * first tries 'OLS' method than 'LP'
149 |                 - 'OLS' only tries to fit by solving directly for a coefficients using ordinary least squares method
150 |                 - 'LP' only tries to estimate fit using simplex linear program optimization routine
151 |                 - 'MLE' first tries 'OLS' method than falls back to a maximum likelihood estimation routine
152 | 
153 |             penalty (:obj:`str`, optional): Used to specify the norm used in the regularization.
154 |                 - must be in set ('l2', None)
155 |                     * 'l2' performs Ridge Regression instead of OLS
156 |                         - Automatically shrinks a coefficients, leading to "smoother" fits
157 |                 - should be set in conjunction with `alpha` parameter
158 |                 - Default: None
159 | 
160 |             alpha (:obj:`float`, optional): Regularization term to add to OLS fit.
161 |                 - strictly >= 0.
162 |                 - should be set in conjunction with `penalty` parameter
163 |                 - Default: 0. (no regularization, OLS)
164 | 
165 |         Raises:
166 |             TypeError: 'Input x must be an array or pandas Series'
167 |             TypeError: 'Input x must be an array of allowable types: int, float, numpy.int64, or numpy.float64'
168 |             TypeError: 'bounds parameter must be of type list'
169 |             TypeError: 'bounds parameter must be list of integers'
170 |             TypeError: 'term_limit parameter should be an integer between 3 and 30'
171 |             TypeError: 'term_lower_bound parameter should be an integer'
172 |             TypeError: 'Input probabilities must be an array'
173 |             TypeError: 'Input probabilities must be an array of integer or float data'
174 | 
175 |             IndexError: 'Input x must be of length 3 or greater'
176 |             IndexError: 'Must supply only one bound for semi-lower or semi-upper boundedness'
177 |             IndexError: 'Must supply exactly two bounds for bounded boundedness (i.e. [0,30])'
178 |             IndexError: 'probs vector and x vector must be the same length'
179 | 
180 |             ValueError: 'for semi-lower boundedness the lower bound must be less than the smallest value in x'
181 |             ValueError: 'for semi-upper boundedness the upper bound must be greater than the largest value in x'
182 |             ValueError: 'Upper bound must be greater than lower bound'
183 |             ValueError: 'boundedness parameter must be u, su, sl or b only'
184 |             ValueError: 'term_limit parameter should be an integer between 3 and 30'
185 |             ValueError: 'term_limit must be less than or equal to the length of the vector x'
186 |             ValueError: 'term_lower_bound parameter should be greater than or equal to 2'
187 |             ValueError: 'term_lower_bound parameter must be less than or equal to term_limit parameter'
188 |             ValueError: 'step_len must be >= to 0.001 and <= to 0.01'
189 |             ValueError: 'Input probabilities cannot contain nans'
190 |             ValueError: 'Input probabilities must have values between, not including, 0 and 1'
191 |             ValueError: 'fit_method can only be values OLS, LP, any, or MLE'
192 |             ValueError: 'penalty can only be values l2 or None'
193 |             ValueError: 'alpha must only be a float >= 0.'
194 | 
195 |         Example:
196 | 
197 |             Fit a metalog to a numpy.ndarray of numeric data.
198 | 
199 |             >>> import numpy as np
200 |                 import pandas as pd
201 |                 import matplotlib.pyplot as plt
202 |                 import pymetalog as pm
203 | 
204 |             >>> fish_data = np.loadtxt('fishout.csv', delimiter=',', skiprows=1, dtype='str')[:,1].astype(np.float)
205 |             >>> fish_metalog = pm.metalog(x=fish_data, bounds=[0,60], boundedness='b', term_limit=9, term_lower_bound=2, step_len=.001,)
206 |             >>> pm.summary(fish_metalog)
207 |             >>> # plot function - right now this saves plots to local
208 |                 pm.plot(fish_metalog)
209 |                 plt.show()
210 | 
211 |         """
212 | 
213 |         self.x = x.copy()
214 |         self.boundedness = boundedness
215 |         self.bounds = bounds[:]
216 |         self.term_limit = term_limit
217 |         self.term_lower_bound = term_lower_bound
218 |         self.step_len = step_len
219 |         self.probs = probs
220 |         self.fit_method = fit_method
221 |         self.penalty = penalty
222 |         self.nobs = len(x)
223 | 
224 |         if penalty == None:
225 |             alpha = 0.0
226 | 
227 |         self.alpha = alpha
228 | 
229 |         if probs == None:
230 |             df_x = MLprobs(self.x, step_len=step_len)
231 | 
232 |         else:
233 |             df_x = pd.DataFrame()
234 |             df_x["x"] = self.x
235 |             df_x["probs"] = self.probs
236 | 
237 |         output_dict = {}
238 | 
239 |         # build z vector based on boundedness
240 |         df_x = self.append_zvector(df_x)
241 | 
242 |         output_dict["params"] = self.get_params()
243 |         output_dict["dataValues"] = df_x
244 | 
245 |         # Construct the Y Matrix initial values
246 |         Y = pd.DataFrame()
247 |         Y["y1"] = np.ones(len(df_x["x"]))
248 |         Y["y2"] = np.log(df_x["probs"] / (1 - df_x["probs"]))
249 |         Y["y3"] = (df_x["probs"] - 0.5) * Y["y2"]
250 | 
251 |         if self.term_limit > 3:
252 |             Y["y4"] = df_x["probs"] - 0.5
253 | 
254 |         # Complete the values through the term limit
255 |         if term_limit > 4:
256 |             for i in range(5, self.term_limit + 1):
257 |                 yn = "y" + str(i)
258 | 
259 |                 if i % 2 != 0:
260 |                     Y[yn] = Y["y4"] ** (int(i // 2))
261 | 
262 |                 if i % 2 == 0:
263 |                     zn = "y" + str(i - 1)
264 |                     Y[yn] = Y["y2"] * Y[zn]
265 | 
266 |         output_dict["Y"] = Y
267 | 
268 |         self.output_dict = a_vector_OLS_and_LP(
269 |             output_dict,
270 |             bounds=self.bounds,
271 |             boundedness=self.boundedness,
272 |             term_limit=self.term_limit,
273 |             term_lower_bound=self.term_lower_bound,
274 |             fit_method=self.fit_method,
275 |             alpha=self.alpha,
276 |             diff_error=0.001,
277 |             diff_step=0.001,
278 |         )
279 | 
280 |     # input validation...
281 |     @property
282 |     def x(self):
283 |         """x (:obj:`list` | `numpy.ndarray` | `pandas.Series`): Input data to fit a metalog to."""
284 | 
285 |         return self._x
286 | 
287 |     @x.setter
288 |     def x(self, xs):
289 |         if (type(xs) != list) and (type(xs) != np.ndarray) and (type(xs) != pd.Series):
290 |             raise TypeError("Input x must be an array or pandas Series")
291 |         if isinstance(xs, pd.Series):
292 |             xs = xs.values.copy()
293 |         if not all(isinstance(x, (int, float, np.int64, np.float64)) for x in xs):
294 |             raise TypeError(
295 |                 "Input x must be an array of allowable types: int, float, numpy.int64, or numpy.float64"
296 |             )
297 |         if np.size(xs) < 3:
298 |             raise IndexError("Input x must be of length 3 or greater")
299 |         self._x = xs
300 | 
301 |     @property
302 |     def bounds(self):
303 |         """bounds (:obj:`list`, optional): Upper and lower limits to filter the data with before calculating metalog quantiles/pdfs."""
304 | 
305 |         return self._bounds
306 | 
307 |     @bounds.setter
308 |     def bounds(self, bs):
309 |         if type(bs) != list:
310 |             raise TypeError("bounds parameter must be of type list")
311 |         if not all(isinstance(x, (int)) for x in bs):
312 |             raise TypeError("bounds parameter must be list of integers")
313 |         if (self.boundedness == "sl" or self.boundedness == "su") and len(bs) != 1:
314 |             raise IndexError(
315 |                 "Must supply only one bound for semi-lower or semi-upper boundedness"
316 |             )
317 |         if self.boundedness == "b" and len(bs) != 2:
318 |             raise IndexError(
319 |                 "Must supply exactly two bounds for bounded boundedness (i.e. [0,30])"
320 |             )
321 |         if self.boundedness == "su":
322 |             bs_o = [np.min(self.x), bs[0]]
323 |         if self.boundedness == "sl":
324 |             bs_o = [bs[0], np.max(self.x)]
325 |         if self.boundedness == "b" or self.boundedness == "u":
326 |             bs_o = bs
327 |         if self.boundedness == "sl" and np.min(self.x) < bs_o[0]:
328 |             raise ValueError(
329 |                 "for semi-lower boundedness the lower bound must be less than the smallest value in x"
330 |             )
331 |         if self.boundedness == "su" and np.max(self.x) > bs_o[1]:
332 |             raise ValueError(
333 |                 "for semi-upper boundedness the upper bound must be greater than the largest value in x"
334 |             )
335 |         if bs_o[0] > bs_o[1] and self.boundedness == "b":
336 |             raise ValueError("Upper bound must be greater than lower bound")
337 |         self._bounds = bs_o
338 | 
339 |     @property
340 |     def boundedness(self):
341 |         """boundedness (:obj:`str`, optional): String that is used to specify the type of metalog to fit."""
342 | 
343 |         return self._boundedness
344 | 
345 |     @boundedness.setter
346 |     def boundedness(self, bns):
347 |         if bns != "u" and bns != "b" and bns != "su" and bns != "sl":
348 |             raise ValueError("boundedness parameter must be u, su, sl or b only")
349 |         self._boundedness = bns
350 | 
351 |     @property
352 |     def term_limit(self):
353 |         """term_limit (:obj:`int`, optional): The upper limit of the range of a coefficients to generate."""
354 | 
355 |         return self._term_limit
356 | 
357 |     @term_limit.setter
358 |     def term_limit(self, tl):
359 |         if type(tl) != int:
360 |             raise TypeError(
361 |                 "term_limit parameter should be an integer between 3 and 30"
362 |             )
363 |         if tl > 30 or tl < 3:
364 |             raise ValueError(
365 |                 "term_limit parameter should be an integer between 3 and 30"
366 |             )
367 |         if tl > len(self.x):
368 |             raise ValueError(
369 |                 "term_limit must be less than or equal to the length of the vector x"
370 |             )
371 |         self._term_limit = tl
372 | 
373 |     @property
374 |     def term_lower_bound(self):
375 |         """term_lower_bound (:obj:`int`, optional): The lower limit of the range of a coefficients to generate."""
376 | 
377 |         return self._term_lower_bound
378 | 
379 |     @term_lower_bound.setter
380 |     def term_lower_bound(self, tlb):
381 |         if type(tlb) != int:
382 |             raise TypeError("term_lower_bound parameter should be an integer")
383 |         if tlb < 2:
384 |             raise ValueError(
385 |                 "term_lower_bound parameter should be greater than or equal to 2"
386 |             )
387 |         if tlb > self.term_limit:
388 |             raise ValueError(
389 |                 "term_lower_bound parameter must be less than or equal to term_limit parameter"
390 |             )
391 |         self._term_lower_bound = tlb
392 | 
393 |     @property
394 |     def step_len(self):
395 |         """step_len (:obj:`float`, optional): Used to specify the bin width used to estimate the metalog."""
396 | 
397 |         return self._step_len
398 | 
399 |     @step_len.setter
400 |     def step_len(self, sl):
401 |         if sl < 0.001 or sl > 0.01:
402 |             raise ValueError("step_len must be >= to 0.001 and <= to 0.01")
403 |         self._step_len = sl
404 | 
405 |     @property
406 |     def probs(self):
407 |         """probs (:obj:`list` | `numpy.ndarray`, optional): Probabilities associated with the data values in x."""
408 | 
409 |         return self._probs
410 | 
411 |     @probs.setter
412 |     def probs(self, ps):
413 |         if ps != None:
414 |             if not isinstance(ps, (list, np.ndarray)):
415 |                 raise TypeError("Input probabilities must be an array")
416 |             if not all(isinstance(x, (int, float)) for x in ps):
417 |                 raise TypeError(
418 |                     "Input probabilities must be an array of integer or float data"
419 |                 )
420 |             if np.size(np.where(np.isnan(ps))) != 0:
421 |                 raise ValueError("Input probabilities cannot contain nans")
422 |             if np.max(ps) > 1 or np.min(ps) < 0:
423 |                 raise ValueError(
424 |                     "Input probabilities must have values between, not including, 0 and 1"
425 |                 )
426 |             if len(ps) != len(self.x):
427 |                 raise IndexError("probs vector and x vector must be the same length")
428 |             ps = ps.copy()
429 |         self._probs = ps
430 | 
431 |     @property
432 |     def fit_method(self):
433 |         """fit_method (:obj:`str`, optional): Fit method to use to fit metalog distribution."""
434 | 
435 |         return self._fit_method
436 | 
437 |     @fit_method.setter
438 |     def fit_method(self, fm):
439 |         if fm != "OLS" and fm != "LP" and fm != "any" and fm != "MLE":
440 |             raise ValueError("fit_method can only be values OLS, LP, any, or MLE")
441 |         self._fit_method = fm
442 | 
443 |     @property
444 |     def penalty(self):
445 |         """penalty (:obj:`str`, optional): Used to specify the norm used in the regularization."""
446 | 
447 |         return self._penalty
448 | 
449 |     @fit_method.setter
450 |     def penalty(self, p):
451 |         if p != "l2" and p is not None:
452 |             raise ValueError("penalty can only be values l2 or None")
453 |         self._penalty = p
454 | 
455 |     @property
456 |     def alpha(self):
457 |         """alpha (:obj:`float`): L2 regularization term to add to OLS fit"""
458 | 
459 |         return self._alpha
460 | 
461 |     @alpha.setter
462 |     def alpha(self, a):
463 |         if a < 0 or not isinstance(a, float):
464 |             raise ValueError("alpha must only be a float >= 0.")
465 |         self._alpha = a
466 | 
467 |     def get_params(self):
468 |         """Sets the `params` key (dict) of `output_dict` object prior to input to `a_vector_OLS_and_LP` method.
469 |             - Uses metalog attributes to set keys
470 | 
471 |         Returns:
472 |             params: (:obj:`dict`): Dictionary that is used as input to `a_vector_OLS_and_LP` method.
473 | 
474 |         """
475 | 
476 |         params = {}
477 |         params["bounds"] = self.bounds
478 |         params["boundedness"] = self.boundedness
479 |         params["term_limit"] = self.term_limit
480 |         params["term_lower_bound"] = self.term_lower_bound
481 |         params["step_len"] = self.step_len
482 |         params["fit_method"] = self.fit_method
483 |         params["nobs"] = self.nobs
484 | 
485 |         return params
486 | 
487 |     def append_zvector(self, df_x):
488 |         """Sets the `dataValues` key (pandas.DataFrame) of `output_dict` object prior to input to `a_vector_OLS_and_LP` method.
489 | 
490 |         Uses `boundedness` attribute to set z vector
491 |             - 'u': output_dict['dataValues']['z'] = x
492 |                 * Start with all the input data
493 |             - 'sl': output_dict['dataValues']['z'] = log( (x-lower_bound) )
494 |             - 'su': output_dict['dataValues']['z'] = log( (upper_bound-x) )
495 |             - 'b': output_dict['dataValues']['z'] = log( (x-lower_bound) / (upper_bound-x) )
496 | 
497 |         Returns:
498 |             df_x: (:obj:`pandas.DataFrame` with columns ['x','probs','z'] of type numeric): DataFrame that is used as input to `a_vector_OLS_and_LP` method.
499 |                 - df_x['x']: metalog.x
500 |                 - df_x['probs']: metalog.probs
501 |                 - df_x['z']: z vector above
502 |         """
503 | 
504 |         if self.boundedness == "u":
505 |             df_x["z"] = df_x["x"]
506 |         if self.boundedness == "sl":
507 |             df_x["z"] = np.log(np.array((df_x["x"] - self.bounds[0]), dtype=np.float64))
508 |         if self.boundedness == "su":
509 |             df_x["z"] = -np.log(
510 |                 np.array((self.bounds[1] - df_x["x"]), dtype=np.float64)
511 |             )
512 |         if self.boundedness == "b":
513 |             df_x["z"] = np.log(
514 |                 np.array(
515 |                     ((df_x["x"] - self.bounds[0]) / (self.bounds[1] - df_x["x"])),
516 |                     dtype=np.float64,
517 |                 )
518 |             )
519 | 
520 |         return df_x
521 | 
522 |     def __getitem__(self):
523 |         return self.output_dict
524 | 
525 |     def __getitem__(self, arr):
526 |         if arr not in self.output_dict:
527 |             raise KeyError()
528 |         return self.output_dict[arr]
529 | 


--------------------------------------------------------------------------------
/pymetalog/pdf_quantile_functions.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from .support import pdfMetalog, quantileMetalog
  3 | 
  4 | 
  5 | def pdf_quantile_builder(temp, y, term_limit, bounds, boundedness):
  6 |     """Builds the metalog pdf and quantile arrays based on the a coefficients found by fitting metalog distribution.
  7 | 
  8 |     Args:
  9 |         temp (:obj: `numpy.ndarray` of type float): Array of a coefficients found by fitting metalog distribution.
 10 |             - Fit method is specified by metalog.fit_method attribute
 11 | 
 12 |         y (:obj: `numpy.ndarray` of type float): Array of bin widths specified for `a` parameter
 13 | 
 14 |         term_limit (:obj: `int`): The upper limit of the range of metalog terms to use to fit the data.
 15 |             - metalog.term_limit attribute
 16 |             - in range [3,30]
 17 | 
 18 |         bounds (:obj:`list`): Upper and lower limits to filter the data with before calculating metalog quantiles/pdfs.
 19 |             - metalog.bounds attribute
 20 |             - Default: [0,1]
 21 | 
 22 |         boundedness (:obj: `str`): String that is used to specify the type of metalog to fit.
 23 |             - metalog.boundedness attribute
 24 | 
 25 |     Returns:
 26 |         q_dict (:obj:`dict` with keys ['m', 'M', 'y', 'valid']): Initialized output_dict variable from metalog class.
 27 |             - q_dict['m']: (:obj:`numpy.ndarray` of type float): Array of metalog pdf values.
 28 |                 * Returned by `pdfMetalog` method
 29 |                 * Influenced by `boundedness` parameter
 30 |                 * A valid metalog fit will return an array having all elements strictly > 0
 31 | 
 32 |             - q_dict['M']: (:obj:`numpy.ndarray` of type float): Array of metalog quantile values.
 33 |                 * Returned by `quantileMetalog` method
 34 |                 * Influenced by `boundedness` parameter
 35 |                     - `boundedness` = 'sl': Inserts `bounds`[0] to the front of the quantile array
 36 |                     - `boundedness` = 'su': Appends `bounds`[1] to the end of the quantile array
 37 |                     - `boundedness` = 'b': Inserts `bounds`[0] to the front of the quantile array
 38 |                                             and appends `bounds`[1] to the end of the quantile array
 39 | 
 40 |             - q_dict['y']: (:obj:`numpy.ndarray` of type float): Array of bin widths specified for the pdfs/quantiles.
 41 |                 * Influenced by `boundedness` parameter
 42 |                     - `boundedness` = 'sl': Inserts `bounds`[0] at the front of the quantile array
 43 |                     - `boundedness` = 'su': Appends `bounds`[1] to the end of the quantile array
 44 |                     - `boundedness` = 'b': Inserts `bounds`[0] at the front of the quantile array
 45 |                                             and appends `bounds`[1] to the end of the quantile array
 46 | 
 47 |             - q_dict['valid']: (:obj:`str`): A string indicating if the metalog pdf generated by `pdfMetalog` method is valid or not.
 48 |                 * If all values in the metalog pdf are >= 0, q_dict['valid'] = 'yes'
 49 |                 * If any values in the metalog pdf are < 0, q_dict['valid'] = 'no'
 50 | 
 51 |     """
 52 |     q_dict = {}
 53 | 
 54 |     # build pdf
 55 |     m = pdfMetalog(temp, y[0], term_limit, bounds=bounds, boundedness=boundedness)
 56 | 
 57 |     for j in range(2, len(y) + 1):
 58 |         tempPDF = pdfMetalog(
 59 |             temp, y[j - 1], term_limit, bounds=bounds, boundedness=boundedness
 60 |         )
 61 |         m = np.append(m, tempPDF)
 62 | 
 63 |     # Build quantile values
 64 |     M = quantileMetalog(temp, y[1], term_limit, bounds=bounds, boundedness=boundedness)
 65 | 
 66 |     for j in range(2, len(y) + 1):
 67 |         tempQant = quantileMetalog(
 68 |             temp, y[j - 1], term_limit, bounds=bounds, boundedness=boundedness
 69 |         )
 70 |         M = np.append(M, tempQant)
 71 | 
 72 |     # Add trailing and leading zero's for pdf bounds
 73 |     if boundedness == "sl":
 74 |         m = np.append(0, m)
 75 |         M = np.append(bounds[0], M)
 76 | 
 77 |     if boundedness == "su":
 78 |         m = np.append(m, 0)
 79 |         M = np.append(M, bounds[1])
 80 | 
 81 |     if boundedness == "b":
 82 |         m = np.append(0, m)
 83 |         m = np.append(m, 0)
 84 |         M = np.append(bounds[0], M)
 85 |         M = np.append(M, bounds[1])
 86 | 
 87 |     # Add y values for bounded models
 88 |     if boundedness == "sl":
 89 |         y = np.append(0, y)
 90 | 
 91 |     if boundedness == "su":
 92 |         y = np.append(y, 1)
 93 | 
 94 |     if boundedness == "b":
 95 |         y = np.append(0, y)
 96 |         y = np.append(y, 1)
 97 | 
 98 |     q_dict["m"] = m
 99 |     q_dict["M"] = M
100 |     q_dict["y"] = y
101 | 
102 |     # PDF validation
103 |     q_dict["valid"] = pdfMetalogValidation(q_dict["m"])
104 | 
105 |     return q_dict
106 | 
107 | 
108 | def pdfMetalogValidation(x):
109 |     """Validation that all calculated metalog pdf values are greater than or equal to 0.
110 | 
111 |     Args:
112 |         x (:obj: `numpy.ndarray` of type float): Array of metalog pdf values.
113 |             - Returned by `pdfMetalog` method
114 |             - Influenced by `boundedness` parameter
115 | 
116 |     Returns:
117 |         'yes' | 'no' (:obj:`str`): 'yes' if all elements strictly >= 0, else 'no'.
118 |     """
119 |     y = np.min(x)
120 |     if y >= 0:
121 |         return "yes"
122 |     else:
123 |         return "no"
124 | 


--------------------------------------------------------------------------------
/pymetalog/support.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import numpy as np
  3 | 
  4 | 
  5 | def MLprobs(x_old, step_len):
  6 |     """Returns the quantile values x['x'] and corresponding bins x['y'].
  7 |     Called during metalog.__init__ method call.
  8 | 
  9 |     Args:
 10 |       x_old (:obj: `numpy.ndarray` of type numeric): Input data to fit the metalog distribution to.
 11 |         - must be an array of allowable types: int, float, numpy.int64, numpy.float64
 12 | 
 13 |       step_len (:obj:`float`): Used to specify the bin width used to estimate the metalog.
 14 | 
 15 |     Returns:
 16 |       x: (:obj:`dict` with keys ['x','probs']  of type float):
 17 |         - x['x']: (:obj:`numpy.ndarray` of type float):
 18 |             * x['x'] is the quantile values found using the bin widths array x['y] - which is specified using the `step_len` parameter
 19 | 
 20 |         - x['probs']: (:obj:`numpy.ndarray` of type float):
 21 |             * x['probs'] is the array of bin widths specified for x['x']
 22 | 
 23 |     """
 24 | 
 25 |     l = len(x_old)
 26 |     x = pd.DataFrame()
 27 |     x["x"] = x_old.copy()
 28 | 
 29 |     x.sort_values(by="x")
 30 | 
 31 |     x["probs"] = 0
 32 |     for i in range(0, l):
 33 |         if i == 0:
 34 |             x.loc[i, "probs"] = 0.5 / l
 35 |         else:
 36 |             x.loc[i, "probs"] = x.loc[i - 1, "probs"] + 1 / l
 37 | 
 38 |     # TODO method for turning off and on this n>100 estimation
 39 |     if len(x.index) > 100:
 40 |         y2 = np.linspace(step_len, 1 - step_len, int((1 - step_len) / step_len))
 41 | 
 42 |         tailstep = step_len / 10
 43 | 
 44 |         y1 = np.linspace(
 45 |             tailstep, (min(y2) - tailstep), int((min(y2) - tailstep) / tailstep)
 46 |         )
 47 | 
 48 |         y3 = np.linspace(
 49 |             (max(y2) + tailstep),
 50 |             (max(y2) + tailstep * 9),
 51 |             int((tailstep * 9) / tailstep),
 52 |         )
 53 | 
 54 |         y = np.hstack((y1, y2, y3))
 55 | 
 56 |         x_new = np.quantile(x_old, y)
 57 | 
 58 |         df_x = {}
 59 |         df_x["x"] = x_new
 60 |         df_x["probs"] = y
 61 |         x = df_x
 62 | 
 63 |     return x
 64 | 
 65 | 
 66 | def pdfMetalog(a, y, t, bounds=[], boundedness="u"):
 67 |     """Estimates the metalog pdf given the a coefficients and percentiles found using the specified metalog.fit_method attribute.
 68 |     Called during metalog.__init__ method call if `fit_method`='MLE'.
 69 |     Called during pdf_quantile_builder method call.
 70 | 
 71 |     Args:
 72 |       a (:obj: `numpy.ndarray` of type float): Array of a coefficients found by fitting metalog distribution using the `fit_method` parameter.
 73 | 
 74 |       y (:obj: `numpy.ndarray` of type float): Array of bin widths specified for `a` parameter
 75 | 
 76 |       t (:obj: `int`): The upper limit of the range of metalog terms to use to fit the data.
 77 |         - metalog.term_limit attribute
 78 |         - in range [3,30]
 79 | 
 80 |       bounds (:obj: `list`, optional): Upper and lower limits to filter the data with before calculating metalog quantiles/pdfs.
 81 |           - should be set in conjunction with the `boundedness` parameter
 82 |           - Default: [0,1]
 83 | 
 84 |       boundedness (:obj: `str`, optional): String that is used to specify the type of metalog to fit.
 85 |           - must be in set ('u','sl','su','b')
 86 |           - Default: 'u'
 87 |               * Fits an unbounded metalog
 88 |               * If `boundedness` parameter != 'u' we must calculate the metalog quantiles using an unbounded metalog, via the `quantileMetalog` method.
 89 |           - 'sl' fits a strictly lower bounded metalog
 90 |               * len(bounds) must == 1
 91 |           - 'su' fits a strictly upper bounded metalog
 92 |               * len(bounds) must == 1
 93 |           - 'b' fits a upper/lower bounded metalog
 94 |               * len(bounds) must == 2
 95 |               * bounds[1] must be > bounds[0]
 96 | 
 97 |     Returns:
 98 |       x: (:obj: `numpy.ndarray` of type float): Array of metalog pdf values.
 99 | 
100 |     """
101 |     if y <= 0:
102 |         y = 0.00001
103 | 
104 |     if y >= 1:
105 |         y = 0.99999
106 | 
107 |     d = y * (1 - y)
108 |     f = y - 0.5
109 |     l = np.log(y / (1 - y))
110 | 
111 |     # Initiate pdf
112 | 
113 |     # For the first three terms
114 |     x = a[1] / d
115 |     if len(a) > 2 and a[2] != 0:
116 |         x = x + a[2] * ((f / d) + l)
117 | 
118 |     # For the fourth term
119 |     if t > 3:
120 |         x = x + a[3]
121 | 
122 |     # Initalize some counting variables
123 |     e = 1
124 |     o = 1
125 | 
126 |     # For all other terms greater than 4
127 |     if t > 4:
128 |         for i in range(5, t + 1):
129 |             if (i % 2) != 0:
130 |                 # iff odd
131 |                 x = x + ((o + 1) * a[i - 1] * f ** o)
132 |                 o = o + 1
133 | 
134 |             if (i % 2) == 0:
135 |                 # iff even
136 |                 x = x + a[i - 1] * (((f ** (e + 1)) / d) + (e + 1) * (f ** e) * l)
137 |                 e = e + 1
138 | 
139 |     # Some change of variables here for boundedness
140 |     x = x ** (-1)
141 | 
142 |     if boundedness != "u":
143 |         M = quantileMetalog(a, y, t, bounds=bounds, boundedness="u")
144 | 
145 |     if boundedness == "sl":
146 |         x = x * np.exp(-M)
147 | 
148 |     if boundedness == "su":
149 |         x = x * np.exp(M)
150 | 
151 |     if boundedness == "b":
152 |         x = (x * (1 + np.exp(M)) ** 2) / ((bounds[1] - bounds[0]) * np.exp(M))
153 | 
154 |     if x <= 0:
155 |         x = 0.00001
156 |     # print(str(x) + " zoop")
157 | 
158 |     return x
159 | 
160 | 
161 | def quantileMetalog(a, y, t, bounds=[], boundedness="u"):
162 |     """Estimates the metalog quantiles given the a coefficients and percentiles found using the specified metalog.fit_method attribute.
163 |     Called during metalog.__init__ method call if `fit_method`='MLE'.
164 |     Called during pdf_quantile_builder method call.
165 | 
166 |     Args:
167 |       a (:obj: `numpy.ndarray` of type float): Array of a coefficients found by fitting metalog distribution using the `fit_method` parameter.
168 | 
169 |       y (:obj: `numpy.ndarray` of type float): Array of bin widths specified for `a` parameter
170 | 
171 |       t (:obj: `int`): The upper limit of the range of metalog terms to use to fit the data.
172 |         - metalog.term_limit attribute
173 |         - in range [3,30]
174 | 
175 |       bounds (:obj: `list`, optional): Upper and lower limits to filter the data with before calculating metalog quantiles/pdfs.
176 |           - should be set in conjunction with the `boundedness` parameter
177 |           - Default: [0,1]
178 | 
179 |       boundedness (:obj: `str`, optional): String that is used to specify the type of metalog to fit.
180 |           - must be in set ('u','sl','su','b')
181 |           - Default: 'u'
182 |               * Fits an unbounded metalog
183 |           - 'sl' fits a strictly lower bounded metalog
184 |               * len(bounds) must == 1
185 |           - 'su' fits a strictly upper bounded metalog
186 |               * len(bounds) must == 1
187 |           - 'b' fits a upper/lower bounded metalog
188 |               * len(bounds) must == 2
189 |               * bounds[1] must be > bounds[0]
190 | 
191 |     Returns:
192 |       x: (:obj: `numpy.ndarray` of type float): Array of metalog quantile values.
193 | 
194 |     """
195 |     if y <= 0:
196 |         y = 0.00001
197 | 
198 |     if y >= 1:
199 |         y = 0.99999
200 |     # Some values for calculation
201 |     f = y - 0.5
202 |     l = np.log(y / (1 - y))
203 | 
204 |     # For the first three terms
205 |     x = a[0] + a[1] * l
206 |     if t > 2:
207 |         x = x + a[2] * f * l
208 | 
209 |     # For the fourth term
210 |     if t > 3:
211 |         x = x + a[3] * f
212 | 
213 |     # Some tracking variables
214 |     o = 2
215 |     e = 2
216 | 
217 |     # For all other terms greater than 4
218 |     if t > 4:
219 |         for i in range(5, t + 1):
220 |             if (i % 2) == 0:
221 |                 x = x + a[i - 1] * f ** e * l
222 |                 e = e + 1
223 |             if (i % 2) != 0:
224 |                 x = x + a[i - 1] * f ** o
225 |                 o = o + 1
226 | 
227 |     if boundedness == "sl":
228 |         x = bounds[0] + np.exp(x)
229 | 
230 |     if boundedness == "su":
231 |         x = bounds[1] - np.exp(-x)
232 | 
233 |     if boundedness == "b":
234 |         x = (bounds[0] + bounds[1] * np.exp(x)) / (1 + np.exp(x))
235 | 
236 |     return x
237 | 
238 | 
239 | def diffMatMetalog(term_limit, step_len):
240 |     """TODO: write docstring"""
241 |     y = np.arange(step_len, 1, step_len)
242 |     Diff = np.array([])
243 | 
244 |     for i in range(0, (len(y))):
245 |         d = y[i] * (1 - y[i])
246 |         f = y[i] - 0.5
247 |         l = np.log(y[i] / (1 - y[i]))
248 | 
249 |         # Initiate pdf
250 |         diffVector = 0
251 | 
252 |         # For the first three terms
253 |         x = 1 / d
254 |         diffVector = [diffVector, x]
255 | 
256 |         if term_limit > 2:
257 |             diffVector.append((f / d) + l)
258 | 
259 |         # For the fourth term
260 |         if term_limit > 3:
261 |             diffVector.append(1)
262 | 
263 |         # Initalize some counting variables
264 |         e = 1
265 |         o = 1
266 | 
267 |         # For all other terms greater than 4
268 |         if term_limit > 4:
269 |             for i in range(5, (term_limit + 1)):
270 |                 if (i % 2) != 0:
271 |                     # iff odd
272 |                     diffVector.append((o + 1) * f ** o)
273 |                     o = o + 1
274 | 
275 |                 if (i % 2) == 0:
276 |                     # iff even
277 |                     diffVector.append(((f ** (e + 1)) / d) + (e + 1) * (f ** e) * l)
278 |                     e = e + 1
279 |         if np.size(Diff) == 0:
280 |             Diff = diffVector
281 |         else:
282 |             Diff = np.vstack((Diff, diffVector))
283 | 
284 |     Diff_neg = -1 * (Diff)
285 |     new_Diff = np.hstack((Diff[:, [0]], Diff_neg[:, [0]]))
286 | 
287 |     for c in range(1, (len(Diff[1, :]))):
288 |         new_Diff = np.hstack((new_Diff, Diff[:, [c]]))
289 |         new_Diff = np.hstack((new_Diff, Diff_neg[:, [c]]))
290 | 
291 |     new_Diff = pd.DataFrame(data=new_Diff)
292 | 
293 |     return new_Diff
294 | 
295 | 
296 | def newtons_method_metalog(m, q, term, bounds=None, boundedness=None):
297 |     """TODO: write docstring"""
298 |     # a simple newtons method application
299 |     if bounds == None:
300 |         bounds = m["params"]["bounds"]
301 |     if boundedness == None:
302 |         boundedness = m["params"]["boundedness"]
303 | 
304 |     # if m is metalog
305 |     try:
306 |         m = m.output_dict
307 |         avec = "a" + str(term)
308 |         a = m["A"][avec]
309 |     except:
310 |         a = m
311 | 
312 |     # TODO there should be setters for at least some of these hyperparameters
313 |     alpha_step = 0.5
314 |     err = 1e-10
315 |     temp_err = 0.1
316 |     y_now = 0.5
317 | 
318 |     i = 1
319 |     while temp_err > err:
320 |         frist_function = quantileMetalog(a, y_now, term, bounds, boundedness) - q
321 |         derv_function = pdfMetalog(a, y_now, term, bounds, boundedness)
322 |         y_next = y_now - alpha_step * (frist_function * derv_function)
323 |         temp_err = abs((y_next - y_now))
324 | 
325 |         if y_next > 1:
326 |             y_next = 0.99999
327 | 
328 |         if y_next < 0:
329 |             y_next = 0.000001
330 | 
331 |         y_now = y_next
332 |         i = i + 1
333 | 
334 |         if i > 10000:
335 |             raise StopIteration(
336 |                 "Approximation taking too long, quantile value: "
337 |                 + str(q)
338 |                 + " is to far from distribution median. Try plot() to see distribution."
339 |             )
340 | 
341 |     return y_now
342 | 
343 | 
344 | def pdfMetalog_density(m, t, y):
345 |     m = m.output_dict
346 |     avec = "a" + str(t)
347 |     a = m["A"][avec]
348 |     bounds = m["params"]["bounds"]
349 |     boundedness = m["params"]["boundedness"]
350 | 
351 |     d = y * (1 - y)
352 |     f = y - 0.5
353 |     l = np.log(y / (1 - y))
354 | 
355 |     # Initiate pdf
356 | 
357 |     # For the first three terms
358 |     x = a[1] / d
359 |     if a[2] != 0:
360 |         x = x + a[2] * ((f / d) + l)
361 | 
362 |     # For the fourth term
363 |     if t > 3:
364 |         x = x + a[3]
365 | 
366 |     # Initalize some counting variables
367 |     e = 1
368 |     o = 1
369 | 
370 |     # For all other terms greater than 4
371 |     if t > 4:
372 |         for i in range(5, t + 1):
373 |             if (i % 2) != 0:
374 |                 # iff odd
375 |                 x = x + ((o + 1) * a[i - 1] * f ** o)
376 |                 o = o + 1
377 | 
378 |             if (i % 2) == 0:
379 |                 # iff even
380 |                 x = x + a[i - 1] * (((f ** (e + 1)) / d) + (e + 1) * (f ** e) * l)
381 |                 e = e + 1
382 | 
383 |     # Some change of variables here for boundedness
384 | 
385 |     x = x ** (-1)
386 | 
387 |     if boundedness != "u":
388 |         M = quantileMetalog(a, y, t, bounds=bounds, boundedness="u")
389 | 
390 |     if boundedness == "sl":
391 |         x = x * np.exp(-M)
392 | 
393 |     if boundedness == "su":
394 |         x = x * np.exp(M)
395 | 
396 |     if boundedness == "b":
397 |         x = (x * (1 + np.exp(M)) ** 2) / ((bounds[1] - bounds[0]) * np.exp(M))
398 | 
399 |     return x
400 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import setuptools
 2 | 
 3 | with open("README.md", "r") as fh:
 4 |     long_description = fh.read()
 5 | 
 6 | setuptools.setup(
 7 |     name="pymetalog",
 8 |     version="0.2.1",
 9 |     author="Colin Smith, Travis Jefferies, Isaac J. Faber",
10 |     description="A python package that generates functions for the metalog distribution. The metalog distribution is a highly flexible probability distribution that can be used to model data without traditional parameters.",
11 |     long_description=long_description,
12 |     long_description_content_type="text/markdown",
13 |     url="https://github.com/tjefferies/pymetalog",
14 |     packages=setuptools.find_packages(),
15 |     classifiers=[
16 |         "Programming Language :: Python :: 3",
17 |         "License :: OSI Approved :: MIT License",
18 |         "Operating System :: OS Independent",
19 |     ],
20 |     package_data={'pymetalog': ["examples/*"]},
21 |     install_requires=[
22 |         'numpy',
23 |         'pandas',
24 |         'scipy',
25 |         'seaborn',
26 |      ],
27 | )
28 | 


--------------------------------------------------------------------------------