├── data
    ├── lak.RData
    ├── compas1.RData
    ├── svcensus.RData
    └── mortgageSE.RData
├── vignettes
    ├── z.png
    ├── wibg.png
    ├── RpartVert.png
    ├── smallwage.png
    ├── freqparcoord.png
    ├── conditdisparity.png
    └── Function_List.Rmd
├── .gitignore
├── R
    ├── dsldBnlearn.R
    ├── onAttach.R
    ├── dsldFrequencybyS.R
    ├── dsldFreqPCoord.R
    ├── dsldConfounders.R
    ├── dsldMatching.R
    ├── dsldML.R
    ├── dsldHunting.R
    ├── dsldConditDisparity.R
    ├── dsldDensitybyS.R
    ├── dsldTakeALookAround.R
    ├── dsldScatterPlot3D.R
    ├── dsldFairML.R
    ├── Utils.R
    ├── dsldFairUtils.R
    └── dsldLogit.R
├── man
    ├── svcensus.Rd
    ├── compas1.Rd
    ├── mortgageSE.Rd
    ├── Utils.Rd
    ├── lak.Rd
    ├── dsldBnlearn.Rd
    ├── dsldConfounders.Rd
    ├── dsldDensityByS.Rd
    ├── dsldFrequencyByS.Rd
    ├── dsldConditDisparity.Rd
    ├── dsldPropens.Rd
    ├── dsldML.Rd
    ├── dsldTakeALookAround.Rd
    ├── dsldFairUtils.Rd
    ├── dsldHunting.Rd
    ├── dsldLogit.Rd
    ├── dsldScatterPlot3D.Rd
    ├── dsldLinear.Rd
    ├── dsldFreqPCoord.Rd
    ├── dsldFairML.Rd
    └── dsldEDFFair.Rd
├── inst
    ├── src
    │   └── dsldPy
    │   │   ├── dsldPyBnLearn.py
    │   │   ├── dsldPyFrequencybyS.py
    │   │   ├── dsldPyDensitybyS.py
    │   │   ├── dsldPyMatching.py
    │   │   ├── LICENSE
    │   │   ├── dsldPyHunting.py
    │   │   ├── dsldPyConfounders.py
    │   │   ├── dsldPyTakeALookAround.py
    │   │   ├── dsldPyConditDisparity.py
    │   │   ├── dsldPyScatterPlot3D.py
    │   │   ├── dsldPyML.py
    │   │   ├── dsldPyFairUtils.py
    │   │   ├── dsldPyLogit.py
    │   │   ├── dsldPyFreqPCoord.py
    │   │   ├── dsldPyLinear.py
    │   │   ├── Utils.py
    │   │   ├── __init__.py
    │   │   ├── dsldPyFairML.py
    │   │   └── dsldPyQeFairML.py
    ├── pyproject.toml
    ├── README.md
    └── examples
    │   ├── graphical.ipynb
    │   ├── tabular.ipynb
    │   └── machine_learning.ipynb
├── DESCRIPTION
├── NAMESPACE
└── README.md


/data/lak.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/matloff/dsld/HEAD/data/lak.RData


--------------------------------------------------------------------------------
/vignettes/z.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/matloff/dsld/HEAD/vignettes/z.png


--------------------------------------------------------------------------------
/data/compas1.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/matloff/dsld/HEAD/data/compas1.RData


--------------------------------------------------------------------------------
/data/svcensus.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/matloff/dsld/HEAD/data/svcensus.RData


--------------------------------------------------------------------------------
/vignettes/wibg.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/matloff/dsld/HEAD/vignettes/wibg.png


--------------------------------------------------------------------------------
/data/mortgageSE.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/matloff/dsld/HEAD/data/mortgageSE.RData


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # R session/user files
2 | .Rhistory
3 | .RData
4 | 
5 | # macOS metadata
6 | .DS_Store


--------------------------------------------------------------------------------
/vignettes/RpartVert.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/matloff/dsld/HEAD/vignettes/RpartVert.png


--------------------------------------------------------------------------------
/vignettes/smallwage.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/matloff/dsld/HEAD/vignettes/smallwage.png


--------------------------------------------------------------------------------
/vignettes/freqparcoord.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/matloff/dsld/HEAD/vignettes/freqparcoord.png


--------------------------------------------------------------------------------
/vignettes/conditdisparity.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/matloff/dsld/HEAD/vignettes/conditdisparity.png


--------------------------------------------------------------------------------
/R/dsldBnlearn.R:
--------------------------------------------------------------------------------
 1 | 
 2 | # iamb method of causal discovery
 3 | 
 4 | dsldIamb <- function(data)
 5 | {
 6 | 
 7 |    getSuggestedLib('bnlearn')
 8 |    return(bnlearn::iamb(data))
 9 | 
10 | }
11 | 
12 | 


--------------------------------------------------------------------------------
/man/svcensus.Rd:
--------------------------------------------------------------------------------
 1 | \name{svcensus}
 2 | \alias{svcensus}
 3 | 
 4 | \title{
 5 |   Silicon Valley programmers and engineers data
 6 | }
 7 | 
 8 | \description{
 9 | 
10 | Via qeML: This data set is adapted from the 2000 Census, 
11 | restricted to programmers and engineers in the Silicon Valley area. 
12 | }
13 | 


--------------------------------------------------------------------------------
/man/compas1.Rd:
--------------------------------------------------------------------------------
 1 | \name{compas1}
 2 | \alias{compas1}
 3 | 
 4 | \title{
 5 | Criminal Offenders Screened in Florida
 6 | }
 7 | 
 8 | \description{ 
 9 | 
10 | A collection of criminal offenders screened in Florida (US) during 
11 | 2013-14. This data was used to predict recidivism. 
12 |   
13 | Additional details for this dataset can be found via the \pkg{fairml} package. 
14 | 
15 | }
16 | 
17 | 


--------------------------------------------------------------------------------
/R/onAttach.R:
--------------------------------------------------------------------------------
 1 | 
 2 | .onAttach <- function(libname, pkgname) {
 3 |    packageStartupMessage(
 4 |       '\n\n\n\n\n*********************\n\n\n\n  Navigating dsld:\n
 5 |       Type vignette("Quick_Start",package="dsld") for a quick overview!\n
 6 |       Type vignette("Function_List",package="dsld") for a categorized function list\n
 7 |       Latest version at https://github.com/matloff/dsld')
 8 | }
 9 | 
10 | 


--------------------------------------------------------------------------------
/man/mortgageSE.Rd:
--------------------------------------------------------------------------------
 1 | \name{mortgageSE}
 2 | \alias{mortgageSE}
 3 | 
 4 | \title{
 5 |   Mortgage Denial
 6 | }
 7 | 
 8 | \description{
 9 | 
10 | The dataset provides applicant information (including race, income, loan
11 | information, etc.) The response variable indicates whether or not the
12 | applicant was approved for the loan. Additional details can be found in
13 | the \code{SortedEffects} package.
14 | 
15 | }
16 | 


--------------------------------------------------------------------------------
/man/Utils.Rd:
--------------------------------------------------------------------------------
 1 | \name{utilities}
 2 | \alias{getSuggestedLib}
 3 | 
 4 | \title{
 5 | Utitlities
 6 | }
 7 | 
 8 | \usage{
 9 |    getSuggestedLib(pkgName)
10 | }
11 | 
12 | \arguments{
13 |    \item{pkgName}{Name of the package to be checked/loaded.}
14 | }
15 | \description{
16 | 
17 | Attempts to load the specified package, halting execution upon failure.
18 | 
19 | }
20 | 
21 | \value{
22 | No value, just side effects.
23 | }
24 | 
25 | 


--------------------------------------------------------------------------------
/man/lak.Rd:
--------------------------------------------------------------------------------
 1 | \name{lak}
 2 | \alias{lak}
 3 | 
 4 | \title{
 5 |   Labor Market Discrimination
 6 | }
 7 | 
 8 | \description{
 9 | Fictional CVs sent to real employers to investigate discrimination via
10 | given names. See Mullainathan and Bertran (2004).
11 | }
12 | 
13 | \references{
14 |    \itemize{
15 |      \item Mullainathan, S. and Bertran, M. (2004). Are Emily and Greg More
16 |      Employable Than Lakisha and Jamal? A Field Experiment on 
17 |      Labor Market Discrimination. American Economic Review, 94:991-1013
18 |    }
19 | }
20 | 
21 | 


--------------------------------------------------------------------------------
/inst/src/dsldPy/dsldPyBnLearn.py:
--------------------------------------------------------------------------------
 1 | from rpy2 import robjects
 2 | from rpy2.robjects.packages import importr
 3 | from IPython.display import Image, display
 4 | from .Utils import get_dsld
 5 | 
 6 | def dsldPyIamb(data, file="iamb.png", width=1200, height=900, res=150):
 7 |     dsld = get_dsld()
 8 |     a = dsld.dsldIamb(data)
 9 |     grdevices = importr("grDevices")        
10 |     grdevices.png(file=file, width=width, height=height, res=res)
11 |     robjects.r["plot"](a)                   
12 |     grdevices.dev_off()                     
13 |     display(Image(filename=file))           
14 |     return file
15 | 


--------------------------------------------------------------------------------
/inst/src/dsldPy/dsldPyFrequencybyS.py:
--------------------------------------------------------------------------------
 1 | import os, tempfile
 2 | import rpy2.robjects as ro
 3 | from rpy2.robjects.vectors import StrVector, IntVector, BoolVector
 4 | from IPython.display import Image, display
 5 | from .Utils import get_dsld, dsld_Rpy2_IsRDataframe, dsld_Rpy2_RDataframeToPandas
 6 | from rpy2.robjects.packages import importr
 7 | 
 8 | def dsldPyFrequencybyS(data, cName, sName):
 9 | 
10 |     r_data = dsld_Rpy2_IsRDataframe(data)
11 |     cName_r = StrVector([cName])
12 |     sName_r = StrVector([sName])
13 | 
14 |     dsld = get_dsld()
15 |     res = dsld.dsldFrequencyByS(r_data, cName_r, sName_r)
16 |     return dsld_Rpy2_RDataframeToPandas(res)
17 | 
18 | 


--------------------------------------------------------------------------------
/man/dsldBnlearn.Rd:
--------------------------------------------------------------------------------
 1 | \name{dsldBnlearn}
 2 | \alias{dsldIamb}
 3 | 
 4 | \title{dsldBnlearn}
 5 | 
 6 | \description{
 7 |     Wrappers for functions in the \pkg{bnlearn} package. (Presently, just \code{iamb}.)
 8 | }
 9 | 
10 | \usage{
11 | dsldIamb(data)
12 | }
13 | 
14 | \arguments{
15 |     \item{data}{
16 |         Data frame.
17 |     }
18 | }
19 | 
20 | \details{
21 | 
22 | Under very stringent assumptions, \code{dsldIamb} performs causal
23 | discovery, i.e. fits a causal model to \code{data}.
24 | 
25 | }
26 | 
27 | \value{
28 |     Object of class 'bn' (\pkg{bnlearn} object). The generic \code{plot}
29 |     function is callable on this object.
30 | }
31 | 
32 | \author{
33 |     N. Matloff
34 | }
35 | 
36 | \examples{
37 |    \donttest{
38 |    data(svcensus)
39 |    # iamb does not accept integer data
40 |    svcensus$wkswrkd <- as.numeric(svcensus$wkswrkd)
41 |    svcensus$wageinc <- as.numeric(svcensus$wageinc)
42 |    iambOut <- dsldIamb(svcensus)
43 |    plot(iambOut)
44 |    }
45 | }
46 | 
47 | 


--------------------------------------------------------------------------------
/inst/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["setuptools>=77", "wheel"]
 3 | build-backend = "setuptools.build_meta"
 4 | 
 5 | [project]
 6 | name = "dsldPy"
 7 | version = "0.0.3"
 8 | description = "Python wrappers around the R 'dsld' package via rpy2"
 9 | readme = "README.md"
10 | requires-python = ">=3.8"
11 | authors = [
12 |   {name = "A. Mittal, T. Abdullah, A. Ashok, B. Zarate Estrada, S. Martha, B. Ouattara, J. Tran, and N. Matloff"}
13 | ]
14 | license = "MIT"
15 | keywords = ["dsld", "rpy2", "fair machine learning", "statistics", "confounder analysis"]
16 | classifiers = [
17 |   "Programming Language :: Python :: 3",
18 |   "Operating System :: OS Independent",
19 |   "Topic :: Scientific/Engineering",
20 | ]
21 | dependencies = [
22 |   "pandas>=1.1",
23 |   "numpy>=1.20",
24 |   "Pillow>=8",
25 |   "pyreadr>=0.4",
26 |   "rpy2>=3.5",
27 |   "plotly>=5",
28 |   "ipython>=7",
29 |   "nbformat",
30 |   "scikit-learn",
31 | ]
32 | 
33 | [tool.setuptools]
34 | package-dir = {"" = "src"}
35 | 
36 | [tool.setuptools.packages.find]
37 | where = ["src"]
38 | 


--------------------------------------------------------------------------------
/man/dsldConfounders.Rd:
--------------------------------------------------------------------------------
 1 | \name{dsldConfounders}
 2 | \alias{dsldConfounders}
 3 | \title{dsldConfounders}
 4 | 
 5 | \description{
 6 | Plots estimated densities of all continuous features X, conditioned on a
 7 | specified categorical feature C. 
 8 | }
 9 | 
10 | \usage{
11 | dsldConfounders(data, sName, graphType = "plotly", fill = FALSE)
12 | }
13 | 
14 | \arguments{
15 |     \item{data}{Dataframe, at least 2 columns.}
16 |     \item{sName}{
17 |         Name of the categorical column, an R factor.  In discrimination 
18 |         contexts, Typically a sensitive variable.
19 |     }
20 |     \item{graphType}{
21 |         Either "plot" or "plotly", for static or interactive graphs.
22 |         The latter requires the \pkg{plotly} package.
23 |     }
24 |     \item{fill}{
25 |         Only applicable to graphType = "plot" case. Setting to true
26 |         will color each line down to the x-axis.
27 |     }
28 | }
29 | 
30 | \author{
31 |     N. Matloff, T. Abdullah, A. Ashok, J. Tran
32 | }
33 | 
34 | \value{No value; plot.}
35 | 
36 | \examples{
37 | \donttest{
38 | data(svcensus)
39 | dsldConfounders(svcensus, "educ")
40 | }
41 | }
42 | 


--------------------------------------------------------------------------------
/inst/src/dsldPy/dsldPyDensitybyS.py:
--------------------------------------------------------------------------------
 1 | import os, tempfile
 2 | import rpy2.robjects as ro
 3 | from rpy2.robjects.vectors import StrVector, IntVector, BoolVector
 4 | from IPython.display import Image, display
 5 | from .Utils import get_dsld, dsld_Rpy2_IsRDataframe
 6 | from rpy2.robjects.packages import importr
 7 | 
 8 | def dsldPyDensitybyS(data, cName, sName, graphType = "plotly", fill = False):
 9 | 
10 |     r_data = dsld_Rpy2_IsRDataframe(data)
11 |     cName_r = StrVector([cName])
12 |     sName_r = StrVector([sName])
13 |     graphType_r = StrVector([graphType])
14 |     fill_r = BoolVector([fill])
15 | 
16 |     fd, tmpfile = tempfile.mkstemp(suffix=".png"); os.close(fd)
17 |     grdevices = importr("grDevices")
18 |     grdevices.png(file=tmpfile, width=1200, height=800, res=150)
19 |     try:
20 |         dsld = get_dsld()
21 |         res = dsld.dsldDensityByS(r_data, cName_r, sName_r, graphType_r, fill_r)
22 |         try: ro.r("print")(res)
23 |         except: pass
24 |     finally:
25 |         grdevices.dev_off()
26 | 
27 |     if os.path.exists(tmpfile): display(Image(filename=tmpfile))
28 |     return
29 | 
30 | 


--------------------------------------------------------------------------------
/inst/src/dsldPy/dsldPyMatching.py:
--------------------------------------------------------------------------------
 1 | import os, tempfile
 2 | import rpy2.robjects as ro
 3 | from rpy2.robjects.vectors import StrVector, IntVector, BoolVector
 4 | from IPython.display import Image, display
 5 | from .Utils import get_dsld, dsld_Rpy2_IsRDataframe, dsld_Rpy2_RDataframeToPandas
 6 | from rpy2.robjects.packages import importr
 7 | import rpy2.robjects as robjects
 8 | 
 9 | def dsldPyMatchedATE(data, yName, sName, yesSVal, yesYVal=None, propensFtn=None, k=None):
10 | 
11 |     r_data = dsld_Rpy2_IsRDataframe(data)
12 |     yName_r = robjects.StrVector([yName])
13 |     sName_r = robjects.StrVector([sName])
14 |     yesSVal_r = robjects.StrVector([yesSVal])
15 | 
16 |     yesYVal_r = robjects.StrVector([yesYVal]) if yesYVal is not None else robjects.NULL
17 |     propensFtn_r = robjects.StrVector([propensFtn]) if propensFtn is not None else robjects.NULL
18 |     k_r = robjects.IntVector([k]) if k is not None else robjects.NULL
19 | 
20 |     dsld = get_dsld()
21 |     res = dsld.dsldMatchedATE(r_data, yName_r, sName_r, yesSVal_r, yesYVal_r, propensFtn_r, k_r)
22 | 
23 |     ro.r("summary")(res)
24 | 
25 |     return
26 | 
27 | 
28 | 


--------------------------------------------------------------------------------
/inst/src/dsldPy/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2018 The Python Packaging Authority
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining a copy
 4 | of this software and associated documentation files (the "Software"), to deal
 5 | in the Software without restriction, including without limitation the rights
 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 7 | copies of the Software, and to permit persons to whom the Software is
 8 | furnished to do so, subject to the following conditions:
 9 | 
10 | The above copyright notice and this permission notice shall be included in all
11 | copies or substantial portions of the Software.
12 | 
13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
19 | SOFTWARE.
20 | 


--------------------------------------------------------------------------------
/man/dsldDensityByS.Rd:
--------------------------------------------------------------------------------
 1 | \name{dsldDensityByS}
 2 | \alias{dsldDensityByS}
 3 | \title{dsldDensityByS}
 4 | 
 5 | \description{
 6 |     Graphs densities of a response variable, grouped by a sensitive variable. 
 7 |     Similar to \code{dsldConfounders}, but includes sliders to control the 
 8 |     bandwidth of the density estimate (analogous to controlling the bin
 9 |     width in a histogram).
10 | }
11 | 
12 | \usage{
13 | dsldDensityByS(data, cName, sName, graphType = "plotly", fill = FALSE)
14 | }
15 | 
16 | \arguments{
17 |     \item{data}{
18 |        Datasetwith at least 1 numerical column and 1 factor column
19 |     }
20 |     \item{cName}{
21 |         Possible confounding variable column, an R numeric
22 |     }
23 |     \item{sName}{
24 |         Name of the sensitive variable column, an R factor
25 |     }
26 |     \item{graphType}{
27 |         Type of graph created. Defaults to "plotly".
28 |     }
29 |     \item{fill}{
30 |         To fill the graph. Defaults to "FALSE".
31 |     }
32 | }
33 | 
34 | \author{
35 |     N. Matloff, T. Abdullah, A. Ashok, J. Tran
36 | }
37 | 
38 | \value{No value; plot.}
39 | 
40 | \examples{
41 | data(svcensus)
42 | dsldDensityByS(svcensus, cName = "wageinc", sName = "educ")
43 | }
44 | 


--------------------------------------------------------------------------------
/inst/src/dsldPy/dsldPyHunting.py:
--------------------------------------------------------------------------------
 1 | from .Utils import get_dsld, dsld_Rpy2_IsRDataframe, dsld_Rpy2_RDataframeToPandas
 2 | import sys
 3 | import os
 4 | import pandas as pd
 5 | from PIL import Image
 6 | import rpy2.robjects as robjects
 7 | from rpy2.robjects.packages import importr
 8 | from rpy2.robjects import r
 9 | import rpy2.robjects as ro
10 | 
11 | ### dsldPyCHunting
12 | def dsldPyCHunting(data,yName,sName,intersectDepth=10):
13 |     r_data = dsld_Rpy2_IsRDataframe(data)
14 |     yName_r = robjects.StrVector([yName])
15 |     sName_r = robjects.StrVector([sName])
16 |     intersectDepth_r = robjects.IntVector([intersectDepth])
17 | 
18 |     dsld = get_dsld()
19 |     res = dsld.dsldCHunting(r_data, yName_r, sName_r, intersectDepth_r)
20 |     result = {'impForY' : list(zip(list(res[0].names), list(res[0]))), 'impForS' : list(zip(list(res[1].names), list(res[1])))}
21 |     return result
22 | 
23 | ### dsldPyOHunting
24 | def dsldPyOHunting(data,yName,sName):
25 |     r_data = dsld_Rpy2_IsRDataframe(data)
26 |     yName_r = robjects.StrVector([yName])
27 |     sName_r = robjects.StrVector([sName])
28 |     dsld = get_dsld()
29 |     res = dsld.dsldOHunting(r_data, yName_r, sName_r)
30 |     
31 |     # print in R
32 |     ro.r("print")(res)
33 |     return res
34 | 


--------------------------------------------------------------------------------
/vignettes/Function_List.Rmd:
--------------------------------------------------------------------------------
 1 | 
 2 | ---
 3 | title: "Categorized Function List"
 4 | output:
 5 |    rmarkdown::html_vignette
 6 | vignette: >
 7 |   %\VignetteIndexEntry{Function List}
 8 |   %\VignetteEngine{knitr::rmarkdown}
 9 |   \usepackage[utf8]{inputenc}
10 | ---
11 | 
12 | ```{r, include = FALSE}
13 | knitr::opts_chunk$set(
14 |   collapse = TRUE,
15 |   comment = "#>"
16 | )
17 | ```
18 | 
19 | 
20 | # Categorized Function List
21 | 
22 | </br>
23 | </br>
24 | </br>
25 | 
26 | *Tabular confounder exploration*
27 | 
28 | * dsldCHunting
29 | 
30 | * dsldDiffSLin
31 | 
32 | * dsldDiffSLog
33 | 
34 | * dsldTakeALookAround
35 |      
36 | *Graphical confounder exploration*
37 | 
38 | * dsldConditDisparity
39 | 
40 | * dsldConfounders
41 | 
42 | * dsldDensityByS
43 | 
44 | * dsldScatterPlot3D
45 | 
46 | *Statistical analysisfor group differences*
47 | 
48 | * dsldLinear
49 | 
50 | * dsldLogit
51 | 
52 | * dsldML
53 | 
54 | *Causal analysis*
55 | 
56 | * dsldIamb
57 | 
58 | * dsldMatchedATE
59 | 
60 | *Fairness in prediction*
61 | 
62 | * dsldFgrrm
63 | 
64 | * dsldFrrm
65 | 
66 | * dsldNclm
67 | 
68 | * dsldOHunting
69 | 
70 | * dsldQeFairRF
71 | 
72 | * dsldQeFairRidgeLin
73 | 
74 | * dsldQeFairRidgeLog
75 | 
76 | * dsldZlrm
77 | 
78 | * dsldZlm
79 | 
80 | * dsldTakeALookAround
81 | 
82 | 
83 | 


--------------------------------------------------------------------------------
/R/dsldFrequencybyS.R:
--------------------------------------------------------------------------------
 1 | dsldFrequencyByS <- function(data, cName, sName) {
 2 |   # type validation #
 3 |   # we're essentially just checking the value-type for the key columns
 4 |   if (!class(data[, sName]) %in% c("factor", "character")) {
 5 |     stop(paste(
 6 |       "sName should be of factor or character data type."
 7 |     ))
 8 |   }
 9 |   # helpful error message if the specified confounder column isn't factor
10 |   if (!class(data[, cName]) %in% c("factor", "character")) {
11 |     stop(paste(
12 |       "cName should be of factor or character data type. Consider",
13 |       " calling `dsldDensityByS(data, cName = ",
14 |       cName,
15 |       ", sName = ",
16 |       sName,
17 |       ")` instead",
18 |       sep = ""
19 |     ))
20 |   }
21 |   
22 |   # sensitive variable frequencies #
23 |   # unique levels to ensure order
24 |   yGroups <- unique(data[[cName]])
25 |   
26 |   # get a lookup for every s level against every ylevel
27 |   freqLookup <- table(data[[sName]], data[[cName]])
28 |   
29 |   # convert counts to proportions
30 |   freqLookup <- freqLookup / rowSums(freqLookup)
31 |   
32 |   # convert to dataframe
33 |   frequencies <- as.data.frame.matrix(freqLookup)
34 |   names(frequencies) <- c(
35 |     paste0("Frequency of ", yGroups)
36 |   )
37 |   
38 |   return(frequencies)
39 | }
40 | 
41 | 


--------------------------------------------------------------------------------
/inst/src/dsldPy/dsldPyConfounders.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from .Utils import get_dsld, dsld_Rpy2_IsRDataframe
 3 | import sys
 4 | import os
 5 | import pandas as pd
 6 | from PIL import Image
 7 | import rpy2.robjects as robjects
 8 | from rpy2.robjects.packages import importr
 9 | from rpy2.robjects import r
10 | import os, tempfile
11 | import rpy2.robjects as ro
12 | from rpy2.robjects.vectors import StrVector, IntVector, BoolVector
13 | from IPython.display import Image, display
14 | from .Utils import dsld_Rpy2_IsRDataframe
15 | from rpy2.robjects.packages import importr
16 | 
17 | def dsldPyConfounders(data,sName, graphType = "plotly",fill=False):
18 |     r_data = dsld_Rpy2_IsRDataframe(data)
19 |     sName_r = robjects.StrVector([sName])
20 |     graphType_r = robjects.StrVector([graphType])
21 |     fill_r = robjects.BoolVector([fill])
22 |     
23 |     fd, tmpfile = tempfile.mkstemp(suffix=".png"); os.close(fd)
24 |     grdevices = importr('grDevices')
25 |     grdevices.png(file=tmpfile, width=1200, height=800, res=150)
26 |     try:
27 |         dsld = get_dsld()
28 |         res = dsld.dsldConfounders(r_data, sName_r, graphType_r, fill_r)
29 |         try: ro.r("print")(res)
30 |         except: pass
31 |     finally:
32 |         grdevices.dev_off()
33 | 
34 |     if os.path.exists(tmpfile): display(Image(filename=tmpfile))
35 |     return
36 | 
37 | 


--------------------------------------------------------------------------------
/R/dsldFreqPCoord.R:
--------------------------------------------------------------------------------
 1 | 
 2 | dsldFreqPCoord <- function(data, m, sName = NULL,
 3 |                            method = "maxdens", faceting = "vert", k = 50,
 4 |                            klm = 5 * k, keepidxs = NULL, plotidxs = FALSE,
 5 |                            cls = NULL, plot_filename = NULL) {
 6 | 
 7 |     getSuggestedLib("freqparcoord") 
 8 |     getSuggestedLib("ggplot2")
 9 | 
10 |     if (!is.null(sName)) {
11 |        s <- data[[sName]]
12 |        scol <- which(names(data) == sName)
13 |        dms <- data[,-scol]
14 |        dms <- factorsToDummies(dms)
15 |        dms <- as.data.frame(dms)
16 |        data <- cbind(dms,s)
17 |        data <- as.data.frame(data)
18 |        scol <- ncol(data)
19 |        colnames(data)[scol] <- sName
20 |        columns <- 1:(scol-1)
21 |     } else {
22 |        data <- factorsToDummies(data)
23 |        columns <- 1:ncol(data)
24 |     }
25 | 
26 |     fpcOut <- freqparcoord::freqparcoord(
27 |         data,
28 |         m,
29 |         dispcols = columns,
30 |         grpvar = sName,
31 |         method = method,
32 |         faceting = faceting,
33 |         k = k,
34 |         klm = klm,
35 |         keepidxs = keepidxs,
36 |         plotidxs = plotidxs,
37 |         cls = cls
38 |     )
39 |     
40 |     if (!is.null(plot_filename)) {
41 |         ggplot2::ggsave(plot_filename, fpcOut) # Save as img
42 |     }
43 |     
44 |     return(fpcOut)
45 | }
46 | 
47 | 
48 | 


--------------------------------------------------------------------------------
/R/dsldConfounders.R:
--------------------------------------------------------------------------------
 1 | dsldConfounders <- function(data, sName, graphType = "plotly", fill = FALSE) {
 2 |     # Error checking
 3 |     if (is.null(sName)) {
 4 |         stop(paste("sName must be provided as a quoted column name"))
 5 |     }
 6 | 
 7 |     # dispatch to appropriate auxiliary method
 8 |     numCols <- ncol(data)
 9 |     for (i in 1:numCols) {
10 |         # skip sName
11 |         if (colnames(data)[i] == sName) {
12 |              next
13 |         }
14 | 
15 |         # if categorical
16 |         if (is.factor(data[, i])) {
17 |             print(dsldFrequencyByS(data, colnames(data)[i], sName))
18 | 
19 |             # require input if there's a next
20 |             if (i != numCols) {
21 |                 cat("Press <ENTER> to view next density graph / frequency dataframe...\n")
22 |                 tempInput <- readline()
23 |             }
24 |         # if numeric
25 |         } else if (is.numeric(data[, i])) {
26 |             print(dsldDensityByS(data, colnames(data)[i], sName, graphType, fill))
27 | 
28 |             # require input if there's a next
29 |             if (i != numCols) {
30 |                 cat("Press <ENTER> to view next density graph / frequency dataframe...\n")
31 |                 tempInput <- readline()
32 |             }
33 |         # throw error
34 |         } else {
35 |             stop(paste("Neither categorical or numeric column, check dataframe"))
36 |         }
37 |     }
38 | }
39 | 
40 | 


--------------------------------------------------------------------------------
/inst/src/dsldPy/dsldPyTakeALookAround.py:
--------------------------------------------------------------------------------
 1 | '''
 2 |     This file contains the interface code for calling the dsldConditDisparity from dsld R package.
 3 |     The code uses rpy2 to handle dsld functions call from R and pandas library to check if
 4 |     users data input is in pandas data frame before doing any computation
 5 | '''
 6 | from .Utils import get_dsld, dsld_Rpy2_IsRDataframe, dsld_Rpy2_RDataframeToPandas
 7 | import sys
 8 | import os
 9 | import pandas as pd
10 | from PIL import Image
11 | import rpy2.robjects as robjects
12 | from rpy2.robjects.packages import importr
13 | from rpy2.robjects import r
14 | import math
15 | 
16 | def dsldPyTakeALookAround(data, yName, sName, maxFeatureSize = None, holdout = None):
17 |     r_data = dsld_Rpy2_IsRDataframe(data)
18 |     yName_r = robjects.StrVector([yName])
19 |     sName_r = robjects.StrVector([sName])
20 | 
21 |     if maxFeatureSize is not None:
22 |         maxFeatureSize_r = robjects.IntVector([maxFeatureSize])
23 |     else:
24 |         maxFeatureSize_r = robjects.IntVector([dsld_Rpy2_RDataframeToPandas(data).shape[1] - 2])
25 | 
26 |     if holdout is not None:
27 |         holdout_r = robjects.IntVector([holdout])
28 |     else:
29 |         holdout_r = robjects.IntVector([math.floor(min(1000, 0.1 * dsld_Rpy2_RDataframeToPandas(data).shape[0]))])
30 | 
31 |     dsld = get_dsld()
32 |     res = dsld.dsldTakeALookAround(r_data, yName_r, sName_r, maxFeatureSize_r, holdout_r)
33 |     return dsld_Rpy2_RDataframeToPandas(res)
34 | 


--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: dsld
 2 | Version: 1.0.0
 3 | Title: Data Science Looks at Discrimination
 4 | Authors@R: c( person("Norm", "Matloff", email = "nsmatloff@ucdavis.edu",
 5 |    role = c("aut"), comment = c(ORCID = "0000-0001-9179-6785")),
 6 |    person("Taha", "Abdullah", email = "tmabdullah@ucdavis.edu",
 7 |    role = c("aut")),
 8 |    person("Arjun", "Ashok", email = "arjashok@ucdavis.edu",
 9 |    role = c("aut")),
10 |    person("Shubhada", "Martha", email = "smartha@ucdavis.edu",
11 |    role = c("aut")),
12 |    person("Aditya", "Mittal", email = "adityamittal2031@gmail.com",
13 |    role = c("aut", "cre")),
14 |    person("Billy", "Ouattara", email = "btouattara@ucdavis.edu",
15 |    role = c("aut")),
16 |    person("Jonathan", "Tran", email = "jsttran@ucdavis.edu",
17 |    role = c("aut")),
18 |    person("Brandon", "Zarate Estrada", email = "bdzarate@ucdavis.edu",
19 |    role = c("aut"))
20 |    )
21 | Maintainer: Aditya Mittal <adityamittal2031@gmail.com>
22 | VignetteBuilder: knitr
23 | Imports: Kendall, ranger, ggplot2, plotly, 
24 |    freqparcoord, fairness,sandwich
25 | Depends: R (>= 3.5.0), fairml, gtools, regtools,qeML,rmarkdown
26 | Suggests: knitr,bnlearn,Matching,randomForest
27 | License: GPL (>= 2) 
28 | Description: Statistical and graphical tools for detecting and measuring
29 |    discrimination and bias, be it racial, gender, age or other.
30 |    Detection and remediation of bias in machine learning algorithms.
31 |    'Python' interfaces available.
32 | URL: https://github.com/matloff/dsld
33 | BugReports: https://github.com/matloff/dsld/issues
34 | 


--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
 1 | import('qeML')
 2 | import('regtools')
 3 | import('gtools')
 4 | import('ranger')
 5 | import('fairml')
 6 | import('fairness')
 7 | import('Kendall')
 8 | import('freqparcoord')
 9 | import('ggplot2')
10 | import('rmarkdown')
11 | 
12 | importFrom("grDevices", "dev.copy", "dev.cur", "dev.off", "dev.set",
13 |                "jpeg", "pdf", "png", "rainbow")
14 | importFrom("graphics", "legend", "lines", "par", "polygon", "title")
15 | importFrom("stats", "as.formula", "binomial", "coef", "cor", "cov",
16 |                "density", "glm", "lm", "loess", "na.exclude", "pnorm",
17 |                "predict", "pt", "setNames", "vcov")
18 | importFrom("utils", "combn")
19 | importFrom("plotly", "plot_ly", "add_lines","layout")
20 | importFrom("sandwich", "sandwich")
21 | importFrom("stats", "model.matrix")
22 | 
23 | export(
24 |    getSuggestedLib,
25 |    dsldFreqPCoord,
26 |    dsldTakeALookAround, dsldConditDisparity, 
27 |    dsldScatterPlot3D,
28 |    dsldLinear, dsldLogit, 
29 |    dsldConfounders, dsldDensityByS, dsldFrequencyByS,
30 |    dsldFgrrm, dsldFrrm, dsldNclm, dsldZlm, dsldZlrm,
31 |    dsldQeFairRidgeLin, dsldQeFairRF, dsldQeFairRidgeLog, dsldQeFairKNN, 
32 |    dsldML, dsldCHunting, dsldOHunting, dsldIamb, dsldMatchedATE,
33 |    dsldFairUtils
34 | )
35 | 
36 | S3method(summary, dsldLM)
37 | S3method(coef, dsldLM)
38 | S3method(vcov, dsldLM)
39 | S3method(predict, dsldLM)
40 | 
41 | S3method(summary, dsldGLM)
42 | S3method(coef, dsldGLM)
43 | S3method(vcov, dsldGLM)
44 | S3method(predict, dsldGLM)
45 | 
46 | S3method(predict, dsldFairML)
47 | S3method(summary, dsldFairML)
48 | 
49 | S3method(predict, dsldQeFair)
50 | 


--------------------------------------------------------------------------------
/man/dsldFrequencyByS.Rd:
--------------------------------------------------------------------------------
 1 | \name{dsldFrequencyByS}
 2 | \alias{dsldFrequencyByS}
 3 | \title{dsldFrequencyByS}
 4 | 
 5 | \description{
 6 | 
 7 | Informal assessment of C as a possible confounder in a relationship between a
 8 | sensitive variable S and a variable Y.
 9 | 
10 | }
11 | 
12 | \usage{
13 | dsldFrequencyByS(data, cName, sName)
14 | }
15 | 
16 | \arguments{
17 |     \item{data}{
18 |         Data frame or equivalent.
19 |     }
20 |     \item{cName}{
21 |         Name of the "C" column, an R factor.
22 |     }
23 |     \item{sName}{
24 |         Name of the sensitive variable column, an R factor
25 |     }
26 | }
27 | 
28 | \details{
29 | 
30 | Essentially an informal assessment of the between S and C.  
31 | 
32 | Consider the \code{svcensus} dataset.  If for instance we are studying
33 | the effect of gender S on wage income Y, say C is occupation.  If
34 | different genders have different occupation patterns, then C is a
35 | potential confounder.  (Y does not explicitly appear here.)
36 | 
37 | }
38 | 
39 | \value{Data frame, one for each level of the sensitive variable S, and
40 | one column for each level of the confounder C. Each row sums to 1.0.}
41 | 
42 | \examples{
43 | data(svcensus) 
44 | dsldFrequencyByS(svcensus, cName = "educ", sName = "gender")
45 | # not much difference in education between genders
46 | dsldFrequencyByS(svcensus, cName = "occ", sName = "gender")
47 | # substantial difference in occupation between genders
48 | data(lsa)
49 | lsa$faminc <- as.factor(lsa$fam_inc)
50 | dsldFrequencyByS(lsa,'faminc','race1')
51 | # distribution of family income by race
52 | }
53 | 
54 | \author{
55 |     N. Matloff, T. Abdullah, A. Ashok, J. Tran, A. Mittal
56 | }
57 | 
58 | 


--------------------------------------------------------------------------------
/inst/src/dsldPy/dsldPyConditDisparity.py:
--------------------------------------------------------------------------------
 1 | import os, tempfile
 2 | import rpy2.robjects as ro
 3 | from rpy2.robjects.vectors import StrVector, IntVector, BoolVector
 4 | from IPython.display import Image, display
 5 | from .Utils import get_dsld, dsld_Rpy2_IsRDataframe
 6 | from rpy2.robjects.packages import importr
 7 | 
 8 |  
 9 | 
10 | def _maybe_strvec(x):
11 |     if x is None: return ro.NULL
12 |     return StrVector(list(x) if isinstance(x, (list, tuple)) else [str(x)])
13 | 
14 | def dsldPyConditDisparity(data, yName, sName, xName, condits=None, qeFtn="qeKNN", minS=50, useLoess=True):
15 |     r_data   = dsld_Rpy2_IsRDataframe(data)
16 |     yName_r  = StrVector([yName])
17 |     sName_r  = StrVector([sName])
18 |     xName_r  = StrVector([xName])
19 |     condits_r = _maybe_strvec(condits)
20 | 
21 |     qeML = importr("qeML")
22 |     if hasattr(qeML, qeFtn) and callable(getattr(qeML, qeFtn)):
23 |         qeFtn_r = getattr(qeML, qeFtn)
24 |     else:
25 |         print(f"ERROR: qeML do not have function name: '{qeFtn}'"); return
26 | 
27 |     minS_r = IntVector([int(minS)])
28 |     useLoess_r = BoolVector([bool(useLoess)])
29 | 
30 |     fd, tmpfile = tempfile.mkstemp(suffix=".png"); os.close(fd)
31 |     grdevices = importr("grDevices")
32 |     grdevices.png(file=tmpfile, width=1200, height=800, res=150)
33 |     try:
34 |         dsld = get_dsld()
35 |         res = dsld.dsldConditDisparity(r_data, yName_r, sName_r, xName_r, condits_r, qeFtn_r, minS_r, useLoess_r)
36 |         try: ro.r("print")(res)
37 |         except: pass
38 |     finally:
39 |         grdevices.dev_off()
40 | 
41 |     if os.path.exists(tmpfile): display(Image(filename=tmpfile))
42 |     return
43 | 


--------------------------------------------------------------------------------
/inst/src/dsldPy/dsldPyScatterPlot3D.py:
--------------------------------------------------------------------------------
 1 | #   OVERVIEW:  
 2 | #   No need to access dsldScatterPlot3D from dsld-package.
 3 | #   The function uses the package plotly in R, which is also available in Python.
 4 | #   This file requires packages: pandas, plotly, pyreadr
 5 | #           
 6 | 
 7 | import pandas as pd 
 8 | import plotly.express as px 
 9 | from rpy2.robjects import pandas2ri
10 | from rpy2.robjects import vectors as rvectors
11 | from rpy2.robjects import conversion, default_converter
12 | 
13 | def dsldPyScatterPlot3D(data, yNames, sName, *, bin_numeric_color=True, n_bins=5, renderer=None):
14 |     if not isinstance(yNames, (list, tuple)) or len(yNames) != 3:
15 |         raise ValueError("yNames must be a list of exactly 3 column names [x, y, z].")
16 | 
17 |     # Accept either pandas DataFrame or R data.frame; convert R -> pandas
18 |     if isinstance(data, pd.DataFrame):
19 |         df = data.copy()
20 |     elif isinstance(data, rvectors.DataFrame):
21 |         with conversion.localconverter(default_converter + pandas2ri.converter):
22 |             df = conversion.rpy2py(data)
23 |     else:
24 |         raise TypeError("data must be a pandas DataFrame or an R data.frame")
25 |     color_col = sName
26 | 
27 |     # If sName is numeric with many unique values, bin for discrete legend
28 |     if bin_numeric_color and pd.api.types.is_numeric_dtype(df[sName]) and df[sName].nunique() > 20:
29 |         color_col = f"{sName}_bin"
30 |         df[color_col] = pd.qcut(df[sName], q=n_bins, duplicates="drop")
31 | 
32 |     fig = px.scatter_3d(
33 |         df,
34 |         x=yNames[0],
35 |         y=yNames[1],
36 |         z=yNames[2],
37 |         color=color_col,
38 |         opacity=0.75
39 |     )
40 | 
41 |     fig.update_traces(marker=dict(size=4))
42 |     fig.update_layout(legend_title_text=sName)
43 | 
44 |     if renderer:
45 |         fig.show(renderer=renderer)
46 |     else:
47 |         fig.show()
48 | 
49 |     return fig
50 | 


--------------------------------------------------------------------------------
/man/dsldConditDisparity.Rd:
--------------------------------------------------------------------------------
 1 | \name{dsldConditDisparity}
 2 | \alias{dsldConditDisparity}
 3 | \title{dsldConditDisparity}
 4 | 
 5 | \description{
 6 |     Plots (estimated) mean Y against X, separately for each level of S,
 7 |     with restrictions \code{condits}. May reveal Simpson's Paradox-like
 8 |     differences not seen in merely plotting mean Y against X.
 9 | }
10 | 
11 | \usage{
12 | dsldConditDisparity(data, yName, sName, xName, condits = NULL,
13 |     qeFtn = qeKNN, minS = 50, useLoess = TRUE)
14 | }
15 | 
16 | \arguments{
17 |    \item{data}{Data frame or equivalent.}
18 |    \item{yName}{Name of predicted variable Y. Must be numeric 
19 |       or dichtomous R factor.}
20 |    \item{sName}{Name of the sensitive variable S, an R factor}
21 |    \item{xName}{Name of a numeric column for the X-axis.}
22 |    \item{condits}{An R vector; each component is a character 
23 |       string for an R logical expression representing a desired 
24 |       condition involving \code{names(data)} other than S and Y.}
25 |    \item{qeFtn}{\code{qeML} predictive function (not quoted;
26 |       only default arguments will be used.)}
27 |    \item{minS}{Minimum size for an S group to be retained in the analysis.}
28 |    \item{useLoess}{If TRUE, do loess smoothing on the fitted regression values.}
29 | }
30 | 
31 | \author{
32 |     N. Matloff, A. Ashok, S. Martha, A. Mittal
33 | }
34 | 
35 | \value{No value; plot.}
36 | 
37 | \examples{
38 | \donttest{
39 | data(compas1)
40 | # graph probability of recidivism by race given age, among those with at
41 | # most 4 prior convictions and COMPAS decile score at least 6
42 | compas1$two_year_recid <- as.numeric(compas1$two_year_recid == "Yes")
43 | dsldConditDisparity(compas1,"two_year_recid", "race", "age", 
44 |                     c("priors_count <= 4","decile_score>=6"), qeKNN)
45 | 
46 | dsldConditDisparity(compas1,"two_year_recid", "race", "age",
47 |                     "priors_count == 0", qeGBoost)
48 | }
49 | }
50 | 
51 | 


--------------------------------------------------------------------------------
/R/dsldMatching.R:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | # finds the estimated mean difference between`the matched Y pairs in the
 4 | # treated/nontreated (exposed and nonn-exposed) groups, with covariates
 5 | # X in 'data' other than the yName and sName columns
 6 | 
 7 | # sName here is the "treatment" or "exposure," S
 8 | 
 9 | # dsld wrapper for Matching::Match; optional propensFtn must be either
10 | # 'glm' for logit or 'knn' for qeKNN
11 | 
12 | # in that optional case, we estimate P(S = 1 | X), either by a logistic
13 | # or k-NN model
14 | 
15 | # due to the fact that various function calls require different argument
16 | # types, we may generate several different versions of a variable; e.g.
17 | # S is a factor but we also need logical and numeric versions
18 | 
19 | dsldMatchedATE <- function(data,yName,sName,yesSVal,yesYVal=NULL,
20 |    propensFtn=NULL,k=NULL) 
21 | {
22 |    getSuggestedLib("Matching")
23 | 
24 |    ycol <- which(names(data) == yName)
25 |    y <- data[,ycol]
26 | 
27 |    if (is.factor(y)) {
28 |       yLvls <- levels(y)
29 |       if (length(yLvls) != 2)
30 |          stop('factor Y can only be dichotomous')
31 |       yNum <- as.integer(y == yesYVal)
32 |       dichotY <- TRUE
33 |    } else {
34 |       yNum <- y
35 |       dichotY <- FALSE
36 |    }
37 | 
38 |    scol <- which(names(data) == sName)
39 |    s <- data[,scol]
40 |    sLog <- (s == yesSVal)
41 |    sNum <- as.integer(sLog)
42 |    
43 |    x <- data[,-c(ycol,scol)]
44 |    if (!allNumeric(x))
45 |       xNum <- factorsToDummies(x,omitLast=TRUE,dfOut=TRUE)
46 |    else xNum <- as.matrix(x)
47 | 
48 |    if (!is.null(propensFtn)) {
49 |       if (propensFtn == 'glm') {
50 |          matchVals <- glm(sNum ~ xNum,family=binomial)$fitted.values
51 |       } else {  # qeKNN 
52 |          tmp <- qeKNN(data[,-ycol],sName,yesYVal=yesSVal,k=k,holdout=NULL)
53 |          matchVals <- tmp$regests
54 |       }
55 |       xNum <- matchVals
56 |    } 
57 | 
58 |    matchOut <- Matching::Match(Y=y,Tr=sLog,X=xNum,estimand='ATE',ties=FALSE)
59 |    matchOut
60 | 
61 | }
62 | 
63 | 
64 | 


--------------------------------------------------------------------------------
/man/dsldPropens.Rd:
--------------------------------------------------------------------------------
 1 | \name{dsldMatchedATE}
 2 | \alias{dsldMatchedATE}
 3 | 
 4 | \title{dsldMatchedATE}
 5 | 
 6 | \description{
 7 | Causal inference via matching models.
 8 | Wrapper for \code{Matching::Match}.
 9 | }
10 | 
11 | \usage{
12 | dsldMatchedATE(data,yName,sName,yesSVal,yesYVal=NULL,
13 |    propensFtn=NULL,k=NULL)
14 | }
15 | 
16 | \arguments{
17 |     \item{data}{Data frame.}
18 |     \item{yName}{Name of the response variable column.}
19 |     \item{sName}{ Name of the sensitive attribute column. The
20 |        attribute must be dichotomous.}
21 |     \item{yesSVal}{S value to be considered "yes," to be coded 
22 |        1 rather than 0.}
23 |     \item{yesYVal}{Y value to be considered "yes," to be coded 
24 |        1 rather than 0.}
25 |     \item{propensFtn}{Either 'glm' (logistic), or 'knn'.}
26 |     \item{k}{Number of nearest neighbors if \code{propensFtn='knn'.}}
27 | }
28 | 
29 | \value{
30 | 
31 | Object of class 'Match'. See documentation in the 
32 | \pkg{Matching} package.
33 | }
34 | 
35 | \details{
36 | 
37 | This is a \pkg{dsld} wrapper for \code{Matching::Match}. 
38 | 
39 | Matched analysis is typically applied to measuring "treatment effects,"
40 | but is often applied in situations in which the "treatment," S here, is
41 | an immutable attribute such as race or gender. The usual issues
42 | concerning observational studies apply.
43 | 
44 | The function \code{dsldMatchedATE} finds the estimated mean difference
45 | between the matched Y pairs in the treated/nontreated (exposed and
46 | non-exposed) groups, with covariates X in \code{data} other than the
47 | \code{yName} and \code{sName} columns.
48 | 
49 | In the propensity model case, we estimate P(S = 1 | X), either by a logistic
50 | or k-NN model.
51 | }
52 | 
53 | \author{
54 |     N. Matloff
55 | }
56 | 
57 | \examples{
58 | 
59 | data(lalonde,package='Matching')
60 | ll <- lalonde
61 | ll$treat <- as.factor(ll$treat)
62 | ll$re74 <- NULL
63 | ll$re75 <- NULL
64 | summary(dsldMatchedATE(ll,'re78','treat','1')) 
65 | summary(dsldMatchedATE(ll,'re78','treat','1',propensFtn='glm'))
66 | summary(dsldMatchedATE(ll,'re78','treat','1',propensFtn='knn',k=15))
67 | }
68 | 
69 | 


--------------------------------------------------------------------------------
/R/dsldML.R:
--------------------------------------------------------------------------------
 1 | 
 2 | # like dsldLinear and dsldLogit, but for machine learning (i.e.
 3 | # nonparametric) prediction algorithms
 4 | 
 5 | # args:
 6 | 
 7 | #    data, yName, sName as usual 
 8 | 
 9 | #    sComparisonPts as in the with-interactions case of dsldLinear()
10 | #    (nonparametric case necessarily has interactions)
11 | 
12 | #    qeMLftnName is, e.g. 'qeKNN'; opts is an R list of optional arguments
13 | #    for that function
14 | 
15 | dsldML<-function(data,yName,sName,qeMLftnName,sComparisonPts='rand5',opts=NULL){
16 | 
17 |   ycol <- which(names(data) == yName)
18 |   scol <- which(names(data) == sName)
19 |   slevels <- levels(data[,scol])
20 |   
21 |   factors_info = factor_levels(data)
22 |   
23 |   if (sComparisonPts=='rand5'){
24 |     rows <- sample(nrow(data), 5)
25 |     reducedData <- data[rows, ]
26 |     columns <- c(yName, sName)
27 |     sComparisonPts <- reducedData[, !(names(reducedData) %in% columns)]
28 |     sComparisonPts <- apply_factor_levels(sComparisonPts, factors_info)
29 |   }
30 |   
31 |   sComparisonPts <- apply_factor_levels(sComparisonPts, factors_info)
32 |   
33 |   # called from lapply(), calling the QE function on the subset of data
34 |   # corresponding to the specified level of the sensitive variable S
35 |   do1Slevel <- function(sLevel) 
36 |   {
37 |     subData <- data[data[,scol]==sLevel,]
38 |     subData <- subData[,-scol]
39 |     opts[['data']] <- subData
40 |     opts[['yName']] <- yName
41 |     do.call(qeMLftnName,opts)
42 |   }
43 |   
44 |   qeOut <- lapply(slevels,do1Slevel)
45 |   names(qeOut) <- slevels
46 | 
47 |   testAccs <- sapply(qeOut,function(qeo) qeo$testAcc)
48 |   res <- list(testAccs = testAccs)
49 | 
50 |   tmp <- sComparisonPts
51 |   for (sl in slevels) {
52 |     # predicted values are the values of the estimated regression
53 |     # function, just what we want
54 |     preds <- predict(qeOut[[sl]],sComparisonPts)
55 |     if (qeOut[[1]]$classif) {
56 |       if (is.null(preds$probs)) stop('ML function does not return "probs"')
57 |       preds <- preds$probs
58 |     } else preds <- as.vector(preds)
59 |     tmp[[sl]] <- preds
60 |   }
61 | 
62 |   res$comparisons <- tmp
63 |   
64 |   return(res) 
65 | }
66 | 
67 | 
68 | 


--------------------------------------------------------------------------------
/inst/src/dsldPy/dsldPyML.py:
--------------------------------------------------------------------------------
 1 | '''
 2 |     This file contains the interface code for calling the dsldLogit from dsld R package.
 3 |     The code uses rpy2 to handle dsld functions call from R and pandas library to check if
 4 |     users data input is in pandas data frame before doing any computation
 5 | '''
 6 | 
 7 | from .Utils import get_dsld, dsld_Rpy2_IsRDataframe, dsld_Rpy2_RDataframeToPandas
 8 | import sys
 9 | import pandas as pd
10 | import rpy2.robjects as robjects
11 | from rpy2.robjects import pandas2ri
12 | from rpy2.robjects import conversion
13 | from rpy2.robjects.packages import importr
14 | import rpy2.robjects as ro
15 | import math
16 | from rpy2.robjects.vectors import ListVector, FloatVector
17 | from .Utils import dsld_Rpy2_RDataframeToPandas
18 | import pandas as pd
19 | 
20 | # Import R packages
21 | 
22 | def map_last_k(df: pd.DataFrame, values: list):
23 |     k = len(values)
24 |     last_k_cols = df.columns[-k:]
25 |     # add "testAcc: " prefix to each column name
26 |     prefixed_cols = [f"testAcc: {col}" for col in last_k_cols]
27 |     return dict(zip(prefixed_cols, values))
28 | 
29 | def dsldPyML(data, yName, sName, qeMLftnName, sComparisonPts='rand5', opts=None):
30 | 
31 |     r_data = dsld_Rpy2_IsRDataframe(data)
32 |     yName = robjects.StrVector([yName])
33 |     sName = robjects.StrVector([sName])
34 |     qeMLftnName = robjects.StrVector([qeMLftnName])
35 | 
36 |     if sComparisonPts != 'rand5':
37 |         if isinstance(sComparisonPts, pd.DataFrame):
38 |             sComparisonPts = dsld_Rpy2_IsRDataframe(sComparisonPts)
39 |         else:
40 |             sComparisonPts = robjects.StrVector([sComparisonPts])
41 |     else:
42 |         sComparisonPts = robjects.StrVector(['rand5'])
43 | 
44 |     if opts is not None:
45 |         opts = ListVector({k: FloatVector([v]) for k, v in opts.items()})
46 |     else:
47 |         opts = robjects.NULL
48 |     
49 |     # call dsldML in R
50 |     dsld = get_dsld()
51 |     model = dsld.dsldML(r_data, yName, sName, qeMLftnName, sComparisonPts, opts)
52 | 
53 |     test_accuracies = model[0]
54 |     comparison_points = dsld_Rpy2_RDataframeToPandas(model[1])
55 |     comparison_points_dict = map_last_k(comparison_points, test_accuracies)
56 | 
57 |     return comparison_points_dict, comparison_points
58 | 


--------------------------------------------------------------------------------
/man/dsldML.Rd:
--------------------------------------------------------------------------------
 1 | \name{dsldML}
 2 | \alias{dsldML}
 3 | \title{dsldML}
 4 | 
 5 | \description{ 
 6 | Nonparametric comparison of sensitive groups.
 7 | }
 8 | 
 9 | \usage{
10 | dsldML(data,yName,sName,qeMLftnName,sComparisonPts='rand5',opts=NULL)
11 | }
12 | 
13 | \arguments{
14 |     \item{data}{
15 |         A data frame.
16 |     }
17 |     \item{yName}{
18 |         Name of the response variable column. 
19 |     }
20 |     \item{sName}{
21 |         Name(s) of the sensitive attribute column(s).
22 |     }
23 |     \item{qeMLftnName}{
24 |         Quoted name of a prediction function in the \code{qeML} package.
25 |     }
26 |     \item{sComparisonPts}{
27 |         Data frame of one or more data points at which the regression
28 |         function is to be estimated for each level of S.  If this is 
29 |         'rand5', then the said data points will consist of five randomly 
30 |         chosen rows in the original dataset.
31 |     }
32 |     \item{opts}{
33 |         An R list specifying arguments for the above \code{qeML} function.
34 |     }
35 | }
36 | 
37 | \author{
38 |     N. Matloff
39 | }
40 | 
41 | \examples{  
42 | 
43 | ## applying K-NN
44 | ## also works for: qeRF, qeRFranger, qeLASSO, qePolyLin/qePolyLog, qeXGBoost
45 | 
46 | data(svcensus) 
47 | 
48 | w <- dsldML(svcensus,'wageinc','gender',qeMLftnName='qeKNN',
49 |    opts=list(k=50))
50 |    
51 | # prints testAcc for each level in sName and the predictions on sComparisonPts
52 | print(w)
53 | 
54 | }
55 | 
56 | \details{
57 | 
58 |     In a linear model with no interactions, one can speak of "the"
59 |     difference in mean Y given X across treatments, independent of X. 
60 |     In a nonparametric analysis, there is interaction by definition,
61 |     and one can only speak of differences across treatments for a
62 |     specific X value. Hence the need for the argument
63 |     \code{sComparisonPts}.
64 | 
65 |     The specified \code{qeML} function will be called on the indicated data once
66 |     for each level of the sensitive variable.  For each such level, estimated
67 |     regression function values will be obtained for each row in
68 |     \code{sComparisonPts}.
69 | }
70 | 
71 | \value{
72 | 
73 | An R list. The first component consists of the holdout-set prediction
74 | accuracies, while the second is a data frame predicted values for each
75 | sensitive group.
76 | 
77 | }
78 | 
79 | 
80 | 


--------------------------------------------------------------------------------
/inst/src/dsldPy/dsldPyFairUtils.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Python interface for dsldFairUtils functions in the dsld R package.
 3 | '''
 4 | 
 5 | from .Utils import get_dsld, dsld_Rpy2_IsRDataframe, dsld_Rpy2_RDataframeToPandas
 6 | from rpy2.robjects.packages import importr
 7 | import rpy2.robjects as robjects
 8 | import numpy as np
 9 | import pandas as pd
10 | 
11 | def _to_float_vector(x):
12 |     if x is None:
13 |         return robjects.NULL
14 |     if isinstance(x, (list, tuple, np.ndarray, pd.Series)):
15 |         return robjects.FloatVector([float(v) for v in x])
16 |     return robjects.FloatVector([float(x)])
17 | 
18 | def _to_int_vector_scalar(x):
19 |     if x is None:
20 |         return robjects.NULL
21 |     return robjects.IntVector([int(x)])
22 | 
23 | def _to_scalar_vector(x):
24 |     if x is None:
25 |         return robjects.NULL
26 |     if isinstance(x, (int, np.integer)):
27 |         return robjects.IntVector([int(x)])
28 |     if isinstance(x, (float, np.floating)):
29 |         return robjects.FloatVector([float(x)])
30 |     return robjects.StrVector([str(x)])
31 | 
32 | def _to_str_vector(x):
33 |     if x is None:
34 |         return robjects.NULL
35 |     if isinstance(x, (list, tuple, np.ndarray, pd.Series)):
36 |         return robjects.StrVector([str(v) for v in x])
37 |     return robjects.StrVector([str(x)])
38 | 
39 | 
40 | def dsldPyFairUtils(data, yName, sName, dsldFTNname,
41 |                     unfairness=None, deweightPars=None,
42 |                     yesYVal=None, k_folds=5):
43 | 
44 |     r_data = dsld_Rpy2_IsRDataframe(data)
45 | 
46 |     yName_r = robjects.StrVector([yName])           # keep single y
47 |     sName_r = _to_str_vector(sName)                 # str or list -> R character vector
48 |     dsldFTNname_r = robjects.StrVector([dsldFTNname])
49 | 
50 |     unfairness_r = _to_float_vector(unfairness)
51 | 
52 |     if deweightPars is not None:
53 |         deweightPars_r = robjects.ListVector(
54 |             {k: _to_float_vector(v) for k, v in deweightPars.items()}
55 |         )
56 |     else:
57 |         deweightPars_r = robjects.NULL
58 | 
59 |     yesYVal_r = _to_scalar_vector(yesYVal)
60 |     k_folds_r = _to_int_vector_scalar(k_folds)
61 | 
62 |     dsld = get_dsld()
63 |     model = dsld.dsldFairUtils(
64 |         r_data, yName_r, sName_r, dsldFTNname_r,
65 |         unfairness_r, deweightPars_r, yesYVal_r, k_folds_r, robjects.NULL
66 |     )
67 |     return dsld_Rpy2_RDataframeToPandas(model)
68 | 


--------------------------------------------------------------------------------
/inst/src/dsldPy/dsldPyLogit.py:
--------------------------------------------------------------------------------
 1 | '''
 2 |     This file contains the interface code for calling the dsldLogit from dsld R package.
 3 |     The code uses rpy2 to handle dsld functions call from R and pandas library to check if
 4 |     users data input is in pandas data frame before doing any computation
 5 | '''
 6 | 
 7 | from .Utils import get_dsld, dsld_Rpy2_IsRDataframe, dsld_Rpy2_RDataframeToPandas
 8 | import rpy2.robjects as robjects
 9 | from rpy2.robjects import pandas2ri
10 | from rpy2.robjects import conversion
11 | from rpy2.robjects.packages import importr
12 | 
13 | 
14 | # Import R packages
15 | 
16 | def dsldPyLogit(data, yName, sName, sComparisonPts=None, interactions=False, yesYVal=None): 
17 | 
18 |     r_data = dsld_Rpy2_IsRDataframe(data)
19 |     yName = robjects.StrVector([yName])
20 |     sName = robjects.StrVector([sName])
21 |     interactions = robjects.BoolVector([interactions])
22 |     yesYVal = robjects.StrVector([yesYVal]) if yesYVal is not None else robjects.NULL
23 |     
24 |     if sComparisonPts is not None:
25 |         sComparisonPts = dsld_Rpy2_IsRDataframe(sComparisonPts)
26 |     else:
27 |         sComparisonPts = robjects.NULL
28 | 
29 |     # call dsldLogit in R
30 |     dsld = get_dsld()
31 |     model = dsld.dsldLogit(r_data, yName, sName, sComparisonPts, interactions, yesYVal)
32 |     return model
33 | 
34 | def dsldPyLogitSummary(dsldLogit): 
35 |     robjects.r.assign("dsldLogit", dsldLogit)
36 |     result = robjects.r('summary(dsldLogit)')
37 |     print(result)
38 |     return result
39 | 
40 | def dsldPyLogitCoef(dsldLogit):  
41 |     robjects.r.assign("dsldLogit", dsldLogit)
42 |     result = robjects.r('coef(dsldLogit)')
43 |     print(result)
44 |     return result
45 | 
46 | def dsldPyLogitVcov(dsldLogit):  
47 |     robjects.r.assign("dsldLogit", dsldLogit)
48 |     result = robjects.r('vcov(dsldLogit)')
49 |     print(result)
50 |     return result
51 | 
52 | def dsldPyLogitGetData(dsldLogit):  
53 |     robjects.r.assign("dsldLogit", dsldLogit)
54 |     result = robjects.r('dsldGetData(dsldLogit)')
55 |     print(result)
56 |     return result
57 | 
58 | def dsldPyLogitPredict(dsldLogit, newData):  
59 |     robjects.r.assign("dsldLogit", dsldLogit)
60 |     xNew = dsld_Rpy2_IsRDataframe(newData)
61 |     # xNew = dsld.convert_cols(newData, cat_features, num_features)
62 |     robjects.r.assign("xNew", xNew)
63 |     result = robjects.r('predict(dsldLogit, xNew)')
64 |     with conversion.localconverter(pandas2ri.converter):
65 |         result_py = conversion.rpy2py(result)
66 |     return result_py
67 | 


--------------------------------------------------------------------------------
/man/dsldTakeALookAround.Rd:
--------------------------------------------------------------------------------
 1 | \name{dsldTakeALookAround}
 2 | \alias{dsldTakeALookAround}
 3 | 
 4 | \title{dsldTakeALookAround}
 5 | 
 6 | \description{
 7 | 
 8 | Evaluate feature sets for predicting Y while considering the
 9 | Fairness-Utility Tradeoff.
10 | }
11 | 
12 | \usage{
13 | dsldTakeALookAround(data, yName, sName, maxFeatureSetSize = (ncol(data) - 2), 
14 |     holdout = floor(min(1000,0.1*nrow(data))))
15 | }
16 | 
17 | \arguments{
18 |     \item{data}{
19 |         Data frame.
20 |     }
21 |     \item{yName}{
22 |         Name of the response variable column. 
23 |     }
24 |     \item{sName}{
25 |         Name of the sensitive attribute column.
26 |     }
27 |     \item{maxFeatureSetSize}{
28 |         Maximum number of combinations of features to be 
29 |         included in the data frame. 
30 |         
31 |     }
32 |     \item{holdout}{
33 |         If not NULL, form a holdout set of the specified size. After fitting to the 
34 |         remaining data, evaluate accuracy on the test set.
35 |     }
36 | }
37 | 
38 | \details{
39 | 
40 | This function provides a tool for exploring feature combinations to use
41 | in predicting an outcome Y from features X and a sensitive variable S. 
42 | 
43 | The features in X will first be considered singly, then doubly and so
44 | on, up though feature combination size \code{maxFeatureSetSize}. Y is
45 | prediction from X either a linear model (numeric Y) or logit
46 | (dichotomous Y).
47 | 
48 | The accuracy (based on qeML holdout) will be computed for each of these
49 | cases: (a) Y predicted from the given feature combination C, (b) Y
50 | predicted from the given feature combination C plus S, and (c) S predicted
51 | from C. The difference between columns 'a' and 'b' shows the sacrifice
52 | in utility stemming from not using S in our prediction of Y. (Due to
53 | sampling variation, it is possible for column 'b' to be larger than
54 | 'a'.) The value in column 'c' shows fairness, the smaller the fairer.
55 | 
56 | }
57 | 
58 | \author{
59 |     N. Matloff, A. Ashok, S. Martha, A. Mittal
60 | }
61 | 
62 | \examples{
63 | \donttest{
64 | # investigate predictive accuracy for a continuous Y,
65 | # 'wageinc', using the default arguments for maxFeatureSetSize = 4
66 | data(svcensus)
67 | dsldTakeALookAround(svcensus, 'wageinc', 'gender', 4)
68 | 
69 | # investigate the predictive accuracy for a categorical Y, 
70 | # 'educ', using the default arguments for maxFeatureSetSize = 4
71 | dsldTakeALookAround(svcensus, 'educ', 'gender')}
72 | }
73 | 
74 | \value{Data frame whose first column consists of the variable names,
75 | followed by columns 'a', 'b' and 'c' as described in 'details'.}
76 | 
77 | 


--------------------------------------------------------------------------------
/inst/src/dsldPy/dsldPyFreqPCoord.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import tempfile
 3 | from typing import Optional, Union, Sequence
 4 | from IPython.display import Image, display
 5 | 
 6 | import pandas as pd
 7 | import rpy2.robjects as ro
 8 | from rpy2.robjects.vectors import IntVector, StrVector, BoolVector
 9 | from rpy2.robjects.packages import importr
10 | 
11 | from .Utils import dsld_Rpy2_IsRDataframe, get_dsld 
12 | 
13 | def _maybe_intvec(x):
14 |     if x is None:
15 |         return ro.NULL
16 |     if isinstance(x, (list, tuple)):
17 |         return IntVector(list(x))
18 |     return IntVector([int(x)])
19 | 
20 | def _maybe_strvec(x: Optional[Union[str, Sequence[str]]]):
21 |     if x is None:
22 |         return ro.NULL
23 |     if isinstance(x, (list, tuple)):
24 |         return StrVector(list(x))
25 |     return StrVector([str(x)])
26 | 
27 | def dsldPyFreqPCoord(data, m, sName, method = "maxdens", faceting = "vert", k = 50, klm = None, keepidxs = None, plotidxs = False, cls = None, plot_filename = None, show = True):
28 |   
29 |     # Prepare inputs (pass scalars where R expects scalars)
30 |     r_data = dsld_Rpy2_IsRDataframe(data)
31 |     r_m = int(m)
32 |     r_sName = _maybe_strvec(sName)
33 |     r_method = str(method)
34 |     r_faceting = str(faceting)
35 |     r_k = int(k)
36 |     if klm is None:
37 |         klm = 5 * k
38 |     r_klm = int(klm)
39 |     r_keepidxs = _maybe_intvec(keepidxs)
40 |     r_plotidxs = bool(plotidxs)
41 |     r_cls = _maybe_strvec(cls)
42 | 
43 |     # Case A: user provided output filename
44 |     if plot_filename:
45 |         dsld = get_dsld()
46 |         res = dsld.dsldFreqPCoord(
47 |             r_data, r_m, r_sName, r_method, r_faceting, r_k, r_klm,
48 |             r_keepidxs, r_plotidxs, r_cls, plot_filename
49 |         )
50 |     
51 |         try:
52 |             ro.r("print")(res)
53 |         except Exception:
54 |             pass
55 |         return  
56 | 
57 |     # Case B: capture to a temporary PNG and show 
58 |     fd, tmpfile = tempfile.mkstemp(suffix=".png")
59 |     os.close(fd)
60 |     try:
61 |         grdevices = importr("grDevices")
62 |         grdevices.png(file=tmpfile, width=1200, height=800, res=150)
63 |         dsld = get_dsld()
64 |         res = dsld.dsldFreqPCoord(
65 |             r_data, r_m, r_sName, r_method, r_faceting, r_k, r_klm,
66 |             r_keepidxs, r_plotidxs, r_cls, ro.NULL
67 |         )
68 |         
69 |         try:
70 |             ro.r("print")(res)
71 |         except Exception:
72 |             pass
73 |     finally:
74 |         grdevices.dev_off()
75 | 
76 |     if os.path.exists(tmpfile):
77 |         display(Image(filename=tmpfile))
78 | 


--------------------------------------------------------------------------------
/man/dsldFairUtils.Rd:
--------------------------------------------------------------------------------
 1 | \name{dsldFairUtils}
 2 | \alias{dsldFairUtils}
 3 | \title{dsldFairUtils}
 4 | 
 5 | \description{ 
 6 | Exploration of the Fairness-Utility Tradeoff.  Finds predictive accuracy
 7 | and correlation between S and predicted Y.
 8 | }
 9 | 
10 | \usage{
11 | dsldFairUtils(data, yName, sName, dsldFTNName, unfairness = NULL,
12 |   deweightPars = NULL, yesYVal = NULL, k_folds = 5,model_args = NULL)
13 | }
14 | 
15 | \arguments{
16 |     \item{data}{
17 |         Data frame. 
18 |     }
19 |     \item{yName}{
20 |         Name of the response variable Y column. Y must be numeric or
21 |         binary (two-level R factor).
22 |     }
23 |     \item{sName}{
24 |         Name of the sensitive attribute S column.
25 |     }
26 |     \item{dsldFTNName}{
27 |         Quoted name of one of the \pkg{fairML} or EDF functions.   
28 |     }
29 |     \item{unfairness}{
30 |         Vector of unfairness values. Nonnull for the \pkg{fairML} functions.
31 |     }
32 |     \item{deweightPars}{
33 |         List of deweightPars grid. Nonnull for the EDF functions.
34 |     }
35 |     \item{yesYVal}{
36 |         Y value to be treated as Y = 1 for binary Y.
37 |     }
38 |     \item{k_folds}{
39 |         Number of folds to use in $k$-fold cross-validation. 
40 |         The final result is reported as the average across all folds.
41 |     }
42 |   
43 |     \item{model_args}{
44 |       A named list of additional arguments passed directly to \code{dsldFtnName}. 
45 |       For example, \code{model_args = list(k = 25)}.
46 |     }
47 | }
48 | 
49 | \author{
50 |     A.Mittal, N. Matloff
51 | }
52 | 
53 | \examples{  
54 | 
55 | \donttest{
56 | data(svcensus)
57 | 
58 | ## regression examples shown --- also works for classification 
59 | dsldFairUtils(svcensus, 
60 |               'wageinc',
61 |               'gender', 
62 |               'dsldQeFairKNN', 
63 |               k_folds = 5, 
64 |               model_args = list(k = 25), 
65 |               deweightPars = list('occ' = c(0.9,0.2), 'educ' = c(0.3, 0.9)))
66 | 
67 | dsldFairUtils(svcensus, 
68 |               'wageinc', 
69 |               'gender', 
70 |               'dsldFrrm', 
71 |               k_folds = 5, 
72 |               unfairness = c(0.9, 0.6, 0.1,0.05, 0.005))
73 | }
74 | }
75 | 
76 | \details{
77 | 
78 | Tool for exploring tradeoff between utility (predictive accuracy, Mean
79 | Absolute Prediction Error or overall probability of misclassification)
80 | and fairness. Roughly speaking, the latter is defined as the strength of
81 | relation between S and predicted Y (the smaller, the better).
82 | 
83 | }
84 | 
85 | \value{
86 | 
87 | A data-frame showing accuracy and correlation between predicted Y and S. 
88 | 
89 | }
90 | 
91 | 


--------------------------------------------------------------------------------
/inst/src/dsldPy/dsldPyLinear.py:
--------------------------------------------------------------------------------
 1 | '''
 2 |     This file contains the interface code for calling the dsldLinear from dsld R package.
 3 |     The code uses rpy2 to handle dsld functions call from R and pandas library to check if
 4 |     users data input is in pandas data frame before doing any computation
 5 | '''
 6 | 
 7 | from .Utils import get_dsld, dsld_Rpy2_IsRDataframe, dsld_Rpy2_RDataframeToPandas
 8 | import sys
 9 | import pandas as pd
10 | import rpy2.robjects as robjects
11 | from rpy2.robjects import pandas2ri
12 | from rpy2.robjects import conversion
13 | from rpy2.robjects.packages import importr
14 | import rpy2.robjects as ro
15 | import math
16 | 
17 | 
18 | def dsldPyLinear(data, yName, sName, interactions=False,
19 |                  sComparisonPts=None, useSandwich=False):
20 | 
21 |     """Python wrapper for dsldLinear in the dsld R package"""
22 | 
23 |     r_data = dsld_Rpy2_IsRDataframe(data)
24 |     yName = robjects.StrVector([yName])
25 |     sName = robjects.StrVector([sName])
26 |     interactions = robjects.BoolVector([interactions])
27 |     useSandwich = robjects.BoolVector([useSandwich])
28 | 
29 |     if sComparisonPts is not None:
30 |         sComparisonPts = dsld_Rpy2_IsRDataframe(sComparisonPts)
31 |     else:
32 |         sComparisonPts = robjects.NULL
33 | 
34 |     dsld = get_dsld()
35 |     dsldLinearObj = dsld.dsldLinear(r_data, yName, sName,
36 |                                     interactions, sComparisonPts, useSandwich)
37 |     return dsldLinearObj
38 | 
39 | def dsldPyLinearSummary(dsldLinear): 
40 |     robjects.r.assign("dsldLinear", dsldLinear)
41 |     result = robjects.r('summary(dsldLinear)')
42 |     print(result)
43 |     return result
44 | 
45 | def dsldPyLinearCoef(dsldLinear):  
46 |     robjects.r.assign("dsldLinear", dsldLinear)
47 |     result = robjects.r('coef(dsldLinear)')
48 |     print(result)
49 |     return result
50 | 
51 | def dsldPyLinearVcov(dsldLinear):  
52 |     robjects.r.assign("dsldLinear", dsldLinear)
53 |     result = robjects.r('vcov(dsldLinear)')
54 |     print(result)
55 |     return result
56 | 
57 | def dsldPyLinearGetData(dsldLinear):  
58 |     robjects.r.assign("dsldLinear", dsldLinear)
59 |     result = robjects.r('dsldGetData(dsldLinear)')
60 |     print(result)
61 |     return result
62 | 
63 | def dsldPyLinearPredict(dsldLinear, newData):  
64 |     robjects.r.assign("dsldLinear", dsldLinear)
65 |     xNew = dsld_Rpy2_IsRDataframe(newData)
66 |     # xNew = dsld.convert_cols(newData, cat_features, num_features)
67 |     robjects.r.assign("xNew", xNew)
68 |     result = robjects.r('predict(dsldLinear, xNew)')
69 |     with conversion.localconverter(pandas2ri.converter):
70 |         result_py = conversion.rpy2py(result)
71 |     return result_py
72 | 
73 | 
74 |     
75 | 


--------------------------------------------------------------------------------
/man/dsldHunting.Rd:
--------------------------------------------------------------------------------
 1 | \name{dsldCHunting and dsldOHunting}
 2 | \alias{dsldCHunting}
 3 | \alias{dsldOHunting}
 4 | \title{Confounder and Proxy Hunting}
 5 | 
 6 | \description{ 
 7 |     Confounder hunting:  searches for variables C that predict both Y and
 8 |     S. Proxy hunting:  searches for variables O that predict S.
 9 | }
10 | 
11 | \usage{
12 | dsldCHunting(data,yName,sName,intersectDepth=10)
13 | dsldOHunting(data,yName,sName)
14 | }
15 | 
16 | \arguments{
17 |     \item{data}{
18 |         Data frame.
19 |     }
20 |     \item{yName}{
21 |         Name of the response variable column. 
22 |     }
23 |     \item{sName}{
24 |         Name of the sensitive attribute column.
25 |     }
26 |     \item{intersectDepth}{
27 |         Maximum size of intersection of the Y predictor set and 
28 |         the S predictor set
29 |     }
30 | }
31 | 
32 | \details{
33 | 
34 |     \code{dsldCHunting}: The random forests function
35 |     \code{qeML:qeRF} will be run on the indicated data to indicate feature
36 |     importance in prediction of Y (without S) and S (without Y).  Call
37 |     these "important predictors" of Y and S.
38 | 
39 |     Then for each \code{i} from 1 to \code{intersectDepth}, the
40 |     intersection of the top \code{i} important predictors of Y and the
41 |     the top \code{i} important predictors of S will be reported, thus
42 |     suggesting possible confounders. Larger values of \code{i} will
43 |     report more potential confounders, though including progressively
44 |     weaker ones. 
45 | 
46 |     The analyst then may then consider omitting the variables C from
47 |     models of the effect of S on Y.
48 | 
49 |     Note: Run times may be long.
50 | 
51 |     \code{dsldOHunting}: Factors, if any, will be converted to dummy
52 |     variables, and then the Kendall Tau correlations will be calculated
53 |     betwene S and potential proxy variables O, i.e. every column other
54 |     than Y and S.  (The Y column itself doesn't enter into computation.)
55 | 
56 |     In fairness analyses, in which one desires to either eliminate or
57 |     reduce the impact of S, one must consider the indirect effect of S
58 |     via O. One may wish to eliminate or reduce the role of O.
59 | 
60 | }
61 | 
62 | \author{
63 |     N. Matloff
64 | }
65 | 
66 | \value{
67 | 
68 | The function \code{dsldCHunting} returns an R list, one component for
69 | each confounder set found.
70 | 
71 | The function \code{dsldOHunting} returns an R matrix of correlations, 
72 | one row for each level of S.
73 | 
74 | }
75 | 
76 | \examples{  
77 | \donttest{
78 | data(lsa) 
79 | dsldCHunting(lsa,'bar','race1')
80 | # e.g. suggests confounders 'decile3', 'lsat'
81 |     
82 | data(mortgageSE)
83 | dsldOHunting(mortgageSE,'deny','black')
84 | # e.g. suggests using loan value and condo purchase as proxies
85 | }
86 | }
87 | 


--------------------------------------------------------------------------------
/inst/src/dsldPy/Utils.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | from PIL import Image
 4 | import rpy2.robjects as robjects
 5 | from rpy2.robjects import StrVector, FloatVector, ListVector
 6 | from rpy2.robjects import pandas2ri
 7 | 
 8 | 
 9 | import rpy2.robjects as ro
10 | from rpy2.robjects import default_converter
11 | from rpy2.robjects.conversion import localconverter
12 | from rpy2.robjects import pandas2ri
13 | 
14 | import os
15 | import pyreadr
16 | from rpy2.robjects.packages import importr
17 | 
18 | 
19 | def get_dsld():
20 |     """Return the R 'dsld' package handle using the installed version in R."""
21 |     return importr("dsld")
22 | 
23 | def get_dsld_version():
24 |     """Return the installed dsld version as a string (or None if unavailable)."""
25 |     try:
26 |         importr("utils")  # ensure utils is available
27 |         ver = ro.r('as.character(utils::packageVersion("dsld"))')[0]
28 |         return ver
29 |     except Exception:
30 |         return None
31 | 
32 | ### data-frame conversion functions
33 | # This function converts a pandas data frame into an R data frame
34 | ## updated to remove depracated function
35 | def dsld_Rpy2_IsRDataframe(data):
36 |     """
37 |     If data is R data.frame, return it.
38 |     If data is pandas DataFrame, convert to R and return.
39 |     Otherwise return -1.
40 |     """
41 |     if isinstance(data, ro.vectors.DataFrame):
42 |         return data
43 |     elif isinstance(data, pd.DataFrame):
44 |         return dsld_Rpy2_PandasToRDataframe(data)
45 |     else:
46 |         print("Error: not Rdata or Pandas Dataframe")
47 |         return -1
48 |               
49 | ### helper functions for dsld_Rpy2_IsRDataframe
50 | def dsld_Rpy2_PandasToRDataframe(pandas_df: pd.DataFrame):
51 |     """convert pandas -> R data.frame."""
52 |     with localconverter(default_converter + pandas2ri.converter):
53 |         return ro.conversion.py2rpy(pandas_df)
54 | 
55 | def dsld_Rpy2_RDataframeToPandas(r_df):
56 |     """convert R data.frame -> pandas DataFrame."""
57 |     with localconverter(default_converter + pandas2ri.converter):
58 |         return ro.conversion.rpy2py(r_df)
59 | 
60 | ### reading data // data cleaning
61 | def read_data(filepath, **kwargs):
62 | 
63 |     ext = os.path.splitext(filepath)[1].lower()
64 |     
65 |     if ext == ".csv":
66 |         return pd.read_csv(filepath, **kwargs)
67 |     
68 |     elif ext in [".rdata", ".rda"]:
69 |         result = pyreadr.read_r(filepath)
70 |         key = list(result.keys())[0]
71 |         return result[key]
72 |     
73 |     else:
74 |         raise ValueError(f"Unsupported file extension: {ext}")
75 | 
76 | def preprocess_data(data, cat_features, num_features):
77 |     r_data = dsld_Rpy2_IsRDataframe(data)
78 |     dsld = get_dsld()
79 |     r_data = dsld.convert_cols(r_data, cat_features, num_features)
80 |     return r_data
81 | 


--------------------------------------------------------------------------------
/inst/src/dsldPy/__init__.py:
--------------------------------------------------------------------------------
 1 | from .dsldPyQeFairML import (
 2 |     dsldPyQeFairKNN,
 3 |     dsldPyQeFairRF,
 4 |     dsldPyQeFairRidgeLin,
 5 |     dsldPyQeFairRidgeLog,
 6 |     dsldPyQeFairML_Predict,
 7 | )
 8 | from .dsldPyLinear import (
 9 |     dsldPyLinear,
10 |     dsldPyLinearSummary,
11 |     dsldPyLinearCoef,
12 |     dsldPyLinearVcov,
13 |     dsldPyLinearGetData,
14 |     dsldPyLinearPredict,
15 | )
16 | from .dsldPyFairML import (
17 |     dsldPyFrrm,
18 |     dsldPyFgrrm,
19 |     dsldPyNclm,
20 |     dsldPyZlm,
21 |     dsldPyZlrm,
22 |     dsldPyFairML_Summary,
23 |     dsldPyFairML_Predict,
24 | )
25 | from .dsldPyBnLearn import dsldPyIamb
26 | from .dsldPyScatterPlot3D import dsldPyScatterPlot3D
27 | from .dsldPyFreqPCoord import dsldPyFreqPCoord
28 | from .dsldPyConditDisparity import dsldPyConditDisparity
29 | from .dsldPyLogit import (
30 |     dsldPyLogit,
31 |     dsldPyLogitSummary,
32 |     dsldPyLogitCoef,
33 |     dsldPyLogitVcov,
34 |     dsldPyLogitGetData,
35 |     dsldPyLogitPredict,
36 | )
37 | from .dsldPyFrequencybyS import dsldPyFrequencybyS
38 | from .dsldPyConfounders import dsldPyConfounders
39 | from .dsldPyML import dsldPyML
40 | from .dsldPyTakeALookAround import dsldPyTakeALookAround
41 | from .dsldPyDensitybyS import dsldPyDensitybyS
42 | from .dsldPyMatching import dsldPyMatchedATE
43 | from .dsldPyHunting import dsldPyCHunting, dsldPyOHunting
44 | from .dsldPyFairUtils import dsldPyFairUtils
45 | from .Utils import (
46 |     dsld_Rpy2_IsRDataframe,
47 |     dsld_Rpy2_PandasToRDataframe,
48 |     dsld_Rpy2_RDataframeToPandas,
49 |     read_data,
50 |     preprocess_data,
51 | )
52 | 
53 | __all__ = [
54 |     'dsldPyQeFairKNN',
55 |     'dsldPyQeFairRF',
56 |     'dsldPyQeFairRidgeLin',
57 |     'dsldPyQeFairRidgeLog',
58 |     'dsldPyQeFairML_Predict',
59 |     'dsldPyLinear',
60 |     'dsldPyLinearSummary',
61 |     'dsldPyLinearCoef',
62 |     'dsldPyLinearVcov',
63 |     'dsldPyLinearGetData',
64 |     'dsldPyLinearPredict',
65 |     'dsldPyFrrm',
66 |     'dsldPyFgrrm',
67 |     'dsldPyNclm',
68 |     'dsldPyZlm',
69 |     'dsldPyZlrm',
70 |     'dsldPyFairML_Summary',
71 |     'dsldPyFairML_Predict',
72 |     'dsldPyIamb',
73 |     'dsldPyScatterPlot3D',
74 |     'dsldPyFreqPCoord',
75 |     'dsldPyConditDisparity',
76 |     'dsldPyLogit',
77 |     'dsldPyLogitSummary',
78 |     'dsldPyLogitCoef',
79 |     'dsldPyLogitVcov',
80 |     'dsldPyLogitGetData',
81 |     'dsldPyLogitPredict',
82 |     'dsldPyFrequencybyS',
83 |     'dsldPyConfounders',
84 |     'dsldPyML',
85 |     'dsldPyTakeALookAround',
86 |     'dsldPyDensitybyS',
87 |     'dsldPyMatchedATE',
88 |     'dsldPyCHunting',
89 |     'dsldPyOHunting',
90 |     'dsldPyFairUtils',
91 |     'dsld_Rpy2_IsRDataframe',
92 |     'dsld_Rpy2_PandasToRDataframe',
93 |     'dsld_Rpy2_RDataframeToPandas',
94 |     'read_data',
95 |     'preprocess_data',
96 | ]
97 | 
98 | __version__ = '0.0.3'
99 | 


--------------------------------------------------------------------------------
/R/dsldHunting.R:
--------------------------------------------------------------------------------
 1 | 
 2 | # ad hoc aid in deciding which covariates one should treat as
 3 | # confounders
 4 | 
 5 | # we want to find variables C that are correlated with both Y and S
 6 | 
 7 | # based on qeRF, which uses the 'randomForests' package; its output
 8 | # includes a variable importance measure
 9 | 
10 | # importance here uses the permutation method, measuring deterioration
11 | # in prediction accuracy resulting from shuffling the given data column;
12 | # the greater the deterioration, the more important the variable
13 | 
14 | # 'intersectDepth' specifies the number of prediction sets for each of Y
15 | # and S to examine for intersection; in datasets with many predictors,
16 | # this probably should be set to a larger value, or else each
17 | # intersection may be null
18 | 
19 | dsldCHunting <- function(data, yName, sName, intersectDepth = 10) {
20 | 
21 |    ycol <- which(names(data) == yName)
22 |    scol <- which(names(data) == sName)
23 |    y <- data[, ycol]
24 |    s <- data[, scol]
25 | 
26 |    dataNoS <- data[, -scol]  # for predicting Y
27 |    dataNoY <- data[, -ycol]  # for predicting S
28 | 
29 |    impY <- qeML::qeRF(dataNoS, yName)$importance
30 |    impS <- qeRF(dataNoY, sName)$importance
31 | 
32 |    # the 'importance' output format has several different cases, which
33 |    # must be dealt with separately in extracting the actual importance
34 |    # vector
35 |    nlevsY <- length(levels(y))
36 |    if (is.numeric(y) || nlevsY == 2) 
37 |        impY1 <- impY[, 1]
38 |    else if (is.factor(y)) {
39 |        impY1 <- impY[, nlevsY + 1]
40 |    }
41 |    else stop("Y must be numeric or an R factor")
42 |    if (!is.factor(s)) stop("S must be an R factor")
43 |    nlevsS <- length(levels(s))
44 |    if (nlevsS == 2) impS1 <- impS[, 1] else impS1 <- impS[, nlevsS + 1]
45 | 
46 |    # larger values mean higher importance
47 |    impY1 <- sort(impY1, decreasing = TRUE)
48 |    impS1 <- sort(impS1, decreasing = TRUE)
49 | 
50 |    # start assembling output
51 |    res <- list(impForY = impY1, impForS = impS1)
52 |    nmsY <- names(impY1)
53 |    nmsS <- names(impS1)
54 |    res$inCommon <- list()
55 | 
56 |    # for each i, find the "top i" set of confounders, defined as being
57 |    # highly correlated with both Y and S
58 |    for (i in 1:min(intersectDepth, ncol(data) - 2)) {
59 |       res$inCommon[[i]] <- intersect(nmsY[1:i], nmsS[1:i])
60 |    }
61 | 
62 |    return(res)
63 | }
64 | 
65 | 
66 | # ad hoc aid in deciding which covariates one should treat as
67 | # proxies
68 | 
69 | # we want to find variables O that are correlated with S; S need not be
70 | # binary/categorical
71 | 
72 | # based on cor(), using Kendall's Tau in order to acccomdate binary
73 | # variables (0,1 valued), and to mitigate effects of outliers
74 | 
75 | dsldOHunting <- function(data,yName,sName) 
76 | {
77 |   
78 |   ycol <- which(names(data) == yName)
79 |   scol <- which(names(data) == sName)
80 |   
81 |   sdumms <- regtools::factorsToDummies(data[,scol,drop=FALSE])
82 |   odumms <- regtools::factorsToDummies(data[,-c(ycol,scol),drop=FALSE])
83 |   
84 |   cor(sdumms,odumms,method='kendall')
85 |   
86 | }
87 | 
88 | 
89 | 


--------------------------------------------------------------------------------
/man/dsldLogit.Rd:
--------------------------------------------------------------------------------
  1 | \name{dsldLogit}
  2 | \alias{dsldLogit}
  3 | \alias{predict.dsldGLM}
  4 | \alias{coef.dsldGLM}
  5 | \alias{vcov.dsldGLM}
  6 | \alias{summary.dsldGLM}
  7 | \title{dsldLogit}
  8 | 
  9 | \description{ 
 10 |     Comparison of conditions for sensitive groups via logistic regression
 11 |     models, with or without interactions with the sensitive variable.
 12 | }
 13 | 
 14 | \usage{
 15 | dsldLogit(data, yName, sName, sComparisonPts = NULL, interactions = FALSE, 
 16 |    yesYVal)
 17 | \method{summary}{dsldGLM}(object,...)
 18 | \method{predict}{dsldGLM}(object,xNew,...)
 19 | \method{coef}{dsldGLM}(object,...)
 20 | \method{vcov}{dsldGLM}(object,...)
 21 | }
 22 | 
 23 | \arguments{
 24 |     \item{data}{
 25 |         Data frame used to train the linear model; will be split according to
 26 |         each level of \code{sName} in output if \code{interactions} is TRUE. 
 27 |     }
 28 |     \item{yName}{
 29 |         Name of the response variable column. 
 30 |     }
 31 |     \item{sName}{
 32 |         Name of the sensitive attribute column. 
 33 |     }
 34 |     \item{interactions}{
 35 |         If TRUE, fit interactions with the sensitive variable. 
 36 |     }
 37 |     \item{sComparisonPts}{
 38 |         If \code{interactions} is TRUE, a
 39 |         a data frame of new cases (minus Y,S) for which P(Y = 1| X) 
 40 |         will be compared between each pairs of S levels. Must be
 41 |         in the same format as the original data. 
 42 |     }
 43 |     \item{yesYVal}{
 44 |         Y value to be considered 'yes', to be coded 1 rather than 0.
 45 |     }
 46 |     \item{object}{
 47 |         An object returned by \code{dsldLogit}. 
 48 |     }
 49 |     \item{xNew}{
 50 |         Dataframe to predict new cases. Must be in the same format 
 51 |         as \code{data}.
 52 |     }
 53 |     \item{...}{Further arguments.}
 54 | }
 55 | 
 56 | \author{
 57 |     N. Matloff, A. Mittal, A. Ashok
 58 | }
 59 | 
 60 | \examples{
 61 | 
 62 | data(lsa)
 63 | 
 64 | ### interactions case - exclude S and Y in newData
 65 | newData <- lsa[c(2,22,222,2222),-c(8,11)]
 66 | log1 <- dsldLogit(lsa,'bar','race1', newData, interactions = TRUE, 'TRUE')
 67 | 
 68 | # extract results
 69 | coef(log1)
 70 | vcov(log1) 
 71 | summary(log1)
 72 | 
 73 | # predict new data --- one prediction for each level of S per row
 74 | predict(log1, newData)
 75 | 
 76 | # no interaction case - exclude Y in newData
 77 | newData <- lsa[c(2,22,222,2222),-c(11)]
 78 | log2 <- dsldLogit(data = lsa, yName = 'bar',sName = 'gender', 
 79 |                   interactions = FALSE, yesYVal = 'TRUE')
 80 | 
 81 | summary(log2)
 82 | 
 83 | # predict on newData  --- one prediction per row
 84 | predict(log2, newData)
 85 | 
 86 | }
 87 | 
 88 | \details{
 89 | 
 90 |     The \code{dsldLogit} function fits a logistic 
 91 |     regression model to the response variable. Interactions are handled
 92 |     as in \code{dsldLinear}.
 93 | 
 94 | }
 95 | 
 96 | 
 97 | \value{
 98 | 
 99 | The \code{dsldLog} function returns an S3 object of class 'dsldGLM',
100 | with one component for each level of S. Each component includes
101 | information about the fitted model.
102 | 
103 | }
104 | 


--------------------------------------------------------------------------------
/man/dsldScatterPlot3D.Rd:
--------------------------------------------------------------------------------
 1 | \name{dsldScatterPlot3D}
 2 | \alias{dsldScatterPlot3D}
 3 | \title{ScatterPlot3D in dsld}
 4 | \description{ Plotly 3D visualization of  a dataset on 3 axes, 
 5 |    with points color-coded on a 4th variable.}
 6 | \usage{
 7 | dsldScatterPlot3D(data, yNames, sName, sGroups = NULL, sortedBy =
 8 |   "Name", numGroups = 8, maxPoints = NULL, xlim = NULL,
 9 |   ylim = NULL, zlim = NULL, main = NULL, colors =
10 |   "Paired", opacity = 1, pointSize = 8)}
11 | 
12 | \arguments{
13 |   \item{data}{
14 |   Data frame with at least 4 columns. 
15 |   }
16 |   \item{yNames}{
17 |   Vector of the indices or names of the columns of the data frame to be 
18 |   graphed on the 3 axes. 
19 |   }
20 |   \item{sName}{
21 |   Index or name of the column that contains the groups for which the data
22 |   will be grouped by. This will affect the colors of the points of the graph. 
23 |   This column must be an R factor.
24 |   }
25 |   \item{sGroups}{
26 |   Vector of the names of the groups for which the data will be grouped by. 
27 |   Every value in the vector must exist in the \code{sName} column of the data
28 |   frame.  If not supplied or is NULL, the function will create this 
29 |   automatically according to the \code{sortedby} and \code{numgrps} parameters.
30 |   By default, the function uses the 8 alphabetically first distinct groups 
31 |   in the \code{sName} column.
32 |   }
33 |   \item{sortedBy}{
34 |   Controls how \code{sGroups} is created automatically. If \code{sGroups} 
35 |   is supplied, this does nothing. One of three values: "Name", "Frequency", 
36 |   "Frequency-Descending". 
37 |   
38 |   "Name" gets the first values alphabetically.
39 |   "Frequency" gets the most frequently occuring values.
40 |   "Frequency-Descending" gets the least frequently occuring values.
41 |   }
42 |   \item{numGroups}{
43 |   Number of  groups to be automatically generated by the function. If 
44 |   \code{grpnames} is supplied, this does nothing. 
45 |   }
46 |   \item{maxPoints}{
47 |   Limit to how many points may be displayed on the graph.
48 |   There is no limit by default.
49 |   }
50 |   \item{xlim, ylim, zlim}{
51 |   The x, y and z limits, each a vector with c(min, max).
52 |   }
53 |   \item{main}{
54 |   The title of the graph. By default, the \code{sName} "vs. " 
55 |   \code{yNames}.
56 |   }
57 |   \item{colors}{
58 |   Either a colorbrewer2.org palette name (e.g. "YlOrRd" or "Blues"), 
59 |   or a vector of colors to interpolate in hexadecimal   "#RRGGBB" format, 
60 |   or a color interpolation function like colorRamp().
61 |   }
62 |   \item{opacity}{
63 |   A value between 0 and 1.
64 |   }
65 |   \item{pointSize}{
66 |   A value above 1.
67 |   }
68 | }
69 | 
70 | \details{
71 | 
72 | An interactive Plotly visualization will be created, with the three
73 | variables specified in \code{yNames}.  Points will be color-coded
74 | according to \code{sName}. The plot can be rotated etc. using the mouse.
75 | 
76 | }
77 | 
78 | \references{
79 |    https://plotly.com/r/3d-scatter-plots/
80 | }
81 | 
82 | \author{
83 | J. Tran and B. Zarate
84 | }
85 | 
86 | \examples{
87 | data(lsa)
88 | dsldScatterPlot3D(lsa,sName = "race1", 
89 |    yNames=c("ugpa", "lsat","age"), xlim=c(2,4))
90 | }
91 | 
92 | \value{No value, plot.}
93 | 
94 | 
95 | 


--------------------------------------------------------------------------------
/inst/README.md:
--------------------------------------------------------------------------------
 1 | # dsldPy — Python Interface to DSLD
 2 | 
 3 | Statistical and graphical tools for detecting and measuring discrimination and bias in datasets, 
 4 | Python interfaces available via rpy2. **dsldPy** wraps the R package **dsld** with a Python-friendly API 
 5 | using the same underlying R implementations.
 6 | 
 7 | **Relevant links:** 
 8 | 
 9 | - **Quarto Book**: [Paper](https://htmlpreview.github.io/?https://github.com/matloff/dsldBook/blob/main/_book/index.html) - Important statistical principles and applications.
10 | - **Research Paper**: [Paper](https://arxiv.org/abs/2411.04228) - Package implementation details.
11 | 
12 | ## Overview
13 | 
14 | DSLD addresses two main types of bias analysis:
15 | 
16 | - **Estimation analysis:** quantify possible discrimination by estimating effects of a sensitive variable S on an outcome Y, while adjusting for confounders C.
17 | 
18 | - **Prediction analysis (fair ML):** build predictive models that limit the influence of S and its proxies O, trading off fairness and utility.
19 | 
20 | **dsldPy** provides wrappers for all 24 R functions.
21 | 
22 | ## Prerequisites
23 | 
24 | - R installed and on PATH (R 4.x recommended)
25 | - R package dsld installed (CRAN or GitHub)
26 | - Python 3.8+
27 | 
28 | Install dsld in R:
29 | 
30 | ```r
31 | install.packages("dsld")
32 | 
33 | ## or latest development version
34 | # install.packages("remotes")
35 | remotes::install_github("matloff/dsld", force = TRUE)
36 | ```
37 | 
38 | Tip: Ensure rpy2 can find R. From a terminal: `R RHOME` should print your R home. If Python cannot find R, set `R_HOME` in your environment per rpy2’s documentation.
39 | 
40 | ## Installation
41 | 
42 | Install the Python package from this repository (subdirectory `inst`):
43 | 
44 | ```bash
45 | pip install dsldPy
46 | ```
47 | 
48 | This will install dsldPy and its Python dependencies (pandas, numpy, rpy2, etc.). The user still needs to manually download **R** and the **dsld** package, as noted above.
49 | 
50 | ## Quickstart
51 | 
52 | Please refer to the instructional jupyter notebooks provided under `examples/` folder. These illustrate examples of all 24 **dsldPy** functions.
53 | 
54 | Jupyter notebooks are available in this repository:
55 | 
56 | - `inst/examples/graphical.ipynb`
57 | - `inst/examples/tabular.ipynb`
58 | - `inst/examples/machine_learning.ipynb`
59 | 
60 | ## Available Wrappers
61 | 
62 | - Analytical: `dsldPyLinear`, `dsldPyLogit`, `dsldPyML`, `dsldPyMatchedATE`, `dsldPyTakeALookAround`, `dsldPyConfounders`, `dsldPyCHunting`, `dsldPyOHunting`
63 | 
64 | - Fair ML: `dsldPyFrrm`, `dsldPyFgrrm`, `dsldPyNclm`, `dsldPyZlm`, `dsldPyZlrm`, `dsldPyQeFairKNN`, `dsldPyQeFairRF`, `dsldPyQeFairRidgeLin`, `dsldPyQeFairRidgeLog`, , `dsldPyFairUtils`
65 | 
66 | - Graphical: `dsldPyFreqPCoord`, `dsldPyScatterPlot3D`, `dsldPyConditDisparity`, `dsldPyDensitybyS`, `dsldPyFrequencybyS`, `dsldPyIamb`
67 | 
68 | Function names mirror the R package. Arguments use standard Python types (pandas.DataFrame, dict, bool, etc.) with the same call forms as the R functions.
69 | 
70 | ## Troubleshooting
71 | 
72 | - rpy2 cannot find R: confirm `R RHOME` works; if not, add R to PATH or set `R_HOME`. See rpy2 docs for your OS.
73 | - dsld not installed in R: run `install.packages("dsld")` in an R session.
74 | 
75 | ## Authors
76 | 
77 | - Norm Matloff
78 | - Aditya Mittal
79 | - Taha Abdullah
80 | - Arjun Ashok
81 | - Shubhada Martha
82 | - Billy Ouattara
83 | - Jonathan Tran
84 | - Brandon Zarate
85 | 
86 | For issues, contact **Aditya Mittal** at mittalaa@uci.edu


--------------------------------------------------------------------------------
/man/dsldLinear.Rd:
--------------------------------------------------------------------------------
  1 | \name{dsldLinear}
  2 | \alias{dsldLinear}
  3 | \alias{predict.dsldLM}
  4 | \alias{coef.dsldLM}
  5 | \alias{vcov.dsldLM}
  6 | \alias{summary.dsldLM}
  7 | \title{dsldLinear}
  8 | 
  9 | \description{ 
 10 |     Comparison of sensitive groups via linear models, with or
 11 |     without interactions with the sensitive variable.
 12 | }
 13 | 
 14 | \usage{
 15 | dsldLinear(data, yName, sName, interactions = FALSE, sComparisonPts = NULL, 
 16 |     useSandwich = FALSE)
 17 | \method{summary}{dsldLM}(object,...)
 18 | \method{predict}{dsldLM}(object,xNew,...)
 19 | \method{coef}{dsldLM}(object,...)
 20 | \method{vcov}{dsldLM}(object,...)
 21 | }
 22 | 
 23 | \arguments{
 24 |     \item{data}{
 25 |         Data frame. 
 26 |     }
 27 |     \item{yName}{
 28 |        Name of the response variable Y column. 
 29 |     }
 30 |     \item{sName}{
 31 |         Name of the sensitive attribute S column. 
 32 |     }
 33 |     \item{interactions}{
 34 |         Logical value indicating whether or not to model interactions with the
 35 |         sensitive variable S. 
 36 |     }
 37 |     \item{sComparisonPts}{
 38 |         If \code{interactions} is TRUE, a data frame of new 
 39 |         cases for which mean Y | X will be compared across
 40 |         each pair of S levels. Must be in the same 
 41 |         format as original data.
 42 |     }
 43 |     \item{useSandwich}{
 44 |         If TRUE, use the "sandwich" variance estimator.
 45 |     }
 46 |     \item{object}{
 47 |         An object returned by the \code{dsldLinear} function.  
 48 |     }
 49 |     \item{xNew}{
 50 |         New data to be predicted. Must be in the same format as original data.
 51 |     }
 52 |     \item{...}{
 53 |         Further arguments.
 54 |     }
 55 | }
 56 | 
 57 | \author{
 58 |     N. Matloff, A. Mittal, A. Ashok
 59 | }
 60 | 
 61 | \examples{  
 62 | data(svcensus) 
 63 | 
 64 | ### interactions case - exclude S and Y in newData
 65 | newData <- svcensus[c(1, 18), -c(4,6)] 
 66 | lin1 <- dsldLinear(svcensus, 'wageinc', 'gender', interactions = TRUE,
 67 |     newData)
 68 |     
 69 | # extract results
 70 | coef(lin1)
 71 | vcov(lin1) 
 72 | summary(lin1)
 73 | 
 74 | # predict on newData  --- one prediction for each level of S per row
 75 | predict(lin1, newData)
 76 | 
 77 | ### no interactions case - exclude Y in newData
 78 | newData <- svcensus[c(1, 18), -c(4)] 
 79 | lin2 <- dsldLinear(svcensus, 'wageinc', 'gender', interactions = FALSE)
 80 | summary(lin2)
 81 | 
 82 | # predict on newData  --- one prediction per row
 83 | predict(lin2, newData)
 84 | }
 85 | 
 86 | \details{
 87 | 
 88 |     The \code{dsldLinear} function fits a linear model to the response
 89 |     variable Y using all other variables in \code{data}.  The user may
 90 |     select for interactions with the sensitive variable S. 
 91 | 
 92 |     The function produces an instance of the `dsldLM` class (an S3
 93 |     object).  Instances of the generic functions \code{summary} and
 94 |     \code{coef} are provided.
 95 | 
 96 |     If \code{interactions} is TRUE, the function will fit m separate
 97 |     models, where m is the number of levels of S. Then \code{summary} 
 98 |     will contain m+1 data frames; the first m of which will be the
 99 |     outputs from the individual models.  
100 | 
101 |     The m+1st data frame will compare the differences
102 |     in conditional mean Y|X for each pair of S levels, and for each
103 |     value of X in \code{sComparisonPts}.
104 |     The intention is to allow users to see the comparisons
105 |     of conditions for sensitive groups via linear models, with 
106 |     interactions with S.
107 | 
108 |     The \code{dsldDiffSLin} function allows users to compare mean Y at that
109 |     X between each pair of S level for additional new unseen data levels
110 |     using the model fitted from \code{dsldLinear}.
111 | 
112 | }
113 | 
114 | \value{
115 | 
116 | The \code{dsldLinear} function returns an S3 object of class 'dsldLM',
117 | with one component for each level of S. Each component includes
118 | information about the fitted model.
119 | 
120 | }
121 | 


--------------------------------------------------------------------------------
/man/dsldFreqPCoord.Rd:
--------------------------------------------------------------------------------
  1 | \name{dsldFreqPCoord}
  2 | \alias{dsldFreqPCoord}
  3 | 
  4 | \title{dsldFreqPCoord}
  5 | 
  6 | \description{
  7 |     Wrapper for the \code{freqparcoord} function from the \pkg{freqparcoord} 
  8 |     package.
  9 | }
 10 | \usage{
 11 | dsldFreqPCoord(data, m, sName = NULL, method
 12 |     = "maxdens", faceting = "vert", k = 50, klm = 5 * k, keepidxs = NULL, 
 13 |     plotidxs = FALSE, cls = NULL, plot_filename = NULL)
 14 | }
 15 | 
 16 | \arguments{
 17 |     \item{data}{
 18 |         Data frame or matrix.
 19 |     }
 20 |     \item{m}{
 21 |         Number of lines to plot for each group. A negative value in conjunction 
 22 |         with the method \code{maxdens} indicates that the 
 23 |         lowest-density lines are to be plotted.  If method is \code{locmax},
 24 |         then \code{m} is forced to 1.
 25 |     }
 26 |     \item{sName}{
 27 |         Column for the grouping variable, if any (if none, all the data 
 28 |         is treated as a single group); the column must be a vector or factor. 
 29 |         The column must not be in \code{dispcols}. If 
 30 |         method is \code{locmax}, \code{grpvar} is forced to NULL
 31 |     }
 32 |     \item{method}{
 33 |         What to display: 'maxdens' for plotting the most
 34 |         (or least) typical lines, 'locmax' for cluster hunting, or 
 35 |         'randsamp' for plotting a random sample of lines.
 36 |     }
 37 |     \item{faceting}{
 38 |         How to display groups, if present.  Use 'vert' for
 39 |         vertical stacking of group plots, 'horiz' for horizontal ones, or
 40 |         'none' to draw all lines in one plot, color-coding by group.
 41 |     }
 42 |     \item{k}{
 43 |         Number of nearest neighbors to use for density estimation.
 44 |     }
 45 |     \item{klm}{
 46 |         If method is "locmax", number of nearest neighbors to 
 47 |         use for finding local maxima for cluster hunting. Generally needs
 48 |         to be much larger than \code{k}, to avoid "noise fitting."
 49 |     }
 50 |     \item{keepidxs}{
 51 |         If not NULL, the indices of the rows of \code{data} that 
 52 |         are plotted will be stored in a component \code{idxs} of the
 53 |         return value.  The rows themselves will be in a component
 54 |         \code{xdisp}, ordered by \code{data[,dispcols[1]}.
 55 |     }
 56 |     \item{plotidxs}{
 57 |         If TRUE, lines in the display will be annotated 
 58 |         with their case numbers, i.e. their row numbers within \code{data}.  
 59 |         Use only with small values of \code{m}, as overplotting may occur.
 60 |     }
 61 |     \item{cls}{
 62 |         Cluster, if any (see the \code{parallel} package) for
 63 |         parallel computation.
 64 |     }
 65 |     \item{plot_filename}{
 66 |         Name of the file that will hold the saved graph image. 
 67 |         If NULL, the graph will be generated and displayed without being saved.
 68 |         
 69 |         If a filename is provided, the graph will not be displayed, only
 70 |         saved.
 71 |     }
 72 | }
 73 | 
 74 | \details{
 75 |     The \code{dsldFreqPCoord} function wraps \code{freqparcoord},
 76 |     which uses a frequency-based parallel coordinates method to 
 77 |     vizualize multiple variables simultaneously in graph form.
 78 |     
 79 |     This is done by plotting either the "most typical" or "least typical"
 80 |     (i.e. highest or lowest estimated multivariate density values respectively)
 81 |     cases to discern relations between variables.  
 82 |     
 83 |     The Y-axis represents the centered and scaled values of the columns.
 84 | }
 85 | 
 86 | \value{
 87 |     Object of type 'gg' (\pkg{ggplot2} object), with components \code{idxs}
 88 |     and \code{xdisp} added if \code{keepidxs} is not NULL (see argument
 89 |     \code{keepidxs} above).
 90 | }
 91 | 
 92 | \references{
 93 |     https://cran.r-project.org/web/packages/freqparcoord/index.html
 94 | }
 95 | \author{
 96 |     N. Matloff, T. Abdullah, B. Ouattara, J. Tran, B. Zarate
 97 | }
 98 | 
 99 | \examples{
100 | data(lsa)
101 | lsa1 <- lsa[,c('fam_inc','ugpa','gender','lsat','race1')]
102 | dsldFreqPCoord(lsa1,75,'race1')
103 | # a number of interesting trends among the most "typical" law students in the
104 | # dataset: remarkably little variation among typical
105 | # African-Americans; typical Hispanic men have low GPAs, poor LSAT
106 | # scores there is more variation; typical Asian and Black students were
107 | # female; Asians and Hispanics have the most variation in family income
108 | # background
109 | }
110 | 
111 | 


--------------------------------------------------------------------------------
/man/dsldFairML.Rd:
--------------------------------------------------------------------------------
  1 | \name{dsldFairML Wrappers}
  2 | \alias{dsldFrrm}
  3 | \alias{dsldFgrrm}
  4 | \alias{dsldNclm}
  5 | \alias{dsldZlm}
  6 | \alias{dsldZlrm}
  7 | \alias{predict.dsldFairML}
  8 | \alias{summary.dsldFairML}
  9 | 
 10 | \title{dsldFairML Wrappers}
 11 | 
 12 | \description{
 13 |     Fair machine learning models: estimation and prediction. The following 
 14 |     functions provide wrappers for some functions in the \pkg{fairML}
 15 |     package. 
 16 | }
 17 | \usage{
 18 | dsldFrrm(data, yName, sName, unfairness, definition = "sp-komiyama", 
 19 |    lambda = 0, save.auxiliary = FALSE)
 20 | dsldFgrrm(data, yName, sName, unfairness, definition = "sp-komiyama", 
 21 |    family = "binomial", lambda = 0, save.auxiliary = FALSE, yesYVal)
 22 | dsldNclm(data, yName, sName, unfairness, covfun = cov, lambda = 0, 
 23 |    save.auxiliary = FALSE)
 24 | dsldZlm(data, yName, sName, unfairness)
 25 | dsldZlrm(data, yName, sName, unfairness, yesYVal)
 26 | }
 27 | 
 28 | \arguments{
 29 |     \item{data}{
 30 |         Data frame.
 31 |     }
 32 |     \item{yName}{
 33 |         Name of the response variable column. 
 34 |     }
 35 |     \item{sName}{
 36 |         Name(s) of the sensitive attribute column(s). 
 37 |     }
 38 |     \item{unfairness}{
 39 |         A number in (0, 1]. Degree of unfairness allowed in
 40 |         the model.  A value (very near) 0 means the model is completely 
 41 |         fair, while a value of 1 means the model is not 
 42 |         constrained to be fair at all.
 43 |     }
 44 |     \item{covfun}{
 45 |         A function computing covariance matrices. 
 46 |     }
 47 |     \item{definition}{
 48 |         Character string, the label of the definition of fairness.
 49 |         Currently either 'sp-komiyama', 'eo-komiyama' or 'if-berk'.
 50 |     }
 51 |     \item{family}{
 52 |         A character string, either 'gaussian' to fit linear regression, 
 53 |         'binomial' for logistic regression, 'poisson' for
 54 |         log-linear regression, 'cox' for Cox proportional 
 55 |         hazards regression, or 'multinomial' for
 56 |         multinomial logistic regression.
 57 |     }
 58 |     \item{lambda}{
 59 |         Non-negative number, a ridge-regression penalty coefficient. 
 60 |     }
 61 |     \item{save.auxiliary}{
 62 |         A logical value, whether to save the fitted values and the residuals 
 63 |         of the auxiliary model that constructs the debiased predictors. 
 64 |     }
 65 |     \item{yesYVal}{
 66 |         Y value to be considered 'yes', to be coded 1 rather than 0.
 67 |     }
 68 | }
 69 | 
 70 | \details{
 71 | 
 72 |    See documentation for the \pkg{fairml} package.
 73 |    
 74 |    The DSLD package extends functionality by providing both accuracy 
 75 |    (MAPE or misclassification rate) and fairness (correlation) on the 
 76 |    training set when fitting the model.
 77 | 
 78 | }
 79 | 
 80 | \value{
 81 |     An object of class 'dsldFairML', which includes the model 
 82 |     information, \code{yName}, \code{sName}, and model training details.
 83 | }
 84 | 
 85 | \author{
 86 |     A. Mittal, S. Martha, B. Ouattara, B. Zarate, J. Tran
 87 | }
 88 | 
 89 | \examples{ 
 90 | \donttest{
 91 | # regression example
 92 | data(svcensus)
 93 | 
 94 | # test/train splits
 95 | n <- nrow(svcensus)
 96 | train_idx <- sample(seq_len(n), size = 0.7 * n) 
 97 | train <- svcensus[train_idx, ]
 98 | test  <- svcensus[-train_idx, -4]
 99 | test_y <- svcensus[-train_idx, 4]
100 | 
101 | # train frrm model // also works with nclm and zlm
102 | frrmOut <- dsldFrrm(data = train, yName = 'wageinc', sName = 'gender', 
103 |                     unfairness = 0.2, definition = "sp-komiyama") 
104 | 
105 | # training results
106 | summary(frrmOut)
107 | frrmOut$trainCorrs
108 | frrmOut$trainAcc
109 | 
110 | # testing results
111 | res <- predict(frrmOut, test) 
112 | res$correlations
113 | mean(abs(res$preds - test_y))
114 | 
115 | # also works with dsldNclm, dsldZlm
116 | 
117 | # classification example
118 | data(compas1)
119 | 
120 | # test/train splits
121 | n <- nrow(compas1)
122 | train_idx <- sample(seq_len(n), size = 0.7 * n) 
123 | train <- compas1[train_idx, ]
124 | test  <- compas1[-train_idx, -8]
125 | test_y <- compas1[-train_idx, 8]
126 | test_y <- as.factor(as.integer(test_y== 'Yes'))
127 | 
128 | # train fgrrm model // also works with zlrm
129 | fgrrmOut <- dsldFgrrm(train, yName = "two_year_recid", 
130 |                       sName = "age", unfairness = 0.05, 
131 |                       definition = "sp-komiyama", 
132 |                       yesYVal = 'Yes')  
133 | # training results
134 | summary(fgrrmOut)
135 | fgrrmOut$trainCorrs
136 | fgrrmOut$trainAcc
137 | 
138 | # testing results
139 | res <- predict(fgrrmOut, test) 
140 | res$correlations
141 | mean(test_y != round(res$preds))
142 | 
143 | # also works with dsldZlm
144 | }
145 | 
146 | }
147 | 


--------------------------------------------------------------------------------
/R/dsldConditDisparity.R:
--------------------------------------------------------------------------------
  1 | 
  2 | # arguments
  3 | 
  4 | #   data: input data frame or equivalewnt
  5 | #   yName: response variable
  6 | #   sName: sensitive variable (R factor)
  7 | #   xName: horizontal axis variables
  8 | #   condits: conditions, a vector of conditions, expressed in
  9 | #      names(data); must have at least 1, even if trivial
 10 | #   qeFtn: qeML predictive function
 11 | #   minS: if 'data' has fewer than this many rows for a give S level,
 12 | #      don't use that level
 13 | #   useLoess: if TRUE, use loess smoothing
 14 | 
 15 | dsldConditDisparity <- function(data, yName, sName, xName, condits = NULL,
 16 |                                 qeFtn = qeKNN, minS = 50, useLoess = TRUE)
 17 | {
 18 |     getSuggestedLib('qeML') 
 19 |   
 20 |     # args type checking
 21 |     if (!is.data.frame(data)) {
 22 |         stop("data must be a dataframe or equivalent")
 23 |     }
 24 | 
 25 |     y <- data[[yName]]
 26 |     
 27 |     dichotY <- inherits(y, "factor") && length(levels(y) == 2)
 28 | 
 29 |     if (!inherits(y, "numeric") &&
 30 |         !inherits(y, "integer") &&
 31 |         !dichotY
 32 |        ) {
 33 |         stop("yName must refer to a numeric or 2-level factor column in data.")
 34 |     }
 35 |     if (!is.factor(data[[sName]])) {
 36 |         stop("sName must refer to a factor column in data.")
 37 |     }
 38 |     if (!is.numeric(data[[xName]])) {
 39 |         stop("xName must refer to a numeric column in data.")
 40 |     }
 41 | 
 42 |     # data engineering #
 43 |     # restrict data to fit conditions
 44 |     if (is.null(condits)) condits <- '1 > 0'
 45 |     if (length(condits) > 1) {
 46 |         # combine conditions
 47 |         condits <- paste(condits, collapse = " & ")
 48 |     }
 49 |     restrictions <- sprintf("focusedData <- subset(data, %s)", condits)
 50 |     eval(parse(text = restrictions))
 51 |     focusedData <- focusedData[c(yName, xName, sName)]
 52 |     sCol <- which(names(focusedData) == sName)
 53 | 
 54 |     # group the data by S level & execute min size condition
 55 |     s <- focusedData[[sName]]
 56 |     groupByS <- split(focusedData, s)
 57 |     sizes <- sapply(groupByS, nrow)
 58 |     tiny <- which(sizes < minS)
 59 | 
 60 |     # remove too-small groups
 61 |     if (length(tiny) > 0) {
 62 |         groupByS <- groupByS[-tiny]
 63 |     }
 64 | 
 65 |     # consider only the remaining S-levels
 66 |     sLevels <- names(groupByS)
 67 |     remainingS <- length(sLevels)
 68 | 
 69 | 
 70 |    # prepare to plot each sensitive level against X; in this loop, fit
 71 |    # the models, and then plot in the following loop
 72 |    curXDataList <- list()
 73 |    predsList <- list()
 74 |    for (i in 1:remainingS) {
 75 | 
 76 |        # setup data for training
 77 |        curData <- groupByS[[i]][,-sCol]  # current s-level w/o sensitive column
 78 |        curXData <- unique(curData[[xName]]) # only the numeric x column
 79 |        curXDF <- as.data.frame(curXData)
 80 |        names(curXDF) <- xName  # adjust column name
 81 | 
 82 |        # fit ML model
 83 |        model <- qeFtn(curData, yName, holdout = NULL)
 84 |        preds <- predict(model, curXDF)
 85 |        if (dichotY) preds <- preds$probs
 86 | 
 87 |        # sort data so that lines() will make sense
 88 |        curXData <- as.vector(curXData)
 89 |        preds <- as.vector(preds)
 90 |        orderedXData <- order(curXData)
 91 |        curXData <- curXData[orderedXData]
 92 |        preds <- preds[orderedXData]
 93 | 
 94 |        # store dataframe w/ sorted data for plotting
 95 |        # check Loess
 96 |        plotdf <- data.frame(curXData, preds)
 97 |        if (useLoess) {
 98 |            preds <- loess(preds ~ curXData, plotdf)$fitted # loess smoothing
 99 |        }
100 | 
101 |        # these 2 will be used in call to lines()
102 |        curXDataList[[i]] <- curXData
103 |        predsList[[i]] <- preds
104 |    }
105 | 
106 |    # create plot
107 |    colors <- rainbow(remainingS)
108 |    predsMax <- max(sapply(predsList, max))
109 |    predsMin <- min(sapply(predsList, min))
110 |    ylow <- if (predsMin >= 0) 0.9 * predsMin else 1.1 * predsMin
111 |    yhigh <- if (predsMax >= 0) 1.1 * predsMax else 0.9 * predsMax
112 |    currXMax <- max(sapply(curXData, max))
113 |    currXMin <- min(sapply(curXData, min))
114 | 
115 |    plot(
116 |         NULL,
117 |         ylim = c(ylow, yhigh),
118 |         xlim = c(currXMin, currXMax),
119 |         xlab = xName,
120 |         ylab = yName,
121 |         main = paste("Underlying Effects of ", sName, " on ", 
122 |             yName, " wrt ", xName)
123 |    )
124 | 
125 |    for (i in 1:remainingS) {
126 |       lines(
127 |         curXDataList[[i]],
128 |         predsList[[i]],
129 |         type = "l",
130 |         lty = "solid",
131 |         col = colors[i]
132 |       )
133 |    }
134 | 
135 |    legend(
136 |        x = "topright",
137 |        lty = rep(1, remainingS),
138 |        text.font = 4,
139 |        col = colors,
140 |        text.col = "black",
141 |        legend = sLevels
142 |    )
143 | }
144 | 
145 | 


--------------------------------------------------------------------------------
/R/dsldDensitybyS.R:
--------------------------------------------------------------------------------
  1 | dsldDensityByS <- function(data, cName, sName, graphType = "plotly", fill = FALSE) {
  2 |   if (!class(data[, sName]) %in% c("factor", "character"))
  3 |     stop(paste("sName should be of factor or character data type. Consider setting this as a cName instead"))
  4 |   
  5 |   if (tolower(graphType) == "plot") 
  6 |     plotDensity(data, cName, sName, fill)
  7 |   else if (tolower(graphType) == "plotly") 
  8 |     plotlyDensity(data, cName, sName)
  9 | }
 10 | 
 11 | # ---- test ----
 12 | # library(dsld)
 13 | # data(svcensus)
 14 | # dsld::dsldDensityByS(svcensus, "wageinc", "educ")
 15 | 
 16 | # non interactable version of density graph
 17 | plotDensity <- function(data, cName, sName, fill) {
 18 |   getSuggestedLib('ggplot2')
 19 |   
 20 |   # the string of the columns to use for labels
 21 |   cNameStr <- names(data[cName])
 22 |   sNameStr <- names(data[sName])
 23 |   
 24 |   sGroups <- levels(unique(data[, sName]))
 25 |   for (i in 1:length(sGroups)) {
 26 |     den <- density(data[data[, sName] == sGroups[i], ][, cName])
 27 |     
 28 |     if (i == 1)
 29 |       plot(den, col = i, xlab = cNameStr, main = paste("Density of", cNameStr, "by", sNameStr))
 30 |     else
 31 |       lines(den, col = i)
 32 |     
 33 |     if (fill) polygon(den, col = i)
 34 |   }
 35 |   
 36 |   legend("topright", title = sNameStr, legend = sGroups, col = 1:length(sGroups), lty = 1)
 37 | }
 38 | 
 39 | # interactable plotly version
 40 | plotlyDensity <- function(data, cName, sName) {
 41 |   getSuggestedLib('plotly')
 42 |   
 43 |   # the strategy for allowing a slider to control for density
 44 |   # is plot one graph for each possible bandwidth on the slider.
 45 |   # the slider will select one graph to be visible at a time
 46 |   
 47 |   numGroups <- length(levels(unique(data[, sName])))
 48 |   # the string of the columns to use for labels
 49 |   cNameStr <- names(data[cName])
 50 |   sNameStr <- names(data[sName])
 51 |   
 52 |   bw <- seq(.25, 4, .25) # a vector of all the bandwidths we're using
 53 |   
 54 |   # aval <- a list of the arguments of all the lines we're going to graph
 55 |   aval <- list()
 56 |   for (i in 1:length(bw)) {
 57 |     # from plotly: creating a single group-separated density dataframe object to graph
 58 |     dens <- with(data, 
 59 |                  tapply(data[, cName], INDEX = data[, sName], density, adjust = bw[i]))
 60 |     df <- data.frame(
 61 |       x = unlist(lapply(dens, "[[", "x")),
 62 |       y = unlist(lapply(dens, "[[", "y")),
 63 |       group = rep(names(dens), each = length(dens[[1]]$x))
 64 |     )
 65 |     # all graphs are invisible by default
 66 |     aval[[i]] <- list(visible = FALSE, x = df$x, y = df$y)
 67 |   }
 68 |   # the default (notch 4 on the slider) is visible
 69 |   aval[[4]]$visible = TRUE
 70 |   
 71 |   # initial plot
 72 |   fig <- plotly::plot_ly(type = 'scatter', mode = 'lines', color = df$group)
 73 |   
 74 |   # each step changes the visible argument of each graph on the plot.
 75 |   steps <- list() 
 76 |   # for every bandwith on the slider, add the different density graphs to the plot. 
 77 |   for (i in 1:length(bw)) {
 78 |     fig <- plotly::add_lines(fig, x = aval[[i]]$x, y = aval[[i]]$y,
 79 |                              visible = aval[[i]]$visible)
 80 |     # if there are 3 groups in sName, and there are 8 bandwidths, there
 81 |     # are 24 graphs. 
 82 |     # we need to initally set all graphss visibility to false
 83 |     step <- list(
 84 |       args = list('visible', rep(FALSE, length(aval) * numGroups)),
 85 |       method = 'restyle', label = bw[i]
 86 |     )
 87 |     # and then the corresponding 3 graphs (1 for each level of sName 
 88 |     # with the same bandwidth ) to true
 89 |     step$args[[2]][1:numGroups + numGroups * i] <- TRUE
 90 |     steps[[i]] <- step
 91 |   }
 92 |   # buttons to select fill or no fill, by changing the fill argument 
 93 |   # of the plot we're graphing
 94 |   buttons <- list(
 95 |     list(
 96 |       method = "restyle",
 97 |       args = list("fill", "none"),
 98 |       label = "no fill"
 99 |     ),
100 |     list(
101 |       method = "restyle",
102 |       args = list("fill", "tozeroy"),
103 |       label = "fill"
104 |     )
105 |   )
106 |   # updatemenus is the button for fill/no fill
107 |   # sliders is the density slider
108 |   fig <- plotly::layout(fig,
109 |                         updatemenus = list(list(
110 |                           active = 0,
111 |                           x = 0,
112 |                           y = 1,
113 |                           buttons = buttons
114 |                         )),
115 |                         sliders = list(list(
116 |                           active = 3,
117 |                           currentvalue = list(prefix = "Adjust: "),
118 |                           steps = steps
119 |                         )),
120 |                         title = paste("Density of", cNameStr, "by", sNameStr),
121 |                         xaxis = list(title = cNameStr),
122 |                         yaxis = list(title = "Density"),
123 |                         legend = list(title = list(text = sNameStr))
124 |   )
125 |   fig
126 | }


--------------------------------------------------------------------------------
/R/dsldTakeALookAround.R:
--------------------------------------------------------------------------------
  1 | ### -------------------------- dsldTakeALookAround ---------------------------
  2 | dsldTakeALookAround <- function(data, yName, sName,
  3 |                                 maxFeatureSetSize = (ncol(data) - 2),
  4 |                                 holdout = floor(min(1000, 0.1 * nrow(data)))) {
  5 |     # load libraries
  6 |     getSuggestedLib("qeML")
  7 | 
  8 |     # args checking #
  9 |     if (maxFeatureSetSize > (ncol(data) - 2)) {
 10 |         stop("maxFeatureSetSize too large!")    # error on invalid size
 11 |     }
 12 |   
 13 |     if (!is.data.frame(data)) {
 14 |         stop("data must be a dataframe or equivalent")       # error on types
 15 |     }
 16 |     
 17 |     # subset dataset to remove sName and yName
 18 |     max_features_data <- data[, !names(data) %in% c(yName, sName)]
 19 | 
 20 |     # get names of feature set 
 21 |     feature_names <- colnames(max_features_data)
 22 |     
 23 |     # initialize empty vectors to populate with test accuracy scores
 24 |     col_names <- c()
 25 |     MSE_Y <- c()
 26 |     MSE_YS <- c()
 27 |     MSE_S <- c()
 28 |     
 29 |     # run for loop to get all possible combinations of features up to maxFeatureSetSize
 30 |     for (i in 1:maxFeatureSetSize) { 
 31 |         # create combination matrix containing i-features
 32 |         combination_matrix <- combn(feature_names, i)
 33 |         
 34 |         # run second for loop across each column of the combination_matrix
 35 |         for (j in 1:dim(combination_matrix)[2]) {
 36 |             # create vector of feature set names across each run - compute 1.
 37 |             current_features <- combination_matrix[,j]                     # get feature names on the jth loop
 38 |             names <- toString(current_features)                            # convert to string
 39 |             names <- gsub(" ","",names)                                    # remove spaces between the characters
 40 |             col_names <- c(col_names, names)                               # append feature names string into vector 
 41 |             
 42 |             # create dataframes to compute test accuracies 
 43 |             feature_data_Y <- data[,c(current_features, yName)]            # dataframe with feature set and Y 
 44 |             feature_data_Y_S <- data[,c(current_features, yName,sName)]    # dataframe with feature set, S and Y
 45 |             feature_data_S <- data[,c(current_features, sName)]            # dataframe with feature set and S
 46 |             
 47 |             # get part 2. and 3.
 48 |             # check whether Y is continuous
 49 |             if (is.numeric(data[[yName]])) {
 50 |                 a <- qeLin(feature_data_Y, yName, holdout)$testAcc          # get prediction accuracy for Y of this feature set
 51 |                 MSE_Y <- c(MSE_Y, a)                                        # append test accuracy into vector 
 52 |                 
 53 |                 b <- qeLin(feature_data_Y_S, yName, holdout)$testAcc        # get prediction accuracy for Y of the feature set PLUS s
 54 |                 MSE_YS <- c(MSE_YS, b)
 55 |             } 
 56 |             # Y is discrete
 57 |             else { 
 58 |                 a <- qeLogit(feature_data_Y, yName, holdout)$testAcc        # get prediction accuracy for Y of this feature set
 59 |                 MSE_Y <- c(MSE_Y, a)
 60 |                 
 61 |                 b <- qeLogit(feature_data_Y_S, yName, holdout)$testAcc      # get prediction accuracy for Y of this feature set PLUS s
 62 |                 MSE_YS <- c(MSE_YS, b)
 63 |             }
 64 |             
 65 |             # get 4.
 66 |             # check whether sName is continuous
 67 |             if (is.numeric(data[[sName]])) {
 68 |                 c <- qeLin(feature_data_S, sName, holdout)$testAcc          # get prediction accuracy of S from the feature set
 69 |                 MSE_S <- c(MSE_S, c)
 70 |             } 
 71 |             # if sName is discrete
 72 |             else {
 73 |                 c <- qeLogit(feature_data_S, sName, holdout)$testAcc        # get prediction accuracy of S from the feature set
 74 |                 MSE_S <- c(MSE_S, c)
 75 |             }
 76 |         }
 77 |     }
 78 |     
 79 |     # create dataframe
 80 |     df <- data.frame(col_names, MSE_Y, MSE_YS, MSE_S)
 81 |     colnames(df)[1] <- "Feature Names"
 82 |     colnames(df)[2] <- "a"
 83 |     colnames(df)[3] <- "b"
 84 |     colnames(df)[4] <- "c"
 85 |     return(df)
 86 | }
 87 | 
 88 | # Test runs
 89 | # Example 1: We investigate the predictive accuracy for a continuous Y,'wageinc', using the default arguments for maxFeatureSetSize = 4
 90 | # data(svcensus)
 91 | # dsldTakeALookAround(svcensus, 'wageinc', 'gender', 4)
 92 | 
 93 | # Example 2:  We investigate the predictive accuracy for a categorical Y, 'educ', using the default arguments for maxFeatureSetSize = 4
 94 | # data(svcensus)
 95 | # dsldTakeALookAround(svcensus, 'educ', 'occ')
 96 | 
 97 | # Example 3:  We investigate the predictive accuracy for a continuous Y, 'wageinc', using the maxFeatureSetSize = 1
 98 | # data(svcensus)
 99 | # dsldTakeALookAround(svcensus, 'wageinc', 'gender', 1)
100 | 


--------------------------------------------------------------------------------
/R/dsldScatterPlot3D.R:
--------------------------------------------------------------------------------
  1 | dsldScatterPlot3D <-  function(data, yNames, sName, sGroups = NULL,
  2 |                                sortedBy = "Name", numGroups = 8,
  3 |                                maxPoints = NULL, xlim = NULL, ylim = NULL,
  4 |                                zlim = NULL, main = NULL, colors = "Paired",
  5 |                                opacity = 1, pointSize = 8) {
  6 |     # environment setup
  7 |     getSuggestedLib("plotly")
  8 |     
  9 |     # limit amount of data points
 10 |     if (!is.null(maxPoints)) {
 11 |       data <- data[1:maxPoints, ]
 12 |     }
 13 | 
 14 |     # args type-checking
 15 |     if (!class(data[, sName]) %in% c("factor", "character"))
 16 |       stop(
 17 |         "sName should be of factor or character data type. 
 18 |         Consider setting this as yName instead"
 19 |       )
 20 |     
 21 |     # check 3D plot compatibility
 22 |     if (length(yNames) != 3) {
 23 |       stop("ScatterPlot3d requires 3 variables for the 3 axis")
 24 |     }
 25 |     
 26 |     # sGroups <- a vector of the individual group names in the 'data'.
 27 |     # the user can supply sGroups as an vector of names they want to look at
 28 |     if (is.null(sGroups)) {
 29 |       sGroups <- makeSGroups(data, sName, numGroups, sortedBy)
 30 |     }
 31 | 
 32 |     # limits dataset to include only those with a group in groupNames
 33 |     data <- data[data[, sName] %in% sGroups, ]
 34 |     data <- droplevels(data)
 35 |     
 36 |     # limit values of data points
 37 |     if (!is.null(xlim) | !is.null(ylim) | !is.null(zlim))
 38 |       data <- limitRange(data, yNames, xlim, ylim, zlim)
 39 |     
 40 |     # creates a title
 41 |     if (is.null(main)) {
 42 |       for (yName in names(data[yNames])) {
 43 |         main <- paste(main, yName)
 44 |       }
 45 | 
 46 |       main <- paste(main, " by ", names(data[sName]))
 47 |     }
 48 |     
 49 |     # save this to print to the text of each point
 50 |     original <- data
 51 | 
 52 |     # numeric for a cleaner looking graph if the axis is factor type
 53 |     data[, yNames] <- sapply(data[, yNames], as.numeric)
 54 | 
 55 |     # info card for each data point
 56 |     text <- paste("<extra></extra>", sep = "")
 57 |     for (i in 1:length(data)) {
 58 |       text <- paste(
 59 |         text,
 60 |         names(data[i]),
 61 |         ": ",
 62 |         original[, i],
 63 |         "<br>",
 64 |         sep = ""
 65 |       )
 66 |     }
 67 | 
 68 |     # plotting the points
 69 |     fig <- plotly::plot_ly(
 70 |       data,
 71 |       x = data[, yNames[1]],
 72 |       y = data[, yNames[2]],
 73 |       z = data[, yNames[3]],
 74 |       color = data[, sName],
 75 |       colors = colors,
 76 |       hovertemplate = text,
 77 |       marker = list(
 78 |         size = pointSize,
 79 |         opacity = opacity
 80 |       )
 81 |     )
 82 | 
 83 |     fig <- plotly::add_markers(fig)
 84 | 
 85 |     # add labels and axis
 86 |     fig <- plotly::layout(
 87 |       fig,
 88 |       title = main,
 89 |       scene = list(
 90 |         xaxis = list(title = paste(names(data[yNames[1]]), "(X)")),
 91 |         yaxis = list(title = paste(names(data[yNames[2]]), "(Y)")),
 92 |         zaxis = list(title = paste(names(data[yNames[3]]), "(Z)")),
 93 |         legend = list(title = list(text = names(data[sName])))
 94 |       )
 95 |     )
 96 |     
 97 |     return(fig)
 98 |   }
 99 | 
100 | # ---- Test Cases ----
101 | # library(dsld)
102 | # data(svcensus)
103 | # dsldScatterPlot3D(svcensus, yNames = c("educ", "wageinc", "occ"), sName = "gender")
104 | 
105 | # Generates a list of groups that exist within a sName column of a data frame
106 | makeSGroups <- function(data, sName, numGroups = NULL, sortedBy = "Name") {
107 |     # If there are 8 possible types the group variable can be, the vector is 8 long.
108 |     # Sorted according to user
109 |     sGroups <- NULL
110 |     switch(sortedBy,
111 |       "Name" = sGroups <- levels(unique(data[, sName])),
112 |       "Frequency" = sGroups <-
113 |         names(sort(table(data[, sName]), decreasing = T)),
114 |       "Frequency-Descending" = sGroups <-
115 |         names(sort(table(data[, sName]), decreasing = F))
116 |     )
117 | 
118 |     # otherwise the vector is cut off to only have numGroups number of sGroups
119 |     if (!is.null(numGroups) && length(sGroups) > numGroups) {
120 |       sGroups <- sGroups[1:numGroups]
121 |     }
122 | 
123 |     return(sGroups)
124 |   }
125 | 
126 | 
127 | # Restricts the values of a data frame to specified limits
128 | limitRange <-
129 |   function(data, yNames, xlim = NULL, ylim = NULL, zlim = NULL) {
130 |     # in case the user only gives lim as a single number
131 |     xlim <- rep(xlim, 2)
132 |     ylim <- rep(ylim, 2)
133 |     zlim <- rep(zlim, 2)
134 |     # limits the data frame 
135 |     if (!is.null(xlim))
136 |       data <- data[data[, yNames[1]] >= xlim[1] & data[, yNames[1]] <= xlim[2],]
137 |     if (!is.null(ylim))
138 |       data <- data[data[, yNames[2]] >= ylim[1] & data[, yNames[2]] <= ylim[2],]
139 |     if (!is.null(zlim))
140 |       data <- data[data[, yNames[3]] >= zlim[1] & data[, yNames[3]] <= zlim[2],]
141 |     
142 |     data
143 |   }
144 | 


--------------------------------------------------------------------------------
/man/dsldEDFFair.Rd:
--------------------------------------------------------------------------------
  1 | \name{dsldEDFFair Wrappers}
  2 | \alias{dsldQeFairKNN}
  3 | \alias{dsldQeFairRF}
  4 | \alias{dsldQeFairRidgeLin}
  5 | \alias{dsldQeFairRidgeLog}
  6 | \alias{predict.dsldQeFair}
  7 | 
  8 | \title{dsldEDFFair Wrappers}
  9 | 
 10 | \description{ 
 11 |   Explicitly Deweighted Features: control the effect of proxies 
 12 |   related to sensitive variables for prediction. 
 13 | }
 14 | 
 15 | \usage{
 16 | dsldQeFairKNN(data, yName, sNames, deweightPars = NULL, 
 17 |   yesYVal = NULL, k = 25, scaleX = TRUE)
 18 | dsldQeFairRF(data, yName, sNames, deweightPars = NULL, nTree = 500, 
 19 |   minNodeSize = 10, mtry = floor(sqrt(ncol(data))), yesYVal = NULL)
 20 | dsldQeFairRidgeLin(data, yName, sNames, deweightPars = NULL)
 21 | dsldQeFairRidgeLog(data, yName, sNames, deweightPars = NULL, yesYVal)
 22 | \method{predict}{dsldQeFair}(object,newx,...)
 23 | }
 24 | 
 25 | \arguments{
 26 |     \item{data}{
 27 |         Dataframe, training set.
 28 |     }
 29 |     \item{yName}{
 30 |         Name of the response variable column. 
 31 |     }
 32 |     \item{sNames}{
 33 |         Name(s) of the sensitive attribute column(s). 
 34 |     }
 35 |     \item{deweightPars}{
 36 |         Values for de-emphasizing variables in a split, e.g. 
 37 |         'list(age=0.2,gender=0.5)'. In the linear case,
 38 |         larger values means more deweighting, i.e. less influence of the given 
 39 |         variable on predictions. For KNN and random forests, smaller
 40 |         values mean more deweighting.
 41 |     }
 42 |     \item{scaleX}{
 43 |         Scale the features. Defaults to TRUE.
 44 |     }
 45 |     \item{yesYVal}{
 46 |         Y value to be considered "yes," to be coded 1 rather than 0.
 47 |     }
 48 |     \item{k}{
 49 |         Number of nearest neighbors. In functions other than 
 50 |         \code{dsldQeFairKNN} for which this is an argument, 
 51 |         it is the number of neighbors to use in finding 
 52 |         conditional probabilities via knnCalib.
 53 |     } 
 54 |     \item{nTree}{
 55 |         Number of trees.
 56 |     }
 57 |     \item{minNodeSize}{
 58 |         Minimum number of data points in a tree node.
 59 |     }
 60 |     \item{mtry}{
 61 |         Number of variables randomly tried at each split.
 62 |     }
 63 |     \item{object}{
 64 |         An object returned by the dsld-EDFFAIR wrapper.  
 65 |     }
 66 |     \item{newx}{
 67 |         New data to be predicted. Must be in the same format as original data.
 68 |     }
 69 |     \item{...}{
 70 |     Further arguments.
 71 |     }
 72 | }
 73 | 
 74 | \author{
 75 |     N. Matloff, A. Mittal, J. Tran
 76 | }
 77 | 
 78 | \details{ 
 79 |   
 80 |     The sensitive variables S are removed entirely, but there is concern
 81 |     that they still affect prediction indirectly, via a set C of proxy
 82 |     variables. 
 83 | 
 84 |     Linear EDF reduces the impact of the proxies through a shinkage
 85 |     process similar to that of ridge regression. Specifically, instead
 86 |     of minimizing the sum of squared errors SSE with respect to a
 87 |     coefficient vector b, we minimize SSE + the squared norm of Db,
 88 |     where D is a diagonal matrix with nonzero elements corresponding to
 89 |     C. Large values penalizing variables in C, thus shrinking them.
 90 | 
 91 |     KNN EDF reduces the weights in Euclidean distance for variables in
 92 |     C.  The random forests version reduces the probabilities that a
 93 |     proxy will be used in splitting a node.
 94 | 
 95 |     By using various values of the deweighting parameters, the user can
 96 |     choose a desired position in the Fairness-Utility Tradeoff.
 97 | 
 98 |     More details can be found in the references. 
 99 |     
100 |     The DSLD package extends functionality by providing both accuracy 
101 |     (MAPE or misclassification rate) and fairness (correlation) on the 
102 |     training set during model training.
103 | }
104 | 
105 | \value{
106 | 
107 | The EDF functions return objects of class 'dsldQeFair', which include
108 | components for test and base accuracy, summaries of inputs and so on.
109 | 
110 | }
111 | 
112 | \references{
113 |   https://github.com/matloff/EDFfair 
114 | }
115 | 
116 | \seealso{
117 | Matloff, Norman, and Wenxi Zhang. "A novel regularization approach to fair ML." \cr
118 | \code{arXiv preprint arXiv:2208.06557} (2022).
119 | }
120 | 
121 | \examples{  
122 | \donttest{
123 | # regression example
124 | data(svcensus)
125 | 
126 | # test/train splits
127 | n <- nrow(svcensus)
128 | train_idx <- sample(seq_len(n), size = 0.7 * n) 
129 | train <- svcensus[train_idx, ]
130 | test  <- svcensus[-train_idx, -4]
131 | test_y <- svcensus[-train_idx, 4]
132 | 
133 | # dsldQeFairRidgeLin: deweight "occupation" and "age" columns
134 | ### also works for qeFairKNN and qeFairRF
135 | lin <- dsldQeFairRidgeLin(train, "wageinc", "gender", deweightPars = 
136 |                             list(occ=.4, age=.2))
137 | 
138 | # training results
139 | lin$trainAcc
140 | lin$trainCorrs
141 | 
142 | # testing results
143 | res <- predict(lin, test) 
144 | res$correlations
145 | mean(abs(res$preds - test_y))
146 | 
147 | # also works with dsldQeFairRF, dsldQeFairKNN
148 | 
149 | 
150 | # classification example
151 | data(compas1) 
152 | 
153 | # test/train splits
154 | n <- nrow(compas1)
155 | train_idx <- sample(seq_len(n), size = 0.7 * n) 
156 | train <- compas1[train_idx, ]
157 | test  <- compas1[-train_idx, -8]
158 | test_y <- compas1[-train_idx, 8]
159 | test_y <- as.factor(as.integer(test_y== 'Yes'))
160 | 
161 | # dsldQeFairKNN: deweight "decile score" column with "race" as the sensitive variable
162 | # also works for qeFairRF, qeFairRidgeLog
163 | knnOut <- dsldQeFairKNN(compas1, "two_year_recid", "race", 
164 |                         list(decile_score=0.1), yesYVal = "Yes")
165 | 
166 | # training/testing results
167 | knnOut$trainAcc 
168 | knnOut$trainCorrs 
169 | res = predict(knnOut, test) 
170 | res$correlations
171 | mean(test_y != round(res$preds$probs))
172 | 
173 | # also works with dsldQeFairRF, dsldQeFairRidgeLog
174 | } 
175 | 
176 | }


--------------------------------------------------------------------------------
/inst/examples/graphical.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "80ed4fdb",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "#### Examples for graphical methods provided by dsldPy\n",
  9 |     "\n",
 10 |     "The goal is to make each function call as simple as possible for the users. The following examples of functions are illustrated:\n",
 11 |     "\n",
 12 |     "1. dsldPyScatterPlot\n",
 13 |     "2. dsldFreqParCoord\n",
 14 |     "3. dsldPyConditsDisparity\n",
 15 |     "3. dsldPyBnLearn\n",
 16 |     "5. dsldConfounders / dsldDensityByS"
 17 |    ]
 18 |   },
 19 |   {
 20 |    "cell_type": "code",
 21 |    "execution_count": null,
 22 |    "id": "f1c5d10e",
 23 |    "metadata": {},
 24 |    "outputs": [],
 25 |    "source": [
 26 |     "## requires R and the dsld (R) package installed\n",
 27 |     "# !pip install dsldPy"
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "code",
 32 |    "execution_count": null,
 33 |    "id": "03b32052",
 34 |    "metadata": {},
 35 |    "outputs": [],
 36 |    "source": [
 37 |     "# load necessary libraries\n",
 38 |     "from dsldPy import (\n",
 39 |     "# data reading and preprocessing\n",
 40 |     "preprocess_data, read_data,\n",
 41 |     "\n",
 42 |     "dsldPyScatterPlot3D, dsldPyFreqPCoord, dsldPyConditDisparity, dsldPyConfounders, dsldPyDensitybyS, dsldPyIamb\n",
 43 |     ")\n"
 44 |    ]
 45 |   },
 46 |   {
 47 |    "cell_type": "code",
 48 |    "execution_count": null,
 49 |    "id": "953b287e",
 50 |    "metadata": {},
 51 |    "outputs": [],
 52 |    "source": [
 53 |     "### data preprocessing\n",
 54 |     "\n",
 55 |     "### all dsldPy functions require a R data frame object as input (NOT pandas dataframe)\n",
 56 |     "### the preprocessing is done by the function preprocess_data\n",
 57 |     "### user needs to manually provide the categorical and numerical features (list)\n",
 58 |     "### the function preprocess_data returns a R data.frame object -> required input for the dsldPy functions\n",
 59 |     "\n",
 60 |     "# two datasets\n",
 61 |     "# svcensus data\n",
 62 |     "#### REPLACE WITH YOUR PATH TO svcensus.RData\n",
 63 |     "# df = read_data(\"\") \n",
 64 |     "\n",
 65 |     "# preprocess data\n",
 66 |     "cat_features = ['educ', 'occ', 'gender']\n",
 67 |     "num_features= ['age', 'wageinc', 'wkswrkd']\n",
 68 |     "svcensus = preprocess_data(df, cat_features, num_features)\n",
 69 |     "\n",
 70 |     "# compas1 data\n",
 71 |     "#### REPLACE WITH YOUR PATH TO compas1.RData\n",
 72 |     "# df = read_data(\"\")\n",
 73 |     "\n",
 74 |     "# preprocess data\n",
 75 |     "cat_features = [\"sex\", \"two_year_recid\", \"race\"]\n",
 76 |     "num_features = [\"age\",\"juv_fel_count\",\"decile_score\",\"juv_misd_count\",\"juv_other_count\",\"priors_count\",\"c_jail_in\",\"c_jail_out\",\"c_offense_date\",\"screening_date\",\"in_custody\",\"out_custody\"]\n",
 77 |     "compas1 = preprocess_data(df, cat_features, num_features)\n"
 78 |    ]
 79 |   },
 80 |   {
 81 |    "cell_type": "code",
 82 |    "execution_count": null,
 83 |    "id": "10d19951",
 84 |    "metadata": {},
 85 |    "outputs": [],
 86 |    "source": [
 87 |     "### 1. ------------------------------ dsldPyScatterPlot3D ------------------------------\n",
 88 |     "dsldPyScatterPlot3D(data = svcensus, yNames= ['wageinc', 'wkswrkd', 'age'], sName = 'gender')"
 89 |    ]
 90 |   },
 91 |   {
 92 |    "cell_type": "code",
 93 |    "execution_count": null,
 94 |    "id": "ec5edc31",
 95 |    "metadata": {},
 96 |    "outputs": [],
 97 |    "source": [
 98 |     "### 2. ------------------------------ dsldPyFreqPCoord ------------------------------\n",
 99 |     "dsldPyFreqPCoord(data = compas1, m = 100, sName = 'sex')"
100 |    ]
101 |   },
102 |   {
103 |    "cell_type": "code",
104 |    "execution_count": null,
105 |    "id": "d3c2dc1d",
106 |    "metadata": {},
107 |    "outputs": [],
108 |    "source": [
109 |     "### 3. ------------------------------ dsldPyConditDisparity ------------------------------\n",
110 |     "dsldPyConditDisparity(data = compas1, yName= \"two_year_recid\", sName= \"race\", xName=\"age\", condits=[\"priors_count <= 4\",\"decile_score>=6\"])"
111 |    ]
112 |   },
113 |   {
114 |    "cell_type": "code",
115 |    "execution_count": null,
116 |    "id": "89831a3d",
117 |    "metadata": {},
118 |    "outputs": [],
119 |    "source": [
120 |     "### 4. ------------------------------ dsldPyIamb ------------------------------\n",
121 |     "dsldPyIamb(data = svcensus)"
122 |    ]
123 |   },
124 |   {
125 |    "cell_type": "code",
126 |    "execution_count": null,
127 |    "id": "4e960a57",
128 |    "metadata": {},
129 |    "outputs": [],
130 |    "source": [
131 |     "### 6. ------------------------------ dsldPyConfounders/dsldPyDensitybyS ------------------------------\n",
132 |     "### the plot is shown in a new google/chrome window // all other variables shown\n",
133 |     "dsldPyConfounders(data =svcensus, sName='gender')"
134 |    ]
135 |   },
136 |   {
137 |    "cell_type": "code",
138 |    "execution_count": null,
139 |    "id": "0af77c5a",
140 |    "metadata": {},
141 |    "outputs": [],
142 |    "source": [
143 |     "### if you just want to focus on one specific plot, you can use the dsldPyDensitybyS function \n",
144 |     "### the plot is shown in a new google/chrome window \n",
145 |     "dsldPyDensitybyS(svcensus, cName = 'wageinc', sName='gender')\n"
146 |    ]
147 |   },
148 |   {
149 |    "cell_type": "code",
150 |    "execution_count": null,
151 |    "id": "2eba7dba",
152 |    "metadata": {},
153 |    "outputs": [],
154 |    "source": []
155 |   }
156 |  ],
157 |  "metadata": {
158 |   "kernelspec": {
159 |    "display_name": "dsld",
160 |    "language": "python",
161 |    "name": "python3"
162 |   },
163 |   "language_info": {
164 |    "codemirror_mode": {
165 |     "name": "ipython",
166 |     "version": 3
167 |    },
168 |    "file_extension": ".py",
169 |    "mimetype": "text/x-python",
170 |    "name": "python",
171 |    "nbconvert_exporter": "python",
172 |    "pygments_lexer": "ipython3",
173 |    "version": "3.12.4"
174 |   }
175 |  },
176 |  "nbformat": 4,
177 |  "nbformat_minor": 5
178 | }
179 | 


--------------------------------------------------------------------------------
/R/dsldFairML.R:
--------------------------------------------------------------------------------
  1 | ### dsld fairML wrappers 
  2 | 
  3 | # base function for fairML wrappers --- they all follow the same format:
  4 | # converts the data into a format that the fairml models accept
  5 | # then puts the fairml model inside an object of the dsldFairML class which
  6 | # has its own predict function
  7 | 
  8 | fairmlBase <- function(fairmlFUNC, data, yName, sName, unfairness, ...) {
  9 |   
 10 |   # data-prep
 11 |   data <- toNumericFactor(data)
 12 |   response <- data[,yName]
 13 |   predictors <- data[,!colnames(data) %in% c(yName,sName)]
 14 |   sensitive <- data[,sName]
 15 |   
 16 |   # calls a fairml model function as the base for the dsldFairML object
 17 |   base <- fairmlFUNC(response = response, predictors = predictors, 
 18 |                        sensitive = sensitive, unfairness = unfairness, ...)
 19 |   
 20 |   # save yName and sName to use in predict()
 21 |   model <- list(
 22 |     base        = base,
 23 |     yName       = yName,
 24 |     sName       = sName,
 25 |     FactorsInfo = factor_levels(data)
 26 |   )
 27 |   
 28 |   class(model) <- c("dsldFairML")
 29 |   model
 30 | }
 31 | 
 32 | # wrapper for Frrm()
 33 | dsldFrrm <- function(data, yName, sName, unfairness,
 34 |                      definition = "sp-komiyama", lambda = 0, 
 35 |                      save.auxiliary = FALSE) {
 36 |   
 37 |   data <- toNumericFactor(data)
 38 |   
 39 |   suppressWarnings({
 40 |     model = fairmlBase(fairml::frrm, data, yName, sName, unfairness,
 41 |                        definition, lambda, save.auxiliary)
 42 |   })
 43 |   
 44 |   # training preds/corrs
 45 |   predictors <- data[,!colnames(data) %in% c(yName, sName)]
 46 |   sensitive <- data[,sName]
 47 |   model$trainPreds <- predict(model$base, predictors, sensitive)
 48 |   model$trainAcc <- mean(abs(model$trainPreds - data[[yName]]))
 49 |   model$trainCorrs <- s_correlations(data, sName, model$trainPreds)
 50 |   model
 51 | }
 52 | 
 53 | # wrapper for Fgrrm()
 54 | dsldFgrrm <- function(data, yName, sName, unfairness,
 55 |                       definition = "sp-komiyama", family = "binomial", 
 56 |                       lambda = 0, save.auxiliary = FALSE, yesYVal) {
 57 |   
 58 |   data <- toNumericFactor(data)
 59 |   data[[yName]] <- as.factor(as.integer(data[[yName]] == yesYVal))
 60 |   
 61 |   suppressWarnings({
 62 |     model <- fairmlBase(fairml::fgrrm, data, yName, sName, unfairness,
 63 |                         definition, family, lambda, save.auxiliary)
 64 |   })
 65 |   
 66 |   # training preds/corrs
 67 |   predictors <- data[,!colnames(data) %in% c(yName, sName)]
 68 |   sensitive <- data[,sName]
 69 |   model$trainPreds <- predict(model$base, predictors, sensitive)
 70 |   test_y <- as.integer(data[[yName]] == 1)  
 71 |   model$trainAcc <- mean(test_y != round(model$trainPreds))
 72 |   model$trainCorrs <- s_correlations(data, sName, model$trainPreds)
 73 |   model
 74 | }
 75 | 
 76 | # wrapper for Nclm()
 77 | dsldNclm <- function(data, yName, sName, unfairness, covfun = cov, 
 78 |                      lambda = 0, save.auxiliary = FALSE) {
 79 |   
 80 |   getSuggestedLib('cccp')
 81 |   data <- toNumericFactor(data)
 82 |   
 83 |   suppressWarnings({
 84 |     model <- fairmlBase(fairml::nclm, data, yName, sName, unfairness, covfun, 
 85 |                         lambda, save.auxiliary)
 86 |   })
 87 |   
 88 |   # training preds/corrs
 89 |   predictors <- data[,!colnames(data) %in% c(yName, sName)]
 90 |   sensitive <- data[,sName]
 91 |   model$trainPreds <- predict(model$base, predictors, sensitive)
 92 |   model$trainAcc <- mean(abs(model$trainPreds - data[[yName]]))
 93 |   model$trainCorrs <- s_correlations(data, sName, model$trainPreds)
 94 |   model
 95 | }
 96 | 
 97 | # wrapper for Zlm()
 98 | dsldZlm <- function(data, yName, sName, unfairness) {
 99 |  
100 |   getSuggestedLib('CVXR')
101 |   data <- toNumericFactor(data)
102 |   
103 |   suppressWarnings({
104 |     model <- fairmlBase(fairml::zlm, data, yName, sName, unfairness)
105 |   })
106 |   
107 |   # training preds/corrs
108 |   predictors <- data[,!colnames(data) %in% c(yName, sName)]
109 |   sensitive <- data[,sName]
110 |   model$trainPreds <- predict(model$base, predictors)
111 |   model$trainAcc <- mean(abs(model$trainPreds - data[[yName]]))
112 |   model$trainCorrs <- s_correlations(data, sName, model$trainPreds)
113 |   model
114 | }
115 | 
116 | # wrapper for Zlrm()
117 | dsldZlrm <- function(data, yName, sName, unfairness, yesYVal) {
118 |   
119 |   getSuggestedLib('CVXR')
120 |   data <- toNumericFactor(data)
121 |   data[[yName]] <- as.factor(as.integer(data[[yName]] == yesYVal))
122 |   
123 |   suppressWarnings({
124 |     model <- fairmlBase(fairml::zlrm, data, yName, sName, unfairness)
125 |   })
126 |   
127 |   # training preds/corrs
128 |   predictors <- data[,!colnames(data) %in% c(yName, sName)]
129 |   sensitive <- data[,sName]
130 |   model$trainPreds <- predict(model$base, predictors)
131 |   test_y <- as.integer(data[[yName]] == 1)  
132 |   model$trainAcc <- mean(test_y != round(model$trainPreds))
133 |   model$trainCorrs <- s_correlations(data, sName, model$trainPreds)
134 |   model
135 | }
136 | 
137 | ### S3 methods summary() and predict()
138 | summary.dsldFairML <- function(object,...){
139 |   summary(object$base)
140 | }
141 | 
142 | predict.dsldFairML <- function(object, newx,...) {
143 |   suppressWarnings({
144 |     # data-prep
145 |     newx <- toNumericFactor(newx)
146 |     newx <- apply_factor_levels(newx, object$FactorsInfo)
147 |     
148 |     yName <- object$yName
149 |     sName <- object$sName
150 |     predictors <- newx[,!colnames(newx) %in% c(yName, sName)]
151 |     sensitive <- newx[,sName]
152 |     
153 |     class <- class(object$base)[1]
154 |     
155 |     if (class %in% c("zlm", "zlrm")) {
156 |       
157 |       # zlm and zlrm have one less argument for prediction
158 |       preds <- predict(object$base, predictors)
159 |       cors  <- s_correlations(newx, sName, preds)
160 |       return(list(preds = preds, correlations = cors))
161 |       
162 |     } else { 
163 |       
164 |       preds <- predict(object$base, predictors, sensitive)
165 |       cors  <- s_correlations(newx, sName, preds)
166 |       return(list(preds = preds, correlations = cors))
167 |       
168 |     }
169 |   })
170 | }
171 | 


--------------------------------------------------------------------------------
/inst/src/dsldPy/dsldPyFairML.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Python interface for dsldFairML functions in the dsld R package.
  3 | '''
  4 | 
  5 | from .Utils import get_dsld, dsld_Rpy2_IsRDataframe
  6 | from rpy2.robjects.packages import importr
  7 | import rpy2.robjects as robjects
  8 | 
  9 | 
 10 | # ================== dsldFrrm ==================
 11 | 
 12 | def dsldPyFrrm(data, yName, sName, unfairness, definition="sp-komiyama", lamda=0, save=False):
 13 |     r_data   = dsld_Rpy2_IsRDataframe(data)
 14 |     yName_r  = robjects.StrVector([yName])
 15 |     sName_r  = robjects.StrVector([sName])
 16 |     unfair_r = robjects.FloatVector([unfairness])
 17 |     def_r    = robjects.StrVector([definition])
 18 |     lamda_r  = robjects.FloatVector([lamda])
 19 |     save_r   = robjects.BoolVector([save])
 20 | 
 21 |     dsld = get_dsld()
 22 |     model = dsld.dsldFrrm(r_data, yName_r, sName_r, unfair_r, def_r, lamda_r, save_r)
 23 | 
 24 |     preds = model.rx2("trainPreds")
 25 |     acc   = model.rx2("trainAcc")
 26 |     corrs = model.rx2("trainCorrs")
 27 | 
 28 |     result = {
 29 |         "model": model,
 30 |         "train_predictions": list(preds),
 31 |         "train_accuracy": float(acc[0]),
 32 |         "train_correlations": list(zip(list(corrs.rx2("feature")),
 33 |                                        list(corrs.rx2("correlation"))))
 34 |     }
 35 |     return result
 36 | 
 37 | # ================== dsldFgrrm ==================
 38 | 
 39 | def dsldPyFgrrm(data, yName, sName, unfairness, definition="sp-komiyama", family="binomial", lamda=0, save=False, yesYVal = None):
 40 |     r_data   = dsld_Rpy2_IsRDataframe(data)
 41 |     yName_r  = robjects.StrVector([yName])
 42 |     sName_r  = robjects.StrVector([sName])
 43 | 
 44 |     unfair_r = robjects.FloatVector([unfairness])
 45 |     
 46 |     def_r    = robjects.StrVector([definition])
 47 |     fam_r    = robjects.StrVector([family])
 48 |     lamda_r  = robjects.FloatVector([lamda])
 49 |     save_r   = robjects.BoolVector([save])
 50 |     yesYVal_r = robjects.StrVector([yesYVal])
 51 | 
 52 | 
 53 |     dsld = get_dsld()
 54 |     model = dsld.dsldFgrrm(r_data, yName_r, sName_r, unfair_r, def_r, fam_r, lamda_r, save_r, yesYVal_r)
 55 | 
 56 |     preds = model.rx2("trainPreds")
 57 |     acc   = model.rx2("trainAcc")
 58 |     corrs = model.rx2("trainCorrs")
 59 | 
 60 |     result = {
 61 |         "model": model,
 62 |         "train_predictions": list(preds),
 63 |         "train_accuracy": float(acc[0]),
 64 |         "train_correlations": list(zip(list(corrs.rx2("feature")),
 65 |                                        list(corrs.rx2("correlation"))))
 66 |     }
 67 |     return result
 68 | 
 69 | # ================== dsldNclm ==================
 70 | 
 71 | def dsldPyNclm(data, yName, sName, unfairness, covfun=robjects.r('cov'), lamda=0, save=False):
 72 |     r_data   = dsld_Rpy2_IsRDataframe(data)
 73 |     yName_r  = robjects.StrVector([yName])
 74 |     sName_r  = robjects.StrVector([sName])
 75 |     unfair_r = robjects.FloatVector([unfairness])
 76 |     lamda_r  = robjects.FloatVector([lamda])
 77 |     save_r   = robjects.BoolVector([save])
 78 | 
 79 |     dsld = get_dsld()
 80 |     model = dsld.dsldNclm(r_data, yName_r, sName_r, unfair_r, covfun, lamda_r, save_r)
 81 | 
 82 |     preds = model.rx2("trainPreds")
 83 |     acc   = model.rx2("trainAcc")
 84 |     corrs = model.rx2("trainCorrs")
 85 | 
 86 |     result = {
 87 |         "model": model,
 88 |         "train_predictions": list(preds),
 89 |         "train_accuracy": float(acc[0]),
 90 |         "train_correlations": list(zip(list(corrs.rx2("feature")),
 91 |                                        list(corrs.rx2("correlation"))))
 92 |     }
 93 |     return result
 94 | 
 95 | # ================== dsldZlm ==================
 96 | 
 97 | def dsldPyZlm(data, yName, sName, unfairness):
 98 |     r_data   = dsld_Rpy2_IsRDataframe(data)
 99 |     yName_r  = robjects.StrVector([yName])
100 |     sName_r  = robjects.StrVector([sName])
101 |     unfair_r = robjects.FloatVector([unfairness])
102 | 
103 |     dsld = get_dsld()
104 |     model = dsld.dsldZlm(r_data, yName_r, sName_r, unfair_r)
105 | 
106 |     preds = model.rx2("trainPreds")
107 |     acc   = model.rx2("trainAcc")
108 |     corrs = model.rx2("trainCorrs")
109 | 
110 |     result = {
111 |         "model": model,
112 |         "train_predictions": list(preds),
113 |         "train_accuracy": float(acc[0]),
114 |         "train_correlations": list(zip(list(corrs.rx2("feature")),
115 |                                        list(corrs.rx2("correlation"))))
116 |     }
117 |     return result
118 | 
119 | # ================== dsldZlrm ==================
120 | 
121 | def dsldPyZlrm(data, yName, sName, unfairness, yesYVal):
122 |     r_data   = dsld_Rpy2_IsRDataframe(data)
123 |     yName_r  = robjects.StrVector([yName])
124 |     sName_r  = robjects.StrVector([sName])
125 |     unfair_r = robjects.FloatVector([unfairness])
126 |     yesYVal_r = robjects.StrVector([yesYVal])
127 | 
128 |     dsld = get_dsld()
129 |     model = dsld.dsldZlrm(r_data, yName_r, sName_r, unfair_r, yesYVal_r)
130 | 
131 |     preds = model.rx2("trainPreds")
132 |     acc   = model.rx2("trainAcc")
133 |     corrs = model.rx2("trainCorrs")
134 | 
135 |     result = {
136 |         "model": model,
137 |         "train_predictions": list(preds),
138 |         "train_accuracy": float(acc[0]),
139 |         "train_correlations": list(zip(list(corrs.rx2("feature")),
140 |                                        list(corrs.rx2("correlation"))))
141 |     }
142 |     return result
143 | 
144 | 
145 | # predict() and summary() method for all the models
146 | 
147 | def dsldPyFairML_Summary(model):
148 |     print(robjects.r['summary'](model['model']))
149 |     return robjects.r['summary'](model['model'])
150 | 
151 | def dsldPyFairML_Predict(model, newData):
152 |     robjects.r.assign("model", model['model'])
153 |     xNew = dsld_Rpy2_IsRDataframe(newData)
154 |     robjects.r.assign("xNew", xNew)
155 |     result = robjects.r('predict(model, xNew)')
156 |     names = list(result[1][0])
157 |     vals  = [float(v) for v in result[1][1]]
158 |     correlations = list(zip(names, vals))
159 |     output = {'test_predictions': list(result[0]),
160 |               'test_correlations': correlations}
161 |     return output
162 | 


--------------------------------------------------------------------------------
/inst/src/dsldPy/dsldPyQeFairML.py:
--------------------------------------------------------------------------------
  1 | from .Utils import get_dsld, dsld_Rpy2_IsRDataframe, dsld_Rpy2_RDataframeToPandas
  2 | import sys
  3 | import pandas as pd
  4 | import rpy2.robjects as robjects
  5 | from rpy2.robjects import pandas2ri
  6 | from rpy2.robjects import conversion
  7 | from rpy2.robjects.packages import importr
  8 | import rpy2.robjects as ro
  9 | import math
 10 | from rpy2.robjects.vectors import ListVector, FloatVector
 11 | from .Utils import dsld_Rpy2_RDataframeToPandas
 12 | import pandas as pd
 13 | 
 14 | from rpy2.robjects import conversion, default_converter
 15 | from rpy2.robjects import pandas2ri
 16 | 
 17 | # add pandas converter to the default rpy2 converter
 18 | converter = default_converter + pandas2ri.converter
 19 | 
 20 | # Import R packages
 21 | 
 22 | ### qeFairKNN-------------------------------------------------------------------
 23 | def dsldPyQeFairKNN(data, yName, sNames, deweightPars=None, yesYVal=None, k=25, scaleX=True):
 24 |     r_data = dsld_Rpy2_IsRDataframe(data)
 25 |     yName = robjects.StrVector([yName])
 26 |     sNames = robjects.StrVector([sNames])
 27 | 
 28 |     if deweightPars is not None:
 29 |         deweightPars = ListVector({k: FloatVector([v]) for k, v in deweightPars.items()})
 30 |     else:
 31 |         deweightPars = robjects.NULL
 32 | 
 33 |     if yesYVal is not None:
 34 |         yesYVal = robjects.StrVector([yesYVal])
 35 |     else:
 36 |         yesYVal = robjects.NULL
 37 | 
 38 |     k = robjects.IntVector([k])
 39 | 
 40 |     scaleX = robjects.BoolVector([scaleX])
 41 | 
 42 |     dsld = get_dsld()
 43 |     model = dsld.dsldQeFairKNN(r_data, yName, sNames, deweightPars, yesYVal, k, scaleX)
 44 | 
 45 |     preds = model.rx2("trainPreds")[0]
 46 |     acc   = model.rx2("trainAcc")
 47 |     corrs = model.rx2("trainCorrs")
 48 | 
 49 |     with conversion.localconverter(converter):
 50 |         corrs_df = conversion.rpy2py(corrs)
 51 | 
 52 |     result = {
 53 |         "model": model,
 54 |         "train_predictions": list(preds),
 55 |         "train_accuracy": float(acc[0]),
 56 |         "train_correlations": list(zip(corrs_df["feature"], corrs_df["correlation"]))
 57 |     }
 58 |     return result
 59 | 
 60 | ## dsldQeFairRF-------------------------------------------------------------------
 61 | def dsldPyQeFairRF(data, yName, sNames, deweightPars=None, nTree=500, minNodeSize=10, mtry=None, yesYVal=None):    
 62 | 
 63 |     temp_data = dsld_Rpy2_RDataframeToPandas(data)
 64 |     if mtry is None:
 65 |         mtry = math.floor(math.sqrt(temp_data.shape[1]))
 66 | 
 67 |     r_data = dsld_Rpy2_IsRDataframe(data)
 68 |     yName = robjects.StrVector([yName])
 69 |     sNames = robjects.StrVector([sNames])
 70 | 
 71 |     if deweightPars is not None:
 72 |         deweightPars = ListVector({k: FloatVector([v]) for k, v in deweightPars.items()})
 73 |     else:
 74 |         deweightPars = robjects.NULL
 75 | 
 76 |     if yesYVal is not None:
 77 |         yesYVal = robjects.StrVector([yesYVal])
 78 |     else:
 79 |         yesYVal = robjects.NULL
 80 | 
 81 |     nTree = robjects.IntVector([nTree])
 82 |     minNodeSize = robjects.IntVector([minNodeSize])
 83 |     mtry = robjects.IntVector([mtry])
 84 | 
 85 |     dsld = get_dsld()
 86 |     model = dsld.dsldQeFairRF(r_data, yName, sNames, deweightPars, nTree, minNodeSize, mtry, yesYVal)
 87 | 
 88 |     preds = model.rx2("trainPreds")[0]
 89 |     acc   = model.rx2("trainAcc")
 90 |     corrs = model.rx2("trainCorrs")
 91 | 
 92 |     with conversion.localconverter(converter):
 93 |         corrs_df = conversion.rpy2py(corrs)
 94 | 
 95 |     result = {
 96 |         "model": model,
 97 |         "train_predictions": list(preds),
 98 |         "train_accuracy": float(acc[0]),
 99 |         "train_correlations": list(zip(corrs_df["feature"], corrs_df["correlation"]))
100 |     }
101 |     return result
102 | 
103 | ### dsldQeFairRidgeLin-------------------------------------------------------------------
104 | 
105 | def dsldPyQeFairRidgeLin(data, yName, sNames, deweightPars=None):
106 |     r_data = dsld_Rpy2_IsRDataframe(data)
107 |     yName = robjects.StrVector([yName])
108 |     sNames = robjects.StrVector([sNames])
109 | 
110 |     if deweightPars is not None:
111 |         deweightPars = ListVector({k: FloatVector([v]) for k, v in deweightPars.items()})
112 |     else:
113 |         deweightPars = robjects.NULL
114 | 
115 |     dsld = get_dsld()
116 |     model = dsld.dsldQeFairRidgeLin(r_data, yName, sNames, deweightPars)
117 | 
118 |     preds = model.rx2("trainPreds")[0]
119 |     acc   = model.rx2("trainAcc")
120 |     corrs = model.rx2("trainCorrs")
121 | 
122 |     with conversion.localconverter(converter):
123 |         corrs_df = conversion.rpy2py(corrs)
124 | 
125 |     result = {
126 |         "model": model,
127 |         "train_predictions": list(preds),
128 |         "train_accuracy": float(acc[0]),
129 |         "train_correlations": list(zip(corrs_df["feature"], corrs_df["correlation"]))
130 |     }
131 |     return result
132 | 
133 | ### dsldQeFairRidgeLog-------------------------------------------------------------------
134 | 
135 | def dsldPyQeFairRidgeLog(data, yName, sNames, deweightPars=None, yesYVal=None):
136 |     r_data = dsld_Rpy2_IsRDataframe(data)
137 |     yName = robjects.StrVector([yName])
138 |     sNames = robjects.StrVector([sNames])
139 | 
140 |     if deweightPars is not None:
141 |         deweightPars = ListVector({k: FloatVector([v]) for k, v in deweightPars.items()})
142 |     else:
143 |         deweightPars = robjects.NULL
144 | 
145 |     if yesYVal is not None:
146 |         yesYVal = robjects.StrVector([yesYVal])
147 |     else:
148 |         yesYVal = robjects.NULL
149 | 
150 |     dsld = get_dsld()
151 |     model = dsld.dsldQeFairRidgeLog(r_data, yName, sNames, deweightPars, yesYVal)
152 | 
153 |     preds = model.rx2("trainPreds")[0]
154 |     acc   = model.rx2("trainAcc")
155 |     corrs = model.rx2("trainCorrs")
156 | 
157 |     with conversion.localconverter(converter):
158 |         corrs_df = conversion.rpy2py(corrs)
159 | 
160 |     result = {
161 |         "model": model,
162 |         "train_predictions": list(preds),
163 |         "train_accuracy": float(acc[0]),
164 |         "train_correlations": list(zip(corrs_df["feature"], corrs_df["correlation"]))
165 |     }
166 |     return result
167 | 
168 | ### predict() method for all the models
169 | def dsldPyQeFairML_Predict(model, newData):
170 |     robjects.r.assign("model", model['model'])
171 |     xNew = dsld_Rpy2_IsRDataframe(newData)
172 |     robjects.r.assign("xNew", xNew)
173 |     result = robjects.r('predict(model, xNew)')
174 |     names = list(result[1][0])
175 |     vals  = [float(v) for v in result[1][1]]
176 |     correlations = list(zip(names, vals))
177 |     output = {'test_predictions': list(result[0]),
178 |               'test_correlations': correlations}
179 |     return output
180 | 


--------------------------------------------------------------------------------
/R/Utils.R:
--------------------------------------------------------------------------------
  1 | 
  2 | # many functions in dsld are wrappers for functions in other packages;
  3 | # in order to avoid "package bloat," we instead check for them as needed
  4 | 
  5 | # e.g. say a dsld function f() wraps some function in package p; then
  6 | # instead of listing p as imported etc. in the dsld DESCRIPTION file, 
  7 | # we write the top of f(), getSuggestedLib('p'); this loads p if it is
  8 | # installed on the user's machine, otherwise so informs the user
  9 | 
 10 | getSuggestedLib <- function(pkgName) {
 11 |    if (!requireNamespace(pkgName,quietly=TRUE))
 12 |       stop(paste0(pkgName, ' not loaded'))
 13 | }
 14 | 
 15 | pr2file <- function(filename)
 16 | {
 17 |    origdev <- dev.cur()
 18 |    parts <- strsplit(filename,".",fixed=TRUE)
 19 |    nparts <- length(parts[[1]])
 20 |    suff <- parts[[1]][nparts]
 21 |    if (suff == "pdf") {
 22 |        pdf(filename)
 23 |    }
 24 |    else if (suff == "png") {
 25 |        png(filename,bg='white')
 26 |    }
 27 |    else jpeg(filename)
 28 |    devnum <- dev.cur()
 29 |    dev.set(origdev)
 30 |    dev.copy(which = devnum)
 31 |    dev.set(devnum)
 32 |    dev.off()
 33 |    dev.set(origdev)
 34 | }
 35 | 
 36 | # generates a "cartesian product" of factor levels from input factors
 37 | cartFactorLvls <- function(factorNames) 
 38 | {
 39 |    theLevels <- lapply(factorNames,function(fName) levels(get(fName)))
 40 |    expand.grid(theLevels)
 41 | }
 42 | 
 43 | ## needed for dsldLinear, dsldLogit -------------------------------------------
 44 | ### selects 5 rows for comparison across each level of the sensitive variable
 45 | ### randomly if the user doesn't supply data in the interactions case
 46 | 
 47 | dsldGetRow5 <- function(data, yName, sName) {
 48 |   rows <- sample(nrow(data), 5)
 49 |   reducedData <- data[rows, ]
 50 |   columns <- c(yName, sName)
 51 |   newDat <- reducedData[, !(names(reducedData) %in% columns)]
 52 |   result <- sprintf("No user sComparisonPts supplied. The following rows 
 53 |                     are selected: %s,%s,%s,%s,%s", rows[1],rows[2],rows[3],rows[4],
 54 |                     rows[5]); print(result)
 55 |   return(newDat)
 56 | }
 57 | 
 58 | ## needed for: python interfaces ----------------------------------------------
 59 | ### convert data to factors and numeric as per user input
 60 | convert_cols <- function(data, cat_features = character(), num_features = character()) {
 61 |   # If both vectors are missing or empty, return original data unchanged
 62 |   if ((missing(cat_features) || length(cat_features) == 0) &&
 63 |       (missing(num_features) || length(num_features) == 0)) {
 64 |     return(data)
 65 |   }
 66 |   
 67 |   data[] <- lapply(names(data), function(col) {
 68 |     if (col %in% cat_features) {
 69 |       factor(data[[col]])
 70 |     } else if (col %in% num_features) {
 71 |       as.numeric(data[[col]])
 72 |     } else {
 73 |       data[[col]]
 74 |     }
 75 |   })
 76 |   
 77 |   names(data) <- names(data)  # preserve original column names
 78 |   data
 79 | }
 80 | 
 81 | ### stores factors levels for each factor in dataset
 82 | factor_levels <- function(data) {
 83 |   stopifnot(is.data.frame(data))
 84 |   facs <- names(Filter(is.factor, data))
 85 |   setNames(lapply(facs, function(nm) levels(data[[nm]])), facs)
 86 | }
 87 | 
 88 | ### applies factor levels from each factor in dataset
 89 | apply_factor_levels <- function(test_data, train_levels, quiet = TRUE) {
 90 |   stopifnot(is.data.frame(test_data), is.list(train_levels))
 91 |   cols <- intersect(names(train_levels), names(test_data))
 92 |   if (!quiet) {
 93 |     skipped <- setdiff(names(train_levels), names(test_data))
 94 |     if (length(skipped)) message("Skipping missing columns: ", paste(skipped, collapse = ", "))
 95 |   }
 96 |   
 97 |   out <- test_data
 98 |   for (nm in cols) {
 99 |     levs <- train_levels[[nm]]
100 |     v <- out[[nm]]
101 |     v_chr <- as.character(v)                   # works for factor/char/anything coercible
102 |     fac <- factor(v_chr, levels = levs)        # unseen -> NA
103 |     if (!quiet && all(is.na(fac))) {
104 |       warning("Column '", nm, "' became all NA after applying training levels.")
105 |     }
106 |     out[[nm]] <- fac
107 |   }
108 |   out
109 | }
110 | 
111 | ## needed for fairML and EDF-Fair functions -----------------------------------
112 | ### converts integer cols to numeric and character cols to factors
113 | toNumericFactor <- function(data) {
114 |   data[,unlist(lapply(data, is.integer))] <- 
115 |     lapply(data[,unlist(lapply(data, is.integer))], as.numeric)
116 |   data[,unlist(lapply(data, is.character))] <- 
117 |     lapply(data[,unlist(lapply(data, is.character))], as.factor)
118 |   data
119 | }
120 | 
121 | ### computes correlation between predictions and one or more sensitive attributes 
122 | s_correlations <- function(data, sNames, predictions,
123 |                            method = "pearson",
124 |                            sort_by_abs = TRUE) {
125 |   stopifnot(is.data.frame(data))
126 |   
127 |   # normalize sNames
128 |   if (length(sNames) == 1L && is.character(sNames) && grepl(",", sNames, fixed = TRUE)) {
129 |     sNames <- trimws(strsplit(sNames, ",", fixed = TRUE)[[1]])
130 |   }
131 |   sNames <- unique(as.character(sNames[nzchar(sNames)]))
132 |   
133 |   if (length(predictions) != nrow(data)) {
134 |     stop("`predictions` length (", length(predictions),
135 |          ") must equal nrow(data) (", nrow(data), ").")
136 |   }
137 |   y <- as.numeric(predictions)
138 |   
139 |   blocks <- list()
140 |   
141 |   for (s in sNames) {
142 |     if (!s %in% names(data)) {
143 |       warning("Skipping missing column: ", s)
144 |       next
145 |     }
146 |     v <- data[[s]]
147 |     
148 |     # coerce characters to factors
149 |     if (is.character(v)) v <- factor(v)
150 |     
151 |     if (is.factor(v)) {
152 |       v <- droplevels(v)
153 |       # If all NA or fewer than 2 levels, skip to avoid contrasts error
154 |       if (all(is.na(v))) {
155 |         warning("Skipping '", s, "': all values are NA after level alignment.")
156 |         next
157 |       }
158 |       if (nlevels(v) < 2L) {
159 |         warning("Skipping '", s, "': factor has fewer than 2 observed levels.")
160 |         next
161 |       }
162 |       mm <- model.matrix(~ v - 1)              # safe now
163 |       colnames(mm) <- paste0(s, "==", levels(v))
164 |       blocks[[s]] <- mm
165 |       
166 |     } else if (is.logical(v)) {
167 |       blocks[[s]] <- matrix(as.numeric(v), ncol = 1, dimnames = list(NULL, s))
168 |       
169 |     } else if (is.numeric(v) || is.integer(v)) {
170 |       blocks[[s]] <- matrix(as.numeric(v), ncol = 1, dimnames = list(NULL, s))
171 |       
172 |     } else {
173 |       warning("Skipping unsupported type for ", s,
174 |               " (class: ", paste(class(v), collapse = "/"), ")")
175 |     }
176 |   }
177 |   
178 |   if (!length(blocks)) {
179 |     return(data.frame(feature = character(0), correlation = numeric(0)))
180 |   }
181 |   
182 |   X <- do.call(cbind, blocks)
183 |   
184 |   cors <- vapply(seq_len(ncol(X)),
185 |                  function(j) cor(y, X[, j], use = "pairwise.complete.obs", method = method),
186 |                  numeric(1))
187 |   names(cors) <- colnames(X)
188 |   
189 |   if (isTRUE(sort_by_abs)) cors <- cors[order(abs(cors), decreasing = TRUE)]
190 |   
191 |   data.frame(feature = names(cors), correlation = as.numeric(cors), row.names = NULL)
192 | }
193 | 


--------------------------------------------------------------------------------
/R/dsldFairUtils.R:
--------------------------------------------------------------------------------
  1 | 
  2 | # useful helpers
  3 | ### create k-fold split
  4 | make_folds <- function(n, k = 5) {
  5 |   stopifnot(n >= k, k >= 2)
  6 |   idx <- sample.int(n)                                 # shuffle rows
  7 |   split(idx, cut(seq_along(idx), breaks = k, labels = FALSE))
  8 | }
  9 | 
 10 | get_fold_split <- function(data, folds, i) {
 11 |   stopifnot(i >= 1, i <= length(folds))
 12 |   test_idx  <- folds[[i]]
 13 |   train_idx <- setdiff(seq_len(nrow(data)), test_idx)
 14 |   list(
 15 |     train = data[train_idx, , drop = FALSE],
 16 |     test  = data[test_idx,  , drop = FALSE]
 17 |   )
 18 | }
 19 | 
 20 | # Helper: keep only arguments that the target function supports (by name)
 21 | .filter_model_args <- function(ftn_name, user_args) {
 22 |   if (is.null(user_args) || !length(user_args)) return(list())
 23 |   ftn <- match.fun(ftn_name)
 24 |   allowed <- names(formals(ftn))
 25 |   keep <- names(user_args) %in% allowed
 26 |   if (any(!keep)) {
 27 |     dropped <- names(user_args)[!keep]
 28 |     warning(sprintf("Ignoring unsupported arg(s) for %s: %s",
 29 |                     ftn_name, paste(dropped, collapse = ", ")),
 30 |             call. = FALSE)
 31 |   }
 32 |   user_args[keep]
 33 | }
 34 | 
 35 | dsldFairUtils <- function(data, yName, sName, dsldFTNName, unfairness = NULL, 
 36 |                           deweightPars = NULL, yesYVal = NULL, k_folds = 5, 
 37 |                           model_args = NULL) {
 38 |   
 39 |   valid_models <- c("dsldQeFairKNN", "dsldQeFairRF", "dsldQeFairRidgeLin", "dsldQeFairRidgeLog", 
 40 |                     "dsldFrrm", "dsldFgrrm", "dsldNclm", "dsldZlm", "dsldZlrm")
 41 |   if (!(dsldFTNName %in% valid_models)) stop("Invalid dsldFTNName specified")
 42 |   
 43 |   # classification gating
 44 |   if (is.factor(data[[yName]])) {
 45 |     if (is.null(yesYVal)) stop("missing yesYVal")
 46 |     data[[yName]] <- as.factor(as.numeric(data[[yName]] == yesYVal))
 47 |     yesYVal <- "1"
 48 |   }
 49 |   
 50 |   if (dsldFTNName %in% c("dsldQeFairKNN","dsldQeFairRF","dsldQeFairRidgeLin","dsldQeFairRidgeLog")) {
 51 |    
 52 |     # --- build grid of deweightPars combos ---
 53 |     if (is.null(deweightPars) || !length(deweightPars))
 54 |       stop("Provide deweightPars as a named list. For a grid, use vectors (e.g. list(occ=c(0.2,0.4), educ=c(0.4))).")
 55 |     
 56 |     # If any element has length > 1, treat as grid; else one single combo
 57 |     is_grid <- any(vapply(deweightPars, length, integer(1)) > 1)
 58 |     grid_df <- if (is_grid) {
 59 |       expand.grid(deweightPars, KEEP.OUT.ATTRS = FALSE, stringsAsFactors = FALSE)
 60 |     } else {
 61 |       # single row data.frame so we can reuse the same loop
 62 |       as.data.frame(as.list(deweightPars), stringsAsFactors = FALSE)
 63 |     }
 64 |     
 65 |     # Pre-filter user-supplied model args to only those supported by the target function
 66 |     extra <- .filter_model_args(dsldFTNName, model_args)
 67 |     
 68 |     rows <- vector("list", nrow(grid_df))
 69 |     
 70 |     # Cache folds once (same across combos)
 71 |     folds <- make_folds(nrow(data), k = k_folds)
 72 |     
 73 |     # Inner CV runner for one deweight combination (named numeric list)
 74 |     run_one_combo_1 <- function(dw_list_named) {
 75 |       accs <- numeric(length(folds))
 76 |       corr_sums <- NULL
 77 |       feat_names <- NULL
 78 |       
 79 |       for (i in seq_along(folds)) {
 80 |         split <- get_fold_split(data, folds, i)
 81 |         trn <- split$train
 82 |         tst <- split$test
 83 |         y_test <- tst[[yName]]
 84 |         tst_x <- tst[, setdiff(names(tst), yName), drop = FALSE]
 85 |         
 86 |         base_args <- list(
 87 |           data         = trn,
 88 |           yName        = yName,
 89 |           sNames       = sName,
 90 |           deweightPars = dw_list_named
 91 |         )
 92 |         
 93 |         if (!is.null(yesYVal)) base_args$yesYVal <- yesYVal
 94 |         
 95 |         call_args <- utils::modifyList(base_args, extra, keep.null = TRUE)
 96 |         fitted <- do.call(dsldFTNName, call_args)
 97 |         
 98 |         res   <- predict(fitted, tst_x)
 99 |         corrs <- res$correlations   # data.frame: feature, correlation
100 |         
101 |         if (is.null(feat_names)) {
102 |           feat_names <- as.character(corrs$feature)
103 |           corr_sums  <- setNames(numeric(length(feat_names)), feat_names)
104 |         }
105 |         corr_sums[as.character(corrs$feature)] <-
106 |           corr_sums[as.character(corrs$feature)] + corrs$correlation
107 |         
108 |         if (!is.null(yesYVal)) {
109 |           accs[i] <- mean(y_test != as.integer(res$preds$probs > 0.5))
110 |         } else {
111 |           accs[i] <- mean(abs(res$preds - y_test))
112 |         }
113 |       }
114 |       
115 |       # averages for this combo
116 |       mean_acc   <- mean(accs)
117 |       mean_corrs <- corr_sums / length(folds)
118 |       
119 |       # return as named list: testAcc + correlation columns
120 |       c(list(testAcc = mean_acc), as.list(mean_corrs))
121 |     }
122 |     
123 |     # Loop over each row of the grid
124 |     for (r in seq_len(nrow(grid_df))) {
125 |       
126 |       dw_row <- lapply(grid_df[r, , drop = FALSE], function(x) as.numeric(x)[1])
127 |       names(dw_row) <- names(grid_df)
128 |       
129 |       metrics <- run_one_combo_1(dw_row)
130 |       
131 |       # Build a single row: params first, then metrics; keep raw names for correlation columns
132 |       rows[[r]] <- as.data.frame(c(as.list(dw_row), metrics), check.names = FALSE)
133 |     }
134 |     
135 |     # Bind all rows; ensure parameter columns come first
136 |     out <- do.call(rbind, rows)
137 |     rownames(out) <- NULL
138 |     return(out)
139 |    
140 |   
141 |     # fairML wrappers 
142 |   } else {
143 |     if (is.null(unfairness) || !length(unfairness))
144 |       stop("Provide unfairness as a vector of numbers between (0,1]. For example: unfairness = c(0.2, 0.9).")
145 |     
146 |     # Pre-filter user-supplied model args to only those supported by the target function
147 |     extra <- .filter_model_args(dsldFTNName, model_args)
148 |     
149 |     # Cache folds once (same across combos)
150 |     folds <- make_folds(nrow(data), k = k_folds)
151 |     
152 |     # Inner CV runner for one unfairness value
153 |     run_one_combo_2 <- function(u_val) {
154 |       accs <- numeric(length(folds))
155 |       corr_sums <- NULL
156 |       feat_names <- NULL
157 |       
158 |       for (i in seq_along(folds)) {
159 |         split <- get_fold_split(data, folds, i)
160 |         trn <- split$train
161 |         tst <- split$test
162 |         y_test <- tst[[yName]]
163 |         tst_x <- tst[, setdiff(names(tst), yName), drop = FALSE]
164 |         
165 |         base_args <- list(
166 |           data       = trn,
167 |           yName      = yName,
168 |           sName      = sName,
169 |           unfairness = u_val
170 |         )
171 |         if (!is.null(yesYVal)) base_args$yesYVal <- yesYVal
172 |         
173 |         call_args <- utils::modifyList(base_args, extra, keep.null = TRUE)
174 |         fitted <- do.call(dsldFTNName, call_args)
175 |      
176 |         res   <- predict(fitted, tst_x)
177 |         corrs <- res$correlations   # data.frame: feature, correlation
178 |         
179 |         if (is.null(feat_names)) {
180 |           feat_names <- as.character(corrs$feature)
181 |           corr_sums  <- setNames(numeric(length(feat_names)), feat_names)
182 |         }
183 |         corr_sums[as.character(corrs$feature)] <-
184 |           corr_sums[as.character(corrs$feature)] + corrs$correlation
185 |         
186 |         if (!is.null(yesYVal)) {
187 |           accs[i] <- mean(y_test != as.integer(res$preds > 0.5))
188 |         } else {
189 |           accs[i] <- mean(abs(res$preds - y_test))
190 |         }
191 |       }
192 |       
193 |       # averages for this unfairness value
194 |       mean_acc   <- mean(accs)
195 |       mean_corrs <- corr_sums / length(folds)
196 |       
197 |       # return as named list: testAcc + correlation columns
198 |       c(list(testAcc = mean_acc), as.list(mean_corrs))
199 |     }
200 |     
201 |     # Loop over the unfairness vector (no grid_df here)
202 |     rows <- vector("list", length(unfairness))
203 |     for (u_idx in seq_along(unfairness)) {
204 |       unfairVal <- as.numeric(unfairness[u_idx])
205 |       m <- as.list(run_one_combo_2(unfairVal))  # named list: testAcc + corr cols
206 |       
207 |       rows[[u_idx]] <- data.frame(
208 |         c(
209 |           list(unfairness = unfairVal, testAcc = m$testAcc),
210 |           m[setdiff(names(m), "testAcc")]
211 |         ),
212 |         check.names = FALSE
213 |       )
214 |     }
215 |     
216 |     out <- do.call(rbind, rows)
217 |     rownames(out) <- NULL
218 |     return(out)
219 |   }
220 | }
221 | 
222 | 
223 | 
224 |   
225 | 


--------------------------------------------------------------------------------
/inst/examples/tabular.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "81e2b54a",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "#### Examples for analytical/tabular methods provided by dsldPy\n",
  9 |     "\n",
 10 |     "The goal is for users to apply analytical/tabular methods with simple, intuitive interface. The following functions are included for python:\n",
 11 |     "\n",
 12 |     "1. dsldLinear, dsldLogit, and dsldML \n",
 13 |     "2. dsldTakeALookAround\n",
 14 |     "3. dsldHunting (both C/O hunting functions)\n",
 15 |     "4. dsldFrequencybyS  \n",
 16 |     "5. dsldMatchedAte"
 17 |    ]
 18 |   },
 19 |   {
 20 |    "cell_type": "code",
 21 |    "execution_count": null,
 22 |    "id": "17972c5f",
 23 |    "metadata": {},
 24 |    "outputs": [],
 25 |    "source": [
 26 |     "## requires R and the dsld (R) package installed\n",
 27 |     "# !pip install dsldPy"
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "code",
 32 |    "execution_count": null,
 33 |    "id": "675ca88a",
 34 |    "metadata": {},
 35 |    "outputs": [],
 36 |    "source": [
 37 |     "# Load necessary libraries\n",
 38 |     "\n",
 39 |     "from dsldPy import (\n",
 40 |     "# data reading and preprocessing\n",
 41 |     "preprocess_data, read_data,\n",
 42 |     "\n",
 43 |     "# linear/logistic/ML comparisons\n",
 44 |     "dsldPyLinear, dsldPyLinearSummary, dsldPyLinearPredict, dsldPyLinearVcov, dsldPyLinearCoef, dsldPyLinearGetData,\n",
 45 |     "dsldPyLogit, dsldPyLogitSummary, dsldPyLogitPredict, dsldPyLogitVcov, dsldPyLogitCoef, dsldPyLogitGetData,\n",
 46 |     "dsldPyML,\n",
 47 |     "\n",
 48 |     "# takeALookAround\n",
 49 |     "dsldPyTakeALookAround, \n",
 50 |     "\n",
 51 |     "# hunting\n",
 52 |     "dsldPyCHunting, dsldPyOHunting, \n",
 53 |     "\n",
 54 |     "# frequency table\n",
 55 |     "dsldPyFrequencybyS,\n",
 56 |     "\n",
 57 |     "# causal inference\n",
 58 |     "dsldPyMatchedATE\n",
 59 |     ")"
 60 |    ]
 61 |   },
 62 |   {
 63 |    "cell_type": "code",
 64 |    "execution_count": null,
 65 |    "id": "c9c7ac54",
 66 |    "metadata": {},
 67 |    "outputs": [],
 68 |    "source": [
 69 |     "### dsldLinear, dsldLogit, dsldML examples \n",
 70 |     "\n",
 71 |     "### data preprocessing\n",
 72 |     "\n",
 73 |     "### all dsldPy functions require a R data frame object as input (NOT pandas dataframe)\n",
 74 |     "### the preprocessing is done by the function preprocess_data\n",
 75 |     "### user needs to manually provide the categorical and numerical features (list)\n",
 76 |     "### the function preprocess_data returns a R data.frame object -> required input for the dsldPy functions\n",
 77 |     "\n",
 78 |     "# svcensus data\n",
 79 |     "# Replace with your own path to the svcensus.RData file\n",
 80 |     "# df = read_data(\"\")\n",
 81 |     "\n",
 82 |     "# preprocess data\n",
 83 |     "cat_features = ['educ', 'occ', 'gender']\n",
 84 |     "num_features= ['age', 'wageinc', 'wkswrkd']\n",
 85 |     "svcensus = preprocess_data(df, cat_features, num_features)\n",
 86 |     "\n",
 87 |     "df_10 = df.head(2)\n",
 88 |     "df_10 = df_10[['age', 'educ', 'occ', 'wkswrkd']]\n",
 89 |     "cat_features = ['educ', 'occ']\n",
 90 |     "num_features = ['age','wkswrkd']\n",
 91 |     "svcensus_comparisons_points = preprocess_data(df_10, cat_features, num_features)\n",
 92 |     "\n",
 93 |     "# compas1 data\n",
 94 |     "# Replace with your own path to the compas1.RData file\n",
 95 |     "# df = read_data(\"\")\n",
 96 |     "\n",
 97 |     "# preprocess data\n",
 98 |     "cat_features = [\"sex\", \"two_year_recid\", \"race\"]\n",
 99 |     "num_features = [\"age\",\"juv_fel_count\",\"decile_score\",\"juv_misd_count\",\"juv_other_count\",\"priors_count\",\"c_jail_in\",\"c_jail_out\",\"c_offense_date\",\"screening_date\",\"in_custody\",\"out_custody\"]\n",
100 |     "compas1 = preprocess_data(df, cat_features, num_features)\n",
101 |     "\n",
102 |     "df_10 = df.head(2)\n",
103 |     "df_10 = df_10[[\"sex\", \"age\",\"juv_fel_count\",\"decile_score\",\"juv_misd_count\",\"juv_other_count\",\"priors_count\",\"c_jail_in\",\"c_jail_out\",\"c_offense_date\",\"screening_date\",\"in_custody\",\"out_custody\"]]\n",
104 |     "cat_features = [\"sex\"]\n",
105 |     "num_features = [\"age\",\"juv_fel_count\",\"decile_score\",\"juv_misd_count\",\"juv_other_count\",\"priors_count\",\"c_jail_in\",\"c_jail_out\",\"c_offense_date\",\"screening_date\",\"in_custody\",\"out_custody\"]\n",
106 |     "compas1_comparisons_points = preprocess_data(df_10, cat_features, num_features)"
107 |    ]
108 |   },
109 |   {
110 |    "cell_type": "code",
111 |    "execution_count": null,
112 |    "id": "ddfb760f",
113 |    "metadata": {},
114 |    "outputs": [],
115 |    "source": [
116 |     "### 1. ------------------------------ dsldPyLinear/dsldPyLogit/dsldPyML ------------------------------\n",
117 |     "\n",
118 |     "## dsldPyLinear - interactions = True\n",
119 |     "a = dsldPyLinear(data = svcensus, yName = 'wageinc', sName = 'gender', interactions = True)\n",
120 |     "\n",
121 |     "### the object a is a list of R objects --- can be accessed using the following functions \n",
122 |     "### note that directly looking at 'a' might not be helpful --- use the following functions to access the results and use in python\n",
123 |     "\n",
124 |     "# uncomment to see the results of the functions\n",
125 |     "# dsldPyLinearSummary(a)   \n",
126 |     "# dsldPyLinearCoef(a)\n",
127 |     "# dsldPyLinearVcov(a)\n",
128 |     "# dsldPyLinearGetData(a)\n",
129 |     "\n",
130 |     "# predict()\n",
131 |     "preds = dsldPyLinearPredict(a, svcensus_comparisons_points)\n",
132 |     "preds\n",
133 |     "\n",
134 |     "### can also work with interactions = False as well\n",
135 |     "a2 = dsldPyLinear(data = svcensus, yName = 'wageinc', sName = 'gender', interactions = False)\n",
136 |     "\n",
137 |     "# dsldPyLinearSummary(a2)   \n",
138 |     "# dsldPyLinearCoef(a2)\n",
139 |     "# dsldPyLinearVcov(a2)\n",
140 |     "# dsldPyLinearGetData(a2)\n",
141 |     "\n",
142 |     "## the predict() method requires newData to include S (which is not done)"
143 |    ]
144 |   },
145 |   {
146 |    "cell_type": "code",
147 |    "execution_count": null,
148 |    "id": "70d2ed58",
149 |    "metadata": {},
150 |    "outputs": [],
151 |    "source": [
152 |     "# dsldPyLogit - interactions = True\n",
153 |     "\n",
154 |     "b = dsldPyLogit(data = compas1, yName = 'two_year_recid', sName = 'race', interactions = True, yesYVal = \"Yes\")\n",
155 |     "\n",
156 |     "### the object b is a list of R objects --- can be accessed using the following functions \n",
157 |     "### note that directly looking at 'b' might not be helpful --- use the following functions to access the results and use in python\n",
158 |     "\n",
159 |     "# uncomment to see the results of the functions\n",
160 |     "# dsldPyLogitSummary(b)\n",
161 |     "# dsldPyLogitCoef(b)\n",
162 |     "# dsldPyLogitVcov(b)\n",
163 |     "# dsldPyLogitGetData(b)\n",
164 |     "\n",
165 |     "# predict()\n",
166 |     "preds = dsldPyLogitPredict(b, compas1_comparisons_points)\n",
167 |     "preds\n",
168 |     "\n",
169 |     "### can also work with interactions = False as well\n",
170 |     "b2 = dsldPyLogit(data = compas1, yName = 'two_year_recid', sName = 'race', interactions = False, yesYVal = \"Yes\")\n",
171 |     "\n",
172 |     "# dsldPyLogitSummary(b2)\n",
173 |     "# dsldPyLogitCoef(b2)\n",
174 |     "# dsldPyLogitVcov(b2)\n",
175 |     "# dsldPyLogitGetData(b2)"
176 |    ]
177 |   },
178 |   {
179 |    "cell_type": "code",
180 |    "execution_count": null,
181 |    "id": "395702ad",
182 |    "metadata": {},
183 |    "outputs": [],
184 |    "source": [
185 |     "## dsldPyML - returns testAcc for each sLevel and dataframe (excluding yName and sName) of predictions\n",
186 |     "### works for several qeML functions as far as I've tried\n",
187 |     "c = dsldPyML(data = svcensus, yName = 'wageinc', sName = 'gender', qeMLftnName = 'qeKNN',sComparisonPts='rand5')\n",
188 |     "print(c)"
189 |    ]
190 |   },
191 |   {
192 |    "cell_type": "code",
193 |    "execution_count": null,
194 |    "id": "ea67194c",
195 |    "metadata": {},
196 |    "outputs": [],
197 |    "source": [
198 |     "### 2. ------------------------------ dsldTakeALookAround ------------------------------\n",
199 |     "dsldPyTakeALookAround(data = svcensus, yName = 'wageinc', sName = 'gender', maxFeatureSize = 4)   "
200 |    ]
201 |   },
202 |   {
203 |    "cell_type": "code",
204 |    "execution_count": null,
205 |    "id": "db089aa9",
206 |    "metadata": {},
207 |    "outputs": [],
208 |    "source": [
209 |     "### 3. ------------------------------ dsldHunting ------------------------------\n",
210 |     "\n",
211 |     "# dsldPyCHunting - C-Hunting\n",
212 |     "a = dsldPyCHunting(data = svcensus, yName = 'wageinc',sName = 'gender')\n",
213 |     "\n",
214 |     "# # dsldPyOHunting - O-Hunting\n",
215 |     "b = dsldPyOHunting(data = svcensus, yName = 'wageinc', sName = 'gender')"
216 |    ]
217 |   },
218 |   {
219 |    "cell_type": "code",
220 |    "execution_count": null,
221 |    "id": "22d1a166",
222 |    "metadata": {},
223 |    "outputs": [],
224 |    "source": [
225 |     "print(a)"
226 |    ]
227 |   },
228 |   {
229 |    "cell_type": "code",
230 |    "execution_count": null,
231 |    "id": "cad52d06",
232 |    "metadata": {},
233 |    "outputs": [],
234 |    "source": [
235 |     "print(b)"
236 |    ]
237 |   },
238 |   {
239 |    "cell_type": "code",
240 |    "execution_count": null,
241 |    "id": "32ab71e2",
242 |    "metadata": {},
243 |    "outputs": [],
244 |    "source": [
245 |     "### 4. ------------------------------ dsldFrequencybyS ------------------------------\n",
246 |     "dsldPyFrequencybyS(data = svcensus, cName = 'educ', sName= 'gender')"
247 |    ]
248 |   },
249 |   {
250 |    "cell_type": "code",
251 |    "execution_count": null,
252 |    "id": "99ef6117",
253 |    "metadata": {},
254 |    "outputs": [],
255 |    "source": [
256 |     "### 5. ------------------------------ dsldMatchedAte ------------------------------\n",
257 |     "dsldPyMatchedATE(data = compas1, yName='two_year_recid', sName='race', yesSVal='Caucasian')"
258 |    ]
259 |   },
260 |   {
261 |    "cell_type": "code",
262 |    "execution_count": null,
263 |    "id": "f483cd46",
264 |    "metadata": {},
265 |    "outputs": [],
266 |    "source": []
267 |   }
268 |  ],
269 |  "metadata": {
270 |   "kernelspec": {
271 |    "display_name": "dsld",
272 |    "language": "python",
273 |    "name": "python3"
274 |   },
275 |   "language_info": {
276 |    "codemirror_mode": {
277 |     "name": "ipython",
278 |     "version": 3
279 |    },
280 |    "file_extension": ".py",
281 |    "mimetype": "text/x-python",
282 |    "name": "python",
283 |    "nbconvert_exporter": "python",
284 |    "pygments_lexer": "ipython3",
285 |    "version": "3.12.11"
286 |   }
287 |  },
288 |  "nbformat": 4,
289 |  "nbformat_minor": 5
290 | }
291 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | 
  2 | # DSLD: Data Science Looks at Discrimination 
  3 | 
  4 | [![CRAN Status](https://www.r-pkg.org/badges/version/dsld)](https://cran.r-project.org/package=dsld)
  5 | 
  6 | > Statistical and graphical tools for detecting and measuring discrimination and bias in data
  7 | 
  8 | This is an R package with Python interfaces available.
  9 | 
 10 | ## Overview
 11 | 
 12 | Discrimination is a key social issue in the United States and in a 
 13 | number of other countries. There is lots of available data with which 
 14 | one might investigate possible discrimination. But how might such 
 15 | investigations be conducted?
 16 | 
 17 | Our **DSLD** package provides statistical and graphical tools for
 18 | detecting and measuring discrimination and bias; be it racial, gender, 
 19 | age or other. It is widely applicable, here are just a few possible use 
 20 | cases:
 21 | 
 22 | - Quantitative analysis in instruction and research in the social sciences.
 23 | - Corporate HR analysis and research.
 24 | - Litigation involving discrimination and related issues.
 25 | - Concerned citizenry.
 26 | 
 27 | This package is broadly aimed at users ranging from instructors of 
 28 | statistics classes to legal professionals, as it offers a powerful yet 
 29 | intuitive approach to discrimination analysis. It also includes an 80 page 
 30 | **Quarto book** to serve as a guide of the key statistical principles and 
 31 | their applications.
 32 | 
 33 | - **Quarto Book**: [Paper](https://htmlpreview.github.io/?https://github.com/matloff/dsldBook/blob/main/_book/index.html) - Important statistical principles and applications.
 34 | - **Research Paper**: [Paper](https://arxiv.org/abs/2411.04228) - Package implementation details.
 35 | 
 36 | ## Installation:
 37 | 
 38 | From CRAN :
 39 | 
 40 | ```r
 41 | install.packages("dsld")
 42 | ```
 43 | 
 44 | <!-- From GitHub (latest Version):
 45 | 
 46 | ```r
 47 | library(devtools)
 48 | install_github("matloff/dsld", force = TRUE)
 49 | ``` -->
 50 | 
 51 | ## Analysis categories:
 52 | 
 53 | DSLD addresses two main types of bias analysis:
 54 | 
 55 | **Estimation Analysis**: Investigates possible discrimination by 
 56 | estimating effects while accounting for confounders. Confounders are
 57 | variables that may affect the outcome variable other than through 
 58 | the sensitive variable. DSLD provides both analytical and graphical functions 
 59 | for this purpose.
 60 | 
 61 | **Prediction Analysis**: Addresses algorithmic bias in machine learning 
 62 | by excluding sensitive variables while controlling proxy effects. 
 63 | Proxies are variables strongly related to the sensitive variable that 
 64 | could indirectly introduce bias.
 65 | 
 66 | The first case examines *societal* or *institutional bias*. The second case 
 67 | focuses on *algorithmic bias*.
 68 | 
 69 | <table border="1">
 70 |    <tr>
 71 |    <th>Statistical Analysis</th>
 72 |    <th>Fair Machine Learning</th>
 73 |    </tr>
 74 |    <tr>
 75 |    <td>Estimate an effect</td>
 76 |    <td>Predict an outcome</td>
 77 |    </tr>
 78 |    <tr>
 79 |    <td>Harm comes from society</td>
 80 |    <td>Harm comes from algorithm</td>
 81 |    </tr>
 82 |    <tr>
 83 |    <td>Include sensitive variables</td>
 84 |    <td>Exclude sensitive variables</td>
 85 |    </tr>
 86 |    <tr>
 87 |    <td>Adjust for covariates</td>
 88 |    <td>Limit proxy impact</td>
 89 |    </tr>
 90 | </table>
 91 | 
 92 | We will tour of a small subset of dsld's features using the **svcensus** data 
 93 | included in the package.
 94 | 
 95 | ### The data
 96 | 
 97 | The **svcensus** dataset consists of recorded income across 6 different 
 98 | engineering occupations. It consists of the features: 'age', 'education level', 
 99 | 'occupation', 'wage income', 'number of weeks worked', 'gender'.
100 | 
101 | ```R
102 | > data(svcensus)
103 | > head(svcensus)
104 |        age     educ occ wageinc wkswrkd gender
105 | 1 50.30082 zzzOther 102   75000      52 female
106 | 2 41.10139 zzzOther 101   12300      20   male
107 | 3 24.67374 zzzOther 102   15400      52 female
108 | 4 50.19951 zzzOther 100       0      52   male
109 | 5 51.18112 zzzOther 100     160       1 female
110 | 6 57.70413 zzzOther 100       0       0   male
111 | ```
112 | 
113 | We will use only a few features to keep things simple. The *Quarto Book* 
114 | provides a more extensive analysis of examples shown below.
115 | 
116 | ## Part One: Adjustment for Confounders 
117 | 
118 | We want to estimate the impact of a sensitive variable [S] 
119 | on an outcome variable [Y], while accounting for confounders [C]. Let's 
120 | call such analysis "confounder adjustment."
121 | 
122 | ### Estimation Example
123 | 
124 | We are investigating a possible gender pay gap between men and women. 
125 | Here, [Y] is wage and [S] is gender. We will treat age as a confounder [C], 
126 | using a linear model. For simplicity, no other confounders (such as occupation) 
127 | or any other predictors [X] are included in this example. 
128 | 
129 | **No interactions**
130 | 
131 | ```r
132 | > data(svcensus)
133 | > svcensus <- svcensus[,c(1,4,6)]      # subset columns: age, wage, gender
134 | > z <- dsldLinear(svcensus,'wageinc','gender', interactions = FALSE)
135 | > summary(z)                              # show coefficients of linear model
136 | 
137 | $`Summary Coefficients`
138 |     Covariate   Estimate StandardError PValue
139 | 1 (Intercept) 31079.9174    1378.08158      0
140 | 2         age   489.5728      30.26461      0
141 | 3  gendermale 13098.2091     790.44515      0
142 | 
143 | $`Sensitive Factor Level Comparisons`
144 |          Factors Compared Estimates Standard Errors P-Value
145 | Estimate    male - female  13098.21        790.4451       0
146 | ```
147 | Our linear model can be written as: 
148 | 
149 | > E(W) = $\beta_0$ + $\beta_1$ A + $\beta_2$ M
150 | 
151 | Consider the case without any interaction: Here *W* indicates wage 
152 | income, *A* is age and *M* denotes an indicator variable, with M = 1 for men and 
153 | M = 0 for women.
154 | 
155 | Where W is wage income, A is age, and M is male indicator (M = 1 for men, M = 0 for women).
156 | 
157 | $\beta_2$ represents the gender wage gap at any age. The linear model shows men earn 
158 | $13,000 more than women across *all* ages. However, the wage gap might also vary by age. 
159 | We test for such interactions by fitting separate models for men and women, for example comparing ages 36 and 43:
160 | 
161 | **Interactions**
162 | ```R
163 | newData <- data.frame(age=c(36,43))
164 | z <- dsldLinear(svcensus,'wageinc','gender',interactions=TRUE, newData)
165 | summary(z)
166 | 
167 | $female
168 |     Covariate   Estimate StandardError PValue
169 | 1 (Intercept) 30551.4302    2123.44361      0
170 | 2         age   502.9624      52.07742      0
171 | 
172 | $male
173 |     Covariate  Estimate StandardError PValue
174 | 1 (Intercept) 44313.159    1484.82216      0
175 | 2         age   486.161      36.02116      0
176 | 
177 | $`Sensitive Factor Level Comparisons`
178 |   Factors Compared New Data Row Estimates Standard Errors
179 | 1    female - male            1 -13156.88        710.9696
180 | 2    female - male            2 -13039.27        710.7782
181 | ```
182 | 
183 | The gender pay gap is -$13,157 at age 36 and -$13,039 at age 43, differing by only $118. 
184 | This suggests minimal age-gender interaction. We only focused on age as the confounder, 
185 | but other variables like occupation could be included depending on the analysis goals.
186 | 
187 | ## Part Two: Discovering/Mitigating Bias in Machine Learning
188 | 
189 | We are predicting [Y] from a feature set [X] and a sensitive variable [S]. 
190 | We want to minimize the effect of [S], along with any proxies [O] in [X] that may 
191 | be correlated with [S]. The inherent trade-off of increasing fairness (minimizing [S] and [O]) 
192 | is reduced utility. The package provides wrappers for several functions. 
193 | 
194 | ### Prediction Example
195 | 
196 | **Goal**: Predict wage income while minimizing gender bias by limiting the 
197 | impact of occupation as a proxy variable. 
198 | 
199 | **Setup**: 
200 | - **Outcome [Y]**: Wage income
201 | - **Sensitive Variable [S]**: Gender  
202 | - **Proxy Variable [O]**: Occupation (deweighted to 0.2)
203 | - **Method**: Fair K-Nearest Neighbors using `dsldQeFairKNN()`
204 | 
205 | <table border="1">
206 | 
207 |    <tr>
208 |    <th>Fairness/Utility Tradeoff</th>
209 |    <th>Fairness</th>
210 |    <th>Accuracy</th>
211 |    </tr>
212 | 
213 |    <tr>
214 |    <td>K-Nearest Neighbors</td>
215 |    <td>0.1943313</td>
216 |    <td>25452.08</td>
217 |    </tr>
218 | 
219 |    <tr>
220 |    <td>Fair K-NN (via EDFFair)</td>
221 |    <td>0.0814919</td>
222 |    <td>26291.38</td>
223 |    </tr>
224 | </table>
225 | 
226 | The base K-NN model shows 0.194 correlation between predicted wage and gender, with $25,452 prediction error. Using `dsldQeFairKNN`, the correlation drops to 0.081, but test error increases by $839. This shows the fairness-utility trade-off. Users can test parameter combinations to find their optimal balance. The `dsldFairUtils` function facilitates this search.
227 | 
228 | ## Function List
229 | 
230 | 1. Graphical Functions
231 | 
232 | <table border="1">
233 |    <tr>
234 |    <th>Function</th>
235 |    <th>Description</th>
236 |    <th>Use Case</th>
237 |    </tr>
238 |    <tr>
239 |    <td><code>dsldFreqPCoord</code></td>
240 |    <td>Frequency-based parallel coordinates</td>
241 |    <td>Visualizing multivariate relationships</td>
242 |    </tr>
243 |    <tr>
244 |    <td><code>dsldScatterPlot3D</code></td>
245 |    <td>3D scatter plots with color coding</td>
246 |    <td>Exploring 3D data relationships</td>
247 |    </tr>
248 |    <tr>
249 |    <td><code>dsldConditDisparity</code></td>
250 |    <td>Conditional disparity plots</td>
251 |    <td>Detecting Simpson's Paradox</td>
252 |    </tr>
253 |    <tr>
254 |    <td><code>dsldDensityByS</code></td>
255 |    <td>Density plots by sensitive variable</td>
256 |    <td>Comparing distributions across groups</td>
257 |    </tr>
258 |    <tr>
259 |    <td><code>dsldConfounders</code></td>
260 |    <td>Confounder analysis</td>
261 |    <td>Identifying confounding variables</td>
262 |    </tr>
263 |    <tr>
264 |    <td><code>dsldIamb</code></td>
265 |    <td>Constraint-based structure learning algorithms</td>
266 |    <td>Fits a causal model to data</td>
267 |    </tr>
268 | </table>
269 | 
270 | 2. Analytical Functions
271 | 
272 | <table border="1">
273 |    <tr>
274 |    <th>Function</th>
275 |    <th>Description</th>
276 |    <th>Use Case</th>
277 |    </tr>
278 |    <tr>
279 |    <td><code>dsldLinear</code></td>
280 |    <td>Linear regression with sensitive group comparisons</td>
281 |    <td>Regression outcome analysis</td>
282 |    </tr>
283 |    <tr>
284 |    <td><code>dsldLogit</code></td>
285 |    <td>Logistic regression with sensitive group comparisons</td>
286 |    <td>Binary outcome analysis</td>
287 |    </tr>
288 |    <tr>
289 |    <td><code>dsldML</code></td>
290 |    <td>Machine learning with sensitive group comparisons</td>
291 |    <td>Analysis via non-parametric models (KNN, RF)</td>
292 |    </tr>
293 |    <tr>
294 |    <td><code>dsldTakeALookAround</code></td>
295 |    <td>Feature set evaluation</td>
296 |    <td>Assessing prediction fairness</td>
297 |    </tr>
298 |    <tr>
299 |    <td><code>dsldCHunting</code></td>
300 |    <td>Confounder hunting</td>
301 |    <td>Finding variables that predict both Y and S</td>
302 |    </tr>
303 |    <tr>
304 |    <td><code>dsldOHunting</code></td>
305 |    <td>Proxy hunting</td>
306 |    <td>Finding variables that predict S</td>
307 |    </tr>
308 |    <tr>
309 |    <td><code>dsldMatchedAte</code></td>
310 |    <td>Causal inference via matching</td>
311 |    <td>Estimating treatment effects</td>
312 |    </tr>
313 | </table>
314 | 
315 | 3. Fair Machine Learning Functions
316 | 
317 | <table border="1">
318 |    <tr>
319 |    <th>Function</th>
320 |    <th>Description</th>
321 |    <th>Package</th>
322 |    </tr>
323 |    <tr>
324 |    <td><code>dsldFairML</code></td>
325 |    <td>FairML algorithm wrappers</td>
326 |    <td>FairML</td>
327 |    </tr>
328 |    <tr>
329 |    <td><code>dsldQeFairML</code></td>
330 |    <td>EDFFair algorithm wrappers</td>
331 |    <td>EDFFair</td>
332 |    </tr>
333 |    <tr>
334 |    <td><code>dsldFairUtils</code></td>
335 |    <td>Grid search and parameter optimization for fair ML</td>
336 |    <td>  </td>
337 |    </tr>
338 | </table>
339 | 
340 | **Available Algorithms**:
341 | - **FairML**: <code>dsldFrrm</code>, <code>dsldFgrrm</code>, <code>dsldZlm</code>, <code>dsldNclm</code>, <code>dsldZlrm</code>
342 | - **EDFFair**: <code>dsldQeFairKNN</code>, <code>dsldQeFairRf</code>, <code>dsldQeFairRidgeLin</code>, <code>dsldQeFairRidgeLog</code>
343 | 
344 | ## Authors
345 | 
346 | - **Norm Matloff** 
347 | - **Aditya Mittal** 
348 | - **Taha Abdullah** 
349 | - **Arjun Ashok** 
350 | - **Shubhada Martha** 
351 | - **Billy Ouattara**
352 | - **Jonathan Tran** 
353 | - **Brandon Zarate** 
354 | 
355 | For issues, contact **Aditya Mittal** at mittalaa@uci.edu


--------------------------------------------------------------------------------
/R/dsldLogit.R:
--------------------------------------------------------------------------------
  1 | ### ------------------------ DSLDLogit -----------------------------------------
  2 | ### creates the dsldLogit object
  3 | dsldLogit <- function(data, yName, sName, sComparisonPts = NULL, interactions = FALSE, yesYVal) {
  4 |   
  5 |   dsldModel <- list()
  6 |   data[[yName]] <- ifelse(data[[yName]] == yesYVal, 1, 0)
  7 |   
  8 |   # user wants interactions #
  9 |   if (interactions) {
 10 | 
 11 |     # generate interactions data if not provided / stop if erroneous
 12 |     if (is.null(sComparisonPts)) {
 13 |       sComparisonPts <- dsldGetRow5(data,yName, sName)
 14 |     } else if (!is.data.frame(sComparisonPts)) {
 15 |       stop(paste("Error: sComparisonPts must be a dataframe or equivalent"))
 16 |     } 
 17 |   
 18 |     # split data into list of dataframes by each level of sName #
 19 |     dataSplit <- split(data, data[[sName]])
 20 |     dataNames <- names(dataSplit)
 21 |     
 22 |     # loop and create model for each level in sName #
 23 |     for (name in dataNames) {
 24 |       # initialize instance of dsldDiffModel #
 25 |       dsldDiffModel <- list()
 26 |       
 27 |       # get data for each specific S factor & drop sensitive column #
 28 |       diffData <- dataSplit[[name]]
 29 |       drop <- c(sName)
 30 |       diffData <- diffData[, !(names(diffData) %in% drop)]
 31 |       
 32 |       # create the model #
 33 |       diffModel <- glm(formula = as.formula(paste(yName, "~ .")),
 34 |                        family = "binomial", data = diffData)
 35 |       
 36 |       # setup individual instance of dsldDiffModel 
 37 |       dsldDiffModel <- c(
 38 |         dsldDiffModel,
 39 |         yName,
 40 |         sName,
 41 |         list(diffModel),
 42 |         list(sComparisonPts),
 43 |         list(summary(diffModel)),
 44 |         list(coef(diffModel)),
 45 |         list(diffData),
 46 |         list(factor_levels(data))
 47 |       )
 48 |       names(dsldDiffModel) <- c("yName", "sName", "model", "newData",
 49 |                                 "summary", "coef", "data", "FactorsInfo")
 50 |       class(dsldDiffModel) <- "dsldDiffModel"
 51 |       
 52 |       # add instance into output list: dsldModel #
 53 |       dsldModel[[name]] <- dsldDiffModel
 54 |     }
 55 |   } else {
 56 |     # initialize instance of dsldDiffModel #
 57 |     dsldDiffModel <- list()
 58 |     
 59 |     # create model #
 60 |     diffModel <- glm(formula = as.formula(paste(yName, "~ .")),
 61 |                      family = "binomial", data = data)
 62 |     
 63 |     # setup instance of dsldDiffModel #
 64 |     dsldDiffModel <- c(dsldDiffModel,
 65 |                        yName,
 66 |                        sName,
 67 |                        list(diffModel),
 68 |                        list(summary(diffModel)),
 69 |                        list(coef(diffModel)),
 70 |                        list(data),
 71 |                        list(factor_levels(data))
 72 |     )
 73 |     names(dsldDiffModel) <- c("yName", "sName", "model", "summary",
 74 |                               "coef", "data", "FactorsInfo")
 75 |     
 76 |     # add instance into dsldModel
 77 |     dsldModel[[sName]] <- dsldDiffModel
 78 |   }
 79 |   class(dsldModel) <- "dsldGLM"
 80 |   return(dsldModel)
 81 | }
 82 | 
 83 | # ----------------------- Auxiliary Functions ---------------------------------#
 84 | coef.dsldGLM <- function(object,...) {
 85 |   # merge & return coefficients #
 86 |   mergedCoef <- lapply(object, function(x) x$coef)
 87 |   return(mergedCoef)
 88 | }
 89 | 
 90 | vcov.dsldGLM <- function(object,...) {
 91 |   # merge & return coefficients #
 92 |   mergedCoef <- lapply(object, function(x) vcov(x$model))
 93 |   return(mergedCoef)
 94 | }
 95 | 
 96 | dsldGetData <- function(object) {
 97 |   # merge & return datasets #
 98 |   mergedData <- lapply(object, function(x) x$data)
 99 |   return(mergedData)
100 | }
101 | 
102 | ### #------------------------- dsldDiffSLog function --------------------------#
103 | dsldDiffSLog <- function(object, sComparisonPts = NULL) {
104 |   # naming
105 |   dsldGLM <- object
106 |   
107 |   # get sName and yName from the output of dsldLogistic #
108 |   sName <- dsldGLM[[1]]$sName
109 |   yName <- dsldGLM[[1]]$yName
110 |   
111 |   # diffS results when interaction == FALSE in dsldLinear #
112 |   if (length(dsldGLM) == 1) {
113 |     # extract pairwise combination of [dummy level in glm - factor levels]
114 |     # from summary output
115 |     data <- dsldGetData(dsldGLM)[[1]]
116 |     model <- dsldGLM[[1]]$model
117 |     C <- vcov(model)
118 |     c <- coef(model)
119 |     
120 |     # get all values containing sName levels from summary(model) #
121 |     rowsWithRace <- grep(sName, rownames(coef(summary(model))))
122 |     regularS <- summary(model)$coefficients[rowsWithRace, ]
123 |     
124 |     # for the case when we have only two levels in S; ex: male/female #
125 |     if (length(levels(data[[sName]])) == 2) {
126 |       estimate <- regularS[1]
127 |       standardError <- regularS[2]
128 |       pVal <- regularS[4]
129 |       sPairs <- combn(levels(data[[sName]]), 2)
130 |       a <- sPairs[1]
131 |       b <- sPairs[2]
132 |       indexVal <- sprintf("%s - %s", b, a)
133 |       df <- data.frame(indexVal, estimate, standardError, pVal)
134 |       names(df) <- c("Factors Compared", "Estimates", 
135 |          "Standard Errors", "P-Value")
136 |       return(df)
137 |     }
138 |     
139 |     # extract estimates and standard errors #
140 |     estimates <- regularS[, 1]
141 |     standardErrors <- regularS[, 2]
142 |     pVal <- regularS[, 4]
143 |     
144 |     # create dataframe #
145 |     df <- data.frame(estimates, standardErrors, pVal)
146 |     df$estimates <- -df$estimates
147 |     
148 |     # extract other pairwise combinations of levels (not including dummy) #
149 |     featureNames <- colnames(vcov(model))
150 |     combinationMatrix <- combn(featureNames, 2)
151 |     
152 |     # remove all columns that do not have sName #
153 |     matchingCols <- which(apply(combinationMatrix, 2,
154 |                                 function(col) all(grepl(sName, col))))
155 |     finalResult <- combinationMatrix[, matchingCols, drop = FALSE]
156 |     
157 |     # loops through each pair #
158 |     for (j in 1:dim(finalResult)[2]) {
159 |       # create i-th pair of pairwise combinations #
160 |       val <- finalResult[, j]
161 |       a <- val[1]
162 |       b <- val[2]
163 |       
164 |       # create vector of 0's length of coef(z) #
165 |       vectorLength <- length(c)
166 |       rt <- rep(0, vectorLength)
167 |       
168 |       # put 1 on the first element #
169 |       aIndex <- which(names(c) == a)
170 |       rt[aIndex] <- 1
171 |       
172 |       # put -1 on the second element #
173 |       bIndex <- which(names(c) == b)
174 |       rt[bIndex] <- -1
175 |       
176 |       aValue <- c[aIndex]
177 |       bValue <- c[bIndex]
178 |       
179 |       # get estimates & standard errors #
180 |       estimates <- aValue - bValue
181 |       standardErrors <- sqrt((t(rt) %*% C %*% rt))
182 |       
183 |       tStatistic <- (estimates) / standardErrors
184 |       degOfFreedom <- nrow(data) - 1 # degrees of freedom
185 |       pVal <- 2 * pt(abs(tStatistic), df = degOfFreedom,
186 |                      lower.tail = FALSE)
187 |       
188 |       tempDF <- data.frame(estimates, standardErrors, pVal)
189 |       df <- rbind(df, tempDF)
190 |     }
191 |     
192 |     # get names of sName comparisons #
193 |     sPairs <- combn(levels(data[[sName]]), 2)
194 |     test <- c()
195 |     for (i in 1:dim(sPairs)[2]) {
196 |       val <- sPairs[,i]
197 |       a <- val[1]
198 |       b <- val[2]
199 |       indexVal <- sprintf("%s - %s", a, b)
200 |       test <- c(test, indexVal)
201 |     }
202 |     
203 |     # create final data-frame #
204 |     df <- cbind(test, df)
205 |     df <- data.frame(df, row.names = NULL)
206 |     names(df) <- c("Factors Compared", "Estimates", "Standard Errors",
207 |                    "P-Value")
208 |     return(df)
209 |     
210 |   } else {
211 |     # raise error if the user doesn't input new data #
212 |     if (is.null(sComparisonPts)) {
213 |       stop("Please enter the sComparisonPts input to compare for interactions")
214 |     }
215 |     
216 |     if (!is.data.frame(sComparisonPts)) {
217 |       stop(paste("Error: sComparisonPts must be a dataframe or equivalent"))
218 |     } 
219 |     
220 |     if (!is.null(sComparisonPts)) {
221 |       sComparisonPts <- apply_factor_levels(sComparisonPts, object[[1]]$FactorsInfo)
222 |     }
223 |     
224 |     # naming
225 |     xNew <- sComparisonPts
226 |     
227 |     # get vector of all levels in sName #
228 |     sNames <- names(dsldGLM)
229 |     df <- data.frame()
230 |     
231 |     # loop through each level of S name to compute estimates and standard errors
232 |     for (i in sNames) {
233 |       data <- dsldGLM[[i]]$data
234 |       model <- dsldGLM[[i]]$model
235 |       predictions <- predict(model, xNew, type = "response", se.fit = TRUE)
236 |       pred <- predictions$fit
237 |       se <- predictions$se.fit
238 |       level <- row <- prediction <- standardError <-  NULL
239 |       tempDF <- data.frame(level = i, row = 1:nrow(xNew), 
240 |          prediction = pred, standardError = se)
241 |       df <- rbind(df, tempDF)
242 |     }
243 |     
244 |     # compute difference in estimates between each pair factor level 
245 |     # for each row
246 |     uniqueElements <- sort(unique(df$row))
247 |     pairwiseDF <- data.frame()
248 |     
249 |     for (i in uniqueElements) {
250 |       rowData <- subset(df, row == i)
251 |       charVec <- as.character(rowData$level)
252 |       combinationMatrix <- combn(charVec, 2)
253 |       for (j in 1:dim(combinationMatrix)[2]) {
254 |         val <- combinationMatrix[, j]
255 |         a <- val[1]
256 |         b <- val[2]
257 |         aData <- subset(rowData, level == a) 
258 |         a3 <- aData[3]
259 |         bData <- subset(rowData, level == b)
260 |         b3 <- bData[3]
261 |         indexVal <- sprintf("%s - %s", a, b)
262 |         estimatedDiff <- aData$prediction - bData$prediction
263 |         standardError <- sqrt(((aData$standardError) ^ 2) +
264 |                                 ((bData$standardError) ^ 2))
265 |         tempDF <- data.frame(indexVal, i, a3,b3, estimatedDiff,
266 |                              standardError)
267 |         names(tempDF) <- c("Factors Compared", "New Data Row", 
268 |            'Factor A','Factor B', "Difference in Estimates", "Standard Errors")
269 |         pairwiseDF <- rbind(pairwiseDF, tempDF)
270 |       }
271 |     }
272 |     return(pairwiseDF)
273 |   }
274 | }
275 | 
276 | ## ------------------------------ summary() ------------------------------
277 | summary.dsldGLM <- function(object,...) {
278 |   diffS <- list()
279 |   # get sName and yName from the output of dsldLogistic #
280 |   sName <- object[[1]]$sName
281 |   yName <- object[[1]]$yName
282 |   
283 |   sNames <- names(object)
284 |   newData <- object[[1]]$newData
285 |   
286 |   if (length(object) == 1) {
287 |     data <- dsldGetData(object)[[1]]
288 |     summary_output <- summary(object[[1]]$model)
289 |     coef <- summary_output$coefficients[, 1]
290 |     std_err <- summary_output$coefficients[, 2]
291 |     pValues <- summary_output$coefficients[, 4]
292 |     
293 |     # Create dataframe
294 |     df <- data.frame(
295 |       Covariate = row.names(summary_output$coefficients),
296 |       Estimate = coef,
297 |       `Standard Error` = std_err,
298 |       PValue = pValues,
299 |       stringsAsFactors = FALSE,
300 |       row.names = NULL
301 |     )
302 |     
303 |     diffS[['Summary Coefficients']] <- df
304 |     diffS[['Sensitive Factor Level Comparisons']] <- dsldDiffSLog(object)
305 |     
306 |     return(diffS)
307 |   } else { 
308 |     # loop through each level of S name to compute estimates and standard errors
309 |     for (i in sNames) {
310 |       data <- object[[i]]$data
311 |       summaryOutput <- summary(object[[i]]$model)
312 |       coef <- summaryOutput$coefficients[, 1]
313 |       stdErr <- summaryOutput$coefficients[, 2]
314 |       pValues <- summaryOutput$coefficients[, 4]
315 |       
316 |       df <- data.frame(
317 |         Covariate = row.names(summaryOutput$coefficients),
318 |         Estimate = coef,
319 |         `Standard Error` = stdErr,
320 |         PValue = pValues,
321 |         stringsAsFactors = FALSE,
322 |         row.names = NULL
323 |       )
324 |       diffS[[i]] <- df
325 |     }
326 |     diffS[['Sensitive Factor Level Comparisons']] <- dsldDiffSLog(object,
327 |                                                                   newData)
328 |     return(diffS)
329 |   }
330 | }
331 | 
332 | # ---------------------------- add predict() -----------------------------------
333 | predict.dsldGLM <- function(object, xNew,...){
334 |   df <- data.frame()
335 |   yName = object[[1]]$yName
336 |   if (length(object) == 1) {
337 |     data <- object[[1]]$data
338 |     model <- object[[1]]$model
339 |     xNew <- apply_factor_levels(xNew, object[[1]]$FactorsInfo)
340 |     predictions <- predict(model, xNew, type = "response", se.fit = TRUE)
341 |     pred <- predictions$fit
342 |     se <- predictions$se.fit
343 |     tempDF <- data.frame(row = 1:nrow(xNew), prediction = pred, standardError = se)
344 |     df <- rbind(df, tempDF)
345 |     return (df)
346 |   } else {
347 |     sNames <- names(object)
348 |     for (i in sNames) {         # loop through each level of S name to compute estimates and standard errors
349 |       data <- object[[i]]$data
350 |       model <- object[[i]]$model
351 |       xNew <- apply_factor_levels(xNew, object[[1]]$FactorsInfo)
352 |       predictions <- predict(model, xNew, type = "response", se.fit = TRUE)
353 |       pred <- predictions$fit
354 |       se <- predictions$se.fit
355 |       tempDF <- data.frame(level = i, row = 1:nrow(xNew), prediction = pred, standardError = se)
356 |       df <- rbind(df, tempDF)
357 |     }
358 |     return (df)
359 |   }
360 | }
361 | 


--------------------------------------------------------------------------------
/inst/examples/machine_learning.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "bd7878bc",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "#### Examples for machine learning algorithms by dsldPy\n",
  9 |     "\n",
 10 |     "The goal is for users to train models with a simple, intuitive interface and also understand effects on fairness-utility tradeoffs based on hyperparamater selection. Examples are shown on training/testing sets with cross validation approaches.\n",
 11 |     "\n",
 12 |     "1) regression examples using dsldPyFairML and dsldPyQeFairML\n",
 13 |     "2) classification examples using dsldPyFairML and dsldPyQeFairML\n",
 14 |     "3) k-fold cross validation to choose best hyperparameters for fairness utility tradeoff"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": null,
 20 |    "id": "da083337",
 21 |    "metadata": {},
 22 |    "outputs": [],
 23 |    "source": [
 24 |     "## requires R and the dsld (R) package installed\n",
 25 |     "# !pip install dsldPy"
 26 |    ]
 27 |   },
 28 |   {
 29 |    "cell_type": "code",
 30 |    "execution_count": null,
 31 |    "id": "ccd05554",
 32 |    "metadata": {},
 33 |    "outputs": [],
 34 |    "source": [
 35 |     "# load libraries\n",
 36 |     "from dsldPy import (\n",
 37 |     "# data reading and preprocessing\n",
 38 |     "preprocess_data, read_data,\n",
 39 |     "\n",
 40 |     "# fairML wrappers\n",
 41 |     "dsldPyFrrm, dsldPyFgrrm, dsldPyNclm, dsldPyZlm, dsldPyZlrm, dsldPyFairML_Summary, dsldPyFairML_Predict,\n",
 42 |     "\n",
 43 |     "# qeFairML wrappers\n",
 44 |     "dsldPyQeFairKNN, dsldPyQeFairRF, dsldPyQeFairRidgeLin, dsldPyQeFairRidgeLog, dsldPyQeFairML_Predict,\n",
 45 |     "\n",
 46 |     "dsldPyFairUtils\n",
 47 |     ")\n",
 48 |     "\n",
 49 |     "from sklearn.model_selection import train_test_split\n",
 50 |     "from sklearn.metrics import mean_absolute_error, accuracy_score\n"
 51 |    ]
 52 |   },
 53 |   {
 54 |    "cell_type": "code",
 55 |    "execution_count": null,
 56 |    "id": "3135dff7",
 57 |    "metadata": {},
 58 |    "outputs": [],
 59 |    "source": [
 60 |     "### regression example --- frrm(), nclm(), zlm(), qeFairKNN(), qeFairRF(), qeFairRidgeLin()\n",
 61 |     "\n",
 62 |     "### read and preprocess data\n",
 63 |     "\n",
 64 |     "### data preprocessing\n",
 65 |     "\n",
 66 |     "### all dsldPy functions require a R data frame object as input (NOT pandas dataframe)\n",
 67 |     "### the preprocessing is done by the function preprocess_data\n",
 68 |     "### user needs to manually provide the categorical and numerical features (list)\n",
 69 |     "### the function preprocess_data returns a R data.frame object -> required input for the dsldPy functions\n",
 70 |     "\n",
 71 |     "# test and train split\n",
 72 |     "#### REPLACE WITH YOUR PATH TO svcensus.RData\n",
 73 |     "# df = read_data(\"\") \n",
 74 |     "test_df, train_df = train_test_split(df, test_size=0.3, random_state=42)\n",
 75 |     "test_y = test_df['wageinc']\n",
 76 |     "test_df = test_df.drop(columns=['wageinc'])\n",
 77 |     "\n",
 78 |     "# preprocess data\n",
 79 |     "cat_features_train = ['educ', 'occ', 'gender']\n",
 80 |     "num_features_train = ['age', 'wageinc', 'wkswrkd']\n",
 81 |     "svcensus_train = preprocess_data(train_df, cat_features_train, num_features_train)\n",
 82 |     "\n",
 83 |     "cat_features_test = ['educ', 'occ', 'gender']\n",
 84 |     "num_features_test = ['age', 'wkswrkd']\n",
 85 |     "svcensus_test = preprocess_data(test_df, cat_features_test, num_features_test)\n"
 86 |    ]
 87 |   },
 88 |   {
 89 |    "cell_type": "code",
 90 |    "execution_count": null,
 91 |    "id": "a08603d4",
 92 |    "metadata": {},
 93 |    "outputs": [],
 94 |    "source": [
 95 |     "### using dsldPyFairML() function\n",
 96 |     "\n",
 97 |     "### model training --- frrm() \n",
 98 |     "### unfairness = 0.05 // can also try different values for unfairness\n",
 99 |     "a = dsldPyFrrm(data=svcensus_train, yName='wageinc', sName='gender',unfairness= 0.05, definition = \"sp-komiyama\", lamda = 0, save = False)\n",
100 |     "\n",
101 |     "# print train accuracy and correlations\n",
102 |     "print(f\"train predictions: {a['train_predictions']}\")\n",
103 |     "print(f\"train accuracy: {a['train_accuracy']}\")\n",
104 |     "print(f\"train correlations: {a['train_correlations']}\")"
105 |    ]
106 |   },
107 |   {
108 |    "cell_type": "code",
109 |    "execution_count": null,
110 |    "id": "7502969a",
111 |    "metadata": {},
112 |    "outputs": [],
113 |    "source": [
114 |     "### predict() on test data\n",
115 |     "a_preds = dsldPyFairML_Predict(a, svcensus_test)\n",
116 |     "\n",
117 |     "# print test predictions and correlations\n",
118 |     "print(f\"test predictions: {a_preds['test_predictions']}\")\n",
119 |     "print(f\"test correlations: {a_preds['test_correlations']}\")\n",
120 |     "\n",
121 |     "# manuallycompute test accuracy (MAPE)\n",
122 |     "test_accuracy = mean_absolute_error(test_y, a_preds['test_predictions'])\n",
123 |     "print(f\"test accuracy: {test_accuracy}\")\n",
124 |     "\n",
125 |     "### the same can be done for other models --- nclm(), zlm() with dsldPyFairML_Predict() method"
126 |    ]
127 |   },
128 |   {
129 |    "cell_type": "code",
130 |    "execution_count": null,
131 |    "id": "434a6e7b",
132 |    "metadata": {},
133 |    "outputs": [],
134 |    "source": [
135 |     "### using dsldPyQeFairML() functions \n",
136 |     "\n",
137 |     "### model training --- dsldQeFairRF() \n",
138 |     "### deweightPars = {'educ': 0.2, 'occ': 0.05} // try different values for proxies\n",
139 |     "deweightPars = {'educ': 0.2, 'occ': 0.05}\n",
140 |     "\n",
141 |     "a = dsldPyQeFairRF(data=svcensus_train, yName='wageinc', sNames='gender', deweightPars=deweightPars)\n",
142 |     "\n",
143 |     "# print train accuracy and correlations\n",
144 |     "print(f\"train predictions: {a['train_predictions']}\")\n",
145 |     "print(f\"train accuracy: {a['train_accuracy']}\")\n",
146 |     "print(f\"train correlations: {a['train_correlations']}\")"
147 |    ]
148 |   },
149 |   {
150 |    "cell_type": "code",
151 |    "execution_count": null,
152 |    "id": "e63aaf39",
153 |    "metadata": {},
154 |    "outputs": [],
155 |    "source": [
156 |     "### predict on test data\n",
157 |     "a_preds = dsldPyQeFairML_Predict(a, svcensus_test)\n",
158 |     "\n",
159 |     "# print test predictions and correlations\n",
160 |     "print(f\"test predictions: {a_preds['test_predictions']}\")\n",
161 |     "print(f\"test correlations: {a_preds['test_correlations']}\")\n",
162 |     "\n",
163 |     "# manually compute test accuracy (MAPE)\n",
164 |     "test_accuracy = mean_absolute_error(test_y, a_preds['test_predictions'])\n",
165 |     "print(f\"test accuracy: {test_accuracy}\")\n",
166 |     "\n",
167 |     "### the same can be done for other models --- qeFairKNN(), qeFairRidgeLin() with dsldPyQeFairML_Predict() method"
168 |    ]
169 |   },
170 |   {
171 |    "cell_type": "code",
172 |    "execution_count": null,
173 |    "id": "e12cdade",
174 |    "metadata": {},
175 |    "outputs": [],
176 |    "source": [
177 |     "### classification examples --- fgrrm(), zlrm(), qeFairKNN(), qeFairRF(), qeFairRidgeLog()\n",
178 |     "\n",
179 |     "### read and preprocess data\n",
180 |     "\n",
181 |     "# test and train split\n",
182 |     "#### REPLACE WITH YOUR PATH TO compas1.RData\n",
183 |     "# df = read_data(\"\")\n",
184 |     "test_df, train_df = train_test_split(df, test_size=0.3, random_state=42)\n",
185 |     "test_y = test_df['two_year_recid']\n",
186 |     "test_y = test_df['two_year_recid'].map({'Yes': 1, 'No': 0})            # convert to binary\n",
187 |     "test_df = test_df.drop(columns=['two_year_recid'])\n",
188 |     "\n",
189 |     "# preprocess data\n",
190 |     "cat_features = ['sex', 'race', 'two_year_recid']\n",
191 |     "num_features = [\"age\", \"juv_fel_count\",\"decile_score\",\"juv_misd_count\",\"juv_other_count\",\"priors_count\",\"c_jail_in\",\"c_jail_out\",\"c_offense_date\",\"screening_date\",\"in_custody\",\"out_custody\"]\n",
192 |     "compas1_train = preprocess_data(train_df, cat_features_train, num_features_train)\n",
193 |     "\n",
194 |     "cat_features = ['sex', 'race']\n",
195 |     "num_features = [\"age\", \"juv_fel_count\",\"decile_score\",\"juv_misd_count\",\"juv_other_count\",\"priors_count\",\"c_jail_in\",\"c_jail_out\",\"c_offense_date\",\"screening_date\",\"in_custody\",\"out_custody\"]\n",
196 |     "compas1_test = preprocess_data(test_df, cat_features_test, num_features_test)\n"
197 |    ]
198 |   },
199 |   {
200 |    "cell_type": "code",
201 |    "execution_count": null,
202 |    "id": "d2d728fd",
203 |    "metadata": {},
204 |    "outputs": [],
205 |    "source": [
206 |     "### using dsldPyFairML() functions \n",
207 |     "\n",
208 |     "### model training --- fgrrm() \n",
209 |     "### unfairness = 0.1 // try different values for unfairness\n",
210 |     "a = dsldPyFgrrm(data=compas1_train, yName='two_year_recid', sName='race', unfairness=0.1, definition = \"sp-komiyama\", family = \"binomial\", lamda = 0, save = False, yesYVal = \"Yes\")\n",
211 |     "\n",
212 |     "# print train accuracy and correlations\n",
213 |     "print(f\"train predictions: {a['train_predictions']}\")             # returns prob = Yes\n",
214 |     "print(f\"train accuracy (misclassification rate): {a['train_accuracy']}\")\n",
215 |     "print(f\"train correlations: {a['train_correlations']}\")"
216 |    ]
217 |   },
218 |   {
219 |    "cell_type": "code",
220 |    "execution_count": null,
221 |    "id": "226e4af8",
222 |    "metadata": {},
223 |    "outputs": [],
224 |    "source": [
225 |     "### predict() on test set\n",
226 |     "a_preds = dsldPyFairML_Predict(a, compas1_test)\n",
227 |     "\n",
228 |     "# print test predictions and correlations\n",
229 |     "print(f\"test predictions: {a_preds['test_predictions']}\") # returns prob = Yes\n",
230 |     "print(f\"test correlations: {a_preds['test_correlations']}\")\n",
231 |     "\n",
232 |     "# manually compute test accuracy (MAPE)\n",
233 |     "y_pred = [int(round(x)) for x in a_preds['test_predictions']]\n",
234 |     "test_accuracy = accuracy_score(test_y, y_pred)\n",
235 |     "misclass_rate = 1 - test_accuracy\n",
236 |     "\n",
237 |     "# print train accuracy and correlations\n",
238 |     "print(f\"test accuracy (misclassification rate): {misclass_rate}\")\n",
239 |     "\n",
240 |     "### the same can be done for other models --- zlrm() with dsldPyFairML_Predict() method"
241 |    ]
242 |   },
243 |   {
244 |    "cell_type": "code",
245 |    "execution_count": null,
246 |    "id": "3916ab41",
247 |    "metadata": {},
248 |    "outputs": [],
249 |    "source": [
250 |     "### using dsldPyQeFairML() functions \n",
251 |     "\n",
252 |     "### model training --- dsldQeFairKNN() \n",
253 |     "### deweightPars = {'decile_score': 0.2, 'priors_count': 0.5} // try different values for deweightPars\n",
254 |     "deweightPars = {'decile_score': 0.2, 'priors_count': 0.5}\n",
255 |     "\n",
256 |     "a = dsldPyQeFairKNN(data=compas1_train, yName='two_year_recid',sNames= 'race', deweightPars=deweightPars, k = 10, scaleX = True, yesYVal = \"Yes\")\n",
257 |     "\n",
258 |     "# print train accuracy and correlations\n",
259 |     "# in the case of classification, the train_predictions returns both predClasses and prob = Yes\n",
260 |     "print(f\"train predictions: {a['train_predictions']}\")     \n",
261 |     "print(f\"train accuracy: {a['train_accuracy']}\")\n",
262 |     "print(f\"train correlations: {a['train_correlations']}\")"
263 |    ]
264 |   },
265 |   {
266 |    "cell_type": "code",
267 |    "execution_count": null,
268 |    "id": "e293d0e3",
269 |    "metadata": {},
270 |    "outputs": [],
271 |    "source": [
272 |     "### predict() on test set\n",
273 |     "a_preds = dsldPyQeFairML_Predict(a, compas1_test)\n",
274 |     "\n",
275 |     "# print test predictions and correlations\n",
276 |     "print(f\"test predictions: {a_preds['test_predictions']}\")\n",
277 |     "print(f\"test correlations: {a_preds['test_correlations']}\")\n",
278 |     "\n",
279 |     "# compute test accuracy\n",
280 |     "y_pred = [int(round(x)) for x in list(a_preds['test_predictions'][1])]\n",
281 |     "test_accuracy = accuracy_score(test_y, y_pred)\n",
282 |     "misclass_rate = 1 - test_accuracy\n",
283 |     "\n",
284 |     "# print train accuracy and correlations\n",
285 |     "print(f\"test accuracy (misclassification rate): {misclass_rate}\")\n",
286 |     "\n",
287 |     "### the same can be done for other models --- dsldQeFairRF(), dsldQeFairRidgeLog() with dsldPyQeFairML_Predict() method"
288 |    ]
289 |   },
290 |   {
291 |    "cell_type": "code",
292 |    "execution_count": null,
293 |    "id": "398cbe9d",
294 |    "metadata": {},
295 |    "outputs": [],
296 |    "source": [
297 |     "### k-fold cross validation to find best model based on fairness and accuracy\n",
298 |     "dsldPyFairUtils(data=svcensus_train, yName='wageinc', sName='gender', dsldFTNname = \"dsldFrrm\", unfairness = [0.01, 0.05, 0.1, 0.2, 0.8], k_folds = 10)"
299 |    ]
300 |   },
301 |   {
302 |    "cell_type": "code",
303 |    "execution_count": null,
304 |    "id": "50a1c4de",
305 |    "metadata": {},
306 |    "outputs": [],
307 |    "source": [
308 |     "dsldPyFairUtils(data = svcensus_train, yName = 'wageinc', sName = 'gender', dsldFTNname = \"dsldQeFairKNN\", deweightPars = {'occ': [0.9 ,0.8 ,0.5 ,0.3 ,0.1 ,0.05 ,0.01]}, k_folds = 10)"
309 |    ]
310 |   },
311 |   {
312 |    "cell_type": "code",
313 |    "execution_count": null,
314 |    "id": "04839f09",
315 |    "metadata": {},
316 |    "outputs": [],
317 |    "source": []
318 |   }
319 |  ],
320 |  "metadata": {
321 |   "kernelspec": {
322 |    "display_name": "base",
323 |    "language": "python",
324 |    "name": "python3"
325 |   },
326 |   "language_info": {
327 |    "codemirror_mode": {
328 |     "name": "ipython",
329 |     "version": 3
330 |    },
331 |    "file_extension": ".py",
332 |    "mimetype": "text/x-python",
333 |    "name": "python",
334 |    "nbconvert_exporter": "python",
335 |    "pygments_lexer": "ipython3",
336 |    "version": "3.12.2"
337 |   }
338 |  },
339 |  "nbformat": 4,
340 |  "nbformat_minor": 5
341 | }
342 | 


--------------------------------------------------------------------------------