├── data ├── lak.RData ├── compas1.RData ├── svcensus.RData └── mortgageSE.RData ├── vignettes ├── z.png ├── wibg.png ├── RpartVert.png ├── smallwage.png ├── freqparcoord.png ├── conditdisparity.png └── Function_List.Rmd ├── .gitignore ├── R ├── dsldBnlearn.R ├── onAttach.R ├── dsldFrequencybyS.R ├── dsldFreqPCoord.R ├── dsldConfounders.R ├── dsldMatching.R ├── dsldML.R ├── dsldHunting.R ├── dsldConditDisparity.R ├── dsldDensitybyS.R ├── dsldTakeALookAround.R ├── dsldScatterPlot3D.R ├── dsldFairML.R ├── Utils.R ├── dsldFairUtils.R └── dsldLogit.R ├── man ├── svcensus.Rd ├── compas1.Rd ├── mortgageSE.Rd ├── Utils.Rd ├── lak.Rd ├── dsldBnlearn.Rd ├── dsldConfounders.Rd ├── dsldDensityByS.Rd ├── dsldFrequencyByS.Rd ├── dsldConditDisparity.Rd ├── dsldPropens.Rd ├── dsldML.Rd ├── dsldTakeALookAround.Rd ├── dsldFairUtils.Rd ├── dsldHunting.Rd ├── dsldLogit.Rd ├── dsldScatterPlot3D.Rd ├── dsldLinear.Rd ├── dsldFreqPCoord.Rd ├── dsldFairML.Rd └── dsldEDFFair.Rd ├── inst ├── src │ └── dsldPy │ │ ├── dsldPyBnLearn.py │ │ ├── dsldPyFrequencybyS.py │ │ ├── dsldPyDensitybyS.py │ │ ├── dsldPyMatching.py │ │ ├── LICENSE │ │ ├── dsldPyHunting.py │ │ ├── dsldPyConfounders.py │ │ ├── dsldPyTakeALookAround.py │ │ ├── dsldPyConditDisparity.py │ │ ├── dsldPyScatterPlot3D.py │ │ ├── dsldPyML.py │ │ ├── dsldPyFairUtils.py │ │ ├── dsldPyLogit.py │ │ ├── dsldPyFreqPCoord.py │ │ ├── dsldPyLinear.py │ │ ├── Utils.py │ │ ├── __init__.py │ │ ├── dsldPyFairML.py │ │ └── dsldPyQeFairML.py ├── pyproject.toml ├── README.md └── examples │ ├── graphical.ipynb │ ├── tabular.ipynb │ └── machine_learning.ipynb ├── DESCRIPTION ├── NAMESPACE └── README.md /data/lak.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matloff/dsld/HEAD/data/lak.RData -------------------------------------------------------------------------------- /vignettes/z.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matloff/dsld/HEAD/vignettes/z.png -------------------------------------------------------------------------------- /data/compas1.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matloff/dsld/HEAD/data/compas1.RData -------------------------------------------------------------------------------- /data/svcensus.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matloff/dsld/HEAD/data/svcensus.RData -------------------------------------------------------------------------------- /vignettes/wibg.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matloff/dsld/HEAD/vignettes/wibg.png -------------------------------------------------------------------------------- /data/mortgageSE.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matloff/dsld/HEAD/data/mortgageSE.RData -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # R session/user files 2 | .Rhistory 3 | .RData 4 | 5 | # macOS metadata 6 | .DS_Store -------------------------------------------------------------------------------- /vignettes/RpartVert.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matloff/dsld/HEAD/vignettes/RpartVert.png -------------------------------------------------------------------------------- /vignettes/smallwage.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matloff/dsld/HEAD/vignettes/smallwage.png -------------------------------------------------------------------------------- /vignettes/freqparcoord.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matloff/dsld/HEAD/vignettes/freqparcoord.png -------------------------------------------------------------------------------- /vignettes/conditdisparity.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matloff/dsld/HEAD/vignettes/conditdisparity.png -------------------------------------------------------------------------------- /R/dsldBnlearn.R: -------------------------------------------------------------------------------- 1 | 2 | # iamb method of causal discovery 3 | 4 | dsldIamb <- function(data) 5 | { 6 | 7 | getSuggestedLib('bnlearn') 8 | return(bnlearn::iamb(data)) 9 | 10 | } 11 | 12 | -------------------------------------------------------------------------------- /man/svcensus.Rd: -------------------------------------------------------------------------------- 1 | \name{svcensus} 2 | \alias{svcensus} 3 | 4 | \title{ 5 | Silicon Valley programmers and engineers data 6 | } 7 | 8 | \description{ 9 | 10 | Via qeML: This data set is adapted from the 2000 Census, 11 | restricted to programmers and engineers in the Silicon Valley area. 12 | } 13 | -------------------------------------------------------------------------------- /man/compas1.Rd: -------------------------------------------------------------------------------- 1 | \name{compas1} 2 | \alias{compas1} 3 | 4 | \title{ 5 | Criminal Offenders Screened in Florida 6 | } 7 | 8 | \description{ 9 | 10 | A collection of criminal offenders screened in Florida (US) during 11 | 2013-14. This data was used to predict recidivism. 12 | 13 | Additional details for this dataset can be found via the \pkg{fairml} package. 14 | 15 | } 16 | 17 | -------------------------------------------------------------------------------- /R/onAttach.R: -------------------------------------------------------------------------------- 1 | 2 | .onAttach <- function(libname, pkgname) { 3 | packageStartupMessage( 4 | '\n\n\n\n\n*********************\n\n\n\n Navigating dsld:\n 5 | Type vignette("Quick_Start",package="dsld") for a quick overview!\n 6 | Type vignette("Function_List",package="dsld") for a categorized function list\n 7 | Latest version at https://github.com/matloff/dsld') 8 | } 9 | 10 | -------------------------------------------------------------------------------- /man/mortgageSE.Rd: -------------------------------------------------------------------------------- 1 | \name{mortgageSE} 2 | \alias{mortgageSE} 3 | 4 | \title{ 5 | Mortgage Denial 6 | } 7 | 8 | \description{ 9 | 10 | The dataset provides applicant information (including race, income, loan 11 | information, etc.) The response variable indicates whether or not the 12 | applicant was approved for the loan. Additional details can be found in 13 | the \code{SortedEffects} package. 14 | 15 | } 16 | -------------------------------------------------------------------------------- /man/Utils.Rd: -------------------------------------------------------------------------------- 1 | \name{utilities} 2 | \alias{getSuggestedLib} 3 | 4 | \title{ 5 | Utitlities 6 | } 7 | 8 | \usage{ 9 | getSuggestedLib(pkgName) 10 | } 11 | 12 | \arguments{ 13 | \item{pkgName}{Name of the package to be checked/loaded.} 14 | } 15 | \description{ 16 | 17 | Attempts to load the specified package, halting execution upon failure. 18 | 19 | } 20 | 21 | \value{ 22 | No value, just side effects. 23 | } 24 | 25 | -------------------------------------------------------------------------------- /man/lak.Rd: -------------------------------------------------------------------------------- 1 | \name{lak} 2 | \alias{lak} 3 | 4 | \title{ 5 | Labor Market Discrimination 6 | } 7 | 8 | \description{ 9 | Fictional CVs sent to real employers to investigate discrimination via 10 | given names. See Mullainathan and Bertran (2004). 11 | } 12 | 13 | \references{ 14 | \itemize{ 15 | \item Mullainathan, S. and Bertran, M. (2004). Are Emily and Greg More 16 | Employable Than Lakisha and Jamal? A Field Experiment on 17 | Labor Market Discrimination. American Economic Review, 94:991-1013 18 | } 19 | } 20 | 21 | -------------------------------------------------------------------------------- /inst/src/dsldPy/dsldPyBnLearn.py: -------------------------------------------------------------------------------- 1 | from rpy2 import robjects 2 | from rpy2.robjects.packages import importr 3 | from IPython.display import Image, display 4 | from .Utils import get_dsld 5 | 6 | def dsldPyIamb(data, file="iamb.png", width=1200, height=900, res=150): 7 | dsld = get_dsld() 8 | a = dsld.dsldIamb(data) 9 | grdevices = importr("grDevices") 10 | grdevices.png(file=file, width=width, height=height, res=res) 11 | robjects.r["plot"](a) 12 | grdevices.dev_off() 13 | display(Image(filename=file)) 14 | return file 15 | -------------------------------------------------------------------------------- /inst/src/dsldPy/dsldPyFrequencybyS.py: -------------------------------------------------------------------------------- 1 | import os, tempfile 2 | import rpy2.robjects as ro 3 | from rpy2.robjects.vectors import StrVector, IntVector, BoolVector 4 | from IPython.display import Image, display 5 | from .Utils import get_dsld, dsld_Rpy2_IsRDataframe, dsld_Rpy2_RDataframeToPandas 6 | from rpy2.robjects.packages import importr 7 | 8 | def dsldPyFrequencybyS(data, cName, sName): 9 | 10 | r_data = dsld_Rpy2_IsRDataframe(data) 11 | cName_r = StrVector([cName]) 12 | sName_r = StrVector([sName]) 13 | 14 | dsld = get_dsld() 15 | res = dsld.dsldFrequencyByS(r_data, cName_r, sName_r) 16 | return dsld_Rpy2_RDataframeToPandas(res) 17 | 18 | -------------------------------------------------------------------------------- /man/dsldBnlearn.Rd: -------------------------------------------------------------------------------- 1 | \name{dsldBnlearn} 2 | \alias{dsldIamb} 3 | 4 | \title{dsldBnlearn} 5 | 6 | \description{ 7 | Wrappers for functions in the \pkg{bnlearn} package. (Presently, just \code{iamb}.) 8 | } 9 | 10 | \usage{ 11 | dsldIamb(data) 12 | } 13 | 14 | \arguments{ 15 | \item{data}{ 16 | Data frame. 17 | } 18 | } 19 | 20 | \details{ 21 | 22 | Under very stringent assumptions, \code{dsldIamb} performs causal 23 | discovery, i.e. fits a causal model to \code{data}. 24 | 25 | } 26 | 27 | \value{ 28 | Object of class 'bn' (\pkg{bnlearn} object). The generic \code{plot} 29 | function is callable on this object. 30 | } 31 | 32 | \author{ 33 | N. Matloff 34 | } 35 | 36 | \examples{ 37 | \donttest{ 38 | data(svcensus) 39 | # iamb does not accept integer data 40 | svcensus$wkswrkd <- as.numeric(svcensus$wkswrkd) 41 | svcensus$wageinc <- as.numeric(svcensus$wageinc) 42 | iambOut <- dsldIamb(svcensus) 43 | plot(iambOut) 44 | } 45 | } 46 | 47 | -------------------------------------------------------------------------------- /inst/pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools>=77", "wheel"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [project] 6 | name = "dsldPy" 7 | version = "0.0.3" 8 | description = "Python wrappers around the R 'dsld' package via rpy2" 9 | readme = "README.md" 10 | requires-python = ">=3.8" 11 | authors = [ 12 | {name = "A. Mittal, T. Abdullah, A. Ashok, B. Zarate Estrada, S. Martha, B. Ouattara, J. Tran, and N. Matloff"} 13 | ] 14 | license = "MIT" 15 | keywords = ["dsld", "rpy2", "fair machine learning", "statistics", "confounder analysis"] 16 | classifiers = [ 17 | "Programming Language :: Python :: 3", 18 | "Operating System :: OS Independent", 19 | "Topic :: Scientific/Engineering", 20 | ] 21 | dependencies = [ 22 | "pandas>=1.1", 23 | "numpy>=1.20", 24 | "Pillow>=8", 25 | "pyreadr>=0.4", 26 | "rpy2>=3.5", 27 | "plotly>=5", 28 | "ipython>=7", 29 | "nbformat", 30 | "scikit-learn", 31 | ] 32 | 33 | [tool.setuptools] 34 | package-dir = {"" = "src"} 35 | 36 | [tool.setuptools.packages.find] 37 | where = ["src"] 38 | -------------------------------------------------------------------------------- /man/dsldConfounders.Rd: -------------------------------------------------------------------------------- 1 | \name{dsldConfounders} 2 | \alias{dsldConfounders} 3 | \title{dsldConfounders} 4 | 5 | \description{ 6 | Plots estimated densities of all continuous features X, conditioned on a 7 | specified categorical feature C. 8 | } 9 | 10 | \usage{ 11 | dsldConfounders(data, sName, graphType = "plotly", fill = FALSE) 12 | } 13 | 14 | \arguments{ 15 | \item{data}{Dataframe, at least 2 columns.} 16 | \item{sName}{ 17 | Name of the categorical column, an R factor. In discrimination 18 | contexts, Typically a sensitive variable. 19 | } 20 | \item{graphType}{ 21 | Either "plot" or "plotly", for static or interactive graphs. 22 | The latter requires the \pkg{plotly} package. 23 | } 24 | \item{fill}{ 25 | Only applicable to graphType = "plot" case. Setting to true 26 | will color each line down to the x-axis. 27 | } 28 | } 29 | 30 | \author{ 31 | N. Matloff, T. Abdullah, A. Ashok, J. Tran 32 | } 33 | 34 | \value{No value; plot.} 35 | 36 | \examples{ 37 | \donttest{ 38 | data(svcensus) 39 | dsldConfounders(svcensus, "educ") 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /inst/src/dsldPy/dsldPyDensitybyS.py: -------------------------------------------------------------------------------- 1 | import os, tempfile 2 | import rpy2.robjects as ro 3 | from rpy2.robjects.vectors import StrVector, IntVector, BoolVector 4 | from IPython.display import Image, display 5 | from .Utils import get_dsld, dsld_Rpy2_IsRDataframe 6 | from rpy2.robjects.packages import importr 7 | 8 | def dsldPyDensitybyS(data, cName, sName, graphType = "plotly", fill = False): 9 | 10 | r_data = dsld_Rpy2_IsRDataframe(data) 11 | cName_r = StrVector([cName]) 12 | sName_r = StrVector([sName]) 13 | graphType_r = StrVector([graphType]) 14 | fill_r = BoolVector([fill]) 15 | 16 | fd, tmpfile = tempfile.mkstemp(suffix=".png"); os.close(fd) 17 | grdevices = importr("grDevices") 18 | grdevices.png(file=tmpfile, width=1200, height=800, res=150) 19 | try: 20 | dsld = get_dsld() 21 | res = dsld.dsldDensityByS(r_data, cName_r, sName_r, graphType_r, fill_r) 22 | try: ro.r("print")(res) 23 | except: pass 24 | finally: 25 | grdevices.dev_off() 26 | 27 | if os.path.exists(tmpfile): display(Image(filename=tmpfile)) 28 | return 29 | 30 | -------------------------------------------------------------------------------- /inst/src/dsldPy/dsldPyMatching.py: -------------------------------------------------------------------------------- 1 | import os, tempfile 2 | import rpy2.robjects as ro 3 | from rpy2.robjects.vectors import StrVector, IntVector, BoolVector 4 | from IPython.display import Image, display 5 | from .Utils import get_dsld, dsld_Rpy2_IsRDataframe, dsld_Rpy2_RDataframeToPandas 6 | from rpy2.robjects.packages import importr 7 | import rpy2.robjects as robjects 8 | 9 | def dsldPyMatchedATE(data, yName, sName, yesSVal, yesYVal=None, propensFtn=None, k=None): 10 | 11 | r_data = dsld_Rpy2_IsRDataframe(data) 12 | yName_r = robjects.StrVector([yName]) 13 | sName_r = robjects.StrVector([sName]) 14 | yesSVal_r = robjects.StrVector([yesSVal]) 15 | 16 | yesYVal_r = robjects.StrVector([yesYVal]) if yesYVal is not None else robjects.NULL 17 | propensFtn_r = robjects.StrVector([propensFtn]) if propensFtn is not None else robjects.NULL 18 | k_r = robjects.IntVector([k]) if k is not None else robjects.NULL 19 | 20 | dsld = get_dsld() 21 | res = dsld.dsldMatchedATE(r_data, yName_r, sName_r, yesSVal_r, yesYVal_r, propensFtn_r, k_r) 22 | 23 | ro.r("summary")(res) 24 | 25 | return 26 | 27 | 28 | -------------------------------------------------------------------------------- /inst/src/dsldPy/LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2018 The Python Packaging Authority 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to deal 5 | in the Software without restriction, including without limitation the rights 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | copies of the Software, and to permit persons to whom the Software is 8 | furnished to do so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in all 11 | copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 19 | SOFTWARE. 20 | -------------------------------------------------------------------------------- /man/dsldDensityByS.Rd: -------------------------------------------------------------------------------- 1 | \name{dsldDensityByS} 2 | \alias{dsldDensityByS} 3 | \title{dsldDensityByS} 4 | 5 | \description{ 6 | Graphs densities of a response variable, grouped by a sensitive variable. 7 | Similar to \code{dsldConfounders}, but includes sliders to control the 8 | bandwidth of the density estimate (analogous to controlling the bin 9 | width in a histogram). 10 | } 11 | 12 | \usage{ 13 | dsldDensityByS(data, cName, sName, graphType = "plotly", fill = FALSE) 14 | } 15 | 16 | \arguments{ 17 | \item{data}{ 18 | Datasetwith at least 1 numerical column and 1 factor column 19 | } 20 | \item{cName}{ 21 | Possible confounding variable column, an R numeric 22 | } 23 | \item{sName}{ 24 | Name of the sensitive variable column, an R factor 25 | } 26 | \item{graphType}{ 27 | Type of graph created. Defaults to "plotly". 28 | } 29 | \item{fill}{ 30 | To fill the graph. Defaults to "FALSE". 31 | } 32 | } 33 | 34 | \author{ 35 | N. Matloff, T. Abdullah, A. Ashok, J. Tran 36 | } 37 | 38 | \value{No value; plot.} 39 | 40 | \examples{ 41 | data(svcensus) 42 | dsldDensityByS(svcensus, cName = "wageinc", sName = "educ") 43 | } 44 | -------------------------------------------------------------------------------- /inst/src/dsldPy/dsldPyHunting.py: -------------------------------------------------------------------------------- 1 | from .Utils import get_dsld, dsld_Rpy2_IsRDataframe, dsld_Rpy2_RDataframeToPandas 2 | import sys 3 | import os 4 | import pandas as pd 5 | from PIL import Image 6 | import rpy2.robjects as robjects 7 | from rpy2.robjects.packages import importr 8 | from rpy2.robjects import r 9 | import rpy2.robjects as ro 10 | 11 | ### dsldPyCHunting 12 | def dsldPyCHunting(data,yName,sName,intersectDepth=10): 13 | r_data = dsld_Rpy2_IsRDataframe(data) 14 | yName_r = robjects.StrVector([yName]) 15 | sName_r = robjects.StrVector([sName]) 16 | intersectDepth_r = robjects.IntVector([intersectDepth]) 17 | 18 | dsld = get_dsld() 19 | res = dsld.dsldCHunting(r_data, yName_r, sName_r, intersectDepth_r) 20 | result = {'impForY' : list(zip(list(res[0].names), list(res[0]))), 'impForS' : list(zip(list(res[1].names), list(res[1])))} 21 | return result 22 | 23 | ### dsldPyOHunting 24 | def dsldPyOHunting(data,yName,sName): 25 | r_data = dsld_Rpy2_IsRDataframe(data) 26 | yName_r = robjects.StrVector([yName]) 27 | sName_r = robjects.StrVector([sName]) 28 | dsld = get_dsld() 29 | res = dsld.dsldOHunting(r_data, yName_r, sName_r) 30 | 31 | # print in R 32 | ro.r("print")(res) 33 | return res 34 | -------------------------------------------------------------------------------- /vignettes/Function_List.Rmd: -------------------------------------------------------------------------------- 1 | 2 | --- 3 | title: "Categorized Function List" 4 | output: 5 | rmarkdown::html_vignette 6 | vignette: > 7 | %\VignetteIndexEntry{Function List} 8 | %\VignetteEngine{knitr::rmarkdown} 9 | \usepackage[utf8]{inputenc} 10 | --- 11 | 12 | ```{r, include = FALSE} 13 | knitr::opts_chunk$set( 14 | collapse = TRUE, 15 | comment = "#>" 16 | ) 17 | ``` 18 | 19 | 20 | # Categorized Function List 21 | 22 |
23 |
24 |
25 | 26 | *Tabular confounder exploration* 27 | 28 | * dsldCHunting 29 | 30 | * dsldDiffSLin 31 | 32 | * dsldDiffSLog 33 | 34 | * dsldTakeALookAround 35 | 36 | *Graphical confounder exploration* 37 | 38 | * dsldConditDisparity 39 | 40 | * dsldConfounders 41 | 42 | * dsldDensityByS 43 | 44 | * dsldScatterPlot3D 45 | 46 | *Statistical analysisfor group differences* 47 | 48 | * dsldLinear 49 | 50 | * dsldLogit 51 | 52 | * dsldML 53 | 54 | *Causal analysis* 55 | 56 | * dsldIamb 57 | 58 | * dsldMatchedATE 59 | 60 | *Fairness in prediction* 61 | 62 | * dsldFgrrm 63 | 64 | * dsldFrrm 65 | 66 | * dsldNclm 67 | 68 | * dsldOHunting 69 | 70 | * dsldQeFairRF 71 | 72 | * dsldQeFairRidgeLin 73 | 74 | * dsldQeFairRidgeLog 75 | 76 | * dsldZlrm 77 | 78 | * dsldZlm 79 | 80 | * dsldTakeALookAround 81 | 82 | 83 | -------------------------------------------------------------------------------- /R/dsldFrequencybyS.R: -------------------------------------------------------------------------------- 1 | dsldFrequencyByS <- function(data, cName, sName) { 2 | # type validation # 3 | # we're essentially just checking the value-type for the key columns 4 | if (!class(data[, sName]) %in% c("factor", "character")) { 5 | stop(paste( 6 | "sName should be of factor or character data type." 7 | )) 8 | } 9 | # helpful error message if the specified confounder column isn't factor 10 | if (!class(data[, cName]) %in% c("factor", "character")) { 11 | stop(paste( 12 | "cName should be of factor or character data type. Consider", 13 | " calling `dsldDensityByS(data, cName = ", 14 | cName, 15 | ", sName = ", 16 | sName, 17 | ")` instead", 18 | sep = "" 19 | )) 20 | } 21 | 22 | # sensitive variable frequencies # 23 | # unique levels to ensure order 24 | yGroups <- unique(data[[cName]]) 25 | 26 | # get a lookup for every s level against every ylevel 27 | freqLookup <- table(data[[sName]], data[[cName]]) 28 | 29 | # convert counts to proportions 30 | freqLookup <- freqLookup / rowSums(freqLookup) 31 | 32 | # convert to dataframe 33 | frequencies <- as.data.frame.matrix(freqLookup) 34 | names(frequencies) <- c( 35 | paste0("Frequency of ", yGroups) 36 | ) 37 | 38 | return(frequencies) 39 | } 40 | 41 | -------------------------------------------------------------------------------- /inst/src/dsldPy/dsldPyConfounders.py: -------------------------------------------------------------------------------- 1 | 2 | from .Utils import get_dsld, dsld_Rpy2_IsRDataframe 3 | import sys 4 | import os 5 | import pandas as pd 6 | from PIL import Image 7 | import rpy2.robjects as robjects 8 | from rpy2.robjects.packages import importr 9 | from rpy2.robjects import r 10 | import os, tempfile 11 | import rpy2.robjects as ro 12 | from rpy2.robjects.vectors import StrVector, IntVector, BoolVector 13 | from IPython.display import Image, display 14 | from .Utils import dsld_Rpy2_IsRDataframe 15 | from rpy2.robjects.packages import importr 16 | 17 | def dsldPyConfounders(data,sName, graphType = "plotly",fill=False): 18 | r_data = dsld_Rpy2_IsRDataframe(data) 19 | sName_r = robjects.StrVector([sName]) 20 | graphType_r = robjects.StrVector([graphType]) 21 | fill_r = robjects.BoolVector([fill]) 22 | 23 | fd, tmpfile = tempfile.mkstemp(suffix=".png"); os.close(fd) 24 | grdevices = importr('grDevices') 25 | grdevices.png(file=tmpfile, width=1200, height=800, res=150) 26 | try: 27 | dsld = get_dsld() 28 | res = dsld.dsldConfounders(r_data, sName_r, graphType_r, fill_r) 29 | try: ro.r("print")(res) 30 | except: pass 31 | finally: 32 | grdevices.dev_off() 33 | 34 | if os.path.exists(tmpfile): display(Image(filename=tmpfile)) 35 | return 36 | 37 | -------------------------------------------------------------------------------- /R/dsldFreqPCoord.R: -------------------------------------------------------------------------------- 1 | 2 | dsldFreqPCoord <- function(data, m, sName = NULL, 3 | method = "maxdens", faceting = "vert", k = 50, 4 | klm = 5 * k, keepidxs = NULL, plotidxs = FALSE, 5 | cls = NULL, plot_filename = NULL) { 6 | 7 | getSuggestedLib("freqparcoord") 8 | getSuggestedLib("ggplot2") 9 | 10 | if (!is.null(sName)) { 11 | s <- data[[sName]] 12 | scol <- which(names(data) == sName) 13 | dms <- data[,-scol] 14 | dms <- factorsToDummies(dms) 15 | dms <- as.data.frame(dms) 16 | data <- cbind(dms,s) 17 | data <- as.data.frame(data) 18 | scol <- ncol(data) 19 | colnames(data)[scol] <- sName 20 | columns <- 1:(scol-1) 21 | } else { 22 | data <- factorsToDummies(data) 23 | columns <- 1:ncol(data) 24 | } 25 | 26 | fpcOut <- freqparcoord::freqparcoord( 27 | data, 28 | m, 29 | dispcols = columns, 30 | grpvar = sName, 31 | method = method, 32 | faceting = faceting, 33 | k = k, 34 | klm = klm, 35 | keepidxs = keepidxs, 36 | plotidxs = plotidxs, 37 | cls = cls 38 | ) 39 | 40 | if (!is.null(plot_filename)) { 41 | ggplot2::ggsave(plot_filename, fpcOut) # Save as img 42 | } 43 | 44 | return(fpcOut) 45 | } 46 | 47 | 48 | -------------------------------------------------------------------------------- /R/dsldConfounders.R: -------------------------------------------------------------------------------- 1 | dsldConfounders <- function(data, sName, graphType = "plotly", fill = FALSE) { 2 | # Error checking 3 | if (is.null(sName)) { 4 | stop(paste("sName must be provided as a quoted column name")) 5 | } 6 | 7 | # dispatch to appropriate auxiliary method 8 | numCols <- ncol(data) 9 | for (i in 1:numCols) { 10 | # skip sName 11 | if (colnames(data)[i] == sName) { 12 | next 13 | } 14 | 15 | # if categorical 16 | if (is.factor(data[, i])) { 17 | print(dsldFrequencyByS(data, colnames(data)[i], sName)) 18 | 19 | # require input if there's a next 20 | if (i != numCols) { 21 | cat("Press to view next density graph / frequency dataframe...\n") 22 | tempInput <- readline() 23 | } 24 | # if numeric 25 | } else if (is.numeric(data[, i])) { 26 | print(dsldDensityByS(data, colnames(data)[i], sName, graphType, fill)) 27 | 28 | # require input if there's a next 29 | if (i != numCols) { 30 | cat("Press to view next density graph / frequency dataframe...\n") 31 | tempInput <- readline() 32 | } 33 | # throw error 34 | } else { 35 | stop(paste("Neither categorical or numeric column, check dataframe")) 36 | } 37 | } 38 | } 39 | 40 | -------------------------------------------------------------------------------- /inst/src/dsldPy/dsldPyTakeALookAround.py: -------------------------------------------------------------------------------- 1 | ''' 2 | This file contains the interface code for calling the dsldConditDisparity from dsld R package. 3 | The code uses rpy2 to handle dsld functions call from R and pandas library to check if 4 | users data input is in pandas data frame before doing any computation 5 | ''' 6 | from .Utils import get_dsld, dsld_Rpy2_IsRDataframe, dsld_Rpy2_RDataframeToPandas 7 | import sys 8 | import os 9 | import pandas as pd 10 | from PIL import Image 11 | import rpy2.robjects as robjects 12 | from rpy2.robjects.packages import importr 13 | from rpy2.robjects import r 14 | import math 15 | 16 | def dsldPyTakeALookAround(data, yName, sName, maxFeatureSize = None, holdout = None): 17 | r_data = dsld_Rpy2_IsRDataframe(data) 18 | yName_r = robjects.StrVector([yName]) 19 | sName_r = robjects.StrVector([sName]) 20 | 21 | if maxFeatureSize is not None: 22 | maxFeatureSize_r = robjects.IntVector([maxFeatureSize]) 23 | else: 24 | maxFeatureSize_r = robjects.IntVector([dsld_Rpy2_RDataframeToPandas(data).shape[1] - 2]) 25 | 26 | if holdout is not None: 27 | holdout_r = robjects.IntVector([holdout]) 28 | else: 29 | holdout_r = robjects.IntVector([math.floor(min(1000, 0.1 * dsld_Rpy2_RDataframeToPandas(data).shape[0]))]) 30 | 31 | dsld = get_dsld() 32 | res = dsld.dsldTakeALookAround(r_data, yName_r, sName_r, maxFeatureSize_r, holdout_r) 33 | return dsld_Rpy2_RDataframeToPandas(res) 34 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: dsld 2 | Version: 1.0.0 3 | Title: Data Science Looks at Discrimination 4 | Authors@R: c( person("Norm", "Matloff", email = "nsmatloff@ucdavis.edu", 5 | role = c("aut"), comment = c(ORCID = "0000-0001-9179-6785")), 6 | person("Taha", "Abdullah", email = "tmabdullah@ucdavis.edu", 7 | role = c("aut")), 8 | person("Arjun", "Ashok", email = "arjashok@ucdavis.edu", 9 | role = c("aut")), 10 | person("Shubhada", "Martha", email = "smartha@ucdavis.edu", 11 | role = c("aut")), 12 | person("Aditya", "Mittal", email = "adityamittal2031@gmail.com", 13 | role = c("aut", "cre")), 14 | person("Billy", "Ouattara", email = "btouattara@ucdavis.edu", 15 | role = c("aut")), 16 | person("Jonathan", "Tran", email = "jsttran@ucdavis.edu", 17 | role = c("aut")), 18 | person("Brandon", "Zarate Estrada", email = "bdzarate@ucdavis.edu", 19 | role = c("aut")) 20 | ) 21 | Maintainer: Aditya Mittal 22 | VignetteBuilder: knitr 23 | Imports: Kendall, ranger, ggplot2, plotly, 24 | freqparcoord, fairness,sandwich 25 | Depends: R (>= 3.5.0), fairml, gtools, regtools,qeML,rmarkdown 26 | Suggests: knitr,bnlearn,Matching,randomForest 27 | License: GPL (>= 2) 28 | Description: Statistical and graphical tools for detecting and measuring 29 | discrimination and bias, be it racial, gender, age or other. 30 | Detection and remediation of bias in machine learning algorithms. 31 | 'Python' interfaces available. 32 | URL: https://github.com/matloff/dsld 33 | BugReports: https://github.com/matloff/dsld/issues 34 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | import('qeML') 2 | import('regtools') 3 | import('gtools') 4 | import('ranger') 5 | import('fairml') 6 | import('fairness') 7 | import('Kendall') 8 | import('freqparcoord') 9 | import('ggplot2') 10 | import('rmarkdown') 11 | 12 | importFrom("grDevices", "dev.copy", "dev.cur", "dev.off", "dev.set", 13 | "jpeg", "pdf", "png", "rainbow") 14 | importFrom("graphics", "legend", "lines", "par", "polygon", "title") 15 | importFrom("stats", "as.formula", "binomial", "coef", "cor", "cov", 16 | "density", "glm", "lm", "loess", "na.exclude", "pnorm", 17 | "predict", "pt", "setNames", "vcov") 18 | importFrom("utils", "combn") 19 | importFrom("plotly", "plot_ly", "add_lines","layout") 20 | importFrom("sandwich", "sandwich") 21 | importFrom("stats", "model.matrix") 22 | 23 | export( 24 | getSuggestedLib, 25 | dsldFreqPCoord, 26 | dsldTakeALookAround, dsldConditDisparity, 27 | dsldScatterPlot3D, 28 | dsldLinear, dsldLogit, 29 | dsldConfounders, dsldDensityByS, dsldFrequencyByS, 30 | dsldFgrrm, dsldFrrm, dsldNclm, dsldZlm, dsldZlrm, 31 | dsldQeFairRidgeLin, dsldQeFairRF, dsldQeFairRidgeLog, dsldQeFairKNN, 32 | dsldML, dsldCHunting, dsldOHunting, dsldIamb, dsldMatchedATE, 33 | dsldFairUtils 34 | ) 35 | 36 | S3method(summary, dsldLM) 37 | S3method(coef, dsldLM) 38 | S3method(vcov, dsldLM) 39 | S3method(predict, dsldLM) 40 | 41 | S3method(summary, dsldGLM) 42 | S3method(coef, dsldGLM) 43 | S3method(vcov, dsldGLM) 44 | S3method(predict, dsldGLM) 45 | 46 | S3method(predict, dsldFairML) 47 | S3method(summary, dsldFairML) 48 | 49 | S3method(predict, dsldQeFair) 50 | -------------------------------------------------------------------------------- /man/dsldFrequencyByS.Rd: -------------------------------------------------------------------------------- 1 | \name{dsldFrequencyByS} 2 | \alias{dsldFrequencyByS} 3 | \title{dsldFrequencyByS} 4 | 5 | \description{ 6 | 7 | Informal assessment of C as a possible confounder in a relationship between a 8 | sensitive variable S and a variable Y. 9 | 10 | } 11 | 12 | \usage{ 13 | dsldFrequencyByS(data, cName, sName) 14 | } 15 | 16 | \arguments{ 17 | \item{data}{ 18 | Data frame or equivalent. 19 | } 20 | \item{cName}{ 21 | Name of the "C" column, an R factor. 22 | } 23 | \item{sName}{ 24 | Name of the sensitive variable column, an R factor 25 | } 26 | } 27 | 28 | \details{ 29 | 30 | Essentially an informal assessment of the between S and C. 31 | 32 | Consider the \code{svcensus} dataset. If for instance we are studying 33 | the effect of gender S on wage income Y, say C is occupation. If 34 | different genders have different occupation patterns, then C is a 35 | potential confounder. (Y does not explicitly appear here.) 36 | 37 | } 38 | 39 | \value{Data frame, one for each level of the sensitive variable S, and 40 | one column for each level of the confounder C. Each row sums to 1.0.} 41 | 42 | \examples{ 43 | data(svcensus) 44 | dsldFrequencyByS(svcensus, cName = "educ", sName = "gender") 45 | # not much difference in education between genders 46 | dsldFrequencyByS(svcensus, cName = "occ", sName = "gender") 47 | # substantial difference in occupation between genders 48 | data(lsa) 49 | lsa$faminc <- as.factor(lsa$fam_inc) 50 | dsldFrequencyByS(lsa,'faminc','race1') 51 | # distribution of family income by race 52 | } 53 | 54 | \author{ 55 | N. Matloff, T. Abdullah, A. Ashok, J. Tran, A. Mittal 56 | } 57 | 58 | -------------------------------------------------------------------------------- /inst/src/dsldPy/dsldPyConditDisparity.py: -------------------------------------------------------------------------------- 1 | import os, tempfile 2 | import rpy2.robjects as ro 3 | from rpy2.robjects.vectors import StrVector, IntVector, BoolVector 4 | from IPython.display import Image, display 5 | from .Utils import get_dsld, dsld_Rpy2_IsRDataframe 6 | from rpy2.robjects.packages import importr 7 | 8 | 9 | 10 | def _maybe_strvec(x): 11 | if x is None: return ro.NULL 12 | return StrVector(list(x) if isinstance(x, (list, tuple)) else [str(x)]) 13 | 14 | def dsldPyConditDisparity(data, yName, sName, xName, condits=None, qeFtn="qeKNN", minS=50, useLoess=True): 15 | r_data = dsld_Rpy2_IsRDataframe(data) 16 | yName_r = StrVector([yName]) 17 | sName_r = StrVector([sName]) 18 | xName_r = StrVector([xName]) 19 | condits_r = _maybe_strvec(condits) 20 | 21 | qeML = importr("qeML") 22 | if hasattr(qeML, qeFtn) and callable(getattr(qeML, qeFtn)): 23 | qeFtn_r = getattr(qeML, qeFtn) 24 | else: 25 | print(f"ERROR: qeML do not have function name: '{qeFtn}'"); return 26 | 27 | minS_r = IntVector([int(minS)]) 28 | useLoess_r = BoolVector([bool(useLoess)]) 29 | 30 | fd, tmpfile = tempfile.mkstemp(suffix=".png"); os.close(fd) 31 | grdevices = importr("grDevices") 32 | grdevices.png(file=tmpfile, width=1200, height=800, res=150) 33 | try: 34 | dsld = get_dsld() 35 | res = dsld.dsldConditDisparity(r_data, yName_r, sName_r, xName_r, condits_r, qeFtn_r, minS_r, useLoess_r) 36 | try: ro.r("print")(res) 37 | except: pass 38 | finally: 39 | grdevices.dev_off() 40 | 41 | if os.path.exists(tmpfile): display(Image(filename=tmpfile)) 42 | return 43 | -------------------------------------------------------------------------------- /inst/src/dsldPy/dsldPyScatterPlot3D.py: -------------------------------------------------------------------------------- 1 | # OVERVIEW: 2 | # No need to access dsldScatterPlot3D from dsld-package. 3 | # The function uses the package plotly in R, which is also available in Python. 4 | # This file requires packages: pandas, plotly, pyreadr 5 | # 6 | 7 | import pandas as pd 8 | import plotly.express as px 9 | from rpy2.robjects import pandas2ri 10 | from rpy2.robjects import vectors as rvectors 11 | from rpy2.robjects import conversion, default_converter 12 | 13 | def dsldPyScatterPlot3D(data, yNames, sName, *, bin_numeric_color=True, n_bins=5, renderer=None): 14 | if not isinstance(yNames, (list, tuple)) or len(yNames) != 3: 15 | raise ValueError("yNames must be a list of exactly 3 column names [x, y, z].") 16 | 17 | # Accept either pandas DataFrame or R data.frame; convert R -> pandas 18 | if isinstance(data, pd.DataFrame): 19 | df = data.copy() 20 | elif isinstance(data, rvectors.DataFrame): 21 | with conversion.localconverter(default_converter + pandas2ri.converter): 22 | df = conversion.rpy2py(data) 23 | else: 24 | raise TypeError("data must be a pandas DataFrame or an R data.frame") 25 | color_col = sName 26 | 27 | # If sName is numeric with many unique values, bin for discrete legend 28 | if bin_numeric_color and pd.api.types.is_numeric_dtype(df[sName]) and df[sName].nunique() > 20: 29 | color_col = f"{sName}_bin" 30 | df[color_col] = pd.qcut(df[sName], q=n_bins, duplicates="drop") 31 | 32 | fig = px.scatter_3d( 33 | df, 34 | x=yNames[0], 35 | y=yNames[1], 36 | z=yNames[2], 37 | color=color_col, 38 | opacity=0.75 39 | ) 40 | 41 | fig.update_traces(marker=dict(size=4)) 42 | fig.update_layout(legend_title_text=sName) 43 | 44 | if renderer: 45 | fig.show(renderer=renderer) 46 | else: 47 | fig.show() 48 | 49 | return fig 50 | -------------------------------------------------------------------------------- /man/dsldConditDisparity.Rd: -------------------------------------------------------------------------------- 1 | \name{dsldConditDisparity} 2 | \alias{dsldConditDisparity} 3 | \title{dsldConditDisparity} 4 | 5 | \description{ 6 | Plots (estimated) mean Y against X, separately for each level of S, 7 | with restrictions \code{condits}. May reveal Simpson's Paradox-like 8 | differences not seen in merely plotting mean Y against X. 9 | } 10 | 11 | \usage{ 12 | dsldConditDisparity(data, yName, sName, xName, condits = NULL, 13 | qeFtn = qeKNN, minS = 50, useLoess = TRUE) 14 | } 15 | 16 | \arguments{ 17 | \item{data}{Data frame or equivalent.} 18 | \item{yName}{Name of predicted variable Y. Must be numeric 19 | or dichtomous R factor.} 20 | \item{sName}{Name of the sensitive variable S, an R factor} 21 | \item{xName}{Name of a numeric column for the X-axis.} 22 | \item{condits}{An R vector; each component is a character 23 | string for an R logical expression representing a desired 24 | condition involving \code{names(data)} other than S and Y.} 25 | \item{qeFtn}{\code{qeML} predictive function (not quoted; 26 | only default arguments will be used.)} 27 | \item{minS}{Minimum size for an S group to be retained in the analysis.} 28 | \item{useLoess}{If TRUE, do loess smoothing on the fitted regression values.} 29 | } 30 | 31 | \author{ 32 | N. Matloff, A. Ashok, S. Martha, A. Mittal 33 | } 34 | 35 | \value{No value; plot.} 36 | 37 | \examples{ 38 | \donttest{ 39 | data(compas1) 40 | # graph probability of recidivism by race given age, among those with at 41 | # most 4 prior convictions and COMPAS decile score at least 6 42 | compas1$two_year_recid <- as.numeric(compas1$two_year_recid == "Yes") 43 | dsldConditDisparity(compas1,"two_year_recid", "race", "age", 44 | c("priors_count <= 4","decile_score>=6"), qeKNN) 45 | 46 | dsldConditDisparity(compas1,"two_year_recid", "race", "age", 47 | "priors_count == 0", qeGBoost) 48 | } 49 | } 50 | 51 | -------------------------------------------------------------------------------- /R/dsldMatching.R: -------------------------------------------------------------------------------- 1 | 2 | 3 | # finds the estimated mean difference between`the matched Y pairs in the 4 | # treated/nontreated (exposed and nonn-exposed) groups, with covariates 5 | # X in 'data' other than the yName and sName columns 6 | 7 | # sName here is the "treatment" or "exposure," S 8 | 9 | # dsld wrapper for Matching::Match; optional propensFtn must be either 10 | # 'glm' for logit or 'knn' for qeKNN 11 | 12 | # in that optional case, we estimate P(S = 1 | X), either by a logistic 13 | # or k-NN model 14 | 15 | # due to the fact that various function calls require different argument 16 | # types, we may generate several different versions of a variable; e.g. 17 | # S is a factor but we also need logical and numeric versions 18 | 19 | dsldMatchedATE <- function(data,yName,sName,yesSVal,yesYVal=NULL, 20 | propensFtn=NULL,k=NULL) 21 | { 22 | getSuggestedLib("Matching") 23 | 24 | ycol <- which(names(data) == yName) 25 | y <- data[,ycol] 26 | 27 | if (is.factor(y)) { 28 | yLvls <- levels(y) 29 | if (length(yLvls) != 2) 30 | stop('factor Y can only be dichotomous') 31 | yNum <- as.integer(y == yesYVal) 32 | dichotY <- TRUE 33 | } else { 34 | yNum <- y 35 | dichotY <- FALSE 36 | } 37 | 38 | scol <- which(names(data) == sName) 39 | s <- data[,scol] 40 | sLog <- (s == yesSVal) 41 | sNum <- as.integer(sLog) 42 | 43 | x <- data[,-c(ycol,scol)] 44 | if (!allNumeric(x)) 45 | xNum <- factorsToDummies(x,omitLast=TRUE,dfOut=TRUE) 46 | else xNum <- as.matrix(x) 47 | 48 | if (!is.null(propensFtn)) { 49 | if (propensFtn == 'glm') { 50 | matchVals <- glm(sNum ~ xNum,family=binomial)$fitted.values 51 | } else { # qeKNN 52 | tmp <- qeKNN(data[,-ycol],sName,yesYVal=yesSVal,k=k,holdout=NULL) 53 | matchVals <- tmp$regests 54 | } 55 | xNum <- matchVals 56 | } 57 | 58 | matchOut <- Matching::Match(Y=y,Tr=sLog,X=xNum,estimand='ATE',ties=FALSE) 59 | matchOut 60 | 61 | } 62 | 63 | 64 | -------------------------------------------------------------------------------- /man/dsldPropens.Rd: -------------------------------------------------------------------------------- 1 | \name{dsldMatchedATE} 2 | \alias{dsldMatchedATE} 3 | 4 | \title{dsldMatchedATE} 5 | 6 | \description{ 7 | Causal inference via matching models. 8 | Wrapper for \code{Matching::Match}. 9 | } 10 | 11 | \usage{ 12 | dsldMatchedATE(data,yName,sName,yesSVal,yesYVal=NULL, 13 | propensFtn=NULL,k=NULL) 14 | } 15 | 16 | \arguments{ 17 | \item{data}{Data frame.} 18 | \item{yName}{Name of the response variable column.} 19 | \item{sName}{ Name of the sensitive attribute column. The 20 | attribute must be dichotomous.} 21 | \item{yesSVal}{S value to be considered "yes," to be coded 22 | 1 rather than 0.} 23 | \item{yesYVal}{Y value to be considered "yes," to be coded 24 | 1 rather than 0.} 25 | \item{propensFtn}{Either 'glm' (logistic), or 'knn'.} 26 | \item{k}{Number of nearest neighbors if \code{propensFtn='knn'.}} 27 | } 28 | 29 | \value{ 30 | 31 | Object of class 'Match'. See documentation in the 32 | \pkg{Matching} package. 33 | } 34 | 35 | \details{ 36 | 37 | This is a \pkg{dsld} wrapper for \code{Matching::Match}. 38 | 39 | Matched analysis is typically applied to measuring "treatment effects," 40 | but is often applied in situations in which the "treatment," S here, is 41 | an immutable attribute such as race or gender. The usual issues 42 | concerning observational studies apply. 43 | 44 | The function \code{dsldMatchedATE} finds the estimated mean difference 45 | between the matched Y pairs in the treated/nontreated (exposed and 46 | non-exposed) groups, with covariates X in \code{data} other than the 47 | \code{yName} and \code{sName} columns. 48 | 49 | In the propensity model case, we estimate P(S = 1 | X), either by a logistic 50 | or k-NN model. 51 | } 52 | 53 | \author{ 54 | N. Matloff 55 | } 56 | 57 | \examples{ 58 | 59 | data(lalonde,package='Matching') 60 | ll <- lalonde 61 | ll$treat <- as.factor(ll$treat) 62 | ll$re74 <- NULL 63 | ll$re75 <- NULL 64 | summary(dsldMatchedATE(ll,'re78','treat','1')) 65 | summary(dsldMatchedATE(ll,'re78','treat','1',propensFtn='glm')) 66 | summary(dsldMatchedATE(ll,'re78','treat','1',propensFtn='knn',k=15)) 67 | } 68 | 69 | -------------------------------------------------------------------------------- /R/dsldML.R: -------------------------------------------------------------------------------- 1 | 2 | # like dsldLinear and dsldLogit, but for machine learning (i.e. 3 | # nonparametric) prediction algorithms 4 | 5 | # args: 6 | 7 | # data, yName, sName as usual 8 | 9 | # sComparisonPts as in the with-interactions case of dsldLinear() 10 | # (nonparametric case necessarily has interactions) 11 | 12 | # qeMLftnName is, e.g. 'qeKNN'; opts is an R list of optional arguments 13 | # for that function 14 | 15 | dsldML<-function(data,yName,sName,qeMLftnName,sComparisonPts='rand5',opts=NULL){ 16 | 17 | ycol <- which(names(data) == yName) 18 | scol <- which(names(data) == sName) 19 | slevels <- levels(data[,scol]) 20 | 21 | factors_info = factor_levels(data) 22 | 23 | if (sComparisonPts=='rand5'){ 24 | rows <- sample(nrow(data), 5) 25 | reducedData <- data[rows, ] 26 | columns <- c(yName, sName) 27 | sComparisonPts <- reducedData[, !(names(reducedData) %in% columns)] 28 | sComparisonPts <- apply_factor_levels(sComparisonPts, factors_info) 29 | } 30 | 31 | sComparisonPts <- apply_factor_levels(sComparisonPts, factors_info) 32 | 33 | # called from lapply(), calling the QE function on the subset of data 34 | # corresponding to the specified level of the sensitive variable S 35 | do1Slevel <- function(sLevel) 36 | { 37 | subData <- data[data[,scol]==sLevel,] 38 | subData <- subData[,-scol] 39 | opts[['data']] <- subData 40 | opts[['yName']] <- yName 41 | do.call(qeMLftnName,opts) 42 | } 43 | 44 | qeOut <- lapply(slevels,do1Slevel) 45 | names(qeOut) <- slevels 46 | 47 | testAccs <- sapply(qeOut,function(qeo) qeo$testAcc) 48 | res <- list(testAccs = testAccs) 49 | 50 | tmp <- sComparisonPts 51 | for (sl in slevels) { 52 | # predicted values are the values of the estimated regression 53 | # function, just what we want 54 | preds <- predict(qeOut[[sl]],sComparisonPts) 55 | if (qeOut[[1]]$classif) { 56 | if (is.null(preds$probs)) stop('ML function does not return "probs"') 57 | preds <- preds$probs 58 | } else preds <- as.vector(preds) 59 | tmp[[sl]] <- preds 60 | } 61 | 62 | res$comparisons <- tmp 63 | 64 | return(res) 65 | } 66 | 67 | 68 | -------------------------------------------------------------------------------- /inst/src/dsldPy/dsldPyML.py: -------------------------------------------------------------------------------- 1 | ''' 2 | This file contains the interface code for calling the dsldLogit from dsld R package. 3 | The code uses rpy2 to handle dsld functions call from R and pandas library to check if 4 | users data input is in pandas data frame before doing any computation 5 | ''' 6 | 7 | from .Utils import get_dsld, dsld_Rpy2_IsRDataframe, dsld_Rpy2_RDataframeToPandas 8 | import sys 9 | import pandas as pd 10 | import rpy2.robjects as robjects 11 | from rpy2.robjects import pandas2ri 12 | from rpy2.robjects import conversion 13 | from rpy2.robjects.packages import importr 14 | import rpy2.robjects as ro 15 | import math 16 | from rpy2.robjects.vectors import ListVector, FloatVector 17 | from .Utils import dsld_Rpy2_RDataframeToPandas 18 | import pandas as pd 19 | 20 | # Import R packages 21 | 22 | def map_last_k(df: pd.DataFrame, values: list): 23 | k = len(values) 24 | last_k_cols = df.columns[-k:] 25 | # add "testAcc: " prefix to each column name 26 | prefixed_cols = [f"testAcc: {col}" for col in last_k_cols] 27 | return dict(zip(prefixed_cols, values)) 28 | 29 | def dsldPyML(data, yName, sName, qeMLftnName, sComparisonPts='rand5', opts=None): 30 | 31 | r_data = dsld_Rpy2_IsRDataframe(data) 32 | yName = robjects.StrVector([yName]) 33 | sName = robjects.StrVector([sName]) 34 | qeMLftnName = robjects.StrVector([qeMLftnName]) 35 | 36 | if sComparisonPts != 'rand5': 37 | if isinstance(sComparisonPts, pd.DataFrame): 38 | sComparisonPts = dsld_Rpy2_IsRDataframe(sComparisonPts) 39 | else: 40 | sComparisonPts = robjects.StrVector([sComparisonPts]) 41 | else: 42 | sComparisonPts = robjects.StrVector(['rand5']) 43 | 44 | if opts is not None: 45 | opts = ListVector({k: FloatVector([v]) for k, v in opts.items()}) 46 | else: 47 | opts = robjects.NULL 48 | 49 | # call dsldML in R 50 | dsld = get_dsld() 51 | model = dsld.dsldML(r_data, yName, sName, qeMLftnName, sComparisonPts, opts) 52 | 53 | test_accuracies = model[0] 54 | comparison_points = dsld_Rpy2_RDataframeToPandas(model[1]) 55 | comparison_points_dict = map_last_k(comparison_points, test_accuracies) 56 | 57 | return comparison_points_dict, comparison_points 58 | -------------------------------------------------------------------------------- /man/dsldML.Rd: -------------------------------------------------------------------------------- 1 | \name{dsldML} 2 | \alias{dsldML} 3 | \title{dsldML} 4 | 5 | \description{ 6 | Nonparametric comparison of sensitive groups. 7 | } 8 | 9 | \usage{ 10 | dsldML(data,yName,sName,qeMLftnName,sComparisonPts='rand5',opts=NULL) 11 | } 12 | 13 | \arguments{ 14 | \item{data}{ 15 | A data frame. 16 | } 17 | \item{yName}{ 18 | Name of the response variable column. 19 | } 20 | \item{sName}{ 21 | Name(s) of the sensitive attribute column(s). 22 | } 23 | \item{qeMLftnName}{ 24 | Quoted name of a prediction function in the \code{qeML} package. 25 | } 26 | \item{sComparisonPts}{ 27 | Data frame of one or more data points at which the regression 28 | function is to be estimated for each level of S. If this is 29 | 'rand5', then the said data points will consist of five randomly 30 | chosen rows in the original dataset. 31 | } 32 | \item{opts}{ 33 | An R list specifying arguments for the above \code{qeML} function. 34 | } 35 | } 36 | 37 | \author{ 38 | N. Matloff 39 | } 40 | 41 | \examples{ 42 | 43 | ## applying K-NN 44 | ## also works for: qeRF, qeRFranger, qeLASSO, qePolyLin/qePolyLog, qeXGBoost 45 | 46 | data(svcensus) 47 | 48 | w <- dsldML(svcensus,'wageinc','gender',qeMLftnName='qeKNN', 49 | opts=list(k=50)) 50 | 51 | # prints testAcc for each level in sName and the predictions on sComparisonPts 52 | print(w) 53 | 54 | } 55 | 56 | \details{ 57 | 58 | In a linear model with no interactions, one can speak of "the" 59 | difference in mean Y given X across treatments, independent of X. 60 | In a nonparametric analysis, there is interaction by definition, 61 | and one can only speak of differences across treatments for a 62 | specific X value. Hence the need for the argument 63 | \code{sComparisonPts}. 64 | 65 | The specified \code{qeML} function will be called on the indicated data once 66 | for each level of the sensitive variable. For each such level, estimated 67 | regression function values will be obtained for each row in 68 | \code{sComparisonPts}. 69 | } 70 | 71 | \value{ 72 | 73 | An R list. The first component consists of the holdout-set prediction 74 | accuracies, while the second is a data frame predicted values for each 75 | sensitive group. 76 | 77 | } 78 | 79 | 80 | -------------------------------------------------------------------------------- /inst/src/dsldPy/dsldPyFairUtils.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Python interface for dsldFairUtils functions in the dsld R package. 3 | ''' 4 | 5 | from .Utils import get_dsld, dsld_Rpy2_IsRDataframe, dsld_Rpy2_RDataframeToPandas 6 | from rpy2.robjects.packages import importr 7 | import rpy2.robjects as robjects 8 | import numpy as np 9 | import pandas as pd 10 | 11 | def _to_float_vector(x): 12 | if x is None: 13 | return robjects.NULL 14 | if isinstance(x, (list, tuple, np.ndarray, pd.Series)): 15 | return robjects.FloatVector([float(v) for v in x]) 16 | return robjects.FloatVector([float(x)]) 17 | 18 | def _to_int_vector_scalar(x): 19 | if x is None: 20 | return robjects.NULL 21 | return robjects.IntVector([int(x)]) 22 | 23 | def _to_scalar_vector(x): 24 | if x is None: 25 | return robjects.NULL 26 | if isinstance(x, (int, np.integer)): 27 | return robjects.IntVector([int(x)]) 28 | if isinstance(x, (float, np.floating)): 29 | return robjects.FloatVector([float(x)]) 30 | return robjects.StrVector([str(x)]) 31 | 32 | def _to_str_vector(x): 33 | if x is None: 34 | return robjects.NULL 35 | if isinstance(x, (list, tuple, np.ndarray, pd.Series)): 36 | return robjects.StrVector([str(v) for v in x]) 37 | return robjects.StrVector([str(x)]) 38 | 39 | 40 | def dsldPyFairUtils(data, yName, sName, dsldFTNname, 41 | unfairness=None, deweightPars=None, 42 | yesYVal=None, k_folds=5): 43 | 44 | r_data = dsld_Rpy2_IsRDataframe(data) 45 | 46 | yName_r = robjects.StrVector([yName]) # keep single y 47 | sName_r = _to_str_vector(sName) # str or list -> R character vector 48 | dsldFTNname_r = robjects.StrVector([dsldFTNname]) 49 | 50 | unfairness_r = _to_float_vector(unfairness) 51 | 52 | if deweightPars is not None: 53 | deweightPars_r = robjects.ListVector( 54 | {k: _to_float_vector(v) for k, v in deweightPars.items()} 55 | ) 56 | else: 57 | deweightPars_r = robjects.NULL 58 | 59 | yesYVal_r = _to_scalar_vector(yesYVal) 60 | k_folds_r = _to_int_vector_scalar(k_folds) 61 | 62 | dsld = get_dsld() 63 | model = dsld.dsldFairUtils( 64 | r_data, yName_r, sName_r, dsldFTNname_r, 65 | unfairness_r, deweightPars_r, yesYVal_r, k_folds_r, robjects.NULL 66 | ) 67 | return dsld_Rpy2_RDataframeToPandas(model) 68 | -------------------------------------------------------------------------------- /inst/src/dsldPy/dsldPyLogit.py: -------------------------------------------------------------------------------- 1 | ''' 2 | This file contains the interface code for calling the dsldLogit from dsld R package. 3 | The code uses rpy2 to handle dsld functions call from R and pandas library to check if 4 | users data input is in pandas data frame before doing any computation 5 | ''' 6 | 7 | from .Utils import get_dsld, dsld_Rpy2_IsRDataframe, dsld_Rpy2_RDataframeToPandas 8 | import rpy2.robjects as robjects 9 | from rpy2.robjects import pandas2ri 10 | from rpy2.robjects import conversion 11 | from rpy2.robjects.packages import importr 12 | 13 | 14 | # Import R packages 15 | 16 | def dsldPyLogit(data, yName, sName, sComparisonPts=None, interactions=False, yesYVal=None): 17 | 18 | r_data = dsld_Rpy2_IsRDataframe(data) 19 | yName = robjects.StrVector([yName]) 20 | sName = robjects.StrVector([sName]) 21 | interactions = robjects.BoolVector([interactions]) 22 | yesYVal = robjects.StrVector([yesYVal]) if yesYVal is not None else robjects.NULL 23 | 24 | if sComparisonPts is not None: 25 | sComparisonPts = dsld_Rpy2_IsRDataframe(sComparisonPts) 26 | else: 27 | sComparisonPts = robjects.NULL 28 | 29 | # call dsldLogit in R 30 | dsld = get_dsld() 31 | model = dsld.dsldLogit(r_data, yName, sName, sComparisonPts, interactions, yesYVal) 32 | return model 33 | 34 | def dsldPyLogitSummary(dsldLogit): 35 | robjects.r.assign("dsldLogit", dsldLogit) 36 | result = robjects.r('summary(dsldLogit)') 37 | print(result) 38 | return result 39 | 40 | def dsldPyLogitCoef(dsldLogit): 41 | robjects.r.assign("dsldLogit", dsldLogit) 42 | result = robjects.r('coef(dsldLogit)') 43 | print(result) 44 | return result 45 | 46 | def dsldPyLogitVcov(dsldLogit): 47 | robjects.r.assign("dsldLogit", dsldLogit) 48 | result = robjects.r('vcov(dsldLogit)') 49 | print(result) 50 | return result 51 | 52 | def dsldPyLogitGetData(dsldLogit): 53 | robjects.r.assign("dsldLogit", dsldLogit) 54 | result = robjects.r('dsldGetData(dsldLogit)') 55 | print(result) 56 | return result 57 | 58 | def dsldPyLogitPredict(dsldLogit, newData): 59 | robjects.r.assign("dsldLogit", dsldLogit) 60 | xNew = dsld_Rpy2_IsRDataframe(newData) 61 | # xNew = dsld.convert_cols(newData, cat_features, num_features) 62 | robjects.r.assign("xNew", xNew) 63 | result = robjects.r('predict(dsldLogit, xNew)') 64 | with conversion.localconverter(pandas2ri.converter): 65 | result_py = conversion.rpy2py(result) 66 | return result_py 67 | -------------------------------------------------------------------------------- /man/dsldTakeALookAround.Rd: -------------------------------------------------------------------------------- 1 | \name{dsldTakeALookAround} 2 | \alias{dsldTakeALookAround} 3 | 4 | \title{dsldTakeALookAround} 5 | 6 | \description{ 7 | 8 | Evaluate feature sets for predicting Y while considering the 9 | Fairness-Utility Tradeoff. 10 | } 11 | 12 | \usage{ 13 | dsldTakeALookAround(data, yName, sName, maxFeatureSetSize = (ncol(data) - 2), 14 | holdout = floor(min(1000,0.1*nrow(data)))) 15 | } 16 | 17 | \arguments{ 18 | \item{data}{ 19 | Data frame. 20 | } 21 | \item{yName}{ 22 | Name of the response variable column. 23 | } 24 | \item{sName}{ 25 | Name of the sensitive attribute column. 26 | } 27 | \item{maxFeatureSetSize}{ 28 | Maximum number of combinations of features to be 29 | included in the data frame. 30 | 31 | } 32 | \item{holdout}{ 33 | If not NULL, form a holdout set of the specified size. After fitting to the 34 | remaining data, evaluate accuracy on the test set. 35 | } 36 | } 37 | 38 | \details{ 39 | 40 | This function provides a tool for exploring feature combinations to use 41 | in predicting an outcome Y from features X and a sensitive variable S. 42 | 43 | The features in X will first be considered singly, then doubly and so 44 | on, up though feature combination size \code{maxFeatureSetSize}. Y is 45 | prediction from X either a linear model (numeric Y) or logit 46 | (dichotomous Y). 47 | 48 | The accuracy (based on qeML holdout) will be computed for each of these 49 | cases: (a) Y predicted from the given feature combination C, (b) Y 50 | predicted from the given feature combination C plus S, and (c) S predicted 51 | from C. The difference between columns 'a' and 'b' shows the sacrifice 52 | in utility stemming from not using S in our prediction of Y. (Due to 53 | sampling variation, it is possible for column 'b' to be larger than 54 | 'a'.) The value in column 'c' shows fairness, the smaller the fairer. 55 | 56 | } 57 | 58 | \author{ 59 | N. Matloff, A. Ashok, S. Martha, A. Mittal 60 | } 61 | 62 | \examples{ 63 | \donttest{ 64 | # investigate predictive accuracy for a continuous Y, 65 | # 'wageinc', using the default arguments for maxFeatureSetSize = 4 66 | data(svcensus) 67 | dsldTakeALookAround(svcensus, 'wageinc', 'gender', 4) 68 | 69 | # investigate the predictive accuracy for a categorical Y, 70 | # 'educ', using the default arguments for maxFeatureSetSize = 4 71 | dsldTakeALookAround(svcensus, 'educ', 'gender')} 72 | } 73 | 74 | \value{Data frame whose first column consists of the variable names, 75 | followed by columns 'a', 'b' and 'c' as described in 'details'.} 76 | 77 | -------------------------------------------------------------------------------- /inst/src/dsldPy/dsldPyFreqPCoord.py: -------------------------------------------------------------------------------- 1 | import os 2 | import tempfile 3 | from typing import Optional, Union, Sequence 4 | from IPython.display import Image, display 5 | 6 | import pandas as pd 7 | import rpy2.robjects as ro 8 | from rpy2.robjects.vectors import IntVector, StrVector, BoolVector 9 | from rpy2.robjects.packages import importr 10 | 11 | from .Utils import dsld_Rpy2_IsRDataframe, get_dsld 12 | 13 | def _maybe_intvec(x): 14 | if x is None: 15 | return ro.NULL 16 | if isinstance(x, (list, tuple)): 17 | return IntVector(list(x)) 18 | return IntVector([int(x)]) 19 | 20 | def _maybe_strvec(x: Optional[Union[str, Sequence[str]]]): 21 | if x is None: 22 | return ro.NULL 23 | if isinstance(x, (list, tuple)): 24 | return StrVector(list(x)) 25 | return StrVector([str(x)]) 26 | 27 | def dsldPyFreqPCoord(data, m, sName, method = "maxdens", faceting = "vert", k = 50, klm = None, keepidxs = None, plotidxs = False, cls = None, plot_filename = None, show = True): 28 | 29 | # Prepare inputs (pass scalars where R expects scalars) 30 | r_data = dsld_Rpy2_IsRDataframe(data) 31 | r_m = int(m) 32 | r_sName = _maybe_strvec(sName) 33 | r_method = str(method) 34 | r_faceting = str(faceting) 35 | r_k = int(k) 36 | if klm is None: 37 | klm = 5 * k 38 | r_klm = int(klm) 39 | r_keepidxs = _maybe_intvec(keepidxs) 40 | r_plotidxs = bool(plotidxs) 41 | r_cls = _maybe_strvec(cls) 42 | 43 | # Case A: user provided output filename 44 | if plot_filename: 45 | dsld = get_dsld() 46 | res = dsld.dsldFreqPCoord( 47 | r_data, r_m, r_sName, r_method, r_faceting, r_k, r_klm, 48 | r_keepidxs, r_plotidxs, r_cls, plot_filename 49 | ) 50 | 51 | try: 52 | ro.r("print")(res) 53 | except Exception: 54 | pass 55 | return 56 | 57 | # Case B: capture to a temporary PNG and show 58 | fd, tmpfile = tempfile.mkstemp(suffix=".png") 59 | os.close(fd) 60 | try: 61 | grdevices = importr("grDevices") 62 | grdevices.png(file=tmpfile, width=1200, height=800, res=150) 63 | dsld = get_dsld() 64 | res = dsld.dsldFreqPCoord( 65 | r_data, r_m, r_sName, r_method, r_faceting, r_k, r_klm, 66 | r_keepidxs, r_plotidxs, r_cls, ro.NULL 67 | ) 68 | 69 | try: 70 | ro.r("print")(res) 71 | except Exception: 72 | pass 73 | finally: 74 | grdevices.dev_off() 75 | 76 | if os.path.exists(tmpfile): 77 | display(Image(filename=tmpfile)) 78 | -------------------------------------------------------------------------------- /man/dsldFairUtils.Rd: -------------------------------------------------------------------------------- 1 | \name{dsldFairUtils} 2 | \alias{dsldFairUtils} 3 | \title{dsldFairUtils} 4 | 5 | \description{ 6 | Exploration of the Fairness-Utility Tradeoff. Finds predictive accuracy 7 | and correlation between S and predicted Y. 8 | } 9 | 10 | \usage{ 11 | dsldFairUtils(data, yName, sName, dsldFTNName, unfairness = NULL, 12 | deweightPars = NULL, yesYVal = NULL, k_folds = 5,model_args = NULL) 13 | } 14 | 15 | \arguments{ 16 | \item{data}{ 17 | Data frame. 18 | } 19 | \item{yName}{ 20 | Name of the response variable Y column. Y must be numeric or 21 | binary (two-level R factor). 22 | } 23 | \item{sName}{ 24 | Name of the sensitive attribute S column. 25 | } 26 | \item{dsldFTNName}{ 27 | Quoted name of one of the \pkg{fairML} or EDF functions. 28 | } 29 | \item{unfairness}{ 30 | Vector of unfairness values. Nonnull for the \pkg{fairML} functions. 31 | } 32 | \item{deweightPars}{ 33 | List of deweightPars grid. Nonnull for the EDF functions. 34 | } 35 | \item{yesYVal}{ 36 | Y value to be treated as Y = 1 for binary Y. 37 | } 38 | \item{k_folds}{ 39 | Number of folds to use in $k$-fold cross-validation. 40 | The final result is reported as the average across all folds. 41 | } 42 | 43 | \item{model_args}{ 44 | A named list of additional arguments passed directly to \code{dsldFtnName}. 45 | For example, \code{model_args = list(k = 25)}. 46 | } 47 | } 48 | 49 | \author{ 50 | A.Mittal, N. Matloff 51 | } 52 | 53 | \examples{ 54 | 55 | \donttest{ 56 | data(svcensus) 57 | 58 | ## regression examples shown --- also works for classification 59 | dsldFairUtils(svcensus, 60 | 'wageinc', 61 | 'gender', 62 | 'dsldQeFairKNN', 63 | k_folds = 5, 64 | model_args = list(k = 25), 65 | deweightPars = list('occ' = c(0.9,0.2), 'educ' = c(0.3, 0.9))) 66 | 67 | dsldFairUtils(svcensus, 68 | 'wageinc', 69 | 'gender', 70 | 'dsldFrrm', 71 | k_folds = 5, 72 | unfairness = c(0.9, 0.6, 0.1,0.05, 0.005)) 73 | } 74 | } 75 | 76 | \details{ 77 | 78 | Tool for exploring tradeoff between utility (predictive accuracy, Mean 79 | Absolute Prediction Error or overall probability of misclassification) 80 | and fairness. Roughly speaking, the latter is defined as the strength of 81 | relation between S and predicted Y (the smaller, the better). 82 | 83 | } 84 | 85 | \value{ 86 | 87 | A data-frame showing accuracy and correlation between predicted Y and S. 88 | 89 | } 90 | 91 | -------------------------------------------------------------------------------- /inst/src/dsldPy/dsldPyLinear.py: -------------------------------------------------------------------------------- 1 | ''' 2 | This file contains the interface code for calling the dsldLinear from dsld R package. 3 | The code uses rpy2 to handle dsld functions call from R and pandas library to check if 4 | users data input is in pandas data frame before doing any computation 5 | ''' 6 | 7 | from .Utils import get_dsld, dsld_Rpy2_IsRDataframe, dsld_Rpy2_RDataframeToPandas 8 | import sys 9 | import pandas as pd 10 | import rpy2.robjects as robjects 11 | from rpy2.robjects import pandas2ri 12 | from rpy2.robjects import conversion 13 | from rpy2.robjects.packages import importr 14 | import rpy2.robjects as ro 15 | import math 16 | 17 | 18 | def dsldPyLinear(data, yName, sName, interactions=False, 19 | sComparisonPts=None, useSandwich=False): 20 | 21 | """Python wrapper for dsldLinear in the dsld R package""" 22 | 23 | r_data = dsld_Rpy2_IsRDataframe(data) 24 | yName = robjects.StrVector([yName]) 25 | sName = robjects.StrVector([sName]) 26 | interactions = robjects.BoolVector([interactions]) 27 | useSandwich = robjects.BoolVector([useSandwich]) 28 | 29 | if sComparisonPts is not None: 30 | sComparisonPts = dsld_Rpy2_IsRDataframe(sComparisonPts) 31 | else: 32 | sComparisonPts = robjects.NULL 33 | 34 | dsld = get_dsld() 35 | dsldLinearObj = dsld.dsldLinear(r_data, yName, sName, 36 | interactions, sComparisonPts, useSandwich) 37 | return dsldLinearObj 38 | 39 | def dsldPyLinearSummary(dsldLinear): 40 | robjects.r.assign("dsldLinear", dsldLinear) 41 | result = robjects.r('summary(dsldLinear)') 42 | print(result) 43 | return result 44 | 45 | def dsldPyLinearCoef(dsldLinear): 46 | robjects.r.assign("dsldLinear", dsldLinear) 47 | result = robjects.r('coef(dsldLinear)') 48 | print(result) 49 | return result 50 | 51 | def dsldPyLinearVcov(dsldLinear): 52 | robjects.r.assign("dsldLinear", dsldLinear) 53 | result = robjects.r('vcov(dsldLinear)') 54 | print(result) 55 | return result 56 | 57 | def dsldPyLinearGetData(dsldLinear): 58 | robjects.r.assign("dsldLinear", dsldLinear) 59 | result = robjects.r('dsldGetData(dsldLinear)') 60 | print(result) 61 | return result 62 | 63 | def dsldPyLinearPredict(dsldLinear, newData): 64 | robjects.r.assign("dsldLinear", dsldLinear) 65 | xNew = dsld_Rpy2_IsRDataframe(newData) 66 | # xNew = dsld.convert_cols(newData, cat_features, num_features) 67 | robjects.r.assign("xNew", xNew) 68 | result = robjects.r('predict(dsldLinear, xNew)') 69 | with conversion.localconverter(pandas2ri.converter): 70 | result_py = conversion.rpy2py(result) 71 | return result_py 72 | 73 | 74 | 75 | -------------------------------------------------------------------------------- /man/dsldHunting.Rd: -------------------------------------------------------------------------------- 1 | \name{dsldCHunting and dsldOHunting} 2 | \alias{dsldCHunting} 3 | \alias{dsldOHunting} 4 | \title{Confounder and Proxy Hunting} 5 | 6 | \description{ 7 | Confounder hunting: searches for variables C that predict both Y and 8 | S. Proxy hunting: searches for variables O that predict S. 9 | } 10 | 11 | \usage{ 12 | dsldCHunting(data,yName,sName,intersectDepth=10) 13 | dsldOHunting(data,yName,sName) 14 | } 15 | 16 | \arguments{ 17 | \item{data}{ 18 | Data frame. 19 | } 20 | \item{yName}{ 21 | Name of the response variable column. 22 | } 23 | \item{sName}{ 24 | Name of the sensitive attribute column. 25 | } 26 | \item{intersectDepth}{ 27 | Maximum size of intersection of the Y predictor set and 28 | the S predictor set 29 | } 30 | } 31 | 32 | \details{ 33 | 34 | \code{dsldCHunting}: The random forests function 35 | \code{qeML:qeRF} will be run on the indicated data to indicate feature 36 | importance in prediction of Y (without S) and S (without Y). Call 37 | these "important predictors" of Y and S. 38 | 39 | Then for each \code{i} from 1 to \code{intersectDepth}, the 40 | intersection of the top \code{i} important predictors of Y and the 41 | the top \code{i} important predictors of S will be reported, thus 42 | suggesting possible confounders. Larger values of \code{i} will 43 | report more potential confounders, though including progressively 44 | weaker ones. 45 | 46 | The analyst then may then consider omitting the variables C from 47 | models of the effect of S on Y. 48 | 49 | Note: Run times may be long. 50 | 51 | \code{dsldOHunting}: Factors, if any, will be converted to dummy 52 | variables, and then the Kendall Tau correlations will be calculated 53 | betwene S and potential proxy variables O, i.e. every column other 54 | than Y and S. (The Y column itself doesn't enter into computation.) 55 | 56 | In fairness analyses, in which one desires to either eliminate or 57 | reduce the impact of S, one must consider the indirect effect of S 58 | via O. One may wish to eliminate or reduce the role of O. 59 | 60 | } 61 | 62 | \author{ 63 | N. Matloff 64 | } 65 | 66 | \value{ 67 | 68 | The function \code{dsldCHunting} returns an R list, one component for 69 | each confounder set found. 70 | 71 | The function \code{dsldOHunting} returns an R matrix of correlations, 72 | one row for each level of S. 73 | 74 | } 75 | 76 | \examples{ 77 | \donttest{ 78 | data(lsa) 79 | dsldCHunting(lsa,'bar','race1') 80 | # e.g. suggests confounders 'decile3', 'lsat' 81 | 82 | data(mortgageSE) 83 | dsldOHunting(mortgageSE,'deny','black') 84 | # e.g. suggests using loan value and condo purchase as proxies 85 | } 86 | } 87 | -------------------------------------------------------------------------------- /inst/src/dsldPy/Utils.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | from PIL import Image 4 | import rpy2.robjects as robjects 5 | from rpy2.robjects import StrVector, FloatVector, ListVector 6 | from rpy2.robjects import pandas2ri 7 | 8 | 9 | import rpy2.robjects as ro 10 | from rpy2.robjects import default_converter 11 | from rpy2.robjects.conversion import localconverter 12 | from rpy2.robjects import pandas2ri 13 | 14 | import os 15 | import pyreadr 16 | from rpy2.robjects.packages import importr 17 | 18 | 19 | def get_dsld(): 20 | """Return the R 'dsld' package handle using the installed version in R.""" 21 | return importr("dsld") 22 | 23 | def get_dsld_version(): 24 | """Return the installed dsld version as a string (or None if unavailable).""" 25 | try: 26 | importr("utils") # ensure utils is available 27 | ver = ro.r('as.character(utils::packageVersion("dsld"))')[0] 28 | return ver 29 | except Exception: 30 | return None 31 | 32 | ### data-frame conversion functions 33 | # This function converts a pandas data frame into an R data frame 34 | ## updated to remove depracated function 35 | def dsld_Rpy2_IsRDataframe(data): 36 | """ 37 | If data is R data.frame, return it. 38 | If data is pandas DataFrame, convert to R and return. 39 | Otherwise return -1. 40 | """ 41 | if isinstance(data, ro.vectors.DataFrame): 42 | return data 43 | elif isinstance(data, pd.DataFrame): 44 | return dsld_Rpy2_PandasToRDataframe(data) 45 | else: 46 | print("Error: not Rdata or Pandas Dataframe") 47 | return -1 48 | 49 | ### helper functions for dsld_Rpy2_IsRDataframe 50 | def dsld_Rpy2_PandasToRDataframe(pandas_df: pd.DataFrame): 51 | """convert pandas -> R data.frame.""" 52 | with localconverter(default_converter + pandas2ri.converter): 53 | return ro.conversion.py2rpy(pandas_df) 54 | 55 | def dsld_Rpy2_RDataframeToPandas(r_df): 56 | """convert R data.frame -> pandas DataFrame.""" 57 | with localconverter(default_converter + pandas2ri.converter): 58 | return ro.conversion.rpy2py(r_df) 59 | 60 | ### reading data // data cleaning 61 | def read_data(filepath, **kwargs): 62 | 63 | ext = os.path.splitext(filepath)[1].lower() 64 | 65 | if ext == ".csv": 66 | return pd.read_csv(filepath, **kwargs) 67 | 68 | elif ext in [".rdata", ".rda"]: 69 | result = pyreadr.read_r(filepath) 70 | key = list(result.keys())[0] 71 | return result[key] 72 | 73 | else: 74 | raise ValueError(f"Unsupported file extension: {ext}") 75 | 76 | def preprocess_data(data, cat_features, num_features): 77 | r_data = dsld_Rpy2_IsRDataframe(data) 78 | dsld = get_dsld() 79 | r_data = dsld.convert_cols(r_data, cat_features, num_features) 80 | return r_data 81 | -------------------------------------------------------------------------------- /inst/src/dsldPy/__init__.py: -------------------------------------------------------------------------------- 1 | from .dsldPyQeFairML import ( 2 | dsldPyQeFairKNN, 3 | dsldPyQeFairRF, 4 | dsldPyQeFairRidgeLin, 5 | dsldPyQeFairRidgeLog, 6 | dsldPyQeFairML_Predict, 7 | ) 8 | from .dsldPyLinear import ( 9 | dsldPyLinear, 10 | dsldPyLinearSummary, 11 | dsldPyLinearCoef, 12 | dsldPyLinearVcov, 13 | dsldPyLinearGetData, 14 | dsldPyLinearPredict, 15 | ) 16 | from .dsldPyFairML import ( 17 | dsldPyFrrm, 18 | dsldPyFgrrm, 19 | dsldPyNclm, 20 | dsldPyZlm, 21 | dsldPyZlrm, 22 | dsldPyFairML_Summary, 23 | dsldPyFairML_Predict, 24 | ) 25 | from .dsldPyBnLearn import dsldPyIamb 26 | from .dsldPyScatterPlot3D import dsldPyScatterPlot3D 27 | from .dsldPyFreqPCoord import dsldPyFreqPCoord 28 | from .dsldPyConditDisparity import dsldPyConditDisparity 29 | from .dsldPyLogit import ( 30 | dsldPyLogit, 31 | dsldPyLogitSummary, 32 | dsldPyLogitCoef, 33 | dsldPyLogitVcov, 34 | dsldPyLogitGetData, 35 | dsldPyLogitPredict, 36 | ) 37 | from .dsldPyFrequencybyS import dsldPyFrequencybyS 38 | from .dsldPyConfounders import dsldPyConfounders 39 | from .dsldPyML import dsldPyML 40 | from .dsldPyTakeALookAround import dsldPyTakeALookAround 41 | from .dsldPyDensitybyS import dsldPyDensitybyS 42 | from .dsldPyMatching import dsldPyMatchedATE 43 | from .dsldPyHunting import dsldPyCHunting, dsldPyOHunting 44 | from .dsldPyFairUtils import dsldPyFairUtils 45 | from .Utils import ( 46 | dsld_Rpy2_IsRDataframe, 47 | dsld_Rpy2_PandasToRDataframe, 48 | dsld_Rpy2_RDataframeToPandas, 49 | read_data, 50 | preprocess_data, 51 | ) 52 | 53 | __all__ = [ 54 | 'dsldPyQeFairKNN', 55 | 'dsldPyQeFairRF', 56 | 'dsldPyQeFairRidgeLin', 57 | 'dsldPyQeFairRidgeLog', 58 | 'dsldPyQeFairML_Predict', 59 | 'dsldPyLinear', 60 | 'dsldPyLinearSummary', 61 | 'dsldPyLinearCoef', 62 | 'dsldPyLinearVcov', 63 | 'dsldPyLinearGetData', 64 | 'dsldPyLinearPredict', 65 | 'dsldPyFrrm', 66 | 'dsldPyFgrrm', 67 | 'dsldPyNclm', 68 | 'dsldPyZlm', 69 | 'dsldPyZlrm', 70 | 'dsldPyFairML_Summary', 71 | 'dsldPyFairML_Predict', 72 | 'dsldPyIamb', 73 | 'dsldPyScatterPlot3D', 74 | 'dsldPyFreqPCoord', 75 | 'dsldPyConditDisparity', 76 | 'dsldPyLogit', 77 | 'dsldPyLogitSummary', 78 | 'dsldPyLogitCoef', 79 | 'dsldPyLogitVcov', 80 | 'dsldPyLogitGetData', 81 | 'dsldPyLogitPredict', 82 | 'dsldPyFrequencybyS', 83 | 'dsldPyConfounders', 84 | 'dsldPyML', 85 | 'dsldPyTakeALookAround', 86 | 'dsldPyDensitybyS', 87 | 'dsldPyMatchedATE', 88 | 'dsldPyCHunting', 89 | 'dsldPyOHunting', 90 | 'dsldPyFairUtils', 91 | 'dsld_Rpy2_IsRDataframe', 92 | 'dsld_Rpy2_PandasToRDataframe', 93 | 'dsld_Rpy2_RDataframeToPandas', 94 | 'read_data', 95 | 'preprocess_data', 96 | ] 97 | 98 | __version__ = '0.0.3' 99 | -------------------------------------------------------------------------------- /R/dsldHunting.R: -------------------------------------------------------------------------------- 1 | 2 | # ad hoc aid in deciding which covariates one should treat as 3 | # confounders 4 | 5 | # we want to find variables C that are correlated with both Y and S 6 | 7 | # based on qeRF, which uses the 'randomForests' package; its output 8 | # includes a variable importance measure 9 | 10 | # importance here uses the permutation method, measuring deterioration 11 | # in prediction accuracy resulting from shuffling the given data column; 12 | # the greater the deterioration, the more important the variable 13 | 14 | # 'intersectDepth' specifies the number of prediction sets for each of Y 15 | # and S to examine for intersection; in datasets with many predictors, 16 | # this probably should be set to a larger value, or else each 17 | # intersection may be null 18 | 19 | dsldCHunting <- function(data, yName, sName, intersectDepth = 10) { 20 | 21 | ycol <- which(names(data) == yName) 22 | scol <- which(names(data) == sName) 23 | y <- data[, ycol] 24 | s <- data[, scol] 25 | 26 | dataNoS <- data[, -scol] # for predicting Y 27 | dataNoY <- data[, -ycol] # for predicting S 28 | 29 | impY <- qeML::qeRF(dataNoS, yName)$importance 30 | impS <- qeRF(dataNoY, sName)$importance 31 | 32 | # the 'importance' output format has several different cases, which 33 | # must be dealt with separately in extracting the actual importance 34 | # vector 35 | nlevsY <- length(levels(y)) 36 | if (is.numeric(y) || nlevsY == 2) 37 | impY1 <- impY[, 1] 38 | else if (is.factor(y)) { 39 | impY1 <- impY[, nlevsY + 1] 40 | } 41 | else stop("Y must be numeric or an R factor") 42 | if (!is.factor(s)) stop("S must be an R factor") 43 | nlevsS <- length(levels(s)) 44 | if (nlevsS == 2) impS1 <- impS[, 1] else impS1 <- impS[, nlevsS + 1] 45 | 46 | # larger values mean higher importance 47 | impY1 <- sort(impY1, decreasing = TRUE) 48 | impS1 <- sort(impS1, decreasing = TRUE) 49 | 50 | # start assembling output 51 | res <- list(impForY = impY1, impForS = impS1) 52 | nmsY <- names(impY1) 53 | nmsS <- names(impS1) 54 | res$inCommon <- list() 55 | 56 | # for each i, find the "top i" set of confounders, defined as being 57 | # highly correlated with both Y and S 58 | for (i in 1:min(intersectDepth, ncol(data) - 2)) { 59 | res$inCommon[[i]] <- intersect(nmsY[1:i], nmsS[1:i]) 60 | } 61 | 62 | return(res) 63 | } 64 | 65 | 66 | # ad hoc aid in deciding which covariates one should treat as 67 | # proxies 68 | 69 | # we want to find variables O that are correlated with S; S need not be 70 | # binary/categorical 71 | 72 | # based on cor(), using Kendall's Tau in order to acccomdate binary 73 | # variables (0,1 valued), and to mitigate effects of outliers 74 | 75 | dsldOHunting <- function(data,yName,sName) 76 | { 77 | 78 | ycol <- which(names(data) == yName) 79 | scol <- which(names(data) == sName) 80 | 81 | sdumms <- regtools::factorsToDummies(data[,scol,drop=FALSE]) 82 | odumms <- regtools::factorsToDummies(data[,-c(ycol,scol),drop=FALSE]) 83 | 84 | cor(sdumms,odumms,method='kendall') 85 | 86 | } 87 | 88 | 89 | -------------------------------------------------------------------------------- /man/dsldLogit.Rd: -------------------------------------------------------------------------------- 1 | \name{dsldLogit} 2 | \alias{dsldLogit} 3 | \alias{predict.dsldGLM} 4 | \alias{coef.dsldGLM} 5 | \alias{vcov.dsldGLM} 6 | \alias{summary.dsldGLM} 7 | \title{dsldLogit} 8 | 9 | \description{ 10 | Comparison of conditions for sensitive groups via logistic regression 11 | models, with or without interactions with the sensitive variable. 12 | } 13 | 14 | \usage{ 15 | dsldLogit(data, yName, sName, sComparisonPts = NULL, interactions = FALSE, 16 | yesYVal) 17 | \method{summary}{dsldGLM}(object,...) 18 | \method{predict}{dsldGLM}(object,xNew,...) 19 | \method{coef}{dsldGLM}(object,...) 20 | \method{vcov}{dsldGLM}(object,...) 21 | } 22 | 23 | \arguments{ 24 | \item{data}{ 25 | Data frame used to train the linear model; will be split according to 26 | each level of \code{sName} in output if \code{interactions} is TRUE. 27 | } 28 | \item{yName}{ 29 | Name of the response variable column. 30 | } 31 | \item{sName}{ 32 | Name of the sensitive attribute column. 33 | } 34 | \item{interactions}{ 35 | If TRUE, fit interactions with the sensitive variable. 36 | } 37 | \item{sComparisonPts}{ 38 | If \code{interactions} is TRUE, a 39 | a data frame of new cases (minus Y,S) for which P(Y = 1| X) 40 | will be compared between each pairs of S levels. Must be 41 | in the same format as the original data. 42 | } 43 | \item{yesYVal}{ 44 | Y value to be considered 'yes', to be coded 1 rather than 0. 45 | } 46 | \item{object}{ 47 | An object returned by \code{dsldLogit}. 48 | } 49 | \item{xNew}{ 50 | Dataframe to predict new cases. Must be in the same format 51 | as \code{data}. 52 | } 53 | \item{...}{Further arguments.} 54 | } 55 | 56 | \author{ 57 | N. Matloff, A. Mittal, A. Ashok 58 | } 59 | 60 | \examples{ 61 | 62 | data(lsa) 63 | 64 | ### interactions case - exclude S and Y in newData 65 | newData <- lsa[c(2,22,222,2222),-c(8,11)] 66 | log1 <- dsldLogit(lsa,'bar','race1', newData, interactions = TRUE, 'TRUE') 67 | 68 | # extract results 69 | coef(log1) 70 | vcov(log1) 71 | summary(log1) 72 | 73 | # predict new data --- one prediction for each level of S per row 74 | predict(log1, newData) 75 | 76 | # no interaction case - exclude Y in newData 77 | newData <- lsa[c(2,22,222,2222),-c(11)] 78 | log2 <- dsldLogit(data = lsa, yName = 'bar',sName = 'gender', 79 | interactions = FALSE, yesYVal = 'TRUE') 80 | 81 | summary(log2) 82 | 83 | # predict on newData --- one prediction per row 84 | predict(log2, newData) 85 | 86 | } 87 | 88 | \details{ 89 | 90 | The \code{dsldLogit} function fits a logistic 91 | regression model to the response variable. Interactions are handled 92 | as in \code{dsldLinear}. 93 | 94 | } 95 | 96 | 97 | \value{ 98 | 99 | The \code{dsldLog} function returns an S3 object of class 'dsldGLM', 100 | with one component for each level of S. Each component includes 101 | information about the fitted model. 102 | 103 | } 104 | -------------------------------------------------------------------------------- /man/dsldScatterPlot3D.Rd: -------------------------------------------------------------------------------- 1 | \name{dsldScatterPlot3D} 2 | \alias{dsldScatterPlot3D} 3 | \title{ScatterPlot3D in dsld} 4 | \description{ Plotly 3D visualization of a dataset on 3 axes, 5 | with points color-coded on a 4th variable.} 6 | \usage{ 7 | dsldScatterPlot3D(data, yNames, sName, sGroups = NULL, sortedBy = 8 | "Name", numGroups = 8, maxPoints = NULL, xlim = NULL, 9 | ylim = NULL, zlim = NULL, main = NULL, colors = 10 | "Paired", opacity = 1, pointSize = 8)} 11 | 12 | \arguments{ 13 | \item{data}{ 14 | Data frame with at least 4 columns. 15 | } 16 | \item{yNames}{ 17 | Vector of the indices or names of the columns of the data frame to be 18 | graphed on the 3 axes. 19 | } 20 | \item{sName}{ 21 | Index or name of the column that contains the groups for which the data 22 | will be grouped by. This will affect the colors of the points of the graph. 23 | This column must be an R factor. 24 | } 25 | \item{sGroups}{ 26 | Vector of the names of the groups for which the data will be grouped by. 27 | Every value in the vector must exist in the \code{sName} column of the data 28 | frame. If not supplied or is NULL, the function will create this 29 | automatically according to the \code{sortedby} and \code{numgrps} parameters. 30 | By default, the function uses the 8 alphabetically first distinct groups 31 | in the \code{sName} column. 32 | } 33 | \item{sortedBy}{ 34 | Controls how \code{sGroups} is created automatically. If \code{sGroups} 35 | is supplied, this does nothing. One of three values: "Name", "Frequency", 36 | "Frequency-Descending". 37 | 38 | "Name" gets the first values alphabetically. 39 | "Frequency" gets the most frequently occuring values. 40 | "Frequency-Descending" gets the least frequently occuring values. 41 | } 42 | \item{numGroups}{ 43 | Number of groups to be automatically generated by the function. If 44 | \code{grpnames} is supplied, this does nothing. 45 | } 46 | \item{maxPoints}{ 47 | Limit to how many points may be displayed on the graph. 48 | There is no limit by default. 49 | } 50 | \item{xlim, ylim, zlim}{ 51 | The x, y and z limits, each a vector with c(min, max). 52 | } 53 | \item{main}{ 54 | The title of the graph. By default, the \code{sName} "vs. " 55 | \code{yNames}. 56 | } 57 | \item{colors}{ 58 | Either a colorbrewer2.org palette name (e.g. "YlOrRd" or "Blues"), 59 | or a vector of colors to interpolate in hexadecimal "#RRGGBB" format, 60 | or a color interpolation function like colorRamp(). 61 | } 62 | \item{opacity}{ 63 | A value between 0 and 1. 64 | } 65 | \item{pointSize}{ 66 | A value above 1. 67 | } 68 | } 69 | 70 | \details{ 71 | 72 | An interactive Plotly visualization will be created, with the three 73 | variables specified in \code{yNames}. Points will be color-coded 74 | according to \code{sName}. The plot can be rotated etc. using the mouse. 75 | 76 | } 77 | 78 | \references{ 79 | https://plotly.com/r/3d-scatter-plots/ 80 | } 81 | 82 | \author{ 83 | J. Tran and B. Zarate 84 | } 85 | 86 | \examples{ 87 | data(lsa) 88 | dsldScatterPlot3D(lsa,sName = "race1", 89 | yNames=c("ugpa", "lsat","age"), xlim=c(2,4)) 90 | } 91 | 92 | \value{No value, plot.} 93 | 94 | 95 | -------------------------------------------------------------------------------- /inst/README.md: -------------------------------------------------------------------------------- 1 | # dsldPy — Python Interface to DSLD 2 | 3 | Statistical and graphical tools for detecting and measuring discrimination and bias in datasets, 4 | Python interfaces available via rpy2. **dsldPy** wraps the R package **dsld** with a Python-friendly API 5 | using the same underlying R implementations. 6 | 7 | **Relevant links:** 8 | 9 | - **Quarto Book**: [Paper](https://htmlpreview.github.io/?https://github.com/matloff/dsldBook/blob/main/_book/index.html) - Important statistical principles and applications. 10 | - **Research Paper**: [Paper](https://arxiv.org/abs/2411.04228) - Package implementation details. 11 | 12 | ## Overview 13 | 14 | DSLD addresses two main types of bias analysis: 15 | 16 | - **Estimation analysis:** quantify possible discrimination by estimating effects of a sensitive variable S on an outcome Y, while adjusting for confounders C. 17 | 18 | - **Prediction analysis (fair ML):** build predictive models that limit the influence of S and its proxies O, trading off fairness and utility. 19 | 20 | **dsldPy** provides wrappers for all 24 R functions. 21 | 22 | ## Prerequisites 23 | 24 | - R installed and on PATH (R 4.x recommended) 25 | - R package dsld installed (CRAN or GitHub) 26 | - Python 3.8+ 27 | 28 | Install dsld in R: 29 | 30 | ```r 31 | install.packages("dsld") 32 | 33 | ## or latest development version 34 | # install.packages("remotes") 35 | remotes::install_github("matloff/dsld", force = TRUE) 36 | ``` 37 | 38 | Tip: Ensure rpy2 can find R. From a terminal: `R RHOME` should print your R home. If Python cannot find R, set `R_HOME` in your environment per rpy2’s documentation. 39 | 40 | ## Installation 41 | 42 | Install the Python package from this repository (subdirectory `inst`): 43 | 44 | ```bash 45 | pip install dsldPy 46 | ``` 47 | 48 | This will install dsldPy and its Python dependencies (pandas, numpy, rpy2, etc.). The user still needs to manually download **R** and the **dsld** package, as noted above. 49 | 50 | ## Quickstart 51 | 52 | Please refer to the instructional jupyter notebooks provided under `examples/` folder. These illustrate examples of all 24 **dsldPy** functions. 53 | 54 | Jupyter notebooks are available in this repository: 55 | 56 | - `inst/examples/graphical.ipynb` 57 | - `inst/examples/tabular.ipynb` 58 | - `inst/examples/machine_learning.ipynb` 59 | 60 | ## Available Wrappers 61 | 62 | - Analytical: `dsldPyLinear`, `dsldPyLogit`, `dsldPyML`, `dsldPyMatchedATE`, `dsldPyTakeALookAround`, `dsldPyConfounders`, `dsldPyCHunting`, `dsldPyOHunting` 63 | 64 | - Fair ML: `dsldPyFrrm`, `dsldPyFgrrm`, `dsldPyNclm`, `dsldPyZlm`, `dsldPyZlrm`, `dsldPyQeFairKNN`, `dsldPyQeFairRF`, `dsldPyQeFairRidgeLin`, `dsldPyQeFairRidgeLog`, , `dsldPyFairUtils` 65 | 66 | - Graphical: `dsldPyFreqPCoord`, `dsldPyScatterPlot3D`, `dsldPyConditDisparity`, `dsldPyDensitybyS`, `dsldPyFrequencybyS`, `dsldPyIamb` 67 | 68 | Function names mirror the R package. Arguments use standard Python types (pandas.DataFrame, dict, bool, etc.) with the same call forms as the R functions. 69 | 70 | ## Troubleshooting 71 | 72 | - rpy2 cannot find R: confirm `R RHOME` works; if not, add R to PATH or set `R_HOME`. See rpy2 docs for your OS. 73 | - dsld not installed in R: run `install.packages("dsld")` in an R session. 74 | 75 | ## Authors 76 | 77 | - Norm Matloff 78 | - Aditya Mittal 79 | - Taha Abdullah 80 | - Arjun Ashok 81 | - Shubhada Martha 82 | - Billy Ouattara 83 | - Jonathan Tran 84 | - Brandon Zarate 85 | 86 | For issues, contact **Aditya Mittal** at mittalaa@uci.edu -------------------------------------------------------------------------------- /man/dsldLinear.Rd: -------------------------------------------------------------------------------- 1 | \name{dsldLinear} 2 | \alias{dsldLinear} 3 | \alias{predict.dsldLM} 4 | \alias{coef.dsldLM} 5 | \alias{vcov.dsldLM} 6 | \alias{summary.dsldLM} 7 | \title{dsldLinear} 8 | 9 | \description{ 10 | Comparison of sensitive groups via linear models, with or 11 | without interactions with the sensitive variable. 12 | } 13 | 14 | \usage{ 15 | dsldLinear(data, yName, sName, interactions = FALSE, sComparisonPts = NULL, 16 | useSandwich = FALSE) 17 | \method{summary}{dsldLM}(object,...) 18 | \method{predict}{dsldLM}(object,xNew,...) 19 | \method{coef}{dsldLM}(object,...) 20 | \method{vcov}{dsldLM}(object,...) 21 | } 22 | 23 | \arguments{ 24 | \item{data}{ 25 | Data frame. 26 | } 27 | \item{yName}{ 28 | Name of the response variable Y column. 29 | } 30 | \item{sName}{ 31 | Name of the sensitive attribute S column. 32 | } 33 | \item{interactions}{ 34 | Logical value indicating whether or not to model interactions with the 35 | sensitive variable S. 36 | } 37 | \item{sComparisonPts}{ 38 | If \code{interactions} is TRUE, a data frame of new 39 | cases for which mean Y | X will be compared across 40 | each pair of S levels. Must be in the same 41 | format as original data. 42 | } 43 | \item{useSandwich}{ 44 | If TRUE, use the "sandwich" variance estimator. 45 | } 46 | \item{object}{ 47 | An object returned by the \code{dsldLinear} function. 48 | } 49 | \item{xNew}{ 50 | New data to be predicted. Must be in the same format as original data. 51 | } 52 | \item{...}{ 53 | Further arguments. 54 | } 55 | } 56 | 57 | \author{ 58 | N. Matloff, A. Mittal, A. Ashok 59 | } 60 | 61 | \examples{ 62 | data(svcensus) 63 | 64 | ### interactions case - exclude S and Y in newData 65 | newData <- svcensus[c(1, 18), -c(4,6)] 66 | lin1 <- dsldLinear(svcensus, 'wageinc', 'gender', interactions = TRUE, 67 | newData) 68 | 69 | # extract results 70 | coef(lin1) 71 | vcov(lin1) 72 | summary(lin1) 73 | 74 | # predict on newData --- one prediction for each level of S per row 75 | predict(lin1, newData) 76 | 77 | ### no interactions case - exclude Y in newData 78 | newData <- svcensus[c(1, 18), -c(4)] 79 | lin2 <- dsldLinear(svcensus, 'wageinc', 'gender', interactions = FALSE) 80 | summary(lin2) 81 | 82 | # predict on newData --- one prediction per row 83 | predict(lin2, newData) 84 | } 85 | 86 | \details{ 87 | 88 | The \code{dsldLinear} function fits a linear model to the response 89 | variable Y using all other variables in \code{data}. The user may 90 | select for interactions with the sensitive variable S. 91 | 92 | The function produces an instance of the `dsldLM` class (an S3 93 | object). Instances of the generic functions \code{summary} and 94 | \code{coef} are provided. 95 | 96 | If \code{interactions} is TRUE, the function will fit m separate 97 | models, where m is the number of levels of S. Then \code{summary} 98 | will contain m+1 data frames; the first m of which will be the 99 | outputs from the individual models. 100 | 101 | The m+1st data frame will compare the differences 102 | in conditional mean Y|X for each pair of S levels, and for each 103 | value of X in \code{sComparisonPts}. 104 | The intention is to allow users to see the comparisons 105 | of conditions for sensitive groups via linear models, with 106 | interactions with S. 107 | 108 | The \code{dsldDiffSLin} function allows users to compare mean Y at that 109 | X between each pair of S level for additional new unseen data levels 110 | using the model fitted from \code{dsldLinear}. 111 | 112 | } 113 | 114 | \value{ 115 | 116 | The \code{dsldLinear} function returns an S3 object of class 'dsldLM', 117 | with one component for each level of S. Each component includes 118 | information about the fitted model. 119 | 120 | } 121 | -------------------------------------------------------------------------------- /man/dsldFreqPCoord.Rd: -------------------------------------------------------------------------------- 1 | \name{dsldFreqPCoord} 2 | \alias{dsldFreqPCoord} 3 | 4 | \title{dsldFreqPCoord} 5 | 6 | \description{ 7 | Wrapper for the \code{freqparcoord} function from the \pkg{freqparcoord} 8 | package. 9 | } 10 | \usage{ 11 | dsldFreqPCoord(data, m, sName = NULL, method 12 | = "maxdens", faceting = "vert", k = 50, klm = 5 * k, keepidxs = NULL, 13 | plotidxs = FALSE, cls = NULL, plot_filename = NULL) 14 | } 15 | 16 | \arguments{ 17 | \item{data}{ 18 | Data frame or matrix. 19 | } 20 | \item{m}{ 21 | Number of lines to plot for each group. A negative value in conjunction 22 | with the method \code{maxdens} indicates that the 23 | lowest-density lines are to be plotted. If method is \code{locmax}, 24 | then \code{m} is forced to 1. 25 | } 26 | \item{sName}{ 27 | Column for the grouping variable, if any (if none, all the data 28 | is treated as a single group); the column must be a vector or factor. 29 | The column must not be in \code{dispcols}. If 30 | method is \code{locmax}, \code{grpvar} is forced to NULL 31 | } 32 | \item{method}{ 33 | What to display: 'maxdens' for plotting the most 34 | (or least) typical lines, 'locmax' for cluster hunting, or 35 | 'randsamp' for plotting a random sample of lines. 36 | } 37 | \item{faceting}{ 38 | How to display groups, if present. Use 'vert' for 39 | vertical stacking of group plots, 'horiz' for horizontal ones, or 40 | 'none' to draw all lines in one plot, color-coding by group. 41 | } 42 | \item{k}{ 43 | Number of nearest neighbors to use for density estimation. 44 | } 45 | \item{klm}{ 46 | If method is "locmax", number of nearest neighbors to 47 | use for finding local maxima for cluster hunting. Generally needs 48 | to be much larger than \code{k}, to avoid "noise fitting." 49 | } 50 | \item{keepidxs}{ 51 | If not NULL, the indices of the rows of \code{data} that 52 | are plotted will be stored in a component \code{idxs} of the 53 | return value. The rows themselves will be in a component 54 | \code{xdisp}, ordered by \code{data[,dispcols[1]}. 55 | } 56 | \item{plotidxs}{ 57 | If TRUE, lines in the display will be annotated 58 | with their case numbers, i.e. their row numbers within \code{data}. 59 | Use only with small values of \code{m}, as overplotting may occur. 60 | } 61 | \item{cls}{ 62 | Cluster, if any (see the \code{parallel} package) for 63 | parallel computation. 64 | } 65 | \item{plot_filename}{ 66 | Name of the file that will hold the saved graph image. 67 | If NULL, the graph will be generated and displayed without being saved. 68 | 69 | If a filename is provided, the graph will not be displayed, only 70 | saved. 71 | } 72 | } 73 | 74 | \details{ 75 | The \code{dsldFreqPCoord} function wraps \code{freqparcoord}, 76 | which uses a frequency-based parallel coordinates method to 77 | vizualize multiple variables simultaneously in graph form. 78 | 79 | This is done by plotting either the "most typical" or "least typical" 80 | (i.e. highest or lowest estimated multivariate density values respectively) 81 | cases to discern relations between variables. 82 | 83 | The Y-axis represents the centered and scaled values of the columns. 84 | } 85 | 86 | \value{ 87 | Object of type 'gg' (\pkg{ggplot2} object), with components \code{idxs} 88 | and \code{xdisp} added if \code{keepidxs} is not NULL (see argument 89 | \code{keepidxs} above). 90 | } 91 | 92 | \references{ 93 | https://cran.r-project.org/web/packages/freqparcoord/index.html 94 | } 95 | \author{ 96 | N. Matloff, T. Abdullah, B. Ouattara, J. Tran, B. Zarate 97 | } 98 | 99 | \examples{ 100 | data(lsa) 101 | lsa1 <- lsa[,c('fam_inc','ugpa','gender','lsat','race1')] 102 | dsldFreqPCoord(lsa1,75,'race1') 103 | # a number of interesting trends among the most "typical" law students in the 104 | # dataset: remarkably little variation among typical 105 | # African-Americans; typical Hispanic men have low GPAs, poor LSAT 106 | # scores there is more variation; typical Asian and Black students were 107 | # female; Asians and Hispanics have the most variation in family income 108 | # background 109 | } 110 | 111 | -------------------------------------------------------------------------------- /man/dsldFairML.Rd: -------------------------------------------------------------------------------- 1 | \name{dsldFairML Wrappers} 2 | \alias{dsldFrrm} 3 | \alias{dsldFgrrm} 4 | \alias{dsldNclm} 5 | \alias{dsldZlm} 6 | \alias{dsldZlrm} 7 | \alias{predict.dsldFairML} 8 | \alias{summary.dsldFairML} 9 | 10 | \title{dsldFairML Wrappers} 11 | 12 | \description{ 13 | Fair machine learning models: estimation and prediction. The following 14 | functions provide wrappers for some functions in the \pkg{fairML} 15 | package. 16 | } 17 | \usage{ 18 | dsldFrrm(data, yName, sName, unfairness, definition = "sp-komiyama", 19 | lambda = 0, save.auxiliary = FALSE) 20 | dsldFgrrm(data, yName, sName, unfairness, definition = "sp-komiyama", 21 | family = "binomial", lambda = 0, save.auxiliary = FALSE, yesYVal) 22 | dsldNclm(data, yName, sName, unfairness, covfun = cov, lambda = 0, 23 | save.auxiliary = FALSE) 24 | dsldZlm(data, yName, sName, unfairness) 25 | dsldZlrm(data, yName, sName, unfairness, yesYVal) 26 | } 27 | 28 | \arguments{ 29 | \item{data}{ 30 | Data frame. 31 | } 32 | \item{yName}{ 33 | Name of the response variable column. 34 | } 35 | \item{sName}{ 36 | Name(s) of the sensitive attribute column(s). 37 | } 38 | \item{unfairness}{ 39 | A number in (0, 1]. Degree of unfairness allowed in 40 | the model. A value (very near) 0 means the model is completely 41 | fair, while a value of 1 means the model is not 42 | constrained to be fair at all. 43 | } 44 | \item{covfun}{ 45 | A function computing covariance matrices. 46 | } 47 | \item{definition}{ 48 | Character string, the label of the definition of fairness. 49 | Currently either 'sp-komiyama', 'eo-komiyama' or 'if-berk'. 50 | } 51 | \item{family}{ 52 | A character string, either 'gaussian' to fit linear regression, 53 | 'binomial' for logistic regression, 'poisson' for 54 | log-linear regression, 'cox' for Cox proportional 55 | hazards regression, or 'multinomial' for 56 | multinomial logistic regression. 57 | } 58 | \item{lambda}{ 59 | Non-negative number, a ridge-regression penalty coefficient. 60 | } 61 | \item{save.auxiliary}{ 62 | A logical value, whether to save the fitted values and the residuals 63 | of the auxiliary model that constructs the debiased predictors. 64 | } 65 | \item{yesYVal}{ 66 | Y value to be considered 'yes', to be coded 1 rather than 0. 67 | } 68 | } 69 | 70 | \details{ 71 | 72 | See documentation for the \pkg{fairml} package. 73 | 74 | The DSLD package extends functionality by providing both accuracy 75 | (MAPE or misclassification rate) and fairness (correlation) on the 76 | training set when fitting the model. 77 | 78 | } 79 | 80 | \value{ 81 | An object of class 'dsldFairML', which includes the model 82 | information, \code{yName}, \code{sName}, and model training details. 83 | } 84 | 85 | \author{ 86 | A. Mittal, S. Martha, B. Ouattara, B. Zarate, J. Tran 87 | } 88 | 89 | \examples{ 90 | \donttest{ 91 | # regression example 92 | data(svcensus) 93 | 94 | # test/train splits 95 | n <- nrow(svcensus) 96 | train_idx <- sample(seq_len(n), size = 0.7 * n) 97 | train <- svcensus[train_idx, ] 98 | test <- svcensus[-train_idx, -4] 99 | test_y <- svcensus[-train_idx, 4] 100 | 101 | # train frrm model // also works with nclm and zlm 102 | frrmOut <- dsldFrrm(data = train, yName = 'wageinc', sName = 'gender', 103 | unfairness = 0.2, definition = "sp-komiyama") 104 | 105 | # training results 106 | summary(frrmOut) 107 | frrmOut$trainCorrs 108 | frrmOut$trainAcc 109 | 110 | # testing results 111 | res <- predict(frrmOut, test) 112 | res$correlations 113 | mean(abs(res$preds - test_y)) 114 | 115 | # also works with dsldNclm, dsldZlm 116 | 117 | # classification example 118 | data(compas1) 119 | 120 | # test/train splits 121 | n <- nrow(compas1) 122 | train_idx <- sample(seq_len(n), size = 0.7 * n) 123 | train <- compas1[train_idx, ] 124 | test <- compas1[-train_idx, -8] 125 | test_y <- compas1[-train_idx, 8] 126 | test_y <- as.factor(as.integer(test_y== 'Yes')) 127 | 128 | # train fgrrm model // also works with zlrm 129 | fgrrmOut <- dsldFgrrm(train, yName = "two_year_recid", 130 | sName = "age", unfairness = 0.05, 131 | definition = "sp-komiyama", 132 | yesYVal = 'Yes') 133 | # training results 134 | summary(fgrrmOut) 135 | fgrrmOut$trainCorrs 136 | fgrrmOut$trainAcc 137 | 138 | # testing results 139 | res <- predict(fgrrmOut, test) 140 | res$correlations 141 | mean(test_y != round(res$preds)) 142 | 143 | # also works with dsldZlm 144 | } 145 | 146 | } 147 | -------------------------------------------------------------------------------- /R/dsldConditDisparity.R: -------------------------------------------------------------------------------- 1 | 2 | # arguments 3 | 4 | # data: input data frame or equivalewnt 5 | # yName: response variable 6 | # sName: sensitive variable (R factor) 7 | # xName: horizontal axis variables 8 | # condits: conditions, a vector of conditions, expressed in 9 | # names(data); must have at least 1, even if trivial 10 | # qeFtn: qeML predictive function 11 | # minS: if 'data' has fewer than this many rows for a give S level, 12 | # don't use that level 13 | # useLoess: if TRUE, use loess smoothing 14 | 15 | dsldConditDisparity <- function(data, yName, sName, xName, condits = NULL, 16 | qeFtn = qeKNN, minS = 50, useLoess = TRUE) 17 | { 18 | getSuggestedLib('qeML') 19 | 20 | # args type checking 21 | if (!is.data.frame(data)) { 22 | stop("data must be a dataframe or equivalent") 23 | } 24 | 25 | y <- data[[yName]] 26 | 27 | dichotY <- inherits(y, "factor") && length(levels(y) == 2) 28 | 29 | if (!inherits(y, "numeric") && 30 | !inherits(y, "integer") && 31 | !dichotY 32 | ) { 33 | stop("yName must refer to a numeric or 2-level factor column in data.") 34 | } 35 | if (!is.factor(data[[sName]])) { 36 | stop("sName must refer to a factor column in data.") 37 | } 38 | if (!is.numeric(data[[xName]])) { 39 | stop("xName must refer to a numeric column in data.") 40 | } 41 | 42 | # data engineering # 43 | # restrict data to fit conditions 44 | if (is.null(condits)) condits <- '1 > 0' 45 | if (length(condits) > 1) { 46 | # combine conditions 47 | condits <- paste(condits, collapse = " & ") 48 | } 49 | restrictions <- sprintf("focusedData <- subset(data, %s)", condits) 50 | eval(parse(text = restrictions)) 51 | focusedData <- focusedData[c(yName, xName, sName)] 52 | sCol <- which(names(focusedData) == sName) 53 | 54 | # group the data by S level & execute min size condition 55 | s <- focusedData[[sName]] 56 | groupByS <- split(focusedData, s) 57 | sizes <- sapply(groupByS, nrow) 58 | tiny <- which(sizes < minS) 59 | 60 | # remove too-small groups 61 | if (length(tiny) > 0) { 62 | groupByS <- groupByS[-tiny] 63 | } 64 | 65 | # consider only the remaining S-levels 66 | sLevels <- names(groupByS) 67 | remainingS <- length(sLevels) 68 | 69 | 70 | # prepare to plot each sensitive level against X; in this loop, fit 71 | # the models, and then plot in the following loop 72 | curXDataList <- list() 73 | predsList <- list() 74 | for (i in 1:remainingS) { 75 | 76 | # setup data for training 77 | curData <- groupByS[[i]][,-sCol] # current s-level w/o sensitive column 78 | curXData <- unique(curData[[xName]]) # only the numeric x column 79 | curXDF <- as.data.frame(curXData) 80 | names(curXDF) <- xName # adjust column name 81 | 82 | # fit ML model 83 | model <- qeFtn(curData, yName, holdout = NULL) 84 | preds <- predict(model, curXDF) 85 | if (dichotY) preds <- preds$probs 86 | 87 | # sort data so that lines() will make sense 88 | curXData <- as.vector(curXData) 89 | preds <- as.vector(preds) 90 | orderedXData <- order(curXData) 91 | curXData <- curXData[orderedXData] 92 | preds <- preds[orderedXData] 93 | 94 | # store dataframe w/ sorted data for plotting 95 | # check Loess 96 | plotdf <- data.frame(curXData, preds) 97 | if (useLoess) { 98 | preds <- loess(preds ~ curXData, plotdf)$fitted # loess smoothing 99 | } 100 | 101 | # these 2 will be used in call to lines() 102 | curXDataList[[i]] <- curXData 103 | predsList[[i]] <- preds 104 | } 105 | 106 | # create plot 107 | colors <- rainbow(remainingS) 108 | predsMax <- max(sapply(predsList, max)) 109 | predsMin <- min(sapply(predsList, min)) 110 | ylow <- if (predsMin >= 0) 0.9 * predsMin else 1.1 * predsMin 111 | yhigh <- if (predsMax >= 0) 1.1 * predsMax else 0.9 * predsMax 112 | currXMax <- max(sapply(curXData, max)) 113 | currXMin <- min(sapply(curXData, min)) 114 | 115 | plot( 116 | NULL, 117 | ylim = c(ylow, yhigh), 118 | xlim = c(currXMin, currXMax), 119 | xlab = xName, 120 | ylab = yName, 121 | main = paste("Underlying Effects of ", sName, " on ", 122 | yName, " wrt ", xName) 123 | ) 124 | 125 | for (i in 1:remainingS) { 126 | lines( 127 | curXDataList[[i]], 128 | predsList[[i]], 129 | type = "l", 130 | lty = "solid", 131 | col = colors[i] 132 | ) 133 | } 134 | 135 | legend( 136 | x = "topright", 137 | lty = rep(1, remainingS), 138 | text.font = 4, 139 | col = colors, 140 | text.col = "black", 141 | legend = sLevels 142 | ) 143 | } 144 | 145 | -------------------------------------------------------------------------------- /R/dsldDensitybyS.R: -------------------------------------------------------------------------------- 1 | dsldDensityByS <- function(data, cName, sName, graphType = "plotly", fill = FALSE) { 2 | if (!class(data[, sName]) %in% c("factor", "character")) 3 | stop(paste("sName should be of factor or character data type. Consider setting this as a cName instead")) 4 | 5 | if (tolower(graphType) == "plot") 6 | plotDensity(data, cName, sName, fill) 7 | else if (tolower(graphType) == "plotly") 8 | plotlyDensity(data, cName, sName) 9 | } 10 | 11 | # ---- test ---- 12 | # library(dsld) 13 | # data(svcensus) 14 | # dsld::dsldDensityByS(svcensus, "wageinc", "educ") 15 | 16 | # non interactable version of density graph 17 | plotDensity <- function(data, cName, sName, fill) { 18 | getSuggestedLib('ggplot2') 19 | 20 | # the string of the columns to use for labels 21 | cNameStr <- names(data[cName]) 22 | sNameStr <- names(data[sName]) 23 | 24 | sGroups <- levels(unique(data[, sName])) 25 | for (i in 1:length(sGroups)) { 26 | den <- density(data[data[, sName] == sGroups[i], ][, cName]) 27 | 28 | if (i == 1) 29 | plot(den, col = i, xlab = cNameStr, main = paste("Density of", cNameStr, "by", sNameStr)) 30 | else 31 | lines(den, col = i) 32 | 33 | if (fill) polygon(den, col = i) 34 | } 35 | 36 | legend("topright", title = sNameStr, legend = sGroups, col = 1:length(sGroups), lty = 1) 37 | } 38 | 39 | # interactable plotly version 40 | plotlyDensity <- function(data, cName, sName) { 41 | getSuggestedLib('plotly') 42 | 43 | # the strategy for allowing a slider to control for density 44 | # is plot one graph for each possible bandwidth on the slider. 45 | # the slider will select one graph to be visible at a time 46 | 47 | numGroups <- length(levels(unique(data[, sName]))) 48 | # the string of the columns to use for labels 49 | cNameStr <- names(data[cName]) 50 | sNameStr <- names(data[sName]) 51 | 52 | bw <- seq(.25, 4, .25) # a vector of all the bandwidths we're using 53 | 54 | # aval <- a list of the arguments of all the lines we're going to graph 55 | aval <- list() 56 | for (i in 1:length(bw)) { 57 | # from plotly: creating a single group-separated density dataframe object to graph 58 | dens <- with(data, 59 | tapply(data[, cName], INDEX = data[, sName], density, adjust = bw[i])) 60 | df <- data.frame( 61 | x = unlist(lapply(dens, "[[", "x")), 62 | y = unlist(lapply(dens, "[[", "y")), 63 | group = rep(names(dens), each = length(dens[[1]]$x)) 64 | ) 65 | # all graphs are invisible by default 66 | aval[[i]] <- list(visible = FALSE, x = df$x, y = df$y) 67 | } 68 | # the default (notch 4 on the slider) is visible 69 | aval[[4]]$visible = TRUE 70 | 71 | # initial plot 72 | fig <- plotly::plot_ly(type = 'scatter', mode = 'lines', color = df$group) 73 | 74 | # each step changes the visible argument of each graph on the plot. 75 | steps <- list() 76 | # for every bandwith on the slider, add the different density graphs to the plot. 77 | for (i in 1:length(bw)) { 78 | fig <- plotly::add_lines(fig, x = aval[[i]]$x, y = aval[[i]]$y, 79 | visible = aval[[i]]$visible) 80 | # if there are 3 groups in sName, and there are 8 bandwidths, there 81 | # are 24 graphs. 82 | # we need to initally set all graphss visibility to false 83 | step <- list( 84 | args = list('visible', rep(FALSE, length(aval) * numGroups)), 85 | method = 'restyle', label = bw[i] 86 | ) 87 | # and then the corresponding 3 graphs (1 for each level of sName 88 | # with the same bandwidth ) to true 89 | step$args[[2]][1:numGroups + numGroups * i] <- TRUE 90 | steps[[i]] <- step 91 | } 92 | # buttons to select fill or no fill, by changing the fill argument 93 | # of the plot we're graphing 94 | buttons <- list( 95 | list( 96 | method = "restyle", 97 | args = list("fill", "none"), 98 | label = "no fill" 99 | ), 100 | list( 101 | method = "restyle", 102 | args = list("fill", "tozeroy"), 103 | label = "fill" 104 | ) 105 | ) 106 | # updatemenus is the button for fill/no fill 107 | # sliders is the density slider 108 | fig <- plotly::layout(fig, 109 | updatemenus = list(list( 110 | active = 0, 111 | x = 0, 112 | y = 1, 113 | buttons = buttons 114 | )), 115 | sliders = list(list( 116 | active = 3, 117 | currentvalue = list(prefix = "Adjust: "), 118 | steps = steps 119 | )), 120 | title = paste("Density of", cNameStr, "by", sNameStr), 121 | xaxis = list(title = cNameStr), 122 | yaxis = list(title = "Density"), 123 | legend = list(title = list(text = sNameStr)) 124 | ) 125 | fig 126 | } -------------------------------------------------------------------------------- /R/dsldTakeALookAround.R: -------------------------------------------------------------------------------- 1 | ### -------------------------- dsldTakeALookAround --------------------------- 2 | dsldTakeALookAround <- function(data, yName, sName, 3 | maxFeatureSetSize = (ncol(data) - 2), 4 | holdout = floor(min(1000, 0.1 * nrow(data)))) { 5 | # load libraries 6 | getSuggestedLib("qeML") 7 | 8 | # args checking # 9 | if (maxFeatureSetSize > (ncol(data) - 2)) { 10 | stop("maxFeatureSetSize too large!") # error on invalid size 11 | } 12 | 13 | if (!is.data.frame(data)) { 14 | stop("data must be a dataframe or equivalent") # error on types 15 | } 16 | 17 | # subset dataset to remove sName and yName 18 | max_features_data <- data[, !names(data) %in% c(yName, sName)] 19 | 20 | # get names of feature set 21 | feature_names <- colnames(max_features_data) 22 | 23 | # initialize empty vectors to populate with test accuracy scores 24 | col_names <- c() 25 | MSE_Y <- c() 26 | MSE_YS <- c() 27 | MSE_S <- c() 28 | 29 | # run for loop to get all possible combinations of features up to maxFeatureSetSize 30 | for (i in 1:maxFeatureSetSize) { 31 | # create combination matrix containing i-features 32 | combination_matrix <- combn(feature_names, i) 33 | 34 | # run second for loop across each column of the combination_matrix 35 | for (j in 1:dim(combination_matrix)[2]) { 36 | # create vector of feature set names across each run - compute 1. 37 | current_features <- combination_matrix[,j] # get feature names on the jth loop 38 | names <- toString(current_features) # convert to string 39 | names <- gsub(" ","",names) # remove spaces between the characters 40 | col_names <- c(col_names, names) # append feature names string into vector 41 | 42 | # create dataframes to compute test accuracies 43 | feature_data_Y <- data[,c(current_features, yName)] # dataframe with feature set and Y 44 | feature_data_Y_S <- data[,c(current_features, yName,sName)] # dataframe with feature set, S and Y 45 | feature_data_S <- data[,c(current_features, sName)] # dataframe with feature set and S 46 | 47 | # get part 2. and 3. 48 | # check whether Y is continuous 49 | if (is.numeric(data[[yName]])) { 50 | a <- qeLin(feature_data_Y, yName, holdout)$testAcc # get prediction accuracy for Y of this feature set 51 | MSE_Y <- c(MSE_Y, a) # append test accuracy into vector 52 | 53 | b <- qeLin(feature_data_Y_S, yName, holdout)$testAcc # get prediction accuracy for Y of the feature set PLUS s 54 | MSE_YS <- c(MSE_YS, b) 55 | } 56 | # Y is discrete 57 | else { 58 | a <- qeLogit(feature_data_Y, yName, holdout)$testAcc # get prediction accuracy for Y of this feature set 59 | MSE_Y <- c(MSE_Y, a) 60 | 61 | b <- qeLogit(feature_data_Y_S, yName, holdout)$testAcc # get prediction accuracy for Y of this feature set PLUS s 62 | MSE_YS <- c(MSE_YS, b) 63 | } 64 | 65 | # get 4. 66 | # check whether sName is continuous 67 | if (is.numeric(data[[sName]])) { 68 | c <- qeLin(feature_data_S, sName, holdout)$testAcc # get prediction accuracy of S from the feature set 69 | MSE_S <- c(MSE_S, c) 70 | } 71 | # if sName is discrete 72 | else { 73 | c <- qeLogit(feature_data_S, sName, holdout)$testAcc # get prediction accuracy of S from the feature set 74 | MSE_S <- c(MSE_S, c) 75 | } 76 | } 77 | } 78 | 79 | # create dataframe 80 | df <- data.frame(col_names, MSE_Y, MSE_YS, MSE_S) 81 | colnames(df)[1] <- "Feature Names" 82 | colnames(df)[2] <- "a" 83 | colnames(df)[3] <- "b" 84 | colnames(df)[4] <- "c" 85 | return(df) 86 | } 87 | 88 | # Test runs 89 | # Example 1: We investigate the predictive accuracy for a continuous Y,'wageinc', using the default arguments for maxFeatureSetSize = 4 90 | # data(svcensus) 91 | # dsldTakeALookAround(svcensus, 'wageinc', 'gender', 4) 92 | 93 | # Example 2: We investigate the predictive accuracy for a categorical Y, 'educ', using the default arguments for maxFeatureSetSize = 4 94 | # data(svcensus) 95 | # dsldTakeALookAround(svcensus, 'educ', 'occ') 96 | 97 | # Example 3: We investigate the predictive accuracy for a continuous Y, 'wageinc', using the maxFeatureSetSize = 1 98 | # data(svcensus) 99 | # dsldTakeALookAround(svcensus, 'wageinc', 'gender', 1) 100 | -------------------------------------------------------------------------------- /R/dsldScatterPlot3D.R: -------------------------------------------------------------------------------- 1 | dsldScatterPlot3D <- function(data, yNames, sName, sGroups = NULL, 2 | sortedBy = "Name", numGroups = 8, 3 | maxPoints = NULL, xlim = NULL, ylim = NULL, 4 | zlim = NULL, main = NULL, colors = "Paired", 5 | opacity = 1, pointSize = 8) { 6 | # environment setup 7 | getSuggestedLib("plotly") 8 | 9 | # limit amount of data points 10 | if (!is.null(maxPoints)) { 11 | data <- data[1:maxPoints, ] 12 | } 13 | 14 | # args type-checking 15 | if (!class(data[, sName]) %in% c("factor", "character")) 16 | stop( 17 | "sName should be of factor or character data type. 18 | Consider setting this as yName instead" 19 | ) 20 | 21 | # check 3D plot compatibility 22 | if (length(yNames) != 3) { 23 | stop("ScatterPlot3d requires 3 variables for the 3 axis") 24 | } 25 | 26 | # sGroups <- a vector of the individual group names in the 'data'. 27 | # the user can supply sGroups as an vector of names they want to look at 28 | if (is.null(sGroups)) { 29 | sGroups <- makeSGroups(data, sName, numGroups, sortedBy) 30 | } 31 | 32 | # limits dataset to include only those with a group in groupNames 33 | data <- data[data[, sName] %in% sGroups, ] 34 | data <- droplevels(data) 35 | 36 | # limit values of data points 37 | if (!is.null(xlim) | !is.null(ylim) | !is.null(zlim)) 38 | data <- limitRange(data, yNames, xlim, ylim, zlim) 39 | 40 | # creates a title 41 | if (is.null(main)) { 42 | for (yName in names(data[yNames])) { 43 | main <- paste(main, yName) 44 | } 45 | 46 | main <- paste(main, " by ", names(data[sName])) 47 | } 48 | 49 | # save this to print to the text of each point 50 | original <- data 51 | 52 | # numeric for a cleaner looking graph if the axis is factor type 53 | data[, yNames] <- sapply(data[, yNames], as.numeric) 54 | 55 | # info card for each data point 56 | text <- paste("", sep = "") 57 | for (i in 1:length(data)) { 58 | text <- paste( 59 | text, 60 | names(data[i]), 61 | ": ", 62 | original[, i], 63 | "
", 64 | sep = "" 65 | ) 66 | } 67 | 68 | # plotting the points 69 | fig <- plotly::plot_ly( 70 | data, 71 | x = data[, yNames[1]], 72 | y = data[, yNames[2]], 73 | z = data[, yNames[3]], 74 | color = data[, sName], 75 | colors = colors, 76 | hovertemplate = text, 77 | marker = list( 78 | size = pointSize, 79 | opacity = opacity 80 | ) 81 | ) 82 | 83 | fig <- plotly::add_markers(fig) 84 | 85 | # add labels and axis 86 | fig <- plotly::layout( 87 | fig, 88 | title = main, 89 | scene = list( 90 | xaxis = list(title = paste(names(data[yNames[1]]), "(X)")), 91 | yaxis = list(title = paste(names(data[yNames[2]]), "(Y)")), 92 | zaxis = list(title = paste(names(data[yNames[3]]), "(Z)")), 93 | legend = list(title = list(text = names(data[sName]))) 94 | ) 95 | ) 96 | 97 | return(fig) 98 | } 99 | 100 | # ---- Test Cases ---- 101 | # library(dsld) 102 | # data(svcensus) 103 | # dsldScatterPlot3D(svcensus, yNames = c("educ", "wageinc", "occ"), sName = "gender") 104 | 105 | # Generates a list of groups that exist within a sName column of a data frame 106 | makeSGroups <- function(data, sName, numGroups = NULL, sortedBy = "Name") { 107 | # If there are 8 possible types the group variable can be, the vector is 8 long. 108 | # Sorted according to user 109 | sGroups <- NULL 110 | switch(sortedBy, 111 | "Name" = sGroups <- levels(unique(data[, sName])), 112 | "Frequency" = sGroups <- 113 | names(sort(table(data[, sName]), decreasing = T)), 114 | "Frequency-Descending" = sGroups <- 115 | names(sort(table(data[, sName]), decreasing = F)) 116 | ) 117 | 118 | # otherwise the vector is cut off to only have numGroups number of sGroups 119 | if (!is.null(numGroups) && length(sGroups) > numGroups) { 120 | sGroups <- sGroups[1:numGroups] 121 | } 122 | 123 | return(sGroups) 124 | } 125 | 126 | 127 | # Restricts the values of a data frame to specified limits 128 | limitRange <- 129 | function(data, yNames, xlim = NULL, ylim = NULL, zlim = NULL) { 130 | # in case the user only gives lim as a single number 131 | xlim <- rep(xlim, 2) 132 | ylim <- rep(ylim, 2) 133 | zlim <- rep(zlim, 2) 134 | # limits the data frame 135 | if (!is.null(xlim)) 136 | data <- data[data[, yNames[1]] >= xlim[1] & data[, yNames[1]] <= xlim[2],] 137 | if (!is.null(ylim)) 138 | data <- data[data[, yNames[2]] >= ylim[1] & data[, yNames[2]] <= ylim[2],] 139 | if (!is.null(zlim)) 140 | data <- data[data[, yNames[3]] >= zlim[1] & data[, yNames[3]] <= zlim[2],] 141 | 142 | data 143 | } 144 | -------------------------------------------------------------------------------- /man/dsldEDFFair.Rd: -------------------------------------------------------------------------------- 1 | \name{dsldEDFFair Wrappers} 2 | \alias{dsldQeFairKNN} 3 | \alias{dsldQeFairRF} 4 | \alias{dsldQeFairRidgeLin} 5 | \alias{dsldQeFairRidgeLog} 6 | \alias{predict.dsldQeFair} 7 | 8 | \title{dsldEDFFair Wrappers} 9 | 10 | \description{ 11 | Explicitly Deweighted Features: control the effect of proxies 12 | related to sensitive variables for prediction. 13 | } 14 | 15 | \usage{ 16 | dsldQeFairKNN(data, yName, sNames, deweightPars = NULL, 17 | yesYVal = NULL, k = 25, scaleX = TRUE) 18 | dsldQeFairRF(data, yName, sNames, deweightPars = NULL, nTree = 500, 19 | minNodeSize = 10, mtry = floor(sqrt(ncol(data))), yesYVal = NULL) 20 | dsldQeFairRidgeLin(data, yName, sNames, deweightPars = NULL) 21 | dsldQeFairRidgeLog(data, yName, sNames, deweightPars = NULL, yesYVal) 22 | \method{predict}{dsldQeFair}(object,newx,...) 23 | } 24 | 25 | \arguments{ 26 | \item{data}{ 27 | Dataframe, training set. 28 | } 29 | \item{yName}{ 30 | Name of the response variable column. 31 | } 32 | \item{sNames}{ 33 | Name(s) of the sensitive attribute column(s). 34 | } 35 | \item{deweightPars}{ 36 | Values for de-emphasizing variables in a split, e.g. 37 | 'list(age=0.2,gender=0.5)'. In the linear case, 38 | larger values means more deweighting, i.e. less influence of the given 39 | variable on predictions. For KNN and random forests, smaller 40 | values mean more deweighting. 41 | } 42 | \item{scaleX}{ 43 | Scale the features. Defaults to TRUE. 44 | } 45 | \item{yesYVal}{ 46 | Y value to be considered "yes," to be coded 1 rather than 0. 47 | } 48 | \item{k}{ 49 | Number of nearest neighbors. In functions other than 50 | \code{dsldQeFairKNN} for which this is an argument, 51 | it is the number of neighbors to use in finding 52 | conditional probabilities via knnCalib. 53 | } 54 | \item{nTree}{ 55 | Number of trees. 56 | } 57 | \item{minNodeSize}{ 58 | Minimum number of data points in a tree node. 59 | } 60 | \item{mtry}{ 61 | Number of variables randomly tried at each split. 62 | } 63 | \item{object}{ 64 | An object returned by the dsld-EDFFAIR wrapper. 65 | } 66 | \item{newx}{ 67 | New data to be predicted. Must be in the same format as original data. 68 | } 69 | \item{...}{ 70 | Further arguments. 71 | } 72 | } 73 | 74 | \author{ 75 | N. Matloff, A. Mittal, J. Tran 76 | } 77 | 78 | \details{ 79 | 80 | The sensitive variables S are removed entirely, but there is concern 81 | that they still affect prediction indirectly, via a set C of proxy 82 | variables. 83 | 84 | Linear EDF reduces the impact of the proxies through a shinkage 85 | process similar to that of ridge regression. Specifically, instead 86 | of minimizing the sum of squared errors SSE with respect to a 87 | coefficient vector b, we minimize SSE + the squared norm of Db, 88 | where D is a diagonal matrix with nonzero elements corresponding to 89 | C. Large values penalizing variables in C, thus shrinking them. 90 | 91 | KNN EDF reduces the weights in Euclidean distance for variables in 92 | C. The random forests version reduces the probabilities that a 93 | proxy will be used in splitting a node. 94 | 95 | By using various values of the deweighting parameters, the user can 96 | choose a desired position in the Fairness-Utility Tradeoff. 97 | 98 | More details can be found in the references. 99 | 100 | The DSLD package extends functionality by providing both accuracy 101 | (MAPE or misclassification rate) and fairness (correlation) on the 102 | training set during model training. 103 | } 104 | 105 | \value{ 106 | 107 | The EDF functions return objects of class 'dsldQeFair', which include 108 | components for test and base accuracy, summaries of inputs and so on. 109 | 110 | } 111 | 112 | \references{ 113 | https://github.com/matloff/EDFfair 114 | } 115 | 116 | \seealso{ 117 | Matloff, Norman, and Wenxi Zhang. "A novel regularization approach to fair ML." \cr 118 | \code{arXiv preprint arXiv:2208.06557} (2022). 119 | } 120 | 121 | \examples{ 122 | \donttest{ 123 | # regression example 124 | data(svcensus) 125 | 126 | # test/train splits 127 | n <- nrow(svcensus) 128 | train_idx <- sample(seq_len(n), size = 0.7 * n) 129 | train <- svcensus[train_idx, ] 130 | test <- svcensus[-train_idx, -4] 131 | test_y <- svcensus[-train_idx, 4] 132 | 133 | # dsldQeFairRidgeLin: deweight "occupation" and "age" columns 134 | ### also works for qeFairKNN and qeFairRF 135 | lin <- dsldQeFairRidgeLin(train, "wageinc", "gender", deweightPars = 136 | list(occ=.4, age=.2)) 137 | 138 | # training results 139 | lin$trainAcc 140 | lin$trainCorrs 141 | 142 | # testing results 143 | res <- predict(lin, test) 144 | res$correlations 145 | mean(abs(res$preds - test_y)) 146 | 147 | # also works with dsldQeFairRF, dsldQeFairKNN 148 | 149 | 150 | # classification example 151 | data(compas1) 152 | 153 | # test/train splits 154 | n <- nrow(compas1) 155 | train_idx <- sample(seq_len(n), size = 0.7 * n) 156 | train <- compas1[train_idx, ] 157 | test <- compas1[-train_idx, -8] 158 | test_y <- compas1[-train_idx, 8] 159 | test_y <- as.factor(as.integer(test_y== 'Yes')) 160 | 161 | # dsldQeFairKNN: deweight "decile score" column with "race" as the sensitive variable 162 | # also works for qeFairRF, qeFairRidgeLog 163 | knnOut <- dsldQeFairKNN(compas1, "two_year_recid", "race", 164 | list(decile_score=0.1), yesYVal = "Yes") 165 | 166 | # training/testing results 167 | knnOut$trainAcc 168 | knnOut$trainCorrs 169 | res = predict(knnOut, test) 170 | res$correlations 171 | mean(test_y != round(res$preds$probs)) 172 | 173 | # also works with dsldQeFairRF, dsldQeFairRidgeLog 174 | } 175 | 176 | } -------------------------------------------------------------------------------- /inst/examples/graphical.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "80ed4fdb", 6 | "metadata": {}, 7 | "source": [ 8 | "#### Examples for graphical methods provided by dsldPy\n", 9 | "\n", 10 | "The goal is to make each function call as simple as possible for the users. The following examples of functions are illustrated:\n", 11 | "\n", 12 | "1. dsldPyScatterPlot\n", 13 | "2. dsldFreqParCoord\n", 14 | "3. dsldPyConditsDisparity\n", 15 | "3. dsldPyBnLearn\n", 16 | "5. dsldConfounders / dsldDensityByS" 17 | ] 18 | }, 19 | { 20 | "cell_type": "code", 21 | "execution_count": null, 22 | "id": "f1c5d10e", 23 | "metadata": {}, 24 | "outputs": [], 25 | "source": [ 26 | "## requires R and the dsld (R) package installed\n", 27 | "# !pip install dsldPy" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": null, 33 | "id": "03b32052", 34 | "metadata": {}, 35 | "outputs": [], 36 | "source": [ 37 | "# load necessary libraries\n", 38 | "from dsldPy import (\n", 39 | "# data reading and preprocessing\n", 40 | "preprocess_data, read_data,\n", 41 | "\n", 42 | "dsldPyScatterPlot3D, dsldPyFreqPCoord, dsldPyConditDisparity, dsldPyConfounders, dsldPyDensitybyS, dsldPyIamb\n", 43 | ")\n" 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": null, 49 | "id": "953b287e", 50 | "metadata": {}, 51 | "outputs": [], 52 | "source": [ 53 | "### data preprocessing\n", 54 | "\n", 55 | "### all dsldPy functions require a R data frame object as input (NOT pandas dataframe)\n", 56 | "### the preprocessing is done by the function preprocess_data\n", 57 | "### user needs to manually provide the categorical and numerical features (list)\n", 58 | "### the function preprocess_data returns a R data.frame object -> required input for the dsldPy functions\n", 59 | "\n", 60 | "# two datasets\n", 61 | "# svcensus data\n", 62 | "#### REPLACE WITH YOUR PATH TO svcensus.RData\n", 63 | "# df = read_data(\"\") \n", 64 | "\n", 65 | "# preprocess data\n", 66 | "cat_features = ['educ', 'occ', 'gender']\n", 67 | "num_features= ['age', 'wageinc', 'wkswrkd']\n", 68 | "svcensus = preprocess_data(df, cat_features, num_features)\n", 69 | "\n", 70 | "# compas1 data\n", 71 | "#### REPLACE WITH YOUR PATH TO compas1.RData\n", 72 | "# df = read_data(\"\")\n", 73 | "\n", 74 | "# preprocess data\n", 75 | "cat_features = [\"sex\", \"two_year_recid\", \"race\"]\n", 76 | "num_features = [\"age\",\"juv_fel_count\",\"decile_score\",\"juv_misd_count\",\"juv_other_count\",\"priors_count\",\"c_jail_in\",\"c_jail_out\",\"c_offense_date\",\"screening_date\",\"in_custody\",\"out_custody\"]\n", 77 | "compas1 = preprocess_data(df, cat_features, num_features)\n" 78 | ] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "execution_count": null, 83 | "id": "10d19951", 84 | "metadata": {}, 85 | "outputs": [], 86 | "source": [ 87 | "### 1. ------------------------------ dsldPyScatterPlot3D ------------------------------\n", 88 | "dsldPyScatterPlot3D(data = svcensus, yNames= ['wageinc', 'wkswrkd', 'age'], sName = 'gender')" 89 | ] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "execution_count": null, 94 | "id": "ec5edc31", 95 | "metadata": {}, 96 | "outputs": [], 97 | "source": [ 98 | "### 2. ------------------------------ dsldPyFreqPCoord ------------------------------\n", 99 | "dsldPyFreqPCoord(data = compas1, m = 100, sName = 'sex')" 100 | ] 101 | }, 102 | { 103 | "cell_type": "code", 104 | "execution_count": null, 105 | "id": "d3c2dc1d", 106 | "metadata": {}, 107 | "outputs": [], 108 | "source": [ 109 | "### 3. ------------------------------ dsldPyConditDisparity ------------------------------\n", 110 | "dsldPyConditDisparity(data = compas1, yName= \"two_year_recid\", sName= \"race\", xName=\"age\", condits=[\"priors_count <= 4\",\"decile_score>=6\"])" 111 | ] 112 | }, 113 | { 114 | "cell_type": "code", 115 | "execution_count": null, 116 | "id": "89831a3d", 117 | "metadata": {}, 118 | "outputs": [], 119 | "source": [ 120 | "### 4. ------------------------------ dsldPyIamb ------------------------------\n", 121 | "dsldPyIamb(data = svcensus)" 122 | ] 123 | }, 124 | { 125 | "cell_type": "code", 126 | "execution_count": null, 127 | "id": "4e960a57", 128 | "metadata": {}, 129 | "outputs": [], 130 | "source": [ 131 | "### 6. ------------------------------ dsldPyConfounders/dsldPyDensitybyS ------------------------------\n", 132 | "### the plot is shown in a new google/chrome window // all other variables shown\n", 133 | "dsldPyConfounders(data =svcensus, sName='gender')" 134 | ] 135 | }, 136 | { 137 | "cell_type": "code", 138 | "execution_count": null, 139 | "id": "0af77c5a", 140 | "metadata": {}, 141 | "outputs": [], 142 | "source": [ 143 | "### if you just want to focus on one specific plot, you can use the dsldPyDensitybyS function \n", 144 | "### the plot is shown in a new google/chrome window \n", 145 | "dsldPyDensitybyS(svcensus, cName = 'wageinc', sName='gender')\n" 146 | ] 147 | }, 148 | { 149 | "cell_type": "code", 150 | "execution_count": null, 151 | "id": "2eba7dba", 152 | "metadata": {}, 153 | "outputs": [], 154 | "source": [] 155 | } 156 | ], 157 | "metadata": { 158 | "kernelspec": { 159 | "display_name": "dsld", 160 | "language": "python", 161 | "name": "python3" 162 | }, 163 | "language_info": { 164 | "codemirror_mode": { 165 | "name": "ipython", 166 | "version": 3 167 | }, 168 | "file_extension": ".py", 169 | "mimetype": "text/x-python", 170 | "name": "python", 171 | "nbconvert_exporter": "python", 172 | "pygments_lexer": "ipython3", 173 | "version": "3.12.4" 174 | } 175 | }, 176 | "nbformat": 4, 177 | "nbformat_minor": 5 178 | } 179 | -------------------------------------------------------------------------------- /R/dsldFairML.R: -------------------------------------------------------------------------------- 1 | ### dsld fairML wrappers 2 | 3 | # base function for fairML wrappers --- they all follow the same format: 4 | # converts the data into a format that the fairml models accept 5 | # then puts the fairml model inside an object of the dsldFairML class which 6 | # has its own predict function 7 | 8 | fairmlBase <- function(fairmlFUNC, data, yName, sName, unfairness, ...) { 9 | 10 | # data-prep 11 | data <- toNumericFactor(data) 12 | response <- data[,yName] 13 | predictors <- data[,!colnames(data) %in% c(yName,sName)] 14 | sensitive <- data[,sName] 15 | 16 | # calls a fairml model function as the base for the dsldFairML object 17 | base <- fairmlFUNC(response = response, predictors = predictors, 18 | sensitive = sensitive, unfairness = unfairness, ...) 19 | 20 | # save yName and sName to use in predict() 21 | model <- list( 22 | base = base, 23 | yName = yName, 24 | sName = sName, 25 | FactorsInfo = factor_levels(data) 26 | ) 27 | 28 | class(model) <- c("dsldFairML") 29 | model 30 | } 31 | 32 | # wrapper for Frrm() 33 | dsldFrrm <- function(data, yName, sName, unfairness, 34 | definition = "sp-komiyama", lambda = 0, 35 | save.auxiliary = FALSE) { 36 | 37 | data <- toNumericFactor(data) 38 | 39 | suppressWarnings({ 40 | model = fairmlBase(fairml::frrm, data, yName, sName, unfairness, 41 | definition, lambda, save.auxiliary) 42 | }) 43 | 44 | # training preds/corrs 45 | predictors <- data[,!colnames(data) %in% c(yName, sName)] 46 | sensitive <- data[,sName] 47 | model$trainPreds <- predict(model$base, predictors, sensitive) 48 | model$trainAcc <- mean(abs(model$trainPreds - data[[yName]])) 49 | model$trainCorrs <- s_correlations(data, sName, model$trainPreds) 50 | model 51 | } 52 | 53 | # wrapper for Fgrrm() 54 | dsldFgrrm <- function(data, yName, sName, unfairness, 55 | definition = "sp-komiyama", family = "binomial", 56 | lambda = 0, save.auxiliary = FALSE, yesYVal) { 57 | 58 | data <- toNumericFactor(data) 59 | data[[yName]] <- as.factor(as.integer(data[[yName]] == yesYVal)) 60 | 61 | suppressWarnings({ 62 | model <- fairmlBase(fairml::fgrrm, data, yName, sName, unfairness, 63 | definition, family, lambda, save.auxiliary) 64 | }) 65 | 66 | # training preds/corrs 67 | predictors <- data[,!colnames(data) %in% c(yName, sName)] 68 | sensitive <- data[,sName] 69 | model$trainPreds <- predict(model$base, predictors, sensitive) 70 | test_y <- as.integer(data[[yName]] == 1) 71 | model$trainAcc <- mean(test_y != round(model$trainPreds)) 72 | model$trainCorrs <- s_correlations(data, sName, model$trainPreds) 73 | model 74 | } 75 | 76 | # wrapper for Nclm() 77 | dsldNclm <- function(data, yName, sName, unfairness, covfun = cov, 78 | lambda = 0, save.auxiliary = FALSE) { 79 | 80 | getSuggestedLib('cccp') 81 | data <- toNumericFactor(data) 82 | 83 | suppressWarnings({ 84 | model <- fairmlBase(fairml::nclm, data, yName, sName, unfairness, covfun, 85 | lambda, save.auxiliary) 86 | }) 87 | 88 | # training preds/corrs 89 | predictors <- data[,!colnames(data) %in% c(yName, sName)] 90 | sensitive <- data[,sName] 91 | model$trainPreds <- predict(model$base, predictors, sensitive) 92 | model$trainAcc <- mean(abs(model$trainPreds - data[[yName]])) 93 | model$trainCorrs <- s_correlations(data, sName, model$trainPreds) 94 | model 95 | } 96 | 97 | # wrapper for Zlm() 98 | dsldZlm <- function(data, yName, sName, unfairness) { 99 | 100 | getSuggestedLib('CVXR') 101 | data <- toNumericFactor(data) 102 | 103 | suppressWarnings({ 104 | model <- fairmlBase(fairml::zlm, data, yName, sName, unfairness) 105 | }) 106 | 107 | # training preds/corrs 108 | predictors <- data[,!colnames(data) %in% c(yName, sName)] 109 | sensitive <- data[,sName] 110 | model$trainPreds <- predict(model$base, predictors) 111 | model$trainAcc <- mean(abs(model$trainPreds - data[[yName]])) 112 | model$trainCorrs <- s_correlations(data, sName, model$trainPreds) 113 | model 114 | } 115 | 116 | # wrapper for Zlrm() 117 | dsldZlrm <- function(data, yName, sName, unfairness, yesYVal) { 118 | 119 | getSuggestedLib('CVXR') 120 | data <- toNumericFactor(data) 121 | data[[yName]] <- as.factor(as.integer(data[[yName]] == yesYVal)) 122 | 123 | suppressWarnings({ 124 | model <- fairmlBase(fairml::zlrm, data, yName, sName, unfairness) 125 | }) 126 | 127 | # training preds/corrs 128 | predictors <- data[,!colnames(data) %in% c(yName, sName)] 129 | sensitive <- data[,sName] 130 | model$trainPreds <- predict(model$base, predictors) 131 | test_y <- as.integer(data[[yName]] == 1) 132 | model$trainAcc <- mean(test_y != round(model$trainPreds)) 133 | model$trainCorrs <- s_correlations(data, sName, model$trainPreds) 134 | model 135 | } 136 | 137 | ### S3 methods summary() and predict() 138 | summary.dsldFairML <- function(object,...){ 139 | summary(object$base) 140 | } 141 | 142 | predict.dsldFairML <- function(object, newx,...) { 143 | suppressWarnings({ 144 | # data-prep 145 | newx <- toNumericFactor(newx) 146 | newx <- apply_factor_levels(newx, object$FactorsInfo) 147 | 148 | yName <- object$yName 149 | sName <- object$sName 150 | predictors <- newx[,!colnames(newx) %in% c(yName, sName)] 151 | sensitive <- newx[,sName] 152 | 153 | class <- class(object$base)[1] 154 | 155 | if (class %in% c("zlm", "zlrm")) { 156 | 157 | # zlm and zlrm have one less argument for prediction 158 | preds <- predict(object$base, predictors) 159 | cors <- s_correlations(newx, sName, preds) 160 | return(list(preds = preds, correlations = cors)) 161 | 162 | } else { 163 | 164 | preds <- predict(object$base, predictors, sensitive) 165 | cors <- s_correlations(newx, sName, preds) 166 | return(list(preds = preds, correlations = cors)) 167 | 168 | } 169 | }) 170 | } 171 | -------------------------------------------------------------------------------- /inst/src/dsldPy/dsldPyFairML.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Python interface for dsldFairML functions in the dsld R package. 3 | ''' 4 | 5 | from .Utils import get_dsld, dsld_Rpy2_IsRDataframe 6 | from rpy2.robjects.packages import importr 7 | import rpy2.robjects as robjects 8 | 9 | 10 | # ================== dsldFrrm ================== 11 | 12 | def dsldPyFrrm(data, yName, sName, unfairness, definition="sp-komiyama", lamda=0, save=False): 13 | r_data = dsld_Rpy2_IsRDataframe(data) 14 | yName_r = robjects.StrVector([yName]) 15 | sName_r = robjects.StrVector([sName]) 16 | unfair_r = robjects.FloatVector([unfairness]) 17 | def_r = robjects.StrVector([definition]) 18 | lamda_r = robjects.FloatVector([lamda]) 19 | save_r = robjects.BoolVector([save]) 20 | 21 | dsld = get_dsld() 22 | model = dsld.dsldFrrm(r_data, yName_r, sName_r, unfair_r, def_r, lamda_r, save_r) 23 | 24 | preds = model.rx2("trainPreds") 25 | acc = model.rx2("trainAcc") 26 | corrs = model.rx2("trainCorrs") 27 | 28 | result = { 29 | "model": model, 30 | "train_predictions": list(preds), 31 | "train_accuracy": float(acc[0]), 32 | "train_correlations": list(zip(list(corrs.rx2("feature")), 33 | list(corrs.rx2("correlation")))) 34 | } 35 | return result 36 | 37 | # ================== dsldFgrrm ================== 38 | 39 | def dsldPyFgrrm(data, yName, sName, unfairness, definition="sp-komiyama", family="binomial", lamda=0, save=False, yesYVal = None): 40 | r_data = dsld_Rpy2_IsRDataframe(data) 41 | yName_r = robjects.StrVector([yName]) 42 | sName_r = robjects.StrVector([sName]) 43 | 44 | unfair_r = robjects.FloatVector([unfairness]) 45 | 46 | def_r = robjects.StrVector([definition]) 47 | fam_r = robjects.StrVector([family]) 48 | lamda_r = robjects.FloatVector([lamda]) 49 | save_r = robjects.BoolVector([save]) 50 | yesYVal_r = robjects.StrVector([yesYVal]) 51 | 52 | 53 | dsld = get_dsld() 54 | model = dsld.dsldFgrrm(r_data, yName_r, sName_r, unfair_r, def_r, fam_r, lamda_r, save_r, yesYVal_r) 55 | 56 | preds = model.rx2("trainPreds") 57 | acc = model.rx2("trainAcc") 58 | corrs = model.rx2("trainCorrs") 59 | 60 | result = { 61 | "model": model, 62 | "train_predictions": list(preds), 63 | "train_accuracy": float(acc[0]), 64 | "train_correlations": list(zip(list(corrs.rx2("feature")), 65 | list(corrs.rx2("correlation")))) 66 | } 67 | return result 68 | 69 | # ================== dsldNclm ================== 70 | 71 | def dsldPyNclm(data, yName, sName, unfairness, covfun=robjects.r('cov'), lamda=0, save=False): 72 | r_data = dsld_Rpy2_IsRDataframe(data) 73 | yName_r = robjects.StrVector([yName]) 74 | sName_r = robjects.StrVector([sName]) 75 | unfair_r = robjects.FloatVector([unfairness]) 76 | lamda_r = robjects.FloatVector([lamda]) 77 | save_r = robjects.BoolVector([save]) 78 | 79 | dsld = get_dsld() 80 | model = dsld.dsldNclm(r_data, yName_r, sName_r, unfair_r, covfun, lamda_r, save_r) 81 | 82 | preds = model.rx2("trainPreds") 83 | acc = model.rx2("trainAcc") 84 | corrs = model.rx2("trainCorrs") 85 | 86 | result = { 87 | "model": model, 88 | "train_predictions": list(preds), 89 | "train_accuracy": float(acc[0]), 90 | "train_correlations": list(zip(list(corrs.rx2("feature")), 91 | list(corrs.rx2("correlation")))) 92 | } 93 | return result 94 | 95 | # ================== dsldZlm ================== 96 | 97 | def dsldPyZlm(data, yName, sName, unfairness): 98 | r_data = dsld_Rpy2_IsRDataframe(data) 99 | yName_r = robjects.StrVector([yName]) 100 | sName_r = robjects.StrVector([sName]) 101 | unfair_r = robjects.FloatVector([unfairness]) 102 | 103 | dsld = get_dsld() 104 | model = dsld.dsldZlm(r_data, yName_r, sName_r, unfair_r) 105 | 106 | preds = model.rx2("trainPreds") 107 | acc = model.rx2("trainAcc") 108 | corrs = model.rx2("trainCorrs") 109 | 110 | result = { 111 | "model": model, 112 | "train_predictions": list(preds), 113 | "train_accuracy": float(acc[0]), 114 | "train_correlations": list(zip(list(corrs.rx2("feature")), 115 | list(corrs.rx2("correlation")))) 116 | } 117 | return result 118 | 119 | # ================== dsldZlrm ================== 120 | 121 | def dsldPyZlrm(data, yName, sName, unfairness, yesYVal): 122 | r_data = dsld_Rpy2_IsRDataframe(data) 123 | yName_r = robjects.StrVector([yName]) 124 | sName_r = robjects.StrVector([sName]) 125 | unfair_r = robjects.FloatVector([unfairness]) 126 | yesYVal_r = robjects.StrVector([yesYVal]) 127 | 128 | dsld = get_dsld() 129 | model = dsld.dsldZlrm(r_data, yName_r, sName_r, unfair_r, yesYVal_r) 130 | 131 | preds = model.rx2("trainPreds") 132 | acc = model.rx2("trainAcc") 133 | corrs = model.rx2("trainCorrs") 134 | 135 | result = { 136 | "model": model, 137 | "train_predictions": list(preds), 138 | "train_accuracy": float(acc[0]), 139 | "train_correlations": list(zip(list(corrs.rx2("feature")), 140 | list(corrs.rx2("correlation")))) 141 | } 142 | return result 143 | 144 | 145 | # predict() and summary() method for all the models 146 | 147 | def dsldPyFairML_Summary(model): 148 | print(robjects.r['summary'](model['model'])) 149 | return robjects.r['summary'](model['model']) 150 | 151 | def dsldPyFairML_Predict(model, newData): 152 | robjects.r.assign("model", model['model']) 153 | xNew = dsld_Rpy2_IsRDataframe(newData) 154 | robjects.r.assign("xNew", xNew) 155 | result = robjects.r('predict(model, xNew)') 156 | names = list(result[1][0]) 157 | vals = [float(v) for v in result[1][1]] 158 | correlations = list(zip(names, vals)) 159 | output = {'test_predictions': list(result[0]), 160 | 'test_correlations': correlations} 161 | return output 162 | -------------------------------------------------------------------------------- /inst/src/dsldPy/dsldPyQeFairML.py: -------------------------------------------------------------------------------- 1 | from .Utils import get_dsld, dsld_Rpy2_IsRDataframe, dsld_Rpy2_RDataframeToPandas 2 | import sys 3 | import pandas as pd 4 | import rpy2.robjects as robjects 5 | from rpy2.robjects import pandas2ri 6 | from rpy2.robjects import conversion 7 | from rpy2.robjects.packages import importr 8 | import rpy2.robjects as ro 9 | import math 10 | from rpy2.robjects.vectors import ListVector, FloatVector 11 | from .Utils import dsld_Rpy2_RDataframeToPandas 12 | import pandas as pd 13 | 14 | from rpy2.robjects import conversion, default_converter 15 | from rpy2.robjects import pandas2ri 16 | 17 | # add pandas converter to the default rpy2 converter 18 | converter = default_converter + pandas2ri.converter 19 | 20 | # Import R packages 21 | 22 | ### qeFairKNN------------------------------------------------------------------- 23 | def dsldPyQeFairKNN(data, yName, sNames, deweightPars=None, yesYVal=None, k=25, scaleX=True): 24 | r_data = dsld_Rpy2_IsRDataframe(data) 25 | yName = robjects.StrVector([yName]) 26 | sNames = robjects.StrVector([sNames]) 27 | 28 | if deweightPars is not None: 29 | deweightPars = ListVector({k: FloatVector([v]) for k, v in deweightPars.items()}) 30 | else: 31 | deweightPars = robjects.NULL 32 | 33 | if yesYVal is not None: 34 | yesYVal = robjects.StrVector([yesYVal]) 35 | else: 36 | yesYVal = robjects.NULL 37 | 38 | k = robjects.IntVector([k]) 39 | 40 | scaleX = robjects.BoolVector([scaleX]) 41 | 42 | dsld = get_dsld() 43 | model = dsld.dsldQeFairKNN(r_data, yName, sNames, deweightPars, yesYVal, k, scaleX) 44 | 45 | preds = model.rx2("trainPreds")[0] 46 | acc = model.rx2("trainAcc") 47 | corrs = model.rx2("trainCorrs") 48 | 49 | with conversion.localconverter(converter): 50 | corrs_df = conversion.rpy2py(corrs) 51 | 52 | result = { 53 | "model": model, 54 | "train_predictions": list(preds), 55 | "train_accuracy": float(acc[0]), 56 | "train_correlations": list(zip(corrs_df["feature"], corrs_df["correlation"])) 57 | } 58 | return result 59 | 60 | ## dsldQeFairRF------------------------------------------------------------------- 61 | def dsldPyQeFairRF(data, yName, sNames, deweightPars=None, nTree=500, minNodeSize=10, mtry=None, yesYVal=None): 62 | 63 | temp_data = dsld_Rpy2_RDataframeToPandas(data) 64 | if mtry is None: 65 | mtry = math.floor(math.sqrt(temp_data.shape[1])) 66 | 67 | r_data = dsld_Rpy2_IsRDataframe(data) 68 | yName = robjects.StrVector([yName]) 69 | sNames = robjects.StrVector([sNames]) 70 | 71 | if deweightPars is not None: 72 | deweightPars = ListVector({k: FloatVector([v]) for k, v in deweightPars.items()}) 73 | else: 74 | deweightPars = robjects.NULL 75 | 76 | if yesYVal is not None: 77 | yesYVal = robjects.StrVector([yesYVal]) 78 | else: 79 | yesYVal = robjects.NULL 80 | 81 | nTree = robjects.IntVector([nTree]) 82 | minNodeSize = robjects.IntVector([minNodeSize]) 83 | mtry = robjects.IntVector([mtry]) 84 | 85 | dsld = get_dsld() 86 | model = dsld.dsldQeFairRF(r_data, yName, sNames, deweightPars, nTree, minNodeSize, mtry, yesYVal) 87 | 88 | preds = model.rx2("trainPreds")[0] 89 | acc = model.rx2("trainAcc") 90 | corrs = model.rx2("trainCorrs") 91 | 92 | with conversion.localconverter(converter): 93 | corrs_df = conversion.rpy2py(corrs) 94 | 95 | result = { 96 | "model": model, 97 | "train_predictions": list(preds), 98 | "train_accuracy": float(acc[0]), 99 | "train_correlations": list(zip(corrs_df["feature"], corrs_df["correlation"])) 100 | } 101 | return result 102 | 103 | ### dsldQeFairRidgeLin------------------------------------------------------------------- 104 | 105 | def dsldPyQeFairRidgeLin(data, yName, sNames, deweightPars=None): 106 | r_data = dsld_Rpy2_IsRDataframe(data) 107 | yName = robjects.StrVector([yName]) 108 | sNames = robjects.StrVector([sNames]) 109 | 110 | if deweightPars is not None: 111 | deweightPars = ListVector({k: FloatVector([v]) for k, v in deweightPars.items()}) 112 | else: 113 | deweightPars = robjects.NULL 114 | 115 | dsld = get_dsld() 116 | model = dsld.dsldQeFairRidgeLin(r_data, yName, sNames, deweightPars) 117 | 118 | preds = model.rx2("trainPreds")[0] 119 | acc = model.rx2("trainAcc") 120 | corrs = model.rx2("trainCorrs") 121 | 122 | with conversion.localconverter(converter): 123 | corrs_df = conversion.rpy2py(corrs) 124 | 125 | result = { 126 | "model": model, 127 | "train_predictions": list(preds), 128 | "train_accuracy": float(acc[0]), 129 | "train_correlations": list(zip(corrs_df["feature"], corrs_df["correlation"])) 130 | } 131 | return result 132 | 133 | ### dsldQeFairRidgeLog------------------------------------------------------------------- 134 | 135 | def dsldPyQeFairRidgeLog(data, yName, sNames, deweightPars=None, yesYVal=None): 136 | r_data = dsld_Rpy2_IsRDataframe(data) 137 | yName = robjects.StrVector([yName]) 138 | sNames = robjects.StrVector([sNames]) 139 | 140 | if deweightPars is not None: 141 | deweightPars = ListVector({k: FloatVector([v]) for k, v in deweightPars.items()}) 142 | else: 143 | deweightPars = robjects.NULL 144 | 145 | if yesYVal is not None: 146 | yesYVal = robjects.StrVector([yesYVal]) 147 | else: 148 | yesYVal = robjects.NULL 149 | 150 | dsld = get_dsld() 151 | model = dsld.dsldQeFairRidgeLog(r_data, yName, sNames, deweightPars, yesYVal) 152 | 153 | preds = model.rx2("trainPreds")[0] 154 | acc = model.rx2("trainAcc") 155 | corrs = model.rx2("trainCorrs") 156 | 157 | with conversion.localconverter(converter): 158 | corrs_df = conversion.rpy2py(corrs) 159 | 160 | result = { 161 | "model": model, 162 | "train_predictions": list(preds), 163 | "train_accuracy": float(acc[0]), 164 | "train_correlations": list(zip(corrs_df["feature"], corrs_df["correlation"])) 165 | } 166 | return result 167 | 168 | ### predict() method for all the models 169 | def dsldPyQeFairML_Predict(model, newData): 170 | robjects.r.assign("model", model['model']) 171 | xNew = dsld_Rpy2_IsRDataframe(newData) 172 | robjects.r.assign("xNew", xNew) 173 | result = robjects.r('predict(model, xNew)') 174 | names = list(result[1][0]) 175 | vals = [float(v) for v in result[1][1]] 176 | correlations = list(zip(names, vals)) 177 | output = {'test_predictions': list(result[0]), 178 | 'test_correlations': correlations} 179 | return output 180 | -------------------------------------------------------------------------------- /R/Utils.R: -------------------------------------------------------------------------------- 1 | 2 | # many functions in dsld are wrappers for functions in other packages; 3 | # in order to avoid "package bloat," we instead check for them as needed 4 | 5 | # e.g. say a dsld function f() wraps some function in package p; then 6 | # instead of listing p as imported etc. in the dsld DESCRIPTION file, 7 | # we write the top of f(), getSuggestedLib('p'); this loads p if it is 8 | # installed on the user's machine, otherwise so informs the user 9 | 10 | getSuggestedLib <- function(pkgName) { 11 | if (!requireNamespace(pkgName,quietly=TRUE)) 12 | stop(paste0(pkgName, ' not loaded')) 13 | } 14 | 15 | pr2file <- function(filename) 16 | { 17 | origdev <- dev.cur() 18 | parts <- strsplit(filename,".",fixed=TRUE) 19 | nparts <- length(parts[[1]]) 20 | suff <- parts[[1]][nparts] 21 | if (suff == "pdf") { 22 | pdf(filename) 23 | } 24 | else if (suff == "png") { 25 | png(filename,bg='white') 26 | } 27 | else jpeg(filename) 28 | devnum <- dev.cur() 29 | dev.set(origdev) 30 | dev.copy(which = devnum) 31 | dev.set(devnum) 32 | dev.off() 33 | dev.set(origdev) 34 | } 35 | 36 | # generates a "cartesian product" of factor levels from input factors 37 | cartFactorLvls <- function(factorNames) 38 | { 39 | theLevels <- lapply(factorNames,function(fName) levels(get(fName))) 40 | expand.grid(theLevels) 41 | } 42 | 43 | ## needed for dsldLinear, dsldLogit ------------------------------------------- 44 | ### selects 5 rows for comparison across each level of the sensitive variable 45 | ### randomly if the user doesn't supply data in the interactions case 46 | 47 | dsldGetRow5 <- function(data, yName, sName) { 48 | rows <- sample(nrow(data), 5) 49 | reducedData <- data[rows, ] 50 | columns <- c(yName, sName) 51 | newDat <- reducedData[, !(names(reducedData) %in% columns)] 52 | result <- sprintf("No user sComparisonPts supplied. The following rows 53 | are selected: %s,%s,%s,%s,%s", rows[1],rows[2],rows[3],rows[4], 54 | rows[5]); print(result) 55 | return(newDat) 56 | } 57 | 58 | ## needed for: python interfaces ---------------------------------------------- 59 | ### convert data to factors and numeric as per user input 60 | convert_cols <- function(data, cat_features = character(), num_features = character()) { 61 | # If both vectors are missing or empty, return original data unchanged 62 | if ((missing(cat_features) || length(cat_features) == 0) && 63 | (missing(num_features) || length(num_features) == 0)) { 64 | return(data) 65 | } 66 | 67 | data[] <- lapply(names(data), function(col) { 68 | if (col %in% cat_features) { 69 | factor(data[[col]]) 70 | } else if (col %in% num_features) { 71 | as.numeric(data[[col]]) 72 | } else { 73 | data[[col]] 74 | } 75 | }) 76 | 77 | names(data) <- names(data) # preserve original column names 78 | data 79 | } 80 | 81 | ### stores factors levels for each factor in dataset 82 | factor_levels <- function(data) { 83 | stopifnot(is.data.frame(data)) 84 | facs <- names(Filter(is.factor, data)) 85 | setNames(lapply(facs, function(nm) levels(data[[nm]])), facs) 86 | } 87 | 88 | ### applies factor levels from each factor in dataset 89 | apply_factor_levels <- function(test_data, train_levels, quiet = TRUE) { 90 | stopifnot(is.data.frame(test_data), is.list(train_levels)) 91 | cols <- intersect(names(train_levels), names(test_data)) 92 | if (!quiet) { 93 | skipped <- setdiff(names(train_levels), names(test_data)) 94 | if (length(skipped)) message("Skipping missing columns: ", paste(skipped, collapse = ", ")) 95 | } 96 | 97 | out <- test_data 98 | for (nm in cols) { 99 | levs <- train_levels[[nm]] 100 | v <- out[[nm]] 101 | v_chr <- as.character(v) # works for factor/char/anything coercible 102 | fac <- factor(v_chr, levels = levs) # unseen -> NA 103 | if (!quiet && all(is.na(fac))) { 104 | warning("Column '", nm, "' became all NA after applying training levels.") 105 | } 106 | out[[nm]] <- fac 107 | } 108 | out 109 | } 110 | 111 | ## needed for fairML and EDF-Fair functions ----------------------------------- 112 | ### converts integer cols to numeric and character cols to factors 113 | toNumericFactor <- function(data) { 114 | data[,unlist(lapply(data, is.integer))] <- 115 | lapply(data[,unlist(lapply(data, is.integer))], as.numeric) 116 | data[,unlist(lapply(data, is.character))] <- 117 | lapply(data[,unlist(lapply(data, is.character))], as.factor) 118 | data 119 | } 120 | 121 | ### computes correlation between predictions and one or more sensitive attributes 122 | s_correlations <- function(data, sNames, predictions, 123 | method = "pearson", 124 | sort_by_abs = TRUE) { 125 | stopifnot(is.data.frame(data)) 126 | 127 | # normalize sNames 128 | if (length(sNames) == 1L && is.character(sNames) && grepl(",", sNames, fixed = TRUE)) { 129 | sNames <- trimws(strsplit(sNames, ",", fixed = TRUE)[[1]]) 130 | } 131 | sNames <- unique(as.character(sNames[nzchar(sNames)])) 132 | 133 | if (length(predictions) != nrow(data)) { 134 | stop("`predictions` length (", length(predictions), 135 | ") must equal nrow(data) (", nrow(data), ").") 136 | } 137 | y <- as.numeric(predictions) 138 | 139 | blocks <- list() 140 | 141 | for (s in sNames) { 142 | if (!s %in% names(data)) { 143 | warning("Skipping missing column: ", s) 144 | next 145 | } 146 | v <- data[[s]] 147 | 148 | # coerce characters to factors 149 | if (is.character(v)) v <- factor(v) 150 | 151 | if (is.factor(v)) { 152 | v <- droplevels(v) 153 | # If all NA or fewer than 2 levels, skip to avoid contrasts error 154 | if (all(is.na(v))) { 155 | warning("Skipping '", s, "': all values are NA after level alignment.") 156 | next 157 | } 158 | if (nlevels(v) < 2L) { 159 | warning("Skipping '", s, "': factor has fewer than 2 observed levels.") 160 | next 161 | } 162 | mm <- model.matrix(~ v - 1) # safe now 163 | colnames(mm) <- paste0(s, "==", levels(v)) 164 | blocks[[s]] <- mm 165 | 166 | } else if (is.logical(v)) { 167 | blocks[[s]] <- matrix(as.numeric(v), ncol = 1, dimnames = list(NULL, s)) 168 | 169 | } else if (is.numeric(v) || is.integer(v)) { 170 | blocks[[s]] <- matrix(as.numeric(v), ncol = 1, dimnames = list(NULL, s)) 171 | 172 | } else { 173 | warning("Skipping unsupported type for ", s, 174 | " (class: ", paste(class(v), collapse = "/"), ")") 175 | } 176 | } 177 | 178 | if (!length(blocks)) { 179 | return(data.frame(feature = character(0), correlation = numeric(0))) 180 | } 181 | 182 | X <- do.call(cbind, blocks) 183 | 184 | cors <- vapply(seq_len(ncol(X)), 185 | function(j) cor(y, X[, j], use = "pairwise.complete.obs", method = method), 186 | numeric(1)) 187 | names(cors) <- colnames(X) 188 | 189 | if (isTRUE(sort_by_abs)) cors <- cors[order(abs(cors), decreasing = TRUE)] 190 | 191 | data.frame(feature = names(cors), correlation = as.numeric(cors), row.names = NULL) 192 | } 193 | -------------------------------------------------------------------------------- /R/dsldFairUtils.R: -------------------------------------------------------------------------------- 1 | 2 | # useful helpers 3 | ### create k-fold split 4 | make_folds <- function(n, k = 5) { 5 | stopifnot(n >= k, k >= 2) 6 | idx <- sample.int(n) # shuffle rows 7 | split(idx, cut(seq_along(idx), breaks = k, labels = FALSE)) 8 | } 9 | 10 | get_fold_split <- function(data, folds, i) { 11 | stopifnot(i >= 1, i <= length(folds)) 12 | test_idx <- folds[[i]] 13 | train_idx <- setdiff(seq_len(nrow(data)), test_idx) 14 | list( 15 | train = data[train_idx, , drop = FALSE], 16 | test = data[test_idx, , drop = FALSE] 17 | ) 18 | } 19 | 20 | # Helper: keep only arguments that the target function supports (by name) 21 | .filter_model_args <- function(ftn_name, user_args) { 22 | if (is.null(user_args) || !length(user_args)) return(list()) 23 | ftn <- match.fun(ftn_name) 24 | allowed <- names(formals(ftn)) 25 | keep <- names(user_args) %in% allowed 26 | if (any(!keep)) { 27 | dropped <- names(user_args)[!keep] 28 | warning(sprintf("Ignoring unsupported arg(s) for %s: %s", 29 | ftn_name, paste(dropped, collapse = ", ")), 30 | call. = FALSE) 31 | } 32 | user_args[keep] 33 | } 34 | 35 | dsldFairUtils <- function(data, yName, sName, dsldFTNName, unfairness = NULL, 36 | deweightPars = NULL, yesYVal = NULL, k_folds = 5, 37 | model_args = NULL) { 38 | 39 | valid_models <- c("dsldQeFairKNN", "dsldQeFairRF", "dsldQeFairRidgeLin", "dsldQeFairRidgeLog", 40 | "dsldFrrm", "dsldFgrrm", "dsldNclm", "dsldZlm", "dsldZlrm") 41 | if (!(dsldFTNName %in% valid_models)) stop("Invalid dsldFTNName specified") 42 | 43 | # classification gating 44 | if (is.factor(data[[yName]])) { 45 | if (is.null(yesYVal)) stop("missing yesYVal") 46 | data[[yName]] <- as.factor(as.numeric(data[[yName]] == yesYVal)) 47 | yesYVal <- "1" 48 | } 49 | 50 | if (dsldFTNName %in% c("dsldQeFairKNN","dsldQeFairRF","dsldQeFairRidgeLin","dsldQeFairRidgeLog")) { 51 | 52 | # --- build grid of deweightPars combos --- 53 | if (is.null(deweightPars) || !length(deweightPars)) 54 | stop("Provide deweightPars as a named list. For a grid, use vectors (e.g. list(occ=c(0.2,0.4), educ=c(0.4))).") 55 | 56 | # If any element has length > 1, treat as grid; else one single combo 57 | is_grid <- any(vapply(deweightPars, length, integer(1)) > 1) 58 | grid_df <- if (is_grid) { 59 | expand.grid(deweightPars, KEEP.OUT.ATTRS = FALSE, stringsAsFactors = FALSE) 60 | } else { 61 | # single row data.frame so we can reuse the same loop 62 | as.data.frame(as.list(deweightPars), stringsAsFactors = FALSE) 63 | } 64 | 65 | # Pre-filter user-supplied model args to only those supported by the target function 66 | extra <- .filter_model_args(dsldFTNName, model_args) 67 | 68 | rows <- vector("list", nrow(grid_df)) 69 | 70 | # Cache folds once (same across combos) 71 | folds <- make_folds(nrow(data), k = k_folds) 72 | 73 | # Inner CV runner for one deweight combination (named numeric list) 74 | run_one_combo_1 <- function(dw_list_named) { 75 | accs <- numeric(length(folds)) 76 | corr_sums <- NULL 77 | feat_names <- NULL 78 | 79 | for (i in seq_along(folds)) { 80 | split <- get_fold_split(data, folds, i) 81 | trn <- split$train 82 | tst <- split$test 83 | y_test <- tst[[yName]] 84 | tst_x <- tst[, setdiff(names(tst), yName), drop = FALSE] 85 | 86 | base_args <- list( 87 | data = trn, 88 | yName = yName, 89 | sNames = sName, 90 | deweightPars = dw_list_named 91 | ) 92 | 93 | if (!is.null(yesYVal)) base_args$yesYVal <- yesYVal 94 | 95 | call_args <- utils::modifyList(base_args, extra, keep.null = TRUE) 96 | fitted <- do.call(dsldFTNName, call_args) 97 | 98 | res <- predict(fitted, tst_x) 99 | corrs <- res$correlations # data.frame: feature, correlation 100 | 101 | if (is.null(feat_names)) { 102 | feat_names <- as.character(corrs$feature) 103 | corr_sums <- setNames(numeric(length(feat_names)), feat_names) 104 | } 105 | corr_sums[as.character(corrs$feature)] <- 106 | corr_sums[as.character(corrs$feature)] + corrs$correlation 107 | 108 | if (!is.null(yesYVal)) { 109 | accs[i] <- mean(y_test != as.integer(res$preds$probs > 0.5)) 110 | } else { 111 | accs[i] <- mean(abs(res$preds - y_test)) 112 | } 113 | } 114 | 115 | # averages for this combo 116 | mean_acc <- mean(accs) 117 | mean_corrs <- corr_sums / length(folds) 118 | 119 | # return as named list: testAcc + correlation columns 120 | c(list(testAcc = mean_acc), as.list(mean_corrs)) 121 | } 122 | 123 | # Loop over each row of the grid 124 | for (r in seq_len(nrow(grid_df))) { 125 | 126 | dw_row <- lapply(grid_df[r, , drop = FALSE], function(x) as.numeric(x)[1]) 127 | names(dw_row) <- names(grid_df) 128 | 129 | metrics <- run_one_combo_1(dw_row) 130 | 131 | # Build a single row: params first, then metrics; keep raw names for correlation columns 132 | rows[[r]] <- as.data.frame(c(as.list(dw_row), metrics), check.names = FALSE) 133 | } 134 | 135 | # Bind all rows; ensure parameter columns come first 136 | out <- do.call(rbind, rows) 137 | rownames(out) <- NULL 138 | return(out) 139 | 140 | 141 | # fairML wrappers 142 | } else { 143 | if (is.null(unfairness) || !length(unfairness)) 144 | stop("Provide unfairness as a vector of numbers between (0,1]. For example: unfairness = c(0.2, 0.9).") 145 | 146 | # Pre-filter user-supplied model args to only those supported by the target function 147 | extra <- .filter_model_args(dsldFTNName, model_args) 148 | 149 | # Cache folds once (same across combos) 150 | folds <- make_folds(nrow(data), k = k_folds) 151 | 152 | # Inner CV runner for one unfairness value 153 | run_one_combo_2 <- function(u_val) { 154 | accs <- numeric(length(folds)) 155 | corr_sums <- NULL 156 | feat_names <- NULL 157 | 158 | for (i in seq_along(folds)) { 159 | split <- get_fold_split(data, folds, i) 160 | trn <- split$train 161 | tst <- split$test 162 | y_test <- tst[[yName]] 163 | tst_x <- tst[, setdiff(names(tst), yName), drop = FALSE] 164 | 165 | base_args <- list( 166 | data = trn, 167 | yName = yName, 168 | sName = sName, 169 | unfairness = u_val 170 | ) 171 | if (!is.null(yesYVal)) base_args$yesYVal <- yesYVal 172 | 173 | call_args <- utils::modifyList(base_args, extra, keep.null = TRUE) 174 | fitted <- do.call(dsldFTNName, call_args) 175 | 176 | res <- predict(fitted, tst_x) 177 | corrs <- res$correlations # data.frame: feature, correlation 178 | 179 | if (is.null(feat_names)) { 180 | feat_names <- as.character(corrs$feature) 181 | corr_sums <- setNames(numeric(length(feat_names)), feat_names) 182 | } 183 | corr_sums[as.character(corrs$feature)] <- 184 | corr_sums[as.character(corrs$feature)] + corrs$correlation 185 | 186 | if (!is.null(yesYVal)) { 187 | accs[i] <- mean(y_test != as.integer(res$preds > 0.5)) 188 | } else { 189 | accs[i] <- mean(abs(res$preds - y_test)) 190 | } 191 | } 192 | 193 | # averages for this unfairness value 194 | mean_acc <- mean(accs) 195 | mean_corrs <- corr_sums / length(folds) 196 | 197 | # return as named list: testAcc + correlation columns 198 | c(list(testAcc = mean_acc), as.list(mean_corrs)) 199 | } 200 | 201 | # Loop over the unfairness vector (no grid_df here) 202 | rows <- vector("list", length(unfairness)) 203 | for (u_idx in seq_along(unfairness)) { 204 | unfairVal <- as.numeric(unfairness[u_idx]) 205 | m <- as.list(run_one_combo_2(unfairVal)) # named list: testAcc + corr cols 206 | 207 | rows[[u_idx]] <- data.frame( 208 | c( 209 | list(unfairness = unfairVal, testAcc = m$testAcc), 210 | m[setdiff(names(m), "testAcc")] 211 | ), 212 | check.names = FALSE 213 | ) 214 | } 215 | 216 | out <- do.call(rbind, rows) 217 | rownames(out) <- NULL 218 | return(out) 219 | } 220 | } 221 | 222 | 223 | 224 | 225 | -------------------------------------------------------------------------------- /inst/examples/tabular.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "81e2b54a", 6 | "metadata": {}, 7 | "source": [ 8 | "#### Examples for analytical/tabular methods provided by dsldPy\n", 9 | "\n", 10 | "The goal is for users to apply analytical/tabular methods with simple, intuitive interface. The following functions are included for python:\n", 11 | "\n", 12 | "1. dsldLinear, dsldLogit, and dsldML \n", 13 | "2. dsldTakeALookAround\n", 14 | "3. dsldHunting (both C/O hunting functions)\n", 15 | "4. dsldFrequencybyS \n", 16 | "5. dsldMatchedAte" 17 | ] 18 | }, 19 | { 20 | "cell_type": "code", 21 | "execution_count": null, 22 | "id": "17972c5f", 23 | "metadata": {}, 24 | "outputs": [], 25 | "source": [ 26 | "## requires R and the dsld (R) package installed\n", 27 | "# !pip install dsldPy" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": null, 33 | "id": "675ca88a", 34 | "metadata": {}, 35 | "outputs": [], 36 | "source": [ 37 | "# Load necessary libraries\n", 38 | "\n", 39 | "from dsldPy import (\n", 40 | "# data reading and preprocessing\n", 41 | "preprocess_data, read_data,\n", 42 | "\n", 43 | "# linear/logistic/ML comparisons\n", 44 | "dsldPyLinear, dsldPyLinearSummary, dsldPyLinearPredict, dsldPyLinearVcov, dsldPyLinearCoef, dsldPyLinearGetData,\n", 45 | "dsldPyLogit, dsldPyLogitSummary, dsldPyLogitPredict, dsldPyLogitVcov, dsldPyLogitCoef, dsldPyLogitGetData,\n", 46 | "dsldPyML,\n", 47 | "\n", 48 | "# takeALookAround\n", 49 | "dsldPyTakeALookAround, \n", 50 | "\n", 51 | "# hunting\n", 52 | "dsldPyCHunting, dsldPyOHunting, \n", 53 | "\n", 54 | "# frequency table\n", 55 | "dsldPyFrequencybyS,\n", 56 | "\n", 57 | "# causal inference\n", 58 | "dsldPyMatchedATE\n", 59 | ")" 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": null, 65 | "id": "c9c7ac54", 66 | "metadata": {}, 67 | "outputs": [], 68 | "source": [ 69 | "### dsldLinear, dsldLogit, dsldML examples \n", 70 | "\n", 71 | "### data preprocessing\n", 72 | "\n", 73 | "### all dsldPy functions require a R data frame object as input (NOT pandas dataframe)\n", 74 | "### the preprocessing is done by the function preprocess_data\n", 75 | "### user needs to manually provide the categorical and numerical features (list)\n", 76 | "### the function preprocess_data returns a R data.frame object -> required input for the dsldPy functions\n", 77 | "\n", 78 | "# svcensus data\n", 79 | "# Replace with your own path to the svcensus.RData file\n", 80 | "# df = read_data(\"\")\n", 81 | "\n", 82 | "# preprocess data\n", 83 | "cat_features = ['educ', 'occ', 'gender']\n", 84 | "num_features= ['age', 'wageinc', 'wkswrkd']\n", 85 | "svcensus = preprocess_data(df, cat_features, num_features)\n", 86 | "\n", 87 | "df_10 = df.head(2)\n", 88 | "df_10 = df_10[['age', 'educ', 'occ', 'wkswrkd']]\n", 89 | "cat_features = ['educ', 'occ']\n", 90 | "num_features = ['age','wkswrkd']\n", 91 | "svcensus_comparisons_points = preprocess_data(df_10, cat_features, num_features)\n", 92 | "\n", 93 | "# compas1 data\n", 94 | "# Replace with your own path to the compas1.RData file\n", 95 | "# df = read_data(\"\")\n", 96 | "\n", 97 | "# preprocess data\n", 98 | "cat_features = [\"sex\", \"two_year_recid\", \"race\"]\n", 99 | "num_features = [\"age\",\"juv_fel_count\",\"decile_score\",\"juv_misd_count\",\"juv_other_count\",\"priors_count\",\"c_jail_in\",\"c_jail_out\",\"c_offense_date\",\"screening_date\",\"in_custody\",\"out_custody\"]\n", 100 | "compas1 = preprocess_data(df, cat_features, num_features)\n", 101 | "\n", 102 | "df_10 = df.head(2)\n", 103 | "df_10 = df_10[[\"sex\", \"age\",\"juv_fel_count\",\"decile_score\",\"juv_misd_count\",\"juv_other_count\",\"priors_count\",\"c_jail_in\",\"c_jail_out\",\"c_offense_date\",\"screening_date\",\"in_custody\",\"out_custody\"]]\n", 104 | "cat_features = [\"sex\"]\n", 105 | "num_features = [\"age\",\"juv_fel_count\",\"decile_score\",\"juv_misd_count\",\"juv_other_count\",\"priors_count\",\"c_jail_in\",\"c_jail_out\",\"c_offense_date\",\"screening_date\",\"in_custody\",\"out_custody\"]\n", 106 | "compas1_comparisons_points = preprocess_data(df_10, cat_features, num_features)" 107 | ] 108 | }, 109 | { 110 | "cell_type": "code", 111 | "execution_count": null, 112 | "id": "ddfb760f", 113 | "metadata": {}, 114 | "outputs": [], 115 | "source": [ 116 | "### 1. ------------------------------ dsldPyLinear/dsldPyLogit/dsldPyML ------------------------------\n", 117 | "\n", 118 | "## dsldPyLinear - interactions = True\n", 119 | "a = dsldPyLinear(data = svcensus, yName = 'wageinc', sName = 'gender', interactions = True)\n", 120 | "\n", 121 | "### the object a is a list of R objects --- can be accessed using the following functions \n", 122 | "### note that directly looking at 'a' might not be helpful --- use the following functions to access the results and use in python\n", 123 | "\n", 124 | "# uncomment to see the results of the functions\n", 125 | "# dsldPyLinearSummary(a) \n", 126 | "# dsldPyLinearCoef(a)\n", 127 | "# dsldPyLinearVcov(a)\n", 128 | "# dsldPyLinearGetData(a)\n", 129 | "\n", 130 | "# predict()\n", 131 | "preds = dsldPyLinearPredict(a, svcensus_comparisons_points)\n", 132 | "preds\n", 133 | "\n", 134 | "### can also work with interactions = False as well\n", 135 | "a2 = dsldPyLinear(data = svcensus, yName = 'wageinc', sName = 'gender', interactions = False)\n", 136 | "\n", 137 | "# dsldPyLinearSummary(a2) \n", 138 | "# dsldPyLinearCoef(a2)\n", 139 | "# dsldPyLinearVcov(a2)\n", 140 | "# dsldPyLinearGetData(a2)\n", 141 | "\n", 142 | "## the predict() method requires newData to include S (which is not done)" 143 | ] 144 | }, 145 | { 146 | "cell_type": "code", 147 | "execution_count": null, 148 | "id": "70d2ed58", 149 | "metadata": {}, 150 | "outputs": [], 151 | "source": [ 152 | "# dsldPyLogit - interactions = True\n", 153 | "\n", 154 | "b = dsldPyLogit(data = compas1, yName = 'two_year_recid', sName = 'race', interactions = True, yesYVal = \"Yes\")\n", 155 | "\n", 156 | "### the object b is a list of R objects --- can be accessed using the following functions \n", 157 | "### note that directly looking at 'b' might not be helpful --- use the following functions to access the results and use in python\n", 158 | "\n", 159 | "# uncomment to see the results of the functions\n", 160 | "# dsldPyLogitSummary(b)\n", 161 | "# dsldPyLogitCoef(b)\n", 162 | "# dsldPyLogitVcov(b)\n", 163 | "# dsldPyLogitGetData(b)\n", 164 | "\n", 165 | "# predict()\n", 166 | "preds = dsldPyLogitPredict(b, compas1_comparisons_points)\n", 167 | "preds\n", 168 | "\n", 169 | "### can also work with interactions = False as well\n", 170 | "b2 = dsldPyLogit(data = compas1, yName = 'two_year_recid', sName = 'race', interactions = False, yesYVal = \"Yes\")\n", 171 | "\n", 172 | "# dsldPyLogitSummary(b2)\n", 173 | "# dsldPyLogitCoef(b2)\n", 174 | "# dsldPyLogitVcov(b2)\n", 175 | "# dsldPyLogitGetData(b2)" 176 | ] 177 | }, 178 | { 179 | "cell_type": "code", 180 | "execution_count": null, 181 | "id": "395702ad", 182 | "metadata": {}, 183 | "outputs": [], 184 | "source": [ 185 | "## dsldPyML - returns testAcc for each sLevel and dataframe (excluding yName and sName) of predictions\n", 186 | "### works for several qeML functions as far as I've tried\n", 187 | "c = dsldPyML(data = svcensus, yName = 'wageinc', sName = 'gender', qeMLftnName = 'qeKNN',sComparisonPts='rand5')\n", 188 | "print(c)" 189 | ] 190 | }, 191 | { 192 | "cell_type": "code", 193 | "execution_count": null, 194 | "id": "ea67194c", 195 | "metadata": {}, 196 | "outputs": [], 197 | "source": [ 198 | "### 2. ------------------------------ dsldTakeALookAround ------------------------------\n", 199 | "dsldPyTakeALookAround(data = svcensus, yName = 'wageinc', sName = 'gender', maxFeatureSize = 4) " 200 | ] 201 | }, 202 | { 203 | "cell_type": "code", 204 | "execution_count": null, 205 | "id": "db089aa9", 206 | "metadata": {}, 207 | "outputs": [], 208 | "source": [ 209 | "### 3. ------------------------------ dsldHunting ------------------------------\n", 210 | "\n", 211 | "# dsldPyCHunting - C-Hunting\n", 212 | "a = dsldPyCHunting(data = svcensus, yName = 'wageinc',sName = 'gender')\n", 213 | "\n", 214 | "# # dsldPyOHunting - O-Hunting\n", 215 | "b = dsldPyOHunting(data = svcensus, yName = 'wageinc', sName = 'gender')" 216 | ] 217 | }, 218 | { 219 | "cell_type": "code", 220 | "execution_count": null, 221 | "id": "22d1a166", 222 | "metadata": {}, 223 | "outputs": [], 224 | "source": [ 225 | "print(a)" 226 | ] 227 | }, 228 | { 229 | "cell_type": "code", 230 | "execution_count": null, 231 | "id": "cad52d06", 232 | "metadata": {}, 233 | "outputs": [], 234 | "source": [ 235 | "print(b)" 236 | ] 237 | }, 238 | { 239 | "cell_type": "code", 240 | "execution_count": null, 241 | "id": "32ab71e2", 242 | "metadata": {}, 243 | "outputs": [], 244 | "source": [ 245 | "### 4. ------------------------------ dsldFrequencybyS ------------------------------\n", 246 | "dsldPyFrequencybyS(data = svcensus, cName = 'educ', sName= 'gender')" 247 | ] 248 | }, 249 | { 250 | "cell_type": "code", 251 | "execution_count": null, 252 | "id": "99ef6117", 253 | "metadata": {}, 254 | "outputs": [], 255 | "source": [ 256 | "### 5. ------------------------------ dsldMatchedAte ------------------------------\n", 257 | "dsldPyMatchedATE(data = compas1, yName='two_year_recid', sName='race', yesSVal='Caucasian')" 258 | ] 259 | }, 260 | { 261 | "cell_type": "code", 262 | "execution_count": null, 263 | "id": "f483cd46", 264 | "metadata": {}, 265 | "outputs": [], 266 | "source": [] 267 | } 268 | ], 269 | "metadata": { 270 | "kernelspec": { 271 | "display_name": "dsld", 272 | "language": "python", 273 | "name": "python3" 274 | }, 275 | "language_info": { 276 | "codemirror_mode": { 277 | "name": "ipython", 278 | "version": 3 279 | }, 280 | "file_extension": ".py", 281 | "mimetype": "text/x-python", 282 | "name": "python", 283 | "nbconvert_exporter": "python", 284 | "pygments_lexer": "ipython3", 285 | "version": "3.12.11" 286 | } 287 | }, 288 | "nbformat": 4, 289 | "nbformat_minor": 5 290 | } 291 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | # DSLD: Data Science Looks at Discrimination 3 | 4 | [![CRAN Status](https://www.r-pkg.org/badges/version/dsld)](https://cran.r-project.org/package=dsld) 5 | 6 | > Statistical and graphical tools for detecting and measuring discrimination and bias in data 7 | 8 | This is an R package with Python interfaces available. 9 | 10 | ## Overview 11 | 12 | Discrimination is a key social issue in the United States and in a 13 | number of other countries. There is lots of available data with which 14 | one might investigate possible discrimination. But how might such 15 | investigations be conducted? 16 | 17 | Our **DSLD** package provides statistical and graphical tools for 18 | detecting and measuring discrimination and bias; be it racial, gender, 19 | age or other. It is widely applicable, here are just a few possible use 20 | cases: 21 | 22 | - Quantitative analysis in instruction and research in the social sciences. 23 | - Corporate HR analysis and research. 24 | - Litigation involving discrimination and related issues. 25 | - Concerned citizenry. 26 | 27 | This package is broadly aimed at users ranging from instructors of 28 | statistics classes to legal professionals, as it offers a powerful yet 29 | intuitive approach to discrimination analysis. It also includes an 80 page 30 | **Quarto book** to serve as a guide of the key statistical principles and 31 | their applications. 32 | 33 | - **Quarto Book**: [Paper](https://htmlpreview.github.io/?https://github.com/matloff/dsldBook/blob/main/_book/index.html) - Important statistical principles and applications. 34 | - **Research Paper**: [Paper](https://arxiv.org/abs/2411.04228) - Package implementation details. 35 | 36 | ## Installation: 37 | 38 | From CRAN : 39 | 40 | ```r 41 | install.packages("dsld") 42 | ``` 43 | 44 | 50 | 51 | ## Analysis categories: 52 | 53 | DSLD addresses two main types of bias analysis: 54 | 55 | **Estimation Analysis**: Investigates possible discrimination by 56 | estimating effects while accounting for confounders. Confounders are 57 | variables that may affect the outcome variable other than through 58 | the sensitive variable. DSLD provides both analytical and graphical functions 59 | for this purpose. 60 | 61 | **Prediction Analysis**: Addresses algorithmic bias in machine learning 62 | by excluding sensitive variables while controlling proxy effects. 63 | Proxies are variables strongly related to the sensitive variable that 64 | could indirectly introduce bias. 65 | 66 | The first case examines *societal* or *institutional bias*. The second case 67 | focuses on *algorithmic bias*. 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 |
Statistical AnalysisFair Machine Learning
Estimate an effectPredict an outcome
Harm comes from societyHarm comes from algorithm
Include sensitive variablesExclude sensitive variables
Adjust for covariatesLimit proxy impact
91 | 92 | We will tour of a small subset of dsld's features using the **svcensus** data 93 | included in the package. 94 | 95 | ### The data 96 | 97 | The **svcensus** dataset consists of recorded income across 6 different 98 | engineering occupations. It consists of the features: 'age', 'education level', 99 | 'occupation', 'wage income', 'number of weeks worked', 'gender'. 100 | 101 | ```R 102 | > data(svcensus) 103 | > head(svcensus) 104 | age educ occ wageinc wkswrkd gender 105 | 1 50.30082 zzzOther 102 75000 52 female 106 | 2 41.10139 zzzOther 101 12300 20 male 107 | 3 24.67374 zzzOther 102 15400 52 female 108 | 4 50.19951 zzzOther 100 0 52 male 109 | 5 51.18112 zzzOther 100 160 1 female 110 | 6 57.70413 zzzOther 100 0 0 male 111 | ``` 112 | 113 | We will use only a few features to keep things simple. The *Quarto Book* 114 | provides a more extensive analysis of examples shown below. 115 | 116 | ## Part One: Adjustment for Confounders 117 | 118 | We want to estimate the impact of a sensitive variable [S] 119 | on an outcome variable [Y], while accounting for confounders [C]. Let's 120 | call such analysis "confounder adjustment." 121 | 122 | ### Estimation Example 123 | 124 | We are investigating a possible gender pay gap between men and women. 125 | Here, [Y] is wage and [S] is gender. We will treat age as a confounder [C], 126 | using a linear model. For simplicity, no other confounders (such as occupation) 127 | or any other predictors [X] are included in this example. 128 | 129 | **No interactions** 130 | 131 | ```r 132 | > data(svcensus) 133 | > svcensus <- svcensus[,c(1,4,6)] # subset columns: age, wage, gender 134 | > z <- dsldLinear(svcensus,'wageinc','gender', interactions = FALSE) 135 | > summary(z) # show coefficients of linear model 136 | 137 | $`Summary Coefficients` 138 | Covariate Estimate StandardError PValue 139 | 1 (Intercept) 31079.9174 1378.08158 0 140 | 2 age 489.5728 30.26461 0 141 | 3 gendermale 13098.2091 790.44515 0 142 | 143 | $`Sensitive Factor Level Comparisons` 144 | Factors Compared Estimates Standard Errors P-Value 145 | Estimate male - female 13098.21 790.4451 0 146 | ``` 147 | Our linear model can be written as: 148 | 149 | > E(W) = $\beta_0$ + $\beta_1$ A + $\beta_2$ M 150 | 151 | Consider the case without any interaction: Here *W* indicates wage 152 | income, *A* is age and *M* denotes an indicator variable, with M = 1 for men and 153 | M = 0 for women. 154 | 155 | Where W is wage income, A is age, and M is male indicator (M = 1 for men, M = 0 for women). 156 | 157 | $\beta_2$ represents the gender wage gap at any age. The linear model shows men earn 158 | $13,000 more than women across *all* ages. However, the wage gap might also vary by age. 159 | We test for such interactions by fitting separate models for men and women, for example comparing ages 36 and 43: 160 | 161 | **Interactions** 162 | ```R 163 | newData <- data.frame(age=c(36,43)) 164 | z <- dsldLinear(svcensus,'wageinc','gender',interactions=TRUE, newData) 165 | summary(z) 166 | 167 | $female 168 | Covariate Estimate StandardError PValue 169 | 1 (Intercept) 30551.4302 2123.44361 0 170 | 2 age 502.9624 52.07742 0 171 | 172 | $male 173 | Covariate Estimate StandardError PValue 174 | 1 (Intercept) 44313.159 1484.82216 0 175 | 2 age 486.161 36.02116 0 176 | 177 | $`Sensitive Factor Level Comparisons` 178 | Factors Compared New Data Row Estimates Standard Errors 179 | 1 female - male 1 -13156.88 710.9696 180 | 2 female - male 2 -13039.27 710.7782 181 | ``` 182 | 183 | The gender pay gap is -$13,157 at age 36 and -$13,039 at age 43, differing by only $118. 184 | This suggests minimal age-gender interaction. We only focused on age as the confounder, 185 | but other variables like occupation could be included depending on the analysis goals. 186 | 187 | ## Part Two: Discovering/Mitigating Bias in Machine Learning 188 | 189 | We are predicting [Y] from a feature set [X] and a sensitive variable [S]. 190 | We want to minimize the effect of [S], along with any proxies [O] in [X] that may 191 | be correlated with [S]. The inherent trade-off of increasing fairness (minimizing [S] and [O]) 192 | is reduced utility. The package provides wrappers for several functions. 193 | 194 | ### Prediction Example 195 | 196 | **Goal**: Predict wage income while minimizing gender bias by limiting the 197 | impact of occupation as a proxy variable. 198 | 199 | **Setup**: 200 | - **Outcome [Y]**: Wage income 201 | - **Sensitive Variable [S]**: Gender 202 | - **Proxy Variable [O]**: Occupation (deweighted to 0.2) 203 | - **Method**: Fair K-Nearest Neighbors using `dsldQeFairKNN()` 204 | 205 | 206 | 207 | 208 | 209 | 210 | 211 | 212 | 213 | 214 | 215 | 216 | 217 | 218 | 219 | 220 | 221 | 222 | 223 | 224 |
Fairness/Utility TradeoffFairnessAccuracy
K-Nearest Neighbors0.194331325452.08
Fair K-NN (via EDFFair)0.081491926291.38
225 | 226 | The base K-NN model shows 0.194 correlation between predicted wage and gender, with $25,452 prediction error. Using `dsldQeFairKNN`, the correlation drops to 0.081, but test error increases by $839. This shows the fairness-utility trade-off. Users can test parameter combinations to find their optimal balance. The `dsldFairUtils` function facilitates this search. 227 | 228 | ## Function List 229 | 230 | 1. Graphical Functions 231 | 232 | 233 | 234 | 235 | 236 | 237 | 238 | 239 | 240 | 241 | 242 | 243 | 244 | 245 | 246 | 247 | 248 | 249 | 250 | 251 | 252 | 253 | 254 | 255 | 256 | 257 | 258 | 259 | 260 | 261 | 262 | 263 | 264 | 265 | 266 | 267 | 268 |
FunctionDescriptionUse Case
dsldFreqPCoordFrequency-based parallel coordinatesVisualizing multivariate relationships
dsldScatterPlot3D3D scatter plots with color codingExploring 3D data relationships
dsldConditDisparityConditional disparity plotsDetecting Simpson's Paradox
dsldDensityBySDensity plots by sensitive variableComparing distributions across groups
dsldConfoundersConfounder analysisIdentifying confounding variables
dsldIambConstraint-based structure learning algorithmsFits a causal model to data
269 | 270 | 2. Analytical Functions 271 | 272 | 273 | 274 | 275 | 276 | 277 | 278 | 279 | 280 | 281 | 282 | 283 | 284 | 285 | 286 | 287 | 288 | 289 | 290 | 291 | 292 | 293 | 294 | 295 | 296 | 297 | 298 | 299 | 300 | 301 | 302 | 303 | 304 | 305 | 306 | 307 | 308 | 309 | 310 | 311 | 312 | 313 |
FunctionDescriptionUse Case
dsldLinearLinear regression with sensitive group comparisonsRegression outcome analysis
dsldLogitLogistic regression with sensitive group comparisonsBinary outcome analysis
dsldMLMachine learning with sensitive group comparisonsAnalysis via non-parametric models (KNN, RF)
dsldTakeALookAroundFeature set evaluationAssessing prediction fairness
dsldCHuntingConfounder huntingFinding variables that predict both Y and S
dsldOHuntingProxy huntingFinding variables that predict S
dsldMatchedAteCausal inference via matchingEstimating treatment effects
314 | 315 | 3. Fair Machine Learning Functions 316 | 317 | 318 | 319 | 320 | 321 | 322 | 323 | 324 | 325 | 326 | 327 | 328 | 329 | 330 | 331 | 332 | 333 | 334 | 335 | 336 | 337 | 338 |
FunctionDescriptionPackage
dsldFairMLFairML algorithm wrappersFairML
dsldQeFairMLEDFFair algorithm wrappersEDFFair
dsldFairUtilsGrid search and parameter optimization for fair ML
339 | 340 | **Available Algorithms**: 341 | - **FairML**: dsldFrrm, dsldFgrrm, dsldZlm, dsldNclm, dsldZlrm 342 | - **EDFFair**: dsldQeFairKNN, dsldQeFairRf, dsldQeFairRidgeLin, dsldQeFairRidgeLog 343 | 344 | ## Authors 345 | 346 | - **Norm Matloff** 347 | - **Aditya Mittal** 348 | - **Taha Abdullah** 349 | - **Arjun Ashok** 350 | - **Shubhada Martha** 351 | - **Billy Ouattara** 352 | - **Jonathan Tran** 353 | - **Brandon Zarate** 354 | 355 | For issues, contact **Aditya Mittal** at mittalaa@uci.edu -------------------------------------------------------------------------------- /R/dsldLogit.R: -------------------------------------------------------------------------------- 1 | ### ------------------------ DSLDLogit ----------------------------------------- 2 | ### creates the dsldLogit object 3 | dsldLogit <- function(data, yName, sName, sComparisonPts = NULL, interactions = FALSE, yesYVal) { 4 | 5 | dsldModel <- list() 6 | data[[yName]] <- ifelse(data[[yName]] == yesYVal, 1, 0) 7 | 8 | # user wants interactions # 9 | if (interactions) { 10 | 11 | # generate interactions data if not provided / stop if erroneous 12 | if (is.null(sComparisonPts)) { 13 | sComparisonPts <- dsldGetRow5(data,yName, sName) 14 | } else if (!is.data.frame(sComparisonPts)) { 15 | stop(paste("Error: sComparisonPts must be a dataframe or equivalent")) 16 | } 17 | 18 | # split data into list of dataframes by each level of sName # 19 | dataSplit <- split(data, data[[sName]]) 20 | dataNames <- names(dataSplit) 21 | 22 | # loop and create model for each level in sName # 23 | for (name in dataNames) { 24 | # initialize instance of dsldDiffModel # 25 | dsldDiffModel <- list() 26 | 27 | # get data for each specific S factor & drop sensitive column # 28 | diffData <- dataSplit[[name]] 29 | drop <- c(sName) 30 | diffData <- diffData[, !(names(diffData) %in% drop)] 31 | 32 | # create the model # 33 | diffModel <- glm(formula = as.formula(paste(yName, "~ .")), 34 | family = "binomial", data = diffData) 35 | 36 | # setup individual instance of dsldDiffModel 37 | dsldDiffModel <- c( 38 | dsldDiffModel, 39 | yName, 40 | sName, 41 | list(diffModel), 42 | list(sComparisonPts), 43 | list(summary(diffModel)), 44 | list(coef(diffModel)), 45 | list(diffData), 46 | list(factor_levels(data)) 47 | ) 48 | names(dsldDiffModel) <- c("yName", "sName", "model", "newData", 49 | "summary", "coef", "data", "FactorsInfo") 50 | class(dsldDiffModel) <- "dsldDiffModel" 51 | 52 | # add instance into output list: dsldModel # 53 | dsldModel[[name]] <- dsldDiffModel 54 | } 55 | } else { 56 | # initialize instance of dsldDiffModel # 57 | dsldDiffModel <- list() 58 | 59 | # create model # 60 | diffModel <- glm(formula = as.formula(paste(yName, "~ .")), 61 | family = "binomial", data = data) 62 | 63 | # setup instance of dsldDiffModel # 64 | dsldDiffModel <- c(dsldDiffModel, 65 | yName, 66 | sName, 67 | list(diffModel), 68 | list(summary(diffModel)), 69 | list(coef(diffModel)), 70 | list(data), 71 | list(factor_levels(data)) 72 | ) 73 | names(dsldDiffModel) <- c("yName", "sName", "model", "summary", 74 | "coef", "data", "FactorsInfo") 75 | 76 | # add instance into dsldModel 77 | dsldModel[[sName]] <- dsldDiffModel 78 | } 79 | class(dsldModel) <- "dsldGLM" 80 | return(dsldModel) 81 | } 82 | 83 | # ----------------------- Auxiliary Functions ---------------------------------# 84 | coef.dsldGLM <- function(object,...) { 85 | # merge & return coefficients # 86 | mergedCoef <- lapply(object, function(x) x$coef) 87 | return(mergedCoef) 88 | } 89 | 90 | vcov.dsldGLM <- function(object,...) { 91 | # merge & return coefficients # 92 | mergedCoef <- lapply(object, function(x) vcov(x$model)) 93 | return(mergedCoef) 94 | } 95 | 96 | dsldGetData <- function(object) { 97 | # merge & return datasets # 98 | mergedData <- lapply(object, function(x) x$data) 99 | return(mergedData) 100 | } 101 | 102 | ### #------------------------- dsldDiffSLog function --------------------------# 103 | dsldDiffSLog <- function(object, sComparisonPts = NULL) { 104 | # naming 105 | dsldGLM <- object 106 | 107 | # get sName and yName from the output of dsldLogistic # 108 | sName <- dsldGLM[[1]]$sName 109 | yName <- dsldGLM[[1]]$yName 110 | 111 | # diffS results when interaction == FALSE in dsldLinear # 112 | if (length(dsldGLM) == 1) { 113 | # extract pairwise combination of [dummy level in glm - factor levels] 114 | # from summary output 115 | data <- dsldGetData(dsldGLM)[[1]] 116 | model <- dsldGLM[[1]]$model 117 | C <- vcov(model) 118 | c <- coef(model) 119 | 120 | # get all values containing sName levels from summary(model) # 121 | rowsWithRace <- grep(sName, rownames(coef(summary(model)))) 122 | regularS <- summary(model)$coefficients[rowsWithRace, ] 123 | 124 | # for the case when we have only two levels in S; ex: male/female # 125 | if (length(levels(data[[sName]])) == 2) { 126 | estimate <- regularS[1] 127 | standardError <- regularS[2] 128 | pVal <- regularS[4] 129 | sPairs <- combn(levels(data[[sName]]), 2) 130 | a <- sPairs[1] 131 | b <- sPairs[2] 132 | indexVal <- sprintf("%s - %s", b, a) 133 | df <- data.frame(indexVal, estimate, standardError, pVal) 134 | names(df) <- c("Factors Compared", "Estimates", 135 | "Standard Errors", "P-Value") 136 | return(df) 137 | } 138 | 139 | # extract estimates and standard errors # 140 | estimates <- regularS[, 1] 141 | standardErrors <- regularS[, 2] 142 | pVal <- regularS[, 4] 143 | 144 | # create dataframe # 145 | df <- data.frame(estimates, standardErrors, pVal) 146 | df$estimates <- -df$estimates 147 | 148 | # extract other pairwise combinations of levels (not including dummy) # 149 | featureNames <- colnames(vcov(model)) 150 | combinationMatrix <- combn(featureNames, 2) 151 | 152 | # remove all columns that do not have sName # 153 | matchingCols <- which(apply(combinationMatrix, 2, 154 | function(col) all(grepl(sName, col)))) 155 | finalResult <- combinationMatrix[, matchingCols, drop = FALSE] 156 | 157 | # loops through each pair # 158 | for (j in 1:dim(finalResult)[2]) { 159 | # create i-th pair of pairwise combinations # 160 | val <- finalResult[, j] 161 | a <- val[1] 162 | b <- val[2] 163 | 164 | # create vector of 0's length of coef(z) # 165 | vectorLength <- length(c) 166 | rt <- rep(0, vectorLength) 167 | 168 | # put 1 on the first element # 169 | aIndex <- which(names(c) == a) 170 | rt[aIndex] <- 1 171 | 172 | # put -1 on the second element # 173 | bIndex <- which(names(c) == b) 174 | rt[bIndex] <- -1 175 | 176 | aValue <- c[aIndex] 177 | bValue <- c[bIndex] 178 | 179 | # get estimates & standard errors # 180 | estimates <- aValue - bValue 181 | standardErrors <- sqrt((t(rt) %*% C %*% rt)) 182 | 183 | tStatistic <- (estimates) / standardErrors 184 | degOfFreedom <- nrow(data) - 1 # degrees of freedom 185 | pVal <- 2 * pt(abs(tStatistic), df = degOfFreedom, 186 | lower.tail = FALSE) 187 | 188 | tempDF <- data.frame(estimates, standardErrors, pVal) 189 | df <- rbind(df, tempDF) 190 | } 191 | 192 | # get names of sName comparisons # 193 | sPairs <- combn(levels(data[[sName]]), 2) 194 | test <- c() 195 | for (i in 1:dim(sPairs)[2]) { 196 | val <- sPairs[,i] 197 | a <- val[1] 198 | b <- val[2] 199 | indexVal <- sprintf("%s - %s", a, b) 200 | test <- c(test, indexVal) 201 | } 202 | 203 | # create final data-frame # 204 | df <- cbind(test, df) 205 | df <- data.frame(df, row.names = NULL) 206 | names(df) <- c("Factors Compared", "Estimates", "Standard Errors", 207 | "P-Value") 208 | return(df) 209 | 210 | } else { 211 | # raise error if the user doesn't input new data # 212 | if (is.null(sComparisonPts)) { 213 | stop("Please enter the sComparisonPts input to compare for interactions") 214 | } 215 | 216 | if (!is.data.frame(sComparisonPts)) { 217 | stop(paste("Error: sComparisonPts must be a dataframe or equivalent")) 218 | } 219 | 220 | if (!is.null(sComparisonPts)) { 221 | sComparisonPts <- apply_factor_levels(sComparisonPts, object[[1]]$FactorsInfo) 222 | } 223 | 224 | # naming 225 | xNew <- sComparisonPts 226 | 227 | # get vector of all levels in sName # 228 | sNames <- names(dsldGLM) 229 | df <- data.frame() 230 | 231 | # loop through each level of S name to compute estimates and standard errors 232 | for (i in sNames) { 233 | data <- dsldGLM[[i]]$data 234 | model <- dsldGLM[[i]]$model 235 | predictions <- predict(model, xNew, type = "response", se.fit = TRUE) 236 | pred <- predictions$fit 237 | se <- predictions$se.fit 238 | level <- row <- prediction <- standardError <- NULL 239 | tempDF <- data.frame(level = i, row = 1:nrow(xNew), 240 | prediction = pred, standardError = se) 241 | df <- rbind(df, tempDF) 242 | } 243 | 244 | # compute difference in estimates between each pair factor level 245 | # for each row 246 | uniqueElements <- sort(unique(df$row)) 247 | pairwiseDF <- data.frame() 248 | 249 | for (i in uniqueElements) { 250 | rowData <- subset(df, row == i) 251 | charVec <- as.character(rowData$level) 252 | combinationMatrix <- combn(charVec, 2) 253 | for (j in 1:dim(combinationMatrix)[2]) { 254 | val <- combinationMatrix[, j] 255 | a <- val[1] 256 | b <- val[2] 257 | aData <- subset(rowData, level == a) 258 | a3 <- aData[3] 259 | bData <- subset(rowData, level == b) 260 | b3 <- bData[3] 261 | indexVal <- sprintf("%s - %s", a, b) 262 | estimatedDiff <- aData$prediction - bData$prediction 263 | standardError <- sqrt(((aData$standardError) ^ 2) + 264 | ((bData$standardError) ^ 2)) 265 | tempDF <- data.frame(indexVal, i, a3,b3, estimatedDiff, 266 | standardError) 267 | names(tempDF) <- c("Factors Compared", "New Data Row", 268 | 'Factor A','Factor B', "Difference in Estimates", "Standard Errors") 269 | pairwiseDF <- rbind(pairwiseDF, tempDF) 270 | } 271 | } 272 | return(pairwiseDF) 273 | } 274 | } 275 | 276 | ## ------------------------------ summary() ------------------------------ 277 | summary.dsldGLM <- function(object,...) { 278 | diffS <- list() 279 | # get sName and yName from the output of dsldLogistic # 280 | sName <- object[[1]]$sName 281 | yName <- object[[1]]$yName 282 | 283 | sNames <- names(object) 284 | newData <- object[[1]]$newData 285 | 286 | if (length(object) == 1) { 287 | data <- dsldGetData(object)[[1]] 288 | summary_output <- summary(object[[1]]$model) 289 | coef <- summary_output$coefficients[, 1] 290 | std_err <- summary_output$coefficients[, 2] 291 | pValues <- summary_output$coefficients[, 4] 292 | 293 | # Create dataframe 294 | df <- data.frame( 295 | Covariate = row.names(summary_output$coefficients), 296 | Estimate = coef, 297 | `Standard Error` = std_err, 298 | PValue = pValues, 299 | stringsAsFactors = FALSE, 300 | row.names = NULL 301 | ) 302 | 303 | diffS[['Summary Coefficients']] <- df 304 | diffS[['Sensitive Factor Level Comparisons']] <- dsldDiffSLog(object) 305 | 306 | return(diffS) 307 | } else { 308 | # loop through each level of S name to compute estimates and standard errors 309 | for (i in sNames) { 310 | data <- object[[i]]$data 311 | summaryOutput <- summary(object[[i]]$model) 312 | coef <- summaryOutput$coefficients[, 1] 313 | stdErr <- summaryOutput$coefficients[, 2] 314 | pValues <- summaryOutput$coefficients[, 4] 315 | 316 | df <- data.frame( 317 | Covariate = row.names(summaryOutput$coefficients), 318 | Estimate = coef, 319 | `Standard Error` = stdErr, 320 | PValue = pValues, 321 | stringsAsFactors = FALSE, 322 | row.names = NULL 323 | ) 324 | diffS[[i]] <- df 325 | } 326 | diffS[['Sensitive Factor Level Comparisons']] <- dsldDiffSLog(object, 327 | newData) 328 | return(diffS) 329 | } 330 | } 331 | 332 | # ---------------------------- add predict() ----------------------------------- 333 | predict.dsldGLM <- function(object, xNew,...){ 334 | df <- data.frame() 335 | yName = object[[1]]$yName 336 | if (length(object) == 1) { 337 | data <- object[[1]]$data 338 | model <- object[[1]]$model 339 | xNew <- apply_factor_levels(xNew, object[[1]]$FactorsInfo) 340 | predictions <- predict(model, xNew, type = "response", se.fit = TRUE) 341 | pred <- predictions$fit 342 | se <- predictions$se.fit 343 | tempDF <- data.frame(row = 1:nrow(xNew), prediction = pred, standardError = se) 344 | df <- rbind(df, tempDF) 345 | return (df) 346 | } else { 347 | sNames <- names(object) 348 | for (i in sNames) { # loop through each level of S name to compute estimates and standard errors 349 | data <- object[[i]]$data 350 | model <- object[[i]]$model 351 | xNew <- apply_factor_levels(xNew, object[[1]]$FactorsInfo) 352 | predictions <- predict(model, xNew, type = "response", se.fit = TRUE) 353 | pred <- predictions$fit 354 | se <- predictions$se.fit 355 | tempDF <- data.frame(level = i, row = 1:nrow(xNew), prediction = pred, standardError = se) 356 | df <- rbind(df, tempDF) 357 | } 358 | return (df) 359 | } 360 | } 361 | -------------------------------------------------------------------------------- /inst/examples/machine_learning.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "bd7878bc", 6 | "metadata": {}, 7 | "source": [ 8 | "#### Examples for machine learning algorithms by dsldPy\n", 9 | "\n", 10 | "The goal is for users to train models with a simple, intuitive interface and also understand effects on fairness-utility tradeoffs based on hyperparamater selection. Examples are shown on training/testing sets with cross validation approaches.\n", 11 | "\n", 12 | "1) regression examples using dsldPyFairML and dsldPyQeFairML\n", 13 | "2) classification examples using dsldPyFairML and dsldPyQeFairML\n", 14 | "3) k-fold cross validation to choose best hyperparameters for fairness utility tradeoff" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": null, 20 | "id": "da083337", 21 | "metadata": {}, 22 | "outputs": [], 23 | "source": [ 24 | "## requires R and the dsld (R) package installed\n", 25 | "# !pip install dsldPy" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": null, 31 | "id": "ccd05554", 32 | "metadata": {}, 33 | "outputs": [], 34 | "source": [ 35 | "# load libraries\n", 36 | "from dsldPy import (\n", 37 | "# data reading and preprocessing\n", 38 | "preprocess_data, read_data,\n", 39 | "\n", 40 | "# fairML wrappers\n", 41 | "dsldPyFrrm, dsldPyFgrrm, dsldPyNclm, dsldPyZlm, dsldPyZlrm, dsldPyFairML_Summary, dsldPyFairML_Predict,\n", 42 | "\n", 43 | "# qeFairML wrappers\n", 44 | "dsldPyQeFairKNN, dsldPyQeFairRF, dsldPyQeFairRidgeLin, dsldPyQeFairRidgeLog, dsldPyQeFairML_Predict,\n", 45 | "\n", 46 | "dsldPyFairUtils\n", 47 | ")\n", 48 | "\n", 49 | "from sklearn.model_selection import train_test_split\n", 50 | "from sklearn.metrics import mean_absolute_error, accuracy_score\n" 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": null, 56 | "id": "3135dff7", 57 | "metadata": {}, 58 | "outputs": [], 59 | "source": [ 60 | "### regression example --- frrm(), nclm(), zlm(), qeFairKNN(), qeFairRF(), qeFairRidgeLin()\n", 61 | "\n", 62 | "### read and preprocess data\n", 63 | "\n", 64 | "### data preprocessing\n", 65 | "\n", 66 | "### all dsldPy functions require a R data frame object as input (NOT pandas dataframe)\n", 67 | "### the preprocessing is done by the function preprocess_data\n", 68 | "### user needs to manually provide the categorical and numerical features (list)\n", 69 | "### the function preprocess_data returns a R data.frame object -> required input for the dsldPy functions\n", 70 | "\n", 71 | "# test and train split\n", 72 | "#### REPLACE WITH YOUR PATH TO svcensus.RData\n", 73 | "# df = read_data(\"\") \n", 74 | "test_df, train_df = train_test_split(df, test_size=0.3, random_state=42)\n", 75 | "test_y = test_df['wageinc']\n", 76 | "test_df = test_df.drop(columns=['wageinc'])\n", 77 | "\n", 78 | "# preprocess data\n", 79 | "cat_features_train = ['educ', 'occ', 'gender']\n", 80 | "num_features_train = ['age', 'wageinc', 'wkswrkd']\n", 81 | "svcensus_train = preprocess_data(train_df, cat_features_train, num_features_train)\n", 82 | "\n", 83 | "cat_features_test = ['educ', 'occ', 'gender']\n", 84 | "num_features_test = ['age', 'wkswrkd']\n", 85 | "svcensus_test = preprocess_data(test_df, cat_features_test, num_features_test)\n" 86 | ] 87 | }, 88 | { 89 | "cell_type": "code", 90 | "execution_count": null, 91 | "id": "a08603d4", 92 | "metadata": {}, 93 | "outputs": [], 94 | "source": [ 95 | "### using dsldPyFairML() function\n", 96 | "\n", 97 | "### model training --- frrm() \n", 98 | "### unfairness = 0.05 // can also try different values for unfairness\n", 99 | "a = dsldPyFrrm(data=svcensus_train, yName='wageinc', sName='gender',unfairness= 0.05, definition = \"sp-komiyama\", lamda = 0, save = False)\n", 100 | "\n", 101 | "# print train accuracy and correlations\n", 102 | "print(f\"train predictions: {a['train_predictions']}\")\n", 103 | "print(f\"train accuracy: {a['train_accuracy']}\")\n", 104 | "print(f\"train correlations: {a['train_correlations']}\")" 105 | ] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "execution_count": null, 110 | "id": "7502969a", 111 | "metadata": {}, 112 | "outputs": [], 113 | "source": [ 114 | "### predict() on test data\n", 115 | "a_preds = dsldPyFairML_Predict(a, svcensus_test)\n", 116 | "\n", 117 | "# print test predictions and correlations\n", 118 | "print(f\"test predictions: {a_preds['test_predictions']}\")\n", 119 | "print(f\"test correlations: {a_preds['test_correlations']}\")\n", 120 | "\n", 121 | "# manuallycompute test accuracy (MAPE)\n", 122 | "test_accuracy = mean_absolute_error(test_y, a_preds['test_predictions'])\n", 123 | "print(f\"test accuracy: {test_accuracy}\")\n", 124 | "\n", 125 | "### the same can be done for other models --- nclm(), zlm() with dsldPyFairML_Predict() method" 126 | ] 127 | }, 128 | { 129 | "cell_type": "code", 130 | "execution_count": null, 131 | "id": "434a6e7b", 132 | "metadata": {}, 133 | "outputs": [], 134 | "source": [ 135 | "### using dsldPyQeFairML() functions \n", 136 | "\n", 137 | "### model training --- dsldQeFairRF() \n", 138 | "### deweightPars = {'educ': 0.2, 'occ': 0.05} // try different values for proxies\n", 139 | "deweightPars = {'educ': 0.2, 'occ': 0.05}\n", 140 | "\n", 141 | "a = dsldPyQeFairRF(data=svcensus_train, yName='wageinc', sNames='gender', deweightPars=deweightPars)\n", 142 | "\n", 143 | "# print train accuracy and correlations\n", 144 | "print(f\"train predictions: {a['train_predictions']}\")\n", 145 | "print(f\"train accuracy: {a['train_accuracy']}\")\n", 146 | "print(f\"train correlations: {a['train_correlations']}\")" 147 | ] 148 | }, 149 | { 150 | "cell_type": "code", 151 | "execution_count": null, 152 | "id": "e63aaf39", 153 | "metadata": {}, 154 | "outputs": [], 155 | "source": [ 156 | "### predict on test data\n", 157 | "a_preds = dsldPyQeFairML_Predict(a, svcensus_test)\n", 158 | "\n", 159 | "# print test predictions and correlations\n", 160 | "print(f\"test predictions: {a_preds['test_predictions']}\")\n", 161 | "print(f\"test correlations: {a_preds['test_correlations']}\")\n", 162 | "\n", 163 | "# manually compute test accuracy (MAPE)\n", 164 | "test_accuracy = mean_absolute_error(test_y, a_preds['test_predictions'])\n", 165 | "print(f\"test accuracy: {test_accuracy}\")\n", 166 | "\n", 167 | "### the same can be done for other models --- qeFairKNN(), qeFairRidgeLin() with dsldPyQeFairML_Predict() method" 168 | ] 169 | }, 170 | { 171 | "cell_type": "code", 172 | "execution_count": null, 173 | "id": "e12cdade", 174 | "metadata": {}, 175 | "outputs": [], 176 | "source": [ 177 | "### classification examples --- fgrrm(), zlrm(), qeFairKNN(), qeFairRF(), qeFairRidgeLog()\n", 178 | "\n", 179 | "### read and preprocess data\n", 180 | "\n", 181 | "# test and train split\n", 182 | "#### REPLACE WITH YOUR PATH TO compas1.RData\n", 183 | "# df = read_data(\"\")\n", 184 | "test_df, train_df = train_test_split(df, test_size=0.3, random_state=42)\n", 185 | "test_y = test_df['two_year_recid']\n", 186 | "test_y = test_df['two_year_recid'].map({'Yes': 1, 'No': 0}) # convert to binary\n", 187 | "test_df = test_df.drop(columns=['two_year_recid'])\n", 188 | "\n", 189 | "# preprocess data\n", 190 | "cat_features = ['sex', 'race', 'two_year_recid']\n", 191 | "num_features = [\"age\", \"juv_fel_count\",\"decile_score\",\"juv_misd_count\",\"juv_other_count\",\"priors_count\",\"c_jail_in\",\"c_jail_out\",\"c_offense_date\",\"screening_date\",\"in_custody\",\"out_custody\"]\n", 192 | "compas1_train = preprocess_data(train_df, cat_features_train, num_features_train)\n", 193 | "\n", 194 | "cat_features = ['sex', 'race']\n", 195 | "num_features = [\"age\", \"juv_fel_count\",\"decile_score\",\"juv_misd_count\",\"juv_other_count\",\"priors_count\",\"c_jail_in\",\"c_jail_out\",\"c_offense_date\",\"screening_date\",\"in_custody\",\"out_custody\"]\n", 196 | "compas1_test = preprocess_data(test_df, cat_features_test, num_features_test)\n" 197 | ] 198 | }, 199 | { 200 | "cell_type": "code", 201 | "execution_count": null, 202 | "id": "d2d728fd", 203 | "metadata": {}, 204 | "outputs": [], 205 | "source": [ 206 | "### using dsldPyFairML() functions \n", 207 | "\n", 208 | "### model training --- fgrrm() \n", 209 | "### unfairness = 0.1 // try different values for unfairness\n", 210 | "a = dsldPyFgrrm(data=compas1_train, yName='two_year_recid', sName='race', unfairness=0.1, definition = \"sp-komiyama\", family = \"binomial\", lamda = 0, save = False, yesYVal = \"Yes\")\n", 211 | "\n", 212 | "# print train accuracy and correlations\n", 213 | "print(f\"train predictions: {a['train_predictions']}\") # returns prob = Yes\n", 214 | "print(f\"train accuracy (misclassification rate): {a['train_accuracy']}\")\n", 215 | "print(f\"train correlations: {a['train_correlations']}\")" 216 | ] 217 | }, 218 | { 219 | "cell_type": "code", 220 | "execution_count": null, 221 | "id": "226e4af8", 222 | "metadata": {}, 223 | "outputs": [], 224 | "source": [ 225 | "### predict() on test set\n", 226 | "a_preds = dsldPyFairML_Predict(a, compas1_test)\n", 227 | "\n", 228 | "# print test predictions and correlations\n", 229 | "print(f\"test predictions: {a_preds['test_predictions']}\") # returns prob = Yes\n", 230 | "print(f\"test correlations: {a_preds['test_correlations']}\")\n", 231 | "\n", 232 | "# manually compute test accuracy (MAPE)\n", 233 | "y_pred = [int(round(x)) for x in a_preds['test_predictions']]\n", 234 | "test_accuracy = accuracy_score(test_y, y_pred)\n", 235 | "misclass_rate = 1 - test_accuracy\n", 236 | "\n", 237 | "# print train accuracy and correlations\n", 238 | "print(f\"test accuracy (misclassification rate): {misclass_rate}\")\n", 239 | "\n", 240 | "### the same can be done for other models --- zlrm() with dsldPyFairML_Predict() method" 241 | ] 242 | }, 243 | { 244 | "cell_type": "code", 245 | "execution_count": null, 246 | "id": "3916ab41", 247 | "metadata": {}, 248 | "outputs": [], 249 | "source": [ 250 | "### using dsldPyQeFairML() functions \n", 251 | "\n", 252 | "### model training --- dsldQeFairKNN() \n", 253 | "### deweightPars = {'decile_score': 0.2, 'priors_count': 0.5} // try different values for deweightPars\n", 254 | "deweightPars = {'decile_score': 0.2, 'priors_count': 0.5}\n", 255 | "\n", 256 | "a = dsldPyQeFairKNN(data=compas1_train, yName='two_year_recid',sNames= 'race', deweightPars=deweightPars, k = 10, scaleX = True, yesYVal = \"Yes\")\n", 257 | "\n", 258 | "# print train accuracy and correlations\n", 259 | "# in the case of classification, the train_predictions returns both predClasses and prob = Yes\n", 260 | "print(f\"train predictions: {a['train_predictions']}\") \n", 261 | "print(f\"train accuracy: {a['train_accuracy']}\")\n", 262 | "print(f\"train correlations: {a['train_correlations']}\")" 263 | ] 264 | }, 265 | { 266 | "cell_type": "code", 267 | "execution_count": null, 268 | "id": "e293d0e3", 269 | "metadata": {}, 270 | "outputs": [], 271 | "source": [ 272 | "### predict() on test set\n", 273 | "a_preds = dsldPyQeFairML_Predict(a, compas1_test)\n", 274 | "\n", 275 | "# print test predictions and correlations\n", 276 | "print(f\"test predictions: {a_preds['test_predictions']}\")\n", 277 | "print(f\"test correlations: {a_preds['test_correlations']}\")\n", 278 | "\n", 279 | "# compute test accuracy\n", 280 | "y_pred = [int(round(x)) for x in list(a_preds['test_predictions'][1])]\n", 281 | "test_accuracy = accuracy_score(test_y, y_pred)\n", 282 | "misclass_rate = 1 - test_accuracy\n", 283 | "\n", 284 | "# print train accuracy and correlations\n", 285 | "print(f\"test accuracy (misclassification rate): {misclass_rate}\")\n", 286 | "\n", 287 | "### the same can be done for other models --- dsldQeFairRF(), dsldQeFairRidgeLog() with dsldPyQeFairML_Predict() method" 288 | ] 289 | }, 290 | { 291 | "cell_type": "code", 292 | "execution_count": null, 293 | "id": "398cbe9d", 294 | "metadata": {}, 295 | "outputs": [], 296 | "source": [ 297 | "### k-fold cross validation to find best model based on fairness and accuracy\n", 298 | "dsldPyFairUtils(data=svcensus_train, yName='wageinc', sName='gender', dsldFTNname = \"dsldFrrm\", unfairness = [0.01, 0.05, 0.1, 0.2, 0.8], k_folds = 10)" 299 | ] 300 | }, 301 | { 302 | "cell_type": "code", 303 | "execution_count": null, 304 | "id": "50a1c4de", 305 | "metadata": {}, 306 | "outputs": [], 307 | "source": [ 308 | "dsldPyFairUtils(data = svcensus_train, yName = 'wageinc', sName = 'gender', dsldFTNname = \"dsldQeFairKNN\", deweightPars = {'occ': [0.9 ,0.8 ,0.5 ,0.3 ,0.1 ,0.05 ,0.01]}, k_folds = 10)" 309 | ] 310 | }, 311 | { 312 | "cell_type": "code", 313 | "execution_count": null, 314 | "id": "04839f09", 315 | "metadata": {}, 316 | "outputs": [], 317 | "source": [] 318 | } 319 | ], 320 | "metadata": { 321 | "kernelspec": { 322 | "display_name": "base", 323 | "language": "python", 324 | "name": "python3" 325 | }, 326 | "language_info": { 327 | "codemirror_mode": { 328 | "name": "ipython", 329 | "version": 3 330 | }, 331 | "file_extension": ".py", 332 | "mimetype": "text/x-python", 333 | "name": "python", 334 | "nbconvert_exporter": "python", 335 | "pygments_lexer": "ipython3", 336 | "version": "3.12.2" 337 | } 338 | }, 339 | "nbformat": 4, 340 | "nbformat_minor": 5 341 | } 342 | --------------------------------------------------------------------------------