├── .gitmodules ├── .gitignore ├── project.clj ├── scripts └── get-data.sh ├── README.md ├── LICENSE ├── pandas_explore.py └── src └── clj_ml_wkg └── ames_house_prices.clj /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "clojupyter"] 2 | path = clojupyter 3 | url = git@github.com:techascent/clojupyter 4 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /target 2 | /classes 3 | /checkouts 4 | pom.xml 5 | pom.xml.asc 6 | *.jar 7 | *.class 8 | /.lein-* 9 | /.nrepl-port 10 | .hgignore 11 | .hg/ 12 | *nippy 13 | data 14 | .ipynb_checkpoints -------------------------------------------------------------------------------- /project.clj: -------------------------------------------------------------------------------- 1 | (defproject clj-ml-workgroup/ames-house-prices "0.1.0-SNAPSHOT" 2 | :description "Ames house prices kaggle comp" 3 | :url "http://github.com/clj-ml-workgroup" 4 | :license {:name "Eclipse Public License" 5 | :url "http://www.eclipse.org/legal/epl-v10.html"} 6 | :dependencies [[org.clojure/clojure "1.10.1"] 7 | [techascent/tech.ml "1.71"]] 8 | :profiles {:dev {:dependencies [[org.clojure/tools.logging "0.3.1"] 9 | [ch.qos.logback/logback-classic "1.1.3"]]}}) 10 | -------------------------------------------------------------------------------- /scripts/get-data.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | mkdir -p data/ames-house-prices 3 | 4 | pushd data 5 | 6 | wget https://s3.us-east-2.amazonaws.com/tech.public.data/house-prices-advanced-regression-techniques.zip 7 | 8 | unzip house-prices-advanced-regression-techniques.zip -d ames-house-prices 9 | 10 | pushd ames-house-prices 11 | 12 | # Of course the files have incorrect permissions... 13 | 14 | chmod 644 * 15 | 16 | popd 17 | 18 | popd 19 | 20 | 21 | wget https://s3.us-east-2.amazonaws.com/tech.public.data/ames-final-results.nippy 22 | wget https://s3.us-east-2.amazonaws.com/tech.public.data/ames-one-hot-results.nippy 23 | wget https://s3.us-east-2.amazonaws.com/tech.public.data/ames-skew-fix-results.nippy 24 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ames-house-prices 2 | 3 | Exploration of kaggle ames house prices comp. 4 | 5 | Check out the [nbviewer version](https://nbviewer.jupyter.org/github/cnuernber/ames-house-prices/blob/82e3ce1679b3e6e31c0128290f60ef7ae16947b0/ames-housing-prices-clojure.ipynb). 6 | 7 | ## Usage 8 | 9 | From top level directory: 10 | ``` 11 | scripts/get-data.sh 12 | ``` 13 | 14 | The data is under data/ames-house-prices 15 | 16 | 17 | Make sure you have openblas or atlas installed as well as libsvm. 18 | 19 | If SVM doesn't work just remove it from the gridsearch pathways. It is an old C 20 | library that can be temperamental. From ubuntu it works fine on jvm 8. 21 | 22 | 23 | There is some dependency conflict with the csv subsystem tablesaw so from here is the workflow: 24 | 25 | ```clojure 26 | lein repl 27 | 28 | ;;load the ames namespace and do the things 29 | (require '[clj-ml-wkg.ames-house-prices]) 30 | 31 | ``` 32 | 33 | ## License 34 | 35 | Copyright © 2019 Clojure ML Working Group 36 | 37 | Distributed under the Eclipse Public License either version 1.0 or (at 38 | your option) any later version. 39 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | THE ACCOMPANYING PROGRAM IS PROVIDED UNDER THE TERMS OF THIS ECLIPSE PUBLIC 2 | LICENSE ("AGREEMENT"). ANY USE, REPRODUCTION OR DISTRIBUTION OF THE PROGRAM 3 | CONSTITUTES RECIPIENT'S ACCEPTANCE OF THIS AGREEMENT. 4 | 5 | 1. DEFINITIONS 6 | 7 | "Contribution" means: 8 | 9 | a) in the case of the initial Contributor, the initial code and 10 | documentation distributed under this Agreement, and 11 | 12 | b) in the case of each subsequent Contributor: 13 | 14 | i) changes to the Program, and 15 | 16 | ii) additions to the Program; 17 | 18 | where such changes and/or additions to the Program originate from and are 19 | distributed by that particular Contributor. A Contribution 'originates' from 20 | a Contributor if it was added to the Program by such Contributor itself or 21 | anyone acting on such Contributor's behalf. Contributions do not include 22 | additions to the Program which: (i) are separate modules of software 23 | distributed in conjunction with the Program under their own license 24 | agreement, and (ii) are not derivative works of the Program. 25 | 26 | "Contributor" means any person or entity that distributes the Program. 27 | 28 | "Licensed Patents" mean patent claims licensable by a Contributor which are 29 | necessarily infringed by the use or sale of its Contribution alone or when 30 | combined with the Program. 31 | 32 | "Program" means the Contributions distributed in accordance with this 33 | Agreement. 34 | 35 | "Recipient" means anyone who receives the Program under this Agreement, 36 | including all Contributors. 37 | 38 | 2. GRANT OF RIGHTS 39 | 40 | a) Subject to the terms of this Agreement, each Contributor hereby grants 41 | Recipient a non-exclusive, worldwide, royalty-free copyright license to 42 | reproduce, prepare derivative works of, publicly display, publicly perform, 43 | distribute and sublicense the Contribution of such Contributor, if any, and 44 | such derivative works, in source code and object code form. 45 | 46 | b) Subject to the terms of this Agreement, each Contributor hereby grants 47 | Recipient a non-exclusive, worldwide, royalty-free patent license under 48 | Licensed Patents to make, use, sell, offer to sell, import and otherwise 49 | transfer the Contribution of such Contributor, if any, in source code and 50 | object code form. This patent license shall apply to the combination of the 51 | Contribution and the Program if, at the time the Contribution is added by the 52 | Contributor, such addition of the Contribution causes such combination to be 53 | covered by the Licensed Patents. The patent license shall not apply to any 54 | other combinations which include the Contribution. No hardware per se is 55 | licensed hereunder. 56 | 57 | c) Recipient understands that although each Contributor grants the licenses 58 | to its Contributions set forth herein, no assurances are provided by any 59 | Contributor that the Program does not infringe the patent or other 60 | intellectual property rights of any other entity. Each Contributor disclaims 61 | any liability to Recipient for claims brought by any other entity based on 62 | infringement of intellectual property rights or otherwise. As a condition to 63 | exercising the rights and licenses granted hereunder, each Recipient hereby 64 | assumes sole responsibility to secure any other intellectual property rights 65 | needed, if any. For example, if a third party patent license is required to 66 | allow Recipient to distribute the Program, it is Recipient's responsibility 67 | to acquire that license before distributing the Program. 68 | 69 | d) Each Contributor represents that to its knowledge it has sufficient 70 | copyright rights in its Contribution, if any, to grant the copyright license 71 | set forth in this Agreement. 72 | 73 | 3. REQUIREMENTS 74 | 75 | A Contributor may choose to distribute the Program in object code form under 76 | its own license agreement, provided that: 77 | 78 | a) it complies with the terms and conditions of this Agreement; and 79 | 80 | b) its license agreement: 81 | 82 | i) effectively disclaims on behalf of all Contributors all warranties and 83 | conditions, express and implied, including warranties or conditions of title 84 | and non-infringement, and implied warranties or conditions of merchantability 85 | and fitness for a particular purpose; 86 | 87 | ii) effectively excludes on behalf of all Contributors all liability for 88 | damages, including direct, indirect, special, incidental and consequential 89 | damages, such as lost profits; 90 | 91 | iii) states that any provisions which differ from this Agreement are offered 92 | by that Contributor alone and not by any other party; and 93 | 94 | iv) states that source code for the Program is available from such 95 | Contributor, and informs licensees how to obtain it in a reasonable manner on 96 | or through a medium customarily used for software exchange. 97 | 98 | When the Program is made available in source code form: 99 | 100 | a) it must be made available under this Agreement; and 101 | 102 | b) a copy of this Agreement must be included with each copy of the Program. 103 | 104 | Contributors may not remove or alter any copyright notices contained within 105 | the Program. 106 | 107 | Each Contributor must identify itself as the originator of its Contribution, 108 | if any, in a manner that reasonably allows subsequent Recipients to identify 109 | the originator of the Contribution. 110 | 111 | 4. COMMERCIAL DISTRIBUTION 112 | 113 | Commercial distributors of software may accept certain responsibilities with 114 | respect to end users, business partners and the like. While this license is 115 | intended to facilitate the commercial use of the Program, the Contributor who 116 | includes the Program in a commercial product offering should do so in a 117 | manner which does not create potential liability for other Contributors. 118 | Therefore, if a Contributor includes the Program in a commercial product 119 | offering, such Contributor ("Commercial Contributor") hereby agrees to defend 120 | and indemnify every other Contributor ("Indemnified Contributor") against any 121 | losses, damages and costs (collectively "Losses") arising from claims, 122 | lawsuits and other legal actions brought by a third party against the 123 | Indemnified Contributor to the extent caused by the acts or omissions of such 124 | Commercial Contributor in connection with its distribution of the Program in 125 | a commercial product offering. The obligations in this section do not apply 126 | to any claims or Losses relating to any actual or alleged intellectual 127 | property infringement. In order to qualify, an Indemnified Contributor must: 128 | a) promptly notify the Commercial Contributor in writing of such claim, and 129 | b) allow the Commercial Contributor to control, and cooperate with the 130 | Commercial Contributor in, the defense and any related settlement 131 | negotiations. The Indemnified Contributor may participate in any such claim 132 | at its own expense. 133 | 134 | For example, a Contributor might include the Program in a commercial product 135 | offering, Product X. That Contributor is then a Commercial Contributor. If 136 | that Commercial Contributor then makes performance claims, or offers 137 | warranties related to Product X, those performance claims and warranties are 138 | such Commercial Contributor's responsibility alone. Under this section, the 139 | Commercial Contributor would have to defend claims against the other 140 | Contributors related to those performance claims and warranties, and if a 141 | court requires any other Contributor to pay any damages as a result, the 142 | Commercial Contributor must pay those damages. 143 | 144 | 5. NO WARRANTY 145 | 146 | EXCEPT AS EXPRESSLY SET FORTH IN THIS AGREEMENT, THE PROGRAM IS PROVIDED ON 147 | AN "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, EITHER 148 | EXPRESS OR IMPLIED INCLUDING, WITHOUT LIMITATION, ANY WARRANTIES OR 149 | CONDITIONS OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY OR FITNESS FOR A 150 | PARTICULAR PURPOSE. Each Recipient is solely responsible for determining the 151 | appropriateness of using and distributing the Program and assumes all risks 152 | associated with its exercise of rights under this Agreement , including but 153 | not limited to the risks and costs of program errors, compliance with 154 | applicable laws, damage to or loss of data, programs or equipment, and 155 | unavailability or interruption of operations. 156 | 157 | 6. DISCLAIMER OF LIABILITY 158 | 159 | EXCEPT AS EXPRESSLY SET FORTH IN THIS AGREEMENT, NEITHER RECIPIENT NOR ANY 160 | CONTRIBUTORS SHALL HAVE ANY LIABILITY FOR ANY DIRECT, INDIRECT, INCIDENTAL, 161 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING WITHOUT LIMITATION 162 | LOST PROFITS), HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 163 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 164 | ARISING IN ANY WAY OUT OF THE USE OR DISTRIBUTION OF THE PROGRAM OR THE 165 | EXERCISE OF ANY RIGHTS GRANTED HEREUNDER, EVEN IF ADVISED OF THE POSSIBILITY 166 | OF SUCH DAMAGES. 167 | 168 | 7. GENERAL 169 | 170 | If any provision of this Agreement is invalid or unenforceable under 171 | applicable law, it shall not affect the validity or enforceability of the 172 | remainder of the terms of this Agreement, and without further action by the 173 | parties hereto, such provision shall be reformed to the minimum extent 174 | necessary to make such provision valid and enforceable. 175 | 176 | If Recipient institutes patent litigation against any entity (including a 177 | cross-claim or counterclaim in a lawsuit) alleging that the Program itself 178 | (excluding combinations of the Program with other software or hardware) 179 | infringes such Recipient's patent(s), then such Recipient's rights granted 180 | under Section 2(b) shall terminate as of the date such litigation is filed. 181 | 182 | All Recipient's rights under this Agreement shall terminate if it fails to 183 | comply with any of the material terms or conditions of this Agreement and 184 | does not cure such failure in a reasonable period of time after becoming 185 | aware of such noncompliance. If all Recipient's rights under this Agreement 186 | terminate, Recipient agrees to cease use and distribution of the Program as 187 | soon as reasonably practicable. However, Recipient's obligations under this 188 | Agreement and any licenses granted by Recipient relating to the Program shall 189 | continue and survive. 190 | 191 | Everyone is permitted to copy and distribute copies of this Agreement, but in 192 | order to avoid inconsistency the Agreement is copyrighted and may only be 193 | modified in the following manner. The Agreement Steward reserves the right to 194 | publish new versions (including revisions) of this Agreement from time to 195 | time. No one other than the Agreement Steward has the right to modify this 196 | Agreement. The Eclipse Foundation is the initial Agreement Steward. The 197 | Eclipse Foundation may assign the responsibility to serve as the Agreement 198 | Steward to a suitable separate entity. Each new version of the Agreement will 199 | be given a distinguishing version number. The Program (including 200 | Contributions) may always be distributed subject to the version of the 201 | Agreement under which it was received. In addition, after a new version of 202 | the Agreement is published, Contributor may elect to distribute the Program 203 | (including its Contributions) under the new version. Except as expressly 204 | stated in Sections 2(a) and 2(b) above, Recipient receives no rights or 205 | licenses to the intellectual property of any Contributor under this 206 | Agreement, whether expressly, by implication, estoppel or otherwise. All 207 | rights in the Program not expressly granted under this Agreement are 208 | reserved. 209 | 210 | This Agreement is governed by the laws of the State of New York and the 211 | intellectual property laws of the United States of America. No party to this 212 | Agreement will bring a legal action under this Agreement more than one year 213 | after the cause of action arose. Each party waives its rights to a jury trial 214 | in any resulting litigation. 215 | -------------------------------------------------------------------------------- /pandas_explore.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | from sklearn.model_selection import cross_val_score, train_test_split 4 | from sklearn.preprocessing import StandardScaler 5 | from sklearn.linear_model import LinearRegression, RidgeCV, LassoCV, ElasticNetCV 6 | from sklearn.metrics import mean_squared_error, make_scorer 7 | from scipy.stats import skew 8 | from IPython.display import display 9 | 10 | train = pd.read_csv("data/ames-house-prices/train.csv") 11 | 12 | 13 | print("train : " + str(train.shape)) 14 | 15 | train.drop("Id", axis = 1, inplace = True) 16 | train = train[train.GrLivArea < 4000] 17 | train.SalePrice = np.log1p(train.SalePrice) 18 | 19 | # Handle missing values for features where median/mean or most common value doesn't make sense 20 | 21 | # Alley : data description says NA means "no alley access" 22 | train.loc[:, "Alley"] = train.loc[:, "Alley"].fillna("None") 23 | # BedroomAbvGr : NA most likely means 0 24 | train.loc[:, "BedroomAbvGr"] = train.loc[:, "BedroomAbvGr"].fillna(0) 25 | # BsmtQual etc : data description says NA for basement features is "no basement" 26 | train.loc[:, "BsmtQual"] = train.loc[:, "BsmtQual"].fillna("No") 27 | train.loc[:, "BsmtCond"] = train.loc[:, "BsmtCond"].fillna("No") 28 | train.loc[:, "BsmtExposure"] = train.loc[:, "BsmtExposure"].fillna("No") 29 | train.loc[:, "BsmtFinType1"] = train.loc[:, "BsmtFinType1"].fillna("No") 30 | train.loc[:, "BsmtFinType2"] = train.loc[:, "BsmtFinType2"].fillna("No") 31 | train.loc[:, "BsmtFullBath"] = train.loc[:, "BsmtFullBath"].fillna(0) 32 | train.loc[:, "BsmtHalfBath"] = train.loc[:, "BsmtHalfBath"].fillna(0) 33 | train.loc[:, "BsmtUnfSF"] = train.loc[:, "BsmtUnfSF"].fillna(0) 34 | # CentralAir : NA most likely means No 35 | train.loc[:, "CentralAir"] = train.loc[:, "CentralAir"].fillna("N") 36 | # Condition : NA most likely means Normal 37 | train.loc[:, "Condition1"] = train.loc[:, "Condition1"].fillna("Norm") 38 | train.loc[:, "Condition2"] = train.loc[:, "Condition2"].fillna("Norm") 39 | # EnclosedPorch : NA most likely means no enclosed porch 40 | train.loc[:, "EnclosedPorch"] = train.loc[:, "EnclosedPorch"].fillna(0) 41 | # External stuff : NA most likely means average 42 | train.loc[:, "ExterCond"] = train.loc[:, "ExterCond"].fillna("TA") 43 | train.loc[:, "ExterQual"] = train.loc[:, "ExterQual"].fillna("TA") 44 | # Fence : data description says NA means "no fence" 45 | train.loc[:, "Fence"] = train.loc[:, "Fence"].fillna("No") 46 | # FireplaceQu : data description says NA means "no fireplace" 47 | train.loc[:, "FireplaceQu"] = train.loc[:, "FireplaceQu"].fillna("No") 48 | train.loc[:, "Fireplaces"] = train.loc[:, "Fireplaces"].fillna(0) 49 | # Functional : data description says NA means typical 50 | train.loc[:, "Functional"] = train.loc[:, "Functional"].fillna("Typ") 51 | # GarageType etc : data description says NA for garage features is "no garage" 52 | train.loc[:, "GarageType"] = train.loc[:, "GarageType"].fillna("No") 53 | train.loc[:, "GarageFinish"] = train.loc[:, "GarageFinish"].fillna("No") 54 | train.loc[:, "GarageQual"] = train.loc[:, "GarageQual"].fillna("No") 55 | train.loc[:, "GarageCond"] = train.loc[:, "GarageCond"].fillna("No") 56 | train.loc[:, "GarageArea"] = train.loc[:, "GarageArea"].fillna(0) 57 | train.loc[:, "GarageCars"] = train.loc[:, "GarageCars"].fillna(0) 58 | # HalfBath : NA most likely means no half baths above grade 59 | train.loc[:, "HalfBath"] = train.loc[:, "HalfBath"].fillna(0) 60 | # HeatingQC : NA most likely means typical 61 | train.loc[:, "HeatingQC"] = train.loc[:, "HeatingQC"].fillna("TA") 62 | # KitchenAbvGr : NA most likely means 0 63 | train.loc[:, "KitchenAbvGr"] = train.loc[:, "KitchenAbvGr"].fillna(0) 64 | # KitchenQual : NA most likely means typical 65 | train.loc[:, "KitchenQual"] = train.loc[:, "KitchenQual"].fillna("TA") 66 | # LotFrontage : NA most likely means no lot frontage 67 | train.loc[:, "LotFrontage"] = train.loc[:, "LotFrontage"].fillna(0) 68 | # LotShape : NA most likely means regular 69 | train.loc[:, "LotShape"] = train.loc[:, "LotShape"].fillna("Reg") 70 | # MasVnrType : NA most likely means no veneer 71 | train.loc[:, "MasVnrType"] = train.loc[:, "MasVnrType"].fillna("None") 72 | train.loc[:, "MasVnrArea"] = train.loc[:, "MasVnrArea"].fillna(0) 73 | # MiscFeature : data description says NA means "no misc feature" 74 | train.loc[:, "MiscFeature"] = train.loc[:, "MiscFeature"].fillna("No") 75 | train.loc[:, "MiscVal"] = train.loc[:, "MiscVal"].fillna(0) 76 | # OpenPorchSF : NA most likely means no open porch 77 | train.loc[:, "OpenPorchSF"] = train.loc[:, "OpenPorchSF"].fillna(0) 78 | # PavedDrive : NA most likely means not paved 79 | train.loc[:, "PavedDrive"] = train.loc[:, "PavedDrive"].fillna("N") 80 | # PoolQC : data description says NA means "no pool" 81 | train.loc[:, "PoolQC"] = train.loc[:, "PoolQC"].fillna("No") 82 | train.loc[:, "PoolArea"] = train.loc[:, "PoolArea"].fillna(0) 83 | # SaleCondition : NA most likely means normal sale 84 | train.loc[:, "SaleCondition"] = train.loc[:, "SaleCondition"].fillna("Normal") 85 | # ScreenPorch : NA most likely means no screen porch 86 | train.loc[:, "ScreenPorch"] = train.loc[:, "ScreenPorch"].fillna(0) 87 | # TotRmsAbvGrd : NA most likely means 0 88 | train.loc[:, "TotRmsAbvGrd"] = train.loc[:, "TotRmsAbvGrd"].fillna(0) 89 | # Utilities : NA most likely means all public utilities 90 | train.loc[:, "Utilities"] = train.loc[:, "Utilities"].fillna("AllPub") 91 | # WoodDeckSF : NA most likely means no wood deck 92 | train.loc[:, "WoodDeckSF"] = train.loc[:, "WoodDeckSF"].fillna(0) 93 | 94 | train = train.replace({"MSSubClass" : {20 : "SC20", 30 : "SC30", 40 : "SC40", 45 : "SC45", 95 | 50 : "SC50", 60 : "SC60", 70 : "SC70", 75 : "SC75", 96 | 80 : "SC80", 85 : "SC85", 90 : "SC90", 120 : "SC120", 97 | 150 : "SC150", 160 : "SC160", 180 : "SC180", 190 : "SC190"}, 98 | "MoSold" : {1 : "Jan", 2 : "Feb", 3 : "Mar", 4 : "Apr", 5 : "May", 6 : "Jun", 99 | 7 : "Jul", 8 : "Aug", 9 : "Sep", 10 : "Oct", 11 : "Nov", 12 : "Dec"} 100 | }) 101 | 102 | train = train.replace({"Alley" : {"Grvl" : 1, "Pave" : 2}, 103 | "BsmtCond" : {"No" : 0, "Po" : 1, "Fa" : 2, "TA" : 3, "Gd" : 4, "Ex" : 5}, 104 | "BsmtExposure" : {"No" : 0, "Mn" : 1, "Av": 2, "Gd" : 3}, 105 | "BsmtFinType1" : {"No" : 0, "Unf" : 1, "LwQ": 2, "Rec" : 3, "BLQ" : 4, 106 | "ALQ" : 5, "GLQ" : 6}, 107 | "BsmtFinType2" : {"No" : 0, "Unf" : 1, "LwQ": 2, "Rec" : 3, "BLQ" : 4, 108 | "ALQ" : 5, "GLQ" : 6}, 109 | "BsmtQual" : {"No" : 0, "Po" : 1, "Fa" : 2, "TA": 3, "Gd" : 4, "Ex" : 5}, 110 | "ExterCond" : {"Po" : 1, "Fa" : 2, "TA": 3, "Gd": 4, "Ex" : 5}, 111 | "ExterQual" : {"Po" : 1, "Fa" : 2, "TA": 3, "Gd": 4, "Ex" : 5}, 112 | "FireplaceQu" : {"No" : 0, "Po" : 1, "Fa" : 2, "TA" : 3, "Gd" : 4, "Ex" : 5}, 113 | "Functional" : {"Sal" : 1, "Sev" : 2, "Maj2" : 3, "Maj1" : 4, "Mod": 5, 114 | "Min2" : 6, "Min1" : 7, "Typ" : 8}, 115 | "GarageCond" : {"No" : 0, "Po" : 1, "Fa" : 2, "TA" : 3, "Gd" : 4, "Ex" : 5}, 116 | "GarageQual" : {"No" : 0, "Po" : 1, "Fa" : 2, "TA" : 3, "Gd" : 4, "Ex" : 5}, 117 | "HeatingQC" : {"Po" : 1, "Fa" : 2, "TA" : 3, "Gd" : 4, "Ex" : 5}, 118 | "KitchenQual" : {"Po" : 1, "Fa" : 2, "TA" : 3, "Gd" : 4, "Ex" : 5}, 119 | "LandSlope" : {"Sev" : 1, "Mod" : 2, "Gtl" : 3}, 120 | "LotShape" : {"IR3" : 1, "IR2" : 2, "IR1" : 3, "Reg" : 4}, 121 | "PavedDrive" : {"N" : 0, "P" : 1, "Y" : 2}, 122 | "PoolQC" : {"No" : 0, "Fa" : 1, "TA" : 2, "Gd" : 3, "Ex" : 4}, 123 | "Street" : {"Grvl" : 1, "Pave" : 2}, 124 | "Utilities" : {"ELO" : 1, "NoSeWa" : 2, "NoSewr" : 3, "AllPub" : 4}} 125 | ) 126 | 127 | 128 | # Create new features 129 | # 1* Simplifications of existing features 130 | train["SimplOverallQual"] = train.OverallQual.replace({1 : 1, 2 : 1, 3 : 1, # bad 131 | 4 : 2, 5 : 2, 6 : 2, # average 132 | 7 : 3, 8 : 3, 9 : 3, 10 : 3 # good 133 | }) 134 | train["SimplOverallCond"] = train.OverallCond.replace({1 : 1, 2 : 1, 3 : 1, # bad 135 | 4 : 2, 5 : 2, 6 : 2, # average 136 | 7 : 3, 8 : 3, 9 : 3, 10 : 3 # good 137 | }) 138 | train["SimplPoolQC"] = train.PoolQC.replace({1 : 1, 2 : 1, # average 139 | 3 : 2, 4 : 2 # good 140 | }) 141 | train["SimplGarageCond"] = train.GarageCond.replace({1 : 1, # bad 142 | 2 : 1, 3 : 1, # average 143 | 4 : 2, 5 : 2 # good 144 | }) 145 | train["SimplGarageQual"] = train.GarageQual.replace({1 : 1, # bad 146 | 2 : 1, 3 : 1, # average 147 | 4 : 2, 5 : 2 # good 148 | }) 149 | train["SimplFireplaceQu"] = train.FireplaceQu.replace({1 : 1, # bad 150 | 2 : 1, 3 : 1, # average 151 | 4 : 2, 5 : 2 # good 152 | }) 153 | train["SimplFireplaceQu"] = train.FireplaceQu.replace({1 : 1, # bad 154 | 2 : 1, 3 : 1, # average 155 | 4 : 2, 5 : 2 # good 156 | }) 157 | train["SimplFunctional"] = train.Functional.replace({1 : 1, 2 : 1, # bad 158 | 3 : 2, 4 : 2, # major 159 | 5 : 3, 6 : 3, 7 : 3, # minor 160 | 8 : 4 # typical 161 | }) 162 | train["SimplKitchenQual"] = train.KitchenQual.replace({1 : 1, # bad 163 | 2 : 1, 3 : 1, # average 164 | 4 : 2, 5 : 2 # good 165 | }) 166 | train["SimplHeatingQC"] = train.HeatingQC.replace({1 : 1, # bad 167 | 2 : 1, 3 : 1, # average 168 | 4 : 2, 5 : 2 # good 169 | }) 170 | train["SimplBsmtFinType1"] = train.BsmtFinType1.replace({1 : 1, # unfinished 171 | 2 : 1, 3 : 1, # rec room 172 | 4 : 2, 5 : 2, 6 : 2 # living quarters 173 | }) 174 | train["SimplBsmtFinType2"] = train.BsmtFinType2.replace({1 : 1, # unfinished 175 | 2 : 1, 3 : 1, # rec room 176 | 4 : 2, 5 : 2, 6 : 2 # living quarters 177 | }) 178 | train["SimplBsmtCond"] = train.BsmtCond.replace({1 : 1, # bad 179 | 2 : 1, 3 : 1, # average 180 | 4 : 2, 5 : 2 # good 181 | }) 182 | train["SimplBsmtQual"] = train.BsmtQual.replace({1 : 1, # bad 183 | 2 : 1, 3 : 1, # average 184 | 4 : 2, 5 : 2 # good 185 | }) 186 | train["SimplExterCond"] = train.ExterCond.replace({1 : 1, # bad 187 | 2 : 1, 3 : 1, # average 188 | 4 : 2, 5 : 2 # good 189 | }) 190 | train["SimplExterQual"] = train.ExterQual.replace({1 : 1, # bad 191 | 2 : 1, 3 : 1, # average 192 | 4 : 2, 5 : 2 # good 193 | }) 194 | 195 | # 2* Combinations of existing features 196 | # Overall quality of the house 197 | train["OverallGrade"] = train["OverallQual"] * train["OverallCond"] 198 | # Overall quality of the garage 199 | train["GarageGrade"] = train["GarageQual"] * train["GarageCond"] 200 | # Overall quality of the exterior 201 | train["ExterGrade"] = train["ExterQual"] * train["ExterCond"] 202 | # Overall kitchen score 203 | train["KitchenScore"] = train["KitchenAbvGr"] * train["KitchenQual"] 204 | # Overall fireplace score 205 | train["FireplaceScore"] = train["Fireplaces"] * train["FireplaceQu"] 206 | # Overall garage score 207 | train["GarageScore"] = train["GarageArea"] * train["GarageQual"] 208 | # Overall pool score 209 | train["PoolScore"] = train["PoolArea"] * train["PoolQC"] 210 | # Simplified overall quality of the house 211 | train["SimplOverallGrade"] = train["SimplOverallQual"] * train["SimplOverallCond"] 212 | # Simplified overall quality of the exterior 213 | train["SimplExterGrade"] = train["SimplExterQual"] * train["SimplExterCond"] 214 | # Simplified overall pool score 215 | train["SimplPoolScore"] = train["PoolArea"] * train["SimplPoolQC"] 216 | # Simplified overall garage score 217 | train["SimplGarageScore"] = train["GarageArea"] * train["SimplGarageQual"] 218 | # Simplified overall fireplace score 219 | train["SimplFireplaceScore"] = train["Fireplaces"] * train["SimplFireplaceQu"] 220 | # Simplified overall kitchen score 221 | train["SimplKitchenScore"] = train["KitchenAbvGr"] * train["SimplKitchenQual"] 222 | # Total number of bathrooms 223 | train["TotalBath"] = train["BsmtFullBath"] + (0.5 * train["BsmtHalfBath"]) + \ 224 | train["FullBath"] + (0.5 * train["HalfBath"]) 225 | # Total SF for house (incl. basement) 226 | train["AllSF"] = train["GrLivArea"] + train["TotalBsmtSF"] 227 | # Total SF for 1st + 2nd floors 228 | train["AllFlrsSF"] = train["1stFlrSF"] + train["2ndFlrSF"] 229 | # Total SF for porch 230 | train["AllPorchSF"] = train["OpenPorchSF"] + train["EnclosedPorch"] + \ 231 | train["3SsnPorch"] + train["ScreenPorch"] 232 | # Has masonry veneer or not 233 | train["HasMasVnr"] = train.MasVnrType.replace({"BrkCmn" : 1, "BrkFace" : 1, "CBlock" : 1, 234 | "Stone" : 1, "None" : 0}) 235 | # House completed before sale or not 236 | train["BoughtOffPlan"] = train.SaleCondition.replace({"Abnorml" : 0, "Alloca" : 0, "AdjLand" : 0, 237 | "Family" : 0, "Normal" : 0, "Partial" : 1}) 238 | 239 | 240 | 241 | # Create new features 242 | # 3* Polynomials on the top 10 existing features 243 | train["OverallQual-s2"] = train["OverallQual"] ** 2 244 | train["OverallQual-s3"] = train["OverallQual"] ** 3 245 | train["OverallQual-Sq"] = np.sqrt(train["OverallQual"]) 246 | train["AllSF-2"] = train["AllSF"] ** 2 247 | train["AllSF-3"] = train["AllSF"] ** 3 248 | train["AllSF-Sq"] = np.sqrt(train["AllSF"]) 249 | train["AllFlrsSF-2"] = train["AllFlrsSF"] ** 2 250 | train["AllFlrsSF-3"] = train["AllFlrsSF"] ** 3 251 | train["AllFlrsSF-Sq"] = np.sqrt(train["AllFlrsSF"]) 252 | train["GrLivArea-2"] = train["GrLivArea"] ** 2 253 | train["GrLivArea-3"] = train["GrLivArea"] ** 3 254 | train["GrLivArea-Sq"] = np.sqrt(train["GrLivArea"]) 255 | train["SimplOverallQual-s2"] = train["SimplOverallQual"] ** 2 256 | train["SimplOverallQual-s3"] = train["SimplOverallQual"] ** 3 257 | train["SimplOverallQual-Sq"] = np.sqrt(train["SimplOverallQual"]) 258 | train["ExterQual-2"] = train["ExterQual"] ** 2 259 | train["ExterQual-3"] = train["ExterQual"] ** 3 260 | train["ExterQual-Sq"] = np.sqrt(train["ExterQual"]) 261 | train["GarageCars-2"] = train["GarageCars"] ** 2 262 | train["GarageCars-3"] = train["GarageCars"] ** 3 263 | train["GarageCars-Sq"] = np.sqrt(train["GarageCars"]) 264 | train["TotalBath-2"] = train["TotalBath"] ** 2 265 | train["TotalBath-3"] = train["TotalBath"] ** 3 266 | train["TotalBath-Sq"] = np.sqrt(train["TotalBath"]) 267 | train["KitchenQual-2"] = train["KitchenQual"] ** 2 268 | train["KitchenQual-3"] = train["KitchenQual"] ** 3 269 | train["KitchenQual-Sq"] = np.sqrt(train["KitchenQual"]) 270 | train["GarageScore-2"] = train["GarageScore"] ** 2 271 | train["GarageScore-3"] = train["GarageScore"] ** 3 272 | train["GarageScore-Sq"] = np.sqrt(train["GarageScore"]) 273 | 274 | 275 | categorical_features = train.select_dtypes(include = ["object"]).columns 276 | numerical_features = train.select_dtypes(exclude = ["object"]).columns 277 | 278 | 279 | print(categorical_features) 280 | -------------------------------------------------------------------------------- /src/clj_ml_wkg/ames_house_prices.clj: -------------------------------------------------------------------------------- 1 | (ns clj-ml-wkg.ames-house-prices 2 | "See reference article: 3 | https://www.kaggle.com/juliencs/a-study-on-regression-applied-to-the-ames-dataset 4 | " 5 | (:require [tech.ml.dataset.pipeline 6 | :refer [col int-map] 7 | :as dsp] 8 | [tech.v2.datatype :as dtype] 9 | [tech.v2.datatype.functional :as dfn] 10 | [tech.ml.dataset.pipeline.column-filters :as cf] 11 | [tech.ml.dataset.pipeline.base 12 | :refer [with-ds]] 13 | [tech.ml.dataset.pipeline.pipeline-operators 14 | :refer [without-recording 15 | pipeline-train-context 16 | pipeline-inference-context]] 17 | [tech.ml.dataset :as ds] 18 | [tech.ml.dataset.column :as ds-col] 19 | [tech.ml :as ml] 20 | [tech.ml.loss :as loss] 21 | [tech.ml.utils :as ml-utils] 22 | 23 | ;;use tablesaw as dataset backing store 24 | [tech.libs.tablesaw :as tablesaw] 25 | [tech.libs.smile.regression] 26 | [tech.libs.xgboost] 27 | [tech.ml.regression :as ml-regression] 28 | [tech.ml.visualization.vega :as vega-viz] 29 | 30 | ;;put/get nippy 31 | [tech.io :as io] 32 | 33 | [clojure.pprint :as pp] 34 | [clojure.set :as c-set]) 35 | 36 | (:import [java.io File])) 37 | 38 | ;; (time (require '[clj-ml-wkg.ames-house-prices])) 39 | 40 | ;; old tech.ml 34224.952879 msec 41 | ;; new (no core.matrix, less metaprogramming, precompiled) tech.ml 24067.88257 42 | 43 | (set! *warn-on-reflection* true) 44 | (set! *unchecked-math* :warn-on-boxed) 45 | 46 | 47 | (comment 48 | 49 | 50 | (def src-dataset (ds/->dataset "data/ames-house-prices/train.csv")) 51 | 52 | 53 | (def filtered-ds (dsp/filter src-dataset "GrLivArea" #(dfn/< (dsp/col) 4000))) 54 | 55 | 56 | (defn initial-pipeline-from-article 57 | [dataset] 58 | (-> dataset 59 | ;;Convert any numeric or boolean columns to be all of one datatype. 60 | (dsp/remove-columns ["Id"]) 61 | (dsp/->datatype) 62 | (dsp/m= "SalePrice" #(dfn/log1p (dsp/col))) 63 | (ds/set-inference-target "SalePrice"))) 64 | 65 | 66 | (defn more-categorical 67 | [dataset] 68 | (dsp/assoc-metadata dataset ["MSSubClass" "OverallQual" "OverallCond"] :categorical? true)) 69 | 70 | (println "pre-categorical-count" (count (cf/categorical? filtered-ds))) 71 | 72 | (def post-categorical-fix (-> filtered-ds 73 | initial-pipeline-from-article 74 | more-categorical)) 75 | 76 | (println "post-categorical-count" (count (cf/categorical? post-categorical-fix))) 77 | 78 | 79 | ;; Impressive patience to come up with this list!! 80 | (defn initial-missing-entries 81 | [dataset] 82 | (-> dataset 83 | ;; Handle missing values for features where median/mean or most common value doesn't 84 | ;; make sense 85 | 86 | ;; Alley : data description says NA means "no alley access" 87 | (dsp/replace-missing "Alley" "None") 88 | ;; BedroomAbvGr : NA most likely means 0 89 | (dsp/replace-missing ["BedroomAbvGr" 90 | "BsmtFullBath" 91 | "BsmtHalfBath" 92 | "BsmtUnfSF" 93 | "EnclosedPorch" 94 | "Fireplaces" 95 | "GarageArea" 96 | "GarageCars" 97 | "HalfBath" 98 | ;; KitchenAbvGr : NA most likely means 0 99 | "KitchenAbvGr" 100 | "LotFrontage" 101 | "MasVnrArea" 102 | "MiscVal" 103 | ;; OpenPorchSF : NA most likely means no open porch 104 | "OpenPorchSF" 105 | "PoolArea" 106 | ;; ScreenPorch : NA most likely means no screen porch 107 | "ScreenPorch" 108 | ;; TotRmsAbvGrd : NA most likely means 0 109 | "TotRmsAbvGrd" 110 | ;; WoodDeckSF : NA most likely means no wood deck 111 | "WoodDeckSF" 112 | ] 113 | 0) 114 | ;; BsmtQual etc : data description says NA for basement features is "no basement" 115 | (dsp/replace-missing ["BsmtQual" 116 | "BsmtCond" 117 | "BsmtExposure" 118 | "BsmtFinType1" 119 | "BsmtFinType2" 120 | ;; Fence : data description says NA means "no fence" 121 | "Fence" 122 | ;; FireplaceQu : data description says NA means "no 123 | ;; fireplace" 124 | 125 | "FireplaceQu" 126 | ;; GarageType etc : data description says NA for garage 127 | ;; features is "no garage" 128 | "GarageType" 129 | "GarageFinish" 130 | "GarageQual" 131 | "GarageCond" 132 | ;; MiscFeature : data description says NA means "no misc 133 | ;; feature" 134 | "MiscFeature" 135 | ;; PoolQC : data description says NA means "no pool" 136 | "PoolQC" 137 | ] 138 | "No") 139 | (dsp/replace-missing "CentralAir" "N") 140 | (dsp/replace-missing ["Condition1" 141 | "Condition2"] 142 | "Norm") 143 | ;; Condition : NA most likely means Normal 144 | ;; EnclosedPorch : NA most likely means no enclosed porch 145 | ;; External stuff : NA most likely means average 146 | (dsp/replace-missing ["ExterCond" 147 | "ExterQual" 148 | ;; HeatingQC : NA most likely means typical 149 | "HeatingQC" 150 | ;; KitchenQual : NA most likely means typical 151 | "KitchenQual" 152 | ] 153 | "TA") 154 | ;; Functional : data description says NA means typical 155 | (dsp/replace-missing "Functional" "Typ") 156 | ;; LotShape : NA most likely means regular 157 | (dsp/replace-missing "LotShape" "Reg") 158 | ;; MasVnrType : NA most likely means no veneer 159 | (dsp/replace-missing "MasVnrType" "None") 160 | ;; PavedDrive : NA most likely means not paved 161 | (dsp/replace-missing "PavedDrive" "N") 162 | (dsp/replace-missing "SaleCondition" "Normal") 163 | (dsp/replace-missing "Utilities" "AllPub"))) 164 | 165 | (println "pre missing fix #1") 166 | (pp/pprint (ds/columns-with-missing-seq post-categorical-fix)) 167 | 168 | (def post-missing (initial-missing-entries post-categorical-fix)) 169 | 170 | (println "post missing fix #1") 171 | 172 | (pp/pprint (ds/columns-with-missing-seq post-missing)) 173 | 174 | 175 | (def str->number-initial-map 176 | { 177 | "Alley" {"Grvl" 1 "Pave" 2 "None" 0} 178 | "BsmtCond" {"No" 0 "Po" 1 "Fa" 2 "TA" 3 "Gd" 4 "Ex" 5} 179 | "BsmtExposure" {"No" 0 "Mn" 1 "Av" 2 "Gd" 3} 180 | "BsmtFinType1" {"No" 0 "Unf" 1 "LwQ" 2 "Rec" 3 "BLQ" 4 181 | "ALQ" 5 "GLQ" 6} 182 | "BsmtFinType2" {"No" 0 "Unf" 1 "LwQ" 2 "Rec" 3 "BLQ" 4 183 | "ALQ" 5 "GLQ" 6} 184 | "BsmtQual" {"No" 0 "Po" 1 "Fa" 2 "TA" 3 "Gd" 4 "Ex" 5} 185 | "ExterCond" {"Po" 1 "Fa" 2 "TA" 3 "Gd" 4 "Ex" 5} 186 | "ExterQual" {"Po" 1 "Fa" 2 "TA" 3 "Gd" 4 "Ex" 5} 187 | "FireplaceQu" {"No" 0 "Po" 1 "Fa" 2 "TA" 3 "Gd" 4 "Ex" 5} 188 | "Functional" {"Sal" 1 "Sev" 2 "Maj2" 3 "Maj1" 4 "Mod" 5 189 | "Min2" 6 "Min1" 7 "Typ" 8} 190 | "GarageCond" {"No" 0 "Po" 1 "Fa" 2 "TA" 3 "Gd" 4 "Ex" 5} 191 | "GarageQual" {"No" 0 "Po" 1 "Fa" 2 "TA" 3 "Gd" 4 "Ex" 5} 192 | "HeatingQC" {"Po" 1 "Fa" 2 "TA" 3 "Gd" 4 "Ex" 5} 193 | "KitchenQual" {"Po" 1 "Fa" 2 "TA" 3 "Gd" 4 "Ex" 5} 194 | "LandSlope" {"Sev" 1 "Mod" 2 "Gtl" 3} 195 | "LotShape" {"IR3" 1 "IR2" 2 "IR1" 3 "Reg" 4} 196 | "PavedDrive" {"N" 0 "P" 1 "Y" 2} 197 | "PoolQC" {"No" 0 "Fa" 1 "TA" 2 "Gd" 3 "Ex" 4} 198 | "Street" {"Grvl" 1 "Pave" 2} 199 | "Utilities" {"ELO" 1 "NoSeWa" 2 "NoSewr" 3 "AllPub" 4} 200 | }) 201 | 202 | 203 | (defn str->number-pipeline 204 | [dataset] 205 | (->> str->number-initial-map 206 | (reduce (fn [dataset str-num-entry] 207 | (apply dsp/string->number dataset str-num-entry)) 208 | dataset))) 209 | 210 | (def str-num-dataset (str->number-pipeline post-missing)) 211 | 212 | (pp/pprint (ds/dataset-label-map str-num-dataset)) 213 | 214 | 215 | (def replace-maps 216 | { 217 | ;; Create new features 218 | ;; 1* Simplifications of existing features 219 | ;; The author implicitly leaves values at zero to be zero, so these maps 220 | ;; are intentionally incomplete 221 | "SimplOverallQual" {"OverallQual" {1 1, 2 1, 3 1, ;; bad 222 | 4 2, 5 2, 6 2, ;; average 223 | 7 3, 8 3, 9 3, 10 3 ;; good 224 | }} 225 | "SimplOverallCond" {"OverallCond" {1 1, 2 1, 3 1, ;; bad 226 | 4 2, 5 2, 6 2, ;; average 227 | 7 3, 8 3, 9 3, 10 3 ;; good 228 | }} 229 | "SimplPoolQC" {"PoolQC" {1 1, 2 1, ;; average 230 | 3 2, 4 2 ;; good 231 | }} 232 | "SimplGarageCond" {"GarageCond" {1 1, ;; bad 233 | 2 1, 3 1, ;; average 234 | 4 2, 5 2 ;; good 235 | }} 236 | "SimplGarageQual" {"GarageQual" {1 1, ;; bad 237 | 2 1, 3 1, ;; average 238 | 4 2, 5 2 ;; good 239 | }} 240 | "SimplFireplaceQu" {"FireplaceQu" {1 1, ;; bad 241 | 2 1, 3 1, ;; average 242 | 4 2, 5 2 ;; good 243 | }} 244 | "SimplFunctional" {"Functional" {1 1, 2 1, ;; bad 245 | 3 2, 4 2, ;; major 246 | 5 3, 6 3, 7 3, ;; minor 247 | 8 4 ;; typical 248 | }} 249 | "SimplKitchenQual" {"KitchenQual" {1 1, ;; bad 250 | 2 1, 3 1, ;; average 251 | 4 2, 5 2 ;; good 252 | }} 253 | "SimplHeatingQC" {"HeatingQC" {1 1, ;; bad 254 | 2 1, 3 1, ;; average 255 | 4 2, 5 2 ;; good 256 | }} 257 | "SimplBsmtFinType1" {"BsmtFinType1" {1 1, ;; unfinished 258 | 2 1, 3 1, ;; rec room 259 | 4 2, 5 2, 6 2 ;; living quarters 260 | }} 261 | "SimplBsmtFinType2" {"BsmtFinType2" {1 1, ;; unfinished 262 | 2 1, 3 1, ;; rec room 263 | 4 2, 5 2, 6 2 ;; living quarters 264 | }} 265 | "SimplBsmtCond" {"BsmtCond" {1 1, ;; bad 266 | 2 1, 3 1, ;; average 267 | 4 2, 5 2 ;; good 268 | }} 269 | "SimplBsmtQual" {"BsmtQual" {1 1, ;; bad 270 | 2 1, 3 1, ;; average 271 | 4 2, 5 2 ;; good 272 | }} 273 | "SimplExterCond" {"ExterCond" {1 1, ;; bad 274 | 2 1, 3 1, ;; average 275 | 4 2, 5 2 ;; good 276 | }} 277 | "SimplExterQual" {"ExterQual" {1 1, ;; bad 278 | 2 1, 3 1, ;; average 279 | 4 2, 5 2 ;; good 280 | }}}) 281 | 282 | 283 | (defn simplifications 284 | [dataset] 285 | (->> replace-maps 286 | (reduce (fn [dataset [target-name coldata-map]] 287 | (let [[col-name replace-data] (first coldata-map)] 288 | (dsp/m= dataset target-name 289 | #(dsp/int-map replace-data (dsp/col col-name) 290 | :not-strict? true)))) 291 | dataset))) 292 | 293 | (def replace-dataset (simplifications str-num-dataset)) 294 | 295 | (pp/pprint (-> (ds/column str-num-dataset "KitchenQual") 296 | (ds-col/unique))) 297 | 298 | (pp/pprint (-> (ds/column replace-dataset "SimplKitchenQual") 299 | (ds-col/unique))) 300 | 301 | 302 | (defn linear-combinations 303 | [dataset] 304 | (-> dataset 305 | (dsp/m= "OverallGrade" #(dfn/* (col "OverallQual") (col "OverallCond"))) 306 | ;; Overall quality of the garage 307 | (dsp/m= "GarageGrade" #(dfn/* (col "GarageQual") (col "GarageCond"))) 308 | ;; Overall quality of the exterior 309 | (dsp/m= "ExterGrade"#(dfn/* (col "ExterQual") (col "ExterCond"))) 310 | ;; Overall kitchen score 311 | (dsp/m= "KitchenScore" #(dfn/* (col "KitchenAbvGr") (col "KitchenQual"))) 312 | ;; Overall fireplace score 313 | (dsp/m= "FireplaceScore" #(dfn/* (col "Fireplaces") (col "FireplaceQu"))) 314 | ;; Overall garage score 315 | (dsp/m= "GarageScore" #(dfn/* (col "GarageArea") (col "GarageQual"))) 316 | ;; Overall pool score 317 | (dsp/m= "PoolScore" #(dfn/* (col "PoolArea") (col "PoolQC"))) 318 | ;; Simplified overall quality of the house 319 | (dsp/m= "SimplOverallGrade" #(dfn/* (col "SimplOverallQual") 320 | (col "SimplOverallCond"))) 321 | ;; Simplified overall quality of the exterior 322 | (dsp/m= "SimplExterGrade" #(dfn/* (col "SimplExterQual") (col "SimplExterCond"))) 323 | ;; Simplified overall pool score 324 | (dsp/m= "SimplPoolScore" #(dfn/* (col "PoolArea") (col "SimplPoolQC"))) 325 | ;; Simplified overall garage score 326 | (dsp/m= "SimplGarageScore" #(dfn/* (col "GarageArea") (col "SimplGarageQual"))) 327 | ;; Simplified overall fireplace score 328 | (dsp/m= "SimplFireplaceScore" #(dfn/* (col "Fireplaces") (col "SimplFireplaceQu"))) 329 | ;; Simplified overall kitchen score 330 | (dsp/m= "SimplKitchenScore" #(dfn/* (col "KitchenAbvGr" ) 331 | (col "SimplKitchenQual"))) 332 | ;; Total number of bathrooms 333 | (dsp/m= "TotalBath" #(dfn/+ (col "BsmtFullBath") (dfn/* 0.5 (col "BsmtHalfBath")) 334 | (col "FullBath") (dfn/* 0.5 (col "HalfBath")))) 335 | ;; Total SF for house (incl. basement) 336 | (dsp/m= "AllSF" #(dfn/+ (col "GrLivArea") (col "TotalBsmtSF"))) 337 | ;; Total SF for 1st + 2nd floors 338 | (dsp/m= "AllFlrsSF" #(dfn/+ (col "1stFlrSF") (col "2ndFlrSF"))) 339 | ;; Total SF for porch 340 | (dsp/m= "AllPorchSF" #(dfn/+ (col "OpenPorchSF") (col "EnclosedPorch") 341 | (col "3SsnPorch") (col "ScreenPorch"))) 342 | ;; Encode MasVrnType 343 | (dsp/string->number "MasVnrType" ["None" "BrkCmn" "BrkFace" "CBlock" "Stone"]) 344 | (dsp/m= "HasMasVnr" #(dfn/not-eq (col "MasVnrType") 0)))) 345 | 346 | 347 | (def linear-combined-ds (linear-combinations replace-dataset)) 348 | 349 | 350 | 351 | (let [print-columns ["TotalBath" "BsmtFullBath" "BsmtHalfBath" 352 | "FullBath" "HalfBath"]] 353 | (println (ds/select linear-combined-ds print-columns (range 10)))) 354 | 355 | (let [print-columns ["AllSF" "GrLivArea" "TotalBsmtSF"]] 356 | (println (ds/select linear-combined-ds print-columns (range 10)))) 357 | 358 | 359 | (def article-correlations 360 | ;;Default for pandas is pearson. 361 | ;; Find most important features relative to target 362 | (->> 363 | {"SalePrice" 1.000 364 | "OverallQual" 0.819 365 | "AllSF" 0.817 366 | "AllFlrsSF" 0.729 367 | "GrLivArea" 0.719 368 | "SimplOverallQual" 0.708 369 | "ExterQual" 0.681 370 | "GarageCars" 0.680 371 | "TotalBath" 0.673 372 | "KitchenQual" 0.667 373 | "GarageScore" 0.657 374 | "GarageArea" 0.655 375 | "TotalBsmtSF" 0.642 376 | "SimplExterQual" 0.636 377 | "SimplGarageScore" 0.631 378 | "BsmtQual" 0.615 379 | "1stFlrSF" 0.614 380 | "SimplKitchenQual" 0.610 381 | "OverallGrade" 0.604 382 | "SimplBsmtQual" 0.594 383 | "FullBath" 0.591 384 | "YearBuilt" 0.589 385 | "ExterGrade" 0.587 386 | "YearRemodAdd" 0.569 387 | "FireplaceQu" 0.547 388 | "GarageYrBlt" 0.544 389 | "TotRmsAbvGrd" 0.533 390 | "SimplOverallGrade" 0.527 391 | "SimplKitchenScore" 0.523 392 | "FireplaceScore" 0.518 393 | "SimplBsmtCond" 0.204 394 | "BedroomAbvGr" 0.204 395 | "AllPorchSF" 0.199 396 | "LotFrontage" 0.174 397 | "SimplFunctional" 0.137 398 | "Functional" 0.136 399 | "ScreenPorch" 0.124 400 | "SimplBsmtFinType2" 0.105 401 | "Street" 0.058 402 | "3SsnPorch" 0.056 403 | "ExterCond" 0.051 404 | "PoolArea" 0.041 405 | "SimplPoolScore" 0.040 406 | "SimplPoolQC" 0.040 407 | "PoolScore" 0.040 408 | "PoolQC" 0.038 409 | "BsmtFinType2" 0.016 410 | "Utilities" 0.013 411 | "BsmtFinSF2" 0.006 412 | "BsmtHalfBath" -0.015 413 | "MiscVal" -0.020 414 | "SimplOverallCond" -0.028 415 | "YrSold" -0.034 416 | "OverallCond" -0.037 417 | "LowQualFinSF" -0.038 418 | "LandSlope" -0.040 419 | "SimplExterCond" -0.042 420 | "KitchenAbvGr" -0.148 421 | "EnclosedPorch" -0.149 422 | "LotShape" -0.286 423 | } 424 | (sort-by second >))) 425 | 426 | 427 | (def tech-ml-correlations (get (ds/correlation-table linear-combined-ds) 428 | "SalePrice")) 429 | 430 | (pp/print-table (map #(zipmap [:pandas :tech.ml.dataset] 431 | [%1 %2]) 432 | (take 20 article-correlations) 433 | (take 20 tech-ml-correlations))) 434 | 435 | 436 | 437 | (defn polynomial-combinations 438 | [dataset correlation-table] 439 | (let [correlation-colnames (->> correlation-table 440 | (drop 1) 441 | (take 10) 442 | (map first))] 443 | (->> correlation-colnames 444 | (reduce (fn [dataset colname] 445 | (-> dataset 446 | (dsp/m= (str colname "-s2") #(dfn/pow (col colname) 2)) 447 | (dsp/m= (str colname "-s3") #(dfn/pow (col colname) 3)) 448 | (dsp/m= (str colname "-sqrt") #(dfn/sqrt (col colname))))) 449 | dataset)))) 450 | 451 | (def poly-data (polynomial-combinations linear-combined-ds tech-ml-correlations)) 452 | 453 | 454 | (println (ds/select poly-data 455 | ["OverallQual" 456 | "OverallQual-s2" 457 | "OverallQual-s3" 458 | "OverallQual-sqrt"] 459 | (range 10))) 460 | 461 | (def target-column-name "SalePrice") 462 | 463 | (def numerical-features (cf/numeric-and-non-categorical-and-not-target poly-data)) 464 | (def categorical-features (with-ds poly-data 465 | (cf/and #(cf/not cf/target?) 466 | #(cf/not numerical-features)))) 467 | 468 | 469 | (println "numeric-features" (count numerical-features)) 470 | 471 | (println "categorical-features" (count categorical-features)) 472 | 473 | (println "inference targets" (cf/target? poly-data)) 474 | 475 | ;;I printed out the categorical features from the when using pandas. 476 | (pp/pprint (->> (c-set/difference 477 | (set ["MSSubClass", "MSZoning", "Alley", "LandContour", "LotConfig", 478 | "Neighborhood", "Condition1", "Condition2", "BldgType", 479 | "HouseStyle", "RoofStyle", "RoofMatl", "Exterior1st", 480 | "Exterior2nd", "MasVnrType", "Foundation", "Heating", 481 | "CentralAir", 482 | "Electrical", "GarageType", "GarageFinish", "Fence", 483 | "MiscFeature", 484 | "MoSold", "SaleType", "SaleCondition"]) 485 | (set categorical-features)) 486 | (map (comp ds-col/metadata (partial ds/column poly-data))))) 487 | 488 | (defn fix-all-missing 489 | [dataset] 490 | (-> dataset 491 | ;;Fix any remaining numeric columns by using the median. 492 | (dsp/replace-missing cf/numeric? #(dfn/median (col))) 493 | ;;Fix any string columns by using 'NA'. 494 | (dsp/replace-missing cf/string? "NA") 495 | (dsp/string->number))) 496 | 497 | 498 | (def missing-fixed (fix-all-missing poly-data)) 499 | 500 | (pp/pprint (ds/columns-with-missing-seq missing-fixed)) 501 | 502 | (defn skew-column-filter 503 | [& [dataset]] 504 | (with-ds (cf/check-dataset dataset) 505 | (cf/and cf/numeric? 506 | #(cf/not "SalePrice") 507 | #(cf/not cf/categorical?) 508 | (fn [] 509 | (cf/> #(dfn/abs (dfn/skewness (col))) 510 | 0.5))))) 511 | 512 | (def skew-fixed (-> (dsp/m= missing-fixed 513 | skew-column-filter 514 | #(dfn/log1p (col))))) 515 | 516 | (println "Pre-fix skew counts" (count (skew-column-filter missing-fixed))) 517 | 518 | (println "Post-fix skew counts" (count (skew-column-filter skew-fixed))) 519 | 520 | 521 | (def poly-std-scale-ds (dsp/std-scale missing-fixed)) 522 | 523 | (def std-scale-ds (dsp/std-scale skew-fixed)) 524 | 525 | 526 | 527 | (println "Before std-scaler") 528 | 529 | (->> (ds/select skew-fixed (take 10 numerical-features) :all) 530 | (ds/columns) 531 | (map (fn [col] 532 | (merge (ds-col/stats col [:mean :variance]) 533 | {:column-name (ds-col/column-name col)}))) 534 | (ds/->>dataset) 535 | (println)) 536 | 537 | (println "\n\nAfter std-scaler") 538 | 539 | (->> (ds/select std-scale-ds (take 10 numerical-features) :all) 540 | (ds/columns) 541 | (map (fn [col] 542 | (merge (ds-col/stats col [:mean :variance]) 543 | {:column-name (ds-col/column-name col)}))) 544 | (ds/->>dataset) 545 | (println)) 546 | 547 | 548 | (defn render-results 549 | [title gridsearch-results] 550 | [:div 551 | [:h3 title] 552 | (vega-viz/accuracy-graph gridsearch-results :y-scale [0.10, 0.20])]) 553 | 554 | 555 | (defn train-regressors 556 | [dataset-name dataset loss-fn & [options]] 557 | (let [base-gridsearch-systems [:smile.regression/elasticnetlasso 558 | :xgboost/regression] 559 | trained-results (ml-regression/train-regressors 560 | dataset options 561 | :loss-fn loss-fn 562 | :gridsearch-regression-systems base-gridsearch-systems)] 563 | (println "Got" (count trained-results) "Trained results") 564 | (vec trained-results))) 565 | 566 | 567 | (defn train-graph-regressors 568 | [dataset-name dataset loss-fn & [options]] 569 | (let [trained-results (train-regressors dataset-name dataset loss-fn options)] 570 | (->> (apply concat [(render-results dataset-name trained-results)] 571 | (->> trained-results 572 | (sort-by :average-loss) 573 | (map (fn [model-result] 574 | [[:div 575 | [:h3 (format "%s-%.4f" 576 | (get-in model-result [:options :model-type]) 577 | (:average-loss model-result))] 578 | [:div 579 | [:span 580 | [:h4 "Predictions"] 581 | (vega-viz/graph-regression-verification-results 582 | model-result :target-key :predictions 583 | :y-scale [10 14] 584 | :x-scale [10 14])] 585 | [:span 586 | [:h4 "Residuals"] 587 | (vega-viz/graph-regression-verification-results 588 | model-result :target-key :residuals 589 | :y-scale [10 14] 590 | :x-scale [-1 1])]]]])))) 591 | (into [:div])))) 592 | 593 | 594 | (oz/view! [:div 595 | (train-graph-regressors "Missing" missing-fixed loss/rmse) 596 | (train-graph-regressors "Skew" skew-fixed loss/rmse) 597 | (train-graph-regressors "Missing + StdScale" poly-std-scale-ds loss/rmse) 598 | (train-graph-regressors "Skew + StdScale" std-scale-ds loss/rmse)]) 599 | 600 | 601 | 602 | (defn data-pipeline 603 | "Now you have a model and you want to go to production." 604 | [dataset training?] 605 | (let [sale-price-col (when training? 606 | (without-recording 607 | (-> dataset 608 | ;;Sale price is originally an integer 609 | (dsp/m= "SalePrice" #(-> (dsp/col) 610 | (dtype/->reader :float64) 611 | dfn/log1p)) 612 | (ds/column "SalePrice")))) 613 | 614 | dataset (if training? 615 | (ds/remove-columns dataset ["SalePrice"]) 616 | dataset) 617 | dataset 618 | (-> dataset 619 | (dsp/remove-columns ["Id"]) 620 | (dsp/->datatype) 621 | more-categorical 622 | initial-missing-entries 623 | str->number-pipeline 624 | simplifications 625 | linear-combinations 626 | (dsp/store-variables #(hash-map :correlation-table 627 | (-> (ds/add-column % sale-price-col) 628 | (ds/correlation-table) 629 | (get "SalePrice")))) 630 | (polynomial-combinations (dsp/read-var :correlation-table)) 631 | fix-all-missing 632 | dsp/std-scale)] 633 | (if training? 634 | (-> (ds/add-column dataset sale-price-col) 635 | (ds/set-inference-target "SalePrice")) 636 | dataset))) 637 | 638 | 639 | 640 | (def inference-pipeline-data (pipeline-train-context 641 | (data-pipeline src-dataset true))) 642 | 643 | (def pipeline-train-dataset (:dataset inference-pipeline-data)) 644 | 645 | 646 | (def inference-pipeline-context (:context inference-pipeline-data)) 647 | 648 | 649 | ;;At inference time we wouldn't have the saleprice column 650 | (def test-inference-src-dataset (dsp/remove-columns src-dataset ["SalePrice"])) 651 | 652 | 653 | ;;Now we can build the same dataset easily using context built during 654 | ;;the training system. This means any string tables generated or any range 655 | ;;k-means, stdscale, etc are all in the context. 656 | (def pipeline-inference-dataset (:dataset 657 | (pipeline-inference-context 658 | inference-pipeline-context 659 | (data-pipeline test-inference-src-dataset false)))) 660 | 661 | 662 | (println (ds/select pipeline-train-dataset ["OverallQual" 663 | "OverallQual-s2" 664 | "OverallQual-s3" 665 | "OverallQual-sqrt"] 666 | (range 10))) 667 | 668 | 669 | (println (ds/select pipeline-inference-dataset ["OverallQual" 670 | "OverallQual-s2" 671 | "OverallQual-s3" 672 | "OverallQual-sqrt"] 673 | (range 10))) 674 | ) 675 | --------------------------------------------------------------------------------