├── Lecture 1 - Linear Regression and Gradient Descent ├── LinReg_and_Gradient_Descent.pdf ├── auto-mpg.data ├── gradient.py └── linreg-sklearn.py ├── Lecture 2 - Gradient Descent and Normal Equations ├── Gradient_Descent_and_Normal_Equations.pdf ├── descent-normal-autompg.py ├── error_surface.py ├── gradient.py └── linreg-normal_equations.py ├── Lecture 3 - Curve Fitting and Model Validation ├── Lecture3.pdf ├── README.md ├── polyfit-auto-mpg-cv.py ├── polyfit-auto-mpg-t-test.py ├── polyfit-auto-mpg.py ├── polyfit-generalisation.py ├── polyfit.py ├── residuals-auto-mpg.py ├── residuals-random.py └── residuals-vs-fitted.py ├── Lecture 4 - Decision Trees ├── Decision_Trees.pdf ├── README.md ├── auto-mpg-modified.data ├── dt-credit.py ├── entropy.py ├── overfit_demo.py └── scikit-dt-auto-mpg.py ├── Lecture 5 - Probabilities and Logistic Regression ├── Probabilities_and_Logistic_Regression.pdf ├── README.md ├── auto-mpg.data ├── linreg-normal_equations.py ├── logreg-hp-origin.py ├── logreg_gradient.py ├── logreg_gradient_2_variables.py └── logreg_gradient_2_variables_iris.py ├── Lecture 6 - Naive Bayes ├── Naive_Bayes.pdf ├── README.md ├── auto-mpg.data ├── gender_height_weight.csv ├── naive_bayes_autompg.py └── naive_bayes_mf.py ├── Lecture 7 - Text Classification ├── README.md ├── data │ └── SMSSpamCollection ├── predict.py └── transform.py └── README.md /Lecture 1 - Linear Regression and Gradient Descent/LinReg_and_Gradient_Descent.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nmanchev/MachineLearningStudyGroup/092f642d888f3dfb105aa8768d4a6927c93a4278/Lecture 1 - Linear Regression and Gradient Descent/LinReg_and_Gradient_Descent.pdf -------------------------------------------------------------------------------- /Lecture 1 - Linear Regression and Gradient Descent/auto-mpg.data: -------------------------------------------------------------------------------- 1 | 18.0 8 307.0 130.0 3504. 12.0 70 1 "chevrolet chevelle malibu" 2 | 15.0 8 350.0 165.0 3693. 11.5 70 1 "buick skylark 320" 3 | 18.0 8 318.0 150.0 3436. 11.0 70 1 "plymouth satellite" 4 | 16.0 8 304.0 150.0 3433. 12.0 70 1 "amc rebel sst" 5 | 17.0 8 302.0 140.0 3449. 10.5 70 1 "ford torino" 6 | 15.0 8 429.0 198.0 4341. 10.0 70 1 "ford galaxie 500" 7 | 14.0 8 454.0 220.0 4354. 9.0 70 1 "chevrolet impala" 8 | 14.0 8 440.0 215.0 4312. 8.5 70 1 "plymouth fury iii" 9 | 14.0 8 455.0 225.0 4425. 10.0 70 1 "pontiac catalina" 10 | 15.0 8 390.0 190.0 3850. 8.5 70 1 "amc ambassador dpl" 11 | 15.0 8 383.0 170.0 3563. 10.0 70 1 "dodge challenger se" 12 | 14.0 8 340.0 160.0 3609. 8.0 70 1 "plymouth 'cuda 340" 13 | 15.0 8 400.0 150.0 3761. 9.5 70 1 "chevrolet monte carlo" 14 | 14.0 8 455.0 225.0 3086. 10.0 70 1 "buick estate wagon (sw)" 15 | 24.0 4 113.0 95.00 2372. 15.0 70 3 "toyota corona mark ii" 16 | 22.0 6 198.0 95.00 2833. 15.5 70 1 "plymouth duster" 17 | 18.0 6 199.0 97.00 2774. 15.5 70 1 "amc hornet" 18 | 21.0 6 200.0 85.00 2587. 16.0 70 1 "ford maverick" 19 | 27.0 4 97.00 88.00 2130. 14.5 70 3 "datsun pl510" 20 | 26.0 4 97.00 46.00 1835. 20.5 70 2 "volkswagen 1131 deluxe sedan" 21 | 25.0 4 110.0 87.00 2672. 17.5 70 2 "peugeot 504" 22 | 24.0 4 107.0 90.00 2430. 14.5 70 2 "audi 100 ls" 23 | 25.0 4 104.0 95.00 2375. 17.5 70 2 "saab 99e" 24 | 26.0 4 121.0 113.0 2234. 12.5 70 2 "bmw 2002" 25 | 21.0 6 199.0 90.00 2648. 15.0 70 1 "amc gremlin" 26 | 10.0 8 360.0 215.0 4615. 14.0 70 1 "ford f250" 27 | 10.0 8 307.0 200.0 4376. 15.0 70 1 "chevy c20" 28 | 11.0 8 318.0 210.0 4382. 13.5 70 1 "dodge d200" 29 | 9.0 8 304.0 193.0 4732. 18.5 70 1 "hi 1200d" 30 | 27.0 4 97.00 88.00 2130. 14.5 71 3 "datsun pl510" 31 | 28.0 4 140.0 90.00 2264. 15.5 71 1 "chevrolet vega 2300" 32 | 25.0 4 113.0 95.00 2228. 14.0 71 3 "toyota corona" 33 | 25.0 4 98.00 ? 2046. 19.0 71 1 "ford pinto" 34 | 19.0 6 232.0 100.0 2634. 13.0 71 1 "amc gremlin" 35 | 16.0 6 225.0 105.0 3439. 15.5 71 1 "plymouth satellite custom" 36 | 17.0 6 250.0 100.0 3329. 15.5 71 1 "chevrolet chevelle malibu" 37 | 19.0 6 250.0 88.00 3302. 15.5 71 1 "ford torino 500" 38 | 18.0 6 232.0 100.0 3288. 15.5 71 1 "amc matador" 39 | 14.0 8 350.0 165.0 4209. 12.0 71 1 "chevrolet impala" 40 | 14.0 8 400.0 175.0 4464. 11.5 71 1 "pontiac catalina brougham" 41 | 14.0 8 351.0 153.0 4154. 13.5 71 1 "ford galaxie 500" 42 | 14.0 8 318.0 150.0 4096. 13.0 71 1 "plymouth fury iii" 43 | 12.0 8 383.0 180.0 4955. 11.5 71 1 "dodge monaco (sw)" 44 | 13.0 8 400.0 170.0 4746. 12.0 71 1 "ford country squire (sw)" 45 | 13.0 8 400.0 175.0 5140. 12.0 71 1 "pontiac safari (sw)" 46 | 18.0 6 258.0 110.0 2962. 13.5 71 1 "amc hornet sportabout (sw)" 47 | 22.0 4 140.0 72.00 2408. 19.0 71 1 "chevrolet vega (sw)" 48 | 19.0 6 250.0 100.0 3282. 15.0 71 1 "pontiac firebird" 49 | 18.0 6 250.0 88.00 3139. 14.5 71 1 "ford mustang" 50 | 23.0 4 122.0 86.00 2220. 14.0 71 1 "mercury capri 2000" 51 | 28.0 4 116.0 90.00 2123. 14.0 71 2 "opel 1900" 52 | 30.0 4 79.00 70.00 2074. 19.5 71 2 "peugeot 304" 53 | 30.0 4 88.00 76.00 2065. 14.5 71 2 "fiat 124b" 54 | 31.0 4 71.00 65.00 1773. 19.0 71 3 "toyota corolla 1200" 55 | 35.0 4 72.00 69.00 1613. 18.0 71 3 "datsun 1200" 56 | 27.0 4 97.00 60.00 1834. 19.0 71 2 "volkswagen model 111" 57 | 26.0 4 91.00 70.00 1955. 20.5 71 1 "plymouth cricket" 58 | 24.0 4 113.0 95.00 2278. 15.5 72 3 "toyota corona hardtop" 59 | 25.0 4 97.50 80.00 2126. 17.0 72 1 "dodge colt hardtop" 60 | 23.0 4 97.00 54.00 2254. 23.5 72 2 "volkswagen type 3" 61 | 20.0 4 140.0 90.00 2408. 19.5 72 1 "chevrolet vega" 62 | 21.0 4 122.0 86.00 2226. 16.5 72 1 "ford pinto runabout" 63 | 13.0 8 350.0 165.0 4274. 12.0 72 1 "chevrolet impala" 64 | 14.0 8 400.0 175.0 4385. 12.0 72 1 "pontiac catalina" 65 | 15.0 8 318.0 150.0 4135. 13.5 72 1 "plymouth fury iii" 66 | 14.0 8 351.0 153.0 4129. 13.0 72 1 "ford galaxie 500" 67 | 17.0 8 304.0 150.0 3672. 11.5 72 1 "amc ambassador sst" 68 | 11.0 8 429.0 208.0 4633. 11.0 72 1 "mercury marquis" 69 | 13.0 8 350.0 155.0 4502. 13.5 72 1 "buick lesabre custom" 70 | 12.0 8 350.0 160.0 4456. 13.5 72 1 "oldsmobile delta 88 royale" 71 | 13.0 8 400.0 190.0 4422. 12.5 72 1 "chrysler newport royal" 72 | 19.0 3 70.00 97.00 2330. 13.5 72 3 "mazda rx2 coupe" 73 | 15.0 8 304.0 150.0 3892. 12.5 72 1 "amc matador (sw)" 74 | 13.0 8 307.0 130.0 4098. 14.0 72 1 "chevrolet chevelle concours (sw)" 75 | 13.0 8 302.0 140.0 4294. 16.0 72 1 "ford gran torino (sw)" 76 | 14.0 8 318.0 150.0 4077. 14.0 72 1 "plymouth satellite custom (sw)" 77 | 18.0 4 121.0 112.0 2933. 14.5 72 2 "volvo 145e (sw)" 78 | 22.0 4 121.0 76.00 2511. 18.0 72 2 "volkswagen 411 (sw)" 79 | 21.0 4 120.0 87.00 2979. 19.5 72 2 "peugeot 504 (sw)" 80 | 26.0 4 96.00 69.00 2189. 18.0 72 2 "renault 12 (sw)" 81 | 22.0 4 122.0 86.00 2395. 16.0 72 1 "ford pinto (sw)" 82 | 28.0 4 97.00 92.00 2288. 17.0 72 3 "datsun 510 (sw)" 83 | 23.0 4 120.0 97.00 2506. 14.5 72 3 "toyouta corona mark ii (sw)" 84 | 28.0 4 98.00 80.00 2164. 15.0 72 1 "dodge colt (sw)" 85 | 27.0 4 97.00 88.00 2100. 16.5 72 3 "toyota corolla 1600 (sw)" 86 | 13.0 8 350.0 175.0 4100. 13.0 73 1 "buick century 350" 87 | 14.0 8 304.0 150.0 3672. 11.5 73 1 "amc matador" 88 | 13.0 8 350.0 145.0 3988. 13.0 73 1 "chevrolet malibu" 89 | 14.0 8 302.0 137.0 4042. 14.5 73 1 "ford gran torino" 90 | 15.0 8 318.0 150.0 3777. 12.5 73 1 "dodge coronet custom" 91 | 12.0 8 429.0 198.0 4952. 11.5 73 1 "mercury marquis brougham" 92 | 13.0 8 400.0 150.0 4464. 12.0 73 1 "chevrolet caprice classic" 93 | 13.0 8 351.0 158.0 4363. 13.0 73 1 "ford ltd" 94 | 14.0 8 318.0 150.0 4237. 14.5 73 1 "plymouth fury gran sedan" 95 | 13.0 8 440.0 215.0 4735. 11.0 73 1 "chrysler new yorker brougham" 96 | 12.0 8 455.0 225.0 4951. 11.0 73 1 "buick electra 225 custom" 97 | 13.0 8 360.0 175.0 3821. 11.0 73 1 "amc ambassador brougham" 98 | 18.0 6 225.0 105.0 3121. 16.5 73 1 "plymouth valiant" 99 | 16.0 6 250.0 100.0 3278. 18.0 73 1 "chevrolet nova custom" 100 | 18.0 6 232.0 100.0 2945. 16.0 73 1 "amc hornet" 101 | 18.0 6 250.0 88.00 3021. 16.5 73 1 "ford maverick" 102 | 23.0 6 198.0 95.00 2904. 16.0 73 1 "plymouth duster" 103 | 26.0 4 97.00 46.00 1950. 21.0 73 2 "volkswagen super beetle" 104 | 11.0 8 400.0 150.0 4997. 14.0 73 1 "chevrolet impala" 105 | 12.0 8 400.0 167.0 4906. 12.5 73 1 "ford country" 106 | 13.0 8 360.0 170.0 4654. 13.0 73 1 "plymouth custom suburb" 107 | 12.0 8 350.0 180.0 4499. 12.5 73 1 "oldsmobile vista cruiser" 108 | 18.0 6 232.0 100.0 2789. 15.0 73 1 "amc gremlin" 109 | 20.0 4 97.00 88.00 2279. 19.0 73 3 "toyota carina" 110 | 21.0 4 140.0 72.00 2401. 19.5 73 1 "chevrolet vega" 111 | 22.0 4 108.0 94.00 2379. 16.5 73 3 "datsun 610" 112 | 18.0 3 70.00 90.00 2124. 13.5 73 3 "maxda rx3" 113 | 19.0 4 122.0 85.00 2310. 18.5 73 1 "ford pinto" 114 | 21.0 6 155.0 107.0 2472. 14.0 73 1 "mercury capri v6" 115 | 26.0 4 98.00 90.00 2265. 15.5 73 2 "fiat 124 sport coupe" 116 | 15.0 8 350.0 145.0 4082. 13.0 73 1 "chevrolet monte carlo s" 117 | 16.0 8 400.0 230.0 4278. 9.50 73 1 "pontiac grand prix" 118 | 29.0 4 68.00 49.00 1867. 19.5 73 2 "fiat 128" 119 | 24.0 4 116.0 75.00 2158. 15.5 73 2 "opel manta" 120 | 20.0 4 114.0 91.00 2582. 14.0 73 2 "audi 100ls" 121 | 19.0 4 121.0 112.0 2868. 15.5 73 2 "volvo 144ea" 122 | 15.0 8 318.0 150.0 3399. 11.0 73 1 "dodge dart custom" 123 | 24.0 4 121.0 110.0 2660. 14.0 73 2 "saab 99le" 124 | 20.0 6 156.0 122.0 2807. 13.5 73 3 "toyota mark ii" 125 | 11.0 8 350.0 180.0 3664. 11.0 73 1 "oldsmobile omega" 126 | 20.0 6 198.0 95.00 3102. 16.5 74 1 "plymouth duster" 127 | 21.0 6 200.0 ? 2875. 17.0 74 1 "ford maverick" 128 | 19.0 6 232.0 100.0 2901. 16.0 74 1 "amc hornet" 129 | 15.0 6 250.0 100.0 3336. 17.0 74 1 "chevrolet nova" 130 | 31.0 4 79.00 67.00 1950. 19.0 74 3 "datsun b210" 131 | 26.0 4 122.0 80.00 2451. 16.5 74 1 "ford pinto" 132 | 32.0 4 71.00 65.00 1836. 21.0 74 3 "toyota corolla 1200" 133 | 25.0 4 140.0 75.00 2542. 17.0 74 1 "chevrolet vega" 134 | 16.0 6 250.0 100.0 3781. 17.0 74 1 "chevrolet chevelle malibu classic" 135 | 16.0 6 258.0 110.0 3632. 18.0 74 1 "amc matador" 136 | 18.0 6 225.0 105.0 3613. 16.5 74 1 "plymouth satellite sebring" 137 | 16.0 8 302.0 140.0 4141. 14.0 74 1 "ford gran torino" 138 | 13.0 8 350.0 150.0 4699. 14.5 74 1 "buick century luxus (sw)" 139 | 14.0 8 318.0 150.0 4457. 13.5 74 1 "dodge coronet custom (sw)" 140 | 14.0 8 302.0 140.0 4638. 16.0 74 1 "ford gran torino (sw)" 141 | 14.0 8 304.0 150.0 4257. 15.5 74 1 "amc matador (sw)" 142 | 29.0 4 98.00 83.00 2219. 16.5 74 2 "audi fox" 143 | 26.0 4 79.00 67.00 1963. 15.5 74 2 "volkswagen dasher" 144 | 26.0 4 97.00 78.00 2300. 14.5 74 2 "opel manta" 145 | 31.0 4 76.00 52.00 1649. 16.5 74 3 "toyota corona" 146 | 32.0 4 83.00 61.00 2003. 19.0 74 3 "datsun 710" 147 | 28.0 4 90.00 75.00 2125. 14.5 74 1 "dodge colt" 148 | 24.0 4 90.00 75.00 2108. 15.5 74 2 "fiat 128" 149 | 26.0 4 116.0 75.00 2246. 14.0 74 2 "fiat 124 tc" 150 | 24.0 4 120.0 97.00 2489. 15.0 74 3 "honda civic" 151 | 26.0 4 108.0 93.00 2391. 15.5 74 3 "subaru" 152 | 31.0 4 79.00 67.00 2000. 16.0 74 2 "fiat x1.9" 153 | 19.0 6 225.0 95.00 3264. 16.0 75 1 "plymouth valiant custom" 154 | 18.0 6 250.0 105.0 3459. 16.0 75 1 "chevrolet nova" 155 | 15.0 6 250.0 72.00 3432. 21.0 75 1 "mercury monarch" 156 | 15.0 6 250.0 72.00 3158. 19.5 75 1 "ford maverick" 157 | 16.0 8 400.0 170.0 4668. 11.5 75 1 "pontiac catalina" 158 | 15.0 8 350.0 145.0 4440. 14.0 75 1 "chevrolet bel air" 159 | 16.0 8 318.0 150.0 4498. 14.5 75 1 "plymouth grand fury" 160 | 14.0 8 351.0 148.0 4657. 13.5 75 1 "ford ltd" 161 | 17.0 6 231.0 110.0 3907. 21.0 75 1 "buick century" 162 | 16.0 6 250.0 105.0 3897. 18.5 75 1 "chevroelt chevelle malibu" 163 | 15.0 6 258.0 110.0 3730. 19.0 75 1 "amc matador" 164 | 18.0 6 225.0 95.00 3785. 19.0 75 1 "plymouth fury" 165 | 21.0 6 231.0 110.0 3039. 15.0 75 1 "buick skyhawk" 166 | 20.0 8 262.0 110.0 3221. 13.5 75 1 "chevrolet monza 2+2" 167 | 13.0 8 302.0 129.0 3169. 12.0 75 1 "ford mustang ii" 168 | 29.0 4 97.00 75.00 2171. 16.0 75 3 "toyota corolla" 169 | 23.0 4 140.0 83.00 2639. 17.0 75 1 "ford pinto" 170 | 20.0 6 232.0 100.0 2914. 16.0 75 1 "amc gremlin" 171 | 23.0 4 140.0 78.00 2592. 18.5 75 1 "pontiac astro" 172 | 24.0 4 134.0 96.00 2702. 13.5 75 3 "toyota corona" 173 | 25.0 4 90.00 71.00 2223. 16.5 75 2 "volkswagen dasher" 174 | 24.0 4 119.0 97.00 2545. 17.0 75 3 "datsun 710" 175 | 18.0 6 171.0 97.00 2984. 14.5 75 1 "ford pinto" 176 | 29.0 4 90.00 70.00 1937. 14.0 75 2 "volkswagen rabbit" 177 | 19.0 6 232.0 90.00 3211. 17.0 75 1 "amc pacer" 178 | 23.0 4 115.0 95.00 2694. 15.0 75 2 "audi 100ls" 179 | 23.0 4 120.0 88.00 2957. 17.0 75 2 "peugeot 504" 180 | 22.0 4 121.0 98.00 2945. 14.5 75 2 "volvo 244dl" 181 | 25.0 4 121.0 115.0 2671. 13.5 75 2 "saab 99le" 182 | 33.0 4 91.00 53.00 1795. 17.5 75 3 "honda civic cvcc" 183 | 28.0 4 107.0 86.00 2464. 15.5 76 2 "fiat 131" 184 | 25.0 4 116.0 81.00 2220. 16.9 76 2 "opel 1900" 185 | 25.0 4 140.0 92.00 2572. 14.9 76 1 "capri ii" 186 | 26.0 4 98.00 79.00 2255. 17.7 76 1 "dodge colt" 187 | 27.0 4 101.0 83.00 2202. 15.3 76 2 "renault 12tl" 188 | 17.5 8 305.0 140.0 4215. 13.0 76 1 "chevrolet chevelle malibu classic" 189 | 16.0 8 318.0 150.0 4190. 13.0 76 1 "dodge coronet brougham" 190 | 15.5 8 304.0 120.0 3962. 13.9 76 1 "amc matador" 191 | 14.5 8 351.0 152.0 4215. 12.8 76 1 "ford gran torino" 192 | 22.0 6 225.0 100.0 3233. 15.4 76 1 "plymouth valiant" 193 | 22.0 6 250.0 105.0 3353. 14.5 76 1 "chevrolet nova" 194 | 24.0 6 200.0 81.00 3012. 17.6 76 1 "ford maverick" 195 | 22.5 6 232.0 90.00 3085. 17.6 76 1 "amc hornet" 196 | 29.0 4 85.00 52.00 2035. 22.2 76 1 "chevrolet chevette" 197 | 24.5 4 98.00 60.00 2164. 22.1 76 1 "chevrolet woody" 198 | 29.0 4 90.00 70.00 1937. 14.2 76 2 "vw rabbit" 199 | 33.0 4 91.00 53.00 1795. 17.4 76 3 "honda civic" 200 | 20.0 6 225.0 100.0 3651. 17.7 76 1 "dodge aspen se" 201 | 18.0 6 250.0 78.00 3574. 21.0 76 1 "ford granada ghia" 202 | 18.5 6 250.0 110.0 3645. 16.2 76 1 "pontiac ventura sj" 203 | 17.5 6 258.0 95.00 3193. 17.8 76 1 "amc pacer d/l" 204 | 29.5 4 97.00 71.00 1825. 12.2 76 2 "volkswagen rabbit" 205 | 32.0 4 85.00 70.00 1990. 17.0 76 3 "datsun b-210" 206 | 28.0 4 97.00 75.00 2155. 16.4 76 3 "toyota corolla" 207 | 26.5 4 140.0 72.00 2565. 13.6 76 1 "ford pinto" 208 | 20.0 4 130.0 102.0 3150. 15.7 76 2 "volvo 245" 209 | 13.0 8 318.0 150.0 3940. 13.2 76 1 "plymouth volare premier v8" 210 | 19.0 4 120.0 88.00 3270. 21.9 76 2 "peugeot 504" 211 | 19.0 6 156.0 108.0 2930. 15.5 76 3 "toyota mark ii" 212 | 16.5 6 168.0 120.0 3820. 16.7 76 2 "mercedes-benz 280s" 213 | 16.5 8 350.0 180.0 4380. 12.1 76 1 "cadillac seville" 214 | 13.0 8 350.0 145.0 4055. 12.0 76 1 "chevy c10" 215 | 13.0 8 302.0 130.0 3870. 15.0 76 1 "ford f108" 216 | 13.0 8 318.0 150.0 3755. 14.0 76 1 "dodge d100" 217 | 31.5 4 98.00 68.00 2045. 18.5 77 3 "honda accord cvcc" 218 | 30.0 4 111.0 80.00 2155. 14.8 77 1 "buick opel isuzu deluxe" 219 | 36.0 4 79.00 58.00 1825. 18.6 77 2 "renault 5 gtl" 220 | 25.5 4 122.0 96.00 2300. 15.5 77 1 "plymouth arrow gs" 221 | 33.5 4 85.00 70.00 1945. 16.8 77 3 "datsun f-10 hatchback" 222 | 17.5 8 305.0 145.0 3880. 12.5 77 1 "chevrolet caprice classic" 223 | 17.0 8 260.0 110.0 4060. 19.0 77 1 "oldsmobile cutlass supreme" 224 | 15.5 8 318.0 145.0 4140. 13.7 77 1 "dodge monaco brougham" 225 | 15.0 8 302.0 130.0 4295. 14.9 77 1 "mercury cougar brougham" 226 | 17.5 6 250.0 110.0 3520. 16.4 77 1 "chevrolet concours" 227 | 20.5 6 231.0 105.0 3425. 16.9 77 1 "buick skylark" 228 | 19.0 6 225.0 100.0 3630. 17.7 77 1 "plymouth volare custom" 229 | 18.5 6 250.0 98.00 3525. 19.0 77 1 "ford granada" 230 | 16.0 8 400.0 180.0 4220. 11.1 77 1 "pontiac grand prix lj" 231 | 15.5 8 350.0 170.0 4165. 11.4 77 1 "chevrolet monte carlo landau" 232 | 15.5 8 400.0 190.0 4325. 12.2 77 1 "chrysler cordoba" 233 | 16.0 8 351.0 149.0 4335. 14.5 77 1 "ford thunderbird" 234 | 29.0 4 97.00 78.00 1940. 14.5 77 2 "volkswagen rabbit custom" 235 | 24.5 4 151.0 88.00 2740. 16.0 77 1 "pontiac sunbird coupe" 236 | 26.0 4 97.00 75.00 2265. 18.2 77 3 "toyota corolla liftback" 237 | 25.5 4 140.0 89.00 2755. 15.8 77 1 "ford mustang ii 2+2" 238 | 30.5 4 98.00 63.00 2051. 17.0 77 1 "chevrolet chevette" 239 | 33.5 4 98.00 83.00 2075. 15.9 77 1 "dodge colt m/m" 240 | 30.0 4 97.00 67.00 1985. 16.4 77 3 "subaru dl" 241 | 30.5 4 97.00 78.00 2190. 14.1 77 2 "volkswagen dasher" 242 | 22.0 6 146.0 97.00 2815. 14.5 77 3 "datsun 810" 243 | 21.5 4 121.0 110.0 2600. 12.8 77 2 "bmw 320i" 244 | 21.5 3 80.00 110.0 2720. 13.5 77 3 "mazda rx-4" 245 | 43.1 4 90.00 48.00 1985. 21.5 78 2 "volkswagen rabbit custom diesel" 246 | 36.1 4 98.00 66.00 1800. 14.4 78 1 "ford fiesta" 247 | 32.8 4 78.00 52.00 1985. 19.4 78 3 "mazda glc deluxe" 248 | 39.4 4 85.00 70.00 2070. 18.6 78 3 "datsun b210 gx" 249 | 36.1 4 91.00 60.00 1800. 16.4 78 3 "honda civic cvcc" 250 | 19.9 8 260.0 110.0 3365. 15.5 78 1 "oldsmobile cutlass salon brougham" 251 | 19.4 8 318.0 140.0 3735. 13.2 78 1 "dodge diplomat" 252 | 20.2 8 302.0 139.0 3570. 12.8 78 1 "mercury monarch ghia" 253 | 19.2 6 231.0 105.0 3535. 19.2 78 1 "pontiac phoenix lj" 254 | 20.5 6 200.0 95.00 3155. 18.2 78 1 "chevrolet malibu" 255 | 20.2 6 200.0 85.00 2965. 15.8 78 1 "ford fairmont (auto)" 256 | 25.1 4 140.0 88.00 2720. 15.4 78 1 "ford fairmont (man)" 257 | 20.5 6 225.0 100.0 3430. 17.2 78 1 "plymouth volare" 258 | 19.4 6 232.0 90.00 3210. 17.2 78 1 "amc concord" 259 | 20.6 6 231.0 105.0 3380. 15.8 78 1 "buick century special" 260 | 20.8 6 200.0 85.00 3070. 16.7 78 1 "mercury zephyr" 261 | 18.6 6 225.0 110.0 3620. 18.7 78 1 "dodge aspen" 262 | 18.1 6 258.0 120.0 3410. 15.1 78 1 "amc concord d/l" 263 | 19.2 8 305.0 145.0 3425. 13.2 78 1 "chevrolet monte carlo landau" 264 | 17.7 6 231.0 165.0 3445. 13.4 78 1 "buick regal sport coupe (turbo)" 265 | 18.1 8 302.0 139.0 3205. 11.2 78 1 "ford futura" 266 | 17.5 8 318.0 140.0 4080. 13.7 78 1 "dodge magnum xe" 267 | 30.0 4 98.00 68.00 2155. 16.5 78 1 "chevrolet chevette" 268 | 27.5 4 134.0 95.00 2560. 14.2 78 3 "toyota corona" 269 | 27.2 4 119.0 97.00 2300. 14.7 78 3 "datsun 510" 270 | 30.9 4 105.0 75.00 2230. 14.5 78 1 "dodge omni" 271 | 21.1 4 134.0 95.00 2515. 14.8 78 3 "toyota celica gt liftback" 272 | 23.2 4 156.0 105.0 2745. 16.7 78 1 "plymouth sapporo" 273 | 23.8 4 151.0 85.00 2855. 17.6 78 1 "oldsmobile starfire sx" 274 | 23.9 4 119.0 97.00 2405. 14.9 78 3 "datsun 200-sx" 275 | 20.3 5 131.0 103.0 2830. 15.9 78 2 "audi 5000" 276 | 17.0 6 163.0 125.0 3140. 13.6 78 2 "volvo 264gl" 277 | 21.6 4 121.0 115.0 2795. 15.7 78 2 "saab 99gle" 278 | 16.2 6 163.0 133.0 3410. 15.8 78 2 "peugeot 604sl" 279 | 31.5 4 89.00 71.00 1990. 14.9 78 2 "volkswagen scirocco" 280 | 29.5 4 98.00 68.00 2135. 16.6 78 3 "honda accord lx" 281 | 21.5 6 231.0 115.0 3245. 15.4 79 1 "pontiac lemans v6" 282 | 19.8 6 200.0 85.00 2990. 18.2 79 1 "mercury zephyr 6" 283 | 22.3 4 140.0 88.00 2890. 17.3 79 1 "ford fairmont 4" 284 | 20.2 6 232.0 90.00 3265. 18.2 79 1 "amc concord dl 6" 285 | 20.6 6 225.0 110.0 3360. 16.6 79 1 "dodge aspen 6" 286 | 17.0 8 305.0 130.0 3840. 15.4 79 1 "chevrolet caprice classic" 287 | 17.6 8 302.0 129.0 3725. 13.4 79 1 "ford ltd landau" 288 | 16.5 8 351.0 138.0 3955. 13.2 79 1 "mercury grand marquis" 289 | 18.2 8 318.0 135.0 3830. 15.2 79 1 "dodge st. regis" 290 | 16.9 8 350.0 155.0 4360. 14.9 79 1 "buick estate wagon (sw)" 291 | 15.5 8 351.0 142.0 4054. 14.3 79 1 "ford country squire (sw)" 292 | 19.2 8 267.0 125.0 3605. 15.0 79 1 "chevrolet malibu classic (sw)" 293 | 18.5 8 360.0 150.0 3940. 13.0 79 1 "chrysler lebaron town @ country (sw)" 294 | 31.9 4 89.00 71.00 1925. 14.0 79 2 "vw rabbit custom" 295 | 34.1 4 86.00 65.00 1975. 15.2 79 3 "maxda glc deluxe" 296 | 35.7 4 98.00 80.00 1915. 14.4 79 1 "dodge colt hatchback custom" 297 | 27.4 4 121.0 80.00 2670. 15.0 79 1 "amc spirit dl" 298 | 25.4 5 183.0 77.00 3530. 20.1 79 2 "mercedes benz 300d" 299 | 23.0 8 350.0 125.0 3900. 17.4 79 1 "cadillac eldorado" 300 | 27.2 4 141.0 71.00 3190. 24.8 79 2 "peugeot 504" 301 | 23.9 8 260.0 90.00 3420. 22.2 79 1 "oldsmobile cutlass salon brougham" 302 | 34.2 4 105.0 70.00 2200. 13.2 79 1 "plymouth horizon" 303 | 34.5 4 105.0 70.00 2150. 14.9 79 1 "plymouth horizon tc3" 304 | 31.8 4 85.00 65.00 2020. 19.2 79 3 "datsun 210" 305 | 37.3 4 91.00 69.00 2130. 14.7 79 2 "fiat strada custom" 306 | 28.4 4 151.0 90.00 2670. 16.0 79 1 "buick skylark limited" 307 | 28.8 6 173.0 115.0 2595. 11.3 79 1 "chevrolet citation" 308 | 26.8 6 173.0 115.0 2700. 12.9 79 1 "oldsmobile omega brougham" 309 | 33.5 4 151.0 90.00 2556. 13.2 79 1 "pontiac phoenix" 310 | 41.5 4 98.00 76.00 2144. 14.7 80 2 "vw rabbit" 311 | 38.1 4 89.00 60.00 1968. 18.8 80 3 "toyota corolla tercel" 312 | 32.1 4 98.00 70.00 2120. 15.5 80 1 "chevrolet chevette" 313 | 37.2 4 86.00 65.00 2019. 16.4 80 3 "datsun 310" 314 | 28.0 4 151.0 90.00 2678. 16.5 80 1 "chevrolet citation" 315 | 26.4 4 140.0 88.00 2870. 18.1 80 1 "ford fairmont" 316 | 24.3 4 151.0 90.00 3003. 20.1 80 1 "amc concord" 317 | 19.1 6 225.0 90.00 3381. 18.7 80 1 "dodge aspen" 318 | 34.3 4 97.00 78.00 2188. 15.8 80 2 "audi 4000" 319 | 29.8 4 134.0 90.00 2711. 15.5 80 3 "toyota corona liftback" 320 | 31.3 4 120.0 75.00 2542. 17.5 80 3 "mazda 626" 321 | 37.0 4 119.0 92.00 2434. 15.0 80 3 "datsun 510 hatchback" 322 | 32.2 4 108.0 75.00 2265. 15.2 80 3 "toyota corolla" 323 | 46.6 4 86.00 65.00 2110. 17.9 80 3 "mazda glc" 324 | 27.9 4 156.0 105.0 2800. 14.4 80 1 "dodge colt" 325 | 40.8 4 85.00 65.00 2110. 19.2 80 3 "datsun 210" 326 | 44.3 4 90.00 48.00 2085. 21.7 80 2 "vw rabbit c (diesel)" 327 | 43.4 4 90.00 48.00 2335. 23.7 80 2 "vw dasher (diesel)" 328 | 36.4 5 121.0 67.00 2950. 19.9 80 2 "audi 5000s (diesel)" 329 | 30.0 4 146.0 67.00 3250. 21.8 80 2 "mercedes-benz 240d" 330 | 44.6 4 91.00 67.00 1850. 13.8 80 3 "honda civic 1500 gl" 331 | 40.9 4 85.00 ? 1835. 17.3 80 2 "renault lecar deluxe" 332 | 33.8 4 97.00 67.00 2145. 18.0 80 3 "subaru dl" 333 | 29.8 4 89.00 62.00 1845. 15.3 80 2 "vokswagen rabbit" 334 | 32.7 6 168.0 132.0 2910. 11.4 80 3 "datsun 280-zx" 335 | 23.7 3 70.00 100.0 2420. 12.5 80 3 "mazda rx-7 gs" 336 | 35.0 4 122.0 88.00 2500. 15.1 80 2 "triumph tr7 coupe" 337 | 23.6 4 140.0 ? 2905. 14.3 80 1 "ford mustang cobra" 338 | 32.4 4 107.0 72.00 2290. 17.0 80 3 "honda accord" 339 | 27.2 4 135.0 84.00 2490. 15.7 81 1 "plymouth reliant" 340 | 26.6 4 151.0 84.00 2635. 16.4 81 1 "buick skylark" 341 | 25.8 4 156.0 92.00 2620. 14.4 81 1 "dodge aries wagon (sw)" 342 | 23.5 6 173.0 110.0 2725. 12.6 81 1 "chevrolet citation" 343 | 30.0 4 135.0 84.00 2385. 12.9 81 1 "plymouth reliant" 344 | 39.1 4 79.00 58.00 1755. 16.9 81 3 "toyota starlet" 345 | 39.0 4 86.00 64.00 1875. 16.4 81 1 "plymouth champ" 346 | 35.1 4 81.00 60.00 1760. 16.1 81 3 "honda civic 1300" 347 | 32.3 4 97.00 67.00 2065. 17.8 81 3 "subaru" 348 | 37.0 4 85.00 65.00 1975. 19.4 81 3 "datsun 210 mpg" 349 | 37.7 4 89.00 62.00 2050. 17.3 81 3 "toyota tercel" 350 | 34.1 4 91.00 68.00 1985. 16.0 81 3 "mazda glc 4" 351 | 34.7 4 105.0 63.00 2215. 14.9 81 1 "plymouth horizon 4" 352 | 34.4 4 98.00 65.00 2045. 16.2 81 1 "ford escort 4w" 353 | 29.9 4 98.00 65.00 2380. 20.7 81 1 "ford escort 2h" 354 | 33.0 4 105.0 74.00 2190. 14.2 81 2 "volkswagen jetta" 355 | 34.5 4 100.0 ? 2320. 15.8 81 2 "renault 18i" 356 | 33.7 4 107.0 75.00 2210. 14.4 81 3 "honda prelude" 357 | 32.4 4 108.0 75.00 2350. 16.8 81 3 "toyota corolla" 358 | 32.9 4 119.0 100.0 2615. 14.8 81 3 "datsun 200sx" 359 | 31.6 4 120.0 74.00 2635. 18.3 81 3 "mazda 626" 360 | 28.1 4 141.0 80.00 3230. 20.4 81 2 "peugeot 505s turbo diesel" 361 | 30.7 6 145.0 76.00 3160. 19.6 81 2 "volvo diesel" 362 | 25.4 6 168.0 116.0 2900. 12.6 81 3 "toyota cressida" 363 | 24.2 6 146.0 120.0 2930. 13.8 81 3 "datsun 810 maxima" 364 | 22.4 6 231.0 110.0 3415. 15.8 81 1 "buick century" 365 | 26.6 8 350.0 105.0 3725. 19.0 81 1 "oldsmobile cutlass ls" 366 | 20.2 6 200.0 88.00 3060. 17.1 81 1 "ford granada gl" 367 | 17.6 6 225.0 85.00 3465. 16.6 81 1 "chrysler lebaron salon" 368 | 28.0 4 112.0 88.00 2605. 19.6 82 1 "chevrolet cavalier" 369 | 27.0 4 112.0 88.00 2640. 18.6 82 1 "chevrolet cavalier wagon" 370 | 34.0 4 112.0 88.00 2395. 18.0 82 1 "chevrolet cavalier 2-door" 371 | 31.0 4 112.0 85.00 2575. 16.2 82 1 "pontiac j2000 se hatchback" 372 | 29.0 4 135.0 84.00 2525. 16.0 82 1 "dodge aries se" 373 | 27.0 4 151.0 90.00 2735. 18.0 82 1 "pontiac phoenix" 374 | 24.0 4 140.0 92.00 2865. 16.4 82 1 "ford fairmont futura" 375 | 23.0 4 151.0 ? 3035. 20.5 82 1 "amc concord dl" 376 | 36.0 4 105.0 74.00 1980. 15.3 82 2 "volkswagen rabbit l" 377 | 37.0 4 91.00 68.00 2025. 18.2 82 3 "mazda glc custom l" 378 | 31.0 4 91.00 68.00 1970. 17.6 82 3 "mazda glc custom" 379 | 38.0 4 105.0 63.00 2125. 14.7 82 1 "plymouth horizon miser" 380 | 36.0 4 98.00 70.00 2125. 17.3 82 1 "mercury lynx l" 381 | 36.0 4 120.0 88.00 2160. 14.5 82 3 "nissan stanza xe" 382 | 36.0 4 107.0 75.00 2205. 14.5 82 3 "honda accord" 383 | 34.0 4 108.0 70.00 2245 16.9 82 3 "toyota corolla" 384 | 38.0 4 91.00 67.00 1965. 15.0 82 3 "honda civic" 385 | 32.0 4 91.00 67.00 1965. 15.7 82 3 "honda civic (auto)" 386 | 38.0 4 91.00 67.00 1995. 16.2 82 3 "datsun 310 gx" 387 | 25.0 6 181.0 110.0 2945. 16.4 82 1 "buick century limited" 388 | 38.0 6 262.0 85.00 3015. 17.0 82 1 "oldsmobile cutlass ciera (diesel)" 389 | 26.0 4 156.0 92.00 2585. 14.5 82 1 "chrysler lebaron medallion" 390 | 22.0 6 232.0 112.0 2835 14.7 82 1 "ford granada l" 391 | 32.0 4 144.0 96.00 2665. 13.9 82 3 "toyota celica gt" 392 | 36.0 4 135.0 84.00 2370. 13.0 82 1 "dodge charger 2.2" 393 | 27.0 4 151.0 90.00 2950. 17.3 82 1 "chevrolet camaro" 394 | 27.0 4 140.0 86.00 2790. 15.6 82 1 "ford mustang gl" 395 | 44.0 4 97.00 52.00 2130. 24.6 82 2 "vw pickup" 396 | 32.0 4 135.0 84.00 2295. 11.6 82 1 "dodge rampage" 397 | 28.0 4 120.0 79.00 2625. 18.6 82 1 "ford ranger" 398 | 31.0 4 119.0 82.00 2720. 19.4 82 1 "chevy s-10" 399 | -------------------------------------------------------------------------------- /Lecture 1 - Linear Regression and Gradient Descent/gradient.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | (C) 2016 Nikolay Manchev, London Machine Learning Study Group 4 | 5 | http://www.meetup.com/London-Machine-Learning-Study-Group/ 6 | 7 | This work is licensed under the Creative Commons Attribution 4.0 International 8 | License. To view a copy of this license, visit 9 | http://creativecommons.org/licenses/by/4.0/. 10 | """ 11 | 12 | import matplotlib.pyplot as plt 13 | 14 | import numpy as np 15 | 16 | def corr_vars( start=1, stop=10, step=1, mu=0, sigma=3, func=lambda x: x ): 17 | """ 18 | Generates a data set of (x,y) pairs with an underlying regularity. y is a 19 | function of x in the form of 20 | 21 | y = f(x) + e 22 | 23 | Where f(x) is specified by the *func* argument and e is a random Gaussian 24 | noise specified by *mu* and *sigma*. 25 | """ 26 | 27 | # Generate x 28 | x = np.arange(start, stop, step) 29 | 30 | # Generate random noise 31 | e = np.random.normal(mu, sigma, x.size) 32 | 33 | # Generate y values as y = func(x) + e 34 | y = np.zeros(x.size) 35 | 36 | for ind in range(x.size): 37 | y[ind] = func(x[ind]) + e[ind] 38 | 39 | return (x,y) 40 | 41 | def y_hat(x, w): 42 | """ 43 | Linear regression hypothesis: y_hat = X.w 44 | """ 45 | 46 | return x.dot(w) 47 | 48 | def gradient_descent(x, y, w, max_iter, alpha = 0.05): 49 | """ 50 | Performs gradient descent to optimise w. 51 | 52 | Keyword arguments: 53 | 54 | *x* : Numpy array 55 | matrix of independent variables 56 | 57 | *y* : Numpy array 58 | columnar vector of target values 59 | 60 | *w* : Numpy array 61 | initial model parameters 62 | 63 | *max_iter* : int 64 | maximum number of iterations 65 | 66 | *alpha* : int, optional 67 | learning rate (defaults to 0.05) 68 | 69 | Returns: 70 | 71 | *J_hist* : Numpy array 72 | values of J(w) at each iteration 73 | 74 | *w* : Numpy array 75 | estimated model parameters 76 | """ 77 | 78 | N = y.shape[0] 79 | 80 | J_hist = np.zeros(max_iter) 81 | 82 | print("\nGradient descent starts\n") 83 | 84 | for i in range(0, max_iter): 85 | 86 | J = np.sum( (y_hat(x, w) - y) ** 2 ) / (2 * N) 87 | 88 | J_hist[i] = J 89 | 90 | print("Iteration %d, J(w): %f\n" % (i, J)) 91 | 92 | gradient = np.dot(x.T, y_hat(x, w) - y) / N 93 | 94 | w = w - alpha * gradient 95 | 96 | print("Gradient descent finished.\n") 97 | 98 | return (J_hist, w) 99 | 100 | def main(): 101 | 102 | # Initialise the data set 103 | 104 | np.random.seed(100) 105 | 106 | (x,y) = corr_vars(sigma=2, func=lambda x: 4*np.log2(x)) 107 | 108 | x = np.array([x]).T 109 | y = np.array([y]).T 110 | 111 | # Add ones for w_0 112 | mat_ones = np.ones(shape=(9, 2)) 113 | mat_ones[:,1] = x[:,0] 114 | x = mat_ones 115 | 116 | # Print the X and y 117 | print("X:") 118 | print(x) 119 | 120 | print("\nY:") 121 | print(y) 122 | 123 | m,n=np.shape(x) 124 | 125 | # Initialise w with zeros 126 | w = np.array([np.ones(n)]).T 127 | 128 | # Perform gradient descent 129 | (j_hist, w) = gradient_descent(x, y, w, 10) 130 | 131 | print("Model parameters:\n") 132 | print(w) 133 | 134 | # Plot X and y 135 | f, (ax1,ax2) = plt.subplots(1, 2, figsize=(7,7)) 136 | ax1.scatter(x[:,1], y) 137 | 138 | # Plot the regression line 139 | ax1.plot(x[:,1], y_hat(x, w), color='r') 140 | ax1.grid(True) 141 | 142 | # Plot the change of J(w) 143 | x = np.arange(1,j_hist.size + 1) 144 | y = j_hist 145 | 146 | ax2.plot(x, j_hist) 147 | ax2.grid(True) 148 | 149 | if __name__ == "__main__": 150 | main() -------------------------------------------------------------------------------- /Lecture 1 - Linear Regression and Gradient Descent/linreg-sklearn.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | (C) 2016 Nikolay Manchev, London Machine Learning Study Group 4 | 5 | http://www.meetup.com/London-Machine-Learning-Study-Group/ 6 | 7 | This work is licensed under the Creative Commons Attribution 4.0 International 8 | License. To view a copy of this license, visit 9 | http://creativecommons.org/licenses/by/4.0/. 10 | """ 11 | 12 | import numpy as np 13 | 14 | from sklearn import linear_model 15 | 16 | def corr_vars( start=1, stop=10, step=1, mu=0, sigma=3, func=lambda x: x ): 17 | """ 18 | Generates a data set of (x,y) pairs with an underlying regularity. y is a 19 | function of x in the form of 20 | 21 | y = f(x) + e 22 | 23 | Where f(x) is specified by the *func* argument and e is a random Gaussian 24 | noise specified by *mu* and *sigma*. 25 | """ 26 | 27 | # Generate x 28 | x = np.arange(start, stop, step) 29 | 30 | # Generate random noise 31 | e = np.random.normal(mu, sigma, x.size) 32 | 33 | # Generate y values as y = func(x) + e 34 | y = np.zeros(x.size) 35 | 36 | for ind in range(x.size): 37 | y[ind] = func(x[ind]) + e[ind] 38 | 39 | return (x,y) 40 | 41 | def main(): 42 | 43 | # Initialise the data set 44 | np.random.seed(100) 45 | 46 | (x,y) = corr_vars(sigma=2, func=lambda x: 4*np.log2(x)) 47 | 48 | x = np.array([x]).T 49 | y = np.array([y]).T 50 | 51 | # Fit a scikit-learn linear model 52 | regr = linear_model.LinearRegression() 53 | 54 | regr.fit(x, y) 55 | 56 | # Print model parameters 57 | print("Model parameters:\n") 58 | print(regr.intercept_) 59 | print(regr.coef_) 60 | 61 | if __name__ == "__main__": 62 | main() -------------------------------------------------------------------------------- /Lecture 2 - Gradient Descent and Normal Equations/Gradient_Descent_and_Normal_Equations.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nmanchev/MachineLearningStudyGroup/092f642d888f3dfb105aa8768d4a6927c93a4278/Lecture 2 - Gradient Descent and Normal Equations/Gradient_Descent_and_Normal_Equations.pdf -------------------------------------------------------------------------------- /Lecture 2 - Gradient Descent and Normal Equations/descent-normal-autompg.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | (C) 2016 Nikolay Manchev, London Machine Learning Study Group 4 | 5 | http://www.meetup.com/London-Machine-Learning-Study-Group/ 6 | 7 | This work is licensed under the Creative Commons Attribution 4.0 International 8 | License. To view a copy of this license, visit 9 | http://creativecommons.org/licenses/by/4.0/. 10 | """ 11 | 12 | import matplotlib.pyplot as plt 13 | 14 | import numpy as np 15 | 16 | def y_hat(x, w): 17 | """ 18 | Linear regression hypothesis: y_hat = X.w 19 | """ 20 | return x.dot(w) 21 | 22 | def gradient_descent(x, y, w, max_iter, alpha = 0.05): 23 | N = y.shape[0] 24 | 25 | J_hist = np.zeros(max_iter) 26 | 27 | print("\nGradient descent starts\n") 28 | 29 | for i in range(0, max_iter): 30 | 31 | J = np.sum( (y_hat(x, w) - y) ** 2 ) / (2 * N) 32 | 33 | J_hist[i] = J 34 | 35 | print("Iteration %d, J(w): %f\n" % (i, J)) 36 | 37 | gradient = np.dot(x.T, y_hat(x, w) - y) / N 38 | 39 | w = w - alpha * gradient 40 | 41 | print("Gradient descent finished.\n") 42 | 43 | return (J_hist, w) 44 | 45 | def main(): 46 | 47 | np.random.seed(100) 48 | 49 | # Load the data set 50 | # We use Auto MPG from UCI Machine Learning Repository 51 | # https://archive.ics.uci.edu/ml/datasets/Auto+MPG 52 | 53 | car_data = np.genfromtxt("auto-mpg.data", usecols=(0, 3)) 54 | car_data = car_data[~np.isnan(car_data).any(axis=1)] 55 | 56 | # Assign Horsepower attribute to x and MPG to y 57 | x = car_data[:,1] 58 | y = car_data[:,0] 59 | 60 | x = np.array([x]).T 61 | y = np.array([y]).T 62 | 63 | # Normalize 64 | x = (x - np.mean(x)) / np.std(x) 65 | 66 | # Add ones for w_0 67 | x = np.hstack((np.array([np.ones(x.shape[0])]).T, x)) 68 | 69 | # Initialise model parameters 70 | w = np.array([np.zeros(x.shape[1])]).T 71 | 72 | (j_hist, w) = gradient_descent(x, y, w, 20, 0.5) 73 | 74 | print("Gradient Descent Model parameters:\n") 75 | print(w, '\n') 76 | 77 | # Normal equations method 78 | xTx = np.linalg.inv(x.T.dot(x)) 79 | xTy = x.T.dot(y) 80 | w = xTx.dot(xTy) 81 | 82 | print("Normal Equations Model parameters:\n") 83 | print(w) 84 | 85 | f, (ax1,ax2) = plt.subplots(1, 2, figsize=(12,8)) 86 | ax1.scatter(x[:,1], y) 87 | ax1.plot(x[:,1], y_hat(x, w), color='r') 88 | ax1.set_title("Horsepower vs MPG") 89 | ax1.grid(True) 90 | 91 | 92 | x = np.arange(1,j_hist.size + 1) 93 | y = j_hist 94 | 95 | ax2.plot(x, j_hist) 96 | ax2.set_title("J(w)") 97 | ax2.grid(True) 98 | 99 | if __name__ == "__main__": 100 | main() -------------------------------------------------------------------------------- /Lecture 2 - Gradient Descent and Normal Equations/error_surface.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | (C) 2016 Nikolay Manchev, London Machine Learning Study Group 4 | 5 | http://www.meetup.com/London-Machine-Learning-Study-Group/ 6 | 7 | This work is licensed under the Creative Commons Attribution 4.0 International 8 | License. To view a copy of this license, visit 9 | http://creativecommons.org/licenses/by/4.0/. 10 | """ 11 | 12 | import matplotlib.pyplot as plt 13 | 14 | import numpy as np 15 | 16 | from mpl_toolkits.mplot3d import Axes3D 17 | 18 | def J(x,y,w_0,w_1): 19 | N = y.shape[0] 20 | J = 0 21 | for i in range(0, len(x)): 22 | J = J + ((w_0 + w_1 * x[i]) - y[i] ) ** 2 23 | return J / N 24 | 25 | def main(): 26 | # Initialise the data set 27 | 28 | car_data = np.genfromtxt("auto-mpg.data", usecols=(0, 3)) 29 | car_data = car_data[~np.isnan(car_data).any(axis=1)] 30 | 31 | # Assign Horsepower attribute to x and MPG to y 32 | x = car_data[:,1] 33 | y = car_data[:,0] 34 | 35 | x = np.array([x]).T 36 | y = np.array([y]).T 37 | 38 | # Normalize 39 | x = (x - np.mean(x)) / np.std(x) 40 | 41 | 42 | w_0 = np.linspace(-300.0, 300.0, 50) 43 | w_1 = np.linspace(-300.0, 300.0, 50) 44 | 45 | W0, W1 = np.meshgrid(w_0, w_1) 46 | 47 | E = np.array([J(x, y, w_0, w_1) for w_0, w_1 in zip(np.ravel(W0), np.ravel(W1))]) 48 | 49 | E = E.reshape(W0.shape) 50 | 51 | fig = plt.figure(figsize=(7,6)) 52 | ax = fig.add_subplot(111, projection='3d') 53 | 54 | ax.plot_surface(W0, W1, E, rstride=1, cstride=1, color='b', alpha=0.5) 55 | 56 | ax.set_xticks([]) 57 | ax.set_yticks([]) 58 | ax.set_zticklabels([]) 59 | 60 | ax.set_xlabel('$w_0$', fontsize=16) 61 | ax.set_ylabel('$w_1$', fontsize=16) 62 | ax.set_zlabel('$J(w_0, w_1)$', fontsize=16) 63 | 64 | plt.show() 65 | ''' 66 | plt.figure() 67 | CS = plt.contour(W0, W1, E) 68 | plt.clabel(CS, inline=1, fontsize=10) 69 | plt.title('Simplest default with labels') 70 | ''' 71 | if __name__ == "__main__": 72 | main() -------------------------------------------------------------------------------- /Lecture 2 - Gradient Descent and Normal Equations/gradient.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | (C) 2016 Nikolay Manchev, London Machine Learning Study Group 4 | 5 | http://www.meetup.com/London-Machine-Learning-Study-Group/ 6 | 7 | This work is licensed under the Creative Commons Attribution 4.0 International 8 | License. To view a copy of this license, visit 9 | http://creativecommons.org/licenses/by/4.0/. 10 | """ 11 | 12 | import matplotlib.pyplot as plt 13 | 14 | import numpy as np 15 | 16 | def corr_vars( start=1, stop=10, step=1, mu=0, sigma=3, func=lambda x: x ): 17 | """ 18 | Generates a data set of (x,y) pairs with an underlying regularity. y is a 19 | function of x in the form of 20 | 21 | y = f(x) + e 22 | 23 | Where f(x) is specified by the *func* argument and e is a random Gaussian 24 | noise specified by *mu* and *sigma*. 25 | """ 26 | 27 | # Generate x 28 | x = np.arange(start, stop, step) 29 | 30 | # Generate random noise 31 | e = np.random.normal(mu, sigma, x.size) 32 | 33 | # Generate y values as y = func(x) + e 34 | y = np.zeros(x.size) 35 | 36 | for ind in range(x.size): 37 | y[ind] = func(x[ind]) + e[ind] 38 | 39 | return (x,y) 40 | 41 | def y_hat(x, w): 42 | """ 43 | Linear regression hypothesis: y_hat = X.w 44 | """ 45 | 46 | return x.dot(w) 47 | 48 | def gradient_descent(x, y, w, max_iter, alpha = 0.001): 49 | """ 50 | Performs gradient descent to optimise w. 51 | 52 | Keyword arguments: 53 | 54 | *x* : Numpy array 55 | matrix of independent variables 56 | 57 | *y* : Numpy array 58 | columnar vector of target values 59 | 60 | *w* : Numpy array 61 | initial model parameters 62 | 63 | *max_iter* : int 64 | maximum number of iterations 65 | 66 | *alpha* : int, optional 67 | learning rate (defaults to 0.05) 68 | 69 | Returns: 70 | 71 | *J_hist* : Numpy array 72 | values of J(w) at each iteration 73 | 74 | *w* : Numpy array 75 | estimated model parameters 76 | """ 77 | 78 | N = y.shape[0] 79 | 80 | J_hist = np.zeros(max_iter) 81 | 82 | print("\nGradient descent starts\n") 83 | 84 | for i in range(0, max_iter): 85 | 86 | J = np.sum( (y_hat(x, w) - y) ** 2 ) / (2 * N) 87 | 88 | J_hist[i] = J 89 | 90 | print("Iteration %d, J(w): %f\n" % (i, J)) 91 | 92 | gradient = np.dot(x.T, y_hat(x, w) - y) / N 93 | 94 | w = w - alpha * gradient 95 | 96 | print("Gradient descent finished.\n") 97 | 98 | return (J_hist, w) 99 | 100 | def main(): 101 | 102 | # Initialise the data set 103 | 104 | np.random.seed(100) 105 | 106 | (x,y) = corr_vars(sigma=2, func=lambda x: 4*np.log2(x)) 107 | 108 | x = np.array([x]).T 109 | y = np.array([y]).T 110 | 111 | # Add ones for w_0 112 | mat_ones = np.ones(shape=(9, 2)) 113 | mat_ones[:,1] = x[:,0] 114 | x = mat_ones 115 | 116 | # Print the X and y 117 | print("X:") 118 | print(x) 119 | 120 | print("\nY:") 121 | print(y) 122 | 123 | m,n=np.shape(x) 124 | 125 | # Initialise w with zeros 126 | w = np.array([np.ones(n)]).T 127 | 128 | # Perform gradient descent 129 | (j_hist, w) = gradient_descent(x, y, w, 10, 0.1) 130 | 131 | print("Model parameters:\n") 132 | print(w) 133 | 134 | # Plot X and y 135 | f, (ax1,ax2) = plt.subplots(1, 2, figsize=(7,7)) 136 | ax1.scatter(x[:,1], y) 137 | 138 | # Plot the regression line 139 | ax1.plot(x[:,1], y_hat(x, w), color='r') 140 | ax1.grid(True) 141 | 142 | # Plot the change of J(w) 143 | x = np.arange(1,j_hist.size + 1) 144 | y = j_hist 145 | 146 | ax2.plot(x, j_hist) 147 | ax2.grid(True) 148 | 149 | if __name__ == "__main__": 150 | main() -------------------------------------------------------------------------------- /Lecture 2 - Gradient Descent and Normal Equations/linreg-normal_equations.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | (C) 2016 Nikolay Manchev, London Machine Learning Study Group 4 | 5 | http://www.meetup.com/London-Machine-Learning-Study-Group/ 6 | 7 | This work is licensed under the Creative Commons Attribution 4.0 International 8 | License. To view a copy of this license, visit 9 | http://creativecommons.org/licenses/by/4.0/. 10 | """ 11 | 12 | import numpy as np 13 | 14 | def corr_vars( start=1, stop=10, step=1, mu=0, sigma=3, func=lambda x: x ): 15 | """ 16 | Generates a data set of (x,y) pairs with an underlying regularity. y is a 17 | function of x in the form of 18 | 19 | y = f(x) + e 20 | 21 | Where f(x) is specified by the *func* argument and e is a random Gaussian 22 | noise specified by *mu* and *sigma*. 23 | """ 24 | 25 | # Generate x 26 | x = np.arange(start, stop, step) 27 | 28 | # Generate random noise 29 | e = np.random.normal(mu, sigma, x.size) 30 | 31 | # Generate y values as y = func(x) + e 32 | y = np.zeros(x.size) 33 | 34 | for ind in range(x.size): 35 | y[ind] = func(x[ind]) + e[ind] 36 | 37 | return (x,y) 38 | 39 | def main(): 40 | 41 | # Initialise the data set 42 | np.random.seed(100) 43 | 44 | (x,y) = corr_vars(sigma=2, func=lambda x: 4*np.log2(x)) 45 | 46 | x = np.array([x]).T 47 | y = np.array([y]).T 48 | 49 | # Add ones for w_0 50 | mat_ones = np.ones(shape=(9, 2)) 51 | mat_ones[:,1] = x[:,0] 52 | x = mat_ones 53 | 54 | # Normal equations method 55 | xTx_inv = np.linalg.inv(x.T.dot(x)) 56 | xTy = x.T.dot(y) 57 | w = xTx_inv.dot(xTy) 58 | 59 | print("Model parameters:\n") 60 | print(w) 61 | 62 | if __name__ == "__main__": 63 | main() -------------------------------------------------------------------------------- /Lecture 3 - Curve Fitting and Model Validation/Lecture3.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nmanchev/MachineLearningStudyGroup/092f642d888f3dfb105aa8768d4a6927c93a4278/Lecture 3 - Curve Fitting and Model Validation/Lecture3.pdf -------------------------------------------------------------------------------- /Lecture 3 - Curve Fitting and Model Validation/README.md: -------------------------------------------------------------------------------- 1 | ## Curve fitting and cross validation examples 2 | 3 | Code examples used in Lecture 3 4 | 5 | * polyfit.py - Polynomial regression against linear data with Gaussian noise 6 | * polyfit-auto-mpg.py - Polynomial regression against the Auto MPG data set 7 | * polyfit-auto-mpg-cv.py - k-fold Cross Validation for polynomial regression model using the Auto MPG data set 8 | * polyfit-auto-mpg-t-test.py - T test for polynomial regression model using the Auto MPG data set 9 | * polyfit-generalisation.py - Hold out validation for polynomial regression model using the Auto MPG data set 10 | * residuals-auto-mpg.py - Residuals plots for polynomial regression model using the Auto MPG data set 11 | * residuals-random.py - Residuals histogram for linear data with Gaussian noise 12 | * residuals-vs-fitted.py - Fitted vs. residuals plot for for linear data with Gaussian noise 13 | 14 | This repository contains materials from the London Machine Learning Study Group Meetups 15 | 16 | The meetup page is available at [http://www.meetup.com/London-Machine-Learning-Study-Group](http://www.meetup.com/London-Machine-Learning-Study-Group). 17 | 18 | (C) 2016 Nikolay Manchev, London Machine Learning Study Group 19 | 20 | This work is licensed under the Creative Commons Attribution 4.0 International License. To view a copy of this license, visit [http://creativecommons.org/licenses/by/4.0](http://creativecommons.org/licenses/by/4.0). 21 | -------------------------------------------------------------------------------- /Lecture 3 - Curve Fitting and Model Validation/polyfit-auto-mpg-cv.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | (C) 2016 Nikolay Manchev, London Machine Learning Study Group 4 | 5 | http://www.meetup.com/London-Machine-Learning-Study-Group/ 6 | 7 | This work is licensed under the Creative Commons Attribution 4.0 International 8 | License. To view a copy of this license, visit 9 | http://creativecommons.org/licenses/by/4.0/. 10 | """ 11 | 12 | import numpy as np 13 | 14 | from sklearn.cross_validation import KFold 15 | 16 | 17 | def y_hat(x, w): 18 | """ 19 | Linear regression hypothesis: y_hat = X.w 20 | """ 21 | return x.dot(w) 22 | 23 | def polyMatrix(v, order): 24 | """ 25 | Given a nx1 vector v, the function generates a matrix of the form: 26 | 27 | [ v[0] v[0]^2 ... v[0]^order ] 28 | [ v[1] v[1]^2 ... v[1]^order ] 29 | [ ... ] 30 | [ v[n] v[n]^2 ... v[n]^order ] 31 | 32 | """ 33 | vector = v 34 | v_pow = 2 35 | 36 | while v_pow <= order: 37 | v = np.hstack((v, np.power(vector, v_pow))) 38 | v_pow = v_pow + 1 39 | 40 | return v 41 | 42 | # Load the data set 43 | # We use Auto MPG from UCI Machine Learning Repository 44 | # https://archive.ics.uci.edu/ml/datasets/Auto+MPG 45 | 46 | car_data = np.genfromtxt("auto-mpg.data", usecols=(0, 3)) 47 | car_data = car_data[~np.isnan(car_data).any(axis=1)] 48 | 49 | # Assign Horsepower attribute to x and MPG to y 50 | x = car_data[:,1] 51 | y = car_data[:,0] 52 | 53 | x = np.array([x]).T 54 | y = np.array([y]).T 55 | 56 | # Set the order of the model and generate the X matrix 57 | k = 1 58 | x = polyMatrix(x, k) 59 | 60 | # Set the number of folds 61 | folds = 10 62 | 63 | # Get the folds indexes 64 | kf = KFold(x.shape[0], n_folds = folds, shuffle = True) 65 | 66 | # Initialise an array to keep the errors from each iteration 67 | sse = np.zeros(folds) 68 | 69 | fold_index = 0 70 | 71 | # Perform k-fold cross valdation 72 | for train_index, test_index in kf: 73 | 74 | # Get the training and test subsets 75 | x_train, x_test = x[train_index], x[test_index] 76 | y_train, y_test = y[train_index], y[test_index] 77 | 78 | # Add ones for w_0 79 | x_train = np.hstack((np.array([np.ones(x_train.shape[0])]).T, x_train)) 80 | x_test = np.hstack((np.array([np.ones(x_test.shape[0])]).T, x_test)) 81 | 82 | # Initialise model parameters 83 | w = np.array([np.zeros(x_train.shape[1])]).T 84 | 85 | # Normal equations method 86 | xTx = np.linalg.inv(x_train.T.dot(x_train)) 87 | xTy = x_train.T.dot(y_train) 88 | w = xTx.dot(xTy) 89 | 90 | # Compute error sum of squares 91 | sse[fold_index] = np.sum( (y_hat(x_test, w) - y_test) ** 2) 92 | print("SSE[%i]: %.2f" % (fold_index, sse[fold_index])) 93 | 94 | fold_index = fold_index + 1 95 | 96 | # Print the average error from all folds 97 | print("Average SSE : %.2f" % (np.average(sse))) 98 | 99 | -------------------------------------------------------------------------------- /Lecture 3 - Curve Fitting and Model Validation/polyfit-auto-mpg-t-test.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | (C) 2016 Nikolay Manchev, London Machine Learning Study Group 4 | 5 | http://www.meetup.com/London-Machine-Learning-Study-Group/ 6 | 7 | This work is licensed under the Creative Commons Attribution 4.0 International 8 | License. To view a copy of this license, visit 9 | http://creativecommons.org/licenses/by/4.0/. 10 | """ 11 | 12 | import matplotlib.pyplot as plt 13 | import numpy as np 14 | 15 | from numpy.polynomial.polynomial import polyval 16 | from scipy import stats 17 | 18 | def y_hat(x, w): 19 | """ 20 | Linear regression hypothesis: y_hat = X.w 21 | """ 22 | return x.dot(w) 23 | 24 | def polyMatrix(v, order): 25 | """ 26 | Given a nx1 vector v, the function generates a matrix of the form: 27 | 28 | [ v[0] v[0]^2 ... v[0]^order ] 29 | [ v[1] v[1]^2 ... v[1]^order ] 30 | [ ... ] 31 | [ v[n] v[n]^2 ... v[n]^order ] 32 | 33 | """ 34 | vector = v 35 | v_pow = 2 36 | 37 | while v_pow <= order: 38 | v = np.hstack((v, np.power(vector, v_pow))) 39 | v_pow = v_pow + 1 40 | 41 | return v 42 | 43 | # Load the data set 44 | # We use Auto MPG from UCI Machine Learning Repository 45 | # https://archive.ics.uci.edu/ml/datasets/Auto+MPG 46 | 47 | car_data = np.genfromtxt("auto-mpg.data", usecols=(0, 3)) 48 | car_data = car_data[~np.isnan(car_data).any(axis=1)] 49 | 50 | # Assign Horsepower attribute to x and MPG to y 51 | x = car_data[:,1] 52 | y = car_data[:,0] 53 | 54 | x = np.array([x]).T 55 | y = np.array([y]).T 56 | 57 | # Set the order of the model and get the X matrix 58 | k = 1 59 | x = polyMatrix(x, k) 60 | 61 | # Add ones for w_0 62 | x = np.hstack((np.array([np.ones(x.shape[0])]).T, x)) 63 | 64 | # Initialise model parameters 65 | w = np.array([np.zeros(x.shape[1])]).T 66 | 67 | # Normal equations method 68 | xTx = np.linalg.inv(x.T.dot(x)) 69 | xTy = x.T.dot(y) 70 | w = xTx.dot(xTy) 71 | 72 | print("Normal Equations Model parameters:\n") 73 | print(w) 74 | 75 | # Plot the data points 76 | f, ax1 = plt.subplots(1, 1, figsize=(7,7)) 77 | ax1.scatter(x[:,1], y) 78 | 79 | # Plot a smooth curve using the fitted coefficients 80 | x_smooth = np.linspace(x[:,1].min(), x[:,1].max(), 200) 81 | f = np.squeeze(polyval(x_smooth, w)) 82 | ax1.plot(x_smooth, f, color='r') 83 | 84 | ax1.set_title('Auto MPG - MPG vs Horsepower') 85 | ax1.grid(True) 86 | 87 | # Compute the p-values 88 | print("\n") 89 | print("Individual Regression Coefficients t Test:\n") 90 | 91 | # Compute sum of squared errors 92 | sse = np.sum( (y_hat(x, w) - y) ** 2) 93 | print("SSE:", sse) 94 | 95 | # Compute sample variance (SSE / degree of freedom) 96 | n = x.shape[0] 97 | sigma = sse / (n - (k+1)) 98 | print("sigma:", sigma) 99 | 100 | # Covariance matrix 101 | C = sigma * np.linalg.inv(np.dot(x.T, x)) 102 | 103 | # Test statistics for the coefficients 104 | se = np.sqrt(C.diagonal()) 105 | 106 | t0w = np.zeros(len(se)) 107 | for i in range(len(se)): 108 | t0w[i] = w[i] / se[i] 109 | 110 | p_values = 1 - stats.t.cdf(abs(t0w), float(n - (k+1))) 111 | 112 | print("P values:") 113 | for p in p_values: 114 | print('%f' % p) 115 | -------------------------------------------------------------------------------- /Lecture 3 - Curve Fitting and Model Validation/polyfit-auto-mpg.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | (C) 2016 Nikolay Manchev, London Machine Learning Study Group 4 | 5 | http://www.meetup.com/London-Machine-Learning-Study-Group/ 6 | 7 | This work is licensed under the Creative Commons Attribution 4.0 International 8 | License. To view a copy of this license, visit 9 | http://creativecommons.org/licenses/by/4.0/. 10 | """ 11 | 12 | import matplotlib.pyplot as plt 13 | import numpy as np 14 | 15 | from numpy.polynomial.polynomial import polyval 16 | 17 | def y_hat(x, w): 18 | """ 19 | Linear regression hypothesis: y_hat = X.w 20 | """ 21 | return x.dot(w) 22 | 23 | def polyMatrix(v, order): 24 | """ 25 | Given a nx1 vector v, the function generates a matrix of the form: 26 | 27 | [ v[0] v[0]^2 ... v[0]^order ] 28 | [ v[1] v[1]^2 ... v[1]^order ] 29 | [ ... ] 30 | [ v[n] v[n]^2 ... v[n]^order ] 31 | 32 | """ 33 | vector = v 34 | v_pow = 2 35 | 36 | while v_pow <= order: 37 | v = np.hstack((v, np.power(vector, v_pow))) 38 | v_pow = v_pow + 1 39 | 40 | return v 41 | 42 | # Load the data set 43 | # We use Auto MPG from UCI Machine Learning Repository 44 | # https://archive.ics.uci.edu/ml/datasets/Auto+MPG 45 | 46 | car_data = np.genfromtxt("auto-mpg.data", usecols=(0, 3)) 47 | car_data = car_data[~np.isnan(car_data).any(axis=1)] 48 | 49 | # Assign Horsepower attribute to x and MPG to y 50 | x = car_data[:,1] 51 | y = car_data[:,0] 52 | 53 | x = np.array([x]).T 54 | y = np.array([y]).T 55 | 56 | # Set the order of the model and get the X matrix 57 | k = 1 58 | x = polyMatrix(x, k) 59 | 60 | # Add ones for w_0 61 | x = np.hstack((np.array([np.ones(x.shape[0])]).T, x)) 62 | 63 | # Initialise model parameters 64 | w = np.array([np.zeros(x.shape[1])]).T 65 | 66 | # Normal equations method 67 | xTx = np.linalg.inv(x.T.dot(x)) 68 | xTy = x.T.dot(y) 69 | w = xTx.dot(xTy) 70 | 71 | print("Normal Equations Model parameters:\n") 72 | print(w) 73 | 74 | # Plot the data points 75 | f, ax1 = plt.subplots(1, 1, figsize=(7,7)) 76 | ax1.scatter(x[:,1], y) 77 | 78 | # Plot a smooth curve using the fitted coefficients 79 | x_smooth = np.linspace(x[:,1].min(), x[:,1].max(), 200) 80 | f = np.squeeze(polyval(x_smooth, w)) 81 | ax1.plot(x_smooth, f, color='r') 82 | 83 | ax1.set_title('Auto MPG - MPG vs Horsepower') 84 | ax1.grid(True) 85 | -------------------------------------------------------------------------------- /Lecture 3 - Curve Fitting and Model Validation/polyfit-generalisation.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | (C) 2016 Nikolay Manchev, London Machine Learning Study Group 4 | 5 | http://www.meetup.com/London-Machine-Learning-Study-Group/ 6 | 7 | This work is licensed under the Creative Commons Attribution 4.0 International 8 | License. To view a copy of this license, visit 9 | http://creativecommons.org/licenses/by/4.0/. 10 | """ 11 | 12 | import matplotlib.pyplot as plt 13 | import numpy as np 14 | 15 | from sklearn.cross_validation import train_test_split 16 | 17 | def corr_vars( start=1, stop=10, step=1, mu=0, sigma=3, func=lambda x: x ): 18 | 19 | # Generate x 20 | x = np.arange(start, stop, step) 21 | 22 | # Generate random noise 23 | e = np.random.normal(mu, sigma, x.size) 24 | 25 | # Generate y values as y = func(x) + e 26 | y = np.zeros(x.size) 27 | 28 | for ind in range(x.size): 29 | y[ind] = func(x[ind]) + e[ind] 30 | 31 | return (x,y) 32 | 33 | def y_hat(x, w): 34 | """ 35 | Linear regression hypothesis: y_hat = X.w 36 | """ 37 | return x.dot(w) 38 | 39 | def polyMatrix(v, order): 40 | """ 41 | Given a nx1 vector v, the function generates a matrix of the form: 42 | 43 | [ v[0] v[0]^2 ... v[0]^order ] 44 | [ v[1] v[1]^2 ... v[1]^order ] 45 | [ ... ] 46 | [ v[n] v[n]^2 ... v[n]^order ] 47 | 48 | """ 49 | vector = v 50 | v_pow = 2 51 | 52 | while v_pow <= order: 53 | v = np.hstack((v, np.power(vector, v_pow))) 54 | v_pow = v_pow + 1 55 | 56 | return v 57 | 58 | def trainPolyFit(x, y, order): 59 | x = polyMatrix(x, order) 60 | 61 | # Add ones for w_0 62 | x = np.hstack((np.array([np.ones(x.shape[0])]).T, x)) 63 | 64 | # Hold out 65 | x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20) 66 | 67 | # Initialise model parameters 68 | w = np.array([np.zeros(x.shape[1])]).T 69 | 70 | # Normal equations method 71 | xTx = np.linalg.inv(x_train.T.dot(x_train)) 72 | xTy = x_train.T.dot(y_train) 73 | w = xTx.dot(xTy) 74 | 75 | # Compute error 76 | N = y_test.shape[0] 77 | 78 | train_err = np.sum( (y_hat(x_train, w) - y_train) ** 2 ) / (2 * N) 79 | test_err = np.sum( (y_hat(x_test, w) - y_test) ** 2 ) / (2 * N) 80 | 81 | return train_err, test_err 82 | 83 | 84 | np.random.seed(100) 85 | 86 | (x,y) = corr_vars(sigma=2, func=lambda x: 4*np.log2(x)) 87 | 88 | x = np.array([x]).T 89 | y = np.array([y]).T 90 | 91 | # Vary the order of the model and compute the 92 | # training and test errors 93 | errors = np.zeros([5,2]) 94 | for order in range(1,len(errors) + 1): 95 | errors[order-1, ] = trainPolyFit(x, y, order) 96 | 97 | print("Training and test errors:\n") 98 | print(errors) 99 | 100 | # Plot X and y 101 | f, ax1 = plt.subplots(figsize=(7,7)) 102 | 103 | # Plot the regression line 104 | x_plot = np.arange(1, len(errors)+1) 105 | ax1.plot(x_plot, errors[:,0], color='b', label="Training") 106 | ax1.plot(x_plot, errors[:,1], color='r', label="Test") 107 | ax1.grid(True) 108 | 109 | ax1.legend(loc="upper right", shadow=True) 110 | -------------------------------------------------------------------------------- /Lecture 3 - Curve Fitting and Model Validation/polyfit.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | (C) 2016 Nikolay Manchev, London Machine Learning Study Group 4 | 5 | http://www.meetup.com/London-Machine-Learning-Study-Group/ 6 | 7 | This work is licensed under the Creative Commons Attribution 4.0 International 8 | License. To view a copy of this license, visit 9 | http://creativecommons.org/licenses/by/4.0/. 10 | """ 11 | 12 | import matplotlib.pyplot as plt 13 | import numpy as np 14 | 15 | from numpy.polynomial.polynomial import polyval 16 | 17 | 18 | def corr_vars( start=1, stop=10, step=1, mu=0, sigma=3, func=lambda x: x ): 19 | 20 | # Generate x 21 | x = np.arange(start, stop, step) 22 | 23 | # Generate random noise 24 | e = np.random.normal(mu, sigma, x.size) 25 | 26 | # Generate y values as y = func(x) + e 27 | y = np.zeros(x.size) 28 | 29 | for ind in range(x.size): 30 | y[ind] = func(x[ind]) + e[ind] 31 | 32 | return (x,y) 33 | 34 | def y_hat(x, w): 35 | """ 36 | Linear regression hypothesis: y_hat = X.w 37 | """ 38 | return x.dot(w) 39 | 40 | def polyMatrix(x, order): 41 | """ 42 | Given a nx1 vector x, the function generates a matrix of the form: 43 | 44 | [ x[0] x[0]^2 ... x[0]^order ] 45 | [ x[1] x[1]^2 ... x[1]^order ] 46 | [ ... ] 47 | [ x[n] x[n]^2 ... x[n]^order ] 48 | 49 | """ 50 | vector = x 51 | x_pow = 2 52 | 53 | while x_pow <= order: 54 | x = np.hstack((x, np.power(vector, x_pow))) 55 | x_pow = x_pow + 1 56 | 57 | return x 58 | 59 | np.random.seed(100) 60 | 61 | (x,y) = corr_vars(sigma=2, func=lambda x: 4*np.log2(x)) 62 | 63 | x = np.array([x]).T 64 | y = np.array([y]).T 65 | 66 | # Set the order of the model and get the X matrix 67 | k = 1 68 | x = polyMatrix(x, k) 69 | 70 | # Add ones for w_0 71 | x = np.hstack((np.array([np.ones(x.shape[0])]).T, x)) 72 | 73 | # Initialise model parameters 74 | w = np.array([np.zeros(x.shape[1])]).T 75 | 76 | # Print X and y 77 | print(x,'\n') 78 | print(y,'\n') 79 | 80 | # Normal equations method 81 | xTx = np.linalg.inv(x.T.dot(x)) 82 | xTy = x.T.dot(y) 83 | w = xTx.dot(xTy) 84 | 85 | # Print the model parameters 86 | print("Normal Equations Model parameters:\n") 87 | print(w) 88 | 89 | # Plot the data points 90 | f, ax1 = plt.subplots(1, 1, figsize=(7,7)) 91 | ax1.scatter(x[:,1], y) 92 | 93 | # Plot a smooth curve using the fitted coefficients 94 | x_smooth = np.linspace(x[:,1].min(), x[:,1].max(), 200) 95 | f = np.squeeze(polyval(x_smooth, w)) 96 | ax1.plot(x_smooth, f, color='r') 97 | 98 | ax1.set_title('y = 4*log2(x) + e') 99 | ax1.grid(True) 100 | 101 | -------------------------------------------------------------------------------- /Lecture 3 - Curve Fitting and Model Validation/residuals-auto-mpg.py: -------------------------------------------------------------------------------- 1 | """ 2 | (C) 2016 Nikolay Manchev 3 | [London Machine Learning Study Group](http://www.meetup.com/London-Machine-Learning-Study-Group/members/) 4 | 5 | This work is licensed under the Creative Commons Attribution 4.0 International 6 | License. To view a copy of this license, visit 7 | http://creativecommons.org/licenses/by/4.0/. 8 | """ 9 | 10 | import numpy as np 11 | 12 | import matplotlib.pyplot as plt 13 | 14 | # Load the data set 15 | # We use Auto MPG from UCI Machine Learning Repository 16 | # https://archive.ics.uci.edu/ml/datasets/Auto+MPG 17 | 18 | car_data = np.genfromtxt("auto-mpg.data", usecols=(0, 3)) 19 | car_data = car_data[~np.isnan(car_data).any(axis=1)] 20 | 21 | # Assign Horsepower attribute to x and MPG to y 22 | x = car_data[:,1] 23 | y = car_data[:,0] 24 | 25 | x = np.array([x]).T 26 | y = np.array([y]).T 27 | 28 | # Add ones for w_0 29 | mat_ones = np.ones(shape=(x.shape[0], 2)) 30 | mat_ones[:,1] = x[:,0] 31 | x = mat_ones 32 | 33 | # Normal equations method 34 | xTx_inv = np.linalg.inv(x.T.dot(x)) 35 | xTy = x.T.dot(y) 36 | w = xTx_inv.dot(xTy) 37 | 38 | # Print intercept and slope 39 | print("Model parameters:\n") 40 | print(w) 41 | 42 | # Make predictions on the training set 43 | y_hat = w[0] + w[1]*x[:,1] 44 | 45 | # Get the residuals 46 | y_hat = y_hat.reshape(y_hat.shape[0],-1) 47 | residuals = np.subtract(y_hat, y) 48 | 49 | # Plot a histogram of the residuals 50 | plt.figure(figsize=(10,8)) 51 | n, bins, patches = plt.hist(residuals, 30, facecolor='green', alpha=0.75) 52 | 53 | plt.title("Histogram of Residuals") 54 | plt.grid(True) 55 | plt.show() 56 | 57 | # Plot residuals vs predictions 58 | plt.rcParams.update({'font.size': 15}) 59 | 60 | f, ax = plt.subplots( figsize=(10,8)) 61 | 62 | ax.scatter(y_hat, residuals, s=10) 63 | ax.axhline(0, color='red') 64 | 65 | ax.set_title("Residuals vs fitted") 66 | ax.set_ylabel("Residuals") 67 | ax.set_xlabel("$\hat y$",fontsize=20) 68 | ax.grid(True) 69 | -------------------------------------------------------------------------------- /Lecture 3 - Curve Fitting and Model Validation/residuals-random.py: -------------------------------------------------------------------------------- 1 | """ 2 | (C) 2016 Nikolay Manchev 3 | [London Machine Learning Study Group](http://www.meetup.com/London-Machine-Learning-Study-Group/members/) 4 | 5 | This work is licensed under the Creative Commons Attribution 4.0 International 6 | License. To view a copy of this license, visit 7 | http://creativecommons.org/licenses/by/4.0/. 8 | """ 9 | 10 | import numpy as np 11 | 12 | import matplotlib.pyplot as plt 13 | import matplotlib.mlab as mlab 14 | 15 | def corr_vars( start=1, stop=10, step=1, mu=0, sigma=3, func=lambda x: x ): 16 | 17 | # Generate x 18 | x = np.arange(start, stop, step) 19 | 20 | # Generate random noise 21 | e = np.random.normal(mu, sigma, x.size) 22 | 23 | # Generate y values as y = func(x) + e 24 | y = np.zeros(x.size) 25 | 26 | for ind in range(x.size): 27 | y[ind] = func(x[ind]) + e[ind] 28 | 29 | return (x,y) 30 | 31 | # Populate the data set 32 | np.random.seed(100) 33 | 34 | # Select a linear or non-linear function for the data generation 35 | (x,y) = corr_vars(start=-3, stop=3, sigma=5, step = 0.005, func=lambda x:np.power(x,4) - 3*np.power(x,3) +8*np.power(x,2) + 7*x) 36 | #(x,y) = corr_vars(start=-3, stop=3, sigma=5, step = 0.005, func=lambda x:7+6*x) 37 | 38 | x = np.array([x]).T 39 | y = np.array([y]).T 40 | 41 | # Add ones for w_0 42 | mat_ones = np.ones(shape=(x.size, 2)) 43 | mat_ones[:,1] = x[:,0] 44 | x = mat_ones 45 | 46 | # Normal equations method 47 | xTx_inv = np.linalg.inv(x.T.dot(x)) 48 | xTy = x.T.dot(y) 49 | w = xTx_inv.dot(xTy) 50 | 51 | print("Model parameters:\n") 52 | print(w) 53 | 54 | # Make predictions on the training set 55 | y_hat = w[0] + w[1]*x[:,1] 56 | 57 | # Get the residuals 58 | y_hat = y_hat.reshape(y_hat.shape[0],-1) 59 | residuals = np.subtract(y_hat, y) 60 | 61 | # Plot the data and the fitted line 62 | 63 | f, (ax1, ax2) = plt.subplots(2, figsize=(10,8)) 64 | f.subplots_adjust(hspace=.5) 65 | 66 | ax1.scatter(x[:,1], y, s=10) 67 | ax1.plot(x[:,1], y_hat, color='r') 68 | 69 | ax1.set_title("Data and fitted line") 70 | ax1.set_xlabel("$x$") 71 | ax1.set_ylabel("$y$") 72 | ax1.grid(True) 73 | 74 | # Plot residuals 75 | 76 | n, bins, patches = ax2.hist(residuals, 20, normed=1, facecolor='green', alpha=0.75) 77 | 78 | # Plot expected distribution 79 | x_exp = np.linspace(-100, 100) 80 | y_exp = mlab.normpdf(x_exp , np.mean(residuals), np.std(residuals)) 81 | l = ax2.plot(x_exp, y_exp, 'r--', linewidth=1) 82 | 83 | ax2.set_title("Histogram of Residuals") 84 | ax2.grid(True) 85 | ax2.set_xlim([-100,100]) 86 | 87 | 88 | 89 | 90 | -------------------------------------------------------------------------------- /Lecture 3 - Curve Fitting and Model Validation/residuals-vs-fitted.py: -------------------------------------------------------------------------------- 1 | """ 2 | (C) 2016 Nikolay Manchev 3 | [London Machine Learning Study Group](http://www.meetup.com/London-Machine-Learning-Study-Group/members/) 4 | 5 | This work is licensed under the Creative Commons Attribution 4.0 International 6 | License. To view a copy of this license, visit 7 | http://creativecommons.org/licenses/by/4.0/. 8 | """ 9 | 10 | import numpy as np 11 | 12 | import matplotlib.pyplot as plt 13 | 14 | def corr_vars( start=1, stop=10, step=1, mu=0, sigma=3, func=lambda x: x ): 15 | 16 | # Generate x 17 | x = np.arange(start, stop, step) 18 | 19 | # Generate random noise 20 | e = np.random.normal(mu, sigma, x.size) 21 | 22 | # Generate y values as y = func(x) + e 23 | y = np.zeros(x.size) 24 | 25 | for ind in range(x.size): 26 | y[ind] = func(x[ind]) + e[ind] 27 | 28 | return (x,y) 29 | 30 | # Initialise the data set 31 | np.random.seed(100) 32 | 33 | # Generate data using one of the two functions (linear and non-linear) 34 | #(x,y) = corr_vars(start=-3, stop=3, sigma=5, step = 0.005, func=lambda x:np.power(x,4) - 3*np.power(x,3) +8*np.power(x,2) + + 7*x) 35 | (x,y) = corr_vars(start=-3, stop=3, sigma=5, step = 0.005, func=lambda x:7+6*x) 36 | 37 | x = np.array([x]).T 38 | y = np.array([y]).T 39 | 40 | # Add ones for w_0 41 | mat_ones = np.ones(shape=(x.size, 2)) 42 | mat_ones[:,1] = x[:,0] 43 | x = mat_ones 44 | 45 | # Normal equations method 46 | xTx_inv = np.linalg.inv(x.T.dot(x)) 47 | xTy = x.T.dot(y) 48 | w = xTx_inv.dot(xTy) 49 | 50 | print("Model parameters:\n") 51 | print(w) 52 | 53 | # Make predictions on the training set 54 | y_hat = w[0] + w[1]*x[:,1] 55 | 56 | # Get the residuals 57 | y_hat = y_hat.reshape(y_hat.shape[0],-1) 58 | residuals = np.subtract(y_hat, y) 59 | 60 | # Plot residuals vs predictions 61 | plt.rcParams.update({'font.size': 15}) 62 | 63 | f, ax = plt.subplots( figsize=(10,8)) 64 | 65 | ax.scatter(y_hat, residuals, s=10) 66 | ax.axhline(0, color='red') 67 | 68 | ax.set_title("Residuals vs fitted") 69 | ax.set_ylabel("Residuals") 70 | ax.set_xlabel("$\hat y$",fontsize=20) 71 | ax.grid(True) 72 | -------------------------------------------------------------------------------- /Lecture 4 - Decision Trees/Decision_Trees.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nmanchev/MachineLearningStudyGroup/092f642d888f3dfb105aa8768d4a6927c93a4278/Lecture 4 - Decision Trees/Decision_Trees.pdf -------------------------------------------------------------------------------- /Lecture 4 - Decision Trees/README.md: -------------------------------------------------------------------------------- 1 | ## Decision Trees 2 | 3 | Code examples used in Lecture 4 4 | 5 | * auto-mpg-modified.data - A modified version of the Auto MPG data set from UCI Machine Learning Repository, with the continues MPG attribute partitioned as follows: 6 | * [9;19) - BAD 7 | * (9;26] - OK 8 | * (26;47] - GOOD 9 | * entropy.py - Splits a data set by attribute and threshold value and computes the entropy for each split 10 | * dt-credit.py - An implementation of a decision tree algorithm against the credit rating data set 11 | * scikit-dt-auto-mpg.py - A decision tree trained on the modified Auto MPG data set, using DecisionTreeClassifier from scikit-learn 12 | * overfit_demo.py - Accuracy score for training/test subset against the modified Auto MPG data set, using DecisionTreeClassifier from scikit-learn 13 | 14 | This repository contains materials from the London Machine Learning Study Group Meetups 15 | 16 | The meetup page is available at [http://www.meetup.com/London-Machine-Learning-Study-Group](http://www.meetup.com/London-Machine-Learning-Study-Group). 17 | 18 | (C) 2016 Nikolay Manchev, London Machine Learning Study Group 19 | 20 | This work is licensed under the Creative Commons Attribution 4.0 International License. To view a copy of this license, visit [http://creativecommons.org/licenses/by/4.0](http://creativecommons.org/licenses/by/4.0). 21 | -------------------------------------------------------------------------------- /Lecture 4 - Decision Trees/auto-mpg-modified.data: -------------------------------------------------------------------------------- 1 | 0 8 304 193 4732 18.5 70 1 hi 1200d 2 | 0 8 360 215 4615 14 70 1 ford f250 3 | 0 8 307 200 4376 15 70 1 chevy c20 4 | 0 8 318 210 4382 13.5 70 1 dodge d200 5 | 0 8 429 208 4633 11 72 1 mercury marquis 6 | 0 8 400 150 4997 14 73 1 chevrolet impala 7 | 0 8 350 180 3664 11 73 1 oldsmobile omega 8 | 0 8 383 180 4955 11.5 71 1 dodge monaco (sw) 9 | 0 8 350 160 4456 13.5 72 1 oldsmobile delta 88 royale 10 | 0 8 429 198 4952 11.5 73 1 mercury marquis brougham 11 | 0 8 455 225 4951 11 73 1 buick electra 225 custom 12 | 0 8 400 167 4906 12.5 73 1 ford country 13 | 0 8 350 180 4499 12.5 73 1 oldsmobile vista cruiser 14 | 0 8 400 170 4746 12 71 1 ford country squire (sw) 15 | 0 8 400 175 5140 12 71 1 pontiac safari (sw) 16 | 0 8 350 165 4274 12 72 1 chevrolet impala 17 | 0 8 350 155 4502 13.5 72 1 buick lesabre custom 18 | 0 8 400 190 4422 12.5 72 1 chrysler newport royal 19 | 0 8 307 130 4098 14 72 1 chevrolet chevelle concours (sw) 20 | 0 8 302 140 4294 16 72 1 ford gran torino (sw) 21 | 0 8 350 175 4100 13 73 1 buick century 350 22 | 0 8 350 145 3988 13 73 1 chevrolet malibu 23 | 0 8 400 150 4464 12 73 1 chevrolet caprice classic 24 | 0 8 351 158 4363 13 73 1 ford ltd 25 | 0 8 440 215 4735 11 73 1 chrysler new yorker brougham 26 | 0 8 360 175 3821 11 73 1 amc ambassador brougham 27 | 0 8 360 170 4654 13 73 1 plymouth custom suburb 28 | 0 8 350 150 4699 14.5 74 1 buick century luxus (sw) 29 | 0 8 302 129 3169 12 75 1 ford mustang ii 30 | 0 8 318 150 3940 13.2 76 1 plymouth volare premier v8 31 | 0 8 350 145 4055 12 76 1 chevy c10 32 | 0 8 302 130 3870 15 76 1 ford f108 33 | 0 8 318 150 3755 14 76 1 dodge d100 34 | 0 8 454 220 4354 9 70 1 chevrolet impala 35 | 0 8 440 215 4312 8.5 70 1 plymouth fury iii 36 | 0 8 455 225 4425 10 70 1 pontiac catalina 37 | 0 8 340 160 3609 8 70 1 plymouth 'cuda 340 38 | 0 8 455 225 3086 10 70 1 buick estate wagon (sw) 39 | 0 8 350 165 4209 12 71 1 chevrolet impala 40 | 0 8 400 175 4464 11.5 71 1 pontiac catalina brougham 41 | 0 8 351 153 4154 13.5 71 1 ford galaxie 500 42 | 0 8 318 150 4096 13 71 1 plymouth fury iii 43 | 0 8 400 175 4385 12 72 1 pontiac catalina 44 | 0 8 351 153 4129 13 72 1 ford galaxie 500 45 | 0 8 318 150 4077 14 72 1 plymouth satellite custom (sw) 46 | 0 8 304 150 3672 11.5 73 1 amc matador 47 | 0 8 302 137 4042 14.5 73 1 ford gran torino 48 | 0 8 318 150 4237 14.5 73 1 plymouth fury gran sedan 49 | 0 8 318 150 4457 13.5 74 1 dodge coronet custom (sw) 50 | 0 8 302 140 4638 16 74 1 ford gran torino (sw) 51 | 0 8 304 150 4257 15.5 74 1 amc matador (sw) 52 | 0 8 351 148 4657 13.5 75 1 ford ltd 53 | 0 8 351 152 4215 12.8 76 1 ford gran torino 54 | 0 8 350 165 3693 11.5 70 1 buick skylark 320 55 | 0 8 429 198 4341 10 70 1 ford galaxie 500 56 | 0 8 390 190 3850 8.5 70 1 amc ambassador dpl 57 | 0 8 383 170 3563 10 70 1 dodge challenger se 58 | 0 8 400 150 3761 9.5 70 1 chevrolet monte carlo 59 | 0 8 318 150 4135 13.5 72 1 plymouth fury iii 60 | 0 8 304 150 3892 12.5 72 1 amc matador (sw) 61 | 0 8 318 150 3777 12.5 73 1 dodge coronet custom 62 | 0 8 350 145 4082 13 73 1 chevrolet monte carlo s 63 | 0 8 318 150 3399 11 73 1 dodge dart custom 64 | 0 6 250 100 3336 17 74 1 chevrolet nova 65 | 0 6 250 72 3432 21 75 1 mercury monarch 66 | 0 6 250 72 3158 19.5 75 1 ford maverick 67 | 0 8 350 145 4440 14 75 1 chevrolet bel air 68 | 0 6 258 110 3730 19 75 1 amc matador 69 | 0 8 302 130 4295 14.9 77 1 mercury cougar brougham 70 | 0 8 304 120 3962 13.9 76 1 amc matador 71 | 0 8 318 145 4140 13.7 77 1 dodge monaco brougham 72 | 0 8 350 170 4165 11.4 77 1 chevrolet monte carlo landau 73 | 0 8 400 190 4325 12.2 77 1 chrysler cordoba 74 | 0 8 351 142 4054 14.3 79 1 ford country squire (sw) 75 | 0 8 304 150 3433 12 70 1 amc rebel sst 76 | 0 6 225 105 3439 15.5 71 1 plymouth satellite custom 77 | 0 6 250 100 3278 18 73 1 chevrolet nova custom 78 | 0 8 400 230 4278 9.5 73 1 pontiac grand prix 79 | 0 6 250 100 3781 17 74 1 chevrolet chevelle malibu classic 80 | 0 6 258 110 3632 18 74 1 amc matador 81 | 0 8 302 140 4141 14 74 1 ford gran torino 82 | 0 8 400 170 4668 11.5 75 1 pontiac catalina 83 | 0 8 318 150 4498 14.5 75 1 plymouth grand fury 84 | 0 6 250 105 3897 18.5 75 1 chevroelt chevelle malibu 85 | 0 8 318 150 4190 13 76 1 dodge coronet brougham 86 | 0 8 400 180 4220 11.1 77 1 pontiac grand prix lj 87 | 0 8 351 149 4335 14.5 77 1 ford thunderbird 88 | 0 6 163 133 3410 15.8 78 2 peugeot 604sl 89 | 0 6 168 120 3820 16.7 76 2 mercedes-benz 280s 90 | 0 8 350 180 4380 12.1 76 1 cadillac seville 91 | 0 8 351 138 3955 13.2 79 1 mercury grand marquis 92 | 0 8 350 155 4360 14.9 79 1 buick estate wagon (sw) 93 | 0 8 302 140 3449 10.5 70 1 ford torino 94 | 0 6 250 100 3329 15.5 71 1 chevrolet chevelle malibu 95 | 0 8 304 150 3672 11.5 72 1 amc ambassador sst 96 | 0 6 231 110 3907 21 75 1 buick century 97 | 0 8 260 110 4060 19 77 1 oldsmobile cutlass supreme 98 | 0 6 163 125 3140 13.6 78 2 volvo 264gl 99 | 0 8 305 130 3840 15.4 79 1 chevrolet caprice classic 100 | 0 8 305 140 4215 13 76 1 chevrolet chevelle malibu classic 101 | 0 6 258 95 3193 17.8 76 1 amc pacer d/l 102 | 0 8 305 145 3880 12.5 77 1 chevrolet caprice classic 103 | 0 6 250 110 3520 16.4 77 1 chevrolet concours 104 | 0 8 318 140 4080 13.7 78 1 dodge magnum xe 105 | 0 8 302 129 3725 13.4 79 1 ford ltd landau 106 | 0 6 225 85 3465 16.6 81 1 chrysler lebaron salon 107 | 0 6 231 165 3445 13.4 78 1 buick regal sport coupe (turbo) 108 | 0 8 307 130 3504 12 70 1 chevrolet chevelle malibu 109 | 0 8 318 150 3436 11 70 1 plymouth satellite 110 | 0 6 199 97 2774 15.5 70 1 amc hornet 111 | 0 6 232 100 3288 15.5 71 1 amc matador 112 | 0 6 258 110 2962 13.5 71 1 amc hornet sportabout (sw) 113 | 0 6 250 88 3139 14.5 71 1 ford mustang 114 | 0 4 121 112 2933 14.5 72 2 volvo 145e (sw) 115 | 0 6 225 105 3121 16.5 73 1 plymouth valiant 116 | 0 6 232 100 2945 16 73 1 amc hornet 117 | 0 6 250 88 3021 16.5 73 1 ford maverick 118 | 0 6 232 100 2789 15 73 1 amc gremlin 119 | 0 3 70 90 2124 13.5 73 3 maxda rx3 120 | 0 6 225 105 3613 16.5 74 1 plymouth satellite sebring 121 | 0 6 250 105 3459 16 75 1 chevrolet nova 122 | 0 6 225 95 3785 19 75 1 plymouth fury 123 | 0 6 171 97 2984 14.5 75 1 ford pinto 124 | 0 6 250 78 3574 21 76 1 ford granada ghia 125 | 0 6 258 120 3410 15.1 78 1 amc concord d/l 126 | 0 8 302 139 3205 11.2 78 1 ford futura 127 | 0 8 318 135 3830 15.2 79 1 dodge st. regis 128 | 0 6 250 110 3645 16.2 76 1 pontiac ventura sj 129 | 0 6 250 98 3525 19 77 1 ford granada 130 | 0 8 360 150 3940 13 79 1 chrysler lebaron town @ country (sw) 131 | 0 6 225 110 3620 18.7 78 1 dodge aspen 132 | 1 6 232 100 2634 13 71 1 amc gremlin 133 | 1 6 250 88 3302 15.5 71 1 ford torino 500 134 | 1 6 250 100 3282 15 71 1 pontiac firebird 135 | 1 3 70 97 2330 13.5 72 3 mazda rx2 coupe 136 | 1 4 122 85 2310 18.5 73 1 ford pinto 137 | 1 4 121 112 2868 15.5 73 2 volvo 144ea 138 | 1 6 232 100 2901 16 74 1 amc hornet 139 | 1 6 225 95 3264 16 75 1 plymouth valiant custom 140 | 1 6 232 90 3211 17 75 1 amc pacer 141 | 1 4 120 88 3270 21.9 76 2 peugeot 504 142 | 1 6 156 108 2930 15.5 76 3 toyota mark ii 143 | 1 6 225 100 3630 17.7 77 1 plymouth volare custom 144 | 1 6 225 90 3381 18.7 80 1 dodge aspen 145 | 1 6 231 105 3535 19.2 78 1 pontiac phoenix lj 146 | 1 8 305 145 3425 13.2 78 1 chevrolet monte carlo landau 147 | 1 8 267 125 3605 15 79 1 chevrolet malibu classic (sw) 148 | 1 8 318 140 3735 13.2 78 1 dodge diplomat 149 | 1 6 232 90 3210 17.2 78 1 amc concord 150 | 1 6 200 85 2990 18.2 79 1 mercury zephyr 6 151 | 1 8 260 110 3365 15.5 78 1 oldsmobile cutlass salon brougham 152 | 1 4 140 90 2408 19.5 72 1 chevrolet vega 153 | 1 4 97 88 2279 19 73 3 toyota carina 154 | 1 4 114 91 2582 14 73 2 audi 100ls 155 | 1 6 156 122 2807 13.5 73 3 toyota mark ii 156 | 1 6 198 95 3102 16.5 74 1 plymouth duster 157 | 1 8 262 110 3221 13.5 75 1 chevrolet monza 2+2 158 | 1 6 232 100 2914 16 75 1 amc gremlin 159 | 1 6 225 100 3651 17.7 76 1 dodge aspen se 160 | 1 4 130 102 3150 15.7 76 2 volvo 245 161 | 1 8 302 139 3570 12.8 78 1 mercury monarch ghia 162 | 1 6 200 85 2965 15.8 78 1 ford fairmont (auto) 163 | 1 6 232 90 3265 18.2 79 1 amc concord dl 6 164 | 1 6 200 88 3060 17.1 81 1 ford granada gl 165 | 1 5 131 103 2830 15.9 78 2 audi 5000 166 | 1 6 231 105 3425 16.9 77 1 buick skylark 167 | 1 6 200 95 3155 18.2 78 1 chevrolet malibu 168 | 1 6 225 100 3430 17.2 78 1 plymouth volare 169 | 1 6 231 105 3380 15.8 78 1 buick century special 170 | 1 6 225 110 3360 16.6 79 1 dodge aspen 6 171 | 1 6 200 85 3070 16.7 78 1 mercury zephyr 172 | 1 6 200 85 2587 16 70 1 ford maverick 173 | 1 6 199 90 2648 15 70 1 amc gremlin 174 | 1 4 122 86 2226 16.5 72 1 ford pinto runabout 175 | 1 4 120 87 2979 19.5 72 2 peugeot 504 (sw) 176 | 1 4 140 72 2401 19.5 73 1 chevrolet vega 177 | 1 6 155 107 2472 14 73 1 mercury capri v6 178 | 1 6 200 ? 2875 17 74 1 ford maverick 179 | 1 6 231 110 3039 15 75 1 buick skyhawk 180 | 1 4 134 95 2515 14.8 78 3 toyota celica gt liftback 181 | 1 4 121 110 2600 12.8 77 2 bmw 320i 182 | 1 3 80 110 2720 13.5 77 3 mazda rx-4 183 | 1 6 231 115 3245 15.4 79 1 pontiac lemans v6 184 | 1 4 121 115 2795 15.7 78 2 saab 99gle 185 | 1 6 198 95 2833 15.5 70 1 plymouth duster 186 | 1 4 140 72 2408 19 71 1 chevrolet vega (sw) 187 | 1 4 121 76 2511 18 72 2 volkswagen 411 (sw) 188 | 1 4 122 86 2395 16 72 1 ford pinto (sw) 189 | 1 4 108 94 2379 16.5 73 3 datsun 610 190 | 1 4 121 98 2945 14.5 75 2 volvo 244dl 191 | 1 6 225 100 3233 15.4 76 1 plymouth valiant 192 | 1 6 250 105 3353 14.5 76 1 chevrolet nova 193 | 1 6 146 97 2815 14.5 77 3 datsun 810 194 | 1 6 232 112 2835 14.7 82 1 ford granada l 195 | 1 4 140 88 2890 17.3 79 1 ford fairmont 4 196 | 1 6 231 110 3415 15.8 81 1 buick century 197 | 1 6 232 90 3085 17.6 76 1 amc hornet 198 | 1 4 122 86 2220 14 71 1 mercury capri 2000 199 | 1 4 97 54 2254 23.5 72 2 volkswagen type 3 200 | 1 4 120 97 2506 14.5 72 3 toyouta corona mark ii (sw) 201 | 1 6 198 95 2904 16 73 1 plymouth duster 202 | 1 4 140 83 2639 17 75 1 ford pinto 203 | 1 4 140 78 2592 18.5 75 1 pontiac astro 204 | 1 4 115 95 2694 15 75 2 audi 100ls 205 | 1 4 120 88 2957 17 75 2 peugeot 504 206 | 1 8 350 125 3900 17.4 79 1 cadillac eldorado 207 | 1 4 151 ? 3035 20.5 82 1 amc concord dl 208 | 1 4 156 105 2745 16.7 78 1 plymouth sapporo 209 | 1 6 173 110 2725 12.6 81 1 chevrolet citation 210 | 1 4 140 ? 2905 14.3 80 1 ford mustang cobra 211 | 1 3 70 100 2420 12.5 80 3 mazda rx-7 gs 212 | 1 4 151 85 2855 17.6 78 1 oldsmobile starfire sx 213 | 1 4 119 97 2405 14.9 78 3 datsun 200-sx 214 | 1 8 260 90 3420 22.2 79 1 oldsmobile cutlass salon brougham 215 | 1 4 113 95 2372 15 70 3 toyota corona mark ii 216 | 1 4 107 90 2430 14.5 70 2 audi 100 ls 217 | 1 4 113 95 2278 15.5 72 3 toyota corona hardtop 218 | 1 4 116 75 2158 15.5 73 2 opel manta 219 | 1 4 121 110 2660 14 73 2 saab 99le 220 | 1 4 90 75 2108 15.5 74 2 fiat 128 221 | 1 4 120 97 2489 15 74 3 honda civic 222 | 1 4 134 96 2702 13.5 75 3 toyota corona 223 | 1 4 119 97 2545 17 75 3 datsun 710 224 | 1 6 200 81 3012 17.6 76 1 ford maverick 225 | 1 4 140 92 2865 16.4 82 1 ford fairmont futura 226 | 1 6 146 120 2930 13.8 81 3 datsun 810 maxima 227 | 1 4 151 90 3003 20.1 80 1 amc concord 228 | 1 4 98 60 2164 22.1 76 1 chevrolet woody 229 | 1 4 151 88 2740 16 77 1 pontiac sunbird coupe 230 | 1 4 110 87 2672 17.5 70 2 peugeot 504 231 | 1 4 104 95 2375 17.5 70 2 saab 99e 232 | 1 4 113 95 2228 14 71 3 toyota corona 233 | 1 4 98 ? 2046 19 71 1 ford pinto 234 | 1 4 97.5 80 2126 17 72 1 dodge colt hardtop 235 | 1 4 140 75 2542 17 74 1 chevrolet vega 236 | 1 4 90 71 2223 16.5 75 2 volkswagen dasher 237 | 1 4 121 115 2671 13.5 75 2 saab 99le 238 | 1 4 116 81 2220 16.9 76 2 opel 1900 239 | 1 4 140 92 2572 14.9 76 1 capri ii 240 | 1 6 181 110 2945 16.4 82 1 buick century limited 241 | 1 4 140 88 2720 15.4 78 1 ford fairmont (man) 242 | 1 5 183 77 3530 20.1 79 2 mercedes benz 300d 243 | 1 6 168 116 2900 12.6 81 3 toyota cressida 244 | 1 4 122 96 2300 15.5 77 1 plymouth arrow gs 245 | 1 4 140 89 2755 15.8 77 1 ford mustang ii 2+2 246 | 1 4 156 92 2620 14.4 81 1 dodge aries wagon (sw) 247 | 1 4 97 46 1835 20.5 70 2 volkswagen 1131 deluxe sedan 248 | 1 4 121 113 2234 12.5 70 2 bmw 2002 249 | 1 4 91 70 1955 20.5 71 1 plymouth cricket 250 | 1 4 96 69 2189 18 72 2 renault 12 (sw) 251 | 1 4 97 46 1950 21 73 2 volkswagen super beetle 252 | 1 4 98 90 2265 15.5 73 2 fiat 124 sport coupe 253 | 1 4 122 80 2451 16.5 74 1 ford pinto 254 | 1 4 79 67 1963 15.5 74 2 volkswagen dasher 255 | 1 4 97 78 2300 14.5 74 2 opel manta 256 | 1 4 116 75 2246 14 74 2 fiat 124 tc 257 | 1 4 108 93 2391 15.5 74 3 subaru 258 | 1 4 98 79 2255 17.7 76 1 dodge colt 259 | 1 4 97 75 2265 18.2 77 3 toyota corolla liftback 260 | 1 4 156 92 2585 14.5 82 1 chrysler lebaron medallion 261 | 2 4 140 88 2870 18.1 80 1 ford fairmont 262 | 2 4 140 72 2565 13.6 76 1 ford pinto 263 | 2 4 151 84 2635 16.4 81 1 buick skylark 264 | 2 8 350 105 3725 19 81 1 oldsmobile cutlass ls 265 | 2 6 173 115 2700 12.9 79 1 oldsmobile omega brougham 266 | 2 4 97 88 2130 14.5 70 3 datsun pl510 267 | 2 4 97 88 2130 14.5 71 3 datsun pl510 268 | 2 4 97 60 1834 19 71 2 volkswagen model 111 269 | 2 4 97 88 2100 16.5 72 3 toyota corolla 1600 (sw) 270 | 2 4 101 83 2202 15.3 76 2 renault 12tl 271 | 2 4 112 88 2640 18.6 82 1 chevrolet cavalier wagon 272 | 2 4 151 90 2735 18 82 1 pontiac phoenix 273 | 2 4 151 90 2950 17.3 82 1 chevrolet camaro 274 | 2 4 140 86 2790 15.6 82 1 ford mustang gl 275 | 2 4 119 97 2300 14.7 78 3 datsun 510 276 | 2 4 141 71 3190 24.8 79 2 peugeot 504 277 | 2 4 135 84 2490 15.7 81 1 plymouth reliant 278 | 2 4 121 80 2670 15 79 1 amc spirit dl 279 | 2 4 134 95 2560 14.2 78 3 toyota corona 280 | 2 4 156 105 2800 14.4 80 1 dodge colt 281 | 2 4 140 90 2264 15.5 71 1 chevrolet vega 2300 282 | 2 4 116 90 2123 14 71 2 opel 1900 283 | 2 4 97 92 2288 17 72 3 datsun 510 (sw) 284 | 2 4 98 80 2164 15 72 1 dodge colt (sw) 285 | 2 4 90 75 2125 14.5 74 1 dodge colt 286 | 2 4 107 86 2464 15.5 76 2 fiat 131 287 | 2 4 97 75 2155 16.4 76 3 toyota corolla 288 | 2 4 151 90 2678 16.5 80 1 chevrolet citation 289 | 2 4 112 88 2605 19.6 82 1 chevrolet cavalier 290 | 2 4 120 79 2625 18.6 82 1 ford ranger 291 | 2 4 141 80 3230 20.4 81 2 peugeot 505s turbo diesel 292 | 2 4 151 90 2670 16 79 1 buick skylark limited 293 | 2 6 173 115 2595 11.3 79 1 chevrolet citation 294 | 2 4 68 49 1867 19.5 73 2 fiat 128 295 | 2 4 98 83 2219 16.5 74 2 audi fox 296 | 2 4 97 75 2171 16 75 3 toyota corolla 297 | 2 4 90 70 1937 14 75 2 volkswagen rabbit 298 | 2 4 85 52 2035 22.2 76 1 chevrolet chevette 299 | 2 4 90 70 1937 14.2 76 2 vw rabbit 300 | 2 4 97 78 1940 14.5 77 2 volkswagen rabbit custom 301 | 2 4 135 84 2525 16 82 1 dodge aries se 302 | 2 4 97 71 1825 12.2 76 2 volkswagen rabbit 303 | 2 4 98 68 2135 16.6 78 3 honda accord lx 304 | 2 4 134 90 2711 15.5 80 3 toyota corona liftback 305 | 2 4 89 62 1845 15.3 80 2 vokswagen rabbit 306 | 2 4 98 65 2380 20.7 81 1 ford escort 2h 307 | 2 4 79 70 2074 19.5 71 2 peugeot 304 308 | 2 4 88 76 2065 14.5 71 2 fiat 124b 309 | 2 4 111 80 2155 14.8 77 1 buick opel isuzu deluxe 310 | 2 4 97 67 1985 16.4 77 3 subaru dl 311 | 2 4 98 68 2155 16.5 78 1 chevrolet chevette 312 | 2 4 146 67 3250 21.8 80 2 mercedes-benz 240d 313 | 2 4 135 84 2385 12.9 81 1 plymouth reliant 314 | 2 4 98 63 2051 17 77 1 chevrolet chevette 315 | 2 4 97 78 2190 14.1 77 2 volkswagen dasher 316 | 2 6 145 76 3160 19.6 81 2 volvo diesel 317 | 2 4 105 75 2230 14.5 78 1 dodge omni 318 | 2 4 71 65 1773 19 71 3 toyota corolla 1200 319 | 2 4 79 67 1950 19 74 3 datsun b210 320 | 2 4 76 52 1649 16.5 74 3 toyota corona 321 | 2 4 79 67 2000 16 74 2 fiat x1.9 322 | 2 4 112 85 2575 16.2 82 1 pontiac j2000 se hatchback 323 | 2 4 91 68 1970 17.6 82 3 mazda glc custom 324 | 2 4 119 82 2720 19.4 82 1 chevy s-10 325 | 2 4 120 75 2542 17.5 80 3 mazda 626 326 | 2 4 98 68 2045 18.5 77 3 honda accord cvcc 327 | 2 4 89 71 1990 14.9 78 2 volkswagen scirocco 328 | 2 4 120 74 2635 18.3 81 3 mazda 626 329 | 2 4 85 65 2020 19.2 79 3 datsun 210 330 | 2 4 89 71 1925 14 79 2 vw rabbit custom 331 | 2 4 71 65 1836 21 74 3 toyota corolla 1200 332 | 2 4 83 61 2003 19 74 3 datsun 710 333 | 2 4 85 70 1990 17 76 3 datsun b-210 334 | 2 4 91 67 1965 15.7 82 3 honda civic (auto) 335 | 2 4 144 96 2665 13.9 82 3 toyota celica gt 336 | 2 4 135 84 2295 11.6 82 1 dodge rampage 337 | 2 4 98 70 2120 15.5 80 1 chevrolet chevette 338 | 2 4 108 75 2265 15.2 80 3 toyota corolla 339 | 2 4 97 67 2065 17.8 81 3 subaru 340 | 2 4 107 72 2290 17 80 3 honda accord 341 | 2 4 108 75 2350 16.8 81 3 toyota corolla 342 | 2 6 168 132 2910 11.4 80 3 datsun 280-zx 343 | 2 4 78 52 1985 19.4 78 3 mazda glc deluxe 344 | 2 4 119 100 2615 14.8 81 3 datsun 200sx 345 | 2 4 91 53 1795 17.5 75 3 honda civic cvcc 346 | 2 4 91 53 1795 17.4 76 3 honda civic 347 | 2 4 105 74 2190 14.2 81 2 volkswagen jetta 348 | 2 4 85 70 1945 16.8 77 3 datsun f-10 hatchback 349 | 2 4 98 83 2075 15.9 77 1 dodge colt m/m 350 | 2 4 151 90 2556 13.2 79 1 pontiac phoenix 351 | 2 4 107 75 2210 14.4 81 3 honda prelude 352 | 2 4 97 67 2145 18 80 3 subaru dl 353 | 2 4 112 88 2395 18 82 1 chevrolet cavalier 2-door 354 | 2 4 108 70 2245 16.9 82 3 toyota corolla 355 | 2 4 86 65 1975 15.2 79 3 maxda glc deluxe 356 | 2 4 91 68 1985 16 81 3 mazda glc 4 357 | 2 4 105 70 2200 13.2 79 1 plymouth horizon 358 | 2 4 97 78 2188 15.8 80 2 audi 4000 359 | 2 4 98 65 2045 16.2 81 1 ford escort 4w 360 | 2 4 105 70 2150 14.9 79 1 plymouth horizon tc3 361 | 2 4 100 ? 2320 15.8 81 2 renault 18i 362 | 2 4 105 63 2215 14.9 81 1 plymouth horizon 4 363 | 2 4 72 69 1613 18 71 3 datsun 1200 364 | 2 4 122 88 2500 15.1 80 2 triumph tr7 coupe 365 | 2 4 81 60 1760 16.1 81 3 honda civic 1300 366 | 2 4 98 80 1915 14.4 79 1 dodge colt hatchback custom 367 | 2 4 79 58 1825 18.6 77 2 renault 5 gtl 368 | 2 4 105 74 1980 15.3 82 2 volkswagen rabbit l 369 | 2 4 98 70 2125 17.3 82 1 mercury lynx l 370 | 2 4 120 88 2160 14.5 82 3 nissan stanza xe 371 | 2 4 107 75 2205 14.5 82 3 honda accord 372 | 2 4 135 84 2370 13 82 1 dodge charger 2.2 373 | 2 4 98 66 1800 14.4 78 1 ford fiesta 374 | 2 4 91 60 1800 16.4 78 3 honda civic cvcc 375 | 2 5 121 67 2950 19.9 80 2 audi 5000s (diesel) 376 | 2 4 119 92 2434 15 80 3 datsun 510 hatchback 377 | 2 4 85 65 1975 19.4 81 3 datsun 210 mpg 378 | 2 4 91 68 2025 18.2 82 3 mazda glc custom l 379 | 2 4 86 65 2019 16.4 80 3 datsun 310 380 | 2 4 91 69 2130 14.7 79 2 fiat strada custom 381 | 2 4 89 62 2050 17.3 81 3 toyota tercel 382 | 2 4 105 63 2125 14.7 82 1 plymouth horizon miser 383 | 2 4 91 67 1965 15 82 3 honda civic 384 | 2 4 91 67 1995 16.2 82 3 datsun 310 gx 385 | 2 6 262 85 3015 17 82 1 oldsmobile cutlass ciera (diesel) 386 | 2 4 89 60 1968 18.8 80 3 toyota corolla tercel 387 | 2 4 86 64 1875 16.4 81 1 plymouth champ 388 | 2 4 79 58 1755 16.9 81 3 toyota starlet 389 | 2 4 85 70 2070 18.6 78 3 datsun b210 gx 390 | 2 4 85 65 2110 19.2 80 3 datsun 210 391 | 2 4 85 ? 1835 17.3 80 2 renault lecar deluxe 392 | 2 4 98 76 2144 14.7 80 2 vw rabbit 393 | 2 4 90 48 1985 21.5 78 2 volkswagen rabbit custom diesel 394 | 2 4 90 48 2335 23.7 80 2 vw dasher (diesel) 395 | 2 4 97 52 2130 24.6 82 2 vw pickup 396 | 2 4 90 48 2085 21.7 80 2 vw rabbit c (diesel) 397 | 2 4 91 67 1850 13.8 80 3 honda civic 1500 gl 398 | 2 4 86 65 2110 17.9 80 3 mazda glc 399 | -------------------------------------------------------------------------------- /Lecture 4 - Decision Trees/dt-credit.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | (C) 2016 Nikolay Manchev, London Machine Learning Study Group 4 | 5 | http://www.meetup.com/London-Machine-Learning-Study-Group/ 6 | 7 | This work is licensed under the Creative Commons Attribution 4.0 International 8 | License. To view a copy of this license, visit 9 | http://creativecommons.org/licenses/by/4.0/. 10 | """ 11 | 12 | import numpy as np 13 | import math 14 | 15 | 16 | def split(dataset, attribute, value): 17 | """ 18 | Split a dataset in two by value of an attribute 19 | 20 | Parameters 21 | ---------- 22 | dataset : dataset for the split 23 | attribute : attribute to split on 24 | value : threshold value for the split 25 | 26 | Returns 27 | ------- 28 | a tuple containing the two splits 29 | """ 30 | set_one = dataset[dataset[:, attribute] > value] 31 | set_two = dataset[dataset[:, attribute] <= value] 32 | return (set_one, set_two) 33 | 34 | 35 | def entropy(dataset): 36 | """ 37 | Computes the entropy for a dataset. The entropy is computed as 38 | 39 | H = sum_{i} p(x_i) log_2 p(x_i) 40 | 41 | The sum is taken over all unique values in the set. The 42 | probability p(x_i) is computed as 43 | 44 | p(x_i) = (frequency of occurrence of x_i) / (size of the dataset) 45 | 46 | Parameters 47 | ---------- 48 | dataset : a list of values 49 | 50 | Returns 51 | ------- 52 | the entropy of the set 53 | """ 54 | H = 0 55 | 56 | for freq in count_distinct(dataset).values(): 57 | H += (-freq/len(dataset)) * math.log(freq/len(dataset), 2) 58 | 59 | return H 60 | 61 | 62 | def mode(dataset): 63 | """ 64 | Computes the mode (i.e. most frequent value) of the dataset 65 | 66 | Parameters 67 | ---------- 68 | dataset : a list of values 69 | 70 | Returns 71 | ------- 72 | the distinct value with highest frequency of occurrence 73 | """ 74 | counts = count_distinct(dataset) 75 | return max(counts, key=counts.get) 76 | 77 | 78 | def count_distinct(dataset): 79 | """ 80 | Gets a list of unique values in a dataset and computes the 81 | frequency of occurrence for each unique value. 82 | 83 | Parameters 84 | ---------- 85 | dataset : a list of values 86 | 87 | Returns 88 | ------- 89 | a dictionary of unique values and their respective frequency 90 | of occurrence 91 | """ 92 | counts = {} 93 | 94 | # Loop over all elements of the dataset 95 | for item in dataset: 96 | if (item in counts): 97 | # This value is already in the dictionary. 98 | # Increase its count. 99 | counts[item] = counts[item] + 1 100 | else: 101 | # This is the first occurrence of the word. 102 | # Add it to the dictionary and set its count to 1 103 | counts[item] = 1 104 | return counts 105 | 106 | def IG(dataset, attr_index, labels): 107 | """ 108 | Computes the expected reduction of entropy if the dataset is 109 | split by a specific attribute. 110 | 111 | IG(dataset, attribute) = H(dataset) - H(dataset|attribute) 112 | 113 | Parameters 114 | ---------- 115 | dataset : a list of values 116 | attr_index : index of an attribute to split on 117 | labels : class labels for the examples in dataset 118 | 119 | Returns 120 | ------- 121 | IG(dataset, attribute) 122 | """ 123 | # Get the dataset distinct values and their respective 124 | # frequency of occurrence 125 | dataset_attributes = count_distinct(dataset[:,attr_index]) 126 | 127 | # Start with 0 entropy 128 | I = 0 129 | 130 | # Compute the entropy of the split 131 | # I(X, A) = \sum_{i=1}^{m} \frac{|X_i|}{|X|} \times H(X_i) 132 | for key in dataset_attributes.keys(): 133 | 134 | # Compute the weighted average \frac{|X_i|}{|X|} 135 | p = dataset_attributes[key] / sum(dataset_attributes.values()) 136 | 137 | # Get the class labels for X_i 138 | subset_labels = labels[dataset[:,attr_index] == key] 139 | 140 | # Add \frac{|X_i|}{|X|} \times H(X_i) to I(X,A) 141 | I = I + p * entropy(subset_labels) 142 | 143 | # Return H(dataset) - I(dataset, A) 144 | return entropy(dataset[:,attr_index]) - I 145 | 146 | 147 | def select_best(dataset, attributes, labels): 148 | """ 149 | Selects the best attribute to split on based on reduction of entropy. 150 | 151 | Parameters 152 | ---------- 153 | dataset : a list of values 154 | attributes : names of the attributes in the dataset 155 | labels : class labels for the examples in dataset 156 | 157 | Returns 158 | ------- 159 | The attribute that maximizes the decrease of entropy after 160 | splitting 161 | """ 162 | best_IG = 0 163 | best_attr = None 164 | 165 | # Go over all attributes of the set 166 | for attr in attributes: 167 | # Compute the expected Information Gain if we split on 168 | # that attribute 169 | gain = IG(dataset, attributes.index(attr), labels) 170 | # If the gain is higher than what we have so far select that attribute 171 | if (gain >= best_IG): 172 | best_IG = gain 173 | best_attr = attr 174 | 175 | # Return the attribute that produces the highest gain 176 | return best_attr 177 | 178 | def build_tree(dataset, attributes, labels, default, verbose = False): 179 | 180 | if verbose: 181 | print("*****************") 182 | print("INPUT ATTRIBUTES:", attributes) 183 | 184 | # No data? Return default classification 185 | if dataset.size == 0: 186 | return default 187 | 188 | # All examples have the same classification? Return this label 189 | if len(set(labels)) <= 1: 190 | 191 | if verbose: 192 | print("SAME CLASS :", labels[0]) 193 | print("*****************") 194 | 195 | return labels[0] 196 | 197 | # Attributes empty? Return MODE 198 | if len(attributes) <= 1: 199 | return default 200 | 201 | # Choose best attribute 202 | attr = select_best(dataset, attributes, labels) 203 | 204 | if (attr == None): 205 | if verbose: 206 | print("NO ATTRIBUTE TO SPLIT ON") 207 | print("************************") 208 | return default 209 | 210 | if verbose: 211 | print("SPLITTING ON :", attr) 212 | print("*****************") 213 | 214 | 215 | # Get distinct attribute index and values 216 | attr_index = attributes.index(attr) 217 | attr_values = count_distinct(dataset[:,attributes.index(attr)]).keys() 218 | 219 | # Remove the selected attribute from the list of remaining attributes 220 | attributes = [x for x in attributes if x != attr] 221 | 222 | # Add a node for that attribute 223 | tree = {attr:{}} 224 | 225 | for v in attr_values: 226 | 227 | # Get the indexes of all examples that have value v for the 228 | # chosen attribute 229 | indexes = dataset[:, attr_index] == v 230 | 231 | # Get all examples and their respective labels 232 | subtree_dataset = dataset[indexes] 233 | subtree_labels = labels[indexes] 234 | 235 | # Build a subtree using the selected examples 236 | subtree = build_tree(subtree_dataset, attributes, 237 | subtree_labels, mode(subtree_labels)) 238 | 239 | # Attach the subtree 240 | tree[attr][v] = subtree 241 | 242 | return tree 243 | 244 | def predict(tree, attributes, example): 245 | """ 246 | Traverse a tree to make a prediction. 247 | 248 | Parameters 249 | ---------- 250 | tree : a dictionary containing a decision tree 251 | attributes : names of the attributes in the dataset 252 | example : example to classify 253 | 254 | Returns 255 | ------- 256 | The class label for this example. 257 | If the example cannot be classified, this function returns None. 258 | """ 259 | # Get the attribute at the tree root 260 | for attr, value in tree.items(): 261 | attr_index = attributes.index(attr) 262 | try: 263 | # Get the node that has the same value as in the example 264 | node = tree[attr][example[attr_index]] 265 | except KeyError: 266 | # No such node exists? We can't classify the example then 267 | return None 268 | if isinstance(node, dict): 269 | # Node exists, but it is a subtree. Traverse recursively. 270 | return predict(node, attributes, example) 271 | else: 272 | # Node exists and is a terminal node. Its value is the class label. 273 | return node 274 | 275 | def printTree(tree, attributes, offset = "|->"): 276 | """ 277 | Prints a decision tree from dictionary. 278 | 279 | Parameters 280 | ---------- 281 | tree : a dictionary containing a decision tree 282 | attributes : names of the attributes in the dataset 283 | """ 284 | for attr, value in tree.items(): 285 | node = tree[attr] 286 | if isinstance(node, dict): 287 | print(offset,attr) 288 | printTree(node, attributes, (" " + offset)) 289 | else: 290 | print(offset,attr, "->", value) 291 | 292 | 293 | # Load the data set 294 | 295 | data = np.array([[0,0,0],[1,0,1],[0,0,0],[0,0,0],[0,1,1],[1,0,0],[0,1,0],[1,1,1],[1,0,0],[1,0,0]]) 296 | #data = np.array([[1,1,1],[2,1,2],[1,1,1],[1,1,1],[1,2,2],[2,1,1],[1,2,1],[2,2,2],[2,1,1],[2,1,1]]) 297 | #data = np.array([[0,0,0],['A',0,'A'],[0,0,0],[0,0,0],['A','A','A'],['A',0,0],[0,'A',0],[0,'A','A'],['A',0,0],['A',0,0]]) 298 | labels = np.array([0, 0, 0, 0, 1, 0, 0, 1, 0, 0]) 299 | 300 | #noise = np.random.normal(0, 0.5, len(data)) 301 | #data[:,1] += noise.astype(int) 302 | 303 | # Set attribute names 304 | attributes = ["<2 YRS JOB", "MISSED PMNTS", "DEFAULTED"] 305 | class_labels = ["GOOD", "BAD"] 306 | 307 | # Get the most frequent label 308 | default = mode(labels) 309 | 310 | tree = build_tree(data, attributes, labels, default) 311 | 312 | printTree(tree, attributes) 313 | #print(predict(tree, attributes, [1,0,1])) 314 | -------------------------------------------------------------------------------- /Lecture 4 - Decision Trees/entropy.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | (C) 2016 Nikolay Manchev, London Machine Learning Study Group 4 | 5 | http://www.meetup.com/London-Machine-Learning-Study-Group/ 6 | 7 | This work is licensed under the Creative Commons Attribution 4.0 International 8 | License. To view a copy of this license, visit 9 | http://creativecommons.org/licenses/by/4.0/. 10 | """ 11 | 12 | import numpy as np 13 | import math 14 | 15 | 16 | def split(dataset, attribute, value): 17 | """ 18 | Split a dataset in two by value of an attribute 19 | 20 | Parameters 21 | ---------- 22 | dataset : dataset for the split 23 | attribute : attribute to split on 24 | value : threshold value for the split 25 | 26 | Returns 27 | ------- 28 | a tuple containing the two splits 29 | """ 30 | set_one = dataset[dataset[:, attribute] > value] 31 | set_two = dataset[dataset[:, attribute] <= value] 32 | return (set_one, set_two) 33 | 34 | 35 | def count_distinct(dataset): 36 | """ 37 | Gets a list of unique values in a dataset and computes the 38 | frequency of occurrence for each unique value. 39 | 40 | Parameters 41 | ---------- 42 | dataset : a list of values 43 | 44 | Returns 45 | ------- 46 | a dictionary of unique values and their respective frequency 47 | of occurrence 48 | """ 49 | counts = {} 50 | 51 | # Loop over all elements of the dataset 52 | for item in dataset: 53 | if (item in counts): 54 | # This value is already in the dictionary. 55 | # Increase its count. 56 | counts[item] = counts[item] + 1 57 | else: 58 | # This is the first occurrence of the word. 59 | # Add it to the dictionary and set its count to 1 60 | counts[item] = 1 61 | return counts 62 | 63 | 64 | def entropy(dataset): 65 | """ 66 | Computes the entropy for a dataset. The entropy is computed as 67 | 68 | H = sum_{i} p(x_i) log_2 p(x_i) 69 | 70 | The sum is taken over all unique values in the set. The 71 | probability p(x_i) is computed as 72 | 73 | p(x_i) = (frequency of occurrence of x_i) / (size of the dataset) 74 | 75 | Parameters 76 | ---------- 77 | dataset : a list of values 78 | 79 | Returns 80 | ------- 81 | the entropy of the set 82 | """ 83 | H = 0 84 | 85 | for freq in count_distinct(dataset).values(): 86 | H += (-freq/len(dataset)) * math.log(freq/len(dataset), 2) 87 | 88 | return H 89 | 90 | 91 | def show_split_entropy(dataset, attr_index, split_value): 92 | """ 93 | Splits a dataset on attribute and prints the frequency of occurrence 94 | and the entropy for each split. 95 | 96 | Parameters 97 | ---------- 98 | dataset : a list of values 99 | attr_index : index of an attribute to split on 100 | split_value : threshold value for the split 101 | 102 | """ 103 | # Split the dataset in two subsets 104 | (x1, x2) = split(dataset,attr_index,split_value) 105 | 106 | # Print the frequencies and entropy for the first subset 107 | print("First split") 108 | print("**************") 109 | print("Value counts: ", count_distinct(x1[:,attr_index])) 110 | print("Entropy: ", entropy(x1[:,attr_index]), "\n") 111 | 112 | # Print the frequencies and entropy for the second subset 113 | print("Second split") 114 | print("**************") 115 | print("Value counts: ", count_distinct(x2[:,attr_index])) 116 | print("Entropy: ", entropy(x2[:,attr_index])) 117 | 118 | 119 | # Load the data set 120 | # We use a modified version of the Auto MPG from UCI Machine Learning 121 | # Repository where the continuous MPG attribute has been converted to 122 | # categorical as follows: 123 | # 124 | # [9;19) - BAD 125 | # (9;26] - OK 126 | # (26;47] - GOOD 127 | # 128 | # The original dataset is available at 129 | # https://archive.ics.uci.edu/ml/datasets/Auto+MPG 130 | 131 | car_data = np.genfromtxt("auto-mpg-modified.data", usecols = range(8)) 132 | car_data = car_data[~np.isnan(car_data).any(axis = 1)] 133 | 134 | # Assign MPG to y and all other attributes to x 135 | data = car_data[:,1:] 136 | labels = car_data[:,0] 137 | 138 | # Set attribute names 139 | attributes = ["CYLYNDERS", "DISPLACEMENT", "HORSEPOWER", "WEIGHT", "ACCELERATION", "MODEL_YEAR", "ORIGIN"] 140 | class_labels = ["BAD", "OK", "GOOD"] 141 | 142 | # Look at the unique values for the MODEL_YEAR attribute 143 | print("Unique values for MODEL_YEAR: ", count_distinct(data[:,5]), "\n") 144 | 145 | # Split the dataset at the value 75 146 | show_split_entropy(data, 5, 75) 147 | -------------------------------------------------------------------------------- /Lecture 4 - Decision Trees/overfit_demo.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | (C) 2016 Nikolay Manchev, London Machine Learning Study Group 4 | 5 | http://www.meetup.com/London-Machine-Learning-Study-Group/ 6 | 7 | This work is licensed under the Creative Commons Attribution 4.0 International 8 | License. To view a copy of this license, visit 9 | http://creativecommons.org/licenses/by/4.0/. 10 | """ 11 | 12 | import numpy as np 13 | 14 | from sklearn import tree 15 | 16 | from sklearn.cross_validation import train_test_split 17 | from sklearn.metrics import accuracy_score 18 | 19 | # Load the data set 20 | # We use a modified version of the Auto MPG from UCI Machine Learning 21 | # Repository where the continuous MPG attribute has been converted to 22 | # categorical as follows: 23 | # 24 | # [9;19) - BAD 25 | # (9;26] - OK 26 | # (26;47] - GOOD 27 | # 28 | # The original dataset is available at 29 | # https://archive.ics.uci.edu/ml/datasets/Auto+MPG 30 | 31 | car_data = np.genfromtxt("auto-mpg-modified.data", usecols = range(8)) 32 | car_data = car_data[~np.isnan(car_data).any(axis = 1)] 33 | 34 | # Assign MPG to y and all other attributes to x 35 | data = car_data[:,1:] 36 | labels = car_data[:,0] 37 | 38 | # Split the data into test/train subsets 39 | x_train, x_test, y_train, y_test = train_test_split(data, labels, test_size=0.1) 40 | 41 | # Train a constrained Decision tree 42 | dt = tree.DecisionTreeClassifier(criterion='entropy') 43 | dt = dt.fit(x_train, y_train) 44 | pred_train = dt.predict(x_train) 45 | pred_test = dt.predict(x_test) 46 | print("Prediction on training data :", accuracy_score(y_train, pred_train)) 47 | print("Prediction on test data :", accuracy_score(y_test, pred_test)) 48 | 49 | -------------------------------------------------------------------------------- /Lecture 4 - Decision Trees/scikit-dt-auto-mpg.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | (C) 2016 Nikolay Manchev, London Machine Learning Study Group 4 | 5 | http://www.meetup.com/London-Machine-Learning-Study-Group/ 6 | 7 | This work is licensed under the Creative Commons Attribution 4.0 International 8 | License. To view a copy of this license, visit 9 | http://creativecommons.org/licenses/by/4.0/. 10 | """ 11 | 12 | import numpy as np 13 | import pydotplus 14 | 15 | from sklearn import tree 16 | from io import StringIO 17 | 18 | # Load the data set 19 | # We use a modified version of the Auto MPG from UCI Machine Learning 20 | # Repository where the continuous MPG attribute has been converted to 21 | # categorical as follows: 22 | # 23 | # [9;19) - BAD 24 | # (9;26] - OK 25 | # (26;47] - GOOD 26 | # 27 | # The original dataset is available at 28 | # https://archive.ics.uci.edu/ml/datasets/Auto+MPG 29 | 30 | car_data = np.genfromtxt("auto-mpg-modified.data", usecols = range(8)) 31 | car_data = car_data[~np.isnan(car_data).any(axis = 1)] 32 | 33 | # Assign MPG to y and all other attributes to x 34 | data = car_data[:,1:] 35 | labels = car_data[:,0] 36 | 37 | # Uncomment to add some noise to the data 38 | #noise = np.random.normal(0, 10, len(data)) 39 | #data[:,5] += noise.astype(int) 40 | 41 | dt = tree.DecisionTreeClassifier(criterion = "entropy", max_depth=3) 42 | dt = dt.fit(data, labels) 43 | 44 | attributes = ["CYLYNDERS", "DISPLACEMENT", "HORSEPOWER", "WEIGHT", "ACCELERATION", "MODEL_YEAR", "ORIGIN"] 45 | class_labels = ["BAD", "OK", "GOOD"] 46 | 47 | out = StringIO() 48 | tree.export_graphviz(dt,out_file=out, 49 | feature_names = attributes, 50 | class_names = class_labels, 51 | filled=True, 52 | impurity = False) 53 | 54 | pydotplus.graph_from_dot_data(out.getvalue()).write_png("dtree.png") 55 | -------------------------------------------------------------------------------- /Lecture 5 - Probabilities and Logistic Regression/Probabilities_and_Logistic_Regression.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nmanchev/MachineLearningStudyGroup/092f642d888f3dfb105aa8768d4a6927c93a4278/Lecture 5 - Probabilities and Logistic Regression/Probabilities_and_Logistic_Regression.pdf -------------------------------------------------------------------------------- /Lecture 5 - Probabilities and Logistic Regression/README.md: -------------------------------------------------------------------------------- 1 | ## Probabilities and Logistic Regression 2 | 3 | Code examples used in Lecture 5 4 | 5 | * auto-mpg.data - The [Auto MPG](https://archive.ics.uci.edu/ml/datasets/Auto+MPG) from UCI Machine Learning Repository 6 | * logreg_gradient.py - Binary Logistic Regression with made up data 7 | * linreg-normal_equations.py - Linear Regression (normal equations) with the made up data 8 | * logreg-hp-origin.py - Binary Logistic Regression using the Auto MPG dataset (one input varaible) 9 | * logreg_gradient_2_variables.py - Binary Logistic Regression using the Auto MPG dataset (two input varaibles) 10 | * logreg_gradient_2_variables_iris.py - Binary Logistic Regression using linearly separable classes from the Iris dataset 11 | 12 | This repository contains materials from the London Machine Learning Study Group Meetups 13 | 14 | The meetup page is available at [http://www.meetup.com/London-Machine-Learning-Study-Group](http://www.meetup.com/London-Machine-Learning-Study-Group). 15 | 16 | (C) 2017 Nikolay Manchev, London Machine Learning Study Group 17 | 18 | This work is licensed under the Creative Commons Attribution 4.0 International License. To view a copy of this license, visit [http://creativecommons.org/licenses/by/4.0](http://creativecommons.org/licenses/by/4.0). 19 | -------------------------------------------------------------------------------- /Lecture 5 - Probabilities and Logistic Regression/auto-mpg.data: -------------------------------------------------------------------------------- 1 | 18.0 8 307.0 130.0 3504. 12.0 70 1 "chevrolet chevelle malibu" 2 | 15.0 8 350.0 165.0 3693. 11.5 70 1 "buick skylark 320" 3 | 18.0 8 318.0 150.0 3436. 11.0 70 1 "plymouth satellite" 4 | 16.0 8 304.0 150.0 3433. 12.0 70 1 "amc rebel sst" 5 | 17.0 8 302.0 140.0 3449. 10.5 70 1 "ford torino" 6 | 15.0 8 429.0 198.0 4341. 10.0 70 1 "ford galaxie 500" 7 | 14.0 8 454.0 220.0 4354. 9.0 70 1 "chevrolet impala" 8 | 14.0 8 440.0 215.0 4312. 8.5 70 1 "plymouth fury iii" 9 | 14.0 8 455.0 225.0 4425. 10.0 70 1 "pontiac catalina" 10 | 15.0 8 390.0 190.0 3850. 8.5 70 1 "amc ambassador dpl" 11 | 15.0 8 383.0 170.0 3563. 10.0 70 1 "dodge challenger se" 12 | 14.0 8 340.0 160.0 3609. 8.0 70 1 "plymouth 'cuda 340" 13 | 15.0 8 400.0 150.0 3761. 9.5 70 1 "chevrolet monte carlo" 14 | 14.0 8 455.0 225.0 3086. 10.0 70 1 "buick estate wagon (sw)" 15 | 24.0 4 113.0 95.00 2372. 15.0 70 3 "toyota corona mark ii" 16 | 22.0 6 198.0 95.00 2833. 15.5 70 1 "plymouth duster" 17 | 18.0 6 199.0 97.00 2774. 15.5 70 1 "amc hornet" 18 | 21.0 6 200.0 85.00 2587. 16.0 70 1 "ford maverick" 19 | 27.0 4 97.00 88.00 2130. 14.5 70 3 "datsun pl510" 20 | 26.0 4 97.00 46.00 1835. 20.5 70 2 "volkswagen 1131 deluxe sedan" 21 | 25.0 4 110.0 87.00 2672. 17.5 70 2 "peugeot 504" 22 | 24.0 4 107.0 90.00 2430. 14.5 70 2 "audi 100 ls" 23 | 25.0 4 104.0 95.00 2375. 17.5 70 2 "saab 99e" 24 | 26.0 4 121.0 113.0 2234. 12.5 70 2 "bmw 2002" 25 | 21.0 6 199.0 90.00 2648. 15.0 70 1 "amc gremlin" 26 | 10.0 8 360.0 215.0 4615. 14.0 70 1 "ford f250" 27 | 10.0 8 307.0 200.0 4376. 15.0 70 1 "chevy c20" 28 | 11.0 8 318.0 210.0 4382. 13.5 70 1 "dodge d200" 29 | 9.0 8 304.0 193.0 4732. 18.5 70 1 "hi 1200d" 30 | 27.0 4 97.00 88.00 2130. 14.5 71 3 "datsun pl510" 31 | 28.0 4 140.0 90.00 2264. 15.5 71 1 "chevrolet vega 2300" 32 | 25.0 4 113.0 95.00 2228. 14.0 71 3 "toyota corona" 33 | 25.0 4 98.00 ? 2046. 19.0 71 1 "ford pinto" 34 | 19.0 6 232.0 100.0 2634. 13.0 71 1 "amc gremlin" 35 | 16.0 6 225.0 105.0 3439. 15.5 71 1 "plymouth satellite custom" 36 | 17.0 6 250.0 100.0 3329. 15.5 71 1 "chevrolet chevelle malibu" 37 | 19.0 6 250.0 88.00 3302. 15.5 71 1 "ford torino 500" 38 | 18.0 6 232.0 100.0 3288. 15.5 71 1 "amc matador" 39 | 14.0 8 350.0 165.0 4209. 12.0 71 1 "chevrolet impala" 40 | 14.0 8 400.0 175.0 4464. 11.5 71 1 "pontiac catalina brougham" 41 | 14.0 8 351.0 153.0 4154. 13.5 71 1 "ford galaxie 500" 42 | 14.0 8 318.0 150.0 4096. 13.0 71 1 "plymouth fury iii" 43 | 12.0 8 383.0 180.0 4955. 11.5 71 1 "dodge monaco (sw)" 44 | 13.0 8 400.0 170.0 4746. 12.0 71 1 "ford country squire (sw)" 45 | 13.0 8 400.0 175.0 5140. 12.0 71 1 "pontiac safari (sw)" 46 | 18.0 6 258.0 110.0 2962. 13.5 71 1 "amc hornet sportabout (sw)" 47 | 22.0 4 140.0 72.00 2408. 19.0 71 1 "chevrolet vega (sw)" 48 | 19.0 6 250.0 100.0 3282. 15.0 71 1 "pontiac firebird" 49 | 18.0 6 250.0 88.00 3139. 14.5 71 1 "ford mustang" 50 | 23.0 4 122.0 86.00 2220. 14.0 71 1 "mercury capri 2000" 51 | 28.0 4 116.0 90.00 2123. 14.0 71 2 "opel 1900" 52 | 30.0 4 79.00 70.00 2074. 19.5 71 2 "peugeot 304" 53 | 30.0 4 88.00 76.00 2065. 14.5 71 2 "fiat 124b" 54 | 31.0 4 71.00 65.00 1773. 19.0 71 3 "toyota corolla 1200" 55 | 35.0 4 72.00 69.00 1613. 18.0 71 3 "datsun 1200" 56 | 27.0 4 97.00 60.00 1834. 19.0 71 2 "volkswagen model 111" 57 | 26.0 4 91.00 70.00 1955. 20.5 71 1 "plymouth cricket" 58 | 24.0 4 113.0 95.00 2278. 15.5 72 3 "toyota corona hardtop" 59 | 25.0 4 97.50 80.00 2126. 17.0 72 1 "dodge colt hardtop" 60 | 23.0 4 97.00 54.00 2254. 23.5 72 2 "volkswagen type 3" 61 | 20.0 4 140.0 90.00 2408. 19.5 72 1 "chevrolet vega" 62 | 21.0 4 122.0 86.00 2226. 16.5 72 1 "ford pinto runabout" 63 | 13.0 8 350.0 165.0 4274. 12.0 72 1 "chevrolet impala" 64 | 14.0 8 400.0 175.0 4385. 12.0 72 1 "pontiac catalina" 65 | 15.0 8 318.0 150.0 4135. 13.5 72 1 "plymouth fury iii" 66 | 14.0 8 351.0 153.0 4129. 13.0 72 1 "ford galaxie 500" 67 | 17.0 8 304.0 150.0 3672. 11.5 72 1 "amc ambassador sst" 68 | 11.0 8 429.0 208.0 4633. 11.0 72 1 "mercury marquis" 69 | 13.0 8 350.0 155.0 4502. 13.5 72 1 "buick lesabre custom" 70 | 12.0 8 350.0 160.0 4456. 13.5 72 1 "oldsmobile delta 88 royale" 71 | 13.0 8 400.0 190.0 4422. 12.5 72 1 "chrysler newport royal" 72 | 19.0 3 70.00 97.00 2330. 13.5 72 3 "mazda rx2 coupe" 73 | 15.0 8 304.0 150.0 3892. 12.5 72 1 "amc matador (sw)" 74 | 13.0 8 307.0 130.0 4098. 14.0 72 1 "chevrolet chevelle concours (sw)" 75 | 13.0 8 302.0 140.0 4294. 16.0 72 1 "ford gran torino (sw)" 76 | 14.0 8 318.0 150.0 4077. 14.0 72 1 "plymouth satellite custom (sw)" 77 | 18.0 4 121.0 112.0 2933. 14.5 72 2 "volvo 145e (sw)" 78 | 22.0 4 121.0 76.00 2511. 18.0 72 2 "volkswagen 411 (sw)" 79 | 21.0 4 120.0 87.00 2979. 19.5 72 2 "peugeot 504 (sw)" 80 | 26.0 4 96.00 69.00 2189. 18.0 72 2 "renault 12 (sw)" 81 | 22.0 4 122.0 86.00 2395. 16.0 72 1 "ford pinto (sw)" 82 | 28.0 4 97.00 92.00 2288. 17.0 72 3 "datsun 510 (sw)" 83 | 23.0 4 120.0 97.00 2506. 14.5 72 3 "toyouta corona mark ii (sw)" 84 | 28.0 4 98.00 80.00 2164. 15.0 72 1 "dodge colt (sw)" 85 | 27.0 4 97.00 88.00 2100. 16.5 72 3 "toyota corolla 1600 (sw)" 86 | 13.0 8 350.0 175.0 4100. 13.0 73 1 "buick century 350" 87 | 14.0 8 304.0 150.0 3672. 11.5 73 1 "amc matador" 88 | 13.0 8 350.0 145.0 3988. 13.0 73 1 "chevrolet malibu" 89 | 14.0 8 302.0 137.0 4042. 14.5 73 1 "ford gran torino" 90 | 15.0 8 318.0 150.0 3777. 12.5 73 1 "dodge coronet custom" 91 | 12.0 8 429.0 198.0 4952. 11.5 73 1 "mercury marquis brougham" 92 | 13.0 8 400.0 150.0 4464. 12.0 73 1 "chevrolet caprice classic" 93 | 13.0 8 351.0 158.0 4363. 13.0 73 1 "ford ltd" 94 | 14.0 8 318.0 150.0 4237. 14.5 73 1 "plymouth fury gran sedan" 95 | 13.0 8 440.0 215.0 4735. 11.0 73 1 "chrysler new yorker brougham" 96 | 12.0 8 455.0 225.0 4951. 11.0 73 1 "buick electra 225 custom" 97 | 13.0 8 360.0 175.0 3821. 11.0 73 1 "amc ambassador brougham" 98 | 18.0 6 225.0 105.0 3121. 16.5 73 1 "plymouth valiant" 99 | 16.0 6 250.0 100.0 3278. 18.0 73 1 "chevrolet nova custom" 100 | 18.0 6 232.0 100.0 2945. 16.0 73 1 "amc hornet" 101 | 18.0 6 250.0 88.00 3021. 16.5 73 1 "ford maverick" 102 | 23.0 6 198.0 95.00 2904. 16.0 73 1 "plymouth duster" 103 | 26.0 4 97.00 46.00 1950. 21.0 73 2 "volkswagen super beetle" 104 | 11.0 8 400.0 150.0 4997. 14.0 73 1 "chevrolet impala" 105 | 12.0 8 400.0 167.0 4906. 12.5 73 1 "ford country" 106 | 13.0 8 360.0 170.0 4654. 13.0 73 1 "plymouth custom suburb" 107 | 12.0 8 350.0 180.0 4499. 12.5 73 1 "oldsmobile vista cruiser" 108 | 18.0 6 232.0 100.0 2789. 15.0 73 1 "amc gremlin" 109 | 20.0 4 97.00 88.00 2279. 19.0 73 3 "toyota carina" 110 | 21.0 4 140.0 72.00 2401. 19.5 73 1 "chevrolet vega" 111 | 22.0 4 108.0 94.00 2379. 16.5 73 3 "datsun 610" 112 | 18.0 3 70.00 90.00 2124. 13.5 73 3 "maxda rx3" 113 | 19.0 4 122.0 85.00 2310. 18.5 73 1 "ford pinto" 114 | 21.0 6 155.0 107.0 2472. 14.0 73 1 "mercury capri v6" 115 | 26.0 4 98.00 90.00 2265. 15.5 73 2 "fiat 124 sport coupe" 116 | 15.0 8 350.0 145.0 4082. 13.0 73 1 "chevrolet monte carlo s" 117 | 16.0 8 400.0 230.0 4278. 9.50 73 1 "pontiac grand prix" 118 | 29.0 4 68.00 49.00 1867. 19.5 73 2 "fiat 128" 119 | 24.0 4 116.0 75.00 2158. 15.5 73 2 "opel manta" 120 | 20.0 4 114.0 91.00 2582. 14.0 73 2 "audi 100ls" 121 | 19.0 4 121.0 112.0 2868. 15.5 73 2 "volvo 144ea" 122 | 15.0 8 318.0 150.0 3399. 11.0 73 1 "dodge dart custom" 123 | 24.0 4 121.0 110.0 2660. 14.0 73 2 "saab 99le" 124 | 20.0 6 156.0 122.0 2807. 13.5 73 3 "toyota mark ii" 125 | 11.0 8 350.0 180.0 3664. 11.0 73 1 "oldsmobile omega" 126 | 20.0 6 198.0 95.00 3102. 16.5 74 1 "plymouth duster" 127 | 21.0 6 200.0 ? 2875. 17.0 74 1 "ford maverick" 128 | 19.0 6 232.0 100.0 2901. 16.0 74 1 "amc hornet" 129 | 15.0 6 250.0 100.0 3336. 17.0 74 1 "chevrolet nova" 130 | 31.0 4 79.00 67.00 1950. 19.0 74 3 "datsun b210" 131 | 26.0 4 122.0 80.00 2451. 16.5 74 1 "ford pinto" 132 | 32.0 4 71.00 65.00 1836. 21.0 74 3 "toyota corolla 1200" 133 | 25.0 4 140.0 75.00 2542. 17.0 74 1 "chevrolet vega" 134 | 16.0 6 250.0 100.0 3781. 17.0 74 1 "chevrolet chevelle malibu classic" 135 | 16.0 6 258.0 110.0 3632. 18.0 74 1 "amc matador" 136 | 18.0 6 225.0 105.0 3613. 16.5 74 1 "plymouth satellite sebring" 137 | 16.0 8 302.0 140.0 4141. 14.0 74 1 "ford gran torino" 138 | 13.0 8 350.0 150.0 4699. 14.5 74 1 "buick century luxus (sw)" 139 | 14.0 8 318.0 150.0 4457. 13.5 74 1 "dodge coronet custom (sw)" 140 | 14.0 8 302.0 140.0 4638. 16.0 74 1 "ford gran torino (sw)" 141 | 14.0 8 304.0 150.0 4257. 15.5 74 1 "amc matador (sw)" 142 | 29.0 4 98.00 83.00 2219. 16.5 74 2 "audi fox" 143 | 26.0 4 79.00 67.00 1963. 15.5 74 2 "volkswagen dasher" 144 | 26.0 4 97.00 78.00 2300. 14.5 74 2 "opel manta" 145 | 31.0 4 76.00 52.00 1649. 16.5 74 3 "toyota corona" 146 | 32.0 4 83.00 61.00 2003. 19.0 74 3 "datsun 710" 147 | 28.0 4 90.00 75.00 2125. 14.5 74 1 "dodge colt" 148 | 24.0 4 90.00 75.00 2108. 15.5 74 2 "fiat 128" 149 | 26.0 4 116.0 75.00 2246. 14.0 74 2 "fiat 124 tc" 150 | 24.0 4 120.0 97.00 2489. 15.0 74 3 "honda civic" 151 | 26.0 4 108.0 93.00 2391. 15.5 74 3 "subaru" 152 | 31.0 4 79.00 67.00 2000. 16.0 74 2 "fiat x1.9" 153 | 19.0 6 225.0 95.00 3264. 16.0 75 1 "plymouth valiant custom" 154 | 18.0 6 250.0 105.0 3459. 16.0 75 1 "chevrolet nova" 155 | 15.0 6 250.0 72.00 3432. 21.0 75 1 "mercury monarch" 156 | 15.0 6 250.0 72.00 3158. 19.5 75 1 "ford maverick" 157 | 16.0 8 400.0 170.0 4668. 11.5 75 1 "pontiac catalina" 158 | 15.0 8 350.0 145.0 4440. 14.0 75 1 "chevrolet bel air" 159 | 16.0 8 318.0 150.0 4498. 14.5 75 1 "plymouth grand fury" 160 | 14.0 8 351.0 148.0 4657. 13.5 75 1 "ford ltd" 161 | 17.0 6 231.0 110.0 3907. 21.0 75 1 "buick century" 162 | 16.0 6 250.0 105.0 3897. 18.5 75 1 "chevroelt chevelle malibu" 163 | 15.0 6 258.0 110.0 3730. 19.0 75 1 "amc matador" 164 | 18.0 6 225.0 95.00 3785. 19.0 75 1 "plymouth fury" 165 | 21.0 6 231.0 110.0 3039. 15.0 75 1 "buick skyhawk" 166 | 20.0 8 262.0 110.0 3221. 13.5 75 1 "chevrolet monza 2+2" 167 | 13.0 8 302.0 129.0 3169. 12.0 75 1 "ford mustang ii" 168 | 29.0 4 97.00 75.00 2171. 16.0 75 3 "toyota corolla" 169 | 23.0 4 140.0 83.00 2639. 17.0 75 1 "ford pinto" 170 | 20.0 6 232.0 100.0 2914. 16.0 75 1 "amc gremlin" 171 | 23.0 4 140.0 78.00 2592. 18.5 75 1 "pontiac astro" 172 | 24.0 4 134.0 96.00 2702. 13.5 75 3 "toyota corona" 173 | 25.0 4 90.00 71.00 2223. 16.5 75 2 "volkswagen dasher" 174 | 24.0 4 119.0 97.00 2545. 17.0 75 3 "datsun 710" 175 | 18.0 6 171.0 97.00 2984. 14.5 75 1 "ford pinto" 176 | 29.0 4 90.00 70.00 1937. 14.0 75 2 "volkswagen rabbit" 177 | 19.0 6 232.0 90.00 3211. 17.0 75 1 "amc pacer" 178 | 23.0 4 115.0 95.00 2694. 15.0 75 2 "audi 100ls" 179 | 23.0 4 120.0 88.00 2957. 17.0 75 2 "peugeot 504" 180 | 22.0 4 121.0 98.00 2945. 14.5 75 2 "volvo 244dl" 181 | 25.0 4 121.0 115.0 2671. 13.5 75 2 "saab 99le" 182 | 33.0 4 91.00 53.00 1795. 17.5 75 3 "honda civic cvcc" 183 | 28.0 4 107.0 86.00 2464. 15.5 76 2 "fiat 131" 184 | 25.0 4 116.0 81.00 2220. 16.9 76 2 "opel 1900" 185 | 25.0 4 140.0 92.00 2572. 14.9 76 1 "capri ii" 186 | 26.0 4 98.00 79.00 2255. 17.7 76 1 "dodge colt" 187 | 27.0 4 101.0 83.00 2202. 15.3 76 2 "renault 12tl" 188 | 17.5 8 305.0 140.0 4215. 13.0 76 1 "chevrolet chevelle malibu classic" 189 | 16.0 8 318.0 150.0 4190. 13.0 76 1 "dodge coronet brougham" 190 | 15.5 8 304.0 120.0 3962. 13.9 76 1 "amc matador" 191 | 14.5 8 351.0 152.0 4215. 12.8 76 1 "ford gran torino" 192 | 22.0 6 225.0 100.0 3233. 15.4 76 1 "plymouth valiant" 193 | 22.0 6 250.0 105.0 3353. 14.5 76 1 "chevrolet nova" 194 | 24.0 6 200.0 81.00 3012. 17.6 76 1 "ford maverick" 195 | 22.5 6 232.0 90.00 3085. 17.6 76 1 "amc hornet" 196 | 29.0 4 85.00 52.00 2035. 22.2 76 1 "chevrolet chevette" 197 | 24.5 4 98.00 60.00 2164. 22.1 76 1 "chevrolet woody" 198 | 29.0 4 90.00 70.00 1937. 14.2 76 2 "vw rabbit" 199 | 33.0 4 91.00 53.00 1795. 17.4 76 3 "honda civic" 200 | 20.0 6 225.0 100.0 3651. 17.7 76 1 "dodge aspen se" 201 | 18.0 6 250.0 78.00 3574. 21.0 76 1 "ford granada ghia" 202 | 18.5 6 250.0 110.0 3645. 16.2 76 1 "pontiac ventura sj" 203 | 17.5 6 258.0 95.00 3193. 17.8 76 1 "amc pacer d/l" 204 | 29.5 4 97.00 71.00 1825. 12.2 76 2 "volkswagen rabbit" 205 | 32.0 4 85.00 70.00 1990. 17.0 76 3 "datsun b-210" 206 | 28.0 4 97.00 75.00 2155. 16.4 76 3 "toyota corolla" 207 | 26.5 4 140.0 72.00 2565. 13.6 76 1 "ford pinto" 208 | 20.0 4 130.0 102.0 3150. 15.7 76 2 "volvo 245" 209 | 13.0 8 318.0 150.0 3940. 13.2 76 1 "plymouth volare premier v8" 210 | 19.0 4 120.0 88.00 3270. 21.9 76 2 "peugeot 504" 211 | 19.0 6 156.0 108.0 2930. 15.5 76 3 "toyota mark ii" 212 | 16.5 6 168.0 120.0 3820. 16.7 76 2 "mercedes-benz 280s" 213 | 16.5 8 350.0 180.0 4380. 12.1 76 1 "cadillac seville" 214 | 13.0 8 350.0 145.0 4055. 12.0 76 1 "chevy c10" 215 | 13.0 8 302.0 130.0 3870. 15.0 76 1 "ford f108" 216 | 13.0 8 318.0 150.0 3755. 14.0 76 1 "dodge d100" 217 | 31.5 4 98.00 68.00 2045. 18.5 77 3 "honda accord cvcc" 218 | 30.0 4 111.0 80.00 2155. 14.8 77 1 "buick opel isuzu deluxe" 219 | 36.0 4 79.00 58.00 1825. 18.6 77 2 "renault 5 gtl" 220 | 25.5 4 122.0 96.00 2300. 15.5 77 1 "plymouth arrow gs" 221 | 33.5 4 85.00 70.00 1945. 16.8 77 3 "datsun f-10 hatchback" 222 | 17.5 8 305.0 145.0 3880. 12.5 77 1 "chevrolet caprice classic" 223 | 17.0 8 260.0 110.0 4060. 19.0 77 1 "oldsmobile cutlass supreme" 224 | 15.5 8 318.0 145.0 4140. 13.7 77 1 "dodge monaco brougham" 225 | 15.0 8 302.0 130.0 4295. 14.9 77 1 "mercury cougar brougham" 226 | 17.5 6 250.0 110.0 3520. 16.4 77 1 "chevrolet concours" 227 | 20.5 6 231.0 105.0 3425. 16.9 77 1 "buick skylark" 228 | 19.0 6 225.0 100.0 3630. 17.7 77 1 "plymouth volare custom" 229 | 18.5 6 250.0 98.00 3525. 19.0 77 1 "ford granada" 230 | 16.0 8 400.0 180.0 4220. 11.1 77 1 "pontiac grand prix lj" 231 | 15.5 8 350.0 170.0 4165. 11.4 77 1 "chevrolet monte carlo landau" 232 | 15.5 8 400.0 190.0 4325. 12.2 77 1 "chrysler cordoba" 233 | 16.0 8 351.0 149.0 4335. 14.5 77 1 "ford thunderbird" 234 | 29.0 4 97.00 78.00 1940. 14.5 77 2 "volkswagen rabbit custom" 235 | 24.5 4 151.0 88.00 2740. 16.0 77 1 "pontiac sunbird coupe" 236 | 26.0 4 97.00 75.00 2265. 18.2 77 3 "toyota corolla liftback" 237 | 25.5 4 140.0 89.00 2755. 15.8 77 1 "ford mustang ii 2+2" 238 | 30.5 4 98.00 63.00 2051. 17.0 77 1 "chevrolet chevette" 239 | 33.5 4 98.00 83.00 2075. 15.9 77 1 "dodge colt m/m" 240 | 30.0 4 97.00 67.00 1985. 16.4 77 3 "subaru dl" 241 | 30.5 4 97.00 78.00 2190. 14.1 77 2 "volkswagen dasher" 242 | 22.0 6 146.0 97.00 2815. 14.5 77 3 "datsun 810" 243 | 21.5 4 121.0 110.0 2600. 12.8 77 2 "bmw 320i" 244 | 21.5 3 80.00 110.0 2720. 13.5 77 3 "mazda rx-4" 245 | 43.1 4 90.00 48.00 1985. 21.5 78 2 "volkswagen rabbit custom diesel" 246 | 36.1 4 98.00 66.00 1800. 14.4 78 1 "ford fiesta" 247 | 32.8 4 78.00 52.00 1985. 19.4 78 3 "mazda glc deluxe" 248 | 39.4 4 85.00 70.00 2070. 18.6 78 3 "datsun b210 gx" 249 | 36.1 4 91.00 60.00 1800. 16.4 78 3 "honda civic cvcc" 250 | 19.9 8 260.0 110.0 3365. 15.5 78 1 "oldsmobile cutlass salon brougham" 251 | 19.4 8 318.0 140.0 3735. 13.2 78 1 "dodge diplomat" 252 | 20.2 8 302.0 139.0 3570. 12.8 78 1 "mercury monarch ghia" 253 | 19.2 6 231.0 105.0 3535. 19.2 78 1 "pontiac phoenix lj" 254 | 20.5 6 200.0 95.00 3155. 18.2 78 1 "chevrolet malibu" 255 | 20.2 6 200.0 85.00 2965. 15.8 78 1 "ford fairmont (auto)" 256 | 25.1 4 140.0 88.00 2720. 15.4 78 1 "ford fairmont (man)" 257 | 20.5 6 225.0 100.0 3430. 17.2 78 1 "plymouth volare" 258 | 19.4 6 232.0 90.00 3210. 17.2 78 1 "amc concord" 259 | 20.6 6 231.0 105.0 3380. 15.8 78 1 "buick century special" 260 | 20.8 6 200.0 85.00 3070. 16.7 78 1 "mercury zephyr" 261 | 18.6 6 225.0 110.0 3620. 18.7 78 1 "dodge aspen" 262 | 18.1 6 258.0 120.0 3410. 15.1 78 1 "amc concord d/l" 263 | 19.2 8 305.0 145.0 3425. 13.2 78 1 "chevrolet monte carlo landau" 264 | 17.7 6 231.0 165.0 3445. 13.4 78 1 "buick regal sport coupe (turbo)" 265 | 18.1 8 302.0 139.0 3205. 11.2 78 1 "ford futura" 266 | 17.5 8 318.0 140.0 4080. 13.7 78 1 "dodge magnum xe" 267 | 30.0 4 98.00 68.00 2155. 16.5 78 1 "chevrolet chevette" 268 | 27.5 4 134.0 95.00 2560. 14.2 78 3 "toyota corona" 269 | 27.2 4 119.0 97.00 2300. 14.7 78 3 "datsun 510" 270 | 30.9 4 105.0 75.00 2230. 14.5 78 1 "dodge omni" 271 | 21.1 4 134.0 95.00 2515. 14.8 78 3 "toyota celica gt liftback" 272 | 23.2 4 156.0 105.0 2745. 16.7 78 1 "plymouth sapporo" 273 | 23.8 4 151.0 85.00 2855. 17.6 78 1 "oldsmobile starfire sx" 274 | 23.9 4 119.0 97.00 2405. 14.9 78 3 "datsun 200-sx" 275 | 20.3 5 131.0 103.0 2830. 15.9 78 2 "audi 5000" 276 | 17.0 6 163.0 125.0 3140. 13.6 78 2 "volvo 264gl" 277 | 21.6 4 121.0 115.0 2795. 15.7 78 2 "saab 99gle" 278 | 16.2 6 163.0 133.0 3410. 15.8 78 2 "peugeot 604sl" 279 | 31.5 4 89.00 71.00 1990. 14.9 78 2 "volkswagen scirocco" 280 | 29.5 4 98.00 68.00 2135. 16.6 78 3 "honda accord lx" 281 | 21.5 6 231.0 115.0 3245. 15.4 79 1 "pontiac lemans v6" 282 | 19.8 6 200.0 85.00 2990. 18.2 79 1 "mercury zephyr 6" 283 | 22.3 4 140.0 88.00 2890. 17.3 79 1 "ford fairmont 4" 284 | 20.2 6 232.0 90.00 3265. 18.2 79 1 "amc concord dl 6" 285 | 20.6 6 225.0 110.0 3360. 16.6 79 1 "dodge aspen 6" 286 | 17.0 8 305.0 130.0 3840. 15.4 79 1 "chevrolet caprice classic" 287 | 17.6 8 302.0 129.0 3725. 13.4 79 1 "ford ltd landau" 288 | 16.5 8 351.0 138.0 3955. 13.2 79 1 "mercury grand marquis" 289 | 18.2 8 318.0 135.0 3830. 15.2 79 1 "dodge st. regis" 290 | 16.9 8 350.0 155.0 4360. 14.9 79 1 "buick estate wagon (sw)" 291 | 15.5 8 351.0 142.0 4054. 14.3 79 1 "ford country squire (sw)" 292 | 19.2 8 267.0 125.0 3605. 15.0 79 1 "chevrolet malibu classic (sw)" 293 | 18.5 8 360.0 150.0 3940. 13.0 79 1 "chrysler lebaron town @ country (sw)" 294 | 31.9 4 89.00 71.00 1925. 14.0 79 2 "vw rabbit custom" 295 | 34.1 4 86.00 65.00 1975. 15.2 79 3 "maxda glc deluxe" 296 | 35.7 4 98.00 80.00 1915. 14.4 79 1 "dodge colt hatchback custom" 297 | 27.4 4 121.0 80.00 2670. 15.0 79 1 "amc spirit dl" 298 | 25.4 5 183.0 77.00 3530. 20.1 79 2 "mercedes benz 300d" 299 | 23.0 8 350.0 125.0 3900. 17.4 79 1 "cadillac eldorado" 300 | 27.2 4 141.0 71.00 3190. 24.8 79 2 "peugeot 504" 301 | 23.9 8 260.0 90.00 3420. 22.2 79 1 "oldsmobile cutlass salon brougham" 302 | 34.2 4 105.0 70.00 2200. 13.2 79 1 "plymouth horizon" 303 | 34.5 4 105.0 70.00 2150. 14.9 79 1 "plymouth horizon tc3" 304 | 31.8 4 85.00 65.00 2020. 19.2 79 3 "datsun 210" 305 | 37.3 4 91.00 69.00 2130. 14.7 79 2 "fiat strada custom" 306 | 28.4 4 151.0 90.00 2670. 16.0 79 1 "buick skylark limited" 307 | 28.8 6 173.0 115.0 2595. 11.3 79 1 "chevrolet citation" 308 | 26.8 6 173.0 115.0 2700. 12.9 79 1 "oldsmobile omega brougham" 309 | 33.5 4 151.0 90.00 2556. 13.2 79 1 "pontiac phoenix" 310 | 41.5 4 98.00 76.00 2144. 14.7 80 2 "vw rabbit" 311 | 38.1 4 89.00 60.00 1968. 18.8 80 3 "toyota corolla tercel" 312 | 32.1 4 98.00 70.00 2120. 15.5 80 1 "chevrolet chevette" 313 | 37.2 4 86.00 65.00 2019. 16.4 80 3 "datsun 310" 314 | 28.0 4 151.0 90.00 2678. 16.5 80 1 "chevrolet citation" 315 | 26.4 4 140.0 88.00 2870. 18.1 80 1 "ford fairmont" 316 | 24.3 4 151.0 90.00 3003. 20.1 80 1 "amc concord" 317 | 19.1 6 225.0 90.00 3381. 18.7 80 1 "dodge aspen" 318 | 34.3 4 97.00 78.00 2188. 15.8 80 2 "audi 4000" 319 | 29.8 4 134.0 90.00 2711. 15.5 80 3 "toyota corona liftback" 320 | 31.3 4 120.0 75.00 2542. 17.5 80 3 "mazda 626" 321 | 37.0 4 119.0 92.00 2434. 15.0 80 3 "datsun 510 hatchback" 322 | 32.2 4 108.0 75.00 2265. 15.2 80 3 "toyota corolla" 323 | 46.6 4 86.00 65.00 2110. 17.9 80 3 "mazda glc" 324 | 27.9 4 156.0 105.0 2800. 14.4 80 1 "dodge colt" 325 | 40.8 4 85.00 65.00 2110. 19.2 80 3 "datsun 210" 326 | 44.3 4 90.00 48.00 2085. 21.7 80 2 "vw rabbit c (diesel)" 327 | 43.4 4 90.00 48.00 2335. 23.7 80 2 "vw dasher (diesel)" 328 | 36.4 5 121.0 67.00 2950. 19.9 80 2 "audi 5000s (diesel)" 329 | 30.0 4 146.0 67.00 3250. 21.8 80 2 "mercedes-benz 240d" 330 | 44.6 4 91.00 67.00 1850. 13.8 80 3 "honda civic 1500 gl" 331 | 40.9 4 85.00 ? 1835. 17.3 80 2 "renault lecar deluxe" 332 | 33.8 4 97.00 67.00 2145. 18.0 80 3 "subaru dl" 333 | 29.8 4 89.00 62.00 1845. 15.3 80 2 "vokswagen rabbit" 334 | 32.7 6 168.0 132.0 2910. 11.4 80 3 "datsun 280-zx" 335 | 23.7 3 70.00 100.0 2420. 12.5 80 3 "mazda rx-7 gs" 336 | 35.0 4 122.0 88.00 2500. 15.1 80 2 "triumph tr7 coupe" 337 | 23.6 4 140.0 ? 2905. 14.3 80 1 "ford mustang cobra" 338 | 32.4 4 107.0 72.00 2290. 17.0 80 3 "honda accord" 339 | 27.2 4 135.0 84.00 2490. 15.7 81 1 "plymouth reliant" 340 | 26.6 4 151.0 84.00 2635. 16.4 81 1 "buick skylark" 341 | 25.8 4 156.0 92.00 2620. 14.4 81 1 "dodge aries wagon (sw)" 342 | 23.5 6 173.0 110.0 2725. 12.6 81 1 "chevrolet citation" 343 | 30.0 4 135.0 84.00 2385. 12.9 81 1 "plymouth reliant" 344 | 39.1 4 79.00 58.00 1755. 16.9 81 3 "toyota starlet" 345 | 39.0 4 86.00 64.00 1875. 16.4 81 1 "plymouth champ" 346 | 35.1 4 81.00 60.00 1760. 16.1 81 3 "honda civic 1300" 347 | 32.3 4 97.00 67.00 2065. 17.8 81 3 "subaru" 348 | 37.0 4 85.00 65.00 1975. 19.4 81 3 "datsun 210 mpg" 349 | 37.7 4 89.00 62.00 2050. 17.3 81 3 "toyota tercel" 350 | 34.1 4 91.00 68.00 1985. 16.0 81 3 "mazda glc 4" 351 | 34.7 4 105.0 63.00 2215. 14.9 81 1 "plymouth horizon 4" 352 | 34.4 4 98.00 65.00 2045. 16.2 81 1 "ford escort 4w" 353 | 29.9 4 98.00 65.00 2380. 20.7 81 1 "ford escort 2h" 354 | 33.0 4 105.0 74.00 2190. 14.2 81 2 "volkswagen jetta" 355 | 34.5 4 100.0 ? 2320. 15.8 81 2 "renault 18i" 356 | 33.7 4 107.0 75.00 2210. 14.4 81 3 "honda prelude" 357 | 32.4 4 108.0 75.00 2350. 16.8 81 3 "toyota corolla" 358 | 32.9 4 119.0 100.0 2615. 14.8 81 3 "datsun 200sx" 359 | 31.6 4 120.0 74.00 2635. 18.3 81 3 "mazda 626" 360 | 28.1 4 141.0 80.00 3230. 20.4 81 2 "peugeot 505s turbo diesel" 361 | 30.7 6 145.0 76.00 3160. 19.6 81 2 "volvo diesel" 362 | 25.4 6 168.0 116.0 2900. 12.6 81 3 "toyota cressida" 363 | 24.2 6 146.0 120.0 2930. 13.8 81 3 "datsun 810 maxima" 364 | 22.4 6 231.0 110.0 3415. 15.8 81 1 "buick century" 365 | 26.6 8 350.0 105.0 3725. 19.0 81 1 "oldsmobile cutlass ls" 366 | 20.2 6 200.0 88.00 3060. 17.1 81 1 "ford granada gl" 367 | 17.6 6 225.0 85.00 3465. 16.6 81 1 "chrysler lebaron salon" 368 | 28.0 4 112.0 88.00 2605. 19.6 82 1 "chevrolet cavalier" 369 | 27.0 4 112.0 88.00 2640. 18.6 82 1 "chevrolet cavalier wagon" 370 | 34.0 4 112.0 88.00 2395. 18.0 82 1 "chevrolet cavalier 2-door" 371 | 31.0 4 112.0 85.00 2575. 16.2 82 1 "pontiac j2000 se hatchback" 372 | 29.0 4 135.0 84.00 2525. 16.0 82 1 "dodge aries se" 373 | 27.0 4 151.0 90.00 2735. 18.0 82 1 "pontiac phoenix" 374 | 24.0 4 140.0 92.00 2865. 16.4 82 1 "ford fairmont futura" 375 | 23.0 4 151.0 ? 3035. 20.5 82 1 "amc concord dl" 376 | 36.0 4 105.0 74.00 1980. 15.3 82 2 "volkswagen rabbit l" 377 | 37.0 4 91.00 68.00 2025. 18.2 82 3 "mazda glc custom l" 378 | 31.0 4 91.00 68.00 1970. 17.6 82 3 "mazda glc custom" 379 | 38.0 4 105.0 63.00 2125. 14.7 82 1 "plymouth horizon miser" 380 | 36.0 4 98.00 70.00 2125. 17.3 82 1 "mercury lynx l" 381 | 36.0 4 120.0 88.00 2160. 14.5 82 3 "nissan stanza xe" 382 | 36.0 4 107.0 75.00 2205. 14.5 82 3 "honda accord" 383 | 34.0 4 108.0 70.00 2245 16.9 82 3 "toyota corolla" 384 | 38.0 4 91.00 67.00 1965. 15.0 82 3 "honda civic" 385 | 32.0 4 91.00 67.00 1965. 15.7 82 3 "honda civic (auto)" 386 | 38.0 4 91.00 67.00 1995. 16.2 82 3 "datsun 310 gx" 387 | 25.0 6 181.0 110.0 2945. 16.4 82 1 "buick century limited" 388 | 38.0 6 262.0 85.00 3015. 17.0 82 1 "oldsmobile cutlass ciera (diesel)" 389 | 26.0 4 156.0 92.00 2585. 14.5 82 1 "chrysler lebaron medallion" 390 | 22.0 6 232.0 112.0 2835 14.7 82 1 "ford granada l" 391 | 32.0 4 144.0 96.00 2665. 13.9 82 3 "toyota celica gt" 392 | 36.0 4 135.0 84.00 2370. 13.0 82 1 "dodge charger 2.2" 393 | 27.0 4 151.0 90.00 2950. 17.3 82 1 "chevrolet camaro" 394 | 27.0 4 140.0 86.00 2790. 15.6 82 1 "ford mustang gl" 395 | 44.0 4 97.00 52.00 2130. 24.6 82 2 "vw pickup" 396 | 32.0 4 135.0 84.00 2295. 11.6 82 1 "dodge rampage" 397 | 28.0 4 120.0 79.00 2625. 18.6 82 1 "ford ranger" 398 | 31.0 4 119.0 82.00 2720. 19.4 82 1 "chevy s-10" 399 | -------------------------------------------------------------------------------- /Lecture 5 - Probabilities and Logistic Regression/linreg-normal_equations.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | (C) 2017 Nikolay Manchev, London Machine Learning Study Group 4 | 5 | http://www.meetup.com/London-Machine-Learning-Study-Group/ 6 | 7 | This work is licensed under the Creative Commons Attribution 4.0 International 8 | License. To view a copy of this license, visit 9 | http://creativecommons.org/licenses/by/4.0/. 10 | """ 11 | 12 | import numpy as np 13 | 14 | import matplotlib.pyplot as plt 15 | 16 | x = np.array([[1,2,3,4,5,6,7,8,9,10]]).T 17 | y = np.array([[0,0,0,0,0,1,1,1,1,1]]).T 18 | 19 | # Normalize the inputs 20 | x = (x - np.mean(x)) / np.std(x) 21 | 22 | # Add ones for w_0 23 | mat_ones = np.ones(shape=(x.shape[0], 2)) 24 | mat_ones[:,1] = x[:,0] 25 | x = mat_ones 26 | 27 | # Normal equations method 28 | xTx = np.linalg.inv(x.T.dot(x)) 29 | xTy = x.T.dot(y) 30 | w = xTx.dot(xTy) 31 | 32 | print("Model parameters:\n") 33 | print(w) 34 | 35 | # Plot X and y 36 | f, ax1 = plt.subplots(1, 1, figsize=(7,7)) 37 | ax1.scatter(x[:,1], y) 38 | 39 | # Make predictions on the training set 40 | y_hat = w[0] + w[1]*x[:,1] 41 | 42 | # Plot the regression line 43 | ax1.plot(x[:,1], y_hat, color='r') 44 | ax1.grid(True) -------------------------------------------------------------------------------- /Lecture 5 - Probabilities and Logistic Regression/logreg-hp-origin.py: -------------------------------------------------------------------------------- 1 | """ 2 | (C) 2017 Nikolay Manchev 3 | [London Machine Learning Study Group](http://www.meetup.com/London-Machine-Learning-Study-Group/members/) 4 | 5 | This work is licensed under the Creative Commons Attribution 4.0 International 6 | License. To view a copy of this license, visit 7 | http://creativecommons.org/licenses/by/4.0/. 8 | """ 9 | 10 | import numpy as np 11 | 12 | import matplotlib.pyplot as plt 13 | 14 | def y_hat(x, w): 15 | """ 16 | Logistic regression hypothesis: y_hat = 1 / (1 + e^(-x*w)) 17 | """ 18 | 19 | return (1/(1+np.exp(-x.dot(w)))) 20 | 21 | def gradient_ascent(x, y, w, max_iter, alpha = 0.01): 22 | """ 23 | Performs gradient ascent to optimise L(w). 24 | 25 | Keyword arguments: 26 | 27 | *x* : Numpy array 28 | matrix of independent variables 29 | 30 | *y* : Numpy array 31 | columnar vector of target values 32 | 33 | *w* : Numpy array 34 | initial model parameters 35 | 36 | *max_iter* : int 37 | maximum number of iterations 38 | 39 | *alpha* : int, optional 40 | learning rate (defaults to 0.01) 41 | 42 | Returns: 43 | 44 | *L_hist* : Numpy array 45 | values of L(w) at each iteration 46 | 47 | *w* : Numpy array 48 | estimated model parameters 49 | """ 50 | 51 | L_hist = np.zeros(max_iter) 52 | 53 | print("\nGradient ascent starts.\n") 54 | 55 | for i in range(0, max_iter): 56 | 57 | # Likelihood function 58 | L = np.sum(y.T.dot(np.log(y_hat(x, w))) + (1-y.T).dot(np.log(1-y_hat(x, w)))) 59 | 60 | # Keep L(w) for each iteration (for the final plot) 61 | L_hist[i] = L 62 | 63 | print("Iteration %d, L(w): %f\n" % (i, L)) 64 | 65 | # Compute the gradient and adjust the model parameters 66 | gradient = np.dot(x.T, y - y_hat(x, w) ) 67 | 68 | w = w + alpha * gradient 69 | 70 | print("Gradient ascent finished.\n") 71 | 72 | return (L_hist, w) 73 | 74 | 75 | # Load the data set 76 | # We use Auto MPG from UCI Machine Learning Repository 77 | # https://archive.ics.uci.edu/ml/datasets/Auto+MPG 78 | 79 | car_data = np.genfromtxt("auto-mpg.data", usecols=(3, 7)) 80 | car_data = car_data[~np.isnan(car_data).any(axis=1)] 81 | 82 | # Remove the data for Japan and recode US and Europe as 0 and 1 83 | car_data = car_data[car_data[:,1]!=3] 84 | car_data[:,1][car_data[:,1] == 1] = 0 85 | car_data[:,1][car_data[:,1] == 2] = 1 86 | 87 | # Assign Horsepower attribute to x and Origin to y 88 | x = car_data[:,0] 89 | y = car_data[:,1] 90 | 91 | x = np.array([x]).T 92 | y = np.array([y]).T 93 | 94 | # Normalize the inputs 95 | hp_mean = np.mean(x) 96 | hp_std = np.std(x) 97 | x = (x - hp_mean) / hp_std 98 | 99 | # Initialise w with ones 100 | m,n=np.shape(x) 101 | w = np.array([np.ones(n)]).T 102 | 103 | # Perform gradient ascent 104 | (l_hist, w) = gradient_ascent(x, y, w, 10) 105 | 106 | print("Model parameters:\n") 107 | print(w) 108 | 109 | # Plot X and y 110 | f, (ax1,ax2) = plt.subplots(1, 2, figsize=(7,7)) 111 | ax1.scatter(x, y) 112 | 113 | # Plot the decision boundary 114 | x = np.arange(-5, 5, 1)[np.newaxis].T 115 | ax1.plot(x, y_hat(x, w), color='r') 116 | ax1.grid(True) 117 | 118 | # Plot the change of L(w) 119 | x = np.arange(1,l_hist.size + 1) 120 | y = l_hist 121 | 122 | ax2.plot(x, l_hist) 123 | ax2.grid(True) 124 | 125 | # To make predictions use 126 | # y_hat(np.array([(...hp_input...-hp_mean)/(hp_std)]), w) -------------------------------------------------------------------------------- /Lecture 5 - Probabilities and Logistic Regression/logreg_gradient.py: -------------------------------------------------------------------------------- 1 | """ 2 | (C) 2017 Nikolay Manchev 3 | [London Machine Learning Study Group](http://www.meetup.com/London-Machine-Learning-Study-Group/members/) 4 | 5 | This work is licensed under the Creative Commons Attribution 4.0 International 6 | License. To view a copy of this license, visit 7 | http://creativecommons.org/licenses/by/4.0/. 8 | """ 9 | 10 | import numpy as np 11 | 12 | import matplotlib.pyplot as plt 13 | 14 | def y_hat(x, w): 15 | """ 16 | Logistic regression hypothesis: y_hat = 1 / (1 + e^(-x*w)) 17 | """ 18 | 19 | return (1/(1+np.exp(-x.dot(w)))) 20 | 21 | def gradient_ascent(x, y, w, max_iter, alpha = 0.01): 22 | """ 23 | Performs gradient ascent to optimise L(w). 24 | 25 | Keyword arguments: 26 | 27 | *x* : Numpy array 28 | matrix of independent variables 29 | 30 | *y* : Numpy array 31 | columnar vector of target values 32 | 33 | *w* : Numpy array 34 | initial model parameters 35 | 36 | *max_iter* : int 37 | maximum number of iterations 38 | 39 | *alpha* : int, optional 40 | learning rate (defaults to 0.01) 41 | 42 | Returns: 43 | 44 | *L_hist* : Numpy array 45 | values of L(w) at each iteration 46 | 47 | *w* : Numpy array 48 | estimated model parameters 49 | """ 50 | 51 | L_hist = np.zeros(max_iter) 52 | 53 | print("\nGradient ascent starts.\n") 54 | 55 | for i in range(0, max_iter): 56 | 57 | # Likelihood function 58 | L = np.sum(y.T.dot(np.log(y_hat(x, w))) + (1-y.T).dot(np.log(1-y_hat(x, w)))) 59 | 60 | # Keep L(w) for each iteration (for the final plot) 61 | L_hist[i] = L 62 | 63 | print("Iteration %d, L(w): %f\n" % (i, L)) 64 | 65 | # Compute the gradient and adjust the model parameters 66 | gradient = np.dot(x.T, y - y_hat(x, w) ) 67 | 68 | w = w + alpha * gradient 69 | 70 | print("Gradient ascent finished.\n") 71 | 72 | return (L_hist, w) 73 | 74 | 75 | # Load the data set 76 | x = np.array([[1,2,3,4,5,6,7,8,9,10]]).T 77 | y = np.array([[0,0,0,0,0,1,1,1,1,1]]).T 78 | 79 | # Normalize the inputs 80 | x = (x - np.mean(x)) / np.std(x) 81 | 82 | # Initialise w with ones 83 | m,n=np.shape(x) 84 | w = np.array([np.ones(n)]).T 85 | 86 | # Perform gradient ascent for 25 iterations 87 | (l_hist, w) = gradient_ascent(x, y, w, 25) 88 | 89 | print("Model parameters:\n") 90 | print(w) 91 | 92 | # Plot X and y 93 | f, (ax1,ax2) = plt.subplots(1, 2, figsize=(7,7)) 94 | ax1.scatter(x, y) 95 | 96 | # Plot the decision boundary 97 | ax1.plot(x, y_hat(x, w), color='r') 98 | ax1.grid(True) 99 | 100 | # Plot the change of L(w) 101 | x = np.arange(1,l_hist.size + 1) 102 | y = l_hist 103 | 104 | ax2.plot(x, l_hist) 105 | ax2.grid(True) 106 | 107 | -------------------------------------------------------------------------------- /Lecture 5 - Probabilities and Logistic Regression/logreg_gradient_2_variables.py: -------------------------------------------------------------------------------- 1 | """ 2 | (C) 2017 Nikolay Manchev 3 | [London Machine Learning Study Group](http://www.meetup.com/London-Machine-Learning-Study-Group/members/) 4 | 5 | This work is licensed under the Creative Commons Attribution 4.0 International 6 | License. To view a copy of this license, visit 7 | http://creativecommons.org/licenses/by/4.0/. 8 | """ 9 | 10 | import numpy as np 11 | 12 | import matplotlib.pyplot as plt 13 | 14 | def y_hat(x, w): 15 | """ 16 | Logistic regression hypothesis: y_hat = 1 / (1 + e^(-x*w)) 17 | """ 18 | 19 | return (1/(1+np.exp(-x.dot(w)))) 20 | 21 | def gradient_ascent(x, y, w, max_iter, alpha = 0.01): 22 | """ 23 | Performs gradient ascent to optimise L(w). 24 | 25 | Keyword arguments: 26 | 27 | *x* : Numpy array 28 | matrix of independent variables 29 | 30 | *y* : Numpy array 31 | columnar vector of target values 32 | 33 | *w* : Numpy array 34 | initial model parameters 35 | 36 | *max_iter* : int 37 | maximum number of iterations 38 | 39 | *alpha* : int, optional 40 | learning rate (defaults to 0.01) 41 | 42 | Returns: 43 | 44 | *L_hist* : Numpy array 45 | values of L(w) at each iteration 46 | 47 | *w* : Numpy array 48 | estimated model parameters 49 | """ 50 | 51 | L_hist = np.zeros(max_iter) 52 | 53 | print("\nGradient ascent starts.\n") 54 | 55 | for i in range(0, max_iter): 56 | 57 | # Likelihood function 58 | L = np.sum(y.T.dot(np.log(y_hat(x, w))) + (1-y.T).dot(np.log(1-y_hat(x, w)))) 59 | 60 | # Keep L(w) for each iteration (for the final plot) 61 | L_hist[i] = L 62 | 63 | print("Iteration %d, L(w): %f\n" % (i, L)) 64 | 65 | # Compute the gradient and adjust the model parameters 66 | gradient = np.dot(x.T, y - y_hat(x, w)) 67 | 68 | w = w + alpha * gradient 69 | 70 | print("Gradient ascent finished.\n") 71 | 72 | return (L_hist, w) 73 | 74 | 75 | # Load the data set 76 | # We use Auto MPG from UCI Machine Learning Repository 77 | # https://archive.ics.uci.edu/ml/datasets/Auto+MPG 78 | 79 | car_data = np.genfromtxt("auto-mpg.data", usecols=(4, 3, 7)) 80 | car_data = car_data[~np.isnan(car_data).any(axis=1)] 81 | 82 | # Remove the data for Japan and recode US and Europe as 0 and 1 83 | car_data = car_data[car_data[:,2]!=3] 84 | car_data[:,1][car_data[:,1] == 1] = 0 85 | car_data[:,1][car_data[:,1] == 2] = 1 86 | 87 | # Assign Horsepower attribute to x and Origin to y 88 | x = car_data[:,[0,1]] 89 | y = car_data[:,2] 90 | 91 | y = np.array([y]).T 92 | 93 | # Normalize the inputs 94 | weight_mean = np.mean(x[:,0]) 95 | weight_std = np.std(x[:,0]) 96 | hp_mean = np.mean(x[:,1]) 97 | hp_std = np.std(x[:,1]) 98 | x[:,0] = (x[:,0] - weight_mean) / weight_std 99 | x[:,1] = (x[:,1] - hp_mean) / hp_std 100 | 101 | # Initialise w with ones 102 | m,n=np.shape(x) 103 | w = np.array([np.ones(n)]).T 104 | 105 | # Perform gradient ascent 106 | (l_hist, w) = gradient_ascent(x, y, w, 50) 107 | 108 | print("Model parameters:\n") 109 | print(w) 110 | 111 | # Plot the data points and a gradient for the probability 112 | # given by y_hat() 113 | f, ax1 = plt.subplots(1, 1, figsize=(7,7)) 114 | 115 | x_min, x_max = x[:, 0].min() - 1, x[:, 0].max() + 1 116 | y_min, y_max = x[:, 1].min() - 1, x[:, 1].max() + 1 117 | 118 | xx, yy = np.meshgrid(np.arange(x_min, x_max, .1), 119 | np.arange(y_min, y_max, .1)) 120 | 121 | Z = y_hat(np.c_[xx.ravel(), yy.ravel()], w) 122 | Z = Z.reshape(xx.shape) 123 | 124 | ax1.contourf(xx, yy, Z, cmap=plt.cm.Blues) 125 | ax1.scatter(x[:,0], x[:,1], c=y, cmap=plt.cm.bwr) 126 | 127 | # To make predictions use 128 | # y_hat(np.array([((...weight_input...)-weight_mean)/weight_std,((...hp_input...)-hp_mean)/hp_std]), w) 129 | -------------------------------------------------------------------------------- /Lecture 5 - Probabilities and Logistic Regression/logreg_gradient_2_variables_iris.py: -------------------------------------------------------------------------------- 1 | """ 2 | (C) 2017 Nikolay Manchev 3 | [London Machine Learning Study Group](http://www.meetup.com/London-Machine-Learning-Study-Group/members/) 4 | 5 | This work is licensed under the Creative Commons Attribution 4.0 International 6 | License. To view a copy of this license, visit 7 | http://creativecommons.org/licenses/by/4.0/. 8 | """ 9 | 10 | import numpy as np 11 | 12 | import matplotlib.pyplot as plt 13 | 14 | from sklearn import datasets 15 | 16 | 17 | def y_hat(x, w): 18 | """ 19 | Logistic regression hypothesis: y_hat = 1 / (1 + e^(-x*w)) 20 | """ 21 | 22 | return (1/(1+np.exp(-x.dot(w)))) 23 | 24 | def gradient_ascent(x, y, w, max_iter, alpha = 0.01): 25 | """ 26 | Performs gradient ascent to optimise L(w). 27 | 28 | Keyword arguments: 29 | 30 | *x* : Numpy array 31 | matrix of independent variables 32 | 33 | *y* : Numpy array 34 | columnar vector of target values 35 | 36 | *w* : Numpy array 37 | initial model parameters 38 | 39 | *max_iter* : int 40 | maximum number of iterations 41 | 42 | *alpha* : int, optional 43 | learning rate (defaults to 0.01) 44 | 45 | Returns: 46 | 47 | *L_hist* : Numpy array 48 | values of L(w) at each iteration 49 | 50 | *w* : Numpy array 51 | estimated model parameters 52 | """ 53 | 54 | L_hist = np.zeros(max_iter) 55 | 56 | print("\nGradient ascent starts.\n") 57 | 58 | for i in range(0, max_iter): 59 | 60 | # Likelihood function 61 | L = np.sum(y.T.dot(np.log(y_hat(x, w))) + (1-y.T).dot(np.log(1-y_hat(x, w)))) 62 | 63 | # Keep L(w) for each iteration (for the final plot) 64 | L_hist[i] = L 65 | 66 | print("Iteration %d, L(w): %f\n" % (i, L)) 67 | 68 | # Compute the gradient and adjust the model parameters 69 | gradient = np.dot(x.T, y - y_hat(x, w)) 70 | 71 | w = w + alpha * gradient 72 | 73 | print("Gradient ascent finished.\n") 74 | 75 | return (L_hist, w) 76 | 77 | # Load the IRIS dataset 78 | iris = datasets.load_iris() 79 | x = iris.data[:99, :2] # we only take the first two features. 80 | y = iris.target[:99] # assign the class variable to y 81 | 82 | y = np.array([y]).T 83 | 84 | # Normalize the inputs 85 | x[:,0] = (x[:,0] - np.mean(x[:,0])) / np.std(x[:,0]) 86 | x[:,1] = (x[:,1] - np.mean(x[:,1])) / np.std(x[:,1]) 87 | 88 | # Initialise w with ones 89 | m,n=np.shape(x) 90 | w = np.array([np.ones(n)]).T 91 | 92 | # Perform gradient ascent 93 | (l_hist, w) = gradient_ascent(x, y, w, 25) 94 | 95 | print("Model parameters:\n") 96 | print(w) 97 | 98 | # Plot the classes and the probability given by y_hat() 99 | f, ax1 = plt.subplots(1, 1, figsize=(7,7)) 100 | 101 | x_min, x_max = x[:, 0].min() - 1, x[:, 0].max() + 1 102 | y_min, y_max = x[:, 1].min() - 1, x[:, 1].max() + 1 103 | 104 | xx, yy = np.meshgrid(np.arange(x_min, x_max, .1), 105 | np.arange(y_min, y_max, .1)) 106 | 107 | Z = y_hat(np.c_[xx.ravel(), yy.ravel()], w) 108 | Z = Z.reshape(xx.shape) 109 | 110 | ax1.contourf(xx, yy, Z, cmap=plt.cm.Blues) 111 | ax1.scatter(x[:,0], x[:,1], c=y, cmap=plt.cm.bwr) 112 | 113 | -------------------------------------------------------------------------------- /Lecture 6 - Naive Bayes/Naive_Bayes.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nmanchev/MachineLearningStudyGroup/092f642d888f3dfb105aa8768d4a6927c93a4278/Lecture 6 - Naive Bayes/Naive_Bayes.pdf -------------------------------------------------------------------------------- /Lecture 6 - Naive Bayes/README.md: -------------------------------------------------------------------------------- 1 | ## Naive Bayes 2 | 3 | Code examples used in Lecture 6 4 | 5 | * auto-mpg.data - The [Auto MPG](https://archive.ics.uci.edu/ml/datasets/Auto+MPG) from UCI Machine Learning Repository 6 | * gender_height_weight.csv - Data from [National Longitudinal Youth Survey](http://www.bls.gov/nls/nlsy97.htm), Bureau of Labor Statistics, United States Department of Labor 7 | * naive_bayes_mf.py - Naive Bayes classification using the National Longitudinal Youth Survey 8 | * naive_bayes_autompg.py - Naive Bayes classification using the Auto MPG dataset (three target classes) 9 | 10 | This repository contains materials from the London Machine Learning Study Group Meetups 11 | 12 | The meetup page is available at [http://www.meetup.com/London-Machine-Learning-Study-Group](http://www.meetup.com/London-Machine-Learning-Study-Group). 13 | 14 | (C) 2017 Nikolay Manchev, London Machine Learning Study Group 15 | 16 | This work is licensed under the Creative Commons Attribution 4.0 International License. To view a copy of this license, visit [http://creativecommons.org/licenses/by/4.0](http://creativecommons.org/licenses/by/4.0). 17 | -------------------------------------------------------------------------------- /Lecture 6 - Naive Bayes/auto-mpg.data: -------------------------------------------------------------------------------- 1 | 18.0 8 307.0 130.0 3504. 12.0 70 1 "chevrolet chevelle malibu" 2 | 15.0 8 350.0 165.0 3693. 11.5 70 1 "buick skylark 320" 3 | 18.0 8 318.0 150.0 3436. 11.0 70 1 "plymouth satellite" 4 | 16.0 8 304.0 150.0 3433. 12.0 70 1 "amc rebel sst" 5 | 17.0 8 302.0 140.0 3449. 10.5 70 1 "ford torino" 6 | 15.0 8 429.0 198.0 4341. 10.0 70 1 "ford galaxie 500" 7 | 14.0 8 454.0 220.0 4354. 9.0 70 1 "chevrolet impala" 8 | 14.0 8 440.0 215.0 4312. 8.5 70 1 "plymouth fury iii" 9 | 14.0 8 455.0 225.0 4425. 10.0 70 1 "pontiac catalina" 10 | 15.0 8 390.0 190.0 3850. 8.5 70 1 "amc ambassador dpl" 11 | 15.0 8 383.0 170.0 3563. 10.0 70 1 "dodge challenger se" 12 | 14.0 8 340.0 160.0 3609. 8.0 70 1 "plymouth 'cuda 340" 13 | 15.0 8 400.0 150.0 3761. 9.5 70 1 "chevrolet monte carlo" 14 | 14.0 8 455.0 225.0 3086. 10.0 70 1 "buick estate wagon (sw)" 15 | 24.0 4 113.0 95.00 2372. 15.0 70 3 "toyota corona mark ii" 16 | 22.0 6 198.0 95.00 2833. 15.5 70 1 "plymouth duster" 17 | 18.0 6 199.0 97.00 2774. 15.5 70 1 "amc hornet" 18 | 21.0 6 200.0 85.00 2587. 16.0 70 1 "ford maverick" 19 | 27.0 4 97.00 88.00 2130. 14.5 70 3 "datsun pl510" 20 | 26.0 4 97.00 46.00 1835. 20.5 70 2 "volkswagen 1131 deluxe sedan" 21 | 25.0 4 110.0 87.00 2672. 17.5 70 2 "peugeot 504" 22 | 24.0 4 107.0 90.00 2430. 14.5 70 2 "audi 100 ls" 23 | 25.0 4 104.0 95.00 2375. 17.5 70 2 "saab 99e" 24 | 26.0 4 121.0 113.0 2234. 12.5 70 2 "bmw 2002" 25 | 21.0 6 199.0 90.00 2648. 15.0 70 1 "amc gremlin" 26 | 10.0 8 360.0 215.0 4615. 14.0 70 1 "ford f250" 27 | 10.0 8 307.0 200.0 4376. 15.0 70 1 "chevy c20" 28 | 11.0 8 318.0 210.0 4382. 13.5 70 1 "dodge d200" 29 | 9.0 8 304.0 193.0 4732. 18.5 70 1 "hi 1200d" 30 | 27.0 4 97.00 88.00 2130. 14.5 71 3 "datsun pl510" 31 | 28.0 4 140.0 90.00 2264. 15.5 71 1 "chevrolet vega 2300" 32 | 25.0 4 113.0 95.00 2228. 14.0 71 3 "toyota corona" 33 | 25.0 4 98.00 ? 2046. 19.0 71 1 "ford pinto" 34 | 19.0 6 232.0 100.0 2634. 13.0 71 1 "amc gremlin" 35 | 16.0 6 225.0 105.0 3439. 15.5 71 1 "plymouth satellite custom" 36 | 17.0 6 250.0 100.0 3329. 15.5 71 1 "chevrolet chevelle malibu" 37 | 19.0 6 250.0 88.00 3302. 15.5 71 1 "ford torino 500" 38 | 18.0 6 232.0 100.0 3288. 15.5 71 1 "amc matador" 39 | 14.0 8 350.0 165.0 4209. 12.0 71 1 "chevrolet impala" 40 | 14.0 8 400.0 175.0 4464. 11.5 71 1 "pontiac catalina brougham" 41 | 14.0 8 351.0 153.0 4154. 13.5 71 1 "ford galaxie 500" 42 | 14.0 8 318.0 150.0 4096. 13.0 71 1 "plymouth fury iii" 43 | 12.0 8 383.0 180.0 4955. 11.5 71 1 "dodge monaco (sw)" 44 | 13.0 8 400.0 170.0 4746. 12.0 71 1 "ford country squire (sw)" 45 | 13.0 8 400.0 175.0 5140. 12.0 71 1 "pontiac safari (sw)" 46 | 18.0 6 258.0 110.0 2962. 13.5 71 1 "amc hornet sportabout (sw)" 47 | 22.0 4 140.0 72.00 2408. 19.0 71 1 "chevrolet vega (sw)" 48 | 19.0 6 250.0 100.0 3282. 15.0 71 1 "pontiac firebird" 49 | 18.0 6 250.0 88.00 3139. 14.5 71 1 "ford mustang" 50 | 23.0 4 122.0 86.00 2220. 14.0 71 1 "mercury capri 2000" 51 | 28.0 4 116.0 90.00 2123. 14.0 71 2 "opel 1900" 52 | 30.0 4 79.00 70.00 2074. 19.5 71 2 "peugeot 304" 53 | 30.0 4 88.00 76.00 2065. 14.5 71 2 "fiat 124b" 54 | 31.0 4 71.00 65.00 1773. 19.0 71 3 "toyota corolla 1200" 55 | 35.0 4 72.00 69.00 1613. 18.0 71 3 "datsun 1200" 56 | 27.0 4 97.00 60.00 1834. 19.0 71 2 "volkswagen model 111" 57 | 26.0 4 91.00 70.00 1955. 20.5 71 1 "plymouth cricket" 58 | 24.0 4 113.0 95.00 2278. 15.5 72 3 "toyota corona hardtop" 59 | 25.0 4 97.50 80.00 2126. 17.0 72 1 "dodge colt hardtop" 60 | 23.0 4 97.00 54.00 2254. 23.5 72 2 "volkswagen type 3" 61 | 20.0 4 140.0 90.00 2408. 19.5 72 1 "chevrolet vega" 62 | 21.0 4 122.0 86.00 2226. 16.5 72 1 "ford pinto runabout" 63 | 13.0 8 350.0 165.0 4274. 12.0 72 1 "chevrolet impala" 64 | 14.0 8 400.0 175.0 4385. 12.0 72 1 "pontiac catalina" 65 | 15.0 8 318.0 150.0 4135. 13.5 72 1 "plymouth fury iii" 66 | 14.0 8 351.0 153.0 4129. 13.0 72 1 "ford galaxie 500" 67 | 17.0 8 304.0 150.0 3672. 11.5 72 1 "amc ambassador sst" 68 | 11.0 8 429.0 208.0 4633. 11.0 72 1 "mercury marquis" 69 | 13.0 8 350.0 155.0 4502. 13.5 72 1 "buick lesabre custom" 70 | 12.0 8 350.0 160.0 4456. 13.5 72 1 "oldsmobile delta 88 royale" 71 | 13.0 8 400.0 190.0 4422. 12.5 72 1 "chrysler newport royal" 72 | 19.0 3 70.00 97.00 2330. 13.5 72 3 "mazda rx2 coupe" 73 | 15.0 8 304.0 150.0 3892. 12.5 72 1 "amc matador (sw)" 74 | 13.0 8 307.0 130.0 4098. 14.0 72 1 "chevrolet chevelle concours (sw)" 75 | 13.0 8 302.0 140.0 4294. 16.0 72 1 "ford gran torino (sw)" 76 | 14.0 8 318.0 150.0 4077. 14.0 72 1 "plymouth satellite custom (sw)" 77 | 18.0 4 121.0 112.0 2933. 14.5 72 2 "volvo 145e (sw)" 78 | 22.0 4 121.0 76.00 2511. 18.0 72 2 "volkswagen 411 (sw)" 79 | 21.0 4 120.0 87.00 2979. 19.5 72 2 "peugeot 504 (sw)" 80 | 26.0 4 96.00 69.00 2189. 18.0 72 2 "renault 12 (sw)" 81 | 22.0 4 122.0 86.00 2395. 16.0 72 1 "ford pinto (sw)" 82 | 28.0 4 97.00 92.00 2288. 17.0 72 3 "datsun 510 (sw)" 83 | 23.0 4 120.0 97.00 2506. 14.5 72 3 "toyouta corona mark ii (sw)" 84 | 28.0 4 98.00 80.00 2164. 15.0 72 1 "dodge colt (sw)" 85 | 27.0 4 97.00 88.00 2100. 16.5 72 3 "toyota corolla 1600 (sw)" 86 | 13.0 8 350.0 175.0 4100. 13.0 73 1 "buick century 350" 87 | 14.0 8 304.0 150.0 3672. 11.5 73 1 "amc matador" 88 | 13.0 8 350.0 145.0 3988. 13.0 73 1 "chevrolet malibu" 89 | 14.0 8 302.0 137.0 4042. 14.5 73 1 "ford gran torino" 90 | 15.0 8 318.0 150.0 3777. 12.5 73 1 "dodge coronet custom" 91 | 12.0 8 429.0 198.0 4952. 11.5 73 1 "mercury marquis brougham" 92 | 13.0 8 400.0 150.0 4464. 12.0 73 1 "chevrolet caprice classic" 93 | 13.0 8 351.0 158.0 4363. 13.0 73 1 "ford ltd" 94 | 14.0 8 318.0 150.0 4237. 14.5 73 1 "plymouth fury gran sedan" 95 | 13.0 8 440.0 215.0 4735. 11.0 73 1 "chrysler new yorker brougham" 96 | 12.0 8 455.0 225.0 4951. 11.0 73 1 "buick electra 225 custom" 97 | 13.0 8 360.0 175.0 3821. 11.0 73 1 "amc ambassador brougham" 98 | 18.0 6 225.0 105.0 3121. 16.5 73 1 "plymouth valiant" 99 | 16.0 6 250.0 100.0 3278. 18.0 73 1 "chevrolet nova custom" 100 | 18.0 6 232.0 100.0 2945. 16.0 73 1 "amc hornet" 101 | 18.0 6 250.0 88.00 3021. 16.5 73 1 "ford maverick" 102 | 23.0 6 198.0 95.00 2904. 16.0 73 1 "plymouth duster" 103 | 26.0 4 97.00 46.00 1950. 21.0 73 2 "volkswagen super beetle" 104 | 11.0 8 400.0 150.0 4997. 14.0 73 1 "chevrolet impala" 105 | 12.0 8 400.0 167.0 4906. 12.5 73 1 "ford country" 106 | 13.0 8 360.0 170.0 4654. 13.0 73 1 "plymouth custom suburb" 107 | 12.0 8 350.0 180.0 4499. 12.5 73 1 "oldsmobile vista cruiser" 108 | 18.0 6 232.0 100.0 2789. 15.0 73 1 "amc gremlin" 109 | 20.0 4 97.00 88.00 2279. 19.0 73 3 "toyota carina" 110 | 21.0 4 140.0 72.00 2401. 19.5 73 1 "chevrolet vega" 111 | 22.0 4 108.0 94.00 2379. 16.5 73 3 "datsun 610" 112 | 18.0 3 70.00 90.00 2124. 13.5 73 3 "maxda rx3" 113 | 19.0 4 122.0 85.00 2310. 18.5 73 1 "ford pinto" 114 | 21.0 6 155.0 107.0 2472. 14.0 73 1 "mercury capri v6" 115 | 26.0 4 98.00 90.00 2265. 15.5 73 2 "fiat 124 sport coupe" 116 | 15.0 8 350.0 145.0 4082. 13.0 73 1 "chevrolet monte carlo s" 117 | 16.0 8 400.0 230.0 4278. 9.50 73 1 "pontiac grand prix" 118 | 29.0 4 68.00 49.00 1867. 19.5 73 2 "fiat 128" 119 | 24.0 4 116.0 75.00 2158. 15.5 73 2 "opel manta" 120 | 20.0 4 114.0 91.00 2582. 14.0 73 2 "audi 100ls" 121 | 19.0 4 121.0 112.0 2868. 15.5 73 2 "volvo 144ea" 122 | 15.0 8 318.0 150.0 3399. 11.0 73 1 "dodge dart custom" 123 | 24.0 4 121.0 110.0 2660. 14.0 73 2 "saab 99le" 124 | 20.0 6 156.0 122.0 2807. 13.5 73 3 "toyota mark ii" 125 | 11.0 8 350.0 180.0 3664. 11.0 73 1 "oldsmobile omega" 126 | 20.0 6 198.0 95.00 3102. 16.5 74 1 "plymouth duster" 127 | 21.0 6 200.0 ? 2875. 17.0 74 1 "ford maverick" 128 | 19.0 6 232.0 100.0 2901. 16.0 74 1 "amc hornet" 129 | 15.0 6 250.0 100.0 3336. 17.0 74 1 "chevrolet nova" 130 | 31.0 4 79.00 67.00 1950. 19.0 74 3 "datsun b210" 131 | 26.0 4 122.0 80.00 2451. 16.5 74 1 "ford pinto" 132 | 32.0 4 71.00 65.00 1836. 21.0 74 3 "toyota corolla 1200" 133 | 25.0 4 140.0 75.00 2542. 17.0 74 1 "chevrolet vega" 134 | 16.0 6 250.0 100.0 3781. 17.0 74 1 "chevrolet chevelle malibu classic" 135 | 16.0 6 258.0 110.0 3632. 18.0 74 1 "amc matador" 136 | 18.0 6 225.0 105.0 3613. 16.5 74 1 "plymouth satellite sebring" 137 | 16.0 8 302.0 140.0 4141. 14.0 74 1 "ford gran torino" 138 | 13.0 8 350.0 150.0 4699. 14.5 74 1 "buick century luxus (sw)" 139 | 14.0 8 318.0 150.0 4457. 13.5 74 1 "dodge coronet custom (sw)" 140 | 14.0 8 302.0 140.0 4638. 16.0 74 1 "ford gran torino (sw)" 141 | 14.0 8 304.0 150.0 4257. 15.5 74 1 "amc matador (sw)" 142 | 29.0 4 98.00 83.00 2219. 16.5 74 2 "audi fox" 143 | 26.0 4 79.00 67.00 1963. 15.5 74 2 "volkswagen dasher" 144 | 26.0 4 97.00 78.00 2300. 14.5 74 2 "opel manta" 145 | 31.0 4 76.00 52.00 1649. 16.5 74 3 "toyota corona" 146 | 32.0 4 83.00 61.00 2003. 19.0 74 3 "datsun 710" 147 | 28.0 4 90.00 75.00 2125. 14.5 74 1 "dodge colt" 148 | 24.0 4 90.00 75.00 2108. 15.5 74 2 "fiat 128" 149 | 26.0 4 116.0 75.00 2246. 14.0 74 2 "fiat 124 tc" 150 | 24.0 4 120.0 97.00 2489. 15.0 74 3 "honda civic" 151 | 26.0 4 108.0 93.00 2391. 15.5 74 3 "subaru" 152 | 31.0 4 79.00 67.00 2000. 16.0 74 2 "fiat x1.9" 153 | 19.0 6 225.0 95.00 3264. 16.0 75 1 "plymouth valiant custom" 154 | 18.0 6 250.0 105.0 3459. 16.0 75 1 "chevrolet nova" 155 | 15.0 6 250.0 72.00 3432. 21.0 75 1 "mercury monarch" 156 | 15.0 6 250.0 72.00 3158. 19.5 75 1 "ford maverick" 157 | 16.0 8 400.0 170.0 4668. 11.5 75 1 "pontiac catalina" 158 | 15.0 8 350.0 145.0 4440. 14.0 75 1 "chevrolet bel air" 159 | 16.0 8 318.0 150.0 4498. 14.5 75 1 "plymouth grand fury" 160 | 14.0 8 351.0 148.0 4657. 13.5 75 1 "ford ltd" 161 | 17.0 6 231.0 110.0 3907. 21.0 75 1 "buick century" 162 | 16.0 6 250.0 105.0 3897. 18.5 75 1 "chevroelt chevelle malibu" 163 | 15.0 6 258.0 110.0 3730. 19.0 75 1 "amc matador" 164 | 18.0 6 225.0 95.00 3785. 19.0 75 1 "plymouth fury" 165 | 21.0 6 231.0 110.0 3039. 15.0 75 1 "buick skyhawk" 166 | 20.0 8 262.0 110.0 3221. 13.5 75 1 "chevrolet monza 2+2" 167 | 13.0 8 302.0 129.0 3169. 12.0 75 1 "ford mustang ii" 168 | 29.0 4 97.00 75.00 2171. 16.0 75 3 "toyota corolla" 169 | 23.0 4 140.0 83.00 2639. 17.0 75 1 "ford pinto" 170 | 20.0 6 232.0 100.0 2914. 16.0 75 1 "amc gremlin" 171 | 23.0 4 140.0 78.00 2592. 18.5 75 1 "pontiac astro" 172 | 24.0 4 134.0 96.00 2702. 13.5 75 3 "toyota corona" 173 | 25.0 4 90.00 71.00 2223. 16.5 75 2 "volkswagen dasher" 174 | 24.0 4 119.0 97.00 2545. 17.0 75 3 "datsun 710" 175 | 18.0 6 171.0 97.00 2984. 14.5 75 1 "ford pinto" 176 | 29.0 4 90.00 70.00 1937. 14.0 75 2 "volkswagen rabbit" 177 | 19.0 6 232.0 90.00 3211. 17.0 75 1 "amc pacer" 178 | 23.0 4 115.0 95.00 2694. 15.0 75 2 "audi 100ls" 179 | 23.0 4 120.0 88.00 2957. 17.0 75 2 "peugeot 504" 180 | 22.0 4 121.0 98.00 2945. 14.5 75 2 "volvo 244dl" 181 | 25.0 4 121.0 115.0 2671. 13.5 75 2 "saab 99le" 182 | 33.0 4 91.00 53.00 1795. 17.5 75 3 "honda civic cvcc" 183 | 28.0 4 107.0 86.00 2464. 15.5 76 2 "fiat 131" 184 | 25.0 4 116.0 81.00 2220. 16.9 76 2 "opel 1900" 185 | 25.0 4 140.0 92.00 2572. 14.9 76 1 "capri ii" 186 | 26.0 4 98.00 79.00 2255. 17.7 76 1 "dodge colt" 187 | 27.0 4 101.0 83.00 2202. 15.3 76 2 "renault 12tl" 188 | 17.5 8 305.0 140.0 4215. 13.0 76 1 "chevrolet chevelle malibu classic" 189 | 16.0 8 318.0 150.0 4190. 13.0 76 1 "dodge coronet brougham" 190 | 15.5 8 304.0 120.0 3962. 13.9 76 1 "amc matador" 191 | 14.5 8 351.0 152.0 4215. 12.8 76 1 "ford gran torino" 192 | 22.0 6 225.0 100.0 3233. 15.4 76 1 "plymouth valiant" 193 | 22.0 6 250.0 105.0 3353. 14.5 76 1 "chevrolet nova" 194 | 24.0 6 200.0 81.00 3012. 17.6 76 1 "ford maverick" 195 | 22.5 6 232.0 90.00 3085. 17.6 76 1 "amc hornet" 196 | 29.0 4 85.00 52.00 2035. 22.2 76 1 "chevrolet chevette" 197 | 24.5 4 98.00 60.00 2164. 22.1 76 1 "chevrolet woody" 198 | 29.0 4 90.00 70.00 1937. 14.2 76 2 "vw rabbit" 199 | 33.0 4 91.00 53.00 1795. 17.4 76 3 "honda civic" 200 | 20.0 6 225.0 100.0 3651. 17.7 76 1 "dodge aspen se" 201 | 18.0 6 250.0 78.00 3574. 21.0 76 1 "ford granada ghia" 202 | 18.5 6 250.0 110.0 3645. 16.2 76 1 "pontiac ventura sj" 203 | 17.5 6 258.0 95.00 3193. 17.8 76 1 "amc pacer d/l" 204 | 29.5 4 97.00 71.00 1825. 12.2 76 2 "volkswagen rabbit" 205 | 32.0 4 85.00 70.00 1990. 17.0 76 3 "datsun b-210" 206 | 28.0 4 97.00 75.00 2155. 16.4 76 3 "toyota corolla" 207 | 26.5 4 140.0 72.00 2565. 13.6 76 1 "ford pinto" 208 | 20.0 4 130.0 102.0 3150. 15.7 76 2 "volvo 245" 209 | 13.0 8 318.0 150.0 3940. 13.2 76 1 "plymouth volare premier v8" 210 | 19.0 4 120.0 88.00 3270. 21.9 76 2 "peugeot 504" 211 | 19.0 6 156.0 108.0 2930. 15.5 76 3 "toyota mark ii" 212 | 16.5 6 168.0 120.0 3820. 16.7 76 2 "mercedes-benz 280s" 213 | 16.5 8 350.0 180.0 4380. 12.1 76 1 "cadillac seville" 214 | 13.0 8 350.0 145.0 4055. 12.0 76 1 "chevy c10" 215 | 13.0 8 302.0 130.0 3870. 15.0 76 1 "ford f108" 216 | 13.0 8 318.0 150.0 3755. 14.0 76 1 "dodge d100" 217 | 31.5 4 98.00 68.00 2045. 18.5 77 3 "honda accord cvcc" 218 | 30.0 4 111.0 80.00 2155. 14.8 77 1 "buick opel isuzu deluxe" 219 | 36.0 4 79.00 58.00 1825. 18.6 77 2 "renault 5 gtl" 220 | 25.5 4 122.0 96.00 2300. 15.5 77 1 "plymouth arrow gs" 221 | 33.5 4 85.00 70.00 1945. 16.8 77 3 "datsun f-10 hatchback" 222 | 17.5 8 305.0 145.0 3880. 12.5 77 1 "chevrolet caprice classic" 223 | 17.0 8 260.0 110.0 4060. 19.0 77 1 "oldsmobile cutlass supreme" 224 | 15.5 8 318.0 145.0 4140. 13.7 77 1 "dodge monaco brougham" 225 | 15.0 8 302.0 130.0 4295. 14.9 77 1 "mercury cougar brougham" 226 | 17.5 6 250.0 110.0 3520. 16.4 77 1 "chevrolet concours" 227 | 20.5 6 231.0 105.0 3425. 16.9 77 1 "buick skylark" 228 | 19.0 6 225.0 100.0 3630. 17.7 77 1 "plymouth volare custom" 229 | 18.5 6 250.0 98.00 3525. 19.0 77 1 "ford granada" 230 | 16.0 8 400.0 180.0 4220. 11.1 77 1 "pontiac grand prix lj" 231 | 15.5 8 350.0 170.0 4165. 11.4 77 1 "chevrolet monte carlo landau" 232 | 15.5 8 400.0 190.0 4325. 12.2 77 1 "chrysler cordoba" 233 | 16.0 8 351.0 149.0 4335. 14.5 77 1 "ford thunderbird" 234 | 29.0 4 97.00 78.00 1940. 14.5 77 2 "volkswagen rabbit custom" 235 | 24.5 4 151.0 88.00 2740. 16.0 77 1 "pontiac sunbird coupe" 236 | 26.0 4 97.00 75.00 2265. 18.2 77 3 "toyota corolla liftback" 237 | 25.5 4 140.0 89.00 2755. 15.8 77 1 "ford mustang ii 2+2" 238 | 30.5 4 98.00 63.00 2051. 17.0 77 1 "chevrolet chevette" 239 | 33.5 4 98.00 83.00 2075. 15.9 77 1 "dodge colt m/m" 240 | 30.0 4 97.00 67.00 1985. 16.4 77 3 "subaru dl" 241 | 30.5 4 97.00 78.00 2190. 14.1 77 2 "volkswagen dasher" 242 | 22.0 6 146.0 97.00 2815. 14.5 77 3 "datsun 810" 243 | 21.5 4 121.0 110.0 2600. 12.8 77 2 "bmw 320i" 244 | 21.5 3 80.00 110.0 2720. 13.5 77 3 "mazda rx-4" 245 | 43.1 4 90.00 48.00 1985. 21.5 78 2 "volkswagen rabbit custom diesel" 246 | 36.1 4 98.00 66.00 1800. 14.4 78 1 "ford fiesta" 247 | 32.8 4 78.00 52.00 1985. 19.4 78 3 "mazda glc deluxe" 248 | 39.4 4 85.00 70.00 2070. 18.6 78 3 "datsun b210 gx" 249 | 36.1 4 91.00 60.00 1800. 16.4 78 3 "honda civic cvcc" 250 | 19.9 8 260.0 110.0 3365. 15.5 78 1 "oldsmobile cutlass salon brougham" 251 | 19.4 8 318.0 140.0 3735. 13.2 78 1 "dodge diplomat" 252 | 20.2 8 302.0 139.0 3570. 12.8 78 1 "mercury monarch ghia" 253 | 19.2 6 231.0 105.0 3535. 19.2 78 1 "pontiac phoenix lj" 254 | 20.5 6 200.0 95.00 3155. 18.2 78 1 "chevrolet malibu" 255 | 20.2 6 200.0 85.00 2965. 15.8 78 1 "ford fairmont (auto)" 256 | 25.1 4 140.0 88.00 2720. 15.4 78 1 "ford fairmont (man)" 257 | 20.5 6 225.0 100.0 3430. 17.2 78 1 "plymouth volare" 258 | 19.4 6 232.0 90.00 3210. 17.2 78 1 "amc concord" 259 | 20.6 6 231.0 105.0 3380. 15.8 78 1 "buick century special" 260 | 20.8 6 200.0 85.00 3070. 16.7 78 1 "mercury zephyr" 261 | 18.6 6 225.0 110.0 3620. 18.7 78 1 "dodge aspen" 262 | 18.1 6 258.0 120.0 3410. 15.1 78 1 "amc concord d/l" 263 | 19.2 8 305.0 145.0 3425. 13.2 78 1 "chevrolet monte carlo landau" 264 | 17.7 6 231.0 165.0 3445. 13.4 78 1 "buick regal sport coupe (turbo)" 265 | 18.1 8 302.0 139.0 3205. 11.2 78 1 "ford futura" 266 | 17.5 8 318.0 140.0 4080. 13.7 78 1 "dodge magnum xe" 267 | 30.0 4 98.00 68.00 2155. 16.5 78 1 "chevrolet chevette" 268 | 27.5 4 134.0 95.00 2560. 14.2 78 3 "toyota corona" 269 | 27.2 4 119.0 97.00 2300. 14.7 78 3 "datsun 510" 270 | 30.9 4 105.0 75.00 2230. 14.5 78 1 "dodge omni" 271 | 21.1 4 134.0 95.00 2515. 14.8 78 3 "toyota celica gt liftback" 272 | 23.2 4 156.0 105.0 2745. 16.7 78 1 "plymouth sapporo" 273 | 23.8 4 151.0 85.00 2855. 17.6 78 1 "oldsmobile starfire sx" 274 | 23.9 4 119.0 97.00 2405. 14.9 78 3 "datsun 200-sx" 275 | 20.3 5 131.0 103.0 2830. 15.9 78 2 "audi 5000" 276 | 17.0 6 163.0 125.0 3140. 13.6 78 2 "volvo 264gl" 277 | 21.6 4 121.0 115.0 2795. 15.7 78 2 "saab 99gle" 278 | 16.2 6 163.0 133.0 3410. 15.8 78 2 "peugeot 604sl" 279 | 31.5 4 89.00 71.00 1990. 14.9 78 2 "volkswagen scirocco" 280 | 29.5 4 98.00 68.00 2135. 16.6 78 3 "honda accord lx" 281 | 21.5 6 231.0 115.0 3245. 15.4 79 1 "pontiac lemans v6" 282 | 19.8 6 200.0 85.00 2990. 18.2 79 1 "mercury zephyr 6" 283 | 22.3 4 140.0 88.00 2890. 17.3 79 1 "ford fairmont 4" 284 | 20.2 6 232.0 90.00 3265. 18.2 79 1 "amc concord dl 6" 285 | 20.6 6 225.0 110.0 3360. 16.6 79 1 "dodge aspen 6" 286 | 17.0 8 305.0 130.0 3840. 15.4 79 1 "chevrolet caprice classic" 287 | 17.6 8 302.0 129.0 3725. 13.4 79 1 "ford ltd landau" 288 | 16.5 8 351.0 138.0 3955. 13.2 79 1 "mercury grand marquis" 289 | 18.2 8 318.0 135.0 3830. 15.2 79 1 "dodge st. regis" 290 | 16.9 8 350.0 155.0 4360. 14.9 79 1 "buick estate wagon (sw)" 291 | 15.5 8 351.0 142.0 4054. 14.3 79 1 "ford country squire (sw)" 292 | 19.2 8 267.0 125.0 3605. 15.0 79 1 "chevrolet malibu classic (sw)" 293 | 18.5 8 360.0 150.0 3940. 13.0 79 1 "chrysler lebaron town @ country (sw)" 294 | 31.9 4 89.00 71.00 1925. 14.0 79 2 "vw rabbit custom" 295 | 34.1 4 86.00 65.00 1975. 15.2 79 3 "maxda glc deluxe" 296 | 35.7 4 98.00 80.00 1915. 14.4 79 1 "dodge colt hatchback custom" 297 | 27.4 4 121.0 80.00 2670. 15.0 79 1 "amc spirit dl" 298 | 25.4 5 183.0 77.00 3530. 20.1 79 2 "mercedes benz 300d" 299 | 23.0 8 350.0 125.0 3900. 17.4 79 1 "cadillac eldorado" 300 | 27.2 4 141.0 71.00 3190. 24.8 79 2 "peugeot 504" 301 | 23.9 8 260.0 90.00 3420. 22.2 79 1 "oldsmobile cutlass salon brougham" 302 | 34.2 4 105.0 70.00 2200. 13.2 79 1 "plymouth horizon" 303 | 34.5 4 105.0 70.00 2150. 14.9 79 1 "plymouth horizon tc3" 304 | 31.8 4 85.00 65.00 2020. 19.2 79 3 "datsun 210" 305 | 37.3 4 91.00 69.00 2130. 14.7 79 2 "fiat strada custom" 306 | 28.4 4 151.0 90.00 2670. 16.0 79 1 "buick skylark limited" 307 | 28.8 6 173.0 115.0 2595. 11.3 79 1 "chevrolet citation" 308 | 26.8 6 173.0 115.0 2700. 12.9 79 1 "oldsmobile omega brougham" 309 | 33.5 4 151.0 90.00 2556. 13.2 79 1 "pontiac phoenix" 310 | 41.5 4 98.00 76.00 2144. 14.7 80 2 "vw rabbit" 311 | 38.1 4 89.00 60.00 1968. 18.8 80 3 "toyota corolla tercel" 312 | 32.1 4 98.00 70.00 2120. 15.5 80 1 "chevrolet chevette" 313 | 37.2 4 86.00 65.00 2019. 16.4 80 3 "datsun 310" 314 | 28.0 4 151.0 90.00 2678. 16.5 80 1 "chevrolet citation" 315 | 26.4 4 140.0 88.00 2870. 18.1 80 1 "ford fairmont" 316 | 24.3 4 151.0 90.00 3003. 20.1 80 1 "amc concord" 317 | 19.1 6 225.0 90.00 3381. 18.7 80 1 "dodge aspen" 318 | 34.3 4 97.00 78.00 2188. 15.8 80 2 "audi 4000" 319 | 29.8 4 134.0 90.00 2711. 15.5 80 3 "toyota corona liftback" 320 | 31.3 4 120.0 75.00 2542. 17.5 80 3 "mazda 626" 321 | 37.0 4 119.0 92.00 2434. 15.0 80 3 "datsun 510 hatchback" 322 | 32.2 4 108.0 75.00 2265. 15.2 80 3 "toyota corolla" 323 | 46.6 4 86.00 65.00 2110. 17.9 80 3 "mazda glc" 324 | 27.9 4 156.0 105.0 2800. 14.4 80 1 "dodge colt" 325 | 40.8 4 85.00 65.00 2110. 19.2 80 3 "datsun 210" 326 | 44.3 4 90.00 48.00 2085. 21.7 80 2 "vw rabbit c (diesel)" 327 | 43.4 4 90.00 48.00 2335. 23.7 80 2 "vw dasher (diesel)" 328 | 36.4 5 121.0 67.00 2950. 19.9 80 2 "audi 5000s (diesel)" 329 | 30.0 4 146.0 67.00 3250. 21.8 80 2 "mercedes-benz 240d" 330 | 44.6 4 91.00 67.00 1850. 13.8 80 3 "honda civic 1500 gl" 331 | 40.9 4 85.00 ? 1835. 17.3 80 2 "renault lecar deluxe" 332 | 33.8 4 97.00 67.00 2145. 18.0 80 3 "subaru dl" 333 | 29.8 4 89.00 62.00 1845. 15.3 80 2 "vokswagen rabbit" 334 | 32.7 6 168.0 132.0 2910. 11.4 80 3 "datsun 280-zx" 335 | 23.7 3 70.00 100.0 2420. 12.5 80 3 "mazda rx-7 gs" 336 | 35.0 4 122.0 88.00 2500. 15.1 80 2 "triumph tr7 coupe" 337 | 23.6 4 140.0 ? 2905. 14.3 80 1 "ford mustang cobra" 338 | 32.4 4 107.0 72.00 2290. 17.0 80 3 "honda accord" 339 | 27.2 4 135.0 84.00 2490. 15.7 81 1 "plymouth reliant" 340 | 26.6 4 151.0 84.00 2635. 16.4 81 1 "buick skylark" 341 | 25.8 4 156.0 92.00 2620. 14.4 81 1 "dodge aries wagon (sw)" 342 | 23.5 6 173.0 110.0 2725. 12.6 81 1 "chevrolet citation" 343 | 30.0 4 135.0 84.00 2385. 12.9 81 1 "plymouth reliant" 344 | 39.1 4 79.00 58.00 1755. 16.9 81 3 "toyota starlet" 345 | 39.0 4 86.00 64.00 1875. 16.4 81 1 "plymouth champ" 346 | 35.1 4 81.00 60.00 1760. 16.1 81 3 "honda civic 1300" 347 | 32.3 4 97.00 67.00 2065. 17.8 81 3 "subaru" 348 | 37.0 4 85.00 65.00 1975. 19.4 81 3 "datsun 210 mpg" 349 | 37.7 4 89.00 62.00 2050. 17.3 81 3 "toyota tercel" 350 | 34.1 4 91.00 68.00 1985. 16.0 81 3 "mazda glc 4" 351 | 34.7 4 105.0 63.00 2215. 14.9 81 1 "plymouth horizon 4" 352 | 34.4 4 98.00 65.00 2045. 16.2 81 1 "ford escort 4w" 353 | 29.9 4 98.00 65.00 2380. 20.7 81 1 "ford escort 2h" 354 | 33.0 4 105.0 74.00 2190. 14.2 81 2 "volkswagen jetta" 355 | 34.5 4 100.0 ? 2320. 15.8 81 2 "renault 18i" 356 | 33.7 4 107.0 75.00 2210. 14.4 81 3 "honda prelude" 357 | 32.4 4 108.0 75.00 2350. 16.8 81 3 "toyota corolla" 358 | 32.9 4 119.0 100.0 2615. 14.8 81 3 "datsun 200sx" 359 | 31.6 4 120.0 74.00 2635. 18.3 81 3 "mazda 626" 360 | 28.1 4 141.0 80.00 3230. 20.4 81 2 "peugeot 505s turbo diesel" 361 | 30.7 6 145.0 76.00 3160. 19.6 81 2 "volvo diesel" 362 | 25.4 6 168.0 116.0 2900. 12.6 81 3 "toyota cressida" 363 | 24.2 6 146.0 120.0 2930. 13.8 81 3 "datsun 810 maxima" 364 | 22.4 6 231.0 110.0 3415. 15.8 81 1 "buick century" 365 | 26.6 8 350.0 105.0 3725. 19.0 81 1 "oldsmobile cutlass ls" 366 | 20.2 6 200.0 88.00 3060. 17.1 81 1 "ford granada gl" 367 | 17.6 6 225.0 85.00 3465. 16.6 81 1 "chrysler lebaron salon" 368 | 28.0 4 112.0 88.00 2605. 19.6 82 1 "chevrolet cavalier" 369 | 27.0 4 112.0 88.00 2640. 18.6 82 1 "chevrolet cavalier wagon" 370 | 34.0 4 112.0 88.00 2395. 18.0 82 1 "chevrolet cavalier 2-door" 371 | 31.0 4 112.0 85.00 2575. 16.2 82 1 "pontiac j2000 se hatchback" 372 | 29.0 4 135.0 84.00 2525. 16.0 82 1 "dodge aries se" 373 | 27.0 4 151.0 90.00 2735. 18.0 82 1 "pontiac phoenix" 374 | 24.0 4 140.0 92.00 2865. 16.4 82 1 "ford fairmont futura" 375 | 23.0 4 151.0 ? 3035. 20.5 82 1 "amc concord dl" 376 | 36.0 4 105.0 74.00 1980. 15.3 82 2 "volkswagen rabbit l" 377 | 37.0 4 91.00 68.00 2025. 18.2 82 3 "mazda glc custom l" 378 | 31.0 4 91.00 68.00 1970. 17.6 82 3 "mazda glc custom" 379 | 38.0 4 105.0 63.00 2125. 14.7 82 1 "plymouth horizon miser" 380 | 36.0 4 98.00 70.00 2125. 17.3 82 1 "mercury lynx l" 381 | 36.0 4 120.0 88.00 2160. 14.5 82 3 "nissan stanza xe" 382 | 36.0 4 107.0 75.00 2205. 14.5 82 3 "honda accord" 383 | 34.0 4 108.0 70.00 2245 16.9 82 3 "toyota corolla" 384 | 38.0 4 91.00 67.00 1965. 15.0 82 3 "honda civic" 385 | 32.0 4 91.00 67.00 1965. 15.7 82 3 "honda civic (auto)" 386 | 38.0 4 91.00 67.00 1995. 16.2 82 3 "datsun 310 gx" 387 | 25.0 6 181.0 110.0 2945. 16.4 82 1 "buick century limited" 388 | 38.0 6 262.0 85.00 3015. 17.0 82 1 "oldsmobile cutlass ciera (diesel)" 389 | 26.0 4 156.0 92.00 2585. 14.5 82 1 "chrysler lebaron medallion" 390 | 22.0 6 232.0 112.0 2835 14.7 82 1 "ford granada l" 391 | 32.0 4 144.0 96.00 2665. 13.9 82 3 "toyota celica gt" 392 | 36.0 4 135.0 84.00 2370. 13.0 82 1 "dodge charger 2.2" 393 | 27.0 4 151.0 90.00 2950. 17.3 82 1 "chevrolet camaro" 394 | 27.0 4 140.0 86.00 2790. 15.6 82 1 "ford mustang gl" 395 | 44.0 4 97.00 52.00 2130. 24.6 82 2 "vw pickup" 396 | 32.0 4 135.0 84.00 2295. 11.6 82 1 "dodge rampage" 397 | 28.0 4 120.0 79.00 2625. 18.6 82 1 "ford ranger" 398 | 31.0 4 119.0 82.00 2720. 19.4 82 1 "chevy s-10" 399 | -------------------------------------------------------------------------------- /Lecture 6 - Naive Bayes/naive_bayes_autompg.py: -------------------------------------------------------------------------------- 1 | """ 2 | (C) 2017 Nikolay Manchev 3 | [London Machine Learning Study Group](http://www.meetup.com/London-Machine-Learning-Study-Group/members/) 4 | 5 | This work is licensed under the Creative Commons Attribution 4.0 International 6 | License. To view a copy of this license, visit 7 | http://creativecommons.org/licenses/by/4.0/. 8 | """ 9 | 10 | import numpy as np 11 | 12 | from sklearn.model_selection import train_test_split 13 | from sklearn.metrics import accuracy_score 14 | from sklearn.metrics import confusion_matrix 15 | 16 | from math import sqrt 17 | from math import pi 18 | from math import exp 19 | 20 | 21 | def getPriors(labels): 22 | """ 23 | Get the class priors by calculating the class probability from the 24 | provided set. The prior is computed as 25 | 26 | (prior for class A) = (number of class A samples) / (total number of samples) 27 | 28 | Parameters 29 | ---------- 30 | labels : target class values 31 | 32 | Returns 33 | ------- 34 | priors : A dictionary with the class priors. 35 | E.g. { ClassA: prior, ClassB: prior, ...} 36 | """ 37 | priors = {} 38 | for className in labels: 39 | N = labels.size 40 | class_occurrence = (labels == className).sum() 41 | priors[className] = class_occurrence/N 42 | return priors 43 | 44 | 45 | def fit(features, labels): 46 | """ 47 | Fits coefficients for a Gaussian Naive Bayes. This method computes and 48 | returns the in-class mean and stadnard deviation for each feature in 49 | the training vectors. 50 | 51 | Parameters 52 | ---------- 53 | featires : training vectors 54 | labels : target class values 55 | 56 | Returns 57 | ------- 58 | priors : A dictionary with with the in-class mean/std for each attribute 59 | 60 | {ClassA: [(attribute1_mean, attribute1_std]), 61 | (attribute2_mean, attribute2_std],...) 62 | ClassB: [(attribute1_mean, attribute1_std]), 63 | (attribute2_mean, attribute2_std],... 64 | ...} 65 | """ 66 | # Get the unique classes from the sample 67 | uniqueClasses = np.unique(labels) 68 | coeffs = {} 69 | # Loop over the unique classes to compute the mean/std statistics 70 | for className in uniqueClasses: 71 | featuresInClass = features[labels == className] 72 | # Compute the mean/std for each input feature 73 | statsInClass = [(np.mean(feature), np.std(feature)) for feature in zip(*featuresInClass)] 74 | coeffs[className] = statsInClass 75 | 76 | return coeffs 77 | 78 | def getLikelihood(x, featureIndex, model, className): 79 | """ 80 | Computes the likelihood (i.e. the probability of the evidence given the 81 | model parameters) for a single value/class combination. The likelihood 82 | is computed using a Gaussian probability desnity function 83 | 84 | f(x|mu, sigma) = 85 | 1 / sqrt( 2 * pi * sigma^2 ) * exp ( - ( x-mu )^2 / (2 * sigma^2) ) 86 | 87 | Parameters 88 | ---------- 89 | x : observation value 90 | featureIndex : position of this attribute in the input vector. If the 91 | model was fitted against an N-dimenisonal input vector 92 | [x_0, x_1, ..., x_N], featureIndex should point to the 93 | position of x in the original vector (e.g. 0,1,...,N) 94 | model : a dictionary with with the in-class mean/std for each 95 | attribute. See the fit(features, labels) method 96 | className : class to asses the observation against 97 | 98 | 99 | Returns 100 | ------- 101 | f : the (x|className) likelihood based on the Guassian PDF 102 | """ 103 | classStats = model[className] 104 | mean = classStats[featureIndex][0] 105 | std = classStats[featureIndex][1] 106 | f = (1/(sqrt(2*pi*pow(std,2))) * exp(-pow((x-mean),2)/(2*pow(std,2)))) 107 | return f 108 | 109 | def getPosterior(x, model, priors): 110 | """ 111 | Computes the posterior using a Gaussian Naive Bayes. 112 | 113 | P(class|x = [x_1, x_2, ..., x_N]) = likelihood(x|class) * prior(class) 114 | 115 | We use the naive assumption of conditional independence between the features, 116 | which means that 117 | 118 | P([x_1, x_2, ..., x_N]|class) = P(x_1|class) * P(x_2|class) * ... * P(x_N|class) 119 | 120 | Parameters 121 | ---------- 122 | x : input vector 123 | model : a dictionary with with the in-class mean/std for each 124 | attribute. See the fit(features, labels) method 125 | priors : a dictionary with with the in-class mean/std for each attribute 126 | 127 | 128 | Returns 129 | ------- 130 | p : the posterior for all classes in priors given the input vector 131 | """ 132 | posteriors = {} 133 | # Loop over all observed classes 134 | for className in priors: 135 | # Compute p(x_1|class) * p(x_2|class) * ... * p(x_N|class) using the 136 | # likelihood function, then multiply by the prior to get 137 | # p(class|x = [x_1, x_2, ..., x_N]) 138 | p = 1 139 | for featureIndex in range(x.size): 140 | p = p * (getLikelihood (x[featureIndex], featureIndex, model, className) * priors[className]) 141 | posteriors[className] = p 142 | return posteriors 143 | 144 | def classify(x, model, priors): 145 | """ 146 | This method uses Maximum a posteriori estimation (MAP) to make a class 147 | prediction on an unseen observation. 148 | 149 | Class_MAP = argmax_c posterior(c|x) = argmax_c likelihood(x|c) * prior (c) 150 | 151 | Parameters 152 | ---------- 153 | x : input vector 154 | model : a dictionary with with the in-class mean/std for each 155 | attribute. See the fit(features, labels) method 156 | priors : a dictionary with with the in-class mean/std for each attribute 157 | 158 | 159 | Returns 160 | ------- 161 | The name of the class that maximizes the posterior value 162 | """ 163 | posteriors = getPosterior(x, model, priors) 164 | return max(posteriors, key=lambda key: posteriors[key]) 165 | 166 | 167 | # Load the data set 168 | # We use Auto MPG from UCI Machine Learning Repository 169 | # https://archive.ics.uci.edu/ml/datasets/Auto+MPGs 170 | 171 | car_data = np.genfromtxt("auto-mpg.data", usecols=(4, 3, 7)) 172 | car_data = car_data[~np.isnan(car_data).any(axis=1)] 173 | features = car_data[:,[0,1]] 174 | labels = car_data[:,2] 175 | 176 | # Split the data into test/train subsets 177 | features_train, features_test, labels_train, labels_test = train_test_split(features, 178 | labels, test_size=0.1, 179 | random_state = 100) 180 | # Fit the model 181 | priors = getPriors(labels_train) 182 | model = fit(features_train, labels_train) 183 | 184 | # Make predictions on the test data 185 | predictions = [classify(x, model, priors) for x in features_test] 186 | 187 | # Measure accuracy 188 | print("Prediction accuracy: %.2f\n" % accuracy_score(labels_test, predictions)) 189 | -------------------------------------------------------------------------------- /Lecture 6 - Naive Bayes/naive_bayes_mf.py: -------------------------------------------------------------------------------- 1 | """ 2 | (C) 2017 Nikolay Manchev 3 | [London Machine Learning Study Group](http://www.meetup.com/London-Machine-Learning-Study-Group/members/) 4 | 5 | This work is licensed under the Creative Commons Attribution 4.0 International 6 | License. To view a copy of this license, visit 7 | http://creativecommons.org/licenses/by/4.0/. 8 | """ 9 | 10 | import numpy as np 11 | 12 | from sklearn.model_selection import train_test_split 13 | from sklearn.metrics import accuracy_score 14 | from sklearn.metrics import confusion_matrix 15 | 16 | from math import sqrt 17 | from math import pi 18 | from math import exp 19 | 20 | 21 | def getPriors(labels): 22 | """ 23 | Get the class priors by calculating the class probability from the 24 | provided set. The prior is computed as 25 | 26 | (prior for class A) = (number of class A samples) / (total number of samples) 27 | 28 | Parameters 29 | ---------- 30 | labels : target class values 31 | 32 | Returns 33 | ------- 34 | priors : A dictionary with the class priors. 35 | E.g. { ClassA: prior, ClassB: prior, ...} 36 | """ 37 | priors = {} 38 | for className in labels: 39 | N = labels.size 40 | class_occurrence = (labels == className).sum() 41 | priors[className] = class_occurrence/N 42 | return priors 43 | 44 | 45 | def fit(features, labels): 46 | """ 47 | Fits coefficients for a Gaussian Naive Bayes. This method computes and 48 | returns the in-class mean and stadnard deviation for each feature in 49 | the training vectors. 50 | 51 | Parameters 52 | ---------- 53 | featires : training vectors 54 | labels : target class values 55 | 56 | Returns 57 | ------- 58 | priors : A dictionary with with the in-class mean/std for each attribute 59 | 60 | {ClassA: [(attribute1_mean, attribute1_std]), 61 | (attribute2_mean, attribute2_std],...) 62 | ClassB: [(attribute1_mean, attribute1_std]), 63 | (attribute2_mean, attribute2_std],... 64 | ...} 65 | """ 66 | # Get the unique classes from the sample 67 | uniqueClasses = np.unique(labels) 68 | coeffs = {} 69 | # Loop over the unique classes to compute the mean/std statistics 70 | for className in uniqueClasses: 71 | featuresInClass = features[labels == className] 72 | # Compute the mean/std for each input feature 73 | statsInClass = [(np.mean(feature), np.std(feature)) for feature in zip(*featuresInClass)] 74 | coeffs[className] = statsInClass 75 | 76 | return coeffs 77 | 78 | def getLikelihood(x, featureIndex, model, className): 79 | """ 80 | Computes the likelihood (i.e. the probability of the evidence given the 81 | model parameters) for a single value/class combination. The likelihood 82 | is computed using a Gaussian probability desnity function 83 | 84 | f(x|mu, sigma) = 85 | 1 / sqrt( 2 * pi * sigma^2 ) * exp ( - ( x-mu )^2 / (2 * sigma^2) ) 86 | 87 | Parameters 88 | ---------- 89 | x : observation value 90 | featureIndex : position of this attribute in the input vector. If the 91 | model was fitted against an N-dimenisonal input vector 92 | [x_0, x_1, ..., x_N], featureIndex should point to the 93 | position of x in the original vector (e.g. 0,1,...,N) 94 | model : a dictionary with with the in-class mean/std for each 95 | attribute. See the fit(features, labels) method 96 | className : class to asses the observation against 97 | 98 | 99 | Returns 100 | ------- 101 | f : the (x|className) likelihood based on the Guassian PDF 102 | """ 103 | classStats = model[className] 104 | mean = classStats[featureIndex][0] 105 | std = classStats[featureIndex][1] 106 | f = (1/(sqrt(2*pi*pow(std,2))) * exp(-pow((x-mean),2)/(2*pow(std,2)))) 107 | return f 108 | 109 | def getPosterior(x, model, priors): 110 | """ 111 | Computes the posterior using a Gaussian Naive Bayes. 112 | 113 | P(class|x = [x_1, x_2, ..., x_N]) = likelihood(x|class) * prior(class) 114 | 115 | We use the naive assumption of conditional independence between the features, 116 | which means that 117 | 118 | P([x_1, x_2, ..., x_N]|class) = P(x_1|class) * P(x_2|class) * ... * P(x_N|class) 119 | 120 | Parameters 121 | ---------- 122 | x : input vector 123 | model : a dictionary with with the in-class mean/std for each 124 | attribute. See the fit(features, labels) method 125 | priors : a dictionary with with the in-class mean/std for each attribute 126 | 127 | 128 | Returns 129 | ------- 130 | p : the posterior for all classes in priors given the input vector 131 | """ 132 | posteriors = {} 133 | # Loop over all observed classes 134 | for className in priors: 135 | # Compute p(x_1|class) * p(x_2|class) * ... * p(x_N|class) using the 136 | # likelihood function, then multiply by the prior to get 137 | # p(class|x = [x_1, x_2, ..., x_N]) 138 | p = 1 139 | for featureIndex in range(x.size): 140 | p = p * (getLikelihood (x[featureIndex], featureIndex, model, className) * priors[className]) 141 | posteriors[className] = p 142 | return posteriors 143 | 144 | def classify(x, model, priors): 145 | """ 146 | This method uses Maximum a posteriori estimation (MAP) to make a class 147 | prediction on an unseen observation. 148 | 149 | Class_MAP = argmax_c posterior(c|x) = argmax_c likelihood(x|c) * prior (c) 150 | 151 | Parameters 152 | ---------- 153 | x : input vector 154 | model : a dictionary with with the in-class mean/std for each 155 | attribute. See the fit(features, labels) method 156 | priors : a dictionary with with the in-class mean/std for each attribute 157 | 158 | 159 | Returns 160 | ------- 161 | The name of the class that maximizes the posterior value 162 | """ 163 | posteriors = getPosterior(x, model, priors) 164 | return max(posteriors, key=lambda key: posteriors[key]) 165 | 166 | 167 | # Data from National Longitudinal Youth Survey, Bureau of Labor Statistics, 168 | # United States Department of Labor 169 | # http://www.bls.gov/nls/nlsy97.htm 170 | data = np.genfromtxt("gender_height_weight.csv", delimiter=",", skip_header=1) 171 | 172 | # Assign [height(inchs), weight(lbs)] to features and [gender] to labels 173 | features = data[:,[1,2]] 174 | labels = data[:,0] 175 | 176 | # Split the data into test/train subsets 177 | features_train, features_test, labels_train, labels_test = train_test_split(features, 178 | labels, test_size=0.1, 179 | random_state = 100) 180 | # Fit the model 181 | priors = getPriors(labels_train) 182 | model = fit(features_train, labels_train) 183 | 184 | # To see the likelihood for a certain attribute per class we can do: 185 | # x = np.array([69]) 186 | # getLikelihood(x, 0, model, 0) <- likelihood for class 0 : 0.1171193286800898 187 | # getLikelihood(x, 0, model, 1) <- likelihood for class 1 : 0.04168934664951199 188 | 189 | # Make predictions on the test data 190 | predictions = [classify(x, model, priors) for x in features_test] 191 | 192 | # Measure accuracy 193 | print("Prediction accuracy: %.2f\n" % accuracy_score(labels_test, predictions)) 194 | 195 | # Print confusion matrix 196 | print("Confusion matrix:\n") 197 | print(confusion_matrix(labels_test, predictions)) 198 | -------------------------------------------------------------------------------- /Lecture 7 - Text Classification/README.md: -------------------------------------------------------------------------------- 1 | ## Text Classification 2 | 3 | Code examples used in Lecture 7 4 | 5 | * data/SMSSpamCollection - The [SMS Spam Collection Data Set](https://archive.ics.uci.edu/ml/datasets/sms+spam+collection) from UCI Machine Learning Repository 6 | * transform.py - Generates an L2 normalized tf-idf matrix from the SMS dataset 7 | * predict.py - Uses Multinomial Naive Bayes to predict the ham/spam class from the stored tf-idf matrix 8 | 9 | This repository contains materials from the London Machine Learning Study Group Meetups 10 | 11 | The meetup page is available at [http://www.meetup.com/London-Machine-Learning-Study-Group](http://www.meetup.com/London-Machine-Learning-Study-Group). 12 | 13 | Lecture recordings are available on [YouTube](https://www.youtube.com/c/NikolayManchev) 14 | 15 | (C) 2017 Nikolay Manchev, London Machine Learning Study Group 16 | 17 | This work is licensed under the Creative Commons Attribution 4.0 International License. To view a copy of this license, visit [http://creativecommons.org/licenses/by/4.0](http://creativecommons.org/licenses/by/4.0). 18 | -------------------------------------------------------------------------------- /Lecture 7 - Text Classification/predict.py: -------------------------------------------------------------------------------- 1 | """ 2 | (C) 2017 Nikolay Manchev 3 | [London Machine Learning Study Group](http://www.meetup.com/London-Machine-Learning-Study-Group/members/) 4 | 5 | This work is licensed under the Creative Commons Attribution 4.0 International 6 | License. To view a copy of this license, visit 7 | http://creativecommons.org/licenses/by/4.0/. 8 | """ 9 | 10 | from sklearn.model_selection import train_test_split 11 | from sklearn.metrics import accuracy_score 12 | from sklearn.naive_bayes import MultinomialNB 13 | 14 | import numpy as np 15 | 16 | import scipy 17 | 18 | import timeit 19 | 20 | np.random.seed(1234) 21 | 22 | labels = np.fromfile("data/labels.csv", sep='\n') 23 | 24 | tf_idf_matrix = scipy.io.mmread("data/training.mtx").todense() 25 | 26 | X_train, X_test, y_train, y_test = train_test_split(tf_idf_matrix, labels, test_size=0.20, random_state=1234) 27 | 28 | start_time = timeit.default_timer() 29 | 30 | clf = MultinomialNB() 31 | 32 | clf.fit(X_train, y_train) 33 | 34 | y_pred = clf.predict(X_test) 35 | 36 | print("Elapsed time: %f sec" % (timeit.default_timer() - start_time)) 37 | 38 | print(accuracy_score(y_test, y_pred)) 39 | -------------------------------------------------------------------------------- /Lecture 7 - Text Classification/transform.py: -------------------------------------------------------------------------------- 1 | """ 2 | (C) 2017 Nikolay Manchev 3 | [London Machine Learning Study Group](http://www.meetup.com/London-Machine-Learning-Study-Group/members/) 4 | 5 | This work is licensed under the Creative Commons Attribution 4.0 International 6 | License. To view a copy of this license, visit 7 | http://creativecommons.org/licenses/by/4.0/. 8 | """ 9 | 10 | import math 11 | import os 12 | import pandas as pd 13 | import numpy as np 14 | 15 | import scipy.sparse 16 | import scipy.io 17 | 18 | import nltk.data 19 | import nltk.tokenize 20 | import nltk.stem 21 | 22 | from nltk.corpus import stopwords 23 | 24 | from collections import Counter 25 | 26 | import numpy as np 27 | 28 | def extract_words(text, stemmer = None, remove_stopwords = False): 29 | """ 30 | Extracts all words from a document. The document is first tokenized, 31 | morphological affixes from words are removed, and stop words 32 | are excluded from the resulting list of words. 33 | 34 | Parameters 35 | ---------- 36 | text : input document (String) 37 | stemmer : NLTK stemmer for the stemming process. Must be an NLTK 38 | stem package class. E.g: 39 | 40 | nltk.stem.porter.PorterStemmer() 41 | nltk.stem.lancaster.LancasterStemmer() 42 | nltk.stem.snowball.EnglishStemmer() 43 | 44 | If set to None, no stemming is performed on the input text 45 | remove_stopwords : If set to True, removes any stop words from the output, 46 | using the nltk.corpus.stopwords corpus (English) 47 | 48 | Returns 49 | ------- 50 | A list of words extracted from the input text. 51 | 52 | """ 53 | 54 | # Get the stopwords corpus 55 | if "stopwords" not in os.listdir(nltk.data.find("corpora")): 56 | nltk.download("stopwords") 57 | 58 | # Tokenize the document 59 | tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+') 60 | tokens = tokenizer.tokenize(text) 61 | 62 | if stemmer is None: 63 | # No stemmer? Just convert to lower case. 64 | words = [token.lower() for token in tokens] 65 | else: 66 | # Apply stemming 67 | words = [stemmer.stem(word.lower()) for word in tokens] 68 | 69 | 70 | # Remove stop words 71 | if remove_stopwords: 72 | words = [word for word in words if word not in stopwords.words('english')] 73 | 74 | return words 75 | 76 | def build_vocabulary(documents): 77 | """ 78 | Builds a vocabulary based on all documents in the corpus. 79 | 80 | Parameters 81 | ---------- 82 | documents : document corpus 83 | 84 | Returns 85 | ------- 86 | A list, containing all unique words from the corpus 87 | 88 | """ 89 | vocabulary = set() 90 | 91 | # Iterate over each document in the corpus 92 | for doc in documents: 93 | # Iterate over all words in the current document and 94 | # add each word to the vocabulary set 95 | vocabulary.update([word for word in doc]) 96 | 97 | # Convert the vocabulary to list 98 | vocabulary = list(vocabulary) 99 | 100 | return vocabulary 101 | 102 | 103 | def get_idfs_dict(vocabulary, documents): 104 | """ 105 | Gets a dictionary containing the vocabulary and their respective IDFs. 106 | This method is used for debug purposes only. 107 | 108 | Parameters 109 | ---------- 110 | vocabulary : vocabulary of the corpus 111 | documents : all documents in the corpus 112 | 113 | Returns 114 | ------- 115 | A dictionary in the form of {(word1, word1_IDF), (word2, word2_IDF), ... } 116 | 117 | """ 118 | 119 | # Get number of documents where each word from the vocabulary appears 120 | counts = Counter() 121 | 122 | # Iterate over the vocabulary and count the occurrence of each word 123 | for word in vocabulary: 124 | for doc in documents: 125 | if word in doc: 126 | counts[word] += 1 127 | 128 | # Get the number of documents in the corpus 129 | number_of_docs = len(documents) 130 | 131 | # Create an empty dictionary 132 | idfs = dict() 133 | 134 | # Iterate over the counts 135 | for term in list(counts.items()): 136 | 137 | # Normalise the count by the number of documents, and take the log 138 | # Add the (word, IDF) pair to the dictionary 139 | idfs[term[0]] = math.log(number_of_docs / term[1], 2) 140 | 141 | return idfs 142 | 143 | 144 | def get_idfs(vocabulary, documents): 145 | """ 146 | Gets a sparse diagonal matrix containing the IDFs for all words in the 147 | vocabulary. The IDF are computed as the logarithmically scaled inverse 148 | fraction of the documents that contain the word, obtained by dividing 149 | the total number of documents by the number of documents containing the 150 | term, and then taking the logarithm of that quotient. 151 | 152 | Parameters 153 | ---------- 154 | vocabulary : vocabulary of the corpus 155 | documents : all documents in the corpus 156 | 157 | Returns 158 | ------- 159 | A diagonal matrix of size len(vocabulary) x len(vocabulary), where the 160 | word's IDFs are located on the main diagonal, and all other elements in 161 | the matrix are 0. Eg: 162 | 163 | index in the vocabulary 0 1 2 3 ... N 164 | 1 idf(word1) 0 0 ... 0 165 | 2 0 idf(word2) 0 ... 0 166 | 3 0 0 idf(word3) ... 0 167 | ... ... ... ... ... ... 168 | N 0 0 0 ... idf(wordN) 169 | 170 | where N = len(vocabulary) 171 | 172 | """ 173 | 174 | # Get number of documents where each word from the vocabulary appears 175 | counts = dict() 176 | 177 | for word in vocabulary: 178 | for doc in documents: 179 | if word in doc: 180 | if word in counts: 181 | counts[word] += 1 182 | else: 183 | counts[word] = 1 184 | 185 | # Compute inverse document frequency 186 | number_of_docs = len(documents) 187 | 188 | # Create a list to hold all the IDFs 189 | idfs = [] 190 | 191 | # Iterate over the counts 192 | for word in vocabulary: 193 | 194 | # Normalise the count by the number of documents, and take the log 195 | # Add the value to the list of IDFs 196 | idfs.append(math.log(number_of_docs / counts[word], 2)) 197 | 198 | # Create a sparse diagonal matrix with the values from IDFs list located 199 | # on the main diagonal 200 | idf_matrix = scipy.sparse.diags(np.squeeze(np.asarray(idfs))) 201 | 202 | return idf_matrix 203 | 204 | def get_tf_vectors(vocabulary, documents): 205 | """ 206 | Computes the term frequency vectors for all documents. This method uses 207 | raw count of a term in a document, i.e. the number of times that term 208 | t occurs in document d. 209 | 210 | Parameters 211 | ---------- 212 | vocabulary : vocabulary of the corpus 213 | documents : all documents in the corpus 214 | 215 | Returns 216 | ------- 217 | A sparse matrix of size len(documents) x len(vocabulary), containing the 218 | raw counts for each term. Entries in the matrix can be viewed using 219 | the print_sparse_row(matrix, row_index) method. Ex: 220 | 221 | tf_matrix.shape 222 | (6918, 1869) 223 | 224 | print_sparse_row(tf_matrix,0) 225 | col[106] 1 226 | col[289] 1 227 | col[482] 1 228 | col[815] 1 229 | col[1074] 1 230 | col[1145] 1 231 | col[1232] 1 232 | col[1565] 1 233 | """ 234 | 235 | # Document / sparse matrix row index 236 | row_index = 0 237 | 238 | # Values and indices for the sparse matrix 239 | rows = [] 240 | cols = [] 241 | values = [] 242 | 243 | # Iterate over all documents in the corpus 244 | for doc in documents: 245 | col_index = 0 246 | 247 | # Iterate over all words in the vocabulary 248 | for word in vocabulary: 249 | 250 | # Current word in current document? 251 | if word in doc: 252 | # Increase the term frequency for this word 253 | rows.append(row_index) 254 | cols.append(col_index) 255 | values.append(doc.count(word)) 256 | col_index += 1 257 | else: 258 | # Move to the next word in the vocabulary 259 | col_index += 1 260 | 261 | # Move to the next document 262 | row_index += 1 263 | 264 | # Compose a sparse matrix of size len(documents) x len(vocabulary) with 265 | # all term frequencies 266 | tf_matrix = scipy.sparse.csr_matrix((values, (rows, cols)), shape=(row_index, len(vocabulary))) 267 | 268 | return tf_matrix 269 | 270 | def print_sparse_row(matrix, row_index): 271 | """ 272 | Prints the indices and their respective values for a sparse matrix row. 273 | This method is used for debugging purposes. 274 | 275 | Ex: 276 | 277 | print_sparse_row(tf_matrix,0) 278 | col[106] 1 279 | col[289] 1 280 | col[482] 1 281 | col[815] 1 282 | col[1074] 1 283 | col[1145] 1 284 | col[1232] 1 285 | col[1565] 1 286 | 287 | Parameters 288 | ---------- 289 | matrix : a sparse matrix 290 | row_index : index of a row from the sparse matrix 291 | 292 | Returns 293 | ------- 294 | 295 | """ 296 | # Convert the row of interest to a Numpy array 297 | row = np.asarray(matrix[row_index].todense()).flatten() 298 | 299 | # Iterate over all columns of the row 300 | col = 0 301 | for el in row: 302 | if el != 0: 303 | # Print the column index and the respective value 304 | print("col[%i] %s"%(col, el)) 305 | col += 1 306 | 307 | 308 | def print_tfidf(matrix, row_index, idfs): 309 | """ 310 | For a given row from a TF matrix, this method prints a table containing 311 | all words, their term frequency, IDF, and TFxIDF values. Ex: 312 | 313 | >>> idfs = get_idfs_dict(vocabulary, dataDF["Words"]) 314 | >>> print_tfidf(tf_matrix, 0, idfs) 315 | 316 | Column Word TF IDF TFxIDF 317 | ------ ---- -- --- ------ 318 | 106 is 1 2.2355206178166482 2.23552061782 319 | 289 just 1 4.616587945974135 4.61658794597 320 | 482 the 1 1.448369266482225 1.44836926648 321 | 815 vinc 1 1.8033980511864398 1.80339805119 322 | 1074 da 1 1.8033980511864398 1.80339805119 323 | 1145 book 1 5.434211203485566 5.43421120349 324 | 1232 code 1 1.801942987986053 1.80194298799 325 | 1565 awesom 1 2.6320179865437403 2.63201798654 326 | 327 | Parameters 328 | ---------- 329 | matrix : matrix containg document term frequencies (see the 330 | get_tf_vectors method) 331 | row_index : index of a row from the TF matrix 332 | 333 | Returns 334 | ------- 335 | """ 336 | 337 | # Get the row of interest as a Numpy array 338 | row = np.asarray(matrix[row_index].todense()).flatten() 339 | col = 0 340 | 341 | # Set the output header 342 | output = [["Column", "Word", "TF", "IDF", "TFxIDF"], 343 | ["------", "----", "--", "---", "------"]] 344 | 345 | # Go over each element of the row (i.e. word from the document) 346 | for el in row: 347 | if el != 0: 348 | # Append the column index, the word, and the TF, IDF, and TFxIDF 349 | # values to the output 350 | output.append([str(col), vocabulary[col], str(el), str(idfs[vocabulary[col]]), 351 | str(idfs[vocabulary[col]]*el)]) 352 | col += 1 353 | 354 | # Print the output as a table 355 | col_width = max(len(word) for row in output for word in row) + 2 # padding 356 | for row in output: 357 | print("".join(word.ljust(col_width) for word in row)) 358 | 359 | def l2_normalized_matrix(matrix): 360 | """ 361 | Normalises a sparse matrix by scaling its rows individually to L2 unit norm 362 | 363 | The new row values are computed as 364 | 365 | ||x|| = sqrt(sum(x^2)) 366 | 367 | For efficiency, the resulting new matrix is formed by computing 368 | 369 | normalized_matrix = 370 | transpose(transpose transpose(matrix) * l2_norm) 371 | 372 | where matrix is the original sparse matrix and l2_norm is diagonal 373 | matrix of the reciprocals of sqrt(sum(x^2)) 374 | 375 | Parameters 376 | ---------- 377 | matrix : a sparse matrix to be normalized 378 | 379 | Returns 380 | ------- 381 | An L2 normalised sparse matrix based on the input matrix 382 | 383 | """ 384 | # Compute the L2 norms 385 | l2_norm = np.sqrt(matrix.power(2).sum(axis=1)) 386 | 387 | # Get the reciprocals 388 | with np.errstate(divide="ignore", invalid="ignore"): 389 | l2_norm = np.reciprocal(l2_norm) 390 | # Treat infinity and NaN as 0 391 | l2_norm[~np.isfinite(l2_norm)] = 0 # -inf inf NaN 392 | 393 | # Form a diagonal matrix of the reciprocals 394 | l2_norm = scipy.sparse.diags(np.squeeze(np.asarray(l2_norm))) 395 | 396 | # Compute the normalised matrix 397 | normalized_matrix = (matrix.T * l2_norm).T 398 | 399 | return normalized_matrix 400 | 401 | def mtx_save(file_name, matrix): 402 | """ 403 | Writes a sparse matrix a to Matrix Market file-like target. 404 | 405 | Parameters 406 | ---------- 407 | file_name : target file name 408 | matrix : a sparse matrix 409 | 410 | Returns 411 | ------- 412 | 413 | """ 414 | scipy.io.mmwrite(file_name, matrix) 415 | 416 | def encode_labels(labelsDF): 417 | """ 418 | Encodes a string set of target classes to a Numpy array of label indices 419 | 420 | Parameters 421 | ---------- 422 | labelsDF : a Pandas DataFrame or Numpy array containing the labels 423 | 424 | Returns 425 | ------- 426 | An encoded Numpy array 427 | 428 | Ex: 429 | 430 | >>> A = np.array(["a", "a", "b", "a"]) 431 | >>> encode_labels(A) 432 | array([0, 0, 1, 0], dtype=int8) 433 | 434 | """ 435 | # Factorize the labels 436 | labelsDF = pd.Categorical(labelsDF) 437 | catLabelsDF = labelsDF.codes 438 | 439 | return catLabelsDF 440 | 441 | def labels_save(file_name, labels): 442 | """ 443 | Saves the target class labels to an external file. 444 | 445 | Parameters 446 | ---------- 447 | file_name : target file name 448 | labels : a Numpy array containing the labels 449 | 450 | Returns 451 | ------- 452 | 453 | """ 454 | labels.tofile(file_name, sep='\n') 455 | 456 | def hash_vectors(tf_idf_matrix, vocabulary, N=8000): 457 | """ 458 | Applies feature hashing / hashing trick to a sparse matrix. This method 459 | turns features into indices in a vector or matrix. It works by applying a 460 | hash function to the features and using their hash values as indices 461 | directly, rather than looking the indices up in an associative array. 462 | 463 | Parameters 464 | ---------- 465 | tf_idf_matrix : a sparse matrix of TFxIDF values 466 | vocabulary : vocabulary of the corpus 467 | N : size of the hased vector 468 | 469 | Returns 470 | ------- 471 | A sparse matrix of size tf_idf_matrix.shape[0] x N, containing the hashed 472 | features 473 | 474 | >>> tf_idf_matrix.shape 475 | (6918, 1869) 476 | 477 | >>> hash_vectors(tf_idf_matrix, vocabulary, 100).shape 478 | (6918, 100) 479 | 480 | """ 481 | 482 | # Make sure the input is a csr_matrix (wee need to access the sparse 483 | # matrix elements directly ) 484 | if not isinstance(tf_idf_matrix, scipy.sparse.csr.csr_matrix): 485 | print("WARN: Input %s is not a Compressed Sparse Row matrix. Converting...") 486 | tf_idf_matrix = tf_idf_matrix.tocsr() 487 | 488 | 489 | row_count = tf_idf_matrix.shape[0] 490 | 491 | hashed_rows = [] 492 | hashed_cols = [] 493 | hashed_data = [] 494 | 495 | # Iterate over the matrix rows 496 | for row_index in range(row_count): 497 | 498 | # Get the current row indices 499 | row = tf_idf_matrix.getrow(row_index) 500 | col_indices = row.indices 501 | 502 | # Iterate over the columns 503 | for col_index in range(len(col_indices)): 504 | # Get the word and its corresponding TFxIDF value 505 | tf_idf_value = tf_idf_matrix[row_index, col_indices[col_index]] 506 | word = vocabulary[col_indices[col_index]] 507 | 508 | # Apply a hash function h to the features (e.g., words), then use 509 | # the hash values directly as feature indices and update the 510 | # resulting vector at those indices 511 | 512 | h = hash(word) 513 | hashed_rows.append(row_index) 514 | hashed_cols.append(h % N) 515 | hashed_data.append(tf_idf_value) 516 | 517 | # Create a new sparse matrix with the hashed features 518 | hashed_features_matrix = scipy.sparse.csr_matrix((hashed_data, 519 | (hashed_rows, hashed_cols)), 520 | shape=(row_count, N)) 521 | 522 | return hashed_features_matrix 523 | 524 | 525 | # Read a data set 526 | dataDF = pd.read_csv("data/SMSSpamCollection", 527 | sep='\t', lineterminator='\n', names = ["Label", "Text"]) 528 | 529 | # Initialise a stemmer 530 | 531 | #porter = nltk.stem.porter.PorterStemmer() 532 | #lancaster = nltk.stem.lancaster.LancasterStemmer() 533 | snowball = nltk.stem.snowball.EnglishStemmer() 534 | 535 | # Apply stemming 536 | print("Stemming...") 537 | dataDF["Words"] = dataDF.apply(lambda row: extract_words(row['Text'], snowball), axis=1) 538 | 539 | # Remove empty rows. Messages like ":)" which will get removed by the stemmer 540 | dataDF = dataDF[dataDF.astype(str)["Words"] != '[]'] 541 | dataDF = dataDF.reset_index(drop=True) 542 | 543 | # Build a vocabulary 544 | print("Building vocabulary...") 545 | vocabulary = build_vocabulary(dataDF["Words"]) 546 | 547 | # Get the TF vectors 548 | print("Forming the TF matrix...") 549 | tf_matrix = get_tf_vectors(vocabulary, dataDF["Words"]) 550 | 551 | # Get the IDF matrix 552 | print("Forming the IDF matrix...") 553 | idf_matrix = get_idfs(vocabulary, dataDF["Words"]) 554 | 555 | # Compute the TFxIDF values 556 | print("Computing the TFxIDF matrix...") 557 | tf_idf_matrix = (tf_matrix * idf_matrix) 558 | tf_idf_matrix = l2_normalized_matrix(tf_idf_matrix) 559 | 560 | #tf_idf_matrix = hash_vectors(tf_idf_matrix, vocabulary, 125) 561 | 562 | # Encode the labels 563 | print("Encoding labels...") 564 | labels = encode_labels(dataDF["Label"]) 565 | 566 | # Save the TFxIDF matrix and the corresponding values 567 | print("Saving features and labels...") 568 | mtx_save("data/training.mtx", tf_idf_matrix) 569 | labels_save("data/labels.csv", labels) 570 | 571 | print("all done!") 572 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Machine Learning Study Group 2 | 3 | This repository contains materials from the London Machine Learning Study Group Meetups 4 | 5 | The meetup page is available at [http://www.meetup.com/London-Machine-Learning-Study-Group](http://www.meetup.com/London-Machine-Learning-Study-Group). 6 | 7 | (C) 2016 Nikolay Manchev, London Machine Learning Study Group 8 | 9 | This work is licensed under the Creative Commons Attribution 4.0 International License. To view a copy of this license, visit [http://creativecommons.org/licenses/by/4.0](http://creativecommons.org/licenses/by/4.0). 10 | --------------------------------------------------------------------------------