├── .all-contributorsrc ├── .devcontainer ├── devcontainer.json └── startup.sh ├── .github └── workflows │ ├── release.yml │ └── test.yml ├── .gitignore ├── Auto.csv ├── Auto.data ├── Ch02-statlearn-lab.Rmd ├── Ch02-statlearn-lab.ipynb ├── Ch03-linreg-lab.Rmd ├── Ch03-linreg-lab.ipynb ├── Ch04-classification-lab.Rmd ├── Ch04-classification-lab.ipynb ├── Ch05-resample-lab.Rmd ├── Ch05-resample-lab.ipynb ├── Ch06-varselect-lab.Rmd ├── Ch06-varselect-lab.ipynb ├── Ch07-nonlin-lab.Rmd ├── Ch07-nonlin-lab.ipynb ├── Ch08-baggboost-lab.Rmd ├── Ch08-baggboost-lab.ipynb ├── Ch09-svm-lab.Rmd ├── Ch09-svm-lab.ipynb ├── Ch10-deeplearning-lab.Rmd ├── Ch10-deeplearning-lab.ipynb ├── Ch11-surv-lab.Rmd ├── Ch11-surv-lab.ipynb ├── Ch12-unsup-lab.Rmd ├── Ch12-unsup-lab.ipynb ├── Ch13-multiple-lab.Rmd ├── Ch13-multiple-lab.ipynb ├── LICENSE ├── Makefile ├── README.md ├── book_images ├── Cape_Weaver.jpg ├── Flamingo.jpg ├── Hawk_Fountain.jpg ├── Hawk_cropped.jpg ├── Lhasa_Apso.jpg └── Sleeping_Cat.jpg ├── imagenet_class_index.json └── requirements.txt /.all-contributorsrc: -------------------------------------------------------------------------------- 1 | { 2 | "files": [ 3 | "README.md" 4 | ], 5 | "imageSize": 100, 6 | "commit": false, 7 | "commitType": "docs", 8 | "commitConvention": "angular", 9 | "contributors": [ 10 | { 11 | "login": "tibshirani", 12 | "name": "tibshirani", 13 | "avatar_url": "https://avatars.githubusercontent.com/u/2848609?v=4", 14 | "profile": "https://github.com/tibshirani", 15 | "contributions": [ 16 | "code", 17 | "content" 18 | ] 19 | }, 20 | { 21 | "login": "trevorhastie", 22 | "name": "trevorhastie", 23 | "avatar_url": "https://avatars.githubusercontent.com/u/13293253?v=4", 24 | "profile": "https://web.stanford.edu/~hastie/", 25 | "contributions": [ 26 | "code", 27 | "content" 28 | ] 29 | }, 30 | { 31 | "login": "danielawitten", 32 | "name": "danielawitten", 33 | "avatar_url": "https://avatars.githubusercontent.com/u/12654191?v=4", 34 | "profile": "https://github.com/danielawitten", 35 | "contributions": [ 36 | "code", 37 | "content" 38 | ] 39 | }, 40 | { 41 | "login": "jonathan-taylor", 42 | "name": "Jonathan Taylor", 43 | "avatar_url": "https://avatars.githubusercontent.com/u/341611?v=4", 44 | "profile": "http://statweb.stanford.edu/~jtaylo", 45 | "contributions": [ 46 | "code", 47 | "content" 48 | ] 49 | }, 50 | { 51 | "login": "tschm", 52 | "name": "Thomas Schmelzer", 53 | "avatar_url": "https://avatars.githubusercontent.com/u/2046079?v=4", 54 | "profile": "https://github.com/tschm", 55 | "contributions": [ 56 | "code" 57 | ] 58 | } 59 | ], 60 | "contributorsPerLine": 7, 61 | "skipCi": true, 62 | "repoType": "github", 63 | "repoHost": "https://github.com", 64 | "projectName": "ISLP_labs", 65 | "projectOwner": "intro-stat-learning" 66 | } 67 | -------------------------------------------------------------------------------- /.devcontainer/devcontainer.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "Jupyter Environment", 3 | "image": "mcr.microsoft.com/devcontainers/python:3", 4 | "features": { 5 | "ghcr.io/devcontainers/features/python:1": {} 6 | }, 7 | "customizations": { 8 | "vscode": { 9 | "extensions": [ 10 | "ms-python.python", 11 | "ms-toolsai.jupyter", 12 | "ms-toolsai.jupyter-keymap", 13 | "ms-toolsai.jupyter-renderers", 14 | "ms-toolsai.vscode-jupyter-cell-tags", 15 | "ms-toolsai.vscode-jupyter-slideshow" 16 | ] 17 | } 18 | }, 19 | "onCreateCommand": ".devcontainer/startup.sh", 20 | "forwardPorts": [8888], 21 | "postStartCommand": "uv run jupyter lab --no-browser --ip=0.0.0.0 --port=8888 --NotebookApp.token='' --NotebookApp.password=''" 22 | } 23 | -------------------------------------------------------------------------------- /.devcontainer/startup.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | curl -LsSf https://astral.sh/uv/install.sh | sh 3 | uv venv --python 3.12 4 | uv pip install --no-cache-dir jupyterlab 5 | uv pip install --no-cache-dir -r requirements.txt 6 | -------------------------------------------------------------------------------- /.github/workflows/release.yml: -------------------------------------------------------------------------------- 1 | name: Bump version and publish 2 | 3 | on: 4 | #push: 5 | workflow_dispatch 6 | 7 | permissions: 8 | contents: write 9 | 10 | jobs: 11 | tagging: 12 | runs-on: ubuntu-latest 13 | outputs: 14 | new_tag: ${{ steps.tag_step.outputs.new_tag }} 15 | 16 | steps: 17 | - name: Generate Tag 18 | id: tag_step 19 | uses: tschm/cradle/actions/tag@v0.1.57 20 | with: 21 | github_token: ${{ secrets.GITHUB_TOKEN }} 22 | -------------------------------------------------------------------------------- /.github/workflows/test.yml: -------------------------------------------------------------------------------- 1 | # testing all notebooks 2 | name: TEST 3 | 4 | on: 5 | - push 6 | 7 | jobs: 8 | test: 9 | runs-on: ubuntu-latest 10 | 11 | strategy: 12 | matrix: 13 | python-version: ['3.11', '3.12'] 14 | notebook: [ Ch02-statlearn-lab.ipynb, 15 | Ch03-linreg-lab.ipynb, 16 | Ch04-classification-lab.ipynb, 17 | Ch05-resample-lab.ipynb, 18 | Ch06-varselect-lab.ipynb, 19 | Ch07-nonlin-lab.ipynb, 20 | Ch08-baggboost-lab.ipynb, 21 | Ch09-svm-lab.ipynb, 22 | Ch10-deeplearning-lab.ipynb, 23 | Ch11-surv-lab.ipynb, 24 | Ch12-unsup-lab.ipynb, 25 | Ch13-multiple-lab.ipynb] 26 | exclude: 27 | - python-version: '3.11' 28 | notebook: Ch10-deeplearning-lab.ipynb 29 | 30 | fail-fast: false 31 | 32 | steps: 33 | - uses: actions/checkout@v4 34 | 35 | - name: Install uv 36 | uses: astral-sh/setup-uv@v5 37 | with: 38 | version: "0.5.15" 39 | 40 | - name: Set up Python 41 | shell: bash 42 | run: | 43 | uv python install ${{ matrix.python-version }} 44 | 45 | - name: Create venv 46 | shell: bash 47 | run: uv venv --python ${{ matrix.python-version }} 48 | 49 | - name: Install requirements 50 | shell: bash 51 | run: | 52 | uv pip install --upgrade pip 53 | uv pip install -r requirements.txt 54 | uv pip install pytest nbmake 55 | 56 | - name: Test 57 | shell: bash 58 | run: | 59 | uv run pytest --nbmake --nbmake-timeout=3600 -vv ${{ matrix.notebook }} 60 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # Jupyter Notebook 7 | .ipynb_checkpoints/ 8 | *.ipynb_checkpoints/ 9 | *.ipynb_meta 10 | 11 | # Python environments 12 | env/ 13 | venv/ 14 | .venv/ 15 | ENV/ 16 | env.bak/ 17 | venv.bak/ 18 | .spyderproject 19 | .spyproject 20 | .ropeproject 21 | 22 | # Distribution / packaging 23 | .Python 24 | build/ 25 | develop-eggs/ 26 | dist/ 27 | downloads/ 28 | eggs/ 29 | .eggs/ 30 | lib/ 31 | lib64/ 32 | parts/ 33 | sdist/ 34 | var/ 35 | wheels/ 36 | pip-wheel-metadata/ 37 | share/python-wheels/ 38 | *.egg-info/ 39 | .installed.cfg 40 | *.egg 41 | 42 | # PyInstaller 43 | *.manifest 44 | *.spec 45 | 46 | # Operating System 47 | .DS_Store 48 | 49 | # IDEs 50 | .vscode/ 51 | .idea/ 52 | 53 | -------------------------------------------------------------------------------- /Auto.csv: -------------------------------------------------------------------------------- 1 | mpg,cylinders,displacement,horsepower,weight,acceleration,year,origin,name 2 | 18.0,8,307.0,130,3504,12.0,70,1,chevrolet chevelle malibu 3 | 15.0,8,350.0,165,3693,11.5,70,1,buick skylark 320 4 | 18.0,8,318.0,150,3436,11.0,70,1,plymouth satellite 5 | 16.0,8,304.0,150,3433,12.0,70,1,amc rebel sst 6 | 17.0,8,302.0,140,3449,10.5,70,1,ford torino 7 | 15.0,8,429.0,198,4341,10.0,70,1,ford galaxie 500 8 | 14.0,8,454.0,220,4354,9.0,70,1,chevrolet impala 9 | 14.0,8,440.0,215,4312,8.5,70,1,plymouth fury iii 10 | 14.0,8,455.0,225,4425,10.0,70,1,pontiac catalina 11 | 15.0,8,390.0,190,3850,8.5,70,1,amc ambassador dpl 12 | 15.0,8,383.0,170,3563,10.0,70,1,dodge challenger se 13 | 14.0,8,340.0,160,3609,8.0,70,1,plymouth 'cuda 340 14 | 15.0,8,400.0,150,3761,9.5,70,1,chevrolet monte carlo 15 | 14.0,8,455.0,225,3086,10.0,70,1,buick estate wagon (sw) 16 | 24.0,4,113.0,95,2372,15.0,70,3,toyota corona mark ii 17 | 22.0,6,198.0,95,2833,15.5,70,1,plymouth duster 18 | 18.0,6,199.0,97,2774,15.5,70,1,amc hornet 19 | 21.0,6,200.0,85,2587,16.0,70,1,ford maverick 20 | 27.0,4,97.0,88,2130,14.5,70,3,datsun pl510 21 | 26.0,4,97.0,46,1835,20.5,70,2,volkswagen 1131 deluxe sedan 22 | 25.0,4,110.0,87,2672,17.5,70,2,peugeot 504 23 | 24.0,4,107.0,90,2430,14.5,70,2,audi 100 ls 24 | 25.0,4,104.0,95,2375,17.5,70,2,saab 99e 25 | 26.0,4,121.0,113,2234,12.5,70,2,bmw 2002 26 | 21.0,6,199.0,90,2648,15.0,70,1,amc gremlin 27 | 10.0,8,360.0,215,4615,14.0,70,1,ford f250 28 | 10.0,8,307.0,200,4376,15.0,70,1,chevy c20 29 | 11.0,8,318.0,210,4382,13.5,70,1,dodge d200 30 | 9.0,8,304.0,193,4732,18.5,70,1,hi 1200d 31 | 27.0,4,97.0,88,2130,14.5,71,3,datsun pl510 32 | 28.0,4,140.0,90,2264,15.5,71,1,chevrolet vega 2300 33 | 25.0,4,113.0,95,2228,14.0,71,3,toyota corona 34 | 19.0,6,232.0,100,2634,13.0,71,1,amc gremlin 35 | 16.0,6,225.0,105,3439,15.5,71,1,plymouth satellite custom 36 | 17.0,6,250.0,100,3329,15.5,71,1,chevrolet chevelle malibu 37 | 19.0,6,250.0,88,3302,15.5,71,1,ford torino 500 38 | 18.0,6,232.0,100,3288,15.5,71,1,amc matador 39 | 14.0,8,350.0,165,4209,12.0,71,1,chevrolet impala 40 | 14.0,8,400.0,175,4464,11.5,71,1,pontiac catalina brougham 41 | 14.0,8,351.0,153,4154,13.5,71,1,ford galaxie 500 42 | 14.0,8,318.0,150,4096,13.0,71,1,plymouth fury iii 43 | 12.0,8,383.0,180,4955,11.5,71,1,dodge monaco (sw) 44 | 13.0,8,400.0,170,4746,12.0,71,1,ford country squire (sw) 45 | 13.0,8,400.0,175,5140,12.0,71,1,pontiac safari (sw) 46 | 18.0,6,258.0,110,2962,13.5,71,1,amc hornet sportabout (sw) 47 | 22.0,4,140.0,72,2408,19.0,71,1,chevrolet vega (sw) 48 | 19.0,6,250.0,100,3282,15.0,71,1,pontiac firebird 49 | 18.0,6,250.0,88,3139,14.5,71,1,ford mustang 50 | 23.0,4,122.0,86,2220,14.0,71,1,mercury capri 2000 51 | 28.0,4,116.0,90,2123,14.0,71,2,opel 1900 52 | 30.0,4,79.0,70,2074,19.5,71,2,peugeot 304 53 | 30.0,4,88.0,76,2065,14.5,71,2,fiat 124b 54 | 31.0,4,71.0,65,1773,19.0,71,3,toyota corolla 1200 55 | 35.0,4,72.0,69,1613,18.0,71,3,datsun 1200 56 | 27.0,4,97.0,60,1834,19.0,71,2,volkswagen model 111 57 | 26.0,4,91.0,70,1955,20.5,71,1,plymouth cricket 58 | 24.0,4,113.0,95,2278,15.5,72,3,toyota corona hardtop 59 | 25.0,4,97.5,80,2126,17.0,72,1,dodge colt hardtop 60 | 23.0,4,97.0,54,2254,23.5,72,2,volkswagen type 3 61 | 20.0,4,140.0,90,2408,19.5,72,1,chevrolet vega 62 | 21.0,4,122.0,86,2226,16.5,72,1,ford pinto runabout 63 | 13.0,8,350.0,165,4274,12.0,72,1,chevrolet impala 64 | 14.0,8,400.0,175,4385,12.0,72,1,pontiac catalina 65 | 15.0,8,318.0,150,4135,13.5,72,1,plymouth fury iii 66 | 14.0,8,351.0,153,4129,13.0,72,1,ford galaxie 500 67 | 17.0,8,304.0,150,3672,11.5,72,1,amc ambassador sst 68 | 11.0,8,429.0,208,4633,11.0,72,1,mercury marquis 69 | 13.0,8,350.0,155,4502,13.5,72,1,buick lesabre custom 70 | 12.0,8,350.0,160,4456,13.5,72,1,oldsmobile delta 88 royale 71 | 13.0,8,400.0,190,4422,12.5,72,1,chrysler newport royal 72 | 19.0,3,70.0,97,2330,13.5,72,3,mazda rx2 coupe 73 | 15.0,8,304.0,150,3892,12.5,72,1,amc matador (sw) 74 | 13.0,8,307.0,130,4098,14.0,72,1,chevrolet chevelle concours (sw) 75 | 13.0,8,302.0,140,4294,16.0,72,1,ford gran torino (sw) 76 | 14.0,8,318.0,150,4077,14.0,72,1,plymouth satellite custom (sw) 77 | 18.0,4,121.0,112,2933,14.5,72,2,volvo 145e (sw) 78 | 22.0,4,121.0,76,2511,18.0,72,2,volkswagen 411 (sw) 79 | 21.0,4,120.0,87,2979,19.5,72,2,peugeot 504 (sw) 80 | 26.0,4,96.0,69,2189,18.0,72,2,renault 12 (sw) 81 | 22.0,4,122.0,86,2395,16.0,72,1,ford pinto (sw) 82 | 28.0,4,97.0,92,2288,17.0,72,3,datsun 510 (sw) 83 | 23.0,4,120.0,97,2506,14.5,72,3,toyouta corona mark ii (sw) 84 | 28.0,4,98.0,80,2164,15.0,72,1,dodge colt (sw) 85 | 27.0,4,97.0,88,2100,16.5,72,3,toyota corolla 1600 (sw) 86 | 13.0,8,350.0,175,4100,13.0,73,1,buick century 350 87 | 14.0,8,304.0,150,3672,11.5,73,1,amc matador 88 | 13.0,8,350.0,145,3988,13.0,73,1,chevrolet malibu 89 | 14.0,8,302.0,137,4042,14.5,73,1,ford gran torino 90 | 15.0,8,318.0,150,3777,12.5,73,1,dodge coronet custom 91 | 12.0,8,429.0,198,4952,11.5,73,1,mercury marquis brougham 92 | 13.0,8,400.0,150,4464,12.0,73,1,chevrolet caprice classic 93 | 13.0,8,351.0,158,4363,13.0,73,1,ford ltd 94 | 14.0,8,318.0,150,4237,14.5,73,1,plymouth fury gran sedan 95 | 13.0,8,440.0,215,4735,11.0,73,1,chrysler new yorker brougham 96 | 12.0,8,455.0,225,4951,11.0,73,1,buick electra 225 custom 97 | 13.0,8,360.0,175,3821,11.0,73,1,amc ambassador brougham 98 | 18.0,6,225.0,105,3121,16.5,73,1,plymouth valiant 99 | 16.0,6,250.0,100,3278,18.0,73,1,chevrolet nova custom 100 | 18.0,6,232.0,100,2945,16.0,73,1,amc hornet 101 | 18.0,6,250.0,88,3021,16.5,73,1,ford maverick 102 | 23.0,6,198.0,95,2904,16.0,73,1,plymouth duster 103 | 26.0,4,97.0,46,1950,21.0,73,2,volkswagen super beetle 104 | 11.0,8,400.0,150,4997,14.0,73,1,chevrolet impala 105 | 12.0,8,400.0,167,4906,12.5,73,1,ford country 106 | 13.0,8,360.0,170,4654,13.0,73,1,plymouth custom suburb 107 | 12.0,8,350.0,180,4499,12.5,73,1,oldsmobile vista cruiser 108 | 18.0,6,232.0,100,2789,15.0,73,1,amc gremlin 109 | 20.0,4,97.0,88,2279,19.0,73,3,toyota carina 110 | 21.0,4,140.0,72,2401,19.5,73,1,chevrolet vega 111 | 22.0,4,108.0,94,2379,16.5,73,3,datsun 610 112 | 18.0,3,70.0,90,2124,13.5,73,3,maxda rx3 113 | 19.0,4,122.0,85,2310,18.5,73,1,ford pinto 114 | 21.0,6,155.0,107,2472,14.0,73,1,mercury capri v6 115 | 26.0,4,98.0,90,2265,15.5,73,2,fiat 124 sport coupe 116 | 15.0,8,350.0,145,4082,13.0,73,1,chevrolet monte carlo s 117 | 16.0,8,400.0,230,4278,9.5,73,1,pontiac grand prix 118 | 29.0,4,68.0,49,1867,19.5,73,2,fiat 128 119 | 24.0,4,116.0,75,2158,15.5,73,2,opel manta 120 | 20.0,4,114.0,91,2582,14.0,73,2,audi 100ls 121 | 19.0,4,121.0,112,2868,15.5,73,2,volvo 144ea 122 | 15.0,8,318.0,150,3399,11.0,73,1,dodge dart custom 123 | 24.0,4,121.0,110,2660,14.0,73,2,saab 99le 124 | 20.0,6,156.0,122,2807,13.5,73,3,toyota mark ii 125 | 11.0,8,350.0,180,3664,11.0,73,1,oldsmobile omega 126 | 20.0,6,198.0,95,3102,16.5,74,1,plymouth duster 127 | 19.0,6,232.0,100,2901,16.0,74,1,amc hornet 128 | 15.0,6,250.0,100,3336,17.0,74,1,chevrolet nova 129 | 31.0,4,79.0,67,1950,19.0,74,3,datsun b210 130 | 26.0,4,122.0,80,2451,16.5,74,1,ford pinto 131 | 32.0,4,71.0,65,1836,21.0,74,3,toyota corolla 1200 132 | 25.0,4,140.0,75,2542,17.0,74,1,chevrolet vega 133 | 16.0,6,250.0,100,3781,17.0,74,1,chevrolet chevelle malibu classic 134 | 16.0,6,258.0,110,3632,18.0,74,1,amc matador 135 | 18.0,6,225.0,105,3613,16.5,74,1,plymouth satellite sebring 136 | 16.0,8,302.0,140,4141,14.0,74,1,ford gran torino 137 | 13.0,8,350.0,150,4699,14.5,74,1,buick century luxus (sw) 138 | 14.0,8,318.0,150,4457,13.5,74,1,dodge coronet custom (sw) 139 | 14.0,8,302.0,140,4638,16.0,74,1,ford gran torino (sw) 140 | 14.0,8,304.0,150,4257,15.5,74,1,amc matador (sw) 141 | 29.0,4,98.0,83,2219,16.5,74,2,audi fox 142 | 26.0,4,79.0,67,1963,15.5,74,2,volkswagen dasher 143 | 26.0,4,97.0,78,2300,14.5,74,2,opel manta 144 | 31.0,4,76.0,52,1649,16.5,74,3,toyota corona 145 | 32.0,4,83.0,61,2003,19.0,74,3,datsun 710 146 | 28.0,4,90.0,75,2125,14.5,74,1,dodge colt 147 | 24.0,4,90.0,75,2108,15.5,74,2,fiat 128 148 | 26.0,4,116.0,75,2246,14.0,74,2,fiat 124 tc 149 | 24.0,4,120.0,97,2489,15.0,74,3,honda civic 150 | 26.0,4,108.0,93,2391,15.5,74,3,subaru 151 | 31.0,4,79.0,67,2000,16.0,74,2,fiat x1.9 152 | 19.0,6,225.0,95,3264,16.0,75,1,plymouth valiant custom 153 | 18.0,6,250.0,105,3459,16.0,75,1,chevrolet nova 154 | 15.0,6,250.0,72,3432,21.0,75,1,mercury monarch 155 | 15.0,6,250.0,72,3158,19.5,75,1,ford maverick 156 | 16.0,8,400.0,170,4668,11.5,75,1,pontiac catalina 157 | 15.0,8,350.0,145,4440,14.0,75,1,chevrolet bel air 158 | 16.0,8,318.0,150,4498,14.5,75,1,plymouth grand fury 159 | 14.0,8,351.0,148,4657,13.5,75,1,ford ltd 160 | 17.0,6,231.0,110,3907,21.0,75,1,buick century 161 | 16.0,6,250.0,105,3897,18.5,75,1,chevroelt chevelle malibu 162 | 15.0,6,258.0,110,3730,19.0,75,1,amc matador 163 | 18.0,6,225.0,95,3785,19.0,75,1,plymouth fury 164 | 21.0,6,231.0,110,3039,15.0,75,1,buick skyhawk 165 | 20.0,8,262.0,110,3221,13.5,75,1,chevrolet monza 2+2 166 | 13.0,8,302.0,129,3169,12.0,75,1,ford mustang ii 167 | 29.0,4,97.0,75,2171,16.0,75,3,toyota corolla 168 | 23.0,4,140.0,83,2639,17.0,75,1,ford pinto 169 | 20.0,6,232.0,100,2914,16.0,75,1,amc gremlin 170 | 23.0,4,140.0,78,2592,18.5,75,1,pontiac astro 171 | 24.0,4,134.0,96,2702,13.5,75,3,toyota corona 172 | 25.0,4,90.0,71,2223,16.5,75,2,volkswagen dasher 173 | 24.0,4,119.0,97,2545,17.0,75,3,datsun 710 174 | 18.0,6,171.0,97,2984,14.5,75,1,ford pinto 175 | 29.0,4,90.0,70,1937,14.0,75,2,volkswagen rabbit 176 | 19.0,6,232.0,90,3211,17.0,75,1,amc pacer 177 | 23.0,4,115.0,95,2694,15.0,75,2,audi 100ls 178 | 23.0,4,120.0,88,2957,17.0,75,2,peugeot 504 179 | 22.0,4,121.0,98,2945,14.5,75,2,volvo 244dl 180 | 25.0,4,121.0,115,2671,13.5,75,2,saab 99le 181 | 33.0,4,91.0,53,1795,17.5,75,3,honda civic cvcc 182 | 28.0,4,107.0,86,2464,15.5,76,2,fiat 131 183 | 25.0,4,116.0,81,2220,16.9,76,2,opel 1900 184 | 25.0,4,140.0,92,2572,14.9,76,1,capri ii 185 | 26.0,4,98.0,79,2255,17.7,76,1,dodge colt 186 | 27.0,4,101.0,83,2202,15.3,76,2,renault 12tl 187 | 17.5,8,305.0,140,4215,13.0,76,1,chevrolet chevelle malibu classic 188 | 16.0,8,318.0,150,4190,13.0,76,1,dodge coronet brougham 189 | 15.5,8,304.0,120,3962,13.9,76,1,amc matador 190 | 14.5,8,351.0,152,4215,12.8,76,1,ford gran torino 191 | 22.0,6,225.0,100,3233,15.4,76,1,plymouth valiant 192 | 22.0,6,250.0,105,3353,14.5,76,1,chevrolet nova 193 | 24.0,6,200.0,81,3012,17.6,76,1,ford maverick 194 | 22.5,6,232.0,90,3085,17.6,76,1,amc hornet 195 | 29.0,4,85.0,52,2035,22.2,76,1,chevrolet chevette 196 | 24.5,4,98.0,60,2164,22.1,76,1,chevrolet woody 197 | 29.0,4,90.0,70,1937,14.2,76,2,vw rabbit 198 | 33.0,4,91.0,53,1795,17.4,76,3,honda civic 199 | 20.0,6,225.0,100,3651,17.7,76,1,dodge aspen se 200 | 18.0,6,250.0,78,3574,21.0,76,1,ford granada ghia 201 | 18.5,6,250.0,110,3645,16.2,76,1,pontiac ventura sj 202 | 17.5,6,258.0,95,3193,17.8,76,1,amc pacer d/l 203 | 29.5,4,97.0,71,1825,12.2,76,2,volkswagen rabbit 204 | 32.0,4,85.0,70,1990,17.0,76,3,datsun b-210 205 | 28.0,4,97.0,75,2155,16.4,76,3,toyota corolla 206 | 26.5,4,140.0,72,2565,13.6,76,1,ford pinto 207 | 20.0,4,130.0,102,3150,15.7,76,2,volvo 245 208 | 13.0,8,318.0,150,3940,13.2,76,1,plymouth volare premier v8 209 | 19.0,4,120.0,88,3270,21.9,76,2,peugeot 504 210 | 19.0,6,156.0,108,2930,15.5,76,3,toyota mark ii 211 | 16.5,6,168.0,120,3820,16.7,76,2,mercedes-benz 280s 212 | 16.5,8,350.0,180,4380,12.1,76,1,cadillac seville 213 | 13.0,8,350.0,145,4055,12.0,76,1,chevy c10 214 | 13.0,8,302.0,130,3870,15.0,76,1,ford f108 215 | 13.0,8,318.0,150,3755,14.0,76,1,dodge d100 216 | 31.5,4,98.0,68,2045,18.5,77,3,honda accord cvcc 217 | 30.0,4,111.0,80,2155,14.8,77,1,buick opel isuzu deluxe 218 | 36.0,4,79.0,58,1825,18.6,77,2,renault 5 gtl 219 | 25.5,4,122.0,96,2300,15.5,77,1,plymouth arrow gs 220 | 33.5,4,85.0,70,1945,16.8,77,3,datsun f-10 hatchback 221 | 17.5,8,305.0,145,3880,12.5,77,1,chevrolet caprice classic 222 | 17.0,8,260.0,110,4060,19.0,77,1,oldsmobile cutlass supreme 223 | 15.5,8,318.0,145,4140,13.7,77,1,dodge monaco brougham 224 | 15.0,8,302.0,130,4295,14.9,77,1,mercury cougar brougham 225 | 17.5,6,250.0,110,3520,16.4,77,1,chevrolet concours 226 | 20.5,6,231.0,105,3425,16.9,77,1,buick skylark 227 | 19.0,6,225.0,100,3630,17.7,77,1,plymouth volare custom 228 | 18.5,6,250.0,98,3525,19.0,77,1,ford granada 229 | 16.0,8,400.0,180,4220,11.1,77,1,pontiac grand prix lj 230 | 15.5,8,350.0,170,4165,11.4,77,1,chevrolet monte carlo landau 231 | 15.5,8,400.0,190,4325,12.2,77,1,chrysler cordoba 232 | 16.0,8,351.0,149,4335,14.5,77,1,ford thunderbird 233 | 29.0,4,97.0,78,1940,14.5,77,2,volkswagen rabbit custom 234 | 24.5,4,151.0,88,2740,16.0,77,1,pontiac sunbird coupe 235 | 26.0,4,97.0,75,2265,18.2,77,3,toyota corolla liftback 236 | 25.5,4,140.0,89,2755,15.8,77,1,ford mustang ii 2+2 237 | 30.5,4,98.0,63,2051,17.0,77,1,chevrolet chevette 238 | 33.5,4,98.0,83,2075,15.9,77,1,dodge colt m/m 239 | 30.0,4,97.0,67,1985,16.4,77,3,subaru dl 240 | 30.5,4,97.0,78,2190,14.1,77,2,volkswagen dasher 241 | 22.0,6,146.0,97,2815,14.5,77,3,datsun 810 242 | 21.5,4,121.0,110,2600,12.8,77,2,bmw 320i 243 | 21.5,3,80.0,110,2720,13.5,77,3,mazda rx-4 244 | 43.1,4,90.0,48,1985,21.5,78,2,volkswagen rabbit custom diesel 245 | 36.1,4,98.0,66,1800,14.4,78,1,ford fiesta 246 | 32.8,4,78.0,52,1985,19.4,78,3,mazda glc deluxe 247 | 39.4,4,85.0,70,2070,18.6,78,3,datsun b210 gx 248 | 36.1,4,91.0,60,1800,16.4,78,3,honda civic cvcc 249 | 19.9,8,260.0,110,3365,15.5,78,1,oldsmobile cutlass salon brougham 250 | 19.4,8,318.0,140,3735,13.2,78,1,dodge diplomat 251 | 20.2,8,302.0,139,3570,12.8,78,1,mercury monarch ghia 252 | 19.2,6,231.0,105,3535,19.2,78,1,pontiac phoenix lj 253 | 20.5,6,200.0,95,3155,18.2,78,1,chevrolet malibu 254 | 20.2,6,200.0,85,2965,15.8,78,1,ford fairmont (auto) 255 | 25.1,4,140.0,88,2720,15.4,78,1,ford fairmont (man) 256 | 20.5,6,225.0,100,3430,17.2,78,1,plymouth volare 257 | 19.4,6,232.0,90,3210,17.2,78,1,amc concord 258 | 20.6,6,231.0,105,3380,15.8,78,1,buick century special 259 | 20.8,6,200.0,85,3070,16.7,78,1,mercury zephyr 260 | 18.6,6,225.0,110,3620,18.7,78,1,dodge aspen 261 | 18.1,6,258.0,120,3410,15.1,78,1,amc concord d/l 262 | 19.2,8,305.0,145,3425,13.2,78,1,chevrolet monte carlo landau 263 | 17.7,6,231.0,165,3445,13.4,78,1,buick regal sport coupe (turbo) 264 | 18.1,8,302.0,139,3205,11.2,78,1,ford futura 265 | 17.5,8,318.0,140,4080,13.7,78,1,dodge magnum xe 266 | 30.0,4,98.0,68,2155,16.5,78,1,chevrolet chevette 267 | 27.5,4,134.0,95,2560,14.2,78,3,toyota corona 268 | 27.2,4,119.0,97,2300,14.7,78,3,datsun 510 269 | 30.9,4,105.0,75,2230,14.5,78,1,dodge omni 270 | 21.1,4,134.0,95,2515,14.8,78,3,toyota celica gt liftback 271 | 23.2,4,156.0,105,2745,16.7,78,1,plymouth sapporo 272 | 23.8,4,151.0,85,2855,17.6,78,1,oldsmobile starfire sx 273 | 23.9,4,119.0,97,2405,14.9,78,3,datsun 200-sx 274 | 20.3,5,131.0,103,2830,15.9,78,2,audi 5000 275 | 17.0,6,163.0,125,3140,13.6,78,2,volvo 264gl 276 | 21.6,4,121.0,115,2795,15.7,78,2,saab 99gle 277 | 16.2,6,163.0,133,3410,15.8,78,2,peugeot 604sl 278 | 31.5,4,89.0,71,1990,14.9,78,2,volkswagen scirocco 279 | 29.5,4,98.0,68,2135,16.6,78,3,honda accord lx 280 | 21.5,6,231.0,115,3245,15.4,79,1,pontiac lemans v6 281 | 19.8,6,200.0,85,2990,18.2,79,1,mercury zephyr 6 282 | 22.3,4,140.0,88,2890,17.3,79,1,ford fairmont 4 283 | 20.2,6,232.0,90,3265,18.2,79,1,amc concord dl 6 284 | 20.6,6,225.0,110,3360,16.6,79,1,dodge aspen 6 285 | 17.0,8,305.0,130,3840,15.4,79,1,chevrolet caprice classic 286 | 17.6,8,302.0,129,3725,13.4,79,1,ford ltd landau 287 | 16.5,8,351.0,138,3955,13.2,79,1,mercury grand marquis 288 | 18.2,8,318.0,135,3830,15.2,79,1,dodge st. regis 289 | 16.9,8,350.0,155,4360,14.9,79,1,buick estate wagon (sw) 290 | 15.5,8,351.0,142,4054,14.3,79,1,ford country squire (sw) 291 | 19.2,8,267.0,125,3605,15.0,79,1,chevrolet malibu classic (sw) 292 | 18.5,8,360.0,150,3940,13.0,79,1,chrysler lebaron town @ country (sw) 293 | 31.9,4,89.0,71,1925,14.0,79,2,vw rabbit custom 294 | 34.1,4,86.0,65,1975,15.2,79,3,maxda glc deluxe 295 | 35.7,4,98.0,80,1915,14.4,79,1,dodge colt hatchback custom 296 | 27.4,4,121.0,80,2670,15.0,79,1,amc spirit dl 297 | 25.4,5,183.0,77,3530,20.1,79,2,mercedes benz 300d 298 | 23.0,8,350.0,125,3900,17.4,79,1,cadillac eldorado 299 | 27.2,4,141.0,71,3190,24.8,79,2,peugeot 504 300 | 23.9,8,260.0,90,3420,22.2,79,1,oldsmobile cutlass salon brougham 301 | 34.2,4,105.0,70,2200,13.2,79,1,plymouth horizon 302 | 34.5,4,105.0,70,2150,14.9,79,1,plymouth horizon tc3 303 | 31.8,4,85.0,65,2020,19.2,79,3,datsun 210 304 | 37.3,4,91.0,69,2130,14.7,79,2,fiat strada custom 305 | 28.4,4,151.0,90,2670,16.0,79,1,buick skylark limited 306 | 28.8,6,173.0,115,2595,11.3,79,1,chevrolet citation 307 | 26.8,6,173.0,115,2700,12.9,79,1,oldsmobile omega brougham 308 | 33.5,4,151.0,90,2556,13.2,79,1,pontiac phoenix 309 | 41.5,4,98.0,76,2144,14.7,80,2,vw rabbit 310 | 38.1,4,89.0,60,1968,18.8,80,3,toyota corolla tercel 311 | 32.1,4,98.0,70,2120,15.5,80,1,chevrolet chevette 312 | 37.2,4,86.0,65,2019,16.4,80,3,datsun 310 313 | 28.0,4,151.0,90,2678,16.5,80,1,chevrolet citation 314 | 26.4,4,140.0,88,2870,18.1,80,1,ford fairmont 315 | 24.3,4,151.0,90,3003,20.1,80,1,amc concord 316 | 19.1,6,225.0,90,3381,18.7,80,1,dodge aspen 317 | 34.3,4,97.0,78,2188,15.8,80,2,audi 4000 318 | 29.8,4,134.0,90,2711,15.5,80,3,toyota corona liftback 319 | 31.3,4,120.0,75,2542,17.5,80,3,mazda 626 320 | 37.0,4,119.0,92,2434,15.0,80,3,datsun 510 hatchback 321 | 32.2,4,108.0,75,2265,15.2,80,3,toyota corolla 322 | 46.6,4,86.0,65,2110,17.9,80,3,mazda glc 323 | 27.9,4,156.0,105,2800,14.4,80,1,dodge colt 324 | 40.8,4,85.0,65,2110,19.2,80,3,datsun 210 325 | 44.3,4,90.0,48,2085,21.7,80,2,vw rabbit c (diesel) 326 | 43.4,4,90.0,48,2335,23.7,80,2,vw dasher (diesel) 327 | 36.4,5,121.0,67,2950,19.9,80,2,audi 5000s (diesel) 328 | 30.0,4,146.0,67,3250,21.8,80,2,mercedes-benz 240d 329 | 44.6,4,91.0,67,1850,13.8,80,3,honda civic 1500 gl 330 | 33.8,4,97.0,67,2145,18.0,80,3,subaru dl 331 | 29.8,4,89.0,62,1845,15.3,80,2,vokswagen rabbit 332 | 32.7,6,168.0,132,2910,11.4,80,3,datsun 280-zx 333 | 23.7,3,70.0,100,2420,12.5,80,3,mazda rx-7 gs 334 | 35.0,4,122.0,88,2500,15.1,80,2,triumph tr7 coupe 335 | 32.4,4,107.0,72,2290,17.0,80,3,honda accord 336 | 27.2,4,135.0,84,2490,15.7,81,1,plymouth reliant 337 | 26.6,4,151.0,84,2635,16.4,81,1,buick skylark 338 | 25.8,4,156.0,92,2620,14.4,81,1,dodge aries wagon (sw) 339 | 23.5,6,173.0,110,2725,12.6,81,1,chevrolet citation 340 | 30.0,4,135.0,84,2385,12.9,81,1,plymouth reliant 341 | 39.1,4,79.0,58,1755,16.9,81,3,toyota starlet 342 | 39.0,4,86.0,64,1875,16.4,81,1,plymouth champ 343 | 35.1,4,81.0,60,1760,16.1,81,3,honda civic 1300 344 | 32.3,4,97.0,67,2065,17.8,81,3,subaru 345 | 37.0,4,85.0,65,1975,19.4,81,3,datsun 210 mpg 346 | 37.7,4,89.0,62,2050,17.3,81,3,toyota tercel 347 | 34.1,4,91.0,68,1985,16.0,81,3,mazda glc 4 348 | 34.7,4,105.0,63,2215,14.9,81,1,plymouth horizon 4 349 | 34.4,4,98.0,65,2045,16.2,81,1,ford escort 4w 350 | 29.9,4,98.0,65,2380,20.7,81,1,ford escort 2h 351 | 33.0,4,105.0,74,2190,14.2,81,2,volkswagen jetta 352 | 33.7,4,107.0,75,2210,14.4,81,3,honda prelude 353 | 32.4,4,108.0,75,2350,16.8,81,3,toyota corolla 354 | 32.9,4,119.0,100,2615,14.8,81,3,datsun 200sx 355 | 31.6,4,120.0,74,2635,18.3,81,3,mazda 626 356 | 28.1,4,141.0,80,3230,20.4,81,2,peugeot 505s turbo diesel 357 | 30.7,6,145.0,76,3160,19.6,81,2,volvo diesel 358 | 25.4,6,168.0,116,2900,12.6,81,3,toyota cressida 359 | 24.2,6,146.0,120,2930,13.8,81,3,datsun 810 maxima 360 | 22.4,6,231.0,110,3415,15.8,81,1,buick century 361 | 26.6,8,350.0,105,3725,19.0,81,1,oldsmobile cutlass ls 362 | 20.2,6,200.0,88,3060,17.1,81,1,ford granada gl 363 | 17.6,6,225.0,85,3465,16.6,81,1,chrysler lebaron salon 364 | 28.0,4,112.0,88,2605,19.6,82,1,chevrolet cavalier 365 | 27.0,4,112.0,88,2640,18.6,82,1,chevrolet cavalier wagon 366 | 34.0,4,112.0,88,2395,18.0,82,1,chevrolet cavalier 2-door 367 | 31.0,4,112.0,85,2575,16.2,82,1,pontiac j2000 se hatchback 368 | 29.0,4,135.0,84,2525,16.0,82,1,dodge aries se 369 | 27.0,4,151.0,90,2735,18.0,82,1,pontiac phoenix 370 | 24.0,4,140.0,92,2865,16.4,82,1,ford fairmont futura 371 | 36.0,4,105.0,74,1980,15.3,82,2,volkswagen rabbit l 372 | 37.0,4,91.0,68,2025,18.2,82,3,mazda glc custom l 373 | 31.0,4,91.0,68,1970,17.6,82,3,mazda glc custom 374 | 38.0,4,105.0,63,2125,14.7,82,1,plymouth horizon miser 375 | 36.0,4,98.0,70,2125,17.3,82,1,mercury lynx l 376 | 36.0,4,120.0,88,2160,14.5,82,3,nissan stanza xe 377 | 36.0,4,107.0,75,2205,14.5,82,3,honda accord 378 | 34.0,4,108.0,70,2245,16.9,82,3,toyota corolla 379 | 38.0,4,91.0,67,1965,15.0,82,3,honda civic 380 | 32.0,4,91.0,67,1965,15.7,82,3,honda civic (auto) 381 | 38.0,4,91.0,67,1995,16.2,82,3,datsun 310 gx 382 | 25.0,6,181.0,110,2945,16.4,82,1,buick century limited 383 | 38.0,6,262.0,85,3015,17.0,82,1,oldsmobile cutlass ciera (diesel) 384 | 26.0,4,156.0,92,2585,14.5,82,1,chrysler lebaron medallion 385 | 22.0,6,232.0,112,2835,14.7,82,1,ford granada l 386 | 32.0,4,144.0,96,2665,13.9,82,3,toyota celica gt 387 | 36.0,4,135.0,84,2370,13.0,82,1,dodge charger 2.2 388 | 27.0,4,151.0,90,2950,17.3,82,1,chevrolet camaro 389 | 27.0,4,140.0,86,2790,15.6,82,1,ford mustang gl 390 | 44.0,4,97.0,52,2130,24.6,82,2,vw pickup 391 | 32.0,4,135.0,84,2295,11.6,82,1,dodge rampage 392 | 28.0,4,120.0,79,2625,18.6,82,1,ford ranger 393 | 31.0,4,119.0,82,2720,19.4,82,1,chevy s-10 394 | -------------------------------------------------------------------------------- /Auto.data: -------------------------------------------------------------------------------- 1 | mpg cylinders displacement horsepower weight acceleration year origin name 2 | 18.0 8 307.0 130.0 3504. 12.0 70 1 "chevrolet chevelle malibu" 3 | 15.0 8 350.0 165.0 3693. 11.5 70 1 "buick skylark 320" 4 | 18.0 8 318.0 150.0 3436. 11.0 70 1 "plymouth satellite" 5 | 16.0 8 304.0 150.0 3433. 12.0 70 1 "amc rebel sst" 6 | 17.0 8 302.0 140.0 3449. 10.5 70 1 "ford torino" 7 | 15.0 8 429.0 198.0 4341. 10.0 70 1 "ford galaxie 500" 8 | 14.0 8 454.0 220.0 4354. 9.0 70 1 "chevrolet impala" 9 | 14.0 8 440.0 215.0 4312. 8.5 70 1 "plymouth fury iii" 10 | 14.0 8 455.0 225.0 4425. 10.0 70 1 "pontiac catalina" 11 | 15.0 8 390.0 190.0 3850. 8.5 70 1 "amc ambassador dpl" 12 | 15.0 8 383.0 170.0 3563. 10.0 70 1 "dodge challenger se" 13 | 14.0 8 340.0 160.0 3609. 8.0 70 1 "plymouth 'cuda 340" 14 | 15.0 8 400.0 150.0 3761. 9.5 70 1 "chevrolet monte carlo" 15 | 14.0 8 455.0 225.0 3086. 10.0 70 1 "buick estate wagon (sw)" 16 | 24.0 4 113.0 95.00 2372. 15.0 70 3 "toyota corona mark ii" 17 | 22.0 6 198.0 95.00 2833. 15.5 70 1 "plymouth duster" 18 | 18.0 6 199.0 97.00 2774. 15.5 70 1 "amc hornet" 19 | 21.0 6 200.0 85.00 2587. 16.0 70 1 "ford maverick" 20 | 27.0 4 97.00 88.00 2130. 14.5 70 3 "datsun pl510" 21 | 26.0 4 97.00 46.00 1835. 20.5 70 2 "volkswagen 1131 deluxe sedan" 22 | 25.0 4 110.0 87.00 2672. 17.5 70 2 "peugeot 504" 23 | 24.0 4 107.0 90.00 2430. 14.5 70 2 "audi 100 ls" 24 | 25.0 4 104.0 95.00 2375. 17.5 70 2 "saab 99e" 25 | 26.0 4 121.0 113.0 2234. 12.5 70 2 "bmw 2002" 26 | 21.0 6 199.0 90.00 2648. 15.0 70 1 "amc gremlin" 27 | 10.0 8 360.0 215.0 4615. 14.0 70 1 "ford f250" 28 | 10.0 8 307.0 200.0 4376. 15.0 70 1 "chevy c20" 29 | 11.0 8 318.0 210.0 4382. 13.5 70 1 "dodge d200" 30 | 9.0 8 304.0 193.0 4732. 18.5 70 1 "hi 1200d" 31 | 27.0 4 97.00 88.00 2130. 14.5 71 3 "datsun pl510" 32 | 28.0 4 140.0 90.00 2264. 15.5 71 1 "chevrolet vega 2300" 33 | 25.0 4 113.0 95.00 2228. 14.0 71 3 "toyota corona" 34 | 25.0 4 98.00 ? 2046. 19.0 71 1 "ford pinto" 35 | 19.0 6 232.0 100.0 2634. 13.0 71 1 "amc gremlin" 36 | 16.0 6 225.0 105.0 3439. 15.5 71 1 "plymouth satellite custom" 37 | 17.0 6 250.0 100.0 3329. 15.5 71 1 "chevrolet chevelle malibu" 38 | 19.0 6 250.0 88.00 3302. 15.5 71 1 "ford torino 500" 39 | 18.0 6 232.0 100.0 3288. 15.5 71 1 "amc matador" 40 | 14.0 8 350.0 165.0 4209. 12.0 71 1 "chevrolet impala" 41 | 14.0 8 400.0 175.0 4464. 11.5 71 1 "pontiac catalina brougham" 42 | 14.0 8 351.0 153.0 4154. 13.5 71 1 "ford galaxie 500" 43 | 14.0 8 318.0 150.0 4096. 13.0 71 1 "plymouth fury iii" 44 | 12.0 8 383.0 180.0 4955. 11.5 71 1 "dodge monaco (sw)" 45 | 13.0 8 400.0 170.0 4746. 12.0 71 1 "ford country squire (sw)" 46 | 13.0 8 400.0 175.0 5140. 12.0 71 1 "pontiac safari (sw)" 47 | 18.0 6 258.0 110.0 2962. 13.5 71 1 "amc hornet sportabout (sw)" 48 | 22.0 4 140.0 72.00 2408. 19.0 71 1 "chevrolet vega (sw)" 49 | 19.0 6 250.0 100.0 3282. 15.0 71 1 "pontiac firebird" 50 | 18.0 6 250.0 88.00 3139. 14.5 71 1 "ford mustang" 51 | 23.0 4 122.0 86.00 2220. 14.0 71 1 "mercury capri 2000" 52 | 28.0 4 116.0 90.00 2123. 14.0 71 2 "opel 1900" 53 | 30.0 4 79.00 70.00 2074. 19.5 71 2 "peugeot 304" 54 | 30.0 4 88.00 76.00 2065. 14.5 71 2 "fiat 124b" 55 | 31.0 4 71.00 65.00 1773. 19.0 71 3 "toyota corolla 1200" 56 | 35.0 4 72.00 69.00 1613. 18.0 71 3 "datsun 1200" 57 | 27.0 4 97.00 60.00 1834. 19.0 71 2 "volkswagen model 111" 58 | 26.0 4 91.00 70.00 1955. 20.5 71 1 "plymouth cricket" 59 | 24.0 4 113.0 95.00 2278. 15.5 72 3 "toyota corona hardtop" 60 | 25.0 4 97.50 80.00 2126. 17.0 72 1 "dodge colt hardtop" 61 | 23.0 4 97.00 54.00 2254. 23.5 72 2 "volkswagen type 3" 62 | 20.0 4 140.0 90.00 2408. 19.5 72 1 "chevrolet vega" 63 | 21.0 4 122.0 86.00 2226. 16.5 72 1 "ford pinto runabout" 64 | 13.0 8 350.0 165.0 4274. 12.0 72 1 "chevrolet impala" 65 | 14.0 8 400.0 175.0 4385. 12.0 72 1 "pontiac catalina" 66 | 15.0 8 318.0 150.0 4135. 13.5 72 1 "plymouth fury iii" 67 | 14.0 8 351.0 153.0 4129. 13.0 72 1 "ford galaxie 500" 68 | 17.0 8 304.0 150.0 3672. 11.5 72 1 "amc ambassador sst" 69 | 11.0 8 429.0 208.0 4633. 11.0 72 1 "mercury marquis" 70 | 13.0 8 350.0 155.0 4502. 13.5 72 1 "buick lesabre custom" 71 | 12.0 8 350.0 160.0 4456. 13.5 72 1 "oldsmobile delta 88 royale" 72 | 13.0 8 400.0 190.0 4422. 12.5 72 1 "chrysler newport royal" 73 | 19.0 3 70.00 97.00 2330. 13.5 72 3 "mazda rx2 coupe" 74 | 15.0 8 304.0 150.0 3892. 12.5 72 1 "amc matador (sw)" 75 | 13.0 8 307.0 130.0 4098. 14.0 72 1 "chevrolet chevelle concours (sw)" 76 | 13.0 8 302.0 140.0 4294. 16.0 72 1 "ford gran torino (sw)" 77 | 14.0 8 318.0 150.0 4077. 14.0 72 1 "plymouth satellite custom (sw)" 78 | 18.0 4 121.0 112.0 2933. 14.5 72 2 "volvo 145e (sw)" 79 | 22.0 4 121.0 76.00 2511. 18.0 72 2 "volkswagen 411 (sw)" 80 | 21.0 4 120.0 87.00 2979. 19.5 72 2 "peugeot 504 (sw)" 81 | 26.0 4 96.00 69.00 2189. 18.0 72 2 "renault 12 (sw)" 82 | 22.0 4 122.0 86.00 2395. 16.0 72 1 "ford pinto (sw)" 83 | 28.0 4 97.00 92.00 2288. 17.0 72 3 "datsun 510 (sw)" 84 | 23.0 4 120.0 97.00 2506. 14.5 72 3 "toyouta corona mark ii (sw)" 85 | 28.0 4 98.00 80.00 2164. 15.0 72 1 "dodge colt (sw)" 86 | 27.0 4 97.00 88.00 2100. 16.5 72 3 "toyota corolla 1600 (sw)" 87 | 13.0 8 350.0 175.0 4100. 13.0 73 1 "buick century 350" 88 | 14.0 8 304.0 150.0 3672. 11.5 73 1 "amc matador" 89 | 13.0 8 350.0 145.0 3988. 13.0 73 1 "chevrolet malibu" 90 | 14.0 8 302.0 137.0 4042. 14.5 73 1 "ford gran torino" 91 | 15.0 8 318.0 150.0 3777. 12.5 73 1 "dodge coronet custom" 92 | 12.0 8 429.0 198.0 4952. 11.5 73 1 "mercury marquis brougham" 93 | 13.0 8 400.0 150.0 4464. 12.0 73 1 "chevrolet caprice classic" 94 | 13.0 8 351.0 158.0 4363. 13.0 73 1 "ford ltd" 95 | 14.0 8 318.0 150.0 4237. 14.5 73 1 "plymouth fury gran sedan" 96 | 13.0 8 440.0 215.0 4735. 11.0 73 1 "chrysler new yorker brougham" 97 | 12.0 8 455.0 225.0 4951. 11.0 73 1 "buick electra 225 custom" 98 | 13.0 8 360.0 175.0 3821. 11.0 73 1 "amc ambassador brougham" 99 | 18.0 6 225.0 105.0 3121. 16.5 73 1 "plymouth valiant" 100 | 16.0 6 250.0 100.0 3278. 18.0 73 1 "chevrolet nova custom" 101 | 18.0 6 232.0 100.0 2945. 16.0 73 1 "amc hornet" 102 | 18.0 6 250.0 88.00 3021. 16.5 73 1 "ford maverick" 103 | 23.0 6 198.0 95.00 2904. 16.0 73 1 "plymouth duster" 104 | 26.0 4 97.00 46.00 1950. 21.0 73 2 "volkswagen super beetle" 105 | 11.0 8 400.0 150.0 4997. 14.0 73 1 "chevrolet impala" 106 | 12.0 8 400.0 167.0 4906. 12.5 73 1 "ford country" 107 | 13.0 8 360.0 170.0 4654. 13.0 73 1 "plymouth custom suburb" 108 | 12.0 8 350.0 180.0 4499. 12.5 73 1 "oldsmobile vista cruiser" 109 | 18.0 6 232.0 100.0 2789. 15.0 73 1 "amc gremlin" 110 | 20.0 4 97.00 88.00 2279. 19.0 73 3 "toyota carina" 111 | 21.0 4 140.0 72.00 2401. 19.5 73 1 "chevrolet vega" 112 | 22.0 4 108.0 94.00 2379. 16.5 73 3 "datsun 610" 113 | 18.0 3 70.00 90.00 2124. 13.5 73 3 "maxda rx3" 114 | 19.0 4 122.0 85.00 2310. 18.5 73 1 "ford pinto" 115 | 21.0 6 155.0 107.0 2472. 14.0 73 1 "mercury capri v6" 116 | 26.0 4 98.00 90.00 2265. 15.5 73 2 "fiat 124 sport coupe" 117 | 15.0 8 350.0 145.0 4082. 13.0 73 1 "chevrolet monte carlo s" 118 | 16.0 8 400.0 230.0 4278. 9.50 73 1 "pontiac grand prix" 119 | 29.0 4 68.00 49.00 1867. 19.5 73 2 "fiat 128" 120 | 24.0 4 116.0 75.00 2158. 15.5 73 2 "opel manta" 121 | 20.0 4 114.0 91.00 2582. 14.0 73 2 "audi 100ls" 122 | 19.0 4 121.0 112.0 2868. 15.5 73 2 "volvo 144ea" 123 | 15.0 8 318.0 150.0 3399. 11.0 73 1 "dodge dart custom" 124 | 24.0 4 121.0 110.0 2660. 14.0 73 2 "saab 99le" 125 | 20.0 6 156.0 122.0 2807. 13.5 73 3 "toyota mark ii" 126 | 11.0 8 350.0 180.0 3664. 11.0 73 1 "oldsmobile omega" 127 | 20.0 6 198.0 95.00 3102. 16.5 74 1 "plymouth duster" 128 | 21.0 6 200.0 ? 2875. 17.0 74 1 "ford maverick" 129 | 19.0 6 232.0 100.0 2901. 16.0 74 1 "amc hornet" 130 | 15.0 6 250.0 100.0 3336. 17.0 74 1 "chevrolet nova" 131 | 31.0 4 79.00 67.00 1950. 19.0 74 3 "datsun b210" 132 | 26.0 4 122.0 80.00 2451. 16.5 74 1 "ford pinto" 133 | 32.0 4 71.00 65.00 1836. 21.0 74 3 "toyota corolla 1200" 134 | 25.0 4 140.0 75.00 2542. 17.0 74 1 "chevrolet vega" 135 | 16.0 6 250.0 100.0 3781. 17.0 74 1 "chevrolet chevelle malibu classic" 136 | 16.0 6 258.0 110.0 3632. 18.0 74 1 "amc matador" 137 | 18.0 6 225.0 105.0 3613. 16.5 74 1 "plymouth satellite sebring" 138 | 16.0 8 302.0 140.0 4141. 14.0 74 1 "ford gran torino" 139 | 13.0 8 350.0 150.0 4699. 14.5 74 1 "buick century luxus (sw)" 140 | 14.0 8 318.0 150.0 4457. 13.5 74 1 "dodge coronet custom (sw)" 141 | 14.0 8 302.0 140.0 4638. 16.0 74 1 "ford gran torino (sw)" 142 | 14.0 8 304.0 150.0 4257. 15.5 74 1 "amc matador (sw)" 143 | 29.0 4 98.00 83.00 2219. 16.5 74 2 "audi fox" 144 | 26.0 4 79.00 67.00 1963. 15.5 74 2 "volkswagen dasher" 145 | 26.0 4 97.00 78.00 2300. 14.5 74 2 "opel manta" 146 | 31.0 4 76.00 52.00 1649. 16.5 74 3 "toyota corona" 147 | 32.0 4 83.00 61.00 2003. 19.0 74 3 "datsun 710" 148 | 28.0 4 90.00 75.00 2125. 14.5 74 1 "dodge colt" 149 | 24.0 4 90.00 75.00 2108. 15.5 74 2 "fiat 128" 150 | 26.0 4 116.0 75.00 2246. 14.0 74 2 "fiat 124 tc" 151 | 24.0 4 120.0 97.00 2489. 15.0 74 3 "honda civic" 152 | 26.0 4 108.0 93.00 2391. 15.5 74 3 "subaru" 153 | 31.0 4 79.00 67.00 2000. 16.0 74 2 "fiat x1.9" 154 | 19.0 6 225.0 95.00 3264. 16.0 75 1 "plymouth valiant custom" 155 | 18.0 6 250.0 105.0 3459. 16.0 75 1 "chevrolet nova" 156 | 15.0 6 250.0 72.00 3432. 21.0 75 1 "mercury monarch" 157 | 15.0 6 250.0 72.00 3158. 19.5 75 1 "ford maverick" 158 | 16.0 8 400.0 170.0 4668. 11.5 75 1 "pontiac catalina" 159 | 15.0 8 350.0 145.0 4440. 14.0 75 1 "chevrolet bel air" 160 | 16.0 8 318.0 150.0 4498. 14.5 75 1 "plymouth grand fury" 161 | 14.0 8 351.0 148.0 4657. 13.5 75 1 "ford ltd" 162 | 17.0 6 231.0 110.0 3907. 21.0 75 1 "buick century" 163 | 16.0 6 250.0 105.0 3897. 18.5 75 1 "chevroelt chevelle malibu" 164 | 15.0 6 258.0 110.0 3730. 19.0 75 1 "amc matador" 165 | 18.0 6 225.0 95.00 3785. 19.0 75 1 "plymouth fury" 166 | 21.0 6 231.0 110.0 3039. 15.0 75 1 "buick skyhawk" 167 | 20.0 8 262.0 110.0 3221. 13.5 75 1 "chevrolet monza 2+2" 168 | 13.0 8 302.0 129.0 3169. 12.0 75 1 "ford mustang ii" 169 | 29.0 4 97.00 75.00 2171. 16.0 75 3 "toyota corolla" 170 | 23.0 4 140.0 83.00 2639. 17.0 75 1 "ford pinto" 171 | 20.0 6 232.0 100.0 2914. 16.0 75 1 "amc gremlin" 172 | 23.0 4 140.0 78.00 2592. 18.5 75 1 "pontiac astro" 173 | 24.0 4 134.0 96.00 2702. 13.5 75 3 "toyota corona" 174 | 25.0 4 90.00 71.00 2223. 16.5 75 2 "volkswagen dasher" 175 | 24.0 4 119.0 97.00 2545. 17.0 75 3 "datsun 710" 176 | 18.0 6 171.0 97.00 2984. 14.5 75 1 "ford pinto" 177 | 29.0 4 90.00 70.00 1937. 14.0 75 2 "volkswagen rabbit" 178 | 19.0 6 232.0 90.00 3211. 17.0 75 1 "amc pacer" 179 | 23.0 4 115.0 95.00 2694. 15.0 75 2 "audi 100ls" 180 | 23.0 4 120.0 88.00 2957. 17.0 75 2 "peugeot 504" 181 | 22.0 4 121.0 98.00 2945. 14.5 75 2 "volvo 244dl" 182 | 25.0 4 121.0 115.0 2671. 13.5 75 2 "saab 99le" 183 | 33.0 4 91.00 53.00 1795. 17.5 75 3 "honda civic cvcc" 184 | 28.0 4 107.0 86.00 2464. 15.5 76 2 "fiat 131" 185 | 25.0 4 116.0 81.00 2220. 16.9 76 2 "opel 1900" 186 | 25.0 4 140.0 92.00 2572. 14.9 76 1 "capri ii" 187 | 26.0 4 98.00 79.00 2255. 17.7 76 1 "dodge colt" 188 | 27.0 4 101.0 83.00 2202. 15.3 76 2 "renault 12tl" 189 | 17.5 8 305.0 140.0 4215. 13.0 76 1 "chevrolet chevelle malibu classic" 190 | 16.0 8 318.0 150.0 4190. 13.0 76 1 "dodge coronet brougham" 191 | 15.5 8 304.0 120.0 3962. 13.9 76 1 "amc matador" 192 | 14.5 8 351.0 152.0 4215. 12.8 76 1 "ford gran torino" 193 | 22.0 6 225.0 100.0 3233. 15.4 76 1 "plymouth valiant" 194 | 22.0 6 250.0 105.0 3353. 14.5 76 1 "chevrolet nova" 195 | 24.0 6 200.0 81.00 3012. 17.6 76 1 "ford maverick" 196 | 22.5 6 232.0 90.00 3085. 17.6 76 1 "amc hornet" 197 | 29.0 4 85.00 52.00 2035. 22.2 76 1 "chevrolet chevette" 198 | 24.5 4 98.00 60.00 2164. 22.1 76 1 "chevrolet woody" 199 | 29.0 4 90.00 70.00 1937. 14.2 76 2 "vw rabbit" 200 | 33.0 4 91.00 53.00 1795. 17.4 76 3 "honda civic" 201 | 20.0 6 225.0 100.0 3651. 17.7 76 1 "dodge aspen se" 202 | 18.0 6 250.0 78.00 3574. 21.0 76 1 "ford granada ghia" 203 | 18.5 6 250.0 110.0 3645. 16.2 76 1 "pontiac ventura sj" 204 | 17.5 6 258.0 95.00 3193. 17.8 76 1 "amc pacer d/l" 205 | 29.5 4 97.00 71.00 1825. 12.2 76 2 "volkswagen rabbit" 206 | 32.0 4 85.00 70.00 1990. 17.0 76 3 "datsun b-210" 207 | 28.0 4 97.00 75.00 2155. 16.4 76 3 "toyota corolla" 208 | 26.5 4 140.0 72.00 2565. 13.6 76 1 "ford pinto" 209 | 20.0 4 130.0 102.0 3150. 15.7 76 2 "volvo 245" 210 | 13.0 8 318.0 150.0 3940. 13.2 76 1 "plymouth volare premier v8" 211 | 19.0 4 120.0 88.00 3270. 21.9 76 2 "peugeot 504" 212 | 19.0 6 156.0 108.0 2930. 15.5 76 3 "toyota mark ii" 213 | 16.5 6 168.0 120.0 3820. 16.7 76 2 "mercedes-benz 280s" 214 | 16.5 8 350.0 180.0 4380. 12.1 76 1 "cadillac seville" 215 | 13.0 8 350.0 145.0 4055. 12.0 76 1 "chevy c10" 216 | 13.0 8 302.0 130.0 3870. 15.0 76 1 "ford f108" 217 | 13.0 8 318.0 150.0 3755. 14.0 76 1 "dodge d100" 218 | 31.5 4 98.00 68.00 2045. 18.5 77 3 "honda accord cvcc" 219 | 30.0 4 111.0 80.00 2155. 14.8 77 1 "buick opel isuzu deluxe" 220 | 36.0 4 79.00 58.00 1825. 18.6 77 2 "renault 5 gtl" 221 | 25.5 4 122.0 96.00 2300. 15.5 77 1 "plymouth arrow gs" 222 | 33.5 4 85.00 70.00 1945. 16.8 77 3 "datsun f-10 hatchback" 223 | 17.5 8 305.0 145.0 3880. 12.5 77 1 "chevrolet caprice classic" 224 | 17.0 8 260.0 110.0 4060. 19.0 77 1 "oldsmobile cutlass supreme" 225 | 15.5 8 318.0 145.0 4140. 13.7 77 1 "dodge monaco brougham" 226 | 15.0 8 302.0 130.0 4295. 14.9 77 1 "mercury cougar brougham" 227 | 17.5 6 250.0 110.0 3520. 16.4 77 1 "chevrolet concours" 228 | 20.5 6 231.0 105.0 3425. 16.9 77 1 "buick skylark" 229 | 19.0 6 225.0 100.0 3630. 17.7 77 1 "plymouth volare custom" 230 | 18.5 6 250.0 98.00 3525. 19.0 77 1 "ford granada" 231 | 16.0 8 400.0 180.0 4220. 11.1 77 1 "pontiac grand prix lj" 232 | 15.5 8 350.0 170.0 4165. 11.4 77 1 "chevrolet monte carlo landau" 233 | 15.5 8 400.0 190.0 4325. 12.2 77 1 "chrysler cordoba" 234 | 16.0 8 351.0 149.0 4335. 14.5 77 1 "ford thunderbird" 235 | 29.0 4 97.00 78.00 1940. 14.5 77 2 "volkswagen rabbit custom" 236 | 24.5 4 151.0 88.00 2740. 16.0 77 1 "pontiac sunbird coupe" 237 | 26.0 4 97.00 75.00 2265. 18.2 77 3 "toyota corolla liftback" 238 | 25.5 4 140.0 89.00 2755. 15.8 77 1 "ford mustang ii 2+2" 239 | 30.5 4 98.00 63.00 2051. 17.0 77 1 "chevrolet chevette" 240 | 33.5 4 98.00 83.00 2075. 15.9 77 1 "dodge colt m/m" 241 | 30.0 4 97.00 67.00 1985. 16.4 77 3 "subaru dl" 242 | 30.5 4 97.00 78.00 2190. 14.1 77 2 "volkswagen dasher" 243 | 22.0 6 146.0 97.00 2815. 14.5 77 3 "datsun 810" 244 | 21.5 4 121.0 110.0 2600. 12.8 77 2 "bmw 320i" 245 | 21.5 3 80.00 110.0 2720. 13.5 77 3 "mazda rx-4" 246 | 43.1 4 90.00 48.00 1985. 21.5 78 2 "volkswagen rabbit custom diesel" 247 | 36.1 4 98.00 66.00 1800. 14.4 78 1 "ford fiesta" 248 | 32.8 4 78.00 52.00 1985. 19.4 78 3 "mazda glc deluxe" 249 | 39.4 4 85.00 70.00 2070. 18.6 78 3 "datsun b210 gx" 250 | 36.1 4 91.00 60.00 1800. 16.4 78 3 "honda civic cvcc" 251 | 19.9 8 260.0 110.0 3365. 15.5 78 1 "oldsmobile cutlass salon brougham" 252 | 19.4 8 318.0 140.0 3735. 13.2 78 1 "dodge diplomat" 253 | 20.2 8 302.0 139.0 3570. 12.8 78 1 "mercury monarch ghia" 254 | 19.2 6 231.0 105.0 3535. 19.2 78 1 "pontiac phoenix lj" 255 | 20.5 6 200.0 95.00 3155. 18.2 78 1 "chevrolet malibu" 256 | 20.2 6 200.0 85.00 2965. 15.8 78 1 "ford fairmont (auto)" 257 | 25.1 4 140.0 88.00 2720. 15.4 78 1 "ford fairmont (man)" 258 | 20.5 6 225.0 100.0 3430. 17.2 78 1 "plymouth volare" 259 | 19.4 6 232.0 90.00 3210. 17.2 78 1 "amc concord" 260 | 20.6 6 231.0 105.0 3380. 15.8 78 1 "buick century special" 261 | 20.8 6 200.0 85.00 3070. 16.7 78 1 "mercury zephyr" 262 | 18.6 6 225.0 110.0 3620. 18.7 78 1 "dodge aspen" 263 | 18.1 6 258.0 120.0 3410. 15.1 78 1 "amc concord d/l" 264 | 19.2 8 305.0 145.0 3425. 13.2 78 1 "chevrolet monte carlo landau" 265 | 17.7 6 231.0 165.0 3445. 13.4 78 1 "buick regal sport coupe (turbo)" 266 | 18.1 8 302.0 139.0 3205. 11.2 78 1 "ford futura" 267 | 17.5 8 318.0 140.0 4080. 13.7 78 1 "dodge magnum xe" 268 | 30.0 4 98.00 68.00 2155. 16.5 78 1 "chevrolet chevette" 269 | 27.5 4 134.0 95.00 2560. 14.2 78 3 "toyota corona" 270 | 27.2 4 119.0 97.00 2300. 14.7 78 3 "datsun 510" 271 | 30.9 4 105.0 75.00 2230. 14.5 78 1 "dodge omni" 272 | 21.1 4 134.0 95.00 2515. 14.8 78 3 "toyota celica gt liftback" 273 | 23.2 4 156.0 105.0 2745. 16.7 78 1 "plymouth sapporo" 274 | 23.8 4 151.0 85.00 2855. 17.6 78 1 "oldsmobile starfire sx" 275 | 23.9 4 119.0 97.00 2405. 14.9 78 3 "datsun 200-sx" 276 | 20.3 5 131.0 103.0 2830. 15.9 78 2 "audi 5000" 277 | 17.0 6 163.0 125.0 3140. 13.6 78 2 "volvo 264gl" 278 | 21.6 4 121.0 115.0 2795. 15.7 78 2 "saab 99gle" 279 | 16.2 6 163.0 133.0 3410. 15.8 78 2 "peugeot 604sl" 280 | 31.5 4 89.00 71.00 1990. 14.9 78 2 "volkswagen scirocco" 281 | 29.5 4 98.00 68.00 2135. 16.6 78 3 "honda accord lx" 282 | 21.5 6 231.0 115.0 3245. 15.4 79 1 "pontiac lemans v6" 283 | 19.8 6 200.0 85.00 2990. 18.2 79 1 "mercury zephyr 6" 284 | 22.3 4 140.0 88.00 2890. 17.3 79 1 "ford fairmont 4" 285 | 20.2 6 232.0 90.00 3265. 18.2 79 1 "amc concord dl 6" 286 | 20.6 6 225.0 110.0 3360. 16.6 79 1 "dodge aspen 6" 287 | 17.0 8 305.0 130.0 3840. 15.4 79 1 "chevrolet caprice classic" 288 | 17.6 8 302.0 129.0 3725. 13.4 79 1 "ford ltd landau" 289 | 16.5 8 351.0 138.0 3955. 13.2 79 1 "mercury grand marquis" 290 | 18.2 8 318.0 135.0 3830. 15.2 79 1 "dodge st. regis" 291 | 16.9 8 350.0 155.0 4360. 14.9 79 1 "buick estate wagon (sw)" 292 | 15.5 8 351.0 142.0 4054. 14.3 79 1 "ford country squire (sw)" 293 | 19.2 8 267.0 125.0 3605. 15.0 79 1 "chevrolet malibu classic (sw)" 294 | 18.5 8 360.0 150.0 3940. 13.0 79 1 "chrysler lebaron town @ country (sw)" 295 | 31.9 4 89.00 71.00 1925. 14.0 79 2 "vw rabbit custom" 296 | 34.1 4 86.00 65.00 1975. 15.2 79 3 "maxda glc deluxe" 297 | 35.7 4 98.00 80.00 1915. 14.4 79 1 "dodge colt hatchback custom" 298 | 27.4 4 121.0 80.00 2670. 15.0 79 1 "amc spirit dl" 299 | 25.4 5 183.0 77.00 3530. 20.1 79 2 "mercedes benz 300d" 300 | 23.0 8 350.0 125.0 3900. 17.4 79 1 "cadillac eldorado" 301 | 27.2 4 141.0 71.00 3190. 24.8 79 2 "peugeot 504" 302 | 23.9 8 260.0 90.00 3420. 22.2 79 1 "oldsmobile cutlass salon brougham" 303 | 34.2 4 105.0 70.00 2200. 13.2 79 1 "plymouth horizon" 304 | 34.5 4 105.0 70.00 2150. 14.9 79 1 "plymouth horizon tc3" 305 | 31.8 4 85.00 65.00 2020. 19.2 79 3 "datsun 210" 306 | 37.3 4 91.00 69.00 2130. 14.7 79 2 "fiat strada custom" 307 | 28.4 4 151.0 90.00 2670. 16.0 79 1 "buick skylark limited" 308 | 28.8 6 173.0 115.0 2595. 11.3 79 1 "chevrolet citation" 309 | 26.8 6 173.0 115.0 2700. 12.9 79 1 "oldsmobile omega brougham" 310 | 33.5 4 151.0 90.00 2556. 13.2 79 1 "pontiac phoenix" 311 | 41.5 4 98.00 76.00 2144. 14.7 80 2 "vw rabbit" 312 | 38.1 4 89.00 60.00 1968. 18.8 80 3 "toyota corolla tercel" 313 | 32.1 4 98.00 70.00 2120. 15.5 80 1 "chevrolet chevette" 314 | 37.2 4 86.00 65.00 2019. 16.4 80 3 "datsun 310" 315 | 28.0 4 151.0 90.00 2678. 16.5 80 1 "chevrolet citation" 316 | 26.4 4 140.0 88.00 2870. 18.1 80 1 "ford fairmont" 317 | 24.3 4 151.0 90.00 3003. 20.1 80 1 "amc concord" 318 | 19.1 6 225.0 90.00 3381. 18.7 80 1 "dodge aspen" 319 | 34.3 4 97.00 78.00 2188. 15.8 80 2 "audi 4000" 320 | 29.8 4 134.0 90.00 2711. 15.5 80 3 "toyota corona liftback" 321 | 31.3 4 120.0 75.00 2542. 17.5 80 3 "mazda 626" 322 | 37.0 4 119.0 92.00 2434. 15.0 80 3 "datsun 510 hatchback" 323 | 32.2 4 108.0 75.00 2265. 15.2 80 3 "toyota corolla" 324 | 46.6 4 86.00 65.00 2110. 17.9 80 3 "mazda glc" 325 | 27.9 4 156.0 105.0 2800. 14.4 80 1 "dodge colt" 326 | 40.8 4 85.00 65.00 2110. 19.2 80 3 "datsun 210" 327 | 44.3 4 90.00 48.00 2085. 21.7 80 2 "vw rabbit c (diesel)" 328 | 43.4 4 90.00 48.00 2335. 23.7 80 2 "vw dasher (diesel)" 329 | 36.4 5 121.0 67.00 2950. 19.9 80 2 "audi 5000s (diesel)" 330 | 30.0 4 146.0 67.00 3250. 21.8 80 2 "mercedes-benz 240d" 331 | 44.6 4 91.00 67.00 1850. 13.8 80 3 "honda civic 1500 gl" 332 | 40.9 4 85.00 ? 1835. 17.3 80 2 "renault lecar deluxe" 333 | 33.8 4 97.00 67.00 2145. 18.0 80 3 "subaru dl" 334 | 29.8 4 89.00 62.00 1845. 15.3 80 2 "vokswagen rabbit" 335 | 32.7 6 168.0 132.0 2910. 11.4 80 3 "datsun 280-zx" 336 | 23.7 3 70.00 100.0 2420. 12.5 80 3 "mazda rx-7 gs" 337 | 35.0 4 122.0 88.00 2500. 15.1 80 2 "triumph tr7 coupe" 338 | 23.6 4 140.0 ? 2905. 14.3 80 1 "ford mustang cobra" 339 | 32.4 4 107.0 72.00 2290. 17.0 80 3 "honda accord" 340 | 27.2 4 135.0 84.00 2490. 15.7 81 1 "plymouth reliant" 341 | 26.6 4 151.0 84.00 2635. 16.4 81 1 "buick skylark" 342 | 25.8 4 156.0 92.00 2620. 14.4 81 1 "dodge aries wagon (sw)" 343 | 23.5 6 173.0 110.0 2725. 12.6 81 1 "chevrolet citation" 344 | 30.0 4 135.0 84.00 2385. 12.9 81 1 "plymouth reliant" 345 | 39.1 4 79.00 58.00 1755. 16.9 81 3 "toyota starlet" 346 | 39.0 4 86.00 64.00 1875. 16.4 81 1 "plymouth champ" 347 | 35.1 4 81.00 60.00 1760. 16.1 81 3 "honda civic 1300" 348 | 32.3 4 97.00 67.00 2065. 17.8 81 3 "subaru" 349 | 37.0 4 85.00 65.00 1975. 19.4 81 3 "datsun 210 mpg" 350 | 37.7 4 89.00 62.00 2050. 17.3 81 3 "toyota tercel" 351 | 34.1 4 91.00 68.00 1985. 16.0 81 3 "mazda glc 4" 352 | 34.7 4 105.0 63.00 2215. 14.9 81 1 "plymouth horizon 4" 353 | 34.4 4 98.00 65.00 2045. 16.2 81 1 "ford escort 4w" 354 | 29.9 4 98.00 65.00 2380. 20.7 81 1 "ford escort 2h" 355 | 33.0 4 105.0 74.00 2190. 14.2 81 2 "volkswagen jetta" 356 | 34.5 4 100.0 ? 2320. 15.8 81 2 "renault 18i" 357 | 33.7 4 107.0 75.00 2210. 14.4 81 3 "honda prelude" 358 | 32.4 4 108.0 75.00 2350. 16.8 81 3 "toyota corolla" 359 | 32.9 4 119.0 100.0 2615. 14.8 81 3 "datsun 200sx" 360 | 31.6 4 120.0 74.00 2635. 18.3 81 3 "mazda 626" 361 | 28.1 4 141.0 80.00 3230. 20.4 81 2 "peugeot 505s turbo diesel" 362 | 30.7 6 145.0 76.00 3160. 19.6 81 2 "volvo diesel" 363 | 25.4 6 168.0 116.0 2900. 12.6 81 3 "toyota cressida" 364 | 24.2 6 146.0 120.0 2930. 13.8 81 3 "datsun 810 maxima" 365 | 22.4 6 231.0 110.0 3415. 15.8 81 1 "buick century" 366 | 26.6 8 350.0 105.0 3725. 19.0 81 1 "oldsmobile cutlass ls" 367 | 20.2 6 200.0 88.00 3060. 17.1 81 1 "ford granada gl" 368 | 17.6 6 225.0 85.00 3465. 16.6 81 1 "chrysler lebaron salon" 369 | 28.0 4 112.0 88.00 2605. 19.6 82 1 "chevrolet cavalier" 370 | 27.0 4 112.0 88.00 2640. 18.6 82 1 "chevrolet cavalier wagon" 371 | 34.0 4 112.0 88.00 2395. 18.0 82 1 "chevrolet cavalier 2-door" 372 | 31.0 4 112.0 85.00 2575. 16.2 82 1 "pontiac j2000 se hatchback" 373 | 29.0 4 135.0 84.00 2525. 16.0 82 1 "dodge aries se" 374 | 27.0 4 151.0 90.00 2735. 18.0 82 1 "pontiac phoenix" 375 | 24.0 4 140.0 92.00 2865. 16.4 82 1 "ford fairmont futura" 376 | 36.0 4 105.0 74.00 1980. 15.3 82 2 "volkswagen rabbit l" 377 | 37.0 4 91.00 68.00 2025. 18.2 82 3 "mazda glc custom l" 378 | 31.0 4 91.00 68.00 1970. 17.6 82 3 "mazda glc custom" 379 | 38.0 4 105.0 63.00 2125. 14.7 82 1 "plymouth horizon miser" 380 | 36.0 4 98.00 70.00 2125. 17.3 82 1 "mercury lynx l" 381 | 36.0 4 120.0 88.00 2160. 14.5 82 3 "nissan stanza xe" 382 | 36.0 4 107.0 75.00 2205. 14.5 82 3 "honda accord" 383 | 34.0 4 108.0 70.00 2245 16.9 82 3 "toyota corolla" 384 | 38.0 4 91.00 67.00 1965. 15.0 82 3 "honda civic" 385 | 32.0 4 91.00 67.00 1965. 15.7 82 3 "honda civic (auto)" 386 | 38.0 4 91.00 67.00 1995. 16.2 82 3 "datsun 310 gx" 387 | 25.0 6 181.0 110.0 2945. 16.4 82 1 "buick century limited" 388 | 38.0 6 262.0 85.00 3015. 17.0 82 1 "oldsmobile cutlass ciera (diesel)" 389 | 26.0 4 156.0 92.00 2585. 14.5 82 1 "chrysler lebaron medallion" 390 | 22.0 6 232.0 112.0 2835 14.7 82 1 "ford granada l" 391 | 32.0 4 144.0 96.00 2665. 13.9 82 3 "toyota celica gt" 392 | 36.0 4 135.0 84.00 2370. 13.0 82 1 "dodge charger 2.2" 393 | 27.0 4 151.0 90.00 2950. 17.3 82 1 "chevrolet camaro" 394 | 27.0 4 140.0 86.00 2790. 15.6 82 1 "ford mustang gl" 395 | 44.0 4 97.00 52.00 2130. 24.6 82 2 "vw pickup" 396 | 32.0 4 135.0 84.00 2295. 11.6 82 1 "dodge rampage" 397 | 28.0 4 120.0 79.00 2625. 18.6 82 1 "ford ranger" 398 | 31.0 4 119.0 82.00 2720. 19.4 82 1 "chevy s-10" 399 | -------------------------------------------------------------------------------- /Ch03-linreg-lab.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | jupyter: 3 | jupytext: 4 | cell_metadata_filter: -all 5 | formats: ipynb,Rmd 6 | main_language: python 7 | text_representation: 8 | extension: .Rmd 9 | format_name: rmarkdown 10 | format_version: '1.2' 11 | jupytext_version: 1.16.7 12 | --- 13 | 14 | # Linear Regression 15 | 16 | 17 | Open In Colab 18 | 19 | 20 | [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/intro-stat-learning/ISLP_labs/v2.2?labpath=Ch03-linreg-lab.ipynb) 21 | 22 | 23 | 24 | ## Importing packages 25 | We import our standard libraries at this top 26 | level. 27 | 28 | ```{python} 29 | import numpy as np 30 | import pandas as pd 31 | from matplotlib.pyplot import subplots 32 | 33 | ``` 34 | 35 | 36 | ### New imports 37 | Throughout this lab we will introduce new functions and libraries. However, 38 | we will import them here to emphasize these are the new 39 | code objects in this lab. Keeping imports near the top 40 | of a notebook makes the code more readable, since scanning the first few 41 | lines tells us what libraries are used. 42 | 43 | ```{python} 44 | import statsmodels.api as sm 45 | 46 | ``` 47 | We will provide relevant details about the 48 | functions below as they are needed. 49 | 50 | Besides importing whole modules, it is also possible 51 | to import only a few items from a given module. This 52 | will help keep the *namespace* clean. 53 | We will use a few specific objects from the `statsmodels` package 54 | which we import here. 55 | 56 | ```{python} 57 | from statsmodels.stats.outliers_influence \ 58 | import variance_inflation_factor as VIF 59 | from statsmodels.stats.anova import anova_lm 60 | 61 | ``` 62 | 63 | As one of the import statements above is quite a long line, we inserted a line break `\` to 64 | ease readability. 65 | 66 | We will also use some functions written for the labs in this book in the `ISLP` 67 | package. 68 | 69 | ```{python} 70 | from ISLP import load_data 71 | from ISLP.models import (ModelSpec as MS, 72 | summarize, 73 | poly) 74 | 75 | ``` 76 | 77 | ### Inspecting Objects and Namespaces 78 | The 79 | function `dir()` 80 | provides a list of 81 | objects in a namespace. 82 | 83 | ```{python} 84 | dir() 85 | 86 | ``` 87 | This shows you everything that `Python` can find at the top level. 88 | There are certain objects like `__builtins__` that contain references to built-in 89 | functions like `print()`. 90 | 91 | Every python object has its own notion of 92 | namespace, also accessible with `dir()`. This will include 93 | both the attributes of the object 94 | as well as any methods associated with it. For instance, we see `'sum'` in the listing for an 95 | array. 96 | 97 | ```{python} 98 | A = np.array([3,5,11]) 99 | dir(A) 100 | 101 | ``` 102 | This indicates that the object `A.sum` exists. In this case it is a method 103 | that can be used to compute the sum of the array `A` as can be seen by typing `A.sum?`. 104 | 105 | ```{python} 106 | A.sum() 107 | 108 | ``` 109 | 110 | 111 | 112 | ## Simple Linear Regression 113 | In this section we will construct model 114 | matrices (also called design matrices) using the `ModelSpec()` transform from `ISLP.models`. 115 | 116 | We will use the `Boston` housing data set, which is contained in the `ISLP` package. The `Boston` dataset records `medv` (median house value) for $506$ neighborhoods 117 | around Boston. We will build a regression model to predict `medv` using $13$ 118 | predictors such as `rm` (average number of rooms per house), 119 | `age` (proportion of owner-occupied units built prior to 1940), and `lstat` (percent of 120 | households with low socioeconomic status). We will use `statsmodels` for this 121 | task, a `Python` package that implements several commonly used 122 | regression methods. 123 | 124 | We have included a simple loading function `load_data()` in the 125 | `ISLP` package: 126 | 127 | ```{python} 128 | Boston = load_data("Boston") 129 | Boston.columns 130 | 131 | ``` 132 | 133 | Type `Boston?` to find out more about these data. 134 | 135 | We start by using the `sm.OLS()` function to fit a 136 | simple linear regression model. Our response will be 137 | `medv` and `lstat` will be the single predictor. 138 | For this model, we can create the model matrix by hand. 139 | 140 | 141 | ```{python} 142 | X = pd.DataFrame({'intercept': np.ones(Boston.shape[0]), 143 | 'lstat': Boston['lstat']}) 144 | X[:4] 145 | 146 | ``` 147 | 148 | We extract the response, and fit the model. 149 | 150 | ```{python} 151 | y = Boston['medv'] 152 | model = sm.OLS(y, X) 153 | results = model.fit() 154 | 155 | ``` 156 | Note that `sm.OLS()` does 157 | not fit the model; it specifies the model, and then `model.fit()` does the actual fitting. 158 | 159 | Our `ISLP` function `summarize()` produces a simple table of the parameter estimates, 160 | their standard errors, t-statistics and p-values. 161 | The function takes a single argument, such as the object `results` 162 | returned here by the `fit` 163 | method, and returns such a summary. 164 | 165 | ```{python} 166 | summarize(results) 167 | 168 | ``` 169 | 170 | 171 | Before we describe other methods for working with fitted models, we outline a more useful and general framework for constructing a model matrix~`X`. 172 | ### Using Transformations: Fit and Transform 173 | Our model above has a single predictor, and constructing `X` was straightforward. 174 | In practice we often fit models with more than one predictor, typically selected from an array or data frame. 175 | We may wish to introduce transformations to the variables before fitting the model, specify interactions between variables, and expand some particular variables into sets of variables (e.g. polynomials). 176 | The `sklearn` package has a particular notion 177 | for this type of task: a *transform*. A transform is an object 178 | that is created with some parameters as arguments. The 179 | object has two main methods: `fit()` and `transform()`. 180 | 181 | We provide a general approach for specifying models and constructing 182 | the model matrix through the transform `ModelSpec()` in the `ISLP` library. 183 | `ModelSpec()` 184 | (renamed `MS()` in the preamble) creates a 185 | transform object, and then a pair of methods 186 | `transform()` and `fit()` are used to construct a 187 | corresponding model matrix. 188 | 189 | We first describe this process for our simple regression model using a single predictor `lstat` in 190 | the `Boston` data frame, but will use it repeatedly in more 191 | complex tasks in this and other labs in this book. 192 | In our case the transform is created by the expression 193 | `design = MS(['lstat'])`. 194 | 195 | The `fit()` method takes the original array and may do some 196 | initial computations on it, as specified in the transform object. 197 | For example, it may compute means and standard deviations for centering and scaling. 198 | The `transform()` 199 | method applies the fitted transformation to the array of data, and produces the model matrix. 200 | 201 | 202 | ```{python} 203 | design = MS(['lstat']) 204 | design = design.fit(Boston) 205 | X = design.transform(Boston) 206 | X[:4] 207 | ``` 208 | In this simple case, the `fit()` method does very little; it simply checks that the variable `'lstat'` specified in `design` exists in `Boston`. Then `transform()` constructs the model matrix with two columns: an `intercept` and the variable `lstat`. 209 | 210 | These two operations can be combined with the 211 | `fit_transform()` method. 212 | 213 | ```{python} 214 | design = MS(['lstat']) 215 | X = design.fit_transform(Boston) 216 | X[:4] 217 | ``` 218 | Note that, as in the previous code chunk when the two steps were done separately, the `design` object is changed as a result of the `fit()` operation. The power of this pipeline will become clearer when we fit more complex models that involve interactions and transformations. 219 | 220 | 221 | Let's return to our fitted regression model. 222 | The object 223 | `results` has several methods that can be used for inference. 224 | We already presented a function `summarize()` for showing the essentials of the fit. 225 | For a full and somewhat exhaustive summary of the fit, we can use the `summary()` 226 | method. 227 | 228 | ```{python} 229 | results.summary() 230 | 231 | ``` 232 | 233 | The fitted coefficients can also be retrieved as the 234 | `params` attribute of `results`. 235 | 236 | ```{python} 237 | results.params 238 | 239 | ``` 240 | 241 | 242 | The `get_prediction()` method can be used to obtain predictions, and produce confidence intervals and 243 | prediction intervals for the prediction of `medv` for given values of `lstat`. 244 | 245 | We first create a new data frame, in this case containing only the variable `lstat`, with the values for this variable at which we wish to make predictions. 246 | We then use the `transform()` method of `design` to create the corresponding model matrix. 247 | 248 | ```{python} 249 | new_df = pd.DataFrame({'lstat':[5, 10, 15]}) 250 | newX = design.transform(new_df) 251 | newX 252 | 253 | ``` 254 | 255 | Next we compute the predictions at `newX`, and view them by extracting the `predicted_mean` attribute. 256 | 257 | ```{python} 258 | new_predictions = results.get_prediction(newX); 259 | new_predictions.predicted_mean 260 | 261 | ``` 262 | We can produce confidence intervals for the predicted values. 263 | 264 | ```{python} 265 | new_predictions.conf_int(alpha=0.05) 266 | 267 | ``` 268 | Prediction intervals are computed by setting `obs=True`: 269 | 270 | ```{python} 271 | new_predictions.conf_int(obs=True, alpha=0.05) 272 | 273 | ``` 274 | For instance, the 95% confidence interval associated with an 275 | `lstat` value of 10 is (24.47, 25.63), and the 95% prediction 276 | interval is (12.82, 37.28). As expected, the confidence and 277 | prediction intervals are centered around the same point (a predicted 278 | value of 25.05 for `medv` when `lstat` equals 279 | 10), but the latter are substantially wider. 280 | 281 | Next we will plot `medv` and `lstat` 282 | using `DataFrame.plot.scatter()`, \definelongblankMR{plot.scatter()}{plot.slashslashscatter()} 283 | and wish to 284 | add the regression line to the resulting plot. 285 | 286 | 287 | ### Defining Functions 288 | While there is a function 289 | within the `ISLP` package that adds a line to an existing plot, we take this opportunity 290 | to define our first function to do so. 291 | 292 | ```{python} 293 | def abline(ax, b, m): 294 | "Add a line with slope m and intercept b to ax" 295 | xlim = ax.get_xlim() 296 | ylim = [m * xlim[0] + b, m * xlim[1] + b] 297 | ax.plot(xlim, ylim) 298 | 299 | ``` 300 | A few things are illustrated above. First we see the syntax for defining a function: 301 | `def funcname(...)`. The function has arguments `ax, b, m` 302 | where `ax` is an axis object for an existing plot, `b` is the intercept and 303 | `m` is the slope of the desired line. Other plotting options can be passed on to 304 | `ax.plot` by including additional optional arguments as follows: 305 | 306 | ```{python} 307 | def abline(ax, b, m, *args, **kwargs): 308 | "Add a line with slope m and intercept b to ax" 309 | xlim = ax.get_xlim() 310 | ylim = [m * xlim[0] + b, m * xlim[1] + b] 311 | ax.plot(xlim, ylim, *args, **kwargs) 312 | 313 | ``` 314 | The addition of `*args` allows any number of 315 | non-named arguments to `abline`, while `**kwargs` allows any 316 | number of named arguments (such as `linewidth=3`) to `abline`. 317 | In our function, we pass 318 | these arguments verbatim to `ax.plot` above. Readers 319 | interested in learning more about 320 | functions are referred to the section on 321 | defining functions in [docs.python.org/tutorial](https://docs.python.org/3/tutorial/controlflow.html#defining-functions). 322 | 323 | Let’s use our new function to add this regression line to a plot of 324 | `medv` vs. `lstat`. 325 | 326 | ```{python} 327 | ax = Boston.plot.scatter('lstat', 'medv') 328 | abline(ax, 329 | results.params[0], 330 | results.params[1], 331 | 'r--', 332 | linewidth=3) 333 | 334 | ``` 335 | Thus, the final call to `ax.plot()` is `ax.plot(xlim, ylim, 'r--', linewidth=3)`. 336 | We have used the argument `'r--'` to produce a red dashed line, and added 337 | an argument to make it of width 3. 338 | There is some evidence for non-linearity in the relationship between `lstat` and `medv`. We will explore this issue later in this lab. 339 | 340 | As mentioned above, there is an existing function to add a line to a plot --- `ax.axline()` --- but knowing how to write such functions empowers us to create more expressive displays. 341 | 342 | 343 | 344 | 345 | Next we examine some diagnostic plots, several of which were discussed 346 | in Section 3.3.3. 347 | We can find the fitted values and residuals 348 | of the fit as attributes of the `results` object. 349 | Various influence measures describing the regression model 350 | are computed with the `get_influence()` method. 351 | As we will not use the `fig` component returned 352 | as the first value from `subplots()`, we simply 353 | capture the second returned value in `ax` below. 354 | 355 | ```{python} 356 | ax = subplots(figsize=(8,8))[1] 357 | ax.scatter(results.fittedvalues, results.resid) 358 | ax.set_xlabel('Fitted value') 359 | ax.set_ylabel('Residual') 360 | ax.axhline(0, c='k', ls='--'); 361 | 362 | ``` 363 | We add a horizontal line at 0 for reference using the 364 | `ax.axhline()` method, indicating 365 | it should be black (`c='k'`) and have a dashed linestyle (`ls='--'`). 366 | 367 | On the basis of the residual plot, there is some evidence of non-linearity. 368 | Leverage statistics can be computed for any number of predictors using the 369 | `hat_matrix_diag` attribute of the value returned by the 370 | `get_influence()` method. 371 | 372 | ```{python} 373 | infl = results.get_influence() 374 | ax = subplots(figsize=(8,8))[1] 375 | ax.scatter(np.arange(X.shape[0]), infl.hat_matrix_diag) 376 | ax.set_xlabel('Index') 377 | ax.set_ylabel('Leverage') 378 | np.argmax(infl.hat_matrix_diag) 379 | 380 | ``` 381 | The `np.argmax()` function identifies the index of the largest element of an array, optionally computed over an axis of the array. 382 | In this case, we maximized over the entire array 383 | to determine which observation has the largest leverage statistic. 384 | 385 | 386 | ## Multiple Linear Regression 387 | In order to fit a multiple linear regression model using least squares, we again use 388 | the `ModelSpec()` transform to construct the required 389 | model matrix and response. The arguments 390 | to `ModelSpec()` can be quite general, but in this case 391 | a list of column names suffice. We consider a fit here with 392 | the two variables `lstat` and `age`. 393 | 394 | ```{python} 395 | X = MS(['lstat', 'age']).fit_transform(Boston) 396 | model1 = sm.OLS(y, X) 397 | results1 = model1.fit() 398 | summarize(results1) 399 | ``` 400 | Notice how we have compacted the first line into a succinct expression describing the construction of `X`. 401 | 402 | The `Boston` data set contains 12 variables, and so it would be cumbersome 403 | to have to type all of these in order to perform a regression using all of the predictors. 404 | Instead, we can use the following short-hand:\definelongblankMR{columns.drop()}{columns.slashslashdrop()} 405 | 406 | ```{python} 407 | terms = Boston.columns.drop('medv') 408 | terms 409 | 410 | ``` 411 | 412 | We can now fit the model with all the variables in `terms` using 413 | the same model matrix builder. 414 | 415 | ```{python} 416 | X = MS(terms).fit_transform(Boston) 417 | model = sm.OLS(y, X) 418 | results = model.fit() 419 | summarize(results) 420 | 421 | ``` 422 | 423 | What if we would like to perform a regression using all of the variables but one? For 424 | example, in the above regression output, `age` has a high $p$-value. 425 | So we may wish to run a regression excluding this predictor. 426 | The following syntax results in a regression using all predictors except `age`. 427 | 428 | ```{python} 429 | minus_age = Boston.columns.drop(['medv', 'age']) 430 | Xma = MS(minus_age).fit_transform(Boston) 431 | model1 = sm.OLS(y, Xma) 432 | summarize(model1.fit()) 433 | 434 | ``` 435 | 436 | ## Multivariate Goodness of Fit 437 | We can access the individual components of `results` by name 438 | (`dir(results)` shows us what is available). Hence 439 | `results.rsquared` gives us the $R^2$, 440 | and 441 | `np.sqrt(results.scale)` gives us the RSE. 442 | 443 | Variance inflation factors (section 3.3.3) are sometimes useful 444 | to assess the effect of collinearity in the model matrix of a regression model. 445 | We will compute the VIFs in our multiple regression fit, and use the opportunity to introduce the idea of *list comprehension*. 446 | 447 | ### List Comprehension 448 | Often we encounter a sequence of objects which we would like to transform 449 | for some other task. Below, we compute the VIF for each 450 | feature in our `X` matrix and produce a data frame 451 | whose index agrees with the columns of `X`. 452 | The notion of list comprehension can often make such 453 | a task easier. 454 | 455 | List comprehensions are simple and powerful ways to form 456 | lists of `Python` objects. The language also supports 457 | dictionary and *generator* comprehension, though these are 458 | beyond our scope here. Let's look at an example. We compute the VIF for each of the variables 459 | in the model matrix `X`, using the function `variance_inflation_factor()`. 460 | 461 | 462 | ```{python} 463 | vals = [VIF(X, i) 464 | for i in range(1, X.shape[1])] 465 | vif = pd.DataFrame({'vif':vals}, 466 | index=X.columns[1:]) 467 | vif 468 | 469 | ``` 470 | The function `VIF()` takes two arguments: a dataframe or array, 471 | and a variable column index. In the code above we call `VIF()` on the fly for all columns in `X`. 472 | We have excluded column 0 above (the intercept), which is not of interest. In this case the VIFs are not that exciting. 473 | 474 | The object `vals` above could have been constructed with the following for loop: 475 | 476 | ```{python} 477 | vals = [] 478 | for i in range(1, X.values.shape[1]): 479 | vals.append(VIF(X.values, i)) 480 | 481 | ``` 482 | List comprehension allows us to perform such repetitive operations in a more straightforward way. 483 | ## Interaction Terms 484 | It is easy to include interaction terms in a linear model using `ModelSpec()`. 485 | Including a tuple `("lstat","age")` tells the model 486 | matrix builder to include an interaction term between 487 | `lstat` and `age`. 488 | 489 | ```{python} 490 | X = MS(['lstat', 491 | 'age', 492 | ('lstat', 'age')]).fit_transform(Boston) 493 | model2 = sm.OLS(y, X) 494 | summarize(model2.fit()) 495 | 496 | ``` 497 | 498 | 499 | ## Non-linear Transformations of the Predictors 500 | The model matrix builder can include terms beyond 501 | just column names and interactions. For instance, 502 | the `poly()` function supplied in `ISLP` specifies that 503 | columns representing polynomial functions 504 | of its first argument are added to the model matrix. 505 | 506 | ```{python} 507 | X = MS([poly('lstat', degree=2), 'age']).fit_transform(Boston) 508 | model3 = sm.OLS(y, X) 509 | results3 = model3.fit() 510 | summarize(results3) 511 | 512 | ``` 513 | The effectively zero *p*-value associated with the quadratic term 514 | (i.e. the third row above) suggests that it leads to an improved model. 515 | 516 | By default, `poly()` creates a basis matrix for inclusion in the 517 | model matrix whose 518 | columns are *orthogonal polynomials*, which are designed for stable 519 | least squares computations. {Actually, `poly()` is a wrapper for the workhorse and standalone function `Poly()` that does the work in building the model matrix.} 520 | Alternatively, had we included an argument 521 | `raw=True` in the above call to `poly()`, the basis matrix would consist simply of 522 | `lstat` and `lstat**2`. Since either of these bases 523 | represent quadratic polynomials, the fitted values would not 524 | change in this case, just the polynomial coefficients. Also by default, the columns 525 | created by `poly()` do not include an intercept column as 526 | that is automatically added by `MS()`. 527 | 528 | We use the `anova_lm()` function to further quantify the extent to which the quadratic fit is 529 | superior to the linear fit. 530 | 531 | ```{python} 532 | anova_lm(results1, results3) 533 | 534 | ``` 535 | Here `results1` represents the linear submodel containing 536 | predictors `lstat` and `age`, 537 | while `results3` corresponds to the larger model above with a quadratic 538 | term in `lstat`. 539 | The `anova_lm()` function performs a hypothesis test 540 | comparing the two models. The null hypothesis is that the quadratic 541 | term in the bigger model is not needed, and the alternative hypothesis is that the 542 | bigger model is superior. Here the *F*-statistic is 177.28 and 543 | the associated *p*-value is zero. 544 | In this case the *F*-statistic is the square of the 545 | *t*-statistic for the quadratic term in the linear model summary 546 | for `results3` --- a consequence of the fact that these nested 547 | models differ by one degree of freedom. 548 | This provides very clear evidence that the quadratic polynomial in 549 | `lstat` improves the linear model. 550 | This is not surprising, since earlier we saw evidence for non-linearity in the relationship between `medv` 551 | and `lstat`. 552 | 553 | The function `anova_lm()` can take more than two nested models 554 | as input, in which case it compares every successive pair of models. 555 | That also explains why there are `NaN`s in the first row above, since 556 | there is no previous model with which to compare the first. 557 | 558 | 559 | ```{python} 560 | ax = subplots(figsize=(8,8))[1] 561 | ax.scatter(results3.fittedvalues, results3.resid) 562 | ax.set_xlabel('Fitted value') 563 | ax.set_ylabel('Residual') 564 | ax.axhline(0, c='k', ls='--'); 565 | 566 | ``` 567 | We see that when the quadratic term is included in the model, 568 | there is little discernible pattern in the residuals. 569 | In order to create a cubic or higher-degree polynomial fit, we can simply change the degree argument 570 | to `poly()`. 571 | 572 | 573 | 574 | ## Qualitative Predictors 575 | Here we use the `Carseats` data, which is included in the 576 | `ISLP` package. We will attempt to predict `Sales` 577 | (child car seat sales) in 400 locations based on a number of 578 | predictors. 579 | 580 | ```{python} 581 | Carseats = load_data('Carseats') 582 | Carseats.columns 583 | 584 | ``` 585 | The `Carseats` 586 | data includes qualitative predictors such as 587 | `ShelveLoc`, an indicator of the quality of the shelving 588 | location --- that is, 589 | the space within a store in which the car seat is displayed. The predictor 590 | `ShelveLoc` takes on three possible values, `Bad`, `Medium`, and `Good`. 591 | Given a qualitative variable such as `ShelveLoc`, `ModelSpec()` generates dummy 592 | variables automatically. 593 | These variables are often referred to as a *one-hot encoding* of the categorical 594 | feature. Their columns sum to one, so to avoid collinearity with an intercept, the first column is dropped. Below we see 595 | the column `ShelveLoc[Bad]` has been dropped, since `Bad` is the first level of `ShelveLoc`. 596 | Below we fit a multiple regression model that includes some interaction terms. 597 | 598 | ```{python} 599 | allvars = list(Carseats.columns.drop('Sales')) 600 | y = Carseats['Sales'] 601 | final = allvars + [('Income', 'Advertising'), 602 | ('Price', 'Age')] 603 | X = MS(final).fit_transform(Carseats) 604 | model = sm.OLS(y, X) 605 | summarize(model.fit()) 606 | 607 | ``` 608 | In the first line above, we made `allvars` a list, so that we 609 | could add the interaction terms two lines down. 610 | Our model-matrix builder has created a `ShelveLoc[Good]` 611 | dummy variable that takes on a value of 1 if the 612 | shelving location is good, and 0 otherwise. It has also created a `ShelveLoc[Medium]` 613 | dummy variable that equals 1 if the shelving location is medium, and 0 otherwise. 614 | A bad shelving location corresponds to a zero for each of the two dummy variables. 615 | The fact that the coefficient for `ShelveLoc[Good]` in the regression output is 616 | positive indicates that a good shelving location is associated with high sales (relative to a bad location). 617 | And `ShelveLoc[Medium]` has a smaller positive coefficient, 618 | indicating that a medium shelving location leads to higher sales than a bad 619 | shelving location, but lower sales than a good shelving location. 620 | 621 | 622 | -------------------------------------------------------------------------------- /Ch05-resample-lab.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | jupyter: 3 | jupytext: 4 | cell_metadata_filter: -all 5 | formats: ipynb,Rmd 6 | main_language: python 7 | text_representation: 8 | extension: .Rmd 9 | format_name: rmarkdown 10 | format_version: '1.2' 11 | jupytext_version: 1.16.7 12 | --- 13 | 14 | # Cross-Validation and the Bootstrap 15 | 16 | 17 | Open In Colab 18 | 19 | 20 | [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/intro-stat-learning/ISLP_labs/v2.2?labpath=Ch05-resample-lab.ipynb) 21 | 22 | 23 | In this lab, we explore the resampling techniques covered in this 24 | chapter. Some of the commands in this lab may take a while to run on 25 | your computer. 26 | 27 | We again begin by placing most of our imports at this top level. 28 | 29 | ```{python} 30 | import numpy as np 31 | import statsmodels.api as sm 32 | from ISLP import load_data 33 | from ISLP.models import (ModelSpec as MS, 34 | summarize, 35 | poly) 36 | from sklearn.model_selection import train_test_split 37 | 38 | ``` 39 | 40 | 41 | There are several new imports needed for this lab. 42 | 43 | ```{python} 44 | from functools import partial 45 | from sklearn.model_selection import \ 46 | (cross_validate, 47 | KFold, 48 | ShuffleSplit) 49 | from sklearn.base import clone 50 | from ISLP.models import sklearn_sm 51 | 52 | ``` 53 | 54 | 55 | ## The Validation Set Approach 56 | We explore the use of the validation set approach in order to estimate 57 | the test error rates that result from fitting various linear models on 58 | the `Auto` data set. 59 | 60 | We use the function `train_test_split()` to split 61 | the data into training and validation sets. As there are 392 observations, 62 | we split into two equal sets of size 196 using the 63 | argument `test_size=196`. It is generally a good idea to set a random seed 64 | when performing operations like this that contain an 65 | element of randomness, so that the results obtained can be reproduced 66 | precisely at a later time. We set the random seed of the splitter 67 | with the argument `random_state=0`. 68 | 69 | ```{python} 70 | Auto = load_data('Auto') 71 | Auto_train, Auto_valid = train_test_split(Auto, 72 | test_size=196, 73 | random_state=0) 74 | 75 | ``` 76 | 77 | Now we can fit a linear regression using only the observations corresponding to the training set `Auto_train`. 78 | 79 | ```{python} 80 | hp_mm = MS(['horsepower']) 81 | X_train = hp_mm.fit_transform(Auto_train) 82 | y_train = Auto_train['mpg'] 83 | model = sm.OLS(y_train, X_train) 84 | results = model.fit() 85 | 86 | ``` 87 | 88 | We now use the `predict()` method of `results` evaluated on the model matrix for this model 89 | created using the validation data set. We also calculate the validation MSE of our model. 90 | 91 | ```{python} 92 | X_valid = hp_mm.transform(Auto_valid) 93 | y_valid = Auto_valid['mpg'] 94 | valid_pred = results.predict(X_valid) 95 | np.mean((y_valid - valid_pred)**2) 96 | 97 | ``` 98 | 99 | Hence our estimate for the validation MSE of the linear regression 100 | fit is $23.62$. 101 | 102 | We can also estimate the validation error for 103 | higher-degree polynomial regressions. We first provide a function `evalMSE()` that takes a model string as well 104 | as training and test sets and returns the MSE on the test set. 105 | 106 | ```{python} 107 | def evalMSE(terms, 108 | response, 109 | train, 110 | test): 111 | 112 | mm = MS(terms) 113 | X_train = mm.fit_transform(train) 114 | y_train = train[response] 115 | 116 | X_test = mm.transform(test) 117 | y_test = test[response] 118 | 119 | results = sm.OLS(y_train, X_train).fit() 120 | test_pred = results.predict(X_test) 121 | 122 | return np.mean((y_test - test_pred)**2) 123 | 124 | ``` 125 | 126 | Let’s use this function to estimate the validation MSE 127 | using linear, quadratic and cubic fits. We use the `enumerate()` function 128 | here, which gives both the values and indices of objects as one iterates 129 | over a for loop. 130 | 131 | ```{python} 132 | MSE = np.zeros(3) 133 | for idx, degree in enumerate(range(1, 4)): 134 | MSE[idx] = evalMSE([poly('horsepower', degree)], 135 | 'mpg', 136 | Auto_train, 137 | Auto_valid) 138 | MSE 139 | 140 | ``` 141 | 142 | These error rates are $23.62, 18.76$, and $18.80$, respectively. If we 143 | choose a different training/validation split instead, then we 144 | can expect somewhat different errors on the validation set. 145 | 146 | ```{python} 147 | Auto_train, Auto_valid = train_test_split(Auto, 148 | test_size=196, 149 | random_state=3) 150 | MSE = np.zeros(3) 151 | for idx, degree in enumerate(range(1, 4)): 152 | MSE[idx] = evalMSE([poly('horsepower', degree)], 153 | 'mpg', 154 | Auto_train, 155 | Auto_valid) 156 | MSE 157 | ``` 158 | 159 | Using this split of the observations into a training set and a validation set, 160 | we find that the validation set error rates for the models with linear, quadratic, and cubic terms are $20.76$, $16.95$, and $16.97$, respectively. 161 | 162 | These results are consistent with our previous findings: a model that 163 | predicts `mpg` using a quadratic function of `horsepower` 164 | performs better than a model that involves only a linear function of 165 | `horsepower`, and there is no evidence of an improvement in using a cubic function of `horsepower`. 166 | 167 | 168 | ## Cross-Validation 169 | In theory, the cross-validation estimate can be computed for any generalized 170 | linear model. {} 171 | In practice, however, the simplest way to cross-validate in 172 | Python is to use `sklearn`, which has a different interface or API 173 | than `statsmodels`, the code we have been using to fit GLMs. 174 | 175 | This is a problem which often confronts data scientists: "I have a function to do task $A$, and need to feed it into something that performs task $B$, so that I can compute $B(A(D))$, where $D$ is my data." When $A$ and $B$ don’t naturally speak to each other, this 176 | requires the use of a *wrapper*. 177 | In the `ISLP` package, 178 | we provide 179 | a wrapper, `sklearn_sm()`, that enables us to easily use the cross-validation tools of `sklearn` with 180 | models fit by `statsmodels`. 181 | 182 | The class `sklearn_sm()` 183 | has as its first argument 184 | a model from `statsmodels`. It can take two additional 185 | optional arguments: `model_str` which can be 186 | used to specify a formula, and `model_args` which should 187 | be a dictionary of additional arguments used when fitting 188 | the model. For example, to fit a logistic regression model 189 | we have to specify a `family` argument. This 190 | is passed as `model_args={'family':sm.families.Binomial()}`. 191 | 192 | Here is our wrapper in action: 193 | 194 | ```{python} 195 | hp_model = sklearn_sm(sm.OLS, 196 | MS(['horsepower'])) 197 | X, Y = Auto.drop(columns=['mpg']), Auto['mpg'] 198 | cv_results = cross_validate(hp_model, 199 | X, 200 | Y, 201 | cv=Auto.shape[0]) 202 | cv_err = np.mean(cv_results['test_score']) 203 | cv_err 204 | 205 | ``` 206 | The arguments to `cross_validate()` are as follows: an 207 | object with the appropriate `fit()`, `predict()`, 208 | and `score()` methods, an 209 | array of features `X` and a response `Y`. 210 | We also included an additional argument `cv` to `cross_validate()`; specifying an integer 211 | $k$ results in $k$-fold cross-validation. We have provided a value 212 | corresponding to the total number of observations, which results in 213 | leave-one-out cross-validation (LOOCV). The `cross_validate()` function produces a dictionary with several components; 214 | we simply want the cross-validated test score here (MSE), which is estimated to be 24.23. 215 | 216 | 217 | We can repeat this procedure for increasingly complex polynomial fits. 218 | To automate the process, we again 219 | use a for loop which iteratively fits polynomial 220 | regressions of degree 1 to 5, computes the 221 | associated cross-validation error, and stores it in the $i$th element 222 | of the vector `cv_error`. The variable `d` in the for loop 223 | corresponds to the degree of the polynomial. We begin by initializing the 224 | vector. This command may take a couple of seconds to run. 225 | 226 | ```{python} 227 | cv_error = np.zeros(5) 228 | H = np.array(Auto['horsepower']) 229 | M = sklearn_sm(sm.OLS) 230 | for i, d in enumerate(range(1,6)): 231 | X = np.power.outer(H, np.arange(d+1)) 232 | M_CV = cross_validate(M, 233 | X, 234 | Y, 235 | cv=Auto.shape[0]) 236 | cv_error[i] = np.mean(M_CV['test_score']) 237 | cv_error 238 | 239 | ``` 240 | As in Figure 5.4, we see a sharp drop in the estimated test MSE between the linear and 241 | quadratic fits, but then no clear improvement from using higher-degree polynomials. 242 | 243 | Above we introduced the `outer()` method of the `np.power()` 244 | function. The `outer()` method is applied to an operation 245 | that has two arguments, such as `add()`, `min()`, or 246 | `power()`. 247 | It has two arrays as 248 | arguments, and then forms a larger 249 | array where the operation is applied to each pair of elements of the 250 | two arrays. 251 | 252 | ```{python} 253 | A = np.array([3, 5, 9]) 254 | B = np.array([2, 4]) 255 | np.add.outer(A, B) 256 | 257 | ``` 258 | 259 | In the CV example above, we used $k=n$, but of course we can also use $k 17 | Open In Colab 18 | 19 | 20 | [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/intro-stat-learning/ISLP_labs/v2.2?labpath=Ch08-baggboost-lab.ipynb) 21 | 22 | 23 | We import some of our usual libraries at this top 24 | level. 25 | 26 | ```{python} 27 | import numpy as np 28 | import pandas as pd 29 | from matplotlib.pyplot import subplots 30 | import sklearn.model_selection as skm 31 | from ISLP import load_data, confusion_table 32 | from ISLP.models import ModelSpec as MS 33 | 34 | ``` 35 | We also collect the new imports 36 | needed for this lab. 37 | 38 | ```{python} 39 | from sklearn.tree import (DecisionTreeClassifier as DTC, 40 | DecisionTreeRegressor as DTR, 41 | plot_tree, 42 | export_text) 43 | from sklearn.metrics import (accuracy_score, 44 | log_loss) 45 | from sklearn.ensemble import \ 46 | (RandomForestRegressor as RF, 47 | GradientBoostingRegressor as GBR) 48 | from ISLP.bart import BART 49 | 50 | ``` 51 | 52 | 53 | ## Fitting Classification Trees 54 | 55 | 56 | We first use classification trees to analyze the `Carseats` data set. 57 | In these data, `Sales` is a continuous variable, and so we begin 58 | by recoding it as a binary variable. We use the `where()` 59 | function to create a variable, called `High`, which takes on a 60 | value of `Yes` if the `Sales` variable exceeds 8, and takes 61 | on a value of `No` otherwise. 62 | 63 | ```{python} 64 | Carseats = load_data('Carseats') 65 | High = np.where(Carseats.Sales > 8, 66 | "Yes", 67 | "No") 68 | 69 | ``` 70 | 71 | We now use `DecisionTreeClassifier()` to fit a classification tree in 72 | order to predict `High` using all variables but `Sales`. 73 | To do so, we must form a model matrix as we did when fitting regression 74 | models. 75 | 76 | ```{python} 77 | model = MS(Carseats.columns.drop('Sales'), intercept=False) 78 | D = model.fit_transform(Carseats) 79 | feature_names = list(D.columns) 80 | X = np.asarray(D) 81 | 82 | ``` 83 | We have converted `D` from a data frame to an array `X`, which is needed in some of the analysis below. We also need the `feature_names` for annotating our plots later. 84 | 85 | There are several options needed to specify the classifier, 86 | such as `max_depth` (how deep to grow the tree), `min_samples_split` 87 | (minimum number of observations in a node to be eligible for splitting) 88 | and `criterion` (whether to use Gini or cross-entropy as the split criterion). 89 | We also set `random_state` for reproducibility; ties in the split criterion are broken at random. 90 | 91 | ```{python} 92 | clf = DTC(criterion='entropy', 93 | max_depth=3, 94 | random_state=0) 95 | clf.fit(X, High) 96 | 97 | ``` 98 | 99 | 100 | In our discussion of qualitative features in Section 3.3, 101 | we noted that for a linear regression model such a feature could be 102 | represented by including a matrix of dummy variables (one-hot-encoding) in the model 103 | matrix, using the formula notation of `statsmodels`. 104 | As mentioned in Section 8.1, there is a more 105 | natural way to handle qualitative features when building a decision 106 | tree, that does not require such dummy variables; each split amounts to partitioning the levels into two groups. 107 | However, 108 | the `sklearn` implementation of decision trees does not take 109 | advantage of this approach; instead it simply treats the one-hot-encoded levels as separate variables. 110 | 111 | ```{python} 112 | accuracy_score(High, clf.predict(X)) 113 | 114 | ``` 115 | 116 | 117 | With only the default arguments, the training error rate is 118 | 21%. 119 | For classification trees, we can 120 | access the value of the deviance using `log_loss()`, 121 | \begin{equation*} 122 | \begin{split} 123 | -2 \sum_m \sum_k n_{mk} \log \hat{p}_{mk}, 124 | \end{split} 125 | \end{equation*} 126 | where $n_{mk}$ is the number of observations in the $m$th terminal 127 | node that belong to the $k$th class. 128 | 129 | ```{python} 130 | resid_dev = np.sum(log_loss(High, clf.predict_proba(X))) 131 | resid_dev 132 | 133 | ``` 134 | 135 | This is closely related to the *entropy*, defined in (8.7). 136 | A small deviance indicates a 137 | tree that provides a good fit to the (training) data. 138 | 139 | One of the most attractive properties of trees is that they can 140 | be graphically displayed. Here we use the `plot()` function 141 | to display the tree structure. 142 | 143 | ```{python} 144 | ax = subplots(figsize=(12,12))[1] 145 | plot_tree(clf, 146 | feature_names=feature_names, 147 | ax=ax); 148 | 149 | ``` 150 | The most important indicator of `Sales` appears to be `ShelveLoc`. 151 | 152 | We can see a text representation of the tree using 153 | `export_text()`, which displays the split 154 | criterion (e.g. `Price <= 92.5`) for each branch. 155 | For leaf nodes it shows the overall prediction 156 | (`Yes` or `No`). 157 | We can also see the number of observations in that 158 | leaf that take on values of `Yes` and `No` by specifying `show_weights=True`. 159 | 160 | ```{python} 161 | print(export_text(clf, 162 | feature_names=feature_names, 163 | show_weights=True)) 164 | 165 | ``` 166 | 167 | In order to properly evaluate the performance of a classification tree 168 | on these data, we must estimate the test error rather than simply 169 | computing the training error. We split the observations into a 170 | training set and a test set, build the tree using the training set, 171 | and evaluate its performance on the test data. This pattern is 172 | similar to that in Chapter 6, with the linear models 173 | replaced here by decision trees --- the code for validation 174 | is almost identical. This approach leads to correct predictions 175 | for 68.5% of the locations in the test data set. 176 | 177 | ```{python} 178 | validation = skm.ShuffleSplit(n_splits=1, 179 | test_size=200, 180 | random_state=0) 181 | results = skm.cross_validate(clf, 182 | D, 183 | High, 184 | cv=validation) 185 | results['test_score'] 186 | 187 | ``` 188 | 189 | 190 | 191 | Next, we consider whether pruning the tree might lead to improved 192 | classification performance. We first split the data into a training and 193 | test set. We will use cross-validation to prune the tree on the training 194 | set, and then evaluate the performance of the pruned tree on the test 195 | set. 196 | 197 | ```{python} 198 | (X_train, 199 | X_test, 200 | High_train, 201 | High_test) = skm.train_test_split(X, 202 | High, 203 | test_size=0.5, 204 | random_state=0) 205 | 206 | ``` 207 | We first refit the full tree on the training set; here we do not set a `max_depth` parameter, since we will learn that through cross-validation. 208 | 209 | 210 | ```{python} 211 | clf = DTC(criterion='entropy', random_state=0) 212 | clf.fit(X_train, High_train) 213 | accuracy_score(High_test, clf.predict(X_test)) 214 | 215 | ``` 216 | Next we use the `cost_complexity_pruning_path()` method of 217 | `clf` to extract cost-complexity values. 218 | 219 | ```{python} 220 | ccp_path = clf.cost_complexity_pruning_path(X_train, High_train) 221 | kfold = skm.KFold(10, 222 | random_state=1, 223 | shuffle=True) 224 | 225 | ``` 226 | This yields a set of impurities and $\alpha$ values 227 | from which we can extract an optimal one by cross-validation. 228 | 229 | ```{python} 230 | grid = skm.GridSearchCV(clf, 231 | {'ccp_alpha': ccp_path.ccp_alphas}, 232 | refit=True, 233 | cv=kfold, 234 | scoring='accuracy') 235 | grid.fit(X_train, High_train) 236 | grid.best_score_ 237 | 238 | ``` 239 | Let’s take a look at the pruned tree. 240 | 241 | ```{python} 242 | ax = subplots(figsize=(12, 12))[1] 243 | best_ = grid.best_estimator_ 244 | plot_tree(best_, 245 | feature_names=feature_names, 246 | ax=ax); 247 | 248 | ``` 249 | This is quite a bushy tree. We could count the leaves, or query 250 | `best_` instead. 251 | 252 | ```{python} 253 | best_.tree_.n_leaves 254 | 255 | ``` 256 | The tree with 30 terminal 257 | nodes results in the lowest cross-validation error rate, with an accuracy of 258 | 68.5%. How well does this pruned tree perform on the test data set? Once 259 | again, we apply the `predict()` function. 260 | 261 | ```{python} 262 | print(accuracy_score(High_test, 263 | best_.predict(X_test))) 264 | confusion = confusion_table(best_.predict(X_test), 265 | High_test) 266 | confusion 267 | 268 | ``` 269 | 270 | 271 | Now 72.0% of the test observations are correctly classified, which is slightly worse than the error for the full tree (with 35 leaves). So cross-validation has not helped us much here; it only pruned off 5 leaves, at a cost of a slightly worse error. These results would change if we were to change the random number seeds above; even though cross-validation gives an unbiased approach to model selection, it does have variance. 272 | 273 | 274 | 275 | 276 | ## Fitting Regression Trees 277 | Here we fit a regression tree to the `Boston` data set. The 278 | steps are similar to those for classification trees. 279 | 280 | ```{python} 281 | Boston = load_data("Boston") 282 | model = MS(Boston.columns.drop('medv'), intercept=False) 283 | D = model.fit_transform(Boston) 284 | feature_names = list(D.columns) 285 | X = np.asarray(D) 286 | 287 | ``` 288 | 289 | First, we split the data into training and test sets, and fit the tree 290 | to the training data. Here we use 30% of the data for the test set. 291 | 292 | 293 | ```{python} 294 | (X_train, 295 | X_test, 296 | y_train, 297 | y_test) = skm.train_test_split(X, 298 | Boston['medv'], 299 | test_size=0.3, 300 | random_state=0) 301 | 302 | ``` 303 | 304 | Having formed our training and test data sets, we fit the regression tree. 305 | 306 | ```{python} 307 | reg = DTR(max_depth=3) 308 | reg.fit(X_train, y_train) 309 | ax = subplots(figsize=(12,12))[1] 310 | plot_tree(reg, 311 | feature_names=feature_names, 312 | ax=ax); 313 | 314 | ``` 315 | 316 | The variable `lstat` measures the percentage of individuals with 317 | lower socioeconomic status. The tree indicates that lower 318 | values of `lstat` correspond to more expensive houses. 319 | The tree predicts a median house price of $12,042 for small-sized homes (`rm < 6.8`), in 320 | suburbs in which residents have low socioeconomic status (`lstat > 14.4`) and the crime-rate is moderate (`crim > 5.8`). 321 | 322 | 323 | Now we use the cross-validation function to see whether pruning 324 | the tree will improve performance. 325 | 326 | ```{python} 327 | ccp_path = reg.cost_complexity_pruning_path(X_train, y_train) 328 | kfold = skm.KFold(5, 329 | shuffle=True, 330 | random_state=10) 331 | grid = skm.GridSearchCV(reg, 332 | {'ccp_alpha': ccp_path.ccp_alphas}, 333 | refit=True, 334 | cv=kfold, 335 | scoring='neg_mean_squared_error') 336 | G = grid.fit(X_train, y_train) 337 | 338 | ``` 339 | 340 | In keeping with the cross-validation results, we use the pruned tree 341 | to make predictions on the test set. 342 | 343 | ```{python} 344 | best_ = grid.best_estimator_ 345 | np.mean((y_test - best_.predict(X_test))**2) 346 | 347 | ``` 348 | 349 | 350 | In other words, the test set MSE associated with the regression tree 351 | is 28.07. The square root of 352 | the MSE is therefore around 353 | 5.30, 354 | indicating that this model leads to test predictions that are within around 355 | $5300 356 | of the true median home value for the suburb. 357 | 358 | Let’s plot the best tree to see how interpretable it is. 359 | 360 | ```{python} 361 | ax = subplots(figsize=(12,12))[1] 362 | plot_tree(G.best_estimator_, 363 | feature_names=feature_names, 364 | ax=ax); 365 | 366 | ``` 367 | 368 | 369 | 370 | 371 | ## Bagging and Random Forests 372 | 373 | 374 | Here we apply bagging and random forests to the `Boston` data, using 375 | the `RandomForestRegressor()` from the `sklearn.ensemble` package. Recall 376 | that bagging is simply a special case of a random forest with 377 | $m=p$. Therefore, the `RandomForestRegressor()` function can be used to 378 | perform both bagging and random forests. We start with bagging. 379 | 380 | ```{python} 381 | bag_boston = RF(max_features=X_train.shape[1], random_state=0) 382 | bag_boston.fit(X_train, y_train) 383 | 384 | ``` 385 | 386 | 387 | The argument `max_features` indicates that all 12 predictors should 388 | be considered for each split of the tree --- in other words, that 389 | bagging should be done. How well does this bagged model perform on 390 | the test set? 391 | 392 | ```{python} 393 | ax = subplots(figsize=(8,8))[1] 394 | y_hat_bag = bag_boston.predict(X_test) 395 | ax.scatter(y_hat_bag, y_test) 396 | np.mean((y_test - y_hat_bag)**2) 397 | 398 | ``` 399 | 400 | The test set MSE associated with the bagged regression tree is 401 | 14.63, about half that obtained using an optimally-pruned single 402 | tree. We could change the number of trees grown from the default of 403 | 100 by 404 | using the `n_estimators` argument: 405 | 406 | ```{python} 407 | bag_boston = RF(max_features=X_train.shape[1], 408 | n_estimators=500, 409 | random_state=0).fit(X_train, y_train) 410 | y_hat_bag = bag_boston.predict(X_test) 411 | np.mean((y_test - y_hat_bag)**2) 412 | ``` 413 | There is not much change. Bagging and random forests cannot overfit by 414 | increasing the number of trees, but can underfit if the number is too small. 415 | 416 | Growing a random forest proceeds in exactly the same way, except that 417 | we use a smaller value of the `max_features` argument. By default, 418 | `RandomForestRegressor()` uses $p$ variables when building a random 419 | forest of regression trees (i.e. it defaults to bagging), and `RandomForestClassifier()` uses 420 | $\sqrt{p}$ variables when building a 421 | random forest of classification trees. Here we use `max_features=6`. 422 | 423 | ```{python} 424 | RF_boston = RF(max_features=6, 425 | random_state=0).fit(X_train, y_train) 426 | y_hat_RF = RF_boston.predict(X_test) 427 | np.mean((y_test - y_hat_RF)**2) 428 | 429 | ``` 430 | 431 | 432 | The test set MSE is 20.04; 433 | this indicates that random forests did somewhat worse than bagging 434 | in this case. Extracting the `feature_importances_` values from the fitted model, we can view the 435 | importance of each variable. 436 | 437 | ```{python} 438 | feature_imp = pd.DataFrame( 439 | {'importance':RF_boston.feature_importances_}, 440 | index=feature_names) 441 | feature_imp.sort_values(by='importance', ascending=False) 442 | ``` 443 | This 444 | is a relative measure of the total decrease in node impurity that results from 445 | splits over that variable, averaged over all trees (this was plotted in Figure 8.9 for a model fit to the `Heart` data). 446 | 447 | The results indicate that across all of the trees considered in the 448 | random forest, the wealth level of the community (`lstat`) and the 449 | house size (`rm`) are by far the two most important variables. 450 | 451 | 452 | 453 | 454 | ## Boosting 455 | 456 | 457 | Here we use `GradientBoostingRegressor()` from `sklearn.ensemble` 458 | to fit boosted regression trees to the `Boston` data 459 | set. For classification we would use `GradientBoostingClassifier()`. 460 | The argument `n_estimators=5000` 461 | indicates that we want 5000 trees, and the option 462 | `max_depth=3` limits the depth of each tree. The 463 | argument `learning_rate` is the $\lambda$ 464 | mentioned earlier in the description of boosting. 465 | 466 | ```{python} 467 | boost_boston = GBR(n_estimators=5000, 468 | learning_rate=0.001, 469 | max_depth=3, 470 | random_state=0) 471 | boost_boston.fit(X_train, y_train) 472 | 473 | ``` 474 | 475 | We can see how the training error decreases with the `train_score_` attribute. 476 | To get an idea of how the test error decreases we can use the 477 | `staged_predict()` method to get the predicted values along the path. 478 | 479 | ```{python} 480 | test_error = np.zeros_like(boost_boston.train_score_) 481 | for idx, y_ in enumerate(boost_boston.staged_predict(X_test)): 482 | test_error[idx] = np.mean((y_test - y_)**2) 483 | 484 | plot_idx = np.arange(boost_boston.train_score_.shape[0]) 485 | ax = subplots(figsize=(8,8))[1] 486 | ax.plot(plot_idx, 487 | boost_boston.train_score_, 488 | 'b', 489 | label='Training') 490 | ax.plot(plot_idx, 491 | test_error, 492 | 'r', 493 | label='Test') 494 | ax.legend(); 495 | 496 | ``` 497 | 498 | We now use the boosted model to predict `medv` on the test set: 499 | 500 | ```{python} 501 | y_hat_boost = boost_boston.predict(X_test); 502 | np.mean((y_test - y_hat_boost)**2) 503 | 504 | ``` 505 | 506 | The test MSE obtained is 14.48, 507 | similar to the test MSE for bagging. If we want to, we can 508 | perform boosting with a different value of the shrinkage parameter 509 | $\lambda$ in (8.10). The default value is 0.001, but 510 | this is easily modified. Here we take $\lambda=0.2$. 511 | 512 | ```{python} 513 | boost_boston = GBR(n_estimators=5000, 514 | learning_rate=0.2, 515 | max_depth=3, 516 | random_state=0) 517 | boost_boston.fit(X_train, 518 | y_train) 519 | y_hat_boost = boost_boston.predict(X_test); 520 | np.mean((y_test - y_hat_boost)**2) 521 | 522 | ``` 523 | 524 | 525 | In this case, using $\lambda=0.2$ leads to almost the same test MSE 526 | as when using $\lambda=0.001$. 527 | 528 | 529 | 530 | 531 | ## Bayesian Additive Regression Trees 532 | 533 | 534 | In this section we demonstrate a `Python` implementation of BART found in the 535 | `ISLP.bart` package. We fit a model 536 | to the `Boston` housing data set. This `BART()` estimator is 537 | designed for quantitative outcome variables, though other implementations are available for 538 | fitting logistic and probit models to categorical outcomes. 539 | 540 | ```{python} 541 | bart_boston = BART(random_state=0, burnin=5, ndraw=15) 542 | bart_boston.fit(X_train, y_train) 543 | 544 | ``` 545 | 546 | 547 | On this data set, with this split into test and training, we see that the test error of BART is similar to that of random forest. 548 | 549 | ```{python} 550 | yhat_test = bart_boston.predict(X_test.astype(np.float32)) 551 | np.mean((y_test - yhat_test)**2) 552 | 553 | ``` 554 | 555 | 556 | We can check how many times each variable appeared in the collection of trees. 557 | This gives a summary similar to the variable importance plot for boosting and random forests. 558 | 559 | ```{python} 560 | var_inclusion = pd.Series(bart_boston.variable_inclusion_.mean(0), 561 | index=D.columns) 562 | var_inclusion 563 | 564 | ``` 565 | 566 | 567 | 568 | 569 | 570 | -------------------------------------------------------------------------------- /Ch09-svm-lab.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | jupyter: 3 | jupytext: 4 | cell_metadata_filter: -all 5 | formats: ipynb,Rmd 6 | main_language: python 7 | text_representation: 8 | extension: .Rmd 9 | format_name: rmarkdown 10 | format_version: '1.2' 11 | jupytext_version: 1.16.7 12 | --- 13 | 14 | # Support Vector Machines 15 | 16 | 17 | Open In Colab 18 | 19 | 20 | [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/intro-stat-learning/ISLP_labs/v2.2?labpath=Ch09-svm-lab.ipynb) 21 | 22 | 23 | In this lab, we use the `sklearn.svm` library to demonstrate the support 24 | vector classifier and the support vector machine. 25 | 26 | We import some of our usual libraries. 27 | 28 | ```{python} 29 | import numpy as np 30 | from matplotlib.pyplot import subplots, cm 31 | import sklearn.model_selection as skm 32 | from ISLP import load_data, confusion_table 33 | 34 | ``` 35 | We also collect the new imports 36 | needed for this lab. 37 | 38 | ```{python} 39 | from sklearn.svm import SVC 40 | from ISLP.svm import plot as plot_svm 41 | from sklearn.metrics import RocCurveDisplay 42 | 43 | ``` 44 | 45 | We will use the function `RocCurveDisplay.from_estimator()` to 46 | produce several ROC plots, using a shorthand `roc_curve`. 47 | 48 | ```{python} 49 | roc_curve = RocCurveDisplay.from_estimator # shorthand 50 | 51 | ``` 52 | 53 | ## Support Vector Classifier 54 | 55 | We now use the `SupportVectorClassifier()` function (abbreviated `SVC()`) from `sklearn` to fit the support vector 56 | classifier for a given value of the parameter `C`. The 57 | `C` argument allows us to specify the cost of a violation to 58 | the margin. When the `C` argument is small, then the margins 59 | will be wide and many support vectors will be on the margin or will 60 | violate the margin. When the `C` argument is large, then the 61 | margins will be narrow and there will be few support vectors on the 62 | margin or violating the margin. 63 | 64 | Here we demonstrate 65 | the use of `SVC()` on a two-dimensional example, so that we can 66 | plot the resulting decision boundary. We begin by generating the 67 | observations, which belong to two classes, and checking whether the 68 | classes are linearly separable. 69 | 70 | ```{python} 71 | rng = np.random.default_rng(1) 72 | X = rng.standard_normal((50, 2)) 73 | y = np.array([-1]*25+[1]*25) 74 | X[y==1] += 1 75 | fig, ax = subplots(figsize=(8,8)) 76 | ax.scatter(X[:,0], 77 | X[:,1], 78 | c=y, 79 | cmap=cm.coolwarm); 80 | 81 | ``` 82 | They are not. We now fit the classifier. 83 | 84 | ```{python} 85 | svm_linear = SVC(C=10, kernel='linear') 86 | svm_linear.fit(X, y) 87 | 88 | ``` 89 | 90 | 91 | The support vector classifier with two features can 92 | be visualized by plotting values of its *decision function*. 93 | We have included a function for this in the `ISLP` package (inspired by a similar 94 | example in the `sklearn` docs). 95 | 96 | ```{python} 97 | fig, ax = subplots(figsize=(8,8)) 98 | plot_svm(X, 99 | y, 100 | svm_linear, 101 | ax=ax) 102 | 103 | ``` 104 | 105 | The decision 106 | boundary between the two classes is linear (because we used the 107 | argument `kernel='linear'`). The support vectors are marked with `+` 108 | and the remaining observations are plotted as circles. 109 | 110 | What if we instead used a smaller value of the cost parameter? 111 | 112 | ```{python} 113 | svm_linear_small = SVC(C=0.1, kernel='linear') 114 | svm_linear_small.fit(X, y) 115 | fig, ax = subplots(figsize=(8,8)) 116 | plot_svm(X, 117 | y, 118 | svm_linear_small, 119 | ax=ax) 120 | 121 | ``` 122 | With a smaller value of the cost parameter, we 123 | obtain a larger number of support vectors, because the margin is now 124 | wider. For linear kernels, we can extract the 125 | coefficients of the linear decision boundary as follows: 126 | 127 | ```{python} 128 | svm_linear.coef_ 129 | 130 | ``` 131 | 132 | 133 | Since the support vector machine is an estimator in `sklearn`, we 134 | can use the usual machinery to tune it. 135 | 136 | ```{python} 137 | kfold = skm.KFold(5, 138 | random_state=0, 139 | shuffle=True) 140 | grid = skm.GridSearchCV(svm_linear, 141 | {'C':[0.001,0.01,0.1,1,5,10,100]}, 142 | refit=True, 143 | cv=kfold, 144 | scoring='accuracy') 145 | grid.fit(X, y) 146 | grid.best_params_ 147 | 148 | ``` 149 | 150 | 151 | We can easily access the cross-validation errors for each of these models 152 | in `grid.cv_results_`. This prints out a lot of detail, so we 153 | extract the accuracy results only. 154 | 155 | ```{python} 156 | grid.cv_results_[('mean_test_score')] 157 | 158 | ``` 159 | We see that `C=1` results in the highest cross-validation 160 | accuracy of 0.74, though 161 | the accuracy is the same for several values of `C`. 162 | The classifier `grid.best_estimator_` can be used to predict the class 163 | label on a set of test observations. Let’s generate a test data set. 164 | 165 | ```{python} 166 | X_test = rng.standard_normal((20, 2)) 167 | y_test = np.array([-1]*10+[1]*10) 168 | X_test[y_test==1] += 1 169 | 170 | ``` 171 | 172 | Now we predict the class labels of these test observations. Here we 173 | use the best model selected by cross-validation in order to make the 174 | predictions. 175 | 176 | ```{python} 177 | best_ = grid.best_estimator_ 178 | y_test_hat = best_.predict(X_test) 179 | confusion_table(y_test_hat, y_test) 180 | 181 | ``` 182 | 183 | Thus, with this value of `C`, 184 | 70% of the test 185 | observations are correctly classified. What if we had instead used 186 | `C=0.001`? 187 | 188 | ```{python} 189 | svm_ = SVC(C=0.001, 190 | kernel='linear').fit(X, y) 191 | y_test_hat = svm_.predict(X_test) 192 | confusion_table(y_test_hat, y_test) 193 | 194 | ``` 195 | 196 | In this case 60% of test observations are correctly classified. 197 | 198 | We now consider a situation in which the two classes are linearly 199 | separable. Then we can find an optimal separating hyperplane using the 200 | `SVC()` estimator. We first 201 | further separate the two classes in our simulated data so that they 202 | are linearly separable: 203 | 204 | ```{python} 205 | X[y==1] += 1.9; 206 | fig, ax = subplots(figsize=(8,8)) 207 | ax.scatter(X[:,0], X[:,1], c=y, cmap=cm.coolwarm); 208 | 209 | ``` 210 | 211 | Now the observations are just barely linearly separable. 212 | 213 | ```{python} 214 | svm_ = SVC(C=1e5, kernel='linear').fit(X, y) 215 | y_hat = svm_.predict(X) 216 | confusion_table(y_hat, y) 217 | 218 | ``` 219 | 220 | We fit the 221 | support vector classifier and plot the resulting hyperplane, using a 222 | very large value of `C` so that no observations are 223 | misclassified. 224 | 225 | ```{python} 226 | fig, ax = subplots(figsize=(8,8)) 227 | plot_svm(X, 228 | y, 229 | svm_, 230 | ax=ax) 231 | 232 | ``` 233 | Indeed no training errors were made and only three support vectors were used. 234 | In fact, the large value of `C` also means that these three support points are *on the margin*, and define it. 235 | One may wonder how good the classifier could be on test data that depends on only three data points! 236 | We now try a smaller 237 | value of `C`. 238 | 239 | ```{python} 240 | svm_ = SVC(C=0.1, kernel='linear').fit(X, y) 241 | y_hat = svm_.predict(X) 242 | confusion_table(y_hat, y) 243 | 244 | ``` 245 | 246 | Using `C=0.1`, we again do not misclassify any training observations, but we 247 | also obtain a much wider margin and make use of twelve support 248 | vectors. These jointly define the orientation of the decision boundary, and since there are more of them, it is more stable. It seems possible that this model will perform better on test 249 | data than the model with `C=1e5` (and indeed, a simple experiment with a large test set would bear this out). 250 | 251 | ```{python} 252 | fig, ax = subplots(figsize=(8,8)) 253 | plot_svm(X, 254 | y, 255 | svm_, 256 | ax=ax) 257 | 258 | ``` 259 | 260 | 261 | ## Support Vector Machine 262 | In order to fit an SVM using a non-linear kernel, we once again use 263 | the `SVC()` estimator. However, now we use a different value 264 | of the parameter `kernel`. To fit an SVM with a polynomial 265 | kernel we use `kernel="poly"`, and to fit an SVM with a 266 | radial kernel we use 267 | `kernel="rbf"`. In the former case we also use the 268 | `degree` argument to specify a degree for the polynomial kernel 269 | (this is $d$ in (9.22)), and in the latter case we use 270 | `gamma` to specify a value of $\gamma$ for the radial basis 271 | kernel (9.24). 272 | 273 | We first generate some data with a non-linear class boundary, as follows: 274 | 275 | ```{python} 276 | X = rng.standard_normal((200, 2)) 277 | X[:100] += 2 278 | X[100:150] -= 2 279 | y = np.array([1]*150+[2]*50) 280 | 281 | ``` 282 | 283 | Plotting the data makes it clear that the class boundary is indeed non-linear. 284 | 285 | ```{python} 286 | fig, ax = subplots(figsize=(8,8)) 287 | ax.scatter(X[:,0], 288 | X[:,1], 289 | c=y, 290 | cmap=cm.coolwarm); 291 | 292 | ``` 293 | 294 | 295 | The data is randomly split into training and testing groups. We then 296 | fit the training data using the `SVC()` estimator with a 297 | radial kernel and $\gamma=1$: 298 | 299 | ```{python} 300 | (X_train, 301 | X_test, 302 | y_train, 303 | y_test) = skm.train_test_split(X, 304 | y, 305 | test_size=0.5, 306 | random_state=0) 307 | svm_rbf = SVC(kernel="rbf", gamma=1, C=1) 308 | svm_rbf.fit(X_train, y_train) 309 | 310 | ``` 311 | 312 | The plot shows that the resulting SVM has a decidedly non-linear 313 | boundary. 314 | 315 | ```{python} 316 | fig, ax = subplots(figsize=(8,8)) 317 | plot_svm(X_train, 318 | y_train, 319 | svm_rbf, 320 | ax=ax) 321 | 322 | ``` 323 | 324 | We can see from the figure that there are a fair number of training 325 | errors in this SVM fit. If we increase the value of `C`, we 326 | can reduce the number of training errors. However, this comes at the 327 | price of a more irregular decision boundary that seems to be at risk 328 | of overfitting the data. 329 | 330 | ```{python} 331 | svm_rbf = SVC(kernel="rbf", gamma=1, C=1e5) 332 | svm_rbf.fit(X_train, y_train) 333 | fig, ax = subplots(figsize=(8,8)) 334 | plot_svm(X_train, 335 | y_train, 336 | svm_rbf, 337 | ax=ax) 338 | 339 | ``` 340 | 341 | We can perform cross-validation using `skm.GridSearchCV()` to select the 342 | best choice of $\gamma$ and `C` for an SVM with a radial 343 | kernel: 344 | 345 | ```{python} 346 | kfold = skm.KFold(5, 347 | random_state=0, 348 | shuffle=True) 349 | grid = skm.GridSearchCV(svm_rbf, 350 | {'C':[0.1,1,10,100,1000], 351 | 'gamma':[0.5,1,2,3,4]}, 352 | refit=True, 353 | cv=kfold, 354 | scoring='accuracy'); 355 | grid.fit(X_train, y_train) 356 | grid.best_params_ 357 | 358 | ``` 359 | 360 | The best choice of parameters under five-fold CV is achieved at `C=1` 361 | and `gamma=0.5`, though several other values also achieve the same 362 | value. 363 | 364 | ```{python} 365 | best_svm = grid.best_estimator_ 366 | fig, ax = subplots(figsize=(8,8)) 367 | plot_svm(X_train, 368 | y_train, 369 | best_svm, 370 | ax=ax) 371 | 372 | y_hat_test = best_svm.predict(X_test) 373 | confusion_table(y_hat_test, y_test) 374 | 375 | ``` 376 | 377 | With these parameters, 12% of test 378 | observations are misclassified by this SVM. 379 | 380 | 381 | ## ROC Curves 382 | 383 | SVMs and support vector classifiers output class labels for each 384 | observation. However, it is also possible to obtain *fitted values* 385 | for each observation, which are the numerical scores used to 386 | obtain the class labels. For instance, in the case of a support vector 387 | classifier, the fitted value for an observation $X= (X_1, X_2, \ldots, 388 | X_p)^T$ takes the form $\hat{\beta}_0 + \hat{\beta}_1 X_1 + 389 | \hat{\beta}_2 X_2 + \ldots + \hat{\beta}_p X_p$. For an SVM with a 390 | non-linear kernel, the equation that yields the fitted value is given 391 | in (9.23). The sign of the fitted value 392 | determines on which side of the decision boundary the observation 393 | lies. Therefore, the relationship between the fitted value and the 394 | class prediction for a given observation is simple: if the fitted 395 | value exceeds zero then the observation is assigned to one class, and 396 | if it is less than zero then it is assigned to the other. 397 | By changing this threshold from zero to some positive value, 398 | we skew the classifications in favor of one class versus the other. 399 | By considering a range of these thresholds, positive and negative, we produce the ingredients for a ROC plot. 400 | We can access these values by calling the `decision_function()` 401 | method of a fitted SVM estimator. 402 | 403 | The function `ROCCurveDisplay.from_estimator()` (which we have abbreviated to `roc_curve()`) will produce a plot of a ROC curve. It takes a fitted estimator as its first argument, followed 404 | by a model matrix $X$ and labels $y$. The argument `name` is used in the legend, 405 | while `color` is used for the color of the line. Results are plotted 406 | on our axis object `ax`. 407 | 408 | ```{python} 409 | fig, ax = subplots(figsize=(8,8)) 410 | roc_curve(best_svm, 411 | X_train, 412 | y_train, 413 | name='Training', 414 | color='r', 415 | ax=ax); 416 | 417 | ``` 418 | In this example, the SVM appears to provide accurate predictions. By increasing 419 | $\gamma$ we can produce a more flexible fit and generate further 420 | improvements in accuracy. 421 | 422 | ```{python} 423 | svm_flex = SVC(kernel="rbf", 424 | gamma=50, 425 | C=1) 426 | svm_flex.fit(X_train, y_train) 427 | fig, ax = subplots(figsize=(8,8)) 428 | roc_curve(svm_flex, 429 | X_train, 430 | y_train, 431 | name='Training $\gamma=50$', 432 | color='r', 433 | ax=ax); 434 | 435 | ``` 436 | 437 | However, these ROC curves are all on the training data. We are really 438 | more interested in the level of prediction accuracy on the test 439 | data. When we compute the ROC curves on the test data, the model with 440 | $\gamma=0.5$ appears to provide the most accurate results. 441 | 442 | ```{python} 443 | roc_curve(svm_flex, 444 | X_test, 445 | y_test, 446 | name='Test $\gamma=50$', 447 | color='b', 448 | ax=ax) 449 | fig; 450 | 451 | ``` 452 | 453 | Let’s look at our tuned SVM. 454 | 455 | ```{python} 456 | fig, ax = subplots(figsize=(8,8)) 457 | for (X_, y_, c, name) in zip( 458 | (X_train, X_test), 459 | (y_train, y_test), 460 | ('r', 'b'), 461 | ('CV tuned on training', 462 | 'CV tuned on test')): 463 | roc_curve(best_svm, 464 | X_, 465 | y_, 466 | name=name, 467 | ax=ax, 468 | color=c) 469 | 470 | ``` 471 | 472 | ## SVM with Multiple Classes 473 | 474 | If the response is a factor containing more than two levels, then the 475 | `SVC()` function will perform multi-class classification using 476 | either the one-versus-one approach (when `decision_function_shape=='ovo'`) 477 | or one-versus-rest {One-versus-rest is also known as one-versus-all.} (when `decision_function_shape=='ovr'`). 478 | We explore that setting briefly here by 479 | generating a third class of observations. 480 | 481 | ```{python} 482 | rng = np.random.default_rng(123) 483 | X = np.vstack([X, rng.standard_normal((50, 2))]) 484 | y = np.hstack([y, [0]*50]) 485 | X[y==0,1] += 2 486 | fig, ax = subplots(figsize=(8,8)) 487 | ax.scatter(X[:,0], X[:,1], c=y, cmap=cm.coolwarm); 488 | 489 | ``` 490 | 491 | We now fit an SVM to the data: 492 | 493 | ```{python} 494 | svm_rbf_3 = SVC(kernel="rbf", 495 | C=10, 496 | gamma=1, 497 | decision_function_shape='ovo'); 498 | svm_rbf_3.fit(X, y) 499 | fig, ax = subplots(figsize=(8,8)) 500 | plot_svm(X, 501 | y, 502 | svm_rbf_3, 503 | scatter_cmap=cm.tab10, 504 | ax=ax) 505 | 506 | ``` 507 | The `sklearn.svm` library can also be used to perform support vector 508 | regression with a numerical response using the estimator `SupportVectorRegression()`. 509 | 510 | 511 | ## Application to Gene Expression Data 512 | 513 | We now examine the `Khan` data set, which consists of a number of 514 | tissue samples corresponding to four distinct types of small round 515 | blue cell tumors. For each tissue sample, gene expression measurements 516 | are available. The data set consists of training data, `xtrain` 517 | and `ytrain`, and testing data, `xtest` and `ytest`. 518 | 519 | We examine the dimension of the data: 520 | 521 | ```{python} 522 | Khan = load_data('Khan') 523 | Khan['xtrain'].shape, Khan['xtest'].shape 524 | 525 | ``` 526 | 527 | This data set consists of expression measurements for 2,308 528 | genes. The training and test sets consist of 63 and 20 529 | observations, respectively. 530 | 531 | We will use a support vector approach to predict cancer subtype using 532 | gene expression measurements. In this data set, there is a very 533 | large number of features relative to the number of observations. This 534 | suggests that we should use a linear kernel, because the additional 535 | flexibility that will result from using a polynomial or radial kernel 536 | is unnecessary. 537 | 538 | ```{python} 539 | khan_linear = SVC(kernel='linear', C=10) 540 | khan_linear.fit(Khan['xtrain'], Khan['ytrain']) 541 | confusion_table(khan_linear.predict(Khan['xtrain']), 542 | Khan['ytrain']) 543 | 544 | ``` 545 | 546 | We see that there are *no* training 547 | errors. In fact, this is not surprising, because the large number of 548 | variables relative to the number of observations implies that it is 549 | easy to find hyperplanes that fully separate the classes. We are more 550 | interested in the support vector classifier’s performance on the 551 | test observations. 552 | 553 | ```{python} 554 | confusion_table(khan_linear.predict(Khan['xtest']), 555 | Khan['ytest']) 556 | 557 | ``` 558 | 559 | We see that using `C=10` yields two test set errors on these data. 560 | 561 | 562 | -------------------------------------------------------------------------------- /Ch11-surv-lab.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | jupyter: 3 | jupytext: 4 | cell_metadata_filter: -all 5 | formats: ipynb,Rmd 6 | main_language: python 7 | text_representation: 8 | extension: .Rmd 9 | format_name: rmarkdown 10 | format_version: '1.2' 11 | jupytext_version: 1.16.7 12 | --- 13 | 14 | # Survival Analysis 15 | 16 | 17 | Open In Colab 18 | 19 | 20 | [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/intro-stat-learning/ISLP_labs/v2.2?labpath=Ch11-surv-lab.ipynb) 21 | 22 | 23 | In this lab, we perform survival analyses on three separate data 24 | sets. In Section 11.8.1 we analyze the `BrainCancer` 25 | data that was first described in Section 11.3. In Section 11.8.2, we examine the `Publication` 26 | data from Section 11.5.4. Finally, Section 11.8.3 explores 27 | a simulated call-center data set. 28 | 29 | We begin by importing some of our libraries at this top 30 | level. This makes the code more readable, as scanning the first few 31 | lines of the notebook tell us what libraries are used in this 32 | notebook. 33 | 34 | ```{python} 35 | from matplotlib.pyplot import subplots 36 | import numpy as np 37 | import pandas as pd 38 | from ISLP.models import ModelSpec as MS 39 | from ISLP import load_data 40 | 41 | ``` 42 | 43 | We also collect the new imports 44 | needed for this lab. 45 | 46 | ```{python} 47 | from lifelines import \ 48 | (KaplanMeierFitter, 49 | CoxPHFitter) 50 | from lifelines.statistics import \ 51 | (logrank_test, 52 | multivariate_logrank_test) 53 | from ISLP.survival import sim_time 54 | 55 | ``` 56 | 57 | ## Brain Cancer Data 58 | 59 | We begin with the `BrainCancer` data set, contained in the `ISLP` package. 60 | 61 | ```{python} 62 | BrainCancer = load_data('BrainCancer') 63 | BrainCancer.columns 64 | 65 | ``` 66 | 67 | The rows index the 88 patients, while the 8 columns contain the predictors and outcome variables. 68 | We first briefly examine the data. 69 | 70 | ```{python} 71 | BrainCancer['sex'].value_counts() 72 | 73 | ``` 74 | 75 | 76 | ```{python} 77 | BrainCancer['diagnosis'].value_counts() 78 | 79 | ``` 80 | 81 | 82 | ```{python} 83 | BrainCancer['status'].value_counts() 84 | 85 | ``` 86 | 87 | 88 | Before beginning an analysis, it is important to know how the 89 | `status` variable has been coded. Most software 90 | uses the convention that a `status` of 1 indicates an 91 | uncensored observation (often death), and a `status` of 0 indicates a censored 92 | observation. But some scientists might use the opposite coding. For 93 | the `BrainCancer` data set 35 patients died before the end of 94 | the study, so we are using the conventional coding. 95 | 96 | To begin the analysis, we re-create the Kaplan-Meier survival curve shown in Figure 11.2. The main 97 | package we will use for survival analysis 98 | is `lifelines`. 99 | The variable `time` corresponds to $y_i$, the time to the $i$th event (either censoring or 100 | death). The first argument to `km.fit` is the event time, and the 101 | second argument is the censoring variable, with a 1 indicating an observed 102 | failure time. The `plot()` method produces a survival curve with pointwise confidence 103 | intervals. By default, these are 90% confidence intervals, but this can be changed 104 | by setting the `alpha` argument to one minus the desired 105 | confidence level. 106 | 107 | ```{python} 108 | fig, ax = subplots(figsize=(8,8)) 109 | km = KaplanMeierFitter() 110 | km_brain = km.fit(BrainCancer['time'], BrainCancer['status']) 111 | km_brain.plot(label='Kaplan Meier estimate', ax=ax) 112 | 113 | ``` 114 | 115 | Next we create Kaplan-Meier survival curves that are stratified by 116 | `sex`, in order to reproduce Figure 11.3. 117 | We do this using the `groupby()` method of a dataframe. 118 | This method returns a generator that can 119 | be iterated over in the `for` loop. In this case, 120 | the items in the `for` loop are 2-tuples representing 121 | the groups: the first entry is the value 122 | of the grouping column `sex` while the second value 123 | is the dataframe consisting of all rows in the 124 | dataframe matching that value of `sex`. 125 | We will want to use this data below 126 | in the log-rank test, hence we store this 127 | information in the dictionary `by_sex`. Finally, 128 | we have also used the notion of 129 | *string interpolation* to automatically 130 | label the different lines in the plot. String 131 | interpolation is a powerful technique to format strings --- 132 | `Python` has many ways to facilitate such operations. 133 | 134 | ```{python} 135 | fig, ax = subplots(figsize=(8,8)) 136 | by_sex = {} 137 | for sex, df in BrainCancer.groupby('sex'): 138 | by_sex[sex] = df 139 | km_sex = km.fit(df['time'], df['status']) 140 | km_sex.plot(label='Sex=%s' % sex, ax=ax) 141 | 142 | ``` 143 | 144 | As discussed in Section 11.4, we can perform a 145 | log-rank test to compare the survival of males to females. We use 146 | the `logrank_test()` function from the `lifelines.statistics` module. 147 | The first two arguments are the event times, with the second 148 | denoting the corresponding (optional) censoring indicators. 149 | 150 | ```{python} 151 | logrank_test(by_sex['Male']['time'], 152 | by_sex['Female']['time'], 153 | by_sex['Male']['status'], 154 | by_sex['Female']['status']) 155 | 156 | ``` 157 | 158 | 159 | The resulting $p$-value is $0.23$, indicating no evidence of a 160 | difference in survival between the two sexes. 161 | 162 | Next, we use the `CoxPHFitter()` estimator 163 | from `lifelines` to fit Cox proportional hazards models. 164 | To begin, we consider a model that uses `sex` as the only predictor. 165 | 166 | ```{python} 167 | coxph = CoxPHFitter # shorthand 168 | sex_df = BrainCancer[['time', 'status', 'sex']] 169 | model_df = MS(['time', 'status', 'sex'], 170 | intercept=False).fit_transform(sex_df) 171 | cox_fit = coxph().fit(model_df, 172 | 'time', 173 | 'status') 174 | cox_fit.summary[['coef', 'se(coef)', 'p']] 175 | 176 | ``` 177 | 178 | The first argument to `fit` should be a data frame containing 179 | at least the event time (the second argument `time` in this case), 180 | as well as an optional censoring variable (the argument `status` in this case). 181 | Note also that the Cox model does not include an intercept, which is why 182 | we used the `intercept=False` argument to `ModelSpec` above. 183 | The `summary()` method delivers many columns; we chose to abbreviate its output here. 184 | It is possible to obtain the likelihood ratio test comparing this model to the one 185 | with no features as follows: 186 | 187 | ```{python} 188 | cox_fit.log_likelihood_ratio_test() 189 | 190 | ``` 191 | 192 | Regardless of which test we use, we see that there is no clear 193 | evidence for a difference in survival between males and females. As 194 | we learned in this chapter, the score test from the Cox model is 195 | exactly equal to the log rank test statistic! 196 | 197 | Now we fit a model that makes use of additional predictors. We first note 198 | that one of our `diagnosis` values is missing, hence 199 | we drop that observation before continuing. 200 | 201 | ```{python} 202 | cleaned = BrainCancer.dropna() 203 | all_MS = MS(cleaned.columns, intercept=False) 204 | all_df = all_MS.fit_transform(cleaned) 205 | fit_all = coxph().fit(all_df, 206 | 'time', 207 | 'status') 208 | fit_all.summary[['coef', 'se(coef)', 'p']] 209 | 210 | ``` 211 | 212 | The `diagnosis` variable has been coded so that the baseline 213 | corresponds to HG glioma. The results indicate that the risk associated with HG glioma 214 | is more than eight times (i.e. $e^{2.15}=8.62$) the risk associated 215 | with meningioma. In other words, after adjusting for the other 216 | predictors, patients with HG glioma have much worse survival compared 217 | to those with meningioma. In addition, larger values of the Karnofsky 218 | index, `ki`, are associated with lower risk, i.e. longer survival. 219 | 220 | Finally, we plot estimated survival curves for each diagnosis category, 221 | adjusting for the other predictors. To make these plots, we set the 222 | values of the other predictors equal to the mean for quantitative variables 223 | and equal to the mode for categorical. To do this, we use the 224 | `apply()` method across rows (i.e. `axis=0`) with a function 225 | `representative` that checks if a column is categorical 226 | or not. 227 | 228 | ```{python} 229 | levels = cleaned['diagnosis'].unique() 230 | def representative(series): 231 | if hasattr(series.dtype, 'categories'): 232 | return pd.Series.mode(series) 233 | else: 234 | return series.mean() 235 | modal_data = cleaned.apply(representative, axis=0) 236 | 237 | ``` 238 | 239 | We make four 240 | copies of the column means and assign the `diagnosis` column to be the four different 241 | diagnoses. 242 | 243 | ```{python} 244 | modal_df = pd.DataFrame( 245 | [modal_data.iloc[0] for _ in range(len(levels))]) 246 | modal_df['diagnosis'] = levels 247 | modal_df 248 | 249 | ``` 250 | 251 | We then construct the model matrix based on the model specification `all_MS` used to fit 252 | the model, and name the rows according to the levels of `diagnosis`. 253 | 254 | ```{python} 255 | modal_X = all_MS.transform(modal_df) 256 | modal_X.index = levels 257 | modal_X 258 | 259 | ``` 260 | 261 | We can use the `predict_survival_function()` method to obtain the estimated survival function. 262 | 263 | ```{python} 264 | predicted_survival = fit_all.predict_survival_function(modal_X) 265 | predicted_survival 266 | 267 | ``` 268 | This returns a data frame, 269 | whose plot methods yields the different survival curves. To avoid clutter in 270 | the plots, we do not display confidence intervals. 271 | 272 | ```{python} 273 | fig, ax = subplots(figsize=(8, 8)) 274 | predicted_survival.plot(ax=ax); 275 | 276 | ``` 277 | 278 | 279 | ## Publication Data 280 | The `Publication` data presented in Section 11.5.4 can be 281 | found in the `ISLP` package. 282 | We first reproduce Figure 11.5 by plotting the Kaplan-Meier curves 283 | stratified on the `posres` variable, which records whether the 284 | study had a positive or negative result. 285 | 286 | ```{python} 287 | fig, ax = subplots(figsize=(8,8)) 288 | Publication = load_data('Publication') 289 | by_result = {} 290 | for result, df in Publication.groupby('posres'): 291 | by_result[result] = df 292 | km_result = km.fit(df['time'], df['status']) 293 | km_result.plot(label='Result=%d' % result, ax=ax) 294 | 295 | ``` 296 | 297 | As discussed previously, the $p$-values from fitting Cox’s 298 | proportional hazards model to the `posres` variable are quite 299 | large, providing no evidence of a difference in time-to-publication 300 | between studies with positive versus negative results. 301 | 302 | ```{python} 303 | posres_df = MS(['posres', 304 | 'time', 305 | 'status'], 306 | intercept=False).fit_transform(Publication) 307 | posres_fit = coxph().fit(posres_df, 308 | 'time', 309 | 'status') 310 | posres_fit.summary[['coef', 'se(coef)', 'p']] 311 | 312 | ``` 313 | 314 | 315 | However, the results change dramatically when we include other 316 | predictors in the model. Here we exclude the funding mechanism 317 | variable. 318 | 319 | ```{python} 320 | model = MS(Publication.columns.drop('mech'), 321 | intercept=False) 322 | coxph().fit(model.fit_transform(Publication), 323 | 'time', 324 | 'status').summary[['coef', 'se(coef)', 'p']] 325 | 326 | ``` 327 | 328 | We see that there are a number of statistically significant variables, 329 | including whether the trial focused on a clinical endpoint, the impact 330 | of the study, and whether the study had positive or negative results. 331 | 332 | 333 | ## Call Center Data 334 | 335 | In this section, we will simulate survival data using the relationship 336 | between cumulative hazard and 337 | the survival function explored in Exercise 8. 338 | Our simulated data will represent the observed 339 | wait times (in seconds) for 2,000 customers who have phoned a call 340 | center. In this context, censoring occurs if a customer hangs up 341 | before his or her call is answered. 342 | 343 | There are three covariates: `Operators` (the number of call 344 | center operators available at the time of the call, which can range 345 | from $5$ to $15$), `Center` (either A, B, or C), and 346 | `Time` of day (Morning, Afternoon, or Evening). We generate data 347 | for these covariates so that all possibilities are equally likely: for 348 | instance, morning, afternoon and evening calls are equally likely, and 349 | any number of operators from $5$ to $15$ is equally likely. 350 | 351 | ```{python} 352 | rng = np.random.default_rng(10) 353 | N = 2000 354 | Operators = rng.choice(np.arange(5, 16), 355 | N, 356 | replace=True) 357 | Center = rng.choice(['A', 'B', 'C'], 358 | N, 359 | replace=True) 360 | Time = rng.choice(['Morn.', 'After.', 'Even.'], 361 | N, 362 | replace=True) 363 | D = pd.DataFrame({'Operators': Operators, 364 | 'Center': pd.Categorical(Center), 365 | 'Time': pd.Categorical(Time)}) 366 | ``` 367 | 368 | We then build a model matrix (omitting the intercept) 369 | 370 | ```{python} 371 | model = MS(['Operators', 372 | 'Center', 373 | 'Time'], 374 | intercept=False) 375 | X = model.fit_transform(D) 376 | ``` 377 | 378 | It is worthwhile to take a peek at the model matrix `X`, so 379 | that we can be sure that we understand how the variables have been coded. By default, 380 | the levels of categorical variables are sorted and, as usual, the first column of the one-hot encoding 381 | of the variable is dropped. 382 | 383 | ```{python} 384 | X[:5] 385 | 386 | ``` 387 | 388 | Next, we specify the coefficients and the hazard function. 389 | 390 | ```{python} 391 | true_beta = np.array([0.04, -0.3, 0, 0.2, -0.2]) 392 | true_linpred = X.dot(true_beta) 393 | hazard = lambda t: 1e-5 * t 394 | 395 | ``` 396 | 397 | Here, we have set the coefficient associated with `Operators` to 398 | equal $0.04$; in other words, each additional operator leads to a 399 | $e^{0.04}=1.041$-fold increase in the “risk” that the call will be 400 | answered, given the `Center` and `Time` covariates. This 401 | makes sense: the greater the number of operators at hand, the shorter 402 | the wait time! The coefficient associated with `Center == B` is 403 | $-0.3$, and `Center == A` is treated as the baseline. This means 404 | that the risk of a call being answered at Center B is 0.74 times the 405 | risk that it will be answered at Center A; in other words, the wait 406 | times are a bit longer at Center B. 407 | 408 | Recall from Section 2.3.7 the use of `lambda` 409 | for creating short functions on the fly. 410 | We use the function 411 | `sim_time()` from the `ISLP.survival` package. This function 412 | uses the relationship between the survival function 413 | and cumulative hazard $S(t) = \exp(-H(t))$ and the specific 414 | form of the cumulative hazard function in the Cox model 415 | to simulate data based on values of the linear predictor 416 | `true_linpred` and the cumulative hazard. 417 | We need to provide the cumulative hazard function, which we do here. 418 | 419 | ```{python} 420 | cum_hazard = lambda t: 1e-5 * t**2 / 2 421 | 422 | ``` 423 | We are now ready to generate data under the Cox proportional hazards 424 | model. We truncate the maximum time to 1000 seconds to keep 425 | simulated wait times reasonable. The function 426 | `sim_time()` takes a linear predictor, 427 | a cumulative hazard function and a 428 | random number generator. 429 | 430 | ```{python} 431 | W = np.array([sim_time(l, cum_hazard, rng) 432 | for l in true_linpred]) 433 | D['Wait time'] = np.clip(W, 0, 1000) 434 | 435 | ``` 436 | 437 | We now simulate our censoring variable, for which we assume 438 | 90% of calls were answered (`Failed==1`) before the 439 | customer hung up (`Failed==0`). 440 | 441 | ```{python} 442 | D['Failed'] = rng.choice([1, 0], 443 | N, 444 | p=[0.9, 0.1]) 445 | D[:5] 446 | 447 | ``` 448 | 449 | 450 | ```{python} 451 | D['Failed'].mean() 452 | 453 | ``` 454 | 455 | We now plot Kaplan-Meier survival curves. First, we stratify by `Center`. 456 | 457 | ```{python} 458 | fig, ax = subplots(figsize=(8,8)) 459 | by_center = {} 460 | for center, df in D.groupby('Center'): 461 | by_center[center] = df 462 | km_center = km.fit(df['Wait time'], df['Failed']) 463 | km_center.plot(label='Center=%s' % center, ax=ax) 464 | ax.set_title("Probability of Still Being on Hold") 465 | 466 | ``` 467 | 468 | Next, we stratify by `Time`. 469 | 470 | ```{python} 471 | fig, ax = subplots(figsize=(8,8)) 472 | by_time = {} 473 | for time, df in D.groupby('Time'): 474 | by_time[time] = df 475 | km_time = km.fit(df['Wait time'], df['Failed']) 476 | km_time.plot(label='Time=%s' % time, ax=ax) 477 | ax.set_title("Probability of Still Being on Hold") 478 | 479 | ``` 480 | 481 | It seems that calls at Call Center B take longer to be answered than 482 | calls at Centers A and C. Similarly, it appears that wait times are 483 | longest in the morning and shortest in the evening hours. We can use a 484 | log-rank test to determine whether these differences are statistically 485 | significant using the function `multivariate_logrank_test()`. 486 | 487 | ```{python} 488 | multivariate_logrank_test(D['Wait time'], 489 | D['Center'], 490 | D['Failed']) 491 | 492 | ``` 493 | 494 | 495 | Next, we consider the effect of `Time`. 496 | 497 | ```{python} 498 | multivariate_logrank_test(D['Wait time'], 499 | D['Time'], 500 | D['Failed']) 501 | 502 | ``` 503 | 504 | 505 | As in the case of a categorical variable with 2 levels, these 506 | results are similar to the likelihood ratio test 507 | from the Cox proportional hazards model. First, we 508 | look at the results for `Center`. 509 | 510 | ```{python} 511 | X = MS(['Wait time', 512 | 'Failed', 513 | 'Center'], 514 | intercept=False).fit_transform(D) 515 | F = coxph().fit(X, 'Wait time', 'Failed') 516 | F.log_likelihood_ratio_test() 517 | 518 | ``` 519 | 520 | 521 | Next, we look at the results for `Time`. 522 | 523 | ```{python} 524 | X = MS(['Wait time', 525 | 'Failed', 526 | 'Time'], 527 | intercept=False).fit_transform(D) 528 | F = coxph().fit(X, 'Wait time', 'Failed') 529 | F.log_likelihood_ratio_test() 530 | 531 | ``` 532 | 533 | 534 | We find that differences between centers are highly significant, as 535 | are differences between times of day. 536 | 537 | Finally, we fit Cox's proportional hazards model to the data. 538 | 539 | ```{python} 540 | X = MS(D.columns, 541 | intercept=False).fit_transform(D) 542 | fit_queuing = coxph().fit( 543 | X, 544 | 'Wait time', 545 | 'Failed') 546 | fit_queuing.summary[['coef', 'se(coef)', 'p']] 547 | 548 | ``` 549 | 550 | 551 | The $p$-values for Center B and evening time 552 | are very small. It is also clear that the 553 | hazard --- that is, the instantaneous risk that a call will be 554 | answered --- increases with the number of operators. Since we 555 | generated the data ourselves, we know that the true coefficients for 556 | `Operators`, `Center = B`, `Center = C`, 557 | `Time = Even.` and `Time = Morn.` are $0.04$, $-0.3$, 558 | $0$, $0.2$, and $-0.2$, respectively. The coefficient estimates 559 | from the fitted Cox model are fairly accurate. 560 | 561 | 562 | -------------------------------------------------------------------------------- /Ch13-multiple-lab.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | jupyter: 3 | jupytext: 4 | cell_metadata_filter: -all 5 | formats: ipynb,Rmd 6 | main_language: python 7 | text_representation: 8 | extension: .Rmd 9 | format_name: rmarkdown 10 | format_version: '1.2' 11 | jupytext_version: 1.16.7 12 | --- 13 | 14 | # Multiple Testing 15 | 16 | 17 | Open In Colab 18 | 19 | 20 | [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/intro-stat-learning/ISLP_labs/v2.2?labpath=Ch13-multiple-lab.ipynb) 21 | 22 | 23 | 24 | 25 | 26 | We include our usual imports seen in earlier labs. 27 | 28 | ```{python} 29 | import numpy as np 30 | import pandas as pd 31 | import matplotlib.pyplot as plt 32 | import statsmodels.api as sm 33 | from ISLP import load_data 34 | 35 | ``` 36 | 37 | We also collect the new imports 38 | needed for this lab. 39 | 40 | ```{python} 41 | from scipy.stats import \ 42 | (ttest_1samp, 43 | ttest_rel, 44 | ttest_ind, 45 | t as t_dbn) 46 | from statsmodels.stats.multicomp import \ 47 | pairwise_tukeyhsd 48 | from statsmodels.stats.multitest import \ 49 | multipletests as mult_test 50 | 51 | ``` 52 | 53 | 54 | ## Review of Hypothesis Tests 55 | We begin by performing some one-sample $t$-tests. 56 | 57 | First we create 100 variables, each consisting of 10 observations. The 58 | first 50 variables have mean $0.5$ and variance $1$, while the others 59 | have mean $0$ and variance $1$. 60 | 61 | ```{python} 62 | rng = np.random.default_rng(12) 63 | X = rng.standard_normal((10, 100)) 64 | true_mean = np.array([0.5]*50 + [0]*50) 65 | X += true_mean[None,:] 66 | 67 | ``` 68 | 69 | To begin, we use `ttest_1samp()` from the 70 | `scipy.stats` module to test $H_{0}: \mu_1=0$, the null 71 | hypothesis that the first variable has mean zero. 72 | 73 | ```{python} 74 | result = ttest_1samp(X[:,0], 0) 75 | result.pvalue 76 | 77 | ``` 78 | 79 | The $p$-value comes out to 0.931, which is not low enough to 80 | reject the null hypothesis at level $\alpha=0.05$. In this case, 81 | $\mu_1=0.5$, so the null hypothesis is false. Therefore, we have made 82 | a Type II error by failing to reject the null hypothesis when the null 83 | hypothesis is false. 84 | 85 | We now test $H_{0,j}: \mu_j=0$ for $j=1,\ldots,100$. We compute the 86 | 100 $p$-values, and then construct a vector recording whether the 87 | $j$th $p$-value is less than or equal to 0.05, in which case we reject 88 | $H_{0j}$, or greater than 0.05, in which case we do not reject 89 | $H_{0j}$, for $j=1,\ldots,100$. 90 | 91 | ```{python} 92 | p_values = np.empty(100) 93 | for i in range(100): 94 | p_values[i] = ttest_1samp(X[:,i], 0).pvalue 95 | decision = pd.cut(p_values, 96 | [0, 0.05, 1], 97 | labels=['Reject H0', 98 | 'Do not reject H0']) 99 | truth = pd.Categorical(true_mean == 0, 100 | categories=[True, False], 101 | ordered=True) 102 | 103 | ``` 104 | Since this is a simulated data set, we can create a $2 \times 2$ table 105 | similar to Table 13.2. 106 | 107 | ```{python} 108 | pd.crosstab(decision, 109 | truth, 110 | rownames=['Decision'], 111 | colnames=['H0']) 112 | 113 | ``` 114 | Therefore, at level $\alpha=0.05$, we reject 15 of the 50 false 115 | null hypotheses, and we incorrectly reject 5 of the true null 116 | hypotheses. Using the notation from Section 13.3, we have 117 | $V=5$, $S=15$, $U=45$ and $W=35$. 118 | We have set $\alpha=0.05$, which means that we expect to reject around 119 | 5% of the true null hypotheses. This is in line with the $2 \times 2$ 120 | table above, which indicates that we rejected $V=5$ of the $50$ true 121 | null hypotheses. 122 | 123 | In the simulation above, for the false null hypotheses, the ratio of 124 | the mean to the standard deviation was only $0.5/1 = 0.5$. This 125 | amounts to quite a weak signal, and it resulted in a high number of 126 | Type II errors. Let’s instead simulate data with a stronger signal, 127 | so that the ratio of the mean to the standard deviation for the false 128 | null hypotheses equals $1$. We make only 10 Type II errors. 129 | 130 | 131 | ```{python} 132 | true_mean = np.array([1]*50 + [0]*50) 133 | X = rng.standard_normal((10, 100)) 134 | X += true_mean[None,:] 135 | for i in range(100): 136 | p_values[i] = ttest_1samp(X[:,i], 0).pvalue 137 | decision = pd.cut(p_values, 138 | [0, 0.05, 1], 139 | labels=['Reject H0', 140 | 'Do not reject H0']) 141 | truth = pd.Categorical(true_mean == 0, 142 | categories=[True, False], 143 | ordered=True) 144 | pd.crosstab(decision, 145 | truth, 146 | rownames=['Decision'], 147 | colnames=['H0']) 148 | 149 | ``` 150 | 151 | 152 | 153 | ## Family-Wise Error Rate 154 | Recall from (13.5) that if the null hypothesis is true 155 | for each of $m$ independent hypothesis tests, then the FWER is equal 156 | to $1-(1-\alpha)^m$. We can use this expression to compute the FWER 157 | for $m=1,\ldots, 500$ and $\alpha=0.05$, $0.01$, and $0.001$. 158 | We plot the FWER for these values of $\alpha$ in order to 159 | reproduce Figure 13.2. 160 | 161 | ```{python} 162 | m = np.linspace(1, 501) 163 | fig, ax = plt.subplots() 164 | [ax.plot(m, 165 | 1 - (1 - alpha)**m, 166 | label=r'$\alpha=%s$' % str(alpha)) 167 | for alpha in [0.05, 0.01, 0.001]] 168 | ax.set_xscale('log') 169 | ax.set_xlabel('Number of Hypotheses') 170 | ax.set_ylabel('Family-Wise Error Rate') 171 | ax.legend() 172 | ax.axhline(0.05, c='k', ls='--'); 173 | 174 | ``` 175 | 176 | As discussed previously, even for moderate values of $m$ such as $50$, 177 | the FWER exceeds $0.05$ unless $\alpha$ is set to a very low value, 178 | such as $0.001$. Of course, the problem with setting $\alpha$ to such 179 | a low value is that we are likely to make a number of Type II errors: 180 | in other words, our power is very low. 181 | 182 | We now conduct a one-sample $t$-test for each of the first five 183 | managers in the 184 | `Fund` dataset, in order to test the null 185 | hypothesis that the $j$th fund manager’s mean return equals zero, 186 | $H_{0,j}: \mu_j=0$. 187 | 188 | ```{python} 189 | Fund = load_data('Fund') 190 | fund_mini = Fund.iloc[:,:5] 191 | fund_mini_pvals = np.empty(5) 192 | for i in range(5): 193 | fund_mini_pvals[i] = ttest_1samp(fund_mini.iloc[:,i], 0).pvalue 194 | fund_mini_pvals 195 | 196 | ``` 197 | 198 | The $p$-values are low for Managers One and Three, and high for the 199 | other three managers. However, we cannot simply reject $H_{0,1}$ and 200 | $H_{0,3}$, since this would fail to account for the multiple testing 201 | that we have performed. Instead, we will conduct Bonferroni’s method 202 | and Holm’s method to control the FWER. 203 | 204 | To do this, we use the `multipletests()` function from the 205 | `statsmodels` module (abbreviated to `mult_test()`). Given the $p$-values, 206 | for methods like Holm and Bonferroni the function outputs 207 | adjusted $p$-values, which 208 | can be thought of as a new set of $p$-values that have been corrected 209 | for multiple testing. If the adjusted $p$-value for a given hypothesis 210 | is less than or equal to $\alpha$, then that hypothesis can be 211 | rejected while maintaining a FWER of no more than $\alpha$. In other 212 | words, for such methods, the adjusted $p$-values resulting from the `multipletests()` 213 | function can simply be compared to the desired FWER in order to 214 | determine whether or not to reject each hypothesis. We will later 215 | see that we can use the same function to control FDR as well. 216 | 217 | 218 | The `mult_test()` function takes $p$-values and a `method` argument, as well as an optional 219 | `alpha` argument. It returns the decisions (`reject` below) 220 | as well as the adjusted $p$-values (`bonf`). 221 | 222 | ```{python} 223 | reject, bonf = mult_test(fund_mini_pvals, method = "bonferroni")[:2] 224 | reject 225 | 226 | ``` 227 | 228 | 229 | The $p$-values `bonf` are simply the `fund_mini_pvalues` multiplied by 5 and truncated to be less than 230 | or equal to 1. 231 | 232 | ```{python} 233 | bonf, np.minimum(fund_mini_pvals * 5, 1) 234 | 235 | ``` 236 | 237 | Therefore, using Bonferroni’s method, we are able to reject the null hypothesis only for Manager 238 | One while controlling FWER at $0.05$. 239 | 240 | By contrast, using Holm’s method, the adjusted $p$-values indicate 241 | that we can reject the null 242 | hypotheses for Managers One and Three at a FWER of $0.05$. 243 | 244 | ```{python} 245 | mult_test(fund_mini_pvals, method = "holm", alpha=0.05)[:2] 246 | 247 | ``` 248 | 249 | 250 | As discussed previously, Manager One seems to perform particularly 251 | well, whereas Manager Two has poor performance. 252 | 253 | 254 | ```{python} 255 | fund_mini.mean() 256 | 257 | ``` 258 | 259 | 260 | Is there evidence of a meaningful difference in performance between 261 | these two managers? We can check this by performing a paired $t$-test using the `ttest_rel()` function 262 | from `scipy.stats`: 263 | 264 | ```{python} 265 | ttest_rel(fund_mini['Manager1'], 266 | fund_mini['Manager2']).pvalue 267 | 268 | ``` 269 | 270 | The test results in a $p$-value of 0.038, 271 | suggesting a statistically significant difference. 272 | 273 | However, we decided to perform this test only after examining the data 274 | and noting that Managers One and Two had the highest and lowest mean 275 | performances. In a sense, this means that we have implicitly 276 | performed ${5 \choose 2} = 5(5-1)/2=10$ hypothesis tests, rather than 277 | just one, as discussed in Section 13.3.2. Hence, we use the 278 | `pairwise_tukeyhsd()` function from 279 | `statsmodels.stats.multicomp` to apply Tukey’s method 280 | in order to adjust for multiple testing. This function takes 281 | as input a fitted *ANOVA* regression model, which is 282 | essentially just a linear regression in which all of the predictors 283 | are qualitative. In this case, the response consists of the monthly 284 | excess returns achieved by each manager, and the predictor indicates 285 | the manager to which each return corresponds. 286 | 287 | ```{python} 288 | returns = np.hstack([fund_mini.iloc[:,i] for i in range(5)]) 289 | managers = np.hstack([[i+1]*50 for i in range(5)]) 290 | tukey = pairwise_tukeyhsd(returns, managers) 291 | print(tukey.summary()) 292 | 293 | ``` 294 | 295 | 296 | The `pairwise_tukeyhsd()` function provides confidence intervals 297 | for the difference between each pair of managers (`lower` and 298 | `upper`), as well as a $p$-value. All of these quantities have 299 | been adjusted for multiple testing. Notice that the $p$-value for the 300 | difference between Managers One and Two has increased from $0.038$ to 301 | $0.186$, so there is no longer clear evidence of a difference between 302 | the managers’ performances. We can plot the confidence intervals for 303 | the pairwise comparisons using the `plot_simultaneous()` method 304 | of `tukey`. Any pair of intervals that don’t overlap indicates a significant difference at the nominal level of 0.05. In this case, 305 | no differences are considered significant as reported in the table above. 306 | 307 | ```{python} 308 | fig, ax = plt.subplots(figsize=(8,8)) 309 | tukey.plot_simultaneous(ax=ax); 310 | 311 | ``` 312 | 313 | ## False Discovery Rate 314 | Now we perform hypothesis tests for all 2,000 fund managers in the 315 | `Fund` dataset. We perform a one-sample $t$-test 316 | of $H_{0,j}: \mu_j=0$, which states that the 317 | $j$th fund manager’s mean return is zero. 318 | 319 | ```{python} 320 | fund_pvalues = np.empty(2000) 321 | for i, manager in enumerate(Fund.columns): 322 | fund_pvalues[i] = ttest_1samp(Fund[manager], 0).pvalue 323 | 324 | ``` 325 | 326 | There are far too many managers to consider trying to control the FWER. 327 | Instead, we focus on controlling the FDR: that is, the expected fraction of rejected null hypotheses that are actually false positives. 328 | The `multipletests()` function (abbreviated `mult_test()`) can be used to carry out the Benjamini--Hochberg procedure. 329 | 330 | ```{python} 331 | fund_qvalues = mult_test(fund_pvalues, method = "fdr_bh")[1] 332 | fund_qvalues[:10] 333 | 334 | ``` 335 | 336 | The *q-values* output by the 337 | Benjamini--Hochberg procedure can be interpreted as the smallest FDR 338 | threshold at which we would reject a particular null hypothesis. For 339 | instance, a $q$-value of $0.1$ indicates that we can reject the 340 | corresponding null hypothesis at an FDR of 10% or greater, but that 341 | we cannot reject the null hypothesis at an FDR below 10%. 342 | 343 | If we control the FDR at 10%, then for how many of the fund managers can we reject $H_{0,j}: \mu_j=0$? 344 | 345 | ```{python} 346 | (fund_qvalues <= 0.1).sum() 347 | 348 | ``` 349 | We find that 146 of the 2,000 fund managers have a $q$-value below 350 | 0.1; therefore, we are able to conclude that 146 of the fund managers 351 | beat the market at an FDR of 10%. Only about 15 (10% of 146) of 352 | these fund managers are likely to be false discoveries. 353 | 354 | By contrast, if we had instead used Bonferroni’s method to control the 355 | FWER at level $\alpha=0.1$, then we would have failed to reject any 356 | null hypotheses! 357 | 358 | ```{python} 359 | (fund_pvalues <= 0.1 / 2000).sum() 360 | 361 | ``` 362 | 363 | 364 | Figure 13.6 displays the ordered 365 | $p$-values, $p_{(1)} \leq p_{(2)} \leq \cdots \leq p_{(2000)}$, for 366 | the `Fund` dataset, as well as the threshold for rejection by the 367 | Benjamini--Hochberg procedure. Recall that the Benjamini--Hochberg 368 | procedure identifies the largest $p$-value such that $p_{(j)} 0: 385 | selected_ = fund_pvalues < sorted_[sorted_set_].max() 386 | sorted_set_ = np.arange(sorted_set_.max()) 387 | else: 388 | selected_ = [] 389 | sorted_set_ = [] 390 | 391 | ``` 392 | 393 | We now reproduce the middle panel of Figure 13.6. 394 | 395 | ```{python} 396 | fig, ax = plt.subplots() 397 | ax.scatter(np.arange(0, sorted_.shape[0]) + 1, 398 | sorted_, s=10) 399 | ax.set_yscale('log') 400 | ax.set_xscale('log') 401 | ax.set_ylabel('P-Value') 402 | ax.set_xlabel('Index') 403 | ax.scatter(sorted_set_+1, sorted_[sorted_set_], c='r', s=20) 404 | ax.axline((0, 0), (1,q/m), c='k', ls='--', linewidth=3); 405 | 406 | ``` 407 | 408 | 409 | ## A Re-Sampling Approach 410 | Here, we implement the re-sampling approach to hypothesis testing 411 | using the `Khan` dataset, which we investigated in 412 | Section 13.5. First, we merge the training and 413 | testing data, which results in observations on 83 patients for 414 | 2,308 genes. 415 | 416 | ```{python} 417 | Khan = load_data('Khan') 418 | D = pd.concat([Khan['xtrain'], Khan['xtest']]) 419 | D['Y'] = pd.concat([Khan['ytrain'], Khan['ytest']]) 420 | D['Y'].value_counts() 421 | 422 | ``` 423 | 424 | 425 | There are four classes of cancer. For each gene, we compare the mean 426 | expression in the second class (rhabdomyosarcoma) to the mean 427 | expression in the fourth class (Burkitt’s lymphoma). Performing a 428 | standard two-sample $t$-test 429 | using `ttest_ind()` from `scipy.stats` on the $11$th 430 | gene produces a test-statistic of -2.09 and an associated $p$-value 431 | of 0.0412, suggesting modest evidence of a difference in mean 432 | expression levels between the two cancer types. 433 | 434 | ```{python} 435 | D2 = D[lambda df:df['Y'] == 2] 436 | D4 = D[lambda df:df['Y'] == 4] 437 | gene_11 = 'G0011' 438 | observedT, pvalue = ttest_ind(D2[gene_11], 439 | D4[gene_11], 440 | equal_var=True) 441 | observedT, pvalue 442 | 443 | ``` 444 | 445 | 446 | However, this $p$-value relies on the assumption that under the null 447 | hypothesis of no difference between the two groups, the test statistic 448 | follows a $t$-distribution with $29+25-2=52$ degrees of freedom. 449 | Instead of using this theoretical null distribution, we can randomly 450 | split the 54 patients into two groups of 29 and 25, and compute a new 451 | test statistic. Under the null hypothesis of no difference between 452 | the groups, this new test statistic should have the same distribution 453 | as our original one. Repeating this process 10,000 times allows us to 454 | approximate the null distribution of the test statistic. We compute 455 | the fraction of the time that our observed test statistic exceeds the 456 | test statistics obtained via re-sampling. 457 | 458 | ```{python} 459 | B = 10000 460 | Tnull = np.empty(B) 461 | D_ = np.hstack([D2[gene_11], D4[gene_11]]) 462 | n_ = D2[gene_11].shape[0] 463 | D_null = D_.copy() 464 | for b in range(B): 465 | rng.shuffle(D_null) 466 | ttest_ = ttest_ind(D_null[:n_], 467 | D_null[n_:], 468 | equal_var=True) 469 | Tnull[b] = ttest_.statistic 470 | (np.abs(Tnull) < np.abs(observedT)).mean() 471 | 472 | ``` 473 | 474 | 475 | This fraction, 0.0398, 476 | is our re-sampling-based $p$-value. 477 | It is almost identical to the $p$-value of 0.0412 obtained using the theoretical null distribution. 478 | We can plot a histogram of the re-sampling-based test statistics in order to reproduce Figure 13.7. 479 | 480 | ```{python} 481 | fig, ax = plt.subplots(figsize=(8,8)) 482 | ax.hist(Tnull, 483 | bins=100, 484 | density=True, 485 | facecolor='y', 486 | label='Null') 487 | xval = np.linspace(-4.2, 4.2, 1001) 488 | ax.plot(xval, 489 | t_dbn.pdf(xval, D_.shape[0]-2), 490 | c='r') 491 | ax.axvline(observedT, 492 | c='b', 493 | label='Observed') 494 | ax.legend() 495 | ax.set_xlabel("Null Distribution of Test Statistic"); 496 | 497 | ``` 498 | The re-sampling-based null distribution is almost identical to the theoretical null distribution, which is displayed in red. 499 | 500 | Finally, we implement the plug-in re-sampling FDR approach outlined in 501 | Algorithm 13.4. Depending on the speed of your 502 | computer, calculating the FDR for all 2,308 genes in the `Khan` 503 | dataset may take a while. Hence, we will illustrate the approach on a 504 | random subset of 100 genes. For each gene, we first compute the 505 | observed test statistic, and then produce 10,000 re-sampled test 506 | statistics. This may take a few minutes to run. If you are in a rush, 507 | then you could set `B` equal to a smaller value (e.g. `B=500`). 508 | 509 | ```{python} 510 | m, B = 100, 10000 511 | idx = rng.choice(Khan['xtest'].columns, m, replace=False) 512 | T_vals = np.empty(m) 513 | Tnull_vals = np.empty((m, B)) 514 | 515 | for j in range(m): 516 | col = idx[j] 517 | T_vals[j] = ttest_ind(D2[col], 518 | D4[col], 519 | equal_var=True).statistic 520 | D_ = np.hstack([D2[col], D4[col]]) 521 | D_null = D_.copy() 522 | for b in range(B): 523 | rng.shuffle(D_null) 524 | ttest_ = ttest_ind(D_null[:n_], 525 | D_null[n_:], 526 | equal_var=True) 527 | Tnull_vals[j,b] = ttest_.statistic 528 | 529 | ``` 530 | 531 | Next, we compute the number of rejected null hypotheses $R$, the 532 | estimated number of false positives $\widehat{V}$, and the estimated 533 | FDR, for a range of threshold values $c$ in 534 | Algorithm 13.4. The threshold values are chosen 535 | using the absolute values of the test statistics from the 100 genes. 536 | 537 | ```{python} 538 | cutoffs = np.sort(np.abs(T_vals)) 539 | FDRs, Rs, Vs = np.empty((3, m)) 540 | for j in range(m): 541 | R = np.sum(np.abs(T_vals) >= cutoffs[j]) 542 | V = np.sum(np.abs(Tnull_vals) >= cutoffs[j]) / B 543 | Rs[j] = R 544 | Vs[j] = V 545 | FDRs[j] = V / R 546 | 547 | ``` 548 | 549 | Now, for any given FDR, we can find the genes that will be 550 | rejected. For example, with FDR controlled at 0.1, we reject 15 of the 551 | 100 null hypotheses. On average, we would expect about one or two of 552 | these genes (i.e. 10% of 15) to be false discoveries. At an FDR of 553 | 0.2, we can reject the null hypothesis for 28 genes, of which we 554 | expect around six to be false discoveries. 555 | 556 | The variable `idx` stores which 557 | genes were included in our 100 randomly-selected genes. Let’s look at 558 | the genes whose estimated FDR is less than 0.1. 559 | 560 | ```{python} 561 | sorted(idx[np.abs(T_vals) >= cutoffs[FDRs < 0.1].min()]) 562 | 563 | ``` 564 | 565 | At an FDR threshold of 0.2, more genes are selected, at the cost of having a higher expected 566 | proportion of false discoveries. 567 | 568 | ```{python} 569 | sorted(idx[np.abs(T_vals) >= cutoffs[FDRs < 0.2].min()]) 570 | 571 | ``` 572 | 573 | The next line generates Figure 13.11, which is similar 574 | to Figure 13.9, 575 | except that it is based on only a subset of the genes. 576 | 577 | ```{python} 578 | fig, ax = plt.subplots() 579 | ax.plot(Rs, FDRs, 'b', linewidth=3) 580 | ax.set_xlabel("Number of Rejections") 581 | ax.set_ylabel("False Discovery Rate"); 582 | 583 | ``` 584 | 585 | 586 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 2-Clause License 2 | 3 | Copyright (c) 2023, intro-stat-learning 4 | 5 | Redistribution and use in source and binary forms, with or without 6 | modification, are permitted provided that the following conditions are met: 7 | 8 | 1. Redistributions of source code must retain the above copyright notice, this 9 | list of conditions and the following disclaimer. 10 | 11 | 2. Redistributions in binary form must reproduce the above copyright notice, 12 | this list of conditions and the following disclaimer in the documentation 13 | and/or other materials provided with the distribution. 14 | 15 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 16 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 19 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 20 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 21 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 22 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 23 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 24 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .DEFAULT_GOAL := help 2 | 3 | venv: 4 | @curl -LsSf https://astral.sh/uv/install.sh | sh 5 | @uv venv --python 3.12 6 | 7 | 8 | .PHONY: install 9 | install: venv ## Install all dependencies (in the virtual environment) defined in requirements.txt 10 | @uv pip install --upgrade pip 11 | @uv pip install -r requirements.txt 12 | 13 | 14 | .PHONY: help 15 | help: ## Display this help screen 16 | @echo -e "\033[1mAvailable commands:\033[0m" 17 | @grep -E '^[a-z.A-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | awk 'BEGIN {FS = ":.*?## "}; {printf " \033[36m%-18s\033[0m %s\n", $$1, $$2}' | sort 18 | 19 | 20 | .PHONY: jupyter 21 | jupyter: install ## Install and start jupyter Lab 22 | @uv run pip install jupyterlab 23 | @uv run jupyter lab 24 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ISLP_labs 2 | 3 | [![Open in GitHub Codespaces](https://github.com/codespaces/badge.svg)](https://codespaces.new/intro-stat-learning/ISLP_Labs) 4 | 5 | 6 | [![All Contributors](https://img.shields.io/badge/all_contributors-5-orange.svg?style=flat-square)](#contributors-) 7 | 8 | 9 | ## Authors 10 | 11 | - Trevor Hastie 12 | 13 | - Gareth James 14 | 15 | - Jonathan Taylor 16 | 17 | - Robert Tibshirani 18 | 19 | - Daniela Witten 20 | 21 | ### ISLP 22 | 23 | Please ensure you have followed the installation instructions for 24 | [ISLP](https://github.com/intro-stat-learning/ISLP). This will address 25 | installation of [jupyterlab](https://github.com/jupyterlab/jupyterlab) 26 | if necessary, which is not included as a requirement of the labs. 27 | 28 | ### Up-to-date version of labs for ISLP. 29 | 30 | This repo will track labs for ISLP as their source code changes. The 31 | intent is that building a virtual environment with 32 | `requirements.txt` will reproduce the results in this repo. 33 | 34 | To install the current version of the requirements run 35 | 36 | ``` 37 | pip install -r https://raw.githubusercontent.com/intro-stat-learning/ISLP_labs/v2.2/requirements.txt; 38 | ``` 39 | 40 | The labs can now be run via: 41 | 42 | ``` 43 | jupyter lab Ch02-statlearn-lab.ipynb 44 | ``` 45 | 46 | ## Using make 47 | 48 | If `make` is available on your machine, the steps above can be replaced 49 | 50 | ``` 51 | make install 52 | make jupyter 53 | ``` 54 | 55 | # Zip / tarball 56 | 57 | You can download all the labs as a `.zip` or `.tar.gz` [here](https://github.com/intro-stat-learning/ISLP_labs/releases/tag/v2.2) 58 | 59 | 60 | ## Contributors ✨ 61 | 62 | Thanks goes to these wonderful people ([emoji key](https://allcontributors.org/docs/en/emoji-key)): 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 |
tibshirani
tibshirani

💻 🖋
trevorhastie
trevorhastie

💻 🖋
danielawitten
danielawitten

💻 🖋
Jonathan Taylor
Jonathan Taylor

💻 🖋
Thomas Schmelzer
Thomas Schmelzer

💻
78 | 79 | 80 | 81 | 82 | 83 | 84 | This project follows the [all-contributors](https://github.com/all-contributors/all-contributors) specification. Contributions of any kind welcome! 85 | -------------------------------------------------------------------------------- /book_images/Cape_Weaver.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/intro-stat-learning/ISLP_labs/f7d7153e137b9d4885670b44a022f6a6113d56b0/book_images/Cape_Weaver.jpg -------------------------------------------------------------------------------- /book_images/Flamingo.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/intro-stat-learning/ISLP_labs/f7d7153e137b9d4885670b44a022f6a6113d56b0/book_images/Flamingo.jpg -------------------------------------------------------------------------------- /book_images/Hawk_Fountain.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/intro-stat-learning/ISLP_labs/f7d7153e137b9d4885670b44a022f6a6113d56b0/book_images/Hawk_Fountain.jpg -------------------------------------------------------------------------------- /book_images/Hawk_cropped.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/intro-stat-learning/ISLP_labs/f7d7153e137b9d4885670b44a022f6a6113d56b0/book_images/Hawk_cropped.jpg -------------------------------------------------------------------------------- /book_images/Lhasa_Apso.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/intro-stat-learning/ISLP_labs/f7d7153e137b9d4885670b44a022f6a6113d56b0/book_images/Lhasa_Apso.jpg -------------------------------------------------------------------------------- /book_images/Sleeping_Cat.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/intro-stat-learning/ISLP_labs/f7d7153e137b9d4885670b44a022f6a6113d56b0/book_images/Sleeping_Cat.jpg -------------------------------------------------------------------------------- /imagenet_class_index.json: -------------------------------------------------------------------------------- 1 | {"0": ["n01440764", "tench"], "1": ["n01443537", "goldfish"], "2": ["n01484850", "great_white_shark"], "3": ["n01491361", "tiger_shark"], "4": ["n01494475", "hammerhead"], "5": ["n01496331", "electric_ray"], "6": ["n01498041", "stingray"], "7": ["n01514668", "cock"], "8": ["n01514859", "hen"], "9": ["n01518878", "ostrich"], "10": ["n01530575", "brambling"], "11": ["n01531178", "goldfinch"], "12": ["n01532829", "house_finch"], "13": ["n01534433", "junco"], "14": ["n01537544", "indigo_bunting"], "15": ["n01558993", "robin"], "16": ["n01560419", "bulbul"], "17": ["n01580077", "jay"], "18": ["n01582220", "magpie"], "19": ["n01592084", "chickadee"], "20": ["n01601694", "water_ouzel"], "21": ["n01608432", "kite"], "22": ["n01614925", "bald_eagle"], "23": ["n01616318", "vulture"], "24": ["n01622779", "great_grey_owl"], "25": ["n01629819", "European_fire_salamander"], "26": ["n01630670", "common_newt"], "27": ["n01631663", "eft"], "28": ["n01632458", "spotted_salamander"], "29": ["n01632777", "axolotl"], "30": ["n01641577", "bullfrog"], "31": ["n01644373", "tree_frog"], "32": ["n01644900", "tailed_frog"], "33": ["n01664065", "loggerhead"], "34": ["n01665541", "leatherback_turtle"], "35": ["n01667114", "mud_turtle"], "36": ["n01667778", "terrapin"], "37": ["n01669191", "box_turtle"], "38": ["n01675722", "banded_gecko"], "39": ["n01677366", "common_iguana"], "40": ["n01682714", "American_chameleon"], "41": ["n01685808", "whiptail"], "42": ["n01687978", "agama"], "43": ["n01688243", "frilled_lizard"], "44": ["n01689811", "alligator_lizard"], "45": ["n01692333", "Gila_monster"], "46": ["n01693334", "green_lizard"], "47": ["n01694178", "African_chameleon"], "48": ["n01695060", "Komodo_dragon"], "49": ["n01697457", "African_crocodile"], "50": ["n01698640", "American_alligator"], "51": ["n01704323", "triceratops"], "52": ["n01728572", "thunder_snake"], "53": ["n01728920", "ringneck_snake"], "54": ["n01729322", "hognose_snake"], "55": ["n01729977", "green_snake"], "56": ["n01734418", "king_snake"], "57": ["n01735189", "garter_snake"], "58": ["n01737021", "water_snake"], "59": ["n01739381", "vine_snake"], "60": ["n01740131", "night_snake"], "61": ["n01742172", "boa_constrictor"], "62": ["n01744401", "rock_python"], "63": ["n01748264", "Indian_cobra"], "64": ["n01749939", "green_mamba"], "65": ["n01751748", "sea_snake"], "66": ["n01753488", "horned_viper"], "67": ["n01755581", "diamondback"], "68": ["n01756291", "sidewinder"], "69": ["n01768244", "trilobite"], "70": ["n01770081", "harvestman"], "71": ["n01770393", "scorpion"], "72": ["n01773157", "black_and_gold_garden_spider"], "73": ["n01773549", "barn_spider"], "74": ["n01773797", "garden_spider"], "75": ["n01774384", "black_widow"], "76": ["n01774750", "tarantula"], "77": ["n01775062", "wolf_spider"], "78": ["n01776313", "tick"], "79": ["n01784675", "centipede"], "80": ["n01795545", "black_grouse"], "81": ["n01796340", "ptarmigan"], "82": ["n01797886", "ruffed_grouse"], "83": ["n01798484", "prairie_chicken"], "84": ["n01806143", "peacock"], "85": ["n01806567", "quail"], "86": ["n01807496", "partridge"], "87": ["n01817953", "African_grey"], "88": ["n01818515", "macaw"], "89": ["n01819313", "sulphur-crested_cockatoo"], "90": ["n01820546", "lorikeet"], "91": ["n01824575", "coucal"], "92": ["n01828970", "bee_eater"], "93": ["n01829413", "hornbill"], "94": ["n01833805", "hummingbird"], "95": ["n01843065", "jacamar"], "96": ["n01843383", "toucan"], "97": ["n01847000", "drake"], "98": ["n01855032", "red-breasted_merganser"], "99": ["n01855672", "goose"], "100": ["n01860187", "black_swan"], "101": ["n01871265", "tusker"], "102": ["n01872401", "echidna"], "103": ["n01873310", "platypus"], "104": ["n01877812", "wallaby"], "105": ["n01882714", "koala"], "106": ["n01883070", "wombat"], "107": ["n01910747", "jellyfish"], "108": ["n01914609", "sea_anemone"], "109": ["n01917289", "brain_coral"], "110": ["n01924916", "flatworm"], "111": ["n01930112", "nematode"], "112": ["n01943899", "conch"], "113": ["n01944390", "snail"], "114": ["n01945685", "slug"], "115": ["n01950731", "sea_slug"], "116": ["n01955084", "chiton"], "117": ["n01968897", "chambered_nautilus"], "118": ["n01978287", "Dungeness_crab"], "119": ["n01978455", "rock_crab"], "120": ["n01980166", "fiddler_crab"], "121": ["n01981276", "king_crab"], "122": ["n01983481", "American_lobster"], "123": ["n01984695", "spiny_lobster"], "124": ["n01985128", "crayfish"], "125": ["n01986214", "hermit_crab"], "126": ["n01990800", "isopod"], "127": ["n02002556", "white_stork"], "128": ["n02002724", "black_stork"], "129": ["n02006656", "spoonbill"], "130": ["n02007558", "flamingo"], "131": ["n02009229", "little_blue_heron"], "132": ["n02009912", "American_egret"], "133": ["n02011460", "bittern"], "134": ["n02012849", "crane"], "135": ["n02013706", "limpkin"], "136": ["n02017213", "European_gallinule"], "137": ["n02018207", "American_coot"], "138": ["n02018795", "bustard"], "139": ["n02025239", "ruddy_turnstone"], "140": ["n02027492", "red-backed_sandpiper"], "141": ["n02028035", "redshank"], "142": ["n02033041", "dowitcher"], "143": ["n02037110", "oystercatcher"], "144": ["n02051845", "pelican"], "145": ["n02056570", "king_penguin"], "146": ["n02058221", "albatross"], "147": ["n02066245", "grey_whale"], "148": ["n02071294", "killer_whale"], "149": ["n02074367", "dugong"], "150": ["n02077923", "sea_lion"], "151": ["n02085620", "Chihuahua"], "152": ["n02085782", "Japanese_spaniel"], "153": ["n02085936", "Maltese_dog"], "154": ["n02086079", "Pekinese"], "155": ["n02086240", "Shih-Tzu"], "156": ["n02086646", "Blenheim_spaniel"], "157": ["n02086910", "papillon"], "158": ["n02087046", "toy_terrier"], "159": ["n02087394", "Rhodesian_ridgeback"], "160": ["n02088094", "Afghan_hound"], "161": ["n02088238", "basset"], "162": ["n02088364", "beagle"], "163": ["n02088466", "bloodhound"], "164": ["n02088632", "bluetick"], "165": ["n02089078", "black-and-tan_coonhound"], "166": ["n02089867", "Walker_hound"], "167": ["n02089973", "English_foxhound"], "168": ["n02090379", "redbone"], "169": ["n02090622", "borzoi"], "170": ["n02090721", "Irish_wolfhound"], "171": ["n02091032", "Italian_greyhound"], "172": ["n02091134", "whippet"], "173": ["n02091244", "Ibizan_hound"], "174": ["n02091467", "Norwegian_elkhound"], "175": ["n02091635", "otterhound"], "176": ["n02091831", "Saluki"], "177": ["n02092002", "Scottish_deerhound"], "178": ["n02092339", "Weimaraner"], "179": ["n02093256", "Staffordshire_bullterrier"], "180": ["n02093428", "American_Staffordshire_terrier"], "181": ["n02093647", "Bedlington_terrier"], "182": ["n02093754", "Border_terrier"], "183": ["n02093859", "Kerry_blue_terrier"], "184": ["n02093991", "Irish_terrier"], "185": ["n02094114", "Norfolk_terrier"], "186": ["n02094258", "Norwich_terrier"], "187": ["n02094433", "Yorkshire_terrier"], "188": ["n02095314", "wire-haired_fox_terrier"], "189": ["n02095570", "Lakeland_terrier"], "190": ["n02095889", "Sealyham_terrier"], "191": ["n02096051", "Airedale"], "192": ["n02096177", "cairn"], "193": ["n02096294", "Australian_terrier"], "194": ["n02096437", "Dandie_Dinmont"], "195": ["n02096585", "Boston_bull"], "196": ["n02097047", "miniature_schnauzer"], "197": ["n02097130", "giant_schnauzer"], "198": ["n02097209", "standard_schnauzer"], "199": ["n02097298", "Scotch_terrier"], "200": ["n02097474", "Tibetan_terrier"], "201": ["n02097658", "silky_terrier"], "202": ["n02098105", "soft-coated_wheaten_terrier"], "203": ["n02098286", "West_Highland_white_terrier"], "204": ["n02098413", "Lhasa"], "205": ["n02099267", "flat-coated_retriever"], "206": ["n02099429", "curly-coated_retriever"], "207": ["n02099601", "golden_retriever"], "208": ["n02099712", "Labrador_retriever"], "209": ["n02099849", "Chesapeake_Bay_retriever"], "210": ["n02100236", "German_short-haired_pointer"], "211": ["n02100583", "vizsla"], "212": ["n02100735", "English_setter"], "213": ["n02100877", "Irish_setter"], "214": ["n02101006", "Gordon_setter"], "215": ["n02101388", "Brittany_spaniel"], "216": ["n02101556", "clumber"], "217": ["n02102040", "English_springer"], "218": ["n02102177", "Welsh_springer_spaniel"], "219": ["n02102318", "cocker_spaniel"], "220": ["n02102480", "Sussex_spaniel"], "221": ["n02102973", "Irish_water_spaniel"], "222": ["n02104029", "kuvasz"], "223": ["n02104365", "schipperke"], "224": ["n02105056", "groenendael"], "225": ["n02105162", "malinois"], "226": ["n02105251", "briard"], "227": ["n02105412", "kelpie"], "228": ["n02105505", "komondor"], "229": ["n02105641", "Old_English_sheepdog"], "230": ["n02105855", "Shetland_sheepdog"], "231": ["n02106030", "collie"], "232": ["n02106166", "Border_collie"], "233": ["n02106382", "Bouvier_des_Flandres"], "234": ["n02106550", "Rottweiler"], "235": ["n02106662", "German_shepherd"], "236": ["n02107142", "Doberman"], "237": ["n02107312", "miniature_pinscher"], "238": ["n02107574", "Greater_Swiss_Mountain_dog"], "239": ["n02107683", "Bernese_mountain_dog"], "240": ["n02107908", "Appenzeller"], "241": ["n02108000", "EntleBucher"], "242": ["n02108089", "boxer"], "243": ["n02108422", "bull_mastiff"], "244": ["n02108551", "Tibetan_mastiff"], "245": ["n02108915", "French_bulldog"], "246": ["n02109047", "Great_Dane"], "247": ["n02109525", "Saint_Bernard"], "248": ["n02109961", "Eskimo_dog"], "249": ["n02110063", "malamute"], "250": ["n02110185", "Siberian_husky"], "251": ["n02110341", "dalmatian"], "252": ["n02110627", "affenpinscher"], "253": ["n02110806", "basenji"], "254": ["n02110958", "pug"], "255": ["n02111129", "Leonberg"], "256": ["n02111277", "Newfoundland"], "257": ["n02111500", "Great_Pyrenees"], "258": ["n02111889", "Samoyed"], "259": ["n02112018", "Pomeranian"], "260": ["n02112137", "chow"], "261": ["n02112350", "keeshond"], "262": ["n02112706", "Brabancon_griffon"], "263": ["n02113023", "Pembroke"], "264": ["n02113186", "Cardigan"], "265": ["n02113624", "toy_poodle"], "266": ["n02113712", "miniature_poodle"], "267": ["n02113799", "standard_poodle"], "268": ["n02113978", "Mexican_hairless"], "269": ["n02114367", "timber_wolf"], "270": ["n02114548", "white_wolf"], "271": ["n02114712", "red_wolf"], "272": ["n02114855", "coyote"], "273": ["n02115641", "dingo"], "274": ["n02115913", "dhole"], "275": ["n02116738", "African_hunting_dog"], "276": ["n02117135", "hyena"], "277": ["n02119022", "red_fox"], "278": ["n02119789", "kit_fox"], "279": ["n02120079", "Arctic_fox"], "280": ["n02120505", "grey_fox"], "281": ["n02123045", "tabby"], "282": ["n02123159", "tiger_cat"], "283": ["n02123394", "Persian_cat"], "284": ["n02123597", "Siamese_cat"], "285": ["n02124075", "Egyptian_cat"], "286": ["n02125311", "cougar"], "287": ["n02127052", "lynx"], "288": ["n02128385", "leopard"], "289": ["n02128757", "snow_leopard"], "290": ["n02128925", "jaguar"], "291": ["n02129165", "lion"], "292": ["n02129604", "tiger"], "293": ["n02130308", "cheetah"], "294": ["n02132136", "brown_bear"], "295": ["n02133161", "American_black_bear"], "296": ["n02134084", "ice_bear"], "297": ["n02134418", "sloth_bear"], "298": ["n02137549", "mongoose"], "299": ["n02138441", "meerkat"], "300": ["n02165105", "tiger_beetle"], "301": ["n02165456", "ladybug"], "302": ["n02167151", "ground_beetle"], "303": ["n02168699", "long-horned_beetle"], "304": ["n02169497", "leaf_beetle"], "305": ["n02172182", "dung_beetle"], "306": ["n02174001", "rhinoceros_beetle"], "307": ["n02177972", "weevil"], "308": ["n02190166", "fly"], "309": ["n02206856", "bee"], "310": ["n02219486", "ant"], "311": ["n02226429", "grasshopper"], "312": ["n02229544", "cricket"], "313": ["n02231487", "walking_stick"], "314": ["n02233338", "cockroach"], "315": ["n02236044", "mantis"], "316": ["n02256656", "cicada"], "317": ["n02259212", "leafhopper"], "318": ["n02264363", "lacewing"], "319": ["n02268443", "dragonfly"], "320": ["n02268853", "damselfly"], "321": ["n02276258", "admiral"], "322": ["n02277742", "ringlet"], "323": ["n02279972", "monarch"], "324": ["n02280649", "cabbage_butterfly"], "325": ["n02281406", "sulphur_butterfly"], "326": ["n02281787", "lycaenid"], "327": ["n02317335", "starfish"], "328": ["n02319095", "sea_urchin"], "329": ["n02321529", "sea_cucumber"], "330": ["n02325366", "wood_rabbit"], "331": ["n02326432", "hare"], "332": ["n02328150", "Angora"], "333": ["n02342885", "hamster"], "334": ["n02346627", "porcupine"], "335": ["n02356798", "fox_squirrel"], "336": ["n02361337", "marmot"], "337": ["n02363005", "beaver"], "338": ["n02364673", "guinea_pig"], "339": ["n02389026", "sorrel"], "340": ["n02391049", "zebra"], "341": ["n02395406", "hog"], "342": ["n02396427", "wild_boar"], "343": ["n02397096", "warthog"], "344": ["n02398521", "hippopotamus"], "345": ["n02403003", "ox"], "346": ["n02408429", "water_buffalo"], "347": ["n02410509", "bison"], "348": ["n02412080", "ram"], "349": ["n02415577", "bighorn"], "350": ["n02417914", "ibex"], "351": ["n02422106", "hartebeest"], "352": ["n02422699", "impala"], "353": ["n02423022", "gazelle"], "354": ["n02437312", "Arabian_camel"], "355": ["n02437616", "llama"], "356": ["n02441942", "weasel"], "357": ["n02442845", "mink"], "358": ["n02443114", "polecat"], "359": ["n02443484", "black-footed_ferret"], "360": ["n02444819", "otter"], "361": ["n02445715", "skunk"], "362": ["n02447366", "badger"], "363": ["n02454379", "armadillo"], "364": ["n02457408", "three-toed_sloth"], "365": ["n02480495", "orangutan"], "366": ["n02480855", "gorilla"], "367": ["n02481823", "chimpanzee"], "368": ["n02483362", "gibbon"], "369": ["n02483708", "siamang"], "370": ["n02484975", "guenon"], "371": ["n02486261", "patas"], "372": ["n02486410", "baboon"], "373": ["n02487347", "macaque"], "374": ["n02488291", "langur"], "375": ["n02488702", "colobus"], "376": ["n02489166", "proboscis_monkey"], "377": ["n02490219", "marmoset"], "378": ["n02492035", "capuchin"], "379": ["n02492660", "howler_monkey"], "380": ["n02493509", "titi"], "381": ["n02493793", "spider_monkey"], "382": ["n02494079", "squirrel_monkey"], "383": ["n02497673", "Madagascar_cat"], "384": ["n02500267", "indri"], "385": ["n02504013", "Indian_elephant"], "386": ["n02504458", "African_elephant"], "387": ["n02509815", "lesser_panda"], "388": ["n02510455", "giant_panda"], "389": ["n02514041", "barracouta"], "390": ["n02526121", "eel"], "391": ["n02536864", "coho"], "392": ["n02606052", "rock_beauty"], "393": ["n02607072", "anemone_fish"], "394": ["n02640242", "sturgeon"], "395": ["n02641379", "gar"], "396": ["n02643566", "lionfish"], "397": ["n02655020", "puffer"], "398": ["n02666196", "abacus"], "399": ["n02667093", "abaya"], "400": ["n02669723", "academic_gown"], "401": ["n02672831", "accordion"], "402": ["n02676566", "acoustic_guitar"], "403": ["n02687172", "aircraft_carrier"], "404": ["n02690373", "airliner"], "405": ["n02692877", "airship"], "406": ["n02699494", "altar"], "407": ["n02701002", "ambulance"], "408": ["n02704792", "amphibian"], "409": ["n02708093", "analog_clock"], "410": ["n02727426", "apiary"], "411": ["n02730930", "apron"], "412": ["n02747177", "ashcan"], "413": ["n02749479", "assault_rifle"], "414": ["n02769748", "backpack"], "415": ["n02776631", "bakery"], "416": ["n02777292", "balance_beam"], "417": ["n02782093", "balloon"], "418": ["n02783161", "ballpoint"], "419": ["n02786058", "Band_Aid"], "420": ["n02787622", "banjo"], "421": ["n02788148", "bannister"], "422": ["n02790996", "barbell"], "423": ["n02791124", "barber_chair"], "424": ["n02791270", "barbershop"], "425": ["n02793495", "barn"], "426": ["n02794156", "barometer"], "427": ["n02795169", "barrel"], "428": ["n02797295", "barrow"], "429": ["n02799071", "baseball"], "430": ["n02802426", "basketball"], "431": ["n02804414", "bassinet"], "432": ["n02804610", "bassoon"], "433": ["n02807133", "bathing_cap"], "434": ["n02808304", "bath_towel"], "435": ["n02808440", "bathtub"], "436": ["n02814533", "beach_wagon"], "437": ["n02814860", "beacon"], "438": ["n02815834", "beaker"], "439": ["n02817516", "bearskin"], "440": ["n02823428", "beer_bottle"], "441": ["n02823750", "beer_glass"], "442": ["n02825657", "bell_cote"], "443": ["n02834397", "bib"], "444": ["n02835271", "bicycle-built-for-two"], "445": ["n02837789", "bikini"], "446": ["n02840245", "binder"], "447": ["n02841315", "binoculars"], "448": ["n02843684", "birdhouse"], "449": ["n02859443", "boathouse"], "450": ["n02860847", "bobsled"], "451": ["n02865351", "bolo_tie"], "452": ["n02869837", "bonnet"], "453": ["n02870880", "bookcase"], "454": ["n02871525", "bookshop"], "455": ["n02877765", "bottlecap"], "456": ["n02879718", "bow"], "457": ["n02883205", "bow_tie"], "458": ["n02892201", "brass"], "459": ["n02892767", "brassiere"], "460": ["n02894605", "breakwater"], "461": ["n02895154", "breastplate"], "462": ["n02906734", "broom"], "463": ["n02909870", "bucket"], "464": ["n02910353", "buckle"], "465": ["n02916936", "bulletproof_vest"], "466": ["n02917067", "bullet_train"], "467": ["n02927161", "butcher_shop"], "468": ["n02930766", "cab"], "469": ["n02939185", "caldron"], "470": ["n02948072", "candle"], "471": ["n02950826", "cannon"], "472": ["n02951358", "canoe"], "473": ["n02951585", "can_opener"], "474": ["n02963159", "cardigan"], "475": ["n02965783", "car_mirror"], "476": ["n02966193", "carousel"], "477": ["n02966687", "carpenter's_kit"], "478": ["n02971356", "carton"], "479": ["n02974003", "car_wheel"], "480": ["n02977058", "cash_machine"], "481": ["n02978881", "cassette"], "482": ["n02979186", "cassette_player"], "483": ["n02980441", "castle"], "484": ["n02981792", "catamaran"], "485": ["n02988304", "CD_player"], "486": ["n02992211", "cello"], "487": ["n02992529", "cellular_telephone"], "488": ["n02999410", "chain"], "489": ["n03000134", "chainlink_fence"], "490": ["n03000247", "chain_mail"], "491": ["n03000684", "chain_saw"], "492": ["n03014705", "chest"], "493": ["n03016953", "chiffonier"], "494": ["n03017168", "chime"], "495": ["n03018349", "china_cabinet"], "496": ["n03026506", "Christmas_stocking"], "497": ["n03028079", "church"], "498": ["n03032252", "cinema"], "499": ["n03041632", "cleaver"], "500": ["n03042490", "cliff_dwelling"], "501": ["n03045698", "cloak"], "502": ["n03047690", "clog"], "503": ["n03062245", "cocktail_shaker"], "504": ["n03063599", "coffee_mug"], "505": ["n03063689", "coffeepot"], "506": ["n03065424", "coil"], "507": ["n03075370", "combination_lock"], "508": ["n03085013", "computer_keyboard"], "509": ["n03089624", "confectionery"], "510": ["n03095699", "container_ship"], "511": ["n03100240", "convertible"], "512": ["n03109150", "corkscrew"], "513": ["n03110669", "cornet"], "514": ["n03124043", "cowboy_boot"], "515": ["n03124170", "cowboy_hat"], "516": ["n03125729", "cradle"], "517": ["n03126707", "crane"], "518": ["n03127747", "crash_helmet"], "519": ["n03127925", "crate"], "520": ["n03131574", "crib"], "521": ["n03133878", "Crock_Pot"], "522": ["n03134739", "croquet_ball"], "523": ["n03141823", "crutch"], "524": ["n03146219", "cuirass"], "525": ["n03160309", "dam"], "526": ["n03179701", "desk"], "527": ["n03180011", "desktop_computer"], "528": ["n03187595", "dial_telephone"], "529": ["n03188531", "diaper"], "530": ["n03196217", "digital_clock"], "531": ["n03197337", "digital_watch"], "532": ["n03201208", "dining_table"], "533": ["n03207743", "dishrag"], "534": ["n03207941", "dishwasher"], "535": ["n03208938", "disk_brake"], "536": ["n03216828", "dock"], "537": ["n03218198", "dogsled"], "538": ["n03220513", "dome"], "539": ["n03223299", "doormat"], "540": ["n03240683", "drilling_platform"], "541": ["n03249569", "drum"], "542": ["n03250847", "drumstick"], "543": ["n03255030", "dumbbell"], "544": ["n03259280", "Dutch_oven"], "545": ["n03271574", "electric_fan"], "546": ["n03272010", "electric_guitar"], "547": ["n03272562", "electric_locomotive"], "548": ["n03290653", "entertainment_center"], "549": ["n03291819", "envelope"], "550": ["n03297495", "espresso_maker"], "551": ["n03314780", "face_powder"], "552": ["n03325584", "feather_boa"], "553": ["n03337140", "file"], "554": ["n03344393", "fireboat"], "555": ["n03345487", "fire_engine"], "556": ["n03347037", "fire_screen"], "557": ["n03355925", "flagpole"], "558": ["n03372029", "flute"], "559": ["n03376595", "folding_chair"], "560": ["n03379051", "football_helmet"], "561": ["n03384352", "forklift"], "562": ["n03388043", "fountain"], "563": ["n03388183", "fountain_pen"], "564": ["n03388549", "four-poster"], "565": ["n03393912", "freight_car"], "566": ["n03394916", "French_horn"], "567": ["n03400231", "frying_pan"], "568": ["n03404251", "fur_coat"], "569": ["n03417042", "garbage_truck"], "570": ["n03424325", "gasmask"], "571": ["n03425413", "gas_pump"], "572": ["n03443371", "goblet"], "573": ["n03444034", "go-kart"], "574": ["n03445777", "golf_ball"], "575": ["n03445924", "golfcart"], "576": ["n03447447", "gondola"], "577": ["n03447721", "gong"], "578": ["n03450230", "gown"], "579": ["n03452741", "grand_piano"], "580": ["n03457902", "greenhouse"], "581": ["n03459775", "grille"], "582": ["n03461385", "grocery_store"], "583": ["n03467068", "guillotine"], "584": ["n03476684", "hair_slide"], "585": ["n03476991", "hair_spray"], "586": ["n03478589", "half_track"], "587": ["n03481172", "hammer"], "588": ["n03482405", "hamper"], "589": ["n03483316", "hand_blower"], "590": ["n03485407", "hand-held_computer"], "591": ["n03485794", "handkerchief"], "592": ["n03492542", "hard_disc"], "593": ["n03494278", "harmonica"], "594": ["n03495258", "harp"], "595": ["n03496892", "harvester"], "596": ["n03498962", "hatchet"], "597": ["n03527444", "holster"], "598": ["n03529860", "home_theater"], "599": ["n03530642", "honeycomb"], "600": ["n03532672", "hook"], "601": ["n03534580", "hoopskirt"], "602": ["n03535780", "horizontal_bar"], "603": ["n03538406", "horse_cart"], "604": ["n03544143", "hourglass"], "605": ["n03584254", "iPod"], "606": ["n03584829", "iron"], "607": ["n03590841", "jack-o'-lantern"], "608": ["n03594734", "jean"], "609": ["n03594945", "jeep"], "610": ["n03595614", "jersey"], "611": ["n03598930", "jigsaw_puzzle"], "612": ["n03599486", "jinrikisha"], "613": ["n03602883", "joystick"], "614": ["n03617480", "kimono"], "615": ["n03623198", "knee_pad"], "616": ["n03627232", "knot"], "617": ["n03630383", "lab_coat"], "618": ["n03633091", "ladle"], "619": ["n03637318", "lampshade"], "620": ["n03642806", "laptop"], "621": ["n03649909", "lawn_mower"], "622": ["n03657121", "lens_cap"], "623": ["n03658185", "letter_opener"], "624": ["n03661043", "library"], "625": ["n03662601", "lifeboat"], "626": ["n03666591", "lighter"], "627": ["n03670208", "limousine"], "628": ["n03673027", "liner"], "629": ["n03676483", "lipstick"], "630": ["n03680355", "Loafer"], "631": ["n03690938", "lotion"], "632": ["n03691459", "loudspeaker"], "633": ["n03692522", "loupe"], "634": ["n03697007", "lumbermill"], "635": ["n03706229", "magnetic_compass"], "636": ["n03709823", "mailbag"], "637": ["n03710193", "mailbox"], "638": ["n03710637", "maillot"], "639": ["n03710721", "maillot"], "640": ["n03717622", "manhole_cover"], "641": ["n03720891", "maraca"], "642": ["n03721384", "marimba"], "643": ["n03724870", "mask"], "644": ["n03729826", "matchstick"], "645": ["n03733131", "maypole"], "646": ["n03733281", "maze"], "647": ["n03733805", "measuring_cup"], "648": ["n03742115", "medicine_chest"], "649": ["n03743016", "megalith"], "650": ["n03759954", "microphone"], "651": ["n03761084", "microwave"], "652": ["n03763968", "military_uniform"], "653": ["n03764736", "milk_can"], "654": ["n03769881", "minibus"], "655": ["n03770439", "miniskirt"], "656": ["n03770679", "minivan"], "657": ["n03773504", "missile"], "658": ["n03775071", "mitten"], "659": ["n03775546", "mixing_bowl"], "660": ["n03776460", "mobile_home"], "661": ["n03777568", "Model_T"], "662": ["n03777754", "modem"], "663": ["n03781244", "monastery"], "664": ["n03782006", "monitor"], "665": ["n03785016", "moped"], "666": ["n03786901", "mortar"], "667": ["n03787032", "mortarboard"], "668": ["n03788195", "mosque"], "669": ["n03788365", "mosquito_net"], "670": ["n03791053", "motor_scooter"], "671": ["n03792782", "mountain_bike"], "672": ["n03792972", "mountain_tent"], "673": ["n03793489", "mouse"], "674": ["n03794056", "mousetrap"], "675": ["n03796401", "moving_van"], "676": ["n03803284", "muzzle"], "677": ["n03804744", "nail"], "678": ["n03814639", "neck_brace"], "679": ["n03814906", "necklace"], "680": ["n03825788", "nipple"], "681": ["n03832673", "notebook"], "682": ["n03837869", "obelisk"], "683": ["n03838899", "oboe"], "684": ["n03840681", "ocarina"], "685": ["n03841143", "odometer"], "686": ["n03843555", "oil_filter"], "687": ["n03854065", "organ"], "688": ["n03857828", "oscilloscope"], "689": ["n03866082", "overskirt"], "690": ["n03868242", "oxcart"], "691": ["n03868863", "oxygen_mask"], "692": ["n03871628", "packet"], "693": ["n03873416", "paddle"], "694": ["n03874293", "paddlewheel"], "695": ["n03874599", "padlock"], "696": ["n03876231", "paintbrush"], "697": ["n03877472", "pajama"], "698": ["n03877845", "palace"], "699": ["n03884397", "panpipe"], "700": ["n03887697", "paper_towel"], "701": ["n03888257", "parachute"], "702": ["n03888605", "parallel_bars"], "703": ["n03891251", "park_bench"], "704": ["n03891332", "parking_meter"], "705": ["n03895866", "passenger_car"], "706": ["n03899768", "patio"], "707": ["n03902125", "pay-phone"], "708": ["n03903868", "pedestal"], "709": ["n03908618", "pencil_box"], "710": ["n03908714", "pencil_sharpener"], "711": ["n03916031", "perfume"], "712": ["n03920288", "Petri_dish"], "713": ["n03924679", "photocopier"], "714": ["n03929660", "pick"], "715": ["n03929855", "pickelhaube"], "716": ["n03930313", "picket_fence"], "717": ["n03930630", "pickup"], "718": ["n03933933", "pier"], "719": ["n03935335", "piggy_bank"], "720": ["n03937543", "pill_bottle"], "721": ["n03938244", "pillow"], "722": ["n03942813", "ping-pong_ball"], "723": ["n03944341", "pinwheel"], "724": ["n03947888", "pirate"], "725": ["n03950228", "pitcher"], "726": ["n03954731", "plane"], "727": ["n03956157", "planetarium"], "728": ["n03958227", "plastic_bag"], "729": ["n03961711", "plate_rack"], "730": ["n03967562", "plow"], "731": ["n03970156", "plunger"], "732": ["n03976467", "Polaroid_camera"], "733": ["n03976657", "pole"], "734": ["n03977966", "police_van"], "735": ["n03980874", "poncho"], "736": ["n03982430", "pool_table"], "737": ["n03983396", "pop_bottle"], "738": ["n03991062", "pot"], "739": ["n03992509", "potter's_wheel"], "740": ["n03995372", "power_drill"], "741": ["n03998194", "prayer_rug"], "742": ["n04004767", "printer"], "743": ["n04005630", "prison"], "744": ["n04008634", "projectile"], "745": ["n04009552", "projector"], "746": ["n04019541", "puck"], "747": ["n04023962", "punching_bag"], "748": ["n04026417", "purse"], "749": ["n04033901", "quill"], "750": ["n04033995", "quilt"], "751": ["n04037443", "racer"], "752": ["n04039381", "racket"], "753": ["n04040759", "radiator"], "754": ["n04041544", "radio"], "755": ["n04044716", "radio_telescope"], "756": ["n04049303", "rain_barrel"], "757": ["n04065272", "recreational_vehicle"], "758": ["n04067472", "reel"], "759": ["n04069434", "reflex_camera"], "760": ["n04070727", "refrigerator"], "761": ["n04074963", "remote_control"], "762": ["n04081281", "restaurant"], "763": ["n04086273", "revolver"], "764": ["n04090263", "rifle"], "765": ["n04099969", "rocking_chair"], "766": ["n04111531", "rotisserie"], "767": ["n04116512", "rubber_eraser"], "768": ["n04118538", "rugby_ball"], "769": ["n04118776", "rule"], "770": ["n04120489", "running_shoe"], "771": ["n04125021", "safe"], "772": ["n04127249", "safety_pin"], "773": ["n04131690", "saltshaker"], "774": ["n04133789", "sandal"], "775": ["n04136333", "sarong"], "776": ["n04141076", "sax"], "777": ["n04141327", "scabbard"], "778": ["n04141975", "scale"], "779": ["n04146614", "school_bus"], "780": ["n04147183", "schooner"], "781": ["n04149813", "scoreboard"], "782": ["n04152593", "screen"], "783": ["n04153751", "screw"], "784": ["n04154565", "screwdriver"], "785": ["n04162706", "seat_belt"], "786": ["n04179913", "sewing_machine"], "787": ["n04192698", "shield"], "788": ["n04200800", "shoe_shop"], "789": ["n04201297", "shoji"], "790": ["n04204238", "shopping_basket"], "791": ["n04204347", "shopping_cart"], "792": ["n04208210", "shovel"], "793": ["n04209133", "shower_cap"], "794": ["n04209239", "shower_curtain"], "795": ["n04228054", "ski"], "796": ["n04229816", "ski_mask"], "797": ["n04235860", "sleeping_bag"], "798": ["n04238763", "slide_rule"], "799": ["n04239074", "sliding_door"], "800": ["n04243546", "slot"], "801": ["n04251144", "snorkel"], "802": ["n04252077", "snowmobile"], "803": ["n04252225", "snowplow"], "804": ["n04254120", "soap_dispenser"], "805": ["n04254680", "soccer_ball"], "806": ["n04254777", "sock"], "807": ["n04258138", "solar_dish"], "808": ["n04259630", "sombrero"], "809": ["n04263257", "soup_bowl"], "810": ["n04264628", "space_bar"], "811": ["n04265275", "space_heater"], "812": ["n04266014", "space_shuttle"], "813": ["n04270147", "spatula"], "814": ["n04273569", "speedboat"], "815": ["n04275548", "spider_web"], "816": ["n04277352", "spindle"], "817": ["n04285008", "sports_car"], "818": ["n04286575", "spotlight"], "819": ["n04296562", "stage"], "820": ["n04310018", "steam_locomotive"], "821": ["n04311004", "steel_arch_bridge"], "822": ["n04311174", "steel_drum"], "823": ["n04317175", "stethoscope"], "824": ["n04325704", "stole"], "825": ["n04326547", "stone_wall"], "826": ["n04328186", "stopwatch"], "827": ["n04330267", "stove"], "828": ["n04332243", "strainer"], "829": ["n04335435", "streetcar"], "830": ["n04336792", "stretcher"], "831": ["n04344873", "studio_couch"], "832": ["n04346328", "stupa"], "833": ["n04347754", "submarine"], "834": ["n04350905", "suit"], "835": ["n04355338", "sundial"], "836": ["n04355933", "sunglass"], "837": ["n04356056", "sunglasses"], "838": ["n04357314", "sunscreen"], "839": ["n04366367", "suspension_bridge"], "840": ["n04367480", "swab"], "841": ["n04370456", "sweatshirt"], "842": ["n04371430", "swimming_trunks"], "843": ["n04371774", "swing"], "844": ["n04372370", "switch"], "845": ["n04376876", "syringe"], "846": ["n04380533", "table_lamp"], "847": ["n04389033", "tank"], "848": ["n04392985", "tape_player"], "849": ["n04398044", "teapot"], "850": ["n04399382", "teddy"], "851": ["n04404412", "television"], "852": ["n04409515", "tennis_ball"], "853": ["n04417672", "thatch"], "854": ["n04418357", "theater_curtain"], "855": ["n04423845", "thimble"], "856": ["n04428191", "thresher"], "857": ["n04429376", "throne"], "858": ["n04435653", "tile_roof"], "859": ["n04442312", "toaster"], "860": ["n04443257", "tobacco_shop"], "861": ["n04447861", "toilet_seat"], "862": ["n04456115", "torch"], "863": ["n04458633", "totem_pole"], "864": ["n04461696", "tow_truck"], "865": ["n04462240", "toyshop"], "866": ["n04465501", "tractor"], "867": ["n04467665", "trailer_truck"], "868": ["n04476259", "tray"], "869": ["n04479046", "trench_coat"], "870": ["n04482393", "tricycle"], "871": ["n04483307", "trimaran"], "872": ["n04485082", "tripod"], "873": ["n04486054", "triumphal_arch"], "874": ["n04487081", "trolleybus"], "875": ["n04487394", "trombone"], "876": ["n04493381", "tub"], "877": ["n04501370", "turnstile"], "878": ["n04505470", "typewriter_keyboard"], "879": ["n04507155", "umbrella"], "880": ["n04509417", "unicycle"], "881": ["n04515003", "upright"], "882": ["n04517823", "vacuum"], "883": ["n04522168", "vase"], "884": ["n04523525", "vault"], "885": ["n04525038", "velvet"], "886": ["n04525305", "vending_machine"], "887": ["n04532106", "vestment"], "888": ["n04532670", "viaduct"], "889": ["n04536866", "violin"], "890": ["n04540053", "volleyball"], "891": ["n04542943", "waffle_iron"], "892": ["n04548280", "wall_clock"], "893": ["n04548362", "wallet"], "894": ["n04550184", "wardrobe"], "895": ["n04552348", "warplane"], "896": ["n04553703", "washbasin"], "897": ["n04554684", "washer"], "898": ["n04557648", "water_bottle"], "899": ["n04560804", "water_jug"], "900": ["n04562935", "water_tower"], "901": ["n04579145", "whiskey_jug"], "902": ["n04579432", "whistle"], "903": ["n04584207", "wig"], "904": ["n04589890", "window_screen"], "905": ["n04590129", "window_shade"], "906": ["n04591157", "Windsor_tie"], "907": ["n04591713", "wine_bottle"], "908": ["n04592741", "wing"], "909": ["n04596742", "wok"], "910": ["n04597913", "wooden_spoon"], "911": ["n04599235", "wool"], "912": ["n04604644", "worm_fence"], "913": ["n04606251", "wreck"], "914": ["n04612504", "yawl"], "915": ["n04613696", "yurt"], "916": ["n06359193", "web_site"], "917": ["n06596364", "comic_book"], "918": ["n06785654", "crossword_puzzle"], "919": ["n06794110", "street_sign"], "920": ["n06874185", "traffic_light"], "921": ["n07248320", "book_jacket"], "922": ["n07565083", "menu"], "923": ["n07579787", "plate"], "924": ["n07583066", "guacamole"], "925": ["n07584110", "consomme"], "926": ["n07590611", "hot_pot"], "927": ["n07613480", "trifle"], "928": ["n07614500", "ice_cream"], "929": ["n07615774", "ice_lolly"], "930": ["n07684084", "French_loaf"], "931": ["n07693725", "bagel"], "932": ["n07695742", "pretzel"], "933": ["n07697313", "cheeseburger"], "934": ["n07697537", "hotdog"], "935": ["n07711569", "mashed_potato"], "936": ["n07714571", "head_cabbage"], "937": ["n07714990", "broccoli"], "938": ["n07715103", "cauliflower"], "939": ["n07716358", "zucchini"], "940": ["n07716906", "spaghetti_squash"], "941": ["n07717410", "acorn_squash"], "942": ["n07717556", "butternut_squash"], "943": ["n07718472", "cucumber"], "944": ["n07718747", "artichoke"], "945": ["n07720875", "bell_pepper"], "946": ["n07730033", "cardoon"], "947": ["n07734744", "mushroom"], "948": ["n07742313", "Granny_Smith"], "949": ["n07745940", "strawberry"], "950": ["n07747607", "orange"], "951": ["n07749582", "lemon"], "952": ["n07753113", "fig"], "953": ["n07753275", "pineapple"], "954": ["n07753592", "banana"], "955": ["n07754684", "jackfruit"], "956": ["n07760859", "custard_apple"], "957": ["n07768694", "pomegranate"], "958": ["n07802026", "hay"], "959": ["n07831146", "carbonara"], "960": ["n07836838", "chocolate_sauce"], "961": ["n07860988", "dough"], "962": ["n07871810", "meat_loaf"], "963": ["n07873807", "pizza"], "964": ["n07875152", "potpie"], "965": ["n07880968", "burrito"], "966": ["n07892512", "red_wine"], "967": ["n07920052", "espresso"], "968": ["n07930864", "cup"], "969": ["n07932039", "eggnog"], "970": ["n09193705", "alp"], "971": ["n09229709", "bubble"], "972": ["n09246464", "cliff"], "973": ["n09256479", "coral_reef"], "974": ["n09288635", "geyser"], "975": ["n09332890", "lakeside"], "976": ["n09399592", "promontory"], "977": ["n09421951", "sandbar"], "978": ["n09428293", "seashore"], "979": ["n09468604", "valley"], "980": ["n09472597", "volcano"], "981": ["n09835506", "ballplayer"], "982": ["n10148035", "groom"], "983": ["n10565667", "scuba_diver"], "984": ["n11879895", "rapeseed"], "985": ["n11939491", "daisy"], "986": ["n12057211", "yellow_lady's_slipper"], "987": ["n12144580", "corn"], "988": ["n12267677", "acorn"], "989": ["n12620546", "hip"], "990": ["n12768682", "buckeye"], "991": ["n12985857", "coral_fungus"], "992": ["n12998815", "agaric"], "993": ["n13037406", "gyromitra"], "994": ["n13040303", "stinkhorn"], "995": ["n13044778", "earthstar"], "996": ["n13052670", "hen-of-the-woods"], "997": ["n13054560", "bolete"], "998": ["n13133613", "ear"], "999": ["n15075141", "toilet_tissue"]} -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy==1.26.4 2 | scipy==1.11.4 3 | pandas==2.2.2 4 | lxml==5.2.2 5 | scikit-learn==1.5.0 6 | joblib==1.4.2 7 | statsmodels==0.14.2 8 | lifelines==0.28.0 9 | pygam==0.9.1 10 | l0bnb==1.0.0 11 | torch==2.3.0 12 | torchvision==0.18.0 13 | pytorch-lightning==2.2.5 14 | torchinfo==1.8.0 15 | torchmetrics==1.4.0.post0 16 | ISLP==0.4.0 17 | --------------------------------------------------------------------------------