├── .all-contributorsrc
├── .devcontainer
    ├── devcontainer.json
    └── startup.sh
├── .github
    └── workflows
    │   ├── release.yml
    │   └── test.yml
├── .gitignore
├── Auto.csv
├── Auto.data
├── Ch02-statlearn-lab.Rmd
├── Ch02-statlearn-lab.ipynb
├── Ch03-linreg-lab.Rmd
├── Ch03-linreg-lab.ipynb
├── Ch04-classification-lab.Rmd
├── Ch04-classification-lab.ipynb
├── Ch05-resample-lab.Rmd
├── Ch05-resample-lab.ipynb
├── Ch06-varselect-lab.Rmd
├── Ch06-varselect-lab.ipynb
├── Ch07-nonlin-lab.Rmd
├── Ch07-nonlin-lab.ipynb
├── Ch08-baggboost-lab.Rmd
├── Ch08-baggboost-lab.ipynb
├── Ch09-svm-lab.Rmd
├── Ch09-svm-lab.ipynb
├── Ch10-deeplearning-lab.Rmd
├── Ch10-deeplearning-lab.ipynb
├── Ch11-surv-lab.Rmd
├── Ch11-surv-lab.ipynb
├── Ch12-unsup-lab.Rmd
├── Ch12-unsup-lab.ipynb
├── Ch13-multiple-lab.Rmd
├── Ch13-multiple-lab.ipynb
├── LICENSE
├── Makefile
├── README.md
├── book_images
    ├── Cape_Weaver.jpg
    ├── Flamingo.jpg
    ├── Hawk_Fountain.jpg
    ├── Hawk_cropped.jpg
    ├── Lhasa_Apso.jpg
    └── Sleeping_Cat.jpg
├── imagenet_class_index.json
└── requirements.txt


/.all-contributorsrc:
--------------------------------------------------------------------------------
 1 | {
 2 |   "files": [
 3 |     "README.md"
 4 |   ],
 5 |   "imageSize": 100,
 6 |   "commit": false,
 7 |   "commitType": "docs",
 8 |   "commitConvention": "angular",
 9 |   "contributors": [
10 |     {
11 |       "login": "tibshirani",
12 |       "name": "tibshirani",
13 |       "avatar_url": "https://avatars.githubusercontent.com/u/2848609?v=4",
14 |       "profile": "https://github.com/tibshirani",
15 |       "contributions": [
16 |         "code",
17 |         "content"
18 |       ]
19 |     },
20 |     {
21 |       "login": "trevorhastie",
22 |       "name": "trevorhastie",
23 |       "avatar_url": "https://avatars.githubusercontent.com/u/13293253?v=4",
24 |       "profile": "https://web.stanford.edu/~hastie/",
25 |       "contributions": [
26 |         "code",
27 |         "content"
28 |       ]
29 |     },
30 |     {
31 |       "login": "danielawitten",
32 |       "name": "danielawitten",
33 |       "avatar_url": "https://avatars.githubusercontent.com/u/12654191?v=4",
34 |       "profile": "https://github.com/danielawitten",
35 |       "contributions": [
36 |         "code",
37 |         "content"
38 |       ]
39 |     },
40 |     {
41 |       "login": "jonathan-taylor",
42 |       "name": "Jonathan Taylor",
43 |       "avatar_url": "https://avatars.githubusercontent.com/u/341611?v=4",
44 |       "profile": "http://statweb.stanford.edu/~jtaylo",
45 |       "contributions": [
46 |         "code",
47 |         "content"
48 |       ]
49 |     },
50 |     {
51 |       "login": "tschm",
52 |       "name": "Thomas Schmelzer",
53 |       "avatar_url": "https://avatars.githubusercontent.com/u/2046079?v=4",
54 |       "profile": "https://github.com/tschm",
55 |       "contributions": [
56 |         "code"
57 |       ]
58 |     }
59 |   ],
60 |   "contributorsPerLine": 7,
61 |   "skipCi": true,
62 |   "repoType": "github",
63 |   "repoHost": "https://github.com",
64 |   "projectName": "ISLP_labs",
65 |   "projectOwner": "intro-stat-learning"
66 | }
67 | 


--------------------------------------------------------------------------------
/.devcontainer/devcontainer.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "name": "Jupyter Environment",
 3 |     "image": "mcr.microsoft.com/devcontainers/python:3",
 4 |     "features": {
 5 |         "ghcr.io/devcontainers/features/python:1": {}
 6 |     },
 7 |     "customizations": {
 8 |         "vscode": {
 9 |             "extensions": [
10 |                 "ms-python.python",
11 |                 "ms-toolsai.jupyter",
12 |                 "ms-toolsai.jupyter-keymap",
13 |                 "ms-toolsai.jupyter-renderers",
14 |                 "ms-toolsai.vscode-jupyter-cell-tags",
15 |                 "ms-toolsai.vscode-jupyter-slideshow"
16 |             ]
17 |         }
18 |     },
19 |     "onCreateCommand": ".devcontainer/startup.sh",
20 |     "forwardPorts": [8888],
21 |     "postStartCommand": "uv run jupyter lab --no-browser --ip=0.0.0.0 --port=8888 --NotebookApp.token='' --NotebookApp.password=''"
22 | }
23 | 


--------------------------------------------------------------------------------
/.devcontainer/startup.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | curl -LsSf https://astral.sh/uv/install.sh | sh
3 | uv venv --python 3.12
4 | uv pip install --no-cache-dir jupyterlab
5 | uv pip install --no-cache-dir -r requirements.txt
6 | 


--------------------------------------------------------------------------------
/.github/workflows/release.yml:
--------------------------------------------------------------------------------
 1 | name: Bump version and publish
 2 | 
 3 | on:
 4 |   #push:
 5 |   workflow_dispatch
 6 | 
 7 | permissions:
 8 |   contents: write
 9 | 
10 | jobs:
11 |   tagging:
12 |     runs-on: ubuntu-latest
13 |     outputs:
14 |       new_tag: ${{ steps.tag_step.outputs.new_tag }}
15 | 
16 |     steps:
17 |       - name: Generate Tag
18 |         id: tag_step
19 |         uses: tschm/cradle/actions/tag@v0.1.57
20 |         with:
21 |           github_token: ${{ secrets.GITHUB_TOKEN }}
22 | 


--------------------------------------------------------------------------------
/.github/workflows/test.yml:
--------------------------------------------------------------------------------
 1 | # testing all notebooks
 2 | name: TEST
 3 | 
 4 | on:
 5 |   - push
 6 | 
 7 | jobs:
 8 |   test:
 9 |     runs-on: ubuntu-latest
10 | 
11 |     strategy:
12 |       matrix:
13 |         python-version: ['3.11', '3.12']
14 |         notebook: [ Ch02-statlearn-lab.ipynb,
15 |                     Ch03-linreg-lab.ipynb,
16 |                     Ch04-classification-lab.ipynb,
17 |                     Ch05-resample-lab.ipynb,
18 |                     Ch06-varselect-lab.ipynb,
19 |                     Ch07-nonlin-lab.ipynb,
20 |                     Ch08-baggboost-lab.ipynb,
21 |                     Ch09-svm-lab.ipynb,
22 |                     Ch10-deeplearning-lab.ipynb,
23 |                     Ch11-surv-lab.ipynb,
24 |                     Ch12-unsup-lab.ipynb,
25 |                     Ch13-multiple-lab.ipynb]
26 |         exclude:
27 |           - python-version: '3.11'
28 |             notebook: Ch10-deeplearning-lab.ipynb
29 | 
30 |       fail-fast: false
31 | 
32 |     steps:
33 |     - uses: actions/checkout@v4
34 | 
35 |     - name: Install uv
36 |       uses: astral-sh/setup-uv@v5
37 |       with:
38 |         version: "0.5.15"
39 | 
40 |     - name: Set up Python
41 |       shell: bash
42 |       run: |
43 |         uv python install ${{ matrix.python-version }}
44 | 
45 |     - name: Create venv
46 |       shell: bash
47 |       run: uv venv --python ${{ matrix.python-version }}
48 | 
49 |     - name: Install requirements
50 |       shell: bash
51 |       run: |
52 |         uv pip install --upgrade pip
53 |         uv pip install -r requirements.txt
54 |         uv pip install pytest nbmake
55 | 
56 |     - name: Test
57 |       shell: bash
58 |       run: |
59 |         uv run pytest --nbmake --nbmake-timeout=3600 -vv ${{ matrix.notebook }}
60 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | *$py.class
 5 | 
 6 | # Jupyter Notebook
 7 | .ipynb_checkpoints/
 8 | *.ipynb_checkpoints/
 9 | *.ipynb_meta
10 | 
11 | # Python environments
12 | env/
13 | venv/
14 | .venv/
15 | ENV/
16 | env.bak/
17 | venv.bak/
18 | .spyderproject
19 | .spyproject
20 | .ropeproject
21 | 
22 | # Distribution / packaging
23 | .Python
24 | build/
25 | develop-eggs/
26 | dist/
27 | downloads/
28 | eggs/
29 | .eggs/
30 | lib/
31 | lib64/
32 | parts/
33 | sdist/
34 | var/
35 | wheels/
36 | pip-wheel-metadata/
37 | share/python-wheels/
38 | *.egg-info/
39 | .installed.cfg
40 | *.egg
41 | 
42 | # PyInstaller
43 | *.manifest
44 | *.spec
45 | 
46 | # Operating System
47 | .DS_Store
48 | 
49 | # IDEs
50 | .vscode/
51 | .idea/
52 | 
53 | 


--------------------------------------------------------------------------------
/Auto.csv:
--------------------------------------------------------------------------------
  1 | mpg,cylinders,displacement,horsepower,weight,acceleration,year,origin,name
  2 | 18.0,8,307.0,130,3504,12.0,70,1,chevrolet chevelle malibu
  3 | 15.0,8,350.0,165,3693,11.5,70,1,buick skylark 320
  4 | 18.0,8,318.0,150,3436,11.0,70,1,plymouth satellite
  5 | 16.0,8,304.0,150,3433,12.0,70,1,amc rebel sst
  6 | 17.0,8,302.0,140,3449,10.5,70,1,ford torino
  7 | 15.0,8,429.0,198,4341,10.0,70,1,ford galaxie 500
  8 | 14.0,8,454.0,220,4354,9.0,70,1,chevrolet impala
  9 | 14.0,8,440.0,215,4312,8.5,70,1,plymouth fury iii
 10 | 14.0,8,455.0,225,4425,10.0,70,1,pontiac catalina
 11 | 15.0,8,390.0,190,3850,8.5,70,1,amc ambassador dpl
 12 | 15.0,8,383.0,170,3563,10.0,70,1,dodge challenger se
 13 | 14.0,8,340.0,160,3609,8.0,70,1,plymouth 'cuda 340
 14 | 15.0,8,400.0,150,3761,9.5,70,1,chevrolet monte carlo
 15 | 14.0,8,455.0,225,3086,10.0,70,1,buick estate wagon (sw)
 16 | 24.0,4,113.0,95,2372,15.0,70,3,toyota corona mark ii
 17 | 22.0,6,198.0,95,2833,15.5,70,1,plymouth duster
 18 | 18.0,6,199.0,97,2774,15.5,70,1,amc hornet
 19 | 21.0,6,200.0,85,2587,16.0,70,1,ford maverick
 20 | 27.0,4,97.0,88,2130,14.5,70,3,datsun pl510
 21 | 26.0,4,97.0,46,1835,20.5,70,2,volkswagen 1131 deluxe sedan
 22 | 25.0,4,110.0,87,2672,17.5,70,2,peugeot 504
 23 | 24.0,4,107.0,90,2430,14.5,70,2,audi 100 ls
 24 | 25.0,4,104.0,95,2375,17.5,70,2,saab 99e
 25 | 26.0,4,121.0,113,2234,12.5,70,2,bmw 2002
 26 | 21.0,6,199.0,90,2648,15.0,70,1,amc gremlin
 27 | 10.0,8,360.0,215,4615,14.0,70,1,ford f250
 28 | 10.0,8,307.0,200,4376,15.0,70,1,chevy c20
 29 | 11.0,8,318.0,210,4382,13.5,70,1,dodge d200
 30 | 9.0,8,304.0,193,4732,18.5,70,1,hi 1200d
 31 | 27.0,4,97.0,88,2130,14.5,71,3,datsun pl510
 32 | 28.0,4,140.0,90,2264,15.5,71,1,chevrolet vega 2300
 33 | 25.0,4,113.0,95,2228,14.0,71,3,toyota corona
 34 | 19.0,6,232.0,100,2634,13.0,71,1,amc gremlin
 35 | 16.0,6,225.0,105,3439,15.5,71,1,plymouth satellite custom
 36 | 17.0,6,250.0,100,3329,15.5,71,1,chevrolet chevelle malibu
 37 | 19.0,6,250.0,88,3302,15.5,71,1,ford torino 500
 38 | 18.0,6,232.0,100,3288,15.5,71,1,amc matador
 39 | 14.0,8,350.0,165,4209,12.0,71,1,chevrolet impala
 40 | 14.0,8,400.0,175,4464,11.5,71,1,pontiac catalina brougham
 41 | 14.0,8,351.0,153,4154,13.5,71,1,ford galaxie 500
 42 | 14.0,8,318.0,150,4096,13.0,71,1,plymouth fury iii
 43 | 12.0,8,383.0,180,4955,11.5,71,1,dodge monaco (sw)
 44 | 13.0,8,400.0,170,4746,12.0,71,1,ford country squire (sw)
 45 | 13.0,8,400.0,175,5140,12.0,71,1,pontiac safari (sw)
 46 | 18.0,6,258.0,110,2962,13.5,71,1,amc hornet sportabout (sw)
 47 | 22.0,4,140.0,72,2408,19.0,71,1,chevrolet vega (sw)
 48 | 19.0,6,250.0,100,3282,15.0,71,1,pontiac firebird
 49 | 18.0,6,250.0,88,3139,14.5,71,1,ford mustang
 50 | 23.0,4,122.0,86,2220,14.0,71,1,mercury capri 2000
 51 | 28.0,4,116.0,90,2123,14.0,71,2,opel 1900
 52 | 30.0,4,79.0,70,2074,19.5,71,2,peugeot 304
 53 | 30.0,4,88.0,76,2065,14.5,71,2,fiat 124b
 54 | 31.0,4,71.0,65,1773,19.0,71,3,toyota corolla 1200
 55 | 35.0,4,72.0,69,1613,18.0,71,3,datsun 1200
 56 | 27.0,4,97.0,60,1834,19.0,71,2,volkswagen model 111
 57 | 26.0,4,91.0,70,1955,20.5,71,1,plymouth cricket
 58 | 24.0,4,113.0,95,2278,15.5,72,3,toyota corona hardtop
 59 | 25.0,4,97.5,80,2126,17.0,72,1,dodge colt hardtop
 60 | 23.0,4,97.0,54,2254,23.5,72,2,volkswagen type 3
 61 | 20.0,4,140.0,90,2408,19.5,72,1,chevrolet vega
 62 | 21.0,4,122.0,86,2226,16.5,72,1,ford pinto runabout
 63 | 13.0,8,350.0,165,4274,12.0,72,1,chevrolet impala
 64 | 14.0,8,400.0,175,4385,12.0,72,1,pontiac catalina
 65 | 15.0,8,318.0,150,4135,13.5,72,1,plymouth fury iii
 66 | 14.0,8,351.0,153,4129,13.0,72,1,ford galaxie 500
 67 | 17.0,8,304.0,150,3672,11.5,72,1,amc ambassador sst
 68 | 11.0,8,429.0,208,4633,11.0,72,1,mercury marquis
 69 | 13.0,8,350.0,155,4502,13.5,72,1,buick lesabre custom
 70 | 12.0,8,350.0,160,4456,13.5,72,1,oldsmobile delta 88 royale
 71 | 13.0,8,400.0,190,4422,12.5,72,1,chrysler newport royal
 72 | 19.0,3,70.0,97,2330,13.5,72,3,mazda rx2 coupe
 73 | 15.0,8,304.0,150,3892,12.5,72,1,amc matador (sw)
 74 | 13.0,8,307.0,130,4098,14.0,72,1,chevrolet chevelle concours (sw)
 75 | 13.0,8,302.0,140,4294,16.0,72,1,ford gran torino (sw)
 76 | 14.0,8,318.0,150,4077,14.0,72,1,plymouth satellite custom (sw)
 77 | 18.0,4,121.0,112,2933,14.5,72,2,volvo 145e (sw)
 78 | 22.0,4,121.0,76,2511,18.0,72,2,volkswagen 411 (sw)
 79 | 21.0,4,120.0,87,2979,19.5,72,2,peugeot 504 (sw)
 80 | 26.0,4,96.0,69,2189,18.0,72,2,renault 12 (sw)
 81 | 22.0,4,122.0,86,2395,16.0,72,1,ford pinto (sw)
 82 | 28.0,4,97.0,92,2288,17.0,72,3,datsun 510 (sw)
 83 | 23.0,4,120.0,97,2506,14.5,72,3,toyouta corona mark ii (sw)
 84 | 28.0,4,98.0,80,2164,15.0,72,1,dodge colt (sw)
 85 | 27.0,4,97.0,88,2100,16.5,72,3,toyota corolla 1600 (sw)
 86 | 13.0,8,350.0,175,4100,13.0,73,1,buick century 350
 87 | 14.0,8,304.0,150,3672,11.5,73,1,amc matador
 88 | 13.0,8,350.0,145,3988,13.0,73,1,chevrolet malibu
 89 | 14.0,8,302.0,137,4042,14.5,73,1,ford gran torino
 90 | 15.0,8,318.0,150,3777,12.5,73,1,dodge coronet custom
 91 | 12.0,8,429.0,198,4952,11.5,73,1,mercury marquis brougham
 92 | 13.0,8,400.0,150,4464,12.0,73,1,chevrolet caprice classic
 93 | 13.0,8,351.0,158,4363,13.0,73,1,ford ltd
 94 | 14.0,8,318.0,150,4237,14.5,73,1,plymouth fury gran sedan
 95 | 13.0,8,440.0,215,4735,11.0,73,1,chrysler new yorker brougham
 96 | 12.0,8,455.0,225,4951,11.0,73,1,buick electra 225 custom
 97 | 13.0,8,360.0,175,3821,11.0,73,1,amc ambassador brougham
 98 | 18.0,6,225.0,105,3121,16.5,73,1,plymouth valiant
 99 | 16.0,6,250.0,100,3278,18.0,73,1,chevrolet nova custom
100 | 18.0,6,232.0,100,2945,16.0,73,1,amc hornet
101 | 18.0,6,250.0,88,3021,16.5,73,1,ford maverick
102 | 23.0,6,198.0,95,2904,16.0,73,1,plymouth duster
103 | 26.0,4,97.0,46,1950,21.0,73,2,volkswagen super beetle
104 | 11.0,8,400.0,150,4997,14.0,73,1,chevrolet impala
105 | 12.0,8,400.0,167,4906,12.5,73,1,ford country
106 | 13.0,8,360.0,170,4654,13.0,73,1,plymouth custom suburb
107 | 12.0,8,350.0,180,4499,12.5,73,1,oldsmobile vista cruiser
108 | 18.0,6,232.0,100,2789,15.0,73,1,amc gremlin
109 | 20.0,4,97.0,88,2279,19.0,73,3,toyota carina
110 | 21.0,4,140.0,72,2401,19.5,73,1,chevrolet vega
111 | 22.0,4,108.0,94,2379,16.5,73,3,datsun 610
112 | 18.0,3,70.0,90,2124,13.5,73,3,maxda rx3
113 | 19.0,4,122.0,85,2310,18.5,73,1,ford pinto
114 | 21.0,6,155.0,107,2472,14.0,73,1,mercury capri v6
115 | 26.0,4,98.0,90,2265,15.5,73,2,fiat 124 sport coupe
116 | 15.0,8,350.0,145,4082,13.0,73,1,chevrolet monte carlo s
117 | 16.0,8,400.0,230,4278,9.5,73,1,pontiac grand prix
118 | 29.0,4,68.0,49,1867,19.5,73,2,fiat 128
119 | 24.0,4,116.0,75,2158,15.5,73,2,opel manta
120 | 20.0,4,114.0,91,2582,14.0,73,2,audi 100ls
121 | 19.0,4,121.0,112,2868,15.5,73,2,volvo 144ea
122 | 15.0,8,318.0,150,3399,11.0,73,1,dodge dart custom
123 | 24.0,4,121.0,110,2660,14.0,73,2,saab 99le
124 | 20.0,6,156.0,122,2807,13.5,73,3,toyota mark ii
125 | 11.0,8,350.0,180,3664,11.0,73,1,oldsmobile omega
126 | 20.0,6,198.0,95,3102,16.5,74,1,plymouth duster
127 | 19.0,6,232.0,100,2901,16.0,74,1,amc hornet
128 | 15.0,6,250.0,100,3336,17.0,74,1,chevrolet nova
129 | 31.0,4,79.0,67,1950,19.0,74,3,datsun b210
130 | 26.0,4,122.0,80,2451,16.5,74,1,ford pinto
131 | 32.0,4,71.0,65,1836,21.0,74,3,toyota corolla 1200
132 | 25.0,4,140.0,75,2542,17.0,74,1,chevrolet vega
133 | 16.0,6,250.0,100,3781,17.0,74,1,chevrolet chevelle malibu classic
134 | 16.0,6,258.0,110,3632,18.0,74,1,amc matador
135 | 18.0,6,225.0,105,3613,16.5,74,1,plymouth satellite sebring
136 | 16.0,8,302.0,140,4141,14.0,74,1,ford gran torino
137 | 13.0,8,350.0,150,4699,14.5,74,1,buick century luxus (sw)
138 | 14.0,8,318.0,150,4457,13.5,74,1,dodge coronet custom (sw)
139 | 14.0,8,302.0,140,4638,16.0,74,1,ford gran torino (sw)
140 | 14.0,8,304.0,150,4257,15.5,74,1,amc matador (sw)
141 | 29.0,4,98.0,83,2219,16.5,74,2,audi fox
142 | 26.0,4,79.0,67,1963,15.5,74,2,volkswagen dasher
143 | 26.0,4,97.0,78,2300,14.5,74,2,opel manta
144 | 31.0,4,76.0,52,1649,16.5,74,3,toyota corona
145 | 32.0,4,83.0,61,2003,19.0,74,3,datsun 710
146 | 28.0,4,90.0,75,2125,14.5,74,1,dodge colt
147 | 24.0,4,90.0,75,2108,15.5,74,2,fiat 128
148 | 26.0,4,116.0,75,2246,14.0,74,2,fiat 124 tc
149 | 24.0,4,120.0,97,2489,15.0,74,3,honda civic
150 | 26.0,4,108.0,93,2391,15.5,74,3,subaru
151 | 31.0,4,79.0,67,2000,16.0,74,2,fiat x1.9
152 | 19.0,6,225.0,95,3264,16.0,75,1,plymouth valiant custom
153 | 18.0,6,250.0,105,3459,16.0,75,1,chevrolet nova
154 | 15.0,6,250.0,72,3432,21.0,75,1,mercury monarch
155 | 15.0,6,250.0,72,3158,19.5,75,1,ford maverick
156 | 16.0,8,400.0,170,4668,11.5,75,1,pontiac catalina
157 | 15.0,8,350.0,145,4440,14.0,75,1,chevrolet bel air
158 | 16.0,8,318.0,150,4498,14.5,75,1,plymouth grand fury
159 | 14.0,8,351.0,148,4657,13.5,75,1,ford ltd
160 | 17.0,6,231.0,110,3907,21.0,75,1,buick century
161 | 16.0,6,250.0,105,3897,18.5,75,1,chevroelt chevelle malibu
162 | 15.0,6,258.0,110,3730,19.0,75,1,amc matador
163 | 18.0,6,225.0,95,3785,19.0,75,1,plymouth fury
164 | 21.0,6,231.0,110,3039,15.0,75,1,buick skyhawk
165 | 20.0,8,262.0,110,3221,13.5,75,1,chevrolet monza 2+2
166 | 13.0,8,302.0,129,3169,12.0,75,1,ford mustang ii
167 | 29.0,4,97.0,75,2171,16.0,75,3,toyota corolla
168 | 23.0,4,140.0,83,2639,17.0,75,1,ford pinto
169 | 20.0,6,232.0,100,2914,16.0,75,1,amc gremlin
170 | 23.0,4,140.0,78,2592,18.5,75,1,pontiac astro
171 | 24.0,4,134.0,96,2702,13.5,75,3,toyota corona
172 | 25.0,4,90.0,71,2223,16.5,75,2,volkswagen dasher
173 | 24.0,4,119.0,97,2545,17.0,75,3,datsun 710
174 | 18.0,6,171.0,97,2984,14.5,75,1,ford pinto
175 | 29.0,4,90.0,70,1937,14.0,75,2,volkswagen rabbit
176 | 19.0,6,232.0,90,3211,17.0,75,1,amc pacer
177 | 23.0,4,115.0,95,2694,15.0,75,2,audi 100ls
178 | 23.0,4,120.0,88,2957,17.0,75,2,peugeot 504
179 | 22.0,4,121.0,98,2945,14.5,75,2,volvo 244dl
180 | 25.0,4,121.0,115,2671,13.5,75,2,saab 99le
181 | 33.0,4,91.0,53,1795,17.5,75,3,honda civic cvcc
182 | 28.0,4,107.0,86,2464,15.5,76,2,fiat 131
183 | 25.0,4,116.0,81,2220,16.9,76,2,opel 1900
184 | 25.0,4,140.0,92,2572,14.9,76,1,capri ii
185 | 26.0,4,98.0,79,2255,17.7,76,1,dodge colt
186 | 27.0,4,101.0,83,2202,15.3,76,2,renault 12tl
187 | 17.5,8,305.0,140,4215,13.0,76,1,chevrolet chevelle malibu classic
188 | 16.0,8,318.0,150,4190,13.0,76,1,dodge coronet brougham
189 | 15.5,8,304.0,120,3962,13.9,76,1,amc matador
190 | 14.5,8,351.0,152,4215,12.8,76,1,ford gran torino
191 | 22.0,6,225.0,100,3233,15.4,76,1,plymouth valiant
192 | 22.0,6,250.0,105,3353,14.5,76,1,chevrolet nova
193 | 24.0,6,200.0,81,3012,17.6,76,1,ford maverick
194 | 22.5,6,232.0,90,3085,17.6,76,1,amc hornet
195 | 29.0,4,85.0,52,2035,22.2,76,1,chevrolet chevette
196 | 24.5,4,98.0,60,2164,22.1,76,1,chevrolet woody
197 | 29.0,4,90.0,70,1937,14.2,76,2,vw rabbit
198 | 33.0,4,91.0,53,1795,17.4,76,3,honda civic
199 | 20.0,6,225.0,100,3651,17.7,76,1,dodge aspen se
200 | 18.0,6,250.0,78,3574,21.0,76,1,ford granada ghia
201 | 18.5,6,250.0,110,3645,16.2,76,1,pontiac ventura sj
202 | 17.5,6,258.0,95,3193,17.8,76,1,amc pacer d/l
203 | 29.5,4,97.0,71,1825,12.2,76,2,volkswagen rabbit
204 | 32.0,4,85.0,70,1990,17.0,76,3,datsun b-210
205 | 28.0,4,97.0,75,2155,16.4,76,3,toyota corolla
206 | 26.5,4,140.0,72,2565,13.6,76,1,ford pinto
207 | 20.0,4,130.0,102,3150,15.7,76,2,volvo 245
208 | 13.0,8,318.0,150,3940,13.2,76,1,plymouth volare premier v8
209 | 19.0,4,120.0,88,3270,21.9,76,2,peugeot 504
210 | 19.0,6,156.0,108,2930,15.5,76,3,toyota mark ii
211 | 16.5,6,168.0,120,3820,16.7,76,2,mercedes-benz 280s
212 | 16.5,8,350.0,180,4380,12.1,76,1,cadillac seville
213 | 13.0,8,350.0,145,4055,12.0,76,1,chevy c10
214 | 13.0,8,302.0,130,3870,15.0,76,1,ford f108
215 | 13.0,8,318.0,150,3755,14.0,76,1,dodge d100
216 | 31.5,4,98.0,68,2045,18.5,77,3,honda accord cvcc
217 | 30.0,4,111.0,80,2155,14.8,77,1,buick opel isuzu deluxe
218 | 36.0,4,79.0,58,1825,18.6,77,2,renault 5 gtl
219 | 25.5,4,122.0,96,2300,15.5,77,1,plymouth arrow gs
220 | 33.5,4,85.0,70,1945,16.8,77,3,datsun f-10 hatchback
221 | 17.5,8,305.0,145,3880,12.5,77,1,chevrolet caprice classic
222 | 17.0,8,260.0,110,4060,19.0,77,1,oldsmobile cutlass supreme
223 | 15.5,8,318.0,145,4140,13.7,77,1,dodge monaco brougham
224 | 15.0,8,302.0,130,4295,14.9,77,1,mercury cougar brougham
225 | 17.5,6,250.0,110,3520,16.4,77,1,chevrolet concours
226 | 20.5,6,231.0,105,3425,16.9,77,1,buick skylark
227 | 19.0,6,225.0,100,3630,17.7,77,1,plymouth volare custom
228 | 18.5,6,250.0,98,3525,19.0,77,1,ford granada
229 | 16.0,8,400.0,180,4220,11.1,77,1,pontiac grand prix lj
230 | 15.5,8,350.0,170,4165,11.4,77,1,chevrolet monte carlo landau
231 | 15.5,8,400.0,190,4325,12.2,77,1,chrysler cordoba
232 | 16.0,8,351.0,149,4335,14.5,77,1,ford thunderbird
233 | 29.0,4,97.0,78,1940,14.5,77,2,volkswagen rabbit custom
234 | 24.5,4,151.0,88,2740,16.0,77,1,pontiac sunbird coupe
235 | 26.0,4,97.0,75,2265,18.2,77,3,toyota corolla liftback
236 | 25.5,4,140.0,89,2755,15.8,77,1,ford mustang ii 2+2
237 | 30.5,4,98.0,63,2051,17.0,77,1,chevrolet chevette
238 | 33.5,4,98.0,83,2075,15.9,77,1,dodge colt m/m
239 | 30.0,4,97.0,67,1985,16.4,77,3,subaru dl
240 | 30.5,4,97.0,78,2190,14.1,77,2,volkswagen dasher
241 | 22.0,6,146.0,97,2815,14.5,77,3,datsun 810
242 | 21.5,4,121.0,110,2600,12.8,77,2,bmw 320i
243 | 21.5,3,80.0,110,2720,13.5,77,3,mazda rx-4
244 | 43.1,4,90.0,48,1985,21.5,78,2,volkswagen rabbit custom diesel
245 | 36.1,4,98.0,66,1800,14.4,78,1,ford fiesta
246 | 32.8,4,78.0,52,1985,19.4,78,3,mazda glc deluxe
247 | 39.4,4,85.0,70,2070,18.6,78,3,datsun b210 gx
248 | 36.1,4,91.0,60,1800,16.4,78,3,honda civic cvcc
249 | 19.9,8,260.0,110,3365,15.5,78,1,oldsmobile cutlass salon brougham
250 | 19.4,8,318.0,140,3735,13.2,78,1,dodge diplomat
251 | 20.2,8,302.0,139,3570,12.8,78,1,mercury monarch ghia
252 | 19.2,6,231.0,105,3535,19.2,78,1,pontiac phoenix lj
253 | 20.5,6,200.0,95,3155,18.2,78,1,chevrolet malibu
254 | 20.2,6,200.0,85,2965,15.8,78,1,ford fairmont (auto)
255 | 25.1,4,140.0,88,2720,15.4,78,1,ford fairmont (man)
256 | 20.5,6,225.0,100,3430,17.2,78,1,plymouth volare
257 | 19.4,6,232.0,90,3210,17.2,78,1,amc concord
258 | 20.6,6,231.0,105,3380,15.8,78,1,buick century special
259 | 20.8,6,200.0,85,3070,16.7,78,1,mercury zephyr
260 | 18.6,6,225.0,110,3620,18.7,78,1,dodge aspen
261 | 18.1,6,258.0,120,3410,15.1,78,1,amc concord d/l
262 | 19.2,8,305.0,145,3425,13.2,78,1,chevrolet monte carlo landau
263 | 17.7,6,231.0,165,3445,13.4,78,1,buick regal sport coupe (turbo)
264 | 18.1,8,302.0,139,3205,11.2,78,1,ford futura
265 | 17.5,8,318.0,140,4080,13.7,78,1,dodge magnum xe
266 | 30.0,4,98.0,68,2155,16.5,78,1,chevrolet chevette
267 | 27.5,4,134.0,95,2560,14.2,78,3,toyota corona
268 | 27.2,4,119.0,97,2300,14.7,78,3,datsun 510
269 | 30.9,4,105.0,75,2230,14.5,78,1,dodge omni
270 | 21.1,4,134.0,95,2515,14.8,78,3,toyota celica gt liftback
271 | 23.2,4,156.0,105,2745,16.7,78,1,plymouth sapporo
272 | 23.8,4,151.0,85,2855,17.6,78,1,oldsmobile starfire sx
273 | 23.9,4,119.0,97,2405,14.9,78,3,datsun 200-sx
274 | 20.3,5,131.0,103,2830,15.9,78,2,audi 5000
275 | 17.0,6,163.0,125,3140,13.6,78,2,volvo 264gl
276 | 21.6,4,121.0,115,2795,15.7,78,2,saab 99gle
277 | 16.2,6,163.0,133,3410,15.8,78,2,peugeot 604sl
278 | 31.5,4,89.0,71,1990,14.9,78,2,volkswagen scirocco
279 | 29.5,4,98.0,68,2135,16.6,78,3,honda accord lx
280 | 21.5,6,231.0,115,3245,15.4,79,1,pontiac lemans v6
281 | 19.8,6,200.0,85,2990,18.2,79,1,mercury zephyr 6
282 | 22.3,4,140.0,88,2890,17.3,79,1,ford fairmont 4
283 | 20.2,6,232.0,90,3265,18.2,79,1,amc concord dl 6
284 | 20.6,6,225.0,110,3360,16.6,79,1,dodge aspen 6
285 | 17.0,8,305.0,130,3840,15.4,79,1,chevrolet caprice classic
286 | 17.6,8,302.0,129,3725,13.4,79,1,ford ltd landau
287 | 16.5,8,351.0,138,3955,13.2,79,1,mercury grand marquis
288 | 18.2,8,318.0,135,3830,15.2,79,1,dodge st. regis
289 | 16.9,8,350.0,155,4360,14.9,79,1,buick estate wagon (sw)
290 | 15.5,8,351.0,142,4054,14.3,79,1,ford country squire (sw)
291 | 19.2,8,267.0,125,3605,15.0,79,1,chevrolet malibu classic (sw)
292 | 18.5,8,360.0,150,3940,13.0,79,1,chrysler lebaron town @ country (sw)
293 | 31.9,4,89.0,71,1925,14.0,79,2,vw rabbit custom
294 | 34.1,4,86.0,65,1975,15.2,79,3,maxda glc deluxe
295 | 35.7,4,98.0,80,1915,14.4,79,1,dodge colt hatchback custom
296 | 27.4,4,121.0,80,2670,15.0,79,1,amc spirit dl
297 | 25.4,5,183.0,77,3530,20.1,79,2,mercedes benz 300d
298 | 23.0,8,350.0,125,3900,17.4,79,1,cadillac eldorado
299 | 27.2,4,141.0,71,3190,24.8,79,2,peugeot 504
300 | 23.9,8,260.0,90,3420,22.2,79,1,oldsmobile cutlass salon brougham
301 | 34.2,4,105.0,70,2200,13.2,79,1,plymouth horizon
302 | 34.5,4,105.0,70,2150,14.9,79,1,plymouth horizon tc3
303 | 31.8,4,85.0,65,2020,19.2,79,3,datsun 210
304 | 37.3,4,91.0,69,2130,14.7,79,2,fiat strada custom
305 | 28.4,4,151.0,90,2670,16.0,79,1,buick skylark limited
306 | 28.8,6,173.0,115,2595,11.3,79,1,chevrolet citation
307 | 26.8,6,173.0,115,2700,12.9,79,1,oldsmobile omega brougham
308 | 33.5,4,151.0,90,2556,13.2,79,1,pontiac phoenix
309 | 41.5,4,98.0,76,2144,14.7,80,2,vw rabbit
310 | 38.1,4,89.0,60,1968,18.8,80,3,toyota corolla tercel
311 | 32.1,4,98.0,70,2120,15.5,80,1,chevrolet chevette
312 | 37.2,4,86.0,65,2019,16.4,80,3,datsun 310
313 | 28.0,4,151.0,90,2678,16.5,80,1,chevrolet citation
314 | 26.4,4,140.0,88,2870,18.1,80,1,ford fairmont
315 | 24.3,4,151.0,90,3003,20.1,80,1,amc concord
316 | 19.1,6,225.0,90,3381,18.7,80,1,dodge aspen
317 | 34.3,4,97.0,78,2188,15.8,80,2,audi 4000
318 | 29.8,4,134.0,90,2711,15.5,80,3,toyota corona liftback
319 | 31.3,4,120.0,75,2542,17.5,80,3,mazda 626
320 | 37.0,4,119.0,92,2434,15.0,80,3,datsun 510 hatchback
321 | 32.2,4,108.0,75,2265,15.2,80,3,toyota corolla
322 | 46.6,4,86.0,65,2110,17.9,80,3,mazda glc
323 | 27.9,4,156.0,105,2800,14.4,80,1,dodge colt
324 | 40.8,4,85.0,65,2110,19.2,80,3,datsun 210
325 | 44.3,4,90.0,48,2085,21.7,80,2,vw rabbit c (diesel)
326 | 43.4,4,90.0,48,2335,23.7,80,2,vw dasher (diesel)
327 | 36.4,5,121.0,67,2950,19.9,80,2,audi 5000s (diesel)
328 | 30.0,4,146.0,67,3250,21.8,80,2,mercedes-benz 240d
329 | 44.6,4,91.0,67,1850,13.8,80,3,honda civic 1500 gl
330 | 33.8,4,97.0,67,2145,18.0,80,3,subaru dl
331 | 29.8,4,89.0,62,1845,15.3,80,2,vokswagen rabbit
332 | 32.7,6,168.0,132,2910,11.4,80,3,datsun 280-zx
333 | 23.7,3,70.0,100,2420,12.5,80,3,mazda rx-7 gs
334 | 35.0,4,122.0,88,2500,15.1,80,2,triumph tr7 coupe
335 | 32.4,4,107.0,72,2290,17.0,80,3,honda accord
336 | 27.2,4,135.0,84,2490,15.7,81,1,plymouth reliant
337 | 26.6,4,151.0,84,2635,16.4,81,1,buick skylark
338 | 25.8,4,156.0,92,2620,14.4,81,1,dodge aries wagon (sw)
339 | 23.5,6,173.0,110,2725,12.6,81,1,chevrolet citation
340 | 30.0,4,135.0,84,2385,12.9,81,1,plymouth reliant
341 | 39.1,4,79.0,58,1755,16.9,81,3,toyota starlet
342 | 39.0,4,86.0,64,1875,16.4,81,1,plymouth champ
343 | 35.1,4,81.0,60,1760,16.1,81,3,honda civic 1300
344 | 32.3,4,97.0,67,2065,17.8,81,3,subaru
345 | 37.0,4,85.0,65,1975,19.4,81,3,datsun 210 mpg
346 | 37.7,4,89.0,62,2050,17.3,81,3,toyota tercel
347 | 34.1,4,91.0,68,1985,16.0,81,3,mazda glc 4
348 | 34.7,4,105.0,63,2215,14.9,81,1,plymouth horizon 4
349 | 34.4,4,98.0,65,2045,16.2,81,1,ford escort 4w
350 | 29.9,4,98.0,65,2380,20.7,81,1,ford escort 2h
351 | 33.0,4,105.0,74,2190,14.2,81,2,volkswagen jetta
352 | 33.7,4,107.0,75,2210,14.4,81,3,honda prelude
353 | 32.4,4,108.0,75,2350,16.8,81,3,toyota corolla
354 | 32.9,4,119.0,100,2615,14.8,81,3,datsun 200sx
355 | 31.6,4,120.0,74,2635,18.3,81,3,mazda 626
356 | 28.1,4,141.0,80,3230,20.4,81,2,peugeot 505s turbo diesel
357 | 30.7,6,145.0,76,3160,19.6,81,2,volvo diesel
358 | 25.4,6,168.0,116,2900,12.6,81,3,toyota cressida
359 | 24.2,6,146.0,120,2930,13.8,81,3,datsun 810 maxima
360 | 22.4,6,231.0,110,3415,15.8,81,1,buick century
361 | 26.6,8,350.0,105,3725,19.0,81,1,oldsmobile cutlass ls
362 | 20.2,6,200.0,88,3060,17.1,81,1,ford granada gl
363 | 17.6,6,225.0,85,3465,16.6,81,1,chrysler lebaron salon
364 | 28.0,4,112.0,88,2605,19.6,82,1,chevrolet cavalier
365 | 27.0,4,112.0,88,2640,18.6,82,1,chevrolet cavalier wagon
366 | 34.0,4,112.0,88,2395,18.0,82,1,chevrolet cavalier 2-door
367 | 31.0,4,112.0,85,2575,16.2,82,1,pontiac j2000 se hatchback
368 | 29.0,4,135.0,84,2525,16.0,82,1,dodge aries se
369 | 27.0,4,151.0,90,2735,18.0,82,1,pontiac phoenix
370 | 24.0,4,140.0,92,2865,16.4,82,1,ford fairmont futura
371 | 36.0,4,105.0,74,1980,15.3,82,2,volkswagen rabbit l
372 | 37.0,4,91.0,68,2025,18.2,82,3,mazda glc custom l
373 | 31.0,4,91.0,68,1970,17.6,82,3,mazda glc custom
374 | 38.0,4,105.0,63,2125,14.7,82,1,plymouth horizon miser
375 | 36.0,4,98.0,70,2125,17.3,82,1,mercury lynx l
376 | 36.0,4,120.0,88,2160,14.5,82,3,nissan stanza xe
377 | 36.0,4,107.0,75,2205,14.5,82,3,honda accord
378 | 34.0,4,108.0,70,2245,16.9,82,3,toyota corolla
379 | 38.0,4,91.0,67,1965,15.0,82,3,honda civic
380 | 32.0,4,91.0,67,1965,15.7,82,3,honda civic (auto)
381 | 38.0,4,91.0,67,1995,16.2,82,3,datsun 310 gx
382 | 25.0,6,181.0,110,2945,16.4,82,1,buick century limited
383 | 38.0,6,262.0,85,3015,17.0,82,1,oldsmobile cutlass ciera (diesel)
384 | 26.0,4,156.0,92,2585,14.5,82,1,chrysler lebaron medallion
385 | 22.0,6,232.0,112,2835,14.7,82,1,ford granada l
386 | 32.0,4,144.0,96,2665,13.9,82,3,toyota celica gt
387 | 36.0,4,135.0,84,2370,13.0,82,1,dodge charger 2.2
388 | 27.0,4,151.0,90,2950,17.3,82,1,chevrolet camaro
389 | 27.0,4,140.0,86,2790,15.6,82,1,ford mustang gl
390 | 44.0,4,97.0,52,2130,24.6,82,2,vw pickup
391 | 32.0,4,135.0,84,2295,11.6,82,1,dodge rampage
392 | 28.0,4,120.0,79,2625,18.6,82,1,ford ranger
393 | 31.0,4,119.0,82,2720,19.4,82,1,chevy s-10
394 | 


--------------------------------------------------------------------------------
/Auto.data:
--------------------------------------------------------------------------------
  1 | mpg	cylinders	displacement	horsepower weight	acceleration	year	origin	name
  2 | 18.0   8   307.0      130.0      3504.      12.0   70  1	"chevrolet chevelle malibu"
  3 | 15.0   8   350.0      165.0      3693.      11.5   70  1	"buick skylark 320"
  4 | 18.0   8   318.0      150.0      3436.      11.0   70  1	"plymouth satellite"
  5 | 16.0   8   304.0      150.0      3433.      12.0   70  1	"amc rebel sst"
  6 | 17.0   8   302.0      140.0      3449.      10.5   70  1	"ford torino"
  7 | 15.0   8   429.0      198.0      4341.      10.0   70  1	"ford galaxie 500"
  8 | 14.0   8   454.0      220.0      4354.       9.0   70  1	"chevrolet impala"
  9 | 14.0   8   440.0      215.0      4312.       8.5   70  1	"plymouth fury iii"
 10 | 14.0   8   455.0      225.0      4425.      10.0   70  1	"pontiac catalina"
 11 | 15.0   8   390.0      190.0      3850.       8.5   70  1	"amc ambassador dpl"
 12 | 15.0   8   383.0      170.0      3563.      10.0   70  1	"dodge challenger se"
 13 | 14.0   8   340.0      160.0      3609.       8.0   70  1	"plymouth 'cuda 340"
 14 | 15.0   8   400.0      150.0      3761.       9.5   70  1	"chevrolet monte carlo"
 15 | 14.0   8   455.0      225.0      3086.      10.0   70  1	"buick estate wagon (sw)"
 16 | 24.0   4   113.0      95.00      2372.      15.0   70  3	"toyota corona mark ii"
 17 | 22.0   6   198.0      95.00      2833.      15.5   70  1	"plymouth duster"
 18 | 18.0   6   199.0      97.00      2774.      15.5   70  1	"amc hornet"
 19 | 21.0   6   200.0      85.00      2587.      16.0   70  1	"ford maverick"
 20 | 27.0   4   97.00      88.00      2130.      14.5   70  3	"datsun pl510"
 21 | 26.0   4   97.00      46.00      1835.      20.5   70  2	"volkswagen 1131 deluxe sedan"
 22 | 25.0   4   110.0      87.00      2672.      17.5   70  2	"peugeot 504"
 23 | 24.0   4   107.0      90.00      2430.      14.5   70  2	"audi 100 ls"
 24 | 25.0   4   104.0      95.00      2375.      17.5   70  2	"saab 99e"
 25 | 26.0   4   121.0      113.0      2234.      12.5   70  2	"bmw 2002"
 26 | 21.0   6   199.0      90.00      2648.      15.0   70  1	"amc gremlin"
 27 | 10.0   8   360.0      215.0      4615.      14.0   70  1	"ford f250"
 28 | 10.0   8   307.0      200.0      4376.      15.0   70  1	"chevy c20"
 29 | 11.0   8   318.0      210.0      4382.      13.5   70  1	"dodge d200"
 30 | 9.0    8   304.0      193.0      4732.      18.5   70  1	"hi 1200d"
 31 | 27.0   4   97.00      88.00      2130.      14.5   71  3	"datsun pl510"
 32 | 28.0   4   140.0      90.00      2264.      15.5   71  1	"chevrolet vega 2300"
 33 | 25.0   4   113.0      95.00      2228.      14.0   71  3	"toyota corona"
 34 | 25.0   4   98.00      ?          2046.      19.0   71  1	"ford pinto"
 35 | 19.0   6   232.0      100.0      2634.      13.0   71  1	"amc gremlin"
 36 | 16.0   6   225.0      105.0      3439.      15.5   71  1	"plymouth satellite custom"
 37 | 17.0   6   250.0      100.0      3329.      15.5   71  1	"chevrolet chevelle malibu"
 38 | 19.0   6   250.0      88.00      3302.      15.5   71  1	"ford torino 500"
 39 | 18.0   6   232.0      100.0      3288.      15.5   71  1	"amc matador"
 40 | 14.0   8   350.0      165.0      4209.      12.0   71  1	"chevrolet impala"
 41 | 14.0   8   400.0      175.0      4464.      11.5   71  1	"pontiac catalina brougham"
 42 | 14.0   8   351.0      153.0      4154.      13.5   71  1	"ford galaxie 500"
 43 | 14.0   8   318.0      150.0      4096.      13.0   71  1	"plymouth fury iii"
 44 | 12.0   8   383.0      180.0      4955.      11.5   71  1	"dodge monaco (sw)"
 45 | 13.0   8   400.0      170.0      4746.      12.0   71  1	"ford country squire (sw)"
 46 | 13.0   8   400.0      175.0      5140.      12.0   71  1	"pontiac safari (sw)"
 47 | 18.0   6   258.0      110.0      2962.      13.5   71  1	"amc hornet sportabout (sw)"
 48 | 22.0   4   140.0      72.00      2408.      19.0   71  1	"chevrolet vega (sw)"
 49 | 19.0   6   250.0      100.0      3282.      15.0   71  1	"pontiac firebird"
 50 | 18.0   6   250.0      88.00      3139.      14.5   71  1	"ford mustang"
 51 | 23.0   4   122.0      86.00      2220.      14.0   71  1	"mercury capri 2000"
 52 | 28.0   4   116.0      90.00      2123.      14.0   71  2	"opel 1900"
 53 | 30.0   4   79.00      70.00      2074.      19.5   71  2	"peugeot 304"
 54 | 30.0   4   88.00      76.00      2065.      14.5   71  2	"fiat 124b"
 55 | 31.0   4   71.00      65.00      1773.      19.0   71  3	"toyota corolla 1200"
 56 | 35.0   4   72.00      69.00      1613.      18.0   71  3	"datsun 1200"
 57 | 27.0   4   97.00      60.00      1834.      19.0   71  2	"volkswagen model 111"
 58 | 26.0   4   91.00      70.00      1955.      20.5   71  1	"plymouth cricket"
 59 | 24.0   4   113.0      95.00      2278.      15.5   72  3	"toyota corona hardtop"
 60 | 25.0   4   97.50      80.00      2126.      17.0   72  1	"dodge colt hardtop"
 61 | 23.0   4   97.00      54.00      2254.      23.5   72  2	"volkswagen type 3"
 62 | 20.0   4   140.0      90.00      2408.      19.5   72  1	"chevrolet vega"
 63 | 21.0   4   122.0      86.00      2226.      16.5   72  1	"ford pinto runabout"
 64 | 13.0   8   350.0      165.0      4274.      12.0   72  1	"chevrolet impala"
 65 | 14.0   8   400.0      175.0      4385.      12.0   72  1	"pontiac catalina"
 66 | 15.0   8   318.0      150.0      4135.      13.5   72  1	"plymouth fury iii"
 67 | 14.0   8   351.0      153.0      4129.      13.0   72  1	"ford galaxie 500"
 68 | 17.0   8   304.0      150.0      3672.      11.5   72  1	"amc ambassador sst"
 69 | 11.0   8   429.0      208.0      4633.      11.0   72  1	"mercury marquis"
 70 | 13.0   8   350.0      155.0      4502.      13.5   72  1	"buick lesabre custom"
 71 | 12.0   8   350.0      160.0      4456.      13.5   72  1	"oldsmobile delta 88 royale"
 72 | 13.0   8   400.0      190.0      4422.      12.5   72  1	"chrysler newport royal"
 73 | 19.0   3   70.00      97.00      2330.      13.5   72  3	"mazda rx2 coupe"
 74 | 15.0   8   304.0      150.0      3892.      12.5   72  1	"amc matador (sw)"
 75 | 13.0   8   307.0      130.0      4098.      14.0   72  1	"chevrolet chevelle concours (sw)"
 76 | 13.0   8   302.0      140.0      4294.      16.0   72  1	"ford gran torino (sw)"
 77 | 14.0   8   318.0      150.0      4077.      14.0   72  1	"plymouth satellite custom (sw)"
 78 | 18.0   4   121.0      112.0      2933.      14.5   72  2	"volvo 145e (sw)"
 79 | 22.0   4   121.0      76.00      2511.      18.0   72  2	"volkswagen 411 (sw)"
 80 | 21.0   4   120.0      87.00      2979.      19.5   72  2	"peugeot 504 (sw)"
 81 | 26.0   4   96.00      69.00      2189.      18.0   72  2	"renault 12 (sw)"
 82 | 22.0   4   122.0      86.00      2395.      16.0   72  1	"ford pinto (sw)"
 83 | 28.0   4   97.00      92.00      2288.      17.0   72  3	"datsun 510 (sw)"
 84 | 23.0   4   120.0      97.00      2506.      14.5   72  3	"toyouta corona mark ii (sw)"
 85 | 28.0   4   98.00      80.00      2164.      15.0   72  1	"dodge colt (sw)"
 86 | 27.0   4   97.00      88.00      2100.      16.5   72  3	"toyota corolla 1600 (sw)"
 87 | 13.0   8   350.0      175.0      4100.      13.0   73  1	"buick century 350"
 88 | 14.0   8   304.0      150.0      3672.      11.5   73  1	"amc matador"
 89 | 13.0   8   350.0      145.0      3988.      13.0   73  1	"chevrolet malibu"
 90 | 14.0   8   302.0      137.0      4042.      14.5   73  1	"ford gran torino"
 91 | 15.0   8   318.0      150.0      3777.      12.5   73  1	"dodge coronet custom"
 92 | 12.0   8   429.0      198.0      4952.      11.5   73  1	"mercury marquis brougham"
 93 | 13.0   8   400.0      150.0      4464.      12.0   73  1	"chevrolet caprice classic"
 94 | 13.0   8   351.0      158.0      4363.      13.0   73  1	"ford ltd"
 95 | 14.0   8   318.0      150.0      4237.      14.5   73  1	"plymouth fury gran sedan"
 96 | 13.0   8   440.0      215.0      4735.      11.0   73  1	"chrysler new yorker brougham"
 97 | 12.0   8   455.0      225.0      4951.      11.0   73  1	"buick electra 225 custom"
 98 | 13.0   8   360.0      175.0      3821.      11.0   73  1	"amc ambassador brougham"
 99 | 18.0   6   225.0      105.0      3121.      16.5   73  1	"plymouth valiant"
100 | 16.0   6   250.0      100.0      3278.      18.0   73  1	"chevrolet nova custom"
101 | 18.0   6   232.0      100.0      2945.      16.0   73  1	"amc hornet"
102 | 18.0   6   250.0      88.00      3021.      16.5   73  1	"ford maverick"
103 | 23.0   6   198.0      95.00      2904.      16.0   73  1	"plymouth duster"
104 | 26.0   4   97.00      46.00      1950.      21.0   73  2	"volkswagen super beetle"
105 | 11.0   8   400.0      150.0      4997.      14.0   73  1	"chevrolet impala"
106 | 12.0   8   400.0      167.0      4906.      12.5   73  1	"ford country"
107 | 13.0   8   360.0      170.0      4654.      13.0   73  1	"plymouth custom suburb"
108 | 12.0   8   350.0      180.0      4499.      12.5   73  1	"oldsmobile vista cruiser"
109 | 18.0   6   232.0      100.0      2789.      15.0   73  1	"amc gremlin"
110 | 20.0   4   97.00      88.00      2279.      19.0   73  3	"toyota carina"
111 | 21.0   4   140.0      72.00      2401.      19.5   73  1	"chevrolet vega"
112 | 22.0   4   108.0      94.00      2379.      16.5   73  3	"datsun 610"
113 | 18.0   3   70.00      90.00      2124.      13.5   73  3	"maxda rx3"
114 | 19.0   4   122.0      85.00      2310.      18.5   73  1	"ford pinto"
115 | 21.0   6   155.0      107.0      2472.      14.0   73  1	"mercury capri v6"
116 | 26.0   4   98.00      90.00      2265.      15.5   73  2	"fiat 124 sport coupe"
117 | 15.0   8   350.0      145.0      4082.      13.0   73  1	"chevrolet monte carlo s"
118 | 16.0   8   400.0      230.0      4278.      9.50   73  1	"pontiac grand prix"
119 | 29.0   4   68.00      49.00      1867.      19.5   73  2	"fiat 128"
120 | 24.0   4   116.0      75.00      2158.      15.5   73  2	"opel manta"
121 | 20.0   4   114.0      91.00      2582.      14.0   73  2	"audi 100ls"
122 | 19.0   4   121.0      112.0      2868.      15.5   73  2	"volvo 144ea"
123 | 15.0   8   318.0      150.0      3399.      11.0   73  1	"dodge dart custom"
124 | 24.0   4   121.0      110.0      2660.      14.0   73  2	"saab 99le"
125 | 20.0   6   156.0      122.0      2807.      13.5   73  3	"toyota mark ii"
126 | 11.0   8   350.0      180.0      3664.      11.0   73  1	"oldsmobile omega"
127 | 20.0   6   198.0      95.00      3102.      16.5   74  1	"plymouth duster"
128 | 21.0   6   200.0      ?          2875.      17.0   74  1	"ford maverick"
129 | 19.0   6   232.0      100.0      2901.      16.0   74  1	"amc hornet"
130 | 15.0   6   250.0      100.0      3336.      17.0   74  1	"chevrolet nova"
131 | 31.0   4   79.00      67.00      1950.      19.0   74  3	"datsun b210"
132 | 26.0   4   122.0      80.00      2451.      16.5   74  1	"ford pinto"
133 | 32.0   4   71.00      65.00      1836.      21.0   74  3	"toyota corolla 1200"
134 | 25.0   4   140.0      75.00      2542.      17.0   74  1	"chevrolet vega"
135 | 16.0   6   250.0      100.0      3781.      17.0   74  1	"chevrolet chevelle malibu classic"
136 | 16.0   6   258.0      110.0      3632.      18.0   74  1	"amc matador"
137 | 18.0   6   225.0      105.0      3613.      16.5   74  1	"plymouth satellite sebring"
138 | 16.0   8   302.0      140.0      4141.      14.0   74  1	"ford gran torino"
139 | 13.0   8   350.0      150.0      4699.      14.5   74  1	"buick century luxus (sw)"
140 | 14.0   8   318.0      150.0      4457.      13.5   74  1	"dodge coronet custom (sw)"
141 | 14.0   8   302.0      140.0      4638.      16.0   74  1	"ford gran torino (sw)"
142 | 14.0   8   304.0      150.0      4257.      15.5   74  1	"amc matador (sw)"
143 | 29.0   4   98.00      83.00      2219.      16.5   74  2	"audi fox"
144 | 26.0   4   79.00      67.00      1963.      15.5   74  2	"volkswagen dasher"
145 | 26.0   4   97.00      78.00      2300.      14.5   74  2	"opel manta"
146 | 31.0   4   76.00      52.00      1649.      16.5   74  3	"toyota corona"
147 | 32.0   4   83.00      61.00      2003.      19.0   74  3	"datsun 710"
148 | 28.0   4   90.00      75.00      2125.      14.5   74  1	"dodge colt"
149 | 24.0   4   90.00      75.00      2108.      15.5   74  2	"fiat 128"
150 | 26.0   4   116.0      75.00      2246.      14.0   74  2	"fiat 124 tc"
151 | 24.0   4   120.0      97.00      2489.      15.0   74  3	"honda civic"
152 | 26.0   4   108.0      93.00      2391.      15.5   74  3	"subaru"
153 | 31.0   4   79.00      67.00      2000.      16.0   74  2	"fiat x1.9"
154 | 19.0   6   225.0      95.00      3264.      16.0   75  1	"plymouth valiant custom"
155 | 18.0   6   250.0      105.0      3459.      16.0   75  1	"chevrolet nova"
156 | 15.0   6   250.0      72.00      3432.      21.0   75  1	"mercury monarch"
157 | 15.0   6   250.0      72.00      3158.      19.5   75  1	"ford maverick"
158 | 16.0   8   400.0      170.0      4668.      11.5   75  1	"pontiac catalina"
159 | 15.0   8   350.0      145.0      4440.      14.0   75  1	"chevrolet bel air"
160 | 16.0   8   318.0      150.0      4498.      14.5   75  1	"plymouth grand fury"
161 | 14.0   8   351.0      148.0      4657.      13.5   75  1	"ford ltd"
162 | 17.0   6   231.0      110.0      3907.      21.0   75  1	"buick century"
163 | 16.0   6   250.0      105.0      3897.      18.5   75  1	"chevroelt chevelle malibu"
164 | 15.0   6   258.0      110.0      3730.      19.0   75  1	"amc matador"
165 | 18.0   6   225.0      95.00      3785.      19.0   75  1	"plymouth fury"
166 | 21.0   6   231.0      110.0      3039.      15.0   75  1	"buick skyhawk"
167 | 20.0   8   262.0      110.0      3221.      13.5   75  1	"chevrolet monza 2+2"
168 | 13.0   8   302.0      129.0      3169.      12.0   75  1	"ford mustang ii"
169 | 29.0   4   97.00      75.00      2171.      16.0   75  3	"toyota corolla"
170 | 23.0   4   140.0      83.00      2639.      17.0   75  1	"ford pinto"
171 | 20.0   6   232.0      100.0      2914.      16.0   75  1	"amc gremlin"
172 | 23.0   4   140.0      78.00      2592.      18.5   75  1	"pontiac astro"
173 | 24.0   4   134.0      96.00      2702.      13.5   75  3	"toyota corona"
174 | 25.0   4   90.00      71.00      2223.      16.5   75  2	"volkswagen dasher"
175 | 24.0   4   119.0      97.00      2545.      17.0   75  3	"datsun 710"
176 | 18.0   6   171.0      97.00      2984.      14.5   75  1	"ford pinto"
177 | 29.0   4   90.00      70.00      1937.      14.0   75  2	"volkswagen rabbit"
178 | 19.0   6   232.0      90.00      3211.      17.0   75  1	"amc pacer"
179 | 23.0   4   115.0      95.00      2694.      15.0   75  2	"audi 100ls"
180 | 23.0   4   120.0      88.00      2957.      17.0   75  2	"peugeot 504"
181 | 22.0   4   121.0      98.00      2945.      14.5   75  2	"volvo 244dl"
182 | 25.0   4   121.0      115.0      2671.      13.5   75  2	"saab 99le"
183 | 33.0   4   91.00      53.00      1795.      17.5   75  3	"honda civic cvcc"
184 | 28.0   4   107.0      86.00      2464.      15.5   76  2	"fiat 131"
185 | 25.0   4   116.0      81.00      2220.      16.9   76  2	"opel 1900"
186 | 25.0   4   140.0      92.00      2572.      14.9   76  1	"capri ii"
187 | 26.0   4   98.00      79.00      2255.      17.7   76  1	"dodge colt"
188 | 27.0   4   101.0      83.00      2202.      15.3   76  2	"renault 12tl"
189 | 17.5   8   305.0      140.0      4215.      13.0   76  1	"chevrolet chevelle malibu classic"
190 | 16.0   8   318.0      150.0      4190.      13.0   76  1	"dodge coronet brougham"
191 | 15.5   8   304.0      120.0      3962.      13.9   76  1	"amc matador"
192 | 14.5   8   351.0      152.0      4215.      12.8   76  1	"ford gran torino"
193 | 22.0   6   225.0      100.0      3233.      15.4   76  1	"plymouth valiant"
194 | 22.0   6   250.0      105.0      3353.      14.5   76  1	"chevrolet nova"
195 | 24.0   6   200.0      81.00      3012.      17.6   76  1	"ford maverick"
196 | 22.5   6   232.0      90.00      3085.      17.6   76  1	"amc hornet"
197 | 29.0   4   85.00      52.00      2035.      22.2   76  1	"chevrolet chevette"
198 | 24.5   4   98.00      60.00      2164.      22.1   76  1	"chevrolet woody"
199 | 29.0   4   90.00      70.00      1937.      14.2   76  2	"vw rabbit"
200 | 33.0   4   91.00      53.00      1795.      17.4   76  3	"honda civic"
201 | 20.0   6   225.0      100.0      3651.      17.7   76  1	"dodge aspen se"
202 | 18.0   6   250.0      78.00      3574.      21.0   76  1	"ford granada ghia"
203 | 18.5   6   250.0      110.0      3645.      16.2   76  1	"pontiac ventura sj"
204 | 17.5   6   258.0      95.00      3193.      17.8   76  1	"amc pacer d/l"
205 | 29.5   4   97.00      71.00      1825.      12.2   76  2	"volkswagen rabbit"
206 | 32.0   4   85.00      70.00      1990.      17.0   76  3	"datsun b-210"
207 | 28.0   4   97.00      75.00      2155.      16.4   76  3	"toyota corolla"
208 | 26.5   4   140.0      72.00      2565.      13.6   76  1	"ford pinto"
209 | 20.0   4   130.0      102.0      3150.      15.7   76  2	"volvo 245"
210 | 13.0   8   318.0      150.0      3940.      13.2   76  1	"plymouth volare premier v8"
211 | 19.0   4   120.0      88.00      3270.      21.9   76  2	"peugeot 504"
212 | 19.0   6   156.0      108.0      2930.      15.5   76  3	"toyota mark ii"
213 | 16.5   6   168.0      120.0      3820.      16.7   76  2	"mercedes-benz 280s"
214 | 16.5   8   350.0      180.0      4380.      12.1   76  1	"cadillac seville"
215 | 13.0   8   350.0      145.0      4055.      12.0   76  1	"chevy c10"
216 | 13.0   8   302.0      130.0      3870.      15.0   76  1	"ford f108"
217 | 13.0   8   318.0      150.0      3755.      14.0   76  1	"dodge d100"
218 | 31.5   4   98.00      68.00      2045.      18.5   77  3	"honda accord cvcc"
219 | 30.0   4   111.0      80.00      2155.      14.8   77  1	"buick opel isuzu deluxe"
220 | 36.0   4   79.00      58.00      1825.      18.6   77  2	"renault 5 gtl"
221 | 25.5   4   122.0      96.00      2300.      15.5   77  1	"plymouth arrow gs"
222 | 33.5   4   85.00      70.00      1945.      16.8   77  3	"datsun f-10 hatchback"
223 | 17.5   8   305.0      145.0      3880.      12.5   77  1	"chevrolet caprice classic"
224 | 17.0   8   260.0      110.0      4060.      19.0   77  1	"oldsmobile cutlass supreme"
225 | 15.5   8   318.0      145.0      4140.      13.7   77  1	"dodge monaco brougham"
226 | 15.0   8   302.0      130.0      4295.      14.9   77  1	"mercury cougar brougham"
227 | 17.5   6   250.0      110.0      3520.      16.4   77  1	"chevrolet concours"
228 | 20.5   6   231.0      105.0      3425.      16.9   77  1	"buick skylark"
229 | 19.0   6   225.0      100.0      3630.      17.7   77  1	"plymouth volare custom"
230 | 18.5   6   250.0      98.00      3525.      19.0   77  1	"ford granada"
231 | 16.0   8   400.0      180.0      4220.      11.1   77  1	"pontiac grand prix lj"
232 | 15.5   8   350.0      170.0      4165.      11.4   77  1	"chevrolet monte carlo landau"
233 | 15.5   8   400.0      190.0      4325.      12.2   77  1	"chrysler cordoba"
234 | 16.0   8   351.0      149.0      4335.      14.5   77  1	"ford thunderbird"
235 | 29.0   4   97.00      78.00      1940.      14.5   77  2	"volkswagen rabbit custom"
236 | 24.5   4   151.0      88.00      2740.      16.0   77  1	"pontiac sunbird coupe"
237 | 26.0   4   97.00      75.00      2265.      18.2   77  3	"toyota corolla liftback"
238 | 25.5   4   140.0      89.00      2755.      15.8   77  1	"ford mustang ii 2+2"
239 | 30.5   4   98.00      63.00      2051.      17.0   77  1	"chevrolet chevette"
240 | 33.5   4   98.00      83.00      2075.      15.9   77  1	"dodge colt m/m"
241 | 30.0   4   97.00      67.00      1985.      16.4   77  3	"subaru dl"
242 | 30.5   4   97.00      78.00      2190.      14.1   77  2	"volkswagen dasher"
243 | 22.0   6   146.0      97.00      2815.      14.5   77  3	"datsun 810"
244 | 21.5   4   121.0      110.0      2600.      12.8   77  2	"bmw 320i"
245 | 21.5   3   80.00      110.0      2720.      13.5   77  3	"mazda rx-4"
246 | 43.1   4   90.00      48.00      1985.      21.5   78  2	"volkswagen rabbit custom diesel"
247 | 36.1   4   98.00      66.00      1800.      14.4   78  1	"ford fiesta"
248 | 32.8   4   78.00      52.00      1985.      19.4   78  3	"mazda glc deluxe"
249 | 39.4   4   85.00      70.00      2070.      18.6   78  3	"datsun b210 gx"
250 | 36.1   4   91.00      60.00      1800.      16.4   78  3	"honda civic cvcc"
251 | 19.9   8   260.0      110.0      3365.      15.5   78  1	"oldsmobile cutlass salon brougham"
252 | 19.4   8   318.0      140.0      3735.      13.2   78  1	"dodge diplomat"
253 | 20.2   8   302.0      139.0      3570.      12.8   78  1	"mercury monarch ghia"
254 | 19.2   6   231.0      105.0      3535.      19.2   78  1	"pontiac phoenix lj"
255 | 20.5   6   200.0      95.00      3155.      18.2   78  1	"chevrolet malibu"
256 | 20.2   6   200.0      85.00      2965.      15.8   78  1	"ford fairmont (auto)"
257 | 25.1   4   140.0      88.00      2720.      15.4   78  1	"ford fairmont (man)"
258 | 20.5   6   225.0      100.0      3430.      17.2   78  1	"plymouth volare"
259 | 19.4   6   232.0      90.00      3210.      17.2   78  1	"amc concord"
260 | 20.6   6   231.0      105.0      3380.      15.8   78  1	"buick century special"
261 | 20.8   6   200.0      85.00      3070.      16.7   78  1	"mercury zephyr"
262 | 18.6   6   225.0      110.0      3620.      18.7   78  1	"dodge aspen"
263 | 18.1   6   258.0      120.0      3410.      15.1   78  1	"amc concord d/l"
264 | 19.2   8   305.0      145.0      3425.      13.2   78  1	"chevrolet monte carlo landau"
265 | 17.7   6   231.0      165.0      3445.      13.4   78  1	"buick regal sport coupe (turbo)"
266 | 18.1   8   302.0      139.0      3205.      11.2   78  1	"ford futura"
267 | 17.5   8   318.0      140.0      4080.      13.7   78  1	"dodge magnum xe"
268 | 30.0   4   98.00      68.00      2155.      16.5   78  1	"chevrolet chevette"
269 | 27.5   4   134.0      95.00      2560.      14.2   78  3	"toyota corona"
270 | 27.2   4   119.0      97.00      2300.      14.7   78  3	"datsun 510"
271 | 30.9   4   105.0      75.00      2230.      14.5   78  1	"dodge omni"
272 | 21.1   4   134.0      95.00      2515.      14.8   78  3	"toyota celica gt liftback"
273 | 23.2   4   156.0      105.0      2745.      16.7   78  1	"plymouth sapporo"
274 | 23.8   4   151.0      85.00      2855.      17.6   78  1	"oldsmobile starfire sx"
275 | 23.9   4   119.0      97.00      2405.      14.9   78  3	"datsun 200-sx"
276 | 20.3   5   131.0      103.0      2830.      15.9   78  2	"audi 5000"
277 | 17.0   6   163.0      125.0      3140.      13.6   78  2	"volvo 264gl"
278 | 21.6   4   121.0      115.0      2795.      15.7   78  2	"saab 99gle"
279 | 16.2   6   163.0      133.0      3410.      15.8   78  2	"peugeot 604sl"
280 | 31.5   4   89.00      71.00      1990.      14.9   78  2	"volkswagen scirocco"
281 | 29.5   4   98.00      68.00      2135.      16.6   78  3	"honda accord lx"
282 | 21.5   6   231.0      115.0      3245.      15.4   79  1	"pontiac lemans v6"
283 | 19.8   6   200.0      85.00      2990.      18.2   79  1	"mercury zephyr 6"
284 | 22.3   4   140.0      88.00      2890.      17.3   79  1	"ford fairmont 4"
285 | 20.2   6   232.0      90.00      3265.      18.2   79  1	"amc concord dl 6"
286 | 20.6   6   225.0      110.0      3360.      16.6   79  1	"dodge aspen 6"
287 | 17.0   8   305.0      130.0      3840.      15.4   79  1	"chevrolet caprice classic"
288 | 17.6   8   302.0      129.0      3725.      13.4   79  1	"ford ltd landau"
289 | 16.5   8   351.0      138.0      3955.      13.2   79  1	"mercury grand marquis"
290 | 18.2   8   318.0      135.0      3830.      15.2   79  1	"dodge st. regis"
291 | 16.9   8   350.0      155.0      4360.      14.9   79  1	"buick estate wagon (sw)"
292 | 15.5   8   351.0      142.0      4054.      14.3   79  1	"ford country squire (sw)"
293 | 19.2   8   267.0      125.0      3605.      15.0   79  1	"chevrolet malibu classic (sw)"
294 | 18.5   8   360.0      150.0      3940.      13.0   79  1	"chrysler lebaron town @ country (sw)"
295 | 31.9   4   89.00      71.00      1925.      14.0   79  2	"vw rabbit custom"
296 | 34.1   4   86.00      65.00      1975.      15.2   79  3	"maxda glc deluxe"
297 | 35.7   4   98.00      80.00      1915.      14.4   79  1	"dodge colt hatchback custom"
298 | 27.4   4   121.0      80.00      2670.      15.0   79  1	"amc spirit dl"
299 | 25.4   5   183.0      77.00      3530.      20.1   79  2	"mercedes benz 300d"
300 | 23.0   8   350.0      125.0      3900.      17.4   79  1	"cadillac eldorado"
301 | 27.2   4   141.0      71.00      3190.      24.8   79  2	"peugeot 504"
302 | 23.9   8   260.0      90.00      3420.      22.2   79  1	"oldsmobile cutlass salon brougham"
303 | 34.2   4   105.0      70.00      2200.      13.2   79  1	"plymouth horizon"
304 | 34.5   4   105.0      70.00      2150.      14.9   79  1	"plymouth horizon tc3"
305 | 31.8   4   85.00      65.00      2020.      19.2   79  3	"datsun 210"
306 | 37.3   4   91.00      69.00      2130.      14.7   79  2	"fiat strada custom"
307 | 28.4   4   151.0      90.00      2670.      16.0   79  1	"buick skylark limited"
308 | 28.8   6   173.0      115.0      2595.      11.3   79  1	"chevrolet citation"
309 | 26.8   6   173.0      115.0      2700.      12.9   79  1	"oldsmobile omega brougham"
310 | 33.5   4   151.0      90.00      2556.      13.2   79  1	"pontiac phoenix"
311 | 41.5   4   98.00      76.00      2144.      14.7   80  2	"vw rabbit"
312 | 38.1   4   89.00      60.00      1968.      18.8   80  3	"toyota corolla tercel"
313 | 32.1   4   98.00      70.00      2120.      15.5   80  1	"chevrolet chevette"
314 | 37.2   4   86.00      65.00      2019.      16.4   80  3	"datsun 310"
315 | 28.0   4   151.0      90.00      2678.      16.5   80  1	"chevrolet citation"
316 | 26.4   4   140.0      88.00      2870.      18.1   80  1	"ford fairmont"
317 | 24.3   4   151.0      90.00      3003.      20.1   80  1	"amc concord"
318 | 19.1   6   225.0      90.00      3381.      18.7   80  1	"dodge aspen"
319 | 34.3   4   97.00      78.00      2188.      15.8   80  2	"audi 4000"
320 | 29.8   4   134.0      90.00      2711.      15.5   80  3	"toyota corona liftback"
321 | 31.3   4   120.0      75.00      2542.      17.5   80  3	"mazda 626"
322 | 37.0   4   119.0      92.00      2434.      15.0   80  3	"datsun 510 hatchback"
323 | 32.2   4   108.0      75.00      2265.      15.2   80  3	"toyota corolla"
324 | 46.6   4   86.00      65.00      2110.      17.9   80  3	"mazda glc"
325 | 27.9   4   156.0      105.0      2800.      14.4   80  1	"dodge colt"
326 | 40.8   4   85.00      65.00      2110.      19.2   80  3	"datsun 210"
327 | 44.3   4   90.00      48.00      2085.      21.7   80  2	"vw rabbit c (diesel)"
328 | 43.4   4   90.00      48.00      2335.      23.7   80  2	"vw dasher (diesel)"
329 | 36.4   5   121.0      67.00      2950.      19.9   80  2	"audi 5000s (diesel)"
330 | 30.0   4   146.0      67.00      3250.      21.8   80  2	"mercedes-benz 240d"
331 | 44.6   4   91.00      67.00      1850.      13.8   80  3	"honda civic 1500 gl"
332 | 40.9   4   85.00      ?          1835.      17.3   80  2	"renault lecar deluxe"
333 | 33.8   4   97.00      67.00      2145.      18.0   80  3	"subaru dl"
334 | 29.8   4   89.00      62.00      1845.      15.3   80  2	"vokswagen rabbit"
335 | 32.7   6   168.0      132.0      2910.      11.4   80  3	"datsun 280-zx"
336 | 23.7   3   70.00      100.0      2420.      12.5   80  3	"mazda rx-7 gs"
337 | 35.0   4   122.0      88.00      2500.      15.1   80  2	"triumph tr7 coupe"
338 | 23.6   4   140.0      ?          2905.      14.3   80  1	"ford mustang cobra"
339 | 32.4   4   107.0      72.00      2290.      17.0   80  3	"honda accord"
340 | 27.2   4   135.0      84.00      2490.      15.7   81  1	"plymouth reliant"
341 | 26.6   4   151.0      84.00      2635.      16.4   81  1	"buick skylark"
342 | 25.8   4   156.0      92.00      2620.      14.4   81  1	"dodge aries wagon (sw)"
343 | 23.5   6   173.0      110.0      2725.      12.6   81  1	"chevrolet citation"
344 | 30.0   4   135.0      84.00      2385.      12.9   81  1	"plymouth reliant"
345 | 39.1   4   79.00      58.00      1755.      16.9   81  3	"toyota starlet"
346 | 39.0   4   86.00      64.00      1875.      16.4   81  1	"plymouth champ"
347 | 35.1   4   81.00      60.00      1760.      16.1   81  3	"honda civic 1300"
348 | 32.3   4   97.00      67.00      2065.      17.8   81  3	"subaru"
349 | 37.0   4   85.00      65.00      1975.      19.4   81  3	"datsun 210 mpg"
350 | 37.7   4   89.00      62.00      2050.      17.3   81  3	"toyota tercel"
351 | 34.1   4   91.00      68.00      1985.      16.0   81  3	"mazda glc 4"
352 | 34.7   4   105.0      63.00      2215.      14.9   81  1	"plymouth horizon 4"
353 | 34.4   4   98.00      65.00      2045.      16.2   81  1	"ford escort 4w"
354 | 29.9   4   98.00      65.00      2380.      20.7   81  1	"ford escort 2h"
355 | 33.0   4   105.0      74.00      2190.      14.2   81  2	"volkswagen jetta"
356 | 34.5   4   100.0      ?          2320.      15.8   81  2	"renault 18i"
357 | 33.7   4   107.0      75.00      2210.      14.4   81  3	"honda prelude"
358 | 32.4   4   108.0      75.00      2350.      16.8   81  3	"toyota corolla"
359 | 32.9   4   119.0      100.0      2615.      14.8   81  3	"datsun 200sx"
360 | 31.6   4   120.0      74.00      2635.      18.3   81  3	"mazda 626"
361 | 28.1   4   141.0      80.00      3230.      20.4   81  2	"peugeot 505s turbo diesel"
362 | 30.7   6   145.0      76.00      3160.      19.6   81  2	"volvo diesel"
363 | 25.4   6   168.0      116.0      2900.      12.6   81  3	"toyota cressida"
364 | 24.2   6   146.0      120.0      2930.      13.8   81  3	"datsun 810 maxima"
365 | 22.4   6   231.0      110.0      3415.      15.8   81  1	"buick century"
366 | 26.6   8   350.0      105.0      3725.      19.0   81  1	"oldsmobile cutlass ls"
367 | 20.2   6   200.0      88.00      3060.      17.1   81  1	"ford granada gl"
368 | 17.6   6   225.0      85.00      3465.      16.6   81  1	"chrysler lebaron salon"
369 | 28.0   4   112.0      88.00      2605.      19.6   82  1	"chevrolet cavalier"
370 | 27.0   4   112.0      88.00      2640.      18.6   82  1	"chevrolet cavalier wagon"
371 | 34.0   4   112.0      88.00      2395.      18.0   82  1	"chevrolet cavalier 2-door"
372 | 31.0   4   112.0      85.00      2575.      16.2   82  1	"pontiac j2000 se hatchback"
373 | 29.0   4   135.0      84.00      2525.      16.0   82  1	"dodge aries se"
374 | 27.0   4   151.0      90.00      2735.      18.0   82  1	"pontiac phoenix"
375 | 24.0   4   140.0      92.00      2865.      16.4   82  1	"ford fairmont futura"
376 | 36.0   4   105.0      74.00      1980.      15.3   82  2	"volkswagen rabbit l"
377 | 37.0   4   91.00      68.00      2025.      18.2   82  3	"mazda glc custom l"
378 | 31.0   4   91.00      68.00      1970.      17.6   82  3	"mazda glc custom"
379 | 38.0   4   105.0      63.00      2125.      14.7   82  1	"plymouth horizon miser"
380 | 36.0   4   98.00      70.00      2125.      17.3   82  1	"mercury lynx l"
381 | 36.0   4   120.0      88.00      2160.      14.5   82  3	"nissan stanza xe"
382 | 36.0   4   107.0      75.00      2205.      14.5   82  3	"honda accord"
383 | 34.0   4   108.0      70.00      2245       16.9   82  3	"toyota corolla"
384 | 38.0   4   91.00      67.00      1965.      15.0   82  3	"honda civic"
385 | 32.0   4   91.00      67.00      1965.      15.7   82  3	"honda civic (auto)"
386 | 38.0   4   91.00      67.00      1995.      16.2   82  3	"datsun 310 gx"
387 | 25.0   6   181.0      110.0      2945.      16.4   82  1	"buick century limited"
388 | 38.0   6   262.0      85.00      3015.      17.0   82  1	"oldsmobile cutlass ciera (diesel)"
389 | 26.0   4   156.0      92.00      2585.      14.5   82  1	"chrysler lebaron medallion"
390 | 22.0   6   232.0      112.0      2835       14.7   82  1	"ford granada l"
391 | 32.0   4   144.0      96.00      2665.      13.9   82  3	"toyota celica gt"
392 | 36.0   4   135.0      84.00      2370.      13.0   82  1	"dodge charger 2.2"
393 | 27.0   4   151.0      90.00      2950.      17.3   82  1	"chevrolet camaro"
394 | 27.0   4   140.0      86.00      2790.      15.6   82  1	"ford mustang gl"
395 | 44.0   4   97.00      52.00      2130.      24.6   82  2	"vw pickup"
396 | 32.0   4   135.0      84.00      2295.      11.6   82  1	"dodge rampage"
397 | 28.0   4   120.0      79.00      2625.      18.6   82  1	"ford ranger"
398 | 31.0   4   119.0      82.00      2720.      19.4   82  1	"chevy s-10"
399 | 


--------------------------------------------------------------------------------
/Ch03-linreg-lab.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | jupyter:
  3 |   jupytext:
  4 |     cell_metadata_filter: -all
  5 |     formats: ipynb,Rmd
  6 |     main_language: python
  7 |     text_representation:
  8 |       extension: .Rmd
  9 |       format_name: rmarkdown
 10 |       format_version: '1.2'
 11 |       jupytext_version: 1.16.7
 12 | ---
 13 | 
 14 | # Linear Regression
 15 | 
 16 | <a target="_blank" href="https://colab.research.google.com/github/intro-stat-learning/ISLP_labs/blob/v2.2/Ch03-linreg-lab.ipynb">
 17 | <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
 18 | </a>
 19 | 
 20 | [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/intro-stat-learning/ISLP_labs/v2.2?labpath=Ch03-linreg-lab.ipynb)
 21 | 
 22 | 
 23 | 
 24 | ## Importing packages
 25 | We import our standard libraries at this top
 26 | level.
 27 | 
 28 | ```{python}
 29 | import numpy as np
 30 | import pandas as pd
 31 | from matplotlib.pyplot import subplots
 32 | 
 33 | ```
 34 | 
 35 | 
 36 | ### New imports
 37 | Throughout this lab we will introduce new functions and libraries. However,
 38 | we will import them here to emphasize these are the new
 39 | code objects in this lab. Keeping imports near the top
 40 | of a notebook makes the code more readable, since scanning the first few
 41 | lines tells us what libraries are used.
 42 | 
 43 | ```{python}
 44 | import statsmodels.api as sm
 45 | 
 46 | ```
 47 |  We will provide relevant details about the
 48 | functions below as they are needed.
 49 | 
 50 | Besides importing whole modules, it is also possible
 51 | to import only a few items from a given module. This
 52 | will help keep the  *namespace* clean.
 53 | We will use a few specific objects from the `statsmodels` package
 54 | which we import here.
 55 | 
 56 | ```{python}
 57 | from statsmodels.stats.outliers_influence \
 58 |      import variance_inflation_factor as VIF
 59 | from statsmodels.stats.anova import anova_lm
 60 | 
 61 | ```
 62 | 
 63 | As one of the import statements above is quite a long line, we inserted a line break `\` to
 64 | ease readability.
 65 | 
 66 | We will also use some functions written for the labs in this book in the `ISLP`
 67 | package.
 68 | 
 69 | ```{python}
 70 | from ISLP import load_data
 71 | from ISLP.models import (ModelSpec as MS,
 72 |                          summarize,
 73 |                          poly)
 74 | 
 75 | ```
 76 | 
 77 | ### Inspecting Objects and Namespaces
 78 | The
 79 | function  `dir()`
 80 | provides a list of
 81 | objects in a namespace.
 82 | 
 83 | ```{python}
 84 | dir()
 85 | 
 86 | ```
 87 |  This shows you everything that `Python` can find at the top level.
 88 | There are certain objects like `__builtins__` that contain references to built-in
 89 | functions like `print()`.
 90 | 
 91 | Every python object has its own notion of
 92 | namespace, also accessible with `dir()`. This will include
 93 | both the attributes of the object
 94 | as well as any methods associated with it. For instance, we see `'sum'` in the listing for an
 95 | array.
 96 | 
 97 | ```{python}
 98 | A = np.array([3,5,11])
 99 | dir(A)
100 | 
101 | ```
102 |  This indicates that the object `A.sum` exists. In this case it is a method
103 | that can be used to compute the sum of the array `A` as can be seen by typing `A.sum?`.
104 | 
105 | ```{python}
106 | A.sum()
107 | 
108 | ```
109 |     
110 | 
111 | 
112 | ## Simple Linear Regression
113 | In this section we will  construct model 
114 | matrices (also called design matrices) using the `ModelSpec()`  transform from `ISLP.models`.
115 | 
116 | We  will use the `Boston` housing data set, which is contained in the `ISLP` package.  The `Boston` dataset records  `medv`  (median house value) for $506$ neighborhoods
117 | around Boston.  We will build a regression model to predict  `medv`  using $13$
118 | predictors such as  `rm`  (average number of rooms per house),
119 |  `age`  (proportion of owner-occupied units built prior to 1940), and  `lstat`  (percent of
120 | households with low socioeconomic status).  We will use `statsmodels` for this
121 | task, a `Python` package that implements several commonly used
122 | regression methods.
123 | 
124 | We have included a simple loading function `load_data()` in the
125 | `ISLP` package:
126 | 
127 | ```{python}
128 | Boston = load_data("Boston")
129 | Boston.columns
130 | 
131 | ```
132 | 
133 | Type `Boston?` to find out more about these data.
134 | 
135 | We start by using the `sm.OLS()`  function to fit a
136 | simple linear regression model.  Our response will be
137 |  `medv`  and  `lstat`  will be the single predictor.
138 | For this model, we can create the model matrix by hand.
139 | 
140 | 
141 | ```{python}
142 | X = pd.DataFrame({'intercept': np.ones(Boston.shape[0]),
143 |                   'lstat': Boston['lstat']})
144 | X[:4]
145 | 
146 | ```
147 | 
148 | We extract the response, and fit the model.
149 | 
150 | ```{python}
151 | y = Boston['medv']
152 | model = sm.OLS(y, X)
153 | results = model.fit()
154 | 
155 | ```
156 | Note that `sm.OLS()` does
157 | not fit the model; it specifies the model, and then `model.fit()` does the actual fitting.  
158 | 
159 | Our `ISLP` function `summarize()` produces a simple table of the parameter estimates,
160 | their standard errors, t-statistics and p-values.
161 | The function takes a single argument, such as the object `results` 
162 | returned here by the `fit`
163 | method, and returns such a summary.
164 | 
165 | ```{python}
166 | summarize(results)
167 | 
168 | ```
169 | 
170 | 
171 | Before we describe other methods for working with fitted models, we outline a more useful and general framework for constructing a model matrix~`X`.
172 | ### Using Transformations: Fit and Transform
173 | Our model above has a single predictor, and constructing `X` was straightforward. 
174 | In practice  we often fit models with more than one predictor, typically selected from an array or data frame.
175 | We may wish to introduce transformations to the variables before fitting the model, specify interactions between variables, and expand some particular variables into sets of variables (e.g. polynomials).
176 | The `sklearn`  package has a particular notion
177 | for this type of task: a *transform*. A transform is an object
178 | that is created with some parameters as arguments. The
179 | object has two main methods: `fit()` and `transform()`.
180 | 
181 | We provide a general approach for specifying models and constructing
182 | the model matrix through the transform `ModelSpec()` in the `ISLP` library.
183 | `ModelSpec()`
184 | (renamed `MS()` in the preamble) creates a
185 | transform object, and then a pair of methods
186 | `transform()` and `fit()` are used to construct a
187 | corresponding model matrix.
188 | 
189 | We first describe this process for our simple regression model  using a single predictor `lstat` in
190 | the `Boston` data frame, but will use it repeatedly in more
191 | complex tasks in this and other labs in this book.
192 | In our case the transform is created by the expression
193 | `design = MS(['lstat'])`.
194 | 
195 | The `fit()`  method takes the original array and may do some
196 | initial computations on it, as specified in the transform object.
197 | For example, it may compute means and standard deviations for centering and scaling.
198 | The `transform()` 
199 | method applies the fitted transformation to the array of data, and produces the model matrix.
200 | 
201 | 
202 | ```{python}
203 | design = MS(['lstat'])
204 | design = design.fit(Boston)
205 | X = design.transform(Boston)
206 | X[:4]
207 | ```
208 | In this simple case, the `fit()`  method does very little; it simply checks that the variable `'lstat'` specified in `design` exists in `Boston`. Then `transform()` constructs the model matrix with two columns: an `intercept` and the variable `lstat`.
209 | 
210 | These two operations can be  combined with the
211 | `fit_transform()`  method.
212 | 
213 | ```{python}
214 | design = MS(['lstat'])
215 | X = design.fit_transform(Boston)
216 | X[:4]
217 | ```
218 | Note that, as in the previous code chunk when the two steps were done separately, the `design` object is changed as a result of the `fit()` operation. The power of this pipeline will become clearer when we fit more complex models that involve interactions and transformations.
219 | 
220 | 
221 | Let's return to our fitted regression model.
222 | The object
223 | `results` has several methods that can be used for inference.
224 | We already presented a function `summarize()` for showing the essentials of the fit.
225 | For a full and somewhat exhaustive summary of the fit, we can use the `summary()` 
226 | method.
227 | 
228 | ```{python}
229 | results.summary()
230 | 
231 | ```
232 | 
233 | The fitted coefficients can also be retrieved as the
234 | `params` attribute of `results`.
235 | 
236 | ```{python}
237 | results.params
238 | 
239 | ```
240 | 
241 | 
242 | The `get_prediction()`  method can be used to obtain predictions, and produce confidence intervals and
243 | prediction intervals for the prediction of  `medv`  for  given values of  `lstat`.
244 | 
245 | We first create a new data frame, in this case containing only the variable `lstat`, with the values for this variable at which we wish to make predictions.
246 | We then use the `transform()` method of `design` to create the corresponding model matrix.
247 | 
248 | ```{python}
249 | new_df = pd.DataFrame({'lstat':[5, 10, 15]})
250 | newX = design.transform(new_df)
251 | newX
252 | 
253 | ```
254 | 
255 | Next we compute the predictions at `newX`, and view them by extracting the `predicted_mean` attribute.
256 | 
257 | ```{python}
258 | new_predictions = results.get_prediction(newX);
259 | new_predictions.predicted_mean
260 | 
261 | ```
262 | We can produce confidence intervals for the predicted values.
263 | 
264 | ```{python}
265 | new_predictions.conf_int(alpha=0.05)
266 | 
267 | ```
268 | Prediction intervals are computed by setting `obs=True`:
269 | 
270 | ```{python}
271 | new_predictions.conf_int(obs=True, alpha=0.05)
272 | 
273 | ```
274 |  For instance, the 95% confidence interval associated with an
275 |  `lstat`  value of 10 is (24.47, 25.63), and the 95% prediction
276 | interval is (12.82, 37.28).  As expected, the confidence and
277 | prediction intervals are centered around the same point (a predicted
278 | value of 25.05 for  `medv`  when  `lstat`  equals
279 | 10), but the latter are substantially wider.
280 | 
281 | Next we will plot  `medv`  and  `lstat` 
282 | using `DataFrame.plot.scatter()`, \definelongblankMR{plot.scatter()}{plot.slashslashscatter()}
283 | and wish to
284 | add the regression line to the resulting plot.
285 | 
286 | 
287 | ### Defining Functions
288 | While there is a function
289 | within the `ISLP` package that adds a line to an existing plot, we take this opportunity
290 | to define our first function to do so.
291 | 
292 | ```{python}
293 | def abline(ax, b, m):
294 |     "Add a line with slope m and intercept b to ax"
295 |     xlim = ax.get_xlim()
296 |     ylim = [m * xlim[0] + b, m * xlim[1] + b]
297 |     ax.plot(xlim, ylim)
298 | 
299 | ```
300 |  A few things are illustrated above. First we see the syntax for defining a function:
301 | `def funcname(...)`. The function has arguments `ax, b, m`
302 | where `ax` is an axis object for an existing plot, `b` is the intercept and
303 | `m` is the slope of the desired line. Other plotting  options can be passed on to
304 | `ax.plot` by including additional optional arguments as follows:
305 | 
306 | ```{python}
307 | def abline(ax, b, m, *args, **kwargs):
308 |     "Add a line with slope m and intercept b to ax"
309 |     xlim = ax.get_xlim()
310 |     ylim = [m * xlim[0] + b, m * xlim[1] + b]
311 |     ax.plot(xlim, ylim, *args, **kwargs)
312 | 
313 | ```
314 | The addition of `*args` allows any number of
315 | non-named arguments to `abline`, while `**kwargs` allows any
316 | number of named arguments (such as `linewidth=3`) to `abline`.
317 | In our function, we pass
318 | these arguments verbatim to `ax.plot` above. Readers
319 | interested in learning more about
320 | functions are referred to the section on
321 | defining functions in [docs.python.org/tutorial](https://docs.python.org/3/tutorial/controlflow.html#defining-functions).
322 | 
323 | Let’s use our new function to add this regression line to a plot of
324 | `medv` vs. `lstat`.
325 | 
326 | ```{python}
327 | ax = Boston.plot.scatter('lstat', 'medv')
328 | abline(ax,
329 |        results.params[0],
330 |        results.params[1],
331 |        'r--',
332 |        linewidth=3)
333 | 
334 | ```
335 | Thus, the final call to `ax.plot()` is `ax.plot(xlim, ylim, 'r--', linewidth=3)`.
336 | We have used the argument `'r--'` to produce a red dashed line, and added
337 | an argument to make it of width 3.
338 | There is some evidence for non-linearity in the relationship between  `lstat`  and  `medv`. We will explore this issue later in this lab.
339 | 
340 | As mentioned above, there is an existing function to add a line to a plot --- `ax.axline()` --- but knowing how to write such functions empowers us to create more expressive displays.
341 | 
342 | 
343 | 
344 | 
345 | Next we examine some diagnostic plots, several of which were discussed
346 | in Section 3.3.3.
347 | We can find the fitted values and residuals
348 | of the fit as attributes of the `results` object.
349 | Various influence measures describing the regression model
350 | are computed with the `get_influence()` method.
351 | As we will not use the `fig` component returned
352 | as the first value from `subplots()`, we simply
353 | capture the second returned value in `ax` below.
354 | 
355 | ```{python}
356 | ax = subplots(figsize=(8,8))[1]
357 | ax.scatter(results.fittedvalues, results.resid)
358 | ax.set_xlabel('Fitted value')
359 | ax.set_ylabel('Residual')
360 | ax.axhline(0, c='k', ls='--');
361 | 
362 | ```
363 |  We add a horizontal line at 0 for reference using the
364 |  `ax.axhline()`   method, indicating
365 | it should be black (`c='k'`) and have a dashed linestyle (`ls='--'`).
366 | 
367 | On the basis of the residual plot, there is some evidence of non-linearity.
368 | Leverage statistics can be computed for any number of predictors using the
369 | `hat_matrix_diag` attribute of the value returned by the
370 | `get_influence()`  method.
371 | 
372 | ```{python}
373 | infl = results.get_influence()
374 | ax = subplots(figsize=(8,8))[1]
375 | ax.scatter(np.arange(X.shape[0]), infl.hat_matrix_diag)
376 | ax.set_xlabel('Index')
377 | ax.set_ylabel('Leverage')
378 | np.argmax(infl.hat_matrix_diag)
379 | 
380 | ```
381 |  The `np.argmax()`  function identifies the index of the largest element of an array, optionally computed over an axis of the array.
382 | In this case, we maximized over the entire array
383 | to determine which observation has the largest leverage statistic.
384 | 
385 | 
386 | ## Multiple Linear Regression
387 | In order to fit a multiple linear regression model using least squares, we again use
388 | the `ModelSpec()`  transform to construct the required
389 | model matrix and response. The arguments
390 | to `ModelSpec()` can be quite general, but in this case
391 | a list of column names suffice. We consider a fit here with
392 | the two variables `lstat` and `age`.
393 | 
394 | ```{python}
395 | X = MS(['lstat', 'age']).fit_transform(Boston)
396 | model1 = sm.OLS(y, X)
397 | results1 = model1.fit()
398 | summarize(results1)
399 | ```
400 | Notice how we have compacted the first line into a succinct expression describing the construction of `X`.
401 | 
402 | The  `Boston`   data set contains 12 variables, and so it would be cumbersome
403 | to have to type all of these in order to perform a regression using all of the predictors.
404 | Instead, we can use the following short-hand:\definelongblankMR{columns.drop()}{columns.slashslashdrop()}
405 | 
406 | ```{python}
407 | terms = Boston.columns.drop('medv')
408 | terms
409 | 
410 | ```
411 | 
412 | We can now fit the model with all the variables in `terms` using
413 | the same model matrix builder.
414 | 
415 | ```{python}
416 | X = MS(terms).fit_transform(Boston)
417 | model = sm.OLS(y, X)
418 | results = model.fit()
419 | summarize(results)
420 | 
421 | ```
422 | 
423 | What if we would like to perform a regression using all of the variables but one?  For
424 | example, in the above regression output,   `age`  has a high $p$-value.
425 | So we may wish to run a regression excluding this predictor.
426 | The following syntax results in a regression using all predictors except  `age`.
427 | 
428 | ```{python}
429 | minus_age = Boston.columns.drop(['medv', 'age']) 
430 | Xma = MS(minus_age).fit_transform(Boston)
431 | model1 = sm.OLS(y, Xma)
432 | summarize(model1.fit())
433 | 
434 | ```
435 | 
436 | ## Multivariate Goodness of Fit
437 | We can access the individual components of `results` by name
438 | (`dir(results)` shows us what is available). Hence
439 | `results.rsquared` gives us the $R^2$,
440 | and
441 | `np.sqrt(results.scale)` gives us the RSE.
442 | 
443 | Variance inflation factors (section 3.3.3) are sometimes useful
444 | to assess the effect of collinearity in the model matrix of a regression model.
445 | We will compute the VIFs in our multiple regression fit, and use the opportunity to introduce the idea of *list comprehension*.
446 | 
447 | ### List Comprehension
448 | Often we encounter a sequence of objects which we would like to transform
449 | for some other task. Below, we compute the VIF for each
450 | feature in our `X` matrix and produce a data frame
451 | whose index agrees with the columns of `X`.
452 | The notion of list comprehension can often make such
453 | a task easier.
454 | 
455 | List comprehensions are simple and powerful ways to form
456 | lists of `Python` objects. The language also supports
457 | dictionary and *generator* comprehension, though these are
458 | beyond our scope here. Let's look at an example. We compute the VIF for each of the variables
459 | in the model matrix `X`, using the function `variance_inflation_factor()`.
460 | 
461 | 
462 | ```{python}
463 | vals = [VIF(X, i)
464 |         for i in range(1, X.shape[1])]
465 | vif = pd.DataFrame({'vif':vals},
466 |                    index=X.columns[1:])
467 | vif
468 | 
469 | ```
470 | The function `VIF()` takes two arguments: a dataframe or array,
471 | and a variable column index. In the code above we call `VIF()` on the fly for all columns in `X`.  
472 | We have excluded column 0 above (the intercept), which is not of interest. In this case the VIFs are not that exciting.
473 | 
474 | The object `vals` above could have been constructed with the following for loop:
475 | 
476 | ```{python}
477 | vals = []
478 | for i in range(1, X.values.shape[1]):
479 |     vals.append(VIF(X.values, i))
480 | 
481 | ```
482 | List comprehension allows us to perform such repetitive operations in a more straightforward way.
483 | ## Interaction Terms
484 | It is easy to include interaction terms in a linear model using `ModelSpec()`.
485 | Including a tuple `("lstat","age")` tells the model
486 | matrix builder to include an interaction term between
487 |  `lstat`  and  `age`.
488 | 
489 | ```{python}
490 | X = MS(['lstat',
491 |         'age',
492 |         ('lstat', 'age')]).fit_transform(Boston)
493 | model2 = sm.OLS(y, X)
494 | summarize(model2.fit())
495 | 
496 | ```
497 | 
498 | 
499 | ## Non-linear Transformations of the Predictors
500 | The model matrix builder can include terms beyond
501 | just column names and interactions. For instance,
502 | the `poly()` function supplied in `ISLP` specifies that
503 | columns representing polynomial functions
504 | of its first argument are added to the model matrix.
505 | 
506 | ```{python}
507 | X = MS([poly('lstat', degree=2), 'age']).fit_transform(Boston)
508 | model3 = sm.OLS(y, X)
509 | results3 = model3.fit()
510 | summarize(results3)
511 | 
512 | ```
513 | The effectively zero *p*-value associated with the quadratic term
514 | (i.e. the third row above) suggests that it leads to an improved model.
515 | 
516 | By default, `poly()` creates a basis matrix for inclusion in the
517 | model matrix whose
518 | columns are *orthogonal polynomials*, which are designed for stable
519 | least squares computations. {Actually, `poly()` is a  wrapper for the workhorse and standalone  function `Poly()` that does the  work in building the model matrix.}
520 | Alternatively, had we included an argument
521 | `raw=True` in the above call to `poly()`, the basis matrix would consist simply of
522 | `lstat` and `lstat**2`. Since either of these bases
523 | represent quadratic polynomials, the fitted values  would not
524 | change in this case, just the polynomial coefficients.  Also by default, the columns
525 | created by `poly()` do not include an intercept column as
526 | that is automatically added by `MS()`.
527 | 
528 | We use the `anova_lm()` function to further quantify the extent to which the quadratic fit is
529 | superior to the linear fit.
530 | 
531 | ```{python}
532 | anova_lm(results1, results3)
533 | 
534 | ```
535 | Here `results1` represents the linear submodel containing
536 | predictors `lstat` and `age`,
537 | while `results3` corresponds to the larger model above  with a quadratic
538 | term in `lstat`.
539 | The `anova_lm()` function performs a hypothesis test
540 | comparing the two models. The null hypothesis is that the quadratic
541 | term in the bigger model is not needed, and the alternative hypothesis is that the
542 | bigger model is superior. Here the *F*-statistic is 177.28 and
543 | the associated *p*-value is zero.
544 | In this case the *F*-statistic is the square of the
545 | *t*-statistic for the quadratic term in the linear model summary
546 | for `results3` --- a consequence of the fact that these nested
547 | models differ by one degree of freedom.
548 | This provides very clear evidence that the quadratic polynomial in
549 | `lstat` improves the linear model.
550 | This is not surprising, since earlier we saw evidence for non-linearity in the relationship between `medv`
551 | and  `lstat`.
552 | 
553 | The function `anova_lm()` can take more than two nested models
554 | as input, in which case it compares every successive pair of models.
555 | That also explains why there are `NaN`s in the first row above, since
556 | there is no previous model with which to compare the first.
557 | 
558 | 
559 | ```{python}
560 | ax = subplots(figsize=(8,8))[1]
561 | ax.scatter(results3.fittedvalues, results3.resid)
562 | ax.set_xlabel('Fitted value')
563 | ax.set_ylabel('Residual')
564 | ax.axhline(0, c='k', ls='--');
565 | 
566 | ```
567 | We see that when the quadratic term is included in the model,
568 | there is little discernible pattern in the residuals.
569 | In order to create a cubic or higher-degree polynomial fit, we can simply change the degree argument
570 | to `poly()`.
571 | 
572 | 
573 | 
574 | ## Qualitative Predictors
575 | Here we use the  `Carseats`  data, which is included in the
576 | `ISLP` package. We will  attempt to predict `Sales`
577 | (child car seat sales) in 400 locations based on a number of
578 | predictors.
579 | 
580 | ```{python}
581 | Carseats = load_data('Carseats')
582 | Carseats.columns
583 | 
584 | ```
585 | The `Carseats`  
586 |  data includes qualitative predictors such as
587 |  `ShelveLoc`, an indicator of the quality of the shelving
588 |  location --- that is,
589 | the  space within a store in which the car seat is displayed. The predictor
590 |  `ShelveLoc`  takes on three possible values, `Bad`, `Medium`, and `Good`.
591 | Given a qualitative variable such as  `ShelveLoc`, `ModelSpec()` generates dummy
592 | variables automatically.
593 | These variables are often referred to as a *one-hot encoding* of the categorical
594 | feature. Their columns sum to one, so to avoid collinearity with an intercept, the first column is dropped. Below we see
595 | the column `ShelveLoc[Bad]` has been dropped, since `Bad` is the first level of `ShelveLoc`.
596 | Below we fit a multiple regression model that includes some interaction terms.
597 | 
598 | ```{python}
599 | allvars = list(Carseats.columns.drop('Sales'))
600 | y = Carseats['Sales']
601 | final = allvars + [('Income', 'Advertising'),
602 |                    ('Price', 'Age')]
603 | X = MS(final).fit_transform(Carseats)
604 | model = sm.OLS(y, X)
605 | summarize(model.fit())
606 | 
607 | ```
608 | In the first line above, we made `allvars` a list, so that we
609 | could add the interaction terms two lines down. 
610 | Our model-matrix builder has created a `ShelveLoc[Good]`
611 | dummy variable that takes on a value of 1 if the
612 | shelving location is good, and 0 otherwise. It has also created a `ShelveLoc[Medium]`
613 | dummy variable that equals 1 if the shelving location is medium, and 0 otherwise.
614 | A bad shelving location corresponds to a zero for each of the two dummy variables.
615 | The fact that the coefficient for `ShelveLoc[Good]` in the regression output is
616 | positive indicates that a good shelving location is associated with high sales (relative to a bad location).
617 | And `ShelveLoc[Medium]` has a smaller positive coefficient,
618 | indicating that a medium shelving location leads to higher sales than a bad
619 | shelving location, but lower sales than a good shelving location.
620 | 
621 | 
622 | 


--------------------------------------------------------------------------------
/Ch05-resample-lab.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | jupyter:
  3 |   jupytext:
  4 |     cell_metadata_filter: -all
  5 |     formats: ipynb,Rmd
  6 |     main_language: python
  7 |     text_representation:
  8 |       extension: .Rmd
  9 |       format_name: rmarkdown
 10 |       format_version: '1.2'
 11 |       jupytext_version: 1.16.7
 12 | ---
 13 | 
 14 | # Cross-Validation and the Bootstrap
 15 | 
 16 | <a target="_blank" href="https://colab.research.google.com/github/intro-stat-learning/ISLP_labs/blob/v2.2/Ch05-resample-lab.ipynb">
 17 | <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
 18 | </a>
 19 | 
 20 | [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/intro-stat-learning/ISLP_labs/v2.2?labpath=Ch05-resample-lab.ipynb)
 21 | 
 22 | 
 23 | In this lab, we explore the resampling techniques covered in this
 24 | chapter. Some of the commands in this lab may take a while to run on
 25 | your computer.
 26 | 
 27 | We again begin by placing most of our imports at this top level.
 28 | 
 29 | ```{python}
 30 | import numpy as np
 31 | import statsmodels.api as sm
 32 | from ISLP import load_data
 33 | from ISLP.models import (ModelSpec as MS,
 34 |                          summarize,
 35 |                          poly)
 36 | from sklearn.model_selection import train_test_split
 37 | 
 38 | ```
 39 | 
 40 | 
 41 | There are several new imports needed for this lab.
 42 | 
 43 | ```{python}
 44 | from functools import partial
 45 | from sklearn.model_selection import \
 46 |      (cross_validate,
 47 |       KFold,
 48 |       ShuffleSplit)
 49 | from sklearn.base import clone
 50 | from ISLP.models import sklearn_sm
 51 | 
 52 | ```
 53 | 
 54 | 
 55 | ## The Validation Set Approach
 56 | We explore the use of the validation set approach in order to estimate
 57 | the test error rates that result from fitting various linear models on
 58 | the  `Auto`  data set.
 59 | 
 60 | We use the function `train_test_split()` to split
 61 | the data into training and validation sets. As there are 392 observations,
 62 | we split into two equal sets of size 196 using the
 63 | argument `test_size=196`. It is generally a good idea to set a random seed
 64 | when performing operations like this that contain an
 65 | element of randomness, so that the results obtained can be reproduced
 66 | precisely at a later time. We set the random seed of the splitter
 67 | with the argument `random_state=0`. 
 68 | 
 69 | ```{python}
 70 | Auto = load_data('Auto')
 71 | Auto_train, Auto_valid = train_test_split(Auto,
 72 |                                          test_size=196,
 73 |                                          random_state=0)
 74 | 
 75 | ```
 76 | 
 77 | Now we can fit a linear regression using only the observations corresponding to the training set `Auto_train`.
 78 | 
 79 | ```{python}
 80 | hp_mm = MS(['horsepower'])
 81 | X_train = hp_mm.fit_transform(Auto_train)
 82 | y_train = Auto_train['mpg']
 83 | model = sm.OLS(y_train, X_train)
 84 | results = model.fit()
 85 | 
 86 | ```
 87 | 
 88 | We now use the `predict()` method of `results` evaluated on the model matrix for this model
 89 | created using the validation data set. We also calculate the validation MSE of our model.
 90 | 
 91 | ```{python}
 92 | X_valid = hp_mm.transform(Auto_valid)
 93 | y_valid = Auto_valid['mpg']
 94 | valid_pred = results.predict(X_valid)
 95 | np.mean((y_valid - valid_pred)**2)
 96 | 
 97 | ```
 98 | 
 99 | Hence our estimate for the validation MSE of  the linear regression
100 | fit is $23.62$.
101 | 
102 | We can also estimate the validation error for
103 | higher-degree polynomial regressions. We first provide a function `evalMSE()` that takes a model string as well
104 | as training and test sets and returns the MSE on the test set.
105 | 
106 | ```{python}
107 | def evalMSE(terms,
108 |             response,
109 |             train,
110 |             test):
111 | 
112 |    mm = MS(terms)
113 |    X_train = mm.fit_transform(train)
114 |    y_train = train[response]
115 | 
116 |    X_test = mm.transform(test)
117 |    y_test = test[response]
118 | 
119 |    results = sm.OLS(y_train, X_train).fit()
120 |    test_pred = results.predict(X_test)
121 | 
122 |    return np.mean((y_test - test_pred)**2)
123 | 
124 | ```
125 | 
126 | Let’s use this function to estimate the validation MSE
127 | using linear, quadratic and cubic fits. We use the `enumerate()`  function
128 | here, which gives both the values and indices of objects as one iterates
129 | over a for loop.
130 | 
131 | ```{python}
132 | MSE = np.zeros(3)
133 | for idx, degree in enumerate(range(1, 4)):
134 |     MSE[idx] = evalMSE([poly('horsepower', degree)],
135 |                        'mpg',
136 |                        Auto_train,
137 |                        Auto_valid)
138 | MSE
139 | 
140 | ```
141 | 
142 | These error rates are $23.62, 18.76$, and $18.80$, respectively. If we
143 | choose a different training/validation split instead, then we
144 | can expect somewhat different errors on the validation set.
145 | 
146 | ```{python}
147 | Auto_train, Auto_valid = train_test_split(Auto,
148 |                                           test_size=196,
149 |                                           random_state=3)
150 | MSE = np.zeros(3)
151 | for idx, degree in enumerate(range(1, 4)):
152 |     MSE[idx] = evalMSE([poly('horsepower', degree)],
153 |                        'mpg',
154 |                        Auto_train,
155 |                        Auto_valid)
156 | MSE
157 | ```
158 | 
159 | Using this split of the observations into a training set and a validation set,
160 | we find that the validation set error rates for the models with linear, quadratic, and cubic terms are $20.76$, $16.95$, and $16.97$, respectively.
161 | 
162 | These results are consistent with our previous findings: a model that
163 | predicts `mpg` using a quadratic function of `horsepower`
164 | performs better than a model that involves only a linear function of
165 | `horsepower`, and there is no evidence of an improvement in using a cubic function of `horsepower`.
166 | 
167 | 
168 | ## Cross-Validation
169 | In theory, the cross-validation estimate can be computed for any generalized
170 | linear model.  {}
171 | In practice, however, the simplest way to cross-validate in
172 | Python is to use `sklearn`, which has a different interface or API
173 | than `statsmodels`, the code we have been using to fit GLMs.
174 | 
175 | This is a problem which often confronts data scientists: "I have a function to do task $A$, and need to feed it into something that performs task $B$, so that I can compute $B(A(D))$, where $D$ is my data." When $A$ and $B$ don’t naturally speak to each other, this
176 | requires the use of a *wrapper*.
177 | In the `ISLP` package,
178 | we provide 
179 | a wrapper, `sklearn_sm()`, that enables us to easily use the cross-validation tools of `sklearn` with
180 | models fit by `statsmodels`.
181 | 
182 | The class `sklearn_sm()` 
183 | has  as its first argument
184 | a model from `statsmodels`. It can take two additional
185 | optional arguments: `model_str` which can be
186 | used to specify a formula, and `model_args` which should
187 | be a dictionary of additional arguments used when fitting
188 | the model. For example, to fit a logistic regression model
189 | we have to specify a `family` argument. This
190 | is passed as `model_args={'family':sm.families.Binomial()}`.
191 | 
192 | Here is our wrapper in action:
193 | 
194 | ```{python}
195 | hp_model = sklearn_sm(sm.OLS,
196 |                       MS(['horsepower']))
197 | X, Y = Auto.drop(columns=['mpg']), Auto['mpg']
198 | cv_results = cross_validate(hp_model,
199 |                             X,
200 |                             Y,
201 |                             cv=Auto.shape[0])
202 | cv_err = np.mean(cv_results['test_score'])
203 | cv_err
204 | 
205 | ```
206 | The arguments to `cross_validate()` are as follows: an
207 | object with the appropriate `fit()`, `predict()`,
208 | and `score()` methods,  an
209 | array of features `X` and a response `Y`. 
210 | We also included an additional argument `cv` to `cross_validate()`; specifying an integer
211 | $k$ results in $k$-fold cross-validation. We have provided a value 
212 | corresponding to the total number of observations, which results in
213 | leave-one-out cross-validation (LOOCV). The `cross_validate()`  function produces a dictionary with several components;
214 | we simply want the cross-validated test score here (MSE), which is estimated to be 24.23.
215 | 
216 | 
217 | We can repeat this procedure for increasingly complex polynomial fits.
218 | To automate the process, we again
219 | use a for loop which iteratively fits polynomial
220 | regressions of degree 1 to 5, computes the
221 | associated cross-validation error, and stores it in the $i$th element
222 | of the vector `cv_error`. The variable `d` in the for loop
223 | corresponds to the degree of the polynomial. We begin by initializing the
224 | vector. This command may take a couple of seconds to run.
225 | 
226 | ```{python}
227 | cv_error = np.zeros(5)
228 | H = np.array(Auto['horsepower'])
229 | M = sklearn_sm(sm.OLS)
230 | for i, d in enumerate(range(1,6)):
231 |     X = np.power.outer(H, np.arange(d+1))
232 |     M_CV = cross_validate(M,
233 |                           X,
234 |                           Y,
235 |                           cv=Auto.shape[0])
236 |     cv_error[i] = np.mean(M_CV['test_score'])
237 | cv_error
238 | 
239 | ```
240 | As in Figure 5.4, we see a sharp drop in the estimated test MSE between the linear and
241 | quadratic fits, but then no clear improvement from using higher-degree polynomials.
242 | 
243 | Above we introduced the `outer()`  method of the `np.power()`
244 | function.  The `outer()` method is applied to an operation
245 | that has two arguments, such as `add()`, `min()`, or
246 | `power()`.
247 | It has two arrays as
248 | arguments, and then forms a larger
249 | array where the operation is applied to each pair of elements of the
250 | two arrays. 
251 | 
252 | ```{python}
253 | A = np.array([3, 5, 9])
254 | B = np.array([2, 4])
255 | np.add.outer(A, B)
256 | 
257 | ```
258 | 
259 | In the CV example above, we used $k=n$, but of course we can also use $k<n$. The code is very similar
260 | to the above (and is significantly faster). Here we use `KFold()` to partition the data into $k=10$ random groups. We use `random_state` to set a random seed and initialize a vector `cv_error` in which we will store the CV errors corresponding to the
261 | polynomial fits of degrees one to five.
262 | 
263 | ```{python}
264 | cv_error = np.zeros(5)
265 | cv = KFold(n_splits=10,
266 |            shuffle=True,
267 |            random_state=0) # use same splits for each degree
268 | for i, d in enumerate(range(1,6)):
269 |     X = np.power.outer(H, np.arange(d+1))
270 |     M_CV = cross_validate(M,
271 |                           X,
272 |                           Y,
273 |                           cv=cv)
274 |     cv_error[i] = np.mean(M_CV['test_score'])
275 | cv_error
276 | 
277 | ```
278 | Notice that the computation time is much shorter than that of LOOCV.
279 | (In principle, the computation time for LOOCV for a least squares
280 | linear model should be faster than for $k$-fold CV, due to the
281 | availability of the formula~(5.2)  for LOOCV;
282 | however, the generic `cross_validate()`  function does not make
283 | use of this formula.)  We still see little evidence that using cubic
284 | or higher-degree polynomial terms leads to a lower test error than simply
285 | using a quadratic fit.
286 | 
287 | 
288 | The `cross_validate()` function is flexible and can take
289 | different splitting mechanisms as an argument. For instance, one can use the `ShuffleSplit()`
290 | function to implement
291 | the validation set approach just as easily as $k$-fold cross-validation.
292 | 
293 | ```{python}
294 | validation = ShuffleSplit(n_splits=1,
295 |                           test_size=196,
296 |                           random_state=0)
297 | results = cross_validate(hp_model,
298 |                          Auto.drop(['mpg'], axis=1),
299 |                          Auto['mpg'],
300 |                          cv=validation);
301 | results['test_score']
302 | 
303 | ```
304 | 
305 | 
306 | One can estimate the variability in the test error by running the following:
307 | 
308 | ```{python}
309 | validation = ShuffleSplit(n_splits=10,
310 |                           test_size=196,
311 |                           random_state=0)
312 | results = cross_validate(hp_model,
313 |                          Auto.drop(['mpg'], axis=1),
314 |                          Auto['mpg'],
315 |                          cv=validation)
316 | results['test_score'].mean(), results['test_score'].std()
317 | 
318 | ```
319 | 
320 | Note that this standard deviation is not a valid estimate of the
321 | sampling variability of the mean test score or the individual scores, since the randomly-selected training
322 | samples overlap and hence introduce correlations. But it does give an
323 | idea of the Monte Carlo variation
324 | incurred by picking different random folds.
325 | 
326 | ## The Bootstrap
327 | We illustrate the use of the bootstrap in the simple example
328 |  {of Section 5.2,}  as well as on an example involving
329 | estimating the accuracy of the linear regression model on the  `Auto`
330 | data set.
331 | ### Estimating the Accuracy of a Statistic of Interest
332 | One of the great advantages of the bootstrap approach is that it can
333 | be applied in almost all situations. No complicated mathematical
334 | calculations are required. While there are several implementations
335 | of the bootstrap in Python, its use for estimating
336 | standard error is simple enough that we write our own function
337 | below for the case when our data is stored
338 | in a dataframe.
339 | 
340 | To illustrate the bootstrap, we
341 | start with a simple example.
342 | The  `Portfolio`  data set in the `ISLP` package is described
343 | in Section 5.2. The goal is to estimate the
344 | sampling variance of the parameter $\alpha$ given in formula~(5.7).  We will
345 | create a function
346 | `alpha_func()`, which takes as input a dataframe `D` assumed
347 | to have columns `X` and `Y`, as well as a
348 | vector `idx` indicating which observations should be used to
349 | estimate 
350 | $\alpha$. The function then outputs the estimate for $\alpha$ based on
351 | the selected observations.
352 | 
353 | ```{python}
354 | Portfolio = load_data('Portfolio')
355 | def alpha_func(D, idx):
356 |    cov_ = np.cov(D[['X','Y']].loc[idx], rowvar=False)
357 |    return ((cov_[1,1] - cov_[0,1]) /
358 |            (cov_[0,0]+cov_[1,1]-2*cov_[0,1]))
359 | 
360 | ```
361 | This function returns an estimate for $\alpha$
362 | based on applying the minimum
363 |     variance formula (5.7) to the observations indexed by
364 | the argument `idx`.  For instance, the following command
365 | estimates $\alpha$ using all 100 observations.
366 | 
367 | ```{python}
368 | alpha_func(Portfolio, range(100))
369 | ```
370 | 
371 | Next we randomly select
372 | 100 observations from `range(100)`, with replacement. This is equivalent
373 | to constructing a new bootstrap data set and recomputing $\hat{\alpha}$
374 | based on the new data set.
375 | 
376 | ```{python}
377 | rng = np.random.default_rng(0)
378 | alpha_func(Portfolio,
379 |            rng.choice(100,
380 |                       100,
381 |                       replace=True))
382 | ```
383 | 
384 | 
385 | This process can be generalized to create a simple function `boot_SE()` for
386 | computing the bootstrap standard error for arbitrary
387 | functions that take only a data frame as an argument.
388 | 
389 | ```{python}
390 | def boot_SE(func,
391 |             D,
392 |             n=None,
393 |             B=1000,
394 |             seed=0):
395 |     rng = np.random.default_rng(seed)
396 |     first_, second_ = 0, 0
397 |     n = n or D.shape[0]
398 |     for _ in range(B):
399 |         idx = rng.choice(D.index,
400 |                          n,
401 |                          replace=True)
402 |         value = func(D, idx)
403 |         first_ += value
404 |         second_ += value**2
405 |     return np.sqrt(second_ / B - (first_ / B)**2)
406 | ```
407 | Notice the use of `_` as a loop variable in `for _ in range(B)`. This is often used if the value of the counter is
408 | unimportant and simply makes sure  the loop is executed `B` times.
409 | 
410 | Let’s use our function to evaluate the accuracy of our
411 | estimate of $\alpha$ using $B=1{,}000$ bootstrap replications. 
412 | 
413 | ```{python}
414 | alpha_SE = boot_SE(alpha_func,
415 |                    Portfolio,
416 |                    B=1000,
417 |                    seed=0)
418 | alpha_SE
419 | 
420 | ```
421 | 
422 | The final output shows that the bootstrap estimate for ${\rm SE}(\hat{\alpha})$ is $0.0912$.
423 | 
424 | ### Estimating the Accuracy of a Linear Regression Model
425 | The bootstrap approach can be used to assess the variability of the
426 | coefficient estimates and predictions from a statistical learning
427 | method. Here we use the bootstrap approach in order to assess the
428 | variability of the estimates for $\beta_0$ and $\beta_1$, the
429 | intercept and slope terms for the linear regression model that uses
430 | `horsepower` to predict `mpg` in the  `Auto`  data set. We
431 | will compare the estimates obtained using the bootstrap to those
432 | obtained using the formulas for ${\rm SE}(\hat{\beta}_0)$ and
433 | ${\rm SE}(\hat{\beta}_1)$ described in Section 3.1.2.
434 | 
435 | To use our `boot_SE()` function, we must write a function (its
436 | first argument)
437 | that takes a data frame `D` and indices `idx`
438 | as its only arguments. But here we want to bootstrap a specific
439 | regression model, specified by a model formula and data. We show how
440 | to do this in a few simple steps.
441 | 
442 | We start by writing a generic
443 | function `boot_OLS()` for bootstrapping a regression model that takes a formula to
444 | define the corresponding regression. We use the `clone()` function to
445 | make a copy of the formula that can be refit to the new dataframe. This means
446 | that any derived features such as those defined by `poly()`
447 | (which we will see shortly),
448 | will be re-fit on the resampled data frame.
449 | 
450 | ```{python}
451 | def boot_OLS(model_matrix, response, D, idx):
452 |     D_ = D.loc[idx]
453 |     Y_ = D_[response]
454 |     X_ = clone(model_matrix).fit_transform(D_)
455 |     return sm.OLS(Y_, X_).fit().params
456 | ```
457 | This is not quite what is needed as the first argument to
458 | `boot_SE()`. The first two arguments which specify the model will not change in the
459 | bootstrap process, and we would like to *freeze* them.   The
460 | function `partial()` from the `functools` module  does precisely this: it takes a function
461 | as an argument, and freezes some of its arguments, starting from the
462 | left. We use it to freeze the first two model-formula arguments of `boot_OLS()`.
463 | 
464 | ```{python}
465 | hp_func = partial(boot_OLS, MS(['horsepower']), 'mpg')
466 | 
467 | ```
468 | Typing `hp_func?` will show that it has two arguments `D`
469 | and `idx` --- it is a version of `boot_OLS()` with the first
470 | two arguments frozen --- and hence is ideal as the first argument for `boot_SE()`.
471 | 
472 | The `hp_func()` function can now be used in order to create
473 | bootstrap estimates for the intercept and slope terms by randomly
474 | sampling from among the observations with replacement. We first
475 | demonstrate its utility on 10 bootstrap samples.
476 | 
477 | ```{python}
478 | rng = np.random.default_rng(0)
479 | np.array([hp_func(Auto,
480 |           rng.choice(Auto.index,
481 |                      392,
482 |                      replace=True)) for _ in range(10)])
483 | 
484 | ```
485 | Next, we use the `boot_SE()` {}  function to compute the standard
486 | errors of 1,000 bootstrap estimates for the intercept and slope terms.
487 | 
488 | ```{python}
489 | hp_se = boot_SE(hp_func,
490 |                 Auto,
491 |                 B=1000,
492 |                 seed=10)
493 | hp_se
494 | 
495 | ```
496 | 
497 | 
498 | This indicates that the bootstrap estimate for ${\rm SE}(\hat{\beta}_0)$ is
499 | 0.85, and that the bootstrap
500 | estimate for ${\rm SE}(\hat{\beta}_1)$ is
501 | 0.0074.  As discussed in
502 | Section 3.1.2, standard formulas can be used to compute
503 | the standard errors for the regression coefficients in a linear
504 | model. These can be obtained using the `summarize()`  function
505 | from `ISLP.sm`.
506 | 
507 | ```{python}
508 | hp_model.fit(Auto, Auto['mpg'])
509 | model_se = summarize(hp_model.results_)['std err']
510 | model_se
511 | 
512 | ```
513 | 
514 | 
515 | The standard error estimates for $\hat{\beta}_0$ and $\hat{\beta}_1$
516 | obtained using the formulas  from Section 3.1.2  are
517 | 0.717 for the
518 | intercept and
519 | 0.006 for the
520 | slope. Interestingly, these are somewhat different from the estimates
521 | obtained using the bootstrap.  Does this indicate a problem with the
522 | bootstrap? In fact, it suggests the opposite.  Recall that the
523 | standard formulas given in
524 |  {Equation 3.8 on page 75}
525 | rely on certain assumptions. For example,
526 | they depend on the unknown parameter $\sigma^2$, the noise
527 | variance. We then estimate $\sigma^2$ using the RSS. Now although the
528 | formulas for the standard errors do not rely on the linear model being
529 | correct, the estimate for $\sigma^2$ does.  We see
530 |  {in Figure 3.8 on page 99}  that there is
531 | a non-linear relationship in the data, and so the residuals from a
532 | linear fit will be inflated, and so will $\hat{\sigma}^2$.  Secondly,
533 | the standard formulas assume (somewhat unrealistically) that the $x_i$
534 | are fixed, and all the variability comes from the variation in the
535 | errors $\epsilon_i$.  The bootstrap approach does not rely on any of
536 | these assumptions, and so it is likely giving a more accurate estimate
537 | of the standard errors of $\hat{\beta}_0$ and $\hat{\beta}_1$ than
538 | the results from `sm.OLS`.
539 | 
540 | Below we compute the bootstrap standard error estimates and the
541 | standard linear regression estimates that result from fitting the
542 | quadratic model to the data. Since this model provides a good fit to
543 | the data (Figure 3.8), there is now a better
544 | correspondence between the bootstrap estimates and the standard
545 | estimates of ${\rm SE}(\hat{\beta}_0)$, ${\rm SE}(\hat{\beta}_1)$ and
546 | ${\rm SE}(\hat{\beta}_2)$.
547 | 
548 | ```{python}
549 | quad_model = MS([poly('horsepower', 2, raw=True)])
550 | quad_func = partial(boot_OLS,
551 |                     quad_model,
552 |                     'mpg')
553 | boot_SE(quad_func, Auto, B=1000)
554 | 
555 | ```
556 | 
557 | We  compare the results to the standard errors computed using `sm.OLS()`.
558 | 
559 | ```{python}
560 | M = sm.OLS(Auto['mpg'],
561 |            quad_model.fit_transform(Auto))
562 | summarize(M.fit())['std err']
563 | 
564 | ```
565 | 
566 | 
567 | 
568 | 


--------------------------------------------------------------------------------
/Ch08-baggboost-lab.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | jupyter:
  3 |   jupytext:
  4 |     cell_metadata_filter: -all
  5 |     formats: ipynb,Rmd
  6 |     main_language: python
  7 |     text_representation:
  8 |       extension: .Rmd
  9 |       format_name: rmarkdown
 10 |       format_version: '1.2'
 11 |       jupytext_version: 1.16.7
 12 | ---
 13 | 
 14 | # Tree-Based Methods
 15 | 
 16 | <a target="_blank" href="https://colab.research.google.com/github/intro-stat-learning/ISLP_labs/blob/v2.2/Ch08-baggboost-lab.ipynb">
 17 | <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
 18 | </a>
 19 | 
 20 | [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/intro-stat-learning/ISLP_labs/v2.2?labpath=Ch08-baggboost-lab.ipynb)
 21 | 
 22 | 
 23 | We import some of our usual libraries at this top
 24 | level.
 25 | 
 26 | ```{python}
 27 | import numpy as np
 28 | import pandas as pd
 29 | from matplotlib.pyplot import subplots
 30 | import sklearn.model_selection as skm
 31 | from ISLP import load_data, confusion_table
 32 | from ISLP.models import ModelSpec as MS
 33 | 
 34 | ```
 35 | We also  collect the new imports
 36 | needed for this lab.
 37 | 
 38 | ```{python}
 39 | from sklearn.tree import (DecisionTreeClassifier as DTC,
 40 |                           DecisionTreeRegressor as DTR,
 41 |                           plot_tree,
 42 |                           export_text)
 43 | from sklearn.metrics import (accuracy_score,
 44 |                              log_loss)
 45 | from sklearn.ensemble import \
 46 |      (RandomForestRegressor as RF,
 47 |       GradientBoostingRegressor as GBR)
 48 | from ISLP.bart import BART
 49 | 
 50 | ```
 51 | 
 52 | 
 53 | ## Fitting Classification Trees
 54 | 
 55 | 
 56 | We first use classification trees to analyze the  `Carseats`  data set.
 57 | In these data, `Sales` is a continuous variable, and so we begin
 58 | by recoding it as a binary variable. We use the `where()` 
 59 | function to create a variable, called `High`, which takes on a
 60 | value of `Yes` if the `Sales` variable exceeds 8, and takes
 61 | on a value of `No` otherwise.
 62 | 
 63 | ```{python}
 64 | Carseats = load_data('Carseats')
 65 | High = np.where(Carseats.Sales > 8,
 66 |                 "Yes",
 67 |                 "No")
 68 | 
 69 | ```
 70 | 
 71 | We now use `DecisionTreeClassifier()`  to fit a classification tree in
 72 | order to predict `High` using all variables but `Sales`.
 73 | To do so, we must form a model matrix as we did when fitting regression
 74 | models.  
 75 | 
 76 | ```{python}
 77 | model = MS(Carseats.columns.drop('Sales'), intercept=False)
 78 | D = model.fit_transform(Carseats)
 79 | feature_names = list(D.columns)
 80 | X = np.asarray(D)
 81 | 
 82 | ```
 83 | We have converted `D` from a data frame to an array `X`, which is needed in some of the analysis below. We also need the `feature_names` for annotating our plots later.
 84 | 
 85 | There are several options needed to specify the  classifier,
 86 | such as `max_depth` (how deep to grow the tree), `min_samples_split`
 87 | (minimum number of observations in a node to be eligible for splitting)
 88 | and `criterion` (whether to use Gini or cross-entropy as the split criterion).
 89 | We also set `random_state` for reproducibility; ties in the split criterion are broken at random.
 90 | 
 91 | ```{python}
 92 | clf = DTC(criterion='entropy',
 93 |           max_depth=3,
 94 |           random_state=0)        
 95 | clf.fit(X, High)
 96 | 
 97 | ```
 98 | 
 99 | 
100 | In our discussion of qualitative features in Section 3.3,
101 | we noted that for a linear regression model such a feature could be
102 | represented by including a matrix of dummy variables (one-hot-encoding) in the model
103 | matrix, using the formula notation of `statsmodels`.
104 | As mentioned in Section 8.1, there is a more
105 | natural way to handle qualitative features when building a decision
106 | tree, that does not require such dummy variables; each split amounts to partitioning the levels into two groups.
107 | However, 
108 | the `sklearn` implementation of decision trees does not take
109 | advantage of this approach; instead it simply treats the one-hot-encoded levels as separate variables.
110 | 
111 | ```{python}
112 | accuracy_score(High, clf.predict(X))
113 | 
114 | ```
115 | 
116 | 
117 | With only the default arguments, the training error rate is
118 | 21%.
119 | For classification trees, we can
120 | access the value of the deviance using `log_loss()`,
121 | \begin{equation*}
122 | \begin{split}
123 | -2 \sum_m \sum_k n_{mk} \log \hat{p}_{mk},
124 | \end{split}
125 | \end{equation*}
126 | where $n_{mk}$ is the number of observations in the $m$th terminal
127 | node that belong to the $k$th class.
128 | 
129 | ```{python}
130 | resid_dev = np.sum(log_loss(High, clf.predict_proba(X)))
131 | resid_dev
132 | 
133 | ```
134 | 
135 | This is closely related to the *entropy*, defined in (8.7).
136 | A small deviance indicates a
137 | tree that provides a good fit to the (training) data.
138 |  
139 | One of the most attractive properties of trees is that they can
140 | be graphically displayed. Here we use the `plot()`  function
141 | to display the tree structure.
142 | 
143 | ```{python}
144 | ax = subplots(figsize=(12,12))[1]
145 | plot_tree(clf,
146 |           feature_names=feature_names,
147 |           ax=ax);
148 | 
149 | ```
150 | The most important indicator of `Sales` appears to be `ShelveLoc`.
151 | 
152 | We can see a text representation of the tree using
153 | `export_text()`, which displays the split
154 | criterion (e.g. `Price <= 92.5`) for each branch.
155 | For leaf nodes it shows the overall prediction  
156 | (`Yes` or `No`). 
157 |  We can also see the number of observations in that
158 | leaf that take on values of `Yes` and `No` by specifying  `show_weights=True`.
159 | 
160 | ```{python}
161 | print(export_text(clf,
162 |                   feature_names=feature_names,
163 |                   show_weights=True))
164 | 
165 | ```
166 | 
167 | In order to properly evaluate the performance of a classification tree
168 | on these data, we must estimate the test error rather than simply
169 | computing the training error. We split the observations into a
170 | training set and a test set, build the tree using the training set,
171 | and evaluate its performance on the test data. This pattern is
172 | similar to that in Chapter 6, with the linear models
173 | replaced here by decision trees --- the code for validation
174 | is almost identical. This approach leads to correct predictions
175 | for 68.5% of the locations in the test data set.
176 | 
177 | ```{python}
178 | validation = skm.ShuffleSplit(n_splits=1,
179 |                               test_size=200,
180 |                               random_state=0)
181 | results = skm.cross_validate(clf,
182 |                              D,
183 |                              High,
184 |                              cv=validation)
185 | results['test_score']
186 | 
187 | ```
188 |     
189 | 
190 | 
191 | Next, we consider whether pruning the tree might lead to improved
192 | classification performance. We first split the data into a training and
193 | test set. We will use cross-validation to prune the tree on the training
194 | set, and then evaluate the performance of the pruned tree on the test
195 | set.
196 | 
197 | ```{python}
198 | (X_train,
199 |  X_test,
200 |  High_train,
201 |  High_test) = skm.train_test_split(X,
202 |                                    High,
203 |                                    test_size=0.5,
204 |                                    random_state=0)
205 |                                    
206 | ```
207 | We first refit the full tree on the training set; here we do not set a `max_depth` parameter, since we will learn that through cross-validation.
208 | 
209 | 
210 | ```{python}
211 | clf = DTC(criterion='entropy', random_state=0)
212 | clf.fit(X_train, High_train)
213 | accuracy_score(High_test, clf.predict(X_test))
214 | 
215 | ```
216 | Next we use the `cost_complexity_pruning_path()` method of
217 | `clf` to extract cost-complexity values. 
218 | 
219 | ```{python}
220 | ccp_path = clf.cost_complexity_pruning_path(X_train, High_train)
221 | kfold = skm.KFold(10,
222 |                   random_state=1,
223 |                   shuffle=True)
224 | 
225 | ```
226 | This yields a set of impurities and $\alpha$ values
227 | from which we can extract an optimal one by cross-validation.
228 | 
229 | ```{python}
230 | grid = skm.GridSearchCV(clf,
231 |                         {'ccp_alpha': ccp_path.ccp_alphas},
232 |                         refit=True,
233 |                         cv=kfold,
234 |                         scoring='accuracy')
235 | grid.fit(X_train, High_train)
236 | grid.best_score_
237 | 
238 | ```
239 | Let’s take a look at the pruned tree.
240 | 
241 | ```{python}
242 | ax = subplots(figsize=(12, 12))[1]
243 | best_ = grid.best_estimator_
244 | plot_tree(best_,
245 |           feature_names=feature_names,
246 |           ax=ax);
247 | 
248 | ```
249 | This is quite a bushy tree. We could count the leaves, or query
250 | `best_` instead.
251 | 
252 | ```{python}
253 | best_.tree_.n_leaves
254 | 
255 | ```
256 | The tree with 30 terminal
257 | nodes results in the lowest cross-validation error rate, with an accuracy of
258 | 68.5%. How well does this pruned tree perform on the test data set? Once
259 | again, we apply the `predict()`  function.
260 | 
261 | ```{python}
262 | print(accuracy_score(High_test,
263 |                      best_.predict(X_test)))
264 | confusion = confusion_table(best_.predict(X_test),
265 |                             High_test)
266 | confusion
267 | 
268 | ```
269 | 
270 | 
271 | Now 72.0% of the test observations are correctly classified, which is slightly worse than the error for the full tree (with 35 leaves). So cross-validation has not helped us much here; it only pruned off 5 leaves, at a cost of a slightly worse error. These results would change if we were to change the random number seeds above; even though cross-validation gives an unbiased approach to model selection, it does have variance.
272 | 
273 |   
274 | 
275 | 
276 | ## Fitting Regression Trees
277 | Here we fit a regression tree to the  `Boston`  data set. The
278 | steps are similar to those for classification trees.
279 | 
280 | ```{python}
281 | Boston = load_data("Boston")
282 | model = MS(Boston.columns.drop('medv'), intercept=False)
283 | D = model.fit_transform(Boston)
284 | feature_names = list(D.columns)
285 | X = np.asarray(D)
286 | 
287 | ```
288 | 
289 | First, we split the data into training and test sets, and fit the tree
290 | to the training data. Here we use 30% of the data for the test set.
291 | 
292 | 
293 | ```{python}
294 | (X_train,
295 |  X_test,
296 |  y_train,
297 |  y_test) = skm.train_test_split(X,
298 |                                 Boston['medv'],
299 |                                 test_size=0.3,
300 |                                 random_state=0)
301 | 
302 | ```
303 | 
304 | Having formed  our training  and test data sets, we fit the regression tree.
305 | 
306 | ```{python}
307 | reg = DTR(max_depth=3)
308 | reg.fit(X_train, y_train)
309 | ax = subplots(figsize=(12,12))[1]
310 | plot_tree(reg,
311 |           feature_names=feature_names,
312 |           ax=ax);
313 | 
314 | ```
315 | 
316 | The variable `lstat` measures the percentage of individuals with
317 | lower socioeconomic status. The tree indicates that lower
318 | values of `lstat` correspond to more expensive houses.
319 | The tree predicts a median house price of $12,042 for small-sized homes (`rm < 6.8`), in
320 | suburbs in which residents have low socioeconomic status (`lstat  > 14.4`) and the crime-rate is moderate (`crim > 5.8`).
321 | 
322 | 
323 | Now we use the cross-validation function to see whether pruning
324 | the tree will improve performance.
325 | 
326 | ```{python}
327 | ccp_path = reg.cost_complexity_pruning_path(X_train, y_train)
328 | kfold = skm.KFold(5,
329 |                   shuffle=True,
330 |                   random_state=10)
331 | grid = skm.GridSearchCV(reg,
332 |                         {'ccp_alpha': ccp_path.ccp_alphas},
333 |                         refit=True,
334 |                         cv=kfold,
335 |                         scoring='neg_mean_squared_error')
336 | G = grid.fit(X_train, y_train)
337 | 
338 | ```
339 | 
340 | In keeping with the cross-validation results, we use the pruned tree
341 | to make predictions on the test set.
342 | 
343 | ```{python}
344 | best_ = grid.best_estimator_
345 | np.mean((y_test - best_.predict(X_test))**2)
346 | 
347 | ```
348 | 
349 | 
350 | In other words, the test set MSE associated with the regression tree
351 | is 28.07.  The square root of
352 | the MSE is therefore around
353 | 5.30,
354 | indicating that this model leads to test predictions that are within around
355 | $5300
356 | of the true median home value for the suburb.
357 | 
358 | Let’s plot the best tree to see how interpretable it is.
359 | 
360 | ```{python}
361 | ax = subplots(figsize=(12,12))[1]
362 | plot_tree(G.best_estimator_,
363 |           feature_names=feature_names,
364 |           ax=ax);
365 | 
366 | ```
367 |  
368 |  
369 | 
370 | 
371 | ## Bagging and Random Forests
372 | 
373 | 
374 | Here we apply bagging and random forests to the `Boston` data, using
375 | the `RandomForestRegressor()` from the `sklearn.ensemble` package. Recall
376 | that bagging is simply a special case of a random forest with
377 | $m=p$. Therefore, the `RandomForestRegressor()`  function can be used to
378 | perform both bagging and random forests. We start with bagging.
379 | 
380 | ```{python}
381 | bag_boston = RF(max_features=X_train.shape[1], random_state=0)
382 | bag_boston.fit(X_train, y_train)
383 | 
384 | ```
385 | 
386 | 
387 | The argument `max_features` indicates that all 12 predictors should
388 | be considered for each split of the tree --- in other words, that
389 | bagging should be done.  How well does this bagged model perform on
390 | the test set?
391 | 
392 | ```{python}
393 | ax = subplots(figsize=(8,8))[1]
394 | y_hat_bag = bag_boston.predict(X_test)
395 | ax.scatter(y_hat_bag, y_test)
396 | np.mean((y_test - y_hat_bag)**2)
397 | 
398 | ```
399 | 
400 | The test set MSE associated with the bagged regression tree is
401 | 14.63, about half that obtained using an optimally-pruned single
402 | tree.  We could change the number of trees grown from the default of
403 | 100 by
404 | using the `n_estimators` argument:
405 | 
406 | ```{python}
407 | bag_boston = RF(max_features=X_train.shape[1],
408 |                 n_estimators=500,
409 |                 random_state=0).fit(X_train, y_train)
410 | y_hat_bag = bag_boston.predict(X_test)
411 | np.mean((y_test - y_hat_bag)**2)
412 | ```
413 | There is not much change. Bagging and random forests cannot overfit by
414 | increasing the number of trees, but can underfit if the number is too small.
415 | 
416 | Growing a random forest proceeds in exactly the same way, except that
417 | we use a smaller value of the `max_features` argument. By default,
418 | `RandomForestRegressor()`  uses $p$ variables when building a random
419 | forest of regression trees (i.e. it defaults to bagging), and `RandomForestClassifier()` uses
420 | $\sqrt{p}$ variables when building a
421 | random forest of classification trees. Here we use `max_features=6`.
422 | 
423 | ```{python}
424 | RF_boston = RF(max_features=6,
425 |                random_state=0).fit(X_train, y_train)
426 | y_hat_RF = RF_boston.predict(X_test)
427 | np.mean((y_test - y_hat_RF)**2)
428 | 
429 | ```
430 | 
431 | 
432 | The test set MSE is 20.04;
433 | this indicates that random forests did somewhat worse than bagging
434 | in this case. Extracting the `feature_importances_` values from the fitted model, we can view the
435 | importance of each variable.
436 | 
437 | ```{python}
438 | feature_imp = pd.DataFrame(
439 |     {'importance':RF_boston.feature_importances_},
440 |     index=feature_names)
441 | feature_imp.sort_values(by='importance', ascending=False)
442 | ```
443 |  This
444 | is a relative measure of the total decrease in node impurity that results from
445 | splits over that variable, averaged over all trees (this was plotted in Figure 8.9 for a model fit to the `Heart` data). 
446 | 
447 | The results indicate that across all of the trees considered in the
448 | random forest, the wealth level of the community (`lstat`) and the
449 | house size (`rm`) are by far the two most important variables.
450 | 
451 |   
452 | 
453 | 
454 | ## Boosting
455 | 
456 | 
457 | Here we use `GradientBoostingRegressor()` from `sklearn.ensemble`
458 | to fit boosted regression trees to the `Boston` data
459 | set. For classification we would  use `GradientBoostingClassifier()`.
460 | The argument `n_estimators=5000`
461 | indicates that we want 5000 trees, and the option
462 | `max_depth=3` limits the depth of each tree. The
463 | argument `learning_rate` is the $\lambda$
464 | mentioned earlier in the description of boosting.
465 | 
466 | ```{python}
467 | boost_boston = GBR(n_estimators=5000,
468 |                    learning_rate=0.001,
469 |                    max_depth=3,
470 |                    random_state=0)
471 | boost_boston.fit(X_train, y_train)
472 | 
473 | ```
474 | 
475 | We can see how the training error decreases with the `train_score_` attribute.
476 | To get an idea of how the test error decreases we can use the
477 | `staged_predict()` method to get the predicted values along the path.
478 | 
479 | ```{python}
480 | test_error = np.zeros_like(boost_boston.train_score_)
481 | for idx, y_ in enumerate(boost_boston.staged_predict(X_test)):
482 |    test_error[idx] = np.mean((y_test - y_)**2)
483 | 
484 | plot_idx = np.arange(boost_boston.train_score_.shape[0])
485 | ax = subplots(figsize=(8,8))[1]
486 | ax.plot(plot_idx,
487 |         boost_boston.train_score_,
488 |         'b',
489 |         label='Training')
490 | ax.plot(plot_idx,
491 |         test_error,
492 |         'r',
493 |         label='Test')
494 | ax.legend();
495 | 
496 | ```
497 | 
498 | We now use the boosted model to predict `medv` on the test set:
499 | 
500 | ```{python}
501 | y_hat_boost = boost_boston.predict(X_test);
502 | np.mean((y_test - y_hat_boost)**2)
503 | 
504 | ```
505 | 
506 |  The test MSE obtained is 14.48,
507 | similar to the test MSE for bagging. If we want to, we can
508 | perform boosting with a different value of the shrinkage parameter
509 | $\lambda$ in  (8.10). The default value is 0.001, but
510 | this is easily modified.  Here we take $\lambda=0.2$.
511 | 
512 | ```{python}
513 | boost_boston = GBR(n_estimators=5000,
514 |                    learning_rate=0.2,
515 |                    max_depth=3,
516 |                    random_state=0)
517 | boost_boston.fit(X_train,
518 |                  y_train)
519 | y_hat_boost = boost_boston.predict(X_test);
520 | np.mean((y_test - y_hat_boost)**2)
521 | 
522 | ```
523 | 
524 | 
525 | In this case, using $\lambda=0.2$ leads to almost the same test MSE
526 | as when using $\lambda=0.001$.
527 | 
528 |  
529 | 
530 | 
531 | ## Bayesian Additive Regression Trees
532 | 
533 | 
534 | In this section we demonstrate a  `Python` implementation of BART found in the
535 | `ISLP.bart` package. We fit a  model
536 | to the `Boston` housing data set. This `BART()` estimator is
537 | designed for quantitative outcome variables, though other implementations are available for
538 | fitting logistic and probit models to categorical outcomes.
539 | 
540 | ```{python}
541 | bart_boston = BART(random_state=0, burnin=5, ndraw=15)
542 | bart_boston.fit(X_train, y_train)
543 | 
544 | ```
545 | 
546 | 
547 | On this data set, with this split into test and training, we see that the test error of BART is similar to that of  random forest.
548 | 
549 | ```{python}
550 | yhat_test = bart_boston.predict(X_test.astype(np.float32))
551 | np.mean((y_test - yhat_test)**2)
552 | 
553 | ```
554 | 
555 | 
556 | We can check how many times each variable appeared in the collection of trees.
557 | This gives a summary similar to the variable importance plot for boosting and random forests.
558 | 
559 | ```{python}
560 | var_inclusion = pd.Series(bart_boston.variable_inclusion_.mean(0),
561 |                                index=D.columns)
562 | var_inclusion
563 | 
564 | ```
565 |     
566 |   
567 |  
568 | 
569 | 
570 | 


--------------------------------------------------------------------------------
/Ch09-svm-lab.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | jupyter:
  3 |   jupytext:
  4 |     cell_metadata_filter: -all
  5 |     formats: ipynb,Rmd
  6 |     main_language: python
  7 |     text_representation:
  8 |       extension: .Rmd
  9 |       format_name: rmarkdown
 10 |       format_version: '1.2'
 11 |       jupytext_version: 1.16.7
 12 | ---
 13 | 
 14 | # Support Vector Machines
 15 | 
 16 | <a target="_blank" href="https://colab.research.google.com/github/intro-stat-learning/ISLP_labs/blob/v2.2/Ch09-svm-lab.ipynb">
 17 | <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
 18 | </a>
 19 | 
 20 | [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/intro-stat-learning/ISLP_labs/v2.2?labpath=Ch09-svm-lab.ipynb)
 21 | 
 22 | 
 23 | In this lab, we use the `sklearn.svm` library to demonstrate the support
 24 | vector classifier and the support vector machine.
 25 | 
 26 | We  import some of our usual libraries.
 27 | 
 28 | ```{python}
 29 | import numpy as np
 30 | from matplotlib.pyplot import subplots, cm
 31 | import sklearn.model_selection as skm
 32 | from ISLP import load_data, confusion_table
 33 | 
 34 | ```
 35 | We also collect the new imports
 36 | needed for this lab.
 37 | 
 38 | ```{python}
 39 | from sklearn.svm import SVC
 40 | from ISLP.svm import plot as plot_svm
 41 | from sklearn.metrics import RocCurveDisplay
 42 | 
 43 | ```
 44 | 
 45 | We will use the function `RocCurveDisplay.from_estimator()` to
 46 | produce several ROC plots, using a shorthand `roc_curve`.
 47 | 
 48 | ```{python}
 49 | roc_curve = RocCurveDisplay.from_estimator # shorthand
 50 | 
 51 | ```
 52 | 
 53 | ## Support Vector Classifier
 54 | 
 55 | We now use the `SupportVectorClassifier()` function (abbreviated `SVC()`) from `sklearn` to fit the support vector
 56 | classifier for a given value of the parameter `C`.  The
 57 | `C` argument allows us to specify the cost of a violation to
 58 | the margin.  When the `C` argument is small, then the margins
 59 | will be wide and many support vectors will be on the margin or will
 60 | violate the margin.  When the `C` argument is large, then the
 61 | margins will be narrow and there will be few support vectors on the
 62 | margin or violating the margin.
 63 | 
 64 | Here we demonstrate
 65 | the use of `SVC()` on a two-dimensional example, so that we can
 66 | plot the resulting decision boundary. We begin by generating the
 67 | observations, which belong to two classes, and checking whether the
 68 | classes are linearly separable.
 69 | 
 70 | ```{python}
 71 | rng = np.random.default_rng(1)
 72 | X = rng.standard_normal((50, 2))
 73 | y = np.array([-1]*25+[1]*25)
 74 | X[y==1] += 1
 75 | fig, ax = subplots(figsize=(8,8))
 76 | ax.scatter(X[:,0],
 77 |            X[:,1],
 78 |            c=y,
 79 |            cmap=cm.coolwarm);
 80 | 
 81 | ```
 82 | They are not.  We now fit the classifier.
 83 | 
 84 | ```{python}
 85 | svm_linear = SVC(C=10, kernel='linear')
 86 | svm_linear.fit(X, y)
 87 | 
 88 | ```
 89 | 
 90 | 
 91 | The support vector classifier with two features can
 92 | be visualized by plotting values of its *decision function*.
 93 | We have included a function for this in the `ISLP` package (inspired by a similar
 94 | example in the `sklearn` docs).
 95 | 
 96 | ```{python}
 97 | fig, ax = subplots(figsize=(8,8))
 98 | plot_svm(X,
 99 |          y,
100 |          svm_linear,
101 |          ax=ax)
102 | 
103 | ```
104 | 
105 | The decision
106 | boundary between the two classes is linear (because we used the
107 | argument `kernel='linear'`). The support vectors are marked with `+`
108 | and the remaining observations are plotted as circles.
109 | 
110 | What if we instead used a smaller value of the cost parameter?
111 | 
112 | ```{python}
113 | svm_linear_small = SVC(C=0.1, kernel='linear')
114 | svm_linear_small.fit(X, y)
115 | fig, ax = subplots(figsize=(8,8))
116 | plot_svm(X,
117 |          y,
118 |          svm_linear_small,
119 |          ax=ax)
120 | 
121 | ```
122 | With  a smaller value of the cost parameter, we
123 | obtain a larger number of support vectors, because the margin is now
124 | wider. For linear kernels, we can extract the
125 | coefficients of the linear decision boundary as follows:
126 | 
127 | ```{python}
128 | svm_linear.coef_
129 | 
130 | ```
131 | 
132 | 
133 | Since the support vector machine is an estimator in `sklearn`, we
134 | can use the usual machinery to tune it.
135 | 
136 | ```{python}
137 | kfold = skm.KFold(5, 
138 |                   random_state=0,
139 |                   shuffle=True)
140 | grid = skm.GridSearchCV(svm_linear,
141 |                         {'C':[0.001,0.01,0.1,1,5,10,100]},
142 |                         refit=True,
143 |                         cv=kfold,
144 |                         scoring='accuracy')
145 | grid.fit(X, y)
146 | grid.best_params_
147 | 
148 | ```
149 | 
150 | 
151 | We can easily access the cross-validation errors for each of these models
152 | in  `grid.cv_results_`. This prints out a lot of detail, so we
153 | extract the accuracy results only.
154 | 
155 | ```{python}
156 | grid.cv_results_[('mean_test_score')]
157 | 
158 | ```
159 | We see that  `C=1` results in the highest cross-validation
160 | accuracy of 0.74, though
161 | the accuracy is the same for several values of `C`.
162 | The classifier `grid.best_estimator_` can be used to predict the class
163 | label on a set of test observations. Let’s generate a test data set.
164 | 
165 | ```{python}
166 | X_test = rng.standard_normal((20, 2))
167 | y_test = np.array([-1]*10+[1]*10)
168 | X_test[y_test==1] += 1
169 | 
170 | ```
171 | 
172 | Now we predict the class labels of these test observations. Here we
173 | use the best model selected by cross-validation in order to make the
174 | predictions.
175 | 
176 | ```{python}
177 | best_ = grid.best_estimator_
178 | y_test_hat = best_.predict(X_test)
179 | confusion_table(y_test_hat, y_test)
180 | 
181 | ```
182 | 
183 | Thus, with this value of `C`,
184 | 70% of the test
185 | observations are correctly classified.  What if we had instead used
186 | `C=0.001`?
187 | 
188 | ```{python}
189 | svm_ = SVC(C=0.001,
190 |            kernel='linear').fit(X, y)
191 | y_test_hat = svm_.predict(X_test)
192 | confusion_table(y_test_hat, y_test)
193 | 
194 | ```
195 | 
196 | In this case 60% of test observations are correctly classified.
197 | 
198 | We now consider a situation in which the two classes are linearly
199 | separable. Then we can find an optimal separating hyperplane using the
200 | `SVC()` estimator.  We first
201 | further separate the two classes in our simulated data so that they
202 | are linearly separable:
203 | 
204 | ```{python}
205 | X[y==1] += 1.9;
206 | fig, ax = subplots(figsize=(8,8))
207 | ax.scatter(X[:,0], X[:,1], c=y, cmap=cm.coolwarm);
208 | 
209 | ```
210 | 
211 | Now the observations are just barely linearly separable.
212 | 
213 | ```{python}
214 | svm_ = SVC(C=1e5, kernel='linear').fit(X, y)
215 | y_hat = svm_.predict(X)
216 | confusion_table(y_hat, y)
217 | 
218 | ```
219 | 
220 | We fit the
221 | support vector classifier and plot the resulting hyperplane, using a
222 | very large value of `C` so that no observations are
223 | misclassified. 
224 | 
225 | ```{python}
226 | fig, ax = subplots(figsize=(8,8))
227 | plot_svm(X,
228 |          y,
229 |          svm_,
230 |          ax=ax)
231 | 
232 | ```
233 | Indeed no training errors were made and only three support vectors were used.
234 | In fact, the large value of `C` also means that these three support points are *on the margin*, and define it.
235 | One may wonder how good the classifier could be on test data that depends on only three data points!
236 |  We now try a smaller
237 | value of `C`.
238 | 
239 | ```{python}
240 | svm_ = SVC(C=0.1, kernel='linear').fit(X, y)
241 | y_hat = svm_.predict(X)
242 | confusion_table(y_hat, y)
243 | 
244 | ```
245 | 
246 | Using `C=0.1`, we again do not misclassify any training observations, but we
247 | also obtain a much wider margin and make use of twelve support
248 | vectors. These jointly define the orientation of the decision boundary, and since there are more of them, it is more stable. It seems possible that this model will perform better on test
249 | data than the model with `C=1e5` (and indeed, a simple experiment with a large test set would bear this out).
250 | 
251 | ```{python}
252 | fig, ax = subplots(figsize=(8,8))
253 | plot_svm(X,
254 |          y,
255 |          svm_,
256 |          ax=ax)
257 | 
258 | ```
259 | 
260 | 
261 | ## Support Vector Machine
262 | In order to fit an SVM using a non-linear kernel, we once again use
263 | the `SVC()`  estimator. However, now we use a different value
264 | of the parameter `kernel`. To fit an SVM with a polynomial
265 | kernel we use `kernel="poly"`, and to fit an SVM with a
266 | radial kernel  we use
267 | `kernel="rbf"`.  In the former case we also use the
268 | `degree` argument to specify a degree for the polynomial kernel
269 | (this is $d$ in (9.22)), and in the latter case we use
270 | `gamma` to specify a value of $\gamma$ for the radial basis
271 | kernel  (9.24).
272 | 
273 | We first generate some data with a non-linear class boundary, as follows:
274 | 
275 | ```{python}
276 | X = rng.standard_normal((200, 2))
277 | X[:100] += 2
278 | X[100:150] -= 2
279 | y = np.array([1]*150+[2]*50)
280 | 
281 | ```
282 | 
283 | Plotting the data makes it clear that the class boundary is indeed non-linear.
284 | 
285 | ```{python}
286 | fig, ax = subplots(figsize=(8,8))
287 | ax.scatter(X[:,0],
288 |            X[:,1],
289 |            c=y,
290 |            cmap=cm.coolwarm);
291 | 
292 | ```
293 | 
294 | 
295 | The data is randomly split into training and testing groups. We then
296 | fit the training data using the `SVC()`  estimator with a
297 | radial kernel and $\gamma=1$:
298 | 
299 | ```{python}
300 | (X_train, 
301 |  X_test,
302 |  y_train,
303 |  y_test) = skm.train_test_split(X,
304 |                                 y,
305 |                                 test_size=0.5,
306 |                                 random_state=0)
307 | svm_rbf = SVC(kernel="rbf", gamma=1, C=1)
308 | svm_rbf.fit(X_train, y_train)
309 | 
310 | ```
311 | 
312 | The plot shows that the resulting SVM has a decidedly non-linear
313 | boundary. 
314 | 
315 | ```{python}
316 | fig, ax = subplots(figsize=(8,8))
317 | plot_svm(X_train,
318 |          y_train,
319 |          svm_rbf,
320 |          ax=ax)
321 | 
322 | ```
323 | 
324 | We can see from the figure that there are a fair number of training
325 | errors in this SVM fit.  If we increase the value of `C`, we
326 | can reduce the number of training errors. However, this comes at the
327 | price of a more irregular decision boundary that seems to be at risk
328 | of overfitting the data.
329 | 
330 | ```{python}
331 | svm_rbf = SVC(kernel="rbf", gamma=1, C=1e5)
332 | svm_rbf.fit(X_train, y_train)
333 | fig, ax = subplots(figsize=(8,8))
334 | plot_svm(X_train,
335 |          y_train,
336 |          svm_rbf,
337 |          ax=ax)
338 | 
339 | ```
340 | 
341 | We can perform cross-validation using `skm.GridSearchCV()`  to select the
342 | best choice of $\gamma$ and `C` for an SVM with a radial
343 | kernel:
344 | 
345 | ```{python}
346 | kfold = skm.KFold(5, 
347 |                   random_state=0,
348 |                   shuffle=True)
349 | grid = skm.GridSearchCV(svm_rbf,
350 |                         {'C':[0.1,1,10,100,1000],
351 |                          'gamma':[0.5,1,2,3,4]},
352 |                         refit=True,
353 |                         cv=kfold,
354 |                         scoring='accuracy');
355 | grid.fit(X_train, y_train)
356 | grid.best_params_
357 | 
358 | ```
359 | 
360 | The best choice of parameters under five-fold CV is achieved at `C=1`
361 | and `gamma=0.5`, though several other values also achieve the same
362 | value.
363 | 
364 | ```{python}
365 | best_svm = grid.best_estimator_
366 | fig, ax = subplots(figsize=(8,8))
367 | plot_svm(X_train,
368 |          y_train,
369 |          best_svm,
370 |          ax=ax)
371 | 
372 | y_hat_test = best_svm.predict(X_test)
373 | confusion_table(y_hat_test, y_test)
374 | 
375 | ```
376 | 
377 | With these parameters, 12% of test
378 | observations are misclassified by this SVM.
379 | 
380 | 
381 | ## ROC Curves
382 | 
383 | SVMs and support vector classifiers output class labels for each
384 | observation. However, it is also possible to obtain *fitted values*
385 | for each observation, which are the numerical scores used to
386 | obtain the class labels. For instance, in the case of a support vector
387 | classifier, the fitted value for an observation $X= (X_1, X_2, \ldots,
388 | X_p)^T$ takes the form $\hat{\beta}_0 + \hat{\beta}_1 X_1 +
389 | \hat{\beta}_2 X_2 + \ldots + \hat{\beta}_p X_p$. For an SVM with a
390 | non-linear kernel, the equation that yields the fitted value is given
391 | in  (9.23). The sign of the fitted value
392 | determines on which side of the decision boundary the observation
393 | lies. Therefore, the relationship between the fitted value and the
394 | class prediction for a given observation is simple: if the fitted
395 | value exceeds zero then the observation is assigned to one class, and
396 | if it is less than zero then it is assigned to the other.
397 | By changing this threshold from zero to some positive value,
398 | we skew the classifications in favor of one class versus the other.
399 | By considering a range of these thresholds, positive and negative, we produce the ingredients for a ROC plot.
400 | We can access these values by calling the `decision_function()`
401 | method of a fitted SVM estimator.
402 | 
403 | The function `ROCCurveDisplay.from_estimator()` (which we have abbreviated to `roc_curve()`) will produce a plot of a ROC curve. It takes a fitted estimator as its first argument, followed
404 | by a model matrix $X$ and labels $y$. The argument `name` is used in the legend,
405 | while `color` is used for the color of the line. Results are plotted
406 | on our axis object `ax`.
407 | 
408 | ```{python}
409 | fig, ax = subplots(figsize=(8,8))
410 | roc_curve(best_svm,
411 |           X_train,
412 |           y_train,
413 |           name='Training',
414 |           color='r',
415 |           ax=ax);
416 | 
417 | ```
418 |  In this example, the SVM appears to provide accurate predictions. By increasing
419 | $\gamma$ we can produce a more flexible fit and generate further
420 | improvements in accuracy.
421 | 
422 | ```{python}
423 | svm_flex = SVC(kernel="rbf", 
424 |               gamma=50,
425 |               C=1)
426 | svm_flex.fit(X_train, y_train)
427 | fig, ax = subplots(figsize=(8,8))
428 | roc_curve(svm_flex,
429 |           X_train,
430 |           y_train,
431 |           name='Training $\gamma=50$',
432 |           color='r',
433 |           ax=ax);
434 | 
435 | ```
436 | 
437 | However, these ROC curves are all on the training data. We are really
438 | more interested in the level of prediction accuracy on the test
439 | data. When we compute the ROC curves on the test data, the model with
440 | $\gamma=0.5$ appears to provide the most accurate results.
441 | 
442 | ```{python}
443 | roc_curve(svm_flex,
444 |           X_test,
445 |           y_test,
446 |           name='Test $\gamma=50$',
447 |           color='b',
448 |           ax=ax)
449 | fig;
450 | 
451 | ```
452 | 
453 | Let’s look at our tuned SVM.
454 | 
455 | ```{python}
456 | fig, ax = subplots(figsize=(8,8))
457 | for (X_, y_, c, name) in zip(
458 |      (X_train, X_test),
459 |      (y_train, y_test),
460 |      ('r', 'b'),
461 |      ('CV tuned on training',
462 |       'CV tuned on test')):
463 |     roc_curve(best_svm,
464 |               X_,
465 |               y_,
466 |               name=name,
467 |               ax=ax,
468 |               color=c)
469 | 
470 | ```
471 | 
472 | ## SVM with Multiple Classes
473 | 
474 | If the response is a factor containing more than two levels, then the
475 | `SVC()`  function will perform multi-class classification using
476 | either the one-versus-one approach (when `decision_function_shape=='ovo'`)
477 | or one-versus-rest {One-versus-rest is also known as one-versus-all.} (when `decision_function_shape=='ovr'`).
478 | We explore that setting briefly here by
479 | generating a third class of observations.
480 | 
481 | ```{python}
482 | rng = np.random.default_rng(123)
483 | X = np.vstack([X, rng.standard_normal((50, 2))])
484 | y = np.hstack([y, [0]*50])
485 | X[y==0,1] += 2
486 | fig, ax = subplots(figsize=(8,8))
487 | ax.scatter(X[:,0], X[:,1], c=y, cmap=cm.coolwarm);
488 | 
489 | ```
490 | 
491 | We now fit an SVM to the data:
492 | 
493 | ```{python}
494 | svm_rbf_3 = SVC(kernel="rbf",
495 |                 C=10,
496 |                 gamma=1,
497 |                 decision_function_shape='ovo');
498 | svm_rbf_3.fit(X, y)
499 | fig, ax = subplots(figsize=(8,8))
500 | plot_svm(X,
501 |          y,
502 |          svm_rbf_3,
503 |          scatter_cmap=cm.tab10,
504 |          ax=ax)
505 | 
506 | ```
507 | The `sklearn.svm` library can also be used to perform support vector
508 | regression with a numerical response using the  estimator `SupportVectorRegression()`.
509 | 
510 | 
511 | ## Application to Gene Expression Data
512 | 
513 | We now examine the `Khan` data set, which consists of a number of
514 | tissue samples corresponding to four distinct types of small round
515 | blue cell tumors. For each tissue sample, gene expression measurements
516 | are available.  The data set consists of training data, `xtrain`
517 | and `ytrain`, and testing data, `xtest` and `ytest`.
518 | 
519 | We examine the dimension of the data:
520 | 
521 | ```{python}
522 | Khan = load_data('Khan')
523 | Khan['xtrain'].shape, Khan['xtest'].shape
524 | 
525 | ```
526 | 
527 | This data set consists of expression measurements for 2,308
528 | genes. The training and test sets consist of 63 and 20
529 | observations, respectively.
530 | 
531 | We will use a support vector approach to predict cancer subtype using
532 | gene expression measurements.  In this data set, there is a very
533 | large number of features relative to the number of observations. This
534 | suggests that we should use a linear kernel, because the additional
535 | flexibility that will result from using a polynomial or radial kernel 
536 | is unnecessary. 
537 | 
538 | ```{python}
539 | khan_linear = SVC(kernel='linear', C=10)
540 | khan_linear.fit(Khan['xtrain'], Khan['ytrain'])
541 | confusion_table(khan_linear.predict(Khan['xtrain']),
542 |                 Khan['ytrain'])
543 | 
544 | ```
545 | 
546 | We  see that there are *no* training
547 | errors. In fact, this is not surprising, because the large number of
548 | variables relative to the number of observations implies that it is
549 | easy to find hyperplanes that fully separate the classes. We are more
550 | interested in the support vector classifier’s performance on the
551 | test observations.
552 | 
553 | ```{python}
554 | confusion_table(khan_linear.predict(Khan['xtest']),
555 |                 Khan['ytest'])
556 | 
557 | ```
558 | 
559 | We see that using `C=10` yields two test set errors on these data.
560 | 
561 | 
562 | 


--------------------------------------------------------------------------------
/Ch11-surv-lab.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | jupyter:
  3 |   jupytext:
  4 |     cell_metadata_filter: -all
  5 |     formats: ipynb,Rmd
  6 |     main_language: python
  7 |     text_representation:
  8 |       extension: .Rmd
  9 |       format_name: rmarkdown
 10 |       format_version: '1.2'
 11 |       jupytext_version: 1.16.7
 12 | ---
 13 | 
 14 | # Survival Analysis
 15 | 
 16 | <a target="_blank" href="https://colab.research.google.com/github/intro-stat-learning/ISLP_labs/blob/v2.2/Ch11-surv-lab.ipynb">
 17 | <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
 18 | </a>
 19 | 
 20 | [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/intro-stat-learning/ISLP_labs/v2.2?labpath=Ch11-surv-lab.ipynb)
 21 | 
 22 | 
 23 |  In this lab, we perform survival analyses on three separate data
 24 | sets. In  Section 11.8.1 we analyze the  `BrainCancer` 
 25 | data  that was first described in Section 11.3. In Section 11.8.2, we examine the  `Publication` 
 26 | data  from Section 11.5.4. Finally,  Section 11.8.3  explores
 27 | a simulated call-center data set.
 28 | 
 29 | We begin by importing some of our libraries at this top
 30 | level. This makes the code more readable, as scanning the first few
 31 | lines of the notebook tell us what libraries are used in this
 32 | notebook.
 33 | 
 34 | ```{python}
 35 | from matplotlib.pyplot import subplots
 36 | import numpy as np
 37 | import pandas as pd
 38 | from ISLP.models import ModelSpec as MS
 39 | from ISLP import load_data
 40 | 
 41 | ```
 42 | 
 43 | We  also collect the new imports
 44 | needed for this lab.
 45 | 
 46 | ```{python}
 47 | from lifelines import \
 48 |      (KaplanMeierFitter,
 49 |       CoxPHFitter)
 50 | from lifelines.statistics import \
 51 |      (logrank_test,
 52 |       multivariate_logrank_test)
 53 | from ISLP.survival import sim_time
 54 | 
 55 | ```
 56 | 
 57 | ## Brain Cancer Data
 58 | 
 59 | We begin with the `BrainCancer` data set, contained in the `ISLP` package.
 60 | 
 61 | ```{python}
 62 | BrainCancer = load_data('BrainCancer')
 63 | BrainCancer.columns
 64 | 
 65 | ```
 66 | 
 67 | The rows index the 88 patients, while the 8 columns contain the predictors and outcome variables.
 68 | We first briefly examine the data.
 69 | 
 70 | ```{python}
 71 | BrainCancer['sex'].value_counts()
 72 | 
 73 | ```
 74 | 
 75 | 
 76 | ```{python}
 77 | BrainCancer['diagnosis'].value_counts()
 78 | 
 79 | ```
 80 | 
 81 | 
 82 | ```{python}
 83 | BrainCancer['status'].value_counts()
 84 | 
 85 | ```
 86 | 
 87 | 
 88 | Before beginning an analysis, it is important to know how the
 89 | `status` variable has been coded.  Most software
 90 | uses the convention that a `status` of 1 indicates an
 91 | uncensored observation (often death), and a `status` of 0 indicates a censored
 92 | observation. But some scientists might use the opposite coding. For
 93 | the  `BrainCancer`  data set 35 patients died before the end of
 94 | the study, so we are using the conventional coding.
 95 | 
 96 | To begin the analysis, we re-create  the Kaplan-Meier survival curve shown in Figure 11.2. The main
 97 | package we will use for survival analysis
 98 | is `lifelines`.
 99 | The variable  `time`  corresponds to $y_i$, the time to the $i$th event (either censoring or
100 | death). The first argument to `km.fit` is the event time, and the
101 | second argument is the censoring variable, with a 1 indicating an observed
102 | failure time. The `plot()` method produces a survival curve with pointwise confidence
103 | intervals. By default, these are  90% confidence intervals, but this can be changed
104 | by setting the `alpha` argument to one minus the desired
105 | confidence level.
106 | 
107 | ```{python}
108 | fig, ax = subplots(figsize=(8,8))
109 | km = KaplanMeierFitter()
110 | km_brain = km.fit(BrainCancer['time'], BrainCancer['status'])
111 | km_brain.plot(label='Kaplan Meier estimate', ax=ax)
112 | 
113 | ```
114 | 
115 | Next we create Kaplan-Meier survival curves that are stratified by
116 | `sex`, in order to reproduce  Figure 11.3.
117 | We  do this using the `groupby()` method of  a  dataframe.
118 | This  method returns a generator that can
119 | be  iterated over in the `for` loop. In this case,
120 | the  items  in the `for` loop are  2-tuples  representing
121 | the groups: the first entry is the  value
122 | of the grouping column  `sex` while the  second  value
123 | is the dataframe consisting of all  rows in the
124 | dataframe matching that value of `sex`.
125 | We will want to use this data below
126 | in the log-rank test,  hence we store this
127 | information in the dictionary `by_sex`. Finally,
128 | we have also used the notion of
129 |  *string interpolation* to automatically
130 | label the  different lines in the  plot. String
131 | interpolation is a powerful technique to format strings ---
132 | `Python` has many ways to facilitate such operations.
133 | 
134 | ```{python}
135 | fig, ax = subplots(figsize=(8,8))
136 | by_sex = {}
137 | for sex, df in BrainCancer.groupby('sex'):
138 |     by_sex[sex] = df
139 |     km_sex = km.fit(df['time'], df['status'])
140 |     km_sex.plot(label='Sex=%s' % sex, ax=ax)
141 | 
142 | ```
143 | 
144 | As discussed in Section 11.4, we can perform a
145 | log-rank test to compare the survival of males to females. We use
146 | the `logrank_test()` function from the `lifelines.statistics` module.
147 | The first two arguments are the event times, with the second
148 | denoting the corresponding (optional) censoring indicators.
149 | 
150 | ```{python}
151 | logrank_test(by_sex['Male']['time'],
152 |              by_sex['Female']['time'],
153 |              by_sex['Male']['status'],
154 |              by_sex['Female']['status'])
155 | 
156 | ```
157 | 
158 | 
159 | The resulting $p$-value is $0.23$, indicating no evidence of a
160 | difference in survival between the two sexes.
161 | 
162 | Next, we  use the `CoxPHFitter()`  estimator
163 | from `lifelines` to fit Cox proportional hazards models.
164 | To begin, we consider a model that uses  `sex`  as the only predictor.
165 | 
166 | ```{python}
167 | coxph = CoxPHFitter # shorthand
168 | sex_df = BrainCancer[['time', 'status', 'sex']]
169 | model_df = MS(['time', 'status', 'sex'],
170 |               intercept=False).fit_transform(sex_df)
171 | cox_fit = coxph().fit(model_df,
172 |                       'time',
173 |                       'status')
174 | cox_fit.summary[['coef', 'se(coef)', 'p']]
175 | 
176 | ```
177 | 
178 | The first argument to `fit` should be a data frame containing
179 | at least the event time (the second argument `time` in this case),
180 | as well as an optional censoring variable (the argument `status` in this case).
181 | Note also that the Cox model does not include an intercept, which is why
182 | we used the `intercept=False` argument to `ModelSpec` above.
183 | The `summary()` method delivers many columns; we chose to abbreviate its output here.
184 | It is possible to obtain the likelihood ratio test comparing this model to the one
185 | with no features as follows:
186 | 
187 | ```{python}
188 | cox_fit.log_likelihood_ratio_test()
189 | 
190 | ```
191 | 
192 | Regardless of which test we use, we see that there is no clear
193 | evidence for a difference in survival between males and females.  As
194 | we learned in this chapter, the score test from the Cox model is
195 | exactly equal to the log rank test statistic!
196 | 
197 | Now we fit a  model that makes use of additional predictors. We first note
198 | that one of our `diagnosis` values is missing, hence
199 | we drop that observation before continuing.
200 | 
201 | ```{python}
202 | cleaned = BrainCancer.dropna()
203 | all_MS = MS(cleaned.columns, intercept=False)
204 | all_df = all_MS.fit_transform(cleaned)
205 | fit_all = coxph().fit(all_df,
206 |                       'time',
207 |                       'status')
208 | fit_all.summary[['coef', 'se(coef)', 'p']]
209 | 
210 | ```
211 | 
212 |  The `diagnosis` variable has been coded so that the baseline
213 | corresponds to HG glioma. The results indicate that the risk associated with HG glioma
214 | is more than eight times (i.e. $e^{2.15}=8.62$) the risk associated
215 | with meningioma. In other words, after adjusting for the other
216 | predictors, patients with HG glioma have much worse survival compared
217 | to those with meningioma.  In addition, larger values of the Karnofsky
218 | index, `ki`, are associated with lower risk, i.e. longer survival.
219 | 
220 | Finally, we plot estimated survival curves for each diagnosis category,
221 | adjusting for the other predictors.  To make these plots, we set the
222 | values of the other predictors equal to the mean for quantitative variables
223 | and equal to the mode for categorical. To do this, we use the
224 | `apply()` method across rows (i.e. `axis=0`) with a function
225 | `representative` that checks if a column is categorical
226 | or not.
227 | 
228 | ```{python}
229 | levels = cleaned['diagnosis'].unique()
230 | def representative(series):
231 |     if hasattr(series.dtype, 'categories'):
232 |         return pd.Series.mode(series)
233 |     else:
234 |         return series.mean()
235 | modal_data = cleaned.apply(representative, axis=0)
236 | 
237 | ```
238 | 
239 | We make four
240 | copies of the column means and assign the `diagnosis` column to be the four different
241 | diagnoses.
242 | 
243 | ```{python}
244 | modal_df = pd.DataFrame(
245 |               [modal_data.iloc[0] for _ in range(len(levels))])
246 | modal_df['diagnosis'] = levels
247 | modal_df
248 | 
249 | ```
250 | 
251 | We then construct the model matrix based on the model specification `all_MS` used to fit
252 | the model, and name the rows according to the levels of `diagnosis`.
253 | 
254 | ```{python}
255 | modal_X = all_MS.transform(modal_df)
256 | modal_X.index = levels
257 | modal_X
258 | 
259 | ```
260 | 
261 | We can use the `predict_survival_function()` method to obtain the estimated survival function.
262 | 
263 | ```{python}
264 | predicted_survival = fit_all.predict_survival_function(modal_X)
265 | predicted_survival
266 | 
267 | ```
268 | This returns a data frame,
269 | whose plot methods yields the different survival curves. To avoid clutter in
270 | the plots, we do not display confidence intervals.
271 | 
272 | ```{python}
273 | fig, ax = subplots(figsize=(8, 8))
274 | predicted_survival.plot(ax=ax);
275 | 
276 | ```
277 | 
278 | 
279 | ## Publication Data
280 | The  `Publication`  data   presented in Section 11.5.4  can be
281 | found in the `ISLP` package.
282 | We first reproduce Figure 11.5  by plotting the Kaplan-Meier curves
283 | stratified on the  `posres`  variable, which records whether the
284 | study had a positive or negative result.
285 | 
286 | ```{python}
287 | fig, ax = subplots(figsize=(8,8))
288 | Publication = load_data('Publication')
289 | by_result = {}
290 | for result, df in Publication.groupby('posres'):
291 |     by_result[result] = df
292 |     km_result = km.fit(df['time'], df['status'])
293 |     km_result.plot(label='Result=%d' % result, ax=ax)
294 | 
295 | ```
296 | 
297 | As discussed previously, the $p$-values from fitting Cox’s
298 | proportional hazards model to the `posres` variable are quite
299 | large, providing no evidence of a difference in time-to-publication
300 | between studies with positive versus negative results.
301 | 
302 | ```{python}
303 | posres_df = MS(['posres',
304 |                 'time',
305 |                 'status'],
306 |                 intercept=False).fit_transform(Publication)
307 | posres_fit = coxph().fit(posres_df,
308 |                          'time',
309 |                          'status')
310 | posres_fit.summary[['coef', 'se(coef)', 'p']]
311 | 
312 | ```
313 | 
314 | 
315 | However, the results change dramatically when we include other
316 | predictors in the model. Here we exclude the funding mechanism
317 | variable.
318 | 
319 | ```{python}
320 | model = MS(Publication.columns.drop('mech'),
321 |            intercept=False)
322 | coxph().fit(model.fit_transform(Publication),
323 |             'time',
324 |             'status').summary[['coef', 'se(coef)', 'p']]
325 | 
326 | ```
327 | 
328 | We see that there are a number of statistically significant variables,
329 | including whether the trial focused on a clinical endpoint, the impact
330 | of the study, and whether the study had positive or negative results.
331 | 
332 | 
333 | ## Call Center Data
334 | 
335 | In this section, we will simulate survival data using the relationship
336 | between cumulative hazard and
337 | the survival function explored in Exercise 8.
338 | Our simulated data will represent the observed
339 | wait times (in seconds) for 2,000 customers who have phoned a call
340 | center.  In this context, censoring occurs if a customer hangs up
341 | before his or her call is answered.
342 | 
343 | There are three covariates: `Operators` (the number of call
344 | center operators available at the time of the call, which can range
345 | from $5$ to $15$), `Center` (either A, B, or C), and
346 | `Time` of day (Morning, Afternoon, or Evening). We generate data
347 | for these covariates so that all possibilities are equally likely: for
348 | instance, morning, afternoon and evening calls are equally likely, and
349 | any number of operators from $5$ to $15$ is equally likely. 
350 | 
351 | ```{python}
352 | rng = np.random.default_rng(10)
353 | N = 2000
354 | Operators = rng.choice(np.arange(5, 16),
355 |                        N,
356 |                        replace=True)
357 | Center = rng.choice(['A', 'B', 'C'],
358 |                     N,
359 |                     replace=True)
360 | Time = rng.choice(['Morn.', 'After.', 'Even.'],
361 |                    N,
362 |                    replace=True)
363 | D = pd.DataFrame({'Operators': Operators,
364 |                   'Center': pd.Categorical(Center),
365 |                   'Time': pd.Categorical(Time)})
366 | ```
367 | 
368 | We then build a model matrix (omitting the intercept)
369 | 
370 | ```{python}
371 | model = MS(['Operators',
372 |             'Center',
373 |             'Time'],
374 |            intercept=False)
375 | X = model.fit_transform(D)
376 | ```
377 | 
378 | It is worthwhile to take a peek at the model matrix `X`, so
379 | that we can be sure that we understand how the variables have been coded. By default,
380 | the levels of categorical variables are sorted and, as usual, the first column of the one-hot encoding
381 | of the variable is dropped.
382 | 
383 | ```{python}
384 | X[:5]
385 | 
386 | ```
387 | 
388 | Next,  we specify the coefficients and the hazard function.
389 | 
390 | ```{python}
391 | true_beta = np.array([0.04, -0.3, 0, 0.2, -0.2])
392 | true_linpred = X.dot(true_beta)
393 | hazard = lambda t: 1e-5 * t
394 | 
395 | ```
396 | 
397 | Here, we have set the coefficient associated with `Operators` to
398 | equal $0.04$; in other words, each additional operator leads to a
399 | $e^{0.04}=1.041$-fold increase in the “risk” that the call will be
400 | answered, given the `Center` and `Time` covariates. This
401 | makes sense: the greater the number of operators at hand, the shorter
402 | the wait time! The coefficient associated with `Center == B` is
403 | $-0.3$, and `Center == A` is treated as the baseline. This means
404 | that the risk of a call being answered at Center B is 0.74 times the
405 | risk that it will be answered at Center A; in other words, the wait
406 | times are a bit longer at Center B.
407 | 
408 | Recall from Section 2.3.7 the use of `lambda`
409 | for creating short functions on the fly.
410 | We use the function
411 | `sim_time()` from the `ISLP.survival` package. This function
412 | uses the relationship between the survival function
413 | and cumulative hazard $S(t) = \exp(-H(t))$ and the specific
414 | form of the cumulative hazard function in the Cox model
415 | to simulate data based on values of the linear predictor
416 | `true_linpred` and the cumulative hazard. 
417 |  We need to provide the cumulative hazard function, which we do here.
418 | 
419 | ```{python}
420 | cum_hazard = lambda t: 1e-5 * t**2 / 2
421 | 
422 | ```
423 | We are now ready to generate data under the Cox proportional hazards
424 | model. We truncate the maximum time to 1000 seconds to keep
425 | simulated wait times reasonable. The function
426 | `sim_time()` takes a linear predictor,
427 | a cumulative hazard function and a
428 | random number generator.
429 | 
430 | ```{python}
431 | W = np.array([sim_time(l, cum_hazard, rng)
432 |               for l in true_linpred])
433 | D['Wait time'] = np.clip(W, 0, 1000)
434 | 
435 | ```
436 | 
437 | We now simulate our censoring variable, for which we assume
438 | 90% of calls were answered (`Failed==1`) before the
439 | customer hung up (`Failed==0`).
440 | 
441 | ```{python}
442 | D['Failed'] = rng.choice([1, 0],
443 |                          N,
444 |                          p=[0.9, 0.1])
445 | D[:5]
446 | 
447 | ```
448 | 
449 | 
450 | ```{python}
451 | D['Failed'].mean()
452 | 
453 | ```
454 | 
455 | We now plot  Kaplan-Meier survival curves. First, we stratify by `Center`.
456 | 
457 | ```{python}
458 | fig, ax = subplots(figsize=(8,8))
459 | by_center = {}
460 | for center, df in D.groupby('Center'):
461 |     by_center[center] = df
462 |     km_center = km.fit(df['Wait time'], df['Failed'])
463 |     km_center.plot(label='Center=%s' % center, ax=ax)
464 | ax.set_title("Probability of Still Being on Hold")
465 | 
466 | ```
467 | 
468 | Next, we stratify by `Time`.
469 | 
470 | ```{python}
471 | fig, ax = subplots(figsize=(8,8))
472 | by_time = {}
473 | for time, df in D.groupby('Time'):
474 |     by_time[time] = df
475 |     km_time = km.fit(df['Wait time'], df['Failed'])
476 |     km_time.plot(label='Time=%s' % time, ax=ax)
477 | ax.set_title("Probability of Still Being on Hold")
478 | 
479 | ```
480 | 
481 | It seems that calls at Call Center B take longer to be answered than
482 | calls at Centers A and C. Similarly, it appears that wait times are
483 | longest in the morning and shortest in the evening hours. We can use a
484 | log-rank test to determine whether these differences are statistically
485 | significant using the function `multivariate_logrank_test()`.
486 | 
487 | ```{python}
488 | multivariate_logrank_test(D['Wait time'],
489 |                           D['Center'],
490 |                           D['Failed'])
491 | 
492 | ```
493 | 
494 | 
495 | Next, we consider the  effect of `Time`.
496 | 
497 | ```{python}
498 | multivariate_logrank_test(D['Wait time'],
499 |                           D['Time'],
500 |                           D['Failed'])
501 | 
502 | ```
503 | 
504 | 
505 | As in the case of a categorical variable with 2 levels, these
506 | results are similar to the likelihood ratio test
507 | from the Cox proportional hazards model. First, we
508 | look at the results for  `Center`.
509 | 
510 | ```{python}
511 | X = MS(['Wait time',
512 |         'Failed',
513 |         'Center'],
514 |         intercept=False).fit_transform(D)
515 | F = coxph().fit(X, 'Wait time', 'Failed')
516 | F.log_likelihood_ratio_test()
517 | 
518 | ```
519 | 
520 | 
521 | Next, we look at the results for `Time`.
522 | 
523 | ```{python}
524 | X = MS(['Wait time',
525 |         'Failed',
526 |         'Time'],
527 |        intercept=False).fit_transform(D)
528 | F = coxph().fit(X, 'Wait time', 'Failed')
529 | F.log_likelihood_ratio_test()
530 | 
531 | ```
532 | 
533 | 
534 | We find that differences between centers are highly significant, as
535 | are differences between times of day.
536 | 
537 | Finally, we fit Cox's proportional hazards model to the data.
538 | 
539 | ```{python}
540 | X = MS(D.columns,
541 |        intercept=False).fit_transform(D)
542 | fit_queuing = coxph().fit(
543 |                   X,
544 |                  'Wait time',
545 |                  'Failed')
546 | fit_queuing.summary[['coef', 'se(coef)', 'p']]
547 | 
548 | ```
549 | 
550 | 
551 | The $p$-values for Center B and evening time
552 | are very small. It is also clear that the
553 | hazard --- that is, the instantaneous risk that a call will be
554 | answered --- increases with the number of operators. Since we
555 | generated the data ourselves, we know that the true coefficients for
556 |  `Operators`, `Center = B`, `Center = C`, 
557 | `Time = Even.` and `Time = Morn.`   are $0.04$, $-0.3$,
558 | $0$,   $0.2$, and $-0.2$, respectively. The coefficient estimates
559 | from the fitted Cox model are fairly accurate.
560 | 
561 | 
562 | 


--------------------------------------------------------------------------------
/Ch13-multiple-lab.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | jupyter:
  3 |   jupytext:
  4 |     cell_metadata_filter: -all
  5 |     formats: ipynb,Rmd
  6 |     main_language: python
  7 |     text_representation:
  8 |       extension: .Rmd
  9 |       format_name: rmarkdown
 10 |       format_version: '1.2'
 11 |       jupytext_version: 1.16.7
 12 | ---
 13 | 
 14 | # Multiple Testing
 15 | 
 16 | <a target="_blank" href="https://colab.research.google.com/github/intro-stat-learning/ISLP_labs/blob/v2.2/Ch13-multiple-lab.ipynb">
 17 | <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
 18 | </a>
 19 | 
 20 | [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/intro-stat-learning/ISLP_labs/v2.2?labpath=Ch13-multiple-lab.ipynb)
 21 | 
 22 | 
 23 |  
 24 | 
 25 | 
 26 | We include our usual imports seen in earlier labs.
 27 | 
 28 | ```{python}
 29 | import numpy as np
 30 | import pandas as pd
 31 | import matplotlib.pyplot as plt
 32 | import statsmodels.api as sm
 33 | from ISLP import load_data
 34 | 
 35 | ```
 36 | 
 37 | We also collect the new imports
 38 | needed for this lab.
 39 | 
 40 | ```{python}
 41 | from scipy.stats import \
 42 |     (ttest_1samp,
 43 |      ttest_rel,
 44 |      ttest_ind,
 45 |      t as t_dbn)
 46 | from statsmodels.stats.multicomp import \
 47 |      pairwise_tukeyhsd
 48 | from statsmodels.stats.multitest import \
 49 |      multipletests as mult_test
 50 | 
 51 | ```
 52 | 
 53 | 
 54 | ## Review of Hypothesis Tests
 55 | We begin by performing some one-sample $t$-tests.
 56 | 
 57 | First we create 100 variables, each consisting of 10 observations. The
 58 | first 50 variables have mean $0.5$ and variance $1$, while the others
 59 | have mean $0$ and variance $1$.
 60 | 
 61 | ```{python}
 62 | rng = np.random.default_rng(12)
 63 | X = rng.standard_normal((10, 100))
 64 | true_mean = np.array([0.5]*50 + [0]*50)
 65 | X += true_mean[None,:]
 66 | 
 67 | ```
 68 | 
 69 | To begin, we use `ttest_1samp()`  from the
 70 | `scipy.stats` module to test $H_{0}: \mu_1=0$, the null
 71 | hypothesis that the first variable has mean zero.
 72 | 
 73 | ```{python}
 74 | result = ttest_1samp(X[:,0], 0)
 75 | result.pvalue
 76 | 
 77 | ```
 78 | 
 79 | The $p$-value comes out to 0.931, which is not low enough to
 80 | reject the null hypothesis at level $\alpha=0.05$.  In this case,
 81 | $\mu_1=0.5$, so the null hypothesis is false. Therefore, we have made
 82 | a Type II error by failing to reject the null hypothesis when the null
 83 | hypothesis is false. 
 84 | 
 85 | We now test $H_{0,j}: \mu_j=0$ for $j=1,\ldots,100$. We compute the
 86 | 100 $p$-values, and then construct a vector recording whether the
 87 | $j$th $p$-value is less than or equal to 0.05, in which case we reject
 88 | $H_{0j}$, or greater than 0.05, in which case we do not reject
 89 | $H_{0j}$, for $j=1,\ldots,100$.
 90 | 
 91 | ```{python}
 92 | p_values = np.empty(100)
 93 | for i in range(100):
 94 |    p_values[i] = ttest_1samp(X[:,i], 0).pvalue
 95 | decision = pd.cut(p_values,
 96 |                   [0, 0.05, 1],
 97 |                   labels=['Reject H0',
 98 |                           'Do not reject H0'])
 99 | truth = pd.Categorical(true_mean == 0,
100 |                        categories=[True, False],
101 |                        ordered=True)
102 | 
103 | ```
104 | Since this is a simulated data set, we can create a $2 \times 2$ table
105 | similar to  Table 13.2.
106 | 
107 | ```{python}
108 | pd.crosstab(decision,
109 |             truth,
110 |      rownames=['Decision'],
111 |      colnames=['H0'])
112 | 
113 | ```
114 | Therefore, at level $\alpha=0.05$, we reject 15 of the 50 false
115 | null hypotheses, and we incorrectly reject 5 of the true null
116 | hypotheses. Using the notation from Section 13.3, we have
117 | $V=5$, $S=15$, $U=45$ and $W=35$.
118 | We have set $\alpha=0.05$, which means that we expect to reject around
119 | 5% of the true null hypotheses. This is in line with the $2 \times 2$
120 | table above, which indicates that we rejected $V=5$ of the $50$ true
121 | null hypotheses.
122 | 
123 | In the simulation above, for the false null hypotheses, the ratio of
124 | the mean to the standard deviation was only $0.5/1 = 0.5$. This
125 | amounts to quite a weak signal, and it resulted in a high number of
126 | Type II errors.  Let’s instead simulate data with a stronger signal,
127 | so that the ratio of the mean to the standard deviation for the false
128 | null hypotheses equals $1$. We make only 10 Type II errors.
129 |  
130 | 
131 | ```{python}
132 | true_mean = np.array([1]*50 + [0]*50)
133 | X = rng.standard_normal((10, 100))
134 | X += true_mean[None,:]
135 | for i in range(100):
136 |    p_values[i] = ttest_1samp(X[:,i], 0).pvalue
137 | decision = pd.cut(p_values,
138 |                   [0, 0.05, 1],
139 |                   labels=['Reject H0',
140 |                           'Do not reject H0'])
141 | truth = pd.Categorical(true_mean == 0,
142 |                        categories=[True, False],
143 |                        ordered=True)
144 | pd.crosstab(decision,
145 |             truth,
146 |             rownames=['Decision'],
147 |             colnames=['H0'])
148 | 
149 | ```
150 |     
151 | 
152 | 
153 | ## Family-Wise Error Rate
154 | Recall from  (13.5)  that if the null hypothesis is true
155 | for each of $m$ independent hypothesis tests, then the FWER is equal
156 | to $1-(1-\alpha)^m$. We can use this expression to compute the FWER
157 | for $m=1,\ldots, 500$ and $\alpha=0.05$, $0.01$, and $0.001$.
158 | We plot the FWER for these values of $\alpha$ in order to
159 | reproduce  Figure 13.2.
160 | 
161 | ```{python}
162 | m = np.linspace(1, 501)
163 | fig, ax = plt.subplots()
164 | [ax.plot(m,
165 |          1 - (1 - alpha)**m,
166 |          label=r'$\alpha=%s$' % str(alpha))
167 |          for alpha in [0.05, 0.01, 0.001]]
168 | ax.set_xscale('log')
169 | ax.set_xlabel('Number of Hypotheses')
170 | ax.set_ylabel('Family-Wise Error Rate')
171 | ax.legend()
172 | ax.axhline(0.05, c='k', ls='--');
173 | 
174 | ```
175 | 
176 | As discussed previously, even for moderate values of $m$ such as $50$,
177 | the FWER exceeds $0.05$ unless $\alpha$ is set to a very low value,
178 | such as $0.001$.  Of course, the problem with setting $\alpha$ to such
179 | a low value is that we are likely to make a number of Type II errors:
180 | in other words, our power is very low.
181 | 
182 | We now conduct a one-sample $t$-test for each of the first five
183 | managers in the  
184 | `Fund`   dataset, in order to test the null
185 | hypothesis that the $j$th fund manager’s mean return equals zero,
186 | $H_{0,j}: \mu_j=0$.
187 | 
188 | ```{python}
189 | Fund = load_data('Fund')
190 | fund_mini = Fund.iloc[:,:5]
191 | fund_mini_pvals = np.empty(5)
192 | for i in range(5):
193 |     fund_mini_pvals[i] = ttest_1samp(fund_mini.iloc[:,i], 0).pvalue
194 | fund_mini_pvals
195 | 
196 | ```
197 | 
198 | The $p$-values are low for Managers One and Three, and high for the
199 | other three managers.  However, we cannot simply reject $H_{0,1}$ and
200 | $H_{0,3}$, since this would fail to account for the multiple testing
201 | that we have performed. Instead, we will conduct Bonferroni’s method
202 | and Holm’s method to control the FWER.
203 | 
204 | To do this, we use the `multipletests()`  function from the
205 | `statsmodels` module (abbreviated to `mult_test()`). Given the $p$-values,
206 | for methods like Holm and Bonferroni the function outputs
207 | adjusted $p$-values,  which
208 | can be thought of as a new set of $p$-values that have been corrected
209 | for multiple testing. If the adjusted $p$-value for a given hypothesis
210 | is less than or equal to $\alpha$, then that hypothesis can be
211 | rejected while maintaining a FWER of no more than $\alpha$. In other
212 | words, for such methods, the adjusted $p$-values resulting from the `multipletests()`
213 | function can simply be compared to the desired FWER in order to
214 | determine whether or not to reject each hypothesis. We will later
215 | see that we can use the same function to control FDR as well.
216 | 
217 | 
218 | The `mult_test()` function takes $p$-values and a `method` argument, as well as an optional
219 | `alpha` argument. It returns the  decisions (`reject` below)
220 | as well as the adjusted $p$-values (`bonf`).
221 | 
222 | ```{python}
223 | reject, bonf = mult_test(fund_mini_pvals, method = "bonferroni")[:2]
224 | reject
225 | 
226 | ```
227 | 
228 | 
229 | The $p$-values `bonf` are simply the `fund_mini_pvalues` multiplied by 5 and truncated to be less than
230 | or equal to 1.
231 | 
232 | ```{python}
233 | bonf, np.minimum(fund_mini_pvals * 5, 1)
234 | 
235 | ```
236 | 
237 | Therefore, using Bonferroni’s method, we are able to reject the null hypothesis only for Manager
238 | One while controlling FWER at $0.05$.
239 | 
240 | By contrast, using Holm’s method, the adjusted $p$-values indicate
241 | that we can  reject the null
242 | hypotheses for Managers One and Three at a FWER of $0.05$.
243 | 
244 | ```{python}
245 | mult_test(fund_mini_pvals, method = "holm", alpha=0.05)[:2]
246 | 
247 | ```
248 | 
249 | 
250 | As discussed previously, Manager One seems to perform particularly
251 | well, whereas Manager Two has poor performance.
252 |  
253 | 
254 | ```{python}
255 | fund_mini.mean()
256 | 
257 | ```
258 | 
259 | 
260 | Is there evidence of a meaningful difference in performance between
261 | these two managers?  We can check this by performing a  paired $t$-test  using the `ttest_rel()` function
262 | from `scipy.stats`:
263 | 
264 | ```{python}
265 | ttest_rel(fund_mini['Manager1'],
266 |           fund_mini['Manager2']).pvalue
267 | 
268 | ```
269 | 
270 | The test results in a $p$-value of 0.038,
271 | suggesting a statistically significant difference.
272 | 
273 | However, we decided to perform this test only after examining the data
274 | and noting that Managers One and Two had the highest and lowest mean
275 | performances.  In a sense, this means that we have implicitly
276 | performed ${5 \choose 2} = 5(5-1)/2=10$ hypothesis tests, rather than
277 | just one, as discussed in  Section 13.3.2.  Hence, we use the
278 | `pairwise_tukeyhsd()`  function from
279 | `statsmodels.stats.multicomp` to apply Tukey’s method
280 |   in order to adjust for multiple testing.  This function takes
281 | as input a fitted *ANOVA*  regression model, which is
282 | essentially just a linear regression in which all of the predictors
283 | are qualitative.  In this case, the response consists of the monthly
284 | excess returns achieved by each manager, and the predictor indicates
285 | the manager to which each return corresponds.
286 | 
287 | ```{python}
288 | returns = np.hstack([fund_mini.iloc[:,i] for i in range(5)])
289 | managers = np.hstack([[i+1]*50 for i in range(5)])
290 | tukey = pairwise_tukeyhsd(returns, managers)
291 | print(tukey.summary())
292 | 
293 | ```
294 | 
295 | 
296 | The `pairwise_tukeyhsd()` function provides confidence intervals
297 | for the difference between each pair of managers (`lower` and
298 | `upper`), as well as a $p$-value. All of these quantities have
299 | been adjusted for multiple testing. Notice that the $p$-value for the
300 | difference between Managers One and Two has increased from $0.038$ to
301 | $0.186$, so there is no longer clear evidence of a difference between
302 | the managers’ performances.  We can plot the confidence intervals for
303 | the pairwise comparisons using the `plot_simultaneous()` method
304 | of `tukey`. Any pair of intervals that don’t overlap indicates a significant difference at the nominal level of 0.05. In this case,
305 | no differences are considered significant as reported in the table above.
306 | 
307 | ```{python}
308 | fig, ax = plt.subplots(figsize=(8,8))
309 | tukey.plot_simultaneous(ax=ax);
310 | 
311 | ```
312 | 
313 | ## False Discovery Rate
314 | Now we perform hypothesis tests for all 2,000 fund managers in the
315 | `Fund`  dataset. We perform a one-sample $t$-test
316 | of   $H_{0,j}: \mu_j=0$, which states that the
317 | $j$th fund manager’s mean return is zero.
318 | 
319 | ```{python}
320 | fund_pvalues = np.empty(2000)
321 | for i, manager in enumerate(Fund.columns):
322 |     fund_pvalues[i] = ttest_1samp(Fund[manager], 0).pvalue
323 | 
324 | ```
325 | 
326 | There are far too many managers to consider trying to control the FWER.
327 | Instead, we focus on controlling the FDR: that is, the expected fraction of rejected null hypotheses that are actually false positives.
328 | The `multipletests()` function (abbreviated `mult_test()`) can be used to carry out the Benjamini--Hochberg procedure.
329 | 
330 | ```{python}
331 | fund_qvalues = mult_test(fund_pvalues, method = "fdr_bh")[1]
332 | fund_qvalues[:10]
333 | 
334 | ```
335 | 
336 | The  *q-values* output by the
337 | Benjamini--Hochberg procedure can be interpreted as the smallest FDR
338 | threshold at which we would reject a particular null hypothesis. For
339 | instance, a $q$-value of $0.1$ indicates that we can reject the
340 | corresponding null hypothesis at an FDR of 10% or greater, but that
341 | we cannot reject the null hypothesis at an FDR below 10%.
342 | 
343 | If we control the FDR at 10%, then for how many of the fund managers can we reject $H_{0,j}: \mu_j=0$?
344 | 
345 | ```{python}
346 | (fund_qvalues <= 0.1).sum()
347 | 
348 | ```
349 | We find that 146 of the 2,000 fund managers have a $q$-value below
350 | 0.1; therefore, we are able to conclude that 146 of the fund managers
351 | beat the market at an FDR of 10%.  Only about 15 (10% of 146) of
352 | these fund managers are likely to be false discoveries.
353 | 
354 | By contrast, if we had instead used Bonferroni’s method to control the
355 | FWER at level $\alpha=0.1$, then we would have failed to reject any
356 | null hypotheses!
357 | 
358 | ```{python}
359 | (fund_pvalues <= 0.1 / 2000).sum()
360 | 
361 | ```
362 | 
363 | 
364 | Figure 13.6 displays the ordered
365 | $p$-values, $p_{(1)} \leq p_{(2)} \leq \cdots \leq p_{(2000)}$, for
366 | the  `Fund`  dataset, as well as the threshold for rejection by the
367 | Benjamini--Hochberg procedure.  Recall that the Benjamini--Hochberg
368 | procedure identifies the largest $p$-value such that $p_{(j)}<qj/m$,
369 | and rejects all hypotheses for which the $p$-value is less than or
370 | equal to $p_{(j)}$. In the code below, we implement the
371 | Benjamini--Hochberg procedure ourselves, in order to illustrate how it
372 | works. We first order the $p$-values. We then identify all $p$-values
373 | that satisfy $p_{(j)}<qj/m$ (`sorted_set_`).  Finally, `selected_`
374 | is a boolean array indicating which $p$-values
375 |  are less than or equal to the largest
376 | $p$-value in `sorted_[sorted_set_]`. Therefore, `selected_` indexes the
377 | $p$-values rejected by the Benjamini--Hochberg procedure.
378 | 
379 | ```{python}
380 | sorted_ = np.sort(fund_pvalues)
381 | m = fund_pvalues.shape[0]
382 | q = 0.1
383 | sorted_set_ = np.where(sorted_ < q * np.linspace(1, m, m) / m)[0]
384 | if sorted_set_.shape[0] > 0:
385 |     selected_ = fund_pvalues < sorted_[sorted_set_].max()
386 |     sorted_set_ = np.arange(sorted_set_.max())
387 | else:
388 |     selected_ = []
389 |     sorted_set_ = []
390 | 
391 | ```
392 | 
393 | We now reproduce  the middle panel of Figure 13.6.
394 | 
395 | ```{python}
396 | fig, ax = plt.subplots()
397 | ax.scatter(np.arange(0, sorted_.shape[0]) + 1,
398 |            sorted_, s=10)
399 | ax.set_yscale('log')
400 | ax.set_xscale('log')
401 | ax.set_ylabel('P-Value')
402 | ax.set_xlabel('Index')
403 | ax.scatter(sorted_set_+1, sorted_[sorted_set_], c='r', s=20)
404 | ax.axline((0, 0), (1,q/m), c='k', ls='--', linewidth=3);
405 | 
406 | ```
407 | 
408 | 
409 | ## A Re-Sampling Approach
410 | Here, we implement the re-sampling approach to hypothesis testing
411 | using the  `Khan`  dataset, which we investigated in
412 | Section 13.5.  First, we merge the training and
413 | testing data, which results in observations on 83 patients for
414 | 2,308 genes.
415 | 
416 | ```{python}
417 | Khan = load_data('Khan')      
418 | D = pd.concat([Khan['xtrain'], Khan['xtest']])
419 | D['Y'] = pd.concat([Khan['ytrain'], Khan['ytest']])
420 | D['Y'].value_counts()
421 | 
422 | ```
423 | 
424 | 
425 | There are four classes of cancer. For each gene, we compare the mean
426 | expression in the second class (rhabdomyosarcoma) to the mean
427 | expression in the fourth class (Burkitt’s lymphoma).  Performing a
428 | standard two-sample $t$-test  
429 | using `ttest_ind()`  from `scipy.stats` on the $11$th
430 | gene produces a test-statistic of -2.09 and an associated $p$-value
431 | of 0.0412, suggesting modest evidence of a difference in mean
432 | expression levels between the two cancer types.
433 | 
434 | ```{python}
435 | D2 = D[lambda df:df['Y'] == 2]
436 | D4 = D[lambda df:df['Y'] == 4]
437 | gene_11 = 'G0011'
438 | observedT, pvalue = ttest_ind(D2[gene_11],
439 |                               D4[gene_11],
440 |                               equal_var=True)
441 | observedT, pvalue
442 | 
443 | ```
444 | 
445 | 
446 | However, this $p$-value relies on the assumption that under the null
447 | hypothesis of no difference between the two groups, the test statistic
448 | follows a $t$-distribution with $29+25-2=52$ degrees of freedom.
449 | Instead of using this theoretical null distribution, we can randomly
450 | split the 54 patients into two groups of 29 and 25, and compute a new
451 | test statistic.  Under the null hypothesis of no difference between
452 | the groups, this new test statistic should have the same distribution
453 | as our original one.  Repeating this process 10,000 times allows us to
454 | approximate the null distribution of the test statistic.  We compute
455 | the fraction of the time that our observed test statistic exceeds the
456 | test statistics obtained via re-sampling.
457 | 
458 | ```{python}
459 | B = 10000
460 | Tnull = np.empty(B)
461 | D_ = np.hstack([D2[gene_11], D4[gene_11]])
462 | n_ = D2[gene_11].shape[0]
463 | D_null = D_.copy()
464 | for b in range(B):
465 |     rng.shuffle(D_null)
466 |     ttest_ = ttest_ind(D_null[:n_],
467 |                        D_null[n_:],
468 |                        equal_var=True)
469 |     Tnull[b] = ttest_.statistic
470 | (np.abs(Tnull) < np.abs(observedT)).mean()
471 | 
472 | ```
473 | 
474 | 
475 | This fraction, 0.0398,
476 | is our re-sampling-based $p$-value.
477 | It is almost identical to the $p$-value of  0.0412 obtained using the theoretical null distribution.
478 | We can plot  a histogram of the re-sampling-based test statistics in order to reproduce  Figure 13.7.
479 | 
480 | ```{python}
481 | fig, ax = plt.subplots(figsize=(8,8))
482 | ax.hist(Tnull,
483 |         bins=100,
484 |         density=True,
485 |         facecolor='y',
486 |         label='Null')
487 | xval = np.linspace(-4.2, 4.2, 1001)
488 | ax.plot(xval,
489 |         t_dbn.pdf(xval, D_.shape[0]-2),
490 |         c='r')
491 | ax.axvline(observedT,
492 |            c='b',
493 |            label='Observed')
494 | ax.legend()
495 | ax.set_xlabel("Null Distribution of Test Statistic");
496 | 
497 | ```
498 | The re-sampling-based null distribution is almost identical to the theoretical null distribution, which is displayed in red.
499 | 
500 | Finally, we implement the plug-in re-sampling FDR approach outlined in
501 | Algorithm 13.4. Depending on the speed of your
502 | computer, calculating the FDR for all 2,308 genes in the `Khan`
503 | dataset may take a while.  Hence, we will illustrate the approach on a
504 | random subset of 100 genes.  For each gene, we first compute the
505 | observed test statistic, and then produce 10,000 re-sampled test
506 | statistics. This may take a few minutes to run.  If you are in a rush,
507 | then you could set `B` equal to a smaller value (e.g. `B=500`).
508 | 
509 | ```{python}
510 | m, B = 100, 10000
511 | idx = rng.choice(Khan['xtest'].columns, m, replace=False)
512 | T_vals = np.empty(m)
513 | Tnull_vals = np.empty((m, B))
514 | 
515 | for j in range(m):
516 |     col = idx[j]
517 |     T_vals[j] = ttest_ind(D2[col],
518 |                           D4[col],
519 |                           equal_var=True).statistic
520 |     D_ = np.hstack([D2[col], D4[col]])
521 |     D_null = D_.copy()
522 |     for b in range(B):
523 |         rng.shuffle(D_null)
524 |         ttest_ = ttest_ind(D_null[:n_],
525 |                            D_null[n_:],
526 |                            equal_var=True)
527 |         Tnull_vals[j,b] = ttest_.statistic
528 | 
529 | ```
530 | 
531 | Next, we compute the number of rejected null hypotheses $R$, the
532 | estimated number of false positives $\widehat{V}$, and the estimated
533 | FDR, for a range of threshold values $c$ in
534 | Algorithm 13.4. The threshold values are chosen
535 | using the absolute values of the test statistics from the 100 genes.
536 | 
537 | ```{python}
538 | cutoffs = np.sort(np.abs(T_vals))
539 | FDRs, Rs, Vs = np.empty((3, m))
540 | for j in range(m):
541 |    R = np.sum(np.abs(T_vals) >= cutoffs[j])
542 |    V = np.sum(np.abs(Tnull_vals) >= cutoffs[j]) / B
543 |    Rs[j] = R
544 |    Vs[j] = V
545 |    FDRs[j] = V / R
546 | 
547 | ```
548 | 
549 | Now, for any given FDR, we can find the genes that will be
550 | rejected. For example, with FDR controlled at 0.1, we reject 15 of the
551 | 100 null hypotheses. On average, we would expect about one or two of
552 | these genes (i.e. 10% of 15) to be false discoveries.  At an FDR of
553 | 0.2, we can reject the null hypothesis for 28 genes, of which we
554 | expect around six to be false discoveries.
555 | 
556 | The variable `idx` stores which
557 | genes were included in our 100 randomly-selected genes. Let’s look at
558 | the genes whose estimated FDR is less than 0.1.
559 | 
560 | ```{python}
561 | sorted(idx[np.abs(T_vals) >= cutoffs[FDRs < 0.1].min()])
562 | 
563 | ```
564 | 
565 | At an FDR threshold of 0.2, more genes are selected, at the cost of having a higher expected
566 | proportion of false discoveries.
567 | 
568 | ```{python}
569 | sorted(idx[np.abs(T_vals) >= cutoffs[FDRs < 0.2].min()])
570 | 
571 | ```
572 | 
573 | The next line  generates  Figure 13.11, which is similar
574 | to  Figure 13.9,
575 | except that it is based on only  a subset of the genes.
576 | 
577 | ```{python}
578 | fig, ax = plt.subplots()
579 | ax.plot(Rs, FDRs, 'b', linewidth=3)
580 | ax.set_xlabel("Number of Rejections")
581 | ax.set_ylabel("False Discovery Rate");
582 | 
583 | ```
584 |  
585 | 
586 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | BSD 2-Clause License
 2 | 
 3 | Copyright (c) 2023, intro-stat-learning
 4 | 
 5 | Redistribution and use in source and binary forms, with or without
 6 | modification, are permitted provided that the following conditions are met:
 7 | 
 8 | 1. Redistributions of source code must retain the above copyright notice, this
 9 |    list of conditions and the following disclaimer.
10 | 
11 | 2. Redistributions in binary form must reproduce the above copyright notice,
12 |    this list of conditions and the following disclaimer in the documentation
13 |    and/or other materials provided with the distribution.
14 | 
15 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
16 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
19 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
21 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
22 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
23 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
24 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | .DEFAULT_GOAL := help
 2 | 
 3 | venv:
 4 | 	@curl -LsSf https://astral.sh/uv/install.sh | sh
 5 | 	@uv venv --python 3.12
 6 | 
 7 | 
 8 | .PHONY: install
 9 | install: venv ## Install all dependencies (in the virtual environment) defined in requirements.txt
10 | 	@uv pip install --upgrade pip
11 | 	@uv pip install -r requirements.txt
12 | 
13 | 
14 | .PHONY: help
15 | help:  ## Display this help screen
16 | 	@echo -e "\033[1mAvailable commands:\033[0m"
17 | 	@grep -E '^[a-z.A-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | awk 'BEGIN {FS = ":.*?## "}; {printf "  \033[36m%-18s\033[0m %s\n", $$1, $$2}' | sort
18 | 
19 | 
20 | .PHONY: jupyter
21 | jupyter: install  ## Install and start jupyter Lab
22 | 	@uv run pip install jupyterlab
23 | 	@uv run jupyter lab
24 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # ISLP_labs
 2 | 
 3 | [![Open in GitHub Codespaces](https://github.com/codespaces/badge.svg)](https://codespaces.new/intro-stat-learning/ISLP_Labs)
 4 | 
 5 | <!-- ALL-CONTRIBUTORS-BADGE:START - Do not remove or modify this section -->
 6 | [![All Contributors](https://img.shields.io/badge/all_contributors-5-orange.svg?style=flat-square)](#contributors-)
 7 | <!-- ALL-CONTRIBUTORS-BADGE:END -->
 8 | 
 9 | ## Authors
10 | 
11 | - Trevor Hastie
12 | 
13 | - Gareth James
14 | 
15 | - Jonathan Taylor
16 | 
17 | - Robert Tibshirani
18 | 
19 | - Daniela Witten
20 | 
21 | ### ISLP
22 | 
23 | Please ensure you have followed the installation instructions for
24 | [ISLP](https://github.com/intro-stat-learning/ISLP). This will address
25 | installation of [jupyterlab](https://github.com/jupyterlab/jupyterlab)
26 | if necessary, which is not included as a requirement of the labs.
27 | 
28 | ### Up-to-date version of labs for ISLP. 
29 | 
30 | This repo will track labs for ISLP as their source code changes.  The
31 | intent is that building a virtual environment with
32 | `requirements.txt` will reproduce the results in this repo.
33 | 
34 | To install the current version of the requirements run
35 | 
36 | ```
37 | pip install -r https://raw.githubusercontent.com/intro-stat-learning/ISLP_labs/v2.2/requirements.txt;
38 | ```
39 | 
40 | The labs can now be run via:
41 | 
42 | ```
43 | jupyter lab Ch02-statlearn-lab.ipynb
44 | ```
45 | 
46 | ## Using make
47 | 
48 | If `make` is available on your machine, the steps above can be replaced
49 | 
50 | ```
51 | make install
52 | make jupyter
53 | ```
54 | 
55 | # Zip / tarball
56 | 
57 | You can download all the labs as a `.zip` or `.tar.gz` [here](https://github.com/intro-stat-learning/ISLP_labs/releases/tag/v2.2)
58 | 
59 | 
60 | ## Contributors ✨
61 | 
62 | Thanks goes to these wonderful people ([emoji key](https://allcontributors.org/docs/en/emoji-key)):
63 | 
64 | <!-- ALL-CONTRIBUTORS-LIST:START - Do not remove or modify this section -->
65 | <!-- prettier-ignore-start -->
66 | <!-- markdownlint-disable -->
67 | <table>
68 |   <tbody>
69 |     <tr>
70 |       <td align="center" valign="top" width="14.28%"><a href="https://github.com/tibshirani"><img src="https://avatars.githubusercontent.com/u/2848609?v=4?s=100" width="100px;" alt="tibshirani"/><br /><sub><b>tibshirani</b></sub></a><br /><a href="https://github.com/intro-stat-learning/ISLP_labs/commits?author=tibshirani" title="Code">💻</a> <a href="#content-tibshirani" title="Content">🖋</a></td>
71 |       <td align="center" valign="top" width="14.28%"><a href="https://web.stanford.edu/~hastie/"><img src="https://avatars.githubusercontent.com/u/13293253?v=4?s=100" width="100px;" alt="trevorhastie"/><br /><sub><b>trevorhastie</b></sub></a><br /><a href="https://github.com/intro-stat-learning/ISLP_labs/commits?author=trevorhastie" title="Code">💻</a> <a href="#content-trevorhastie" title="Content">🖋</a></td>
72 |       <td align="center" valign="top" width="14.28%"><a href="https://github.com/danielawitten"><img src="https://avatars.githubusercontent.com/u/12654191?v=4?s=100" width="100px;" alt="danielawitten"/><br /><sub><b>danielawitten</b></sub></a><br /><a href="https://github.com/intro-stat-learning/ISLP_labs/commits?author=danielawitten" title="Code">💻</a> <a href="#content-danielawitten" title="Content">🖋</a></td>
73 |       <td align="center" valign="top" width="14.28%"><a href="http://statweb.stanford.edu/~jtaylo"><img src="https://avatars.githubusercontent.com/u/341611?v=4?s=100" width="100px;" alt="Jonathan Taylor"/><br /><sub><b>Jonathan Taylor</b></sub></a><br /><a href="https://github.com/intro-stat-learning/ISLP_labs/commits?author=jonathan-taylor" title="Code">💻</a> <a href="#content-jonathan-taylor" title="Content">🖋</a></td>
74 |       <td align="center" valign="top" width="14.28%"><a href="https://github.com/tschm"><img src="https://avatars.githubusercontent.com/u/2046079?v=4?s=100" width="100px;" alt="Thomas Schmelzer"/><br /><sub><b>Thomas Schmelzer</b></sub></a><br /><a href="https://github.com/intro-stat-learning/ISLP_labs/commits?author=tschm" title="Code">💻</a></td>
75 |     </tr>
76 |   </tbody>
77 | </table>
78 | 
79 | <!-- markdownlint-restore -->
80 | <!-- prettier-ignore-end -->
81 | 
82 | <!-- ALL-CONTRIBUTORS-LIST:END -->
83 | 
84 | This project follows the [all-contributors](https://github.com/all-contributors/all-contributors) specification. Contributions of any kind welcome!
85 | 


--------------------------------------------------------------------------------
/book_images/Cape_Weaver.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/intro-stat-learning/ISLP_labs/f7d7153e137b9d4885670b44a022f6a6113d56b0/book_images/Cape_Weaver.jpg


--------------------------------------------------------------------------------
/book_images/Flamingo.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/intro-stat-learning/ISLP_labs/f7d7153e137b9d4885670b44a022f6a6113d56b0/book_images/Flamingo.jpg


--------------------------------------------------------------------------------
/book_images/Hawk_Fountain.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/intro-stat-learning/ISLP_labs/f7d7153e137b9d4885670b44a022f6a6113d56b0/book_images/Hawk_Fountain.jpg


--------------------------------------------------------------------------------
/book_images/Hawk_cropped.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/intro-stat-learning/ISLP_labs/f7d7153e137b9d4885670b44a022f6a6113d56b0/book_images/Hawk_cropped.jpg


--------------------------------------------------------------------------------
/book_images/Lhasa_Apso.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/intro-stat-learning/ISLP_labs/f7d7153e137b9d4885670b44a022f6a6113d56b0/book_images/Lhasa_Apso.jpg


--------------------------------------------------------------------------------
/book_images/Sleeping_Cat.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/intro-stat-learning/ISLP_labs/f7d7153e137b9d4885670b44a022f6a6113d56b0/book_images/Sleeping_Cat.jpg


--------------------------------------------------------------------------------
/imagenet_class_index.json:
--------------------------------------------------------------------------------
1 | {"0": ["n01440764", "tench"], "1": ["n01443537", "goldfish"], "2": ["n01484850", "great_white_shark"], "3": ["n01491361", "tiger_shark"], "4": ["n01494475", "hammerhead"], "5": ["n01496331", "electric_ray"], "6": ["n01498041", "stingray"], "7": ["n01514668", "cock"], "8": ["n01514859", "hen"], "9": ["n01518878", "ostrich"], "10": ["n01530575", "brambling"], "11": ["n01531178", "goldfinch"], "12": ["n01532829", "house_finch"], "13": ["n01534433", "junco"], "14": ["n01537544", "indigo_bunting"], "15": ["n01558993", "robin"], "16": ["n01560419", "bulbul"], "17": ["n01580077", "jay"], "18": ["n01582220", "magpie"], "19": ["n01592084", "chickadee"], "20": ["n01601694", "water_ouzel"], "21": ["n01608432", "kite"], "22": ["n01614925", "bald_eagle"], "23": ["n01616318", "vulture"], "24": ["n01622779", "great_grey_owl"], "25": ["n01629819", "European_fire_salamander"], "26": ["n01630670", "common_newt"], "27": ["n01631663", "eft"], "28": ["n01632458", "spotted_salamander"], "29": ["n01632777", "axolotl"], "30": ["n01641577", "bullfrog"], "31": ["n01644373", "tree_frog"], "32": ["n01644900", "tailed_frog"], "33": ["n01664065", "loggerhead"], "34": ["n01665541", "leatherback_turtle"], "35": ["n01667114", "mud_turtle"], "36": ["n01667778", "terrapin"], "37": ["n01669191", "box_turtle"], "38": ["n01675722", "banded_gecko"], "39": ["n01677366", "common_iguana"], "40": ["n01682714", "American_chameleon"], "41": ["n01685808", "whiptail"], "42": ["n01687978", "agama"], "43": ["n01688243", "frilled_lizard"], "44": ["n01689811", "alligator_lizard"], "45": ["n01692333", "Gila_monster"], "46": ["n01693334", "green_lizard"], "47": ["n01694178", "African_chameleon"], "48": ["n01695060", "Komodo_dragon"], "49": ["n01697457", "African_crocodile"], "50": ["n01698640", "American_alligator"], "51": ["n01704323", "triceratops"], "52": ["n01728572", "thunder_snake"], "53": ["n01728920", "ringneck_snake"], "54": ["n01729322", "hognose_snake"], "55": ["n01729977", "green_snake"], "56": ["n01734418", "king_snake"], "57": ["n01735189", "garter_snake"], "58": ["n01737021", "water_snake"], "59": ["n01739381", "vine_snake"], "60": ["n01740131", "night_snake"], "61": ["n01742172", "boa_constrictor"], "62": ["n01744401", "rock_python"], "63": ["n01748264", "Indian_cobra"], "64": ["n01749939", "green_mamba"], "65": ["n01751748", "sea_snake"], "66": ["n01753488", "horned_viper"], "67": ["n01755581", "diamondback"], "68": ["n01756291", "sidewinder"], "69": ["n01768244", "trilobite"], "70": ["n01770081", "harvestman"], "71": ["n01770393", "scorpion"], "72": ["n01773157", "black_and_gold_garden_spider"], "73": ["n01773549", "barn_spider"], "74": ["n01773797", "garden_spider"], "75": ["n01774384", "black_widow"], "76": ["n01774750", "tarantula"], "77": ["n01775062", "wolf_spider"], "78": ["n01776313", "tick"], "79": ["n01784675", "centipede"], "80": ["n01795545", "black_grouse"], "81": ["n01796340", "ptarmigan"], "82": ["n01797886", "ruffed_grouse"], "83": ["n01798484", "prairie_chicken"], "84": ["n01806143", "peacock"], "85": ["n01806567", "quail"], "86": ["n01807496", "partridge"], "87": ["n01817953", "African_grey"], "88": ["n01818515", "macaw"], "89": ["n01819313", "sulphur-crested_cockatoo"], "90": ["n01820546", "lorikeet"], "91": ["n01824575", "coucal"], "92": ["n01828970", "bee_eater"], "93": ["n01829413", "hornbill"], "94": ["n01833805", "hummingbird"], "95": ["n01843065", "jacamar"], "96": ["n01843383", "toucan"], "97": ["n01847000", "drake"], "98": ["n01855032", "red-breasted_merganser"], "99": ["n01855672", "goose"], "100": ["n01860187", "black_swan"], "101": ["n01871265", "tusker"], "102": ["n01872401", "echidna"], "103": ["n01873310", "platypus"], "104": ["n01877812", "wallaby"], "105": ["n01882714", "koala"], "106": ["n01883070", "wombat"], "107": ["n01910747", "jellyfish"], "108": ["n01914609", "sea_anemone"], "109": ["n01917289", "brain_coral"], "110": ["n01924916", "flatworm"], "111": ["n01930112", "nematode"], "112": ["n01943899", "conch"], "113": ["n01944390", "snail"], "114": ["n01945685", "slug"], "115": ["n01950731", "sea_slug"], "116": ["n01955084", "chiton"], "117": ["n01968897", "chambered_nautilus"], "118": ["n01978287", "Dungeness_crab"], "119": ["n01978455", "rock_crab"], "120": ["n01980166", "fiddler_crab"], "121": ["n01981276", "king_crab"], "122": ["n01983481", "American_lobster"], "123": ["n01984695", "spiny_lobster"], "124": ["n01985128", "crayfish"], "125": ["n01986214", "hermit_crab"], "126": ["n01990800", "isopod"], "127": ["n02002556", "white_stork"], "128": ["n02002724", "black_stork"], "129": ["n02006656", "spoonbill"], "130": ["n02007558", "flamingo"], "131": ["n02009229", "little_blue_heron"], "132": ["n02009912", "American_egret"], "133": ["n02011460", "bittern"], "134": ["n02012849", "crane"], "135": ["n02013706", "limpkin"], "136": ["n02017213", "European_gallinule"], "137": ["n02018207", "American_coot"], "138": ["n02018795", "bustard"], "139": ["n02025239", "ruddy_turnstone"], "140": ["n02027492", "red-backed_sandpiper"], "141": ["n02028035", "redshank"], "142": ["n02033041", "dowitcher"], "143": ["n02037110", "oystercatcher"], "144": ["n02051845", "pelican"], "145": ["n02056570", "king_penguin"], "146": ["n02058221", "albatross"], "147": ["n02066245", "grey_whale"], "148": ["n02071294", "killer_whale"], "149": ["n02074367", "dugong"], "150": ["n02077923", "sea_lion"], "151": ["n02085620", "Chihuahua"], "152": ["n02085782", "Japanese_spaniel"], "153": ["n02085936", "Maltese_dog"], "154": ["n02086079", "Pekinese"], "155": ["n02086240", "Shih-Tzu"], "156": ["n02086646", "Blenheim_spaniel"], "157": ["n02086910", "papillon"], "158": ["n02087046", "toy_terrier"], "159": ["n02087394", "Rhodesian_ridgeback"], "160": ["n02088094", "Afghan_hound"], "161": ["n02088238", "basset"], "162": ["n02088364", "beagle"], "163": ["n02088466", "bloodhound"], "164": ["n02088632", "bluetick"], "165": ["n02089078", "black-and-tan_coonhound"], "166": ["n02089867", "Walker_hound"], "167": ["n02089973", "English_foxhound"], "168": ["n02090379", "redbone"], "169": ["n02090622", "borzoi"], "170": ["n02090721", "Irish_wolfhound"], "171": ["n02091032", "Italian_greyhound"], "172": ["n02091134", "whippet"], "173": ["n02091244", "Ibizan_hound"], "174": ["n02091467", "Norwegian_elkhound"], "175": ["n02091635", "otterhound"], "176": ["n02091831", "Saluki"], "177": ["n02092002", "Scottish_deerhound"], "178": ["n02092339", "Weimaraner"], "179": ["n02093256", "Staffordshire_bullterrier"], "180": ["n02093428", "American_Staffordshire_terrier"], "181": ["n02093647", "Bedlington_terrier"], "182": ["n02093754", "Border_terrier"], "183": ["n02093859", "Kerry_blue_terrier"], "184": ["n02093991", "Irish_terrier"], "185": ["n02094114", "Norfolk_terrier"], "186": ["n02094258", "Norwich_terrier"], "187": ["n02094433", "Yorkshire_terrier"], "188": ["n02095314", "wire-haired_fox_terrier"], "189": ["n02095570", "Lakeland_terrier"], "190": ["n02095889", "Sealyham_terrier"], "191": ["n02096051", "Airedale"], "192": ["n02096177", "cairn"], "193": ["n02096294", "Australian_terrier"], "194": ["n02096437", "Dandie_Dinmont"], "195": ["n02096585", "Boston_bull"], "196": ["n02097047", "miniature_schnauzer"], "197": ["n02097130", "giant_schnauzer"], "198": ["n02097209", "standard_schnauzer"], "199": ["n02097298", "Scotch_terrier"], "200": ["n02097474", "Tibetan_terrier"], "201": ["n02097658", "silky_terrier"], "202": ["n02098105", "soft-coated_wheaten_terrier"], "203": ["n02098286", "West_Highland_white_terrier"], "204": ["n02098413", "Lhasa"], "205": ["n02099267", "flat-coated_retriever"], "206": ["n02099429", "curly-coated_retriever"], "207": ["n02099601", "golden_retriever"], "208": ["n02099712", "Labrador_retriever"], "209": ["n02099849", "Chesapeake_Bay_retriever"], "210": ["n02100236", "German_short-haired_pointer"], "211": ["n02100583", "vizsla"], "212": ["n02100735", "English_setter"], "213": ["n02100877", "Irish_setter"], "214": ["n02101006", "Gordon_setter"], "215": ["n02101388", "Brittany_spaniel"], "216": ["n02101556", "clumber"], "217": ["n02102040", "English_springer"], "218": ["n02102177", "Welsh_springer_spaniel"], "219": ["n02102318", "cocker_spaniel"], "220": ["n02102480", "Sussex_spaniel"], "221": ["n02102973", "Irish_water_spaniel"], "222": ["n02104029", "kuvasz"], "223": ["n02104365", "schipperke"], "224": ["n02105056", "groenendael"], "225": ["n02105162", "malinois"], "226": ["n02105251", "briard"], "227": ["n02105412", "kelpie"], "228": ["n02105505", "komondor"], "229": ["n02105641", "Old_English_sheepdog"], "230": ["n02105855", "Shetland_sheepdog"], "231": ["n02106030", "collie"], "232": ["n02106166", "Border_collie"], "233": ["n02106382", "Bouvier_des_Flandres"], "234": ["n02106550", "Rottweiler"], "235": ["n02106662", "German_shepherd"], "236": ["n02107142", "Doberman"], "237": ["n02107312", "miniature_pinscher"], "238": ["n02107574", "Greater_Swiss_Mountain_dog"], "239": ["n02107683", "Bernese_mountain_dog"], "240": ["n02107908", "Appenzeller"], "241": ["n02108000", "EntleBucher"], "242": ["n02108089", "boxer"], "243": ["n02108422", "bull_mastiff"], "244": ["n02108551", "Tibetan_mastiff"], "245": ["n02108915", "French_bulldog"], "246": ["n02109047", "Great_Dane"], "247": ["n02109525", "Saint_Bernard"], "248": ["n02109961", "Eskimo_dog"], "249": ["n02110063", "malamute"], "250": ["n02110185", "Siberian_husky"], "251": ["n02110341", "dalmatian"], "252": ["n02110627", "affenpinscher"], "253": ["n02110806", "basenji"], "254": ["n02110958", "pug"], "255": ["n02111129", "Leonberg"], "256": ["n02111277", "Newfoundland"], "257": ["n02111500", "Great_Pyrenees"], "258": ["n02111889", "Samoyed"], "259": ["n02112018", "Pomeranian"], "260": ["n02112137", "chow"], "261": ["n02112350", "keeshond"], "262": ["n02112706", "Brabancon_griffon"], "263": ["n02113023", "Pembroke"], "264": ["n02113186", "Cardigan"], "265": ["n02113624", "toy_poodle"], "266": ["n02113712", "miniature_poodle"], "267": ["n02113799", "standard_poodle"], "268": ["n02113978", "Mexican_hairless"], "269": ["n02114367", "timber_wolf"], "270": ["n02114548", "white_wolf"], "271": ["n02114712", "red_wolf"], "272": ["n02114855", "coyote"], "273": ["n02115641", "dingo"], "274": ["n02115913", "dhole"], "275": ["n02116738", "African_hunting_dog"], "276": ["n02117135", "hyena"], "277": ["n02119022", "red_fox"], "278": ["n02119789", "kit_fox"], "279": ["n02120079", "Arctic_fox"], "280": ["n02120505", "grey_fox"], "281": ["n02123045", "tabby"], "282": ["n02123159", "tiger_cat"], "283": ["n02123394", "Persian_cat"], "284": ["n02123597", "Siamese_cat"], "285": ["n02124075", "Egyptian_cat"], "286": ["n02125311", "cougar"], "287": ["n02127052", "lynx"], "288": ["n02128385", "leopard"], "289": ["n02128757", "snow_leopard"], "290": ["n02128925", "jaguar"], "291": ["n02129165", "lion"], "292": ["n02129604", "tiger"], "293": ["n02130308", "cheetah"], "294": ["n02132136", "brown_bear"], "295": ["n02133161", "American_black_bear"], "296": ["n02134084", "ice_bear"], "297": ["n02134418", "sloth_bear"], "298": ["n02137549", "mongoose"], "299": ["n02138441", "meerkat"], "300": ["n02165105", "tiger_beetle"], "301": ["n02165456", "ladybug"], "302": ["n02167151", "ground_beetle"], "303": ["n02168699", "long-horned_beetle"], "304": ["n02169497", "leaf_beetle"], "305": ["n02172182", "dung_beetle"], "306": ["n02174001", "rhinoceros_beetle"], "307": ["n02177972", "weevil"], "308": ["n02190166", "fly"], "309": ["n02206856", "bee"], "310": ["n02219486", "ant"], "311": ["n02226429", "grasshopper"], "312": ["n02229544", "cricket"], "313": ["n02231487", "walking_stick"], "314": ["n02233338", "cockroach"], "315": ["n02236044", "mantis"], "316": ["n02256656", "cicada"], "317": ["n02259212", "leafhopper"], "318": ["n02264363", "lacewing"], "319": ["n02268443", "dragonfly"], "320": ["n02268853", "damselfly"], "321": ["n02276258", "admiral"], "322": ["n02277742", "ringlet"], "323": ["n02279972", "monarch"], "324": ["n02280649", "cabbage_butterfly"], "325": ["n02281406", "sulphur_butterfly"], "326": ["n02281787", "lycaenid"], "327": ["n02317335", "starfish"], "328": ["n02319095", "sea_urchin"], "329": ["n02321529", "sea_cucumber"], "330": ["n02325366", "wood_rabbit"], "331": ["n02326432", "hare"], "332": ["n02328150", "Angora"], "333": ["n02342885", "hamster"], "334": ["n02346627", "porcupine"], "335": ["n02356798", "fox_squirrel"], "336": ["n02361337", "marmot"], "337": ["n02363005", "beaver"], "338": ["n02364673", "guinea_pig"], "339": ["n02389026", "sorrel"], "340": ["n02391049", "zebra"], "341": ["n02395406", "hog"], "342": ["n02396427", "wild_boar"], "343": ["n02397096", "warthog"], "344": ["n02398521", "hippopotamus"], "345": ["n02403003", "ox"], "346": ["n02408429", "water_buffalo"], "347": ["n02410509", "bison"], "348": ["n02412080", "ram"], "349": ["n02415577", "bighorn"], "350": ["n02417914", "ibex"], "351": ["n02422106", "hartebeest"], "352": ["n02422699", "impala"], "353": ["n02423022", "gazelle"], "354": ["n02437312", "Arabian_camel"], "355": ["n02437616", "llama"], "356": ["n02441942", "weasel"], "357": ["n02442845", "mink"], "358": ["n02443114", "polecat"], "359": ["n02443484", "black-footed_ferret"], "360": ["n02444819", "otter"], "361": ["n02445715", "skunk"], "362": ["n02447366", "badger"], "363": ["n02454379", "armadillo"], "364": ["n02457408", "three-toed_sloth"], "365": ["n02480495", "orangutan"], "366": ["n02480855", "gorilla"], "367": ["n02481823", "chimpanzee"], "368": ["n02483362", "gibbon"], "369": ["n02483708", "siamang"], "370": ["n02484975", "guenon"], "371": ["n02486261", "patas"], "372": ["n02486410", "baboon"], "373": ["n02487347", "macaque"], "374": ["n02488291", "langur"], "375": ["n02488702", "colobus"], "376": ["n02489166", "proboscis_monkey"], "377": ["n02490219", "marmoset"], "378": ["n02492035", "capuchin"], "379": ["n02492660", "howler_monkey"], "380": ["n02493509", "titi"], "381": ["n02493793", "spider_monkey"], "382": ["n02494079", "squirrel_monkey"], "383": ["n02497673", "Madagascar_cat"], "384": ["n02500267", "indri"], "385": ["n02504013", "Indian_elephant"], "386": ["n02504458", "African_elephant"], "387": ["n02509815", "lesser_panda"], "388": ["n02510455", "giant_panda"], "389": ["n02514041", "barracouta"], "390": ["n02526121", "eel"], "391": ["n02536864", "coho"], "392": ["n02606052", "rock_beauty"], "393": ["n02607072", "anemone_fish"], "394": ["n02640242", "sturgeon"], "395": ["n02641379", "gar"], "396": ["n02643566", "lionfish"], "397": ["n02655020", "puffer"], "398": ["n02666196", "abacus"], "399": ["n02667093", "abaya"], "400": ["n02669723", "academic_gown"], "401": ["n02672831", "accordion"], "402": ["n02676566", "acoustic_guitar"], "403": ["n02687172", "aircraft_carrier"], "404": ["n02690373", "airliner"], "405": ["n02692877", "airship"], "406": ["n02699494", "altar"], "407": ["n02701002", "ambulance"], "408": ["n02704792", "amphibian"], "409": ["n02708093", "analog_clock"], "410": ["n02727426", "apiary"], "411": ["n02730930", "apron"], "412": ["n02747177", "ashcan"], "413": ["n02749479", "assault_rifle"], "414": ["n02769748", "backpack"], "415": ["n02776631", "bakery"], "416": ["n02777292", "balance_beam"], "417": ["n02782093", "balloon"], "418": ["n02783161", "ballpoint"], "419": ["n02786058", "Band_Aid"], "420": ["n02787622", "banjo"], "421": ["n02788148", "bannister"], "422": ["n02790996", "barbell"], "423": ["n02791124", "barber_chair"], "424": ["n02791270", "barbershop"], "425": ["n02793495", "barn"], "426": ["n02794156", "barometer"], "427": ["n02795169", "barrel"], "428": ["n02797295", "barrow"], "429": ["n02799071", "baseball"], "430": ["n02802426", "basketball"], "431": ["n02804414", "bassinet"], "432": ["n02804610", "bassoon"], "433": ["n02807133", "bathing_cap"], "434": ["n02808304", "bath_towel"], "435": ["n02808440", "bathtub"], "436": ["n02814533", "beach_wagon"], "437": ["n02814860", "beacon"], "438": ["n02815834", "beaker"], "439": ["n02817516", "bearskin"], "440": ["n02823428", "beer_bottle"], "441": ["n02823750", "beer_glass"], "442": ["n02825657", "bell_cote"], "443": ["n02834397", "bib"], "444": ["n02835271", "bicycle-built-for-two"], "445": ["n02837789", "bikini"], "446": ["n02840245", "binder"], "447": ["n02841315", "binoculars"], "448": ["n02843684", "birdhouse"], "449": ["n02859443", "boathouse"], "450": ["n02860847", "bobsled"], "451": ["n02865351", "bolo_tie"], "452": ["n02869837", "bonnet"], "453": ["n02870880", "bookcase"], "454": ["n02871525", "bookshop"], "455": ["n02877765", "bottlecap"], "456": ["n02879718", "bow"], "457": ["n02883205", "bow_tie"], "458": ["n02892201", "brass"], "459": ["n02892767", "brassiere"], "460": ["n02894605", "breakwater"], "461": ["n02895154", "breastplate"], "462": ["n02906734", "broom"], "463": ["n02909870", "bucket"], "464": ["n02910353", "buckle"], "465": ["n02916936", "bulletproof_vest"], "466": ["n02917067", "bullet_train"], "467": ["n02927161", "butcher_shop"], "468": ["n02930766", "cab"], "469": ["n02939185", "caldron"], "470": ["n02948072", "candle"], "471": ["n02950826", "cannon"], "472": ["n02951358", "canoe"], "473": ["n02951585", "can_opener"], "474": ["n02963159", "cardigan"], "475": ["n02965783", "car_mirror"], "476": ["n02966193", "carousel"], "477": ["n02966687", "carpenter's_kit"], "478": ["n02971356", "carton"], "479": ["n02974003", "car_wheel"], "480": ["n02977058", "cash_machine"], "481": ["n02978881", "cassette"], "482": ["n02979186", "cassette_player"], "483": ["n02980441", "castle"], "484": ["n02981792", "catamaran"], "485": ["n02988304", "CD_player"], "486": ["n02992211", "cello"], "487": ["n02992529", "cellular_telephone"], "488": ["n02999410", "chain"], "489": ["n03000134", "chainlink_fence"], "490": ["n03000247", "chain_mail"], "491": ["n03000684", "chain_saw"], "492": ["n03014705", "chest"], "493": ["n03016953", "chiffonier"], "494": ["n03017168", "chime"], "495": ["n03018349", "china_cabinet"], "496": ["n03026506", "Christmas_stocking"], "497": ["n03028079", "church"], "498": ["n03032252", "cinema"], "499": ["n03041632", "cleaver"], "500": ["n03042490", "cliff_dwelling"], "501": ["n03045698", "cloak"], "502": ["n03047690", "clog"], "503": ["n03062245", "cocktail_shaker"], "504": ["n03063599", "coffee_mug"], "505": ["n03063689", "coffeepot"], "506": ["n03065424", "coil"], "507": ["n03075370", "combination_lock"], "508": ["n03085013", "computer_keyboard"], "509": ["n03089624", "confectionery"], "510": ["n03095699", "container_ship"], "511": ["n03100240", "convertible"], "512": ["n03109150", "corkscrew"], "513": ["n03110669", "cornet"], "514": ["n03124043", "cowboy_boot"], "515": ["n03124170", "cowboy_hat"], "516": ["n03125729", "cradle"], "517": ["n03126707", "crane"], "518": ["n03127747", "crash_helmet"], "519": ["n03127925", "crate"], "520": ["n03131574", "crib"], "521": ["n03133878", "Crock_Pot"], "522": ["n03134739", "croquet_ball"], "523": ["n03141823", "crutch"], "524": ["n03146219", "cuirass"], "525": ["n03160309", "dam"], "526": ["n03179701", "desk"], "527": ["n03180011", "desktop_computer"], "528": ["n03187595", "dial_telephone"], "529": ["n03188531", "diaper"], "530": ["n03196217", "digital_clock"], "531": ["n03197337", "digital_watch"], "532": ["n03201208", "dining_table"], "533": ["n03207743", "dishrag"], "534": ["n03207941", "dishwasher"], "535": ["n03208938", "disk_brake"], "536": ["n03216828", "dock"], "537": ["n03218198", "dogsled"], "538": ["n03220513", "dome"], "539": ["n03223299", "doormat"], "540": ["n03240683", "drilling_platform"], "541": ["n03249569", "drum"], "542": ["n03250847", "drumstick"], "543": ["n03255030", "dumbbell"], "544": ["n03259280", "Dutch_oven"], "545": ["n03271574", "electric_fan"], "546": ["n03272010", "electric_guitar"], "547": ["n03272562", "electric_locomotive"], "548": ["n03290653", "entertainment_center"], "549": ["n03291819", "envelope"], "550": ["n03297495", "espresso_maker"], "551": ["n03314780", "face_powder"], "552": ["n03325584", "feather_boa"], "553": ["n03337140", "file"], "554": ["n03344393", "fireboat"], "555": ["n03345487", "fire_engine"], "556": ["n03347037", "fire_screen"], "557": ["n03355925", "flagpole"], "558": ["n03372029", "flute"], "559": ["n03376595", "folding_chair"], "560": ["n03379051", "football_helmet"], "561": ["n03384352", "forklift"], "562": ["n03388043", "fountain"], "563": ["n03388183", "fountain_pen"], "564": ["n03388549", "four-poster"], "565": ["n03393912", "freight_car"], "566": ["n03394916", "French_horn"], "567": ["n03400231", "frying_pan"], "568": ["n03404251", "fur_coat"], "569": ["n03417042", "garbage_truck"], "570": ["n03424325", "gasmask"], "571": ["n03425413", "gas_pump"], "572": ["n03443371", "goblet"], "573": ["n03444034", "go-kart"], "574": ["n03445777", "golf_ball"], "575": ["n03445924", "golfcart"], "576": ["n03447447", "gondola"], "577": ["n03447721", "gong"], "578": ["n03450230", "gown"], "579": ["n03452741", "grand_piano"], "580": ["n03457902", "greenhouse"], "581": ["n03459775", "grille"], "582": ["n03461385", "grocery_store"], "583": ["n03467068", "guillotine"], "584": ["n03476684", "hair_slide"], "585": ["n03476991", "hair_spray"], "586": ["n03478589", "half_track"], "587": ["n03481172", "hammer"], "588": ["n03482405", "hamper"], "589": ["n03483316", "hand_blower"], "590": ["n03485407", "hand-held_computer"], "591": ["n03485794", "handkerchief"], "592": ["n03492542", "hard_disc"], "593": ["n03494278", "harmonica"], "594": ["n03495258", "harp"], "595": ["n03496892", "harvester"], "596": ["n03498962", "hatchet"], "597": ["n03527444", "holster"], "598": ["n03529860", "home_theater"], "599": ["n03530642", "honeycomb"], "600": ["n03532672", "hook"], "601": ["n03534580", "hoopskirt"], "602": ["n03535780", "horizontal_bar"], "603": ["n03538406", "horse_cart"], "604": ["n03544143", "hourglass"], "605": ["n03584254", "iPod"], "606": ["n03584829", "iron"], "607": ["n03590841", "jack-o'-lantern"], "608": ["n03594734", "jean"], "609": ["n03594945", "jeep"], "610": ["n03595614", "jersey"], "611": ["n03598930", "jigsaw_puzzle"], "612": ["n03599486", "jinrikisha"], "613": ["n03602883", "joystick"], "614": ["n03617480", "kimono"], "615": ["n03623198", "knee_pad"], "616": ["n03627232", "knot"], "617": ["n03630383", "lab_coat"], "618": ["n03633091", "ladle"], "619": ["n03637318", "lampshade"], "620": ["n03642806", "laptop"], "621": ["n03649909", "lawn_mower"], "622": ["n03657121", "lens_cap"], "623": ["n03658185", "letter_opener"], "624": ["n03661043", "library"], "625": ["n03662601", "lifeboat"], "626": ["n03666591", "lighter"], "627": ["n03670208", "limousine"], "628": ["n03673027", "liner"], "629": ["n03676483", "lipstick"], "630": ["n03680355", "Loafer"], "631": ["n03690938", "lotion"], "632": ["n03691459", "loudspeaker"], "633": ["n03692522", "loupe"], "634": ["n03697007", "lumbermill"], "635": ["n03706229", "magnetic_compass"], "636": ["n03709823", "mailbag"], "637": ["n03710193", "mailbox"], "638": ["n03710637", "maillot"], "639": ["n03710721", "maillot"], "640": ["n03717622", "manhole_cover"], "641": ["n03720891", "maraca"], "642": ["n03721384", "marimba"], "643": ["n03724870", "mask"], "644": ["n03729826", "matchstick"], "645": ["n03733131", "maypole"], "646": ["n03733281", "maze"], "647": ["n03733805", "measuring_cup"], "648": ["n03742115", "medicine_chest"], "649": ["n03743016", "megalith"], "650": ["n03759954", "microphone"], "651": ["n03761084", "microwave"], "652": ["n03763968", "military_uniform"], "653": ["n03764736", "milk_can"], "654": ["n03769881", "minibus"], "655": ["n03770439", "miniskirt"], "656": ["n03770679", "minivan"], "657": ["n03773504", "missile"], "658": ["n03775071", "mitten"], "659": ["n03775546", "mixing_bowl"], "660": ["n03776460", "mobile_home"], "661": ["n03777568", "Model_T"], "662": ["n03777754", "modem"], "663": ["n03781244", "monastery"], "664": ["n03782006", "monitor"], "665": ["n03785016", "moped"], "666": ["n03786901", "mortar"], "667": ["n03787032", "mortarboard"], "668": ["n03788195", "mosque"], "669": ["n03788365", "mosquito_net"], "670": ["n03791053", "motor_scooter"], "671": ["n03792782", "mountain_bike"], "672": ["n03792972", "mountain_tent"], "673": ["n03793489", "mouse"], "674": ["n03794056", "mousetrap"], "675": ["n03796401", "moving_van"], "676": ["n03803284", "muzzle"], "677": ["n03804744", "nail"], "678": ["n03814639", "neck_brace"], "679": ["n03814906", "necklace"], "680": ["n03825788", "nipple"], "681": ["n03832673", "notebook"], "682": ["n03837869", "obelisk"], "683": ["n03838899", "oboe"], "684": ["n03840681", "ocarina"], "685": ["n03841143", "odometer"], "686": ["n03843555", "oil_filter"], "687": ["n03854065", "organ"], "688": ["n03857828", "oscilloscope"], "689": ["n03866082", "overskirt"], "690": ["n03868242", "oxcart"], "691": ["n03868863", "oxygen_mask"], "692": ["n03871628", "packet"], "693": ["n03873416", "paddle"], "694": ["n03874293", "paddlewheel"], "695": ["n03874599", "padlock"], "696": ["n03876231", "paintbrush"], "697": ["n03877472", "pajama"], "698": ["n03877845", "palace"], "699": ["n03884397", "panpipe"], "700": ["n03887697", "paper_towel"], "701": ["n03888257", "parachute"], "702": ["n03888605", "parallel_bars"], "703": ["n03891251", "park_bench"], "704": ["n03891332", "parking_meter"], "705": ["n03895866", "passenger_car"], "706": ["n03899768", "patio"], "707": ["n03902125", "pay-phone"], "708": ["n03903868", "pedestal"], "709": ["n03908618", "pencil_box"], "710": ["n03908714", "pencil_sharpener"], "711": ["n03916031", "perfume"], "712": ["n03920288", "Petri_dish"], "713": ["n03924679", "photocopier"], "714": ["n03929660", "pick"], "715": ["n03929855", "pickelhaube"], "716": ["n03930313", "picket_fence"], "717": ["n03930630", "pickup"], "718": ["n03933933", "pier"], "719": ["n03935335", "piggy_bank"], "720": ["n03937543", "pill_bottle"], "721": ["n03938244", "pillow"], "722": ["n03942813", "ping-pong_ball"], "723": ["n03944341", "pinwheel"], "724": ["n03947888", "pirate"], "725": ["n03950228", "pitcher"], "726": ["n03954731", "plane"], "727": ["n03956157", "planetarium"], "728": ["n03958227", "plastic_bag"], "729": ["n03961711", "plate_rack"], "730": ["n03967562", "plow"], "731": ["n03970156", "plunger"], "732": ["n03976467", "Polaroid_camera"], "733": ["n03976657", "pole"], "734": ["n03977966", "police_van"], "735": ["n03980874", "poncho"], "736": ["n03982430", "pool_table"], "737": ["n03983396", "pop_bottle"], "738": ["n03991062", "pot"], "739": ["n03992509", "potter's_wheel"], "740": ["n03995372", "power_drill"], "741": ["n03998194", "prayer_rug"], "742": ["n04004767", "printer"], "743": ["n04005630", "prison"], "744": ["n04008634", "projectile"], "745": ["n04009552", "projector"], "746": ["n04019541", "puck"], "747": ["n04023962", "punching_bag"], "748": ["n04026417", "purse"], "749": ["n04033901", "quill"], "750": ["n04033995", "quilt"], "751": ["n04037443", "racer"], "752": ["n04039381", "racket"], "753": ["n04040759", "radiator"], "754": ["n04041544", "radio"], "755": ["n04044716", "radio_telescope"], "756": ["n04049303", "rain_barrel"], "757": ["n04065272", "recreational_vehicle"], "758": ["n04067472", "reel"], "759": ["n04069434", "reflex_camera"], "760": ["n04070727", "refrigerator"], "761": ["n04074963", "remote_control"], "762": ["n04081281", "restaurant"], "763": ["n04086273", "revolver"], "764": ["n04090263", "rifle"], "765": ["n04099969", "rocking_chair"], "766": ["n04111531", "rotisserie"], "767": ["n04116512", "rubber_eraser"], "768": ["n04118538", "rugby_ball"], "769": ["n04118776", "rule"], "770": ["n04120489", "running_shoe"], "771": ["n04125021", "safe"], "772": ["n04127249", "safety_pin"], "773": ["n04131690", "saltshaker"], "774": ["n04133789", "sandal"], "775": ["n04136333", "sarong"], "776": ["n04141076", "sax"], "777": ["n04141327", "scabbard"], "778": ["n04141975", "scale"], "779": ["n04146614", "school_bus"], "780": ["n04147183", "schooner"], "781": ["n04149813", "scoreboard"], "782": ["n04152593", "screen"], "783": ["n04153751", "screw"], "784": ["n04154565", "screwdriver"], "785": ["n04162706", "seat_belt"], "786": ["n04179913", "sewing_machine"], "787": ["n04192698", "shield"], "788": ["n04200800", "shoe_shop"], "789": ["n04201297", "shoji"], "790": ["n04204238", "shopping_basket"], "791": ["n04204347", "shopping_cart"], "792": ["n04208210", "shovel"], "793": ["n04209133", "shower_cap"], "794": ["n04209239", "shower_curtain"], "795": ["n04228054", "ski"], "796": ["n04229816", "ski_mask"], "797": ["n04235860", "sleeping_bag"], "798": ["n04238763", "slide_rule"], "799": ["n04239074", "sliding_door"], "800": ["n04243546", "slot"], "801": ["n04251144", "snorkel"], "802": ["n04252077", "snowmobile"], "803": ["n04252225", "snowplow"], "804": ["n04254120", "soap_dispenser"], "805": ["n04254680", "soccer_ball"], "806": ["n04254777", "sock"], "807": ["n04258138", "solar_dish"], "808": ["n04259630", "sombrero"], "809": ["n04263257", "soup_bowl"], "810": ["n04264628", "space_bar"], "811": ["n04265275", "space_heater"], "812": ["n04266014", "space_shuttle"], "813": ["n04270147", "spatula"], "814": ["n04273569", "speedboat"], "815": ["n04275548", "spider_web"], "816": ["n04277352", "spindle"], "817": ["n04285008", "sports_car"], "818": ["n04286575", "spotlight"], "819": ["n04296562", "stage"], "820": ["n04310018", "steam_locomotive"], "821": ["n04311004", "steel_arch_bridge"], "822": ["n04311174", "steel_drum"], "823": ["n04317175", "stethoscope"], "824": ["n04325704", "stole"], "825": ["n04326547", "stone_wall"], "826": ["n04328186", "stopwatch"], "827": ["n04330267", "stove"], "828": ["n04332243", "strainer"], "829": ["n04335435", "streetcar"], "830": ["n04336792", "stretcher"], "831": ["n04344873", "studio_couch"], "832": ["n04346328", "stupa"], "833": ["n04347754", "submarine"], "834": ["n04350905", "suit"], "835": ["n04355338", "sundial"], "836": ["n04355933", "sunglass"], "837": ["n04356056", "sunglasses"], "838": ["n04357314", "sunscreen"], "839": ["n04366367", "suspension_bridge"], "840": ["n04367480", "swab"], "841": ["n04370456", "sweatshirt"], "842": ["n04371430", "swimming_trunks"], "843": ["n04371774", "swing"], "844": ["n04372370", "switch"], "845": ["n04376876", "syringe"], "846": ["n04380533", "table_lamp"], "847": ["n04389033", "tank"], "848": ["n04392985", "tape_player"], "849": ["n04398044", "teapot"], "850": ["n04399382", "teddy"], "851": ["n04404412", "television"], "852": ["n04409515", "tennis_ball"], "853": ["n04417672", "thatch"], "854": ["n04418357", "theater_curtain"], "855": ["n04423845", "thimble"], "856": ["n04428191", "thresher"], "857": ["n04429376", "throne"], "858": ["n04435653", "tile_roof"], "859": ["n04442312", "toaster"], "860": ["n04443257", "tobacco_shop"], "861": ["n04447861", "toilet_seat"], "862": ["n04456115", "torch"], "863": ["n04458633", "totem_pole"], "864": ["n04461696", "tow_truck"], "865": ["n04462240", "toyshop"], "866": ["n04465501", "tractor"], "867": ["n04467665", "trailer_truck"], "868": ["n04476259", "tray"], "869": ["n04479046", "trench_coat"], "870": ["n04482393", "tricycle"], "871": ["n04483307", "trimaran"], "872": ["n04485082", "tripod"], "873": ["n04486054", "triumphal_arch"], "874": ["n04487081", "trolleybus"], "875": ["n04487394", "trombone"], "876": ["n04493381", "tub"], "877": ["n04501370", "turnstile"], "878": ["n04505470", "typewriter_keyboard"], "879": ["n04507155", "umbrella"], "880": ["n04509417", "unicycle"], "881": ["n04515003", "upright"], "882": ["n04517823", "vacuum"], "883": ["n04522168", "vase"], "884": ["n04523525", "vault"], "885": ["n04525038", "velvet"], "886": ["n04525305", "vending_machine"], "887": ["n04532106", "vestment"], "888": ["n04532670", "viaduct"], "889": ["n04536866", "violin"], "890": ["n04540053", "volleyball"], "891": ["n04542943", "waffle_iron"], "892": ["n04548280", "wall_clock"], "893": ["n04548362", "wallet"], "894": ["n04550184", "wardrobe"], "895": ["n04552348", "warplane"], "896": ["n04553703", "washbasin"], "897": ["n04554684", "washer"], "898": ["n04557648", "water_bottle"], "899": ["n04560804", "water_jug"], "900": ["n04562935", "water_tower"], "901": ["n04579145", "whiskey_jug"], "902": ["n04579432", "whistle"], "903": ["n04584207", "wig"], "904": ["n04589890", "window_screen"], "905": ["n04590129", "window_shade"], "906": ["n04591157", "Windsor_tie"], "907": ["n04591713", "wine_bottle"], "908": ["n04592741", "wing"], "909": ["n04596742", "wok"], "910": ["n04597913", "wooden_spoon"], "911": ["n04599235", "wool"], "912": ["n04604644", "worm_fence"], "913": ["n04606251", "wreck"], "914": ["n04612504", "yawl"], "915": ["n04613696", "yurt"], "916": ["n06359193", "web_site"], "917": ["n06596364", "comic_book"], "918": ["n06785654", "crossword_puzzle"], "919": ["n06794110", "street_sign"], "920": ["n06874185", "traffic_light"], "921": ["n07248320", "book_jacket"], "922": ["n07565083", "menu"], "923": ["n07579787", "plate"], "924": ["n07583066", "guacamole"], "925": ["n07584110", "consomme"], "926": ["n07590611", "hot_pot"], "927": ["n07613480", "trifle"], "928": ["n07614500", "ice_cream"], "929": ["n07615774", "ice_lolly"], "930": ["n07684084", "French_loaf"], "931": ["n07693725", "bagel"], "932": ["n07695742", "pretzel"], "933": ["n07697313", "cheeseburger"], "934": ["n07697537", "hotdog"], "935": ["n07711569", "mashed_potato"], "936": ["n07714571", "head_cabbage"], "937": ["n07714990", "broccoli"], "938": ["n07715103", "cauliflower"], "939": ["n07716358", "zucchini"], "940": ["n07716906", "spaghetti_squash"], "941": ["n07717410", "acorn_squash"], "942": ["n07717556", "butternut_squash"], "943": ["n07718472", "cucumber"], "944": ["n07718747", "artichoke"], "945": ["n07720875", "bell_pepper"], "946": ["n07730033", "cardoon"], "947": ["n07734744", "mushroom"], "948": ["n07742313", "Granny_Smith"], "949": ["n07745940", "strawberry"], "950": ["n07747607", "orange"], "951": ["n07749582", "lemon"], "952": ["n07753113", "fig"], "953": ["n07753275", "pineapple"], "954": ["n07753592", "banana"], "955": ["n07754684", "jackfruit"], "956": ["n07760859", "custard_apple"], "957": ["n07768694", "pomegranate"], "958": ["n07802026", "hay"], "959": ["n07831146", "carbonara"], "960": ["n07836838", "chocolate_sauce"], "961": ["n07860988", "dough"], "962": ["n07871810", "meat_loaf"], "963": ["n07873807", "pizza"], "964": ["n07875152", "potpie"], "965": ["n07880968", "burrito"], "966": ["n07892512", "red_wine"], "967": ["n07920052", "espresso"], "968": ["n07930864", "cup"], "969": ["n07932039", "eggnog"], "970": ["n09193705", "alp"], "971": ["n09229709", "bubble"], "972": ["n09246464", "cliff"], "973": ["n09256479", "coral_reef"], "974": ["n09288635", "geyser"], "975": ["n09332890", "lakeside"], "976": ["n09399592", "promontory"], "977": ["n09421951", "sandbar"], "978": ["n09428293", "seashore"], "979": ["n09468604", "valley"], "980": ["n09472597", "volcano"], "981": ["n09835506", "ballplayer"], "982": ["n10148035", "groom"], "983": ["n10565667", "scuba_diver"], "984": ["n11879895", "rapeseed"], "985": ["n11939491", "daisy"], "986": ["n12057211", "yellow_lady's_slipper"], "987": ["n12144580", "corn"], "988": ["n12267677", "acorn"], "989": ["n12620546", "hip"], "990": ["n12768682", "buckeye"], "991": ["n12985857", "coral_fungus"], "992": ["n12998815", "agaric"], "993": ["n13037406", "gyromitra"], "994": ["n13040303", "stinkhorn"], "995": ["n13044778", "earthstar"], "996": ["n13052670", "hen-of-the-woods"], "997": ["n13054560", "bolete"], "998": ["n13133613", "ear"], "999": ["n15075141", "toilet_tissue"]}


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | numpy==1.26.4
 2 | scipy==1.11.4
 3 | pandas==2.2.2
 4 | lxml==5.2.2
 5 | scikit-learn==1.5.0
 6 | joblib==1.4.2
 7 | statsmodels==0.14.2
 8 | lifelines==0.28.0
 9 | pygam==0.9.1
10 | l0bnb==1.0.0
11 | torch==2.3.0
12 | torchvision==0.18.0
13 | pytorch-lightning==2.2.5
14 | torchinfo==1.8.0
15 | torchmetrics==1.4.0.post0
16 | ISLP==0.4.0
17 | 


--------------------------------------------------------------------------------