├── .dvc
    ├── config
    ├── .gitignore
    └── plots
    │   ├── scatter.json
    │   ├── default.json
    │   ├── confusion.json
    │   └── smooth.json
├── .gitignore
├── requirements.txt
├── metrics.json
├── README.md
├── dvc.yaml
├── get_data.py
├── .github
    └── workflows
    │   └── train.yaml
├── dvc.lock
├── process_data.py
└── train.py


/.dvc/config:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.dvc/.gitignore:
--------------------------------------------------------------------------------
1 | /config.local
2 | /tmp
3 | /cache
4 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | /data_raw.csv
2 | /data_processed.csv
3 | /by_region.png
4 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | wget
2 | sklearn
3 | pandas
4 | seaborn
5 | matplotlib
6 | 


--------------------------------------------------------------------------------
/metrics.json:
--------------------------------------------------------------------------------
1 | {"accuracy": 0.8666666666666667, "specificity": 0.375, "sensitivity": 0.9550561797752809}


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Modeling Swiss farmer's attitudes about climate change
2 | 
3 | Modeling data from [Kreft et al. 2020](https://www.sciencedirect.com/science/article/pii/S2352340920303048).
4 | 
5 | 
6 | 


--------------------------------------------------------------------------------
/dvc.yaml:
--------------------------------------------------------------------------------
 1 | stages:
 2 |   get_data:
 3 |     cmd: python get_data.py
 4 |     deps:
 5 |     - get_data.py
 6 |     outs:
 7 |     - data_raw.csv  
 8 |   process:
 9 |     cmd: python process_data.py
10 |     deps:
11 |     - process_data.py
12 |     - data_raw.csv
13 |     outs:
14 |     - data_processed.csv
15 |   train:
16 |     cmd: python train.py
17 |     deps:
18 |     - train.py
19 |     - data_processed.csv
20 |     outs:
21 |     - by_region.png
22 |     metrics:
23 |     - metrics.json:
24 |         cache: false


--------------------------------------------------------------------------------
/get_data.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import wget
 3 | 
 4 | # data from https://www.sciencedirect.com/science/article/pii/S2352340920303048
 5 | 
 6 | # Download the zipped dataset
 7 | url = 'https://md-datasets-cache-zipfiles-prod.s3.eu-west-1.amazonaws.com/yshdbyj6zy-1.zip'
 8 | zip_name = "data.zip"
 9 | wget.download(url, zip_name)
10 | 
11 | # Unzip it and standardize the .csv filename
12 | import zipfile
13 | with zipfile.ZipFile(zip_name,"r") as zip_ref:
14 |     zip_ref.filelist[0].filename = 'data_raw.csv'
15 |     zip_ref.extract(zip_ref.filelist[0])
16 | 
17 | os.remove(zip_name)
18 | 
19 | 


--------------------------------------------------------------------------------
/.github/workflows/train.yaml:
--------------------------------------------------------------------------------
 1 | name: farmers
 2 | on: [push]
 3 | jobs:
 4 |   run:
 5 |     runs-on: [ubuntu-latest]
 6 |     container: docker://dvcorg/cml-py3:latest
 7 |     steps:
 8 |       - uses: actions/checkout@v2
 9 |       - name: cml_run
10 |         env:
11 |           repo_token: ${{ secrets.GITHUB_TOKEN }}
12 |         run: |
13 |           pip install -r requirements.txt
14 |           dvc repro 
15 | 
16 |           git fetch --prune
17 |           dvc metrics diff --show-md master > report.md
18 | 
19 |           # Add figure to the report
20 |           echo "## Validating results by region"
21 |           cml-publish by_region.png --md >> report.md
22 |           cml-send-comment report.md


--------------------------------------------------------------------------------
/.dvc/plots/scatter.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "$schema": "https://vega.github.io/schema/vega-lite/v4.json",
 3 |     "data": {
 4 |         "values": "<DVC_METRIC_DATA>"
 5 |     },
 6 |     "title": "<DVC_METRIC_TITLE>",
 7 |     "mark": "point",
 8 |     "encoding": {
 9 |         "x": {
10 |             "field": "<DVC_METRIC_X>",
11 |             "type": "quantitative",
12 |             "title": "<DVC_METRIC_X_LABEL>"
13 |         },
14 |         "y": {
15 |             "field": "<DVC_METRIC_Y>",
16 |             "type": "quantitative",
17 |             "title": "<DVC_METRIC_Y_LABEL>",
18 |             "scale": {
19 |                 "zero": false
20 |             }
21 |         },
22 |         "color": {
23 |             "field": "rev",
24 |             "type": "nominal"
25 |         }
26 |     }
27 | }
28 | 


--------------------------------------------------------------------------------
/.dvc/plots/default.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "$schema": "https://vega.github.io/schema/vega-lite/v4.json",
 3 |     "data": {
 4 |         "values": "<DVC_METRIC_DATA>"
 5 |     },
 6 |     "title": "<DVC_METRIC_TITLE>",
 7 |     "mark": {
 8 |         "type": "line"
 9 |     },
10 |     "encoding": {
11 |         "x": {
12 |             "field": "<DVC_METRIC_X>",
13 |             "type": "quantitative",
14 |             "title": "<DVC_METRIC_X_LABEL>"
15 |         },
16 |         "y": {
17 |             "field": "<DVC_METRIC_Y>",
18 |             "type": "quantitative",
19 |             "title": "<DVC_METRIC_Y_LABEL>",
20 |             "scale": {
21 |                 "zero": false
22 |             }
23 |         },
24 |         "color": {
25 |             "field": "rev",
26 |             "type": "nominal"
27 |         }
28 |     }
29 | }
30 | 


--------------------------------------------------------------------------------
/dvc.lock:
--------------------------------------------------------------------------------
 1 | get_data:
 2 |   cmd: python get_data.py
 3 |   deps:
 4 |   - path: get_data.py
 5 |     md5: 1db5f442403042e0403c75132fe59af4
 6 |   outs:
 7 |   - path: data_raw.csv
 8 |     md5: a6aec8da63a5fa2619af025a76746f29
 9 | process:
10 |   cmd: python process_data.py
11 |   deps:
12 |   - path: data_raw.csv
13 |     md5: a6aec8da63a5fa2619af025a76746f29
14 |   - path: process_data.py
15 |     md5: 79b357c12f171f3d07c76780815b651c
16 |   outs:
17 |   - path: data_processed.csv
18 |     md5: 3b20a3a6ac0570f3de28b77d1e88f932
19 | train:
20 |   cmd: python train.py
21 |   deps:
22 |   - path: data_processed.csv
23 |     md5: 3b20a3a6ac0570f3de28b77d1e88f932
24 |   - path: train.py
25 |     md5: 80ad33d8caf823fc1d5cdefcb5b9490a
26 |   outs:
27 |   - path: by_region.png
28 |     md5: e7f3818fac35589b0c46dd65f8293e74
29 |   - path: metrics.json
30 |     md5: f4844c28505568f336c5f91db3f1beb3
31 | 


--------------------------------------------------------------------------------
/.dvc/plots/confusion.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "$schema": "https://vega.github.io/schema/vega-lite/v4.json",
 3 |     "data": {
 4 |         "values": "<DVC_METRIC_DATA>"
 5 |     },
 6 |     "title": "<DVC_METRIC_TITLE>",
 7 |     "mark": "rect",
 8 |     "encoding": {
 9 |         "x": {
10 |             "field": "<DVC_METRIC_X>",
11 |             "type": "nominal",
12 |             "sort": "ascending",
13 |             "title": "<DVC_METRIC_X_LABEL>"
14 |         },
15 |         "y": {
16 |             "field": "<DVC_METRIC_Y>",
17 |             "type": "nominal",
18 |             "sort": "ascending",
19 |             "title": "<DVC_METRIC_Y_LABEL>"
20 |         },
21 |         "color": {
22 |             "aggregate": "count",
23 |             "type": "quantitative"
24 |         },
25 |         "facet": {
26 |             "field": "rev",
27 |             "type": "nominal"
28 |         }
29 |     }
30 | }
31 | 


--------------------------------------------------------------------------------
/process_data.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | 
 3 | df = pd.read_csv("data_raw.csv")
 4 | 
 5 | all_features = df.columns
 6 | 
 7 | # Let's drop some features
 8 | names = [feat for feat in all_features if "net_name" in feat] # excluded for privacy reasons
 9 | useless = ["info_gew","info_resul","interviewtime","id","date"] # features that we expect are uninformative
10 | drop_list = names + useless 
11 | 
12 | # Remove the questionnaire about agricultural practices until I can better understand it
13 | practice_list = ["legum","conc","add","lact","breed","covman","comp","drag","cov","plow","solar","biog","ecodr"]
14 | for feat in all_features:
15 |     if any(x in feat for x in practice_list):
16 |         drop_list.append(feat)
17 | 
18 | 
19 | df = df.drop(columns=drop_list)
20 | 
21 | # Convert non-numeric features to numeric
22 | non_numeric = list(df.select_dtypes(include=['O']).columns)
23 | for col in non_numeric:
24 |     codes,uniques=pd.factorize(df[col])
25 |     df[col] = codes
26 | 
27 | df.to_csv("data_processed.csv")
28 | 
29 | 


--------------------------------------------------------------------------------
/.dvc/plots/smooth.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "$schema": "https://vega.github.io/schema/vega-lite/v4.json",
 3 |     "data": {
 4 |         "values": "<DVC_METRIC_DATA>"
 5 |     },
 6 |     "title": "<DVC_METRIC_TITLE>",
 7 |     "mark": {
 8 |         "type": "line"
 9 |     },
10 |     "encoding": {
11 |         "x": {
12 |             "field": "<DVC_METRIC_X>",
13 |             "type": "quantitative",
14 |             "title": "<DVC_METRIC_X_LABEL>"
15 |         },
16 |         "y": {
17 |             "field": "<DVC_METRIC_Y>",
18 |             "type": "quantitative",
19 |             "title": "<DVC_METRIC_Y_LABEL>",
20 |             "scale": {
21 |                 "zero": false
22 |             }
23 |         },
24 |         "color": {
25 |             "field": "rev",
26 |             "type": "nominal"
27 |         }
28 |     },
29 |     "transform": [
30 |         {
31 |             "loess": "<DVC_METRIC_Y>",
32 |             "on": "<DVC_METRIC_X>",
33 |             "groupby": [
34 |                 "rev"
35 |             ],
36 |             "bandwidth": 0.3
37 |         }
38 |     ]
39 | }
40 | 


--------------------------------------------------------------------------------
/train.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd 
 2 | import numpy as np
 3 | from sklearn.linear_model import LogisticRegression
 4 | from sklearn import preprocessing
 5 | from sklearn.model_selection import cross_val_predict
 6 | from sklearn.metrics import confusion_matrix
 7 | from sklearn.metrics import roc_curve
 8 | from sklearn.model_selection import train_test_split
 9 | import json
10 | import seaborn as sns
11 | import matplotlib.pyplot as plt
12 | from sklearn.impute import SimpleImputer
13 | 
14 | df = pd.read_csv("data_processed.csv")
15 | 
16 | #### Get features ready to model! 
17 | y = df.pop("cons_general").to_numpy()
18 | y[y< 4] = 0
19 | y[y>= 4] = 1
20 | 
21 | X = df.to_numpy()
22 | X = preprocessing.scale(X) # Is standard
23 | # Impute NaNs
24 | 
25 | imp = SimpleImputer(missing_values=np.nan, strategy='mean')
26 | imp.fit(X)
27 | X = imp.transform(X)
28 | 
29 | 
30 | # Linear model
31 | clf = LogisticRegression()
32 | yhat = cross_val_predict(clf, X, y, cv=5)
33 | 
34 | acc = np.mean(yhat==y)
35 | tn, fp, fn, tp = confusion_matrix(y, yhat).ravel()
36 | specificity = tn / (tn+fp)
37 | sensitivity = tp / (tp + fn)
38 | 
39 | # Now print to file
40 | with open("metrics.json", 'w') as outfile:
41 |         json.dump({ "accuracy": acc, "specificity": specificity, "sensitivity":sensitivity}, outfile)
42 | 
43 | # Let's visualize within several slices of the dataset
44 | score = yhat == y
45 | score_int = [int(s) for s in score]
46 | df['pred_accuracy'] = score_int
47 | 
48 | # Bar plot by region
49 | 
50 | sns.set_color_codes("dark")
51 | ax = sns.barplot(x="region", y="pred_accuracy", data=df, palette = "Greens_d")
52 | ax.set(xlabel="Region", ylabel = "Model accuracy")
53 | plt.savefig("by_region.png",dpi=80)
54 | 


--------------------------------------------------------------------------------