├── .dvc ├── config ├── .gitignore └── plots │ ├── scatter.json │ ├── default.json │ ├── confusion.json │ └── smooth.json ├── .gitignore ├── requirements.txt ├── metrics.json ├── README.md ├── dvc.yaml ├── get_data.py ├── .github └── workflows │ └── train.yaml ├── dvc.lock ├── process_data.py └── train.py /.dvc/config: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.dvc/.gitignore: -------------------------------------------------------------------------------- 1 | /config.local 2 | /tmp 3 | /cache 4 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /data_raw.csv 2 | /data_processed.csv 3 | /by_region.png 4 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | wget 2 | sklearn 3 | pandas 4 | seaborn 5 | matplotlib 6 | -------------------------------------------------------------------------------- /metrics.json: -------------------------------------------------------------------------------- 1 | {"accuracy": 0.8666666666666667, "specificity": 0.375, "sensitivity": 0.9550561797752809} -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Modeling Swiss farmer's attitudes about climate change 2 | 3 | Modeling data from [Kreft et al. 2020](https://www.sciencedirect.com/science/article/pii/S2352340920303048). 4 | 5 | 6 | -------------------------------------------------------------------------------- /dvc.yaml: -------------------------------------------------------------------------------- 1 | stages: 2 | get_data: 3 | cmd: python get_data.py 4 | deps: 5 | - get_data.py 6 | outs: 7 | - data_raw.csv 8 | process: 9 | cmd: python process_data.py 10 | deps: 11 | - process_data.py 12 | - data_raw.csv 13 | outs: 14 | - data_processed.csv 15 | train: 16 | cmd: python train.py 17 | deps: 18 | - train.py 19 | - data_processed.csv 20 | outs: 21 | - by_region.png 22 | metrics: 23 | - metrics.json: 24 | cache: false -------------------------------------------------------------------------------- /get_data.py: -------------------------------------------------------------------------------- 1 | import os 2 | import wget 3 | 4 | # data from https://www.sciencedirect.com/science/article/pii/S2352340920303048 5 | 6 | # Download the zipped dataset 7 | url = 'https://md-datasets-cache-zipfiles-prod.s3.eu-west-1.amazonaws.com/yshdbyj6zy-1.zip' 8 | zip_name = "data.zip" 9 | wget.download(url, zip_name) 10 | 11 | # Unzip it and standardize the .csv filename 12 | import zipfile 13 | with zipfile.ZipFile(zip_name,"r") as zip_ref: 14 | zip_ref.filelist[0].filename = 'data_raw.csv' 15 | zip_ref.extract(zip_ref.filelist[0]) 16 | 17 | os.remove(zip_name) 18 | 19 | -------------------------------------------------------------------------------- /.github/workflows/train.yaml: -------------------------------------------------------------------------------- 1 | name: farmers 2 | on: [push] 3 | jobs: 4 | run: 5 | runs-on: [ubuntu-latest] 6 | container: docker://dvcorg/cml-py3:latest 7 | steps: 8 | - uses: actions/checkout@v2 9 | - name: cml_run 10 | env: 11 | repo_token: ${{ secrets.GITHUB_TOKEN }} 12 | run: | 13 | pip install -r requirements.txt 14 | dvc repro 15 | 16 | git fetch --prune 17 | dvc metrics diff --show-md master > report.md 18 | 19 | # Add figure to the report 20 | echo "## Validating results by region" 21 | cml-publish by_region.png --md >> report.md 22 | cml-send-comment report.md -------------------------------------------------------------------------------- /.dvc/plots/scatter.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "https://vega.github.io/schema/vega-lite/v4.json", 3 | "data": { 4 | "values": "" 5 | }, 6 | "title": "", 7 | "mark": "point", 8 | "encoding": { 9 | "x": { 10 | "field": "", 11 | "type": "quantitative", 12 | "title": "" 13 | }, 14 | "y": { 15 | "field": "", 16 | "type": "quantitative", 17 | "title": "", 18 | "scale": { 19 | "zero": false 20 | } 21 | }, 22 | "color": { 23 | "field": "rev", 24 | "type": "nominal" 25 | } 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /.dvc/plots/default.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "https://vega.github.io/schema/vega-lite/v4.json", 3 | "data": { 4 | "values": "" 5 | }, 6 | "title": "", 7 | "mark": { 8 | "type": "line" 9 | }, 10 | "encoding": { 11 | "x": { 12 | "field": "", 13 | "type": "quantitative", 14 | "title": "" 15 | }, 16 | "y": { 17 | "field": "", 18 | "type": "quantitative", 19 | "title": "", 20 | "scale": { 21 | "zero": false 22 | } 23 | }, 24 | "color": { 25 | "field": "rev", 26 | "type": "nominal" 27 | } 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /dvc.lock: -------------------------------------------------------------------------------- 1 | get_data: 2 | cmd: python get_data.py 3 | deps: 4 | - path: get_data.py 5 | md5: 1db5f442403042e0403c75132fe59af4 6 | outs: 7 | - path: data_raw.csv 8 | md5: a6aec8da63a5fa2619af025a76746f29 9 | process: 10 | cmd: python process_data.py 11 | deps: 12 | - path: data_raw.csv 13 | md5: a6aec8da63a5fa2619af025a76746f29 14 | - path: process_data.py 15 | md5: 79b357c12f171f3d07c76780815b651c 16 | outs: 17 | - path: data_processed.csv 18 | md5: 3b20a3a6ac0570f3de28b77d1e88f932 19 | train: 20 | cmd: python train.py 21 | deps: 22 | - path: data_processed.csv 23 | md5: 3b20a3a6ac0570f3de28b77d1e88f932 24 | - path: train.py 25 | md5: 80ad33d8caf823fc1d5cdefcb5b9490a 26 | outs: 27 | - path: by_region.png 28 | md5: e7f3818fac35589b0c46dd65f8293e74 29 | - path: metrics.json 30 | md5: f4844c28505568f336c5f91db3f1beb3 31 | -------------------------------------------------------------------------------- /.dvc/plots/confusion.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "https://vega.github.io/schema/vega-lite/v4.json", 3 | "data": { 4 | "values": "" 5 | }, 6 | "title": "", 7 | "mark": "rect", 8 | "encoding": { 9 | "x": { 10 | "field": "", 11 | "type": "nominal", 12 | "sort": "ascending", 13 | "title": "" 14 | }, 15 | "y": { 16 | "field": "", 17 | "type": "nominal", 18 | "sort": "ascending", 19 | "title": "" 20 | }, 21 | "color": { 22 | "aggregate": "count", 23 | "type": "quantitative" 24 | }, 25 | "facet": { 26 | "field": "rev", 27 | "type": "nominal" 28 | } 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /process_data.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | df = pd.read_csv("data_raw.csv") 4 | 5 | all_features = df.columns 6 | 7 | # Let's drop some features 8 | names = [feat for feat in all_features if "net_name" in feat] # excluded for privacy reasons 9 | useless = ["info_gew","info_resul","interviewtime","id","date"] # features that we expect are uninformative 10 | drop_list = names + useless 11 | 12 | # Remove the questionnaire about agricultural practices until I can better understand it 13 | practice_list = ["legum","conc","add","lact","breed","covman","comp","drag","cov","plow","solar","biog","ecodr"] 14 | for feat in all_features: 15 | if any(x in feat for x in practice_list): 16 | drop_list.append(feat) 17 | 18 | 19 | df = df.drop(columns=drop_list) 20 | 21 | # Convert non-numeric features to numeric 22 | non_numeric = list(df.select_dtypes(include=['O']).columns) 23 | for col in non_numeric: 24 | codes,uniques=pd.factorize(df[col]) 25 | df[col] = codes 26 | 27 | df.to_csv("data_processed.csv") 28 | 29 | -------------------------------------------------------------------------------- /.dvc/plots/smooth.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "https://vega.github.io/schema/vega-lite/v4.json", 3 | "data": { 4 | "values": "" 5 | }, 6 | "title": "", 7 | "mark": { 8 | "type": "line" 9 | }, 10 | "encoding": { 11 | "x": { 12 | "field": "", 13 | "type": "quantitative", 14 | "title": "" 15 | }, 16 | "y": { 17 | "field": "", 18 | "type": "quantitative", 19 | "title": "", 20 | "scale": { 21 | "zero": false 22 | } 23 | }, 24 | "color": { 25 | "field": "rev", 26 | "type": "nominal" 27 | } 28 | }, 29 | "transform": [ 30 | { 31 | "loess": "", 32 | "on": "", 33 | "groupby": [ 34 | "rev" 35 | ], 36 | "bandwidth": 0.3 37 | } 38 | ] 39 | } 40 | -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | from sklearn.linear_model import LogisticRegression 4 | from sklearn import preprocessing 5 | from sklearn.model_selection import cross_val_predict 6 | from sklearn.metrics import confusion_matrix 7 | from sklearn.metrics import roc_curve 8 | from sklearn.model_selection import train_test_split 9 | import json 10 | import seaborn as sns 11 | import matplotlib.pyplot as plt 12 | from sklearn.impute import SimpleImputer 13 | 14 | df = pd.read_csv("data_processed.csv") 15 | 16 | #### Get features ready to model! 17 | y = df.pop("cons_general").to_numpy() 18 | y[y< 4] = 0 19 | y[y>= 4] = 1 20 | 21 | X = df.to_numpy() 22 | X = preprocessing.scale(X) # Is standard 23 | # Impute NaNs 24 | 25 | imp = SimpleImputer(missing_values=np.nan, strategy='mean') 26 | imp.fit(X) 27 | X = imp.transform(X) 28 | 29 | 30 | # Linear model 31 | clf = LogisticRegression() 32 | yhat = cross_val_predict(clf, X, y, cv=5) 33 | 34 | acc = np.mean(yhat==y) 35 | tn, fp, fn, tp = confusion_matrix(y, yhat).ravel() 36 | specificity = tn / (tn+fp) 37 | sensitivity = tp / (tp + fn) 38 | 39 | # Now print to file 40 | with open("metrics.json", 'w') as outfile: 41 | json.dump({ "accuracy": acc, "specificity": specificity, "sensitivity":sensitivity}, outfile) 42 | 43 | # Let's visualize within several slices of the dataset 44 | score = yhat == y 45 | score_int = [int(s) for s in score] 46 | df['pred_accuracy'] = score_int 47 | 48 | # Bar plot by region 49 | 50 | sns.set_color_codes("dark") 51 | ax = sns.barplot(x="region", y="pred_accuracy", data=df, palette = "Greens_d") 52 | ax.set(xlabel="Region", ylabel = "Model accuracy") 53 | plt.savefig("by_region.png",dpi=80) 54 | --------------------------------------------------------------------------------