├── data
    └── messages
    │   ├── Lenna.png
    │   ├── Lenna_16x16.png
    │   ├── Lenna_32x32.png
    │   ├── Lenna_64x64.png
    │   ├── Lenna_8x8.png
    │   ├── Lenna_128x128.png
    │   ├── Lenna_256x256.png
    │   └── himmw.txt
├── output
    └── plots
    │   ├── clf_embedding_acc.png
    │   └── clf_embedding_ll.png
├── stegasawus
    ├── __init__.py
    ├── parameter_tuning.yaml
    ├── seq.py
    ├── mlp.py
    ├── dataset.py
    ├── eda.py
    ├── lsb.py
    ├── models.py
    ├── features.py
    └── tuning.py
├── setup.py
├── .gitignore
├── examples
    ├── dataset_creation.py
    ├── image_plots.py
    └── feature_set_creation.py
└── README.md


/data/messages/Lenna.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rokkuran/stegasawus/HEAD/data/messages/Lenna.png


--------------------------------------------------------------------------------
/data/messages/Lenna_16x16.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rokkuran/stegasawus/HEAD/data/messages/Lenna_16x16.png


--------------------------------------------------------------------------------
/data/messages/Lenna_32x32.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rokkuran/stegasawus/HEAD/data/messages/Lenna_32x32.png


--------------------------------------------------------------------------------
/data/messages/Lenna_64x64.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rokkuran/stegasawus/HEAD/data/messages/Lenna_64x64.png


--------------------------------------------------------------------------------
/data/messages/Lenna_8x8.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rokkuran/stegasawus/HEAD/data/messages/Lenna_8x8.png


--------------------------------------------------------------------------------
/data/messages/Lenna_128x128.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rokkuran/stegasawus/HEAD/data/messages/Lenna_128x128.png


--------------------------------------------------------------------------------
/data/messages/Lenna_256x256.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rokkuran/stegasawus/HEAD/data/messages/Lenna_256x256.png


--------------------------------------------------------------------------------
/output/plots/clf_embedding_acc.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rokkuran/stegasawus/HEAD/output/plots/clf_embedding_acc.png


--------------------------------------------------------------------------------
/output/plots/clf_embedding_ll.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rokkuran/stegasawus/HEAD/output/plots/clf_embedding_ll.png


--------------------------------------------------------------------------------
/stegasawus/__init__.py:
--------------------------------------------------------------------------------
1 | import dataset
2 | import eda
3 | import features
4 | import model
5 | import lsb
6 | import seq
7 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | from distutils.core import setup
 4 | 
 5 | setup(
 6 |     name='stegasawus',
 7 |     version='0.4.2',
 8 |     description='Machine learning detection of steganographic images',
 9 |     author='Lachlan Taylor',
10 |     packages=['stegasawus']  # 'stegasawus.lsb']
11 |     # data_files=[('./images/', ['Lenna.png', 'image.png'])]
12 | )
13 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | data/archive/
 2 | data/features/
 3 | output/distplots
 4 | output/histograms
 5 | output/log1p_test
 6 | output/k*
 7 | images/
 8 | jpeg-v4/
 9 | # archive/
10 | 
11 | # Byte-compiled / optimized / DLL files
12 | __pycache__/
13 | *.py[cod]
14 | *$py.class
15 | 
16 | # C extensions
17 | *.so
18 | 
19 | # Distribution / packaging
20 | .Python
21 | env/
22 | build/
23 | develop-eggs/
24 | dist/
25 | downloads/
26 | eggs/
27 | .eggs/
28 | lib/
29 | lib64/
30 | parts/
31 | sdist/
32 | var/
33 | *.egg-info/
34 | .installed.cfg
35 | *.egg
36 | 
37 | # PyInstaller
38 | #  Usually these files are written by a python script from a template
39 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
40 | *.manifest
41 | *.spec
42 | 
43 | # Installer logs
44 | pip-log.txt
45 | pip-delete-this-directory.txt
46 | 
47 | # Unit test / coverage reports
48 | htmlcov/
49 | .tox/
50 | .coverage
51 | .coverage.*
52 | .cache
53 | nosetests.xml
54 | coverage.xml
55 | *,cover
56 | 
57 | # Translations
58 | *.mo
59 | *.pot
60 | 
61 | # Django stuff:
62 | *.log
63 | 
64 | # Sphinx documentation
65 | docs/_build/
66 | 
67 | # PyBuilder
68 | target/
69 | 


--------------------------------------------------------------------------------
/examples/dataset_creation.py:
--------------------------------------------------------------------------------
 1 | from stegasawus.dataset import (
 2 |     get_secret_message,
 3 |     batch_png_to_jpg,
 4 |     crop_images,
 5 |     batch_hide_message,
 6 |     create_benchmark_image_message)
 7 | 
 8 | 
 9 | # Crop all images in a file and ouput to directory
10 | crop_images(
11 |     path_images='{}images/jpg/cats_and_dogs/original/'.format(path)
12 |     path_output='{}images/jpg/cats_and_dogs/cropped_256/'.format(path),
13 |     dimensions=(256, 256),
14 |     centre=True
15 | )
16 | 
17 | 
18 | # Batch convert jpg to png
19 | batch_png_to_jpg(
20 |     path_input='{}images/jpg/cats_and_dogs/cropped_256/'.format(path),
21 |     path_output='{}images/png/cover/'.format(path)
22 | )
23 | 
24 | 
25 | # Create steganographic image set embedding the secret message
26 | generator = 'identity'
27 | dim = 64
28 | 
29 | path = '/home/rokkuran/workspace/stegasawus/'
30 | path_images = '{}images/png/cover/'.format(path)
31 | path_output = '{}images/png/lenna{}_{}/'.format(path, dim, generator)
32 | path_msg = '{}data/messages/'.format(path)
33 | 
34 | secret_message = get_secret_message(
35 |     '{}Lenna_{}x{}.txt'.format(path_msg, dim, dim)
36 | )
37 | 
38 | batch_hide_message(
39 |     secret_message=secret_message,
40 |     path_images=path_images,
41 |     path_output=path_output,
42 |     file_type='png',
43 |     generator=generator
44 | )
45 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Stegasawus
 2 | Detecting whether steganography is present in an image using machine learning.
 3 | - Least significant bit (LSB) embedding functions using various embedding location sequences.
 4 | - Generates dataset from set cover images.
 5 | - Creates feature vectors from statistical moments of autocorrelation and discrete wavelet decomposition measures.
 6 | - Exploratory plots of images and training datasets.
 7 | - Preliminary model comparisons.
 8 | 
 9 | ### Preliminary Results
10 | Model 5-fold cross validation results on 2000 (256x256) images of cats and dogs with various message sizes embedded. Image messages are converted to strings and hidden in a cover image using the LSB algorithm.
11 | 
12 | Image type counts below. Cover is the original image and 64x64 is the image size that is hidden in a cover image.
13 | 
14 | `Counter({'16x16': 350, '32x32': 333, '64x64': 317, 'cover': 1000})`
15 | 
16 | ![Classifier Accuracy](https://github.com/rokkuran/stegasawus/blob/master/output/plots/clf_embedding_acc.png)
17 | 
18 | ![Classifier Log Loss](https://github.com/rokkuran/stegasawus/blob/master/output/plots/clf_embedding_ll.png)
19 | 
20 | 
21 | ### Future Work
22 | - Model benchmarks for different LSB embedding generator types.
23 | - Look at model performance for different image types (only cats and dogs at the moment).
24 | - Model persistence for well performing trained models.
25 | - Improve LSB embedding location sequences.
26 | - Extend/improve features.
27 | - Look at jpg images and embedding in discrete cosine coefficients.
28 | 


--------------------------------------------------------------------------------
/examples/image_plots.py:
--------------------------------------------------------------------------------
 1 | from stegasawus import eda
 2 | 
 3 | from stegano.lsbset import generators
 4 | 
 5 | 
 6 | path = '/home/rokkuran/workspace/stegasawus/'
 7 | path_cover = '{}images/png/cover/'.format(path)
 8 | path_stego = '{}images/png/lenna64_identity/'.format(path)
 9 | # path_stego = '{}images/png/lenna64_eratosthenes/'.format(path)
10 | path_output = '{}output'.format(path)
11 | 
12 | fname = 'cat.2.png'
13 | z = eda.JointImageAnalyser(path_cover + fname, path_stego + fname)
14 | 
15 | # plot cover and stego images side by side.
16 | z.plot_images()
17 | 
18 | # plot difference between cover and stego images.
19 | z.plot_difference()
20 | 
21 | # plot colour channels of cover and stego images.
22 | z.plot_rgb_components()
23 | 
24 | # Reveal and show hidden image
25 | z.reveal_image(generators.identity(), show=True)
26 | 
27 | # Plot wavelet decomposition for a colour channel
28 | eda.plot_wavelet_decomposition(z.I[:, :, 0])
29 | 
30 | # generate set of histogram/kde plots
31 | eda.generate_feature_distplots(
32 |     filepath_train='{}data/features/train_lenna_identity.csv'.format(path),
33 |     path_output=path_output,
34 |     normalise=False
35 | )
36 | 
37 | # generate set of histograms
38 | eda.generate_feature_histograms(
39 |     filepath_train='{}data/features/train_lenna_identity.csv'.format(path),
40 |     path_output=path_output,
41 |     bins=50,
42 |     normalise=False
43 | )
44 | 
45 | # generate kernel density estimation plots
46 | eda.generate_feature_kde(
47 |     filepath_train='{}data/features/train_lenna_identity.csv'.format(path),
48 |     path_output=path_output,
49 |     normalise=False
50 | )
51 | 


--------------------------------------------------------------------------------
/examples/feature_set_creation.py:
--------------------------------------------------------------------------------
 1 | from stegasawus.features import (
 2 |     create_feature_dataset,
 3 |     concatenate_feature_sets,
 4 |     concat_multiple_feature_sets)
 5 | 
 6 | path = '/home/rokkuran/workspace/stegasawus/'
 7 | 
 8 | dim = 64
 9 | generator = 'identity'
10 | # generator = 'eratosthenes'
11 | 
12 | path_cover = '{}images/png/cover/'.format(path)
13 | path_stego = '{}images/png/lenna{}_{}/'.format(path, dim, generator)
14 | 
15 | # use both feature types to create training set
16 | f_types = ['autocorrelation', 'wavelet']
17 | 
18 | # create cover image training dataset
19 | create_feature_dataset(
20 |   path_images=path_cover,
21 |   class_label='cover',
22 |   path_output='{}data/train_cover.csv'.format(path),
23 |   f_types=f_types
24 | )
25 | 
26 | # create steganographic image training dataset
27 | create_feature_dataset(
28 |   path_images=path_stego,
29 |   class_label='stego',
30 |   path_output='%sdata/train_stego_lenna%s_%s.csv' % (path, dim, generator),
31 |   f_types=f_types
32 | )
33 | 
34 | # merge cover and stego images to create complete training set
35 | concatenate_feature_sets(
36 |   '{}data/train_cover.csv'.format(path),
37 |   '{}data/train_stego_lenna{}_{}.csv'.format(path, dim, generator),
38 |   '{}data/train_lenna_{}.csv'.format(path, generator)
39 | )
40 | 
41 | # combine multiple training training sets together
42 | concat_multiple_feature_sets(
43 |     [
44 |         '{}data/train_cover.csv'.format(path),
45 |         '{}data/train_stego_lenna16_identity.csv'.format(path),
46 |         '{}data/train_stego_lenna32_identity.csv'.format(path),
47 |         '{}data/train_stego_lenna64_identity.csv'.format(path),
48 |     ],
49 |     '{}data/train_lenna_identity.csv'.format(path)
50 | )
51 | 


--------------------------------------------------------------------------------
/data/messages/himmw.txt:
--------------------------------------------------------------------------------
1 | SHOUTS AND MURMURS about man who describes meeting his wife at a party. In his description, he drops many prefixes. It had been a rough day, so when I walked into the party I was very chalant, despite my efforts to appear gruntled and consolate. I was furling my wieldy umbrella for the coat check when I saw her standing alone in a corner. She was a descript person, a woman in a state of total array. Her hair was kempt, her clothing shevelled, and she moved in a gainly way. I wanted desperately to meet her, but I knew I'd have to make bones about it, since I was travelling cognito. Beknownst to me, the hostess, whom I could see both hide and hair of, was very proper, so it would be skin off my nose if anything bad happened. And even though I had only swerving loyalty to her, my manners couldn't be peccable. Only toward and heard-of behavior would do. Fortunately, the embarrassment that my maculate appearance might cause was evitable. There were two ways about it, but the chances that someone as flappable as I would be ept enough to become persona grata or sung hero were slim. I was, after all, something to sneeze at, someone you could easily hold a candle to, someone who usually aroused bridled passion. So I decided not to rush it. But then, all at once, for some apparent reason, she looked in my direction and smiled in a way that I could make heads or tails of. So, after a terminable delay, I acted with mitigated gall and made my way through the ruly crowd with strong givings. Nevertheless, since this was all new hat to me and I had no time to prepare a promptu speech, I was petuous. She responded well, and I was mayed that she considered me a savory char- acter who was up to some good. She told me who she was. "What a perfect nomer," I said, advertently. The conversation became more and more choate, and we spoke at length to much avail. But I was defatigable, so I had to leave at a godly hour. I asked if she wanted to come with me. To my delight, she was committal. We left the party together and have been together ever since. I have given her my love, and she has requited it.
2 | 


--------------------------------------------------------------------------------
/stegasawus/parameter_tuning.yaml:
--------------------------------------------------------------------------------
 1 | pso:
 2 |   # TODO: use and anchor or reference or whatever it is called for repeats
 3 |   lr_lbfgs: # C, tol
 4 |     lb: [1e-2, 1e-4]
 5 |     ub: [1e3, 1e-3]
 6 | 
 7 |   svc_linear: # C, tol
 8 |     lb: [1e-2, 1e-4]
 9 |     ub: [1e3, 1e-3]
10 | 
11 |   rf: # max_depth, min_samples_leaf, min_samples_split
12 |     lb: [2, 1, 2]
13 |     ub: [20, 15, 10]
14 | 
15 |   xgb:  # max_depth, learning_rate, gamma
16 |     lb: [3, 0.001, 0]
17 |     ub: [10, 50, 50]
18 | 
19 | 
20 | grid_search:
21 |   svc_rbf:
22 |     svc_rbf__C: [1, 50, 100, 250, 600, 650, 750, 800, 900, 1000]
23 |     svc_rbf__tol: [1e-3, 1e-4]
24 |     svc_rbf__gamma: [0.01, 0.1, 0.25, 0.5, 0.75]
25 | 
26 |   svc_linear:
27 |     svc_linear__C: [0.01, 0.1, 1, 10, 100, 1000]
28 |     svc_linear__tol: [1e-3, 1e-4]
29 | 
30 |   nusvc:
31 |     nusvc__nu: [0.01, 0.1, 0.25, 0.5, 0.75, 0.90]
32 |     nusvc__kernel: [rbf, poly, sigmoid]
33 |     nusvc__tol: [1e-3, 1e-4]
34 |     nusvc__gamma: [0.01, 0.1, 0.25, 0.5, 0.75]
35 | 
36 |   knn:
37 |     knn__n_neighbors: [3, 6, 9]
38 |     knn__weights: [uniform, distance]
39 |     knn__algorithm: [ball_tree, kd_tree, brute]
40 |     knn__metric: [minkowski, euclidean, chebyshev, manhattan]
41 | 
42 |   bag_knn:
43 |     bag_knn__base_estimator__n_neighbors: [3, 6, 9]
44 |     bag_knn__max_samples: [0.1, 0.25, 0.5, 0.75, 0.90]
45 |     bag_knn__max_features: [0.1, 0.25, 0.5, 0.75, 0.90]
46 | 
47 |   lr_lbfgs:
48 |     lr_lbfgs__C: [0.01, 0.1, 1, 10, 100, 1000]
49 |     lr_lbfgs__tol: [1e-3, 1e-4]
50 | 
51 |   rf:
52 |     rf__criterion: [gini, entropy]
53 |     rf__max_depth: [10, 12, 15, 20]
54 |     rf__min_samples_leaf: [1, 5, 8, 12]
55 |     rf__min_samples_split: [2, 3, 4, 5]
56 | 
57 |   et:
58 |     et__criterion: [gini, entropy]
59 |     et__max_depth: [6, 10, 18, 20, 25]
60 |     et__min_samples_leaf: [1, 5, 8, 12]
61 |     et__min_samples_split: [2, 3, 5, 8, 10]
62 | 
63 |   pa:
64 |     pa__C: [0.01, 0.1, 1, 10, 100, 1000]
65 |     pa__fit_intercept: [False, True]
66 |     pa__loss: [hinge, squared_hinge]
67 | 
68 |   xgb:
69 |     xgb__learning_rate: [0.01, 0.1, 1, 10, 100]
70 |     xgb__max_depth: [3, 6, 9, 12]
71 |     xgb__gamma: [0.1, 1, 10]
72 | 


--------------------------------------------------------------------------------
/stegasawus/seq.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from functools import partial
 3 | 
 4 | 
 5 | def all_the_kings_men(n, **kwargs):
 6 |     return xrange(n)
 7 | 
 8 | 
 9 | def _gen_skipy(y, n, **kwargs):
10 |     i = 0
11 |     x = 0
12 |     while i <= n:
13 |         x += y
14 |         yield x
15 |         i += 1
16 | 
17 | 
18 | def skipy(y):
19 |     return partial(_gen_skipy, y=y)
20 | 
21 | 
22 | def _gen_rand_jump(seed, max_jump, n, **kwargs):
23 |     i = 0
24 |     x = 0
25 |     np.random.seed(seed)
26 |     while i <= n:
27 |         x += np.random.randint(1, max_jump)
28 |         yield x
29 |         i += 1
30 | 
31 | 
32 | def rand_jump(seed, max_jump):
33 |     return partial(_gen_rand_jump, seed=seed, max_jump=max_jump)
34 | 
35 | 
36 | def _gen_rand_jump_circle(seed, max_jump, n, verbose=True, **kwargs):
37 |     np.random.seed(seed)
38 |     i = 0
39 |     x = 0
40 |     modified = []
41 |     n_resets = 0
42 |     while i <= n:
43 |         if x >= n:
44 |             # return to beginning of array once end reached
45 |             x = -1 + np.random.randint(1, max_jump)
46 |             n_resets += 1
47 |             if verbose:
48 |                 print 'x reset; n_resets = %d' % n_resets
49 |         else:
50 |             x += np.random.randint(1, max_jump)
51 |             if x not in modified and x < n:
52 |                 yield x
53 |                 modified.append(x)
54 |                 i += 1
55 | 
56 | 
57 | def rand_jump_circle(seed, max_jump):
58 |     return partial(_gen_rand_jump_circle, seed=seed, max_jump=max_jump)
59 | 
60 | 
61 | def _gen_rand_darts(seed, n):
62 |     np.random.seed(seed)
63 |     i = 0
64 |     remaining = range(n)
65 |     while i <= n:
66 |         x = np.random.randint(0, len(remaining))
67 |         yield remaining[x]
68 |         remaining.pop(x)
69 |         i += 1
70 | 
71 | 
72 | def rand_darts(seed):
73 |     return partial(_gen_rand_darts, seed=seed)
74 | 
75 | 
76 | def _gen_shuffle_iter(seed, n):
77 |     np.random.seed(seed)
78 |     i = 0
79 |     x = range(n)
80 |     np.random.shuffle(x)
81 |     while i < n:
82 |             yield x[i]
83 |             i += 1
84 | 
85 | 
86 | def shuffle_iter(seed):
87 |     return partial(_gen_shuffle_iter, seed=seed)
88 | 
89 | 
90 | if __name__ == '__main__':
91 |     pass
92 | 


--------------------------------------------------------------------------------
/stegasawus/mlp.py:
--------------------------------------------------------------------------------
  1 | from stegasawus.model import get_equal_sets, cv_split_generator
  2 | 
  3 | import numpy as np
  4 | import pandas as pd
  5 | import matplotlib.pyplot as plt
  6 | 
  7 | from sklearn import metrics
  8 | from sklearn.preprocessing import LabelEncoder, StandardScaler
  9 | from sklearn.pipeline import FeatureUnion, Pipeline
 10 | from sklearn.model_selection import StratifiedKFold, ShuffleSplit
 11 | from sklearn.decomposition import PCA
 12 | 
 13 | from keras.models import Sequential
 14 | from keras.layers import Dense, Dropout, Activation, ActivityRegularization
 15 | from keras.regularizers import WeightRegularizer
 16 | from keras.wrappers.scikit_learn import KerasClassifier
 17 | 
 18 | 
 19 | input_dim = 125
 20 | 
 21 | 
 22 | def create_mlp():
 23 |     model = Sequential()
 24 |     model.add(Dense(64, 'uniform', 'sigmoid', input_dim=input_dim))
 25 |     # model.add(ActivityRegularization(l1=0, l2=0.001))
 26 |     model.add(Dropout(0.2))
 27 |     model.add(Dense(output_dim=64, activation='tanh'))
 28 |     model.add(Dropout(0.1))
 29 |     model.add(Dense(1, activation='sigmoid'))
 30 | 
 31 |     model.compile(
 32 |         loss='binary_crossentropy',
 33 |         optimizer='adam',
 34 |         metrics=['accuracy']
 35 |     )
 36 |     return model
 37 | 
 38 | 
 39 | def plot_training_history(hist):
 40 |     fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(7, 8))
 41 |     ax1.plot(hist['acc'], '-', color='k', alpha=0.6, lw=1, label='acc')
 42 |     ax1.plot(hist['val_acc'], '-', color='r', alpha=0.6, lw=1, label='val_acc')
 43 |     ax1.set_xlabel('n_iterations')
 44 |     ax1.set_ylabel('accuracy', color='k')
 45 |     ax1.legend(loc='lower right')
 46 | 
 47 |     ax2.plot(hist['loss'], '-', color='purple', alpha=0.6, lw=1, label='loss')
 48 |     ax2.plot(hist['val_loss'], '-', color='b', alpha=0.6, lw=1, label='val_loss')
 49 |     ax2.set_xlabel('n_iterations')
 50 |     ax2.set_ylabel('loss', color='k')
 51 |     ax2.legend(loc='upper right')
 52 | 
 53 |     plt.savefig('{}/output/keras_mlp_training.png'.format(path))
 54 |     plt.show()
 55 | 
 56 | 
 57 | if __name__ == '__main__':
 58 |     path = '/home/rokkuran/workspace/stegasawus'
 59 |     path_train = '{}/data/features/train_lenna_identity.csv'.format(path)
 60 | 
 61 |     train = pd.read_csv(path_train)
 62 |     train = get_equal_sets(train)
 63 | 
 64 |     target = 'label'
 65 |     le_target = LabelEncoder().fit(train[target])
 66 |     y = le_target.transform(train[target])
 67 | 
 68 |     train = train.drop([target, 'image', 'filename'], axis=1)
 69 | 
 70 |     combined_features = Pipeline([
 71 |         ('pca', Pipeline([
 72 |             ('scaler', StandardScaler()),
 73 |             ('pca', PCA(n_components=input_dim)),
 74 |         ])),
 75 |     ])
 76 | 
 77 |     X = combined_features.fit_transform(train.as_matrix())
 78 | 
 79 |     model = KerasClassifier(build_fn=create_mlp)
 80 | 
 81 |     splitter = ShuffleSplit(n_splits=5, test_size=0.1, random_state=0)
 82 |     cv_splits = cv_split_generator(X=X, y=y, splitter=splitter)
 83 | 
 84 |     scores = []
 85 |     hist = {}
 86 |     for i, X_train, X_val, y_train, y_val in cv_splits:
 87 |         X = combined_features.fit_transform(train.as_matrix())
 88 |         results = model.fit(
 89 |             X_train,
 90 |             y_train,
 91 |             nb_epoch=250,
 92 |             batch_size=128,
 93 |             validation_split=0.1,
 94 |             verbose=1
 95 |         )
 96 | 
 97 |         y_pred = model.predict(X_val)
 98 |         acc = metrics.accuracy_score(y_val, y_pred.flatten())
 99 |         scores.append(acc)
100 | 
101 |         hist[i] = results.history
102 | 
103 |     scores = np.array(scores)
104 | 
105 |     plot_training_history(hist[0])
106 | 


--------------------------------------------------------------------------------
/stegasawus/dataset.py:
--------------------------------------------------------------------------------
  1 | from stegasawus import lsb, seq
  2 | 
  3 | import os
  4 | import base64
  5 | import cStringIO
  6 | from PIL import Image
  7 | 
  8 | from os import path, listdir
  9 | from skimage import io
 10 | 
 11 | 
 12 | def get_secret_message(filepath):
 13 |     """
 14 |     Read text file, return message.
 15 |     """
 16 |     with open(filepath, 'rb') as f:
 17 |         message = f.read()
 18 |     return message
 19 | 
 20 | 
 21 | def image_to_string(path_image):
 22 |     with open(path_image, 'rb') as f:
 23 |         return base64.b64encode(f.read())
 24 | 
 25 | 
 26 | def string_to_image(image_string):
 27 |     s = base64.b64decode(image_string)
 28 |     s = cStringIO.StringIO(s)
 29 |     return np.array(Image.open(s))
 30 | 
 31 | 
 32 | def crop_image(image, dim, centre=True):
 33 |     m, n = dim
 34 |     if centre:
 35 |         x, y, _ = image.shape
 36 |         x0 = int((x - m) / 2) - 1
 37 |         y0 = int((y - n) / 2) - 1
 38 |         xm = int(m + x0)
 39 |         yn = int(n + y0)
 40 |         return image[x0:xm, y0:yn]
 41 |     else:
 42 |         return image[0:m, 0:n]
 43 | 
 44 | 
 45 | def crop_images(path_images, path_output, dimensions, centre=True):
 46 |     """
 47 |     Batch crop images from top left hand corner to dimensions specified. Skips
 48 |     images where dimensions are incompatible.
 49 |     """
 50 |     print 'cropping images...'
 51 |     for i, filename in enumerate(os.listdir(path_images)):
 52 |         try:
 53 |             image = io.imread('{}{}'.format(path_images, filename))
 54 |             cropped = crop_image(image, dimensions, centre=centre)
 55 |             io.imsave(
 56 |                 fname='{}{}'.format(path_output, filename),
 57 |                 arr=cropped
 58 |             )
 59 |             print '{}: {}'.format(i, filename)
 60 |         except IndexError:
 61 |             print '{}: {} failed - dimensions incompatible'.format(i, filename)
 62 | 
 63 |     print 'all images cropped and saved.'
 64 | 
 65 | 
 66 | def batch_jpg_to_png(path_input, path_output):
 67 |     """
 68 |     Convert jpg images to png.
 69 |     """
 70 |     print 'coverting images...'
 71 |     for i, filename in enumerate(os.listdir(path_input)):
 72 |         input_jpg = '{}{}'.format(path_input, filename)
 73 | 
 74 |         fname = filename.replace('.jpg', '.png')
 75 |         output_png = '{}{}'.format(path_output, fname)
 76 | 
 77 |         I = io.imread(input_jpg)
 78 |         io.imsave(output_png, I)
 79 |         print '{}: {}'.format(i, filename)
 80 |     print 'image conversion complete.'
 81 | 
 82 | 
 83 | class DatasetGenerator(object):
 84 |     """
 85 |     Generates dataset from
 86 |     """
 87 |     def __init__(self, path_images, path_output, seq_method):
 88 |         super(DatasetGenerator, self).__init__()
 89 |         self._path_images = path_images
 90 |         self._path_output = path_output
 91 |         self._seq_method = seq_method
 92 | 
 93 |     def _read_embed_save(self, filename, message):
 94 |         try:
 95 |             path_cover = '{}{}'.format(self._path_images, filename)
 96 |             path_stego = '{}{}'.format(self._path_output, filename)
 97 |             I = io.imread(path_cover)
 98 |             S = lsb.embed(I, message, self._seq_method)
 99 |             io.imsave(arr=S, fname=path_stego)
100 |         except KeyError as e:
101 |             print '%s | message size greater than image capacity.' % filename
102 | 
103 |     def batch_hide_message(self, message):
104 |         # TODO: cleanup error handling
105 |         for i, filename in enumerate(listdir(self._path_images), start=1):
106 |             file_type = filename.split('.')[-1]
107 |             if file_type == 'png':
108 |                 self._read_embed_save(filename, message)
109 |                 print '{}: {}'.format(i, filename)
110 |             else:
111 |                 error = 'Image type not supported. Supported types: {png}'
112 |                 raise Exception(error)
113 | 
114 |         print 'image encoding complete.'
115 | 
116 | 
117 | def create_cropped_set(path_image, path_output, dims):
118 |     I = io.imread(path_image)
119 |     for m, n in dims.items():
120 |         pass
121 | 
122 | 
123 | if __name__ == '__main__':
124 |     cdir = '/home/rokkuran/workspace/stegasawus/'
125 |     fp = path.join(cdir, 'data/messages/Lenna_64x64.png')
126 | 
127 |     msg = image_to_string(fp)
128 | 
129 |     path_images = '{}images/png/cover_test/'.format(cdir)
130 |     path_output = '{}images/png/lsb_test/'.format(cdir)
131 | 
132 |     seq_method = seq.rand_darts(seed=77)
133 |     g = DatasetGenerator(path_images, path_output, seq_method)
134 |     g.batch_hide_message(msg)
135 | 


--------------------------------------------------------------------------------
/stegasawus/eda.py:
--------------------------------------------------------------------------------
  1 | from stegasawus import lsb, seq, dataset
  2 | 
  3 | import numpy as np
  4 | import pandas as pd
  5 | import abc
  6 | import pywt
  7 | import base64
  8 | import cStringIO
  9 | from PIL import Image
 10 | 
 11 | import matplotlib.pyplot as plt
 12 | import seaborn as sns
 13 | import skimage.io as io
 14 | 
 15 | from os import path
 16 | 
 17 | sns.set_style('whitegrid', {'axes.grid': False})
 18 | 
 19 | 
 20 | def rgb_to_grey(image):
 21 |     """
 22 |     Converts RGB image array (m, n, 3) to greyscale (m, n).
 23 |     """
 24 |     return np.dot(image, [0.2989, 0.5870, 0.1140])
 25 | 
 26 | 
 27 | class ImagePlots(object):
 28 |     __metaclass__ = abc.ABCMeta
 29 | 
 30 |     @abc.abstractproperty
 31 |     def I(self):
 32 |         raise NotImplementedError()
 33 | 
 34 |     @abc.abstractproperty
 35 |     def S(self):
 36 |         raise NotImplementedError()
 37 | 
 38 |     def plot_images(self):
 39 |         """
 40 |         Plot cover and steganographic RGB images side by side.
 41 |         """
 42 |         io.imshow(np.concatenate((self.I, self.S), axis=1))
 43 |         plt.title('Left: original cover image. Right: steganographic image.')
 44 |         plt.grid(False)
 45 |         plt.show()
 46 | 
 47 |     def plot_rgb_components(self):
 48 |         """
 49 |         Plot RGB colour channels for both cover and steganographic images.
 50 |         """
 51 |         f, axarr = plt.subplots(nrows=2, ncols=3)
 52 |         for i, image_type in enumerate(['Cover', 'Stego']):
 53 |             for j, colour in enumerate(['Red', 'Green', 'Blue']):
 54 |                 axarr[i, j].imshow(self.I[:, :, j], cmap='{}s'.format(colour))
 55 |                 axarr[i, j].set_title('{} {}'.format(image_type, colour))
 56 |                 axarr[i, j].set_xticklabels([])
 57 |                 axarr[i, j].set_yticklabels([])
 58 |         plt.show()
 59 | 
 60 |     def plot_rgb_difference(self):
 61 |         """
 62 |         Plots difference between cover and steganographic images for each RGB
 63 |         colour channel.
 64 |         """
 65 |         f, axarr = plt.subplots(1, 3, figsize=(12, 4))
 66 |         for j, colour in enumerate(['Red', 'Green', 'Blue']):
 67 |             diff = self.I[:, :, j] - self.S[:, :, j]
 68 |             axarr[j].imshow(diff, cmap='{}s_r'.format(colour))
 69 |             axarr[j].set_title('{}'.format(colour))
 70 |             axarr[j].set_xticklabels([])
 71 |             axarr[j].set_yticklabels([])
 72 |         plt.show()
 73 | 
 74 |     def plot_difference(self):
 75 |         """
 76 |         Plot difference between cover and steganographic image.
 77 |         """
 78 |         io.imshow(self.I - self.S)
 79 |         plt.grid(False)
 80 |         plt.show()
 81 | 
 82 | 
 83 | def plot_wavelet_decomposition(image, level=3):
 84 |     """
 85 |     Plot of 2D wavelet decompositions for given number of levels.
 86 | 
 87 |     image needs to be either a colour channel or greyscale image:
 88 |         rgb: self.I[:, :, n], where n = {0, 1, 2}
 89 |         greyscale: use rgb_to_grey(self.I)
 90 | 
 91 |     """
 92 |     coeffs = pywt.wavedec2(image, wavelet='haar', level=level)
 93 |     for i, (cH, cV, cD) in enumerate(coeffs[1:]):
 94 |         if i == 0:
 95 |             cAcH = np.concatenate((coeffs[0], cH), axis=1)
 96 |             cVcD = np.concatenate((cV, cD), axis=1)
 97 |             plot_image = np.concatenate((cAcH, cVcD), axis=0)
 98 |         else:
 99 |             plot_image = np.concatenate((plot_image, cH), axis=1)
100 |             cVcD = np.concatenate((cV, cD), axis=1)
101 |             plot_image = np.concatenate((plot_image, cVcD), axis=0)
102 | 
103 |     plt.grid(False)
104 |     io.imshow(abs(plot_image), cmap='gray_r')
105 |     plt.show()
106 | 
107 | 
108 | class JointImageAnalyser(ImagePlots):
109 |     """"""
110 |     def __init__(self, cover, stego):
111 |         super(JointImageAnalyser, self).__init__()
112 |         self._I = self._set_image(cover)
113 |         self._S = self._set_image(stego)
114 | 
115 |     def _check_type(self, v, types):
116 |         return any([isinstance(v, t) for t in types])
117 | 
118 |     def _set_image(self, image):
119 |         if self._check_type(image, [np.ndarray, list]):
120 |             return image
121 |         elif self._check_type(image, [str]):
122 |             return io.imread(image)
123 |         else:
124 |             raise Exception('Input image type not array like or filepath.')
125 | 
126 |     @property
127 |     def I(self):
128 |         return self._I
129 | 
130 |     @property
131 |     def S(self):
132 |         return self._S
133 | 
134 |     @property
135 |     def diff(self):
136 |         return self.I - self.S
137 | 
138 |     def print_details(self):
139 |         # TODO: add some more details...
140 |         a = np.sum(abs(self.diff))
141 |         print 'sum of absolute image difference = %s' % a
142 | 
143 |     def reveal(self, seq_method):
144 |         return lsb.reveal(self.S, seq_method)
145 | 
146 |     def reveal_image(self, seq_method, show=False):
147 |         s = base64.b64decode(self.reveal(seq_method))
148 |         s = cStringIO.StringIO(s)
149 |         I = np.array(Image.open(s))
150 | 
151 |         if show:
152 |             io.imshow(I)
153 |             plt.show()
154 | 
155 |         return I
156 | 
157 | 
158 | if __name__ == '__main__':
159 |     cdir = '/home/rokkuran/workspace/stegasawus/'
160 | 
161 |     fp = path.join(cdir, 'data/messages/Lenna_64x64.png')
162 |     msg = dataset.image_to_string(fp)
163 | 
164 |     path_images = '{}images/png/cover_test/'.format(cdir)
165 |     path_output = '{}images/png/lsb_test/'.format(cdir)
166 | 
167 |     seq_method = seq.rand_darts(seed=77)
168 |     g = dataset.DatasetGenerator(path_images, path_output, seq_method)
169 |     g.batch_hide_message(msg)
170 | 
171 |     filename = 'cat.117.png'
172 |     a = JointImageAnalyser(path_images + filename, path_output + filename)
173 |     H = a.reveal_image(seq_method)
174 |     io.imshow(H)
175 |     plt.show()
176 | 


--------------------------------------------------------------------------------
/stegasawus/lsb.py:
--------------------------------------------------------------------------------
  1 | from stegasawus import seq
  2 | 
  3 | import numpy as np
  4 | 
  5 | from os import path
  6 | from skimage import io
  7 | 
  8 | 
  9 | def bit_generator(s, verbose=False):
 10 |     """
 11 |     Yields individual bits for characters in string.
 12 |     """
 13 |     for x in s:
 14 |         a = ord(x)
 15 |         if verbose:
 16 |             print x, ord(x), bin(ord(x))
 17 | 
 18 |         i = 0
 19 |         while i < 7:
 20 |             if verbose:
 21 |                 print a, bin(a), a & 1
 22 |             yield a & 1
 23 |             a = a >> 1  # bit shifting embeds character backwards
 24 |             i += 1
 25 | 
 26 |     # signify end with 14 zeros (double ascii null)
 27 |     for x in xrange(14):
 28 |         yield 0
 29 | 
 30 | 
 31 | def set_lsb(byte, bit):
 32 |     """
 33 |     Replaces least significant bit of of byte with bit.
 34 |         if bit == 1:
 35 |         - 0110101 | 0000001 = 0110101
 36 |         - 0110100 | 0000001 = 0110101
 37 |         if bit == 0:
 38 |         - 0110100 & 1111110 = 0110100
 39 |         - 0110101 & 1111110 = 0110100
 40 |     """
 41 |     if bit:
 42 |         return byte | bit
 43 |     else:
 44 |         return byte & 0b11111110
 45 | 
 46 | 
 47 | def _old_embed(I, message, seq_method, verbose=False):
 48 |     """
 49 |     Embeds message in LSB of image at locations specified by seq_method.
 50 |     """
 51 |     dimensions = I.shape
 52 |     S = I.flatten().copy()
 53 |     bits = bit_generator(message)
 54 | 
 55 |     pixel_count = 0
 56 |     for i in seq_method(n=len(S)):
 57 |         bit = next(bits, None)
 58 |         if bit is not None:
 59 |             S[i] = set_lsb(S[i], bit)
 60 |             pixel_count += 1
 61 |             if verbose:
 62 |                 print '%d pixel modified' % i
 63 |         else:
 64 |             break
 65 | 
 66 |     if verbose:
 67 |         print 'Pixels modified: %.2f' % (pixel_count / 3.)
 68 |     return S.reshape(dimensions)
 69 | 
 70 | 
 71 | def reveal(S, seq_method):
 72 |     """
 73 |     Reveals embedded LSB message at locations specified by seq_method.
 74 |     """
 75 |     char = ''
 76 |     S = S.flatten()
 77 | 
 78 |     end = list(np.repeat(1, 14))  # message end signified by 14 zeros
 79 |     for i in seq_method(n=len(S)):  # img_length=len(S))
 80 |         x = S[i]
 81 |         bit = x & 1
 82 |         char += str(bit)
 83 |         end = end[1:] + [bit]
 84 |         if not sum(end):
 85 |             break
 86 | 
 87 |     text = ''
 88 |     while len(char) > 0:
 89 |         b = char[:7][::-1]  # reversed binary
 90 |         text += chr(int(b, 2))
 91 |         char = char[7:]
 92 | 
 93 |     # remove 2 ascii nulls at message end
 94 |     text = text.replace('\x00', '')
 95 |     return text
 96 | 
 97 | 
 98 | def binary_size(s):
 99 |     # add 14 due to double null to signify message end
100 |     # multiply by 7 for each binary element per character
101 |     return (len(s) + 14) * 7
102 | 
103 | 
104 | def best_max_jump(I, msg, verbose=False):
105 |     msg_binary_size = binary_size(msg)
106 |     max_jump = int(len(I.flatten()) / float(msg_binary_size))
107 |     if verbose:
108 |         args = len(I.flatten()), msg_binary_size, max_jump
109 |         print 'img_size=%d; msg_binary_size=%d; best_max_jump=%d' % args
110 |     return max_jump
111 | 
112 | 
113 | def _has_capacity(I, msg):
114 |     return False if binary_size(msg) > len(I.flatten()) else True
115 | 
116 | 
117 | def _check_capacity(I, msg, verbose=False):
118 |     args = (len(I.flatten()), binary_size(msg))
119 |     ps = 'img_size=%d; msg_binary_size=%d' % args
120 |     if verbose:
121 |         print ps
122 | 
123 |     if not _has_capacity(I, msg):
124 |         error = 'Message length too long to embed: ' + ps
125 |         raise Exception(error)
126 | 
127 | 
128 | def embed(I, message, seq_method, verbose=False):
129 |     """
130 |     Embeds message in LSB of image at locations specified by seq_method.
131 |     """
132 |     dimensions = I.shape
133 |     S = I.flatten().copy()
134 |     bits = bit_generator(message)
135 | 
136 |     if not _has_capacity(I, message):
137 |         args = (len(I.flatten()), binary_size(message))
138 |         ps = 'img_size=%d; msg_binary_size=%d' % args
139 |         error = 'message length too long to embed: ' + ps
140 |         print error
141 |     else:
142 |         pixel_count = 0
143 |         for i in seq_method(n=len(S)):
144 |             bit = next(bits, None)
145 |             if bit is not None:
146 |                 S[i] = set_lsb(S[i], bit)
147 |                 pixel_count += 1
148 |                 if verbose:
149 |                     print '%d pixel modified' % i
150 |             else:
151 |                 break
152 | 
153 |         if verbose:
154 |             print 'Pixels modified: %.2f' % (pixel_count / 3.)
155 |         return S.reshape(dimensions)
156 | 
157 | 
158 | if __name__ == '__main__':
159 |     cdir = path.dirname(__file__)
160 |     I = io.imread(path.join(cdir, '../data/messages/Lenna.png'))
161 | 
162 |     def check_embed_reveal(I, msg, seq_method):
163 |         S = embed(I, msg, seq_method)
164 |         return reveal(S, seq_method) == msg
165 | 
166 |     def test_characters(I, seq_method):
167 |         msg = 'abcdefghijklmnopqrstuvwxys 1234567890~`!@#$%^&*()_+-=:<>,.?/|  '
168 |         flag = check_embed_reveal(I, msg, seq_method)
169 |         assert flag, 'test_characters: %s' % seq_method.__name__
170 | 
171 |     # TODO: probably doesn't work for extended ascii codes
172 |     test_characters(I, seq.all_the_kings_men)
173 |     test_characters(I, seq.skipy(y=3))
174 |     test_characters(I, seq.skipy(y=5))
175 |     test_characters(I, seq.rand_jump(seed=77, max_jump=25))
176 |     test_characters(I, seq.rand_jump_circle(seed=77, max_jump=25))
177 |     test_characters(I, seq.rand_darts(seed=0))
178 | 
179 |     msg = 'abcdefghijklmnopqrstuvwxys 1234567890~`!@#$%^&*()_+-=:<>,.?/|  '
180 |     max_jump = best_max_jump(I, msg)
181 |     test_characters(I, seq.rand_jump(seed=77, max_jump=max_jump))
182 | 
183 |     # msg = 'abcdefghijklmnopqrstuvwxys 1234567890~`!@#$%^&*()_+-=:<>,.?/|  '
184 |     # seq_method = seq.all_the_kings_men
185 |     # S = _zembed(I, msg, seq_method)
186 |     # hmsg = _zreveal(S, seq_method)
187 |     # print hmsg
188 | 


--------------------------------------------------------------------------------
/stegasawus/models.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | import yaml
  4 | import re
  5 | 
  6 | import matplotlib.pyplot as plt
  7 | from scipy import stats
  8 | from collections import Counter
  9 | 
 10 | from sklearn.metrics import (
 11 |     accuracy_score, log_loss, precision_score, recall_score, f1_score,
 12 |     roc_auc_score)
 13 | from sklearn.preprocessing import (
 14 |     LabelEncoder, StandardScaler, LabelBinarizer, PolynomialFeatures)
 15 | from sklearn.model_selection import (
 16 |     GridSearchCV, learning_curve, ShuffleSplit, StratifiedShuffleSplit)
 17 | from sklearn.pipeline import Pipeline, FeatureUnion, make_pipeline
 18 | from sklearn.decomposition import PCA, KernelPCA
 19 | from sklearn.feature_selection import SelectKBest, RFE
 20 | from sklearn.naive_bayes import GaussianNB
 21 | from sklearn.linear_model import (
 22 |     LogisticRegression, PassiveAggressiveClassifier)
 23 | from sklearn.svm import SVC, LinearSVC
 24 | from sklearn.ensemble import RandomForestClassifier
 25 | from sklearn.naive_bayes import GaussianNB
 26 | from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
 27 | 
 28 | from keras.models import Sequential
 29 | from keras.layers import Dense, Dropout, Activation, ActivityRegularization
 30 | from keras.regularizers import WeightRegularizer
 31 | from keras.wrappers.scikit_learn import KerasClassifier
 32 | 
 33 | from xgboost import XGBClassifier
 34 | 
 35 | 
 36 | input_dim = 125
 37 | 
 38 | 
 39 | class ModelComparer(object):
 40 |     """"""
 41 |     def __init__(self, X, y, pipeline, splitter, classifiers, metrics=None):
 42 |         super(ModelComparer, self).__init__()
 43 |         self.X = X
 44 |         self.y = y
 45 |         self.pipeline = pipeline
 46 |         self.splitter = splitter
 47 |         self.classifiers = classifiers
 48 | 
 49 |         if metrics is None:
 50 |             self.metrics = [accuracy_score, log_loss, precision_score,
 51 |                             recall_score, f1_score, roc_auc_score]
 52 |         else:
 53 |             self.metrics = metrics
 54 | 
 55 |         self._scores = []
 56 |         self.models = {}
 57 | 
 58 |     def _get_pipeline(self, name):
 59 |         return make_pipeline(self.pipeline, classifiers[name])
 60 | 
 61 |     @property
 62 |     def cv_split_generator(self):
 63 |         """
 64 |         Train and validation set split generator.
 65 |         """
 66 |         g = enumerate(self.splitter.split(self.X, self.y))
 67 |         for i, (train_idx, val_idx) in g:
 68 |             X_train, X_val = self.X[train_idx], self.X[val_idx]
 69 |             y_train, y_val = self.y[train_idx], self.y[val_idx]
 70 |             yield i, X_train, X_val, y_train, y_val
 71 | 
 72 |     def _metric_name(self, f):
 73 |         return f.__name__.replace('_score', '')
 74 | 
 75 |     def _get_metric_scores(self, y_val, y_pred):
 76 |         s = str()
 77 |         scores = []
 78 |         for fn in self.metrics:
 79 |             score = fn(y_val, y_pred)
 80 |             scores.append(score)
 81 |             s += '%s = %.4f; ' % (self._metric_name(fn), score)
 82 |         return scores, s
 83 | 
 84 |     def model_comparison(self, cv_mean=True):
 85 |         self._scores = []
 86 |         for i, X_train, X_val, y_train, y_val in self.cv_split_generator:
 87 |             for name, clf in self.classifiers.items():
 88 |                 pipeline = self._get_pipeline(name)
 89 |                 model = pipeline.fit(X_train, y_train)
 90 |                 self.models[name] = model
 91 | 
 92 |                 y_pred = model.predict(X_val)
 93 |                 metrics, ps = self._get_metric_scores(y_val, y_pred)
 94 |                 ps += ' | %s_%d' % (name, i)
 95 |                 print ps
 96 |                 self._scores.append([name, i] + metrics)
 97 | 
 98 |     def scores(self, mean=True):
 99 |         cols = ['classifier', 'split']
100 |         cols += [self._metric_name(fn) for fn in self.metrics]
101 | 
102 |         df = pd.DataFrame(self._scores, columns=cols)
103 |         df = df.sort_values(
104 |             by=['accuracy', 'log_loss'],
105 |             ascending=[False, True]
106 |         ).reset_index(drop=True)
107 | 
108 |         df_mean = df.ix[:, df.columns != 'split'] \
109 |             .groupby(['classifier']) \
110 |             .mean() \
111 |             .sort_values(
112 |                 by=['accuracy', 'log_loss'],
113 |                 ascending=[False, True])
114 | 
115 |         return df_mean if mean else df
116 | 
117 | 
118 | def create_mlp():
119 |     model = Sequential()
120 |     model.add(Dense(64, 'uniform', 'sigmoid', input_dim=input_dim))
121 |     # model.add(ActivityRegularization(l1=0, l2=0.001))
122 |     model.add(Dropout(0.2))
123 |     model.add(Dense(output_dim=64, activation='tanh'))
124 |     model.add(Dropout(0.1))
125 |     model.add(Dense(1, activation='sigmoid'))
126 | 
127 |     model.compile(
128 |         loss='binary_crossentropy',
129 |         optimizer='adam',
130 |         metrics=['accuracy']
131 |     )
132 |     return model
133 | 
134 | 
135 | classifiers = {
136 |     'keras_mlp': KerasClassifier(
137 |         build_fn=create_mlp,
138 |         nb_epoch=150,
139 |         batch_size=64
140 |     ),
141 |     'svc_linear': LinearSVC(),
142 |     'lr_lbfgs': LogisticRegression(
143 |         C=2.02739770e+04,  # particle swarm optimised
144 |         tol=6.65926091e-04,
145 |         solver='lbfgs'
146 |     ),
147 |     'lr_lbfgs_default': LogisticRegression(solver='lbfgs'),
148 |     'pa': PassiveAggressiveClassifier(
149 |         C=0.01,
150 |         fit_intercept=True,
151 |         loss='hinge'
152 |     ),
153 |     'pa_default': PassiveAggressiveClassifier(),
154 |     'gnb': GaussianNB(),
155 |     'lda': LinearDiscriminantAnalysis(),
156 |     'rf': RandomForestClassifier(
157 |         n_estimators=200,
158 |         criterion='gini',
159 |         max_depth=4,
160 |         min_samples_leaf=3,
161 |         min_samples_split=3
162 |     ),
163 |     'xgb': XGBClassifier(
164 |         n_estimators=200,
165 |         max_depth=6,
166 |         learning_rate=0.1,
167 |         gamma=1,
168 |         objective='binary:logistic',
169 |         nthread=-1
170 |     ),
171 | }
172 | 
173 | pipeline = Pipeline([
174 |     ('scaler', StandardScaler()),
175 |     ('pca', PCA(n_components=input_dim)),
176 | ])
177 | 
178 | 
179 | if __name__ == '__main__':
180 |     path = '/home/rokkuran/workspace/stegasawus'
181 |     path_train = '{}/data/features/train_lenna.csv'.format(path)
182 | 
183 |     train = pd.read_csv(path_train)
184 | 
185 |     target = 'label'
186 |     le_target = LabelEncoder().fit(train[target])
187 |     y = le_target.transform(train[target])
188 | 
189 |     train = train.drop([target, 'image'], axis=1)
190 |     X = train.as_matrix()
191 | 
192 |     splitter = ShuffleSplit(n_splits=3, test_size=0.2, random_state=0)
193 |     mc = ModelComparer(X, y, pipeline, splitter, classifiers)
194 |     mc.model_comparison()
195 |     print '\n', mc.scores()
196 | 


--------------------------------------------------------------------------------
/stegasawus/features.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | import os
  4 | import pywt
  5 | 
  6 | from scipy import stats
  7 | import matplotlib.pyplot as plt
  8 | from skimage import io
  9 | 
 10 | 
 11 | def statistical_metrics(x):
 12 |     """
 13 |     Calculates statistical metrics on input array (mean, std, skew, kurtosis).
 14 |     """
 15 | 
 16 |     metrics = {
 17 |         'mean': np.mean,
 18 |         'stdev': np.std,
 19 |         'skew': stats.skew,
 20 |         'kurtosis': stats.kurtosis
 21 |     }
 22 |     return {k: fn(x.flatten()) for k, fn in metrics.items()}
 23 | 
 24 | 
 25 | def prefix_dict_keys(d, prefix):
 26 |     """
 27 |     Adds prefix to dict keys.
 28 |     """
 29 |     return {'{}_{}'.format(prefix, k): v for k, v in d.items()}
 30 | 
 31 | 
 32 | def autocorrelation_features(I, lags=[(1, 0), (0, 1), (1, 1)]):
 33 |     """
 34 |     Calculate the autocorrelation statistical features from a 2D image array
 35 |     (greyscale image or an individual colour channel) for the specified pixel
 36 |     vertical and horizontal coordinate shift lags:
 37 |         e.g. [(1, 0), (0, 1), (1, 1), (1, 2), (2, 1), (2, 2)]
 38 |     """
 39 |     m, n = I.shape
 40 | 
 41 |     features = {}
 42 |     for x, y in lags:
 43 |         ac = I[x:, y:] * I[:m-x, :n-y]
 44 |         aca = np.sum(ac / (I[x:, y:].std() * I[:m-x, :n-y].std()))
 45 | 
 46 |         features['aca_{}{}'.format(x, y)] = aca
 47 | 
 48 |         f_stat = statistical_metrics(ac)
 49 |         f_stat = prefix_dict_keys(f_stat, 'ac_{}{}'.format(x, y))
 50 |         features.update(f_stat)
 51 | 
 52 |     return features
 53 | 
 54 | 
 55 | def rgb_autocorrelation_features(I, lags=((1, 0), (0, 1), (1, 1))):
 56 |     """
 57 |     Calculate the autocorrelation statistical features of an RGB image
 58 |     array (m, n, 3) for the specified lags.
 59 |     """
 60 |     features = {}
 61 |     m, n, _ = I.shape
 62 | 
 63 |     for c, colour in enumerate('rgb'):
 64 |         f_ac = autocorrelation_features(I[:, :, c], lags)
 65 |         f_ac = prefix_dict_keys(f_ac, colour)
 66 |         features.update(f_ac)
 67 | 
 68 |     return features
 69 | 
 70 | 
 71 | def concatenate_feature_sets(filepath_cover, filepath_stego, filepath_output):
 72 |     """
 73 |     Concatenates two feature csv files.
 74 | 
 75 |     Parameters
 76 |     ----------
 77 |     filepath_cover : string
 78 |         Filepath to cover image feature set.
 79 |     filepath_stego : string
 80 |         Filepath to steganographic image feature set.
 81 |     filepath_output : string
 82 |         Output filepath.
 83 | 
 84 |     Returns
 85 |     -------
 86 |     Concatenated dataset.
 87 | 
 88 |     """
 89 |     train_cover = pd.read_csv(filepath_cover)
 90 |     train_stego = pd.read_csv(filepath_stego)
 91 |     train = pd.concat([train_cover, train_stego])
 92 |     train.to_csv(filepath_output, index=False)
 93 |     return train
 94 | 
 95 | 
 96 | def concat_multiple_feature_sets(filepaths, filepath_output):
 97 |     train = pd.DataFrame()
 98 |     for filepath in filepaths:
 99 |         df = pd.read_csv(filepath)
100 |         df['filename'] = filepath.split('/')[-1]
101 |         train = pd.concat([train, df])
102 |     train.to_csv(filepath_output, index=False)
103 |     return train
104 | 
105 | 
106 | def apply_tolerance(x, tol):
107 |     """
108 |     Applies absolute value filter for given tolerance.
109 | 
110 |     Parameters
111 |     ----------
112 |     x : numpy.ndarray
113 |         Input data.
114 |     tol : int, float
115 |         Tolerance.
116 | 
117 |     Returns
118 |     -------
119 |     Filtered array where |x| >= tol.
120 |     If no values are above the tolerance np.array([0]) is returned.
121 | 
122 |     """
123 |     x_tol = abs(x) >= tol
124 |     if x_tol.any():
125 |         return x[x_tol]
126 |     else:
127 |         return np.zeros(1)
128 | 
129 | 
130 | def wavdec_features(coeffs, tol=1):
131 |     """
132 |     Calculated the statistical features on the components of a mulitlevel 2D
133 |     discrete wavelet decomposition.
134 | 
135 |     Parameters
136 |     ----------
137 |     coeffs : list
138 |         n level coefficients from pywt.wavedec2
139 |         [cAn, (cHn, cVn, cDn), ... (cH1, cV1, cD1)]
140 |     tol : int, float, default : 1
141 |         Tolerance to apply to individual coefficient arrays.
142 | 
143 |     Returns
144 |     -------
145 |     features : dict
146 |         Feature vector of statistical components in dictionary form.
147 | 
148 |     """
149 |     n_layers = len(coeffs) - 1
150 | 
151 |     features = {}
152 | 
153 |     cA = coeffs[0]
154 |     prefix = 'dwt_{}_cA'.format(n_layers)
155 |     cA = apply_tolerance(cA, tol)  # reduce sensitivity to noise
156 |     f_stat = statistical_metrics(cA)
157 |     f_stat = prefix_dict_keys(f_stat, prefix)
158 |     features.update(f_stat)
159 | 
160 |     for i, (cH, cV, cD) in enumerate(coeffs[1:]):
161 |         layer = n_layers - i
162 |         for c, cX in zip(('cH', 'cV', 'cD'), (cH, cV, cD)):
163 |             prefix = 'dwt_{}_{}'.format(layer, c)
164 |             cX = apply_tolerance(cX, tol)
165 |             f_stat = statistical_metrics(cX)
166 |             f_stat = prefix_dict_keys(f_stat, prefix)
167 |             features.update(f_stat)
168 | 
169 |     return features
170 | 
171 | 
172 | def rgb_wavelet_features(I, tol=1):
173 |     """
174 |     For each RGB channel, calculates the statistical features the components of
175 |     a mulitlevel 2D discrete wavelet decomposition.
176 | 
177 |     Parameters
178 |     ----------
179 |     I : numpy.ndarray
180 |         RGB image array.
181 |     tol : int, float, default : 1
182 |         Tolerance to apply to individual coefficient arrays.
183 | 
184 |     Returns
185 |     -------
186 |     features : dict
187 |         Feature vector of statistical components in dictionary form.
188 | 
189 |     """
190 |     features = {}
191 |     m, n, _ = I.shape
192 | 
193 |     for c, colour in enumerate('rgb'):
194 |         coeffs = pywt.wavedec2(I[:, :, c], wavelet='haar', level=3)
195 |         f_wavelet = wavdec_features(coeffs)
196 |         f_wavelet = prefix_dict_keys(f_wavelet, colour)
197 |         features.update(f_wavelet)
198 | 
199 |     return features
200 | 
201 | 
202 | def create_feature_dataset(path_images, class_label, path_output,
203 |                            f_types=['autocorrelation', 'wavelet'],
204 |                            image_limit=None):
205 | 
206 |     """
207 |     Create feature vectors from images in directory and save as csv output.
208 | 
209 |     Parameters
210 |     ----------
211 |     path_images : directory path string
212 |         Directory with images for processing.
213 |     class_label : string
214 |         Class label used in label column of output.
215 |     path_output : directory path string
216 |         Output directory for csv file.
217 |     f_types : array_like, default : ['autocorrelation', 'wavelet']
218 |         Specify the feature types to include as list of strings:
219 |         {'autocorrelation', 'wavelet'}
220 |         Default: ['autocorrelation', 'wavelet']
221 |     image_limit : int, default : None
222 |         Number of images in directory to process.
223 | 
224 |     Returns
225 |     -------
226 |     csv output file as specified in path_output.
227 | 
228 |     """
229 | 
230 |     print 'creating image feature dataset...'
231 | 
232 |     dataset = list()
233 |     for i, filename in enumerate(os.listdir(path_images)):
234 |         fname = '{}{}'.format(path_images, filename)
235 |         image = io.imread(fname)
236 | 
237 |         features = {}
238 |         if 'autocorrelation' in f_types:
239 |             lags = ((1, 0), (0, 1), (1, 1), (1, 2), (2, 2), (2, 2))
240 |             features.update(rgb_autocorrelation_features(image, lags))
241 | 
242 |         if 'wavelet' in f_types:
243 |             features.update(rgb_wavelet_features(image))
244 | 
245 |         if i == 0:
246 |             feature_names = features.keys()
247 | 
248 |         row = [filename, class_label]
249 |         for feature in feature_names:
250 |             row.append(features[feature])
251 |         dataset.append(row)
252 | 
253 |         if i % 250 == 0:
254 |             print '{} images processed'.format(i)
255 | 
256 |         if image_limit:
257 |             if i > image_limit:
258 |                 break
259 | 
260 |     df = pd.DataFrame(dataset, columns=['image', 'label'] + feature_names)
261 |     df.to_csv(path_output, index=False)
262 | 
263 |     print 'image feature dataset created.'
264 | 
265 | 
266 | # ******************************************************************************
267 | if __name__ == '__main__':
268 |     pass
269 | 


--------------------------------------------------------------------------------
/stegasawus/tuning.py:
--------------------------------------------------------------------------------
  1 | from stegasawus.model import (
  2 |     cv_split_generator,
  3 |     get_pipeline,
  4 |     get_equal_sets)
  5 | 
  6 | import numpy as np
  7 | import pandas as pd
  8 | import yaml
  9 | import re
 10 | import collections
 11 | import functools
 12 | 
 13 | import matplotlib.pyplot as plt
 14 | 
 15 | from pyswarm import pso
 16 | 
 17 | from sklearn import metrics
 18 | from sklearn.preprocessing import (
 19 |     LabelEncoder,
 20 |     StandardScaler,
 21 |     PolynomialFeatures)
 22 | from sklearn.model_selection import (
 23 |     GridSearchCV,
 24 |     learning_curve,
 25 |     validation_curve,
 26 |     ShuffleSplit)
 27 | from sklearn.pipeline import Pipeline, FeatureUnion
 28 | from sklearn.decomposition import PCA, KernelPCA
 29 | from sklearn.feature_selection import SelectKBest, RFE
 30 | 
 31 | from sklearn.naive_bayes import GaussianNB
 32 | from sklearn.neighbors import KNeighborsClassifier
 33 | from sklearn.linear_model import (
 34 |     LogisticRegression,
 35 |     PassiveAggressiveClassifier)
 36 | from sklearn.svm import SVC, LinearSVC, NuSVC
 37 | from sklearn.tree import DecisionTreeClassifier
 38 | from sklearn.ensemble import (
 39 |     RandomForestClassifier,
 40 |     ExtraTreesClassifier,
 41 |     AdaBoostClassifier,
 42 |     GradientBoostingClassifier,
 43 |     VotingClassifier)
 44 | from sklearn.naive_bayes import GaussianNB
 45 | from sklearn.discriminant_analysis import (
 46 |     LinearDiscriminantAnalysis,
 47 |     QuadraticDiscriminantAnalysis)
 48 | 
 49 | from xgboost import XGBClassifier
 50 | 
 51 | 
 52 | # ******************************************************************************
 53 | def gs_parameter_tuning(clf, X_train, y_train, parameters, scoring, cv=5):
 54 |     gs_clf = GridSearchCV(clf, parameters, scoring=scoring, cv=cv, n_jobs=6)
 55 |     gs_clf = gs_clf.fit(X_train, y_train)
 56 | 
 57 |     best_parameters, score, _ = max(gs_clf.grid_scores_, key=lambda x: x[1])
 58 |     for param_name in sorted(parameters.keys()):
 59 |         print("%s: %r" % (param_name, best_parameters[param_name]))
 60 | 
 61 | 
 62 | # TODO: improve, extend, refactor
 63 | def pso_parameter_tuning(clf, X, y, lb, ub, swarmsize, maxiter, n_splits=3,
 64 |                          integer=False, *args):
 65 |     """
 66 |     Particle swarm optimisation based parameter tuning.
 67 | 
 68 |     Parameters
 69 |     ----------
 70 |     clf : sklearn classifier
 71 |         Model to tune parameters.
 72 |     X : numpy.ndarray
 73 |         Training features.
 74 |     y : numpy.ndarray
 75 |         Training target values.
 76 |     lb : array_like
 77 |         Lower bound values for parameters to tune.
 78 |     ub : array_like
 79 |         Upper bound values for parameters to tune.
 80 |     swarmsize : int
 81 |         Number of particles in the swarm.
 82 |     maxiter : int
 83 |         Maximum number of iterations for swarm to search.
 84 |     n_splits : int, default = 3
 85 |         Number of cross validation splits.
 86 | 
 87 |     Returns
 88 |     -------
 89 |     g : array
 90 |         The swarm's best known parameters settings.
 91 |     f : scalar
 92 |         The value of the minimisation function at g.
 93 | 
 94 |     """
 95 |     def clf_check(clf, classifiers):
 96 |         return any([isinstance(clf, c) for c in classifiers])
 97 | 
 98 |     def minimise(x, *args):
 99 |         """"""
100 |         if clf_check(clf, [LinearSVC, LogisticRegression]):
101 |             C, tol = x
102 |             clf.set_params(C=C, tol=tol)
103 | 
104 |         elif clf_check(clf, [RandomForestClassifier]):
105 |             # random forest: all values need to be integer
106 |             x = [int(np.round(v, 0)) for v in x]
107 |             max_depth, min_samples_leaf, min_samples_split = x
108 |             clf.set_params(
109 |                 max_depth=max_depth,
110 |                 min_samples_leaf=min_samples_leaf,
111 |                 min_samples_split=min_samples_split)
112 | 
113 |         elif clf_check(clf, [XGBClassifier]):
114 |             # xgb: max_depth should be integer
115 |             max_depth, learning_rate, gamma = x
116 |             max_depth = int(np.round(max_depth, 0))
117 |             clf.set_params(
118 |                 max_depth=max_depth,
119 |                 learning_rate=learning_rate,
120 |                 gamma=gamma)
121 | 
122 |         else:
123 |             raise Exception('Classifier not supported.')
124 | 
125 |         pipeline = Pipeline([
126 |             ('pca', Pipeline([
127 |                 ('scaler', StandardScaler()),
128 |                 ('pca', PCA(n_components=125)),
129 |             ])),
130 |             ('clf', clf)
131 |         ])
132 | 
133 |         ss = ShuffleSplit(n_splits=n_splits, test_size=0.2)
134 |         cv_splits = cv_split_generator(X=X, y=y, splitter=ss)
135 | 
136 |         ll = []
137 |         for i, X_train, X_val, y_train, y_val in cv_splits:
138 |             model = pipeline.fit(X_train, y_train)
139 |             y_pred = model.predict(X_val)
140 |             ll.append(metrics.log_loss(y_val, y_pred))
141 | 
142 |         print x, np.mean(ll)
143 |         return np.mean(ll)
144 | 
145 |     g, f = pso(minimise, lb, ub, swarmsize=swarmsize, maxiter=maxiter,
146 |                debug=True, args=('clf', clf))
147 |     return g, f
148 | 
149 | 
150 | classifiers = {
151 |     'knn': KNeighborsClassifier(
152 |         n_neighbors=6,
153 |         algorithm='ball_tree',
154 |         weights='distance',
155 |         metric='chebyshev'
156 |     ),
157 |     'knn_default': KNeighborsClassifier(),
158 |     'svc_rbf': SVC(
159 |         kernel='rbf',
160 |         C=50,
161 |         gamma=0.01,
162 |         tol=1e-3
163 |     ),
164 |     'svc_rbf_default': SVC(kernel='rbf'),
165 |     'svc_linear': LinearSVC(
166 |         C=1e3,
167 |         loss='squared_hinge',
168 |         penalty='l2',
169 |         tol=1e-3
170 |     ),
171 |     'svc_linear_default': LinearSVC(),
172 |     'nusvc': NuSVC(),
173 |     'rf': RandomForestClassifier(
174 |         criterion='gini',
175 |         n_estimators=200,
176 |         max_depth=4,
177 |         min_samples_leaf=3,
178 |         min_samples_split=3
179 |     ),
180 |     'rf_default': RandomForestClassifier(),
181 |     'adaboost': AdaBoostClassifier(),
182 |     'et': ExtraTreesClassifier(
183 |         criterion='entropy',
184 |         max_depth=25,
185 |         min_samples_leaf=5,
186 |         min_samples_split=5
187 |     ),
188 |     'et_default': ExtraTreesClassifier(),
189 |     'gbc': GradientBoostingClassifier(),
190 |     'lr_lbfgs': LogisticRegression(
191 |         C=2.02739770e+04,  # particle swarm optimised
192 |         tol=6.65926091e-04,
193 |         solver='lbfgs'
194 |     ),
195 |     'lr_lbfgs_default': LogisticRegression(solver='lbfgs'),
196 |     'pa': PassiveAggressiveClassifier(
197 |         C=0.01,
198 |         fit_intercept=True,
199 |         loss='hinge'
200 |     ),
201 |     'pa_default': PassiveAggressiveClassifier(),
202 |     'gnb': GaussianNB(),
203 |     'lda': LinearDiscriminantAnalysis(),
204 |     'qda': QuadraticDiscriminantAnalysis(),
205 |     'xgb_defualt': XGBClassifier(),
206 |     'xgb': XGBClassifier(
207 |         max_depth=6,
208 |         learning_rate=0.01,
209 |         n_estimators=100,
210 |         silent=True,
211 |         objective='binary:logistic',
212 |         nthread=-1,
213 |         gamma=0,
214 |         min_child_weight=1,
215 |         max_delta_step=0,
216 |         subsample=1,
217 |         colsample_bytree=1,
218 |         colsample_bylevel=1,
219 |         reg_alpha=0,
220 |         reg_lambda=1,
221 |         scale_pos_weight=1,
222 |         base_score=0.5,
223 |         seed=0,
224 |         missing=None
225 |     )
226 | }
227 | 
228 | 
229 | if __name__ == '__main__':
230 |     path = '/home/rokkuran/workspace/stegasawus'
231 |     path_train = '{}/data/features/train_lenna_identity.csv'.format(path)
232 | 
233 |     train = pd.read_csv(path_train)
234 |     train = get_equal_sets(train)
235 | 
236 |     filenames = train.filename.copy()
237 |     filenames = filenames.apply(
238 |         lambda s: re.search(r'lenna\d+', s).group()
239 |         if re.search(r'lenna\d+', s) is not None else 'cover'
240 |     )
241 | 
242 |     # target and index preprocessing
243 |     target = 'label'
244 |     le_target = LabelEncoder().fit(train[target])
245 |     y_train_binary = le_target.transform(train[target])
246 | 
247 |     train = train.drop([target, 'image', 'filename'], axis=1)
248 | 
249 |     # **************************************************************************
250 |     parameters = yaml.safe_load(
251 |         open('{}/stegasawus/parameter_tuning.yaml'.format(path), 'rb')
252 |     )
253 | 
254 |     # **************************************************************************
255 |     # Grid search parameter tuning.
256 |     def run_gs_parameter_tuning():
257 |         name = 'knn'
258 |         pipeline = get_pipeline(name)
259 | 
260 |         gs_parameter_tuning(
261 |             clf=pipeline,
262 |             X_train=train.as_matrix(),
263 |             y_train=y_train_binary,
264 |             cv=3,
265 |             parameters=parameters['grid_search'][name],
266 |             scoring='accuracy'
267 |         )
268 | 
269 |     # run_gs_parameter_tuning()
270 | 
271 |     # **************************************************************************
272 |     # Particle swarm optimisation parameter tuning.
273 |     def run_pso_parameter_tuning(clf_name):
274 | 
275 |         # TODO: fix issue with string representations of '1e-3' in yaml read
276 |         lb = [float(v) for v in parameters['pso'][clf_name]['lb']]
277 |         ub = [float(v) for v in parameters['pso'][clf_name]['ub']]
278 | 
279 |         g, f = pso_parameter_tuning(
280 |             clf=classifiers['lr_lbfgs'], X=train.as_matrix(), y=y_train_binary,
281 |             lb=lb, ub=ub, swarmsize=100, maxiter=20, n_splits=3)
282 |         print g, f
283 | 
284 |     run_pso_parameter_tuning('lr_lbfgs')
285 |     # run_pso_parameter_tuning('rf')
286 |     # run_pso_parameter_tuning('xgb')
287 | 
288 |     # **************************************************************************
289 |     def plot_validation_curve():
290 |         name = 'svc_linear'
291 |         pipeline = get_pipeline(name)
292 | 
293 |         param_range = np.logspace(-2, 3, 6)
294 |         # param_range = np.logspace(-5, -1, 5)
295 |         train_scores, val_scores = validation_curve(
296 |             estimator=pipeline,
297 |             X=train.as_matrix(),
298 |             y=y_train_binary,
299 |             param_name='%s__C' % name,
300 |             # param_name='lr_lbfgs__tol',
301 |             param_range=param_range,
302 |             cv=5,
303 |             scoring='accuracy',
304 |             n_jobs=6
305 |         )
306 | 
307 |         plt.semilogx(
308 |             param_range, train_scores.mean(axis=1),
309 |             ls='-', lw=1, color='b', alpha=1, label='train'
310 |         )
311 |         plt.fill_between(
312 |             param_range,
313 |             train_scores.mean(axis=1) - train_scores.std(axis=1),
314 |             train_scores.mean(axis=1) + train_scores.std(axis=1),
315 |             color='b', alpha=0.1, lw=0.5
316 |         )
317 |         plt.semilogx(
318 |             param_range, val_scores.mean(axis=1),
319 |             ls='-', lw=1, color='r', alpha=1, label='validation'
320 |         )
321 |         plt.fill_between(
322 |             param_range,
323 |             val_scores.mean(axis=1) - val_scores.std(axis=1),
324 |             val_scores.mean(axis=1) + val_scores.std(axis=1),
325 |             color='r', alpha=0.1, lw=0.5
326 |         )
327 | 
328 |         plt.title('%s: validation curve' % name)
329 |         plt.xlabel('C')
330 |         plt.ylabel('Score')
331 |         plt.ylim(0.0, 1.1)
332 |         plt.legend(loc="best")
333 |         plt.show()
334 | 
335 |     # **************************************************************************
336 |     def plot_roc_curve(name):
337 |         pipeline = get_pipeline(name)
338 | 
339 |         ss = ShuffleSplit(n_splits=5, test_size=0.2)
340 | 
341 |         X = train.as_matrix()
342 |         y = y_train_binary
343 | 
344 |         fpr, tpr = [], []
345 |         for i, (train_idx, val_idx) in enumerate(ss.split(X, y)):
346 |             X_train, X_val = X[train_idx], X[val_idx]
347 |             y_train, y_val = y[train_idx], y[val_idx]
348 | 
349 |             model = pipeline.fit(X_train, y_train)
350 |             y_pred = model.predict(X_val)
351 | 
352 |             fpr_i, tpr_i, _ = metrics.roc_curve(y_val, y_pred)
353 |             fpr.append(fpr_i)
354 |             tpr.append(tpr_i)
355 | 
356 |         fpr, tpr = np.array(fpr), np.array(tpr)
357 | 
358 |         plt.figure()
359 |         plt.plot(
360 |             fpr.mean(axis=0), tpr.mean(axis=0),
361 |             color='b', alpha=0.6, lw=1, label='ROC curve'
362 |         )
363 |         plt.plot([0, 1], [0, 1], color='k', alpha=0.6, lw=1, linestyle='--')
364 |         plt.xlim([0.0, 1.0])
365 |         plt.ylim([0.0, 1.05])
366 |         plt.xlabel('False Positive Rate')
367 |         plt.ylabel('True Positive Rate')
368 |         plt.title('%s: ROC Curve' % name)
369 |         plt.legend(loc="lower right")
370 |         plt.show()
371 | 
372 |     # plot_roc_curve('svc_linear')
373 | 


--------------------------------------------------------------------------------