├── data └── messages │ ├── Lenna.png │ ├── Lenna_16x16.png │ ├── Lenna_32x32.png │ ├── Lenna_64x64.png │ ├── Lenna_8x8.png │ ├── Lenna_128x128.png │ ├── Lenna_256x256.png │ └── himmw.txt ├── output └── plots │ ├── clf_embedding_acc.png │ └── clf_embedding_ll.png ├── stegasawus ├── __init__.py ├── parameter_tuning.yaml ├── seq.py ├── mlp.py ├── dataset.py ├── eda.py ├── lsb.py ├── models.py ├── features.py └── tuning.py ├── setup.py ├── .gitignore ├── examples ├── dataset_creation.py ├── image_plots.py └── feature_set_creation.py └── README.md /data/messages/Lenna.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rokkuran/stegasawus/HEAD/data/messages/Lenna.png -------------------------------------------------------------------------------- /data/messages/Lenna_16x16.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rokkuran/stegasawus/HEAD/data/messages/Lenna_16x16.png -------------------------------------------------------------------------------- /data/messages/Lenna_32x32.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rokkuran/stegasawus/HEAD/data/messages/Lenna_32x32.png -------------------------------------------------------------------------------- /data/messages/Lenna_64x64.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rokkuran/stegasawus/HEAD/data/messages/Lenna_64x64.png -------------------------------------------------------------------------------- /data/messages/Lenna_8x8.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rokkuran/stegasawus/HEAD/data/messages/Lenna_8x8.png -------------------------------------------------------------------------------- /data/messages/Lenna_128x128.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rokkuran/stegasawus/HEAD/data/messages/Lenna_128x128.png -------------------------------------------------------------------------------- /data/messages/Lenna_256x256.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rokkuran/stegasawus/HEAD/data/messages/Lenna_256x256.png -------------------------------------------------------------------------------- /output/plots/clf_embedding_acc.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rokkuran/stegasawus/HEAD/output/plots/clf_embedding_acc.png -------------------------------------------------------------------------------- /output/plots/clf_embedding_ll.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rokkuran/stegasawus/HEAD/output/plots/clf_embedding_ll.png -------------------------------------------------------------------------------- /stegasawus/__init__.py: -------------------------------------------------------------------------------- 1 | import dataset 2 | import eda 3 | import features 4 | import model 5 | import lsb 6 | import seq 7 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from distutils.core import setup 4 | 5 | setup( 6 | name='stegasawus', 7 | version='0.4.2', 8 | description='Machine learning detection of steganographic images', 9 | author='Lachlan Taylor', 10 | packages=['stegasawus'] # 'stegasawus.lsb'] 11 | # data_files=[('./images/', ['Lenna.png', 'image.png'])] 12 | ) 13 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | data/archive/ 2 | data/features/ 3 | output/distplots 4 | output/histograms 5 | output/log1p_test 6 | output/k* 7 | images/ 8 | jpeg-v4/ 9 | # archive/ 10 | 11 | # Byte-compiled / optimized / DLL files 12 | __pycache__/ 13 | *.py[cod] 14 | *$py.class 15 | 16 | # C extensions 17 | *.so 18 | 19 | # Distribution / packaging 20 | .Python 21 | env/ 22 | build/ 23 | develop-eggs/ 24 | dist/ 25 | downloads/ 26 | eggs/ 27 | .eggs/ 28 | lib/ 29 | lib64/ 30 | parts/ 31 | sdist/ 32 | var/ 33 | *.egg-info/ 34 | .installed.cfg 35 | *.egg 36 | 37 | # PyInstaller 38 | # Usually these files are written by a python script from a template 39 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 40 | *.manifest 41 | *.spec 42 | 43 | # Installer logs 44 | pip-log.txt 45 | pip-delete-this-directory.txt 46 | 47 | # Unit test / coverage reports 48 | htmlcov/ 49 | .tox/ 50 | .coverage 51 | .coverage.* 52 | .cache 53 | nosetests.xml 54 | coverage.xml 55 | *,cover 56 | 57 | # Translations 58 | *.mo 59 | *.pot 60 | 61 | # Django stuff: 62 | *.log 63 | 64 | # Sphinx documentation 65 | docs/_build/ 66 | 67 | # PyBuilder 68 | target/ 69 | -------------------------------------------------------------------------------- /examples/dataset_creation.py: -------------------------------------------------------------------------------- 1 | from stegasawus.dataset import ( 2 | get_secret_message, 3 | batch_png_to_jpg, 4 | crop_images, 5 | batch_hide_message, 6 | create_benchmark_image_message) 7 | 8 | 9 | # Crop all images in a file and ouput to directory 10 | crop_images( 11 | path_images='{}images/jpg/cats_and_dogs/original/'.format(path) 12 | path_output='{}images/jpg/cats_and_dogs/cropped_256/'.format(path), 13 | dimensions=(256, 256), 14 | centre=True 15 | ) 16 | 17 | 18 | # Batch convert jpg to png 19 | batch_png_to_jpg( 20 | path_input='{}images/jpg/cats_and_dogs/cropped_256/'.format(path), 21 | path_output='{}images/png/cover/'.format(path) 22 | ) 23 | 24 | 25 | # Create steganographic image set embedding the secret message 26 | generator = 'identity' 27 | dim = 64 28 | 29 | path = '/home/rokkuran/workspace/stegasawus/' 30 | path_images = '{}images/png/cover/'.format(path) 31 | path_output = '{}images/png/lenna{}_{}/'.format(path, dim, generator) 32 | path_msg = '{}data/messages/'.format(path) 33 | 34 | secret_message = get_secret_message( 35 | '{}Lenna_{}x{}.txt'.format(path_msg, dim, dim) 36 | ) 37 | 38 | batch_hide_message( 39 | secret_message=secret_message, 40 | path_images=path_images, 41 | path_output=path_output, 42 | file_type='png', 43 | generator=generator 44 | ) 45 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Stegasawus 2 | Detecting whether steganography is present in an image using machine learning. 3 | - Least significant bit (LSB) embedding functions using various embedding location sequences. 4 | - Generates dataset from set cover images. 5 | - Creates feature vectors from statistical moments of autocorrelation and discrete wavelet decomposition measures. 6 | - Exploratory plots of images and training datasets. 7 | - Preliminary model comparisons. 8 | 9 | ### Preliminary Results 10 | Model 5-fold cross validation results on 2000 (256x256) images of cats and dogs with various message sizes embedded. Image messages are converted to strings and hidden in a cover image using the LSB algorithm. 11 | 12 | Image type counts below. Cover is the original image and 64x64 is the image size that is hidden in a cover image. 13 | 14 | `Counter({'16x16': 350, '32x32': 333, '64x64': 317, 'cover': 1000})` 15 | 16 | ![Classifier Accuracy](https://github.com/rokkuran/stegasawus/blob/master/output/plots/clf_embedding_acc.png) 17 | 18 | ![Classifier Log Loss](https://github.com/rokkuran/stegasawus/blob/master/output/plots/clf_embedding_ll.png) 19 | 20 | 21 | ### Future Work 22 | - Model benchmarks for different LSB embedding generator types. 23 | - Look at model performance for different image types (only cats and dogs at the moment). 24 | - Model persistence for well performing trained models. 25 | - Improve LSB embedding location sequences. 26 | - Extend/improve features. 27 | - Look at jpg images and embedding in discrete cosine coefficients. 28 | -------------------------------------------------------------------------------- /examples/image_plots.py: -------------------------------------------------------------------------------- 1 | from stegasawus import eda 2 | 3 | from stegano.lsbset import generators 4 | 5 | 6 | path = '/home/rokkuran/workspace/stegasawus/' 7 | path_cover = '{}images/png/cover/'.format(path) 8 | path_stego = '{}images/png/lenna64_identity/'.format(path) 9 | # path_stego = '{}images/png/lenna64_eratosthenes/'.format(path) 10 | path_output = '{}output'.format(path) 11 | 12 | fname = 'cat.2.png' 13 | z = eda.JointImageAnalyser(path_cover + fname, path_stego + fname) 14 | 15 | # plot cover and stego images side by side. 16 | z.plot_images() 17 | 18 | # plot difference between cover and stego images. 19 | z.plot_difference() 20 | 21 | # plot colour channels of cover and stego images. 22 | z.plot_rgb_components() 23 | 24 | # Reveal and show hidden image 25 | z.reveal_image(generators.identity(), show=True) 26 | 27 | # Plot wavelet decomposition for a colour channel 28 | eda.plot_wavelet_decomposition(z.I[:, :, 0]) 29 | 30 | # generate set of histogram/kde plots 31 | eda.generate_feature_distplots( 32 | filepath_train='{}data/features/train_lenna_identity.csv'.format(path), 33 | path_output=path_output, 34 | normalise=False 35 | ) 36 | 37 | # generate set of histograms 38 | eda.generate_feature_histograms( 39 | filepath_train='{}data/features/train_lenna_identity.csv'.format(path), 40 | path_output=path_output, 41 | bins=50, 42 | normalise=False 43 | ) 44 | 45 | # generate kernel density estimation plots 46 | eda.generate_feature_kde( 47 | filepath_train='{}data/features/train_lenna_identity.csv'.format(path), 48 | path_output=path_output, 49 | normalise=False 50 | ) 51 | -------------------------------------------------------------------------------- /examples/feature_set_creation.py: -------------------------------------------------------------------------------- 1 | from stegasawus.features import ( 2 | create_feature_dataset, 3 | concatenate_feature_sets, 4 | concat_multiple_feature_sets) 5 | 6 | path = '/home/rokkuran/workspace/stegasawus/' 7 | 8 | dim = 64 9 | generator = 'identity' 10 | # generator = 'eratosthenes' 11 | 12 | path_cover = '{}images/png/cover/'.format(path) 13 | path_stego = '{}images/png/lenna{}_{}/'.format(path, dim, generator) 14 | 15 | # use both feature types to create training set 16 | f_types = ['autocorrelation', 'wavelet'] 17 | 18 | # create cover image training dataset 19 | create_feature_dataset( 20 | path_images=path_cover, 21 | class_label='cover', 22 | path_output='{}data/train_cover.csv'.format(path), 23 | f_types=f_types 24 | ) 25 | 26 | # create steganographic image training dataset 27 | create_feature_dataset( 28 | path_images=path_stego, 29 | class_label='stego', 30 | path_output='%sdata/train_stego_lenna%s_%s.csv' % (path, dim, generator), 31 | f_types=f_types 32 | ) 33 | 34 | # merge cover and stego images to create complete training set 35 | concatenate_feature_sets( 36 | '{}data/train_cover.csv'.format(path), 37 | '{}data/train_stego_lenna{}_{}.csv'.format(path, dim, generator), 38 | '{}data/train_lenna_{}.csv'.format(path, generator) 39 | ) 40 | 41 | # combine multiple training training sets together 42 | concat_multiple_feature_sets( 43 | [ 44 | '{}data/train_cover.csv'.format(path), 45 | '{}data/train_stego_lenna16_identity.csv'.format(path), 46 | '{}data/train_stego_lenna32_identity.csv'.format(path), 47 | '{}data/train_stego_lenna64_identity.csv'.format(path), 48 | ], 49 | '{}data/train_lenna_identity.csv'.format(path) 50 | ) 51 | -------------------------------------------------------------------------------- /data/messages/himmw.txt: -------------------------------------------------------------------------------- 1 | SHOUTS AND MURMURS about man who describes meeting his wife at a party. In his description, he drops many prefixes. It had been a rough day, so when I walked into the party I was very chalant, despite my efforts to appear gruntled and consolate. I was furling my wieldy umbrella for the coat check when I saw her standing alone in a corner. She was a descript person, a woman in a state of total array. Her hair was kempt, her clothing shevelled, and she moved in a gainly way. I wanted desperately to meet her, but I knew I'd have to make bones about it, since I was travelling cognito. Beknownst to me, the hostess, whom I could see both hide and hair of, was very proper, so it would be skin off my nose if anything bad happened. And even though I had only swerving loyalty to her, my manners couldn't be peccable. Only toward and heard-of behavior would do. Fortunately, the embarrassment that my maculate appearance might cause was evitable. There were two ways about it, but the chances that someone as flappable as I would be ept enough to become persona grata or sung hero were slim. I was, after all, something to sneeze at, someone you could easily hold a candle to, someone who usually aroused bridled passion. So I decided not to rush it. But then, all at once, for some apparent reason, she looked in my direction and smiled in a way that I could make heads or tails of. So, after a terminable delay, I acted with mitigated gall and made my way through the ruly crowd with strong givings. Nevertheless, since this was all new hat to me and I had no time to prepare a promptu speech, I was petuous. She responded well, and I was mayed that she considered me a savory char- acter who was up to some good. She told me who she was. "What a perfect nomer," I said, advertently. The conversation became more and more choate, and we spoke at length to much avail. But I was defatigable, so I had to leave at a godly hour. I asked if she wanted to come with me. To my delight, she was committal. We left the party together and have been together ever since. I have given her my love, and she has requited it. 2 | -------------------------------------------------------------------------------- /stegasawus/parameter_tuning.yaml: -------------------------------------------------------------------------------- 1 | pso: 2 | # TODO: use and anchor or reference or whatever it is called for repeats 3 | lr_lbfgs: # C, tol 4 | lb: [1e-2, 1e-4] 5 | ub: [1e3, 1e-3] 6 | 7 | svc_linear: # C, tol 8 | lb: [1e-2, 1e-4] 9 | ub: [1e3, 1e-3] 10 | 11 | rf: # max_depth, min_samples_leaf, min_samples_split 12 | lb: [2, 1, 2] 13 | ub: [20, 15, 10] 14 | 15 | xgb: # max_depth, learning_rate, gamma 16 | lb: [3, 0.001, 0] 17 | ub: [10, 50, 50] 18 | 19 | 20 | grid_search: 21 | svc_rbf: 22 | svc_rbf__C: [1, 50, 100, 250, 600, 650, 750, 800, 900, 1000] 23 | svc_rbf__tol: [1e-3, 1e-4] 24 | svc_rbf__gamma: [0.01, 0.1, 0.25, 0.5, 0.75] 25 | 26 | svc_linear: 27 | svc_linear__C: [0.01, 0.1, 1, 10, 100, 1000] 28 | svc_linear__tol: [1e-3, 1e-4] 29 | 30 | nusvc: 31 | nusvc__nu: [0.01, 0.1, 0.25, 0.5, 0.75, 0.90] 32 | nusvc__kernel: [rbf, poly, sigmoid] 33 | nusvc__tol: [1e-3, 1e-4] 34 | nusvc__gamma: [0.01, 0.1, 0.25, 0.5, 0.75] 35 | 36 | knn: 37 | knn__n_neighbors: [3, 6, 9] 38 | knn__weights: [uniform, distance] 39 | knn__algorithm: [ball_tree, kd_tree, brute] 40 | knn__metric: [minkowski, euclidean, chebyshev, manhattan] 41 | 42 | bag_knn: 43 | bag_knn__base_estimator__n_neighbors: [3, 6, 9] 44 | bag_knn__max_samples: [0.1, 0.25, 0.5, 0.75, 0.90] 45 | bag_knn__max_features: [0.1, 0.25, 0.5, 0.75, 0.90] 46 | 47 | lr_lbfgs: 48 | lr_lbfgs__C: [0.01, 0.1, 1, 10, 100, 1000] 49 | lr_lbfgs__tol: [1e-3, 1e-4] 50 | 51 | rf: 52 | rf__criterion: [gini, entropy] 53 | rf__max_depth: [10, 12, 15, 20] 54 | rf__min_samples_leaf: [1, 5, 8, 12] 55 | rf__min_samples_split: [2, 3, 4, 5] 56 | 57 | et: 58 | et__criterion: [gini, entropy] 59 | et__max_depth: [6, 10, 18, 20, 25] 60 | et__min_samples_leaf: [1, 5, 8, 12] 61 | et__min_samples_split: [2, 3, 5, 8, 10] 62 | 63 | pa: 64 | pa__C: [0.01, 0.1, 1, 10, 100, 1000] 65 | pa__fit_intercept: [False, True] 66 | pa__loss: [hinge, squared_hinge] 67 | 68 | xgb: 69 | xgb__learning_rate: [0.01, 0.1, 1, 10, 100] 70 | xgb__max_depth: [3, 6, 9, 12] 71 | xgb__gamma: [0.1, 1, 10] 72 | -------------------------------------------------------------------------------- /stegasawus/seq.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from functools import partial 3 | 4 | 5 | def all_the_kings_men(n, **kwargs): 6 | return xrange(n) 7 | 8 | 9 | def _gen_skipy(y, n, **kwargs): 10 | i = 0 11 | x = 0 12 | while i <= n: 13 | x += y 14 | yield x 15 | i += 1 16 | 17 | 18 | def skipy(y): 19 | return partial(_gen_skipy, y=y) 20 | 21 | 22 | def _gen_rand_jump(seed, max_jump, n, **kwargs): 23 | i = 0 24 | x = 0 25 | np.random.seed(seed) 26 | while i <= n: 27 | x += np.random.randint(1, max_jump) 28 | yield x 29 | i += 1 30 | 31 | 32 | def rand_jump(seed, max_jump): 33 | return partial(_gen_rand_jump, seed=seed, max_jump=max_jump) 34 | 35 | 36 | def _gen_rand_jump_circle(seed, max_jump, n, verbose=True, **kwargs): 37 | np.random.seed(seed) 38 | i = 0 39 | x = 0 40 | modified = [] 41 | n_resets = 0 42 | while i <= n: 43 | if x >= n: 44 | # return to beginning of array once end reached 45 | x = -1 + np.random.randint(1, max_jump) 46 | n_resets += 1 47 | if verbose: 48 | print 'x reset; n_resets = %d' % n_resets 49 | else: 50 | x += np.random.randint(1, max_jump) 51 | if x not in modified and x < n: 52 | yield x 53 | modified.append(x) 54 | i += 1 55 | 56 | 57 | def rand_jump_circle(seed, max_jump): 58 | return partial(_gen_rand_jump_circle, seed=seed, max_jump=max_jump) 59 | 60 | 61 | def _gen_rand_darts(seed, n): 62 | np.random.seed(seed) 63 | i = 0 64 | remaining = range(n) 65 | while i <= n: 66 | x = np.random.randint(0, len(remaining)) 67 | yield remaining[x] 68 | remaining.pop(x) 69 | i += 1 70 | 71 | 72 | def rand_darts(seed): 73 | return partial(_gen_rand_darts, seed=seed) 74 | 75 | 76 | def _gen_shuffle_iter(seed, n): 77 | np.random.seed(seed) 78 | i = 0 79 | x = range(n) 80 | np.random.shuffle(x) 81 | while i < n: 82 | yield x[i] 83 | i += 1 84 | 85 | 86 | def shuffle_iter(seed): 87 | return partial(_gen_shuffle_iter, seed=seed) 88 | 89 | 90 | if __name__ == '__main__': 91 | pass 92 | -------------------------------------------------------------------------------- /stegasawus/mlp.py: -------------------------------------------------------------------------------- 1 | from stegasawus.model import get_equal_sets, cv_split_generator 2 | 3 | import numpy as np 4 | import pandas as pd 5 | import matplotlib.pyplot as plt 6 | 7 | from sklearn import metrics 8 | from sklearn.preprocessing import LabelEncoder, StandardScaler 9 | from sklearn.pipeline import FeatureUnion, Pipeline 10 | from sklearn.model_selection import StratifiedKFold, ShuffleSplit 11 | from sklearn.decomposition import PCA 12 | 13 | from keras.models import Sequential 14 | from keras.layers import Dense, Dropout, Activation, ActivityRegularization 15 | from keras.regularizers import WeightRegularizer 16 | from keras.wrappers.scikit_learn import KerasClassifier 17 | 18 | 19 | input_dim = 125 20 | 21 | 22 | def create_mlp(): 23 | model = Sequential() 24 | model.add(Dense(64, 'uniform', 'sigmoid', input_dim=input_dim)) 25 | # model.add(ActivityRegularization(l1=0, l2=0.001)) 26 | model.add(Dropout(0.2)) 27 | model.add(Dense(output_dim=64, activation='tanh')) 28 | model.add(Dropout(0.1)) 29 | model.add(Dense(1, activation='sigmoid')) 30 | 31 | model.compile( 32 | loss='binary_crossentropy', 33 | optimizer='adam', 34 | metrics=['accuracy'] 35 | ) 36 | return model 37 | 38 | 39 | def plot_training_history(hist): 40 | fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(7, 8)) 41 | ax1.plot(hist['acc'], '-', color='k', alpha=0.6, lw=1, label='acc') 42 | ax1.plot(hist['val_acc'], '-', color='r', alpha=0.6, lw=1, label='val_acc') 43 | ax1.set_xlabel('n_iterations') 44 | ax1.set_ylabel('accuracy', color='k') 45 | ax1.legend(loc='lower right') 46 | 47 | ax2.plot(hist['loss'], '-', color='purple', alpha=0.6, lw=1, label='loss') 48 | ax2.plot(hist['val_loss'], '-', color='b', alpha=0.6, lw=1, label='val_loss') 49 | ax2.set_xlabel('n_iterations') 50 | ax2.set_ylabel('loss', color='k') 51 | ax2.legend(loc='upper right') 52 | 53 | plt.savefig('{}/output/keras_mlp_training.png'.format(path)) 54 | plt.show() 55 | 56 | 57 | if __name__ == '__main__': 58 | path = '/home/rokkuran/workspace/stegasawus' 59 | path_train = '{}/data/features/train_lenna_identity.csv'.format(path) 60 | 61 | train = pd.read_csv(path_train) 62 | train = get_equal_sets(train) 63 | 64 | target = 'label' 65 | le_target = LabelEncoder().fit(train[target]) 66 | y = le_target.transform(train[target]) 67 | 68 | train = train.drop([target, 'image', 'filename'], axis=1) 69 | 70 | combined_features = Pipeline([ 71 | ('pca', Pipeline([ 72 | ('scaler', StandardScaler()), 73 | ('pca', PCA(n_components=input_dim)), 74 | ])), 75 | ]) 76 | 77 | X = combined_features.fit_transform(train.as_matrix()) 78 | 79 | model = KerasClassifier(build_fn=create_mlp) 80 | 81 | splitter = ShuffleSplit(n_splits=5, test_size=0.1, random_state=0) 82 | cv_splits = cv_split_generator(X=X, y=y, splitter=splitter) 83 | 84 | scores = [] 85 | hist = {} 86 | for i, X_train, X_val, y_train, y_val in cv_splits: 87 | X = combined_features.fit_transform(train.as_matrix()) 88 | results = model.fit( 89 | X_train, 90 | y_train, 91 | nb_epoch=250, 92 | batch_size=128, 93 | validation_split=0.1, 94 | verbose=1 95 | ) 96 | 97 | y_pred = model.predict(X_val) 98 | acc = metrics.accuracy_score(y_val, y_pred.flatten()) 99 | scores.append(acc) 100 | 101 | hist[i] = results.history 102 | 103 | scores = np.array(scores) 104 | 105 | plot_training_history(hist[0]) 106 | -------------------------------------------------------------------------------- /stegasawus/dataset.py: -------------------------------------------------------------------------------- 1 | from stegasawus import lsb, seq 2 | 3 | import os 4 | import base64 5 | import cStringIO 6 | from PIL import Image 7 | 8 | from os import path, listdir 9 | from skimage import io 10 | 11 | 12 | def get_secret_message(filepath): 13 | """ 14 | Read text file, return message. 15 | """ 16 | with open(filepath, 'rb') as f: 17 | message = f.read() 18 | return message 19 | 20 | 21 | def image_to_string(path_image): 22 | with open(path_image, 'rb') as f: 23 | return base64.b64encode(f.read()) 24 | 25 | 26 | def string_to_image(image_string): 27 | s = base64.b64decode(image_string) 28 | s = cStringIO.StringIO(s) 29 | return np.array(Image.open(s)) 30 | 31 | 32 | def crop_image(image, dim, centre=True): 33 | m, n = dim 34 | if centre: 35 | x, y, _ = image.shape 36 | x0 = int((x - m) / 2) - 1 37 | y0 = int((y - n) / 2) - 1 38 | xm = int(m + x0) 39 | yn = int(n + y0) 40 | return image[x0:xm, y0:yn] 41 | else: 42 | return image[0:m, 0:n] 43 | 44 | 45 | def crop_images(path_images, path_output, dimensions, centre=True): 46 | """ 47 | Batch crop images from top left hand corner to dimensions specified. Skips 48 | images where dimensions are incompatible. 49 | """ 50 | print 'cropping images...' 51 | for i, filename in enumerate(os.listdir(path_images)): 52 | try: 53 | image = io.imread('{}{}'.format(path_images, filename)) 54 | cropped = crop_image(image, dimensions, centre=centre) 55 | io.imsave( 56 | fname='{}{}'.format(path_output, filename), 57 | arr=cropped 58 | ) 59 | print '{}: {}'.format(i, filename) 60 | except IndexError: 61 | print '{}: {} failed - dimensions incompatible'.format(i, filename) 62 | 63 | print 'all images cropped and saved.' 64 | 65 | 66 | def batch_jpg_to_png(path_input, path_output): 67 | """ 68 | Convert jpg images to png. 69 | """ 70 | print 'coverting images...' 71 | for i, filename in enumerate(os.listdir(path_input)): 72 | input_jpg = '{}{}'.format(path_input, filename) 73 | 74 | fname = filename.replace('.jpg', '.png') 75 | output_png = '{}{}'.format(path_output, fname) 76 | 77 | I = io.imread(input_jpg) 78 | io.imsave(output_png, I) 79 | print '{}: {}'.format(i, filename) 80 | print 'image conversion complete.' 81 | 82 | 83 | class DatasetGenerator(object): 84 | """ 85 | Generates dataset from 86 | """ 87 | def __init__(self, path_images, path_output, seq_method): 88 | super(DatasetGenerator, self).__init__() 89 | self._path_images = path_images 90 | self._path_output = path_output 91 | self._seq_method = seq_method 92 | 93 | def _read_embed_save(self, filename, message): 94 | try: 95 | path_cover = '{}{}'.format(self._path_images, filename) 96 | path_stego = '{}{}'.format(self._path_output, filename) 97 | I = io.imread(path_cover) 98 | S = lsb.embed(I, message, self._seq_method) 99 | io.imsave(arr=S, fname=path_stego) 100 | except KeyError as e: 101 | print '%s | message size greater than image capacity.' % filename 102 | 103 | def batch_hide_message(self, message): 104 | # TODO: cleanup error handling 105 | for i, filename in enumerate(listdir(self._path_images), start=1): 106 | file_type = filename.split('.')[-1] 107 | if file_type == 'png': 108 | self._read_embed_save(filename, message) 109 | print '{}: {}'.format(i, filename) 110 | else: 111 | error = 'Image type not supported. Supported types: {png}' 112 | raise Exception(error) 113 | 114 | print 'image encoding complete.' 115 | 116 | 117 | def create_cropped_set(path_image, path_output, dims): 118 | I = io.imread(path_image) 119 | for m, n in dims.items(): 120 | pass 121 | 122 | 123 | if __name__ == '__main__': 124 | cdir = '/home/rokkuran/workspace/stegasawus/' 125 | fp = path.join(cdir, 'data/messages/Lenna_64x64.png') 126 | 127 | msg = image_to_string(fp) 128 | 129 | path_images = '{}images/png/cover_test/'.format(cdir) 130 | path_output = '{}images/png/lsb_test/'.format(cdir) 131 | 132 | seq_method = seq.rand_darts(seed=77) 133 | g = DatasetGenerator(path_images, path_output, seq_method) 134 | g.batch_hide_message(msg) 135 | -------------------------------------------------------------------------------- /stegasawus/eda.py: -------------------------------------------------------------------------------- 1 | from stegasawus import lsb, seq, dataset 2 | 3 | import numpy as np 4 | import pandas as pd 5 | import abc 6 | import pywt 7 | import base64 8 | import cStringIO 9 | from PIL import Image 10 | 11 | import matplotlib.pyplot as plt 12 | import seaborn as sns 13 | import skimage.io as io 14 | 15 | from os import path 16 | 17 | sns.set_style('whitegrid', {'axes.grid': False}) 18 | 19 | 20 | def rgb_to_grey(image): 21 | """ 22 | Converts RGB image array (m, n, 3) to greyscale (m, n). 23 | """ 24 | return np.dot(image, [0.2989, 0.5870, 0.1140]) 25 | 26 | 27 | class ImagePlots(object): 28 | __metaclass__ = abc.ABCMeta 29 | 30 | @abc.abstractproperty 31 | def I(self): 32 | raise NotImplementedError() 33 | 34 | @abc.abstractproperty 35 | def S(self): 36 | raise NotImplementedError() 37 | 38 | def plot_images(self): 39 | """ 40 | Plot cover and steganographic RGB images side by side. 41 | """ 42 | io.imshow(np.concatenate((self.I, self.S), axis=1)) 43 | plt.title('Left: original cover image. Right: steganographic image.') 44 | plt.grid(False) 45 | plt.show() 46 | 47 | def plot_rgb_components(self): 48 | """ 49 | Plot RGB colour channels for both cover and steganographic images. 50 | """ 51 | f, axarr = plt.subplots(nrows=2, ncols=3) 52 | for i, image_type in enumerate(['Cover', 'Stego']): 53 | for j, colour in enumerate(['Red', 'Green', 'Blue']): 54 | axarr[i, j].imshow(self.I[:, :, j], cmap='{}s'.format(colour)) 55 | axarr[i, j].set_title('{} {}'.format(image_type, colour)) 56 | axarr[i, j].set_xticklabels([]) 57 | axarr[i, j].set_yticklabels([]) 58 | plt.show() 59 | 60 | def plot_rgb_difference(self): 61 | """ 62 | Plots difference between cover and steganographic images for each RGB 63 | colour channel. 64 | """ 65 | f, axarr = plt.subplots(1, 3, figsize=(12, 4)) 66 | for j, colour in enumerate(['Red', 'Green', 'Blue']): 67 | diff = self.I[:, :, j] - self.S[:, :, j] 68 | axarr[j].imshow(diff, cmap='{}s_r'.format(colour)) 69 | axarr[j].set_title('{}'.format(colour)) 70 | axarr[j].set_xticklabels([]) 71 | axarr[j].set_yticklabels([]) 72 | plt.show() 73 | 74 | def plot_difference(self): 75 | """ 76 | Plot difference between cover and steganographic image. 77 | """ 78 | io.imshow(self.I - self.S) 79 | plt.grid(False) 80 | plt.show() 81 | 82 | 83 | def plot_wavelet_decomposition(image, level=3): 84 | """ 85 | Plot of 2D wavelet decompositions for given number of levels. 86 | 87 | image needs to be either a colour channel or greyscale image: 88 | rgb: self.I[:, :, n], where n = {0, 1, 2} 89 | greyscale: use rgb_to_grey(self.I) 90 | 91 | """ 92 | coeffs = pywt.wavedec2(image, wavelet='haar', level=level) 93 | for i, (cH, cV, cD) in enumerate(coeffs[1:]): 94 | if i == 0: 95 | cAcH = np.concatenate((coeffs[0], cH), axis=1) 96 | cVcD = np.concatenate((cV, cD), axis=1) 97 | plot_image = np.concatenate((cAcH, cVcD), axis=0) 98 | else: 99 | plot_image = np.concatenate((plot_image, cH), axis=1) 100 | cVcD = np.concatenate((cV, cD), axis=1) 101 | plot_image = np.concatenate((plot_image, cVcD), axis=0) 102 | 103 | plt.grid(False) 104 | io.imshow(abs(plot_image), cmap='gray_r') 105 | plt.show() 106 | 107 | 108 | class JointImageAnalyser(ImagePlots): 109 | """""" 110 | def __init__(self, cover, stego): 111 | super(JointImageAnalyser, self).__init__() 112 | self._I = self._set_image(cover) 113 | self._S = self._set_image(stego) 114 | 115 | def _check_type(self, v, types): 116 | return any([isinstance(v, t) for t in types]) 117 | 118 | def _set_image(self, image): 119 | if self._check_type(image, [np.ndarray, list]): 120 | return image 121 | elif self._check_type(image, [str]): 122 | return io.imread(image) 123 | else: 124 | raise Exception('Input image type not array like or filepath.') 125 | 126 | @property 127 | def I(self): 128 | return self._I 129 | 130 | @property 131 | def S(self): 132 | return self._S 133 | 134 | @property 135 | def diff(self): 136 | return self.I - self.S 137 | 138 | def print_details(self): 139 | # TODO: add some more details... 140 | a = np.sum(abs(self.diff)) 141 | print 'sum of absolute image difference = %s' % a 142 | 143 | def reveal(self, seq_method): 144 | return lsb.reveal(self.S, seq_method) 145 | 146 | def reveal_image(self, seq_method, show=False): 147 | s = base64.b64decode(self.reveal(seq_method)) 148 | s = cStringIO.StringIO(s) 149 | I = np.array(Image.open(s)) 150 | 151 | if show: 152 | io.imshow(I) 153 | plt.show() 154 | 155 | return I 156 | 157 | 158 | if __name__ == '__main__': 159 | cdir = '/home/rokkuran/workspace/stegasawus/' 160 | 161 | fp = path.join(cdir, 'data/messages/Lenna_64x64.png') 162 | msg = dataset.image_to_string(fp) 163 | 164 | path_images = '{}images/png/cover_test/'.format(cdir) 165 | path_output = '{}images/png/lsb_test/'.format(cdir) 166 | 167 | seq_method = seq.rand_darts(seed=77) 168 | g = dataset.DatasetGenerator(path_images, path_output, seq_method) 169 | g.batch_hide_message(msg) 170 | 171 | filename = 'cat.117.png' 172 | a = JointImageAnalyser(path_images + filename, path_output + filename) 173 | H = a.reveal_image(seq_method) 174 | io.imshow(H) 175 | plt.show() 176 | -------------------------------------------------------------------------------- /stegasawus/lsb.py: -------------------------------------------------------------------------------- 1 | from stegasawus import seq 2 | 3 | import numpy as np 4 | 5 | from os import path 6 | from skimage import io 7 | 8 | 9 | def bit_generator(s, verbose=False): 10 | """ 11 | Yields individual bits for characters in string. 12 | """ 13 | for x in s: 14 | a = ord(x) 15 | if verbose: 16 | print x, ord(x), bin(ord(x)) 17 | 18 | i = 0 19 | while i < 7: 20 | if verbose: 21 | print a, bin(a), a & 1 22 | yield a & 1 23 | a = a >> 1 # bit shifting embeds character backwards 24 | i += 1 25 | 26 | # signify end with 14 zeros (double ascii null) 27 | for x in xrange(14): 28 | yield 0 29 | 30 | 31 | def set_lsb(byte, bit): 32 | """ 33 | Replaces least significant bit of of byte with bit. 34 | if bit == 1: 35 | - 0110101 | 0000001 = 0110101 36 | - 0110100 | 0000001 = 0110101 37 | if bit == 0: 38 | - 0110100 & 1111110 = 0110100 39 | - 0110101 & 1111110 = 0110100 40 | """ 41 | if bit: 42 | return byte | bit 43 | else: 44 | return byte & 0b11111110 45 | 46 | 47 | def _old_embed(I, message, seq_method, verbose=False): 48 | """ 49 | Embeds message in LSB of image at locations specified by seq_method. 50 | """ 51 | dimensions = I.shape 52 | S = I.flatten().copy() 53 | bits = bit_generator(message) 54 | 55 | pixel_count = 0 56 | for i in seq_method(n=len(S)): 57 | bit = next(bits, None) 58 | if bit is not None: 59 | S[i] = set_lsb(S[i], bit) 60 | pixel_count += 1 61 | if verbose: 62 | print '%d pixel modified' % i 63 | else: 64 | break 65 | 66 | if verbose: 67 | print 'Pixels modified: %.2f' % (pixel_count / 3.) 68 | return S.reshape(dimensions) 69 | 70 | 71 | def reveal(S, seq_method): 72 | """ 73 | Reveals embedded LSB message at locations specified by seq_method. 74 | """ 75 | char = '' 76 | S = S.flatten() 77 | 78 | end = list(np.repeat(1, 14)) # message end signified by 14 zeros 79 | for i in seq_method(n=len(S)): # img_length=len(S)) 80 | x = S[i] 81 | bit = x & 1 82 | char += str(bit) 83 | end = end[1:] + [bit] 84 | if not sum(end): 85 | break 86 | 87 | text = '' 88 | while len(char) > 0: 89 | b = char[:7][::-1] # reversed binary 90 | text += chr(int(b, 2)) 91 | char = char[7:] 92 | 93 | # remove 2 ascii nulls at message end 94 | text = text.replace('\x00', '') 95 | return text 96 | 97 | 98 | def binary_size(s): 99 | # add 14 due to double null to signify message end 100 | # multiply by 7 for each binary element per character 101 | return (len(s) + 14) * 7 102 | 103 | 104 | def best_max_jump(I, msg, verbose=False): 105 | msg_binary_size = binary_size(msg) 106 | max_jump = int(len(I.flatten()) / float(msg_binary_size)) 107 | if verbose: 108 | args = len(I.flatten()), msg_binary_size, max_jump 109 | print 'img_size=%d; msg_binary_size=%d; best_max_jump=%d' % args 110 | return max_jump 111 | 112 | 113 | def _has_capacity(I, msg): 114 | return False if binary_size(msg) > len(I.flatten()) else True 115 | 116 | 117 | def _check_capacity(I, msg, verbose=False): 118 | args = (len(I.flatten()), binary_size(msg)) 119 | ps = 'img_size=%d; msg_binary_size=%d' % args 120 | if verbose: 121 | print ps 122 | 123 | if not _has_capacity(I, msg): 124 | error = 'Message length too long to embed: ' + ps 125 | raise Exception(error) 126 | 127 | 128 | def embed(I, message, seq_method, verbose=False): 129 | """ 130 | Embeds message in LSB of image at locations specified by seq_method. 131 | """ 132 | dimensions = I.shape 133 | S = I.flatten().copy() 134 | bits = bit_generator(message) 135 | 136 | if not _has_capacity(I, message): 137 | args = (len(I.flatten()), binary_size(message)) 138 | ps = 'img_size=%d; msg_binary_size=%d' % args 139 | error = 'message length too long to embed: ' + ps 140 | print error 141 | else: 142 | pixel_count = 0 143 | for i in seq_method(n=len(S)): 144 | bit = next(bits, None) 145 | if bit is not None: 146 | S[i] = set_lsb(S[i], bit) 147 | pixel_count += 1 148 | if verbose: 149 | print '%d pixel modified' % i 150 | else: 151 | break 152 | 153 | if verbose: 154 | print 'Pixels modified: %.2f' % (pixel_count / 3.) 155 | return S.reshape(dimensions) 156 | 157 | 158 | if __name__ == '__main__': 159 | cdir = path.dirname(__file__) 160 | I = io.imread(path.join(cdir, '../data/messages/Lenna.png')) 161 | 162 | def check_embed_reveal(I, msg, seq_method): 163 | S = embed(I, msg, seq_method) 164 | return reveal(S, seq_method) == msg 165 | 166 | def test_characters(I, seq_method): 167 | msg = 'abcdefghijklmnopqrstuvwxys 1234567890~`!@#$%^&*()_+-=:<>,.?/| ' 168 | flag = check_embed_reveal(I, msg, seq_method) 169 | assert flag, 'test_characters: %s' % seq_method.__name__ 170 | 171 | # TODO: probably doesn't work for extended ascii codes 172 | test_characters(I, seq.all_the_kings_men) 173 | test_characters(I, seq.skipy(y=3)) 174 | test_characters(I, seq.skipy(y=5)) 175 | test_characters(I, seq.rand_jump(seed=77, max_jump=25)) 176 | test_characters(I, seq.rand_jump_circle(seed=77, max_jump=25)) 177 | test_characters(I, seq.rand_darts(seed=0)) 178 | 179 | msg = 'abcdefghijklmnopqrstuvwxys 1234567890~`!@#$%^&*()_+-=:<>,.?/| ' 180 | max_jump = best_max_jump(I, msg) 181 | test_characters(I, seq.rand_jump(seed=77, max_jump=max_jump)) 182 | 183 | # msg = 'abcdefghijklmnopqrstuvwxys 1234567890~`!@#$%^&*()_+-=:<>,.?/| ' 184 | # seq_method = seq.all_the_kings_men 185 | # S = _zembed(I, msg, seq_method) 186 | # hmsg = _zreveal(S, seq_method) 187 | # print hmsg 188 | -------------------------------------------------------------------------------- /stegasawus/models.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import yaml 4 | import re 5 | 6 | import matplotlib.pyplot as plt 7 | from scipy import stats 8 | from collections import Counter 9 | 10 | from sklearn.metrics import ( 11 | accuracy_score, log_loss, precision_score, recall_score, f1_score, 12 | roc_auc_score) 13 | from sklearn.preprocessing import ( 14 | LabelEncoder, StandardScaler, LabelBinarizer, PolynomialFeatures) 15 | from sklearn.model_selection import ( 16 | GridSearchCV, learning_curve, ShuffleSplit, StratifiedShuffleSplit) 17 | from sklearn.pipeline import Pipeline, FeatureUnion, make_pipeline 18 | from sklearn.decomposition import PCA, KernelPCA 19 | from sklearn.feature_selection import SelectKBest, RFE 20 | from sklearn.naive_bayes import GaussianNB 21 | from sklearn.linear_model import ( 22 | LogisticRegression, PassiveAggressiveClassifier) 23 | from sklearn.svm import SVC, LinearSVC 24 | from sklearn.ensemble import RandomForestClassifier 25 | from sklearn.naive_bayes import GaussianNB 26 | from sklearn.discriminant_analysis import LinearDiscriminantAnalysis 27 | 28 | from keras.models import Sequential 29 | from keras.layers import Dense, Dropout, Activation, ActivityRegularization 30 | from keras.regularizers import WeightRegularizer 31 | from keras.wrappers.scikit_learn import KerasClassifier 32 | 33 | from xgboost import XGBClassifier 34 | 35 | 36 | input_dim = 125 37 | 38 | 39 | class ModelComparer(object): 40 | """""" 41 | def __init__(self, X, y, pipeline, splitter, classifiers, metrics=None): 42 | super(ModelComparer, self).__init__() 43 | self.X = X 44 | self.y = y 45 | self.pipeline = pipeline 46 | self.splitter = splitter 47 | self.classifiers = classifiers 48 | 49 | if metrics is None: 50 | self.metrics = [accuracy_score, log_loss, precision_score, 51 | recall_score, f1_score, roc_auc_score] 52 | else: 53 | self.metrics = metrics 54 | 55 | self._scores = [] 56 | self.models = {} 57 | 58 | def _get_pipeline(self, name): 59 | return make_pipeline(self.pipeline, classifiers[name]) 60 | 61 | @property 62 | def cv_split_generator(self): 63 | """ 64 | Train and validation set split generator. 65 | """ 66 | g = enumerate(self.splitter.split(self.X, self.y)) 67 | for i, (train_idx, val_idx) in g: 68 | X_train, X_val = self.X[train_idx], self.X[val_idx] 69 | y_train, y_val = self.y[train_idx], self.y[val_idx] 70 | yield i, X_train, X_val, y_train, y_val 71 | 72 | def _metric_name(self, f): 73 | return f.__name__.replace('_score', '') 74 | 75 | def _get_metric_scores(self, y_val, y_pred): 76 | s = str() 77 | scores = [] 78 | for fn in self.metrics: 79 | score = fn(y_val, y_pred) 80 | scores.append(score) 81 | s += '%s = %.4f; ' % (self._metric_name(fn), score) 82 | return scores, s 83 | 84 | def model_comparison(self, cv_mean=True): 85 | self._scores = [] 86 | for i, X_train, X_val, y_train, y_val in self.cv_split_generator: 87 | for name, clf in self.classifiers.items(): 88 | pipeline = self._get_pipeline(name) 89 | model = pipeline.fit(X_train, y_train) 90 | self.models[name] = model 91 | 92 | y_pred = model.predict(X_val) 93 | metrics, ps = self._get_metric_scores(y_val, y_pred) 94 | ps += ' | %s_%d' % (name, i) 95 | print ps 96 | self._scores.append([name, i] + metrics) 97 | 98 | def scores(self, mean=True): 99 | cols = ['classifier', 'split'] 100 | cols += [self._metric_name(fn) for fn in self.metrics] 101 | 102 | df = pd.DataFrame(self._scores, columns=cols) 103 | df = df.sort_values( 104 | by=['accuracy', 'log_loss'], 105 | ascending=[False, True] 106 | ).reset_index(drop=True) 107 | 108 | df_mean = df.ix[:, df.columns != 'split'] \ 109 | .groupby(['classifier']) \ 110 | .mean() \ 111 | .sort_values( 112 | by=['accuracy', 'log_loss'], 113 | ascending=[False, True]) 114 | 115 | return df_mean if mean else df 116 | 117 | 118 | def create_mlp(): 119 | model = Sequential() 120 | model.add(Dense(64, 'uniform', 'sigmoid', input_dim=input_dim)) 121 | # model.add(ActivityRegularization(l1=0, l2=0.001)) 122 | model.add(Dropout(0.2)) 123 | model.add(Dense(output_dim=64, activation='tanh')) 124 | model.add(Dropout(0.1)) 125 | model.add(Dense(1, activation='sigmoid')) 126 | 127 | model.compile( 128 | loss='binary_crossentropy', 129 | optimizer='adam', 130 | metrics=['accuracy'] 131 | ) 132 | return model 133 | 134 | 135 | classifiers = { 136 | 'keras_mlp': KerasClassifier( 137 | build_fn=create_mlp, 138 | nb_epoch=150, 139 | batch_size=64 140 | ), 141 | 'svc_linear': LinearSVC(), 142 | 'lr_lbfgs': LogisticRegression( 143 | C=2.02739770e+04, # particle swarm optimised 144 | tol=6.65926091e-04, 145 | solver='lbfgs' 146 | ), 147 | 'lr_lbfgs_default': LogisticRegression(solver='lbfgs'), 148 | 'pa': PassiveAggressiveClassifier( 149 | C=0.01, 150 | fit_intercept=True, 151 | loss='hinge' 152 | ), 153 | 'pa_default': PassiveAggressiveClassifier(), 154 | 'gnb': GaussianNB(), 155 | 'lda': LinearDiscriminantAnalysis(), 156 | 'rf': RandomForestClassifier( 157 | n_estimators=200, 158 | criterion='gini', 159 | max_depth=4, 160 | min_samples_leaf=3, 161 | min_samples_split=3 162 | ), 163 | 'xgb': XGBClassifier( 164 | n_estimators=200, 165 | max_depth=6, 166 | learning_rate=0.1, 167 | gamma=1, 168 | objective='binary:logistic', 169 | nthread=-1 170 | ), 171 | } 172 | 173 | pipeline = Pipeline([ 174 | ('scaler', StandardScaler()), 175 | ('pca', PCA(n_components=input_dim)), 176 | ]) 177 | 178 | 179 | if __name__ == '__main__': 180 | path = '/home/rokkuran/workspace/stegasawus' 181 | path_train = '{}/data/features/train_lenna.csv'.format(path) 182 | 183 | train = pd.read_csv(path_train) 184 | 185 | target = 'label' 186 | le_target = LabelEncoder().fit(train[target]) 187 | y = le_target.transform(train[target]) 188 | 189 | train = train.drop([target, 'image'], axis=1) 190 | X = train.as_matrix() 191 | 192 | splitter = ShuffleSplit(n_splits=3, test_size=0.2, random_state=0) 193 | mc = ModelComparer(X, y, pipeline, splitter, classifiers) 194 | mc.model_comparison() 195 | print '\n', mc.scores() 196 | -------------------------------------------------------------------------------- /stegasawus/features.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import os 4 | import pywt 5 | 6 | from scipy import stats 7 | import matplotlib.pyplot as plt 8 | from skimage import io 9 | 10 | 11 | def statistical_metrics(x): 12 | """ 13 | Calculates statistical metrics on input array (mean, std, skew, kurtosis). 14 | """ 15 | 16 | metrics = { 17 | 'mean': np.mean, 18 | 'stdev': np.std, 19 | 'skew': stats.skew, 20 | 'kurtosis': stats.kurtosis 21 | } 22 | return {k: fn(x.flatten()) for k, fn in metrics.items()} 23 | 24 | 25 | def prefix_dict_keys(d, prefix): 26 | """ 27 | Adds prefix to dict keys. 28 | """ 29 | return {'{}_{}'.format(prefix, k): v for k, v in d.items()} 30 | 31 | 32 | def autocorrelation_features(I, lags=[(1, 0), (0, 1), (1, 1)]): 33 | """ 34 | Calculate the autocorrelation statistical features from a 2D image array 35 | (greyscale image or an individual colour channel) for the specified pixel 36 | vertical and horizontal coordinate shift lags: 37 | e.g. [(1, 0), (0, 1), (1, 1), (1, 2), (2, 1), (2, 2)] 38 | """ 39 | m, n = I.shape 40 | 41 | features = {} 42 | for x, y in lags: 43 | ac = I[x:, y:] * I[:m-x, :n-y] 44 | aca = np.sum(ac / (I[x:, y:].std() * I[:m-x, :n-y].std())) 45 | 46 | features['aca_{}{}'.format(x, y)] = aca 47 | 48 | f_stat = statistical_metrics(ac) 49 | f_stat = prefix_dict_keys(f_stat, 'ac_{}{}'.format(x, y)) 50 | features.update(f_stat) 51 | 52 | return features 53 | 54 | 55 | def rgb_autocorrelation_features(I, lags=((1, 0), (0, 1), (1, 1))): 56 | """ 57 | Calculate the autocorrelation statistical features of an RGB image 58 | array (m, n, 3) for the specified lags. 59 | """ 60 | features = {} 61 | m, n, _ = I.shape 62 | 63 | for c, colour in enumerate('rgb'): 64 | f_ac = autocorrelation_features(I[:, :, c], lags) 65 | f_ac = prefix_dict_keys(f_ac, colour) 66 | features.update(f_ac) 67 | 68 | return features 69 | 70 | 71 | def concatenate_feature_sets(filepath_cover, filepath_stego, filepath_output): 72 | """ 73 | Concatenates two feature csv files. 74 | 75 | Parameters 76 | ---------- 77 | filepath_cover : string 78 | Filepath to cover image feature set. 79 | filepath_stego : string 80 | Filepath to steganographic image feature set. 81 | filepath_output : string 82 | Output filepath. 83 | 84 | Returns 85 | ------- 86 | Concatenated dataset. 87 | 88 | """ 89 | train_cover = pd.read_csv(filepath_cover) 90 | train_stego = pd.read_csv(filepath_stego) 91 | train = pd.concat([train_cover, train_stego]) 92 | train.to_csv(filepath_output, index=False) 93 | return train 94 | 95 | 96 | def concat_multiple_feature_sets(filepaths, filepath_output): 97 | train = pd.DataFrame() 98 | for filepath in filepaths: 99 | df = pd.read_csv(filepath) 100 | df['filename'] = filepath.split('/')[-1] 101 | train = pd.concat([train, df]) 102 | train.to_csv(filepath_output, index=False) 103 | return train 104 | 105 | 106 | def apply_tolerance(x, tol): 107 | """ 108 | Applies absolute value filter for given tolerance. 109 | 110 | Parameters 111 | ---------- 112 | x : numpy.ndarray 113 | Input data. 114 | tol : int, float 115 | Tolerance. 116 | 117 | Returns 118 | ------- 119 | Filtered array where |x| >= tol. 120 | If no values are above the tolerance np.array([0]) is returned. 121 | 122 | """ 123 | x_tol = abs(x) >= tol 124 | if x_tol.any(): 125 | return x[x_tol] 126 | else: 127 | return np.zeros(1) 128 | 129 | 130 | def wavdec_features(coeffs, tol=1): 131 | """ 132 | Calculated the statistical features on the components of a mulitlevel 2D 133 | discrete wavelet decomposition. 134 | 135 | Parameters 136 | ---------- 137 | coeffs : list 138 | n level coefficients from pywt.wavedec2 139 | [cAn, (cHn, cVn, cDn), ... (cH1, cV1, cD1)] 140 | tol : int, float, default : 1 141 | Tolerance to apply to individual coefficient arrays. 142 | 143 | Returns 144 | ------- 145 | features : dict 146 | Feature vector of statistical components in dictionary form. 147 | 148 | """ 149 | n_layers = len(coeffs) - 1 150 | 151 | features = {} 152 | 153 | cA = coeffs[0] 154 | prefix = 'dwt_{}_cA'.format(n_layers) 155 | cA = apply_tolerance(cA, tol) # reduce sensitivity to noise 156 | f_stat = statistical_metrics(cA) 157 | f_stat = prefix_dict_keys(f_stat, prefix) 158 | features.update(f_stat) 159 | 160 | for i, (cH, cV, cD) in enumerate(coeffs[1:]): 161 | layer = n_layers - i 162 | for c, cX in zip(('cH', 'cV', 'cD'), (cH, cV, cD)): 163 | prefix = 'dwt_{}_{}'.format(layer, c) 164 | cX = apply_tolerance(cX, tol) 165 | f_stat = statistical_metrics(cX) 166 | f_stat = prefix_dict_keys(f_stat, prefix) 167 | features.update(f_stat) 168 | 169 | return features 170 | 171 | 172 | def rgb_wavelet_features(I, tol=1): 173 | """ 174 | For each RGB channel, calculates the statistical features the components of 175 | a mulitlevel 2D discrete wavelet decomposition. 176 | 177 | Parameters 178 | ---------- 179 | I : numpy.ndarray 180 | RGB image array. 181 | tol : int, float, default : 1 182 | Tolerance to apply to individual coefficient arrays. 183 | 184 | Returns 185 | ------- 186 | features : dict 187 | Feature vector of statistical components in dictionary form. 188 | 189 | """ 190 | features = {} 191 | m, n, _ = I.shape 192 | 193 | for c, colour in enumerate('rgb'): 194 | coeffs = pywt.wavedec2(I[:, :, c], wavelet='haar', level=3) 195 | f_wavelet = wavdec_features(coeffs) 196 | f_wavelet = prefix_dict_keys(f_wavelet, colour) 197 | features.update(f_wavelet) 198 | 199 | return features 200 | 201 | 202 | def create_feature_dataset(path_images, class_label, path_output, 203 | f_types=['autocorrelation', 'wavelet'], 204 | image_limit=None): 205 | 206 | """ 207 | Create feature vectors from images in directory and save as csv output. 208 | 209 | Parameters 210 | ---------- 211 | path_images : directory path string 212 | Directory with images for processing. 213 | class_label : string 214 | Class label used in label column of output. 215 | path_output : directory path string 216 | Output directory for csv file. 217 | f_types : array_like, default : ['autocorrelation', 'wavelet'] 218 | Specify the feature types to include as list of strings: 219 | {'autocorrelation', 'wavelet'} 220 | Default: ['autocorrelation', 'wavelet'] 221 | image_limit : int, default : None 222 | Number of images in directory to process. 223 | 224 | Returns 225 | ------- 226 | csv output file as specified in path_output. 227 | 228 | """ 229 | 230 | print 'creating image feature dataset...' 231 | 232 | dataset = list() 233 | for i, filename in enumerate(os.listdir(path_images)): 234 | fname = '{}{}'.format(path_images, filename) 235 | image = io.imread(fname) 236 | 237 | features = {} 238 | if 'autocorrelation' in f_types: 239 | lags = ((1, 0), (0, 1), (1, 1), (1, 2), (2, 2), (2, 2)) 240 | features.update(rgb_autocorrelation_features(image, lags)) 241 | 242 | if 'wavelet' in f_types: 243 | features.update(rgb_wavelet_features(image)) 244 | 245 | if i == 0: 246 | feature_names = features.keys() 247 | 248 | row = [filename, class_label] 249 | for feature in feature_names: 250 | row.append(features[feature]) 251 | dataset.append(row) 252 | 253 | if i % 250 == 0: 254 | print '{} images processed'.format(i) 255 | 256 | if image_limit: 257 | if i > image_limit: 258 | break 259 | 260 | df = pd.DataFrame(dataset, columns=['image', 'label'] + feature_names) 261 | df.to_csv(path_output, index=False) 262 | 263 | print 'image feature dataset created.' 264 | 265 | 266 | # ****************************************************************************** 267 | if __name__ == '__main__': 268 | pass 269 | -------------------------------------------------------------------------------- /stegasawus/tuning.py: -------------------------------------------------------------------------------- 1 | from stegasawus.model import ( 2 | cv_split_generator, 3 | get_pipeline, 4 | get_equal_sets) 5 | 6 | import numpy as np 7 | import pandas as pd 8 | import yaml 9 | import re 10 | import collections 11 | import functools 12 | 13 | import matplotlib.pyplot as plt 14 | 15 | from pyswarm import pso 16 | 17 | from sklearn import metrics 18 | from sklearn.preprocessing import ( 19 | LabelEncoder, 20 | StandardScaler, 21 | PolynomialFeatures) 22 | from sklearn.model_selection import ( 23 | GridSearchCV, 24 | learning_curve, 25 | validation_curve, 26 | ShuffleSplit) 27 | from sklearn.pipeline import Pipeline, FeatureUnion 28 | from sklearn.decomposition import PCA, KernelPCA 29 | from sklearn.feature_selection import SelectKBest, RFE 30 | 31 | from sklearn.naive_bayes import GaussianNB 32 | from sklearn.neighbors import KNeighborsClassifier 33 | from sklearn.linear_model import ( 34 | LogisticRegression, 35 | PassiveAggressiveClassifier) 36 | from sklearn.svm import SVC, LinearSVC, NuSVC 37 | from sklearn.tree import DecisionTreeClassifier 38 | from sklearn.ensemble import ( 39 | RandomForestClassifier, 40 | ExtraTreesClassifier, 41 | AdaBoostClassifier, 42 | GradientBoostingClassifier, 43 | VotingClassifier) 44 | from sklearn.naive_bayes import GaussianNB 45 | from sklearn.discriminant_analysis import ( 46 | LinearDiscriminantAnalysis, 47 | QuadraticDiscriminantAnalysis) 48 | 49 | from xgboost import XGBClassifier 50 | 51 | 52 | # ****************************************************************************** 53 | def gs_parameter_tuning(clf, X_train, y_train, parameters, scoring, cv=5): 54 | gs_clf = GridSearchCV(clf, parameters, scoring=scoring, cv=cv, n_jobs=6) 55 | gs_clf = gs_clf.fit(X_train, y_train) 56 | 57 | best_parameters, score, _ = max(gs_clf.grid_scores_, key=lambda x: x[1]) 58 | for param_name in sorted(parameters.keys()): 59 | print("%s: %r" % (param_name, best_parameters[param_name])) 60 | 61 | 62 | # TODO: improve, extend, refactor 63 | def pso_parameter_tuning(clf, X, y, lb, ub, swarmsize, maxiter, n_splits=3, 64 | integer=False, *args): 65 | """ 66 | Particle swarm optimisation based parameter tuning. 67 | 68 | Parameters 69 | ---------- 70 | clf : sklearn classifier 71 | Model to tune parameters. 72 | X : numpy.ndarray 73 | Training features. 74 | y : numpy.ndarray 75 | Training target values. 76 | lb : array_like 77 | Lower bound values for parameters to tune. 78 | ub : array_like 79 | Upper bound values for parameters to tune. 80 | swarmsize : int 81 | Number of particles in the swarm. 82 | maxiter : int 83 | Maximum number of iterations for swarm to search. 84 | n_splits : int, default = 3 85 | Number of cross validation splits. 86 | 87 | Returns 88 | ------- 89 | g : array 90 | The swarm's best known parameters settings. 91 | f : scalar 92 | The value of the minimisation function at g. 93 | 94 | """ 95 | def clf_check(clf, classifiers): 96 | return any([isinstance(clf, c) for c in classifiers]) 97 | 98 | def minimise(x, *args): 99 | """""" 100 | if clf_check(clf, [LinearSVC, LogisticRegression]): 101 | C, tol = x 102 | clf.set_params(C=C, tol=tol) 103 | 104 | elif clf_check(clf, [RandomForestClassifier]): 105 | # random forest: all values need to be integer 106 | x = [int(np.round(v, 0)) for v in x] 107 | max_depth, min_samples_leaf, min_samples_split = x 108 | clf.set_params( 109 | max_depth=max_depth, 110 | min_samples_leaf=min_samples_leaf, 111 | min_samples_split=min_samples_split) 112 | 113 | elif clf_check(clf, [XGBClassifier]): 114 | # xgb: max_depth should be integer 115 | max_depth, learning_rate, gamma = x 116 | max_depth = int(np.round(max_depth, 0)) 117 | clf.set_params( 118 | max_depth=max_depth, 119 | learning_rate=learning_rate, 120 | gamma=gamma) 121 | 122 | else: 123 | raise Exception('Classifier not supported.') 124 | 125 | pipeline = Pipeline([ 126 | ('pca', Pipeline([ 127 | ('scaler', StandardScaler()), 128 | ('pca', PCA(n_components=125)), 129 | ])), 130 | ('clf', clf) 131 | ]) 132 | 133 | ss = ShuffleSplit(n_splits=n_splits, test_size=0.2) 134 | cv_splits = cv_split_generator(X=X, y=y, splitter=ss) 135 | 136 | ll = [] 137 | for i, X_train, X_val, y_train, y_val in cv_splits: 138 | model = pipeline.fit(X_train, y_train) 139 | y_pred = model.predict(X_val) 140 | ll.append(metrics.log_loss(y_val, y_pred)) 141 | 142 | print x, np.mean(ll) 143 | return np.mean(ll) 144 | 145 | g, f = pso(minimise, lb, ub, swarmsize=swarmsize, maxiter=maxiter, 146 | debug=True, args=('clf', clf)) 147 | return g, f 148 | 149 | 150 | classifiers = { 151 | 'knn': KNeighborsClassifier( 152 | n_neighbors=6, 153 | algorithm='ball_tree', 154 | weights='distance', 155 | metric='chebyshev' 156 | ), 157 | 'knn_default': KNeighborsClassifier(), 158 | 'svc_rbf': SVC( 159 | kernel='rbf', 160 | C=50, 161 | gamma=0.01, 162 | tol=1e-3 163 | ), 164 | 'svc_rbf_default': SVC(kernel='rbf'), 165 | 'svc_linear': LinearSVC( 166 | C=1e3, 167 | loss='squared_hinge', 168 | penalty='l2', 169 | tol=1e-3 170 | ), 171 | 'svc_linear_default': LinearSVC(), 172 | 'nusvc': NuSVC(), 173 | 'rf': RandomForestClassifier( 174 | criterion='gini', 175 | n_estimators=200, 176 | max_depth=4, 177 | min_samples_leaf=3, 178 | min_samples_split=3 179 | ), 180 | 'rf_default': RandomForestClassifier(), 181 | 'adaboost': AdaBoostClassifier(), 182 | 'et': ExtraTreesClassifier( 183 | criterion='entropy', 184 | max_depth=25, 185 | min_samples_leaf=5, 186 | min_samples_split=5 187 | ), 188 | 'et_default': ExtraTreesClassifier(), 189 | 'gbc': GradientBoostingClassifier(), 190 | 'lr_lbfgs': LogisticRegression( 191 | C=2.02739770e+04, # particle swarm optimised 192 | tol=6.65926091e-04, 193 | solver='lbfgs' 194 | ), 195 | 'lr_lbfgs_default': LogisticRegression(solver='lbfgs'), 196 | 'pa': PassiveAggressiveClassifier( 197 | C=0.01, 198 | fit_intercept=True, 199 | loss='hinge' 200 | ), 201 | 'pa_default': PassiveAggressiveClassifier(), 202 | 'gnb': GaussianNB(), 203 | 'lda': LinearDiscriminantAnalysis(), 204 | 'qda': QuadraticDiscriminantAnalysis(), 205 | 'xgb_defualt': XGBClassifier(), 206 | 'xgb': XGBClassifier( 207 | max_depth=6, 208 | learning_rate=0.01, 209 | n_estimators=100, 210 | silent=True, 211 | objective='binary:logistic', 212 | nthread=-1, 213 | gamma=0, 214 | min_child_weight=1, 215 | max_delta_step=0, 216 | subsample=1, 217 | colsample_bytree=1, 218 | colsample_bylevel=1, 219 | reg_alpha=0, 220 | reg_lambda=1, 221 | scale_pos_weight=1, 222 | base_score=0.5, 223 | seed=0, 224 | missing=None 225 | ) 226 | } 227 | 228 | 229 | if __name__ == '__main__': 230 | path = '/home/rokkuran/workspace/stegasawus' 231 | path_train = '{}/data/features/train_lenna_identity.csv'.format(path) 232 | 233 | train = pd.read_csv(path_train) 234 | train = get_equal_sets(train) 235 | 236 | filenames = train.filename.copy() 237 | filenames = filenames.apply( 238 | lambda s: re.search(r'lenna\d+', s).group() 239 | if re.search(r'lenna\d+', s) is not None else 'cover' 240 | ) 241 | 242 | # target and index preprocessing 243 | target = 'label' 244 | le_target = LabelEncoder().fit(train[target]) 245 | y_train_binary = le_target.transform(train[target]) 246 | 247 | train = train.drop([target, 'image', 'filename'], axis=1) 248 | 249 | # ************************************************************************** 250 | parameters = yaml.safe_load( 251 | open('{}/stegasawus/parameter_tuning.yaml'.format(path), 'rb') 252 | ) 253 | 254 | # ************************************************************************** 255 | # Grid search parameter tuning. 256 | def run_gs_parameter_tuning(): 257 | name = 'knn' 258 | pipeline = get_pipeline(name) 259 | 260 | gs_parameter_tuning( 261 | clf=pipeline, 262 | X_train=train.as_matrix(), 263 | y_train=y_train_binary, 264 | cv=3, 265 | parameters=parameters['grid_search'][name], 266 | scoring='accuracy' 267 | ) 268 | 269 | # run_gs_parameter_tuning() 270 | 271 | # ************************************************************************** 272 | # Particle swarm optimisation parameter tuning. 273 | def run_pso_parameter_tuning(clf_name): 274 | 275 | # TODO: fix issue with string representations of '1e-3' in yaml read 276 | lb = [float(v) for v in parameters['pso'][clf_name]['lb']] 277 | ub = [float(v) for v in parameters['pso'][clf_name]['ub']] 278 | 279 | g, f = pso_parameter_tuning( 280 | clf=classifiers['lr_lbfgs'], X=train.as_matrix(), y=y_train_binary, 281 | lb=lb, ub=ub, swarmsize=100, maxiter=20, n_splits=3) 282 | print g, f 283 | 284 | run_pso_parameter_tuning('lr_lbfgs') 285 | # run_pso_parameter_tuning('rf') 286 | # run_pso_parameter_tuning('xgb') 287 | 288 | # ************************************************************************** 289 | def plot_validation_curve(): 290 | name = 'svc_linear' 291 | pipeline = get_pipeline(name) 292 | 293 | param_range = np.logspace(-2, 3, 6) 294 | # param_range = np.logspace(-5, -1, 5) 295 | train_scores, val_scores = validation_curve( 296 | estimator=pipeline, 297 | X=train.as_matrix(), 298 | y=y_train_binary, 299 | param_name='%s__C' % name, 300 | # param_name='lr_lbfgs__tol', 301 | param_range=param_range, 302 | cv=5, 303 | scoring='accuracy', 304 | n_jobs=6 305 | ) 306 | 307 | plt.semilogx( 308 | param_range, train_scores.mean(axis=1), 309 | ls='-', lw=1, color='b', alpha=1, label='train' 310 | ) 311 | plt.fill_between( 312 | param_range, 313 | train_scores.mean(axis=1) - train_scores.std(axis=1), 314 | train_scores.mean(axis=1) + train_scores.std(axis=1), 315 | color='b', alpha=0.1, lw=0.5 316 | ) 317 | plt.semilogx( 318 | param_range, val_scores.mean(axis=1), 319 | ls='-', lw=1, color='r', alpha=1, label='validation' 320 | ) 321 | plt.fill_between( 322 | param_range, 323 | val_scores.mean(axis=1) - val_scores.std(axis=1), 324 | val_scores.mean(axis=1) + val_scores.std(axis=1), 325 | color='r', alpha=0.1, lw=0.5 326 | ) 327 | 328 | plt.title('%s: validation curve' % name) 329 | plt.xlabel('C') 330 | plt.ylabel('Score') 331 | plt.ylim(0.0, 1.1) 332 | plt.legend(loc="best") 333 | plt.show() 334 | 335 | # ************************************************************************** 336 | def plot_roc_curve(name): 337 | pipeline = get_pipeline(name) 338 | 339 | ss = ShuffleSplit(n_splits=5, test_size=0.2) 340 | 341 | X = train.as_matrix() 342 | y = y_train_binary 343 | 344 | fpr, tpr = [], [] 345 | for i, (train_idx, val_idx) in enumerate(ss.split(X, y)): 346 | X_train, X_val = X[train_idx], X[val_idx] 347 | y_train, y_val = y[train_idx], y[val_idx] 348 | 349 | model = pipeline.fit(X_train, y_train) 350 | y_pred = model.predict(X_val) 351 | 352 | fpr_i, tpr_i, _ = metrics.roc_curve(y_val, y_pred) 353 | fpr.append(fpr_i) 354 | tpr.append(tpr_i) 355 | 356 | fpr, tpr = np.array(fpr), np.array(tpr) 357 | 358 | plt.figure() 359 | plt.plot( 360 | fpr.mean(axis=0), tpr.mean(axis=0), 361 | color='b', alpha=0.6, lw=1, label='ROC curve' 362 | ) 363 | plt.plot([0, 1], [0, 1], color='k', alpha=0.6, lw=1, linestyle='--') 364 | plt.xlim([0.0, 1.0]) 365 | plt.ylim([0.0, 1.05]) 366 | plt.xlabel('False Positive Rate') 367 | plt.ylabel('True Positive Rate') 368 | plt.title('%s: ROC Curve' % name) 369 | plt.legend(loc="lower right") 370 | plt.show() 371 | 372 | # plot_roc_curve('svc_linear') 373 | --------------------------------------------------------------------------------