├── .gitignore
├── LICENSE
├── README.md
├── misc
    ├── front.jpg
    ├── num_examples_per_class.png
    ├── tsne_0.png
    ├── tsne_1.png
    ├── tsne_10.png
    ├── tsne_2.png
    ├── tsne_3.png
    ├── tsne_4.png
    ├── tsne_5.png
    ├── tsne_6.png
    ├── tsne_7.png
    ├── tsne_8.png
    ├── tsne_9.png
    ├── vermeer_vs_van_meegeren.png
    └── vermeer_vs_van_meegeren_hist.png
├── painters
    ├── __init__.py
    ├── cnn_embedding.py
    ├── data_dirs_organizer.py
    ├── data_provider.py
    ├── submission.py
    ├── train_cnn.py
    ├── utils.py
    └── validation.py
└── requirements.txt


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Created by https://www.gitignore.io/api/pycharm,python
  2 | 
  3 | ### PyCharm ###
  4 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and Webstorm
  5 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
  6 | 
  7 | .idea/
  8 | 
  9 | # User-specific stuff:
 10 | # .idea/workspace.xml
 11 | # .idea/tasks.xml
 12 | # .idea/dictionaries
 13 | # .idea/vcs.xml
 14 | # .idea/jsLibraryMappings.xml
 15 | 
 16 | # Sensitive or high-churn files:
 17 | # .idea/dataSources.ids
 18 | # .idea/dataSources.xml
 19 | # .idea/dataSources.local.xml
 20 | # .idea/sqlDataSources.xml
 21 | # .idea/dynamic.xml
 22 | # .idea/uiDesigner.xml
 23 | 
 24 | # Gradle:
 25 | # .idea/gradle.xml
 26 | # .idea/libraries
 27 | 
 28 | # Mongo Explorer plugin:
 29 | # .idea/mongoSettings.xml
 30 | 
 31 | ## File-based project format:
 32 | *.iws
 33 | 
 34 | ## Plugin-specific files:
 35 | 
 36 | # IntelliJ
 37 | /out/
 38 | 
 39 | # mpeltonen/sbt-idea plugin
 40 | .idea_modules/
 41 | 
 42 | # JIRA plugin
 43 | atlassian-ide-plugin.xml
 44 | 
 45 | # Crashlytics plugin (for Android Studio and IntelliJ)
 46 | com_crashlytics_export_strings.xml
 47 | crashlytics.properties
 48 | crashlytics-build.properties
 49 | fabric.properties
 50 | 
 51 | 
 52 | ### Python ###
 53 | # Byte-compiled / optimized / DLL files
 54 | __pycache__/
 55 | *.py[cod]
 56 | *$py.class
 57 | 
 58 | # C extensions
 59 | *.so
 60 | 
 61 | # Distribution / packaging
 62 | .Python
 63 | env/
 64 | build/
 65 | develop-eggs/
 66 | dist/
 67 | downloads/
 68 | eggs/
 69 | .eggs/
 70 | lib/
 71 | lib64/
 72 | parts/
 73 | sdist/
 74 | var/
 75 | *.egg-info/
 76 | .installed.cfg
 77 | *.egg
 78 | 
 79 | # PyInstaller
 80 | #  Usually these files are written by a python script from a template
 81 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 82 | *.manifest
 83 | *.spec
 84 | 
 85 | # Installer logs
 86 | pip-log.txt
 87 | pip-delete-this-directory.txt
 88 | 
 89 | # Unit test / coverage reports
 90 | htmlcov/
 91 | .tox/
 92 | .coverage
 93 | .coverage.*
 94 | .cache
 95 | nosetests.xml
 96 | coverage.xml
 97 | *,cover
 98 | .hypothesis/
 99 | 
100 | # Translations
101 | *.mo
102 | *.pot
103 | 
104 | # Django stuff:
105 | *.log
106 | local_settings.py
107 | 
108 | # Flask instance folder
109 | instance/
110 | 
111 | # Scrapy stuff:
112 | .scrapy
113 | 
114 | # Sphinx documentation
115 | docs/_build/
116 | 
117 | # PyBuilder
118 | target/
119 | 
120 | # IPython Notebook
121 | .ipynb_checkpoints
122 | 
123 | # pyenv
124 | .python-version
125 | 
126 | # celery beat schedule file
127 | celerybeat-schedule
128 | 
129 | # dotenv
130 | .env
131 | 
132 | # virtualenv
133 | venv/
134 | ENV/
135 | 
136 | # Spyder project settings
137 | .spyderproject
138 | 
139 | # Rope project settings
140 | .ropeproject
141 | 
142 | /data
143 | .DS_Store
144 | models/
145 | training_plots/
146 | q.txt
147 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2016 Nejc Ilenič
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | ## Painter by Numbers competition on Kaggle
  2 | <p align="center">
  3 |     <img src="/misc/front.jpg?raw=true"/>
  4 | </p>
  5 | 
  6 | This repository contains a 1st place solution for the [Painter by Numbers competition on Kaggle](https://www.kaggle.com/c/painter-by-numbers). Below is a brief description of the dataset and approaches I've used to build and validate a predictive model.
  7 | 
  8 | The challenge of the competition was to examine pairs of paintings and determine whether they were painted by the same artist. The training set consists of artwork images and their corresponding class labels (painters). Examples in the test set were split into 13 groups and all possible pairs within each group needed to be examined for the submission. The evaluation metric for the leaderboard was AUC (area under the curve).
  9 | 
 10 | Weights of the best-performing model along with preprocessing statistics are available [here](http://bit.ly/painters-pretrained).
 11 | 
 12 | ### 1. Dataset and preprocessing
 13 | The training set is unbalanced and some classes are only present in the training set and some only in the test set. Additionally input images are of various dimensions. There are 79433 instances and 1584 unique painters in the training set and the test set is composed of 23817 instances.  Predictions for approximately 22M pairs needed to be made for the submission.
 14 | 
 15 | The plot below shows number of paintings for each of the 1584 painters in the training set.
 16 | <p align="center">
 17 |     <img src="/misc/num_examples_per_class.png?raw=true"/>
 18 |     <b align="center">Number of examples per classes in the training set</b>
 19 | </p>
 20 | 
 21 | Labeled images were split into training (0.9) and validation (0.1) sets in a stratified manner resulting in 71423 training examples and 8010 validation examples belonging to 1584 classes.
 22 | 
 23 | The model I've built assumes fixed-size inputs, so the first preprocessing step was to resize each image's smallest dimension to 256 pixels (retaining the aspect ratio) and then cropping it at the center of the larger dimension, obtaining 256x256 images. Some information gets lost during this process and an alternative approach where multiple crops are taken from the same image was considered, but not used for the final solution due to much longer training times (bigger, but more correlated training set). Furthermore, mean values were subtracted from each feature in the data and the obtained values were normalized by dividing each dimension by its standard deviation. Preprocessing data statistics were computed from the subset of training instances. During the training phase random transformations (rotations, zooms, shifts, shears and flips) were applied to data in order to reduce overfitting. The latter assures that our model only rarely sees exactly the same example more than once. For exact transformation parameters see [data_provider.py](painters/data_provider.py#L41).
 24 | 
 25 | ### 2. Building a predictive model
 26 | There were two main approaches considered for verifying whether two instances belong to the same class. The unsupervised method involves training a model that can predict one of the 1584 classes and then taking a dot product of the two class distribution vectors (softmax outputs). The supervised method is an end-to-end metric learning approach called siamese network. The main idea is to replicate the model once for each input image and merge their outputs into a single vector, that can then be used to directly predict whether the two images were painted by the same artist. An important aspect of this architecture is that the weights of both models are shared and during backpropagation the total gradient is the sum of the gradients contributed by the two models. Since the model trained for the unsupervised technique can also be used in the siamese architecture, most of the effort went into the multi-class painter recognition task.
 27 | 
 28 | Depiction below illustrates the architecture of the final convolutional neural network with non-linearities, dropouts and batch normalization layers omitted. 3x3 convolutional filters with stride 1 are used to produce feature maps, that are two neurons smaller along each of the two dimensions, than their input volumes. Zero padding is then used to retain the original shape and 2x2 max pooling with stride 2 halves the number of neurons along each of the two dimensions. Non-linearities are applied to convolution and fully connected outputs using the PReLU function (Leaky ReLU with trainable slope parameter in the negative part). Dense layers at the end of the architecture are the reason why fixed-size inputs need to be fed to the network. The model is regularized using dropout, batch normalization layers and L2 weight penalties. A more detailed architecture and exact values of hyper parameters can be found in [train_cnn.py](painters/train_cnn.py).
 29 | ```
 30 |         LAYER               DATA DIMENSIONS
 31 | 
 32 |         Input     #####     (3, 256, 256)
 33 |   Convolution      \|/
 34 |                   #####     (16, 256, 256)
 35 |   Convolution      \|/
 36 |                   #####     (16, 256, 256)
 37 |    MaxPooling     YYYYY 
 38 |                   #####     (16, 128, 128)
 39 |   Convolution      \|/  
 40 |                   #####     (32, 128, 128)
 41 |   Convolution      \|/  
 42 |                   #####     (32, 128, 128)
 43 |   Convolution      \|/  
 44 |                   #####     (32, 128, 128)
 45 |    MaxPooling     YYYYY 
 46 |                   #####     (32, 64, 64)
 47 |   Convolution      \|/  
 48 |                   #####     (64, 64, 64)
 49 |   Convolution      \|/  
 50 |                   #####     (64, 64, 64)
 51 |   Convolution      \|/  
 52 |                   #####     (64, 64, 64)
 53 |    MaxPooling     YYYYY 
 54 |                   #####     (64, 32, 32)
 55 |   Convolution      \|/
 56 |                   #####     (128, 32, 32)
 57 |   Convolution      \|/
 58 |                   #####     (128, 32, 32)
 59 |   Convolution      \|/
 60 |                   #####     (128, 32, 32)
 61 |    MaxPooling     YYYYY
 62 |                   #####     (128, 16, 16)
 63 |   Convolution      \|/
 64 |                   #####     (256, 16, 16)
 65 |   Convolution      \|/
 66 |                   #####     (256, 16, 16)
 67 |   Convolution      \|/
 68 |                   #####     (256, 16, 16)
 69 |    MaxPooling     YYYYY
 70 |                   #####     (256, 8, 8)
 71 |       Flatten     |||||
 72 |                   #####     (16384,)
 73 |         Dense     XXXXX
 74 |                   #####     (2048,)
 75 |         Dense     XXXXX
 76 |                   #####     (1584,)
 77 |       Softmax     #####     (1584,)
 78 | ```
 79 | <p align="center">
 80 |     <b align="center">Final ConvNet architecture</b>
 81 | </p>
 82 | 
 83 | 300 epochs are needed for model to converge to the local minima using the Adam optimizer with 7.4e-05 learning rate and batch size of 96 examples. During training the cross-entropy loss was minimized.
 84 | 
 85 | Neural networks can be used as descriptor generators that produce lower dimensionality representations of input instances. One can think of them as automatic feature extractors. Such embeddings are obtained by simply taking the 2048 dimensional output vectors of the penultimate layer. To check whether there is any internal structure in the features produced by the ConvNet I've used the t-SNE dimensionality reduction technique. t-SNE is a convenient algorithm for visualization of high dimensional data and allows us to compare how similar input instances are. Below are two scatter plots of some of the artwork images of randomly selected artists from the validation set. Having in mind that the network hasn't seen those examples during training and that the t-SNE algorithm doesn't get class labels as inputs, the visual results are quite exciting. For more t-SNE plots see the [misc](misc) directory.
 86 | <p align="center">
 87 |     <img src="/misc/tsne_3.png?raw=true"/>
 88 |     <img src="/misc/tsne_2.png?raw=true"/>
 89 |     <b align="center">t-SNE embeddings of the features generated by the ConvNet (click on the image for full resolution)</b>
 90 | </p>
 91 | 
 92 | ### 3. Competition results
 93 | The public leaderboard score was calculated on 70% of the submission pairs and the private leaderboard score on the remaining 30%. The final submission was generated using the unsupervised approach for verifying the same class identity. The best single ConvNet scored `0.90717 AUC` on the private leaderboard and an ensemble of 18 best ConvNets trained during the hyper parameter search process scored `0.92890 AUC` on the private leaderboard. Adding more (worse) models to the ensemble started to hurt the overall performance. A single hypothesis was obtained from multiple models as a weighted average of their predictions for the painter recognition task and only then the inner product of the two averaged class distribution vectors was calculated.
 94 | 
 95 | The administrator of the competition [Kiri Nichol](https://www.kaggle.com/smallyellowduck) has posted some very useful insights into the performance of the algorithm on the private, test dataset. As stated on the competition [forum](https://www.kaggle.com/c/painter-by-numbers/forums/t/24970/wrapping-up), an ingenious Dutch forger Han van Meegeren was slipped into the test set in order to better understand how good the model is at extracting painters' unique styles. The forger has replicated some of the world's most famous artists' work, including the paintings of Johannes Vermeer. Below is a pairwise comparison table and a histogram of my best submission's predictions for van Meegeren and Vermeer examples from the test set. Based on the model's predictions it can be seen that Vermeer's paintings are indeed more similar to each other than van Meegeren's paintings are to Vermeer's paintings. It can also be seen that Vermeer's paintings are more similar to each other than van Meegeren's paintings are to each other, due to van Meegeren forging paintings in the style of several different artists.
 96 | <p align="center">
 97 |     <img src="/misc/vermeer_vs_van_meegeren.png?raw=true"/>
 98 |     <img src="/misc/vermeer_vs_van_meegeren_hist.png?raw=true"/>
 99 |     <b align="center">Pairwise comparison for van Meegeren and Vermeer paintings from the test set</b>
100 | </p>
101 | 
102 | Another really valuable insight concerns the extrapolation of the model to artists that were not seen during training. The results are given in the form of AUC of my final submission for two different groups of instances from the test set. The first group consists of pairs of images whose painters were present in the training set: `0.94218 AUC` and the second one is composed of pairs whose artists haven't been seen by the model before: `0.82509 AUC`. The results indicate that the model is not so good at generalizing to unknown classes.
103 | 
104 | ### 4. Conclusion and further work
105 | Based on the results of the competition it can be concluded that convolutional neural networks are able to decompose artwork images' visual space based on their painters unique style. The bad news is that the described algorithm is not good at extrapolating to unfamiliar artists. This is largely due to the fact that same identity verification is calculated directly from the two class distribution vectors.
106 | 
107 | As my first Kaggle competition this was an excellent learning experience and since I'm planning to continue the work as my upcoming master's degree thesis it was also a great opportunity for me to gain more knowledge about possible pitfalls and challenges in the domain. From this point forward my main focus will be on achieving better generalization by training an end-to-end metric learning technique called siamese network that was only briefly mentioned above.
108 | 
109 | I would like to thank [Niko Colnerič](https://github.com/nikicc), [Tomaž Hočevar](https://github.com/thocevar), [Blaž Zupan](https://github.com/BlazZupan), [Jure Žbontar](https://github.com/jzbontar) and other members of the [Bioinformatics Laboratory from University of Ljubljana](http://www.biolab.si/en/) for their help and provision of the infrastructure.
110 | 
111 | ### 5. Resources
112 | - [Bioinformatics Laboratory, University of Ljubljana](http://www.biolab.si/en/)
113 | - [Stanford CS231n notes](http://cs231n.github.io)
114 | - Very Deep Convolutional Networks for Large-Scale Image Recognition: Karen Simonyan, Andrew Zisserman
115 | - DeepFace: Closing the Gap to Human-Level Performance in Face Verification: Yaniv Taigman, Ming Yang, Marc'Aurelio Ranzato, Lior Wolf
116 | - Dimensionality Reduction by Learning an Invariant Mapping: Raia Hadsell, Sumit Chopra, Yann LeCun
117 | - Learning a similarity metric discriminatively, with application to face verification: Sumit Chopra, Raia Hadsell, Yann LeCun
118 | - [Keras: Deep Learning library](https://github.com/fchollet/keras)
119 | - [Competition](https://www.kaggle.com/c/painter-by-numbers) and its datasets that were prepared by [Kiri Nichol](https://www.kaggle.com/smallyellowduck)
120 | 


--------------------------------------------------------------------------------
/misc/front.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/inejc/painters/b8d2b3518c16a4d3e6fb3cdd825566f6eeaf5fda/misc/front.jpg


--------------------------------------------------------------------------------
/misc/num_examples_per_class.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/inejc/painters/b8d2b3518c16a4d3e6fb3cdd825566f6eeaf5fda/misc/num_examples_per_class.png


--------------------------------------------------------------------------------
/misc/tsne_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/inejc/painters/b8d2b3518c16a4d3e6fb3cdd825566f6eeaf5fda/misc/tsne_0.png


--------------------------------------------------------------------------------
/misc/tsne_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/inejc/painters/b8d2b3518c16a4d3e6fb3cdd825566f6eeaf5fda/misc/tsne_1.png


--------------------------------------------------------------------------------
/misc/tsne_10.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/inejc/painters/b8d2b3518c16a4d3e6fb3cdd825566f6eeaf5fda/misc/tsne_10.png


--------------------------------------------------------------------------------
/misc/tsne_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/inejc/painters/b8d2b3518c16a4d3e6fb3cdd825566f6eeaf5fda/misc/tsne_2.png


--------------------------------------------------------------------------------
/misc/tsne_3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/inejc/painters/b8d2b3518c16a4d3e6fb3cdd825566f6eeaf5fda/misc/tsne_3.png


--------------------------------------------------------------------------------
/misc/tsne_4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/inejc/painters/b8d2b3518c16a4d3e6fb3cdd825566f6eeaf5fda/misc/tsne_4.png


--------------------------------------------------------------------------------
/misc/tsne_5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/inejc/painters/b8d2b3518c16a4d3e6fb3cdd825566f6eeaf5fda/misc/tsne_5.png


--------------------------------------------------------------------------------
/misc/tsne_6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/inejc/painters/b8d2b3518c16a4d3e6fb3cdd825566f6eeaf5fda/misc/tsne_6.png


--------------------------------------------------------------------------------
/misc/tsne_7.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/inejc/painters/b8d2b3518c16a4d3e6fb3cdd825566f6eeaf5fda/misc/tsne_7.png


--------------------------------------------------------------------------------
/misc/tsne_8.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/inejc/painters/b8d2b3518c16a4d3e6fb3cdd825566f6eeaf5fda/misc/tsne_8.png


--------------------------------------------------------------------------------
/misc/tsne_9.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/inejc/painters/b8d2b3518c16a4d3e6fb3cdd825566f6eeaf5fda/misc/tsne_9.png


--------------------------------------------------------------------------------
/misc/vermeer_vs_van_meegeren.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/inejc/painters/b8d2b3518c16a4d3e6fb3cdd825566f6eeaf5fda/misc/vermeer_vs_van_meegeren.png


--------------------------------------------------------------------------------
/misc/vermeer_vs_van_meegeren_hist.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/inejc/painters/b8d2b3518c16a4d3e6fb3cdd825566f6eeaf5fda/misc/vermeer_vs_van_meegeren_hist.png


--------------------------------------------------------------------------------
/painters/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/inejc/painters/b8d2b3518c16a4d3e6fb3cdd825566f6eeaf5fda/painters/__init__.py


--------------------------------------------------------------------------------
/painters/cnn_embedding.py:
--------------------------------------------------------------------------------
  1 | from os.path import join, isfile, basename, splitext, dirname
  2 | 
  3 | import numpy as np
  4 | 
  5 | from data_provider import DATA_DIR, load_organized_data_info
  6 | from data_provider import MODELS_DIR, testing_generator
  7 | from data_provider import init_directory_generator
  8 | from train_cnn import load_trained_cnn_feature_maps_layer, PENULTIMATE_SIZE
  9 | from train_cnn import load_trained_cnn_penultimate_layer, LAST_FEATURE_MAPS_SIZE
 10 | from train_cnn import load_trained_cnn_softmax_layer, SOFTMAX_SIZE
 11 | from utils import mkdirs_if_not_exist
 12 | 
 13 | IMGS_DIM_1D = 256
 14 | MODEL_NAME = 'cnn_2_9069_vl.h5'
 15 | BATCH_SIZE = 512
 16 | 
 17 | LAYER_RESULT_FUNCS = {
 18 |     'feature_maps': load_trained_cnn_feature_maps_layer,
 19 |     'penultimate': load_trained_cnn_penultimate_layer,
 20 |     'softmax': load_trained_cnn_softmax_layer
 21 | }
 22 | 
 23 | LAYER_SIZES = {
 24 |     'feature_maps': LAST_FEATURE_MAPS_SIZE,
 25 |     'penultimate': PENULTIMATE_SIZE,
 26 |     'softmax': SOFTMAX_SIZE
 27 | }
 28 | 
 29 | 
 30 | def get_embedded_train_val_split(layer, model_name=MODEL_NAME):
 31 |     assert layer in LAYER_RESULT_FUNCS.keys()
 32 | 
 33 |     model_path = join(MODELS_DIR, model_name)
 34 |     model_name_no_ext, _ = splitext(model_name)
 35 |     embedded_data_dir = join(
 36 |         DATA_DIR, 'embedding_{:s}'.format(model_name_no_ext))
 37 |     train_val_split_file = join(
 38 |         embedded_data_dir, 'train_val_split_{:s}.npz'.format(layer))
 39 | 
 40 |     if isfile(train_val_split_file):
 41 |         split = np.load(train_val_split_file)
 42 |         return split['arr_0'], split['arr_1'],\
 43 |             split['arr_2'], split['arr_3'],\
 44 |             split['arr_4'], split['arr_5']
 45 |     else:
 46 |         return _create_embedded_train_val_split(
 47 |             layer, model_path, train_val_split_file)
 48 | 
 49 | 
 50 | def _create_embedded_train_val_split(layer, model_path, train_val_split_file):
 51 |     data_info = load_organized_data_info(IMGS_DIM_1D)
 52 |     dir_tr, num_tr = data_info['dir_tr'], data_info['num_tr']
 53 |     dir_val, num_val = data_info['dir_val'], data_info['num_val']
 54 | 
 55 |     model = LAYER_RESULT_FUNCS[layer](model_path)
 56 |     gen = testing_generator(dir_tr=dir_tr)
 57 | 
 58 |     X_tr, y_tr, names_tr = _create_embedded_data_from_dir(
 59 |         model, gen, dir_tr, num_tr, LAYER_SIZES[layer])
 60 | 
 61 |     X_val, y_val, names_val = _create_embedded_data_from_dir(
 62 |         model, gen, dir_val, num_val, LAYER_SIZES[layer])
 63 | 
 64 |     _save_np_compressed_data(
 65 |         train_val_split_file, X_tr, y_tr, names_tr, X_val, y_val, names_val)
 66 | 
 67 |     return X_tr, y_tr, names_tr, X_val, y_val, names_val
 68 | 
 69 | 
 70 | def get_embedded_test_set(layer, model_name=MODEL_NAME):
 71 |     assert layer in LAYER_RESULT_FUNCS.keys()
 72 | 
 73 |     model_path = join(MODELS_DIR, model_name)
 74 |     model_name_no_ext, _ = splitext(model_name)
 75 |     embedded_data_dir = join(
 76 |         DATA_DIR, 'embedding_{:s}'.format(model_name_no_ext))
 77 |     test_set_file = join(embedded_data_dir, 'test_set_{:s}.npz'.format(layer))
 78 | 
 79 |     if isfile(test_set_file):
 80 |         test_set = np.load(test_set_file)
 81 |         return test_set['arr_0'], test_set['arr_1']
 82 |     else:
 83 |         return _create_embedded_test_set(layer, model_path, test_set_file)
 84 | 
 85 | 
 86 | def _create_embedded_test_set(layer, model_path, test_set_file):
 87 |     data_info = load_organized_data_info(IMGS_DIM_1D)
 88 |     dir_te, num_te = data_info['dir_te'], data_info['num_te']
 89 |     dir_tr = data_info['dir_tr']
 90 | 
 91 |     model = LAYER_RESULT_FUNCS[layer](model_path)
 92 |     gen = testing_generator(dir_tr=dir_tr)
 93 | 
 94 |     X_te, names = _create_embedded_data_from_dir(
 95 |         model, gen, dir_te, num_te, LAYER_SIZES[layer], is_test_set=True)
 96 | 
 97 |     _save_np_compressed_data(test_set_file, X_te, names)
 98 |     return X_te, names
 99 | 
100 | 
101 | def _create_embedded_data_from_dir(
102 |         model, gen, dir_, num_samples, layer_size, is_test_set=False):
103 |     gen = init_directory_generator(
104 |         gen, dir_, BATCH_SIZE, class_mode='sparse', shuffle_=False)
105 | 
106 |     X, y = _create_embedded_data_from_gen(model, gen, num_samples, layer_size)
107 |     names = [basename(p) for p in gen.filenames]
108 | 
109 |     if is_test_set:
110 |         return X, names
111 |     return X, y, names
112 | 
113 | 
114 | def _create_embedded_data_from_gen(model, data_gen, num_samples, layer_size):
115 |     num_full_epochs = num_samples // BATCH_SIZE
116 |     last_batch_size = num_samples - (num_full_epochs * BATCH_SIZE)
117 | 
118 |     if isinstance(layer_size, int):
119 |         X = np.empty((0, layer_size))
120 |     else:
121 |         X = np.empty((0, layer_size[0], layer_size[1], layer_size[2]))
122 |     y = np.empty((0,)).astype(int)
123 | 
124 |     for i in range(num_full_epochs + 1):
125 |         X_batch, y_batch = next(data_gen)
126 | 
127 |         if i == num_full_epochs:
128 |             X_batch = X_batch[:last_batch_size]
129 |             y_batch = y_batch[:last_batch_size]
130 | 
131 |         X = np.vstack((X, model(X_batch)))
132 |         y = np.hstack((y, y_batch.astype(int)))
133 | 
134 |     return X, y
135 | 
136 | 
137 | def _save_np_compressed_data(file_name, *args):
138 |     mkdirs_if_not_exist(dirname(file_name))
139 |     np.savez_compressed(file_name, *args)
140 | 
141 | 
142 | if __name__ == '__main__':
143 |     get_embedded_train_val_split('penultimate')
144 |     get_embedded_test_set('penultimate')
145 |     get_embedded_train_val_split('softmax')
146 |     get_embedded_test_set('softmax')
147 |     get_embedded_train_val_split('feature_maps')
148 |     get_embedded_test_set('feature_maps')
149 | 


--------------------------------------------------------------------------------
/painters/data_dirs_organizer.py:
--------------------------------------------------------------------------------
  1 | from os import mkdir, listdir, makedirs
  2 | from os.path import join, abspath, basename, splitext, dirname
  3 | 
  4 | import numpy as np
  5 | from PIL.Image import LANCZOS
  6 | from PIL.ImageOps import fit
  7 | from keras.preprocessing.image import load_img
  8 | from sklearn.cross_validation import StratifiedShuffleSplit
  9 | from sklearn.preprocessing import LabelEncoder
 10 | 
 11 | from data_provider import TRAIN_DIR, TEST_DIR, DATA_DIR
 12 | from data_provider import load_train_info
 13 | from data_provider import save_organized_data_info, load_organized_data_info
 14 | 
 15 | IMGS_DIM_2D = (256, 256)
 16 | NEW_TRAIN_DIR = join(DATA_DIR, 'train_{:d}'.format(IMGS_DIM_2D[0]))
 17 | NEW_VAL_DIR = join(DATA_DIR, 'val_{:d}'.format(IMGS_DIM_2D[0]))
 18 | NEW_TEST_DIR = join(DATA_DIR, 'test_{:d}'.format(IMGS_DIM_2D[0]))
 19 | NEW_TEST_DIR = join(NEW_TEST_DIR, 'all')
 20 | MULTI_CROP = '_multi_crop'
 21 | VAL_SIZE = 0.1
 22 | 
 23 | 
 24 | def _organize_train_test_dir():
 25 |     _organize_train_dir()
 26 |     _organize_test_dir()
 27 | 
 28 | 
 29 | def _organize_train_dir():
 30 |     paths, labels = _load_paths_labels_from_train_dir()
 31 |     ind_tr, ind_val, classes = _train_val_split_indices(labels)
 32 |     _save_images_to_dir(NEW_TRAIN_DIR, paths[ind_tr], labels[ind_tr], classes)
 33 |     _save_images_to_dir(
 34 |         NEW_VAL_DIR, paths[ind_val], labels[ind_val], classes, multi_crop=False)
 35 | 
 36 | 
 37 | def _load_paths_labels_from_train_dir():
 38 |     labels_lookup = load_train_info()
 39 |     paths, labels = [], []
 40 |     for name in listdir(TRAIN_DIR):
 41 |         abspath_ = abspath(join(TRAIN_DIR, name))
 42 |         paths.append(abspath_)
 43 |         labels.append(labels_lookup[name])
 44 | 
 45 |     return np.array(paths), LabelEncoder().fit_transform(labels)
 46 | 
 47 | 
 48 | def _train_val_split_indices(labels):
 49 |     split = StratifiedShuffleSplit(
 50 |         labels, n_iter=1, test_size=VAL_SIZE, random_state=42)
 51 |     indices_tr, indices_val = next(iter(split))
 52 | 
 53 |     _save_organized_data_info(
 54 |         split.classes, indices_tr, indices_val, multi_crop=False)
 55 |     _save_organized_data_info(
 56 |         split.classes, indices_tr, indices_val, multi_crop=True)
 57 |     return indices_tr, indices_val, split.classes
 58 | 
 59 | 
 60 | def _save_organized_data_info(classes, indices_tr, indices_val, multi_crop):
 61 |     dir_tr = NEW_TRAIN_DIR + MULTI_CROP if multi_crop else NEW_TRAIN_DIR
 62 |     num_tr = 5 * len(indices_tr) if multi_crop else len(indices_tr)
 63 |     info = {
 64 |         'dir_tr': dir_tr,
 65 |         'num_tr': num_tr,
 66 |         'dir_val': NEW_VAL_DIR,
 67 |         'num_val': len(indices_val),
 68 |         'num_distinct_cls': len(classes),
 69 |         'dir_te': dirname(NEW_TEST_DIR)
 70 |     }
 71 |     save_organized_data_info(info, IMGS_DIM_2D[0], multi_crop)
 72 | 
 73 | 
 74 | def _save_images_to_dir(
 75 |         dest_dir, src_paths, labels, distinct_classes, multi_crop=True):
 76 | 
 77 |     _make_dir_tree(dest_dir, distinct_classes)
 78 |     if multi_crop:
 79 |         _make_dir_tree(dest_dir + MULTI_CROP, distinct_classes)
 80 | 
 81 |     for src_path, label in zip(src_paths, labels):
 82 |         dest_path = join(join(dest_dir, str(label)), basename(src_path))
 83 |         scaled_cropped_image = _save_scaled_cropped_img(src_path, dest_path)
 84 | 
 85 |         if multi_crop:
 86 |             _save_multi_cropped_to_dir(
 87 |                 src_path, dest_dir, label, scaled_cropped_image)
 88 | 
 89 | 
 90 | def _make_dir_tree(dir_, classes):
 91 |     mkdir(dir_)
 92 |     for class_ in classes:
 93 |         class_dir = join(dir_, str(class_))
 94 |         mkdir(class_dir)
 95 | 
 96 | 
 97 | def _save_multi_cropped_to_dir(src_path, dest_dir, label, scaled_cropped_image):
 98 |     multi_crop_dir = join(dest_dir + MULTI_CROP, str(label))
 99 |     dest_path_multi_crop = join(multi_crop_dir, basename(src_path))
100 |     scaled_cropped_image.save(dest_path_multi_crop)
101 |     _save_multi_cropped_imgs(src_path, dest_path_multi_crop)
102 | 
103 | 
104 | def _save_multi_cropped_imgs(src, dest):
105 |     image = load_img(src)
106 |     image, crop_coordinates = _prepare_image_for_cropping(image)
107 | 
108 |     dest_no_ext, ext = splitext(dest)
109 |     for i, crop_position in enumerate(crop_coordinates):
110 |         dest_i = "{:s}_{:d}{:s}".format(dest_no_ext, i, ext)
111 |         cropped_img = image.crop(box=crop_position)
112 | 
113 |         assert cropped_img.size == IMGS_DIM_2D, \
114 |             'Cropped image dimension is {:s}, instead of {:s}'\
115 |             .format(cropped_img.size, IMGS_DIM_2D)
116 | 
117 |         cropped_img.save(dest_i)
118 | 
119 | 
120 | def _prepare_image_for_cropping(image):
121 |     width, height = image.size
122 | 
123 |     fixed_width = IMGS_DIM_2D[0] if width < IMGS_DIM_2D[0] else width
124 |     fixed_height = IMGS_DIM_2D[1] if height < IMGS_DIM_2D[1] else height
125 |     if (fixed_width, fixed_height) != image.size:
126 |         image = image.resize((fixed_width, fixed_height), resample=LANCZOS)
127 | 
128 |     crop_coordinates = [
129 |         (0, 0, IMGS_DIM_2D[0], IMGS_DIM_2D[1]),
130 |         (fixed_width - IMGS_DIM_2D[0], 0, fixed_width, IMGS_DIM_2D[1]),
131 |         (0, fixed_height - IMGS_DIM_2D[1], IMGS_DIM_2D[0], fixed_height),
132 |         (fixed_width - IMGS_DIM_2D[0], fixed_height - IMGS_DIM_2D[1],
133 |          fixed_width, fixed_height),
134 |     ]
135 | 
136 |     return image, crop_coordinates
137 | 
138 | 
139 | def _organize_test_dir():
140 |     makedirs(NEW_TEST_DIR)
141 | 
142 |     num_test_samples = 0
143 |     for name in listdir(TEST_DIR):
144 |         src_path = abspath(join(TEST_DIR, name))
145 |         dest_path = join(NEW_TEST_DIR, name)
146 |         _save_scaled_cropped_img(src_path, dest_path)
147 |         num_test_samples += 1
148 | 
149 |     _append_num_te_to_organized_data_info(num_test_samples, multi_crop=False)
150 |     _append_num_te_to_organized_data_info(num_test_samples, multi_crop=True)
151 | 
152 | 
153 | def _save_scaled_cropped_img(src, dest):
154 |     image = load_img(src)
155 |     image = fit(image, IMGS_DIM_2D, method=LANCZOS)
156 |     image.save(dest)
157 |     return image
158 | 
159 | 
160 | def _append_num_te_to_organized_data_info(num_test_samples, multi_crop):
161 |     data_info = load_organized_data_info(IMGS_DIM_2D[0], multi_crop=multi_crop)
162 |     data_info['num_te'] = num_test_samples
163 |     save_organized_data_info(data_info, IMGS_DIM_2D[0], multi_crop=multi_crop)
164 | 
165 | 
166 | if __name__ == '__main__':
167 |     _organize_train_test_dir()
168 | 


--------------------------------------------------------------------------------
/painters/data_provider.py:
--------------------------------------------------------------------------------
  1 | import threading
  2 | from itertools import combinations, chain
  3 | from json import load, dump
  4 | from math import ceil
  5 | from os import listdir
  6 | from os.path import join, dirname, isfile, abspath, isdir, basename
  7 | from random import shuffle
  8 | 
  9 | import numpy as np
 10 | from keras.preprocessing.image import ImageDataGenerator
 11 | 
 12 | from utils import read_lines, load_img_arr
 13 | 
 14 | DATA_DIR = join(dirname(dirname(__file__)), 'data')
 15 | TEST_DIR = join(DATA_DIR, 'test')
 16 | TRAIN_DIR = join(DATA_DIR, 'train')
 17 | TRAIN_INFO_FILE = join(DATA_DIR, 'train_info.csv')
 18 | SUBMISSION_INFO_FILE = join(DATA_DIR, 'submission_info.csv')
 19 | ORGANIZED_DATA_INFO_FILE = 'organized_data_info_.json'
 20 | MODELS_DIR = join(dirname(dirname(__file__)), 'models')
 21 | MISC_DIR = join(dirname(dirname(__file__)), 'misc')
 22 | 
 23 | 
 24 | def train_val_dirs_generators(
 25 |         batch_size, dir_tr, dir_val, target_size=(256, 256)):
 26 |     gen_tr = _train_generator()
 27 |     gen_val = _val_generator()
 28 | 
 29 |     sample = apply_to_images_in_subdirs(dir_tr, load_img_arr, num_per_cls=10)
 30 |     sample = np.array(sample)
 31 |     gen_tr.fit(sample)
 32 |     gen_val.fit(sample)
 33 | 
 34 |     gen_tr = init_directory_generator(
 35 |         gen_tr, dir_tr, batch_size, target_size=target_size)
 36 |     gen_val = init_directory_generator(
 37 |         gen_val, dir_val, batch_size, target_size=target_size)
 38 |     return gen_tr, gen_val
 39 | 
 40 | 
 41 | def _train_generator():
 42 |     return ImageDataGenerator(
 43 |         featurewise_center=True,
 44 |         featurewise_std_normalization=True,
 45 |         rotation_range=180,
 46 |         zoom_range=0.2,
 47 |         width_shift_range=0.2,
 48 |         height_shift_range=0.2,
 49 |         shear_range=0.3,
 50 |         horizontal_flip=True,
 51 |         vertical_flip=True,
 52 |         fill_mode='reflect')
 53 | 
 54 | 
 55 | def testing_generator(dir_tr):
 56 |     gen = _val_generator()
 57 |     sample = apply_to_images_in_subdirs(dir_tr, load_img_arr, num_per_cls=10)
 58 |     sample = np.array(sample)
 59 |     gen.fit(sample)
 60 |     return gen
 61 | 
 62 | 
 63 | def _val_generator():
 64 |     return ImageDataGenerator(
 65 |         featurewise_center=True,
 66 |         featurewise_std_normalization=True)
 67 | 
 68 | 
 69 | def apply_to_images_in_subdirs(parent_dir, func, num_per_cls=None, **kwargs):
 70 |     results = []
 71 |     for cls_dir_name in listdir(parent_dir):
 72 |         cls_dir = abspath(join(parent_dir, cls_dir_name))
 73 |         r = _apply_to_first_n_in_dir(func, cls_dir, num_per_cls, **kwargs)
 74 |         results += r
 75 |     return results
 76 | 
 77 | 
 78 | def _apply_to_first_n_in_dir(func, dir_, num_per_cls, **kwargs):
 79 |     if not isdir(dir_):
 80 |         return []
 81 |     results = []
 82 |     for path in listdir(dir_)[:num_per_cls]:
 83 |         abspath_ = abspath(join(dir_, path))
 84 |         result = func(abspath_, **kwargs)
 85 |         results.append(result)
 86 |     return results
 87 | 
 88 | 
 89 | def init_directory_generator(
 90 |         gen, dir_, batch_size, target_size=(256, 256),
 91 |         class_mode='categorical', shuffle_=True):
 92 | 
 93 |     return gen.flow_from_directory(
 94 |         dir_,
 95 |         class_mode=class_mode,
 96 |         batch_size=batch_size,
 97 |         target_size=target_size,
 98 |         shuffle=shuffle_)
 99 | 
100 | 
101 | def train_val_pairs_dirs_generators(
102 |         batch_size, dir_tr, dir_val, num_groups_tr,
103 |         num_groups_val, num_samples_per_cls_val=None):
104 | 
105 |     gen_tr = PairsImageDataGenerator(
106 |         featurewise_center=True,
107 |         featurewise_std_normalization=True,
108 |         rotation_range=180,
109 |         zoom_range=0.2,
110 |         width_shift_range=0.2,
111 |         height_shift_range=0.2,
112 |         shear_range=0.3,
113 |         horizontal_flip=True,
114 |         vertical_flip=True,
115 |         fill_mode='reflect')
116 |     gen_val = PairsImageDataGenerator(
117 |         featurewise_center=True,
118 |         featurewise_std_normalization=True)
119 | 
120 |     sample = apply_to_images_in_subdirs(dir_tr, load_img_arr, num_per_cls=10)
121 |     sample = np.array(sample)
122 |     gen_tr.fit(sample)
123 |     gen_val.fit(sample)
124 | 
125 |     gen_tr = gen_tr.flow_from_directory(
126 |         dir_tr, batch_size=batch_size, num_groups=num_groups_tr)
127 |     gen_val = gen_val.flow_from_directory(
128 |         dir_val, batch_size=batch_size, num_groups=num_groups_val,
129 |         num_samples_per_cls=num_samples_per_cls_val)
130 |     return gen_tr, gen_val
131 | 
132 | 
133 | class PairsImageDataGenerator(ImageDataGenerator):
134 | 
135 |     def __init__(self, *args, **kwargs):
136 |         super().__init__(*args, **kwargs)
137 | 
138 |     def flow(self, X, y=None, batch_size=32, shuffle=True, seed=None,
139 |              save_to_dir=None, save_prefix='', save_format='jpeg',
140 |              num_groups=43, num_samples_per_cls=None):
141 | 
142 |         raise NotImplementedError
143 | 
144 |     def flow_from_directory(
145 |             self, dir_, target_size=(256, 256), color_mode='rgb',
146 |             classes=None, class_mode='categorical', batch_size=32,
147 |             shuffle=True, seed=None, save_to_dir=None, save_prefix='',
148 |             save_format='jpeg', num_groups=43, num_samples_per_cls=None):
149 | 
150 |         return PairsDirectoryIterator(
151 |             dir_, num_groups, self, batch_size, num_samples_per_cls)
152 | 
153 | 
154 | def inf_pairs_generator(batch_size, X, y, num_groups, num_samples_per_cls=None):
155 |     return PairsNumpyArrayIterator(
156 |         X, y, num_groups, batch_size, num_samples_per_cls)
157 | 
158 | 
159 | class PairsNumpyArrayIterator(object):
160 | 
161 |     def __init__(self, X, y, num_groups, batch_size=32, num_per_cls=None):
162 |         if num_per_cls:
163 |             self.X, self.y = self._select_num_per_cls_samples(X, y, num_per_cls)
164 |         else:
165 |             self.X, self.y = X, y
166 |         self.num_groups = num_groups
167 |         self.batch_size = batch_size
168 |         self._init_pairs_generator()
169 |         self.lock = threading.Lock()
170 | 
171 |     def _init_pairs_generator(self):
172 |         self.pairs_generator = pairs_generator(
173 |             self.X, self.y, self.batch_size,
174 |             lambda a, b: [a, b], self.num_groups)
175 | 
176 |     @staticmethod
177 |     def _select_num_per_cls_samples(X, y, num_per_cls):
178 |         X_sub, y_sub = np.empty((0,) + X.shape[1:]), np.empty((0,))
179 |         for cls in set(y):
180 |             X_sub = np.vstack((X_sub, X[y == cls][:num_per_cls]))
181 |             y_sub = np.hstack((y_sub, y[y == cls][:num_per_cls]))
182 |         return X_sub, y_sub
183 | 
184 |     def __iter__(self):
185 |         return self
186 | 
187 |     def __next__(self):
188 |         with self.lock:
189 |             try:
190 |                 X_batch, y_batch = next(self.pairs_generator)
191 |             except StopIteration:
192 |                 # todo: implement this properly :)
193 |                 self._init_pairs_generator()
194 |                 X_batch, y_batch = next(self.pairs_generator)
195 |         return [X_batch[:, 0], X_batch[:, 1]], y_batch
196 | 
197 | 
198 | class PairsDirectoryIterator(object):
199 | 
200 |     def __init__(self, dir_, num_groups, image_data_generator,
201 |                  batch_size=32, num_samples_per_cls=None):
202 | 
203 |         paths, y = self._get_paths_labels_from_dir(dir_, num_samples_per_cls)
204 |         self.paths = paths
205 |         self.y = y
206 |         self.num_groups = num_groups
207 |         self.batch_size = batch_size
208 |         self._init_pairs_generator()
209 |         self.image_data_generator = image_data_generator
210 |         self.lock = threading.Lock()
211 | 
212 |     @staticmethod
213 |     def _get_paths_labels_from_dir(dir_, num_per_cls):
214 |         def path_label(p): return [p, basename(dirname(p))]
215 |         paths_labels = apply_to_images_in_subdirs(dir_, path_label, num_per_cls)
216 |         paths_labels = np.array(paths_labels)
217 |         return paths_labels[:, 0], paths_labels[:, 1].astype(int)
218 | 
219 |     def _init_pairs_generator(self):
220 |         self.pairs_generator = pairs_generator(
221 |             self.paths, self.y, self.batch_size,
222 |             lambda a, b: [a, b], self.num_groups)
223 | 
224 |     def __iter__(self):
225 |         return self
226 | 
227 |     def __next__(self):
228 |         with self.lock:
229 |             try:
230 |                 paths_batch, y_batch = next(self.pairs_generator)
231 |             except StopIteration:
232 |                 # todo: implement this properly :)
233 |                 self._init_pairs_generator()
234 |                 paths_batch, y_batch = next(self.pairs_generator)
235 | 
236 |         X_batch = []
237 |         for path_a, path_b in paths_batch:
238 |             image_a, image_b = load_img_arr(path_a), load_img_arr(path_b)
239 |             image_a = self._std_random_transform_img(image_a)
240 |             image_b = self._std_random_transform_img(image_b)
241 |             X_batch.append([image_a, image_b])
242 |         X_batch = np.array(X_batch)
243 | 
244 |         return [X_batch[:, 0], X_batch[:, 1]], y_batch
245 | 
246 |     def _std_random_transform_img(self, img):
247 |         img = self.image_data_generator.random_transform(img)
248 |         return self.image_data_generator.standardize(img)
249 | 
250 | 
251 | def pairs_generator(X, y, batch_size, pair_func, num_groups):
252 |     grouped_indices = _split_into_groups(y, num_groups)
253 |     merged_combinations = _merge_within_groups_combinations(grouped_indices)
254 | 
255 |     while True:
256 |         X_batch, y_batch = [], []
257 | 
258 |         for _ in range(batch_size):
259 |             try:
260 |                 pair_indices = next(merged_combinations)
261 |             except StopIteration:
262 |                 return
263 | 
264 |             index_a, index_b = int(pair_indices[0]), int(pair_indices[1])
265 |             X_batch.append(pair_func(X[index_a], X[index_b]))
266 |             y_batch.append(int(y[index_a] == y[index_b]))
267 | 
268 |         yield np.array(X_batch), np.array(y_batch)
269 | 
270 | 
271 | def _split_into_groups(y, num_groups):
272 |     groups = [[] for _ in range(num_groups)]
273 |     group_index = 0
274 | 
275 |     for cls in set(y):
276 |         this_cls_indices = np.where(y == cls)[0]
277 |         num_cls_samples = len(this_cls_indices)
278 | 
279 |         num_cls_split_groups = ceil(num_cls_samples / 500)
280 |         split = np.array_split(this_cls_indices, num_cls_split_groups)
281 | 
282 |         for cls_group in split:
283 |             groups[group_index] = np.hstack((groups[group_index], cls_group))
284 |             group_index = (group_index + 1) % num_groups
285 | 
286 |     return groups
287 | 
288 | 
289 | def _merge_within_groups_combinations(grouped_indices):
290 |     for gi in grouped_indices:
291 |         shuffle(gi)
292 |     group_combinations = [combinations(gi, 2) for gi in grouped_indices]
293 |     shuffle(group_combinations)
294 |     return chain.from_iterable(group_combinations)
295 | 
296 | 
297 | def load_train_info():
298 |     train_info = read_lines(TRAIN_INFO_FILE)[1:]
299 |     parsed_train_info = {}
300 |     # filename,artist,title,style,genre,date
301 |     for l in train_info:
302 |         split = l.split(',')
303 |         parsed_train_info[split[0]] = split[1]
304 |     return parsed_train_info
305 | 
306 | 
307 | def load_organized_data_info(imgs_dim_1d, multi_crop=False):
308 |     if not isfile(_organized_data_info_file_dim(imgs_dim_1d, multi_crop)):
309 |         raise FileNotFoundError('Run data_dirs_organizer first')
310 |     with open(_organized_data_info_file_dim(imgs_dim_1d, multi_crop), 'r') as f:
311 |         return load(f)
312 | 
313 | 
314 | def save_organized_data_info(info, imgs_dim_1d, multi_crop=False):
315 |     with open(_organized_data_info_file_dim(imgs_dim_1d, multi_crop), 'w') as f:
316 |         dump(info, f)
317 | 
318 | 
319 | def _organized_data_info_file_dim(imgs_dim_1d, multi_crop=False):
320 |     split = ORGANIZED_DATA_INFO_FILE.split('.')
321 |     split[0] += str(imgs_dim_1d)
322 |     if multi_crop:
323 |         split[0] += '_multi_crop'
324 |     return join(DATA_DIR, '.'.join(split))
325 | 


--------------------------------------------------------------------------------
/painters/submission.py:
--------------------------------------------------------------------------------
 1 | from os.path import join
 2 | 
 3 | import numpy as np
 4 | 
 5 | from cnn_embedding import get_embedded_test_set
 6 | from data_provider import SUBMISSION_INFO_FILE, DATA_DIR
 7 | from data_provider import load_organized_data_info
 8 | from utils import append_to_file
 9 | from utils import read_lines_in_batches
10 | from validation import CNNS_WEIGHTS
11 | 
12 | IMGS_DIM_1D = 256
13 | SUBMISSION_FILE = join(DATA_DIR, 'submission.csv')
14 | BATCH_SIZE = 1000000
15 | FILES_TO_AVG = {}
16 | 
17 | 
18 | def _create_submission_file_avg_cnns():
19 |     data_info = load_organized_data_info(IMGS_DIM_1D)
20 |     X_avg, names = _average_embedded_test_data(data_info)
21 |     features_lookup = {n: f for n, f in zip(names, X_avg)}
22 |     _create_submission_file(
23 |         BATCH_SIZE, features_lookup, _calculate_batch_prediction_dot)
24 | 
25 | 
26 | def _average_embedded_test_data(data_info):
27 |     X_avg, names =\
28 |         np.zeros((data_info['num_te'], data_info['num_distinct_cls'])), None
29 |     for model, weight in CNNS_WEIGHTS.items():
30 |         X, names = get_embedded_test_set('softmax', model_name=model)
31 |         X_avg += weight * X
32 | 
33 |     X_avg /= sum(CNNS_WEIGHTS.values())
34 |     return X_avg, names
35 | 
36 | 
37 | def _calculate_batch_prediction_dot(lines, features_lookup):
38 |     y_pred, submission_indices = [], []
39 | 
40 |     for line in lines:
41 |         submission_indices.append(line[0])
42 |         image_feature_a = features_lookup[line[1]]
43 |         image_feature_b = features_lookup[line[2]]
44 |         y_pred.append(np.dot(image_feature_a, image_feature_b))
45 | 
46 |     return y_pred, submission_indices
47 | 
48 | 
49 | def _create_submission_file(batch_size, features_lookup, batch_predict_func):
50 |     append_to_file(["index,sameArtist\n"], SUBMISSION_FILE)
51 | 
52 |     for batch in read_lines_in_batches(SUBMISSION_INFO_FILE, batch_size):
53 |         y_pred, indices = batch_predict_func(batch, features_lookup)
54 |         lines = ["{:s},{:f}\n".format(i, p) for i, p in zip(indices, y_pred)]
55 |         append_to_file(lines, SUBMISSION_FILE)
56 | 
57 | 
58 | def _average_submission_files():
59 |     lines_gens, weights = [], []
60 | 
61 |     for file_name, weight in FILES_TO_AVG.items():
62 |         file_path = join(DATA_DIR, file_name)
63 |         lines_gen = read_lines_in_batches(file_path, batch_size=BATCH_SIZE)
64 |         lines_gens.append(lines_gen)
65 |         weights.append(weight)
66 | 
67 |     append_to_file(["index,sameArtist\n"], SUBMISSION_FILE)
68 | 
69 |     while True:
70 |         try:
71 |             _average_write_next_batch(lines_gens, weights)
72 |         except StopIteration:
73 |             return
74 | 
75 | 
76 | def _average_write_next_batch(lines_gens, weights):
77 |     separated_lines = [next(lg) for lg in lines_gens]
78 |     merged_lines = zip(*separated_lines)
79 | 
80 |     result_lines = []
81 | 
82 |     for same_example_lines in merged_lines:
83 |         example_index = same_example_lines[0][0]
84 |         preds = [float(l[1]) for l in same_example_lines]
85 |         pred_avg = sum(w * p for w, p in zip(weights, preds)) / sum(weights)
86 |         result_lines.append("{:s},{:f}\n".format(example_index, pred_avg))
87 | 
88 |     append_to_file(result_lines, SUBMISSION_FILE)
89 | 
90 | 
91 | if __name__ == '__main__':
92 |     _create_submission_file_avg_cnns()
93 |     # _average_submission_files()
94 | 


--------------------------------------------------------------------------------
/painters/train_cnn.py:
--------------------------------------------------------------------------------
  1 | from os.path import join
  2 | 
  3 | import keras.backend as K
  4 | from keras.callbacks import ModelCheckpoint
  5 | from keras.layers import Conv2D, Dense, Activation, MaxPooling2D
  6 | from keras.layers import Flatten, BatchNormalization, Dropout
  7 | from keras.layers.advanced_activations import PReLU
  8 | from keras.models import Sequential, load_model
  9 | from keras.optimizers import Adam
 10 | from keras.regularizers import l2
 11 | 
 12 | from data_provider import MODELS_DIR
 13 | from data_provider import load_organized_data_info
 14 | from data_provider import train_val_dirs_generators
 15 | 
 16 | IMGS_DIM_3D = (3, 256, 256)
 17 | CNN_MODEL_FILE = join(MODELS_DIR, 'cnn.h5')
 18 | MAX_EPOCHS = 500
 19 | BATCH_SIZE = 96
 20 | L2_REG = 0.003
 21 | W_INIT = 'he_normal'
 22 | LAST_FEATURE_MAPS_LAYER = 46
 23 | LAST_FEATURE_MAPS_SIZE = (128, 8, 8)
 24 | PENULTIMATE_LAYER = 51
 25 | PENULTIMATE_SIZE = 2048
 26 | SOFTMAX_LAYER = 55
 27 | SOFTMAX_SIZE = 1584
 28 | 
 29 | 
 30 | def _train_model():
 31 |     data_info = load_organized_data_info(IMGS_DIM_3D[1])
 32 |     dir_tr = data_info['dir_tr']
 33 |     dir_val = data_info['dir_val']
 34 | 
 35 |     gen_tr, gen_val = train_val_dirs_generators(BATCH_SIZE, dir_tr, dir_val)
 36 |     model = _cnn(IMGS_DIM_3D)
 37 | 
 38 |     model.fit_generator(
 39 |         generator=gen_tr,
 40 |         nb_epoch=MAX_EPOCHS,
 41 |         samples_per_epoch=data_info['num_tr'],
 42 |         validation_data=gen_val,
 43 |         nb_val_samples=data_info['num_val'],
 44 |         callbacks=[ModelCheckpoint(CNN_MODEL_FILE, save_best_only=True)],
 45 |         verbose=2)
 46 | 
 47 | 
 48 | def _cnn(imgs_dim, compile_=True):
 49 |     model = Sequential()
 50 | 
 51 |     model.add(_convolutional_layer(nb_filter=16, input_shape=imgs_dim))
 52 |     model.add(BatchNormalization(axis=1, mode=2))
 53 |     model.add(PReLU(init=W_INIT))
 54 |     model.add(_convolutional_layer(nb_filter=16))
 55 |     model.add(BatchNormalization(axis=1, mode=2))
 56 |     model.add(PReLU(init=W_INIT))
 57 |     model.add(MaxPooling2D(pool_size=(2, 2)))
 58 | 
 59 |     model.add(_convolutional_layer(nb_filter=32))
 60 |     model.add(BatchNormalization(axis=1, mode=2))
 61 |     model.add(PReLU(init=W_INIT))
 62 |     model.add(_convolutional_layer(nb_filter=32))
 63 |     model.add(BatchNormalization(axis=1, mode=2))
 64 |     model.add(PReLU(init=W_INIT))
 65 |     model.add(_convolutional_layer(nb_filter=32))
 66 |     model.add(BatchNormalization(axis=1, mode=2))
 67 |     model.add(PReLU(init=W_INIT))
 68 |     model.add(MaxPooling2D(pool_size=(2, 2)))
 69 | 
 70 |     model.add(_convolutional_layer(nb_filter=64))
 71 |     model.add(BatchNormalization(axis=1, mode=2))
 72 |     model.add(PReLU(init=W_INIT))
 73 |     model.add(_convolutional_layer(nb_filter=64))
 74 |     model.add(BatchNormalization(axis=1, mode=2))
 75 |     model.add(PReLU(init=W_INIT))
 76 |     model.add(_convolutional_layer(nb_filter=64))
 77 |     model.add(BatchNormalization(axis=1, mode=2))
 78 |     model.add(PReLU(init=W_INIT))
 79 |     model.add(MaxPooling2D(pool_size=(2, 2)))
 80 | 
 81 |     model.add(_convolutional_layer(nb_filter=128))
 82 |     model.add(BatchNormalization(axis=1, mode=2))
 83 |     model.add(PReLU(init=W_INIT))
 84 |     model.add(_convolutional_layer(nb_filter=128))
 85 |     model.add(BatchNormalization(axis=1, mode=2))
 86 |     model.add(PReLU(init=W_INIT))
 87 |     model.add(_convolutional_layer(nb_filter=128))
 88 |     model.add(BatchNormalization(axis=1, mode=2))
 89 |     model.add(PReLU(init=W_INIT))
 90 |     model.add(MaxPooling2D(pool_size=(2, 2)))
 91 | 
 92 |     model.add(_convolutional_layer(nb_filter=256))
 93 |     model.add(BatchNormalization(axis=1, mode=2))
 94 |     model.add(PReLU(init=W_INIT))
 95 |     model.add(_convolutional_layer(nb_filter=256))
 96 |     model.add(BatchNormalization(axis=1, mode=2))
 97 |     model.add(PReLU(init=W_INIT))
 98 |     model.add(_convolutional_layer(nb_filter=256))
 99 |     model.add(BatchNormalization(axis=1, mode=2))
100 |     model.add(PReLU(init=W_INIT))
101 |     model.add(MaxPooling2D(pool_size=(2, 2)))
102 |     model.add(Dropout(p=0.5))
103 | 
104 |     model.add(Flatten())
105 |     model.add(_dense_layer(output_dim=PENULTIMATE_SIZE))
106 |     model.add(BatchNormalization(mode=2))
107 |     model.add(PReLU(init=W_INIT))
108 | 
109 |     if compile_:
110 |         model.add(Dropout(p=0.5))
111 |         model.add(_dense_layer(output_dim=SOFTMAX_SIZE))
112 |         model.add(BatchNormalization(mode=2))
113 |         model.add(Activation(activation='softmax'))
114 |         return compile_model(model)
115 | 
116 |     return model
117 | 
118 | 
119 | def _convolutional_layer(nb_filter, input_shape=None):
120 |     if input_shape:
121 |         return _first_convolutional_layer(nb_filter, input_shape)
122 |     else:
123 |         return _intermediate_convolutional_layer(nb_filter)
124 | 
125 | 
126 | def _first_convolutional_layer(nb_filter, input_shape):
127 |     return Conv2D(
128 |         nb_filter=nb_filter, nb_row=3, nb_col=3, input_shape=input_shape,
129 |         border_mode='same', init=W_INIT, W_regularizer=l2(l=L2_REG))
130 | 
131 | 
132 | def _intermediate_convolutional_layer(nb_filter):
133 |     return Conv2D(
134 |         nb_filter=nb_filter, nb_row=3, nb_col=3, border_mode='same',
135 |         init=W_INIT, W_regularizer=l2(l=L2_REG))
136 | 
137 | 
138 | def _dense_layer(output_dim):
139 |     return Dense(output_dim=output_dim, W_regularizer=l2(l=L2_REG), init=W_INIT)
140 | 
141 | 
142 | def compile_model(model):
143 |     adam = Adam(lr=0.000074)
144 |     model.compile(
145 |         loss='categorical_crossentropy',
146 |         optimizer=adam,
147 |         metrics=['accuracy'])
148 |     return model
149 | 
150 | 
151 | def load_trained_cnn_feature_maps_layer(model_path):
152 |     return _load_trained_cnn_layer(model_path, LAST_FEATURE_MAPS_LAYER)
153 | 
154 | 
155 | def load_trained_cnn_penultimate_layer(model_path):
156 |     return _load_trained_cnn_layer(model_path, PENULTIMATE_LAYER)
157 | 
158 | 
159 | def load_trained_cnn_softmax_layer(model_path):
160 |     return _load_trained_cnn_layer(model_path, SOFTMAX_LAYER)
161 | 
162 | 
163 | def _load_trained_cnn_layer(model_path, layer_index):
164 |     model = load_model(model_path)
165 |     dense_output = K.function(
166 |         [model.layers[0].input, K.learning_phase()],
167 |         [model.layers[layer_index].output])
168 |     # output in test mode = 0
169 |     return lambda X: dense_output([X, 0])[0]
170 | 
171 | 
172 | if __name__ == '__main__':
173 |     _train_model()
174 | 


--------------------------------------------------------------------------------
/painters/utils.py:
--------------------------------------------------------------------------------
 1 | import pickle
 2 | from csv import reader
 3 | from itertools import islice
 4 | from os import makedirs
 5 | from os.path import isdir
 6 | 
 7 | import numpy as np
 8 | from keras.preprocessing.image import load_img, img_to_array
 9 | 
10 | 
11 | def read_lines(file_name):
12 |     with open(file_name, 'r') as f:
13 |         return list(f)
14 | 
15 | 
16 | def read_lines_in_batches(file_name, batch_size, num_skip=1):
17 |     with open(file_name, 'r') as f:
18 |         list(islice(f, num_skip))
19 | 
20 |         reader_ = reader(f)
21 |         while True:
22 |             batch = list(islice(reader_, batch_size))
23 |             yield batch
24 | 
25 |             if len(batch) < batch_size:
26 |                 return
27 | 
28 | 
29 | def save_pickle(obj, file_name):
30 |     with open(file_name, 'wb') as f:
31 |         pickle.dump(obj, f)
32 | 
33 | 
34 | def load_pickle(file_name):
35 |     with open(file_name, 'rb') as f:
36 |         return pickle.load(f)
37 | 
38 | 
39 | def append_to_file(lines, file_name):
40 |     with open(file_name, 'a') as f:
41 |         f.writelines(lines)
42 | 
43 | 
44 | def mkdirs_if_not_exist(path):
45 |     if not isdir(path):
46 |         makedirs(path)
47 | 
48 | 
49 | def load_img_arr(p):
50 |     return img_to_array(load_img(p))
51 | 
52 | 
53 | def pairs_dot(X):
54 |     return np.sum(X[:, 0] * X[:, 1], axis=1)
55 | 


--------------------------------------------------------------------------------
/painters/validation.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from sklearn.metrics import roc_auc_score
 3 | 
 4 | from cnn_embedding import get_embedded_train_val_split
 5 | from data_provider import pairs_generator, load_organized_data_info
 6 | from utils import pairs_dot
 7 | 
 8 | IMGS_DIM_1D = 256
 9 | CNNS_WEIGHTS = {
10 |     'cnn_2_9069_vl.h5': 11,
11 |     'cnn_2_9273_vl.h5': 11,
12 |     'cnn_2_9330_vl.h5': 11,
13 |     'cnn_2_9549_vl.h5': 10,
14 |     'cnn_2_9675_vl.h5': 10,
15 |     'cnn_2_9678_vl.h5': 9,
16 |     'cnn_2_9755_vl.h5': 9,
17 |     'cnn_2_9806_vl.h5': 9,
18 |     'cnn_2_9924_vl.h5': 9,
19 |     'cnn_2_9979_vl.h5': 9,
20 |     'cnn_3_0069_vl.h5': 9,
21 |     'cnn_3_0236_vl.h5': 8,
22 |     'cnn_3_0256_vl.h5': 8,
23 |     'cnn_3_0416_vl.h5': 7,
24 |     'cnn_3_0453_vl.h5': 7,
25 |     'cnn_3_0456_vl.h5': 6,
26 |     'cnn_3_0743_vl.h5': 4,
27 |     'cnn_3_0752_vl.h5': 4,
28 | }
29 | 
30 | 
31 | def _softmax_dot():
32 |     data_info = load_organized_data_info(IMGS_DIM_1D)
33 |     X_avg, y_val = _average_embedded_val_data(data_info)
34 | 
35 |     batches_val = _create_pairs_generator(
36 |         X_avg, y_val, lambda u, v: [u, v],
37 |         num_groups=32,
38 |         batch_size=1000000)
39 | 
40 |     y_pred, y_true = np.array([]), np.array([])
41 |     for X, y in batches_val:
42 |         y_pred = np.hstack((y_pred, pairs_dot(X)))
43 |         y_true = np.hstack((y_true, y))
44 | 
45 |     print("Validation AUC: {:.4f}".format(roc_auc_score(y_true, y_pred)))
46 | 
47 | 
48 | def _average_embedded_val_data(data_info):
49 |     X_avg, y_val =\
50 |         np.zeros((data_info['num_val'], data_info['num_distinct_cls'])), None
51 | 
52 |     for model, weight in CNNS_WEIGHTS.items():
53 |         print("Model: {:s}".format(model))
54 |         split = get_embedded_train_val_split('softmax', model_name=model)
55 |         _, _, _, X_val, y_val, _ = split
56 |         X_avg += weight * X_val
57 | 
58 |     X_avg /= sum(CNNS_WEIGHTS.values())
59 |     return X_avg, y_val
60 | 
61 | 
62 | def _create_pairs_generator(X, y, pairs_func, num_groups, batch_size):
63 |     return pairs_generator(
64 |         X, y,
65 |         batch_size=batch_size,
66 |         pair_func=pairs_func,
67 |         num_groups=num_groups)
68 | 
69 | 
70 | if __name__ == '__main__':
71 |     _softmax_dot()
72 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | cycler==0.10.0
 2 | h5py==2.6.0
 3 | -e git+https://github.com/fchollet/keras.git@2c510530b1b7463df20d44b656d1f884eb674ad6#egg=Keras
 4 | matplotlib==1.5.1
 5 | numpy==1.11.1
 6 | Pillow==3.3.0
 7 | protobuf==3.0.0b2
 8 | pyparsing==2.1.5
 9 | python-dateutil==2.5.3
10 | pytz==2016.6.1
11 | PyYAML==3.11
12 | scikit-learn==0.17.1
13 | scipy==0.18.0
14 | six==1.10.0
15 | Theano==0.8.2


--------------------------------------------------------------------------------