├── .dvc ├── .gitignore └── config ├── .gitignore ├── README.md ├── requirements.txt └── train.py /.dvc/.gitignore: -------------------------------------------------------------------------------- 1 | state 2 | lock 3 | config.local 4 | updater 5 | cache 6 | /lock 7 | /config.local 8 | /updater 9 | /updater.lock 10 | /state-journal 11 | /state-wal 12 | /state 13 | /cache 14 | /tmp 15 | -------------------------------------------------------------------------------- /.dvc/config: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iterative/example-versioning/3f6ec3ad3af6141e5b171322ff04748945337fe2/.dvc/config -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .env 2 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Example: Versioning 2 | 3 | Datasets and ML model getting started 4 | [versioning tutorial](https://dvc.org/doc/tutorials/versioning). 5 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | pillow 2 | scipy 3 | tensorflow>=2,<3 4 | tqdm>=4.41.0,<5 5 | -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | '''This script goes along the blog post 2 | "Building powerful image classification models using very little data" 3 | from blog.keras.io. 4 | 5 | In our example we will be using data that can be downloaded at: 6 | https://www.kaggle.com/tongpython/cat-and-dog 7 | 8 | In our setup, it expects: 9 | - a data/ folder 10 | - train/ and validation/ subfolders inside data/ 11 | - cats/ and dogs/ subfolders inside train/ and validation/ 12 | - put the cat pictures index 0-X in data/train/cats 13 | - put the cat pictures index 1000-1400 in data/validation/cats 14 | - put the dogs pictures index 0-X in data/train/dogs 15 | - put the dog pictures index 1000-1400 in data/validation/dogs 16 | 17 | We have X training examples for each class, and 400 validation examples 18 | for each class. In summary, this is our directory structure: 19 | ``` 20 | data/ 21 | train/ 22 | dogs/ 23 | dog001.jpg 24 | dog002.jpg 25 | ... 26 | cats/ 27 | cat001.jpg 28 | cat002.jpg 29 | ... 30 | validation/ 31 | dogs/ 32 | dog001.jpg 33 | dog002.jpg 34 | ... 35 | cats/ 36 | cat001.jpg 37 | cat002.jpg 38 | ... 39 | ``` 40 | ''' 41 | import numpy as np 42 | import sys 43 | import os 44 | 45 | from tensorflow.keras.preprocessing.image import ImageDataGenerator 46 | from tensorflow.keras.models import Sequential 47 | from tensorflow.keras.layers import Dropout, Flatten, Dense 48 | from tensorflow.keras import applications 49 | from tensorflow.keras.callbacks import CSVLogger 50 | from tqdm.keras import TqdmCallback 51 | 52 | pathname = os.path.dirname(sys.argv[0]) 53 | path = os.path.abspath(pathname) 54 | 55 | # dimensions of our images. 56 | img_width, img_height = 150, 150 57 | 58 | top_model_weights_path = 'model.weights.h5' 59 | train_data_dir = os.path.join('data', 'train') 60 | validation_data_dir = os.path.join('data', 'validation') 61 | cats_train_path = os.path.join(path, train_data_dir, 'cats') 62 | nb_train_samples = 2 * len([name for name in os.listdir(cats_train_path) 63 | if os.path.isfile( 64 | os.path.join(cats_train_path, name))]) 65 | nb_validation_samples = 800 66 | epochs = 10 67 | batch_size = 10 68 | 69 | 70 | def save_bottlebeck_features(): 71 | datagen = ImageDataGenerator(rescale=1. / 255) 72 | 73 | # build the VGG16 network 74 | model = applications.VGG16(include_top=False, weights='imagenet') 75 | 76 | generator = datagen.flow_from_directory( 77 | train_data_dir, 78 | target_size=(img_width, img_height), 79 | batch_size=batch_size, 80 | class_mode=None, 81 | shuffle=False) 82 | bottleneck_features_train = model.predict( 83 | generator, nb_train_samples // batch_size) 84 | np.save(open('bottleneck_features_train.npy', 'wb'), 85 | bottleneck_features_train) 86 | 87 | generator = datagen.flow_from_directory( 88 | validation_data_dir, 89 | target_size=(img_width, img_height), 90 | batch_size=batch_size, 91 | class_mode=None, 92 | shuffle=False) 93 | bottleneck_features_validation = model.predict( 94 | generator, nb_validation_samples // batch_size) 95 | np.save(open('bottleneck_features_validation.npy', 'wb'), 96 | bottleneck_features_validation) 97 | 98 | 99 | def train_top_model(): 100 | train_data = np.load(open('bottleneck_features_train.npy', 'rb')) 101 | train_labels = np.array( 102 | [0] * (int(nb_train_samples / 2)) + [1] * (int(nb_train_samples / 2))) 103 | 104 | validation_data = np.load(open('bottleneck_features_validation.npy', 'rb')) 105 | validation_labels = np.array( 106 | [0] * (int(nb_validation_samples / 2)) + 107 | [1] * (int(nb_validation_samples / 2))) 108 | 109 | model = Sequential() 110 | model.add(Flatten(input_shape=train_data.shape[1:])) 111 | model.add(Dense(256, activation='relu')) 112 | model.add(Dropout(0.5)) 113 | model.add(Dense(1, activation='sigmoid')) 114 | 115 | model.compile(optimizer='rmsprop', 116 | loss='binary_crossentropy', metrics=['accuracy']) 117 | 118 | model.fit(train_data, train_labels, 119 | epochs=epochs, 120 | batch_size=batch_size, 121 | validation_data=(validation_data, validation_labels), 122 | verbose=0, 123 | callbacks=[TqdmCallback(), CSVLogger("metrics.csv")]) 124 | model.save_weights(top_model_weights_path) 125 | 126 | 127 | save_bottlebeck_features() 128 | train_top_model() 129 | --------------------------------------------------------------------------------