├── README.md ├── create_dirs.sh ├── description.pdf ├── final-submission-builder.ipynb ├── how-to-run.pdf ├── learning-4gr-only.ipynb ├── learning-main-model.ipynb ├── semi-supervised-trick.ipynb └── src ├── 4gr-collect-stats.py ├── 4gr-freq-reducer.py ├── 4gr-rf.py ├── 4gr-svc.py ├── base-features-packer.py ├── func_grepper.py ├── main.sh ├── ngramms_extractor.py ├── sections_hist.py ├── set_up.py ├── spectral_asm.py └── stdcall_grepper.py /README.md: -------------------------------------------------------------------------------- 1 | Kaggle ['Microsoft Malware Classification Challenge'](https://www.kaggle.com/c/malware-classification) 3rd place solution 2 | ======= 3 | ### Mikhail Trofimov, Dmitry Ulyanov, Stanislav Semenov. 4 | 5 | Gets score 0.0040 on private leaderboard 6 | 7 | How to reproduce submission 8 | ======= 9 | Don't forget to check paths in ./src/set_up.py! 10 | ``` 11 | ./create_dirs.sh 12 | cd ./src 13 | ./main.sh 14 | cd ../ 15 | ``` 16 | and run all the code in 17 | `learning-main-model.ipynb`, 18 | `learning-4gr-only.ipynb`, 19 | `semi-supervised-trick.ipynb` and 20 | `final-submission-builder.ipynb`. 21 | 22 | Dependencies 23 | ======= 24 | * python 2.7.9 25 | * ipython 3.1.0 26 | * sklearn 0.16.1 27 | * numpy 1.9.2 28 | * pandas 0.16.0 29 | * hickle 1.1.1 30 | * pypy 2.5.1 (with installed joblib 0.8.4) 31 | * scipy 0.15.1 32 | * xgboost-0.3 33 | 34 | Hardware 35 | ======= 36 | We run this code on machine with 16 cores and 120 GB RAM. 37 | The most memory-consuming part is processing 4-gramms. All the others will require no more than 32 GB RAM. 38 | -------------------------------------------------------------------------------- /create_dirs.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | mkdir tmp 3 | mkdir data 4 | mkdir data/feats 5 | mkdir data/raw 6 | mkdir submissions/ 7 | -------------------------------------------------------------------------------- /description.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/geffy/kaggle-malware/8275e226827107505a559b321356adc0d666849e/description.pdf -------------------------------------------------------------------------------- /final-submission-builder.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# final submission builder" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 11, 13 | "metadata": { 14 | "collapsed": false 15 | }, 16 | "outputs": [], 17 | "source": [ 18 | "import pandas as pd\n", 19 | "\n", 20 | "import set_up" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": 12, 26 | "metadata": { 27 | "collapsed": false 28 | }, 29 | "outputs": [], 30 | "source": [ 31 | "dsol1 = pd.read_csv(set_up.submissions_dir_path + 'release1.csv')\n", 32 | "dsol2 = pd.read_csv(set_up.submissions_dir_path + 'release0.5.csv')" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": 13, 38 | "metadata": { 39 | "collapsed": false 40 | }, 41 | "outputs": [], 42 | "source": [ 43 | "dfinal = pd.read_csv(set_up.test_sample_path)\n", 44 | "\n", 45 | "dfinal.ix[:, 2:] = 0.95 * dsol1.ix[:, 2:] + 0.05 * dsol2.ix[:, 2:]\n", 46 | "dfinal.ix[:, 1] = dsol1.ix[:, 1]\n", 47 | "dfinal.to_csv(set_up.submissions_dir_path + 'release2.csv', index = False)" 48 | ] 49 | } 50 | ], 51 | "metadata": { 52 | "kernelspec": { 53 | "display_name": "Python 2", 54 | "language": "python", 55 | "name": "python2" 56 | }, 57 | "language_info": { 58 | "codemirror_mode": { 59 | "name": "ipython", 60 | "version": 2 61 | }, 62 | "file_extension": ".py", 63 | "mimetype": "text/x-python", 64 | "name": "python", 65 | "nbconvert_exporter": "python", 66 | "pygments_lexer": "ipython2", 67 | "version": "2.7.6" 68 | } 69 | }, 70 | "nbformat": 4, 71 | "nbformat_minor": 0 72 | } 73 | -------------------------------------------------------------------------------- /how-to-run.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/geffy/kaggle-malware/8275e226827107505a559b321356adc0d666849e/how-to-run.pdf -------------------------------------------------------------------------------- /learning-4gr-only.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# script for learning 4gr-only model" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 3, 13 | "metadata": { 14 | "collapsed": false 15 | }, 16 | "outputs": [ 17 | { 18 | "name": "stdout", 19 | "output_type": "stream", 20 | "text": [ 21 | "Populating the interactive namespace from numpy and matplotlib\n" 22 | ] 23 | }, 24 | { 25 | "name": "stderr", 26 | "output_type": "stream", 27 | "text": [ 28 | "WARNING: pylab import has clobbered these variables: ['random']\n", 29 | "`%matplotlib` prevents importing * from pylab and numpy\n" 30 | ] 31 | } 32 | ], 33 | "source": [ 34 | "%pylab inline\n", 35 | "\n", 36 | "import numpy as np\n", 37 | "import pandas as pd\n", 38 | "\n", 39 | "import time\n", 40 | "import random\n", 41 | "import cPickle\n", 42 | "\n", 43 | "from sklearn.externals import joblib\n", 44 | "\n", 45 | "import set_up\n", 46 | "\n", 47 | "import sys\n", 48 | "sys.path.append('../git/xgboost/wrapper/')\n", 49 | "import xgboost as xgb" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": 5, 55 | "metadata": { 56 | "collapsed": false 57 | }, 58 | "outputs": [ 59 | { 60 | "name": "stdout", 61 | "output_type": "stream", 62 | "text": [ 63 | "(10868, 16905) (10868,)\n", 64 | "(10873, 16905)\n" 65 | ] 66 | } 67 | ], 68 | "source": [ 69 | "X_4gr_dim10k_tr = joblib.load(set_up.feats_4gr_folder_path + '4gr_train_dim10k.joblib')\n", 70 | "X_4gr_dim10k_te = joblib.load(set_up.feats_4gr_folder_path + '4gr_test_dim10k.joblib')\n", 71 | "\n", 72 | "Xtrain = (X_4gr_dim10k_tr).toarray()\n", 73 | "Xtest = (X_4gr_dim10k_te).toarray()\n", 74 | "\n", 75 | "ytrain = np.array(pd.read_csv(set_up.train_labels_path)['Class']) - 1\n", 76 | "\n", 77 | "print shape(Xtrain), shape(ytrain)\n", 78 | "print shape(Xtest)" 79 | ] 80 | }, 81 | { 82 | "cell_type": "code", 83 | "execution_count": 7, 84 | "metadata": { 85 | "collapsed": false 86 | }, 87 | "outputs": [], 88 | "source": [ 89 | "param = {}\n", 90 | "param['booster'] = 'gbtree'\n", 91 | "param['objective'] = 'multi:softprob'\n", 92 | "param['num_class'] = 9\n", 93 | "param['eval_metric'] = 'logloss'\n", 94 | "param['scale_pos_weight'] = 1.0\n", 95 | "param['bst:eta'] = 0.3\n", 96 | "param['bst:max_depth'] = 10\n", 97 | "param['bst:colsample_bytree'] = 0.5\n", 98 | "param['silent'] = 1\n", 99 | "param['nthread'] = 16\n", 100 | "\n", 101 | "num_round = 150\n", 102 | "\n", 103 | "plst = list(param.items())\n", 104 | "\n", 105 | "watchlist = []" 106 | ] 107 | }, 108 | { 109 | "cell_type": "code", 110 | "execution_count": 8, 111 | "metadata": { 112 | "collapsed": false 113 | }, 114 | "outputs": [ 115 | { 116 | "name": "stdout", 117 | "output_type": "stream", 118 | "text": [ 119 | "0 5.30212034782\n", 120 | "1 10.6938072642\n", 121 | "2 15.7326108813\n", 122 | "3 19.9750008146\n", 123 | "4 23.7675551971\n", 124 | "5 27.5684586326\n", 125 | "6 31.3690359672\n", 126 | "7 35.2041736325\n", 127 | "8 39.0149246653\n", 128 | "9 42.8346661488\n", 129 | "10 46.6522561669\n", 130 | "11 50.4519218008\n", 131 | "12 54.244935147\n", 132 | "13 58.0708679001\n", 133 | "14 61.8752340992\n", 134 | "15 65.6579940637\n", 135 | "16 69.4180411657\n", 136 | "17 73.2266673485\n", 137 | "18 77.0510769486\n", 138 | "19 80.859686017\n" 139 | ] 140 | } 141 | ], 142 | "source": [ 143 | "time0 = time.time()\n", 144 | "\n", 145 | "indtrain = arange(len(ytrain))\n", 146 | "yfinalxg = np.zeros((len(Xtest), 9))\n", 147 | "\n", 148 | "bgs = 20\n", 149 | "for bg in range(bgs):\n", 150 | " param['seed'] = bg + 1\n", 151 | " plst = list(param.items())\n", 152 | "\n", 153 | " newindtrain = random.sample(indtrain, int(len(indtrain) * 1.0))\n", 154 | " \n", 155 | " Xdatatrain = xgb.DMatrix(data = Xtrain[newindtrain], label = ytrain[newindtrain])\n", 156 | " Xdatatest = xgb.DMatrix(data = Xtest)\n", 157 | "\n", 158 | " bst = xgb.train(plst, Xdatatrain, num_round, watchlist)\n", 159 | "\n", 160 | " curpred = bst.predict(Xdatatest).reshape((len(Xtest), 9)) \n", 161 | " yfinalxg += curpred\n", 162 | "\n", 163 | " print bg, (time.time() - time0) / 60.\n", 164 | "\n", 165 | "yfinalxg /= bgs" 166 | ] 167 | }, 168 | { 169 | "cell_type": "code", 170 | "execution_count": 9, 171 | "metadata": { 172 | "collapsed": true 173 | }, 174 | "outputs": [], 175 | "source": [ 176 | "dfinal = pd.read_csv(set_up.test_sample_path)\n", 177 | "\n", 178 | "dfinal.ix[:, 1:] = yfinalxg\n", 179 | "dfinal.to_csv(set_up.submissions_dir_path + 'release0.5.csv', index = False)" 180 | ] 181 | } 182 | ], 183 | "metadata": { 184 | "kernelspec": { 185 | "display_name": "Python 2", 186 | "language": "python", 187 | "name": "python2" 188 | }, 189 | "language_info": { 190 | "codemirror_mode": { 191 | "name": "ipython", 192 | "version": 2 193 | }, 194 | "file_extension": ".py", 195 | "mimetype": "text/x-python", 196 | "name": "python", 197 | "nbconvert_exporter": "python", 198 | "pygments_lexer": "ipython2", 199 | "version": "2.7.6" 200 | } 201 | }, 202 | "nbformat": 4, 203 | "nbformat_minor": 0 204 | } 205 | -------------------------------------------------------------------------------- /learning-main-model.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# script for learning main model" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": { 14 | "collapsed": false 15 | }, 16 | "outputs": [ 17 | { 18 | "name": "stdout", 19 | "output_type": "stream", 20 | "text": [ 21 | "Populating the interactive namespace from numpy and matplotlib\n" 22 | ] 23 | } 24 | ], 25 | "source": [ 26 | "%pylab inline\n", 27 | "\n", 28 | "import numpy as np\n", 29 | "import pandas as pd\n", 30 | "\n", 31 | "import time\n", 32 | "import random\n", 33 | "import cPickle\n", 34 | "\n", 35 | "from sklearn.externals import joblib\n", 36 | "\n", 37 | "import set_up\n", 38 | "\n", 39 | "import sys\n", 40 | "sys.path.append('../git/xgboost/wrapper/')\n", 41 | "import xgboost as xgb" 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": 2, 47 | "metadata": { 48 | "collapsed": false, 49 | "scrolled": true 50 | }, 51 | "outputs": [ 52 | { 53 | "name": "stdout", 54 | "output_type": "stream", 55 | "text": [ 56 | "(10868, 203) (10868,)\n", 57 | "(10873, 203)\n" 58 | ] 59 | } 60 | ], 61 | "source": [ 62 | "X_base_tr, X_base_te = joblib.load(set_up.feats_folder_path + 'X_basepack')\n", 63 | "X_tr, X_te = cPickle.load(open(set_up.feats_folder_path + '4gr_pack_dim100.pickled', 'rb'))\n", 64 | "\n", 65 | "Xtrain = np.hstack((X_base_tr, X_tr))\n", 66 | "Xtest = np.hstack((X_base_te, X_te))\n", 67 | "\n", 68 | "ytrain = np.array(pd.read_csv(set_up.train_labels_path)['Class']) - 1\n", 69 | "\n", 70 | "print shape(Xtrain), shape(ytrain)\n", 71 | "print shape(Xtest)" 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": 3, 77 | "metadata": { 78 | "collapsed": false 79 | }, 80 | "outputs": [], 81 | "source": [ 82 | "param = {}\n", 83 | "param['booster'] = 'gbtree'\n", 84 | "param['objective'] = 'multi:softprob'\n", 85 | "param['num_class'] = 9\n", 86 | "param['eval_metric'] = 'logloss'\n", 87 | "param['scale_pos_weight'] = 1.0\n", 88 | "param['bst:eta'] = 0.3\n", 89 | "param['bst:max_depth'] = 6\n", 90 | "param['bst:colsample_bytree'] = 0.5\n", 91 | "param['silent'] = 1\n", 92 | "param['nthread'] = 16\n", 93 | "\n", 94 | "num_round = 100\n", 95 | "\n", 96 | "plst = list(param.items())\n", 97 | "\n", 98 | "watchlist = []" 99 | ] 100 | }, 101 | { 102 | "cell_type": "code", 103 | "execution_count": 4, 104 | "metadata": { 105 | "collapsed": false 106 | }, 107 | "outputs": [ 108 | { 109 | "name": "stdout", 110 | "output_type": "stream", 111 | "text": [ 112 | "0 0.418001099428\n", 113 | "1 0.829770731926\n", 114 | "2 1.24144924879\n", 115 | "3 1.64242663383\n", 116 | "4 2.06019373337\n", 117 | "5 2.47356063128\n", 118 | "6 2.90849781434\n", 119 | "7 3.31573658387\n", 120 | "8 3.72551828225\n", 121 | "9 4.14109171629\n", 122 | "10 4.55687183142\n", 123 | "11 5.6331213673\n", 124 | "12 7.3000319163\n", 125 | "13 8.98525646528\n", 126 | "14 10.6799249013\n", 127 | "15 12.3658061345\n", 128 | "16 14.0249494155\n", 129 | "17 15.7358465989\n", 130 | "18 17.4313094497\n", 131 | "19 19.1317503333\n" 132 | ] 133 | } 134 | ], 135 | "source": [ 136 | "time0 = time.time()\n", 137 | "\n", 138 | "indtrain = arange(len(ytrain))\n", 139 | "yfinalxg = np.zeros((len(Xtest), 9))\n", 140 | "\n", 141 | "bgs = 20\n", 142 | "for bg in range(bgs):\n", 143 | " param['seed'] = bg + 1\n", 144 | " plst = list(param.items())\n", 145 | "\n", 146 | " newindtrain = random.sample(indtrain, int(len(indtrain) * 1.0))\n", 147 | "\n", 148 | " for i in range(int(len(indtrain) * 7.0)):\n", 149 | " newindtrain.append(random.choice(indtrain))\n", 150 | " \n", 151 | " Xdatatrain = xgb.DMatrix(data = Xtrain[newindtrain], label = ytrain[newindtrain])\n", 152 | " Xdatatest = xgb.DMatrix(data = Xtest)\n", 153 | "\n", 154 | " bst = xgb.train(plst, Xdatatrain, num_round, watchlist)\n", 155 | "\n", 156 | " curpred = bst.predict(Xdatatest).reshape((len(Xtest), 9)) \n", 157 | " yfinalxg += curpred\n", 158 | "\n", 159 | " print bg, (time.time() - time0) / 60.\n", 160 | "\n", 161 | "yfinalxg /= bgs" 162 | ] 163 | }, 164 | { 165 | "cell_type": "code", 166 | "execution_count": 5, 167 | "metadata": { 168 | "collapsed": false 169 | }, 170 | "outputs": [], 171 | "source": [ 172 | "dfinal = pd.read_csv(set_up.test_sample_path)\n", 173 | "\n", 174 | "dfinal.ix[:, 1:] = yfinalxg\n", 175 | "dfinal.to_csv(set_up.submissions_dir_path + 'release0.csv', index = False)" 176 | ] 177 | } 178 | ], 179 | "metadata": { 180 | "kernelspec": { 181 | "display_name": "Python 2", 182 | "language": "python", 183 | "name": "python2" 184 | }, 185 | "language_info": { 186 | "codemirror_mode": { 187 | "name": "ipython", 188 | "version": 2 189 | }, 190 | "file_extension": ".py", 191 | "mimetype": "text/x-python", 192 | "name": "python", 193 | "nbconvert_exporter": "python", 194 | "pygments_lexer": "ipython2", 195 | "version": "2.7.6" 196 | } 197 | }, 198 | "nbformat": 4, 199 | "nbformat_minor": 0 200 | } 201 | -------------------------------------------------------------------------------- /semi-supervised-trick.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# semi-supervides trick" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": { 14 | "collapsed": false 15 | }, 16 | "outputs": [ 17 | { 18 | "name": "stdout", 19 | "output_type": "stream", 20 | "text": [ 21 | "Populating the interactive namespace from numpy and matplotlib\n" 22 | ] 23 | } 24 | ], 25 | "source": [ 26 | "%pylab inline\n", 27 | "\n", 28 | "import numpy as np\n", 29 | "import pandas as pd\n", 30 | "\n", 31 | "import scipy\n", 32 | "import time\n", 33 | "import random\n", 34 | "import cPickle\n", 35 | "\n", 36 | "from sklearn.externals import joblib\n", 37 | "from sklearn.cross_validation import KFold\n", 38 | "\n", 39 | "import set_up\n", 40 | "\n", 41 | "import sys\n", 42 | "sys.path.append('../git/xgboost/wrapper/')\n", 43 | "import xgboost as xgb" 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": 2, 49 | "metadata": { 50 | "collapsed": false 51 | }, 52 | "outputs": [ 53 | { 54 | "name": "stdout", 55 | "output_type": "stream", 56 | "text": [ 57 | "(10868, 203) (10868,)\n", 58 | "(10873, 203)\n" 59 | ] 60 | } 61 | ], 62 | "source": [ 63 | "X_base_tr, X_base_te = joblib.load(set_up.feats_folder_path + 'X_basepack')\n", 64 | "X_tr, X_te = cPickle.load(open(set_up.feats_folder_path + '4gr_pack_dim100.pickled', 'rb'))\n", 65 | "\n", 66 | "Xtrain = np.hstack((X_base_tr, X_tr))\n", 67 | "Xtest = np.hstack((X_base_te, X_te))\n", 68 | "\n", 69 | "ytrain = np.array(pd.read_csv(set_up.train_labels_path)['Class']) - 1\n", 70 | "\n", 71 | "print shape(Xtrain), shape(ytrain)\n", 72 | "print shape(Xtest)" 73 | ] 74 | }, 75 | { 76 | "cell_type": "code", 77 | "execution_count": 3, 78 | "metadata": { 79 | "collapsed": true 80 | }, 81 | "outputs": [], 82 | "source": [ 83 | "dbest = pd.read_csv(set_up.submissions_dir_path + 'release0.csv')" 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": 4, 89 | "metadata": { 90 | "collapsed": false 91 | }, 92 | "outputs": [], 93 | "source": [ 94 | "Y = np.array(ytrain)\n", 95 | "test_preds = np.array(dbest.ix[:, 1:])\n", 96 | "\n", 97 | "x_train = np.array(Xtrain)\n", 98 | "x_test = np.array(Xtest[:, :])" 99 | ] 100 | }, 101 | { 102 | "cell_type": "code", 103 | "execution_count": null, 104 | "metadata": { 105 | "collapsed": false 106 | }, 107 | "outputs": [], 108 | "source": [ 109 | "rvs = [scipy.stats.rv_discrete(values = (range(9),test_preds[j,:])) for j in range(test_preds.shape[0])]\n", 110 | " \n", 111 | "skf = KFold(test_preds.shape[0], n_folds=10,random_state = 11)\n", 112 | " \n", 113 | "loss = 0\n", 114 | "test_pr1 = test_preds.copy()\n", 115 | " \n", 116 | "t0 = time.time()\n", 117 | " \n", 118 | "kok = 0\n", 119 | "for train_index, test_index in skf:\n", 120 | " \n", 121 | " datas_train = np.concatenate([x_train,x_test[train_index,:]])\n", 122 | " datas_test = x_test[test_index,:]\n", 123 | " \n", 124 | " pr_vals = []\n", 125 | " for i in range(20):\n", 126 | " Y_test_s = [r.rvs() for r in rvs]\n", 127 | " \n", 128 | " y_train = np.array(Y.tolist() + np.array(Y_test_s)[train_index].tolist())\n", 129 | " \n", 130 | " indbigtrain = np.arange(len(datas_train))\n", 131 | " indtrain = np.arange(len(train_index))\n", 132 | " newindtrain = random.sample(indbigtrain, int(len(indbigtrain) * 1.0))\n", 133 | "\n", 134 | " for j in range(int(len(indtrain) * 7.0)):\n", 135 | " newindtrain.append(random.choice(indtrain))\n", 136 | "\n", 137 | " dtrain = xgb.DMatrix(datas_train[newindtrain], label=y_train[newindtrain])\n", 138 | " dval = xgb.DMatrix(datas_test)\n", 139 | "\n", 140 | " param1 = {'num_round': 200,\n", 141 | " 'seed' : i*13141 + 123,\n", 142 | " 'max_depth':3,\n", 143 | " 'gamma': 0.0,\n", 144 | " 'eta':0.22,\n", 145 | " 'silent':1, \n", 146 | " 'objective':'multi:softprob',\n", 147 | " 'num_class' : 9,\n", 148 | " 'subsample' : 1,\n", 149 | " 'colsample_bytree' : 0.3,\n", 150 | " 'nthread' : 16} \n", 151 | " \n", 152 | " watchlist = []\n", 153 | " \n", 154 | " bst = xgb.train(\n", 155 | " param1, dtrain, param1['num_round'], watchlist)\n", 156 | " \n", 157 | " pr_val = bst.predict(dval)\n", 158 | " \n", 159 | " pr_vals.append(pr_val)\n", 160 | " print \" - %d\" % i, (time.time() - t0) / 60\n", 161 | " \n", 162 | " pr_val = np.mean(pr_vals,axis = 0)\n", 163 | " \n", 164 | " test_pr1[test_index,:] = pr_val\n", 165 | " \n", 166 | " print ' f', kok\n", 167 | " \n", 168 | " kok +=1" 169 | ] 170 | }, 171 | { 172 | "cell_type": "code", 173 | "execution_count": null, 174 | "metadata": { 175 | "collapsed": true 176 | }, 177 | "outputs": [], 178 | "source": [ 179 | "dfinal = pd.read_csv(set_up.test_sample_path)\n", 180 | "\n", 181 | "dfinal.ix[:, 1:] = test_pr1\n", 182 | "dfinal.to_csv(set_up.submissions_dir_path + 'release1.csv', index = False)" 183 | ] 184 | } 185 | ], 186 | "metadata": { 187 | "kernelspec": { 188 | "display_name": "Python 2", 189 | "language": "python", 190 | "name": "python2" 191 | }, 192 | "language_info": { 193 | "codemirror_mode": { 194 | "name": "ipython", 195 | "version": 2 196 | }, 197 | "file_extension": ".py", 198 | "mimetype": "text/x-python", 199 | "name": "python", 200 | "nbconvert_exporter": "python", 201 | "pygments_lexer": "ipython2", 202 | "version": "2.7.6" 203 | } 204 | }, 205 | "nbformat": 4, 206 | "nbformat_minor": 0 207 | } 208 | -------------------------------------------------------------------------------- /src/4gr-collect-stats.py: -------------------------------------------------------------------------------- 1 | import cPickle as pickle 2 | import numpy as np 3 | import hickle 4 | import glob 5 | import sys 6 | import gc 7 | import set_up 8 | print '[4gr freq stat collector]' 9 | 10 | in_dir = set_up.feats_folder_path + '4gr/' 11 | all_tokens = np.zeros(257**4) 12 | files = glob.glob(in_dir + '*.bytes')[::5] 13 | for i, fname in enumerate(files): 14 | print '[{}] {}'.format(i, fname) 15 | ptr, vals = pickle.load(open(fname)) 16 | #for key in ptr: 17 | # all_tokens[key] += 1 18 | all_tokens[ptr] = all_tokens[ptr] + 1 19 | del ptr, vals 20 | if i%100==0: 21 | gc.collect() 22 | if (i%4000==0) and (i>0): 23 | print 'pickled_state: {}'.format(i) 24 | hickle.dump(all_tokens, open(set_up.tmp_path + '4gr_stats_2', 'w')) 25 | break -------------------------------------------------------------------------------- /src/4gr-freq-reducer.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import sys 3 | import gc 4 | import numpy as np 5 | import pickle 6 | from joblib import Parallel, delayed 7 | import set_up 8 | print '[4gr freq reducer]' 9 | 10 | 11 | repl = pickle.load(open(set_up.tmp_path + '4gr_replacer', 'rb')) 12 | goodset = set(repl.keys()) 13 | 14 | in_dir = set_up.feats_folder_path + '4gr/' 15 | 16 | 17 | 18 | def worker(i,fname): 19 | print '[{}] {}'.format(i, fname) 20 | (ptr, vals) = pickle.load(open(fname)) 21 | new_ptr = [] 22 | new_vals = [] 23 | for ind, val in zip(ptr, vals): 24 | if ind in goodset: 25 | new_ptr.append(repl[ind]) 26 | new_vals.append(val) 27 | del ptr, vals 28 | pickle.dump((new_ptr, new_vals), open(set_up.feats_folder_path + '4gr/' + fname.split('/')[-1] + '.freq', 'wb'), protocol=2) 29 | gc.collect() 30 | 31 | 32 | files = glob.glob(in_dir + '*.bytes') 33 | 34 | #print files 35 | Parallel(n_jobs=15)(delayed(worker)(i,f) for i,f in enumerate(files)) 36 | 37 | print 'Done!' 38 | -------------------------------------------------------------------------------- /src/4gr-rf.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import hickle, cPickle 4 | import set_up 5 | import scipy.sparse as sp 6 | from sklearn.externals import joblib 7 | 8 | import logging 9 | reload(logging) 10 | logging.basicConfig(format = u'[%(asctime)s] %(message)s', level = logging.INFO) 11 | logging.info('[Script for 2nd stage feature selection by RF]') 12 | 13 | logging.info('Load data...') 14 | trainLabels = pd.read_csv(set_up.train_labels_path) 15 | X_tr = joblib.load(set_up.feats_folder_path + '4gr/4gr_train_dim10k.joblib').todense() 16 | X_te = joblib.load(set_up.feats_folder_path + '4gr/4gr_test_dim10k.joblib').todense() 17 | 18 | logging.info('Fitting RF...') 19 | from sklearn.cross_validation import train_test_split 20 | from sklearn.ensemble import RandomForestClassifier 21 | X_train, X_test, y_train, y_test = train_test_split(X_tr, trainLabels.Class.values, random_state=42, test_size=0.3) 22 | rf = RandomForestClassifier(n_jobs=-1, n_estimators=1000) 23 | rf.fit(X_train, y_train) 24 | logging.info('RF acc: {}'.format(np.mean(rf.predict(X_test) == y_test))) 25 | 26 | fmask = (rf.feature_importances_> 0.0014) 27 | logging.info('Total feats: {}, selected: {}'.format(len(fmask), sum(fmask))) 28 | 29 | logging.info('Dumping...') 30 | data = (X_tr[:, fmask], X_te[:, fmask]) 31 | cPickle.dump(data, open(set_up.feats_folder_path + '4gr_pack_dim100.pickled', 'wb'), protocol=2) 32 | 33 | logging.info('Done!') -------------------------------------------------------------------------------- /src/4gr-svc.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import hickle, cPickle 4 | import set_up 5 | import scipy.sparse as sp 6 | from sklearn.externals import joblib 7 | import os 8 | 9 | import logging 10 | reload(logging) 11 | logging.basicConfig(format = u'[%(asctime)s] %(message)s', level = logging.INFO) 12 | logging.info('[Script for 1st stage feature selection by SVC]') 13 | 14 | # assembling train 15 | logging.info('[train] Load data...') 16 | trainLabels = pd.read_csv(set_up.train_labels_path) 17 | 18 | data = [] 19 | indices = [] 20 | ptrs = [0] 21 | cur_bound = 0 22 | for i, row in trainLabels.iterrows(): 23 | fname = set_up.feats_folder_path + '4gr/' + '{}.bytes.freq'.format(row['Id']) 24 | (inds, vals) = cPickle.load(open(fname, 'rb')) 25 | assert len(vals) == len(inds) 26 | data.extend(vals) 27 | indices.extend(inds) 28 | cur_bound += len(vals) 29 | ptrs.append(cur_bound) 30 | if i%1000==0: 31 | print '[{}] {}'.format(i, row['Id']) 32 | hickle.dump((np.array(data), np.array(indices), np.array(ptrs)), set_up.tmp_path + '4gr_train_raw.hi') 33 | 34 | logging.info('[train] Build csr...') 35 | X = sp.csr_matrix((np.array(data), np.array(indices), np.array(ptrs)), dtype=int) 36 | 37 | logging.info('[train] dump to {}...'.format(set_up.feats_folder_path + '4gr/4gr_train_csr.joblib')) 38 | joblib.dump(X, set_up.feats_folder_path + '4gr/4gr_train_csr.joblib') 39 | del X, data, indices, ptrs 40 | 41 | 42 | # assembling test 43 | logging.info('[test] Load data...') 44 | sampleSubmission = pd.read_csv(set_up.test_sample_path) #ToDo 45 | 46 | data = [] 47 | indices = [] 48 | ptrs = [0] 49 | cur_bound = 0 50 | for i, row in sampleSubmission.iterrows(): 51 | fname = set_up.feats_folder_path + '4gr/' + '{}.bytes.freq'.format(row['Id']) 52 | (inds, vals) = cPickle.load(open(fname, 'rb')) 53 | assert len(vals) == len(inds) 54 | data.extend(vals) 55 | indices.extend(inds) 56 | cur_bound += len(vals) 57 | ptrs.append(cur_bound) 58 | if i%1000==0: 59 | print '[{}] {}'.format(i, row['Id']) 60 | 61 | hickle.dump((np.array(data), np.array(indices), np.array(ptrs)), set_up.tmp_path + '4gr_test_raw.hi') 62 | 63 | logging.info('[test] Build csr...') 64 | X = sp.csr_matrix((np.array(data), np.array(indices), np.array(ptrs)), dtype=int) 65 | 66 | logging.info('[test] dump to {}...'.format(set_up.feats_folder_path + '4gr/4gr_test_csr.joblib')) 67 | joblib.dump(X, set_up.feats_folder_path + '4gr/4gr_test_csr.joblib') 68 | 69 | logging.info('Cleaning memory...') 70 | del X, data, indices, ptrs 71 | 72 | 73 | # Feature selection 74 | logging.info('Load train for feature selection...') 75 | 76 | import scipy.sparse as sp 77 | from sklearn.externals import joblib 78 | from sklearn.svm import LinearSVC 79 | 80 | trainLabels = pd.read_csv(set_up.train_labels_path) 81 | X_csr_tr = joblib.load(set_up.feats_folder_path + '4gr/4gr_train_csr.joblib') 82 | 83 | logging.info('Fit lsvc...') 84 | model = LinearSVC(penalty='l1',max_iter=20, dual=False, verbose=1) 85 | model.fit(X_csr_tr, trainLabels.Class.values) 86 | 87 | logging.info('Dump fitted model..') 88 | joblib.dump(model, set_up.tmp_path + '4gr_csr_model.joblib') 89 | 90 | 91 | # reduce train 92 | logging.info('Reduce train...') 93 | X_mini = model.transform(X_csr_tr) 94 | 95 | logging.info('Dump train...') 96 | joblib.dump(X_mini, set_up.feats_folder_path + '4gr/4gr_train_dim10k.joblib') 97 | del X_csr_tr 98 | 99 | 100 | # reduce test 101 | logging.info('Read test...') 102 | X_csr_te = joblib.load(set_up.feats_folder_path + '4gr/4gr_test_csr.joblib') 103 | 104 | logging.info('Reduce test...') 105 | X_mini = model.transform(X_csr_te) 106 | 107 | logging.info('Dump test...') 108 | joblib.dump(X_mini, set_up.feats_folder_path + '4gr/4gr_test_dim10k.joblib') 109 | 110 | logging.info('Done!') -------------------------------------------------------------------------------- /src/base-features-packer.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | import pandas as pd 4 | import numpy as np 5 | import cPickle 6 | from sklearn.preprocessing import LabelBinarizer 7 | from sklearn.ensemble import RandomForestClassifier 8 | from sklearn.metrics import log_loss 9 | from sklearn.cross_validation import train_test_split 10 | from sklearn.externals import joblib 11 | import set_up 12 | feats_path = '../data/feats/' 13 | 14 | 15 | import logging 16 | reload(logging) 17 | logging.basicConfig(format = u'[%(asctime)s] %(message)s', level = logging.INFO) 18 | logging.info('[Base feature packer]') 19 | 20 | 21 | 22 | def calc_entropy(X): 23 | ent = np.zeros(len(X)) 24 | for i in range(len(X)): 25 | x = X[i]*1.0 / (sum(X[i]) + 0.00001) 26 | ent[i] = -np.sum(x*np.log(x+0.00000001)) 27 | return ent 28 | 29 | 30 | 31 | 32 | trainLabels = pd.read_csv(set_up.train_labels_path) 33 | sampleSubmission = pd.read_csv(set_up.test_sample_path) 34 | 35 | 36 | 37 | # sections features 38 | logging.info('Collect sections...') 39 | from sklearn.feature_extraction import DictVectorizer 40 | section_whitelist = set(['.bss', '.data', '.edata', '.idata', '.rdata', '.reloc', 41 | '.rsrc', '.text', '.tls', 'bss', 'code', 'data', 'header']) 42 | 43 | y_tr = np.zeros(len(trainLabels)) 44 | lines_tr = [] 45 | 46 | #for train 47 | for i,row in trainLabels.iterrows(): 48 | y_tr[i] = row['Class'] 49 | 50 | # lines 51 | feat = cPickle.load(open('{}sections_hist/{}'.format(feats_path, row['Id']), 'r')) 52 | del feat['sum'] 53 | whitened = {} 54 | for x in feat: 55 | if x in section_whitelist: 56 | whitened[x] = feat[x] 57 | lines_tr.append(whitened) 58 | 59 | 60 | #for test 61 | lines_te = [] 62 | for i,row in sampleSubmission.iterrows(): 63 | # lines 64 | feat = cPickle.load(open('{}sections_hist/{}'.format(feats_path, row['Id']), 'r')) 65 | del feat['sum'] 66 | whitened = {} 67 | for x in feat: 68 | if x in section_whitelist: 69 | whitened[x] = feat[x] 70 | lines_te.append(whitened) 71 | 72 | # convert to matrix 73 | dv = DictVectorizer(sparse=False) 74 | dv.fit(lines_tr) 75 | X_lines_tr = dv.transform(lines_tr) 76 | X_lines_te = dv.transform(lines_te) 77 | 78 | 79 | E_lines_tr = calc_entropy(X_lines_tr) 80 | E_lines_te = calc_entropy(X_lines_te) 81 | 82 | 83 | 84 | 85 | # filesize 86 | logging.info('Collect file sizes') 87 | import os 88 | # train 89 | X_sizes_tr = np.zeros([len(trainLabels), 2]) 90 | for i,row in trainLabels.iterrows(): 91 | fname = row['Id'] 92 | X_sizes_tr[i, 0] = os.path.getsize('{}{}.bytes'.format(set_up.train_folder_path, fname)) 93 | X_sizes_tr[i, 1] = os.path.getsize('{}{}.asm'.format(set_up.train_folder_path, fname)) 94 | size_ratio_tr = (X_sizes_tr[:, 0] *1.0 / X_sizes_tr[:, 1])[:, np.newaxis] 95 | 96 | #test 97 | X_sizes_te = np.zeros([len(sampleSubmission), 2]) 98 | for i,row in sampleSubmission.iterrows(): 99 | fname = row['Id'] 100 | X_sizes_te[i, 0] = os.path.getsize('{}{}.bytes'.format(set_up.test_folder_path, fname)) 101 | X_sizes_te[i, 1] = os.path.getsize('{}{}.asm'.format(set_up.test_folder_path, fname)) 102 | size_ratio_te = (X_sizes_te[:, 0] *1.0 / X_sizes_te[:, 1])[:, np.newaxis] 103 | 104 | 105 | 106 | # spectral asm 107 | logging.info('Collect spectral asm...') 108 | def read_file(filename): 109 | fin = open(filename, 'r') 110 | data = [] 111 | for line in fin: 112 | data.append(line.strip()) 113 | return data 114 | 115 | fnames = read_file('{}spectral_asm/fnames'.format(feats_path)) 116 | asm_dict = {} 117 | specter = ['jmp', 'mov', 'retf', 'push', 'pop', 'xor', 'retn', 'nop', 'sub', 'inc', 'dec', 'add', 118 | 'imul', 'xchg', 'or', 'shr', 'cmp', 'call', 'shl', 'ror', 'rol', 'jnb'] 119 | for op in specter: 120 | values = read_file('{}spectral_asm/{}'.format(feats_path, op)) 121 | for fname, val in zip(fnames, values): 122 | asm_dict[fname] = asm_dict.get(fname, {}) 123 | asm_dict[fname][op] = val 124 | # train 125 | X_asm_tr = np.zeros((len(trainLabels), 22)) 126 | for i, fname in enumerate(trainLabels.Id.values): 127 | for j,op in enumerate(specter): 128 | X_asm_tr[i,j] = asm_dict[fname][op] 129 | 130 | E_asm_tr = calc_entropy(X_asm_tr) 131 | 132 | #test 133 | X_asm_te = np.zeros((len(sampleSubmission), 22)) 134 | for i, fname in enumerate(sampleSubmission.Id.values): 135 | for j,op in enumerate(specter): 136 | X_asm_te[i,j] = asm_dict[fname][op] 137 | 138 | E_asm_te = calc_entropy(X_asm_te) 139 | 140 | 141 | 142 | # line counts 143 | logging.info('Collect line counts...') 144 | fnames = read_file('{}spectral_asm/fnames'.format(feats_path)) 145 | line_counts = read_file('{}spectral_asm/line_count'.format(feats_path)) 146 | 147 | line_dict = {} 148 | for i, fname in enumerate(fnames): 149 | line_dict[fname] = line_counts[i] 150 | 151 | # train 152 | X_lcounts_tr = np.zeros((len(trainLabels), 1)) 153 | for i, fname in enumerate(trainLabels.Id.values): 154 | X_lcounts_tr[i, 0] = line_dict[fname] 155 | 156 | E_lcounts_tr = calc_entropy(X_lcounts_tr) 157 | 158 | # test 159 | X_lcounts_te = np.zeros((len(sampleSubmission), 1)) 160 | for i, fname in enumerate(sampleSubmission.Id.values): 161 | X_lcounts_te[i, 0] = line_dict[fname] 162 | 163 | E_lcounts_te = calc_entropy(X_lcounts_te) 164 | 165 | 166 | 167 | # import calls 168 | logging.info('Collect calls...') 169 | def get_call_list(fname): 170 | calls =[] 171 | lines = read_file(fname) 172 | for line in lines: 173 | calls.append(line.split('__stdcall')[1].split('(')[0].split('_')[0].strip()) 174 | return calls 175 | 176 | # train 177 | call_txt_tr = [] 178 | for i,row in trainLabels.iterrows(): 179 | call_txt_tr.append(' '.join(get_call_list('{}stdcall_grepper/'.format(feats_path) + row['Id']))) 180 | 181 | # train 182 | call_txt_te = [] 183 | for i,row in sampleSubmission.iterrows(): 184 | call_txt_te.append(' '.join(get_call_list('{}stdcall_grepper/'.format(feats_path) + row['Id']))) 185 | 186 | 187 | logging.info('-> vectorizing...') 188 | from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer 189 | vect = TfidfVectorizer(max_features=10000) 190 | vect.fit(call_txt_tr + call_txt_te) 191 | X_call_tr = vect.transform(call_txt_tr) 192 | X_call_te = vect.transform(call_txt_te) 193 | 194 | 195 | logging.info('-> apply NMF...') 196 | from sklearn.decomposition import TruncatedSVD, NMF 197 | from scipy import sparse 198 | 199 | nmf = NMF(n_components=10, sparseness='data') 200 | nmf.fit(sparse.vstack([X_call_tr, X_call_te])) 201 | X_calls_nmf_tr = nmf.transform(X_call_tr) 202 | X_calls_nmf_te = nmf.transform(X_call_te) 203 | 204 | 205 | 206 | # funcs 207 | 208 | 209 | logging.info('Collect FUNCs...') 210 | 211 | def get_func_list(fname): 212 | procs =[] 213 | lines = read_file(fname) 214 | for line in lines: 215 | line2 = line.split('FUNCTION')[1] 216 | if 'PRESS' in line2: 217 | procs.append(line2.split('PRESS')[0].strip().replace('.', '')) 218 | return procs 219 | 220 | func_txt_tr = [] 221 | func_txt_te = [] 222 | 223 | for i,row in trainLabels.iterrows(): 224 | func_txt_tr.append(' '.join(get_func_list('{}func_grepper/'.format(feats_path) + row['Id']))) 225 | 226 | for i,row in sampleSubmission.iterrows(): 227 | func_txt_te.append(' '.join(get_func_list('{}func_grepper/'.format(feats_path) + row['Id']))) 228 | 229 | logging.info('-> vectorizing...') 230 | from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer 231 | vect_func = TfidfVectorizer(max_features=10000) 232 | vect_func.fit(func_txt_tr + func_txt_te) 233 | 234 | X_func_tr = vect_func.transform(func_txt_tr) 235 | X_func_te = vect_func.transform(func_txt_te) 236 | 237 | logging.info('-> apply NMF...') 238 | nmf_func = NMF(n_components=10, sparseness='data') 239 | nmf_func.fit(sparse.vstack([X_func_tr, X_func_te])) 240 | X_func_nmf_tr = nmf_func.transform(X_func_tr) 241 | X_func_nmf_te = nmf_func.transform(X_func_te) 242 | 243 | 244 | 245 | # building 246 | logging.info('Build all together...') 247 | X_train_tr = np.hstack(( 248 | X_lines_tr, 249 | size_ratio_tr, 250 | X_asm_tr, 251 | X_sizes_tr, #! 252 | E_lines_tr[:, np.newaxis], #! 253 | X_calls_nmf_tr, 254 | X_func_nmf_tr 255 | )) 256 | 257 | X_train_te = np.hstack(( 258 | X_lines_te, 259 | size_ratio_te, 260 | X_asm_te, 261 | X_sizes_te, #! 262 | E_lines_te[:, np.newaxis], #! 263 | X_calls_nmf_te, 264 | X_func_nmf_te 265 | )) 266 | 267 | # dump 268 | joblib.dump((X_train_tr, X_train_te), '{}X_basepack'.format(feats_path)) 269 | logging.info('Done!') 270 | -------------------------------------------------------------------------------- /src/func_grepper.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import cPickle 3 | import glob 4 | import subprocess 5 | import os 6 | import set_up 7 | print '[Grepper for FUNCTION]' 8 | 9 | if sys.argv[1]=='train': 10 | in_dir = set_up.train_folder_path 11 | elif sys.argv[1]=='test': 12 | in_dir = set_up.test_folder_path 13 | else: 14 | print 'Unknown option' 15 | sys.exit() 16 | 17 | out_dir = set_up.feats_folder_path + 'func_grepper/' 18 | if not os.path.exists(out_dir): 19 | os.makedirs(out_dir) 20 | 21 | 22 | def worker(fname): 23 | # preparation 24 | subprocess.call('grep "FUNCTION" {}{}.asm > {}{}'.format(in_dir, fname, out_dir, fname), shell=True) 25 | 26 | 27 | raw_filenames = glob.glob(in_dir + '*.asm') 28 | fnames = map(lambda x: x.split('/')[-1].split('.')[0], raw_filenames) 29 | 30 | from multiprocessing import Process 31 | def wrapper(fname_list): 32 | for fname in fname_list: 33 | worker(fname) 34 | 35 | nJobs = 15 36 | workers = [] 37 | for workerId in range(nJobs): 38 | p = Process(target=wrapper, args=[[param for i, param in enumerate(fnames) if i % nJobs == workerId]]) 39 | workers.append(p) 40 | p.start() 41 | for p in workers: 42 | p.join() 43 | print 'Done!' -------------------------------------------------------------------------------- /src/main.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | pypy section_hist.py train 3 | pypy section_hist.py trest 4 | python spectral_asm.py train 5 | python spectral_asm.py test 6 | python stdcall_grepper.py train 7 | python stdcall_grepper.py test 8 | python func_grepper.py train 9 | python func_grepper.py test 10 | python base-features-packer.py 11 | pypy ngramms_extractor.py train 4 12 | pypy ngramms_extractor.py test 4 13 | python 4gr-collect-stats.py 14 | pypy 4gr-freq-reducer.py 15 | python 4gr-svc.py 16 | python 4gr-rf.py -------------------------------------------------------------------------------- /src/ngramms_extractor.py: -------------------------------------------------------------------------------- 1 | import glob 2 | from joblib import Parallel, delayed 3 | import set_up 4 | import pickle 5 | import os 6 | import sys 7 | print '[n-gramm extractor]' 8 | 9 | if sys.argv[1]=='train': 10 | in_dir = set_up.train_folder_path + '*.bytes' 11 | elif sys.argv[1]=='test': 12 | in_dir = set_up.test_folder_path + '*.bytes' 13 | else: 14 | print 'Unknown option' 15 | sys.exit() 16 | 17 | ng_order = int(sys.argv[2]) 18 | out_dir = set_up.feats_folder_path + '{}gr/'.format(ng_order) 19 | if not os.path.exists(out_dir): 20 | os.makedirs(out_dir) 21 | 22 | files = glob.glob(in_dir)[::-1] 23 | 24 | def get_dict(): 25 | d = {format(key, '02X'): key for key in range(256)} 26 | d['??'] = 256 27 | return d 28 | 29 | def indexer4gr(tokens): 30 | return tokens[0]*16974593 + tokens[1]*66049 + tokens[2]*257 + tokens[3] 31 | 32 | 33 | def count_4f(all_elem_codes,order): 34 | counts_4g = {} 35 | if order == 4: 36 | indexer = indexer4gr 37 | elif order == 10: 38 | print "Order10 not prepared" 39 | else: 40 | print 'WFT?' 41 | # collect counts 42 | for i in range(len(all_elem_codes)-order+1): 43 | index = indexer(all_elem_codes[i:i+order]) 44 | counts_4g[index] = counts_4g.get(index,0)+1 45 | # dump it! 46 | ptr = [] 47 | vals = [] 48 | for key in counts_4g: 49 | ptr.append(key) 50 | vals.append(counts_4g[key]) 51 | return (ptr, vals) 52 | 53 | def extract_4g (filename,order): 54 | convert_dict = get_dict() 55 | with open(filename,'r') as f: 56 | text = f.read() 57 | lines = text.split('\r\n') 58 | all_elems_codes = [] 59 | for l in lines: 60 | elems = l.split(' ') 61 | all_elems_codes.extend([convert_dict[x] for x in elems[1:]]) 62 | 63 | with open(out_dir + filename.split('/')[-1],'w') as f_dump: 64 | pickle.dump(count_4f(all_elems_codes,order),f_dump) 65 | 66 | #print files 67 | Parallel(n_jobs=-1)(delayed(extract_4g)(fi, ng_order) for fi in files) 68 | 69 | #pickle.dump(four_gr,open('../data/feats/%s/four_gr/4g' % what,'w')) 70 | #pickle.dump([x.split('/')[-1] for x in files],open('../data/feats/%s/four_gr/names' % what,'w')) -------------------------------------------------------------------------------- /src/sections_hist.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import cPickle 3 | import glob 4 | import set_up 5 | import os 6 | 7 | nJobs = set_up.nJobs 8 | print '[Section extraction script]' 9 | 10 | if sys.argv[1]=='train': 11 | in_dir = set_up.train_folder_path 12 | elif sys.argv[1]=='test': 13 | in_dir = set_up.test_folder_path 14 | else: 15 | print 'Unknown option' 16 | sys.exit() 17 | 18 | out_dir = set_up.feats_folder_path + 'sections_hist/' 19 | if not os.path.exists(out_dir): 20 | os.makedirs(out_dir) 21 | 22 | 23 | def worker(fname): 24 | # preparation 25 | stat = {} 26 | fin = open(in_dir + fname + '.asm', 'r') 27 | for line in fin: 28 | line_type = line.split(':')[0].lower() 29 | stat[line_type] = stat.get(line_type, 0) + 1 30 | stat['sum'] = stat.get('sum', 0) + 1 31 | cPickle.dump(stat, open(out_dir + fname, 'w')) 32 | 33 | 34 | raw_filenames = glob.glob(in_dir + '*.asm') 35 | fnames = map(lambda x: x.split('/')[-1].split('.')[0], raw_filenames) 36 | 37 | 38 | from multiprocessing import Process 39 | def wrapper(fname_list): 40 | for fname in fname_list: 41 | worker(fname) 42 | 43 | 44 | workers = [] 45 | for workerId in range(nJobs): 46 | p = Process(target=wrapper, args=[[param for i, param in enumerate(fnames) if i % nJobs == workerId]]) 47 | workers.append(p) 48 | p.start() 49 | for p in workers: 50 | p.join() 51 | print 'Done!' 52 | -------------------------------------------------------------------------------- /src/set_up.py: -------------------------------------------------------------------------------- 1 | feats_folder_path = '../data/feats/' 2 | train_folder_path = '../data/raw/train/' 3 | test_folder_path = '../data/raw/test/' 4 | 5 | train_labels_path = '../data/raw/trainLabels.csv' 6 | test_sample_path = '../data/raw/sampleSubmission.csv' 7 | 8 | submissions_dir_path = '../submissions/' 9 | tmp_path = '../tmp/' 10 | nJobs = 15 11 | -------------------------------------------------------------------------------- /src/spectral_asm.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import cPickle 3 | import glob 4 | import subprocess 5 | import os 6 | import set_up 7 | print 'ASM specter extractor' 8 | 9 | if sys.argv[1]=='train': 10 | in_dir = set_up.train_folder_path 11 | elif sys.argv[1]=='test': 12 | in_dir = set_up.test_folder_path 13 | else: 14 | print 'Unknown option' 15 | sys.exit() 16 | 17 | out_dir = set_up.feats_folder_path + 'spectral_asm/' 18 | if not os.path.exists(out_dir): 19 | os.makedirs(out_dir) 20 | 21 | 22 | def worker(fname): 23 | # preparation 24 | subprocess.call('echo "{}" >> {}{}'.format(fname, out_dir, 'fnames'), shell=True) 25 | subprocess.call('cat {}{}.asm | wc -l >> {}{}'.format(in_dir, fname, out_dir, 'line_count'), shell=True) 26 | 27 | specter = ['jmp', 'mov', 'retf', 'push', 'pop', 'xor', 'retn', 'nop', 'sub', 'inc', 'dec', 'add', 28 | 'imul', 'xchg', 'or', 'shr', 'cmp', 'call', 'shl', 'ror', 'or', 'rol', 'jnb'] 29 | for op in specter: 30 | subprocess.call('grep "\s{}\s" {}{}.asm | wc -l >> {}{}'.format(op, in_dir, fname, out_dir, op), shell=True) 31 | 32 | 33 | 34 | raw_filenames = glob.glob(in_dir + '*.asm') 35 | fnames = map(lambda x: x.split('/')[-1].split('.')[0], raw_filenames) 36 | for i, fname in enumerate(fnames): 37 | worker(fname) 38 | if i%200==0: 39 | print i 40 | print 'Done!' -------------------------------------------------------------------------------- /src/stdcall_grepper.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import cPickle 3 | import glob 4 | import subprocess 5 | import os 6 | import set_up 7 | print '[Grepper for __stdcall]' 8 | 9 | if sys.argv[1]=='train': 10 | in_dir = set_up.train_folder_path 11 | elif sys.argv[1]=='test': 12 | in_dir = set_up.test_folder_path 13 | else: 14 | print 'Unknown option' 15 | sys.exit() 16 | 17 | out_dir = set_up.feats_folder_path + 'stdcall_grepper/' 18 | if not os.path.exists(out_dir): 19 | os.makedirs(out_dir) 20 | 21 | 22 | def worker(fname): 23 | # preparation 24 | subprocess.call('grep "__stdcall" {}{}.asm > {}{}'.format(in_dir, fname, out_dir, fname), shell=True) 25 | 26 | 27 | raw_filenames = glob.glob(in_dir + '*.asm') 28 | fnames = map(lambda x: x.split('/')[-1].split('.')[0], raw_filenames) 29 | 30 | 31 | from multiprocessing import Process 32 | def wrapper(fname_list): 33 | for fname in fname_list: 34 | worker(fname) 35 | 36 | nJobs = 15 37 | workers = [] 38 | for workerId in range(nJobs): 39 | p = Process(target=wrapper, args=[[param for i, param in enumerate(fnames) if i % nJobs == workerId]]) 40 | workers.append(p) 41 | p.start() 42 | for p in workers: 43 | p.join() 44 | print 'Done!' --------------------------------------------------------------------------------