├── README.md
├── create_dirs.sh
├── description.pdf
├── final-submission-builder.ipynb
├── how-to-run.pdf
├── learning-4gr-only.ipynb
├── learning-main-model.ipynb
├── semi-supervised-trick.ipynb
└── src
    ├── 4gr-collect-stats.py
    ├── 4gr-freq-reducer.py
    ├── 4gr-rf.py
    ├── 4gr-svc.py
    ├── base-features-packer.py
    ├── func_grepper.py
    ├── main.sh
    ├── ngramms_extractor.py
    ├── sections_hist.py
    ├── set_up.py
    ├── spectral_asm.py
    └── stdcall_grepper.py


/README.md:
--------------------------------------------------------------------------------
 1 | Kaggle ['Microsoft Malware Classification Challenge'](https://www.kaggle.com/c/malware-classification) 3rd place solution
 2 | =======
 3 | ### Mikhail Trofimov, Dmitry Ulyanov, Stanislav Semenov.
 4 | 
 5 | Gets score 0.0040 on private leaderboard
 6 | 
 7 | How to reproduce submission
 8 | =======
 9 | Don't forget to check paths in ./src/set_up.py!
10 | ```
11 | ./create_dirs.sh
12 | cd ./src
13 | ./main.sh
14 | cd ../
15 | ```
16 | and run all the code in 
17 | `learning-main-model.ipynb`,
18 | `learning-4gr-only.ipynb`,
19 | `semi-supervised-trick.ipynb` and 
20 | `final-submission-builder.ipynb`.
21 | 
22 | Dependencies
23 | =======
24 | * python 2.7.9
25 | * ipython 3.1.0
26 | * sklearn 0.16.1
27 | * numpy 1.9.2
28 | * pandas 0.16.0
29 | * hickle 1.1.1
30 | * pypy 2.5.1 (with installed joblib 0.8.4)
31 | * scipy 0.15.1
32 | * xgboost-0.3 
33 | 
34 | Hardware
35 | =======
36 | We run this code on machine with 16 cores and 120 GB RAM.
37 | The most memory-consuming part is processing 4-gramms. All the others will require no more than 32 GB RAM.
38 | 


--------------------------------------------------------------------------------
/create_dirs.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | mkdir tmp
3 | mkdir data
4 | mkdir data/feats
5 | mkdir data/raw
6 | mkdir submissions/
7 | 


--------------------------------------------------------------------------------
/description.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geffy/kaggle-malware/8275e226827107505a559b321356adc0d666849e/description.pdf


--------------------------------------------------------------------------------
/final-submission-builder.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "markdown",
 5 |    "metadata": {},
 6 |    "source": [
 7 |     "# final submission builder"
 8 |    ]
 9 |   },
10 |   {
11 |    "cell_type": "code",
12 |    "execution_count": 11,
13 |    "metadata": {
14 |     "collapsed": false
15 |    },
16 |    "outputs": [],
17 |    "source": [
18 |     "import pandas as pd\n",
19 |     "\n",
20 |     "import set_up"
21 |    ]
22 |   },
23 |   {
24 |    "cell_type": "code",
25 |    "execution_count": 12,
26 |    "metadata": {
27 |     "collapsed": false
28 |    },
29 |    "outputs": [],
30 |    "source": [
31 |     "dsol1 = pd.read_csv(set_up.submissions_dir_path + 'release1.csv')\n",
32 |     "dsol2 = pd.read_csv(set_up.submissions_dir_path + 'release0.5.csv')"
33 |    ]
34 |   },
35 |   {
36 |    "cell_type": "code",
37 |    "execution_count": 13,
38 |    "metadata": {
39 |     "collapsed": false
40 |    },
41 |    "outputs": [],
42 |    "source": [
43 |     "dfinal = pd.read_csv(set_up.test_sample_path)\n",
44 |     "\n",
45 |     "dfinal.ix[:, 2:] = 0.95 * dsol1.ix[:, 2:] + 0.05 * dsol2.ix[:, 2:]\n",
46 |     "dfinal.ix[:, 1] = dsol1.ix[:, 1]\n",
47 |     "dfinal.to_csv(set_up.submissions_dir_path + 'release2.csv', index = False)"
48 |    ]
49 |   }
50 |  ],
51 |  "metadata": {
52 |   "kernelspec": {
53 |    "display_name": "Python 2",
54 |    "language": "python",
55 |    "name": "python2"
56 |   },
57 |   "language_info": {
58 |    "codemirror_mode": {
59 |     "name": "ipython",
60 |     "version": 2
61 |    },
62 |    "file_extension": ".py",
63 |    "mimetype": "text/x-python",
64 |    "name": "python",
65 |    "nbconvert_exporter": "python",
66 |    "pygments_lexer": "ipython2",
67 |    "version": "2.7.6"
68 |   }
69 |  },
70 |  "nbformat": 4,
71 |  "nbformat_minor": 0
72 | }
73 | 


--------------------------------------------------------------------------------
/how-to-run.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geffy/kaggle-malware/8275e226827107505a559b321356adc0d666849e/how-to-run.pdf


--------------------------------------------------------------------------------
/learning-4gr-only.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# script for learning 4gr-only model"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": 3,
 13 |    "metadata": {
 14 |     "collapsed": false
 15 |    },
 16 |    "outputs": [
 17 |     {
 18 |      "name": "stdout",
 19 |      "output_type": "stream",
 20 |      "text": [
 21 |       "Populating the interactive namespace from numpy and matplotlib\n"
 22 |      ]
 23 |     },
 24 |     {
 25 |      "name": "stderr",
 26 |      "output_type": "stream",
 27 |      "text": [
 28 |       "WARNING: pylab import has clobbered these variables: ['random']\n",
 29 |       "`%matplotlib` prevents importing * from pylab and numpy\n"
 30 |      ]
 31 |     }
 32 |    ],
 33 |    "source": [
 34 |     "%pylab inline\n",
 35 |     "\n",
 36 |     "import numpy as np\n",
 37 |     "import pandas as pd\n",
 38 |     "\n",
 39 |     "import time\n",
 40 |     "import random\n",
 41 |     "import cPickle\n",
 42 |     "\n",
 43 |     "from sklearn.externals import joblib\n",
 44 |     "\n",
 45 |     "import set_up\n",
 46 |     "\n",
 47 |     "import sys\n",
 48 |     "sys.path.append('../git/xgboost/wrapper/')\n",
 49 |     "import xgboost as xgb"
 50 |    ]
 51 |   },
 52 |   {
 53 |    "cell_type": "code",
 54 |    "execution_count": 5,
 55 |    "metadata": {
 56 |     "collapsed": false
 57 |    },
 58 |    "outputs": [
 59 |     {
 60 |      "name": "stdout",
 61 |      "output_type": "stream",
 62 |      "text": [
 63 |       "(10868, 16905) (10868,)\n",
 64 |       "(10873, 16905)\n"
 65 |      ]
 66 |     }
 67 |    ],
 68 |    "source": [
 69 |     "X_4gr_dim10k_tr = joblib.load(set_up.feats_4gr_folder_path + '4gr_train_dim10k.joblib')\n",
 70 |     "X_4gr_dim10k_te = joblib.load(set_up.feats_4gr_folder_path + '4gr_test_dim10k.joblib')\n",
 71 |     "\n",
 72 |     "Xtrain = (X_4gr_dim10k_tr).toarray()\n",
 73 |     "Xtest = (X_4gr_dim10k_te).toarray()\n",
 74 |     "\n",
 75 |     "ytrain = np.array(pd.read_csv(set_up.train_labels_path)['Class']) - 1\n",
 76 |     "\n",
 77 |     "print shape(Xtrain), shape(ytrain)\n",
 78 |     "print shape(Xtest)"
 79 |    ]
 80 |   },
 81 |   {
 82 |    "cell_type": "code",
 83 |    "execution_count": 7,
 84 |    "metadata": {
 85 |     "collapsed": false
 86 |    },
 87 |    "outputs": [],
 88 |    "source": [
 89 |     "param = {}\n",
 90 |     "param['booster'] = 'gbtree'\n",
 91 |     "param['objective'] = 'multi:softprob'\n",
 92 |     "param['num_class'] = 9\n",
 93 |     "param['eval_metric'] = 'logloss'\n",
 94 |     "param['scale_pos_weight'] = 1.0\n",
 95 |     "param['bst:eta'] = 0.3\n",
 96 |     "param['bst:max_depth'] = 10\n",
 97 |     "param['bst:colsample_bytree'] = 0.5\n",
 98 |     "param['silent'] = 1\n",
 99 |     "param['nthread'] = 16\n",
100 |     "\n",
101 |     "num_round = 150\n",
102 |     "\n",
103 |     "plst = list(param.items())\n",
104 |     "\n",
105 |     "watchlist = []"
106 |    ]
107 |   },
108 |   {
109 |    "cell_type": "code",
110 |    "execution_count": 8,
111 |    "metadata": {
112 |     "collapsed": false
113 |    },
114 |    "outputs": [
115 |     {
116 |      "name": "stdout",
117 |      "output_type": "stream",
118 |      "text": [
119 |       "0 5.30212034782\n",
120 |       "1 10.6938072642\n",
121 |       "2 15.7326108813\n",
122 |       "3 19.9750008146\n",
123 |       "4 23.7675551971\n",
124 |       "5 27.5684586326\n",
125 |       "6 31.3690359672\n",
126 |       "7 35.2041736325\n",
127 |       "8 39.0149246653\n",
128 |       "9 42.8346661488\n",
129 |       "10 46.6522561669\n",
130 |       "11 50.4519218008\n",
131 |       "12 54.244935147\n",
132 |       "13 58.0708679001\n",
133 |       "14 61.8752340992\n",
134 |       "15 65.6579940637\n",
135 |       "16 69.4180411657\n",
136 |       "17 73.2266673485\n",
137 |       "18 77.0510769486\n",
138 |       "19 80.859686017\n"
139 |      ]
140 |     }
141 |    ],
142 |    "source": [
143 |     "time0 = time.time()\n",
144 |     "\n",
145 |     "indtrain = arange(len(ytrain))\n",
146 |     "yfinalxg = np.zeros((len(Xtest), 9))\n",
147 |     "\n",
148 |     "bgs = 20\n",
149 |     "for bg in range(bgs):\n",
150 |     "    param['seed'] = bg + 1\n",
151 |     "    plst = list(param.items())\n",
152 |     "\n",
153 |     "    newindtrain = random.sample(indtrain, int(len(indtrain) * 1.0))\n",
154 |     "        \n",
155 |     "    Xdatatrain = xgb.DMatrix(data = Xtrain[newindtrain], label = ytrain[newindtrain])\n",
156 |     "    Xdatatest = xgb.DMatrix(data = Xtest)\n",
157 |     "\n",
158 |     "    bst = xgb.train(plst, Xdatatrain, num_round, watchlist)\n",
159 |     "\n",
160 |     "    curpred = bst.predict(Xdatatest).reshape((len(Xtest), 9))        \n",
161 |     "    yfinalxg += curpred\n",
162 |     "\n",
163 |     "    print bg, (time.time() - time0) / 60.\n",
164 |     "\n",
165 |     "yfinalxg /= bgs"
166 |    ]
167 |   },
168 |   {
169 |    "cell_type": "code",
170 |    "execution_count": 9,
171 |    "metadata": {
172 |     "collapsed": true
173 |    },
174 |    "outputs": [],
175 |    "source": [
176 |     "dfinal = pd.read_csv(set_up.test_sample_path)\n",
177 |     "\n",
178 |     "dfinal.ix[:, 1:] = yfinalxg\n",
179 |     "dfinal.to_csv(set_up.submissions_dir_path + 'release0.5.csv', index = False)"
180 |    ]
181 |   }
182 |  ],
183 |  "metadata": {
184 |   "kernelspec": {
185 |    "display_name": "Python 2",
186 |    "language": "python",
187 |    "name": "python2"
188 |   },
189 |   "language_info": {
190 |    "codemirror_mode": {
191 |     "name": "ipython",
192 |     "version": 2
193 |    },
194 |    "file_extension": ".py",
195 |    "mimetype": "text/x-python",
196 |    "name": "python",
197 |    "nbconvert_exporter": "python",
198 |    "pygments_lexer": "ipython2",
199 |    "version": "2.7.6"
200 |   }
201 |  },
202 |  "nbformat": 4,
203 |  "nbformat_minor": 0
204 | }
205 | 


--------------------------------------------------------------------------------
/learning-main-model.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# script for learning main model"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": 1,
 13 |    "metadata": {
 14 |     "collapsed": false
 15 |    },
 16 |    "outputs": [
 17 |     {
 18 |      "name": "stdout",
 19 |      "output_type": "stream",
 20 |      "text": [
 21 |       "Populating the interactive namespace from numpy and matplotlib\n"
 22 |      ]
 23 |     }
 24 |    ],
 25 |    "source": [
 26 |     "%pylab inline\n",
 27 |     "\n",
 28 |     "import numpy as np\n",
 29 |     "import pandas as pd\n",
 30 |     "\n",
 31 |     "import time\n",
 32 |     "import random\n",
 33 |     "import cPickle\n",
 34 |     "\n",
 35 |     "from sklearn.externals import joblib\n",
 36 |     "\n",
 37 |     "import set_up\n",
 38 |     "\n",
 39 |     "import sys\n",
 40 |     "sys.path.append('../git/xgboost/wrapper/')\n",
 41 |     "import xgboost as xgb"
 42 |    ]
 43 |   },
 44 |   {
 45 |    "cell_type": "code",
 46 |    "execution_count": 2,
 47 |    "metadata": {
 48 |     "collapsed": false,
 49 |     "scrolled": true
 50 |    },
 51 |    "outputs": [
 52 |     {
 53 |      "name": "stdout",
 54 |      "output_type": "stream",
 55 |      "text": [
 56 |       "(10868, 203) (10868,)\n",
 57 |       "(10873, 203)\n"
 58 |      ]
 59 |     }
 60 |    ],
 61 |    "source": [
 62 |     "X_base_tr, X_base_te = joblib.load(set_up.feats_folder_path + 'X_basepack')\n",
 63 |     "X_tr, X_te = cPickle.load(open(set_up.feats_folder_path + '4gr_pack_dim100.pickled', 'rb'))\n",
 64 |     "\n",
 65 |     "Xtrain = np.hstack((X_base_tr, X_tr))\n",
 66 |     "Xtest = np.hstack((X_base_te, X_te))\n",
 67 |     "\n",
 68 |     "ytrain = np.array(pd.read_csv(set_up.train_labels_path)['Class']) - 1\n",
 69 |     "\n",
 70 |     "print shape(Xtrain), shape(ytrain)\n",
 71 |     "print shape(Xtest)"
 72 |    ]
 73 |   },
 74 |   {
 75 |    "cell_type": "code",
 76 |    "execution_count": 3,
 77 |    "metadata": {
 78 |     "collapsed": false
 79 |    },
 80 |    "outputs": [],
 81 |    "source": [
 82 |     "param = {}\n",
 83 |     "param['booster'] = 'gbtree'\n",
 84 |     "param['objective'] = 'multi:softprob'\n",
 85 |     "param['num_class'] = 9\n",
 86 |     "param['eval_metric'] = 'logloss'\n",
 87 |     "param['scale_pos_weight'] = 1.0\n",
 88 |     "param['bst:eta'] = 0.3\n",
 89 |     "param['bst:max_depth'] = 6\n",
 90 |     "param['bst:colsample_bytree'] = 0.5\n",
 91 |     "param['silent'] = 1\n",
 92 |     "param['nthread'] = 16\n",
 93 |     "\n",
 94 |     "num_round = 100\n",
 95 |     "\n",
 96 |     "plst = list(param.items())\n",
 97 |     "\n",
 98 |     "watchlist = []"
 99 |    ]
100 |   },
101 |   {
102 |    "cell_type": "code",
103 |    "execution_count": 4,
104 |    "metadata": {
105 |     "collapsed": false
106 |    },
107 |    "outputs": [
108 |     {
109 |      "name": "stdout",
110 |      "output_type": "stream",
111 |      "text": [
112 |       "0 0.418001099428\n",
113 |       "1 0.829770731926\n",
114 |       "2 1.24144924879\n",
115 |       "3 1.64242663383\n",
116 |       "4 2.06019373337\n",
117 |       "5 2.47356063128\n",
118 |       "6 2.90849781434\n",
119 |       "7 3.31573658387\n",
120 |       "8 3.72551828225\n",
121 |       "9 4.14109171629\n",
122 |       "10 4.55687183142\n",
123 |       "11 5.6331213673\n",
124 |       "12 7.3000319163\n",
125 |       "13 8.98525646528\n",
126 |       "14 10.6799249013\n",
127 |       "15 12.3658061345\n",
128 |       "16 14.0249494155\n",
129 |       "17 15.7358465989\n",
130 |       "18 17.4313094497\n",
131 |       "19 19.1317503333\n"
132 |      ]
133 |     }
134 |    ],
135 |    "source": [
136 |     "time0 = time.time()\n",
137 |     "\n",
138 |     "indtrain = arange(len(ytrain))\n",
139 |     "yfinalxg = np.zeros((len(Xtest), 9))\n",
140 |     "\n",
141 |     "bgs = 20\n",
142 |     "for bg in range(bgs):\n",
143 |     "    param['seed'] = bg + 1\n",
144 |     "    plst = list(param.items())\n",
145 |     "\n",
146 |     "    newindtrain = random.sample(indtrain, int(len(indtrain) * 1.0))\n",
147 |     "\n",
148 |     "    for i in range(int(len(indtrain) * 7.0)):\n",
149 |     "        newindtrain.append(random.choice(indtrain))\n",
150 |     "        \n",
151 |     "    Xdatatrain = xgb.DMatrix(data = Xtrain[newindtrain], label = ytrain[newindtrain])\n",
152 |     "    Xdatatest = xgb.DMatrix(data = Xtest)\n",
153 |     "\n",
154 |     "    bst = xgb.train(plst, Xdatatrain, num_round, watchlist)\n",
155 |     "\n",
156 |     "    curpred = bst.predict(Xdatatest).reshape((len(Xtest), 9))        \n",
157 |     "    yfinalxg += curpred\n",
158 |     "\n",
159 |     "    print bg, (time.time() - time0) / 60.\n",
160 |     "\n",
161 |     "yfinalxg /= bgs"
162 |    ]
163 |   },
164 |   {
165 |    "cell_type": "code",
166 |    "execution_count": 5,
167 |    "metadata": {
168 |     "collapsed": false
169 |    },
170 |    "outputs": [],
171 |    "source": [
172 |     "dfinal = pd.read_csv(set_up.test_sample_path)\n",
173 |     "\n",
174 |     "dfinal.ix[:, 1:] = yfinalxg\n",
175 |     "dfinal.to_csv(set_up.submissions_dir_path + 'release0.csv', index = False)"
176 |    ]
177 |   }
178 |  ],
179 |  "metadata": {
180 |   "kernelspec": {
181 |    "display_name": "Python 2",
182 |    "language": "python",
183 |    "name": "python2"
184 |   },
185 |   "language_info": {
186 |    "codemirror_mode": {
187 |     "name": "ipython",
188 |     "version": 2
189 |    },
190 |    "file_extension": ".py",
191 |    "mimetype": "text/x-python",
192 |    "name": "python",
193 |    "nbconvert_exporter": "python",
194 |    "pygments_lexer": "ipython2",
195 |    "version": "2.7.6"
196 |   }
197 |  },
198 |  "nbformat": 4,
199 |  "nbformat_minor": 0
200 | }
201 | 


--------------------------------------------------------------------------------
/semi-supervised-trick.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# semi-supervides trick"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": 1,
 13 |    "metadata": {
 14 |     "collapsed": false
 15 |    },
 16 |    "outputs": [
 17 |     {
 18 |      "name": "stdout",
 19 |      "output_type": "stream",
 20 |      "text": [
 21 |       "Populating the interactive namespace from numpy and matplotlib\n"
 22 |      ]
 23 |     }
 24 |    ],
 25 |    "source": [
 26 |     "%pylab inline\n",
 27 |     "\n",
 28 |     "import numpy as np\n",
 29 |     "import pandas as pd\n",
 30 |     "\n",
 31 |     "import scipy\n",
 32 |     "import time\n",
 33 |     "import random\n",
 34 |     "import cPickle\n",
 35 |     "\n",
 36 |     "from sklearn.externals import joblib\n",
 37 |     "from sklearn.cross_validation import KFold\n",
 38 |     "\n",
 39 |     "import set_up\n",
 40 |     "\n",
 41 |     "import sys\n",
 42 |     "sys.path.append('../git/xgboost/wrapper/')\n",
 43 |     "import xgboost as xgb"
 44 |    ]
 45 |   },
 46 |   {
 47 |    "cell_type": "code",
 48 |    "execution_count": 2,
 49 |    "metadata": {
 50 |     "collapsed": false
 51 |    },
 52 |    "outputs": [
 53 |     {
 54 |      "name": "stdout",
 55 |      "output_type": "stream",
 56 |      "text": [
 57 |       "(10868, 203) (10868,)\n",
 58 |       "(10873, 203)\n"
 59 |      ]
 60 |     }
 61 |    ],
 62 |    "source": [
 63 |     "X_base_tr, X_base_te = joblib.load(set_up.feats_folder_path + 'X_basepack')\n",
 64 |     "X_tr, X_te = cPickle.load(open(set_up.feats_folder_path + '4gr_pack_dim100.pickled', 'rb'))\n",
 65 |     "\n",
 66 |     "Xtrain = np.hstack((X_base_tr, X_tr))\n",
 67 |     "Xtest = np.hstack((X_base_te, X_te))\n",
 68 |     "\n",
 69 |     "ytrain = np.array(pd.read_csv(set_up.train_labels_path)['Class']) - 1\n",
 70 |     "\n",
 71 |     "print shape(Xtrain), shape(ytrain)\n",
 72 |     "print shape(Xtest)"
 73 |    ]
 74 |   },
 75 |   {
 76 |    "cell_type": "code",
 77 |    "execution_count": 3,
 78 |    "metadata": {
 79 |     "collapsed": true
 80 |    },
 81 |    "outputs": [],
 82 |    "source": [
 83 |     "dbest = pd.read_csv(set_up.submissions_dir_path + 'release0.csv')"
 84 |    ]
 85 |   },
 86 |   {
 87 |    "cell_type": "code",
 88 |    "execution_count": 4,
 89 |    "metadata": {
 90 |     "collapsed": false
 91 |    },
 92 |    "outputs": [],
 93 |    "source": [
 94 |     "Y = np.array(ytrain)\n",
 95 |     "test_preds = np.array(dbest.ix[:, 1:])\n",
 96 |     "\n",
 97 |     "x_train = np.array(Xtrain)\n",
 98 |     "x_test = np.array(Xtest[:, :])"
 99 |    ]
100 |   },
101 |   {
102 |    "cell_type": "code",
103 |    "execution_count": null,
104 |    "metadata": {
105 |     "collapsed": false
106 |    },
107 |    "outputs": [],
108 |    "source": [
109 |     "rvs = [scipy.stats.rv_discrete(values = (range(9),test_preds[j,:])) for j in range(test_preds.shape[0])]\n",
110 |     " \n",
111 |     "skf = KFold(test_preds.shape[0], n_folds=10,random_state = 11)\n",
112 |     " \n",
113 |     "loss = 0\n",
114 |     "test_pr1 = test_preds.copy()\n",
115 |     " \n",
116 |     "t0 = time.time()\n",
117 |     "    \n",
118 |     "kok = 0\n",
119 |     "for train_index, test_index in skf:\n",
120 |     " \n",
121 |     "    datas_train = np.concatenate([x_train,x_test[train_index,:]])\n",
122 |     "    datas_test  = x_test[test_index,:]\n",
123 |     " \n",
124 |     "    pr_vals = []\n",
125 |     "    for i in range(20):\n",
126 |     "        Y_test_s = [r.rvs() for r in rvs]\n",
127 |     " \n",
128 |     "        y_train  = np.array(Y.tolist()  + np.array(Y_test_s)[train_index].tolist())\n",
129 |     "    \n",
130 |     "        indbigtrain = np.arange(len(datas_train))\n",
131 |     "        indtrain = np.arange(len(train_index))\n",
132 |     "        newindtrain = random.sample(indbigtrain, int(len(indbigtrain) * 1.0))\n",
133 |     "\n",
134 |     "        for j in range(int(len(indtrain) * 7.0)):\n",
135 |     "            newindtrain.append(random.choice(indtrain))\n",
136 |     "\n",
137 |     "        dtrain = xgb.DMatrix(datas_train[newindtrain], label=y_train[newindtrain])\n",
138 |     "        dval = xgb.DMatrix(datas_test)\n",
139 |     "\n",
140 |     "        param1 = {'num_round': 200,\n",
141 |     "                     'seed' : i*13141 + 123,\n",
142 |     "                     'max_depth':3,\n",
143 |     "                     'gamma': 0.0,\n",
144 |     "                     'eta':0.22,\n",
145 |     "                     'silent':1, \n",
146 |     "                     'objective':'multi:softprob',\n",
147 |     "                     'num_class' : 9,\n",
148 |     "                     'subsample' : 1,\n",
149 |     "                     'colsample_bytree' : 0.3,\n",
150 |     "                     'nthread' : 16}        \n",
151 |     "        \n",
152 |     "        watchlist = []\n",
153 |     " \n",
154 |     "        bst = xgb.train(\n",
155 |     "            param1, dtrain, param1['num_round'], watchlist)\n",
156 |     " \n",
157 |     "        pr_val = bst.predict(dval)\n",
158 |     " \n",
159 |     "        pr_vals.append(pr_val)\n",
160 |     "        print \" - %d\" % i, (time.time() - t0) / 60\n",
161 |     " \n",
162 |     "    pr_val = np.mean(pr_vals,axis = 0)\n",
163 |     " \n",
164 |     "    test_pr1[test_index,:] = pr_val\n",
165 |     "     \n",
166 |     "    print ' f', kok\n",
167 |     "    \n",
168 |     "    kok +=1"
169 |    ]
170 |   },
171 |   {
172 |    "cell_type": "code",
173 |    "execution_count": null,
174 |    "metadata": {
175 |     "collapsed": true
176 |    },
177 |    "outputs": [],
178 |    "source": [
179 |     "dfinal = pd.read_csv(set_up.test_sample_path)\n",
180 |     "\n",
181 |     "dfinal.ix[:, 1:] = test_pr1\n",
182 |     "dfinal.to_csv(set_up.submissions_dir_path + 'release1.csv', index = False)"
183 |    ]
184 |   }
185 |  ],
186 |  "metadata": {
187 |   "kernelspec": {
188 |    "display_name": "Python 2",
189 |    "language": "python",
190 |    "name": "python2"
191 |   },
192 |   "language_info": {
193 |    "codemirror_mode": {
194 |     "name": "ipython",
195 |     "version": 2
196 |    },
197 |    "file_extension": ".py",
198 |    "mimetype": "text/x-python",
199 |    "name": "python",
200 |    "nbconvert_exporter": "python",
201 |    "pygments_lexer": "ipython2",
202 |    "version": "2.7.6"
203 |   }
204 |  },
205 |  "nbformat": 4,
206 |  "nbformat_minor": 0
207 | }
208 | 


--------------------------------------------------------------------------------
/src/4gr-collect-stats.py:
--------------------------------------------------------------------------------
 1 | import cPickle as pickle
 2 | import numpy as np
 3 | import hickle
 4 | import glob
 5 | import sys
 6 | import gc
 7 | import set_up
 8 | print '[4gr freq stat collector]'
 9 | 
10 | in_dir = set_up.feats_folder_path + '4gr/'
11 | all_tokens = np.zeros(257**4)
12 | files = glob.glob(in_dir + '*.bytes')[::5]
13 | for i, fname in enumerate(files):
14 |     print '[{}] {}'.format(i, fname)
15 |     ptr, vals = pickle.load(open(fname))
16 |     #for key in ptr:
17 |     #    all_tokens[key] += 1
18 |     all_tokens[ptr] = all_tokens[ptr] + 1
19 |     del ptr, vals
20 |     if i%100==0:
21 |         gc.collect()
22 |     if (i%4000==0) and (i>0):
23 |         print 'pickled_state: {}'.format(i)
24 |         hickle.dump(all_tokens, open(set_up.tmp_path + '4gr_stats_2', 'w')) 
25 |         break


--------------------------------------------------------------------------------
/src/4gr-freq-reducer.py:
--------------------------------------------------------------------------------
 1 | import glob
 2 | import sys
 3 | import gc
 4 | import numpy as np
 5 | import pickle
 6 | from joblib import Parallel, delayed
 7 | import set_up
 8 | print '[4gr freq reducer]'
 9 | 
10 | 
11 | repl = pickle.load(open(set_up.tmp_path + '4gr_replacer', 'rb'))
12 | goodset = set(repl.keys())
13 | 
14 | in_dir = set_up.feats_folder_path + '4gr/'
15 | 
16 | 
17 | 
18 | def worker(i,fname):
19 |     print '[{}] {}'.format(i, fname)
20 |     (ptr, vals) = pickle.load(open(fname))
21 |     new_ptr = []
22 |     new_vals = []
23 |     for ind, val in zip(ptr, vals):
24 |         if ind in goodset:
25 |             new_ptr.append(repl[ind])
26 |             new_vals.append(val)
27 |     del ptr, vals
28 |     pickle.dump((new_ptr, new_vals), open(set_up.feats_folder_path + '4gr/' + fname.split('/')[-1] + '.freq', 'wb'), protocol=2)
29 |     gc.collect()
30 |     
31 | 
32 | files = glob.glob(in_dir + '*.bytes')
33 | 
34 | #print files
35 | Parallel(n_jobs=15)(delayed(worker)(i,f) for i,f in enumerate(files))
36 | 
37 | print 'Done!'
38 |    


--------------------------------------------------------------------------------
/src/4gr-rf.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | import hickle, cPickle
 4 | import set_up
 5 | import scipy.sparse as sp
 6 | from sklearn.externals import joblib
 7 | 
 8 | import logging
 9 | reload(logging)
10 | logging.basicConfig(format = u'[%(asctime)s]  %(message)s', level = logging.INFO)
11 | logging.info('[Script for 2nd stage feature selection by RF]')
12 | 
13 | logging.info('Load data...')
14 | trainLabels = pd.read_csv(set_up.train_labels_path)
15 | X_tr = joblib.load(set_up.feats_folder_path + '4gr/4gr_train_dim10k.joblib').todense()
16 | X_te = joblib.load(set_up.feats_folder_path + '4gr/4gr_test_dim10k.joblib').todense()
17 | 
18 | logging.info('Fitting RF...')
19 | from sklearn.cross_validation import train_test_split
20 | from sklearn.ensemble import RandomForestClassifier
21 | X_train, X_test, y_train, y_test = train_test_split(X_tr, trainLabels.Class.values, random_state=42, test_size=0.3)
22 | rf = RandomForestClassifier(n_jobs=-1, n_estimators=1000)
23 | rf.fit(X_train, y_train)
24 | logging.info('RF acc: {}'.format(np.mean(rf.predict(X_test) == y_test)))
25 | 
26 | fmask = (rf.feature_importances_> 0.0014)
27 | logging.info('Total feats: {}, selected: {}'.format(len(fmask), sum(fmask)))
28 | 
29 | logging.info('Dumping...')
30 | data = (X_tr[:, fmask], X_te[:, fmask])
31 | cPickle.dump(data, open(set_up.feats_folder_path + '4gr_pack_dim100.pickled', 'wb'), protocol=2)
32 | 
33 | logging.info('Done!')


--------------------------------------------------------------------------------
/src/4gr-svc.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import numpy as np
  3 | import hickle, cPickle
  4 | import set_up
  5 | import scipy.sparse as sp
  6 | from sklearn.externals import joblib
  7 | import os
  8 | 
  9 | import logging
 10 | reload(logging)
 11 | logging.basicConfig(format = u'[%(asctime)s]  %(message)s', level = logging.INFO)
 12 | logging.info('[Script for 1st stage feature selection by SVC]')
 13 | 
 14 | # assembling train
 15 | logging.info('[train] Load data...')
 16 | trainLabels = pd.read_csv(set_up.train_labels_path)
 17 | 
 18 | data = []
 19 | indices = []
 20 | ptrs = [0]
 21 | cur_bound = 0
 22 | for i, row in trainLabels.iterrows():
 23 |     fname = set_up.feats_folder_path + '4gr/' + '{}.bytes.freq'.format(row['Id'])
 24 |     (inds, vals) = cPickle.load(open(fname, 'rb'))
 25 |     assert len(vals) == len(inds)
 26 |     data.extend(vals)
 27 |     indices.extend(inds)
 28 |     cur_bound += len(vals)
 29 |     ptrs.append(cur_bound)
 30 |     if i%1000==0:
 31 |         print '[{}] {}'.format(i, row['Id'])        
 32 | hickle.dump((np.array(data), np.array(indices), np.array(ptrs)), set_up.tmp_path + '4gr_train_raw.hi')
 33 | 
 34 | logging.info('[train] Build csr...')
 35 | X = sp.csr_matrix((np.array(data), np.array(indices), np.array(ptrs)), dtype=int)
 36 | 
 37 | logging.info('[train] dump to {}...'.format(set_up.feats_folder_path + '4gr/4gr_train_csr.joblib'))
 38 | joblib.dump(X, set_up.feats_folder_path + '4gr/4gr_train_csr.joblib')
 39 | del X, data, indices, ptrs
 40 | 
 41 | 
 42 | # assembling test
 43 | logging.info('[test] Load data...')
 44 | sampleSubmission = pd.read_csv(set_up.test_sample_path) #ToDo
 45 | 
 46 | data = []
 47 | indices = []
 48 | ptrs = [0]
 49 | cur_bound = 0
 50 | for i, row in sampleSubmission.iterrows():
 51 |     fname = set_up.feats_folder_path + '4gr/' + '{}.bytes.freq'.format(row['Id'])
 52 |     (inds, vals) = cPickle.load(open(fname, 'rb'))
 53 |     assert len(vals) == len(inds)
 54 |     data.extend(vals)
 55 |     indices.extend(inds)
 56 |     cur_bound += len(vals)
 57 |     ptrs.append(cur_bound)
 58 |     if i%1000==0:
 59 |         print '[{}] {}'.format(i, row['Id'])
 60 | 
 61 | hickle.dump((np.array(data), np.array(indices), np.array(ptrs)), set_up.tmp_path + '4gr_test_raw.hi')
 62 | 
 63 | logging.info('[test] Build csr...')
 64 | X = sp.csr_matrix((np.array(data), np.array(indices), np.array(ptrs)), dtype=int)
 65 | 
 66 | logging.info('[test] dump to {}...'.format(set_up.feats_folder_path + '4gr/4gr_test_csr.joblib'))
 67 | joblib.dump(X, set_up.feats_folder_path + '4gr/4gr_test_csr.joblib')
 68 | 
 69 | logging.info('Cleaning memory...')
 70 | del X, data, indices, ptrs
 71 | 
 72 | 
 73 | # Feature selection
 74 | logging.info('Load train for feature selection...')
 75 | 
 76 | import scipy.sparse as sp
 77 | from sklearn.externals import joblib
 78 | from sklearn.svm import LinearSVC
 79 | 
 80 | trainLabels = pd.read_csv(set_up.train_labels_path)
 81 | X_csr_tr = joblib.load(set_up.feats_folder_path + '4gr/4gr_train_csr.joblib')
 82 | 
 83 | logging.info('Fit lsvc...')
 84 | model = LinearSVC(penalty='l1',max_iter=20, dual=False, verbose=1)
 85 | model.fit(X_csr_tr, trainLabels.Class.values)
 86 | 
 87 | logging.info('Dump fitted model..')
 88 | joblib.dump(model, set_up.tmp_path + '4gr_csr_model.joblib')
 89 | 
 90 | 
 91 | # reduce train
 92 | logging.info('Reduce train...')
 93 | X_mini = model.transform(X_csr_tr)
 94 | 
 95 | logging.info('Dump train...')
 96 | joblib.dump(X_mini, set_up.feats_folder_path + '4gr/4gr_train_dim10k.joblib')
 97 | del X_csr_tr
 98 | 
 99 | 
100 | # reduce test
101 | logging.info('Read test...')
102 | X_csr_te = joblib.load(set_up.feats_folder_path + '4gr/4gr_test_csr.joblib')
103 | 
104 | logging.info('Reduce test...')
105 | X_mini = model.transform(X_csr_te)
106 | 
107 | logging.info('Dump test...')
108 | joblib.dump(X_mini, set_up.feats_folder_path + '4gr/4gr_test_dim10k.joblib')
109 | 
110 | logging.info('Done!')


--------------------------------------------------------------------------------
/src/base-features-packer.py:
--------------------------------------------------------------------------------
  1 | # coding: utf-8
  2 | 
  3 | import pandas as pd
  4 | import numpy as np
  5 | import cPickle
  6 | from sklearn.preprocessing import LabelBinarizer
  7 | from sklearn.ensemble import RandomForestClassifier
  8 | from sklearn.metrics import log_loss
  9 | from sklearn.cross_validation import train_test_split
 10 | from sklearn.externals import joblib
 11 | import set_up
 12 | feats_path = '../data/feats/'
 13 | 
 14 | 
 15 | import logging
 16 | reload(logging)
 17 | logging.basicConfig(format = u'[%(asctime)s]  %(message)s', level = logging.INFO)
 18 | logging.info('[Base feature packer]')
 19 | 
 20 | 
 21 | 
 22 | def calc_entropy(X):
 23 |     ent = np.zeros(len(X))
 24 |     for i in range(len(X)):
 25 |         x = X[i]*1.0 / (sum(X[i]) + 0.00001)
 26 |         ent[i] = -np.sum(x*np.log(x+0.00000001))
 27 |     return ent
 28 | 
 29 | 
 30 | 
 31 | 
 32 | trainLabels = pd.read_csv(set_up.train_labels_path)
 33 | sampleSubmission = pd.read_csv(set_up.test_sample_path)
 34 | 
 35 | 
 36 | 
 37 | # sections features
 38 | logging.info('Collect sections...')
 39 | from sklearn.feature_extraction import DictVectorizer
 40 | section_whitelist = set(['.bss', '.data', '.edata', '.idata', '.rdata', '.reloc',
 41 |                        '.rsrc', '.text', '.tls', 'bss', 'code', 'data', 'header'])
 42 | 
 43 | y_tr = np.zeros(len(trainLabels))
 44 | lines_tr = []
 45 | 
 46 | #for train
 47 | for i,row in trainLabels.iterrows():
 48 |     y_tr[i] = row['Class']
 49 |     
 50 |     # lines
 51 |     feat = cPickle.load(open('{}sections_hist/{}'.format(feats_path, row['Id']), 'r'))
 52 |     del feat['sum']
 53 |     whitened = {}
 54 |     for x in feat:
 55 |         if x in section_whitelist:
 56 |             whitened[x] = feat[x]
 57 |     lines_tr.append(whitened)
 58 | 
 59 |         
 60 | #for test
 61 | lines_te = []
 62 | for i,row in sampleSubmission.iterrows():
 63 |     # lines
 64 |     feat = cPickle.load(open('{}sections_hist/{}'.format(feats_path, row['Id']), 'r'))
 65 |     del feat['sum']
 66 |     whitened = {}
 67 |     for x in feat:
 68 |         if x in section_whitelist:
 69 |             whitened[x] = feat[x]
 70 |     lines_te.append(whitened)
 71 |     
 72 | # convert to matrix
 73 | dv = DictVectorizer(sparse=False)
 74 | dv.fit(lines_tr)
 75 | X_lines_tr = dv.transform(lines_tr)
 76 | X_lines_te = dv.transform(lines_te)
 77 | 
 78 | 
 79 | E_lines_tr = calc_entropy(X_lines_tr)
 80 | E_lines_te = calc_entropy(X_lines_te)
 81 | 
 82 | 
 83 | 
 84 | 
 85 | # filesize
 86 | logging.info('Collect file sizes')
 87 | import os
 88 | # train
 89 | X_sizes_tr = np.zeros([len(trainLabels), 2])
 90 | for i,row in trainLabels.iterrows():
 91 |     fname = row['Id']
 92 |     X_sizes_tr[i, 0] = os.path.getsize('{}{}.bytes'.format(set_up.train_folder_path, fname))
 93 |     X_sizes_tr[i, 1] = os.path.getsize('{}{}.asm'.format(set_up.train_folder_path, fname))
 94 | size_ratio_tr = (X_sizes_tr[:, 0] *1.0 / X_sizes_tr[:, 1])[:, np.newaxis]
 95 | 
 96 | #test
 97 | X_sizes_te = np.zeros([len(sampleSubmission), 2])
 98 | for i,row in sampleSubmission.iterrows():
 99 |     fname = row['Id']
100 |     X_sizes_te[i, 0] = os.path.getsize('{}{}.bytes'.format(set_up.test_folder_path, fname))
101 |     X_sizes_te[i, 1] = os.path.getsize('{}{}.asm'.format(set_up.test_folder_path, fname))
102 | size_ratio_te = (X_sizes_te[:, 0] *1.0 / X_sizes_te[:, 1])[:, np.newaxis]
103 | 
104 | 
105 | 
106 | # spectral asm
107 | logging.info('Collect spectral asm...')
108 | def read_file(filename):
109 |     fin = open(filename, 'r')
110 |     data = []
111 |     for line in fin:
112 |         data.append(line.strip())
113 |     return data
114 | 
115 | fnames = read_file('{}spectral_asm/fnames'.format(feats_path))
116 | asm_dict = {}
117 | specter = ['jmp', 'mov', 'retf', 'push', 'pop', 'xor', 'retn', 'nop', 'sub', 'inc', 'dec', 'add',
118 |                 'imul', 'xchg', 'or', 'shr', 'cmp', 'call', 'shl', 'ror', 'rol', 'jnb']
119 | for op in specter:
120 |     values = read_file('{}spectral_asm/{}'.format(feats_path, op))
121 |     for fname, val in zip(fnames, values):
122 |         asm_dict[fname] = asm_dict.get(fname, {})
123 |         asm_dict[fname][op] = val
124 | # train
125 | X_asm_tr = np.zeros((len(trainLabels), 22))
126 | for i, fname in enumerate(trainLabels.Id.values):
127 |     for j,op in enumerate(specter):
128 |         X_asm_tr[i,j] = asm_dict[fname][op]
129 |         
130 | E_asm_tr = calc_entropy(X_asm_tr)
131 | 
132 | #test
133 | X_asm_te = np.zeros((len(sampleSubmission), 22))
134 | for i, fname in enumerate(sampleSubmission.Id.values):
135 |     for j,op in enumerate(specter):
136 |         X_asm_te[i,j] = asm_dict[fname][op]
137 |         
138 | E_asm_te = calc_entropy(X_asm_te)
139 | 
140 | 
141 | 
142 | # line counts
143 | logging.info('Collect line counts...')
144 | fnames = read_file('{}spectral_asm/fnames'.format(feats_path))
145 | line_counts = read_file('{}spectral_asm/line_count'.format(feats_path))
146 | 
147 | line_dict = {}
148 | for i, fname in enumerate(fnames):
149 |     line_dict[fname] = line_counts[i]
150 | 
151 | # train
152 | X_lcounts_tr = np.zeros((len(trainLabels), 1))
153 | for i, fname in enumerate(trainLabels.Id.values):
154 |     X_lcounts_tr[i, 0] = line_dict[fname]
155 |     
156 | E_lcounts_tr = calc_entropy(X_lcounts_tr)
157 | 
158 | # test
159 | X_lcounts_te = np.zeros((len(sampleSubmission), 1))
160 | for i, fname in enumerate(sampleSubmission.Id.values):
161 |     X_lcounts_te[i, 0] = line_dict[fname]
162 |     
163 | E_lcounts_te = calc_entropy(X_lcounts_te)
164 | 
165 | 
166 | 
167 | # import calls
168 | logging.info('Collect calls...')
169 | def get_call_list(fname):
170 |     calls =[]
171 |     lines = read_file(fname)
172 |     for line in lines:
173 |         calls.append(line.split('__stdcall')[1].split('(')[0].split('_')[0].strip())
174 |     return calls
175 | 
176 | # train
177 | call_txt_tr = []
178 | for i,row in trainLabels.iterrows():
179 |     call_txt_tr.append(' '.join(get_call_list('{}stdcall_grepper/'.format(feats_path) + row['Id'])))
180 | 
181 | # train
182 | call_txt_te = []
183 | for i,row in sampleSubmission.iterrows():
184 |     call_txt_te.append(' '.join(get_call_list('{}stdcall_grepper/'.format(feats_path) + row['Id'])))
185 | 
186 | 
187 | logging.info('-> vectorizing...')
188 | from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
189 | vect = TfidfVectorizer(max_features=10000)
190 | vect.fit(call_txt_tr + call_txt_te)
191 | X_call_tr = vect.transform(call_txt_tr)
192 | X_call_te = vect.transform(call_txt_te)
193 | 
194 | 
195 | logging.info('-> apply NMF...')
196 | from sklearn.decomposition import TruncatedSVD, NMF
197 | from scipy import sparse
198 | 
199 | nmf = NMF(n_components=10, sparseness='data')
200 | nmf.fit(sparse.vstack([X_call_tr, X_call_te]))
201 | X_calls_nmf_tr = nmf.transform(X_call_tr)
202 | X_calls_nmf_te = nmf.transform(X_call_te)
203 | 
204 | 
205 | 
206 | # funcs
207 | 
208 | 
209 | logging.info('Collect FUNCs...')
210 | 
211 | def get_func_list(fname):
212 |     procs =[]
213 |     lines = read_file(fname)
214 |     for line in lines:
215 |         line2 = line.split('FUNCTION')[1]
216 |         if 'PRESS' in line2:
217 |             procs.append(line2.split('PRESS')[0].strip().replace('.', ''))
218 |     return procs
219 | 
220 | func_txt_tr = []
221 | func_txt_te = []
222 | 
223 | for i,row in trainLabels.iterrows():
224 |     func_txt_tr.append(' '.join(get_func_list('{}func_grepper/'.format(feats_path) + row['Id'])))
225 |         
226 | for i,row in sampleSubmission.iterrows():
227 |     func_txt_te.append(' '.join(get_func_list('{}func_grepper/'.format(feats_path) + row['Id'])))
228 | 
229 | logging.info('-> vectorizing...')
230 | from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
231 | vect_func = TfidfVectorizer(max_features=10000)
232 | vect_func.fit(func_txt_tr + func_txt_te)
233 | 
234 | X_func_tr = vect_func.transform(func_txt_tr)
235 | X_func_te = vect_func.transform(func_txt_te)
236 | 
237 | logging.info('-> apply NMF...')
238 | nmf_func = NMF(n_components=10, sparseness='data')
239 | nmf_func.fit(sparse.vstack([X_func_tr, X_func_te]))
240 | X_func_nmf_tr = nmf_func.transform(X_func_tr)
241 | X_func_nmf_te = nmf_func.transform(X_func_te)
242 | 
243 | 
244 | 
245 | # building
246 | logging.info('Build all together...')
247 | X_train_tr = np.hstack((
248 |                           X_lines_tr, 
249 |                           size_ratio_tr, 
250 |                           X_asm_tr,
251 |                           X_sizes_tr, #!
252 |                           E_lines_tr[:, np.newaxis], #!
253 |                           X_calls_nmf_tr,
254 |                           X_func_nmf_tr
255 |                           ))
256 | 
257 | X_train_te = np.hstack((
258 |                           X_lines_te, 
259 |                           size_ratio_te, 
260 |                           X_asm_te,
261 |                           X_sizes_te, #!
262 |                           E_lines_te[:, np.newaxis], #!
263 |                           X_calls_nmf_te,
264 |                           X_func_nmf_te
265 |                           ))
266 | 
267 | # dump
268 | joblib.dump((X_train_tr, X_train_te), '{}X_basepack'.format(feats_path))
269 | logging.info('Done!')
270 | 


--------------------------------------------------------------------------------
/src/func_grepper.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import cPickle
 3 | import glob
 4 | import subprocess
 5 | import os
 6 | import set_up
 7 | print '[Grepper for FUNCTION]'
 8 | 
 9 | if sys.argv[1]=='train':
10 |     in_dir = set_up.train_folder_path
11 | elif sys.argv[1]=='test':
12 |     in_dir = set_up.test_folder_path   
13 | else:
14 |     print 'Unknown option'
15 |     sys.exit()
16 | 
17 | out_dir = set_up.feats_folder_path + 'func_grepper/'
18 | if not os.path.exists(out_dir):
19 |     os.makedirs(out_dir)
20 | 
21 | 
22 | def worker(fname):
23 |     # preparation
24 |     subprocess.call('grep "FUNCTION" {}{}.asm > {}{}'.format(in_dir, fname, out_dir, fname), shell=True)
25 | 
26 | 
27 | raw_filenames = glob.glob(in_dir + '*.asm')
28 | fnames = map(lambda x: x.split('/')[-1].split('.')[0], raw_filenames)
29 | 
30 | from multiprocessing import Process
31 | def wrapper(fname_list):
32 |     for fname in fname_list:
33 |         worker(fname)
34 | 
35 | nJobs = 15
36 | workers = []
37 | for workerId in range(nJobs):
38 |     p = Process(target=wrapper, args=[[param for i, param in enumerate(fnames) if i % nJobs == workerId]])
39 |     workers.append(p)
40 |     p.start()
41 | for p in workers:
42 |     p.join()
43 | print 'Done!'


--------------------------------------------------------------------------------
/src/main.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | pypy section_hist.py train
 3 | pypy section_hist.py trest
 4 | python spectral_asm.py train
 5 | python spectral_asm.py test
 6 | python stdcall_grepper.py train
 7 | python stdcall_grepper.py test
 8 | python func_grepper.py train
 9 | python func_grepper.py test
10 | python base-features-packer.py
11 | pypy ngramms_extractor.py train 4
12 | pypy ngramms_extractor.py test 4
13 | python 4gr-collect-stats.py
14 | pypy 4gr-freq-reducer.py
15 | python 4gr-svc.py
16 | python 4gr-rf.py


--------------------------------------------------------------------------------
/src/ngramms_extractor.py:
--------------------------------------------------------------------------------
 1 | import glob
 2 | from joblib import Parallel, delayed
 3 | import set_up
 4 | import pickle 
 5 | import os
 6 | import sys
 7 | print '[n-gramm extractor]'
 8 | 
 9 | if sys.argv[1]=='train':
10 |     in_dir = set_up.train_folder_path + '*.bytes'
11 | elif sys.argv[1]=='test':
12 |     in_dir = set_up.test_folder_path + '*.bytes'  
13 | else:
14 |     print 'Unknown option'
15 |     sys.exit()
16 |     
17 | ng_order = int(sys.argv[2])
18 | out_dir = set_up.feats_folder_path + '{}gr/'.format(ng_order)
19 | if not os.path.exists(out_dir):
20 |     os.makedirs(out_dir)
21 | 
22 | files = glob.glob(in_dir)[::-1]
23 | 
24 | def get_dict():
25 |     d = {format(key, '02X'): key for key in range(256)}
26 |     d['??'] = 256
27 |     return d
28 | 
29 | def indexer4gr(tokens):
30 |     return tokens[0]*16974593 + tokens[1]*66049 + tokens[2]*257 + tokens[3]
31 | 
32 | 
33 | def count_4f(all_elem_codes,order):
34 |     counts_4g = {}
35 |     if order == 4:
36 |         indexer = indexer4gr
37 |     elif order == 10:
38 |         print "Order10 not prepared"
39 |     else:
40 |         print 'WFT?'
41 |     # collect counts    
42 |     for i in range(len(all_elem_codes)-order+1):
43 |         index = indexer(all_elem_codes[i:i+order])
44 |         counts_4g[index] = counts_4g.get(index,0)+1
45 |     # dump it!
46 |     ptr = []
47 |     vals = []
48 |     for key in counts_4g:
49 |         ptr.append(key)
50 |         vals.append(counts_4g[key])
51 |     return (ptr, vals)
52 | 
53 | def extract_4g (filename,order):
54 |         convert_dict = get_dict()
55 |         with open(filename,'r') as f:
56 |             text = f.read()
57 |         lines = text.split('\r\n')
58 |         all_elems_codes = []
59 |         for l in lines:
60 |             elems = l.split(' ')
61 |             all_elems_codes.extend([convert_dict[x] for x in elems[1:]])
62 |             
63 |         with open(out_dir + filename.split('/')[-1],'w') as f_dump:
64 |             pickle.dump(count_4f(all_elems_codes,order),f_dump)   
65 |     
66 | #print files
67 | Parallel(n_jobs=-1)(delayed(extract_4g)(fi, ng_order) for fi in files)
68 | 
69 | #pickle.dump(four_gr,open('../data/feats/%s/four_gr/4g' % what,'w'))
70 | #pickle.dump([x.split('/')[-1] for x in files],open('../data/feats/%s/four_gr/names' % what,'w'))


--------------------------------------------------------------------------------
/src/sections_hist.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import cPickle
 3 | import glob
 4 | import set_up
 5 | import os
 6 | 
 7 | nJobs = set_up.nJobs
 8 | print '[Section extraction script]'
 9 | 
10 | if sys.argv[1]=='train':
11 |     in_dir = set_up.train_folder_path
12 | elif sys.argv[1]=='test':
13 |     in_dir = set_up.test_folder_path   
14 | else:
15 |     print 'Unknown option'
16 |     sys.exit()
17 | 
18 | out_dir = set_up.feats_folder_path + 'sections_hist/'
19 | if not os.path.exists(out_dir):
20 |     os.makedirs(out_dir)
21 | 
22 |     
23 | def worker(fname):
24 |         # preparation
25 |         stat = {}
26 |         fin = open(in_dir + fname + '.asm', 'r')
27 |         for line in fin:
28 |                 line_type = line.split(':')[0].lower()
29 |                 stat[line_type] = stat.get(line_type, 0) + 1
30 |                 stat['sum'] = stat.get('sum', 0) + 1
31 |         cPickle.dump(stat, open(out_dir + fname, 'w'))
32 | 
33 |         
34 | raw_filenames = glob.glob(in_dir + '*.asm')
35 | fnames = map(lambda x: x.split('/')[-1].split('.')[0], raw_filenames)
36 | 
37 | 
38 | from multiprocessing import Process
39 | def wrapper(fname_list):
40 |         for fname in fname_list:
41 |                 worker(fname)
42 | 
43 |                 
44 | workers = []
45 | for workerId in range(nJobs):
46 |         p = Process(target=wrapper, args=[[param for i, param in enumerate(fnames) if i % nJobs == workerId]])
47 |         workers.append(p)
48 |         p.start()
49 | for p in workers:
50 |         p.join()
51 | print 'Done!'
52 | 


--------------------------------------------------------------------------------
/src/set_up.py:
--------------------------------------------------------------------------------
 1 | feats_folder_path = '../data/feats/'
 2 | train_folder_path = '../data/raw/train/'
 3 | test_folder_path  = '../data/raw/test/'
 4 | 
 5 | train_labels_path = '../data/raw/trainLabels.csv'
 6 | test_sample_path  = '../data/raw/sampleSubmission.csv'
 7 | 
 8 | submissions_dir_path = '../submissions/'
 9 | tmp_path = '../tmp/'
10 | nJobs = 15
11 | 


--------------------------------------------------------------------------------
/src/spectral_asm.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import cPickle
 3 | import glob
 4 | import subprocess
 5 | import os
 6 | import set_up
 7 | print 'ASM specter extractor'
 8 | 
 9 | if sys.argv[1]=='train':
10 |     in_dir = set_up.train_folder_path
11 | elif sys.argv[1]=='test':
12 |     in_dir = set_up.test_folder_path   
13 | else:
14 |     print 'Unknown option'
15 |     sys.exit()
16 | 
17 | out_dir = set_up.feats_folder_path + 'spectral_asm/'
18 | if not os.path.exists(out_dir):
19 |     os.makedirs(out_dir)
20 | 
21 | 
22 | def worker(fname):
23 |         # preparation
24 |         subprocess.call('echo "{}" >> {}{}'.format(fname, out_dir, 'fnames'), shell=True)
25 |         subprocess.call('cat {}{}.asm | wc -l >> {}{}'.format(in_dir, fname, out_dir, 'line_count'), shell=True)
26 | 
27 |         specter = ['jmp', 'mov', 'retf', 'push', 'pop', 'xor', 'retn', 'nop', 'sub', 'inc', 'dec', 'add',
28 |                 'imul', 'xchg', 'or', 'shr', 'cmp', 'call', 'shl', 'ror', 'or', 'rol', 'jnb']
29 |         for op in specter:
30 |                 subprocess.call('grep "\s{}\s" {}{}.asm | wc -l >> {}{}'.format(op, in_dir, fname, out_dir, op), shell=True)
31 | 
32 | 
33 | 
34 | raw_filenames = glob.glob(in_dir + '*.asm')
35 | fnames = map(lambda x: x.split('/')[-1].split('.')[0], raw_filenames)
36 | for i, fname in enumerate(fnames):
37 |         worker(fname)
38 |         if i%200==0:
39 |                 print i
40 | print 'Done!'


--------------------------------------------------------------------------------
/src/stdcall_grepper.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import cPickle
 3 | import glob
 4 | import subprocess
 5 | import os
 6 | import set_up
 7 | print '[Grepper for __stdcall]'
 8 | 
 9 | if sys.argv[1]=='train':
10 |     in_dir = set_up.train_folder_path
11 | elif sys.argv[1]=='test':
12 |     in_dir = set_up.test_folder_path   
13 | else:
14 |     print 'Unknown option'
15 |     sys.exit()
16 | 
17 | out_dir = set_up.feats_folder_path + 'stdcall_grepper/'
18 | if not os.path.exists(out_dir):
19 |     os.makedirs(out_dir)
20 |     
21 | 
22 | def worker(fname):
23 |     # preparation
24 |     subprocess.call('grep "__stdcall" {}{}.asm > {}{}'.format(in_dir, fname, out_dir, fname), shell=True)
25 | 
26 | 
27 | raw_filenames = glob.glob(in_dir + '*.asm')
28 | fnames = map(lambda x: x.split('/')[-1].split('.')[0], raw_filenames)
29 | 
30 | 
31 | from multiprocessing import Process
32 | def wrapper(fname_list):
33 |     for fname in fname_list:
34 |         worker(fname)
35 | 
36 | nJobs = 15
37 | workers = []
38 | for workerId in range(nJobs):
39 |     p = Process(target=wrapper, args=[[param for i, param in enumerate(fnames) if i % nJobs == workerId]])
40 |     workers.append(p)
41 |     p.start()
42 | for p in workers:
43 |     p.join()
44 | print 'Done!'


--------------------------------------------------------------------------------