├── .gitignore
├── LICENSE
├── Makefile
├── README.md
├── clip.gif
├── server
    ├── Testing.ipynb
    ├── app.py
    ├── generate.py
    ├── model.py
    └── requirements.txt
└── webclient
    ├── html
        └── index.html
    ├── package-lock.json
    ├── package.json
    ├── scripts
        ├── src
        │   ├── im-receiver.tsx
        │   ├── im-sender.tsx
        │   ├── index.ts
        │   ├── page.tsx
        │   └── tester.tsx
        └── tsconfig.json
    ├── styles
        └── src
        │   └── index.styl
    └── webpack.config.js


/.gitignore:
--------------------------------------------------------------------------------
 1 | .idea
 2 | 
 3 | corpus/*
 4 | database/*
 5 | 
 6 | webclient/*/bin
 7 | webclient/node_modules
 8 | 
 9 | server/*.pyc
10 | 
11 | .ipynb_checkpoints
12 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License
 2 | 
 3 | Copyright (c) 2010-2017 Google, Inc. http://angularjs.org
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in
13 | all copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 | THE SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | .PHONY: server static install db download init
 2 | 
 3 | server:
 4 | 	cd server && python app.py
 5 | 
 6 | static:
 7 | 	npm run dev
 8 | 
 9 | install:
10 | 	pip install -r server/requirements.txt
11 | 	cd webclient && npm install
12 | 
13 | db:
14 | 	mkdir -p database
15 | 	echo '{}' > database/db.json
16 | 
17 | download: corpus/glove.6B.zip corpus/enwiki-20150602-words-frequency.txt
18 | 	mkdir -p corpus
19 | 	wget -P corpus https://github.com/IlyaSemenov/wikipedia-word-frequency/blob/master/results/enwiki-20150602-words-frequency.txt
20 | 	wget -p corpus http://nlp.stanford.edu/data/wordvecs/glove.6B.zip
21 | 	cd corpus && unzip glove.6B.zip
22 | 
23 | init: db download install
24 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # word2vec-spam-filter
 2 | 
 3 | This is a project done during the [Kik](https://github.com/kikinteractive/) hackathon 2017.
 4 | 
 5 | In this project we demonstrate a way to classify spam messages on the client while protecting user privacy.
 6 | 
 7 | A client generates a "hash" from the message sending it to the server. The server then compares the "hash" to a bank of known reported messages.
 8 | 
 9 | The bank of known reported messages is created from spam reports. The server compares a given reported message to the previous bank of reported messages. If the message is similar to a previously reported message, a report count is incremented. Otherwise the message is added to the bank with a count of 1.
10 | 
11 | A message in the bank of reported messages is considered a spam message once it was reported more than 3 times.
12 | 
13 | ## Preview
14 | 
15 | <img src="clip.gif" width="700"/>
16 | 
17 | ## Corpus downloads
18 | We used 2 datasets for creating sentence vectors:
19 | 1. word vectors taken from: https://github.com/stanfordnlp/GloVe
20 | 2. word frequencies from: https://github.com/IlyaSemenov/wikipedia-word-frequency/blob/master/results/enwiki-20150602-words-frequency.txt
21 | 
22 | ## Configurable parameters (Hyper-Parameters)
23 | We played around with a few configurations to get the best results for short user messages:
24 | 
25 | * Confidence Threshold - a number between 0.0 - 1.0 to determine when 2 messages are considered the same
26 | * Distance Function - we used vector dot product
27 | * Normalization - how to deal with words we don't have in our corpus, punctuation marks, non english words
28 | * Vector Size - the longer the vector the higher the accuracy but heavier in memory
29 | * Weight Function - given a word frequency, how to create the vector weights (`the` should weigh less than `camera`)
30 | * Custom Corpus - creating the word vectors and frequencies from real user message data might yield better results
31 | * Random Indices - how many random indices should the client send to the server to mask the original message indices
32 | 
33 | ## Running the code
34 | This project includes a single makefile to help with the initialization, dependency installation and corpus download.
35 | You can invoke a help message by running:
36 | 
37 | ```
38 | make
39 | ```
40 | 
41 | Or you can manually run the server and client apps:
42 | 
43 | ### server
44 | In the `server` directory install the pip dependencies in a `virtualenv`:
45 | 
46 | ```
47 | pip install -r requirements.txt
48 | ```
49 | 
50 | and run the server:
51 | ```
52 | python app.py
53 | ```
54 | 
55 | ### web client
56 | To use the web client go into the `webclient` directory in your terminal and then:
57 | ```
58 | npm install
59 | npm run dev
60 | ```
61 | 
62 | That should install all dependencies and kick start the project, if it all works you should see something like:
63 |  > Project is running at http://localhost:3333/
64 |  > webpack output is served from /
65 | 
66 | Now load http://localhost:3333/ in your browser
67 | 
68 | There are 3 different "view modes" which can be switched using the select box at the top right corner of the page.  
69 | The 3 views are:
70 |  * Standalone Tester: A textarea in which one can input a message and then either report it as spam or check whether it is classified as spam.
71 |  * IM Sender: A textarea in which the user can input a message (or select a message from a bunch of existing ones) and then "send" the message to another client.
72 |  * IM Receiver: A view which displays a list of received messages (using the `IM Sender`) and the ability to report each message as spam.
73 | 


--------------------------------------------------------------------------------
/clip.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/doodyparizada/word2vec-spam-filter/695898b7e7a3091947626f12fc31c9be4bb03784/clip.gif


--------------------------------------------------------------------------------
/server/Testing.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 142,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import requests\n",
 10 |     "BASE_URL = 'http://localhost:5000'\n",
 11 |     "\n",
 12 |     "def report(msg):\n",
 13 |     "    requests.post(BASE_URL + '/spam/report',\n",
 14 |     "             json={'message': msg}\n",
 15 |     "            ).ok\n",
 16 |     "\n",
 17 |     "def is_spam(vector):\n",
 18 |     "    data = requests.get(BASE_URL + '/spam/detect',\n",
 19 |     "                        params={'vector': ','.join(str(v) for v in vector)}\n",
 20 |     "            ).json()\n",
 21 |     "    return data['spam'], data['confidence']\n",
 22 |     "\n",
 23 |     "def get_word_vectors(indexes):\n",
 24 |     "    data = requests.get(BASE_URL + '/words/vector',\n",
 25 |     "                        params={'ids': ','.join(str(i) for i in indexes)}).json()\n",
 26 |     "    return {int(k): v['vector'] for k,v in data['words'].items()}\n",
 27 |     "\n",
 28 |     "def get_word_indexes():\n",
 29 |     "    \"\"\"return a dict of word to index.\"\"\"\n",
 30 |     "    content = requests.get(BASE_URL + '/words/list').content\n",
 31 |     "    return {word: i for i, word in enumerate(content.split('\\n'))}"
 32 |    ]
 33 |   },
 34 |   {
 35 |    "cell_type": "code",
 36 |    "execution_count": 143,
 37 |    "metadata": {},
 38 |    "outputs": [],
 39 |    "source": [
 40 |     "WORD_INDEXES = get_word_indexes()\n",
 41 |     "\n",
 42 |     "\n",
 43 |     "def calc_message(message):\n",
 44 |     "    import random\n",
 45 |     "    import numpy as np\n",
 46 |     "    \n",
 47 |     "    message = message.lower()\n",
 48 |     "    \n",
 49 |     "    indexes = [WORD_INDEXES[word] \n",
 50 |     "               for word in message.split(' ')\n",
 51 |     "               if word in WORD_INDEXES]\n",
 52 |     "    fakes = [random.randint(0, len(WORD_INDEXES)) for _ in range(20)]\n",
 53 |     "    \n",
 54 |     "    shuffled = indexes + fakes\n",
 55 |     "    random.shuffle(shuffled)\n",
 56 |     "    vectors = get_word_vectors(shuffled)\n",
 57 |     "    \n",
 58 |     "    vec_sum = sum(np.array(vectors[i]) for i in indexes).tolist()\n",
 59 |     "\n",
 60 |     "    return vec_sum"
 61 |    ]
 62 |   },
 63 |   {
 64 |    "cell_type": "code",
 65 |    "execution_count": 144,
 66 |    "metadata": {},
 67 |    "outputs": [],
 68 |    "source": [
 69 |     "def is_spam_message(msg):\n",
 70 |     "    return is_spam(calc_message(msg))"
 71 |    ]
 72 |   },
 73 |   {
 74 |    "cell_type": "code",
 75 |    "execution_count": 145,
 76 |    "metadata": {},
 77 |    "outputs": [],
 78 |    "source": [
 79 |     "spams = [\n",
 80 |     "    'Hi are you looking for a sexy cool woman to spend time ,with well I,m lookin for a gentleman who still khowns how to treat a lady, I am here to u.http . G.lovendate.pw code: 605',\n",
 81 |     "    'I offer you an exchange, you free register on my site http:\\/\\/rachelmel.pro\\nConfirmation email and show me the screenshot. \\nAfter that I\\'ll send you my nude pics',\n",
 82 |     "    '''Hi, do I know u? you just showed up in my kik hmm.. my friends warned me that there are many fake accounts and bots here, no offense, are u a real person? If you are a real person, you won't have any trouble liking my pic, will you;)? the one where I'm wearing a white swimming suit. This way I'll be convinced that you are real''',\n",
 83 |     "    '''This is the first time I will be playing naked on a webcam. Come see my naked body. Reg here, plz:* http:\\/\\/u.to\\/VgdTEA It is completely free'''\n",
 84 |     "    '''go here, click the create private account.. you might have to create a username before you can see me live''',\n",
 85 |     "]\n",
 86 |     "\n",
 87 |     "good = [\n",
 88 |     "    'Hi I am your mother and I love you',\n",
 89 |     "    'Did you watch the movie last night?',\n",
 90 |     "    'Whats up with all the weird clouds?',\n",
 91 |     "]"
 92 |    ]
 93 |   },
 94 |   {
 95 |    "cell_type": "code",
 96 |    "execution_count": 146,
 97 |    "metadata": {},
 98 |    "outputs": [],
 99 |    "source": [
100 |     "# register spam messages\n",
101 |     "for msg in spams:\n",
102 |     "    for i in range(4):\n",
103 |     "        report(msg)"
104 |    ]
105 |   },
106 |   {
107 |    "cell_type": "code",
108 |    "execution_count": 115,
109 |    "metadata": {},
110 |    "outputs": [
111 |     {
112 |      "data": {
113 |       "text/plain": [
114 |        "(False, 0.8641516840020381)"
115 |       ]
116 |      },
117 |      "execution_count": 115,
118 |      "metadata": {},
119 |      "output_type": "execute_result"
120 |     }
121 |    ],
122 |    "source": [
123 |     "is_spam_message(good[2])"
124 |    ]
125 |   },
126 |   {
127 |    "cell_type": "code",
128 |    "execution_count": 117,
129 |    "metadata": {},
130 |    "outputs": [
131 |     {
132 |      "data": {
133 |       "text/plain": [
134 |        "(True, 1.0)"
135 |       ]
136 |      },
137 |      "execution_count": 117,
138 |      "metadata": {},
139 |      "output_type": "execute_result"
140 |     }
141 |    ],
142 |    "source": [
143 |     "is_spam_message(spams[1])"
144 |    ]
145 |   },
146 |   {
147 |    "cell_type": "code",
148 |    "execution_count": 129,
149 |    "metadata": {},
150 |    "outputs": [
151 |     {
152 |      "data": {
153 |       "text/plain": [
154 |        "(False, 0.6815927618764492)"
155 |       ]
156 |      },
157 |      "execution_count": 129,
158 |      "metadata": {},
159 |      "output_type": "execute_result"
160 |     }
161 |    ],
162 |    "source": [
163 |     "is_spam_message('Hey Doody What is up?')"
164 |    ]
165 |   },
166 |   {
167 |    "cell_type": "code",
168 |    "execution_count": 141,
169 |    "metadata": {},
170 |    "outputs": [
171 |     {
172 |      "name": "stdout",
173 |      "output_type": "stream",
174 |      "text": [
175 |       "['Hi', 'do', 'I', 'know', 'u', 'you', 'just', 'showed', 'up', 'in', 'my', 'kik', 'hmm', 'my', 'friends', 'warned', 'me', 'that', 'there', 'are', 'many', 'fake', 'accounts', 'and', 'bots', 'here', 'no', 'offense', 'are', 'u', 'a', 'real', 'person', 'If', 'you', 'are', 'a', 'real', 'person', 'you', \"won't\", 'have', 'any', 'trouble', 'liking', 'my', 'pic', 'will', 'you', 'the', 'one', 'where', \"I'm\", 'wearing', 'a', 'white', 'swimming', 'suit', 'This', 'way', \"I'll\", 'be', 'convinced', 'that', 'you', 'are', 'real']\n"
176 |      ]
177 |     }
178 |    ],
179 |    "source": [
180 |     ">>> import re\n",
181 |     ">>> string = spams[2]\n",
182 |     ">>> pattern = re.compile(\"^\\s+|\\s*[, \\.\\?;\\(\\)]\\s*|\\s+$\")\n",
183 |     ">>> print([x for x in pattern.split(string) if x])"
184 |    ]
185 |   },
186 |   {
187 |    "cell_type": "code",
188 |    "execution_count": null,
189 |    "metadata": {},
190 |    "outputs": [],
191 |    "source": []
192 |   }
193 |  ],
194 |  "metadata": {
195 |   "kernelspec": {
196 |    "display_name": "Python 2",
197 |    "language": "python",
198 |    "name": "python2"
199 |   },
200 |   "language_info": {
201 |    "codemirror_mode": {
202 |     "name": "ipython",
203 |     "version": 2
204 |    },
205 |    "file_extension": ".py",
206 |    "mimetype": "text/x-python",
207 |    "name": "python",
208 |    "nbconvert_exporter": "python",
209 |    "pygments_lexer": "ipython2",
210 |    "version": "2.7.14"
211 |   }
212 |  },
213 |  "nbformat": 4,
214 |  "nbformat_minor": 2
215 | }
216 | 


--------------------------------------------------------------------------------
/server/app.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import sys
  3 | from math import log
  4 | 
  5 | from flask import request, Flask, jsonify
  6 | import numpy as np
  7 | 
  8 | from model import DB
  9 | from generate import generate_matrix, normalize_matrix, normalize_vector
 10 | from flask_cors import CORS
 11 | 
 12 | 
 13 | app = Flask(__name__)
 14 | CORS(app)
 15 | 
 16 | GLOVE = '../corpus/glove.6B.300d.txt'
 17 | FREQ = '../corpus/enwiki-20150602-words-frequency.txt'
 18 | 
 19 | iweights = {}
 20 | vocab = {}
 21 | ivocab = {}
 22 | WORD_LIST = ''
 23 | W_norm = None
 24 | messages = []
 25 | 
 26 | EPSILON = 0.95
 27 | DEFAULT_WEIGHT = 15
 28 | 
 29 | 
 30 | def init():
 31 |     """read glove file and generate a word matrix"""
 32 |     global W_norm, WORD_LIST, vocab, ivocab, iweights 
 33 |     sys.stderr.write('initializing word vectors')
 34 |     word_vectors = []
 35 |     
 36 |     # open and parse word vector file
 37 |     with open(GLOVE, 'r') as f:
 38 |         for i, line in enumerate(f):
 39 |             vals = line.rstrip().split(' ')
 40 |             vector = [float(x) for x in vals[1:]]
 41 |             word = vals[0]
 42 |             word_vectors.append((word, vector))
 43 |             if i % 10000 == 0:
 44 |                 sys.stderr.write('.')
 45 |     
 46 |     WORD_LIST += '\n'.join(w for w, _ in word_vectors)
 47 |     W, vocab, ivocab = generate_matrix(word_vectors)
 48 |     W_norm = normalize_matrix(W)
 49 | 
 50 |     sys.stderr.write('\ninitializing word weights')
 51 |     max_freq = None
 52 |     with open(FREQ, 'r') as f:
 53 |         for i, line in enumerate(f):
 54 |             vals = line.rstrip().split(' ')
 55 |             word = vals[0]
 56 |             freq = int(vals[1])
 57 |             max_freq = max_freq or freq  # the first iteration will set max_freq. The first line is the highest freq
 58 |             if word in vocab:
 59 |                 iweights[vocab[word]] = freq_to_weight(freq, max_freq)
 60 |             if i % 10000 == 0:
 61 |                 sys.stderr.write('.')
 62 | 
 63 |     sys.stderr.write('\ndone!\n')
 64 | 
 65 | 
 66 | def get_vector(idx):
 67 |     """return the weighted vector for an index."""
 68 |     return (W_norm[idx, :] * iweights.get(idx, DEFAULT_WEIGHT))
 69 | 
 70 | 
 71 | def freq_to_weight(freq, max_freq):
 72 |     """calculate a vector weight for a frequency."""
 73 |     # taken from https://www.wikiwand.com/en/Word_lists_by_frequency
 74 |     return 0.5 - log(float(freq)/max_freq, 2)
 75 | 
 76 |     
 77 | def generate_spam_matrix(report_threashold):
 78 |     """
 79 |     put all known spam vectors in a matrix
 80 |     """
 81 |     db = DB.load()
 82 |     word_vectors = [(word, rm.vector)
 83 |                     for word, rm in db.reported_messages.items()
 84 |                     if rm.reports >= report_threashold]
 85 |     return generate_matrix(word_vectors)
 86 | 
 87 | 
 88 | def closest_spam(vector, report_threashold=3):
 89 |     """given a vector, return the closest spam messages and distance."""
 90 |     W, vocab, ivocab = generate_spam_matrix(report_threashold=report_threashold)
 91 | 
 92 |     if not vocab:  # means empty db
 93 |         return '', 0
 94 | 
 95 |     vector = normalize_vector(vector)
 96 | 
 97 |     dist = np.dot(W, vector.T)
 98 | 
 99 |     a = np.argsort(-dist)[:1]  # currently returns generator of 3 most closest
100 |     for x in a:
101 |         return ivocab[x], float(dist[x])
102 | 
103 |     return '', 0
104 | 
105 | def tokenize_message(message):
106 |     """return a list of normalized words."""
107 |     return (message
108 |             .lower()
109 |             .replace(".", " .")
110 |             .replace(",", " ,")
111 |             .replace("?", " ?")
112 |             .replace("!", " !")
113 |             .replace(":", " :")
114 |             .replace("'s", " 's")
115 |             .split())
116 | 
117 | 
118 | def message_to_vector(message):
119 |     """sums up all known vectors of a given message."""
120 |     vector = np.zeros(W_norm[0, :].shape)
121 |     for term in tokenize_message(message):
122 |         if term in vocab:
123 |             vector += get_vector(vocab[term])
124 |     return vector
125 | 
126 | 
127 | @app.route('/words/list')
128 | def word_list():
129 |     """return word list. ordered by indexes."""
130 |     return WORD_LIST
131 | 
132 | 
133 | @app.route('/words/vector')
134 | def word_vectors():
135 |     """retrun vectors for the words by given ids."""
136 |     ids = {int(i) for i in request.args['ids'].split(',')}
137 | 
138 |     return jsonify({'words':
139 |                     {i: {'vector': get_vector(i).tolist()}
140 |                      for i in ids}})
141 | 
142 | 
143 | @app.route('/spam/detect')
144 | def detect_spam():
145 |     """the given vector should not be normalized. normalization happens on server."""
146 |     vector = [float(i) for i in request.args['vector'].split(',')]
147 |     msg, dist = closest_spam(vector)
148 |     is_spam = dist > EPSILON
149 |     return jsonify({'spam': is_spam,
150 |                     'confidence': dist,
151 |                     'meta': msg})
152 | 
153 | 
154 | @app.route('/spam/report', methods=['POST'])
155 | def report_spam():
156 |     """if spam message already exists or is close to a known message add a report count. else add as new entry in db."""
157 |     data = request.get_json()
158 |     reported_message = data['message']
159 |     vector = message_to_vector(reported_message)
160 | 
161 |     similar_msg, dist = closest_spam(vector, 0)
162 | 
163 |     db = DB.load()
164 |     if dist > EPSILON:
165 |         db.reported_messages[similar_msg].reports += 1
166 |     else:
167 |         db.add_new_message(reported_message, normalize_vector(vector).tolist())
168 | 
169 |     db.save()
170 |     return jsonify({})
171 | 
172 | @app.route('/messages', methods=['POST', 'GET'])
173 | def message_handler():
174 |     global messages
175 |     if request.method == 'POST':
176 |         messages.append(request.get_json()['message'])
177 |         return jsonify({})
178 |     else:
179 |         if messages:
180 |             return jsonify({'message': messages.pop(0)})
181 |         return jsonify({})
182 | 
183 | 
184 | if __name__ == '__main__':
185 |     init()
186 |     app.run()
187 | 


--------------------------------------------------------------------------------
/server/generate.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | def normalize_vector(vector):
 5 |     vector = np.array(vector)
 6 |     vec_norm = np.zeros(vector.shape)
 7 |     d = (np.sum(vector ** 2,) ** (0.5))
 8 |     vec_norm = (vector.T / d).T
 9 |     return vec_norm
10 | 
11 | 
12 | # XXX todo see if we can unite vector and matrix normalization
13 | def normalize_matrix(W):
14 |     # normalize each word vector to unit variance
15 |     W_norm = np.zeros(W.shape)
16 |     d = (np.sum(W ** 2, 1) ** (0.5))
17 |     W_norm = (W.T / d).T
18 |     return W_norm
19 | 
20 | 
21 | def generate_matrix(word_vectors):
22 |     """given a list of word,vector pairs generate matrix and vocab dicts""" 
23 |     vectors = dict(word_vectors)
24 |     words = [w for w, _ in word_vectors]
25 | 
26 |     vocab = {w: idx for idx, w in enumerate(words)}
27 |     ivocab = {idx: w for idx, w in enumerate(words)}
28 |         
29 |     vocab_size = len(vectors)
30 |     vector_dim = len(vectors.values()[0]) if vectors else 0
31 |     W = np.zeros((vocab_size, vector_dim))
32 |     for word, v in vectors.items():
33 |         W[vocab[word], :] = v
34 |     return W, vocab, ivocab
35 | 


--------------------------------------------------------------------------------
/server/model.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | 
 3 | from schematics.models import Model
 4 | from schematics.types import StringType, ListType, IntType, FloatType, ModelType, DictType
 5 | 
 6 | 
 7 | FILENAME = '../database/db.json'
 8 | 
 9 | 
10 | class DB(Model):
11 |     class ReportedMessage(Model):
12 |         reports = IntType()
13 |         vector = ListType(FloatType)
14 | 
15 |     reported_messages = DictType(ModelType(ReportedMessage), default={})
16 |     
17 |     def add_new_message(self, reported_message, vector):
18 |         rm = self.ReportedMessage()
19 |         rm.reports = 1
20 |         rm.vector = vector
21 |         self.reported_messages[reported_message] = rm
22 |     
23 |     @classmethod
24 |     def load(cls):
25 |         with open(FILENAME, 'r') as f:
26 |             return DB(json.loads(f.read()))
27 | 
28 |     def save(self):
29 |         string = json.dumps(self.to_primitive())
30 |         with open(FILENAME, 'w') as f:
31 |             f.write(string)
32 | 


--------------------------------------------------------------------------------
/server/requirements.txt:
--------------------------------------------------------------------------------
1 | flask==0.12.2
2 | Flask-Cors==3.0.3
3 | requests==2.18.4
4 | schematics==2.0.1
5 | numpy==1.13.3
6 | 


--------------------------------------------------------------------------------
/webclient/html/index.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html lang="en">
 3 | 	<head>
 4 | 		<meta charset="UTF-8">
 5 | 		<title>word2vec spam filter</title>
 6 | 		<link rel="stylesheet" href="styles/bin/index.css" />
 7 | 	</head>
 8 | 	<body>
 9 | 		<main id="main"></main>
10 | 	</body>
11 | </html>


--------------------------------------------------------------------------------
/webclient/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "word2vec-webclient",
 3 |   "version": "1.0.0",
 4 |   "description": "",
 5 |   "main": "scripts/bin/index.js",
 6 |   "scripts": {
 7 |     "removeold": "rimraf scripts/bin styles/bin/*",
 8 |     "createdir": "./node_modules/.bin/mkdirp styles/bin",
 9 |     "clean": "run-p removeold createdir",
10 |     "build-scripts": "./node_modules/typescript/bin/tsc -p scripts",
11 |     "build-scripts-watch": "npm run build-scripts -- --watch",
12 |     "build-styles": "./node_modules/stylus/bin/stylus -c ./styles/src --out ./styles/bin",
13 |     "build-styles-watch": "npm run build-styles -- --watch",
14 |     "build": "npm-run-all clean build-scripts build-styles",
15 |     "build-watch": "run-p build-scripts-watch build-styles-watch",
16 |     "serve": "webpack-dev-server --no-info --colors --port 3333",
17 |     "dev": "npm-run-all build -p build-watch serve"
18 |   },
19 |   "repository": {
20 |     "type": "git",
21 |     "url": "git+ssh://git@github.com/doodyparizada/word2vec-spam-filter.git"
22 |   },
23 |   "author": "nitzan tomer",
24 |   "license": "ISC",
25 |   "bugs": {
26 |     "url": "https://github.com/doodyparizada/word2vec-spam-filter/issues"
27 |   },
28 |   "homepage": "https://github.com/doodyparizada/word2vec-spam-filter#readme",
29 |   "dependencies": {
30 |     "@types/react": "^16.0.28",
31 |     "@types/react-dom": "^16.0.3",
32 |     "html-webpack-plugin": "^2.30.1",
33 |     "mkdirp": "^0.5.1",
34 |     "npm-run-all": "^4.1.2",
35 |     "popsicle": "^9.2.0",
36 |     "react": "^16.2.0",
37 |     "react-dom": "^16.2.0",
38 |     "rimraf": "^2.6.2",
39 |     "shuffle-array": "^1.0.1",
40 |     "stylus": "^0.54.5",
41 |     "typescript": "^2.6.2",
42 |     "webpack": "^3.10.0",
43 |     "webpack-config-utils": "^2.3.0",
44 |     "webpack-dev-server": "^2.9.7"
45 |   }
46 | }
47 | 


--------------------------------------------------------------------------------
/webclient/scripts/src/im-receiver.tsx:
--------------------------------------------------------------------------------
  1 | import * as React from "react";
  2 | 
  3 | const popsicle = require("popsicle");
  4 | const baseUrl = "http://localhost:5000";
  5 | 
  6 | export type MessageMeta = { id: number; content: string; isSpam?: boolean; };
  7 | 
  8 | export type ReceiverPageProps = {
  9 | 	report: (message: string) => void;
 10 | 	check: (message: string) => Promise<{ spam: boolean; confidence: number; }>;
 11 | }
 12 | 
 13 | export type ReceiverPageState = {
 14 | 	messages: MessageMeta[];
 15 | }
 16 | 
 17 | export class ReceiverPage extends React.Component<ReceiverPageProps, ReceiverPageState> {
 18 | 	private static readonly INTERVAL = 1000;
 19 | 
 20 | 	private timer: number;
 21 | 
 22 | 	constructor(props?: any) {
 23 | 		super(props);
 24 | 
 25 | 		this.state = {
 26 | 			messages: []
 27 | 		};
 28 | 	}
 29 | 
 30 | 	componentDidMount() {
 31 | 		this.timer = setInterval(this.poll.bind(this), ReceiverPage.INTERVAL);
 32 | 	}
 33 | 
 34 | 	componentWillUnmount() {
 35 | 		clearInterval(this.timer);
 36 | 	}
 37 | 
 38 | 	render() {
 39 | 		const messages = this.state.messages.length === 0 ?
 40 | 			<tr><td>no messages yet</td></tr>
 41 | 			: this.state.messages.map(message => {
 42 | 					const clsname = message.isSpam === undefined ? "unknown" : (message.isSpam ? "spam" : "notspam");
 43 | 					return (
 44 | 						<tr key={ message.id } className={ clsname }>
 45 | 							<td> { message.content }</td>
 46 | 							<td>
 47 | 								<button onClick={ this.report.bind(this, message) } disabled={ !!message.isSpam }>report</button>
 48 | 							</td>
 49 | 						</tr>
 50 | 					);
 51 | 				});
 52 | 
 53 | 		return (
 54 | 			<div className="content">
 55 | 				<table className="incoming" cellPadding={ 0 } cellSpacing={ 0 }>
 56 | 					<tbody>
 57 | 					{ messages }
 58 | 					</tbody>
 59 | 				</table>
 60 | 			</div>
 61 | 		);
 62 | 	}
 63 | 
 64 | 	private poll() {
 65 | 		popsicle
 66 | 			.request(`${ baseUrl }/messages`)
 67 | 			.use(popsicle.plugins.parse("json"))
 68 | 			.then(res => {
 69 | 				if (res.body.message) {
 70 | 					const id = generateMessageId();
 71 | 
 72 | 					this.setState({
 73 | 						messages: [{ id, content: res.body.message }].concat(this.state.messages)
 74 | 					});
 75 | 
 76 | 					setTimeout(() => {
 77 | 						this.props.check(res.body.message).then(res => {
 78 | 							this.updateMessageState(id, res.spam);
 79 | 						});
 80 | 					}, 2500);
 81 | 				}
 82 | 			});
 83 | 	}
 84 | 
 85 | 	private report(message: MessageMeta) {
 86 | 		this.props.report(message.content);
 87 | 		this.updateMessageState(message.id, true);
 88 | 	}
 89 | 
 90 | 	private updateMessageState(id: number, spam: boolean) {
 91 | 		this.setState({
 92 | 			messages: this.state.messages.map(item => item.id !== id ? item : Object.assign({}, item, { isSpam: spam }))
 93 | 		});
 94 | 	}
 95 | }
 96 | 
 97 | let counter = 0;
 98 | function generateMessageId(): number {
 99 | 	return ++counter;
100 | }
101 | 


--------------------------------------------------------------------------------
/webclient/scripts/src/im-sender.tsx:
--------------------------------------------------------------------------------
 1 | import * as React from "react";
 2 | 
 3 | const popsicle = require("popsicle");
 4 | const baseUrl = "http://localhost:5000";
 5 | 
 6 | function createGetUrl(path: string): string {
 7 | 	return `${ baseUrl }${ path }`;
 8 | }
 9 | 
10 | export class SenderPage extends React.Component<{}, {}> {
11 | 	private textarea: HTMLTextAreaElement;
12 | 
13 | 	render() {
14 | 		return (
15 | 			<div className="content">
16 | 				<div id="examples">
17 | 					<ul onClick={ this.onClickExample.bind(this) }>
18 | 						<li>Hi are you looking for a sexy cool woman to spend time ,with well I,m lookin for a gentleman who still khowns how to treat a lady, I am here to u.http . G.lovendate.pw code: 605</li>
19 | 						<li>I offer you an exchange, you free register on my site http://rachelmel.pro Confirmation email and show me the screenshot.  After that I'll send you my nude pics</li>
20 | 						<li>Hi, do I know u? you just showed up in my kik hmm.. my friends warned me that there are many fake accounts and bots here, no offense, are u a real person? If you are a real person, you won't have any trouble liking my pic, will you;)? the one where I'm wearing a white swimming suit. This way I'll be convinced that you are real</li>
21 | 						<li>Whats going on Nitzan, Its me Doody from the party last night</li>
22 | 					</ul>
23 | 				</div>
24 | 				<textarea placeholder="Enter message here" ref={ el => this.textarea = el }></textarea>
25 | 				<div className="actions">
26 | 					<button onClick={ this.send.bind(this) }>Send Message</button>
27 | 				</div>
28 | 			</div>
29 | 		);
30 | 	}
31 | 
32 | 	private send() {
33 | 		popsicle.request({
34 | 			url: createGetUrl("/messages"),
35 | 			method: "POST",
36 | 			body: {
37 | 				message: this.textarea.value
38 | 			}
39 | 		}).then(res => console.log("message sent"));
40 | 	}
41 | 
42 | 	private onClickExample(event: React.MouseEvent<HTMLUListElement>) {
43 | 		this.textarea.value = (event.target as HTMLLIElement).textContent;
44 | 	}
45 | }
46 | 


--------------------------------------------------------------------------------
/webclient/scripts/src/index.ts:
--------------------------------------------------------------------------------
  1 | import * as view from "./page";
  2 | 
  3 | const popsicle = require("popsicle");
  4 | const shuffle = require("shuffle-array");
  5 | const baseUrl = "http://localhost:5000";
  6 | 
  7 | const dictionary = {} as { [word: string]: number };
  8 | let dictionarySize;
  9 | popsicle.request(createGetUrl("/words/list")).then(res => {
 10 | 	res.body.split("\n").forEach((word, index) => dictionary[word] = index);
 11 | 	dictionarySize = Object.keys(dictionary).length;
 12 | 	console.log(`words loaded (count of ${ dictionarySize })`);
 13 | });
 14 | 
 15 | const main = document.getElementById("main");
 16 | view.render(main, {
 17 | 	report: message => {
 18 | 		return popsicle.request({
 19 | 			url: createGetUrl("/spam/report"),
 20 | 			method: "POST",
 21 | 			body: {
 22 | 				message
 23 | 			}
 24 | 		});
 25 | 	},
 26 | 	check: message => {
 27 | 		// go to lower case
 28 | 		message = message.toLowerCase();
 29 | 
 30 | 		// split message into words
 31 | 		const words = normalize(message).split(/\s+/);
 32 | 
 33 | 		// find indexes for words
 34 | 		const indexes = [] as number[];
 35 | 		words.forEach(word => {
 36 | 			if (dictionary[word] !== undefined) {
 37 | 				indexes.push(dictionary[word]);
 38 | 			}
 39 | 		});
 40 | 
 41 | 		// add random (word) indexes
 42 | 		const dummyCount = getRandomInt(words.length, words.length * 2);
 43 | 		const dummys = [] as number[];
 44 | 		for (let i = 0; i < dummyCount; i++) {
 45 | 			dummys.push(getRandomInt(0, dictionarySize));
 46 | 		}
 47 | 
 48 | 		const ids = indexes.concat(dummys);
 49 | 		// shuffle indexes
 50 | 		shuffle(ids);
 51 | 		return analyze(ids, indexes);
 52 | 	}
 53 | });
 54 | 
 55 | function getRandomInt(min, max) {
 56 | 	min = Math.ceil(min);
 57 | 	max = Math.floor(max);
 58 | 	return Math.floor(Math.random() * (max - min)) + min;
 59 | }
 60 | 
 61 | function normalize(message: string): string {
 62 | 	return message
 63 | 		.replace(/\./g, " .")
 64 | 		.replace(/,/g, " ,")
 65 | 		.replace(/\?/g, " ?")
 66 | 		.replace(/\!/g, " !")
 67 | 		.replace(/\:/g, " :")
 68 | 		.replace(/'s/g, " 's");
 69 | }
 70 | 
 71 | function createGetUrl(path: string, arrName?: string, arr?: number[]): string {
 72 | 	let str = JSON.stringify(arr || []);
 73 | 	str = str.substring(1, str.length - 1);
 74 | 
 75 | 	if (str !== "") {
 76 | 		str = `?${ arrName }=${ str }`;
 77 | 	}
 78 | 
 79 | 	return `${ baseUrl }${ path }${ str }`
 80 | }
 81 | 
 82 | function analyze(indexes: number[], reals: number[]): Promise<{ spam: boolean; confidence: number; }> {
 83 | 	return popsicle
 84 | 		.request(createGetUrl("/words/vector", "ids", indexes))
 85 | 		.use(popsicle.plugins.parse("json"))
 86 | 		.then(response => {
 87 | 			let result: Vector = null;
 88 | 
 89 | 			reals.forEach(index => {
 90 | 				let vector = new Vector(response.body.words[index].vector);
 91 | 
 92 | 				if (result === null) {
 93 | 					result = vector;
 94 | 				} else {
 95 | 					result.add(vector);
 96 | 				}
 97 | 			});
 98 | 
 99 | 			return popsicle
100 | 				.request(createGetUrl("/spam/detect", "vector", result.toArray()))
101 | 				.use(popsicle.plugins.parse("json"))
102 | 				.then(response2 => {
103 | 					return response2.body;
104 | 				});
105 | 		});
106 | }
107 | 
108 | class Vector {
109 | 	private readonly values: number[];
110 | 
111 | 	constructor(values: number[]) {
112 | 		this.values = values;
113 | 	}
114 | 
115 | 	size() {
116 | 		return this.values.length;
117 | 	}
118 | 
119 | 	add(other: Vector) {
120 | 		if (this.size() !== other.size()) {
121 | 			throw "error: cannot add vectors of different sizes";
122 | 		}
123 | 
124 | 		other.values.forEach((num, index) => this.values[index] += num);
125 | 	}
126 | 
127 | 	toArray(): number[] {
128 | 		return this.values;
129 | 	}
130 | }
131 | 
132 | console.log("app started");
133 | // hey, what's up? how are you?
134 | 


--------------------------------------------------------------------------------
/webclient/scripts/src/page.tsx:
--------------------------------------------------------------------------------
 1 | import * as React from "react";
 2 | import * as ReactDOM from "react-dom";
 3 | 
 4 | import { TesterPage } from "./tester";
 5 | import { SenderPage } from "./im-sender";
 6 | import { ReceiverPage } from "./im-receiver";
 7 | 
 8 | export type PageProps = {
 9 | 	report: (message: string) => Promise<void>;
10 | 	check: (message: string) => Promise<{ spam: boolean; confidence: number; }>;
11 | }
12 | 
13 | export type PageState = {
14 | 	content: "tester" | "sender" | "receiver";
15 | }
16 | 
17 | export class Page extends React.Component<PageProps, PageState> {
18 | 	constructor(props: PageProps) {
19 | 		super(props);
20 | 		this.state = { content: "tester" };
21 | 	}
22 | 
23 | 	render() {
24 | 		let content: JSX.Element;
25 | 
26 | 		switch (this.state.content) {
27 | 			case "tester":
28 | 				content = <TesterPage report={ this.props.report } check={ this.props.check } />;
29 | 				break;
30 | 
31 | 			case "sender":
32 | 				content = <SenderPage/>;
33 | 				break;
34 | 
35 | 			case "receiver":
36 | 				content = <ReceiverPage report={ this.props.report } check={ this.props.check } />;
37 | 				break;
38 | 		}
39 | 
40 | 		return (
41 | 			<div id="page">
42 | 				<div className="header">
43 | 					<h1>spam classification based on word2vec</h1>
44 | 					<div className="viewmode">
45 | 						<span>View Mode: </span>
46 | 						<select onChange={ this.onPageChange.bind(this) }>
47 | 							<option value="tester">Standalone Tester</option>
48 | 							<option value="sender">IM Sender</option>
49 | 							<option value="receiver">IM Receiver</option>
50 | 						</select>
51 | 					</div>
52 | 				</div>
53 | 				{ content }
54 | 			</div>
55 | 		);
56 | 	}
57 | 
58 | 	private onPageChange(event: React.FormEvent<HTMLSelectElement>) {
59 | 		const content = event.currentTarget.value as ("tester" | "sender" | "receiver");
60 | 
61 | 		this.setState({
62 | 			content
63 | 		});
64 | 	}
65 | }
66 | 
67 | export function render(wrapper: HTMLElement, props: PageProps): void {
68 | 	ReactDOM.render(<Page { ...props } />, wrapper);
69 | }


--------------------------------------------------------------------------------
/webclient/scripts/src/tester.tsx:
--------------------------------------------------------------------------------
 1 | import * as React from "react";
 2 | 
 3 | export type TesterPageProps = {
 4 | 	report: (message: string) => Promise<void>;
 5 | 	check: (message: string) => Promise<{ spam: boolean; confidence: number; }>;
 6 | }
 7 | 
 8 | export class TesterPage extends React.Component<TesterPageProps, {}> {
 9 | 	private textarea: HTMLTextAreaElement;
10 | 
11 | 	render() {
12 | 		return (
13 | 			<div className="content">
14 | 				<textarea placeholder="Enter message here" ref={ el => this.textarea = el }></textarea>
15 | 				<div className="actions">
16 | 					<button onClick={ this.clear.bind(this) }>Clear</button>
17 | 					<button onClick={ this.report.bind(this) }>Report as spam</button>
18 | 					<button onClick={ this.check.bind(this) }>Check for spam</button>
19 | 				</div>
20 | 			</div>
21 | 		);
22 | 	}
23 | 
24 | 	private clear() {
25 | 		this.textarea.value = "";
26 | 	}
27 | 
28 | 	private report() {
29 | 		this.props.report(this.textarea.value).then(() => alert("message reported"));
30 | 	}
31 | 
32 | 	private check() {
33 | 		this.props.check(this.textarea.value).then(res => {
34 | 			alert(res.spam ? "message is spammy" : "message is ok");
35 | 		});
36 | 	}
37 | }
38 | 


--------------------------------------------------------------------------------
/webclient/scripts/tsconfig.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"compilerOptions": {
 3 | 		"target": "ES6",
 4 | 		"removeComments": true,
 5 | 		"sourceMap": true,
 6 | 		"jsx": "react",
 7 | 		"outDir": "bin",
 8 | 		"rootDir": "src",
 9 | 		"declaration": true,
10 | 		"skipLibCheck": true,
11 | 		"skipDefaultLibCheck": true
12 | 	}
13 | }
14 | 


--------------------------------------------------------------------------------
/webclient/styles/src/index.styl:
--------------------------------------------------------------------------------
 1 | html, body, #main
 2 | 	height: 100%
 3 | 	width: 100%
 4 | 	margin: 0px
 5 | 	padding: 0px
 6 | 	box-sizing: border-box
 7 | 
 8 | html, body, p, h1, h2, h3, h4, h5, h6, textarea, input, button, span, div, main, select
 9 | 	font-family: Hack, monospace
10 | 
11 | .actions
12 | 	padding: 20px 0px 10px 0px
13 | 	text-align: center
14 | 
15 | 	button
16 | 		padding: 3px 10px
17 | 		margin: 0px 10px
18 | 		border: 1px solid black
19 | 
20 | #main
21 | 	padding: 20px
22 | 
23 | 	#page
24 | 		width: 100%
25 | 		height: 100%
26 | 		display: flex
27 | 		flex-direction: column
28 | 
29 | 		.header
30 | 			display: flex
31 | 			margin-bottom: 15px
32 | 			padding-bottom: 5px
33 | 			border-bottom: 2px solid black
34 | 
35 | 			h1
36 | 				margin: 0
37 | 				padding: 0
38 | 				flex-grow: 1
39 | 
40 | 			.viewmode
41 | 				display:flex;
42 | 				align-items: flex-end;
43 | 
44 | 				select
45 | 					font-size: 14px
46 | 
47 | 		.content
48 | 			flex-grow: 1
49 | 			display: flex
50 | 			flex-direction: column
51 | 
52 | 			textarea
53 | 				resize: none
54 | 				flex-grow: 1
55 | 				padding: 10px 10px 25px 10px
56 | 
57 | 			#examples
58 | 				ul
59 | 					font-size: 14px
60 | 					padding: 0
61 | 
62 | 				li
63 | 					padding: 5px 0px
64 | 					list-style: none
65 | 					cursor: pointer
66 | 
67 | 				li:hover
68 | 					background-color: #ffff44
69 | 
70 | 			.incoming
71 | 				border-collapse: separate
72 | 				border-spacing: 0px 5px
73 | 
74 | 				td
75 | 					padding: 3px
76 | 
77 | 				tr.spam
78 | 					td
79 | 						background-color: rgba(255, 0, 0, 0.5)
80 | 
81 | 				tr.notspam
82 | 					td
83 | 						background-color: rgba(0, 255, 0, 0.5)
84 | 


--------------------------------------------------------------------------------
/webclient/webpack.config.js:
--------------------------------------------------------------------------------
 1 | const path = require("path");
 2 | const webpack = require("webpack");
 3 | const { getIfUtils, removeEmpty } = require("webpack-config-utils");
 4 | 
 5 | // variables
 6 | const outPath = path.join(__dirname);
 7 | const assetsPath = path.join(__dirname, "html");
 8 | 
 9 | // plugins
10 | const HtmlWebpackPlugin = require("html-webpack-plugin");
11 | 
12 | const htmlTemplate = path.join(assetsPath, "index.html");
13 | 
14 | module.exports = env => {
15 | 	const { ifNotProd } = getIfUtils(env || {});
16 | 	// get boolean value to use directly in flag configuration;
17 | 	const isNotProd = ifNotProd(true, false);
18 | 
19 | 	return {
20 | 		devtool: "eval",
21 | 		entry: removeEmpty({
22 | 			main: path.join(__dirname, "scripts", "bin", "index.js"),
23 | 		}),
24 | 		output: {
25 | 			path: outPath,
26 | 			filename: "[name].bundle.js",
27 | 			pathinfo: isNotProd,
28 | 			publicPath: "/"
29 | 		},
30 | 		module: {
31 | 			noParse: [/\.min\.js$/, /\.bundle\.js$/],
32 | 			rules: []
33 | 		},
34 | 		resolve: {
35 | 			extensions: [".ts", ".tsx", ".js"],
36 | 			// Fix webpack's default behavior to not load packages with jsnext:main module
37 | 			// (jsnext:main directs not usually distributable es6 format, but es6 sources)
38 | 			mainFields: ["module", "browser", "main"]
39 | 		},
40 | 		target: "web",
41 | 		plugins: [new HtmlWebpackPlugin({
42 | 			template: htmlTemplate,
43 | 		})]
44 | 	};
45 | };
46 | 


--------------------------------------------------------------------------------