├── .gitignore ├── LICENSE ├── Makefile ├── README.md ├── clip.gif ├── server ├── Testing.ipynb ├── app.py ├── generate.py ├── model.py └── requirements.txt └── webclient ├── html └── index.html ├── package-lock.json ├── package.json ├── scripts ├── src │ ├── im-receiver.tsx │ ├── im-sender.tsx │ ├── index.ts │ ├── page.tsx │ └── tester.tsx └── tsconfig.json ├── styles └── src │ └── index.styl └── webpack.config.js /.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | 3 | corpus/* 4 | database/* 5 | 6 | webclient/*/bin 7 | webclient/node_modules 8 | 9 | server/*.pyc 10 | 11 | .ipynb_checkpoints 12 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License 2 | 3 | Copyright (c) 2010-2017 Google, Inc. http://angularjs.org 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in 13 | all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | THE SOFTWARE. 22 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: server static install db download init 2 | 3 | server: 4 | cd server && python app.py 5 | 6 | static: 7 | npm run dev 8 | 9 | install: 10 | pip install -r server/requirements.txt 11 | cd webclient && npm install 12 | 13 | db: 14 | mkdir -p database 15 | echo '{}' > database/db.json 16 | 17 | download: corpus/glove.6B.zip corpus/enwiki-20150602-words-frequency.txt 18 | mkdir -p corpus 19 | wget -P corpus https://github.com/IlyaSemenov/wikipedia-word-frequency/blob/master/results/enwiki-20150602-words-frequency.txt 20 | wget -p corpus http://nlp.stanford.edu/data/wordvecs/glove.6B.zip 21 | cd corpus && unzip glove.6B.zip 22 | 23 | init: db download install 24 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # word2vec-spam-filter 2 | 3 | This is a project done during the [Kik](https://github.com/kikinteractive/) hackathon 2017. 4 | 5 | In this project we demonstrate a way to classify spam messages on the client while protecting user privacy. 6 | 7 | A client generates a "hash" from the message sending it to the server. The server then compares the "hash" to a bank of known reported messages. 8 | 9 | The bank of known reported messages is created from spam reports. The server compares a given reported message to the previous bank of reported messages. If the message is similar to a previously reported message, a report count is incremented. Otherwise the message is added to the bank with a count of 1. 10 | 11 | A message in the bank of reported messages is considered a spam message once it was reported more than 3 times. 12 | 13 | ## Preview 14 | 15 | 16 | 17 | ## Corpus downloads 18 | We used 2 datasets for creating sentence vectors: 19 | 1. word vectors taken from: https://github.com/stanfordnlp/GloVe 20 | 2. word frequencies from: https://github.com/IlyaSemenov/wikipedia-word-frequency/blob/master/results/enwiki-20150602-words-frequency.txt 21 | 22 | ## Configurable parameters (Hyper-Parameters) 23 | We played around with a few configurations to get the best results for short user messages: 24 | 25 | * Confidence Threshold - a number between 0.0 - 1.0 to determine when 2 messages are considered the same 26 | * Distance Function - we used vector dot product 27 | * Normalization - how to deal with words we don't have in our corpus, punctuation marks, non english words 28 | * Vector Size - the longer the vector the higher the accuracy but heavier in memory 29 | * Weight Function - given a word frequency, how to create the vector weights (`the` should weigh less than `camera`) 30 | * Custom Corpus - creating the word vectors and frequencies from real user message data might yield better results 31 | * Random Indices - how many random indices should the client send to the server to mask the original message indices 32 | 33 | ## Running the code 34 | This project includes a single makefile to help with the initialization, dependency installation and corpus download. 35 | You can invoke a help message by running: 36 | 37 | ``` 38 | make 39 | ``` 40 | 41 | Or you can manually run the server and client apps: 42 | 43 | ### server 44 | In the `server` directory install the pip dependencies in a `virtualenv`: 45 | 46 | ``` 47 | pip install -r requirements.txt 48 | ``` 49 | 50 | and run the server: 51 | ``` 52 | python app.py 53 | ``` 54 | 55 | ### web client 56 | To use the web client go into the `webclient` directory in your terminal and then: 57 | ``` 58 | npm install 59 | npm run dev 60 | ``` 61 | 62 | That should install all dependencies and kick start the project, if it all works you should see something like: 63 | > Project is running at http://localhost:3333/ 64 | > webpack output is served from / 65 | 66 | Now load http://localhost:3333/ in your browser 67 | 68 | There are 3 different "view modes" which can be switched using the select box at the top right corner of the page. 69 | The 3 views are: 70 | * Standalone Tester: A textarea in which one can input a message and then either report it as spam or check whether it is classified as spam. 71 | * IM Sender: A textarea in which the user can input a message (or select a message from a bunch of existing ones) and then "send" the message to another client. 72 | * IM Receiver: A view which displays a list of received messages (using the `IM Sender`) and the ability to report each message as spam. 73 | -------------------------------------------------------------------------------- /clip.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/doodyparizada/word2vec-spam-filter/695898b7e7a3091947626f12fc31c9be4bb03784/clip.gif -------------------------------------------------------------------------------- /server/Testing.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 142, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import requests\n", 10 | "BASE_URL = 'http://localhost:5000'\n", 11 | "\n", 12 | "def report(msg):\n", 13 | " requests.post(BASE_URL + '/spam/report',\n", 14 | " json={'message': msg}\n", 15 | " ).ok\n", 16 | "\n", 17 | "def is_spam(vector):\n", 18 | " data = requests.get(BASE_URL + '/spam/detect',\n", 19 | " params={'vector': ','.join(str(v) for v in vector)}\n", 20 | " ).json()\n", 21 | " return data['spam'], data['confidence']\n", 22 | "\n", 23 | "def get_word_vectors(indexes):\n", 24 | " data = requests.get(BASE_URL + '/words/vector',\n", 25 | " params={'ids': ','.join(str(i) for i in indexes)}).json()\n", 26 | " return {int(k): v['vector'] for k,v in data['words'].items()}\n", 27 | "\n", 28 | "def get_word_indexes():\n", 29 | " \"\"\"return a dict of word to index.\"\"\"\n", 30 | " content = requests.get(BASE_URL + '/words/list').content\n", 31 | " return {word: i for i, word in enumerate(content.split('\\n'))}" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": 143, 37 | "metadata": {}, 38 | "outputs": [], 39 | "source": [ 40 | "WORD_INDEXES = get_word_indexes()\n", 41 | "\n", 42 | "\n", 43 | "def calc_message(message):\n", 44 | " import random\n", 45 | " import numpy as np\n", 46 | " \n", 47 | " message = message.lower()\n", 48 | " \n", 49 | " indexes = [WORD_INDEXES[word] \n", 50 | " for word in message.split(' ')\n", 51 | " if word in WORD_INDEXES]\n", 52 | " fakes = [random.randint(0, len(WORD_INDEXES)) for _ in range(20)]\n", 53 | " \n", 54 | " shuffled = indexes + fakes\n", 55 | " random.shuffle(shuffled)\n", 56 | " vectors = get_word_vectors(shuffled)\n", 57 | " \n", 58 | " vec_sum = sum(np.array(vectors[i]) for i in indexes).tolist()\n", 59 | "\n", 60 | " return vec_sum" 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": 144, 66 | "metadata": {}, 67 | "outputs": [], 68 | "source": [ 69 | "def is_spam_message(msg):\n", 70 | " return is_spam(calc_message(msg))" 71 | ] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "execution_count": 145, 76 | "metadata": {}, 77 | "outputs": [], 78 | "source": [ 79 | "spams = [\n", 80 | " 'Hi are you looking for a sexy cool woman to spend time ,with well I,m lookin for a gentleman who still khowns how to treat a lady, I am here to u.http . G.lovendate.pw code: 605',\n", 81 | " 'I offer you an exchange, you free register on my site http:\\/\\/rachelmel.pro\\nConfirmation email and show me the screenshot. \\nAfter that I\\'ll send you my nude pics',\n", 82 | " '''Hi, do I know u? you just showed up in my kik hmm.. my friends warned me that there are many fake accounts and bots here, no offense, are u a real person? If you are a real person, you won't have any trouble liking my pic, will you;)? the one where I'm wearing a white swimming suit. This way I'll be convinced that you are real''',\n", 83 | " '''This is the first time I will be playing naked on a webcam. Come see my naked body. Reg here, plz:* http:\\/\\/u.to\\/VgdTEA It is completely free'''\n", 84 | " '''go here, click the create private account.. you might have to create a username before you can see me live''',\n", 85 | "]\n", 86 | "\n", 87 | "good = [\n", 88 | " 'Hi I am your mother and I love you',\n", 89 | " 'Did you watch the movie last night?',\n", 90 | " 'Whats up with all the weird clouds?',\n", 91 | "]" 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "execution_count": 146, 97 | "metadata": {}, 98 | "outputs": [], 99 | "source": [ 100 | "# register spam messages\n", 101 | "for msg in spams:\n", 102 | " for i in range(4):\n", 103 | " report(msg)" 104 | ] 105 | }, 106 | { 107 | "cell_type": "code", 108 | "execution_count": 115, 109 | "metadata": {}, 110 | "outputs": [ 111 | { 112 | "data": { 113 | "text/plain": [ 114 | "(False, 0.8641516840020381)" 115 | ] 116 | }, 117 | "execution_count": 115, 118 | "metadata": {}, 119 | "output_type": "execute_result" 120 | } 121 | ], 122 | "source": [ 123 | "is_spam_message(good[2])" 124 | ] 125 | }, 126 | { 127 | "cell_type": "code", 128 | "execution_count": 117, 129 | "metadata": {}, 130 | "outputs": [ 131 | { 132 | "data": { 133 | "text/plain": [ 134 | "(True, 1.0)" 135 | ] 136 | }, 137 | "execution_count": 117, 138 | "metadata": {}, 139 | "output_type": "execute_result" 140 | } 141 | ], 142 | "source": [ 143 | "is_spam_message(spams[1])" 144 | ] 145 | }, 146 | { 147 | "cell_type": "code", 148 | "execution_count": 129, 149 | "metadata": {}, 150 | "outputs": [ 151 | { 152 | "data": { 153 | "text/plain": [ 154 | "(False, 0.6815927618764492)" 155 | ] 156 | }, 157 | "execution_count": 129, 158 | "metadata": {}, 159 | "output_type": "execute_result" 160 | } 161 | ], 162 | "source": [ 163 | "is_spam_message('Hey Doody What is up?')" 164 | ] 165 | }, 166 | { 167 | "cell_type": "code", 168 | "execution_count": 141, 169 | "metadata": {}, 170 | "outputs": [ 171 | { 172 | "name": "stdout", 173 | "output_type": "stream", 174 | "text": [ 175 | "['Hi', 'do', 'I', 'know', 'u', 'you', 'just', 'showed', 'up', 'in', 'my', 'kik', 'hmm', 'my', 'friends', 'warned', 'me', 'that', 'there', 'are', 'many', 'fake', 'accounts', 'and', 'bots', 'here', 'no', 'offense', 'are', 'u', 'a', 'real', 'person', 'If', 'you', 'are', 'a', 'real', 'person', 'you', \"won't\", 'have', 'any', 'trouble', 'liking', 'my', 'pic', 'will', 'you', 'the', 'one', 'where', \"I'm\", 'wearing', 'a', 'white', 'swimming', 'suit', 'This', 'way', \"I'll\", 'be', 'convinced', 'that', 'you', 'are', 'real']\n" 176 | ] 177 | } 178 | ], 179 | "source": [ 180 | ">>> import re\n", 181 | ">>> string = spams[2]\n", 182 | ">>> pattern = re.compile(\"^\\s+|\\s*[, \\.\\?;\\(\\)]\\s*|\\s+$\")\n", 183 | ">>> print([x for x in pattern.split(string) if x])" 184 | ] 185 | }, 186 | { 187 | "cell_type": "code", 188 | "execution_count": null, 189 | "metadata": {}, 190 | "outputs": [], 191 | "source": [] 192 | } 193 | ], 194 | "metadata": { 195 | "kernelspec": { 196 | "display_name": "Python 2", 197 | "language": "python", 198 | "name": "python2" 199 | }, 200 | "language_info": { 201 | "codemirror_mode": { 202 | "name": "ipython", 203 | "version": 2 204 | }, 205 | "file_extension": ".py", 206 | "mimetype": "text/x-python", 207 | "name": "python", 208 | "nbconvert_exporter": "python", 209 | "pygments_lexer": "ipython2", 210 | "version": "2.7.14" 211 | } 212 | }, 213 | "nbformat": 4, 214 | "nbformat_minor": 2 215 | } 216 | -------------------------------------------------------------------------------- /server/app.py: -------------------------------------------------------------------------------- 1 | import json 2 | import sys 3 | from math import log 4 | 5 | from flask import request, Flask, jsonify 6 | import numpy as np 7 | 8 | from model import DB 9 | from generate import generate_matrix, normalize_matrix, normalize_vector 10 | from flask_cors import CORS 11 | 12 | 13 | app = Flask(__name__) 14 | CORS(app) 15 | 16 | GLOVE = '../corpus/glove.6B.300d.txt' 17 | FREQ = '../corpus/enwiki-20150602-words-frequency.txt' 18 | 19 | iweights = {} 20 | vocab = {} 21 | ivocab = {} 22 | WORD_LIST = '' 23 | W_norm = None 24 | messages = [] 25 | 26 | EPSILON = 0.95 27 | DEFAULT_WEIGHT = 15 28 | 29 | 30 | def init(): 31 | """read glove file and generate a word matrix""" 32 | global W_norm, WORD_LIST, vocab, ivocab, iweights 33 | sys.stderr.write('initializing word vectors') 34 | word_vectors = [] 35 | 36 | # open and parse word vector file 37 | with open(GLOVE, 'r') as f: 38 | for i, line in enumerate(f): 39 | vals = line.rstrip().split(' ') 40 | vector = [float(x) for x in vals[1:]] 41 | word = vals[0] 42 | word_vectors.append((word, vector)) 43 | if i % 10000 == 0: 44 | sys.stderr.write('.') 45 | 46 | WORD_LIST += '\n'.join(w for w, _ in word_vectors) 47 | W, vocab, ivocab = generate_matrix(word_vectors) 48 | W_norm = normalize_matrix(W) 49 | 50 | sys.stderr.write('\ninitializing word weights') 51 | max_freq = None 52 | with open(FREQ, 'r') as f: 53 | for i, line in enumerate(f): 54 | vals = line.rstrip().split(' ') 55 | word = vals[0] 56 | freq = int(vals[1]) 57 | max_freq = max_freq or freq # the first iteration will set max_freq. The first line is the highest freq 58 | if word in vocab: 59 | iweights[vocab[word]] = freq_to_weight(freq, max_freq) 60 | if i % 10000 == 0: 61 | sys.stderr.write('.') 62 | 63 | sys.stderr.write('\ndone!\n') 64 | 65 | 66 | def get_vector(idx): 67 | """return the weighted vector for an index.""" 68 | return (W_norm[idx, :] * iweights.get(idx, DEFAULT_WEIGHT)) 69 | 70 | 71 | def freq_to_weight(freq, max_freq): 72 | """calculate a vector weight for a frequency.""" 73 | # taken from https://www.wikiwand.com/en/Word_lists_by_frequency 74 | return 0.5 - log(float(freq)/max_freq, 2) 75 | 76 | 77 | def generate_spam_matrix(report_threashold): 78 | """ 79 | put all known spam vectors in a matrix 80 | """ 81 | db = DB.load() 82 | word_vectors = [(word, rm.vector) 83 | for word, rm in db.reported_messages.items() 84 | if rm.reports >= report_threashold] 85 | return generate_matrix(word_vectors) 86 | 87 | 88 | def closest_spam(vector, report_threashold=3): 89 | """given a vector, return the closest spam messages and distance.""" 90 | W, vocab, ivocab = generate_spam_matrix(report_threashold=report_threashold) 91 | 92 | if not vocab: # means empty db 93 | return '', 0 94 | 95 | vector = normalize_vector(vector) 96 | 97 | dist = np.dot(W, vector.T) 98 | 99 | a = np.argsort(-dist)[:1] # currently returns generator of 3 most closest 100 | for x in a: 101 | return ivocab[x], float(dist[x]) 102 | 103 | return '', 0 104 | 105 | def tokenize_message(message): 106 | """return a list of normalized words.""" 107 | return (message 108 | .lower() 109 | .replace(".", " .") 110 | .replace(",", " ,") 111 | .replace("?", " ?") 112 | .replace("!", " !") 113 | .replace(":", " :") 114 | .replace("'s", " 's") 115 | .split()) 116 | 117 | 118 | def message_to_vector(message): 119 | """sums up all known vectors of a given message.""" 120 | vector = np.zeros(W_norm[0, :].shape) 121 | for term in tokenize_message(message): 122 | if term in vocab: 123 | vector += get_vector(vocab[term]) 124 | return vector 125 | 126 | 127 | @app.route('/words/list') 128 | def word_list(): 129 | """return word list. ordered by indexes.""" 130 | return WORD_LIST 131 | 132 | 133 | @app.route('/words/vector') 134 | def word_vectors(): 135 | """retrun vectors for the words by given ids.""" 136 | ids = {int(i) for i in request.args['ids'].split(',')} 137 | 138 | return jsonify({'words': 139 | {i: {'vector': get_vector(i).tolist()} 140 | for i in ids}}) 141 | 142 | 143 | @app.route('/spam/detect') 144 | def detect_spam(): 145 | """the given vector should not be normalized. normalization happens on server.""" 146 | vector = [float(i) for i in request.args['vector'].split(',')] 147 | msg, dist = closest_spam(vector) 148 | is_spam = dist > EPSILON 149 | return jsonify({'spam': is_spam, 150 | 'confidence': dist, 151 | 'meta': msg}) 152 | 153 | 154 | @app.route('/spam/report', methods=['POST']) 155 | def report_spam(): 156 | """if spam message already exists or is close to a known message add a report count. else add as new entry in db.""" 157 | data = request.get_json() 158 | reported_message = data['message'] 159 | vector = message_to_vector(reported_message) 160 | 161 | similar_msg, dist = closest_spam(vector, 0) 162 | 163 | db = DB.load() 164 | if dist > EPSILON: 165 | db.reported_messages[similar_msg].reports += 1 166 | else: 167 | db.add_new_message(reported_message, normalize_vector(vector).tolist()) 168 | 169 | db.save() 170 | return jsonify({}) 171 | 172 | @app.route('/messages', methods=['POST', 'GET']) 173 | def message_handler(): 174 | global messages 175 | if request.method == 'POST': 176 | messages.append(request.get_json()['message']) 177 | return jsonify({}) 178 | else: 179 | if messages: 180 | return jsonify({'message': messages.pop(0)}) 181 | return jsonify({}) 182 | 183 | 184 | if __name__ == '__main__': 185 | init() 186 | app.run() 187 | -------------------------------------------------------------------------------- /server/generate.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def normalize_vector(vector): 5 | vector = np.array(vector) 6 | vec_norm = np.zeros(vector.shape) 7 | d = (np.sum(vector ** 2,) ** (0.5)) 8 | vec_norm = (vector.T / d).T 9 | return vec_norm 10 | 11 | 12 | # XXX todo see if we can unite vector and matrix normalization 13 | def normalize_matrix(W): 14 | # normalize each word vector to unit variance 15 | W_norm = np.zeros(W.shape) 16 | d = (np.sum(W ** 2, 1) ** (0.5)) 17 | W_norm = (W.T / d).T 18 | return W_norm 19 | 20 | 21 | def generate_matrix(word_vectors): 22 | """given a list of word,vector pairs generate matrix and vocab dicts""" 23 | vectors = dict(word_vectors) 24 | words = [w for w, _ in word_vectors] 25 | 26 | vocab = {w: idx for idx, w in enumerate(words)} 27 | ivocab = {idx: w for idx, w in enumerate(words)} 28 | 29 | vocab_size = len(vectors) 30 | vector_dim = len(vectors.values()[0]) if vectors else 0 31 | W = np.zeros((vocab_size, vector_dim)) 32 | for word, v in vectors.items(): 33 | W[vocab[word], :] = v 34 | return W, vocab, ivocab 35 | -------------------------------------------------------------------------------- /server/model.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | from schematics.models import Model 4 | from schematics.types import StringType, ListType, IntType, FloatType, ModelType, DictType 5 | 6 | 7 | FILENAME = '../database/db.json' 8 | 9 | 10 | class DB(Model): 11 | class ReportedMessage(Model): 12 | reports = IntType() 13 | vector = ListType(FloatType) 14 | 15 | reported_messages = DictType(ModelType(ReportedMessage), default={}) 16 | 17 | def add_new_message(self, reported_message, vector): 18 | rm = self.ReportedMessage() 19 | rm.reports = 1 20 | rm.vector = vector 21 | self.reported_messages[reported_message] = rm 22 | 23 | @classmethod 24 | def load(cls): 25 | with open(FILENAME, 'r') as f: 26 | return DB(json.loads(f.read())) 27 | 28 | def save(self): 29 | string = json.dumps(self.to_primitive()) 30 | with open(FILENAME, 'w') as f: 31 | f.write(string) 32 | -------------------------------------------------------------------------------- /server/requirements.txt: -------------------------------------------------------------------------------- 1 | flask==0.12.2 2 | Flask-Cors==3.0.3 3 | requests==2.18.4 4 | schematics==2.0.1 5 | numpy==1.13.3 6 | -------------------------------------------------------------------------------- /webclient/html/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | word2vec spam filter 6 | 7 | 8 | 9 |
10 | 11 | -------------------------------------------------------------------------------- /webclient/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "word2vec-webclient", 3 | "version": "1.0.0", 4 | "description": "", 5 | "main": "scripts/bin/index.js", 6 | "scripts": { 7 | "removeold": "rimraf scripts/bin styles/bin/*", 8 | "createdir": "./node_modules/.bin/mkdirp styles/bin", 9 | "clean": "run-p removeold createdir", 10 | "build-scripts": "./node_modules/typescript/bin/tsc -p scripts", 11 | "build-scripts-watch": "npm run build-scripts -- --watch", 12 | "build-styles": "./node_modules/stylus/bin/stylus -c ./styles/src --out ./styles/bin", 13 | "build-styles-watch": "npm run build-styles -- --watch", 14 | "build": "npm-run-all clean build-scripts build-styles", 15 | "build-watch": "run-p build-scripts-watch build-styles-watch", 16 | "serve": "webpack-dev-server --no-info --colors --port 3333", 17 | "dev": "npm-run-all build -p build-watch serve" 18 | }, 19 | "repository": { 20 | "type": "git", 21 | "url": "git+ssh://git@github.com/doodyparizada/word2vec-spam-filter.git" 22 | }, 23 | "author": "nitzan tomer", 24 | "license": "ISC", 25 | "bugs": { 26 | "url": "https://github.com/doodyparizada/word2vec-spam-filter/issues" 27 | }, 28 | "homepage": "https://github.com/doodyparizada/word2vec-spam-filter#readme", 29 | "dependencies": { 30 | "@types/react": "^16.0.28", 31 | "@types/react-dom": "^16.0.3", 32 | "html-webpack-plugin": "^2.30.1", 33 | "mkdirp": "^0.5.1", 34 | "npm-run-all": "^4.1.2", 35 | "popsicle": "^9.2.0", 36 | "react": "^16.2.0", 37 | "react-dom": "^16.2.0", 38 | "rimraf": "^2.6.2", 39 | "shuffle-array": "^1.0.1", 40 | "stylus": "^0.54.5", 41 | "typescript": "^2.6.2", 42 | "webpack": "^3.10.0", 43 | "webpack-config-utils": "^2.3.0", 44 | "webpack-dev-server": "^2.9.7" 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /webclient/scripts/src/im-receiver.tsx: -------------------------------------------------------------------------------- 1 | import * as React from "react"; 2 | 3 | const popsicle = require("popsicle"); 4 | const baseUrl = "http://localhost:5000"; 5 | 6 | export type MessageMeta = { id: number; content: string; isSpam?: boolean; }; 7 | 8 | export type ReceiverPageProps = { 9 | report: (message: string) => void; 10 | check: (message: string) => Promise<{ spam: boolean; confidence: number; }>; 11 | } 12 | 13 | export type ReceiverPageState = { 14 | messages: MessageMeta[]; 15 | } 16 | 17 | export class ReceiverPage extends React.Component { 18 | private static readonly INTERVAL = 1000; 19 | 20 | private timer: number; 21 | 22 | constructor(props?: any) { 23 | super(props); 24 | 25 | this.state = { 26 | messages: [] 27 | }; 28 | } 29 | 30 | componentDidMount() { 31 | this.timer = setInterval(this.poll.bind(this), ReceiverPage.INTERVAL); 32 | } 33 | 34 | componentWillUnmount() { 35 | clearInterval(this.timer); 36 | } 37 | 38 | render() { 39 | const messages = this.state.messages.length === 0 ? 40 | no messages yet 41 | : this.state.messages.map(message => { 42 | const clsname = message.isSpam === undefined ? "unknown" : (message.isSpam ? "spam" : "notspam"); 43 | return ( 44 | 45 | { message.content } 46 | 47 | 48 | 49 | 50 | ); 51 | }); 52 | 53 | return ( 54 |
55 | 56 | 57 | { messages } 58 | 59 |
60 |
61 | ); 62 | } 63 | 64 | private poll() { 65 | popsicle 66 | .request(`${ baseUrl }/messages`) 67 | .use(popsicle.plugins.parse("json")) 68 | .then(res => { 69 | if (res.body.message) { 70 | const id = generateMessageId(); 71 | 72 | this.setState({ 73 | messages: [{ id, content: res.body.message }].concat(this.state.messages) 74 | }); 75 | 76 | setTimeout(() => { 77 | this.props.check(res.body.message).then(res => { 78 | this.updateMessageState(id, res.spam); 79 | }); 80 | }, 2500); 81 | } 82 | }); 83 | } 84 | 85 | private report(message: MessageMeta) { 86 | this.props.report(message.content); 87 | this.updateMessageState(message.id, true); 88 | } 89 | 90 | private updateMessageState(id: number, spam: boolean) { 91 | this.setState({ 92 | messages: this.state.messages.map(item => item.id !== id ? item : Object.assign({}, item, { isSpam: spam })) 93 | }); 94 | } 95 | } 96 | 97 | let counter = 0; 98 | function generateMessageId(): number { 99 | return ++counter; 100 | } 101 | -------------------------------------------------------------------------------- /webclient/scripts/src/im-sender.tsx: -------------------------------------------------------------------------------- 1 | import * as React from "react"; 2 | 3 | const popsicle = require("popsicle"); 4 | const baseUrl = "http://localhost:5000"; 5 | 6 | function createGetUrl(path: string): string { 7 | return `${ baseUrl }${ path }`; 8 | } 9 | 10 | export class SenderPage extends React.Component<{}, {}> { 11 | private textarea: HTMLTextAreaElement; 12 | 13 | render() { 14 | return ( 15 |
16 |
17 |
    18 |
  • Hi are you looking for a sexy cool woman to spend time ,with well I,m lookin for a gentleman who still khowns how to treat a lady, I am here to u.http . G.lovendate.pw code: 605
  • 19 |
  • I offer you an exchange, you free register on my site http://rachelmel.pro Confirmation email and show me the screenshot. After that I'll send you my nude pics
  • 20 |
  • Hi, do I know u? you just showed up in my kik hmm.. my friends warned me that there are many fake accounts and bots here, no offense, are u a real person? If you are a real person, you won't have any trouble liking my pic, will you;)? the one where I'm wearing a white swimming suit. This way I'll be convinced that you are real
  • 21 |
  • Whats going on Nitzan, Its me Doody from the party last night
  • 22 |
23 |
24 | 25 |
26 | 27 |
28 |
29 | ); 30 | } 31 | 32 | private send() { 33 | popsicle.request({ 34 | url: createGetUrl("/messages"), 35 | method: "POST", 36 | body: { 37 | message: this.textarea.value 38 | } 39 | }).then(res => console.log("message sent")); 40 | } 41 | 42 | private onClickExample(event: React.MouseEvent) { 43 | this.textarea.value = (event.target as HTMLLIElement).textContent; 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /webclient/scripts/src/index.ts: -------------------------------------------------------------------------------- 1 | import * as view from "./page"; 2 | 3 | const popsicle = require("popsicle"); 4 | const shuffle = require("shuffle-array"); 5 | const baseUrl = "http://localhost:5000"; 6 | 7 | const dictionary = {} as { [word: string]: number }; 8 | let dictionarySize; 9 | popsicle.request(createGetUrl("/words/list")).then(res => { 10 | res.body.split("\n").forEach((word, index) => dictionary[word] = index); 11 | dictionarySize = Object.keys(dictionary).length; 12 | console.log(`words loaded (count of ${ dictionarySize })`); 13 | }); 14 | 15 | const main = document.getElementById("main"); 16 | view.render(main, { 17 | report: message => { 18 | return popsicle.request({ 19 | url: createGetUrl("/spam/report"), 20 | method: "POST", 21 | body: { 22 | message 23 | } 24 | }); 25 | }, 26 | check: message => { 27 | // go to lower case 28 | message = message.toLowerCase(); 29 | 30 | // split message into words 31 | const words = normalize(message).split(/\s+/); 32 | 33 | // find indexes for words 34 | const indexes = [] as number[]; 35 | words.forEach(word => { 36 | if (dictionary[word] !== undefined) { 37 | indexes.push(dictionary[word]); 38 | } 39 | }); 40 | 41 | // add random (word) indexes 42 | const dummyCount = getRandomInt(words.length, words.length * 2); 43 | const dummys = [] as number[]; 44 | for (let i = 0; i < dummyCount; i++) { 45 | dummys.push(getRandomInt(0, dictionarySize)); 46 | } 47 | 48 | const ids = indexes.concat(dummys); 49 | // shuffle indexes 50 | shuffle(ids); 51 | return analyze(ids, indexes); 52 | } 53 | }); 54 | 55 | function getRandomInt(min, max) { 56 | min = Math.ceil(min); 57 | max = Math.floor(max); 58 | return Math.floor(Math.random() * (max - min)) + min; 59 | } 60 | 61 | function normalize(message: string): string { 62 | return message 63 | .replace(/\./g, " .") 64 | .replace(/,/g, " ,") 65 | .replace(/\?/g, " ?") 66 | .replace(/\!/g, " !") 67 | .replace(/\:/g, " :") 68 | .replace(/'s/g, " 's"); 69 | } 70 | 71 | function createGetUrl(path: string, arrName?: string, arr?: number[]): string { 72 | let str = JSON.stringify(arr || []); 73 | str = str.substring(1, str.length - 1); 74 | 75 | if (str !== "") { 76 | str = `?${ arrName }=${ str }`; 77 | } 78 | 79 | return `${ baseUrl }${ path }${ str }` 80 | } 81 | 82 | function analyze(indexes: number[], reals: number[]): Promise<{ spam: boolean; confidence: number; }> { 83 | return popsicle 84 | .request(createGetUrl("/words/vector", "ids", indexes)) 85 | .use(popsicle.plugins.parse("json")) 86 | .then(response => { 87 | let result: Vector = null; 88 | 89 | reals.forEach(index => { 90 | let vector = new Vector(response.body.words[index].vector); 91 | 92 | if (result === null) { 93 | result = vector; 94 | } else { 95 | result.add(vector); 96 | } 97 | }); 98 | 99 | return popsicle 100 | .request(createGetUrl("/spam/detect", "vector", result.toArray())) 101 | .use(popsicle.plugins.parse("json")) 102 | .then(response2 => { 103 | return response2.body; 104 | }); 105 | }); 106 | } 107 | 108 | class Vector { 109 | private readonly values: number[]; 110 | 111 | constructor(values: number[]) { 112 | this.values = values; 113 | } 114 | 115 | size() { 116 | return this.values.length; 117 | } 118 | 119 | add(other: Vector) { 120 | if (this.size() !== other.size()) { 121 | throw "error: cannot add vectors of different sizes"; 122 | } 123 | 124 | other.values.forEach((num, index) => this.values[index] += num); 125 | } 126 | 127 | toArray(): number[] { 128 | return this.values; 129 | } 130 | } 131 | 132 | console.log("app started"); 133 | // hey, what's up? how are you? 134 | -------------------------------------------------------------------------------- /webclient/scripts/src/page.tsx: -------------------------------------------------------------------------------- 1 | import * as React from "react"; 2 | import * as ReactDOM from "react-dom"; 3 | 4 | import { TesterPage } from "./tester"; 5 | import { SenderPage } from "./im-sender"; 6 | import { ReceiverPage } from "./im-receiver"; 7 | 8 | export type PageProps = { 9 | report: (message: string) => Promise; 10 | check: (message: string) => Promise<{ spam: boolean; confidence: number; }>; 11 | } 12 | 13 | export type PageState = { 14 | content: "tester" | "sender" | "receiver"; 15 | } 16 | 17 | export class Page extends React.Component { 18 | constructor(props: PageProps) { 19 | super(props); 20 | this.state = { content: "tester" }; 21 | } 22 | 23 | render() { 24 | let content: JSX.Element; 25 | 26 | switch (this.state.content) { 27 | case "tester": 28 | content = ; 29 | break; 30 | 31 | case "sender": 32 | content = ; 33 | break; 34 | 35 | case "receiver": 36 | content = ; 37 | break; 38 | } 39 | 40 | return ( 41 |
42 |
43 |

spam classification based on word2vec

44 |
45 | View Mode: 46 | 51 |
52 |
53 | { content } 54 |
55 | ); 56 | } 57 | 58 | private onPageChange(event: React.FormEvent) { 59 | const content = event.currentTarget.value as ("tester" | "sender" | "receiver"); 60 | 61 | this.setState({ 62 | content 63 | }); 64 | } 65 | } 66 | 67 | export function render(wrapper: HTMLElement, props: PageProps): void { 68 | ReactDOM.render(, wrapper); 69 | } -------------------------------------------------------------------------------- /webclient/scripts/src/tester.tsx: -------------------------------------------------------------------------------- 1 | import * as React from "react"; 2 | 3 | export type TesterPageProps = { 4 | report: (message: string) => Promise; 5 | check: (message: string) => Promise<{ spam: boolean; confidence: number; }>; 6 | } 7 | 8 | export class TesterPage extends React.Component { 9 | private textarea: HTMLTextAreaElement; 10 | 11 | render() { 12 | return ( 13 |
14 | 15 |
16 | 17 | 18 | 19 |
20 |
21 | ); 22 | } 23 | 24 | private clear() { 25 | this.textarea.value = ""; 26 | } 27 | 28 | private report() { 29 | this.props.report(this.textarea.value).then(() => alert("message reported")); 30 | } 31 | 32 | private check() { 33 | this.props.check(this.textarea.value).then(res => { 34 | alert(res.spam ? "message is spammy" : "message is ok"); 35 | }); 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /webclient/scripts/tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "compilerOptions": { 3 | "target": "ES6", 4 | "removeComments": true, 5 | "sourceMap": true, 6 | "jsx": "react", 7 | "outDir": "bin", 8 | "rootDir": "src", 9 | "declaration": true, 10 | "skipLibCheck": true, 11 | "skipDefaultLibCheck": true 12 | } 13 | } 14 | -------------------------------------------------------------------------------- /webclient/styles/src/index.styl: -------------------------------------------------------------------------------- 1 | html, body, #main 2 | height: 100% 3 | width: 100% 4 | margin: 0px 5 | padding: 0px 6 | box-sizing: border-box 7 | 8 | html, body, p, h1, h2, h3, h4, h5, h6, textarea, input, button, span, div, main, select 9 | font-family: Hack, monospace 10 | 11 | .actions 12 | padding: 20px 0px 10px 0px 13 | text-align: center 14 | 15 | button 16 | padding: 3px 10px 17 | margin: 0px 10px 18 | border: 1px solid black 19 | 20 | #main 21 | padding: 20px 22 | 23 | #page 24 | width: 100% 25 | height: 100% 26 | display: flex 27 | flex-direction: column 28 | 29 | .header 30 | display: flex 31 | margin-bottom: 15px 32 | padding-bottom: 5px 33 | border-bottom: 2px solid black 34 | 35 | h1 36 | margin: 0 37 | padding: 0 38 | flex-grow: 1 39 | 40 | .viewmode 41 | display:flex; 42 | align-items: flex-end; 43 | 44 | select 45 | font-size: 14px 46 | 47 | .content 48 | flex-grow: 1 49 | display: flex 50 | flex-direction: column 51 | 52 | textarea 53 | resize: none 54 | flex-grow: 1 55 | padding: 10px 10px 25px 10px 56 | 57 | #examples 58 | ul 59 | font-size: 14px 60 | padding: 0 61 | 62 | li 63 | padding: 5px 0px 64 | list-style: none 65 | cursor: pointer 66 | 67 | li:hover 68 | background-color: #ffff44 69 | 70 | .incoming 71 | border-collapse: separate 72 | border-spacing: 0px 5px 73 | 74 | td 75 | padding: 3px 76 | 77 | tr.spam 78 | td 79 | background-color: rgba(255, 0, 0, 0.5) 80 | 81 | tr.notspam 82 | td 83 | background-color: rgba(0, 255, 0, 0.5) 84 | -------------------------------------------------------------------------------- /webclient/webpack.config.js: -------------------------------------------------------------------------------- 1 | const path = require("path"); 2 | const webpack = require("webpack"); 3 | const { getIfUtils, removeEmpty } = require("webpack-config-utils"); 4 | 5 | // variables 6 | const outPath = path.join(__dirname); 7 | const assetsPath = path.join(__dirname, "html"); 8 | 9 | // plugins 10 | const HtmlWebpackPlugin = require("html-webpack-plugin"); 11 | 12 | const htmlTemplate = path.join(assetsPath, "index.html"); 13 | 14 | module.exports = env => { 15 | const { ifNotProd } = getIfUtils(env || {}); 16 | // get boolean value to use directly in flag configuration; 17 | const isNotProd = ifNotProd(true, false); 18 | 19 | return { 20 | devtool: "eval", 21 | entry: removeEmpty({ 22 | main: path.join(__dirname, "scripts", "bin", "index.js"), 23 | }), 24 | output: { 25 | path: outPath, 26 | filename: "[name].bundle.js", 27 | pathinfo: isNotProd, 28 | publicPath: "/" 29 | }, 30 | module: { 31 | noParse: [/\.min\.js$/, /\.bundle\.js$/], 32 | rules: [] 33 | }, 34 | resolve: { 35 | extensions: [".ts", ".tsx", ".js"], 36 | // Fix webpack's default behavior to not load packages with jsnext:main module 37 | // (jsnext:main directs not usually distributable es6 format, but es6 sources) 38 | mainFields: ["module", "browser", "main"] 39 | }, 40 | target: "web", 41 | plugins: [new HtmlWebpackPlugin({ 42 | template: htmlTemplate, 43 | })] 44 | }; 45 | }; 46 | --------------------------------------------------------------------------------