├── .gitignore ├── .jslintrc ├── .travis.yml ├── LICENCE ├── README.md ├── bin └── simhash ├── lib ├── cli.js └── simhash.js ├── package.json └── test ├── sample1.txt ├── sample2.txt ├── test_cli.js └── test_simhash.js /.gitignore: -------------------------------------------------------------------------------- 1 | node_modules/ 2 | -------------------------------------------------------------------------------- /.jslintrc: -------------------------------------------------------------------------------- 1 | { 2 | "es6": true, 3 | "ass": false, 4 | "bitwise": false, 5 | "browser": false, 6 | "closure": false, 7 | "continue": false, 8 | "couch": false, 9 | "debug": false, 10 | "devel": false, 11 | "eqeq": false, 12 | "evil": false, 13 | "forin": false, 14 | "indent": 4, 15 | "maxerr": 50, 16 | "maxlen": false, 17 | "newcap": false, 18 | "node": true, 19 | "nomen": false, 20 | "passfail": false, 21 | "plusplus": false, 22 | "predef": [ 23 | "angular", 24 | "$", 25 | "" 26 | ], 27 | "regexp": false, 28 | "rhino": false, 29 | "sloppy": false, 30 | "stupid": false, 31 | "sub": false, 32 | "todo": false, 33 | "unparam": false, 34 | "vars": false, 35 | "white": false 36 | } 37 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: node_js 2 | node_js: 3 | - "6.2.0" 4 | -------------------------------------------------------------------------------- /LICENCE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright {yyyy} {name of copyright owner} 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # node-simhash 2 | 3 | A simple command line tool for comparing text files using the simhash algorithm and contrasting with the jaccard index. 4 | 5 | [![Build Status](https://travis-ci.org/sjhorn/node-simhash.svg?branch=master)](https://travis-ci.org/sjhorn/node-simhash) 6 | 7 | 8 | ## References 9 | 10 | [Near duplicate detection (moz.com)](https://moz.com/devblog/near-duplicate-detection/) 11 | 12 | ## Installation 13 | 14 | ### If you have just clone this like then run the following 15 | ```` 16 | npm install 17 | npm link 18 | ```` 19 | 20 | Or if you would like to install globally 21 | ```` 22 | npm install https://github.com/sjhorn/node-simhash -g 23 | ```` 24 | 25 | ## Command line tool usage 26 | 27 | Using node 28 | ```` 29 | simhash file1.txt file2.txt 30 | 31 | simhash https://file.com/page1.html https://file.com/page2.html 32 | 33 | ```` 34 | 35 | ### Using lib 36 | ````js 37 | var simhash = require('node-simhash'); 38 | 39 | simhash.compare(string1, string2); 40 | 41 | ```` 42 | 43 | ### Methods 44 | #### .summary(file1, file2) 45 | Compare two text strings using both simhash and jaccard index and print a summary 46 | 47 | #### .compare(file1, file2) 48 | Compare two text strings using both simhash and jaccard index 49 | 50 | 51 | #### .hammingWeight(number) 52 | 53 | Count the binary ones in a number. 54 | 55 | #### .shingles(string, words_per_single=2) 56 | 57 | Convert string to set of shingles using the default of 2 words per shingle and tokenize using the natural libraries default tokenizer. 58 | 59 | #### .jaccardIndex(string1, string2) 60 | 61 | Compare two strings by tokeniseing and then compare the intersection of shingles to the union of shingles. 62 | 63 | #### .createBinaryString(number) 64 | 65 | Print a 32-bit number as a binary string of 32 characters 66 | 67 | #### .shingleHashList(set) 68 | 69 | Convert a set of shingles to a set of crc-32 hashes. 70 | -------------------------------------------------------------------------------- /bin/simhash: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env node 2 | require(require('path').resolve(__dirname, '../lib/cli')); 3 | -------------------------------------------------------------------------------- /lib/cli.js: -------------------------------------------------------------------------------- 1 | void function() { 2 | 'use strict'; 3 | var simhash = require('./simhash.js'); 4 | var fetch = require('node-fetch'); 5 | var fs = require('fs'); 6 | var argv = require('yargs') 7 | .usage('Usage: $0 [options]') 8 | .demand(2) 9 | .example('$0 file1.htm http://t.com/file2.htm') 10 | .help('h') 11 | .alias('h', 'help') 12 | .argv; 13 | 14 | var f1 = argv._.shift(); 15 | var f2 = argv._.shift(); 16 | 17 | if(f1.startsWith("http") && f2.startsWith("http")) { 18 | console.log("Downloading files to compare..."); 19 | fetch(f1).then(r1 => r1.text()).then(t1 => { 20 | fetch(f2).then(r2 => r2.text()).then(t2 => { 21 | simhash.summary(t1, t2); 22 | }).catch(e => console.error("Failed to download "+f2+" "+e)) 23 | }).catch(e => console.error("Failed to download "+f1+" "+e)); 24 | } else if(f1.startsWith("http") && !f2.startsWith("http")) { 25 | console.log("Downloading file to compare..."); 26 | fetch(f1).then(r1 => r1.text()).then(t1 => { 27 | simhash.summary(t1, fs.readFileSync(f2, "utf8")); 28 | }).catch(e => console.error("Failed to download "+f1+" "+e)); 29 | } else if(!f1.startsWith("http") && f2.startsWith("http")) { 30 | console.log("Downloading file to compare..."); 31 | fetch(f2).then(r2 => r2.text()).then(t2 => { 32 | simhash.summary(fs.readFileSync(f1, "utf8"),t2); 33 | }).catch(e => console.error("Failed to download "+f2+" "+e)); 34 | } else { 35 | simhash.summary(fs.readFileSync(f1, "utf8"), fs.readFileSync(f2, "utf8")); 36 | } 37 | }.call(this); 38 | -------------------------------------------------------------------------------- /lib/simhash.js: -------------------------------------------------------------------------------- 1 | void function() { 2 | 'use strict'; 3 | var crc32 = require('crc-32'); 4 | var natural = require('natural'); 5 | var NGrams = natural.NGrams; 6 | 7 | module.exports = { 8 | compare: compare, 9 | summary: summary, 10 | hammingWeight: hammingWeight, 11 | shingles: shingles, 12 | jaccardIndex: jaccardIndex, 13 | createBinaryString: createBinaryString, 14 | shingleHashList: shingleHashList 15 | } 16 | 17 | function compare(file1, file2) { 18 | return similarity(simhash(file1), simhash(file2)); 19 | } 20 | 21 | function summary(file1, file2) { 22 | var hash1 = simhash(file1); 23 | var hash2 = simhash(file2); 24 | var simhashval = similarity(hash1, hash2); 25 | var jaccard = jaccardIndex(shingles(file1), shingles(file2)); 26 | console.log("File1 simhash:", createBinaryString(hash1)); 27 | console.log("File2 simhash:", createBinaryString(hash2)); 28 | console.log( "Simhash similarity is "+simhashval+" (%d%% similar)", Math.round(simhashval * 100) ); 29 | console.log( "Jaccard index is "+jaccard+" (%d%% similar)", Math.round(jaccard * 100) ); 30 | } 31 | 32 | function hammingWeight(l) { 33 | var c; 34 | for(c = 0; l; c++) { 35 | l &= l-1; 36 | } 37 | return c; 38 | } 39 | 40 | function similarity(simhash1, simhash2) { 41 | return hammingWeight((simhash1 & simhash2)) / hammingWeight((simhash1 | simhash2)); 42 | } 43 | 44 | function shingleHashList(str) { 45 | var list = []; 46 | for (var word of shingles(str, 2)) { 47 | list.push(crc32.str(word) & 0xffffffff); 48 | } 49 | return list; 50 | } 51 | 52 | function shingles(original, kshingles=2) { 53 | var shingles = new Set(); 54 | for(var wordlist of NGrams.ngrams(original, kshingles, null, '[end]')) { 55 | shingles.add(wordlist.join(" ")); 56 | } 57 | return shingles; 58 | } 59 | 60 | function simhash(str) { 61 | var shingles = shingleHashList(str); 62 | var mask = 0x1; 63 | var simhash = 0x0; 64 | for(var i = 0; i < 64; i++) { 65 | var sim = 0; 66 | for(var s of shingles) { 67 | sim += ((s & mask) == mask) ? 1 : -1; 68 | } 69 | simhash |= (sim > 0 ? mask : 0x0); 70 | mask <<= 1; 71 | } 72 | return simhash; 73 | } 74 | 75 | function jaccardIndex(set1, set2) { 76 | var total = set1.size + set2.size; 77 | var intersection = 0; 78 | for(var shingle of set1 ) { 79 | if(set2.has(shingle)) { 80 | intersection++; 81 | } 82 | } 83 | var union = total - intersection; 84 | return intersection / union; 85 | } 86 | 87 | function createBinaryString (nMask) { 88 | for (var nFlag = 0, nShifted = nMask, sMask = ""; nFlag < 32; 89 | nFlag++, sMask += String(nShifted >>> 31), nShifted <<= 1); 90 | return sMask; 91 | } 92 | 93 | }.call(this); 94 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "node-simhash", 3 | "version": "0.1.0", 4 | "description": "Command Line tool that compares two text files using simhash", 5 | "homepage": "https://github.com/sjhorn/node-simhash", 6 | "keywords": [ 7 | "simhash", 8 | "jaccard index", 9 | "compare text" 10 | ], 11 | "repository": { 12 | "type": "git", 13 | "url": "git://github.com/sjhorn/node-simhash.git" 14 | }, 15 | "bugs": "https://github.com/sjhorn/node-simhash/issues", 16 | "engines": { 17 | "node": "6.2.x" 18 | }, 19 | "main": "lib/simhash.js", 20 | "directories": { 21 | "bin": "bin", 22 | "lib": "lib", 23 | "test": "test" 24 | }, 25 | "dependencies": { 26 | "crc-32": "~0.3.0", 27 | "natural": "0.4.0", 28 | "node-fetch": "^1.5.3", 29 | "yargs": "^4.7.1" 30 | }, 31 | "devDependencies": { 32 | "nodeunit": "^0.9.1" 33 | }, 34 | "bin": { 35 | "simhash": "bin/simhash" 36 | }, 37 | "scripts": { 38 | "test": "nodeunit" 39 | }, 40 | "licenses": [ 41 | { 42 | "type": "Apache", 43 | "url": "https://github.com/sjhorn/node-simhash/blob/master/LICENSE" 44 | } 45 | ], 46 | "author": "Scott Horn" 47 | } 48 | -------------------------------------------------------------------------------- /test/sample1.txt: -------------------------------------------------------------------------------- 1 | A simple string that, 2 | 3 | spans somelines 4 | 5 | with some ! punctuations 6 | 7 | ***hello. 8 | -------------------------------------------------------------------------------- /test/sample2.txt: -------------------------------------------------------------------------------- 1 | A simple string that, 2 | 3 | spans 4 | 5 | with some ! punctuations 6 | 7 | ***hello. 8 | -------------------------------------------------------------------------------- /test/test_cli.js: -------------------------------------------------------------------------------- 1 | var exec = require('child_process').exec; 2 | 3 | exports['Cli shows Help'] = function(test) { 4 | exec('node lib/cli.js -h', function(error, stdout, stderr) { 5 | if (error != null) { 6 | test.ok(false, "cli does not successfully show help"); 7 | } 8 | test.done(); 9 | }) 10 | } 11 | 12 | exports['Cli Compare Files'] = function(test) { 13 | exec('node lib/cli.js test/sample1.txt test/sample2.txt', function(error, stdout, stderr) { 14 | if (error != null) { 15 | test.ok(false, "cli does not successfully compare files"); 16 | } else { 17 | test.ok(stdout.indexOf("Simhash similarity is") != -1, "Failed to display summary"); 18 | } 19 | test.done(); 20 | }) 21 | } 22 | 23 | exports['Cli Compare sites'] = function(test) { 24 | exec('node lib/cli.js https://raw.githubusercontent.com/sjhorn/node-simhash/master/LICENCE https://raw.githubusercontent.com/sjhorn/node-simhash/master/LICENCE', function(error, stdout, stderr) { 25 | if (error != null) { 26 | test.ok(false, "cli does not successfully compare pages"); 27 | } else { 28 | test.ok(stdout.indexOf("Simhash similarity is") != -1, "Failed to display summary"); 29 | } 30 | test.done(); 31 | }) 32 | } 33 | -------------------------------------------------------------------------------- /test/test_simhash.js: -------------------------------------------------------------------------------- 1 | var simhash = require('../lib/simhash.js'); 2 | var fs = require('fs'); 3 | 4 | exports.compareSmallString = function(test) { 5 | test.equal(simhash.compare('I am working', 'I am working'), 1); 6 | test.done(); 7 | } 8 | 9 | exports.compareSameFile = function(test) { 10 | var file1 = fs.readFileSync("test/sample1.txt", "utf8"); 11 | test.equal(simhash.compare(file1, file1), 1); 12 | test.done(); 13 | } 14 | 15 | exports.compareSimilarFile = function(test) { 16 | var file1 = fs.readFileSync("test/sample1.txt", "utf8"); 17 | var file2 = fs.readFileSync("test/sample2.txt", "utf8"); 18 | var estimate = Math.round(simhash.compare(file1, file2) * 100); 19 | test.equal(estimate, 67); 20 | test.done(); 21 | } 22 | --------------------------------------------------------------------------------