├── LICENSE ├── README.md └── ssdeep_elastic ├── ssdeep-mapping.json └── ssdeep_querying.py /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Intezer Labs 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ssdeep-elastic 2 | Example implementation of [ssdeep](http://www.forensicswiki.org/wiki/Ssdeep) similarity search optimized with [Elasticsearch](https://www.elastic.co/products/elasticsearch). 3 | 4 | This example shows how to preform ssdeep search and compare in scale. 5 | You can read about it at Intezer's blog post - [Optimize ssdeep Comparisons with ElasticSearch](http://www.intezer.com/intezer-community-tip-ssdeep-comparisons-with-elasticsearch/). 6 | 7 | Related Articles: 8 | - https://www.virusbulletin.com/virusbulletin/2015/11/optimizing-ssdeep-use-scale/ 9 | - https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-ngram-tokenizer.html 10 | -------------------------------------------------------------------------------- /ssdeep_elastic/ssdeep-mapping.json: -------------------------------------------------------------------------------- 1 | { 2 | "settings": { 3 | "analysis": { 4 | "analyzer": { 5 | "ssdeep_analyzer": { 6 | "tokenizer": "ssdeep_tokenizer" 7 | } 8 | }, 9 | "tokenizer": { 10 | "ssdeep_tokenizer": { 11 | "type": "ngram", 12 | "min_gram": 7, 13 | "max_gram": 7 14 | } 15 | } 16 | } 17 | }, 18 | "mappings": { 19 | "_default_": { 20 | "_all": { 21 | "enabled": false 22 | }, 23 | "dynamic": "strict", 24 | "properties": { 25 | "chunksize": { 26 | "type": "integer" 27 | }, 28 | "chunk": { 29 | "analyzer": "ssdeep_analyzer", 30 | "type": "text" 31 | }, 32 | "double_chunk": { 33 | "analyzer": "ssdeep_analyzer", 34 | "type": "text" 35 | }, 36 | "ssdeep": { 37 | "type": "keyword" 38 | }, 39 | "sha256": { 40 | "type": "keyword" 41 | } 42 | } 43 | }, 44 | "record": {} 45 | } 46 | } -------------------------------------------------------------------------------- /ssdeep_elastic/ssdeep_querying.py: -------------------------------------------------------------------------------- 1 | import elasticsearch 2 | import ssdeep 3 | 4 | 5 | def insert_record_to_ssdeep_index(ssdeep_value, sha256): 6 | """ 7 | Adds a record to the ssdeep index in elasticsearch 8 | :param ssdeep_value: The ssdeep hash value of the item 9 | :param sha256: The sha256 hash value of the item 10 | """ 11 | chunksize, chunk, double_chunk = ssdeep_value.split(':') 12 | chunksize = int(chunksize) 13 | 14 | es = elasticsearch.Elasticsearch(['localhost:9200']) 15 | 16 | document = {'chunksize': chunksize, 'chunk': chunk, 'double_chunk': double_chunk, 'ssdeep': ssdeep_value, 17 | 'sha256': sha256} 18 | 19 | es.index('ssdeep-index', 'record', document) 20 | es.indices.refresh('ssdeep-index') 21 | 22 | 23 | def get_matching_items_by_ssdeep(ssdeep_value, threshold_grade): 24 | """ 25 | A function that finds matching items by ssdeep comparison with optimizations using ElasticSearch 26 | :param ssdeep_value: The ssdeep hash value of the item 27 | :param threshold_grade: The grade being used as a threshold, only items that pass this grade will be returned 28 | :return: A List of matching items (in this case, a list of sha256 hash values) 29 | """ 30 | chunksize, chunk, double_chunk = ssdeep_value.split(':') 31 | chunksize = int(chunksize) 32 | 33 | es = elasticsearch.Elasticsearch(['localhost:9200']) 34 | 35 | query = { 36 | 'query': { 37 | 'bool': { 38 | 'must': [ 39 | { 40 | 'terms': { 41 | 'chunksize': [chunksize, chunksize * 2, int(chunksize / 2)] 42 | } 43 | }, 44 | { 45 | 'bool': { 46 | 'should': [ 47 | { 48 | 'match': { 49 | 'chunk': { 50 | 'query': chunk 51 | } 52 | } 53 | }, 54 | { 55 | 'match': { 56 | 'double_chunk': { 57 | 'query': double_chunk 58 | } 59 | } 60 | } 61 | ], 62 | 'minimum_should_match': 1 63 | } 64 | } 65 | ] 66 | } 67 | } 68 | } 69 | 70 | results = es.search('ssdeep-index', body=query) 71 | 72 | sha256_list_to_return = [] 73 | 74 | for record in results['hits']['hits']: 75 | record_ssdeep = record['_source']['ssdeep'] 76 | ssdeep_grade = ssdeep.compare(record_ssdeep, ssdeep_value) 77 | 78 | if ssdeep_grade >= threshold_grade: 79 | sha256_list_to_return.append(record['_source']['sha256']) 80 | 81 | return sha256_list_to_return 82 | 83 | 84 | if __name__ == '__main__': 85 | item_1 = {'ssdeep': '768:v7XINhXznVJ8CC1rBXdo0zekXUd3CdPJxB7mNmDZkUKMKZQbFTiKKAZTy:ShT8C+fuioHq1KEFoAU', 86 | 'sha256': 'aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa'} 87 | item_2 = {'ssdeep': '768:C7XINhXznVJ8CC1rBXdo0zekXUd3CdPJxB7mNmDZkUKMKZQbFTiKKAZTV6:ThT8C+fuioHq1KEFoAj6', 88 | 'sha256': 'bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb'} 89 | item_3 = { 90 | 'ssdeep': '768:t2m3D9SlK1TVYatO/tkqzWQDG/ssC7XkZDzYYFTdqiP1msdT1OhN7UmSaED7Etnc:w7atyfzWgGEXszYYF4iosdTE1zz2+Ze', 91 | 'sha256': 'cccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc'} 92 | 93 | insert_record_to_ssdeep_index(item_1['ssdeep'], item_1['sha256']) 94 | insert_record_to_ssdeep_index(item_3['ssdeep'], item_3['sha256']) 95 | 96 | matching_items = get_matching_items_by_ssdeep(item_2['ssdeep'], 90) 97 | 98 | print(matching_items) # This will only print The first item 99 | --------------------------------------------------------------------------------