├── .gitmodules ├── popular-ransomware.png ├── filter1combine.py ├── ransomware-family-distribution.png ├── .gitignore ├── vtmetadownload.sh ├── dates.py ├── filedates.r ├── familydates.r ├── sampledates.py ├── unzoo.py ├── samplefiles.py ├── sampleinfo.py ├── filter2.py ├── familydates.py ├── Makefile ├── README.md ├── filedates.py ├── statsampler.py ├── barplot.py ├── genericide.py ├── samplepicker.py ├── filter1.py ├── genericide.md └── families.md /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "avclass"] 2 | path = avclass 3 | url = https://github.com/malicialab/avclass 4 | -------------------------------------------------------------------------------- /popular-ransomware.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/undo-ransomware/ransomware-dataset/HEAD/popular-ransomware.png -------------------------------------------------------------------------------- /filter1combine.py: -------------------------------------------------------------------------------- 1 | import os 2 | DIR = 'ransomware/' 3 | os.execvp('zcat', ['zcat'] + [DIR + file for file in sorted(os.listdir(DIR))]) 4 | -------------------------------------------------------------------------------- /ransomware-family-distribution.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/undo-ransomware/ransomware-dataset/HEAD/ransomware-family-distribution.png -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | ransomware.jsons 2 | ransomware.labels 3 | samples.json 4 | sampledates.json 5 | ransomware-families.pdf 6 | filedates.pdf 7 | filedates.json 8 | dates.json 9 | familydates.pdf 10 | todo.md5 11 | suggested.md5 12 | download.md5 13 | index.md5 14 | Raw 15 | ransomware 16 | MetaInfo 17 | vxshare-filetypes 18 | *.tmp 19 | -------------------------------------------------------------------------------- /vtmetadownload.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -e 2 | if [ -z "$1" ]; then 3 | echo "usage: $0 file.md5..." 4 | echo "usually: while sleep 1; do make todo.md5 && $0 todo.md5; done" 5 | exit 1 6 | fi 7 | METAINFO=/home/matthias/ransomware/MetaInfo 8 | cat "$@" | while read hash _; do 9 | if ! [ -s $METAINFO/$hash.json ]; then 10 | echo $hash 11 | python /home/matthias/ransomware/VirusTotalApi/vt/vt.py -s -j --allinfo $hash >$METAINFO/$hash.json 12 | sleep 10 13 | fi 14 | mv VTDL_*.json vtdl 15 | done 16 | -------------------------------------------------------------------------------- /dates.py: -------------------------------------------------------------------------------- 1 | import ast 2 | import os 3 | import io 4 | import json 5 | import sys 6 | from datetime import date 7 | from numpy import mean 8 | 9 | with io.open('sampledates.json', 'rb') as exact: 10 | dates = json.load(exact) 11 | with io.open('filedates.json', 'rb') as stats: 12 | filedates = json.load(stats) 13 | with io.open('samples.json', 'rb') as infile: 14 | ransomware = json.load(infile) 15 | 16 | total_stdev = 0 17 | for hash, data in ransomware.items(): 18 | if hash not in dates or dates[hash] is None: 19 | file = data['file'] 20 | dates[hash] = filedates[file]['mean'] 21 | total_stdev += filedates[file]['stdev'] 22 | 23 | with io.open('dates.json', 'wb') as outfile: 24 | json.dump(dates, outfile) 25 | print 'total stdev', total_stdev / 86400 / 365, 'years' 26 | -------------------------------------------------------------------------------- /filedates.r: -------------------------------------------------------------------------------- 1 | data = read.csv('filedates.tmp') 2 | data$date = as.Date(data$date) 3 | pdf('filedates.pdf', 12, 8) 4 | plot(c(0, max(data$filename)), c(as.Date('2006-12-31'), as.Date('2019-02-01')), 5 | pch='', xaxt='n', yaxt='n', 6 | xlab='VirusShare torrent number', ylab='first submission date on VirusTotal', 7 | main='ransomware: estimated submission time by VirusShare download file') 8 | axis(1) 9 | years = 2006:2019 10 | axis(2, as.Date(paste(years,'-06-30', sep='')), labels=years, 11 | tick=F, las=1, line=F) 12 | axis(2, as.Date(paste(years,'-01-01', sep='')), labels=F) 13 | abline(h=as.Date(paste(years,'-01-01', sep='')), col='#e0e0e0', lwd=.5) 14 | boxplot(date ~ filename, data, lwd=.5, outcol='#7777ff', outcex=.3, 15 | whiskcol='#777777', staplecol='#777777', boxfill='#cccccc', 16 | boxlty=0, add=T, xaxt='n', yaxt='n') 17 | -------------------------------------------------------------------------------- /familydates.r: -------------------------------------------------------------------------------- 1 | labels = read.csv('familydates1.tmp') 2 | data = read.csv('familydates2.tmp') 3 | data$date = as.Date(data$date) 4 | 5 | pdf('familydates.pdf', 12, 8) 6 | plot(c(3, max(labels$rank) - 2), c(as.Date('2007-07-31'), as.Date('2019-02-01')), 7 | pch='', xaxt='n', yaxt='n', xlab=NA, 8 | ylab='first submission date on VirusTotal', 9 | main='known ransomware: estimated submission time by family') 10 | 11 | years = 2006:2019 12 | axis(2, as.Date(paste(years,'-06-30', sep='')), labels=years, 13 | tick=F, las=1, line=F) 14 | axis(2, as.Date(paste(years,'-01-01', sep='')), labels=F) 15 | abline(h=as.Date(paste(years,'-01-01', sep='')), col='#cccccc', lwd=.5) 16 | axis(1, at=labels$rank, labels=labels$family, las=2, cex.axis=.6) 17 | abline(v=labels$rank, col='#cccccc', lwd=.5, lty=2) 18 | 19 | boxplot(date ~ rank, data, lwd=.5, outcol='#7777ff', outcex=.3, outpch=4, 20 | whiskcol='#7777ff', staplecol='#7777ff', boxfill='#777777', 21 | boxlty=0, add=T, xaxt='n', yaxt='n') 22 | -------------------------------------------------------------------------------- /sampledates.py: -------------------------------------------------------------------------------- 1 | import ast 2 | import os 3 | import io 4 | import json 5 | import sys 6 | 7 | if os.path.isfile('sampledates.json'): 8 | with io.open('sampledates.json', 'rb') as cache: 9 | dates = json.load(cache) 10 | else: 11 | dates = dict() 12 | 13 | with io.open('samples.json', 'rb') as infile: 14 | ransomware = json.load(infile) 15 | index = 0 16 | for hash, data in ransomware.items(): 17 | if hash not in dates: 18 | metafile ='MetaInfo/' + hash + '.json' 19 | if os.path.isfile(metafile): 20 | with open(metafile, 'r') as meta: 21 | try: 22 | data = ast.literal_eval(meta.readline()) 23 | except Exception: 24 | print 'crash parsing', hash 25 | raise 26 | attrs = data['data']['attributes'] 27 | if 'first_submission_date' in attrs: 28 | dates[hash] = int(attrs['first_submission_date']) 29 | else: 30 | dates[hash] = None 31 | 32 | index += 1 33 | if index % 1000 == 0: 34 | sys.stderr.write('\r%dk ' % (index / 1000)) 35 | sys.stderr.flush() 36 | sys.stderr.write('\r%d / %d \n' % (len(dates), index)) 37 | 38 | with io.open('sampledates.json', 'wb') as outfile: 39 | json.dump(dates, outfile) 40 | -------------------------------------------------------------------------------- /unzoo.py: -------------------------------------------------------------------------------- 1 | import os 2 | import io 3 | import subprocess 4 | 5 | BIN = 'theZoo/malwares/Binaries/' 6 | 7 | def find(dir, ext): 8 | file = [z for z in os.listdir(dir) if z.endswith(ext)] 9 | if len(file) != 1: 10 | raise Exception('more than one ' + ext + ' in ' + dir) 11 | return dir + '/' + file[0] 12 | 13 | sources = dict() 14 | for file in os.listdir(BIN): 15 | zip = find(BIN + file, '.zip') 16 | with io.open(find(BIN + file, '.pass'), 'rb') as infile: 17 | pw = infile.readline().rstrip() 18 | os.mkdir('temp') 19 | subprocess.call(['unzip', '-P' + pw, zip, '-d', 'temp']) 20 | os.system('rm -f temp/.DS_Store') 21 | for line in subprocess.Popen('find temp -type f -print0 | xargs -0 md5sum', 22 | shell=True, stdout=subprocess.PIPE).stdout: 23 | hash = line[0:32] 24 | sample = line[34:].rstrip() 25 | name = 'samples/' + hash 26 | if not os.path.isfile(name): 27 | os.link(sample, name) 28 | os.chmod(name, 0644) 29 | if hash not in sources: 30 | sources[hash] = [] 31 | sources[hash].append(file) 32 | os.system('rm -rf temp') 33 | 34 | with io.open('theZoo.md5', 'wb') as index: 35 | for hash in sources.keys(): 36 | for file in sources[hash]: 37 | index.write(hash + ' ' + file + '\n') 38 | -------------------------------------------------------------------------------- /samplefiles.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | import ast 3 | import os 4 | import io 5 | import json 6 | import sys 7 | import re 8 | from datetime import date 9 | 10 | if not os.path.isfile('index.md5'): 11 | sys.stderr.write('index missing! generate index.md5 by doing:\n') 12 | sys.stderr.write(' find /path/to/samples -type f -print0 | xargs -0 md5sum >index.md5\n') 13 | sys.exit(1) 14 | 15 | available = dict() 16 | with io.open('index.md5', 'rb') as infile: 17 | for line in infile: 18 | hash = line[0:32] 19 | path = line[34:].rstrip() 20 | available[hash] = path 21 | with io.open('sha256.json', 'rb') as cache: 22 | sha256 = json.load(cache) 23 | 24 | def scan(infile, selected): 25 | for line in infile: 26 | selected.append(line[0:32]) 27 | 28 | selected = [] 29 | if len(sys.argv) > 1: 30 | for file in sys.argv[1:]: 31 | if os.path.isfile(file): 32 | with io.open(file, 'rb') as infile: 33 | scan(infile, selected) 34 | elif re.match('[0-9a-f]{32}$', file): 35 | selected.append(file) 36 | else: 37 | raise Exception(file + ' is neither a hash nor a file') 38 | else: 39 | scan(sys.stdin, selected) 40 | 41 | for hash in selected: 42 | if hash in available: 43 | print hash, available[hash] 44 | else: 45 | print hash, 'https://virusshare.com/download.4n6?sample=%s' % (sha256[hash]) 46 | -------------------------------------------------------------------------------- /sampleinfo.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | import ast 3 | import os 4 | import io 5 | import json 6 | import sys 7 | import re 8 | from datetime import date 9 | 10 | with io.open('sampledates.json', 'rb') as cache: 11 | dates = json.load(cache) 12 | with io.open('filedates.json', 'rb') as cache: 13 | filedates = json.load(cache) 14 | with io.open('samples.json', 'rb') as cache: 15 | ransomware = json.load(cache) 16 | 17 | def family(hash): 18 | return ' '.join(ransomware[hash]['families']) 19 | 20 | def label(hash): 21 | lab = ransomware[hash]['label'] 22 | return lab if lab is not None else '???' 23 | 24 | def dateinfo(hash): 25 | if hash in dates: 26 | return str(date.fromtimestamp(dates[hash])) 27 | stats = filedates[ransomware[hash]['file']] 28 | stdev = stats['stdev'] / 86400 29 | return str(date.fromtimestamp(stats['mean'])) + (' ±%.1f' % stdev) + ' days' 30 | 31 | def query(hash): 32 | if hash in ransomware: 33 | selected.append(hash) 34 | 35 | def scan(infile, selected): 36 | for line in infile: 37 | hash = line[0:32] 38 | query(hash) 39 | 40 | selected = [] 41 | if len(sys.argv) > 1: 42 | for file in sys.argv[1:]: 43 | if os.path.isfile(file): 44 | with io.open(file, 'rb') as infile: 45 | scan(infile, selected) 46 | elif re.match('[0-9a-f]{32}$', file): 47 | query(file) 48 | else: 49 | raise Exception(file + ' is neither a hash nor a file') 50 | else: 51 | scan(sys.stdin, selected) 52 | for hash in sorted(selected, key=family): 53 | print hash, family(hash), label(hash), dateinfo(hash) 54 | -------------------------------------------------------------------------------- /filter2.py: -------------------------------------------------------------------------------- 1 | import os 2 | import io 3 | import json 4 | import sys 5 | 6 | # ignore all SINGLETON samples. these are the ones for which not a single AV 7 | # engine has a non-generic name. rationale is that if it isn't important 8 | # enough to get a name, it never had any significant spread. 9 | labels = dict() 10 | with io.open('ransomware.labels', 'rb') as infile: 11 | for line in infile: 12 | hash = line[0:32] 13 | label = line[33:].rstrip() 14 | if not label.startswith('SINGLETON:'): 15 | labels[hash] = label 16 | sys.stderr.write(str(len(labels)) + ' non-singleton samples\n') 17 | 18 | # ransomware.md5 just lists the hashes, and sample source files. because it is 19 | # much smaller, processing it is orders of magnitude faster. 20 | meta = dict() 21 | sha256 = dict() 22 | with io.open('ransomware.jsons', 'rb') as ransom: 23 | index = 0 24 | selected = 0 25 | for line in ransom: 26 | data = json.loads(line) 27 | hash = data['md5'] 28 | families = data['families'] 29 | if hash in labels or len(families) > 0: 30 | obj = { key: data[key] for key in ['families', 'file'] } 31 | obj['label'] = labels[hash] if hash in labels else None 32 | meta[hash] = obj 33 | selected += 1 34 | sha256[hash] = data['sha256'] 35 | 36 | index += 1 37 | if index % 1000 == 0: 38 | sys.stderr.write('\r' + str(index / 1000) + 'k ') 39 | sys.stderr.flush() 40 | sys.stderr.write('\r' + str(selected) + ' / ' + str(index) + '\n') 41 | 42 | with io.open('samples.json', 'wb') as out: 43 | json.dump(meta, out) 44 | with io.open('sha256.json', 'wb') as out: 45 | json.dump(sha256, out) 46 | -------------------------------------------------------------------------------- /familydates.py: -------------------------------------------------------------------------------- 1 | import ast 2 | import os 3 | import io 4 | import json 5 | import sys 6 | from datetime import date 7 | from numpy import median 8 | 9 | with io.open('dates.json', 'rb') as exact: 10 | dates = json.load(exact) 11 | with io.open('samples.json', 'rb') as infile: 12 | ransomware = json.load(infile) 13 | 14 | families = dict() 15 | labels = dict() 16 | known = unknown = 0 17 | for hash, data in ransomware.items(): 18 | for family in data['families']: 19 | if family not in families: 20 | families[family] = [] 21 | families[family].append(hash) 22 | 23 | if len(data['families']) == 0: 24 | label = data['label'] 25 | if label not in labels: 26 | labels[label] = [] 27 | labels[label].append(hash) 28 | unknown += 1 29 | else: 30 | known += 1 31 | 32 | print len(families), 'families containing', known, 'samples' 33 | print len(labels), 'labels for', unknown, 'samples' 34 | 35 | med_date = { family: median([dates[hash] for hash in families[family]]) 36 | for family in families.keys() } 37 | fam = sorted(families.keys(), key=lambda family: med_date[family]) 38 | with io.open('familydates1.tmp', 'wb') as dataset: 39 | dataset.write('rank,family\n') 40 | rank = 1 41 | for family in fam: 42 | dataset.write(str(rank) + ',' + family + '\n') 43 | rank += 1 44 | with io.open('familydates2.tmp', 'wb') as dataset: 45 | dataset.write('rank,date\n') 46 | rank = 1 47 | for family in fam: 48 | for hash in families[family]: 49 | dataset.write(str(rank) + ',' + str(date.fromtimestamp(dates[hash])) + '\n') 50 | rank += 1 51 | os.system('R --vanilla --slave -f familydates.r') 52 | os.remove('familydates1.tmp') 53 | os.remove('familydates2.tmp') 54 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | all: dates.json ransomware-families.pdf familydates.pdf 2 | 3 | ransomware/%.jsons.gz: filter1.py families.md Raw/%.ldjson.tar.gz \ 4 | vxshare-filetypes/%.zip.file.json.gz 5 | python filter1.py $* 6 | 7 | ransomware.jsons: $(foreach id,$(wildcard Raw/*.ldjson.tar.gz), \ 8 | $(subst .ldjson.tar.gz,.jsons.gz,$(subst Raw/,ransomware/,$(id)))) 9 | python filter1combine.py >ransomware.jsons 10 | 11 | ransomware.labels: ransomware.jsons avclass/avclass_labeler.py 12 | python avclass/avclass_labeler.py -vt ransomware.jsons >$@ 13 | 14 | samples.json sha256.json: filter2.py ransomware.labels ransomware.jsons 15 | python filter2.py 16 | 17 | ransomware-families.pdf: samples.json barplot.py 18 | python barplot.py 19 | 20 | # deliberately using the MetaInfo directory itself as prerequisite. declaring 21 | # all files takes almost 1min just for make to list the files. directory mtime 22 | # changes whenever a file is added, and these files are never modified anyway. 23 | sampledates.json: samples.json sampledates.py MetaInfo/ 24 | python sampledates.py 25 | 26 | filedates.pdf filedates.json: sampledates.json samples.json filedates.py \ 27 | filedates.r 28 | python filedates.py 29 | 30 | dates.json: sampledates.json filedates.json samples.json dates.py 31 | python dates.py 32 | 33 | familydates.pdf: dates.json samples.json familydates.py familydates.r 34 | python familydates.py 35 | 36 | todo.md5: sampledates.json filedates.json samples.json statsampler.py 37 | python statsampler.py 1440 38 | 39 | # these files take about 1 hour to build. ransomware.jsons is around 1.1GB. 40 | # .PRECIOUS makes sure make doesn't accidentally delete any of them. 41 | .PRECIOUS: ransomware/*.jsons.gz ransomware.jsons ransomware.labels \ 42 | families.md5 ransomware.md5 43 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Ransomware samples dataset 2 | 3 | Our ransomware dataset is based on [VirusShare](https://virusshare.com)'s collection of 33.9M samples. 4 | We used John Seymour's [dataset](https://twitter.com/_delta_zero/status/1113477389961416704) containing the VirusTotal [labels](https://www.zerofox.com/blog/labeling-virusshare-corpus/) of all 33.2M samples from June 2012 to February 2019. 5 | 6 | We downloaded the [Raw dataset](https://drive.google.com/drive/folders/1oKr5hP8Dlz1QABUOX-HKi2n8tyRkbaDN) and filtered it for all `ransom` detections. 7 | These 456856 samples are then further filtered for Windows executables using the [VirusShare filetypes dataset](https://a4lg.com/downloads/vxshare/). 8 | Filtering by filetype is mostly meant to remove a significant number of browser-based HTML ransom demands, which are scary but harmless (in an up-to-date browser). 9 | 10 | The resulting 339594 samples were then classified using the [AVClass malware labeling tool](https://github.com/malicialab/avclass) to group them by family. 11 | This yielded 23616 `SINGLETON`s (samples with generic names only), 1562 "families" containing only one sample, and 1671 ransomware families with 2 or more members. 12 | Filtering out the `SINGLETON`s leaves a base set of 315978 samples. 13 | 14 | ![almost but not quite a power law](ransomware-family-distribution.png) 15 | 16 | To the surprise of absolutely no one, it's the usual long-tailed distribution. 17 | What is surprising is that the 2-sample families do contain some ransomware that did make the news, eg. *GoldenEye*, *ZeroLocker* and *Bad Rabbit*. 18 | The 1-sample families contain many generic names like `940677ecdf` or `aawj`, but also known ransomware like *Alcatraz Locker*. 19 | 20 | The head end: 21 | 22 | ![Zeus, Winwebsec, Virlock, ZeroAccess, PornoBlocker, …](popular-ransomware.png) 23 | -------------------------------------------------------------------------------- /filedates.py: -------------------------------------------------------------------------------- 1 | import os 2 | import io 3 | import json 4 | from datetime import date 5 | from math import sqrt 6 | 7 | class FileStats: 8 | def __init__(self): 9 | self.mean = 0 10 | self.var = 0 11 | self.stdev = float('nan') 12 | self.known = dict() 13 | 14 | def add(self, hash, ts): 15 | self.known[hash] = ts 16 | # sum-of-squares algorithm works fine with python's bigints 17 | self.mean += ts 18 | self.var += ts * ts 19 | 20 | def finalize(self): 21 | n = len(self.known) 22 | if n <= 1: 23 | # (for 1, mean is well-defined and doesn't need division by 1) 24 | self.var = self.stdev = float('inf') 25 | return 26 | self.var = (self.var - self.mean * self.mean // n) // (n - 1) 27 | self.mean //= n 28 | self.stdev = sqrt(self.var) 29 | 30 | with io.open('sampledates.json', 'rb') as cache: 31 | dates = json.load(cache) 32 | with io.open('samples.json', 'rb') as infile: 33 | ransomware = json.load(infile) 34 | 35 | samples = dict() 36 | for hash, data in ransomware.items(): 37 | file = data['file'] 38 | if file not in samples: 39 | samples[file] = FileStats() 40 | if hash in dates and dates[hash] is not None: 41 | samples[file].add(hash, dates[hash]) 42 | for file in samples.keys(): 43 | samples[file].finalize() 44 | 45 | dates = dict() 46 | for file in sorted(samples.keys()): 47 | dates[file] = { 'mean': samples[file].mean, 'stdev': samples[file].stdev } 48 | with io.open('filedates.json', 'wb') as outfile: 49 | json.dump(dates, outfile) 50 | 51 | with io.open('filedates.tmp', 'wb') as dataset: 52 | dataset.write('filename,date\n') 53 | for file in sorted(samples.keys()): 54 | for hash, d in samples[file].known.items(): 55 | dataset.write(file[11:16] + ',' + str(date.fromtimestamp(d)) + '\n') 56 | os.system('R --vanilla --slave -f filedates.r') 57 | os.remove('filedates.tmp') 58 | -------------------------------------------------------------------------------- /statsampler.py: -------------------------------------------------------------------------------- 1 | ## statistics-aware sampler for metadata download ## 2 | # the metadata samples are chosen preferentially from files with high 3 | # stdev. these give the lowest overall error when the submission date of 4 | # samples is estimated from their file's average submission time. 5 | import io 6 | import sys 7 | import json 8 | import random 9 | from math import ceil 10 | from datetime import date 11 | 12 | with io.open('sampledates.json', 'rb') as cache: 13 | dates = json.load(cache) 14 | with io.open('filedates.json', 'rb') as cache: 15 | stats = json.load(cache) 16 | with io.open('samples.json', 'rb') as infile: 17 | ransomware = json.load(infile) 18 | 19 | def append(groups, group, item, stdev): 20 | if group not in groups: 21 | groups[group] = { 'stdev': 0, 'items': [] } 22 | groups[group]['items'].append(item) 23 | groups[group]['stdev'] += stdev 24 | 25 | INFINITY = 1e9 26 | families = dict() 27 | labeled = dict() 28 | for hash, data in ransomware.items(): 29 | if hash in dates: 30 | continue 31 | file = data['file'] 32 | stdev = stats[file]['stdev'] 33 | if stdev > INFINITY: 34 | stdev = INFINITY 35 | if len(data['families']) > 0: 36 | for family in data['families']: 37 | append(families, family, hash, stdev) 38 | else: 39 | append(labeled, file, hash, stdev) 40 | 41 | def sample(grouped, count, todo): 42 | total_score = sum(data['stdev'] for data in grouped.values()) 43 | for group in sorted(grouped, key=lambda group: -grouped[group]['stdev']): 44 | # ceil() so we have a chance to spot files where the known samples 45 | # suggest a really low stdev, but the file itself actually has a high 46 | # stdev nevertheless. 47 | unknown = grouped[group]['items'] 48 | score = grouped[group]['stdev'] 49 | n = min(int(ceil(count * score / total_score)), len(unknown)) 50 | for i in range(n): 51 | index = random.randrange(len(unknown)) 52 | hash = unknown.pop(index) 53 | todo.write(hash + ' ' + group + '\n') 54 | if n > 0: 55 | print group, n, score 56 | 57 | with io.open('todo.md5', 'wb') as todo: 58 | count = int(sys.argv[1]) 59 | sample(families, count, todo) 60 | sample(labeled, count, todo) 61 | -------------------------------------------------------------------------------- /barplot.py: -------------------------------------------------------------------------------- 1 | import io 2 | import json 3 | import numpy as np 4 | from collections import Counter 5 | import matplotlib.pyplot as plt 6 | from matplotlib.backends.backend_pdf import PdfPages 7 | 8 | with io.open('samples.json', 'rb') as infile: 9 | ransomware = json.load(infile) 10 | 11 | labelcounts = Counter() 12 | familycounts = Counter() 13 | for hash, data in ransomware.items(): 14 | if len(data['families']) > 0: 15 | for family in data['families']: 16 | familycounts[family] += 1 17 | else: 18 | labelcounts[data['label']] += 1 19 | 20 | single = 0 21 | for family in labelcounts.keys(): 22 | if labelcounts[family] == 1: 23 | single += 1 24 | del labelcounts[family] 25 | print 'Ignoring', single, 'labels with only one sample' 26 | print 'Number of familis: ' + str(len(familycounts.keys())) 27 | print 'Number of labels: ' + str(len(labelcounts.keys())) 28 | 29 | def families(counts): 30 | return sorted(counts.keys(), key=lambda family: -counts[family]) 31 | 32 | def render(counts, pdf, cutoff, label): 33 | plt.figure(figsize=(12, 10)) 34 | plt.ylabel('Number of samples') 35 | plt.xlabel(label) 36 | #plt.title('Most popular ransomware families') 37 | plt.yscale('log') 38 | 39 | bars = [] 40 | height = [] 41 | for family in families(counts): 42 | if counts[family] > cutoff: 43 | bars.append(family) 44 | height.append(counts[family]) 45 | 46 | x_pos = np.arange(len(bars)) 47 | pbars = plt.bar(x_pos, height, 0.8) 48 | 49 | # create names on the x-axis 50 | plt.xticks(x_pos, bars, rotation=-90, size=7) 51 | 52 | for rect in pbars: 53 | height = rect.get_height() 54 | plt.text(rect.get_x() + rect.get_width()/2.0, height, '%d' % int(height), ha='center', va='bottom', rotation=-90, size=7) 55 | 56 | plt.subplots_adjust(bottom=0.15) 57 | pdf.savefig() 58 | 59 | with PdfPages('ransomware-families.pdf') as pdf: 60 | render(familycounts, pdf, 0, 'Ransomware families') 61 | render(labelcounts, pdf, 500, 'AVclass labels') 62 | 63 | plt.figure(figsize=(12, 10)) 64 | plt.ylabel('Number of samples') 65 | plt.xlabel('AVclass label (rank)') 66 | plt.title('Distribution of ransomware labels') 67 | plt.yscale('log') 68 | 69 | bars = [] 70 | height = [] 71 | for family in families(labelcounts): 72 | bars.append(family) 73 | height.append(labelcounts[family]) 74 | 75 | # this is too dense to actually distinguish individual bars anyway, so let's 76 | # just fill the entire area. that's way faster anyway. 77 | x_pos = np.arange(len(bars)) 78 | plt.fill_between(x_pos, 0, height) 79 | pdf.savefig() 80 | 81 | plt.close() 82 | -------------------------------------------------------------------------------- /genericide.py: -------------------------------------------------------------------------------- 1 | import os 2 | import io 3 | import json 4 | import sys 5 | 6 | # ignore all SINGLETON samples. these are the ones for which not a single AV 7 | # engine has a non-generic name. rationale is that if it isn't important 8 | # enough to get a name, it never had any significant spread. 9 | recognized = dict() 10 | with io.open('ransomware.labels', 'rb') as infile: 11 | for line in infile: 12 | hash = line[0:32] 13 | family = line[33:].rstrip() 14 | if not family.startswith('SINGLETON:'): 15 | recognized[hash] = family 16 | sys.stderr.write(str(len(recognized)) + ' non-singleton samples\n') 17 | 18 | stats = dict() 19 | with io.open('ransomware.jsons', 'rb') as ransom: 20 | total = 0 21 | for line in ransom: 22 | data = json.loads(line) 23 | hash = data['md5'] 24 | file = data['file'] 25 | for family in data['families']: 26 | if family not in stats: 27 | stats[family] = { 'samples': 0, 'labels': dict(), 28 | 'unicorns': 0, 'detect': 0, 'ransom': 0 } 29 | stats[family]['samples'] += 1 30 | if hash in recognized: 31 | label = recognized[hash] 32 | if label not in stats[family]['labels']: 33 | stats[family]['labels'][label] = 0 34 | stats[family]['labels'][label] += 1 35 | if len(data['families']) == 1: 36 | stats[family]['unicorns'] += 1 37 | stats[family]['detect'] += len(data['scans']) 38 | stats[family]['ransom'] += sum(1 for eng, res in data['scans'].items() 39 | if 'ransom' in res['result'].lower()) 40 | 41 | total += 1 42 | if total % 1000 == 0: 43 | sys.stderr.write('\r' + str(total / 1000) + 'k ') 44 | sys.stderr.flush() 45 | sys.stderr.write('\r' + str(total) + '\n') 46 | 47 | for family, stat in stats.items(): 48 | samples = stat['samples'] 49 | stat['fraction'] = samples / float(total) 50 | stat['unicorns'] /= float(samples) 51 | stat['labels'] = len(stat['labels']) 52 | stat['ransom'] /= float(stat['detect']) 53 | stat['score'] = stat['samples'] * stat['labels'] / (stat['ransom'] + 1e-5) 54 | families = sorted(stats.keys(), key=lambda family: stats[family]['score']) 55 | 56 | with io.open('genericide.md', 'wb') as select: 57 | select.write('| %-14s | %-7s | %-6s | %-8s | %-8s | %-8s |\n' % ('family', 'samples', 'labels', 'ransom', 'fraction', 'unicorns')) 58 | select.write('|-%-14s-|-%-7s-|-%-6s-|-%-8s-|-%-8s-|-%-8s-|\n' % ('-'*14, '-'*7, '-'*6, '-'*8, '-'*8, '-'*8)) 59 | for family in families: 60 | stat = stats[family] 61 | select.write('| %-14s | %-7d | %-6d | %8.5f | %8.5f | %8.5f |\n' % 62 | (family, stat['samples'], stat['labels'], stat['ransom'] * 100, stat['fraction'] * 100, stat['unicorns'])) 63 | -------------------------------------------------------------------------------- /samplepicker.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | import ast 3 | import os 4 | import io 5 | import json 6 | import sys 7 | import re 8 | from numpy import median 9 | from datetime import date 10 | from collections import Counter 11 | 12 | with io.open('samples.json', 'rb') as cache: 13 | ransomware = json.load(cache) 14 | with io.open('sampledates.json', 'rb') as cache: 15 | for hash, ts in json.load(cache).items(): 16 | ransomware[hash]['exactdate'] = ts 17 | with io.open('dates.json', 'rb') as cache: 18 | for hash, ts in json.load(cache).items(): 19 | ransomware[hash]['date'] = ts 20 | with io.open('sha256.json', 'rb') as cache: 21 | for hash, sha256 in json.load(cache).items(): 22 | ransomware[hash]['sha256'] = sha256 23 | 24 | available = dict() 25 | if os.path.isfile('index.md5'): 26 | with io.open('index.md5', 'rb') as infile: 27 | for line in infile: 28 | hash = line[0:32] 29 | path = line[34:].rstrip() 30 | available[hash] = path 31 | else: 32 | sys.stderr.write('index missing! generate index.md5 by doing:\n') 33 | sys.stderr.write(' find /path/to/samples -type f -print0 | xargs -0 md5sum >index.md5\n') 34 | 35 | def score(family): 36 | med = median([ransomware[hash]['date'] for hash in families[family]]) 37 | return lambda hash: ( 38 | hash in available, # prefer samples that we already have 39 | -len(ransomware[hash]['families']), # prefer isotopically pure samples 40 | 'exactdate' in ransomware[hash], # prefer samples with known date 41 | -abs(med - ransomware[hash]['date']), # prefer samples near median date 42 | hash) # deterministically random tiebreaker 43 | 44 | if len(sys.argv) > 1: 45 | num = int(sys.argv[1]) 46 | else: 47 | num = 10 48 | processed = set() 49 | done = Counter() 50 | for file in sys.argv[2:]: 51 | family_filters = dict() 52 | with io.open(file, 'rb') as infile: 53 | for line in infile: # find the first table 54 | if line.startswith('|---'): 55 | break 56 | for line in infile: 57 | if not line.startswith('|'): 58 | break 59 | 60 | task, hash, family = line.split('|')[1:4] 61 | hash = re.search('`([0-9a-f]{32})`', hash).group(1) 62 | family = family.strip() 63 | if '#' in task and '~~' not in task: 64 | done[family] += 1 65 | processed.add(hash) 66 | 67 | families = dict() 68 | for hash, data in ransomware.items(): 69 | for family in data['families']: 70 | if hash not in processed: 71 | if family not in families: 72 | families[family] = [] 73 | families[family].append(hash) 74 | 75 | def dateinfo(meta): 76 | ts = meta['date'] 77 | if 'exactdate' in meta and meta['exactdate'] is not None: 78 | ts = meta['exactdate'] 79 | dt = str(date.fromtimestamp(ts)) 80 | if 'exactdate' not in meta: 81 | return '~' + dt 82 | return dt 83 | 84 | with io.open('suggested.md5', 'wb') as todo, io.open('download.md5', 'wb') as dl: 85 | for family, samples in sorted(families.items()): 86 | hashes = sorted(samples, key=score(family), reverse=True) 87 | n = num - done[family] 88 | if n <= 0: 89 | continue 90 | for hash in hashes[0:n]: 91 | meta = ransomware[hash] 92 | fams = ' '.join(meta['families']) 93 | label = ('`' + meta['label'] + '`') if meta['label'] is not None else 'missing' 94 | print '| TODO | `%32s` | %-23s | %-17s | %-11s | `drop` | 600 | TBD |' % ( 95 | hash, family, label, dateinfo(meta)) 96 | if hash in available: 97 | todo.write('%s %s\n' % (hash, available[hash])) 98 | else: 99 | dl.write('%s https://virusshare.com/download.4n6?sample=%s\n' % (hash, meta['sha256'])) 100 | -------------------------------------------------------------------------------- /filter1.py: -------------------------------------------------------------------------------- 1 | import os 2 | import io 3 | import re 4 | import sys 5 | import gzip 6 | import json 7 | 8 | # directory containing (compressed) downloaded files from 9 | # https://drive.google.com/drive/folders/1oKr5hP8Dlz1QABUOX-HKi2n8tyRkbaDN 10 | # that's 19GB compressed / 200GB uncompressed; this script runs about 30min. 11 | RAW = 'Raw/' 12 | # directory containing the VirusShare_*.zip.file.json files from 13 | # https://a4lg.com/downloads/vxshare/ 14 | # they're in the vxshare-filetypes-???-v5.tar.xz files. 15 | # these are gzip compressed to save storage. 16 | # processing time 3-4min, dominated by JSON parsing. 17 | TYPES = 'vxshare-filetypes/' 18 | 19 | def match(data, family_filters): 20 | return [family for family, words in family_filters.items() 21 | if all(word in data for word in words)] 22 | 23 | def scan_jsons(basename, family_filters): 24 | peexe = set() 25 | with gzip.open(TYPES + basename + '.zip.file.json.gz', 'rb') as types: 26 | for row in json.load(types): 27 | if 'sig' not in row: 28 | # there's a few (1260) entries missing the "file" results. 29 | # they're all ELF's, so it doesn't matter. 30 | continue 31 | if row['sig'].startswith('PE'): 32 | f = row['f'] 33 | if not f.startswith("VirusShare_"): 34 | raise Exception('invalid sample name ' + f) 35 | peexe.add(f[11:].rstrip()) 36 | 37 | with gzip.open(RAW + basename + '.ldjson.tar.gz', 'rb') as tgz: 38 | # the Raw dataset files are single-element tar.gz files, which is a bit 39 | # silly. however, since all tars are single-element, we can just strip 40 | # a single 512-byte tar header block and treat the rest of the file as 41 | # ordinary text. 42 | # since the files always end with a newline, tar's zero-padding of the 43 | # last block appears as a line containing only NUL bytes, which won't 44 | # match the "ransom" filter anyway. 45 | tar_header = tgz.read(512) 46 | if len(tar_header) != 512: 47 | raise Exception('cannot skip the TAR header?') 48 | for line in tgz: 49 | if '\x00' in line: 50 | break 51 | line = line.lower() 52 | # pre filter before parsing to avoid unnexessary parsing 53 | if 'ransom' not in line and len(match(line, family_filters)) == 0: 54 | continue 55 | data = json.loads(line) 56 | # pre-filter for PE executables. mostly meant to remove 57 | # browser-based HTML ransom notices, which cannot affect files 58 | # (in a working browser). 59 | if data['md5'] not in peexe: 60 | continue 61 | yield data 62 | 63 | family_filters = dict() 64 | with io.open('families.md', 'rb') as families: 65 | for line in families: # find the first table 66 | if line.startswith('|---'): 67 | break 68 | for line in families: 69 | if not line.startswith('|'): 70 | break 71 | family = line.split(r'|')[1].strip() 72 | if '~~' not in family: 73 | family_filters[family] = re.split('[ ./-]+', family.lower()) 74 | 75 | if len(sys.argv) < 2: 76 | sys.stderr.write('usage: python filter1.py basename\n') 77 | sys.exit(1) 78 | basename = sys.argv[1] 79 | 80 | with gzip.open('ransomware/' + basename + '.tmp', 'wb') as ransomware: 81 | # stream over the actual metadata files. they're too large to hold in 82 | # memory. 83 | for data in scan_jsons(basename, family_filters): 84 | ransom = False 85 | detections = dict() 86 | families = set() 87 | undetected = [] 88 | for engine, res in data['scans'].items(): 89 | if res['detected']: 90 | result = res['result'] 91 | detections[engine] = result 92 | families.update(match(result, family_filters)) 93 | ransom |= 'ransom' in result 94 | else: 95 | undetected.append(engine) 96 | if not ransom and len(families) == 0: 97 | # happens for prefilter-detected keywords that are on the same 98 | # line, but not in the same engine's detection result 99 | continue 100 | 101 | obj = { key: data[key] for key in ['md5', 'sha1', 'sha256'] } 102 | obj['file'] = basename 103 | # deliberately using the verbose format that avclass can read 104 | obj['scans'] = { engine: { 'detected': True, 'result': res } 105 | for engine, res in detections.items() } 106 | obj['undetected'] = undetected 107 | obj['families'] = list(families) 108 | json.dump(obj, ransomware) 109 | ransomware.write('\n') 110 | os.rename('ransomware/' + basename + '.tmp', 'ransomware/' + basename + '.jsons.gz') 111 | -------------------------------------------------------------------------------- /genericide.md: -------------------------------------------------------------------------------- 1 | | family | samples | labels | ransom | fraction | unicorns | 2 | |----------------|---------|--------|----------|----------|----------| 3 | | Ransom32 | 2 | 0 | 60.93750 | 0.00015 | 1.00000 | 4 | | NotPetya | 1 | 1 | 39.28571 | 0.00008 | 0.00000 | 5 | | Mordor | 1 | 1 | 39.21569 | 0.00008 | 0.00000 | 6 | | MMLocker | 1 | 1 | 23.40426 | 0.00008 | 1.00000 | 7 | | ZCryptor | 1 | 1 | 23.07692 | 0.00008 | 1.00000 | 8 | | Fsociety | 2 | 1 | 23.95833 | 0.00015 | 1.00000 | 9 | | Zerolocker | 2 | 1 | 22.68041 | 0.00015 | 1.00000 | 10 | | WinPlock | 1 | 1 | 7.50000 | 0.00008 | 0.00000 | 11 | | MarsJoke | 3 | 1 | 22.14765 | 0.00023 | 0.66667 | 12 | | 7ev3n | 2 | 2 | 25.45455 | 0.00015 | 0.50000 | 13 | | SamSam | 3 | 2 | 35.33333 | 0.00023 | 1.00000 | 14 | | BadRabbit | 4 | 2 | 32.09877 | 0.00030 | 1.00000 | 15 | | BandarChor | 2 | 2 | 15.00000 | 0.00015 | 0.50000 | 16 | | Coinvault | 4 | 1 | 12.12121 | 0.00030 | 0.75000 | 17 | | ORX-Locker | 3 | 3 | 21.62162 | 0.00023 | 0.66667 | 18 | | CryptoFortress | 2 | 2 | 8.79121 | 0.00015 | 0.50000 | 19 | | Encryptor | 6 | 2 | 23.10231 | 0.00045 | 0.00000 | 20 | | Satana | 5 | 3 | 25.39062 | 0.00038 | 0.80000 | 21 | | DirtyDecrypt | 2 | 2 | 6.15385 | 0.00015 | 1.00000 | 22 | | Maktub | 5 | 3 | 11.58798 | 0.00038 | 0.80000 | 23 | | Radamant | 7 | 5 | 15.45455 | 0.00053 | 1.00000 | 24 | | Dharma | 16 | 5 | 34.64567 | 0.00120 | 0.00000 | 25 | | AlphaCrypt | 17 | 2 | 14.11451 | 0.00128 | 0.05882 | 26 | | Bucbi | 8 | 5 | 10.90343 | 0.00060 | 0.50000 | 27 | | Kraken | 14 | 7 | 13.16998 | 0.00105 | 0.50000 | 28 | | Jigsaw | 35 | 6 | 21.38574 | 0.00263 | 0.80000 | 29 | | DMALocker | 12 | 7 | 8.08950 | 0.00090 | 0.83333 | 30 | | Ransom-FUE | 17 | 5 | 4.27586 | 0.00128 | 0.82353 | 31 | | VaultCrypt | 13 | 8 | 3.83944 | 0.00098 | 0.76923 | 32 | | CrypVault | 16 | 9 | 4.17661 | 0.00120 | 0.75000 | 33 | | VenusLocker | 18 | 8 | 2.36753 | 0.00135 | 0.00000 | 34 | | CryptoMix | 52 | 17 | 13.34392 | 0.00391 | 0.13462 | 35 | | Gpcoder | 68 | 17 | 13.31794 | 0.00511 | 0.80882 | 36 | | Slocker | 28 | 12 | 3.40081 | 0.00210 | 0.35714 | 37 | | Zyklon | 45 | 16 | 6.48402 | 0.00338 | 0.55556 | 38 | | Hidden Tear | 75 | 26 | 15.85802 | 0.00564 | 0.86667 | 39 | | Nemucod | 67 | 21 | 7.30634 | 0.00503 | 0.67164 | 40 | | HydraCrypt | 79 | 19 | 7.67884 | 0.00594 | 0.72152 | 41 | | TorrentLocker | 134 | 26 | 15.89804 | 0.01007 | 0.14179 | 42 | | CryptXXX | 131 | 41 | 19.01347 | 0.00984 | 0.19084 | 43 | | Crysis | 104 | 33 | 11.78563 | 0.00781 | 0.65385 | 44 | | Fantom | 34 | 11 | 1.25698 | 0.00255 | 0.94118 | 45 | | Ryuk | 63 | 44 | 3.67615 | 0.00473 | 0.93651 | 46 | | Jaff | 77 | 37 | 3.24531 | 0.00579 | 0.93506 | 47 | | Exotic | 1 | 1 | 0.00000 | 0.00008 | 1.00000 | 48 | | PayDay | 1 | 1 | 0.00000 | 0.00008 | 1.00000 | 49 | | Petya | 245 | 14 | 2.83861 | 0.01841 | 0.93878 | 50 | | Critroni | 198 | 50 | 6.45869 | 0.01488 | 0.47980 | 51 | | Chanitor | 144 | 35 | 2.82680 | 0.01082 | 0.82639 | 52 | | Troldesh | 548 | 57 | 14.62629 | 0.04117 | 0.39234 | 53 | | CryptoDefense | 286 | 56 | 7.33318 | 0.02149 | 0.36713 | 54 | | Chimera | 65 | 11 | 0.24225 | 0.00488 | 0.98462 | 55 | | Citron | 73 | 12 | 0.23383 | 0.00548 | 1.00000 | 56 | | Pacman | 4 | 1 | 0.00000 | 0.00030 | 1.00000 | 57 | | Hi Buddy | 4 | 1 | 0.00000 | 0.00030 | 1.00000 | 58 | | Serpent | 143 | 19 | 0.54210 | 0.01074 | 0.96503 | 59 | | Ginx | 109 | 40 | 0.67827 | 0.00819 | 0.97248 | 60 | | WildFire | 80 | 4 | 0.03635 | 0.00601 | 1.00000 | 61 | | Hermes | 601 | 70 | 2.51479 | 0.04516 | 0.82696 | 62 | | TeslaCrypt | 2439 | 117 | 14.25607 | 0.18326 | 0.42230 | 63 | | WannaCry | 8154 | 113 | 38.41410 | 0.61267 | 0.97326 | 64 | | Locky | 2915 | 153 | 17.65814 | 0.21902 | 0.44220 | 65 | | Onion | 1499 | 176 | 9.30733 | 0.11263 | 0.47832 | 66 | | Threat Finder | 9 | 4 | 0.00000 | 0.00068 | 1.00000 | 67 | | CTB-Locker | 1422 | 122 | 4.73694 | 0.10684 | 0.79395 | 68 | | CryptoWall | 1623 | 162 | 6.59624 | 0.12195 | 0.61553 | 69 | | CryptoLocker | 6574 | 169 | 27.18724 | 0.49395 | 0.17630 | 70 | | Xorist | 6639 | 159 | 25.79024 | 0.49883 | 0.33469 | 71 | | Power Worm | 15 | 3 | 0.00000 | 0.00113 | 1.00000 | 72 | | Nymaim | 1176 | 135 | 2.64127 | 0.08836 | 0.58418 | 73 | | XRTN | 88 | 35 | 0.02877 | 0.00661 | 1.00000 | 74 | | VirLock | 13563 | 99 | 11.82076 | 1.01908 | 0.84458 | 75 | | Cerber | 5236 | 297 | 11.26451 | 0.39342 | 0.49981 | 76 | | Crowti | 2839 | 279 | 5.46956 | 0.21331 | 0.63332 | 77 | | Pclock | 27 | 6 | 0.00000 | 0.00203 | 1.00000 | 78 | | Domino | 19 | 9 | 0.00000 | 0.00143 | 1.00000 | 79 | | Matsnu | 2627 | 192 | 2.32300 | 0.19738 | 0.86563 | 80 | | Mamba | 42 | 7 | 0.00000 | 0.00316 | 1.00000 | 81 | | Globe | 4448 | 58 | 0.72468 | 0.33421 | 0.80126 | 82 | | Kovter | 6213 | 254 | 4.17811 | 0.46683 | 0.70964 | 83 | | DXXD | 404 | 170 | 0.15010 | 0.03036 | 0.88614 | 84 | | Revenge | 245 | 33 | 0.01022 | 0.01841 | 0.90612 | 85 | | Dumb | 779 | 209 | 0.11583 | 0.05853 | 0.89859 | 86 | | CHIP | 1470 | 123 | 0.11474 | 0.11045 | 0.99116 | 87 | | Bart | 2219 | 176 | 0.13290 | 0.16673 | 0.98963 | 88 | | Nuke | 1850 | 389 | 0.24189 | 0.13900 | 0.96757 | 89 | | Urausy | 32794 | 619 | 5.28424 | 2.46404 | 0.82747 | 90 | | Reveton | 23306 | 596 | 3.53314 | 1.75114 | 0.84099 | 91 | | Ransomlock | 35756 | 1766 | 12.09938 | 2.68659 | 0.70466 | 92 | | Sage | 11680 | 799 | 0.71278 | 0.87760 | 0.95950 | 93 | | Razy | 111950 | 1163 | 0.42245 | 8.41157 | 0.93777 | 94 | | Magic | 4084 | 116 | 0.00000 | 0.30686 | 0.98531 | 95 | | Enigma | 11424 | 1247 | 0.01817 | 0.85836 | 0.99492 | 96 | | Mole | 37496 | 1701 | 0.03619 | 2.81733 | 0.99176 | 97 | | Tox | 60370 | 953 | 0.01621 | 4.53601 | 0.99715 | 98 | | Rex | 719047 | 1544 | 0.00690 | 54.02692 | 0.99175 | 99 | -------------------------------------------------------------------------------- /families.md: -------------------------------------------------------------------------------- 1 | # Overview 2 | 3 | *Warning: `filter1.py` wants to read the family names from the first table in 4 | this file! First column must be the family names. 5 | 6 | | family | avclass label | 7 | |-------------------|----------------| 8 | | 7ev3n | TBD | 9 | | Alcatraz-Locker | TBD | 10 | | AlphaCrypt | TBD | 11 | | AngryDuck | TBD | 12 | | BadRabbit | TBD | 13 | | BandarChor | TBD | 14 | | ~~Bart~~ | TBD | 15 | | Browlock | TBD | 16 | | Bucbi | TBD | 17 | | ~~CHIP~~ | TBD | 18 | | CTB-Locker | TBD | 19 | | Citron | TBD | 20 | | Cerber | `cerber` | 21 | | Chanitor | TBD | 22 | | Chimera | TBD | 23 | | Coinvault | TBD | 24 | | Comrade-Circle | TBD | 25 | | Critroni | TBD | 26 | | Crowti | TBD | 27 | | CryLocker | TBD | 28 | | CrypVault | TBD | 29 | | CryptFile2 | TBD | 30 | | CryptInfinite | TBD | 31 | | CryptXXX | TBD | 32 | | CryptoApp | TBD | 33 | | CryptoDefense | TBD | 34 | | CryptoFortress | TBD | 35 | | CryptoJocker | TBD | 36 | | CryptoLocker | TBD | 37 | | CryptoLuck | TBD | 38 | | CryptoMix | TBD | 39 | | CryptoShield | TBD | 40 | | CryptoWall | TBD | 41 | | Cryptvault | TBD | 42 | | Crysis | TBD | 43 | | DMALocker | TBD | 44 | | DXXD | TBD | 45 | | Dharma | TBD | 46 | | DirtyDecrypt | `dircrypt` | 47 | | Disakil | TBD | 48 | | Domino | TBD | 49 | | ~~Dumb~~ | TBD | 50 | | DummyLocker | TBD | 51 | | Encryptor | TBD | 52 | | ~~Enigma~~ | TBD | 53 | | Exotic | TBD | 54 | | Fakben | TBD | 55 | | Fantom | TBD | 56 | | Fsociety | TBD | 57 | | GandCrab | TBD | 58 | | Ginx | TBD | 59 | | ~~Globe~~ | TBD | 60 | | Gomasom | TBD | 61 | | Gpcoder | TBD | 62 | | Gremit | TBD | 63 | | HDDCryptor | TBD | 64 | | HadesLocker | TBD | 65 | | Herbst | TBD | 66 | | Hi Buddy | TBD | 67 | | Hidden Tear | TBD | 68 | | HydraCrypt | TBD | 69 | | Jaff | TBD | 70 | | Jigsaw | `jigsaw` | 71 | | Job Cryptor | TBD | 72 | | Karmen | TBD | 73 | | KeRanger | TBD | 74 | | KillerLocker | TBD | 75 | | Kostya | TBD | 76 | | ~~Kovter~~ | TBD | 77 | | Kraken | TBD | 78 | | LeChiffre | TBD | 79 | | Linkup | TBD | 80 | | Lock93 | TBD | 81 | | LockDroid | TBD | 82 | | LockLock | TBD | 83 | | Locky | TBD | 84 | | LowLevel404 | TBD | 85 | | MBL Advisory | TBD | 86 | | MIRCOP | TBD | 87 | | MMLocker | TBD | 88 | | Mabouia | TBD | 89 | | ~~Magic~~ | TBD | 90 | | Maktub | TBD | 91 | | Mamba | TBD | 92 | | MarsJoke | TBD | 93 | | Matsnu | TBD | 94 | | ~~Mole~~ | TBD | 95 | | Mordor | TBD | 96 | | Nanolocker | TBD | 97 | | Nemucod | TBD | 98 | | ~~Nuke~~ | TBD | 99 | | Nullbyte | TBD | 100 | | Nymaim | TBD | 101 | | ORX-Locker | TBD | 102 | | Onion | TBD | 103 | | Pacman | TBD | 104 | | PayCrypt | TBD | 105 | | PayDay | TBD | 106 | | Pclock | TBD | 107 | | Petya | TBD | 108 | | NotPetya | TBD | 109 | | Phonywall | TBD | 110 | | PoshCoder | TBD | 111 | | Power Worm | TBD | 112 | | Radamant | TBD | 113 | | Ransom-FUE | TBD | 114 | | Ransom32 | TBD | 115 | | ~~Ransomlock~~ | TBD | 116 | | ~~Razy~~ | TBD | 117 | | ~~Revenge~~ | TBD | 118 | | ~~Reveton~~ | TBD | 119 | | ~~Rex~~ | TBD | 120 | | Ryuk | TBD | 121 | | Hermes | TBD | 122 | | ~~Sage~~ | TBD | 123 | | SamSam | TBD | 124 | | Satana | TBD | 125 | | Serpent | TBD | 126 | | SharkRaaS | TBD | 127 | | Simplocker | TBD | 128 | | Slocker | TBD | 129 | | Sodinokibi | TBD | 130 | | Synolocker | TBD | 131 | | TeslaCrypt | `tescrypt` | 132 | | Threat Finder | TBD | 133 | | TorrentLocker | TBD | 134 | | ~~Tox~~ | TBD | 135 | | ToxCrypt | TBD | 136 | | Troldesh | TBD | 137 | | Umbrecrypt | TBD | 138 | | Unix.Ransomcrypt | TBD | 139 | | Unnamed_0 | TBD | 140 | | ~~Urausy~~ | TBD | 141 | | VaultCrypt | TBD | 142 | | VenusLocker | TBD | 143 | | Vipasana | TBD | 144 | | VirLock | TBD | 145 | | Viruscoder | TBD | 146 | | WannaCry | `wannacry` | 147 | | WildFire | TBD | 148 | | WinPlock | TBD | 149 | | XRTN | TBD | 150 | | Xorist | TBD | 151 | | ZCryptor | TBD | 152 | | Zerolocker | TBD | 153 | | Zyklon | TBD | 154 | | n1n1n1 | TBD | 155 | 156 | Struck-out families have been genericided because they (3 out of 4): 157 | 158 | - match a huge number of samples 159 | - contain way too many different avclass labels 160 | - are very rarely detected as something containing `ransom` 161 | - are a common English word 162 | 163 | `genericidy.py` is used to get the numbers and sorts the families according to 164 | the first 3 criteria. The families are then filtered manually. 165 | 166 | | family | samples | labels | ransom | fraction | unicorns | 167 | |----------------|---------|--------|----------|----------|----------| 168 | | Globe | 4448 | 58 | 0.72468 | 0.33421 | 0.80126 | 169 | | Kovter | 6213 | 254 | 4.17811 | 0.46683 | 0.70964 | 170 | | Revenge | 245 | 33 | 0.01022 | 0.01841 | 0.90612 | 171 | | Dumb | 779 | 209 | 0.11583 | 0.05853 | 0.89859 | 172 | | CHIP | 1470 | 123 | 0.11474 | 0.11045 | 0.99116 | 173 | | Bart | 2219 | 176 | 0.13290 | 0.16673 | 0.98963 | 174 | | Nuke | 1850 | 389 | 0.24189 | 0.13900 | 0.96757 | 175 | | Urausy | 32794 | 619 | 5.28424 | 2.46404 | 0.82747 | 176 | | Reveton | 23306 | 596 | 3.53314 | 1.75114 | 0.84099 | 177 | | Ransomlock | 35756 | 1766 | 12.09938 | 2.68659 | 0.70466 | 178 | | Sage | 11680 | 799 | 0.71278 | 0.87760 | 0.95950 | 179 | | Razy | 111950 | 1163 | 0.42245 | 8.41157 | 0.93777 | 180 | | Magic | 4084 | 116 | 0.00000 | 0.30686 | 0.98531 | 181 | | Enigma | 11424 | 1247 | 0.01817 | 0.85836 | 0.99492 | 182 | | Mole | 37496 | 1701 | 0.03619 | 2.81733 | 0.99176 | 183 | | Tox | 60370 | 953 | 0.01621 | 4.53601 | 0.99715 | 184 | | Rex | 719047 | 1544 | 0.00690 | 54.02692 | 0.99175 | 185 | 186 | - [Ransomlock](https://www.symantec.com/security-center/writeup/2009-041513-1400-99) 187 | is a generic label used for many annoying lock-the-desktop type programs. 188 | - [Tox](https://securingtomorrow.mcafee.com/other-blogs/mcafee-labs/meet-tox-ransomware-for-the-rest-of-us/) 189 | is a Ransomware-as-a-Service, but also matches `toxic` and is hence useless. 190 | - [Kovter](https://www.enigmasoftware.com/kovterransomware-removal/), 191 | [Reveton and Urausy](https://www.f-secure.com/en/web/labs_global/removing-police-themed-ransomware) 192 | are families of lock-the-desktop type programs with scary police-themes messages. 193 | - [VirLock](https://www.symantec.com/security-center/writeup/2014-120915-3319-99) 194 | has an incredibly huge number of samples, but that's fully expected for a 195 | fully polymorphic virus. Left in the sample, though sharing the encrypted 196 | files is expected to be problematic when each of them contains a copy of the 197 | malware... 198 | 199 | Some lock-the-desktop ransomware might encrypt files, but finding the single instance 200 | that does that in several 10k samples is impractical. 201 | 202 | # TheZoo collection 203 | 204 | https://github.com/ytisf/theZoo 205 | 206 | | family | directory name | 207 | |---------------|-----------------------------------| 208 | | Matsnu | `Ransomware.Matsnu` | 209 | | Radamant | `Ransomware.Radamant` | 210 | | Petrwrap | `Ransomware.Petrwrap` | 211 | | DirtyDecrypt | `Win32Dircrypt.Trojan.Ransom.ABZ` | 212 | | TeslaCrypt | `Ransomware.TeslaCrypt` | 213 | | Unnamed_0 | `Ransomware.Unnamed_0` | 214 | | Cerber | `Ransomware.Cerber` | 215 | | CryptoWall | `Ransomware.Cryptowall` | 216 | | Jigsaw | `Ransomware.Jigsaw` | 217 | | Locky | `Ransomware.Locky` | 218 | | Mamba | `Ransomware.Mamba` | 219 | | Petya | `Ransomware.Petya` | 220 | | Rex | `Ransomware.Rex` | 221 | | Satana | `Ransomware.Satana` | 222 | | Vipasana | `Ransomware.Vipasana` | 223 | | WannaCry_Plus | `Ransomware.WannaCry_Plus` | 224 | | WannaCry | `Ransomware.WannaCry` | 225 | 226 | # Cuckoo's signatures 227 | 228 | | family | signature filename | 229 | |-----------------|-------------------------------------------------------| 230 | | 7ev3n | `ransomware_fileextensions.py` | 231 | | Alcatraz-Locker | `ransomware_files.py`, `ransomware_fileextensions.py` | 232 | | AlphaCrypt | `ransomware_files.py` | 233 | | AngryDuck | `ransomware_fileextensions.py` | 234 | | Bart | `ransomware_fileextensions.py` | 235 | | CHIP | `ransomware_files.py`, `ransomware_fileextensions.py` | 236 | | CTB-Locker | `ransomware_files.py` | 237 | | Cerber | `ransomware_files.py` | 238 | | Chanitor | `ransom_mutex.py` | 239 | | Chimera | `ransomware_files.py` | 240 | | Comrade-Circle | `ransomware_files.py`, `ransomware_fileextensions.py` | 241 | | CryLocker | `ransomware_fileextensions.py` | 242 | | CrypVault | `ransomware_files.py` | 243 | | CryptFile2 | `ransomware_files.py`, `ransomware_fileextensions.py` | 244 | | CryptXXX | `ransomware_files.py`, `ransomware_fileextensions.py` | 245 | | CryptoLocker | `ransomware_files.py` | 246 | | CryptoLuck | `ransomware_fileextensions.py` | 247 | | CryptoMix | `ransomware_fileextensions.py` | 248 | | CryptoShield | `ransomware_fileextensions.py` | 249 | | CryptoWall | `ransomware_files.py` | 250 | | Crysis | `ransomware_fileextensions.py` | 251 | | DMALocker | `ransomware_files.py` | 252 | | DXXD | `ransomware_fileextensions.py` | 253 | | Dharma | `ransomware_fileextensions.py` | 254 | | Domino | `ransomware_fileextensions.py` | 255 | | DummyLocker | `ransomware_fileextensions.py` | 256 | | Enigma | `ransomware_fileextensions.py` | 257 | | Exotic | `ransomware_fileextensions.py` | 258 | | Fakben | `ransomware_files.py` | 259 | | Fantom | `ransomware_files.py`, `ransomware_fileextensions.py` | 260 | | Fsociety | `ransomware_fileextensions.py` | 261 | | Globe | `ransomware_fileextensions.py` | 262 | | Gremit | `ransomware_fileextensions.py` | 263 | | HadesLocker | `ransomware_files.py`, `ransomware_fileextensions.py` | 264 | | Herbst | `ransomware_fileextensions.py` | 265 | | HydraCrypt | `ransomware_files.py`, `ransomware_fileextensions.py` | 266 | | Jaff | `ransomware_fileextensions.py` | 267 | | Karmen | `ransomware_fileextensions.py` | 268 | | KillerLocker | `ransomware_fileextensions.py` | 269 | | Kostya | `ransomware_fileextensions.py` | 270 | | Kraken | `ransomware_fileextensions.py` | 271 | | LeChiffre | `ransomware_files.py` | 272 | | Lock93 | `ransomware_fileextensions.py` | 273 | | LockLock | `ransomware_fileextensions.py` | 274 | | Locky | `ransomware_files.py`, `ransomware_fileextensions.py` | 275 | | MMLocker | `ransomware_files.py` | 276 | | Maktub | `ransomware_files.py` | 277 | | MarsJoke | `ransomware_files.py` | 278 | | Mole | `ransomware_fileextensions.py` | 279 | | Mordor | `ransomware_fileextensions.py` | 280 | | Nuke | `ransomware_files.py`, `ransomware_fileextensions.py` | 281 | | Nullbyte | `ransomware_fileextensions.py` | 282 | | PayDay | `ransomware_fileextensions.py` | 283 | | Radamant | `ransomware_files.py` | 284 | | Razy | `ransomware_fileextensions.py` | 285 | | Revenge | `ransomware_fileextensions.py` | 286 | | Sage | `ransomware_fileextensions.py` | 287 | | Satana | `ransomware_files.py` | 288 | | Serpent | `ransomware_fileextensions.py` | 289 | | TeslaCrypt | `ransomware_files.py` | 290 | | ToxCrypt | `ransomware_fileextensions.py` | 291 | | Troldesh | `ransomware_fileextensions.py` | 292 | | VenusLocker | `ransomware_fileextensions.py` | 293 | | Viruscoder | `ransomware_viruscoder.py` | 294 | | WannaCry | `ransomware_fileextensions.py` | 295 | | WildFire | `ransomware_files.py` | 296 | | WildFire-Locker | `ransomware_fileextensions.py` | 297 | | WinPlock | `ransomware_files.py` | 298 | | n1n1n1 | `ransomware_files.py` | 299 | 300 | # Symantec ISTRs 301 | 302 | https://www.symantec.com/security-center/archived-publications 303 | 304 | Volume 21/2016 has a complete timeline of ransomware on page 59, going all the 305 | way back to Gpcoder in 2005. 306 | 307 | | Name | ISTR Volume | 308 | +-------------------+------------------+ 309 | | 73v3n | 21/2016 | 310 | | BadRabbit | 23/2018 | 311 | | BandarChor | 21/2016 | 312 | | Browlock | 21/2016 | 313 | | Bucbi | 22/2017 | 314 | | CTB-Locker/Citron | 21/2016 | 315 | | Cerber | 22/2017, 23/2018 | 316 | | Chimera-Locker | 21/2016 | 317 | | Coinvault | 21/2016 | 318 | | CryptInfinite | 21/2016 | 319 | | CryptXXX | 22/2017 | 320 | | CryptoApp | 21/2016 | 321 | | CryptoJocker | 21/2016 | 322 | | CryptoLocker | 22/2017 | 323 | | Cryptolocker2015 | 21/2016 | 324 | | Cryptowall | 23/2018, 21/2016 | 325 | | Cryptvault | 21/2016 | 326 | | Crysis | 24/2019 | 327 | | DMA-Locker | 21/2016 | 328 | | Disakil | 23/2018, 22/2017 | 329 | | Dumb | 21/2016 | 330 | | Encryptor RaaS | 21/2016 | 331 | | Ginx | 21/2016 | 332 | | Gomasom | 21/2016 | 333 | | Gpcoder | 21/2016 | 334 | | HDDCryptor | 22/2017 | 335 | | Hi Buddy | 21/2016 | 336 | | Hidden Tear | 21/2016 | 337 | | Hydracrypt | 21/2016 | 338 | | Job Cryptor | 21/2016 | 339 | | KeRanger | 21/2016, 22/2017 | 340 | | Kovter | 21/2016 | 341 | | LeChiffre | 21/2016 | 342 | | Linkup | 21/2016 | 343 | | LockDroid | 21/2016 | 344 | | Locky | 23/2018, 22/2017, 21/2016 | 345 | | LowLevel404 | 21/2016 | 346 | | MIRCOP | 22/2017 | 347 | | Mabouia OSX POC | 21/2016 | 348 | | Magic | 21/2016 | 349 | | Nanolocker | 21/2016 | 350 | | Nemucod | 22/2017 | 351 | | Nymaim | 21/2016 | 352 | | ORX-Locker | 21/2016 | 353 | | Onion | 21/2016 | 354 | | Pacman | 21/2016 | 355 | | PayCrypt | 21/2016 | 356 | | Pclock | 21/2016 | 357 | | Petya / NotPetya | 23/2018 | 358 | | Phonywall | 23/2018 | 359 | | Power Worm | 21/2016 | 360 | | Radamant | 21/2016 | 361 | | Ransom32 | 21/2016, 22/2017 | 362 | | Ransomlock | 22/2017 | 363 | | Reveton | 21/2016 | 364 | | Ryuk / Hermes | 24/2019 | 365 | | SamSam | 24/2019, 22/2017 | 366 | | SharkRaaS | 22/2017 | 367 | | Simplocker | 21/2016, 22/2017 | 368 | | Slocker | 21/2016 | 369 | | Synolocker | 21/2016 | 370 | | TeslaCrypt | 21/2016 | 371 | | Threat Finder | 21/2016 | 372 | | TorrentLocker | 21/2016, 23/2018 | 373 | | Tox | 21/2016 | 374 | | Troldesh | 21/2016 | 375 | | Umbrecrypt | 21/2016 | 376 | | Unix.Ransomcrypt | 21/2016 | 377 | | Urausy | 21/2016 | 378 | | VaultCrypt | 21/2016 | 379 | | Vipasana | 21/2016 | 380 | | VirLock | 21/2016 | 381 | | Wannacry | 23/2018 | 382 | | XRTN | 21/2016 | 383 | | ZCryptor | 22/2017 | 384 | | Zerolocker | 21/2016 | 385 | | Zyklon | 22/2017 | 386 | 387 | # ShieldFS paper 388 | 389 | https://doi.org/10.1145/2991079.2991110 390 | 391 | | Name | Source | 392 | +---------------+---------+ 393 | | Critroni | both | 394 | | Crowti | Table 2 | 395 | | CryptoDefense | both | 396 | | CryptoLocker | Table 6 | 397 | | CryptoWall | both | 398 | | DirtyDecrypt | Table 6 | 399 | | Locky | Table 6 | 400 | | PayCrypt | Table 6 | 401 | | TeslaCrypt | both | 402 | | TorrentLocker | Table 6 | 403 | | Troldesh | Table 6 | 404 | | ZeroLocker | Table 6 | 405 | 406 | # CryptoLock paper 407 | 408 | https://doi.org/10.1109/ICDCS.2016.46 409 | 410 | "Filecoder" is a generic label by AV software; it matches 12674 samples from 411 | 300 different families. 412 | 413 | | Name | Source | 414 | +---------------------+---------+ 415 | | CryptoDefense | Table 1 | 416 | | CryptoFortress | Table 1 | 417 | | CryptoLocker | Table 1 | 418 | | CryptoTorLocker2015 | Table 1 | 419 | | CryptoWall | Table 1 | 420 | | CTB-Locker | Table 1 | 421 | | Filecoder | Table 1 | 422 | | GPcode | Table 1 | 423 | | MBL Advisory | Table 1 | 424 | | PoshCoder | Table 1 | 425 | | Ransom-FUE 2 | Table 1 | 426 | | TeslaCrypt | Table 1 | 427 | | Virlock | Table 1 | 428 | | Xorist | Table 1 | 429 | 430 | # Other 431 | 432 | | Name | Source | 433 | |------------|---------------------------------------------------------------------------| 434 | | GandCrab | https://www.hornetsecurity.com/de/security-informationen/gandcrab-analyse | 435 | | Sodinokibi | https://heise.de/-4483691 | 436 | --------------------------------------------------------------------------------