├── testdata ├── LAND.MAP ├── embedded_video_quicktime.doc └── README.rst ├── extra_tests ├── spamsum.h ├── Makefile ├── spamsum_compare.c ├── spamsum_main.c ├── from_git_repo.go ├── edit_dist.c └── spamsum.c ├── README.md ├── spamsumwriter.go ├── spamsum_test.go ├── spamsumwriter_test.go ├── spamsum_compare.go ├── spamsum_compare_test.go ├── spamsum.go └── LICENSE /testdata/LAND.MAP: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/michielbuddingh/spamsum/HEAD/testdata/LAND.MAP -------------------------------------------------------------------------------- /testdata/embedded_video_quicktime.doc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/michielbuddingh/spamsum/HEAD/testdata/embedded_video_quicktime.doc -------------------------------------------------------------------------------- /extra_tests/spamsum.h: -------------------------------------------------------------------------------- 1 | typedef unsigned u32; 2 | typedef unsigned char uchar; 3 | 4 | #define FLAG_IGNORE_WHITESPACE 1 5 | #define FLAG_IGNORE_HEADERS 2 6 | 7 | char *spamsum_file(const char *, u32, u32); 8 | -------------------------------------------------------------------------------- /testdata/README.rst: -------------------------------------------------------------------------------- 1 | Origins of these files 2 | ~~~~~~~~~~~~~~~~~~~~~~ 3 | 4 | embedded_video_quicktime.doc: 5 | from https://github.com/ross-spencer/format-corpus , CC0 6 | 7 | LAND.MAP: 8 | from https://github.com/ross-spencer/format-corpus , CC0 9 | -------------------------------------------------------------------------------- /extra_tests/Makefile: -------------------------------------------------------------------------------- 1 | # Builds the original spamsum tool, and one additional quick hack, 2 | # required to run the from_git_repo.go script 3 | CC = gcc 4 | CFLAGS = -Wall -W 5 | 6 | OBJS = spamsum.o edit_dist.o 7 | 8 | all: spamsum spamsum_compare 9 | 10 | spamsum: $(OBJS) 11 | $(CC) -o spamsum spamsum_main.c $(OBJS) 12 | 13 | spamsum_compare: $(OBJS) 14 | $(CC) -o spamsum_compare spamsum_compare.c $(OBJS) 15 | 16 | clean: 17 | @rm -f $(OBJS) spamsum spamsum_compare *~ 18 | -------------------------------------------------------------------------------- /extra_tests/spamsum_compare.c: -------------------------------------------------------------------------------- 1 | /* Copyright 2013, Michiel Buddingh, All rights reserved. Use of this 2 | code is governed by version 2.0 or later of the Apache License, 3 | available at http://www.apache.org/licenses/LICENSE-2.0 4 | 5 | This is a quick-and-dirty hack to compare two spamsums from the 6 | command line */ 7 | 8 | #include 9 | #include 10 | #include 11 | #include "spamsum.h" 12 | 13 | int main(int argc, char *argv[]) { 14 | int result = spamsum_match(argv[1], argv[2]); 15 | printf("%d", result); 16 | return 0; 17 | } 18 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | spamsum 2 | ======= 3 | 4 | #### A fuzzy checksum for matching spam #### 5 | 6 | This is a native go implementation of spamsum. 7 | 8 | spamsum was developed by Andrew Tridgell to hash email messages for computationally inexpensive SPAM detection. See . 9 | 10 | The state of this package 11 | ------------------------- 12 | 13 | * Ready for production use. 14 | * It seems to generate results identical to that of the [spamsum tool](https://junkcode.samba.org/ftp/unpacked/junkcode/spamsum/) and [ssdeep](http://ssdeep.sf.net). This has only been tested on a small number of files. 15 | * It is about twice as slow as the spamsum tool; about 40MB/s on a 3Ghz Core i3. Use `gccgo` to make the speed difference disappear. 16 | * Fuzzy comparison may be slower than the spamsum tool. Benchmark forthcoming. 17 | 18 | How to use 19 | ---------- 20 | 21 | Unfortunately, the default operation for spamsum is to iterate over the data several times to determine an optimal block size, so it's not sensible to implement the `hash.Hash` interface. 22 | 23 | Instead, the package exports the functions `HashBytes(b [] byte)` and `HashReadSeeker(source io.ReadSeeker, length int64)`. 24 | 25 | if file, err := os.Open("filename"); err != nil { 26 | log.Fatal(err) 27 | } else if stat, err := file.Stat(); err != nil { 28 | log.Fatal(err) 29 | } else { 30 | sum, err := spamsum.HashReadSeeker(file, stat.Size()) 31 | // etc. 32 | } 33 | 34 | Any errors returned by `HashReadSeeker` will originate from the `io.ReadSeeker` functions. 35 | 36 | ### Alternatively ### 37 | 38 | If it is acceptable to set a fixed blocksize beforehand, the `SpamSumWriter` type can be used, which _does_ implement the `hash.Hash` interface. The `Sum(b []byte) []byte` method is not terribly useful; it will return a slice where the non-zero bytes contain a base64-encoded 6-bit hash for a `BlockSize()`-sized block. Use the `String()` method to obtain a more useful representation. 39 | 40 | ### License ### 41 | 42 | Use of this code is governed by version 2.0 or later of the Apache 43 | License, available at 44 | 45 | The `extra_tests` directory contains modified sources of the original [spamsum tool](https://junkcode.samba.org/ftp/unpacked/junkcode/spamsum/) as part of a regression test suite. This code is joint licensed under the Perl Artistic License and the GPL version 2 or later. 46 | -------------------------------------------------------------------------------- /extra_tests/spamsum_main.c: -------------------------------------------------------------------------------- 1 | /* 2 | this is a checksum routine that is specifically designed for spam. 3 | Copyright Andrew Tridgell 2002 4 | 5 | This code is released under the GNU General Public License version 2 6 | or later. Alteratively, you may also use this code under the terms 7 | of the Perl Artistic license. 8 | 9 | If you wish to distribute this code under the terms of a different 10 | free software license then please ask me. If there is a good reason 11 | then I will probably say yes. 12 | */ 13 | #include 14 | #include 15 | #include 16 | #include "spamsum.h" 17 | 18 | static void show_help(void) 19 | { 20 | printf(""); 21 | } 22 | 23 | int main(int argc, char *argv[]) 24 | { 25 | char *sum; 26 | extern char *optarg; 27 | extern int optind; 28 | int c; 29 | char *dbname = NULL; 30 | u32 score; 31 | int i; 32 | u32 flags = 0; 33 | u32 block_size = 0; 34 | u32 threshold = 90; 35 | 36 | while ((c = getopt(argc, argv, "B:WHd:c:C:hT:")) != -1) { 37 | switch (c) { 38 | case 'W': 39 | flags |= FLAG_IGNORE_WHITESPACE; 40 | break; 41 | 42 | case 'H': 43 | flags |= FLAG_IGNORE_HEADERS; 44 | break; 45 | 46 | case 'd': 47 | dbname = optarg; 48 | break; 49 | 50 | case 'B': 51 | block_size = atoi(optarg); 52 | break; 53 | 54 | case 'T': 55 | threshold = atoi(optarg); 56 | break; 57 | 58 | case 'c': 59 | if (!dbname) { 60 | show_help(); 61 | exit(1); 62 | } 63 | score = spamsum_match_db(dbname, optarg, 64 | threshold); 65 | printf("%u\n", score); 66 | exit(score >= threshold ? 0 : 2); 67 | 68 | case 'C': 69 | if (!dbname) { 70 | show_help(); 71 | exit(1); 72 | } 73 | score = spamsum_match_db(dbname, 74 | spamsum_file(optarg, flags, 75 | block_size), 76 | threshold); 77 | printf("%u\n", score); 78 | exit(score >= threshold ? 0 : 2); 79 | 80 | case 'h': 81 | default: 82 | show_help(); 83 | exit(0); 84 | } 85 | } 86 | 87 | argc -= optind; 88 | argv += optind; 89 | 90 | if (argc == 0) { 91 | show_help(); 92 | return 0; 93 | } 94 | 95 | /* compute the spamsum on a list of files */ 96 | for (i=0;i 1 { 55 | for _, commit := range commits { 56 | contents, err := exec.Command("git", "show", "--format=raw", string(commit[1])+":"+filename).Output() 57 | 58 | if err == nil && len(contents) > 0 && !strings.HasPrefix(string(contents), "fatal") { 59 | sum1 := createSpamSum(contents) 60 | sum2 := createOriginalSpamSum(contents) 61 | if sum1 != sum2 { 62 | log.Printf("revision %s of file %s has differing spamsums", string(commit[1]), filename) 63 | } 64 | count++ 65 | sums = append(sums, sum1) 66 | } 67 | } 68 | } 69 | 70 | for idx, left := range sums { 71 | for i := idx + 1; i < len(sums); i++ { 72 | first := compareSpamSum(left, sums[i]) 73 | second := compareOriginalSpamSum(left, sums[i]) 74 | if first != second { 75 | log.Printf("Difference in comparison between %s and %s, %d, %d\n", left, sums[i], first, second) 76 | } 77 | comparisoncount++ 78 | } 79 | } 80 | 81 | } 82 | 83 | func compareSpamSum(left, right string) int { 84 | var leftSum, rightSum spamsum.SpamSum 85 | fmt.Sscan(left, &leftSum) 86 | fmt.Sscan(right, &rightSum) 87 | score := leftSum.Compare(rightSum) 88 | return int(score) 89 | } 90 | 91 | func compareOriginalSpamSum(left, right string) int { 92 | scoretext, _ := exec.Command(spamsum_comparepath, left, right).Output() 93 | var score int 94 | score, _ = strconv.Atoi(string(scoretext)) 95 | return score 96 | } 97 | 98 | func createSpamSum(contents []byte) string { 99 | reader := bytes.NewReader(contents) 100 | sum, _ := spamsum.HashReadSeeker(reader, int64(len(contents))) 101 | return sum.String() 102 | } 103 | 104 | func createOriginalSpamSum(contents []byte) string { 105 | reader := bytes.NewReader(contents) 106 | cmd := exec.Command(spamsumpath, "-") 107 | cmd.Stdin = reader 108 | if sumbytes, err := cmd.Output(); err == nil { 109 | return strings.TrimSpace(string(sumbytes)) 110 | } 111 | return "nil" 112 | } 113 | -------------------------------------------------------------------------------- /spamsumwriter_test.go: -------------------------------------------------------------------------------- 1 | // Copyright 2013, Michiel Buddingh, All rights reserved. 2 | // Use of this code is governed by version 2.0 or later of the Apache 3 | // License, available at http://www.apache.org/licenses/LICENSE-2.0 4 | 5 | package spamsum 6 | 7 | import ( 8 | "bufio" 9 | "encoding/binary" 10 | "math/rand" 11 | "os" 12 | "path/filepath" 13 | "testing" 14 | ) 15 | 16 | func TestWriter(t *testing.T) { 17 | tests := []struct { 18 | seed int64 19 | length int 20 | blocksize uint32 21 | expected string 22 | }{ 23 | {42, 16384, 384, "384:PnwCSZ6yE9r4UCZ1he34xas/E8AhHgdd2yM:PbSZ6yE9rGfExx"}, 24 | {1000, 2048, 48, "48:Zo+v/bCSly4VhreHwHJdkHTzF7sjBU1YuD/QtFsByxoSJW+QiLlH:uSWSFteQHJd+Tp79mqSqyCt+5LlH"}, 25 | {1000, 1048576, 24576, "24576:xL2L/P40/cnWGr7tsP+mgdQGvnb1UV+gQ8ZwU:ErPP/2WItsPTgdD/bqQ4yU"}, 26 | {71268, 24, 3, "3:N0n6xmcFctn:7xmptn"}, 27 | } 28 | 29 | for _, test := range tests { 30 | generator := rand.New(rand.NewSource(test.seed)) 31 | writer := StartFixedBlocksize(test.blocksize) 32 | for i := 0; i < test.length/4; i++ { 33 | binary.Write(writer, binary.BigEndian, generator.Uint32()) 34 | } 35 | if writer.String() != test.expected { 36 | t.Errorf("Expected %v, result was %v", test.expected, writer) 37 | } 38 | } 39 | } 40 | 41 | func TestWriterIntermediate(t *testing.T) { 42 | tests := []struct { 43 | filename string 44 | initialLength int 45 | expectedIntermediate string 46 | expectedFinal string 47 | blockSize uint32 48 | }{ 49 | { 50 | "LAND.MAP", 51 | 131072, 52 | "768:tlBecdq6/+dgZUTp+gAdAm:3", 53 | "768:tlBecdq6/+dgZUTp+gAdA3T9Y02xEFshHOl3O98FzbXfBfhPcGxGB3whvm9HvMB1:O", 54 | 768, 55 | }, 56 | { 57 | "embedded_video_quicktime.doc", 58 | 12288, 59 | "192:o50PBwxGc+Zrnn:G8cOb", 60 | "192:o50PBwxGc+ZrnCe9pz1aZ8GHiLUd0935:G8cOz9pzJ3", 61 | 192, 62 | }, 63 | } 64 | 65 | for _, test := range tests { 66 | writer := StartFixedBlocksize(test.blockSize) 67 | 68 | path := filepath.Join("testdata", test.filename) 69 | file, openerr := os.Open(path) 70 | if openerr != nil { 71 | t.Fatal(openerr) 72 | } 73 | defer file.Close() 74 | 75 | reader := bufio.NewReader(file) 76 | 77 | buf4k := make([]byte, test.initialLength) 78 | _, readerr := reader.Read(buf4k) 79 | if readerr != nil { 80 | t.Fatal(readerr) 81 | } 82 | 83 | writer.Write(buf4k) 84 | 85 | if writer.String() != test.expectedIntermediate { 86 | t.Errorf("Expected intermediate result %s, got %s", 87 | test.expectedIntermediate, 88 | writer.String()) 89 | } 90 | 91 | reader.WriteTo(writer) 92 | 93 | if writer.String() != test.expectedFinal { 94 | t.Errorf("Expected final result %s, got %s", 95 | test.expectedFinal, 96 | writer.String()) 97 | } 98 | } 99 | } 100 | 101 | func TestWriterReset(t *testing.T) { 102 | generator := rand.New(rand.NewSource(3181)) 103 | writer := StartFixedBlocksize(768) 104 | emtpySlice := make([]byte, 0) 105 | 106 | for i := 0; i < 4096; i++ { 107 | binary.Write(writer, binary.BigEndian, generator.Uint32()) 108 | } 109 | 110 | beforeReset := writer.String() 111 | beforeResetBinary := writer.Sum(emtpySlice) 112 | 113 | writer.Reset() 114 | generator = rand.New(rand.NewSource(3181)) 115 | 116 | for i := 0; i < 4096; i++ { 117 | binary.Write(writer, binary.BigEndian, generator.Uint32()) 118 | } 119 | 120 | afterReset := writer.String() 121 | afterResetBinary := writer.Sum(emtpySlice) 122 | 123 | if beforeReset != afterReset { 124 | t.Errorf("Same data written to the same writer, but different results!") 125 | } 126 | 127 | if len(afterResetBinary) != len(beforeResetBinary) { 128 | t.Errorf("Binary spamsums are not even the same size") 129 | } 130 | 131 | for i, _ := range beforeResetBinary { 132 | if beforeResetBinary[i] != afterResetBinary[i] { 133 | t.Errorf("Binary spamsums before and after reset differ at byte %d", i) 134 | break 135 | } 136 | } 137 | } 138 | 139 | func TestSize(t *testing.T) { 140 | writer := StartFixedBlocksize(16) 141 | if writer.Size() != SpamsumLength { 142 | t.Errorf("Max result size should always be equal to SpamsumLength, which is %d\n", SpamsumLength) 143 | } 144 | } 145 | -------------------------------------------------------------------------------- /spamsum_compare.go: -------------------------------------------------------------------------------- 1 | // Copyright 2013, Michiel Buddingh, All rights reserved. 2 | // Use of this code is governed by version 2.0 or later of the Apache 3 | // License, available at http://www.apache.org/licenses/LICENSE-2.0 4 | 5 | package spamsum 6 | 7 | import ( 8 | "math" 9 | ) 10 | 11 | const ( 12 | insCost = 1 13 | delCost = 1 14 | changeCost = 3 15 | ) 16 | 17 | // Compare two SpamSums, returning a value between 0 and 100. 18 | // This method is currently not bug-for-bug compatible with the 19 | // original spamsum. 20 | func (from SpamSum) Compare(to SpamSum) (similarity uint32) { 21 | q := float32(from.blocksize) / float32(to.blocksize) 22 | if q == 1 { 23 | similarity = uint32(max( 24 | score(from.leftPart[:from.leftIndex], 25 | to.leftPart[:to.leftIndex], 26 | int(from.blocksize)), 27 | score(from.rightPart[:from.rightIndex], 28 | to.rightPart[:to.rightIndex], 29 | int(to.blocksize)))) 30 | 31 | } else if q == 2 { 32 | similarity = uint32(score( 33 | from.leftPart[:from.leftIndex], 34 | to.rightPart[:to.rightIndex], 35 | int(from.blocksize))) 36 | } else if q == 0.5 { 37 | similarity = uint32(score( 38 | from.rightPart[:from.rightIndex], 39 | to.leftPart[:to.leftIndex], 40 | int(to.blocksize))) 41 | } else { 42 | similarity = 0 43 | } 44 | return 45 | } 46 | 47 | func score(from, to []byte, blocksize int) (score int) { 48 | if !hasCommonSubstring(from, to) { 49 | return 0 50 | } 51 | 52 | from = eliminateRepetition(from) 53 | to = eliminateRepetition(to) 54 | 55 | score = editDistance(from, to) 56 | 57 | score *= SpamsumLength 58 | score /= len(from) + len(to) 59 | 60 | score = (score * 100) / 64 61 | 62 | score = 100 - score 63 | 64 | maxscore := blocksize / minBlockSize * min(len(from), len(to)) 65 | score = min(score, maxscore) 66 | 67 | return score 68 | } 69 | 70 | func editDistance(from, to []byte) int { 71 | // memoize turns a recursive levenshtein function into one that uses an 72 | // array to cache results. Uses |from| * |to| ints of memory. 73 | memoize := func(calculate func(a, b []byte) int) func(a, b []byte) int { 74 | var memo []int 75 | ffl, ttl := len(from), len(to) 76 | memo = make([]int, ffl*ttl) 77 | 78 | return func(from, to []byte) int { 79 | fl, tl := len(from), len(to) 80 | 81 | if fl == 0 { 82 | return tl 83 | } 84 | if tl == 0 { 85 | return fl 86 | } 87 | 88 | index := ((tl - 1) * ffl) + fl - 1 89 | if memo[index] == 0 { 90 | memo[index] = calculate(from, to) 91 | 92 | } 93 | return memo[index] 94 | } 95 | } 96 | 97 | var levenshteinRecursive func(from, to []byte) int 98 | 99 | // to see uncached results, just remove the memoize() 100 | levenshteinRecursive = memoize(func(from, to []byte) (distance int) { 101 | // This algorithm is not tuned for anything but legibility, complexity 102 | // is O(|from| * |to|). The original code has the option of swapping 103 | // adjacent characters; as far as I can deduce, this is never used due 104 | // to the cost penalty, so it is omitted here. 105 | fl, tl := len(from), len(to) 106 | 107 | if fl == 0 { 108 | return tl 109 | } 110 | if tl == 0 { 111 | return fl 112 | } 113 | 114 | var cost = changeCost 115 | 116 | if from[fl-1] == to[tl-1] { 117 | cost = 0 118 | } 119 | 120 | return min( 121 | levenshteinRecursive(from[:fl-1], to)+delCost, 122 | levenshteinRecursive(from, to[:tl-1])+delCost, 123 | levenshteinRecursive(from[:fl-1], to[:tl-1])+cost) 124 | }) 125 | 126 | return levenshteinRecursive(from, to) 127 | } 128 | 129 | // eliminateRepetition reduces sequences of repeating bytes 130 | // longer than 3 bytes to length 3. 131 | func eliminateRepetition(from []byte) (to []byte) { 132 | to = make([]byte, len(from)) 133 | copy(to, from[:3]) 134 | 135 | i, j := 3, 3 136 | for ; i < len(from); i++ { 137 | if from[i-3] != from[i] || 138 | from[i-2] != from[i] || 139 | from[i-1] != from[i] { 140 | to[j] = from[i] 141 | j++ 142 | } 143 | } 144 | 145 | return to[:j] 146 | } 147 | 148 | // hasCommonSubstring returns true if the two byte slices 149 | // passed have a common substring of at least seven bytes. 150 | func hasCommonSubstring(seq1, seq2 []byte) (found bool) { 151 | shift_offset: 152 | for shift := len(seq1) - 7; shift >= 7-len(seq2); shift-- { 153 | firstbound, secondbound := max(0, shift), max(0, -shift) 154 | common := 0 155 | for i, j := firstbound, secondbound; j < len(seq2) && i < len(seq1); i++ { 156 | if seq1[i] != seq2[j] { 157 | common = 0 158 | } else if common == 6 { 159 | found = true 160 | break shift_offset 161 | } else { 162 | common++ 163 | } 164 | j++ 165 | } 166 | } 167 | return 168 | } 169 | 170 | // min returns the minimum of its arguments 171 | func min(args ...int) int { 172 | min := int(math.MaxInt32) 173 | for _, m := range args { 174 | if m < min { 175 | min = m 176 | } 177 | } 178 | return min 179 | } 180 | 181 | // max returns the maximum of its arguments 182 | func max(args ...int) int { 183 | max := int(-math.MaxInt32) 184 | for _, m := range args { 185 | if m > max { 186 | max = m 187 | } 188 | } 189 | return max 190 | } 191 | -------------------------------------------------------------------------------- /spamsum_compare_test.go: -------------------------------------------------------------------------------- 1 | // Copyright 2013, Michiel Buddingh, All rights reserved. 2 | // Use of this code is governed by version 2.0 or later of the Apache 3 | // License, available at http://www.apache.org/licenses/LICENSE-2.0 4 | 5 | package spamsum 6 | 7 | import ( 8 | "fmt" 9 | "testing" 10 | ) 11 | 12 | func TestEliminateRepetition(t *testing.T) { 13 | teststrings := []struct { 14 | input, expected string 15 | }{ 16 | {"AAAABC", "AAABC"}, 17 | {"Qddddddddd", "Qddd"}, 18 | {"AtrU||||v*****pn", "AtrU|||v***pn"}, 19 | } 20 | 21 | for _, pair := range teststrings { 22 | shortened := string(eliminateRepetition([]byte(pair.input))) 23 | if shortened != pair.expected { 24 | t.Errorf("%v shortened should be %v, is %v", pair.input, pair.expected, shortened) 25 | } 26 | } 27 | } 28 | 29 | func TestHasCommonSubstring(t *testing.T) { 30 | tests := []struct { 31 | left, right string 32 | expected bool 33 | }{ 34 | {"Hello, world", "Hello there", false}, 35 | {"abcdefg", "abcdefg", true}, 36 | {"", "", false}, 37 | {"0123456789ABCDEF", "ABCDEF0123456789", true}, 38 | {"321abcdefg321", "abcdefg", true}, 39 | {"123b4567", "123c4567", false}, 40 | } 41 | 42 | for _, test := range tests { 43 | result := hasCommonSubstring([]byte(test.left), []byte(test.right)) 44 | if result != test.expected { 45 | condition := "not " 46 | if test.expected { 47 | condition = "" 48 | } 49 | t.Errorf("\"%v\" and \"%v\" should %shave a common substring of length 7", test.left, test.right, condition) 50 | } 51 | mirroredResult := hasCommonSubstring([]byte(test.right), []byte(test.left)) 52 | if mirroredResult != result { 53 | t.Errorf("Symmetry error for %v and %v", test.left, test.right) 54 | } 55 | } 56 | } 57 | 58 | func TestEditDistance(t *testing.T) { 59 | tests := []struct { 60 | left, right string 61 | dist_expected int 62 | }{ 63 | {"abcdefg", "abcdefg", 0}, 64 | {"abcdefg", "abcqefg", 2}, 65 | {"ABCDEFG", "ABCEDFG", 2}, 66 | {"ooooAAA", "AAAoooo", 6}, 67 | {"oAoooAA", "AAoooAo", 4}, 68 | {"", "1234567", 7}, 69 | {"", "", 0}, 70 | {"HIJKLMN", "JKLMNOPQRST", 8}, 71 | {"UVxeXup8VuH8rD//pcrHBrlG5FWgYJ70A", 72 | "kVxeXup8VuH8rD//4crHBrlGXm5WgYJ70A", 7}, 73 | {"O4XuptH8D//pcrHmgfL", "e4XuptH8D//4crHMmUfL", 7}, 74 | {"kVxeXup8VuH8rD//4crHBrlGXm5WgYJ70A", 75 | "kVxeXup8VuH8rD//4crHBrlGXm5WGYJ70A", 2}, 76 | {"2Ewd+NvN88y3GdkvBC+9lKMHhDh", 77 | "2Ewd+NvNrgdkvBC+9lKMHhDh", 7}, 78 | {"vEnWHH6d/4H/4Z2fvNoF8Sy2yt/YUC", 79 | "xLnWHH6d/4H/4HHHHHHHH4CnrJuN0QhsSyjTU9/j4hbp96khuYhwX", 51}, 80 | } 81 | 82 | for _, test := range tests { 83 | result := editDistance([]byte(test.left), []byte(test.right)) 84 | if result != test.dist_expected { 85 | t.Errorf("\"%v\" and \"%v\" should have a distance of %d, was %d", test.left, test.right, test.dist_expected, result) 86 | } 87 | mirroredResult := editDistance([]byte(test.left), []byte(test.right)) 88 | if mirroredResult != result { 89 | t.Errorf("Symmetry error, editDistance(%s, %s) should be editDistance(%s, %s)", test.left, test.right, test.right, test.left) 90 | } 91 | } 92 | } 93 | 94 | func TestScore(t *testing.T) { 95 | tests := []struct { 96 | left, right string 97 | blocksize int 98 | score_expected int 99 | }{ 100 | {"2Ewd+NvN88y3GdkvBC+9lKMHhDh", 101 | "2Ewd+NvNrgdkvBC+9lKMHhDh", 6, 48}, 102 | {"7iExTmgeXCcGYX1CRRX1PRRX88p0RRpdV/ISGcEvNOk+l/oX9QUopsAoX9QUopIo", 103 | "7iExTmgeXCcGYX1CRRX1PRRXrZGcEvNOk+l/oX9QUopsAoX9QUopIHKl057DRMHD", 104 | 12, 80}, 105 | {"vEnWHH6d/4H/4Z2fvNoF8Sy2yt/YUC", 106 | "xLnWHH6d/4H/4HHHHHHHH4CnrJuN0QhsSyjTU9/j4hbp96khuYhwX", 24, 43}, 107 | } 108 | for _, test := range tests { 109 | result := score([]byte(test.left), []byte(test.right), test.blocksize) 110 | if result != test.score_expected { 111 | t.Errorf("\"%v\" and \"%v\" should have a score of %d, was %d", test.left, test.right, test.score_expected, result) 112 | } 113 | } 114 | 115 | } 116 | 117 | func TestCompare(t *testing.T) { 118 | tests := []struct { 119 | left, right string 120 | similarity_expected uint32 121 | }{ 122 | // these are not values produced by the original spamsum 123 | // score algorithm 124 | { 125 | 126 | "12582912:UVxeXup8VuH8rD//pcrHBrlG5FWgYJ70A:O4XuptH8D//pcrHmgfL", 127 | "12582912:kVxeXup8VuH8rD//4crHBrlGXm5WgYJ70A:e4XuptH8D//4crHMmUfL", 128 | 91}, 129 | 130 | {"12582912:kVxeXup8VuH8rD//4crHBrlGXm5WgYJ70A:e4XuptH8D//4crHMmUfL", 131 | "12582912:kVxeXup8VuH8rD//4crHBrlGXm5WGYJ70A:e4XuptH8D//4crHMMUfL", 132 | 99}, 133 | // different block sizes 134 | {"96:aaUi0DTEnLMZMVd2jnEMyFrsdy9LdeGatg3Uogbqs0uBUZoXLn1IvwwDaK:aaf0PU8YMnElrcULdSWgbqs0uBb1IIK", 135 | "192:aaf6PU8YMnElrcULdSWgbqs0uBb1IIAfsR6OZWjZDx:aaf6PUcYrfLdSWgms0uBb1TA0lZ8ZDx", 80}, 136 | // different block sizes reversed 137 | {"192:aaf6PU8YMnElrcULdSWgbqs0uBb1IIAfsR6OZWjZDx:aaf6PUcYrfLdSWgms0uBb1TA0lZ8ZDx", 138 | "96:aaUi0DTEnLMZMVd2jnEMyFrsdy9LdeGatg3Uogbqs0uBUZoXLn1IvwwDaK:aaf0PU8YMnElrcULdSWgbqs0uBb1IIK", 80}, 139 | // Uncomparable due to different sizes, should be 0 140 | {"12582912:kVxeXup8VuH8rD//4crHBrlGXm5WgYJ70A:e4XuptH8D//4crHMmUfL", 141 | "96:aaUi0DTEnLMZMVd2jnEMyFrsdy9LdeGatg3Uogbqs0uBUZoXLn1IvwwDaK:aaf0PU8YMnElrcULdSWgbqs0uBb1IIK", 0}, 142 | 143 | {"48:wX0GLBZET14EHWFIUXs0hPbaL3RdNhI6h0:wPLBS4EecWT6hdNhs", 144 | "48:w+wNj5GLBX/8jrT14EHWFIUXs0hPbaL3qd9hI6h0:w+zLBX/w14EecWT6ad9hs", 77}, 145 | {"12:7iExTmgeXCcGYX1CRRX1PRRX88p0RRpdV/ISGcEvNOk+l/oX9QUopsAoX9QUopIo:2Ewd+NvN88y3GdkvBC+9lKMHhDh", "12:7iExTmgeXCcGYX1CRRX1PRRXrZGcEvNOk+l/oX9QUopsAoX9QUopIHKl057DRMHD:2Ewd+NvNrgdkvBC+9lKMHhDh", 88}, 146 | {"24:R9mMhMDnWm8m86dmW4zm8mW4zm/mhkcnZ/uLkcHrBCaDrvNQxhwQmq8SywwboX+6:vEnWHH6d/4H/4Z2fvNoF8Sy2yt/YUC", 147 | "48:xLnWHH6d/4H/4HHHHHHHH4CnrJuN0QhsSyjTU9/j4hbp96khuYhwX:NWHH6dQHQHHHHHHHH4CnV1QeSyj8j4hG", 148 | 43}, 149 | } 150 | 151 | for _, test := range tests { 152 | var left, right SpamSum 153 | if _, err := fmt.Sscan(test.left, &left); err != nil { 154 | t.Errorf("Could not scan string %s, %v", test.left, err) 155 | } 156 | if _, err := fmt.Sscan(test.right, &right); err != nil { 157 | t.Errorf("Could not scan string %s, %v", test.right, err) 158 | } 159 | similarity := left.Compare(right) 160 | if similarity != test.similarity_expected { 161 | t.Errorf("%v, %v\nSimilariy score should be %d, was %d", left, right, test.similarity_expected, similarity) 162 | } 163 | } 164 | } 165 | -------------------------------------------------------------------------------- /extra_tests/edit_dist.c: -------------------------------------------------------------------------------- 1 | /* 2 | This edit distance code is taken from trn3.6. A few minor 3 | modifications have been made by Andrew Tridgell 4 | for use in spamsum. 5 | */ 6 | 7 | 8 | /***************************************************************************/ 9 | 10 | 11 | /* The authors make no claims as to the fitness or correctness of this software 12 | * for any use whatsoever, and it is provided as is. Any use of this software 13 | * is at the user's own risk. 14 | */ 15 | 16 | #include 17 | #include 18 | #include 19 | 20 | /* edit_dist -- returns the minimum edit distance between two strings 21 | 22 | Program by: Mark Maimone CMU Computer Science 13 Nov 89 23 | Last Modified: 28 Jan 90 24 | 25 | If the input strings have length n and m, the algorithm runs in time 26 | O(nm) and space O(min(m,n)). 27 | 28 | HISTORY 29 | 13 Nov 89 (mwm) Created edit_dist() and set_costs(). 30 | 31 | 28 Jan 90 (mwm) Added view_costs(). Should verify that THRESHOLD 32 | computations will work even when THRESHOLD is not a multiple of 33 | sizeof(int). 34 | 35 | 17 May 93 (mwm) Improved performance when used with trn's newsgroup 36 | processing; assume all costs are 1, and you can terminate when a 37 | threshold is exceeded. 38 | */ 39 | 40 | #define MIN_DIST 100 41 | 42 | #define TRN_SPEEDUP /* Use a less-general version of the 43 | routine, one that's better for trn. 44 | All change costs are 1, and it's okay 45 | to terminate if the edit distance is 46 | known to exceed MIN_DIST */ 47 | 48 | #define THRESHOLD 4000 /* worry about allocating more memory only 49 | when this # of bytes is exceeded */ 50 | #define STRLENTHRESHOLD ((int) ((THRESHOLD / sizeof (int) - 3) / 2)) 51 | 52 | #define SAFE_ASSIGN(x,y) (((x) != NULL) ? (*(x) = (y)) : (y)) 53 | 54 | #define swap_int(x,y) (_iswap = (x), (x) = (y), (y) = _iswap) 55 | #define swap_char(x,y) (_cswap = (x), (x) = (y), (y) = _cswap) 56 | #define min3(x,y,z) (_mx = (x), _my = (y), _mz = (z), (_mx < _my ? (_mx < _mz ? _mx : _mz) : (_mz < _my) ? _mz : _my)) 57 | #define min2(x,y) (_mx = (x), _my = (y), (_mx < _my ? _mx : _my)) 58 | 59 | 60 | static int insert_cost = 1; 61 | static int delete_cost = 1; 62 | #ifndef TRN_SPEEDUP 63 | static int change_cost = 1; 64 | static int swap_cost = 1; 65 | #endif 66 | 67 | static int _iswap; /* swap_int temp variable */ 68 | static char *_cswap; /* swap_char temp variable */ 69 | static int _mx, _my, _mz; /* min2, min3 temp variables */ 70 | 71 | 72 | 73 | /* edit_distn -- returns the edit distance between two strings, or -1 on 74 | failure */ 75 | 76 | int 77 | edit_distn(from, from_len, to, to_len) 78 | char *from, *to; 79 | register int from_len, to_len; 80 | { 81 | #ifndef TRN_SPEEDUP 82 | register int ins, del, ch; /* local copies of edit costs */ 83 | #endif 84 | register int row, col, index; /* dynamic programming counters */ 85 | register int radix; /* radix for modular indexing */ 86 | #ifdef TRN_SPEEDUP 87 | register int low; 88 | #endif 89 | int *buffer; /* pointer to storage for one row 90 | of the d.p. array */ 91 | static int store[THRESHOLD / sizeof (int)]; 92 | /* a small amount of static 93 | storage, to be used when the 94 | input strings are small enough */ 95 | 96 | /* Handle trivial cases when one string is empty */ 97 | 98 | if (from == NULL || !from_len) 99 | if (to == NULL || !to_len) 100 | return 0; 101 | else 102 | return to_len * insert_cost; 103 | else if (to == NULL || !to_len) 104 | return from_len * delete_cost; 105 | 106 | /* Initialize registers */ 107 | 108 | radix = 2 * from_len + 3; 109 | #ifdef TRN_SPEEDUP 110 | #define ins 1 111 | #define del 1 112 | #define ch 3 113 | #define swap_cost 5 114 | #else 115 | ins = insert_cost; 116 | del = delete_cost; 117 | ch = change_cost; 118 | #endif 119 | 120 | /* Make from short enough to fit in the static storage, if it's at all 121 | possible */ 122 | 123 | if (from_len > to_len && from_len > STRLENTHRESHOLD) { 124 | swap_int(from_len, to_len); 125 | swap_char(from, to); 126 | #ifndef TRN_SPEEDUP 127 | swap_int(ins, del); 128 | #endif 129 | } /* if from_len > to_len */ 130 | 131 | /* Allocate the array storage (from the heap if necessary) */ 132 | 133 | if (from_len <= STRLENTHRESHOLD) 134 | buffer = store; 135 | else 136 | buffer = (int *) malloc(radix * sizeof (int)); 137 | 138 | /* Here's where the fun begins. We will find the minimum edit distance 139 | using dynamic programming. We only need to store two rows of the matrix 140 | at a time, since we always progress down the matrix. For example, 141 | given the strings "one" and "two", and insert, delete and change costs 142 | equal to 1: 143 | 144 | _ o n e 145 | _ 0 1 2 3 146 | t 1 1 2 3 147 | w 2 2 2 3 148 | o 3 2 3 3 149 | 150 | The dynamic programming recursion is defined as follows: 151 | 152 | ar(x,0) := x * insert_cost 153 | ar(0,y) := y * delete_cost 154 | ar(x,y) := min(a(x - 1, y - 1) + (from[x] == to[y] ? 0 : change), 155 | a(x - 1, y) + insert_cost, 156 | a(x, y - 1) + delete_cost, 157 | a(x - 2, y - 2) + (from[x] == to[y-1] && 158 | from[x-1] == to[y] ? swap_cost : 159 | infinity)) 160 | 161 | Since this only looks at most two rows and three columns back, we need 162 | only store the values for the two preceeding rows. In this 163 | implementation, we do not explicitly store the zero column, so only 2 * 164 | from_len + 2 words are needed. However, in the implementation of the 165 | swap_cost check, the current matrix value is used as a buffer; we 166 | can't overwrite the earlier value until the swap_cost check has 167 | been performed. So we use 2 * from_len + 3 elements in the buffer. 168 | */ 169 | 170 | #define ar(x,y,index) (((x) == 0) ? (y) * del : (((y) == 0) ? (x) * ins : \ 171 | buffer[mod(index)])) 172 | #define NW(x,y) ar(x, y, index + from_len + 2) 173 | #define N(x,y) ar(x, y, index + from_len + 3) 174 | #define W(x,y) ar(x, y, index + radix - 1) 175 | #define NNWW(x,y) ar(x, y, index + 1) 176 | #define mod(x) ((x) % radix) 177 | 178 | index = 0; 179 | 180 | #ifdef DEBUG_EDITDIST 181 | printf(" "); 182 | for (col = 0; col < from_len; col++) 183 | printf(" %c ", from[col]); 184 | printf("\n "); 185 | 186 | for (col = 0; col <= from_len; col++) 187 | printf("%2d ", col * del); 188 | #endif 189 | 190 | /* Row 0 is handled implicitly; its value at a given column is col*del. 191 | The loop below computes the values for Row 1. At this point we know the 192 | strings are nonempty. We also don't need to consider swap costs in row 193 | 1. 194 | 195 | COMMENT: the indicies row and col below point into the STRING, so 196 | the corresponding MATRIX indicies are row+1 and col+1. 197 | */ 198 | 199 | buffer[index++] = min2(ins + del, (from[0] == to[0] ? 0 : ch)); 200 | #ifdef TRN_SPEEDUP 201 | low = buffer[mod(index + radix - 1)]; 202 | #endif 203 | 204 | #ifdef DEBUG_EDITDIST 205 | printf("\n %c %2d %2d ", to[0], ins, buffer[index - 1]); 206 | #endif 207 | 208 | for (col = 1; col < from_len; col++) { 209 | buffer[index] = min3( 210 | col * del + ((from[col] == to[0]) ? 0 : ch), 211 | (col + 1) * del + ins, 212 | buffer[index - 1] + del); 213 | #ifdef TRN_SPEEDUP 214 | if (buffer[index] < low) 215 | low = buffer[index]; 216 | #endif 217 | index++; 218 | 219 | #ifdef DEBUG_EDITDIST 220 | printf("%2d ", buffer[index - 1]); 221 | #endif 222 | 223 | } /* for col = 1 */ 224 | 225 | #ifdef DEBUG_EDITDIST 226 | printf("\n %c %2d ", to[1], 2 * ins); 227 | #endif 228 | 229 | /* Now handle the rest of the matrix */ 230 | 231 | for (row = 1; row < to_len; row++) { 232 | for (col = 0; col < from_len; col++) { 233 | buffer[index] = min3( 234 | NW(row, col) + ((from[col] == to[row]) ? 0 : ch), 235 | N(row, col + 1) + ins, 236 | W(row + 1, col) + del); 237 | if (from[col] == to[row - 1] && col > 0 && 238 | from[col - 1] == to[row]) 239 | buffer[index] = min2(buffer[index], 240 | NNWW(row - 1, col - 1) + swap_cost); 241 | 242 | #ifdef DEBUG_EDITDIST 243 | printf("%2d ", buffer[index]); 244 | #endif 245 | #ifdef TRN_SPEEDUP 246 | if (buffer[index] < low || col == 0) 247 | low = buffer[index]; 248 | #endif 249 | 250 | index = mod(index + 1); 251 | } /* for col = 1 */ 252 | #ifdef DEBUG_EDITDIST 253 | if (row < to_len - 1) 254 | printf("\n %c %2d ", to[row+1], (row + 2) * ins); 255 | else 256 | printf("\n"); 257 | #endif 258 | #ifdef TRN_SPEEDUP 259 | if (low > MIN_DIST) 260 | break; 261 | #endif 262 | } /* for row = 1 */ 263 | 264 | row = buffer[mod(index + radix - 1)]; 265 | if (buffer != store) 266 | free((char *) buffer); 267 | return row; 268 | } /* edit_distn */ 269 | -------------------------------------------------------------------------------- /spamsum.go: -------------------------------------------------------------------------------- 1 | // Copyright 2013, Michiel Buddingh, All rights reserved. Use of this 2 | // code is governed by version 2.0 or later of the Apache License, 3 | // available at http://www.apache.org/licenses/LICENSE-2.0 4 | 5 | // Package spamsum implements Andrew Tridgell's fuzzy hash for spam 6 | // detection. The output should be identical to that produced by 7 | // existing tools like ssdeep. 8 | package spamsum 9 | 10 | import ( 11 | "bytes" 12 | "errors" 13 | "fmt" 14 | "io" 15 | "strconv" 16 | "unicode" 17 | ) 18 | 19 | const ( 20 | rollingWindow = 7 21 | minBlockSize = 3 22 | SpamsumLength = 64 23 | ReadSize = 8192 24 | offset32 = uint32(0x28021967) 25 | prime32 = uint32(16777619) 26 | ) 27 | 28 | type SpamSum struct { 29 | blocksize uint32 30 | leftPart [SpamsumLength]byte 31 | rightPart [SpamsumLength / 2]byte 32 | leftIndex, rightIndex int 33 | } 34 | 35 | // String produces the canonical representation of a spamsum. a 36 | // positive number indicating the block size, up to 64 base64 37 | // characters each encoding a 6-bit hash of an (approximately) 38 | // BlockSize()-sized block, up to 32 base64 characters encoding a 39 | // 6-bit hash for BlockSize() * 2. The three fields are separated by 40 | // a colon. 41 | func (ss *SpamSum) String() string { 42 | return fmt.Sprintf("%d:%s:%s", 43 | ss.blocksize, 44 | string(ss.leftPart[:nonZeroLength(ss.leftPart[:])]), 45 | string(ss.rightPart[:nonZeroLength(ss.rightPart[:])])) 46 | } 47 | 48 | // BlockSize returns the approximate block size used in this sum. 49 | // Note that this size is only the expected value. The individual 50 | // 6-bit block hashes may encode far smaller or far larger blocks. 51 | func (ss *SpamSum) BlockSize() int { 52 | return int(ss.blocksize) 53 | } 54 | 55 | // HashBytes takes a byte slice, and takes its SpamSum, calculating 56 | // the optimal block size in several passes. Since adding more data 57 | // to such a sum would invalidate the block size calculation, this 58 | // SpamSum can not be added to. 59 | func HashBytes(b []byte) *SpamSum { 60 | wrapper := io.NewSectionReader(bytes.NewReader(b), 0, int64(len(b))) 61 | // we discard the error, since they won't be produced 62 | // for an in-memory byte slice 63 | result, _ := HashReadSeeker(wrapper, wrapper.Size()) 64 | return result 65 | } 66 | 67 | // HashReadSeeker requires an implementation of io.ReadSeeker, and a length 68 | // value indicating its size, and takes its SpamSum, calculating 69 | // the optimal block size in several passes. It is assumed that Seeks upto 70 | // the specified length are allowed. Since adding more data 71 | // to such a sum would invalidate the block size calculation, this 72 | // SpamSum can not be added to. Any errors returned will originate 73 | // from the implementation of ReadSeeker. 74 | func HashReadSeeker(source io.ReadSeeker, length int64) (*SpamSum, error) { 75 | sum := new(SpamSum) 76 | sum.blocksize = minBlockSize 77 | 78 | for int64(sum.blocksize*SpamsumLength) < length { 79 | sum.blocksize *= 2 80 | } 81 | 82 | sss := spamsumState{} 83 | source_iteration: 84 | for { 85 | sss.reset() 86 | sum.reset() 87 | 88 | if _, err := source.Seek(0, 0); err != nil { 89 | return nil, err 90 | } 91 | block := make([]byte, ReadSize) 92 | 93 | block_read_loop: 94 | for { 95 | var num int 96 | var err error 97 | if num, err = source.Read(block); num == 0 { 98 | break block_read_loop 99 | } else { 100 | processBlock(block, num, &sss, sum) 101 | } 102 | 103 | if err != nil { 104 | return nil, err 105 | } 106 | } 107 | 108 | writeTail(&sss, sum) 109 | 110 | if sum.blocksize > minBlockSize && sum.leftIndex < (SpamsumLength/2) { 111 | sum.blocksize /= 2 112 | } else { 113 | break source_iteration 114 | } 115 | } 116 | 117 | return sum, nil 118 | } 119 | 120 | type spamsumState struct { 121 | // fields for the rolling hash 122 | window [rollingWindow]byte 123 | rollingSum, h2, shiftHash, position uint32 124 | 125 | // FNV-1 style hash fields 126 | left, right uint32 127 | } 128 | 129 | const b64 string = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/" 130 | 131 | func processBlock(block []byte, length int, sss *spamsumState, sum *SpamSum) { 132 | for i := 0; i < length; i++ { 133 | sss.h2 -= sss.rollingSum 134 | sss.h2 += rollingWindow * uint32(block[i]) 135 | 136 | sss.rollingSum += uint32(block[i]) 137 | sss.rollingSum -= uint32(sss.window[sss.position%rollingWindow]) 138 | 139 | sss.window[sss.position%rollingWindow] = block[i] 140 | sss.position += 1 141 | 142 | sss.shiftHash <<= 5 143 | sss.shiftHash ^= uint32(block[i]) 144 | 145 | roll := sss.rollingSum + sss.h2 + sss.shiftHash 146 | 147 | // left and right are Fowler/Noll/Vo-1 hashes with a 148 | // slightly different starting value. 149 | sss.left *= prime32 150 | sss.left ^= uint32(block[i]) 151 | 152 | sss.right *= prime32 153 | sss.right ^= uint32(block[i]) 154 | 155 | // Assuming the output of the rolling sum is uniformly 156 | // distributed, this condition will occur once every 157 | // blocksize bytes. This means that the expected value 158 | // for the length of the blocks hashed is blocksize. 159 | if roll%sum.blocksize == (sum.blocksize - 1) { 160 | sum.leftPart[sum.leftIndex] = b64[sss.left%64] 161 | // Note that this means that the first 63 bytes of the 162 | // hash will encode the first 63*blocksize blocks, 163 | // and the last byte will encode the remainder, be it 164 | // one block, or 4GB. 165 | if sum.leftIndex < SpamsumLength-1 { 166 | sum.leftIndex += 1 167 | sss.left = offset32 168 | } 169 | } 170 | 171 | // As for the previous condition, but for blocksize * 2 172 | if roll%(sum.blocksize*2) == ((sum.blocksize * 2) - 1) { 173 | sum.rightPart[sum.rightIndex] = b64[sss.right%64] 174 | if sum.rightIndex < (SpamsumLength/2)-1 { 175 | sum.rightIndex += 1 176 | sss.right = offset32 177 | } 178 | } 179 | } 180 | } 181 | 182 | func writeTail(sss *spamsumState, sum *SpamSum) { 183 | roll := sss.rollingSum + sss.h2 + sss.shiftHash 184 | if roll != 0 { 185 | sum.leftPart[sum.leftIndex] = b64[sss.left%64] 186 | sum.rightPart[sum.rightIndex] = b64[sss.right%64] 187 | } 188 | } 189 | 190 | func (sss *spamsumState) reset() { 191 | for i := range sss.window { 192 | sss.window[i] = 0 193 | } 194 | 195 | sss.rollingSum = 0 196 | sss.h2 = 0 197 | sss.shiftHash = 0 198 | sss.position = 0 199 | 200 | sss.left = offset32 201 | sss.right = offset32 202 | } 203 | 204 | func (sum *SpamSum) reset() { 205 | for i := range sum.leftPart { 206 | sum.leftPart[i] = 0 207 | } 208 | 209 | for i := range sum.rightPart { 210 | sum.rightPart[i] = 0 211 | } 212 | 213 | sum.leftIndex, sum.rightIndex = 0, 0 214 | } 215 | 216 | func nonZeroLength(array []byte) (r int) { 217 | for i := range array { 218 | if array[i] == 0 { 219 | break 220 | } 221 | r += 1 222 | } 223 | return r 224 | } 225 | 226 | func (sum *SpamSum) Scan(state fmt.ScanState, verb rune) error { 227 | var blocksize int 228 | var leftPart, rightPart, blockPart, buffer []byte 229 | var err error 230 | 231 | if blockPart, err = state.Token(false, // do not skip spaces 232 | func(r rune) bool { 233 | return unicode.IsDigit(r) 234 | }); err != nil { 235 | return err 236 | } else if len(blockPart) == 0 { 237 | return errors.New("Cannot read block size.") 238 | } 239 | 240 | if blocksize, err = strconv.Atoi(string(blockPart)); err != nil { 241 | return err 242 | } else if blocksize < 3 { 243 | return errors.New("Block size too small") 244 | } 245 | 246 | if r, _, err := state.ReadRune(); err != nil { 247 | return err 248 | } else if r != ':' { 249 | return errors.New("Invalid token delimiter") 250 | } 251 | 252 | if buffer, err = state.Token(false, // do not skip spaces 253 | func(r rune) bool { 254 | return (bytes.IndexRune([]byte(b64), r) != -1) 255 | }); err != nil { 256 | return err 257 | } else if len(buffer) > SpamsumLength { 258 | return errors.New("First base64 string too long") 259 | } 260 | 261 | leftPart = make([]byte, len(buffer)) 262 | copy(leftPart, buffer[:]) 263 | 264 | if r, _, err := state.ReadRune(); err != nil { 265 | return err 266 | } else if r != ':' { 267 | return errors.New("Invalid token delimiter") 268 | } 269 | 270 | if buffer, err = state.Token(false, // do not skip spaces 271 | func(r rune) bool { 272 | return (bytes.IndexRune([]byte(b64), r) != -1) 273 | }); err != nil { 274 | return err 275 | } else if len(buffer) > (SpamsumLength / 2) { 276 | return errors.New("Second base64 string too long") 277 | } 278 | 279 | rightPart = make([]byte, len(buffer)) 280 | copy(rightPart[:], buffer) 281 | 282 | sum.blocksize = uint32(blocksize) 283 | copy(sum.leftPart[:], leftPart) 284 | copy(sum.rightPart[:], rightPart) 285 | sum.leftIndex = len(leftPart) 286 | sum.rightIndex = len(rightPart) 287 | 288 | return nil 289 | } 290 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | APPENDIX: How to apply the Apache License to your work. 180 | 181 | To apply the Apache License to your work, attach the following 182 | boilerplate notice, with the fields enclosed by brackets "[]" 183 | replaced with your own identifying information. (Don't include 184 | the brackets!) The text should be enclosed in the appropriate 185 | comment syntax for the file format. We also recommend that a 186 | file or class name and description of purpose be included on the 187 | same "printed page" as the copyright notice for easier 188 | identification within third-party archives. 189 | 190 | Copyright [yyyy] [name of copyright owner] 191 | 192 | Licensed under the Apache License, Version 2.0 (the "License"); 193 | you may not use this file except in compliance with the License. 194 | You may obtain a copy of the License at 195 | 196 | http://www.apache.org/licenses/LICENSE-2.0 197 | 198 | Unless required by applicable law or agreed to in writing, software 199 | distributed under the License is distributed on an "AS IS" BASIS, 200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | See the License for the specific language governing permissions and 202 | limitations under the License. 203 | -------------------------------------------------------------------------------- /extra_tests/spamsum.c: -------------------------------------------------------------------------------- 1 | /* 2 | this is a checksum routine that is specifically designed for spam. 3 | Copyright Andrew Tridgell 2002 4 | 5 | This code is released under the GNU General Public License version 2 6 | or later. Alteratively, you may also use this code under the terms 7 | of the Perl Artistic license. 8 | 9 | If you wish to distribute this code under the terms of a different 10 | free software license then please ask me. If there is a good reason 11 | then I will probably say yes. 12 | */ 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include "spamsum.h" 23 | 24 | /* the output is a string of length 64 in base64 */ 25 | #define SPAMSUM_LENGTH 64 26 | 27 | #define MIN_BLOCKSIZE 3 28 | #define HASH_PRIME 0x01000193 29 | #define HASH_INIT 0x28021967 30 | #define ROLLING_WINDOW 7 31 | 32 | #ifndef MIN 33 | #define MIN(a,b) ((a)<(b)?(a):(b)) 34 | #endif 35 | 36 | #ifndef MAX 37 | #define MAX(a,b) ((a)>(b)?(a):(b)) 38 | #endif 39 | 40 | 41 | static struct { 42 | uchar window[ROLLING_WINDOW]; 43 | u32 h1, h2, h3; 44 | u32 n; 45 | } roll_state; 46 | 47 | /* 48 | a rolling hash, based on the Adler checksum. By using a rolling hash 49 | we can perform auto resynchronisation after inserts/deletes 50 | 51 | internally, h1 is the sum of the bytes in the window and h2 52 | is the sum of the bytes times the index 53 | 54 | h3 is a shift/xor based rolling hash, and is mostly needed to ensure that 55 | we can cope with large blocksize values 56 | */ 57 | static inline u32 roll_hash(uchar c) 58 | { 59 | roll_state.h2 -= roll_state.h1; 60 | roll_state.h2 += ROLLING_WINDOW * c; 61 | 62 | roll_state.h1 += c; 63 | roll_state.h1 -= roll_state.window[roll_state.n % ROLLING_WINDOW]; 64 | 65 | roll_state.window[roll_state.n % ROLLING_WINDOW] = c; 66 | roll_state.n++; 67 | 68 | roll_state.h3 = (roll_state.h3 << 5) & 0xFFFFFFFF; 69 | roll_state.h3 ^= c; 70 | 71 | return roll_state.h1 + roll_state.h2 + roll_state.h3; 72 | } 73 | 74 | /* 75 | reset the state of the rolling hash and return the initial rolling hash value 76 | */ 77 | static u32 roll_reset(void) 78 | { 79 | memset(&roll_state, 0, sizeof(roll_state)); 80 | return 0; 81 | } 82 | 83 | /* a simple non-rolling hash, based on the FNV hash */ 84 | static inline u32 sum_hash(uchar c, u32 h) 85 | { 86 | h *= HASH_PRIME; 87 | h ^= c; 88 | return h; 89 | } 90 | 91 | /* 92 | take a message of length 'length' and return a string representing a hash of that message, 93 | prefixed by the selected blocksize 94 | */ 95 | char *spamsum(const uchar *in, size_t length, u32 flags, u32 bsize) 96 | { 97 | const char *b64 = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; 98 | char *ret, *p; 99 | u32 total_chars; 100 | u32 h, h2, h3; 101 | u32 j, n, i, k; 102 | u32 block_size; 103 | uchar ret2[SPAMSUM_LENGTH/2 + 1]; 104 | 105 | /* if we are ignoring email headers then skip past them now */ 106 | if (flags & FLAG_IGNORE_HEADERS) { 107 | const uchar *s = strstr(in, "\n\n"); 108 | if (s) { 109 | length -= (s+2 - in); 110 | in = s+2; 111 | } 112 | } 113 | 114 | if (flags & FLAG_IGNORE_WHITESPACE) { 115 | /* count the non-ignored chars */ 116 | for (n=0, i=0; i MIN_BLOCKSIZE && j < SPAMSUM_LENGTH/2) { 210 | block_size = block_size / 2; 211 | goto again; 212 | } 213 | 214 | return ret; 215 | } 216 | 217 | 218 | /* 219 | we only accept a match if we have at least one common substring in 220 | the signature of length ROLLING_WINDOW. This dramatically drops the 221 | false positive rate for low score thresholds while having 222 | negligable affect on the rate of spam detection. 223 | 224 | return 1 if the two strings do have a common substring, 0 otherwise 225 | */ 226 | static int has_common_substring(const char *s1, const char *s2) 227 | { 228 | int i, j; 229 | int num_hashes; 230 | u32 hashes[SPAMSUM_LENGTH]; 231 | 232 | /* there are many possible algorithms for common substring 233 | detection. In this case I am re-using the rolling hash code 234 | to act as a filter for possible substring matches */ 235 | 236 | roll_reset(); 237 | memset(hashes, 0, sizeof(hashes)); 238 | 239 | /* first compute the windowed rolling hash at each offset in 240 | the first string */ 241 | for (i=0;s1[i];i++) { 242 | hashes[i] = roll_hash((uchar)s1[i]); 243 | } 244 | num_hashes = i; 245 | 246 | roll_reset(); 247 | 248 | /* now for each offset in the second string compute the 249 | rolling hash and compare it to all of the rolling hashes 250 | for the first string. If one matches then we have a 251 | candidate substring match. We then confirm that match with 252 | a direct string comparison */ 253 | for (i=0;s2[i];i++) { 254 | u32 h = roll_hash((uchar)s2[i]); 255 | if (i < ROLLING_WINDOW-1) continue; 256 | for (j=ROLLING_WINDOW-1;j= ROLLING_WINDOW && 260 | strncmp(s2+i-(ROLLING_WINDOW-1), 261 | s1+j-(ROLLING_WINDOW-1), 262 | ROLLING_WINDOW) == 0) { 263 | return 1; 264 | } 265 | } 266 | } 267 | } 268 | 269 | return 0; 270 | } 271 | 272 | 273 | /* 274 | eliminate sequences of longer than 3 identical characters. These 275 | sequences contain very little information so they tend to just bias 276 | the result unfairly 277 | */ 278 | static char *eliminate_sequences(const char *str) 279 | { 280 | char *ret; 281 | int i, j, len; 282 | 283 | ret = strdup(str); 284 | if (!ret) return NULL; 285 | 286 | len = strlen(str); 287 | 288 | for (i=j=3;i SPAMSUM_LENGTH || len2 > SPAMSUM_LENGTH) { 317 | /* not a real spamsum signature? */ 318 | return 0; 319 | } 320 | 321 | /* the two strings must have a common substring of length 322 | ROLLING_WINDOW to be candidates */ 323 | if (has_common_substring(s1, s2) == 0) { 324 | return 0; 325 | } 326 | 327 | /* compute the edit distance between the two strings. The edit distance gives 328 | us a pretty good idea of how closely related the two strings are */ 329 | score = edit_distn(s1, len1, s2, len2); 330 | 331 | /* scale the edit distance by the lengths of the two 332 | strings. This changes the score to be a measure of the 333 | proportion of the message that has changed rather than an 334 | absolute quantity. It also copes with the variability of 335 | the string lengths. */ 336 | score = (score * SPAMSUM_LENGTH) / (len1 + len2); 337 | 338 | /* at this stage the score occurs roughly on a 0-64 scale, 339 | * with 0 being a good match and 64 being a complete 340 | * mismatch */ 341 | 342 | /* rescale to a 0-100 scale (friendlier to humans) */ 343 | score = (100 * score) / 64; 344 | 345 | /* it is possible to get a score above 100 here, but it is a 346 | really terrible match */ 347 | if (score >= 100) return 0; 348 | 349 | /* now re-scale on a 0-100 scale with 0 being a poor match and 350 | 100 being a excellent match. */ 351 | score = 100 - score; 352 | 353 | /* when the blocksize is small we don't want to exaggerate the match size */ 354 | if (score > block_size/MIN_BLOCKSIZE * MIN(len1, len2)) { 355 | score = block_size/MIN_BLOCKSIZE * MIN(len1, len2); 356 | } 357 | 358 | return score; 359 | } 360 | 361 | /* 362 | given two spamsum strings return a value indicating the degree to which they match. 363 | */ 364 | u32 spamsum_match(const char *str1, const char *str2) 365 | { 366 | u32 block_size1, block_size2; 367 | u32 score = 0; 368 | char *s1, *s2; 369 | char *s1_1, *s1_2; 370 | char *s2_1, *s2_2; 371 | 372 | /* each spamsum is prefixed by its block size */ 373 | if (sscanf(str1, "%u:", &block_size1) != 1 || 374 | sscanf(str2, "%u:", &block_size2) != 1) { 375 | return 0; 376 | } 377 | 378 | /* if the blocksizes don't match then we are comparing 379 | apples to oranges ... */ 380 | if (block_size1 != block_size2 && 381 | block_size1 != block_size2*2 && 382 | block_size2 != block_size1*2) { 383 | return 0; 384 | } 385 | 386 | /* move past the prefix */ 387 | str1 = strchr(str1, ':'); 388 | str2 = strchr(str2, ':'); 389 | 390 | if (!str1 || !str2) { 391 | /* badly formed ... */ 392 | return 0; 393 | } 394 | 395 | /* there is very little information content is sequences of 396 | the same character like 'LLLLL'. Eliminate any sequences 397 | longer than 3. This is especially important when combined 398 | with the has_common_substring() test below. */ 399 | s1 = eliminate_sequences(str1+1); 400 | s2 = eliminate_sequences(str2+1); 401 | 402 | if (!s1 || !s2) return 0; 403 | 404 | /* now break them into the two pieces */ 405 | s1_1 = s1; 406 | s2_1 = s2; 407 | 408 | s1_2 = strchr(s1, ':'); 409 | s2_2 = strchr(s2, ':'); 410 | 411 | if (!s1_2 || !s2_2) { 412 | /* a signature is malformed - it doesn't have 2 parts */ 413 | free(s1); free(s2); 414 | return 0; 415 | } 416 | 417 | *s1_2++ = 0; 418 | *s2_2++ = 0; 419 | 420 | /* each signature has a string for two block sizes. We now 421 | choose how to combine the two block sizes. We checked above 422 | that they have at least one block size in common */ 423 | if (block_size1 == block_size2) { 424 | u32 score1, score2; 425 | score1 = score_strings(s1_1, s2_1, block_size1); 426 | score2 = score_strings(s1_2, s2_2, block_size2); 427 | score = MAX(score1, score2); 428 | } else if (block_size1 == block_size2*2) { 429 | score = score_strings(s1_1, s2_2, block_size1); 430 | } else { 431 | score = score_strings(s1_2, s2_1, block_size2); 432 | } 433 | 434 | free(s1); 435 | free(s2); 436 | 437 | return score; 438 | } 439 | 440 | /* 441 | return the maximum match for a file containing a list of spamsums 442 | */ 443 | u32 spamsum_match_db(const char *fname, const char *sum, u32 threshold) 444 | { 445 | FILE *f; 446 | char line[100]; 447 | u32 best = 0; 448 | 449 | f = fopen(fname, "r"); 450 | if (!f) return 0; 451 | 452 | /* on each line of the database we compute the spamsum match 453 | score. We then pick the best score */ 454 | while (fgets(line, sizeof(line)-1, f)) { 455 | u32 score; 456 | int len; 457 | len = strlen(line); 458 | if (line[len-1] == '\n') line[len-1] = 0; 459 | 460 | score = spamsum_match(sum, line); 461 | 462 | if (score > best) { 463 | best = score; 464 | if (best >= threshold) break; 465 | } 466 | } 467 | 468 | fclose(f); 469 | 470 | return best; 471 | } 472 | 473 | /* 474 | return the spamsum on stdin 475 | */ 476 | static char *spamsum_stdin(u32 flags, u32 block_size) 477 | { 478 | uchar buf[10*1024]; 479 | uchar *msg; 480 | size_t length = 0; 481 | int n; 482 | char *sum; 483 | 484 | msg = malloc(sizeof(buf)); 485 | if (!msg) return NULL; 486 | 487 | /* load the file, expanding the allocation as needed. */ 488 | while (1) { 489 | n = read(0, buf, sizeof(buf)); 490 | if (n == -1 && errno == EINTR) continue; 491 | if (n <= 0) break; 492 | 493 | msg = realloc(msg, length + n); 494 | if (!msg) return NULL; 495 | 496 | memcpy(msg+length, buf, n); 497 | length += n; 498 | } 499 | 500 | sum = spamsum(msg, length, flags, block_size); 501 | 502 | free(msg); 503 | 504 | return sum; 505 | } 506 | 507 | 508 | /* 509 | return the spamsum on a file 510 | */ 511 | char *spamsum_file(const char *fname, u32 flags, u32 block_size) 512 | { 513 | int fd; 514 | char *sum; 515 | struct stat st; 516 | uchar *msg; 517 | 518 | if (strcmp(fname, "-") == 0) { 519 | return spamsum_stdin(flags, block_size); 520 | } 521 | 522 | fd = open(fname, O_RDONLY); 523 | if (fd == -1) { 524 | perror(fname); 525 | return NULL; 526 | } 527 | 528 | if (fstat(fd, &st) == -1) { 529 | perror("fstat"); 530 | return NULL; 531 | } 532 | 533 | msg = mmap(NULL, st.st_size, PROT_READ, MAP_FILE|MAP_PRIVATE, fd, 0); 534 | if (msg == (uchar *)-1) { 535 | perror("mmap"); 536 | return NULL; 537 | } 538 | close(fd); 539 | 540 | sum = spamsum(msg, st.st_size, flags, block_size); 541 | 542 | munmap(msg, st.st_size); 543 | 544 | return sum; 545 | } 546 | --------------------------------------------------------------------------------