├── testdata
    ├── LAND.MAP
    ├── embedded_video_quicktime.doc
    └── README.rst
├── extra_tests
    ├── spamsum.h
    ├── Makefile
    ├── spamsum_compare.c
    ├── spamsum_main.c
    ├── from_git_repo.go
    ├── edit_dist.c
    └── spamsum.c
├── README.md
├── spamsumwriter.go
├── spamsum_test.go
├── spamsumwriter_test.go
├── spamsum_compare.go
├── spamsum_compare_test.go
├── spamsum.go
└── LICENSE


/testdata/LAND.MAP:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/michielbuddingh/spamsum/HEAD/testdata/LAND.MAP


--------------------------------------------------------------------------------
/testdata/embedded_video_quicktime.doc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/michielbuddingh/spamsum/HEAD/testdata/embedded_video_quicktime.doc


--------------------------------------------------------------------------------
/extra_tests/spamsum.h:
--------------------------------------------------------------------------------
1 | typedef unsigned u32;
2 | typedef unsigned char uchar;
3 | 
4 | #define FLAG_IGNORE_WHITESPACE 1
5 | #define FLAG_IGNORE_HEADERS 2
6 | 
7 | char *spamsum_file(const char *, u32, u32);
8 | 


--------------------------------------------------------------------------------
/testdata/README.rst:
--------------------------------------------------------------------------------
1 | Origins of these files
2 | ~~~~~~~~~~~~~~~~~~~~~~
3 | 
4 | embedded_video_quicktime.doc:
5 |    from https://github.com/ross-spencer/format-corpus , CC0
6 | 
7 | LAND.MAP:
8 |    from https://github.com/ross-spencer/format-corpus , CC0
9 | 


--------------------------------------------------------------------------------
/extra_tests/Makefile:
--------------------------------------------------------------------------------
 1 | # Builds the original spamsum tool, and one additional quick hack,
 2 | # required to run the from_git_repo.go script
 3 | CC = gcc
 4 | CFLAGS = -Wall -W
 5 | 
 6 | OBJS = spamsum.o edit_dist.o
 7 | 
 8 | all:	spamsum spamsum_compare
 9 | 
10 | spamsum: $(OBJS)
11 | 	$(CC) -o spamsum spamsum_main.c $(OBJS)
12 | 
13 | spamsum_compare: $(OBJS)
14 | 	$(CC) -o spamsum_compare spamsum_compare.c $(OBJS)
15 | 
16 | clean:
17 | 	@rm -f $(OBJS) spamsum spamsum_compare *~
18 | 


--------------------------------------------------------------------------------
/extra_tests/spamsum_compare.c:
--------------------------------------------------------------------------------
 1 | /* Copyright 2013, Michiel Buddingh, All rights reserved.  Use of this
 2 |    code is governed by version 2.0 or later of the Apache License,
 3 |    available at http://www.apache.org/licenses/LICENSE-2.0
 4 | 
 5 |    This is a quick-and-dirty hack to compare two spamsums from the
 6 |    command line */
 7 | 
 8 | #include <stdio.h>
 9 | #include <string.h>
10 | #include <unistd.h>
11 | #include "spamsum.h"
12 | 
13 | int main(int argc, char *argv[]) {
14 |     int result = spamsum_match(argv[1], argv[2]);
15 |     printf("%d", result);
16 |     return 0;
17 | }
18 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | spamsum
 2 | =======
 3 | 
 4 | #### A fuzzy checksum for matching spam ####
 5 | 
 6 | This is a native go implementation of spamsum.
 7 | 
 8 | spamsum was developed by Andrew Tridgell to hash email messages for computationally inexpensive SPAM detection. See <http://junkcode.samba.org/#spamsum>.
 9 | 
10 | The state of this package
11 | -------------------------
12 | 
13 | * Ready for production use.
14 | * It seems to generate results identical to that of the [spamsum tool](https://junkcode.samba.org/ftp/unpacked/junkcode/spamsum/) and [ssdeep](http://ssdeep.sf.net).  This has only been tested on a small number of files.
15 | * It is about twice as slow as the spamsum tool; about 40MB/s on a 3Ghz Core i3.  Use `gccgo` to make the speed difference disappear.
16 | * Fuzzy comparison may be slower than the spamsum tool.  Benchmark forthcoming.
17 | 
18 | How to use
19 | ----------
20 | 
21 | Unfortunately, the default operation for spamsum is to iterate over the data several times to determine an optimal block size, so it's not sensible to implement the `hash.Hash` interface.
22 | 
23 | Instead, the package exports the functions `HashBytes(b [] byte)` and `HashReadSeeker(source io.ReadSeeker, length int64)`.
24 | 
25 | 	if file, err := os.Open("filename"); err != nil {
26 | 		log.Fatal(err)
27 | 	} else if stat, err := file.Stat(); err != nil {
28 | 		log.Fatal(err)
29 | 	} else {
30 | 		sum, err := spamsum.HashReadSeeker(file, stat.Size())
31 | 		// etc.
32 | 	}
33 | 
34 | Any errors returned by `HashReadSeeker` will originate from the `io.ReadSeeker` functions.
35 | 
36 | ### Alternatively ###
37 | 
38 | If it is acceptable to set a fixed blocksize beforehand, the `SpamSumWriter` type can be used, which _does_ implement the `hash.Hash` interface.  The `Sum(b []byte) []byte` method is not terribly useful; it will return a slice where the non-zero bytes contain a base64-encoded 6-bit hash for a `BlockSize()`-sized block. Use the `String()` method to obtain a more useful representation.
39 | 
40 | ### License ###
41 | 
42 | Use of this code is governed by version 2.0 or later of the Apache
43 | License, available at <http://www.apache.org/licenses/LICENSE-2.0>
44 | 
45 | The `extra_tests` directory contains modified sources of the original [spamsum tool](https://junkcode.samba.org/ftp/unpacked/junkcode/spamsum/) as part of a regression test suite.  This code is joint licensed under the Perl Artistic License and the GPL version 2 or later.
46 | 


--------------------------------------------------------------------------------
/extra_tests/spamsum_main.c:
--------------------------------------------------------------------------------
  1 | /*
  2 |   this is a checksum routine that is specifically designed for spam.
  3 |   Copyright Andrew Tridgell <tridge@samba.org> 2002
  4 | 
  5 |   This code is released under the GNU General Public License version 2
  6 |   or later.  Alteratively, you may also use this code under the terms
  7 |   of the Perl Artistic license.
  8 | 
  9 |   If you wish to distribute this code under the terms of a different
 10 |   free software license then please ask me. If there is a good reason
 11 |   then I will probably say yes.
 12 | */
 13 | #include <stdio.h>
 14 | #include <unistd.h>
 15 | #include <stdlib.h>
 16 | #include "spamsum.h"
 17 | 
 18 | static void show_help(void)
 19 | {
 20 | 	printf("");
 21 | }
 22 | 
 23 | int main(int argc, char *argv[])
 24 | {
 25 | 	char *sum;
 26 | 	extern char *optarg;
 27 | 	extern int optind;
 28 | 	int c;
 29 | 	char *dbname = NULL;
 30 | 	u32 score;
 31 | 	int i;
 32 | 	u32 flags = 0;
 33 | 	u32 block_size = 0;
 34 | 	u32 threshold = 90;
 35 | 
 36 | 	while ((c = getopt(argc, argv, "B:WHd:c:C:hT:")) != -1) {
 37 | 		switch (c) {
 38 | 		case 'W':
 39 | 			flags |= FLAG_IGNORE_WHITESPACE;
 40 | 			break;
 41 | 
 42 | 		case 'H':
 43 | 			flags |= FLAG_IGNORE_HEADERS;
 44 | 			break;
 45 | 
 46 | 		case 'd':
 47 | 			dbname = optarg;
 48 | 			break;
 49 | 
 50 | 		case 'B':
 51 | 			block_size = atoi(optarg);
 52 | 			break;
 53 | 
 54 | 		case 'T':
 55 | 			threshold = atoi(optarg);
 56 | 			break;
 57 | 
 58 | 		case 'c':
 59 | 			if (!dbname) {
 60 | 				show_help();
 61 | 				exit(1);
 62 | 			}
 63 | 			score = spamsum_match_db(dbname, optarg,
 64 | 						 threshold);
 65 | 			printf("%u\n", score);
 66 | 			exit(score >= threshold ? 0 : 2);
 67 | 
 68 | 		case 'C':
 69 | 			if (!dbname) {
 70 | 				show_help();
 71 | 				exit(1);
 72 | 			}
 73 | 			score = spamsum_match_db(dbname,
 74 | 						 spamsum_file(optarg, flags,
 75 | 							      block_size),
 76 | 						 threshold);
 77 | 			printf("%u\n", score);
 78 | 			exit(score >= threshold ? 0 : 2);
 79 | 
 80 | 		case 'h':
 81 | 		default:
 82 | 			show_help();
 83 | 			exit(0);
 84 | 		}
 85 | 	}
 86 | 
 87 | 	argc -= optind;
 88 | 	argv += optind;
 89 | 
 90 | 	if (argc == 0) {
 91 | 		show_help();
 92 | 		return 0;
 93 | 	}
 94 | 
 95 | 	/* compute the spamsum on a list of files */
 96 | 	for (i=0;i<argc;i++) {
 97 | 		sum = spamsum_file(argv[i], flags, block_size);
 98 | 		printf("%s\n", sum);
 99 | 		free(sum);
100 | 	}
101 | 
102 | 	return 0;
103 | }
104 | 


--------------------------------------------------------------------------------
/spamsumwriter.go:
--------------------------------------------------------------------------------
 1 | // Copyright 2013, Michiel Buddingh, All rights reserved.
 2 | // Use of this code is governed by version 2.0 or later of the Apache
 3 | // License, available at http://www.apache.org/licenses/LICENSE-2.0
 4 | 
 5 | package spamsum
 6 | 
 7 | import (
 8 | //	"hash"
 9 | )
10 | 
11 | type SpamSumWriter struct {
12 | 	SpamSum
13 | 	spamsumState
14 | }
15 | 
16 | // StartFixedBlocksize creates a SpamSumWriter with a fixed block size,
17 | // that implements the hash.Hash interface, and accepts an arbitrary
18 | // number of bytes through Write().  Note that the SpamSum algorithm
19 | // does not handle arbitrary length inputs well.  If the input stream
20 | // is significantly longer than SpamLength * blocksize, the tail end
21 | // of the stream will, for most intents and purposes, not generate
22 | // hash blocks.  Please consider the HashBytes or HashReadSeeker
23 | // functions instead.
24 | func StartFixedBlocksize(blockSize uint32) *SpamSumWriter {
25 | 	sum := new(SpamSumWriter)
26 | 
27 | 	sum.SpamSum.reset()
28 | 	sum.spamsumState.reset()
29 | 
30 | 	sum.blocksize = blockSize
31 | 	return sum
32 | }
33 | 
34 | // Reset sets the state of the SpamSumWriter to its initial value,
35 | // while keeping the blocksize parameter as is.
36 | func (sss *SpamSumWriter) Reset() {
37 | 	sss.spamsumState.reset()
38 | 	sss.SpamSum.reset()
39 | }
40 | 
41 | func (sss *SpamSumWriter) Size() int {
42 | 	return SpamsumLength
43 | }
44 | 
45 | // Write a byte slice to the SpamSumWriter.  Returns the length of the
46 | // byte slice, and nil.
47 | func (sss *SpamSumWriter) Write(block []byte) (int, error) {
48 | 	processBlock(block, len(block), &sss.spamsumState, &sss.SpamSum)
49 | 	return len(block), nil
50 | }
51 | 
52 | func (sss *SpamSumWriter) String() (result string) {
53 | 	writeTail(&sss.spamsumState, &sss.SpamSum)
54 | 	result = sss.SpamSum.String()
55 | 	return
56 | }
57 | 
58 | // Sum is implemented mostly for the sake of compatibility with
59 | // hash.Hash.  While the SpamSum algorithm creates variable-length
60 | // hashes, Sum is supposed to return a fixed-length slice of Size()
61 | // bytes.  The implementation returns a slice where the non-zero bytes
62 | // contain a base64-encoded 6-bit hash for a `BlockSize()`-sized
63 | // block.  The block hashes continue up to the end of the slice, or up
64 | // to the first zero byte.
65 | func (sss *SpamSumWriter) Sum(block []byte) (result []byte) {
66 | 	var cloneState spamsumState = sss.spamsumState
67 | 	var cloneSum SpamSum = sss.SpamSum
68 | 
69 | 	processBlock(block, len(block), &cloneState, &cloneSum)
70 | 
71 | 	writeTail(&cloneState, &cloneSum)
72 | 
73 | 	result = make([]byte, SpamsumLength)
74 | 	copy(result, cloneSum.leftPart[:cloneSum.leftIndex])
75 | 	return
76 | }
77 | 


--------------------------------------------------------------------------------
/spamsum_test.go:
--------------------------------------------------------------------------------
  1 | package spamsum
  2 | 
  3 | import (
  4 | 	"encoding/binary"
  5 | 	"fmt"
  6 | 	"math/rand"
  7 | 	"os"
  8 | 	"path/filepath"
  9 | 	"testing"
 10 | )
 11 | 
 12 | func TestScan(t *testing.T) {
 13 | 	tests := []struct {
 14 | 		input      string
 15 | 		shouldfail bool
 16 | 	}{
 17 | 		{"49152:dihMNzhZt62oh9+onrqMPr/KwJsvD/mMplt:Hxxpj", false},
 18 | 		{"12582912:kVxeXup8VuH8rD//4crHBrlGXm5WgYJ70A:e4XuptH8D//4crHMmUfL", false},
 19 | 		{"18446744073709551616:dihMNzhZt62oh9+onrqMPr/KwJsvD/mMplt:H.soa", true},
 20 | 		{"49152:dihMNzhZt62oh9+onrqMPr/KwJsvD/mMplt.Hxxpj", true},
 21 | 		{"22:i3wkMEgPthpID7YoQDjrdAjGBwBIg8Qow0iLSAhIi3AQSItCCEiLUhBIOch1MEiJBCRIiVQkCEiJ:UxUp", true},
 22 | 	}
 23 | 
 24 | 	for _, test := range tests {
 25 | 		var sum SpamSum
 26 | 		_, err := fmt.Sscan(test.input, &sum)
 27 | 		if test.shouldfail && err == nil {
 28 | 			t.Errorf("Should not be able to parse %s\n", test.input)
 29 | 		}
 30 | 		if !test.shouldfail && err != nil {
 31 | 			t.Errorf("Parse failed with error: %v", err)
 32 | 		}
 33 | 		if !test.shouldfail && sum.String() != test.input {
 34 | 			t.Errorf("scanned sum %s is not equal to input string %s\n", sum.String(), test.input)
 35 | 		}
 36 | 	}
 37 | }
 38 | 
 39 | func TestHashReadSeeker(t *testing.T) {
 40 | 	tests := []struct {
 41 | 		filename string
 42 | 		expected string
 43 | 	}{
 44 | 		{"LAND.MAP", "768:tlBecdq6/+dgZUTp+gAdA3T9Y02xEFshHOl3O98FzbXfBfhPcGxGB3whvm9HvMB1:O"},
 45 | 		{"embedded_video_quicktime.doc", "192:o50PBwxGc+ZrnCe9pz1aZ8GHiLUd0935:G8cOz9pzJ3"},
 46 | 	}
 47 | 
 48 | 	for _, test := range tests {
 49 | 		path := filepath.Join("testdata", test.filename)
 50 | 		file, openerr := os.Open(path)
 51 | 		if openerr != nil {
 52 | 			t.Fatal(openerr)
 53 | 		}
 54 | 		defer file.Close()
 55 | 		stat, staterr := file.Stat()
 56 | 		if staterr != nil {
 57 | 			t.Fatal(staterr)
 58 | 		}
 59 | 
 60 | 		sum, sumerr := HashReadSeeker(file, stat.Size())
 61 | 		if sumerr != nil {
 62 | 			t.Fatal(sumerr)
 63 | 		}
 64 | 
 65 | 		if sum.String() != test.expected {
 66 | 			t.Errorf("Expected %s hashing %s, result was %v", test.expected, test.filename, sum)
 67 | 		}
 68 | 	}
 69 | }
 70 | 
 71 | func TestHashBytes(t *testing.T) {
 72 | 	tests := []struct {
 73 | 		seed      int64
 74 | 		length    int
 75 | 		blocksize uint32
 76 | 		expected  string
 77 | 	}{
 78 | 		{6065, 1024, 24, "24:D4JsKhbN85qJzgs+JLY4DffT9hhD6Wa333cRhDEPVreO:LKFN85qJMHJj1hkuDEPVeO"},
 79 | 		{1029936, 1025, 12, "12:RePpJA8PW0JP1uTMCa9qpRCwtnacOzIayUpkmp6v12qVVIFSpNKDrASjqPoOaY1L:UPPWE4TMY37nYzslTN2gTDZpaCji8ZmM"},
 80 | 		{1252877, 22624, 192, "192:V5cZcnyOVaMvLF4f8mkfu4u95tgALGPVxn8QhXSd1CsvQ+D3QMfFiz/uxuVge/7P:nIyvGkWN/iHImSc6vAzWgyeIiBzPbgzk"},
 81 | 		{1497046, 22624, 192, "192:BTBLFZFxOyNbTjMRjkLOBKiDKe2cRfzKQACMTsZGJRaYjx44gkX2iJ4nURozFp9S:B5OyN/QjkL6KiH2JgoNIDreMuxqRkxJZ"},
 82 | 	}
 83 | 
 84 | 	for _, test := range tests {
 85 | 		byteSlice := make([]byte, test.length)
 86 | 		generator := rand.New(rand.NewSource(test.seed))
 87 | 
 88 | 		generator.Read(byteSlice)
 89 | 
 90 | 		sum := HashBytes(byteSlice)
 91 | 
 92 | 		if sum.String() != test.expected {
 93 | 			t.Errorf("Expected %v, result was %v", test.expected, sum)
 94 | 		}
 95 | 	}
 96 | }
 97 | 
 98 | func TestBlocksizeAdjustment(t *testing.T) {
 99 | 	byteSlice := make([]byte, 17921)
100 | 	generator := rand.New(rand.NewSource(191))
101 | 
102 | 	i := 0
103 | 	for ; i < 24; i++ {
104 | 		binary.BigEndian.PutUint32(byteSlice[i*4:], generator.Uint32())
105 | 	}
106 | 
107 | 	for ; i < 17921; i++ {
108 | 		byteSlice[i] = 0
109 | 	}
110 | 
111 | 	sum := HashBytes(byteSlice)
112 | 	expected := "3:Bl5KOiWl/:ldZ/"
113 | 
114 | 	if sum.String() != expected {
115 | 		t.Errorf("Expected %v, result was %v", expected, sum)
116 | 	}
117 | }
118 | 


--------------------------------------------------------------------------------
/extra_tests/from_git_repo.go:
--------------------------------------------------------------------------------
  1 | // Copyright 2013, Michiel Buddingh, All rights reserved.  Use of this
  2 | // code is governed by version 2.0 or later of the Apache License,
  3 | // available at http://www.apache.org/licenses/LICENSE-2.0
  4 | 
  5 | // This is a really ugly script that turns any git repository into a
  6 | // test suite for the Go spamsum implementation.  It inspects all
  7 | // revisions of all files in the current head, takes their Spamsum,
  8 | // and compares this against the result of the original spamsum tool.
  9 | //
 10 | // Note that this script requires the C sources of the original
 11 | // spamsum tool to work, and suffers from bad error handling in many
 12 | // places.  The time to run scales quadratically with the number of
 13 | // revisions, so its best suited for mid-sized git repositories.
 14 | package main
 15 | 
 16 | import (
 17 | 	"bytes"
 18 | 	"fmt"
 19 | 	"github.com/michielbuddingh/spamsum"
 20 | 	"log"
 21 | 	"os/exec"
 22 | 	"regexp"
 23 | 	"strconv"
 24 | 	"strings"
 25 | )
 26 | 
 27 | const spamsumpath = "./spamsum"
 28 | const spamsum_comparepath = "./spamsum_compare"
 29 | 
 30 | var count = 0
 31 | var comparisoncount = 0
 32 | 
 33 | func main() {
 34 | 	files, err := exec.Command("git", "ls-files").Output()
 35 | 	if err != nil {
 36 | 		println(err)
 37 | 	}
 38 | 	filelist := strings.Split(string(files), "\n")
 39 | 	for _, file := range filelist {
 40 | 		// Iterate over all revisions of a file.
 41 | 		allRevisions(file)
 42 | 	}
 43 | 	log.Printf("%d files processed\n", count)
 44 | 	log.Printf("%d comparisons\n", comparisoncount)
 45 | }
 46 | 
 47 | func allRevisions(filename string) {
 48 | 	grabCommit := regexp.MustCompile("commit ([0-9a-f]*)")
 49 | 	commitlog, _ := exec.Command("git", "log", "--pretty=short", filename).Output()
 50 | 	commits := grabCommit.FindAllSubmatch(commitlog, -1)
 51 | 
 52 | 	sums := make([]string, 0)
 53 | 
 54 | 	if len(commits) > 1 {
 55 | 		for _, commit := range commits {
 56 | 			contents, err := exec.Command("git", "show", "--format=raw", string(commit[1])+":"+filename).Output()
 57 | 
 58 | 			if err == nil && len(contents) > 0 && !strings.HasPrefix(string(contents), "fatal") {
 59 | 				sum1 := createSpamSum(contents)
 60 | 				sum2 := createOriginalSpamSum(contents)
 61 | 				if sum1 != sum2 {
 62 | 					log.Printf("revision %s of file %s has differing spamsums", string(commit[1]), filename)
 63 | 				}
 64 | 				count++
 65 | 				sums = append(sums, sum1)
 66 | 			}
 67 | 		}
 68 | 	}
 69 | 
 70 | 	for idx, left := range sums {
 71 | 		for i := idx + 1; i < len(sums); i++ {
 72 | 			first := compareSpamSum(left, sums[i])
 73 | 			second := compareOriginalSpamSum(left, sums[i])
 74 | 			if first != second {
 75 | 				log.Printf("Difference in comparison between %s and %s, %d, %d\n", left, sums[i], first, second)
 76 | 			}
 77 | 			comparisoncount++
 78 | 		}
 79 | 	}
 80 | 
 81 | }
 82 | 
 83 | func compareSpamSum(left, right string) int {
 84 | 	var leftSum, rightSum spamsum.SpamSum
 85 | 	fmt.Sscan(left, &leftSum)
 86 | 	fmt.Sscan(right, &rightSum)
 87 | 	score := leftSum.Compare(rightSum)
 88 | 	return int(score)
 89 | }
 90 | 
 91 | func compareOriginalSpamSum(left, right string) int {
 92 | 	scoretext, _ := exec.Command(spamsum_comparepath, left, right).Output()
 93 | 	var score int
 94 | 	score, _ = strconv.Atoi(string(scoretext))
 95 | 	return score
 96 | }
 97 | 
 98 | func createSpamSum(contents []byte) string {
 99 | 	reader := bytes.NewReader(contents)
100 | 	sum, _ := spamsum.HashReadSeeker(reader, int64(len(contents)))
101 | 	return sum.String()
102 | }
103 | 
104 | func createOriginalSpamSum(contents []byte) string {
105 | 	reader := bytes.NewReader(contents)
106 | 	cmd := exec.Command(spamsumpath, "-")
107 | 	cmd.Stdin = reader
108 | 	if sumbytes, err := cmd.Output(); err == nil {
109 | 		return strings.TrimSpace(string(sumbytes))
110 | 	}
111 | 	return "nil"
112 | }
113 | 


--------------------------------------------------------------------------------
/spamsumwriter_test.go:
--------------------------------------------------------------------------------
  1 | // Copyright 2013, Michiel Buddingh, All rights reserved.
  2 | // Use of this code is governed by version 2.0 or later of the Apache
  3 | // License, available at http://www.apache.org/licenses/LICENSE-2.0
  4 | 
  5 | package spamsum
  6 | 
  7 | import (
  8 | 	"bufio"
  9 | 	"encoding/binary"
 10 | 	"math/rand"
 11 | 	"os"
 12 | 	"path/filepath"
 13 | 	"testing"
 14 | )
 15 | 
 16 | func TestWriter(t *testing.T) {
 17 | 	tests := []struct {
 18 | 		seed      int64
 19 | 		length    int
 20 | 		blocksize uint32
 21 | 		expected  string
 22 | 	}{
 23 | 		{42, 16384, 384, "384:PnwCSZ6yE9r4UCZ1he34xas/E8AhHgdd2yM:PbSZ6yE9rGfExx"},
 24 | 		{1000, 2048, 48, "48:Zo+v/bCSly4VhreHwHJdkHTzF7sjBU1YuD/QtFsByxoSJW+QiLlH:uSWSFteQHJd+Tp79mqSqyCt+5LlH"},
 25 | 		{1000, 1048576, 24576, "24576:xL2L/P40/cnWGr7tsP+mgdQGvnb1UV+gQ8ZwU:ErPP/2WItsPTgdD/bqQ4yU"},
 26 | 		{71268, 24, 3, "3:N0n6xmcFctn:7xmptn"},
 27 | 	}
 28 | 
 29 | 	for _, test := range tests {
 30 | 		generator := rand.New(rand.NewSource(test.seed))
 31 | 		writer := StartFixedBlocksize(test.blocksize)
 32 | 		for i := 0; i < test.length/4; i++ {
 33 | 			binary.Write(writer, binary.BigEndian, generator.Uint32())
 34 | 		}
 35 | 		if writer.String() != test.expected {
 36 | 			t.Errorf("Expected %v, result was %v", test.expected, writer)
 37 | 		}
 38 | 	}
 39 | }
 40 | 
 41 | func TestWriterIntermediate(t *testing.T) {
 42 | 	tests := []struct {
 43 | 		filename             string
 44 | 		initialLength        int
 45 | 		expectedIntermediate string
 46 | 		expectedFinal        string
 47 | 		blockSize            uint32
 48 | 	}{
 49 | 		{
 50 | 			"LAND.MAP",
 51 | 			131072,
 52 | 			"768:tlBecdq6/+dgZUTp+gAdAm:3",
 53 | 			"768:tlBecdq6/+dgZUTp+gAdA3T9Y02xEFshHOl3O98FzbXfBfhPcGxGB3whvm9HvMB1:O",
 54 | 			768,
 55 | 		},
 56 | 		{
 57 | 			"embedded_video_quicktime.doc",
 58 | 			12288,
 59 | 			"192:o50PBwxGc+Zrnn:G8cOb",
 60 | 			"192:o50PBwxGc+ZrnCe9pz1aZ8GHiLUd0935:G8cOz9pzJ3",
 61 | 			192,
 62 | 		},
 63 | 	}
 64 | 
 65 | 	for _, test := range tests {
 66 | 		writer := StartFixedBlocksize(test.blockSize)
 67 | 
 68 | 		path := filepath.Join("testdata", test.filename)
 69 | 		file, openerr := os.Open(path)
 70 | 		if openerr != nil {
 71 | 			t.Fatal(openerr)
 72 | 		}
 73 | 		defer file.Close()
 74 | 
 75 | 		reader := bufio.NewReader(file)
 76 | 
 77 | 		buf4k := make([]byte, test.initialLength)
 78 | 		_, readerr := reader.Read(buf4k)
 79 | 		if readerr != nil {
 80 | 			t.Fatal(readerr)
 81 | 		}
 82 | 
 83 | 		writer.Write(buf4k)
 84 | 
 85 | 		if writer.String() != test.expectedIntermediate {
 86 | 			t.Errorf("Expected intermediate result %s, got %s",
 87 | 				test.expectedIntermediate,
 88 | 				writer.String())
 89 | 		}
 90 | 
 91 | 		reader.WriteTo(writer)
 92 | 
 93 | 		if writer.String() != test.expectedFinal {
 94 | 			t.Errorf("Expected final result %s, got %s",
 95 | 				test.expectedFinal,
 96 | 				writer.String())
 97 | 		}
 98 | 	}
 99 | }
100 | 
101 | func TestWriterReset(t *testing.T) {
102 | 	generator := rand.New(rand.NewSource(3181))
103 | 	writer := StartFixedBlocksize(768)
104 | 	emtpySlice := make([]byte, 0)
105 | 
106 | 	for i := 0; i < 4096; i++ {
107 | 		binary.Write(writer, binary.BigEndian, generator.Uint32())
108 | 	}
109 | 
110 | 	beforeReset := writer.String()
111 | 	beforeResetBinary := writer.Sum(emtpySlice)
112 | 
113 | 	writer.Reset()
114 | 	generator = rand.New(rand.NewSource(3181))
115 | 
116 | 	for i := 0; i < 4096; i++ {
117 | 		binary.Write(writer, binary.BigEndian, generator.Uint32())
118 | 	}
119 | 
120 | 	afterReset := writer.String()
121 | 	afterResetBinary := writer.Sum(emtpySlice)
122 | 
123 | 	if beforeReset != afterReset {
124 | 		t.Errorf("Same data written to the same writer, but different results!")
125 | 	}
126 | 
127 | 	if len(afterResetBinary) != len(beforeResetBinary) {
128 | 		t.Errorf("Binary spamsums are not even the same size")
129 | 	}
130 | 
131 | 	for i, _ := range beforeResetBinary {
132 | 		if beforeResetBinary[i] != afterResetBinary[i] {
133 | 			t.Errorf("Binary spamsums before and after reset differ at byte %d", i)
134 | 			break
135 | 		}
136 | 	}
137 | }
138 | 
139 | func TestSize(t *testing.T) {
140 | 	writer := StartFixedBlocksize(16)
141 | 	if writer.Size() != SpamsumLength {
142 | 		t.Errorf("Max result size should always be equal to SpamsumLength, which is %d\n", SpamsumLength)
143 | 	}
144 | }
145 | 


--------------------------------------------------------------------------------
/spamsum_compare.go:
--------------------------------------------------------------------------------
  1 | // Copyright 2013, Michiel Buddingh, All rights reserved.
  2 | // Use of this code is governed by version 2.0 or later of the Apache
  3 | // License, available at http://www.apache.org/licenses/LICENSE-2.0
  4 | 
  5 | package spamsum
  6 | 
  7 | import (
  8 | 	"math"
  9 | )
 10 | 
 11 | const (
 12 | 	insCost    = 1
 13 | 	delCost    = 1
 14 | 	changeCost = 3
 15 | )
 16 | 
 17 | // Compare two SpamSums, returning a value between 0 and 100.
 18 | // This method is currently not bug-for-bug compatible with the
 19 | // original spamsum.
 20 | func (from SpamSum) Compare(to SpamSum) (similarity uint32) {
 21 | 	q := float32(from.blocksize) / float32(to.blocksize)
 22 | 	if q == 1 {
 23 | 		similarity = uint32(max(
 24 | 			score(from.leftPart[:from.leftIndex],
 25 | 				to.leftPart[:to.leftIndex],
 26 | 				int(from.blocksize)),
 27 | 			score(from.rightPart[:from.rightIndex],
 28 | 				to.rightPart[:to.rightIndex],
 29 | 				int(to.blocksize))))
 30 | 
 31 | 	} else if q == 2 {
 32 | 		similarity = uint32(score(
 33 | 			from.leftPart[:from.leftIndex],
 34 | 			to.rightPart[:to.rightIndex],
 35 | 			int(from.blocksize)))
 36 | 	} else if q == 0.5 {
 37 | 		similarity = uint32(score(
 38 | 			from.rightPart[:from.rightIndex],
 39 | 			to.leftPart[:to.leftIndex],
 40 | 			int(to.blocksize)))
 41 | 	} else {
 42 | 		similarity = 0
 43 | 	}
 44 | 	return
 45 | }
 46 | 
 47 | func score(from, to []byte, blocksize int) (score int) {
 48 | 	if !hasCommonSubstring(from, to) {
 49 | 		return 0
 50 | 	}
 51 | 
 52 | 	from = eliminateRepetition(from)
 53 | 	to = eliminateRepetition(to)
 54 | 
 55 | 	score = editDistance(from, to)
 56 | 
 57 | 	score *= SpamsumLength
 58 | 	score /= len(from) + len(to)
 59 | 
 60 | 	score = (score * 100) / 64
 61 | 
 62 | 	score = 100 - score
 63 | 
 64 | 	maxscore := blocksize / minBlockSize * min(len(from), len(to))
 65 | 	score = min(score, maxscore)
 66 | 
 67 | 	return score
 68 | }
 69 | 
 70 | func editDistance(from, to []byte) int {
 71 | 	// memoize turns a recursive levenshtein function into one that uses an
 72 | 	// array to cache results.  Uses |from| * |to| ints of memory.
 73 | 	memoize := func(calculate func(a, b []byte) int) func(a, b []byte) int {
 74 | 		var memo []int
 75 | 		ffl, ttl := len(from), len(to)
 76 | 		memo = make([]int, ffl*ttl)
 77 | 
 78 | 		return func(from, to []byte) int {
 79 | 			fl, tl := len(from), len(to)
 80 | 
 81 | 			if fl == 0 {
 82 | 				return tl
 83 | 			}
 84 | 			if tl == 0 {
 85 | 				return fl
 86 | 			}
 87 | 
 88 | 			index := ((tl - 1) * ffl) + fl - 1
 89 | 			if memo[index] == 0 {
 90 | 				memo[index] = calculate(from, to)
 91 | 
 92 | 			}
 93 | 			return memo[index]
 94 | 		}
 95 | 	}
 96 | 
 97 | 	var levenshteinRecursive func(from, to []byte) int
 98 | 
 99 | 	// to see uncached results, just remove the memoize()
100 | 	levenshteinRecursive = memoize(func(from, to []byte) (distance int) {
101 | 		// This algorithm is not tuned for anything but legibility, complexity
102 | 		// is O(|from| * |to|).  The original code has the option of swapping
103 | 		// adjacent characters; as far as I can deduce, this is never used due
104 | 		// to the cost penalty, so it is omitted here.
105 | 		fl, tl := len(from), len(to)
106 | 
107 | 		if fl == 0 {
108 | 			return tl
109 | 		}
110 | 		if tl == 0 {
111 | 			return fl
112 | 		}
113 | 
114 | 		var cost = changeCost
115 | 
116 | 		if from[fl-1] == to[tl-1] {
117 | 			cost = 0
118 | 		}
119 | 
120 | 		return min(
121 | 			levenshteinRecursive(from[:fl-1], to)+delCost,
122 | 			levenshteinRecursive(from, to[:tl-1])+delCost,
123 | 			levenshteinRecursive(from[:fl-1], to[:tl-1])+cost)
124 | 	})
125 | 
126 | 	return levenshteinRecursive(from, to)
127 | }
128 | 
129 | // eliminateRepetition reduces sequences of repeating bytes
130 | // longer than 3 bytes to length 3.
131 | func eliminateRepetition(from []byte) (to []byte) {
132 | 	to = make([]byte, len(from))
133 | 	copy(to, from[:3])
134 | 
135 | 	i, j := 3, 3
136 | 	for ; i < len(from); i++ {
137 | 		if from[i-3] != from[i] ||
138 | 			from[i-2] != from[i] ||
139 | 			from[i-1] != from[i] {
140 | 			to[j] = from[i]
141 | 			j++
142 | 		}
143 | 	}
144 | 
145 | 	return to[:j]
146 | }
147 | 
148 | // hasCommonSubstring returns true if the two byte slices
149 | // passed have a common substring of at least seven bytes.
150 | func hasCommonSubstring(seq1, seq2 []byte) (found bool) {
151 | shift_offset:
152 | 	for shift := len(seq1) - 7; shift >= 7-len(seq2); shift-- {
153 | 		firstbound, secondbound := max(0, shift), max(0, -shift)
154 | 		common := 0
155 | 		for i, j := firstbound, secondbound; j < len(seq2) && i < len(seq1); i++ {
156 | 			if seq1[i] != seq2[j] {
157 | 				common = 0
158 | 			} else if common == 6 {
159 | 				found = true
160 | 				break shift_offset
161 | 			} else {
162 | 				common++
163 | 			}
164 | 			j++
165 | 		}
166 | 	}
167 | 	return
168 | }
169 | 
170 | // min returns the minimum of its arguments
171 | func min(args ...int) int {
172 | 	min := int(math.MaxInt32)
173 | 	for _, m := range args {
174 | 		if m < min {
175 | 			min = m
176 | 		}
177 | 	}
178 | 	return min
179 | }
180 | 
181 | // max returns the maximum of its arguments
182 | func max(args ...int) int {
183 | 	max := int(-math.MaxInt32)
184 | 	for _, m := range args {
185 | 		if m > max {
186 | 			max = m
187 | 		}
188 | 	}
189 | 	return max
190 | }
191 | 


--------------------------------------------------------------------------------
/spamsum_compare_test.go:
--------------------------------------------------------------------------------
  1 | // Copyright 2013, Michiel Buddingh, All rights reserved.
  2 | // Use of this code is governed by version 2.0 or later of the Apache
  3 | // License, available at http://www.apache.org/licenses/LICENSE-2.0
  4 | 
  5 | package spamsum
  6 | 
  7 | import (
  8 | 	"fmt"
  9 | 	"testing"
 10 | )
 11 | 
 12 | func TestEliminateRepetition(t *testing.T) {
 13 | 	teststrings := []struct {
 14 | 		input, expected string
 15 | 	}{
 16 | 		{"AAAABC", "AAABC"},
 17 | 		{"Qddddddddd", "Qddd"},
 18 | 		{"AtrU||||v*****pn", "AtrU|||v***pn"},
 19 | 	}
 20 | 
 21 | 	for _, pair := range teststrings {
 22 | 		shortened := string(eliminateRepetition([]byte(pair.input)))
 23 | 		if shortened != pair.expected {
 24 | 			t.Errorf("%v shortened should be %v, is %v", pair.input, pair.expected, shortened)
 25 | 		}
 26 | 	}
 27 | }
 28 | 
 29 | func TestHasCommonSubstring(t *testing.T) {
 30 | 	tests := []struct {
 31 | 		left, right string
 32 | 		expected    bool
 33 | 	}{
 34 | 		{"Hello, world", "Hello there", false},
 35 | 		{"abcdefg", "abcdefg", true},
 36 | 		{"", "", false},
 37 | 		{"0123456789ABCDEF", "ABCDEF0123456789", true},
 38 | 		{"321abcdefg321", "abcdefg", true},
 39 | 		{"123b4567", "123c4567", false},
 40 | 	}
 41 | 
 42 | 	for _, test := range tests {
 43 | 		result := hasCommonSubstring([]byte(test.left), []byte(test.right))
 44 | 		if result != test.expected {
 45 | 			condition := "not "
 46 | 			if test.expected {
 47 | 				condition = ""
 48 | 			}
 49 | 			t.Errorf("\"%v\" and \"%v\" should %shave a common substring of length 7", test.left, test.right, condition)
 50 | 		}
 51 | 		mirroredResult := hasCommonSubstring([]byte(test.right), []byte(test.left))
 52 | 		if mirroredResult != result {
 53 | 			t.Errorf("Symmetry error for %v and %v", test.left, test.right)
 54 | 		}
 55 | 	}
 56 | }
 57 | 
 58 | func TestEditDistance(t *testing.T) {
 59 | 	tests := []struct {
 60 | 		left, right   string
 61 | 		dist_expected int
 62 | 	}{
 63 | 		{"abcdefg", "abcdefg", 0},
 64 | 		{"abcdefg", "abcqefg", 2},
 65 | 		{"ABCDEFG", "ABCEDFG", 2},
 66 | 		{"ooooAAA", "AAAoooo", 6},
 67 | 		{"oAoooAA", "AAoooAo", 4},
 68 | 		{"", "1234567", 7},
 69 | 		{"", "", 0},
 70 | 		{"HIJKLMN", "JKLMNOPQRST", 8},
 71 | 		{"UVxeXup8VuH8rD//pcrHBrlG5FWgYJ70A",
 72 | 			"kVxeXup8VuH8rD//4crHBrlGXm5WgYJ70A", 7},
 73 | 		{"O4XuptH8D//pcrHmgfL", "e4XuptH8D//4crHMmUfL", 7},
 74 | 		{"kVxeXup8VuH8rD//4crHBrlGXm5WgYJ70A",
 75 | 			"kVxeXup8VuH8rD//4crHBrlGXm5WGYJ70A", 2},
 76 | 		{"2Ewd+NvN88y3GdkvBC+9lKMHhDh",
 77 | 			"2Ewd+NvNrgdkvBC+9lKMHhDh", 7},
 78 | 		{"vEnWHH6d/4H/4Z2fvNoF8Sy2yt/YUC",
 79 | 			"xLnWHH6d/4H/4HHHHHHHH4CnrJuN0QhsSyjTU9/j4hbp96khuYhwX", 51},
 80 | 	}
 81 | 
 82 | 	for _, test := range tests {
 83 | 		result := editDistance([]byte(test.left), []byte(test.right))
 84 | 		if result != test.dist_expected {
 85 | 			t.Errorf("\"%v\" and \"%v\" should have a distance of %d, was %d", test.left, test.right, test.dist_expected, result)
 86 | 		}
 87 | 		mirroredResult := editDistance([]byte(test.left), []byte(test.right))
 88 | 		if mirroredResult != result {
 89 | 			t.Errorf("Symmetry error, editDistance(%s, %s) should be editDistance(%s, %s)", test.left, test.right, test.right, test.left)
 90 | 		}
 91 | 	}
 92 | }
 93 | 
 94 | func TestScore(t *testing.T) {
 95 | 	tests := []struct {
 96 | 		left, right    string
 97 | 		blocksize      int
 98 | 		score_expected int
 99 | 	}{
100 | 		{"2Ewd+NvN88y3GdkvBC+9lKMHhDh",
101 | 			"2Ewd+NvNrgdkvBC+9lKMHhDh", 6, 48},
102 | 		{"7iExTmgeXCcGYX1CRRX1PRRX88p0RRpdV/ISGcEvNOk+l/oX9QUopsAoX9QUopIo",
103 | 			"7iExTmgeXCcGYX1CRRX1PRRXrZGcEvNOk+l/oX9QUopsAoX9QUopIHKl057DRMHD",
104 | 			12, 80},
105 | 		{"vEnWHH6d/4H/4Z2fvNoF8Sy2yt/YUC",
106 | 			"xLnWHH6d/4H/4HHHHHHHH4CnrJuN0QhsSyjTU9/j4hbp96khuYhwX", 24, 43},
107 | 	}
108 | 	for _, test := range tests {
109 | 		result := score([]byte(test.left), []byte(test.right), test.blocksize)
110 | 		if result != test.score_expected {
111 | 			t.Errorf("\"%v\" and \"%v\" should have a score of %d, was %d", test.left, test.right, test.score_expected, result)
112 | 		}
113 | 	}
114 | 
115 | }
116 | 
117 | func TestCompare(t *testing.T) {
118 | 	tests := []struct {
119 | 		left, right         string
120 | 		similarity_expected uint32
121 | 	}{
122 | 		// these are not values produced by the original spamsum
123 | 		// score algorithm
124 | 		{
125 | 
126 | 			"12582912:UVxeXup8VuH8rD//pcrHBrlG5FWgYJ70A:O4XuptH8D//pcrHmgfL",
127 | 			"12582912:kVxeXup8VuH8rD//4crHBrlGXm5WgYJ70A:e4XuptH8D//4crHMmUfL",
128 | 			91},
129 | 
130 | 		{"12582912:kVxeXup8VuH8rD//4crHBrlGXm5WgYJ70A:e4XuptH8D//4crHMmUfL",
131 | 			"12582912:kVxeXup8VuH8rD//4crHBrlGXm5WGYJ70A:e4XuptH8D//4crHMMUfL",
132 | 			99},
133 | 		// different block sizes
134 | 		{"96:aaUi0DTEnLMZMVd2jnEMyFrsdy9LdeGatg3Uogbqs0uBUZoXLn1IvwwDaK:aaf0PU8YMnElrcULdSWgbqs0uBb1IIK",
135 | 			"192:aaf6PU8YMnElrcULdSWgbqs0uBb1IIAfsR6OZWjZDx:aaf6PUcYrfLdSWgms0uBb1TA0lZ8ZDx", 80},
136 | 		// different block sizes reversed
137 | 		{"192:aaf6PU8YMnElrcULdSWgbqs0uBb1IIAfsR6OZWjZDx:aaf6PUcYrfLdSWgms0uBb1TA0lZ8ZDx",
138 | 			"96:aaUi0DTEnLMZMVd2jnEMyFrsdy9LdeGatg3Uogbqs0uBUZoXLn1IvwwDaK:aaf0PU8YMnElrcULdSWgbqs0uBb1IIK", 80},
139 | 		// Uncomparable due to different sizes, should be 0
140 | 		{"12582912:kVxeXup8VuH8rD//4crHBrlGXm5WgYJ70A:e4XuptH8D//4crHMmUfL",
141 | 			"96:aaUi0DTEnLMZMVd2jnEMyFrsdy9LdeGatg3Uogbqs0uBUZoXLn1IvwwDaK:aaf0PU8YMnElrcULdSWgbqs0uBb1IIK", 0},
142 | 
143 | 		{"48:wX0GLBZET14EHWFIUXs0hPbaL3RdNhI6h0:wPLBS4EecWT6hdNhs",
144 | 			"48:w+wNj5GLBX/8jrT14EHWFIUXs0hPbaL3qd9hI6h0:w+zLBX/w14EecWT6ad9hs", 77},
145 | 		{"12:7iExTmgeXCcGYX1CRRX1PRRX88p0RRpdV/ISGcEvNOk+l/oX9QUopsAoX9QUopIo:2Ewd+NvN88y3GdkvBC+9lKMHhDh", "12:7iExTmgeXCcGYX1CRRX1PRRXrZGcEvNOk+l/oX9QUopsAoX9QUopIHKl057DRMHD:2Ewd+NvNrgdkvBC+9lKMHhDh", 88},
146 | 		{"24:R9mMhMDnWm8m86dmW4zm8mW4zm/mhkcnZ/uLkcHrBCaDrvNQxhwQmq8SywwboX+6:vEnWHH6d/4H/4Z2fvNoF8Sy2yt/YUC",
147 | 			"48:xLnWHH6d/4H/4HHHHHHHH4CnrJuN0QhsSyjTU9/j4hbp96khuYhwX:NWHH6dQHQHHHHHHHH4CnV1QeSyj8j4hG",
148 | 			43},
149 | 	}
150 | 
151 | 	for _, test := range tests {
152 | 		var left, right SpamSum
153 | 		if _, err := fmt.Sscan(test.left, &left); err != nil {
154 | 			t.Errorf("Could not scan string %s, %v", test.left, err)
155 | 		}
156 | 		if _, err := fmt.Sscan(test.right, &right); err != nil {
157 | 			t.Errorf("Could not scan string %s, %v", test.right, err)
158 | 		}
159 | 		similarity := left.Compare(right)
160 | 		if similarity != test.similarity_expected {
161 | 			t.Errorf("%v, %v\nSimilariy score should be %d, was %d", left, right, test.similarity_expected, similarity)
162 | 		}
163 | 	}
164 | }
165 | 


--------------------------------------------------------------------------------
/extra_tests/edit_dist.c:
--------------------------------------------------------------------------------
  1 | /*
  2 |   This edit distance code is taken from trn3.6. A few minor
  3 |   modifications have been made by Andrew Tridgell <tridge@samba.org>
  4 |   for use in spamsum.
  5 | */
  6 | 
  7 | 
  8 | /***************************************************************************/
  9 | 
 10 | 
 11 | /* The authors make no claims as to the fitness or correctness of this software
 12 |  * for any use whatsoever, and it is provided as is. Any use of this software
 13 |  * is at the user's own risk.
 14 |  */
 15 | 
 16 | #include <stdio.h>
 17 | #include <unistd.h>
 18 | #include <stdlib.h>
 19 | 
 20 | /* edit_dist -- returns the minimum edit distance between two strings
 21 | 
 22 | 	Program by:  Mark Maimone   CMU Computer Science   13 Nov 89
 23 | 	Last Modified:  28 Jan 90
 24 | 
 25 |    If the input strings have length n and m, the algorithm runs in time
 26 |    O(nm) and space O(min(m,n)).
 27 | 
 28 | HISTORY
 29 |    13 Nov 89 (mwm) Created edit_dist() and set_costs().
 30 | 
 31 |    28 Jan 90 (mwm) Added view_costs().  Should verify that THRESHOLD
 32 |    computations will work even when THRESHOLD is not a multiple of
 33 |    sizeof(int).
 34 | 
 35 |    17 May 93 (mwm) Improved performance when used with trn's newsgroup
 36 |    processing; assume all costs are 1, and you can terminate when a
 37 |    threshold is exceeded.
 38 | */
 39 | 
 40 | #define MIN_DIST 100
 41 | 
 42 | #define	TRN_SPEEDUP		/* Use a less-general version of the
 43 | 				   routine, one that's better for trn.
 44 | 				   All change costs are 1, and it's okay
 45 | 				   to terminate if the edit distance is
 46 | 				   known to exceed MIN_DIST */
 47 | 
 48 | #define THRESHOLD 4000		/* worry about allocating more memory only
 49 | 				   when this # of bytes is exceeded */
 50 | #define STRLENTHRESHOLD ((int) ((THRESHOLD / sizeof (int) - 3) / 2))
 51 | 
 52 | #define SAFE_ASSIGN(x,y) (((x) != NULL) ? (*(x) = (y)) : (y))
 53 | 
 54 | #define swap_int(x,y)  (_iswap = (x), (x) = (y), (y) = _iswap)
 55 | #define swap_char(x,y) (_cswap = (x), (x) = (y), (y) = _cswap)
 56 | #define min3(x,y,z) (_mx = (x), _my = (y), _mz = (z), (_mx < _my ? (_mx < _mz ? _mx : _mz) : (_mz < _my) ? _mz : _my))
 57 | #define min2(x,y) (_mx = (x), _my = (y), (_mx < _my ? _mx : _my))
 58 | 
 59 | 
 60 | static int insert_cost = 1;
 61 | static int delete_cost = 1;
 62 | #ifndef TRN_SPEEDUP
 63 | static int change_cost = 1;
 64 | static int swap_cost   = 1;
 65 | #endif
 66 | 
 67 | static int _iswap;			/* swap_int temp variable */
 68 | static char *_cswap;			/* swap_char temp variable */
 69 | static int _mx, _my, _mz;		/* min2, min3 temp variables */
 70 | 
 71 | 
 72 | 
 73 | /* edit_distn -- returns the edit distance between two strings, or -1 on
 74 |    failure */
 75 | 
 76 | int
 77 | edit_distn(from, from_len, to, to_len)
 78 | char *from, *to;
 79 | register int from_len, to_len;
 80 | {
 81 | #ifndef TRN_SPEEDUP
 82 |     register int ins, del, ch;	  	/* local copies of edit costs */
 83 | #endif
 84 |     register int row, col, index;	/* dynamic programming counters */
 85 |     register int radix;			/* radix for modular indexing */
 86 | #ifdef TRN_SPEEDUP
 87 |     register int low;
 88 | #endif
 89 |     int *buffer;			/* pointer to storage for one row
 90 | 					   of the d.p. array */
 91 |     static int store[THRESHOLD / sizeof (int)];
 92 | 					/* a small amount of static
 93 | 					   storage, to be used when the
 94 | 					   input strings are small enough */
 95 | 
 96 | /* Handle trivial cases when one string is empty */
 97 | 
 98 |     if (from == NULL || !from_len)
 99 | 	if (to == NULL || !to_len)
100 | 	    return 0;
101 | 	else
102 | 	    return to_len * insert_cost;
103 |     else if (to == NULL || !to_len)
104 | 	return from_len * delete_cost;
105 | 
106 | /* Initialize registers */
107 | 
108 |     radix = 2 * from_len + 3;
109 | #ifdef TRN_SPEEDUP
110 | #define ins 1
111 | #define del 1
112 | #define ch 3
113 | #define swap_cost 5
114 | #else
115 |     ins  = insert_cost;
116 |     del  = delete_cost;
117 |     ch   = change_cost;
118 | #endif
119 | 
120 | /* Make   from   short enough to fit in the static storage, if it's at all
121 |    possible */
122 | 
123 |     if (from_len > to_len && from_len > STRLENTHRESHOLD) {
124 | 	swap_int(from_len, to_len);
125 | 	swap_char(from, to);
126 | #ifndef TRN_SPEEDUP
127 | 	swap_int(ins, del);
128 | #endif
129 |     } /* if from_len > to_len */
130 | 
131 | /* Allocate the array storage (from the heap if necessary) */
132 | 
133 |     if (from_len <= STRLENTHRESHOLD)
134 | 	buffer = store;
135 |     else
136 | 	buffer = (int *) malloc(radix * sizeof (int));
137 | 
138 | /* Here's where the fun begins.  We will find the minimum edit distance
139 |    using dynamic programming.  We only need to store two rows of the matrix
140 |    at a time, since we always progress down the matrix.  For example,
141 |    given the strings "one" and "two", and insert, delete and change costs
142 |    equal to 1:
143 | 
144 | 	   _  o  n  e
145 | 	_  0  1  2  3
146 | 	t  1  1  2  3
147 | 	w  2  2  2  3
148 | 	o  3  2  3  3
149 | 
150 |    The dynamic programming recursion is defined as follows:
151 | 
152 | 	ar(x,0) := x * insert_cost
153 | 	ar(0,y) := y * delete_cost
154 | 	ar(x,y) := min(a(x - 1, y - 1) + (from[x] == to[y] ? 0 : change),
155 | 		       a(x - 1, y) + insert_cost,
156 | 		       a(x, y - 1) + delete_cost,
157 | 		       a(x - 2, y - 2) + (from[x] == to[y-1] &&
158 | 					  from[x-1] == to[y] ? swap_cost :
159 | 					  infinity))
160 | 
161 |    Since this only looks at most two rows and three columns back, we need
162 |    only store the values for the two preceeding rows.  In this
163 |    implementation, we do not explicitly store the zero column, so only 2 *
164 |    from_len + 2   words are needed.  However, in the implementation of the
165 |    swap_cost   check, the current matrix value is used as a buffer; we
166 |    can't overwrite the earlier value until the   swap_cost   check has
167 |    been performed.  So we use   2 * from_len + 3   elements in the buffer.
168 | */
169 | 
170 | #define ar(x,y,index) (((x) == 0) ? (y) * del : (((y) == 0) ? (x) * ins : \
171 | 	buffer[mod(index)]))
172 | #define NW(x,y)	  ar(x, y, index + from_len + 2)
173 | #define N(x,y)	  ar(x, y, index + from_len + 3)
174 | #define W(x,y)	  ar(x, y, index + radix - 1)
175 | #define NNWW(x,y) ar(x, y, index + 1)
176 | #define mod(x) ((x) % radix)
177 | 
178 |     index = 0;
179 | 
180 | #ifdef DEBUG_EDITDIST
181 |     printf("      ");
182 |     for (col = 0; col < from_len; col++)
183 | 	printf(" %c ", from[col]);
184 |     printf("\n   ");
185 | 
186 |     for (col = 0; col <= from_len; col++)
187 | 	printf("%2d ", col * del);
188 | #endif
189 | 
190 | /* Row 0 is handled implicitly; its value at a given column is   col*del.
191 |    The loop below computes the values for Row 1.  At this point we know the
192 |    strings are nonempty.  We also don't need to consider swap costs in row
193 |    1.
194 | 
195 |    COMMENT:  the indicies   row and col   below point into the STRING, so
196 |    the corresponding MATRIX indicies are   row+1 and col+1.
197 | */
198 | 
199 |     buffer[index++] = min2(ins + del, (from[0] == to[0] ? 0 : ch));
200 | #ifdef TRN_SPEEDUP
201 |     low = buffer[mod(index + radix - 1)];
202 | #endif
203 | 
204 | #ifdef DEBUG_EDITDIST
205 |     printf("\n %c %2d %2d ", to[0], ins, buffer[index - 1]);
206 | #endif
207 | 
208 |     for (col = 1; col < from_len; col++) {
209 | 	buffer[index] = min3(
210 | 		col * del + ((from[col] == to[0]) ? 0 : ch),
211 | 		(col + 1) * del + ins,
212 | 		buffer[index - 1] + del);
213 | #ifdef TRN_SPEEDUP
214 | 	if (buffer[index] < low)
215 | 	    low = buffer[index];
216 | #endif
217 | 	index++;
218 | 
219 | #ifdef DEBUG_EDITDIST
220 | 	printf("%2d ", buffer[index - 1]);
221 | #endif
222 | 
223 |     } /* for col = 1 */
224 | 
225 | #ifdef DEBUG_EDITDIST
226 |     printf("\n %c %2d ", to[1], 2 * ins);
227 | #endif
228 | 
229 | /* Now handle the rest of the matrix */
230 | 
231 |     for (row = 1; row < to_len; row++) {
232 | 	for (col = 0; col < from_len; col++) {
233 | 	    buffer[index] = min3(
234 | 		    NW(row, col) + ((from[col] == to[row]) ? 0 : ch),
235 | 		    N(row, col + 1) + ins,
236 | 		    W(row + 1, col) + del);
237 | 	    if (from[col] == to[row - 1] && col > 0 &&
238 | 		    from[col - 1] == to[row])
239 | 		buffer[index] = min2(buffer[index],
240 | 			NNWW(row - 1, col - 1) + swap_cost);
241 | 
242 | #ifdef DEBUG_EDITDIST
243 | 	    printf("%2d ", buffer[index]);
244 | #endif
245 | #ifdef TRN_SPEEDUP
246 | 	    if (buffer[index] < low || col == 0)
247 | 		low = buffer[index];
248 | #endif
249 | 
250 | 	    index = mod(index + 1);
251 | 	} /* for col = 1 */
252 | #ifdef DEBUG_EDITDIST
253 | 	if (row < to_len - 1)
254 | 	    printf("\n %c %2d ", to[row+1], (row + 2) * ins);
255 | 	else
256 | 	    printf("\n");
257 | #endif
258 | #ifdef TRN_SPEEDUP
259 | 	if (low > MIN_DIST)
260 | 	    break;
261 | #endif
262 |     } /* for row = 1 */
263 | 
264 |     row = buffer[mod(index + radix - 1)];
265 |     if (buffer != store)
266 | 	free((char *) buffer);
267 |     return row;
268 | } /* edit_distn */
269 | 


--------------------------------------------------------------------------------
/spamsum.go:
--------------------------------------------------------------------------------
  1 | // Copyright 2013, Michiel Buddingh, All rights reserved.  Use of this
  2 | // code is governed by version 2.0 or later of the Apache License,
  3 | // available at http://www.apache.org/licenses/LICENSE-2.0
  4 | 
  5 | // Package spamsum implements Andrew Tridgell's fuzzy hash for spam
  6 | // detection.  The output should be identical to that produced by
  7 | // existing tools like ssdeep.
  8 | package spamsum
  9 | 
 10 | import (
 11 | 	"bytes"
 12 | 	"errors"
 13 | 	"fmt"
 14 | 	"io"
 15 | 	"strconv"
 16 | 	"unicode"
 17 | )
 18 | 
 19 | const (
 20 | 	rollingWindow = 7
 21 | 	minBlockSize  = 3
 22 | 	SpamsumLength = 64
 23 | 	ReadSize      = 8192
 24 | 	offset32      = uint32(0x28021967)
 25 | 	prime32       = uint32(16777619)
 26 | )
 27 | 
 28 | type SpamSum struct {
 29 | 	blocksize             uint32
 30 | 	leftPart              [SpamsumLength]byte
 31 | 	rightPart             [SpamsumLength / 2]byte
 32 | 	leftIndex, rightIndex int
 33 | }
 34 | 
 35 | // String produces the canonical representation of a spamsum. a
 36 | // positive number indicating the block size, up to 64 base64
 37 | // characters each encoding a 6-bit hash of an (approximately)
 38 | // BlockSize()-sized block, up to 32 base64 characters encoding a
 39 | // 6-bit hash for BlockSize() * 2.  The three fields are separated by
 40 | // a colon.
 41 | func (ss *SpamSum) String() string {
 42 | 	return fmt.Sprintf("%d:%s:%s",
 43 | 		ss.blocksize,
 44 | 		string(ss.leftPart[:nonZeroLength(ss.leftPart[:])]),
 45 | 		string(ss.rightPart[:nonZeroLength(ss.rightPart[:])]))
 46 | }
 47 | 
 48 | // BlockSize returns the approximate block size used in this sum.
 49 | // Note that this size is only the expected value.  The individual
 50 | // 6-bit block hashes may encode far smaller or far larger blocks.
 51 | func (ss *SpamSum) BlockSize() int {
 52 | 	return int(ss.blocksize)
 53 | }
 54 | 
 55 | // HashBytes takes a byte slice, and takes its SpamSum, calculating
 56 | // the optimal block size in several passes.  Since adding more data
 57 | // to such a sum would invalidate the block size calculation, this
 58 | // SpamSum can not be added to.
 59 | func HashBytes(b []byte) *SpamSum {
 60 | 	wrapper := io.NewSectionReader(bytes.NewReader(b), 0, int64(len(b)))
 61 | 	// we discard the error, since they won't be produced
 62 | 	// for an in-memory byte slice
 63 | 	result, _ := HashReadSeeker(wrapper, wrapper.Size())
 64 | 	return result
 65 | }
 66 | 
 67 | // HashReadSeeker requires an implementation of io.ReadSeeker, and a length
 68 | // value indicating its size, and takes its SpamSum, calculating
 69 | // the optimal block size in several passes.  It is assumed that Seeks upto
 70 | // the specified length are allowed. Since adding more data
 71 | // to such a sum would invalidate the block size calculation, this
 72 | // SpamSum can not be added to.  Any errors returned will originate
 73 | // from the implementation of ReadSeeker.
 74 | func HashReadSeeker(source io.ReadSeeker, length int64) (*SpamSum, error) {
 75 | 	sum := new(SpamSum)
 76 | 	sum.blocksize = minBlockSize
 77 | 
 78 | 	for int64(sum.blocksize*SpamsumLength) < length {
 79 | 		sum.blocksize *= 2
 80 | 	}
 81 | 
 82 | 	sss := spamsumState{}
 83 | source_iteration:
 84 | 	for {
 85 | 		sss.reset()
 86 | 		sum.reset()
 87 | 
 88 | 		if _, err := source.Seek(0, 0); err != nil {
 89 | 			return nil, err
 90 | 		}
 91 | 		block := make([]byte, ReadSize)
 92 | 
 93 | 	block_read_loop:
 94 | 		for {
 95 | 			var num int
 96 | 			var err error
 97 | 			if num, err = source.Read(block); num == 0 {
 98 | 				break block_read_loop
 99 | 			} else {
100 | 				processBlock(block, num, &sss, sum)
101 | 			}
102 | 
103 | 			if err != nil {
104 | 				return nil, err
105 | 			}
106 | 		}
107 | 
108 | 		writeTail(&sss, sum)
109 | 
110 | 		if sum.blocksize > minBlockSize && sum.leftIndex < (SpamsumLength/2) {
111 | 			sum.blocksize /= 2
112 | 		} else {
113 | 			break source_iteration
114 | 		}
115 | 	}
116 | 
117 | 	return sum, nil
118 | }
119 | 
120 | type spamsumState struct {
121 | 	// fields for the rolling hash
122 | 	window                              [rollingWindow]byte
123 | 	rollingSum, h2, shiftHash, position uint32
124 | 
125 | 	// FNV-1 style hash fields
126 | 	left, right uint32
127 | }
128 | 
129 | const b64 string = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"
130 | 
131 | func processBlock(block []byte, length int, sss *spamsumState, sum *SpamSum) {
132 | 	for i := 0; i < length; i++ {
133 | 		sss.h2 -= sss.rollingSum
134 | 		sss.h2 += rollingWindow * uint32(block[i])
135 | 
136 | 		sss.rollingSum += uint32(block[i])
137 | 		sss.rollingSum -= uint32(sss.window[sss.position%rollingWindow])
138 | 
139 | 		sss.window[sss.position%rollingWindow] = block[i]
140 | 		sss.position += 1
141 | 
142 | 		sss.shiftHash <<= 5
143 | 		sss.shiftHash ^= uint32(block[i])
144 | 
145 | 		roll := sss.rollingSum + sss.h2 + sss.shiftHash
146 | 
147 | 		// left and right are Fowler/Noll/Vo-1 hashes with a
148 | 		// slightly different starting value.
149 | 		sss.left *= prime32
150 | 		sss.left ^= uint32(block[i])
151 | 
152 | 		sss.right *= prime32
153 | 		sss.right ^= uint32(block[i])
154 | 
155 | 		// Assuming the output of the rolling sum is uniformly
156 | 		// distributed, this condition will occur once every
157 | 		// blocksize bytes.  This means that the expected value
158 | 		// for the length of the blocks hashed is blocksize.
159 | 		if roll%sum.blocksize == (sum.blocksize - 1) {
160 | 			sum.leftPart[sum.leftIndex] = b64[sss.left%64]
161 | 			// Note that this means that the first 63 bytes of the
162 | 			// hash will encode the first 63*blocksize blocks,
163 | 			// and the last byte will encode the remainder, be it
164 | 			// one block, or 4GB.
165 | 			if sum.leftIndex < SpamsumLength-1 {
166 | 				sum.leftIndex += 1
167 | 				sss.left = offset32
168 | 			}
169 | 		}
170 | 
171 | 		// As for the previous condition, but for blocksize * 2
172 | 		if roll%(sum.blocksize*2) == ((sum.blocksize * 2) - 1) {
173 | 			sum.rightPart[sum.rightIndex] = b64[sss.right%64]
174 | 			if sum.rightIndex < (SpamsumLength/2)-1 {
175 | 				sum.rightIndex += 1
176 | 				sss.right = offset32
177 | 			}
178 | 		}
179 | 	}
180 | }
181 | 
182 | func writeTail(sss *spamsumState, sum *SpamSum) {
183 | 	roll := sss.rollingSum + sss.h2 + sss.shiftHash
184 | 	if roll != 0 {
185 | 		sum.leftPart[sum.leftIndex] = b64[sss.left%64]
186 | 		sum.rightPart[sum.rightIndex] = b64[sss.right%64]
187 | 	}
188 | }
189 | 
190 | func (sss *spamsumState) reset() {
191 | 	for i := range sss.window {
192 | 		sss.window[i] = 0
193 | 	}
194 | 
195 | 	sss.rollingSum = 0
196 | 	sss.h2 = 0
197 | 	sss.shiftHash = 0
198 | 	sss.position = 0
199 | 
200 | 	sss.left = offset32
201 | 	sss.right = offset32
202 | }
203 | 
204 | func (sum *SpamSum) reset() {
205 | 	for i := range sum.leftPart {
206 | 		sum.leftPart[i] = 0
207 | 	}
208 | 
209 | 	for i := range sum.rightPart {
210 | 		sum.rightPart[i] = 0
211 | 	}
212 | 
213 | 	sum.leftIndex, sum.rightIndex = 0, 0
214 | }
215 | 
216 | func nonZeroLength(array []byte) (r int) {
217 | 	for i := range array {
218 | 		if array[i] == 0 {
219 | 			break
220 | 		}
221 | 		r += 1
222 | 	}
223 | 	return r
224 | }
225 | 
226 | func (sum *SpamSum) Scan(state fmt.ScanState, verb rune) error {
227 | 	var blocksize int
228 | 	var leftPart, rightPart, blockPart, buffer []byte
229 | 	var err error
230 | 
231 | 	if blockPart, err = state.Token(false, // do not skip spaces
232 | 		func(r rune) bool {
233 | 			return unicode.IsDigit(r)
234 | 		}); err != nil {
235 | 		return err
236 | 	} else if len(blockPart) == 0 {
237 | 		return errors.New("Cannot read block size.")
238 | 	}
239 | 
240 | 	if blocksize, err = strconv.Atoi(string(blockPart)); err != nil {
241 | 		return err
242 | 	} else if blocksize < 3 {
243 | 		return errors.New("Block size too small")
244 | 	}
245 | 
246 | 	if r, _, err := state.ReadRune(); err != nil {
247 | 		return err
248 | 	} else if r != ':' {
249 | 		return errors.New("Invalid token delimiter")
250 | 	}
251 | 
252 | 	if buffer, err = state.Token(false, // do not skip spaces
253 | 		func(r rune) bool {
254 | 			return (bytes.IndexRune([]byte(b64), r) != -1)
255 | 		}); err != nil {
256 | 		return err
257 | 	} else if len(buffer) > SpamsumLength {
258 | 		return errors.New("First base64 string too long")
259 | 	}
260 | 
261 | 	leftPart = make([]byte, len(buffer))
262 | 	copy(leftPart, buffer[:])
263 | 
264 | 	if r, _, err := state.ReadRune(); err != nil {
265 | 		return err
266 | 	} else if r != ':' {
267 | 		return errors.New("Invalid token delimiter")
268 | 	}
269 | 
270 | 	if buffer, err = state.Token(false, // do not skip spaces
271 | 		func(r rune) bool {
272 | 			return (bytes.IndexRune([]byte(b64), r) != -1)
273 | 		}); err != nil {
274 | 		return err
275 | 	} else if len(buffer) > (SpamsumLength / 2) {
276 | 		return errors.New("Second base64 string too long")
277 | 	}
278 | 
279 | 	rightPart = make([]byte, len(buffer))
280 | 	copy(rightPart[:], buffer)
281 | 
282 | 	sum.blocksize = uint32(blocksize)
283 | 	copy(sum.leftPart[:], leftPart)
284 | 	copy(sum.rightPart[:], rightPart)
285 | 	sum.leftIndex = len(leftPart)
286 | 	sum.rightIndex = len(rightPart)
287 | 
288 | 	return nil
289 | }
290 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | 
  2 |                                  Apache License
  3 |                            Version 2.0, January 2004
  4 |                         http://www.apache.org/licenses/
  5 | 
  6 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  7 | 
  8 |    1. Definitions.
  9 | 
 10 |       "License" shall mean the terms and conditions for use, reproduction,
 11 |       and distribution as defined by Sections 1 through 9 of this document.
 12 | 
 13 |       "Licensor" shall mean the copyright owner or entity authorized by
 14 |       the copyright owner that is granting the License.
 15 | 
 16 |       "Legal Entity" shall mean the union of the acting entity and all
 17 |       other entities that control, are controlled by, or are under common
 18 |       control with that entity. For the purposes of this definition,
 19 |       "control" means (i) the power, direct or indirect, to cause the
 20 |       direction or management of such entity, whether by contract or
 21 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 22 |       outstanding shares, or (iii) beneficial ownership of such entity.
 23 | 
 24 |       "You" (or "Your") shall mean an individual or Legal Entity
 25 |       exercising permissions granted by this License.
 26 | 
 27 |       "Source" form shall mean the preferred form for making modifications,
 28 |       including but not limited to software source code, documentation
 29 |       source, and configuration files.
 30 | 
 31 |       "Object" form shall mean any form resulting from mechanical
 32 |       transformation or translation of a Source form, including but
 33 |       not limited to compiled object code, generated documentation,
 34 |       and conversions to other media types.
 35 | 
 36 |       "Work" shall mean the work of authorship, whether in Source or
 37 |       Object form, made available under the License, as indicated by a
 38 |       copyright notice that is included in or attached to the work
 39 |       (an example is provided in the Appendix below).
 40 | 
 41 |       "Derivative Works" shall mean any work, whether in Source or Object
 42 |       form, that is based on (or derived from) the Work and for which the
 43 |       editorial revisions, annotations, elaborations, or other modifications
 44 |       represent, as a whole, an original work of authorship. For the purposes
 45 |       of this License, Derivative Works shall not include works that remain
 46 |       separable from, or merely link (or bind by name) to the interfaces of,
 47 |       the Work and Derivative Works thereof.
 48 | 
 49 |       "Contribution" shall mean any work of authorship, including
 50 |       the original version of the Work and any modifications or additions
 51 |       to that Work or Derivative Works thereof, that is intentionally
 52 |       submitted to Licensor for inclusion in the Work by the copyright owner
 53 |       or by an individual or Legal Entity authorized to submit on behalf of
 54 |       the copyright owner. For the purposes of this definition, "submitted"
 55 |       means any form of electronic, verbal, or written communication sent
 56 |       to the Licensor or its representatives, including but not limited to
 57 |       communication on electronic mailing lists, source code control systems,
 58 |       and issue tracking systems that are managed by, or on behalf of, the
 59 |       Licensor for the purpose of discussing and improving the Work, but
 60 |       excluding communication that is conspicuously marked or otherwise
 61 |       designated in writing by the copyright owner as "Not a Contribution."
 62 | 
 63 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 64 |       on behalf of whom a Contribution has been received by Licensor and
 65 |       subsequently incorporated within the Work.
 66 | 
 67 |    2. Grant of Copyright License. Subject to the terms and conditions of
 68 |       this License, each Contributor hereby grants to You a perpetual,
 69 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 70 |       copyright license to reproduce, prepare Derivative Works of,
 71 |       publicly display, publicly perform, sublicense, and distribute the
 72 |       Work and such Derivative Works in Source or Object form.
 73 | 
 74 |    3. Grant of Patent License. Subject to the terms and conditions of
 75 |       this License, each Contributor hereby grants to You a perpetual,
 76 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 77 |       (except as stated in this section) patent license to make, have made,
 78 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 79 |       where such license applies only to those patent claims licensable
 80 |       by such Contributor that are necessarily infringed by their
 81 |       Contribution(s) alone or by combination of their Contribution(s)
 82 |       with the Work to which such Contribution(s) was submitted. If You
 83 |       institute patent litigation against any entity (including a
 84 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 85 |       or a Contribution incorporated within the Work constitutes direct
 86 |       or contributory patent infringement, then any patent licenses
 87 |       granted to You under this License for that Work shall terminate
 88 |       as of the date such litigation is filed.
 89 | 
 90 |    4. Redistribution. You may reproduce and distribute copies of the
 91 |       Work or Derivative Works thereof in any medium, with or without
 92 |       modifications, and in Source or Object form, provided that You
 93 |       meet the following conditions:
 94 | 
 95 |       (a) You must give any other recipients of the Work or
 96 |           Derivative Works a copy of this License; and
 97 | 
 98 |       (b) You must cause any modified files to carry prominent notices
 99 |           stating that You changed the files; and
100 | 
101 |       (c) You must retain, in the Source form of any Derivative Works
102 |           that You distribute, all copyright, patent, trademark, and
103 |           attribution notices from the Source form of the Work,
104 |           excluding those notices that do not pertain to any part of
105 |           the Derivative Works; and
106 | 
107 |       (d) If the Work includes a "NOTICE" text file as part of its
108 |           distribution, then any Derivative Works that You distribute must
109 |           include a readable copy of the attribution notices contained
110 |           within such NOTICE file, excluding those notices that do not
111 |           pertain to any part of the Derivative Works, in at least one
112 |           of the following places: within a NOTICE text file distributed
113 |           as part of the Derivative Works; within the Source form or
114 |           documentation, if provided along with the Derivative Works; or,
115 |           within a display generated by the Derivative Works, if and
116 |           wherever such third-party notices normally appear. The contents
117 |           of the NOTICE file are for informational purposes only and
118 |           do not modify the License. You may add Your own attribution
119 |           notices within Derivative Works that You distribute, alongside
120 |           or as an addendum to the NOTICE text from the Work, provided
121 |           that such additional attribution notices cannot be construed
122 |           as modifying the License.
123 | 
124 |       You may add Your own copyright statement to Your modifications and
125 |       may provide additional or different license terms and conditions
126 |       for use, reproduction, or distribution of Your modifications, or
127 |       for any such Derivative Works as a whole, provided Your use,
128 |       reproduction, and distribution of the Work otherwise complies with
129 |       the conditions stated in this License.
130 | 
131 |    5. Submission of Contributions. Unless You explicitly state otherwise,
132 |       any Contribution intentionally submitted for inclusion in the Work
133 |       by You to the Licensor shall be under the terms and conditions of
134 |       this License, without any additional terms or conditions.
135 |       Notwithstanding the above, nothing herein shall supersede or modify
136 |       the terms of any separate license agreement you may have executed
137 |       with Licensor regarding such Contributions.
138 | 
139 |    6. Trademarks. This License does not grant permission to use the trade
140 |       names, trademarks, service marks, or product names of the Licensor,
141 |       except as required for reasonable and customary use in describing the
142 |       origin of the Work and reproducing the content of the NOTICE file.
143 | 
144 |    7. Disclaimer of Warranty. Unless required by applicable law or
145 |       agreed to in writing, Licensor provides the Work (and each
146 |       Contributor provides its Contributions) on an "AS IS" BASIS,
147 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148 |       implied, including, without limitation, any warranties or conditions
149 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150 |       PARTICULAR PURPOSE. You are solely responsible for determining the
151 |       appropriateness of using or redistributing the Work and assume any
152 |       risks associated with Your exercise of permissions under this License.
153 | 
154 |    8. Limitation of Liability. In no event and under no legal theory,
155 |       whether in tort (including negligence), contract, or otherwise,
156 |       unless required by applicable law (such as deliberate and grossly
157 |       negligent acts) or agreed to in writing, shall any Contributor be
158 |       liable to You for damages, including any direct, indirect, special,
159 |       incidental, or consequential damages of any character arising as a
160 |       result of this License or out of the use or inability to use the
161 |       Work (including but not limited to damages for loss of goodwill,
162 |       work stoppage, computer failure or malfunction, or any and all
163 |       other commercial damages or losses), even if such Contributor
164 |       has been advised of the possibility of such damages.
165 | 
166 |    9. Accepting Warranty or Additional Liability. While redistributing
167 |       the Work or Derivative Works thereof, You may choose to offer,
168 |       and charge a fee for, acceptance of support, warranty, indemnity,
169 |       or other liability obligations and/or rights consistent with this
170 |       License. However, in accepting such obligations, You may act only
171 |       on Your own behalf and on Your sole responsibility, not on behalf
172 |       of any other Contributor, and only if You agree to indemnify,
173 |       defend, and hold each Contributor harmless for any liability
174 |       incurred by, or claims asserted against, such Contributor by reason
175 |       of your accepting any such warranty or additional liability.
176 | 
177 |    END OF TERMS AND CONDITIONS
178 | 
179 |    APPENDIX: How to apply the Apache License to your work.
180 | 
181 |       To apply the Apache License to your work, attach the following
182 |       boilerplate notice, with the fields enclosed by brackets "[]"
183 |       replaced with your own identifying information. (Don't include
184 |       the brackets!)  The text should be enclosed in the appropriate
185 |       comment syntax for the file format. We also recommend that a
186 |       file or class name and description of purpose be included on the
187 |       same "printed page" as the copyright notice for easier
188 |       identification within third-party archives.
189 | 
190 |    Copyright [yyyy] [name of copyright owner]
191 | 
192 |    Licensed under the Apache License, Version 2.0 (the "License");
193 |    you may not use this file except in compliance with the License.
194 |    You may obtain a copy of the License at
195 | 
196 |        http://www.apache.org/licenses/LICENSE-2.0
197 | 
198 |    Unless required by applicable law or agreed to in writing, software
199 |    distributed under the License is distributed on an "AS IS" BASIS,
200 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
201 |    See the License for the specific language governing permissions and
202 |    limitations under the License.
203 | 


--------------------------------------------------------------------------------
/extra_tests/spamsum.c:
--------------------------------------------------------------------------------
  1 | /*
  2 |   this is a checksum routine that is specifically designed for spam.
  3 |   Copyright Andrew Tridgell <tridge@samba.org> 2002
  4 | 
  5 |   This code is released under the GNU General Public License version 2
  6 |   or later.  Alteratively, you may also use this code under the terms
  7 |   of the Perl Artistic license.
  8 | 
  9 |   If you wish to distribute this code under the terms of a different
 10 |   free software license then please ask me. If there is a good reason
 11 |   then I will probably say yes.
 12 | */
 13 | #include <stdio.h>
 14 | #include <stdlib.h>
 15 | #include <string.h>
 16 | #include <fcntl.h>
 17 | #include <errno.h>
 18 | #include <sys/mman.h>
 19 | #include <sys/stat.h>
 20 | #include <unistd.h>
 21 | #include <ctype.h>
 22 | #include "spamsum.h"
 23 | 
 24 | /* the output is a string of length 64 in base64 */
 25 | #define SPAMSUM_LENGTH 64
 26 | 
 27 | #define MIN_BLOCKSIZE 3
 28 | #define HASH_PRIME 0x01000193
 29 | #define HASH_INIT 0x28021967
 30 | #define ROLLING_WINDOW 7
 31 | 
 32 | #ifndef MIN
 33 | #define MIN(a,b) ((a)<(b)?(a):(b))
 34 | #endif
 35 | 
 36 | #ifndef MAX
 37 | #define MAX(a,b) ((a)>(b)?(a):(b))
 38 | #endif
 39 | 
 40 | 
 41 | static struct {
 42 | 	uchar window[ROLLING_WINDOW];
 43 | 	u32 h1, h2, h3;
 44 | 	u32 n;
 45 | } roll_state;
 46 | 
 47 | /*
 48 |   a rolling hash, based on the Adler checksum. By using a rolling hash
 49 |   we can perform auto resynchronisation after inserts/deletes
 50 | 
 51 |   internally, h1 is the sum of the bytes in the window and h2
 52 |   is the sum of the bytes times the index
 53 | 
 54 |   h3 is a shift/xor based rolling hash, and is mostly needed to ensure that
 55 |   we can cope with large blocksize values
 56 | */
 57 | static inline u32 roll_hash(uchar c)
 58 | {
 59 | 	roll_state.h2 -= roll_state.h1;
 60 | 	roll_state.h2 += ROLLING_WINDOW * c;
 61 | 
 62 | 	roll_state.h1 += c;
 63 | 	roll_state.h1 -= roll_state.window[roll_state.n % ROLLING_WINDOW];
 64 | 
 65 | 	roll_state.window[roll_state.n % ROLLING_WINDOW] = c;
 66 | 	roll_state.n++;
 67 | 
 68 | 	roll_state.h3 = (roll_state.h3 << 5) & 0xFFFFFFFF;
 69 | 	roll_state.h3 ^= c;
 70 | 
 71 | 	return roll_state.h1 + roll_state.h2 + roll_state.h3;
 72 | }
 73 | 
 74 | /*
 75 |   reset the state of the rolling hash and return the initial rolling hash value
 76 | */
 77 | static u32 roll_reset(void)
 78 | {
 79 | 	memset(&roll_state, 0, sizeof(roll_state));
 80 | 	return 0;
 81 | }
 82 | 
 83 | /* a simple non-rolling hash, based on the FNV hash */
 84 | static inline u32 sum_hash(uchar c, u32 h)
 85 | {
 86 | 	h *= HASH_PRIME;
 87 | 	h ^= c;
 88 | 	return h;
 89 | }
 90 | 
 91 | /*
 92 |   take a message of length 'length' and return a string representing a hash of that message,
 93 |   prefixed by the selected blocksize
 94 | */
 95 | char *spamsum(const uchar *in, size_t length, u32 flags, u32 bsize)
 96 | {
 97 | 	const char *b64 = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
 98 | 	char *ret, *p;
 99 | 	u32 total_chars;
100 | 	u32 h, h2, h3;
101 | 	u32 j, n, i, k;
102 | 	u32 block_size;
103 | 	uchar ret2[SPAMSUM_LENGTH/2 + 1];
104 | 
105 | 	/* if we are ignoring email headers then skip past them now */
106 | 	if (flags & FLAG_IGNORE_HEADERS) {
107 | 		const uchar *s = strstr(in, "\n\n");
108 | 		if (s) {
109 | 			length -= (s+2 - in);
110 | 			in = s+2;
111 | 		}
112 | 	}
113 | 
114 | 	if (flags & FLAG_IGNORE_WHITESPACE) {
115 | 		/* count the non-ignored chars */
116 | 		for (n=0, i=0; i<length; i++) {
117 | 			if (isspace(in[i])) continue;
118 | 			n++;
119 | 		}
120 | 		total_chars = n;
121 | 	} else {
122 | 		total_chars = length;
123 | 	}
124 | 
125 | 	if (bsize == 0) {
126 | 	/* guess a reasonable block size */
127 | 		block_size = MIN_BLOCKSIZE;
128 | 		while (block_size * SPAMSUM_LENGTH < total_chars) {
129 | 			block_size = block_size * 2;
130 | 		}
131 | 	} else {
132 | 		block_size = bsize;
133 | 	}
134 | 
135 | 	ret = malloc(SPAMSUM_LENGTH + SPAMSUM_LENGTH/2 + 20);
136 | 	if (!ret) return NULL;
137 | 
138 | again:
139 | 	/* the first part of the spamsum signature is the blocksize */
140 | 	snprintf(ret, 12, "%u:", block_size);
141 | 	p = ret + strlen(ret);
142 | 
143 | 	memset(p, 0, SPAMSUM_LENGTH+1);
144 | 	memset(ret2, 0, sizeof(ret2));
145 | 
146 | 	k = j = 0;
147 | 	h3 = h2 = HASH_INIT;
148 | 	h = roll_reset();
149 | 
150 | 	for (i=0; i<length; i++) {
151 | 		if ((flags & FLAG_IGNORE_WHITESPACE) &&
152 | 		    isspace(in[i])) continue;
153 | 
154 | 		/*
155 | 		   at each character we update the rolling hash and
156 | 		   the normal hash. When the rolling hash hits the
157 | 		   reset value then we emit the normal hash as a
158 | 		   element of the signature and reset both hashes
159 | 		*/
160 | 		h = roll_hash(in[i]);
161 | 		h2 = sum_hash(in[i], h2);
162 | 		h3 = sum_hash(in[i], h3);
163 | 
164 | 		if (h % block_size == (block_size-1)) {
165 | 			/* we have hit a reset point. We now emit a
166 | 			   hash which is based on all chacaters in the
167 | 			   piece of the message between the last reset
168 | 			   point and this one */
169 | 			p[j] = b64[h2 % 64];
170 | 			if (j < SPAMSUM_LENGTH-1) {
171 | 				/* we can have a problem with the tail
172 | 				   overflowing. The easiest way to
173 | 				   cope with this is to only reset the
174 | 				   second hash if we have room for
175 | 				   more characters in our
176 | 				   signature. This has the effect of
177 | 				   combining the last few pieces of
178 | 				   the message into a single piece */
179 | 				h2 = HASH_INIT;
180 | 				j++;
181 | 			}
182 | 		}
183 | 
184 | 		/* this produces a second signature with a block size
185 | 		   of block_size*2. By producing dual signatures in
186 | 		   this way the effect of small changes in the message
187 | 		   size near a block size boundary is greatly reduced. */
188 | 		if (h % (block_size*2) == ((block_size*2)-1)) {
189 | 			ret2[k] = b64[h3 % 64];
190 | 			if (k < SPAMSUM_LENGTH/2-1) {
191 | 				h3 = HASH_INIT;
192 | 				k++;
193 | 			}
194 | 		}
195 | 	}
196 | 
197 | 	/* if we have anything left then add it to the end. This
198 | 	   ensures that the last part of the message is always
199 | 	   considered */
200 | 	if (h != 0) {
201 | 		p[j] = b64[h2 % 64];
202 | 		ret2[k] = b64[h3 % 64];
203 | 	}
204 | 
205 | 	strcat(p+j, ":");
206 | 	strcat(p+j, ret2);
207 | 
208 | 	/* our blocksize guess may have been way off - repeat if necessary */
209 | 	if (bsize == 0 && block_size > MIN_BLOCKSIZE && j < SPAMSUM_LENGTH/2) {
210 | 		block_size = block_size / 2;
211 | 		goto again;
212 | 	}
213 | 
214 | 	return ret;
215 | }
216 | 
217 | 
218 | /*
219 |    we only accept a match if we have at least one common substring in
220 |    the signature of length ROLLING_WINDOW. This dramatically drops the
221 |    false positive rate for low score thresholds while having
222 |    negligable affect on the rate of spam detection.
223 | 
224 |    return 1 if the two strings do have a common substring, 0 otherwise
225 | */
226 | static int has_common_substring(const char *s1, const char *s2)
227 | {
228 | 	int i, j;
229 | 	int num_hashes;
230 | 	u32 hashes[SPAMSUM_LENGTH];
231 | 
232 | 	/* there are many possible algorithms for common substring
233 | 	   detection. In this case I am re-using the rolling hash code
234 | 	   to act as a filter for possible substring matches */
235 | 
236 | 	roll_reset();
237 | 	memset(hashes, 0, sizeof(hashes));
238 | 
239 | 	/* first compute the windowed rolling hash at each offset in
240 | 	   the first string */
241 | 	for (i=0;s1[i];i++) {
242 | 		hashes[i] = roll_hash((uchar)s1[i]);
243 | 	}
244 | 	num_hashes = i;
245 | 
246 | 	roll_reset();
247 | 
248 | 	/* now for each offset in the second string compute the
249 | 	   rolling hash and compare it to all of the rolling hashes
250 | 	   for the first string. If one matches then we have a
251 | 	   candidate substring match. We then confirm that match with
252 | 	   a direct string comparison */
253 | 	for (i=0;s2[i];i++) {
254 | 		u32 h = roll_hash((uchar)s2[i]);
255 | 		if (i < ROLLING_WINDOW-1) continue;
256 | 		for (j=ROLLING_WINDOW-1;j<num_hashes;j++) {
257 | 			if (hashes[j] != 0 && hashes[j] == h) {
258 | 				/* we have a potential match - confirm it */
259 | 				if (strlen(s2+i-(ROLLING_WINDOW-1)) >= ROLLING_WINDOW &&
260 | 				    strncmp(s2+i-(ROLLING_WINDOW-1),
261 | 					    s1+j-(ROLLING_WINDOW-1),
262 | 					    ROLLING_WINDOW) == 0) {
263 | 					return 1;
264 | 				}
265 | 			}
266 | 		}
267 | 	}
268 | 
269 | 	return 0;
270 | }
271 | 
272 | 
273 | /*
274 |   eliminate sequences of longer than 3 identical characters. These
275 |   sequences contain very little information so they tend to just bias
276 |   the result unfairly
277 | */
278 | static char *eliminate_sequences(const char *str)
279 | {
280 | 	char *ret;
281 | 	int i, j, len;
282 | 
283 | 	ret = strdup(str);
284 | 	if (!ret) return NULL;
285 | 
286 | 	len = strlen(str);
287 | 
288 | 	for (i=j=3;i<len;i++) {
289 | 		if (str[i] != str[i-1] ||
290 | 		    str[i] != str[i-2] ||
291 | 		    str[i] != str[i-3]) {
292 | 			ret[j++] = str[i];
293 | 		}
294 | 	}
295 | 
296 | 	ret[j] = 0;
297 | 
298 | 	return ret;
299 | }
300 | 
301 | /*
302 |   this is the low level string scoring algorithm. It takes two strings
303 |   and scores them on a scale of 0-100 where 0 is a terrible match and
304 |   100 is a great match. The block_size is used to cope with very small
305 |   messages.
306 | */
307 | static unsigned score_strings(const char *s1, const char *s2, u32 block_size)
308 | {
309 | 	u32 score;
310 | 	u32 len1, len2;
311 | 	int edit_distn(const char *from, int from_len, const char *to, int to_len);
312 | 
313 | 	len1 = strlen(s1);
314 | 	len2 = strlen(s2);
315 | 
316 | 	if (len1 > SPAMSUM_LENGTH || len2 > SPAMSUM_LENGTH) {
317 | 		/* not a real spamsum signature? */
318 | 		return 0;
319 | 	}
320 | 
321 | 	/* the two strings must have a common substring of length
322 | 	   ROLLING_WINDOW to be candidates */
323 | 	if (has_common_substring(s1, s2) == 0) {
324 | 		return 0;
325 | 	}
326 | 
327 | 	/* compute the edit distance between the two strings. The edit distance gives
328 | 	   us a pretty good idea of how closely related the two strings are */
329 | 	score = edit_distn(s1, len1, s2, len2);
330 | 
331 | 	/* scale the edit distance by the lengths of the two
332 | 	   strings. This changes the score to be a measure of the
333 | 	   proportion of the message that has changed rather than an
334 | 	   absolute quantity. It also copes with the variability of
335 | 	   the string lengths. */
336 | 	score = (score * SPAMSUM_LENGTH) / (len1 + len2);
337 | 
338 | 	/* at this stage the score occurs roughly on a 0-64 scale,
339 | 	 * with 0 being a good match and 64 being a complete
340 | 	 * mismatch */
341 | 
342 | 	/* rescale to a 0-100 scale (friendlier to humans) */
343 | 	score = (100 * score) / 64;
344 | 
345 | 	/* it is possible to get a score above 100 here, but it is a
346 | 	   really terrible match */
347 | 	if (score >= 100) return 0;
348 | 
349 | 	/* now re-scale on a 0-100 scale with 0 being a poor match and
350 | 	   100 being a excellent match. */
351 | 	score = 100 - score;
352 | 
353 | 	/* when the blocksize is small we don't want to exaggerate the match size */
354 | 	if (score > block_size/MIN_BLOCKSIZE * MIN(len1, len2)) {
355 | 		score = block_size/MIN_BLOCKSIZE * MIN(len1, len2);
356 | 	}
357 | 
358 | 	return score;
359 | }
360 | 
361 | /*
362 |   given two spamsum strings return a value indicating the degree to which they match.
363 | */
364 | u32 spamsum_match(const char *str1, const char *str2)
365 | {
366 | 	u32 block_size1, block_size2;
367 | 	u32 score = 0;
368 | 	char *s1, *s2;
369 | 	char *s1_1, *s1_2;
370 | 	char *s2_1, *s2_2;
371 | 
372 | 	/* each spamsum is prefixed by its block size */
373 | 	if (sscanf(str1, "%u:", &block_size1) != 1 ||
374 | 	    sscanf(str2, "%u:", &block_size2) != 1) {
375 | 		return 0;
376 | 	}
377 | 
378 | 	/* if the blocksizes don't match then we are comparing
379 | 	   apples to oranges ... */
380 | 	if (block_size1 != block_size2 &&
381 | 	    block_size1 != block_size2*2 &&
382 | 	    block_size2 != block_size1*2) {
383 | 		return 0;
384 | 	}
385 | 
386 | 	/* move past the prefix */
387 | 	str1 = strchr(str1, ':');
388 | 	str2 = strchr(str2, ':');
389 | 
390 | 	if (!str1 || !str2) {
391 | 		/* badly formed ... */
392 | 		return 0;
393 | 	}
394 | 
395 | 	/* there is very little information content is sequences of
396 | 	   the same character like 'LLLLL'. Eliminate any sequences
397 | 	   longer than 3. This is especially important when combined
398 | 	   with the has_common_substring() test below. */
399 | 	s1 = eliminate_sequences(str1+1);
400 | 	s2 = eliminate_sequences(str2+1);
401 | 
402 | 	if (!s1 || !s2) return 0;
403 | 
404 | 	/* now break them into the two pieces */
405 | 	s1_1 = s1;
406 | 	s2_1 = s2;
407 | 
408 | 	s1_2 = strchr(s1, ':');
409 | 	s2_2 = strchr(s2, ':');
410 | 
411 | 	if (!s1_2 || !s2_2) {
412 | 		/* a signature is malformed - it doesn't have 2 parts */
413 | 		free(s1); free(s2);
414 | 		return 0;
415 | 	}
416 | 
417 | 	*s1_2++ = 0;
418 | 	*s2_2++ = 0;
419 | 
420 | 	/* each signature has a string for two block sizes. We now
421 | 	   choose how to combine the two block sizes. We checked above
422 | 	   that they have at least one block size in common */
423 | 	if (block_size1 == block_size2) {
424 | 		u32 score1, score2;
425 | 		score1 = score_strings(s1_1, s2_1, block_size1);
426 | 		score2 = score_strings(s1_2, s2_2, block_size2);
427 | 		score = MAX(score1, score2);
428 | 	} else if (block_size1 == block_size2*2) {
429 | 		score = score_strings(s1_1, s2_2, block_size1);
430 | 	} else {
431 | 		score = score_strings(s1_2, s2_1, block_size2);
432 | 	}
433 | 
434 | 	free(s1);
435 | 	free(s2);
436 | 
437 | 	return score;
438 | }
439 | 
440 | /*
441 |   return the maximum match for a file containing a list of spamsums
442 | */
443 | u32 spamsum_match_db(const char *fname, const char *sum, u32 threshold)
444 | {
445 | 	FILE *f;
446 | 	char line[100];
447 | 	u32 best = 0;
448 | 
449 | 	f = fopen(fname, "r");
450 | 	if (!f) return 0;
451 | 
452 | 	/* on each line of the database we compute the spamsum match
453 | 	   score. We then pick the best score */
454 | 	while (fgets(line, sizeof(line)-1, f)) {
455 | 		u32 score;
456 | 		int len;
457 | 		len = strlen(line);
458 | 		if (line[len-1] == '\n') line[len-1] = 0;
459 | 
460 | 		score = spamsum_match(sum, line);
461 | 
462 | 		if (score > best) {
463 | 			best = score;
464 | 			if (best >= threshold) break;
465 | 		}
466 | 	}
467 | 
468 | 	fclose(f);
469 | 
470 | 	return best;
471 | }
472 | 
473 | /*
474 |   return the spamsum on stdin
475 | */
476 | static char *spamsum_stdin(u32 flags, u32 block_size)
477 | {
478 | 	uchar buf[10*1024];
479 | 	uchar *msg;
480 | 	size_t length = 0;
481 | 	int n;
482 | 	char *sum;
483 | 
484 | 	msg = malloc(sizeof(buf));
485 | 	if (!msg) return NULL;
486 | 
487 | 	/* load the file, expanding the allocation as needed. */
488 | 	while (1) {
489 | 		n = read(0, buf, sizeof(buf));
490 | 		if (n == -1 && errno == EINTR) continue;
491 | 		if (n <= 0) break;
492 | 
493 | 		msg = realloc(msg, length + n);
494 | 		if (!msg) return NULL;
495 | 
496 | 		memcpy(msg+length, buf, n);
497 | 		length += n;
498 | 	}
499 | 
500 | 	sum = spamsum(msg, length, flags, block_size);
501 | 
502 | 	free(msg);
503 | 
504 | 	return sum;
505 | }
506 | 
507 | 
508 | /*
509 |   return the spamsum on a file
510 | */
511 | char *spamsum_file(const char *fname, u32 flags, u32 block_size)
512 | {
513 | 	int fd;
514 | 	char *sum;
515 | 	struct stat st;
516 | 	uchar *msg;
517 | 
518 | 	if (strcmp(fname, "-") == 0) {
519 | 		return spamsum_stdin(flags, block_size);
520 | 	}
521 | 
522 | 	fd = open(fname, O_RDONLY);
523 | 	if (fd == -1) {
524 | 		perror(fname);
525 | 		return NULL;
526 | 	}
527 | 
528 | 	if (fstat(fd, &st) == -1) {
529 | 		perror("fstat");
530 | 		return NULL;
531 | 	}
532 | 
533 | 	msg = mmap(NULL, st.st_size, PROT_READ, MAP_FILE|MAP_PRIVATE, fd, 0);
534 | 	if (msg == (uchar *)-1) {
535 | 		perror("mmap");
536 | 		return NULL;
537 | 	}
538 | 	close(fd);
539 | 
540 | 	sum = spamsum(msg, st.st_size, flags, block_size);
541 | 
542 | 	munmap(msg, st.st_size);
543 | 
544 | 	return sum;
545 | }
546 | 


--------------------------------------------------------------------------------