├── renderer ├── resource │ ├── dummy.go │ ├── utils.js │ ├── headers.js │ ├── render.js │ ├── events.js │ └── extractors.js ├── noscript_test.go ├── phantomjs_test.go ├── base_test.go ├── base.go ├── noscript.go └── phantomjs.go ├── go.mod ├── fuzzer ├── dummy │ ├── dummy_test.go │ └── dummy.go ├── arachni │ ├── arachni_test.go │ └── arachni.go └── sqlmap │ ├── sqlmap_test.go │ └── sqlmap.go ├── data ├── memory_test.go ├── store.go ├── store_test.go └── memory.go ├── .gitignore ├── util.go ├── cmd ├── gryffin-standalone │ ├── main_test.go │ └── main.go └── gryffin-distributed │ ├── main_test.go │ └── main.go ├── global.go ├── Makefile ├── .github └── workflows │ └── linux.yml ├── html-distance ├── bktree_test.go ├── bktree.go ├── README.md ├── feature.go └── feature_test.go ├── go.sum ├── session_test.go ├── LICENSE ├── serialize.go ├── README.md ├── session.go ├── gryffin_test.go └── gryffin.go /renderer/resource/dummy.go: -------------------------------------------------------------------------------- 1 | // Copyright 2015, Yahoo Inc. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package dummy 6 | -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/yahoo/gryffin 2 | 3 | go 1.14 4 | 5 | require ( 6 | github.com/mfonda/simhash v0.0.0-20151007195837-79f94a1100d6 7 | github.com/nsqio/go-nsq v1.0.8 8 | golang.org/x/net v0.0.0-20200425230154-ff2c4b7c35a0 9 | ) 10 | -------------------------------------------------------------------------------- /renderer/noscript_test.go: -------------------------------------------------------------------------------- 1 | // Copyright 2015, Yahoo Inc. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package renderer 6 | 7 | import ( 8 | "testing" 9 | ) 10 | 11 | func TestNoScriptCrawlAsync(t *testing.T) { 12 | t.Parallel() 13 | r := &NoScriptRenderer{} 14 | testCrawlAsync(t, r) 15 | } 16 | -------------------------------------------------------------------------------- /renderer/phantomjs_test.go: -------------------------------------------------------------------------------- 1 | // Copyright 2015, Yahoo Inc. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package renderer 6 | 7 | import ( 8 | "testing" 9 | ) 10 | 11 | func TestPhantomJSCrawlAsync(t *testing.T) { 12 | t.Parallel() 13 | r := &PhantomJSRenderer{Timeout: 30} 14 | testCrawlAsync(t, r) 15 | } 16 | -------------------------------------------------------------------------------- /fuzzer/dummy/dummy_test.go: -------------------------------------------------------------------------------- 1 | // Copyright 2015, Yahoo Inc. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package dummy 6 | 7 | import ( 8 | "testing" 9 | 10 | "github.com/yahoo/gryffin" 11 | ) 12 | 13 | func TestFuzzer(t *testing.T) { 14 | 15 | f := &Fuzzer{} 16 | scan := gryffin.NewScan("GET", "http://www.yahoo.com", "") 17 | _, err := f.Fuzz(scan) 18 | if err != nil { 19 | t.Error(err) 20 | } 21 | 22 | } 23 | -------------------------------------------------------------------------------- /data/memory_test.go: -------------------------------------------------------------------------------- 1 | // Copyright 2015, Yahoo Inc. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package data 6 | 7 | import ( 8 | "testing" 9 | ) 10 | 11 | func TestMemoryStore(t *testing.T) { 12 | t.Parallel() 13 | testStore(t, NewMemoryStore()) 14 | } 15 | 16 | func BenchmarkMemoryStore(b *testing.B) { 17 | s := NewMemoryStore() 18 | b.ResetTimer() 19 | for i := 0; i < b.N; i++ { 20 | benchStore(b, s) 21 | } 22 | 23 | } 24 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Compiled Object files, Static and Dynamic libs (Shared Objects) 2 | *.o 3 | *.a 4 | *.so 5 | 6 | # Folders 7 | _obj 8 | _test 9 | 10 | # Architecture specific extensions/prefixes 11 | *.[568vq] 12 | [568vq].out 13 | 14 | *.cgo1.go 15 | *.cgo2.c 16 | _cgo_defun.c 17 | _cgo_gotypes.go 18 | _cgo_export.* 19 | 20 | _testmain.go 21 | 22 | *.exe 23 | *.test 24 | *.prof 25 | 26 | # NSQ temporary files. 27 | *.dat 28 | 29 | # logstashes 30 | *.log 31 | logstash-forwarder.crt 32 | logstash-forwarder.key 33 | .logstash-forwarder -------------------------------------------------------------------------------- /util.go: -------------------------------------------------------------------------------- 1 | // Copyright 2015, Yahoo Inc. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package gryffin 6 | 7 | import ( 8 | "crypto/rand" 9 | "fmt" 10 | "io" 11 | ) 12 | 13 | // GenRandomID generates a random ID. 14 | func GenRandomID() string { 15 | // UUID generation is trivial per RSC in https://groups.google.com/d/msg/golang-dev/zwB0k2mpshc/l3zS3oxXuNwJ 16 | buf := make([]byte, 16) 17 | io.ReadFull(rand.Reader, buf) 18 | return fmt.Sprintf("%X", buf) 19 | } 20 | -------------------------------------------------------------------------------- /data/store.go: -------------------------------------------------------------------------------- 1 | // Copyright 2015, Yahoo Inc. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | // Package data provides an interface for common data store operations. 6 | package data 7 | 8 | // Store is an interface that capture all methods supported for a data store. 9 | type Store interface { 10 | Get(key string) (value interface{}, ok bool) 11 | Set(key string, value interface{}) bool 12 | IncrBy(key string, delta int64) (newVal int64) 13 | Publish(key string, value interface{}) 14 | } 15 | -------------------------------------------------------------------------------- /renderer/base_test.go: -------------------------------------------------------------------------------- 1 | // Copyright 2015, Yahoo Inc. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package renderer 6 | 7 | import ( 8 | "os" 9 | "testing" 10 | 11 | "github.com/yahoo/gryffin" 12 | ) 13 | 14 | func testCrawlAsync(t *testing.T, r gryffin.Renderer) { 15 | if os.Getenv("INTEGRATION") == "" { 16 | t.Skip("Skip integration tests.") 17 | } 18 | 19 | url := "https://www.yahoo.com/" 20 | 21 | s := gryffin.NewScan("GET", url, "") 22 | r.Do(s) 23 | <-r.GetRequestBody() 24 | for link := range r.GetLinks() { 25 | t.Logf("Got link %s", link.Request.URL) 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /fuzzer/dummy/dummy.go: -------------------------------------------------------------------------------- 1 | // Copyright 2015, Yahoo Inc. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package dummy 6 | 7 | import ( 8 | "fmt" 9 | "os/exec" 10 | 11 | "github.com/yahoo/gryffin" 12 | ) 13 | 14 | // Fuzzer is the handle for the fuzzing methods. 15 | type Fuzzer struct{} 16 | 17 | // Fuzz runs a dummy scan. 18 | func (s *Fuzzer) Fuzz(g *gryffin.Scan) (count int, err error) { 19 | 20 | cmd := exec.Command("echo", g.Request.URL.Host) 21 | _, err = cmd.Output() 22 | 23 | g.Logm("Dummy.Scan", fmt.Sprintf("Echo return %t", cmd.ProcessState.Success())) 24 | return 0, err 25 | 26 | } 27 | -------------------------------------------------------------------------------- /fuzzer/arachni/arachni_test.go: -------------------------------------------------------------------------------- 1 | // Copyright 2015, Yahoo Inc. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package arachni 6 | 7 | import ( 8 | "os" 9 | "testing" 10 | 11 | "github.com/yahoo/gryffin" 12 | ) 13 | 14 | func TestFuzzer(t *testing.T) { 15 | if os.Getenv("INTEGRATION") == "" { 16 | t.Skip("Skip integration tests.") 17 | } 18 | s := &Fuzzer{} 19 | scan := gryffin.NewScan("GET", "http://127.0.0.1:8081/xss/reflect/full1?in=change_me", "") 20 | c, err := s.Fuzz(scan) 21 | if err != nil { 22 | t.Error(err) 23 | } 24 | if c == 0 { 25 | t.Error("No issue detected.") 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /fuzzer/sqlmap/sqlmap_test.go: -------------------------------------------------------------------------------- 1 | // Copyright 2015, Yahoo Inc. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package sqlmap 6 | 7 | import ( 8 | "os" 9 | "testing" 10 | 11 | "github.com/yahoo/gryffin" 12 | ) 13 | 14 | func TestFuzzer(t *testing.T) { 15 | if os.Getenv("INTEGRATION") == "" { 16 | t.Skip("Skip integration tests.") 17 | } 18 | 19 | s := &Fuzzer{} 20 | scan := gryffin.NewScan("GET", "http://127.0.0.1:8082/dvwa/vulnerabilities/sqli/?id=1&Submit=Submit", "") 21 | c, err := s.Fuzz(scan) 22 | if err != nil { 23 | t.Error(err) 24 | } 25 | if c == 0 { 26 | t.Error("No issue detected.") 27 | } 28 | 29 | } 30 | -------------------------------------------------------------------------------- /cmd/gryffin-standalone/main_test.go: -------------------------------------------------------------------------------- 1 | // Copyright 2015, Yahoo Inc. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package main 6 | 7 | import ( 8 | "net/http" 9 | "net/http/httptest" 10 | "os" 11 | "testing" 12 | 13 | "github.com/yahoo/gryffin" 14 | ) 15 | 16 | var h = http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { 17 | w.Write([]byte("Hello World")) 18 | }) 19 | 20 | var ts = httptest.NewServer(h) 21 | 22 | func TestMain(t *testing.T) { 23 | if os.Getenv("INTEGRATION") == "" { 24 | t.Skip("Skip integration tests.") 25 | } 26 | scan := gryffin.NewScan("GET", ts.URL, "") 27 | linkChannels(scan) 28 | 29 | } 30 | -------------------------------------------------------------------------------- /global.go: -------------------------------------------------------------------------------- 1 | // Copyright 2015, Yahoo Inc. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package gryffin 6 | 7 | import ( 8 | "io" 9 | "sync" 10 | ) 11 | 12 | var ( 13 | memoryStore *GryffinStore 14 | logWriter io.Writer 15 | memoryStoreMu sync.Mutex 16 | logWriterMu sync.Mutex 17 | ) 18 | 19 | // SetMemoryStore sets the package internal global variable 20 | // for the memory store. 21 | func SetMemoryStore(m *GryffinStore) { 22 | memoryStoreMu.Lock() 23 | memoryStore = m 24 | memoryStoreMu.Unlock() 25 | } 26 | 27 | // SetLogWriter sets the log writer. 28 | func SetLogWriter(w io.Writer) { 29 | logWriterMu.Lock() 30 | logWriter = w 31 | logWriterMu.Unlock() 32 | } 33 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | 2 | # This Makefile is adopted from https://github.com/hashicorp/consul/blob/master/Makefile 3 | 4 | all: format build 5 | 6 | cov: 7 | gocov test | gocov-html > /tmp/coverage.html 8 | open /tmp/coverage.html 9 | 10 | build: test 11 | cd cmd/gryffin-standalone; go build 12 | 13 | test: 14 | go test ./... 15 | @$(MAKE) vet 16 | 17 | test-mono: 18 | go run cmd/gryffin-standalone/main.go "http://127.0.0.1:8081" 19 | go run cmd/gryffin-standalone/main.go "http://127.0.0.1:8082/dvwa/vulnerabilities/sqli/?id=1&Submit=Submit" 20 | 21 | 22 | test-integration: 23 | INTEGRATION=1 go test ./... 24 | 25 | test-cover: 26 | go test --cover ./... 27 | 28 | format: 29 | @gofmt -l . 30 | 31 | vet: 32 | @go vet ./... 33 | 34 | .PHONY: all cov build test vet web web-push 35 | -------------------------------------------------------------------------------- /data/store_test.go: -------------------------------------------------------------------------------- 1 | // Copyright 2015, Yahoo Inc. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package data 6 | 7 | import ( 8 | "testing" 9 | ) 10 | 11 | func testStore(t *testing.T, s Store) { 12 | s.Set("hello", "world") 13 | if v, ok := s.Get("hello"); !ok || v != "world" { 14 | t.Error("Get and Set is inconsistent.", v) 15 | } 16 | 17 | s.Set("foo", 100) 18 | if n := s.IncrBy("foo", 10); n != 110 { 19 | t.Error("Incr failed.") 20 | } 21 | if v, ok := s.Get("foo"); v.(int64) != 110 { 22 | t.Errorf("Incr is inconsistent %t, %t and %s", ok, v.(int64) == 110, v) 23 | } 24 | 25 | } 26 | 27 | func benchStore(b *testing.B, s Store) { 28 | s.Set("hello", "world") 29 | s.Set("foo", 100) 30 | s.IncrBy("foo", 10) 31 | } 32 | -------------------------------------------------------------------------------- /cmd/gryffin-distributed/main_test.go: -------------------------------------------------------------------------------- 1 | // Copyright 2015, Yahoo Inc. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package main 6 | 7 | // Unit test for gryffin-distributed is still on todo list. 8 | // 9 | // import ( 10 | // "net/http" 11 | // "net/http/httptest" 12 | // "os" 13 | // "testing" 14 | 15 | // "github.com/yahoo/gryffin" 16 | // ) 17 | 18 | // var handler = http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { 19 | // w.Write([]byte("Hello World")) 20 | // }) 21 | 22 | // var ts = httptest.NewServer(handler) 23 | 24 | // func TestMain(t *testing.T) { 25 | // if os.Getenv("INTEGRATION") == "" { 26 | // t.Skip("Skip integration tests.") 27 | // } 28 | // scan := gryffin.NewScan("GET", ts.URL, "") 29 | // linkChannels(scan) 30 | 31 | // } 32 | -------------------------------------------------------------------------------- /renderer/base.go: -------------------------------------------------------------------------------- 1 | // Copyright 2015, Yahoo Inc. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package renderer 6 | 7 | import ( 8 | "github.com/yahoo/gryffin" 9 | ) 10 | 11 | type BaseRenderer struct { 12 | chanResponse chan *gryffin.Scan 13 | chanLinks chan *gryffin.Scan 14 | done chan string // done, notify with a string of the "reason", e.g. terminated, completed, etc. 15 | } 16 | 17 | func (r *BaseRenderer) Do(s *gryffin.Scan) { 18 | // Dummy operation, just close the channels. 19 | defer close(r.chanResponse) 20 | defer close(r.chanLinks) 21 | defer close(r.done) 22 | } 23 | 24 | func (r *BaseRenderer) GetRequestBody() <-chan *gryffin.Scan { 25 | return r.chanResponse 26 | } 27 | 28 | func (r *BaseRenderer) GetLinks() <-chan *gryffin.Scan { 29 | return r.chanLinks 30 | } 31 | -------------------------------------------------------------------------------- /.github/workflows/linux.yml: -------------------------------------------------------------------------------- 1 | name: Linux 2 | 3 | on: 4 | push: 5 | branches: 6 | - master 7 | pull_request: 8 | branches: 9 | - '*' 10 | 11 | jobs: 12 | build: 13 | runs-on: ubuntu-latest 14 | strategy: 15 | matrix: 16 | go: [ '1.14.2', '1.13' ] 17 | name: Go ${{ matrix.go }} build 18 | steps: 19 | - uses: actions/checkout@v2 20 | - name: Setup go 21 | uses: actions/setup-go@v1 22 | with: 23 | go-version: ${{ matrix.go }} 24 | 25 | - name: go vet 26 | run: go vet -v ./... 27 | 28 | - name: Basic build 29 | run: go build ./cmd/... 30 | 31 | - name: Run tests on linux 32 | run: go test ./... 33 | 34 | - name: Run tests with race detector 35 | run: go test -v -race -coverprofile=coverage.txt -covermode=atomic ./... 36 | 37 | - name: Upload coverage to Codecov 38 | uses: codecov/codecov-action@v1 39 | with: 40 | file: ./coverage.txt 41 | -------------------------------------------------------------------------------- /html-distance/bktree_test.go: -------------------------------------------------------------------------------- 1 | // Copyright 2015, Yahoo Inc. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package distance 6 | 7 | import ( 8 | "testing" 9 | ) 10 | 11 | func TestNewOracle(t *testing.T) { 12 | // just add 0 and 1. 13 | oracle := NewOracle() 14 | for i := uint64(1); i < 2; i++ { 15 | oracle.See(i) 16 | } 17 | r := uint8(2) 18 | for i := uint64(0); i < 30; i++ { 19 | t.Logf("Has the oracle seen anything closed to %02d (%08b) within distance of %d? %t", i, i, r, oracle.Seen(i, r)) 20 | } 21 | 22 | } 23 | 24 | func BenchmarkOracleSee(b *testing.B) { 25 | oracle := NewOracle() 26 | for i := 0; i < b.N; i++ { 27 | // for i := uint64(1); i < 10000; i++ { 28 | oracle.See(uint64(i)) 29 | // } 30 | } 31 | } 32 | 33 | func BenchmarkOracleSeen(b *testing.B) { 34 | oracle := NewOracle() 35 | for i := uint64(1); i < 1000000; i++ { 36 | oracle.See(i) 37 | } 38 | b.ResetTimer() 39 | r := uint8(2) 40 | for i := 0; i < b.N; i++ { 41 | oracle.Seen(uint64(i), r) 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /go.sum: -------------------------------------------------------------------------------- 1 | github.com/golang/snappy v0.0.1 h1:Qgr9rKW7uDUkrbSmQeiDsGa8SjGyCOGtuasMWwvp2P4= 2 | github.com/golang/snappy v0.0.1/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q= 3 | github.com/mfonda/simhash v0.0.0-20151007195837-79f94a1100d6 h1:bjfMeqxWEJ6IRUvGkiTkSwx0a6UdQJsbirRSoXogteY= 4 | github.com/mfonda/simhash v0.0.0-20151007195837-79f94a1100d6/go.mod h1:WVJJvUw/pIOcwu2O8ZzHEhmigq2jzwRNfJVRMJB7bR8= 5 | github.com/nsqio/go-nsq v1.0.8 h1:3L2F8tNLlwXXlp2slDUrUWSBn2O3nMh8R1/KEDFTHPk= 6 | github.com/nsqio/go-nsq v1.0.8/go.mod h1:vKq36oyeVXgsS5Q8YEO7WghqidAVXQlcFxzQbQTuDEY= 7 | golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= 8 | golang.org/x/net v0.0.0-20200425230154-ff2c4b7c35a0 h1:Jcxah/M+oLZ/R4/z5RzfPzGbPXnVDPkEDtf2JnuxN+U= 9 | golang.org/x/net v0.0.0-20200425230154-ff2c4b7c35a0/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A= 10 | golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= 11 | golang.org/x/sys v0.0.0-20200323222414-85ca7c5b95cd/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= 12 | golang.org/x/text v0.3.0 h1:g61tztE5qeGQ89tm6NTjjM9VPIm088od1l6aSorWRWg= 13 | golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= 14 | -------------------------------------------------------------------------------- /session_test.go: -------------------------------------------------------------------------------- 1 | package gryffin 2 | 3 | import ( 4 | "sync" 5 | "testing" 6 | "time" 7 | ) 8 | 9 | func TestNewGryffinStore(t *testing.T) { 10 | 11 | t.Parallel() 12 | 13 | store1 := NewSharedGryffinStore() 14 | store2 := NewSharedGryffinStore() 15 | 16 | var wg sync.WaitGroup 17 | wg.Add(1) 18 | 19 | go func() { 20 | store1.See("foo", "oracle", uint64(0x1234)) 21 | b := <-store1.GetSndChan() 22 | t.Log("Store1 got ", string(b)) 23 | store2.GetRcvChan() <- b 24 | 25 | store1.See("foo", "hash", uint64(0x5678)) 26 | b = <-store1.GetSndChan() 27 | t.Log("Store1 got ", string(b)) 28 | store2.GetRcvChan() <- b 29 | wg.Done() 30 | }() 31 | 32 | wg.Wait() 33 | for i := 0; i < 100000; i++ { 34 | if store2.Seen("foo", "oracle", uint64(0x1234), 2) { 35 | t.Logf("Store2 see the new oracle value in %d microseconds.", i) 36 | break 37 | } 38 | time.Sleep(1 * time.Microsecond) 39 | } 40 | 41 | if !store2.Seen("foo", "oracle", uint64(0x1234), 2) { 42 | t.Error("2nd store should see the oracle value in oracle.", store2.Oracles) 43 | } 44 | 45 | for i := 0; i < 100000; i++ { 46 | if store2.Seen("foo", "hash", uint64(0x5678), 2) { 47 | t.Logf("Store2 see the new hash value in %d microseconds.", i) 48 | break 49 | } 50 | time.Sleep(1 * time.Microsecond) 51 | } 52 | 53 | if !store2.Seen("foo", "hash", uint64(0x5678), 2) { 54 | t.Error("2nd store should see the hash value in hashes.", store2.Hashes) 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2015 Yahoo Inc. All rights reserved. 2 | 3 | Redistribution and use in source and binary forms, with or without 4 | modification, are permitted provided that the following conditions are 5 | met: 6 | 7 | * Redistributions of source code must retain the above copyright 8 | notice, this list of conditions and the following disclaimer. 9 | * Redistributions in binary form must reproduce the above 10 | copyright notice, this list of conditions and the following disclaimer 11 | in the documentation and/or other materials provided with the 12 | distribution. 13 | * Neither the name of Yahoo Inc. nor the names of its 14 | contributors may be used to endorse or promote products derived from 15 | this software without specific prior written permission. 16 | 17 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 18 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 19 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 20 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 21 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 22 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 23 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 24 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 25 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 27 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | -------------------------------------------------------------------------------- /serialize.go: -------------------------------------------------------------------------------- 1 | // Copyright 2015, Yahoo Inc. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package gryffin 6 | 7 | import ( 8 | "encoding/json" 9 | "log" 10 | "net/http" 11 | ) 12 | 13 | // NewScanFromJson creates a Scan from the passed JSON blob. 14 | func NewScanFromJson(b []byte) *Scan { 15 | // ensure we got a memory store.. 16 | if memoryStore == nil { 17 | memoryStore = NewGryffinStore() 18 | } 19 | 20 | var scan Scan 21 | json.Unmarshal(b, &scan) 22 | return &scan 23 | } 24 | 25 | // Json serializes Scan as JSON. 26 | func (s *Scan) Json() []byte { 27 | ss := &SerializableScan{ 28 | s, 29 | &SerializableRequest{s.Request, ""}, 30 | &SerializableResponse{ 31 | s.Response, 32 | &SerializableRequest{s.Request, ""}, 33 | }, 34 | } 35 | b, err := json.Marshal(ss) 36 | if err != nil { 37 | log.Printf("Scan.Json: err=%v", err) 38 | s.Error("Json", err) 39 | } 40 | return b 41 | 42 | } 43 | 44 | // SerializableScan is a Scan extended with serializable 45 | // request and response fields. 46 | type SerializableScan struct { 47 | *Scan 48 | Request *SerializableRequest 49 | Response *SerializableResponse 50 | } 51 | 52 | // SerializableResponse is a Scan extended with serializable 53 | // response field. 54 | type SerializableResponse struct { 55 | *http.Response 56 | Request *SerializableRequest 57 | } 58 | 59 | // SerializableRequest is a Scan extended with serializable 60 | // request field. 61 | type SerializableRequest struct { 62 | *http.Request 63 | Cancel string 64 | } 65 | -------------------------------------------------------------------------------- /fuzzer/arachni/arachni.go: -------------------------------------------------------------------------------- 1 | // Copyright 2015, Yahoo Inc. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package arachni 6 | 7 | import ( 8 | "fmt" 9 | "os/exec" 10 | "strings" 11 | 12 | "github.com/yahoo/gryffin" 13 | ) 14 | 15 | // Fuzzer is the handle for the fuzzing methods. 16 | type Fuzzer struct{} 17 | 18 | // Fuzz runs an Arachni scan. 19 | func (s *Fuzzer) Fuzz(g *gryffin.Scan) (count int, err error) { 20 | var cookies []string 21 | // for _, c := range g.CookieJar.Cookies(g.Request.URL) { 22 | for _, c := range g.Cookies { 23 | cookies = append(cookies, c.String()) 24 | } 25 | 26 | args := []string{ 27 | "--checks", "xss*", 28 | "--output-only-positives", 29 | "--http-request-concurrency", "1", 30 | "--http-request-timeout", "10000", 31 | "--timeout", "00:03:00", 32 | "--scope-dom-depth-limit", "0", 33 | "--scope-directory-depth-limit", "0", 34 | "--scope-page-limit", "1", 35 | "--audit-with-both-methods", 36 | "--report-save-path", "/dev/null", 37 | "--snapshot-save-path", "/dev/null", 38 | } 39 | 40 | // TODO: Post method 41 | 42 | // Cookie 43 | if len(cookies) > 0 { 44 | args = append(args, "--http-cookie-string", strings.Join(cookies, ";")) 45 | } 46 | 47 | args = append(args, g.Request.URL.String()) 48 | 49 | cmd := exec.Command("arachni", args...) 50 | 51 | g.Logm("Arachni.Scan", fmt.Sprintf("Run as %s", cmd.Args)) 52 | 53 | output, err := cmd.Output() 54 | 55 | count = s.extract(g, string(output)) 56 | 57 | if err != nil { 58 | return 59 | } 60 | 61 | g.Logm("Arachni.Scan", fmt.Sprintf("Arachni return %t", cmd.ProcessState.Success())) 62 | return 63 | 64 | } 65 | 66 | func (s *Fuzzer) extract(g *gryffin.Scan, output string) (count int) { 67 | for _, l := range strings.Split(output, "\n") { 68 | l = strings.TrimSpace(l) 69 | switch { 70 | case strings.HasPrefix(l, "[~] Affected page"): 71 | g.Logm("Arachni.Findings", l) 72 | count++ 73 | } 74 | } 75 | 76 | return 77 | } 78 | -------------------------------------------------------------------------------- /renderer/noscript.go: -------------------------------------------------------------------------------- 1 | // Copyright 2015, Yahoo Inc. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package renderer 6 | 7 | import ( 8 | "fmt" 9 | "log" 10 | "net/http" 11 | "strings" 12 | "time" 13 | 14 | // "sync" 15 | 16 | "github.com/yahoo/gryffin" 17 | "golang.org/x/net/html" 18 | ) 19 | 20 | // allow 100 crawling in the machine (regardless of domains) 21 | 22 | type NoScriptRenderer struct { 23 | BaseRenderer 24 | } 25 | 26 | func (r *NoScriptRenderer) Do(s *gryffin.Scan) { 27 | r.chanResponse = make(chan *gryffin.Scan, 10) 28 | r.chanLinks = make(chan *gryffin.Scan, 10) 29 | 30 | crawl := func() { 31 | 32 | defer close(r.chanResponse) 33 | defer close(r.chanLinks) 34 | 35 | client := &http.Client{} 36 | 37 | client.Timeout = time.Duration(3) * time.Second 38 | 39 | if response, err := client.Do(s.Request); err == nil { 40 | s.Response = response 41 | } else { 42 | s.Logm("NoScriptRenderer", fmt.Sprintf("error in building request: %s", err)) 43 | return 44 | } 45 | 46 | s.ReadResponseBody() 47 | 48 | if s.IsDuplicatedPage() { 49 | return 50 | } 51 | 52 | tokenizer := html.NewTokenizer(strings.NewReader(s.ResponseBody)) 53 | 54 | r.chanResponse <- s 55 | 56 | for { 57 | t := tokenizer.Next() 58 | 59 | switch t { 60 | 61 | case html.ErrorToken: 62 | return 63 | 64 | case html.StartTagToken: 65 | token := tokenizer.Token() 66 | if token.DataAtom.String() == "a" { 67 | for _, attr := range token.Attr { 68 | if attr.Key == "href" { 69 | link := s.Spawn() 70 | // TODO - we drop relative URL as it would drop "#". 71 | // Yet, how about real relative URLs? 72 | if req, err := http.NewRequest("GET", attr.Val, nil); err == nil { 73 | if true { 74 | // || req.URL.IsAbs() { 75 | link.MergeRequest(req) 76 | if link.IsScanAllowed() { 77 | r.chanLinks <- link 78 | } 79 | } 80 | // else { 81 | // FIXME: ignore relative URL. 82 | // } 83 | } else { 84 | log.Printf("error in building request: %s", err) 85 | } 86 | } 87 | } 88 | } 89 | } 90 | } 91 | 92 | // parse and find links. 93 | 94 | } 95 | 96 | go crawl() 97 | } 98 | -------------------------------------------------------------------------------- /html-distance/bktree.go: -------------------------------------------------------------------------------- 1 | // Copyright 2015, Yahoo Inc. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | // Package distance is a go library for computing the proximity of the HTML pages. 6 | // The implementation similiarity fingerprint is Charikar's simhash. 7 | // 8 | // Distance is the hamming distance of the fingerprints. Since fingerprint is 9 | // of size 64 (inherited from hash/fnv), Similiarity is defined as 1 - d / 64. 10 | // 11 | // In normal scenario, similarity > 95% (i.e. d>3) could be considered as duplicated html pages. 12 | package distance 13 | 14 | import ( 15 | "sync" 16 | 17 | "github.com/mfonda/simhash" 18 | ) 19 | 20 | // Oracle answers the query if a fingerprint has been seen. 21 | type Oracle struct { 22 | fingerprint uint64 // node value. 23 | nodes [65]*Oracle // leaf nodes 24 | mu sync.Mutex 25 | } 26 | 27 | // NewOracle return an oracle that could tell if the fingerprint has been seen or not. 28 | func NewOracle() *Oracle { 29 | return newNode(0) 30 | } 31 | 32 | func newNode(f uint64) *Oracle { 33 | return &Oracle{fingerprint: f} 34 | } 35 | 36 | // Distance return the similarity distance between two fingerprint. 37 | func Distance(a, b uint64) uint8 { 38 | return simhash.Compare(a, b) 39 | } 40 | 41 | // See asks the oracle to see the fingerprint. 42 | func (n *Oracle) See(f uint64) *Oracle { 43 | d := Distance(n.fingerprint, f) 44 | 45 | if d == 0 { 46 | // current node with same fingerprint. 47 | return n 48 | } 49 | 50 | // the target node is already set, 51 | n.mu.Lock() 52 | defer n.mu.Unlock() 53 | if c := n.nodes[d]; c != nil { 54 | return c.See(f) 55 | } 56 | 57 | n.nodes[d] = newNode(f) 58 | return n.nodes[d] 59 | } 60 | 61 | // Seen asks the oracle if anything closed to the fingerprint in a range (r) is seen before. 62 | func (n *Oracle) Seen(f uint64, r uint8) bool { 63 | d := Distance(n.fingerprint, f) 64 | if d < r { 65 | return true 66 | } 67 | 68 | // TODO - should search from d, d-1, d+1, ... until d-r and d+r, for best performance 69 | for k := d - r; k <= d+r; k++ { 70 | if k > 64 { 71 | break 72 | } 73 | n.mu.Lock() 74 | c := n.nodes[k] 75 | n.mu.Unlock() 76 | if c != nil { 77 | if c.Seen(f, r) { 78 | return true 79 | } 80 | } 81 | } 82 | return false 83 | } 84 | -------------------------------------------------------------------------------- /html-distance/README.md: -------------------------------------------------------------------------------- 1 | # html-distance 2 | 3 | html-distance is a go library for computing the proximity of the HTML pages. The implementation similiarity fingerprint is Charikar's simhash. 4 | 5 | We used BK Tree (Burkhard and Keller) for verifying if a fingerprint is closed to a set of fingerprint within a defined proximity distance. 6 | 7 | Distance is the hamming distance of the fingerprints. Since fingerprint is of size 64 (inherited from hash/fnv), Similiarity is defined as 1 - d / 64. 8 | 9 | In normal scenario, similarity > 95% (i.e. d>3) could be considered as duplicated html pages. 10 | 11 | 12 | ## Get the source 13 | 14 | ``` 15 | go get github.com/yahoo/gryffin/html-distance/... 16 | ``` 17 | 18 | ## Install 19 | 20 | ``` 21 | go install github.com/yahoo/gryffin/html-distance/cmd/html-distance 22 | ``` 23 | 24 | ## Command Line Interface 25 | 26 | ``` 27 | Usage of html-distance: 28 | 29 | html-distance url1 url2 30 | ``` 31 | 32 | Example 1 33 | ``` 34 | $ html-distance https://www.flickr.com/photos/120759744@N07/20389369791/ https://www.flickr.com/photos/120759744@N07/20374523532/in/photostream/ 35 | 36 | Fetching https://www.flickr.com/photos/120759744@N07/20389369791/, Got 200 37 | Fetching https://www.flickr.com/photos/120759744@N07/20374523532/in/photostream/, Got 200 38 | Feature distance is 0. HTML Similarity is 100.00% 39 | ``` 40 | 41 | Example 2 42 | ``` 43 | $ html-distance https://www.yahoo.com/politics/kasichs-reception-on-gay-marriage-important-126109300441.html https://www.yahoo.com/tech/s/verizon-drop-phone-contracts-end-discounted-phones-201530971--finance.html 44 | 45 | Fetching https://www.yahoo.com/politics/kasichs-reception-on-gay-marriage-important-126109300441.html, Got 200 46 | Fetching https://www.yahoo.com/tech/s/verizon-drop-phone-contracts-end-discounted-phones-201530971--finance.html, Got 200 47 | Feature distance is 2. HTML Similarity is 96.88% 48 | ``` 49 | 50 | Example 3 51 | ``` 52 | $ html-distance https://www.flickr.com/photos/120759744@N07/20389369791/ https://www.yahoo.com/tech/s/verizon-drop-phone-contracts-end-discounted-phones-201530971--finance.html 53 | 54 | Fetching https://www.flickr.com/photos/120759744@N07/20389369791/, Got 200 55 | Fetching https://www.yahoo.com/tech/s/verizon-drop-phone-contracts-end-discounted-phones-201530971--finance.html, Got 200 56 | Feature distance is 9. HTML Similarity is 85.94% 57 | ``` 58 | -------------------------------------------------------------------------------- /renderer/resource/utils.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2015, Yahoo Inc. All rights reserved. 3 | * Use of this source code is governed by a BSD-style 4 | * license that can be found in the LICENSE file. 5 | * 6 | 7 | */ 8 | 9 | var re_hostname = /^(?:https?|ftp):\/\/([^:\/\?]+)/i, 10 | re_extensionFilter = /\.(?:css|pdf|svg|ttf|zip|tar|gz|pkg|exe)(?:[\?#;][^\?#;]*)?$/i, 11 | re_jsAnalyticsFilter = /^https?:\/\/(?:\w+\.)?yimg\.com\/mi(?:\/[^\/]+)?\/ywa\.js$/i, 12 | re_whitelistedRedirectionDomains = /(?:yahoo\.com?(?:\.\w\w)?|yimg\.com|flickr\.com|y-cloud\.net|yahoodns\.net|yahoofs\.com|zenfs\.com)$/; 13 | 14 | exports.getHostname = function(url) { 15 | url = url.match(re_hostname); 16 | return url ? url[1] : null; 17 | } 18 | exports.invalidUrl = function(url, allowedDomains) { 19 | url = exports.getHostname(url); 20 | return (url === null || (allowedDomains && allowedDomains.indexOf(url) === -1)); 21 | } 22 | exports.blacklistedUrl = function(url) { 23 | return re_extensionFilter.test(url) || re_jsAnalyticsFilter.test(url); 24 | } 25 | exports.whitelistedRedirectionDomains = function(url) { 26 | return re_whitelistedRedirectionDomains.test(exports.getHostname(url)); 27 | } 28 | 29 | exports.cleanResponseBody = function(body) { 30 | return (body == '
') ? '' : body; 31 | } 32 | 33 | // to repackage headers as a dict format, as required by scrappy 34 | exports.prepareResponse = function(response, headersFilter) { 35 | return { 36 | headers: headersFilter(response.headers), 37 | contentType: response.contentType, 38 | status: response.status, 39 | url: response.url 40 | } 41 | } 42 | 43 | // TODO: add to redis 44 | exports.pageChanges = (function() { 45 | var changes = {}; 46 | return { 47 | fetch: function(eventName) { 48 | var ret = changes[eventName] || []; 49 | changes[eventName] = []; 50 | return ret; 51 | }, 52 | fetchAll: function() { 53 | var ret = changes; 54 | changes = {}; 55 | return ret; 56 | }, 57 | push: function(eventName, obj) { 58 | changes[eventName] = changes[eventName] || []; 59 | changes[eventName].push(obj); 60 | } 61 | } 62 | })(); 63 | 64 | var JSONSignature = '==lXlKfYWch7H9VdJgPCmJ=='; 65 | 66 | exports.printJSON = function(type, output) { 67 | output['msgType'] = type; 68 | output['signature'] = JSONSignature; 69 | console.log(JSON.stringify(output)); 70 | // console.log(['{'+type, JSON.stringify(output), type+'}'].join(JSONSignature)); 71 | } -------------------------------------------------------------------------------- /renderer/resource/headers.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2015, Yahoo Inc. All rights reserved. 3 | * Use of this source code is governed by a BSD-style 4 | * license that can be found in the LICENSE file. 5 | * 6 | 7 | */ 8 | 9 | exports.init = function(phantom, page){ 10 | 11 | function setReqHeaders(headers, cookieHostname){ 12 | phantom.clearCookies(); 13 | // console.log("DEBUG HEADERS... " + cookieHostname) 14 | 15 | // for(var i in headers) { 16 | // console.log("headers " + i) 17 | // console.log(headers[i]) 18 | // } 19 | 20 | 21 | if (!headers || typeof(headers) != 'object') return {}; 22 | 23 | // avoid requesting for gzipped/compressed content, i.e., Accept-Encoding and Accept request headers unconfigurable 24 | // gzip decompression is problematic: https://github.com/ariya/phantomjs/issues/10930 25 | // the following headers modification is moved to phantomjs.py 26 | // headers['Accept-Encoding'] = "identity"; 27 | // delete headers['Accept']; 28 | 29 | // make cookies available for subresources requests of the same hostname, otherwise, only the main page will receive cookie 30 | if (headers['Cookie']) { 31 | headers['Cookie'].split(';').forEach(function(cookie){ 32 | var eqIndex = cookie.indexOf('='); 33 | phantom.addCookie({ 34 | name: cookie.substr(0, eqIndex).trim(), 35 | value: cookie.substr(eqIndex + 1).trim(), 36 | domain: cookieHostname, // already defaulted to hostname of current page 37 | path: '/', httponly: true, secure: false 38 | }); 39 | }); 40 | delete headers['Cookie']; 41 | } 42 | 43 | 44 | // User-Agent in request header must be explicitly configured thru settings.userAgent 45 | Object.keys(headers).forEach(function(headerName){ 46 | if (headerName.toLowerCase() == 'user-agent') { 47 | page.settings.userAgent = headers[headerName]; 48 | delete headers[headerName]; 49 | } 50 | }); 51 | 52 | return headers; 53 | } 54 | 55 | 56 | function getRespHeaders(headers) { 57 | var out = {}; 58 | headers && headers.forEach(function(h){ 59 | // the following headers are stripped to prevent decoding twice by scrapy 60 | var name = h.name.toLowerCase(), value = h.value.toLowerCase(); 61 | if ((name == 'content-encoding' && ['gzip','deflate'].indexOf(value) != -1) 62 | || (name == 'transfer-encoding' && value == 'chunked')) 63 | return; 64 | 65 | name = h.name; 66 | out[name] = out[name] || []; 67 | out[name].push(h.value); 68 | }); 69 | return out; 70 | } 71 | 72 | 73 | return { 74 | 'setReqHeaders': setReqHeaders, 75 | 'getRespHeaders': getRespHeaders 76 | }; 77 | } -------------------------------------------------------------------------------- /fuzzer/sqlmap/sqlmap.go: -------------------------------------------------------------------------------- 1 | // Copyright 2015, Yahoo Inc. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package sqlmap 6 | 7 | import ( 8 | "fmt" 9 | "os/exec" 10 | "strconv" 11 | "strings" 12 | 13 | "github.com/yahoo/gryffin" 14 | ) 15 | 16 | // Fuzzer is the handle for the fuzzing methods. 17 | type Fuzzer struct{} 18 | 19 | // Fuzz runs an sqlmap scan. 20 | func (s *Fuzzer) Fuzz(g *gryffin.Scan) (count int, err error) { 21 | 22 | var cookies []string 23 | 24 | // for _, c := range g.CookieJar.Cookies(g.Request.URL) { 25 | for _, c := range g.Cookies { 26 | cookies = append(cookies, c.String()) 27 | } 28 | 29 | args := []string{ 30 | "--batch", 31 | "--timeout=2", 32 | "--retries=3", 33 | "--crawl=0", 34 | "--disable-coloring", 35 | "-o", 36 | "--text-only", 37 | // "--threads=4", 38 | "-v", "0", 39 | "--level=1", 40 | "--risk=1", 41 | "--smart", 42 | "--fresh-queries", 43 | "--purge-output", 44 | "--os=Linux", 45 | "--dbms=MySQL", 46 | "--delay=0.1", 47 | "--time-sec=1", 48 | } 49 | 50 | // TODO: Post method 51 | // if g.RequestBody != "" { 52 | // args = append(args, fmt.Sprintf("--data=..." 53 | // } 54 | 55 | // only for integer based injection. 56 | var testable []string 57 | for k, vs := range g.Request.URL.Query() { 58 | for _, v := range vs { 59 | _, err := strconv.ParseInt(v, 10, 64) 60 | if err == nil { 61 | // query param value is an integer 62 | testable = append(testable, k) 63 | } 64 | } 65 | } 66 | if len(testable) > 0 { 67 | args = append(args, "-p", strings.Join(testable, ",")) 68 | } 69 | 70 | // Cookie 71 | if len(cookies) > 0 { 72 | fmt.Println(cookies) 73 | args = append(args, "--cookie", strings.Join(cookies, ";")) 74 | } 75 | 76 | args = append(args, "-u", g.Request.URL.String()) 77 | 78 | cmd := exec.Command("sqlmap", args...) 79 | 80 | g.Logm("SQLMap.Scan", fmt.Sprintf("Run as %s", cmd.Args)) 81 | 82 | output, err := cmd.Output() 83 | 84 | if err != nil { 85 | return 86 | } 87 | 88 | count = s.extract(g, string(output)) 89 | 90 | g.Logm("SQLMap.Scan", fmt.Sprintf("SQLMap return %t", cmd.ProcessState.Success())) 91 | return 92 | 93 | } 94 | 95 | func (s *Fuzzer) extract(g *gryffin.Scan, output string) (count int) { 96 | 97 | for _, l := range strings.Split(output, "\n") { 98 | l = strings.TrimSpace(l) 99 | switch { 100 | case strings.HasPrefix(l, "Payload: "): 101 | g.Logm("SQLMap.Findings", l) 102 | count++ 103 | } 104 | } 105 | 106 | return 107 | } 108 | -------------------------------------------------------------------------------- /data/memory.go: -------------------------------------------------------------------------------- 1 | // Copyright 2015, Yahoo Inc. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package data 6 | 7 | import ( 8 | // "log" 9 | // "reflect" 10 | "strings" 11 | "sync/atomic" 12 | ) 13 | 14 | // MemoryStore is an implementation for memory based data store. 15 | type MemoryStore struct { 16 | heap map[string]interface{} 17 | } 18 | 19 | // Set stores the key value pair. 20 | func (m *MemoryStore) Set(key string, value interface{}) bool { 21 | switch value.(type) { 22 | 23 | case int, int8, int16, int32, int64, uint, uint8, uint16, uint32, uint64: 24 | s, _ := convertIntToPtr(value) 25 | m.heap[key] = s 26 | 27 | default: 28 | m.heap[key] = value 29 | } 30 | return true 31 | } 32 | 33 | // Get retrieves the value pointed by the key. 34 | func (m *MemoryStore) Get(key string) (value interface{}, ok bool) { 35 | value, ok = m.heap[key] 36 | switch value.(type) { 37 | case *int, *int8, *int16, *int32, *int64, *uint, *uint8, *uint16, *uint32, *uint64: 38 | s, ok := convertPtrToInt(value) 39 | return s, ok 40 | default: 41 | return value, ok 42 | } 43 | } 44 | 45 | // IncrBy increments the value pointed by key with the delta, and return the new value. 46 | func (m *MemoryStore) IncrBy(key string, delta int64) (newVal int64) { 47 | newVal = atomic.AddInt64(m.heap[key].(*int64), delta) 48 | return 49 | 50 | } 51 | 52 | // DelPrefix deletes records from the MemoryStore's heap 53 | // when the keys match the given prefix. 54 | func (m *MemoryStore) DelPrefix(prefix string) { 55 | for k := range m.heap { 56 | if strings.HasPrefix(k, prefix) { 57 | delete(m.heap, k) 58 | } 59 | } 60 | } 61 | 62 | // Publish is a dummy no-op method. 63 | func (m *MemoryStore) Publish(k string, d interface{}) { 64 | 65 | } 66 | 67 | // NewMemoryStore creates the new store. 68 | func NewMemoryStore() *MemoryStore { 69 | m := MemoryStore{ 70 | heap: make(map[string]interface{}), 71 | } 72 | return &m 73 | } 74 | 75 | func convertIntToPtr(v interface{}) (s *int64, ok bool) { 76 | var t int64 77 | 78 | switch v := v.(type) { 79 | 80 | case int: 81 | t = int64(v) 82 | case int8: 83 | t = int64(v) 84 | case int16: 85 | t = int64(v) 86 | case int32: 87 | t = int64(v) 88 | case int64: 89 | t = v 90 | case uint: 91 | t = int64(v) 92 | case uint8: 93 | t = int64(v) 94 | case uint16: 95 | t = int64(v) 96 | case uint32: 97 | t = int64(v) 98 | case uint64: 99 | t = int64(v) 100 | } 101 | 102 | return &t, ok 103 | } 104 | 105 | func convertPtrToInt(v interface{}) (s int64, ok bool) { 106 | 107 | switch v := v.(type) { 108 | 109 | case *int: 110 | return int64(*v), true 111 | case *int8: 112 | return int64(*v), true 113 | case *int16: 114 | return int64(*v), true 115 | case *int32: 116 | return int64(*v), true 117 | case *int64: 118 | return *v, true 119 | 120 | case *uint: 121 | return int64(*v), true 122 | case *uint8: 123 | return int64(*v), true 124 | case *uint16: 125 | return int64(*v), true 126 | case *uint32: 127 | return int64(*v), true 128 | case *uint64: 129 | return int64(*v), true 130 | } 131 | 132 | return 133 | 134 | } 135 | -------------------------------------------------------------------------------- /html-distance/feature.go: -------------------------------------------------------------------------------- 1 | // Copyright 2015, Yahoo Inc. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package distance 6 | 7 | import ( 8 | "bytes" 9 | "io" 10 | 11 | "github.com/mfonda/simhash" 12 | "golang.org/x/net/html" 13 | ) 14 | 15 | // Fingerprint generates the fingerprint of an HTML from the io.Reader r and a shingle factor. 16 | // Shingle refers to the level of shuffling. 17 | // E.g. with shingle factor =2, input "a", "b", "c" will be converted to "a b", "b c" 18 | func Fingerprint(r io.Reader, shingle int) uint64 { 19 | if shingle < 1 { 20 | shingle = 1 21 | } 22 | // collect the features via this cf channel. 23 | cf := make(chan string, 1000) 24 | cs := make(chan uint64, 1000) 25 | v := simhash.Vector{} 26 | 27 | // Tokenize and then Generate Features. . 28 | go func() { 29 | defer close(cf) 30 | z := html.NewTokenizer(r) 31 | // TODO - export the max token count as an function argument. 32 | count := 0 33 | for tt := z.Next(); count < 5000 && tt != html.ErrorToken; tt = z.Next() { 34 | t := z.Token() 35 | count++ 36 | genFeatures(&t, cf) 37 | } 38 | 39 | }() 40 | 41 | // Collect the features. 42 | go func() { 43 | defer close(cs) 44 | a := make([][]byte, shingle) 45 | for f := <-cf; f != ""; f = <-cf { 46 | // shingle: generate the k-gram token as a single feature. 47 | a = append(a[1:], []byte(f)) 48 | // fmt.Printf("%#v\n", a) 49 | // fmt.Printf("%s\n", bytes.Join(a, []byte(" "))) 50 | cs <- simhash.NewFeature(bytes.Join(a, []byte(" "))).Sum() 51 | // cs <- simhash.NewFeature([]byte(f)).Sum() 52 | } 53 | }() 54 | 55 | // from the checksum (of feature), append to vector. 56 | for s := <-cs; s != 0; s = <-cs { 57 | for i := uint8(0); i < 64; i++ { 58 | bit := ((s >> i) & 1) 59 | if bit == 1 { 60 | v[i]++ 61 | } else { 62 | v[i]-- 63 | } 64 | } 65 | } 66 | 67 | return simhash.Fingerprint(v) 68 | 69 | } 70 | 71 | func genFeatures(t *html.Token, cf chan<- string) { 72 | 73 | s := "" 74 | 75 | switch t.Type { 76 | case html.StartTagToken: 77 | s = "A:" + t.DataAtom.String() 78 | case html.EndTagToken: 79 | s = "B:" + t.DataAtom.String() 80 | case html.SelfClosingTagToken: 81 | s = "C:" + t.DataAtom.String() 82 | case html.DoctypeToken: 83 | s = "D:" + t.DataAtom.String() 84 | case html.CommentToken: 85 | s = "E:" + t.DataAtom.String() 86 | case html.TextToken: 87 | s = "F:" + t.DataAtom.String() 88 | case html.ErrorToken: 89 | s = "Z:" + t.DataAtom.String() 90 | } 91 | // fmt.Println(s) 92 | cf <- s 93 | 94 | for _, attr := range t.Attr { 95 | switch attr.Key { 96 | case "class": 97 | s = "G:" + t.DataAtom.String() + ":" + attr.Key + ":" + attr.Val 98 | // case "id": 99 | // s = "G:" + t.DataAtom.String() + ":" + attr.Key + ":" + attr.Val 100 | case "name": 101 | s = "G:" + t.DataAtom.String() + ":" + attr.Key + ":" + attr.Val 102 | case "rel": 103 | s = "G:" + t.DataAtom.String() + ":" + attr.Key + ":" + attr.Val 104 | default: 105 | s = "G:" + t.DataAtom.String() + ":" + attr.Key 106 | } 107 | // fmt.Println(s) 108 | cf <- s 109 | } 110 | 111 | // fmt.Println(s) 112 | 113 | } 114 | -------------------------------------------------------------------------------- /html-distance/feature_test.go: -------------------------------------------------------------------------------- 1 | // Copyright 2015, Yahoo Inc. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package distance 6 | 7 | import ( 8 | "bytes" 9 | "io/ioutil" 10 | "net/http" 11 | "strings" 12 | "testing" 13 | ) 14 | 15 | // var input = "" 16 | // var input = "
te<&;xt
" 17 | var input = ` 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 |2
3 50 | 13 51 | 123 52 | 1| 2 | 3
| B | C