├── renderer ├── resource │ ├── dummy.go │ ├── utils.js │ ├── headers.js │ ├── render.js │ ├── events.js │ └── extractors.js ├── noscript_test.go ├── phantomjs_test.go ├── base_test.go ├── base.go ├── noscript.go └── phantomjs.go ├── go.mod ├── fuzzer ├── dummy │ ├── dummy_test.go │ └── dummy.go ├── arachni │ ├── arachni_test.go │ └── arachni.go └── sqlmap │ ├── sqlmap_test.go │ └── sqlmap.go ├── data ├── memory_test.go ├── store.go ├── store_test.go └── memory.go ├── .gitignore ├── util.go ├── cmd ├── gryffin-standalone │ ├── main_test.go │ └── main.go └── gryffin-distributed │ ├── main_test.go │ └── main.go ├── global.go ├── Makefile ├── .github └── workflows │ └── linux.yml ├── html-distance ├── bktree_test.go ├── bktree.go ├── README.md ├── feature.go └── feature_test.go ├── go.sum ├── session_test.go ├── LICENSE ├── serialize.go ├── README.md ├── session.go ├── gryffin_test.go └── gryffin.go /renderer/resource/dummy.go: -------------------------------------------------------------------------------- 1 | // Copyright 2015, Yahoo Inc. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package dummy 6 | -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/yahoo/gryffin 2 | 3 | go 1.14 4 | 5 | require ( 6 | github.com/mfonda/simhash v0.0.0-20151007195837-79f94a1100d6 7 | github.com/nsqio/go-nsq v1.0.8 8 | golang.org/x/net v0.0.0-20200425230154-ff2c4b7c35a0 9 | ) 10 | -------------------------------------------------------------------------------- /renderer/noscript_test.go: -------------------------------------------------------------------------------- 1 | // Copyright 2015, Yahoo Inc. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package renderer 6 | 7 | import ( 8 | "testing" 9 | ) 10 | 11 | func TestNoScriptCrawlAsync(t *testing.T) { 12 | t.Parallel() 13 | r := &NoScriptRenderer{} 14 | testCrawlAsync(t, r) 15 | } 16 | -------------------------------------------------------------------------------- /renderer/phantomjs_test.go: -------------------------------------------------------------------------------- 1 | // Copyright 2015, Yahoo Inc. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package renderer 6 | 7 | import ( 8 | "testing" 9 | ) 10 | 11 | func TestPhantomJSCrawlAsync(t *testing.T) { 12 | t.Parallel() 13 | r := &PhantomJSRenderer{Timeout: 30} 14 | testCrawlAsync(t, r) 15 | } 16 | -------------------------------------------------------------------------------- /fuzzer/dummy/dummy_test.go: -------------------------------------------------------------------------------- 1 | // Copyright 2015, Yahoo Inc. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package dummy 6 | 7 | import ( 8 | "testing" 9 | 10 | "github.com/yahoo/gryffin" 11 | ) 12 | 13 | func TestFuzzer(t *testing.T) { 14 | 15 | f := &Fuzzer{} 16 | scan := gryffin.NewScan("GET", "http://www.yahoo.com", "") 17 | _, err := f.Fuzz(scan) 18 | if err != nil { 19 | t.Error(err) 20 | } 21 | 22 | } 23 | -------------------------------------------------------------------------------- /data/memory_test.go: -------------------------------------------------------------------------------- 1 | // Copyright 2015, Yahoo Inc. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package data 6 | 7 | import ( 8 | "testing" 9 | ) 10 | 11 | func TestMemoryStore(t *testing.T) { 12 | t.Parallel() 13 | testStore(t, NewMemoryStore()) 14 | } 15 | 16 | func BenchmarkMemoryStore(b *testing.B) { 17 | s := NewMemoryStore() 18 | b.ResetTimer() 19 | for i := 0; i < b.N; i++ { 20 | benchStore(b, s) 21 | } 22 | 23 | } 24 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Compiled Object files, Static and Dynamic libs (Shared Objects) 2 | *.o 3 | *.a 4 | *.so 5 | 6 | # Folders 7 | _obj 8 | _test 9 | 10 | # Architecture specific extensions/prefixes 11 | *.[568vq] 12 | [568vq].out 13 | 14 | *.cgo1.go 15 | *.cgo2.c 16 | _cgo_defun.c 17 | _cgo_gotypes.go 18 | _cgo_export.* 19 | 20 | _testmain.go 21 | 22 | *.exe 23 | *.test 24 | *.prof 25 | 26 | # NSQ temporary files. 27 | *.dat 28 | 29 | # logstashes 30 | *.log 31 | logstash-forwarder.crt 32 | logstash-forwarder.key 33 | .logstash-forwarder -------------------------------------------------------------------------------- /util.go: -------------------------------------------------------------------------------- 1 | // Copyright 2015, Yahoo Inc. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package gryffin 6 | 7 | import ( 8 | "crypto/rand" 9 | "fmt" 10 | "io" 11 | ) 12 | 13 | // GenRandomID generates a random ID. 14 | func GenRandomID() string { 15 | // UUID generation is trivial per RSC in https://groups.google.com/d/msg/golang-dev/zwB0k2mpshc/l3zS3oxXuNwJ 16 | buf := make([]byte, 16) 17 | io.ReadFull(rand.Reader, buf) 18 | return fmt.Sprintf("%X", buf) 19 | } 20 | -------------------------------------------------------------------------------- /data/store.go: -------------------------------------------------------------------------------- 1 | // Copyright 2015, Yahoo Inc. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | // Package data provides an interface for common data store operations. 6 | package data 7 | 8 | // Store is an interface that capture all methods supported for a data store. 9 | type Store interface { 10 | Get(key string) (value interface{}, ok bool) 11 | Set(key string, value interface{}) bool 12 | IncrBy(key string, delta int64) (newVal int64) 13 | Publish(key string, value interface{}) 14 | } 15 | -------------------------------------------------------------------------------- /renderer/base_test.go: -------------------------------------------------------------------------------- 1 | // Copyright 2015, Yahoo Inc. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package renderer 6 | 7 | import ( 8 | "os" 9 | "testing" 10 | 11 | "github.com/yahoo/gryffin" 12 | ) 13 | 14 | func testCrawlAsync(t *testing.T, r gryffin.Renderer) { 15 | if os.Getenv("INTEGRATION") == "" { 16 | t.Skip("Skip integration tests.") 17 | } 18 | 19 | url := "https://www.yahoo.com/" 20 | 21 | s := gryffin.NewScan("GET", url, "") 22 | r.Do(s) 23 | <-r.GetRequestBody() 24 | for link := range r.GetLinks() { 25 | t.Logf("Got link %s", link.Request.URL) 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /fuzzer/dummy/dummy.go: -------------------------------------------------------------------------------- 1 | // Copyright 2015, Yahoo Inc. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package dummy 6 | 7 | import ( 8 | "fmt" 9 | "os/exec" 10 | 11 | "github.com/yahoo/gryffin" 12 | ) 13 | 14 | // Fuzzer is the handle for the fuzzing methods. 15 | type Fuzzer struct{} 16 | 17 | // Fuzz runs a dummy scan. 18 | func (s *Fuzzer) Fuzz(g *gryffin.Scan) (count int, err error) { 19 | 20 | cmd := exec.Command("echo", g.Request.URL.Host) 21 | _, err = cmd.Output() 22 | 23 | g.Logm("Dummy.Scan", fmt.Sprintf("Echo return %t", cmd.ProcessState.Success())) 24 | return 0, err 25 | 26 | } 27 | -------------------------------------------------------------------------------- /fuzzer/arachni/arachni_test.go: -------------------------------------------------------------------------------- 1 | // Copyright 2015, Yahoo Inc. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package arachni 6 | 7 | import ( 8 | "os" 9 | "testing" 10 | 11 | "github.com/yahoo/gryffin" 12 | ) 13 | 14 | func TestFuzzer(t *testing.T) { 15 | if os.Getenv("INTEGRATION") == "" { 16 | t.Skip("Skip integration tests.") 17 | } 18 | s := &Fuzzer{} 19 | scan := gryffin.NewScan("GET", "http://127.0.0.1:8081/xss/reflect/full1?in=change_me", "") 20 | c, err := s.Fuzz(scan) 21 | if err != nil { 22 | t.Error(err) 23 | } 24 | if c == 0 { 25 | t.Error("No issue detected.") 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /fuzzer/sqlmap/sqlmap_test.go: -------------------------------------------------------------------------------- 1 | // Copyright 2015, Yahoo Inc. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package sqlmap 6 | 7 | import ( 8 | "os" 9 | "testing" 10 | 11 | "github.com/yahoo/gryffin" 12 | ) 13 | 14 | func TestFuzzer(t *testing.T) { 15 | if os.Getenv("INTEGRATION") == "" { 16 | t.Skip("Skip integration tests.") 17 | } 18 | 19 | s := &Fuzzer{} 20 | scan := gryffin.NewScan("GET", "http://127.0.0.1:8082/dvwa/vulnerabilities/sqli/?id=1&Submit=Submit", "") 21 | c, err := s.Fuzz(scan) 22 | if err != nil { 23 | t.Error(err) 24 | } 25 | if c == 0 { 26 | t.Error("No issue detected.") 27 | } 28 | 29 | } 30 | -------------------------------------------------------------------------------- /cmd/gryffin-standalone/main_test.go: -------------------------------------------------------------------------------- 1 | // Copyright 2015, Yahoo Inc. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package main 6 | 7 | import ( 8 | "net/http" 9 | "net/http/httptest" 10 | "os" 11 | "testing" 12 | 13 | "github.com/yahoo/gryffin" 14 | ) 15 | 16 | var h = http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { 17 | w.Write([]byte("Hello World")) 18 | }) 19 | 20 | var ts = httptest.NewServer(h) 21 | 22 | func TestMain(t *testing.T) { 23 | if os.Getenv("INTEGRATION") == "" { 24 | t.Skip("Skip integration tests.") 25 | } 26 | scan := gryffin.NewScan("GET", ts.URL, "") 27 | linkChannels(scan) 28 | 29 | } 30 | -------------------------------------------------------------------------------- /global.go: -------------------------------------------------------------------------------- 1 | // Copyright 2015, Yahoo Inc. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package gryffin 6 | 7 | import ( 8 | "io" 9 | "sync" 10 | ) 11 | 12 | var ( 13 | memoryStore *GryffinStore 14 | logWriter io.Writer 15 | memoryStoreMu sync.Mutex 16 | logWriterMu sync.Mutex 17 | ) 18 | 19 | // SetMemoryStore sets the package internal global variable 20 | // for the memory store. 21 | func SetMemoryStore(m *GryffinStore) { 22 | memoryStoreMu.Lock() 23 | memoryStore = m 24 | memoryStoreMu.Unlock() 25 | } 26 | 27 | // SetLogWriter sets the log writer. 28 | func SetLogWriter(w io.Writer) { 29 | logWriterMu.Lock() 30 | logWriter = w 31 | logWriterMu.Unlock() 32 | } 33 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | 2 | # This Makefile is adopted from https://github.com/hashicorp/consul/blob/master/Makefile 3 | 4 | all: format build 5 | 6 | cov: 7 | gocov test | gocov-html > /tmp/coverage.html 8 | open /tmp/coverage.html 9 | 10 | build: test 11 | cd cmd/gryffin-standalone; go build 12 | 13 | test: 14 | go test ./... 15 | @$(MAKE) vet 16 | 17 | test-mono: 18 | go run cmd/gryffin-standalone/main.go "http://127.0.0.1:8081" 19 | go run cmd/gryffin-standalone/main.go "http://127.0.0.1:8082/dvwa/vulnerabilities/sqli/?id=1&Submit=Submit" 20 | 21 | 22 | test-integration: 23 | INTEGRATION=1 go test ./... 24 | 25 | test-cover: 26 | go test --cover ./... 27 | 28 | format: 29 | @gofmt -l . 30 | 31 | vet: 32 | @go vet ./... 33 | 34 | .PHONY: all cov build test vet web web-push 35 | -------------------------------------------------------------------------------- /data/store_test.go: -------------------------------------------------------------------------------- 1 | // Copyright 2015, Yahoo Inc. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package data 6 | 7 | import ( 8 | "testing" 9 | ) 10 | 11 | func testStore(t *testing.T, s Store) { 12 | s.Set("hello", "world") 13 | if v, ok := s.Get("hello"); !ok || v != "world" { 14 | t.Error("Get and Set is inconsistent.", v) 15 | } 16 | 17 | s.Set("foo", 100) 18 | if n := s.IncrBy("foo", 10); n != 110 { 19 | t.Error("Incr failed.") 20 | } 21 | if v, ok := s.Get("foo"); v.(int64) != 110 { 22 | t.Errorf("Incr is inconsistent %t, %t and %s", ok, v.(int64) == 110, v) 23 | } 24 | 25 | } 26 | 27 | func benchStore(b *testing.B, s Store) { 28 | s.Set("hello", "world") 29 | s.Set("foo", 100) 30 | s.IncrBy("foo", 10) 31 | } 32 | -------------------------------------------------------------------------------- /cmd/gryffin-distributed/main_test.go: -------------------------------------------------------------------------------- 1 | // Copyright 2015, Yahoo Inc. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package main 6 | 7 | // Unit test for gryffin-distributed is still on todo list. 8 | // 9 | // import ( 10 | // "net/http" 11 | // "net/http/httptest" 12 | // "os" 13 | // "testing" 14 | 15 | // "github.com/yahoo/gryffin" 16 | // ) 17 | 18 | // var handler = http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { 19 | // w.Write([]byte("Hello World")) 20 | // }) 21 | 22 | // var ts = httptest.NewServer(handler) 23 | 24 | // func TestMain(t *testing.T) { 25 | // if os.Getenv("INTEGRATION") == "" { 26 | // t.Skip("Skip integration tests.") 27 | // } 28 | // scan := gryffin.NewScan("GET", ts.URL, "") 29 | // linkChannels(scan) 30 | 31 | // } 32 | -------------------------------------------------------------------------------- /renderer/base.go: -------------------------------------------------------------------------------- 1 | // Copyright 2015, Yahoo Inc. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package renderer 6 | 7 | import ( 8 | "github.com/yahoo/gryffin" 9 | ) 10 | 11 | type BaseRenderer struct { 12 | chanResponse chan *gryffin.Scan 13 | chanLinks chan *gryffin.Scan 14 | done chan string // done, notify with a string of the "reason", e.g. terminated, completed, etc. 15 | } 16 | 17 | func (r *BaseRenderer) Do(s *gryffin.Scan) { 18 | // Dummy operation, just close the channels. 19 | defer close(r.chanResponse) 20 | defer close(r.chanLinks) 21 | defer close(r.done) 22 | } 23 | 24 | func (r *BaseRenderer) GetRequestBody() <-chan *gryffin.Scan { 25 | return r.chanResponse 26 | } 27 | 28 | func (r *BaseRenderer) GetLinks() <-chan *gryffin.Scan { 29 | return r.chanLinks 30 | } 31 | -------------------------------------------------------------------------------- /.github/workflows/linux.yml: -------------------------------------------------------------------------------- 1 | name: Linux 2 | 3 | on: 4 | push: 5 | branches: 6 | - master 7 | pull_request: 8 | branches: 9 | - '*' 10 | 11 | jobs: 12 | build: 13 | runs-on: ubuntu-latest 14 | strategy: 15 | matrix: 16 | go: [ '1.14.2', '1.13' ] 17 | name: Go ${{ matrix.go }} build 18 | steps: 19 | - uses: actions/checkout@v2 20 | - name: Setup go 21 | uses: actions/setup-go@v1 22 | with: 23 | go-version: ${{ matrix.go }} 24 | 25 | - name: go vet 26 | run: go vet -v ./... 27 | 28 | - name: Basic build 29 | run: go build ./cmd/... 30 | 31 | - name: Run tests on linux 32 | run: go test ./... 33 | 34 | - name: Run tests with race detector 35 | run: go test -v -race -coverprofile=coverage.txt -covermode=atomic ./... 36 | 37 | - name: Upload coverage to Codecov 38 | uses: codecov/codecov-action@v1 39 | with: 40 | file: ./coverage.txt 41 | -------------------------------------------------------------------------------- /html-distance/bktree_test.go: -------------------------------------------------------------------------------- 1 | // Copyright 2015, Yahoo Inc. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package distance 6 | 7 | import ( 8 | "testing" 9 | ) 10 | 11 | func TestNewOracle(t *testing.T) { 12 | // just add 0 and 1. 13 | oracle := NewOracle() 14 | for i := uint64(1); i < 2; i++ { 15 | oracle.See(i) 16 | } 17 | r := uint8(2) 18 | for i := uint64(0); i < 30; i++ { 19 | t.Logf("Has the oracle seen anything closed to %02d (%08b) within distance of %d? %t", i, i, r, oracle.Seen(i, r)) 20 | } 21 | 22 | } 23 | 24 | func BenchmarkOracleSee(b *testing.B) { 25 | oracle := NewOracle() 26 | for i := 0; i < b.N; i++ { 27 | // for i := uint64(1); i < 10000; i++ { 28 | oracle.See(uint64(i)) 29 | // } 30 | } 31 | } 32 | 33 | func BenchmarkOracleSeen(b *testing.B) { 34 | oracle := NewOracle() 35 | for i := uint64(1); i < 1000000; i++ { 36 | oracle.See(i) 37 | } 38 | b.ResetTimer() 39 | r := uint8(2) 40 | for i := 0; i < b.N; i++ { 41 | oracle.Seen(uint64(i), r) 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /go.sum: -------------------------------------------------------------------------------- 1 | github.com/golang/snappy v0.0.1 h1:Qgr9rKW7uDUkrbSmQeiDsGa8SjGyCOGtuasMWwvp2P4= 2 | github.com/golang/snappy v0.0.1/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q= 3 | github.com/mfonda/simhash v0.0.0-20151007195837-79f94a1100d6 h1:bjfMeqxWEJ6IRUvGkiTkSwx0a6UdQJsbirRSoXogteY= 4 | github.com/mfonda/simhash v0.0.0-20151007195837-79f94a1100d6/go.mod h1:WVJJvUw/pIOcwu2O8ZzHEhmigq2jzwRNfJVRMJB7bR8= 5 | github.com/nsqio/go-nsq v1.0.8 h1:3L2F8tNLlwXXlp2slDUrUWSBn2O3nMh8R1/KEDFTHPk= 6 | github.com/nsqio/go-nsq v1.0.8/go.mod h1:vKq36oyeVXgsS5Q8YEO7WghqidAVXQlcFxzQbQTuDEY= 7 | golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= 8 | golang.org/x/net v0.0.0-20200425230154-ff2c4b7c35a0 h1:Jcxah/M+oLZ/R4/z5RzfPzGbPXnVDPkEDtf2JnuxN+U= 9 | golang.org/x/net v0.0.0-20200425230154-ff2c4b7c35a0/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A= 10 | golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= 11 | golang.org/x/sys v0.0.0-20200323222414-85ca7c5b95cd/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= 12 | golang.org/x/text v0.3.0 h1:g61tztE5qeGQ89tm6NTjjM9VPIm088od1l6aSorWRWg= 13 | golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= 14 | -------------------------------------------------------------------------------- /session_test.go: -------------------------------------------------------------------------------- 1 | package gryffin 2 | 3 | import ( 4 | "sync" 5 | "testing" 6 | "time" 7 | ) 8 | 9 | func TestNewGryffinStore(t *testing.T) { 10 | 11 | t.Parallel() 12 | 13 | store1 := NewSharedGryffinStore() 14 | store2 := NewSharedGryffinStore() 15 | 16 | var wg sync.WaitGroup 17 | wg.Add(1) 18 | 19 | go func() { 20 | store1.See("foo", "oracle", uint64(0x1234)) 21 | b := <-store1.GetSndChan() 22 | t.Log("Store1 got ", string(b)) 23 | store2.GetRcvChan() <- b 24 | 25 | store1.See("foo", "hash", uint64(0x5678)) 26 | b = <-store1.GetSndChan() 27 | t.Log("Store1 got ", string(b)) 28 | store2.GetRcvChan() <- b 29 | wg.Done() 30 | }() 31 | 32 | wg.Wait() 33 | for i := 0; i < 100000; i++ { 34 | if store2.Seen("foo", "oracle", uint64(0x1234), 2) { 35 | t.Logf("Store2 see the new oracle value in %d microseconds.", i) 36 | break 37 | } 38 | time.Sleep(1 * time.Microsecond) 39 | } 40 | 41 | if !store2.Seen("foo", "oracle", uint64(0x1234), 2) { 42 | t.Error("2nd store should see the oracle value in oracle.", store2.Oracles) 43 | } 44 | 45 | for i := 0; i < 100000; i++ { 46 | if store2.Seen("foo", "hash", uint64(0x5678), 2) { 47 | t.Logf("Store2 see the new hash value in %d microseconds.", i) 48 | break 49 | } 50 | time.Sleep(1 * time.Microsecond) 51 | } 52 | 53 | if !store2.Seen("foo", "hash", uint64(0x5678), 2) { 54 | t.Error("2nd store should see the hash value in hashes.", store2.Hashes) 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2015 Yahoo Inc. All rights reserved. 2 | 3 | Redistribution and use in source and binary forms, with or without 4 | modification, are permitted provided that the following conditions are 5 | met: 6 | 7 | * Redistributions of source code must retain the above copyright 8 | notice, this list of conditions and the following disclaimer. 9 | * Redistributions in binary form must reproduce the above 10 | copyright notice, this list of conditions and the following disclaimer 11 | in the documentation and/or other materials provided with the 12 | distribution. 13 | * Neither the name of Yahoo Inc. nor the names of its 14 | contributors may be used to endorse or promote products derived from 15 | this software without specific prior written permission. 16 | 17 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 18 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 19 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 20 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 21 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 22 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 23 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 24 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 25 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 27 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | -------------------------------------------------------------------------------- /serialize.go: -------------------------------------------------------------------------------- 1 | // Copyright 2015, Yahoo Inc. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package gryffin 6 | 7 | import ( 8 | "encoding/json" 9 | "log" 10 | "net/http" 11 | ) 12 | 13 | // NewScanFromJson creates a Scan from the passed JSON blob. 14 | func NewScanFromJson(b []byte) *Scan { 15 | // ensure we got a memory store.. 16 | if memoryStore == nil { 17 | memoryStore = NewGryffinStore() 18 | } 19 | 20 | var scan Scan 21 | json.Unmarshal(b, &scan) 22 | return &scan 23 | } 24 | 25 | // Json serializes Scan as JSON. 26 | func (s *Scan) Json() []byte { 27 | ss := &SerializableScan{ 28 | s, 29 | &SerializableRequest{s.Request, ""}, 30 | &SerializableResponse{ 31 | s.Response, 32 | &SerializableRequest{s.Request, ""}, 33 | }, 34 | } 35 | b, err := json.Marshal(ss) 36 | if err != nil { 37 | log.Printf("Scan.Json: err=%v", err) 38 | s.Error("Json", err) 39 | } 40 | return b 41 | 42 | } 43 | 44 | // SerializableScan is a Scan extended with serializable 45 | // request and response fields. 46 | type SerializableScan struct { 47 | *Scan 48 | Request *SerializableRequest 49 | Response *SerializableResponse 50 | } 51 | 52 | // SerializableResponse is a Scan extended with serializable 53 | // response field. 54 | type SerializableResponse struct { 55 | *http.Response 56 | Request *SerializableRequest 57 | } 58 | 59 | // SerializableRequest is a Scan extended with serializable 60 | // request field. 61 | type SerializableRequest struct { 62 | *http.Request 63 | Cancel string 64 | } 65 | -------------------------------------------------------------------------------- /fuzzer/arachni/arachni.go: -------------------------------------------------------------------------------- 1 | // Copyright 2015, Yahoo Inc. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package arachni 6 | 7 | import ( 8 | "fmt" 9 | "os/exec" 10 | "strings" 11 | 12 | "github.com/yahoo/gryffin" 13 | ) 14 | 15 | // Fuzzer is the handle for the fuzzing methods. 16 | type Fuzzer struct{} 17 | 18 | // Fuzz runs an Arachni scan. 19 | func (s *Fuzzer) Fuzz(g *gryffin.Scan) (count int, err error) { 20 | var cookies []string 21 | // for _, c := range g.CookieJar.Cookies(g.Request.URL) { 22 | for _, c := range g.Cookies { 23 | cookies = append(cookies, c.String()) 24 | } 25 | 26 | args := []string{ 27 | "--checks", "xss*", 28 | "--output-only-positives", 29 | "--http-request-concurrency", "1", 30 | "--http-request-timeout", "10000", 31 | "--timeout", "00:03:00", 32 | "--scope-dom-depth-limit", "0", 33 | "--scope-directory-depth-limit", "0", 34 | "--scope-page-limit", "1", 35 | "--audit-with-both-methods", 36 | "--report-save-path", "/dev/null", 37 | "--snapshot-save-path", "/dev/null", 38 | } 39 | 40 | // TODO: Post method 41 | 42 | // Cookie 43 | if len(cookies) > 0 { 44 | args = append(args, "--http-cookie-string", strings.Join(cookies, ";")) 45 | } 46 | 47 | args = append(args, g.Request.URL.String()) 48 | 49 | cmd := exec.Command("arachni", args...) 50 | 51 | g.Logm("Arachni.Scan", fmt.Sprintf("Run as %s", cmd.Args)) 52 | 53 | output, err := cmd.Output() 54 | 55 | count = s.extract(g, string(output)) 56 | 57 | if err != nil { 58 | return 59 | } 60 | 61 | g.Logm("Arachni.Scan", fmt.Sprintf("Arachni return %t", cmd.ProcessState.Success())) 62 | return 63 | 64 | } 65 | 66 | func (s *Fuzzer) extract(g *gryffin.Scan, output string) (count int) { 67 | for _, l := range strings.Split(output, "\n") { 68 | l = strings.TrimSpace(l) 69 | switch { 70 | case strings.HasPrefix(l, "[~] Affected page"): 71 | g.Logm("Arachni.Findings", l) 72 | count++ 73 | } 74 | } 75 | 76 | return 77 | } 78 | -------------------------------------------------------------------------------- /renderer/noscript.go: -------------------------------------------------------------------------------- 1 | // Copyright 2015, Yahoo Inc. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package renderer 6 | 7 | import ( 8 | "fmt" 9 | "log" 10 | "net/http" 11 | "strings" 12 | "time" 13 | 14 | // "sync" 15 | 16 | "github.com/yahoo/gryffin" 17 | "golang.org/x/net/html" 18 | ) 19 | 20 | // allow 100 crawling in the machine (regardless of domains) 21 | 22 | type NoScriptRenderer struct { 23 | BaseRenderer 24 | } 25 | 26 | func (r *NoScriptRenderer) Do(s *gryffin.Scan) { 27 | r.chanResponse = make(chan *gryffin.Scan, 10) 28 | r.chanLinks = make(chan *gryffin.Scan, 10) 29 | 30 | crawl := func() { 31 | 32 | defer close(r.chanResponse) 33 | defer close(r.chanLinks) 34 | 35 | client := &http.Client{} 36 | 37 | client.Timeout = time.Duration(3) * time.Second 38 | 39 | if response, err := client.Do(s.Request); err == nil { 40 | s.Response = response 41 | } else { 42 | s.Logm("NoScriptRenderer", fmt.Sprintf("error in building request: %s", err)) 43 | return 44 | } 45 | 46 | s.ReadResponseBody() 47 | 48 | if s.IsDuplicatedPage() { 49 | return 50 | } 51 | 52 | tokenizer := html.NewTokenizer(strings.NewReader(s.ResponseBody)) 53 | 54 | r.chanResponse <- s 55 | 56 | for { 57 | t := tokenizer.Next() 58 | 59 | switch t { 60 | 61 | case html.ErrorToken: 62 | return 63 | 64 | case html.StartTagToken: 65 | token := tokenizer.Token() 66 | if token.DataAtom.String() == "a" { 67 | for _, attr := range token.Attr { 68 | if attr.Key == "href" { 69 | link := s.Spawn() 70 | // TODO - we drop relative URL as it would drop "#". 71 | // Yet, how about real relative URLs? 72 | if req, err := http.NewRequest("GET", attr.Val, nil); err == nil { 73 | if true { 74 | // || req.URL.IsAbs() { 75 | link.MergeRequest(req) 76 | if link.IsScanAllowed() { 77 | r.chanLinks <- link 78 | } 79 | } 80 | // else { 81 | // FIXME: ignore relative URL. 82 | // } 83 | } else { 84 | log.Printf("error in building request: %s", err) 85 | } 86 | } 87 | } 88 | } 89 | } 90 | } 91 | 92 | // parse and find links. 93 | 94 | } 95 | 96 | go crawl() 97 | } 98 | -------------------------------------------------------------------------------- /html-distance/bktree.go: -------------------------------------------------------------------------------- 1 | // Copyright 2015, Yahoo Inc. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | // Package distance is a go library for computing the proximity of the HTML pages. 6 | // The implementation similiarity fingerprint is Charikar's simhash. 7 | // 8 | // Distance is the hamming distance of the fingerprints. Since fingerprint is 9 | // of size 64 (inherited from hash/fnv), Similiarity is defined as 1 - d / 64. 10 | // 11 | // In normal scenario, similarity > 95% (i.e. d>3) could be considered as duplicated html pages. 12 | package distance 13 | 14 | import ( 15 | "sync" 16 | 17 | "github.com/mfonda/simhash" 18 | ) 19 | 20 | // Oracle answers the query if a fingerprint has been seen. 21 | type Oracle struct { 22 | fingerprint uint64 // node value. 23 | nodes [65]*Oracle // leaf nodes 24 | mu sync.Mutex 25 | } 26 | 27 | // NewOracle return an oracle that could tell if the fingerprint has been seen or not. 28 | func NewOracle() *Oracle { 29 | return newNode(0) 30 | } 31 | 32 | func newNode(f uint64) *Oracle { 33 | return &Oracle{fingerprint: f} 34 | } 35 | 36 | // Distance return the similarity distance between two fingerprint. 37 | func Distance(a, b uint64) uint8 { 38 | return simhash.Compare(a, b) 39 | } 40 | 41 | // See asks the oracle to see the fingerprint. 42 | func (n *Oracle) See(f uint64) *Oracle { 43 | d := Distance(n.fingerprint, f) 44 | 45 | if d == 0 { 46 | // current node with same fingerprint. 47 | return n 48 | } 49 | 50 | // the target node is already set, 51 | n.mu.Lock() 52 | defer n.mu.Unlock() 53 | if c := n.nodes[d]; c != nil { 54 | return c.See(f) 55 | } 56 | 57 | n.nodes[d] = newNode(f) 58 | return n.nodes[d] 59 | } 60 | 61 | // Seen asks the oracle if anything closed to the fingerprint in a range (r) is seen before. 62 | func (n *Oracle) Seen(f uint64, r uint8) bool { 63 | d := Distance(n.fingerprint, f) 64 | if d < r { 65 | return true 66 | } 67 | 68 | // TODO - should search from d, d-1, d+1, ... until d-r and d+r, for best performance 69 | for k := d - r; k <= d+r; k++ { 70 | if k > 64 { 71 | break 72 | } 73 | n.mu.Lock() 74 | c := n.nodes[k] 75 | n.mu.Unlock() 76 | if c != nil { 77 | if c.Seen(f, r) { 78 | return true 79 | } 80 | } 81 | } 82 | return false 83 | } 84 | -------------------------------------------------------------------------------- /html-distance/README.md: -------------------------------------------------------------------------------- 1 | # html-distance 2 | 3 | html-distance is a go library for computing the proximity of the HTML pages. The implementation similiarity fingerprint is Charikar's simhash. 4 | 5 | We used BK Tree (Burkhard and Keller) for verifying if a fingerprint is closed to a set of fingerprint within a defined proximity distance. 6 | 7 | Distance is the hamming distance of the fingerprints. Since fingerprint is of size 64 (inherited from hash/fnv), Similiarity is defined as 1 - d / 64. 8 | 9 | In normal scenario, similarity > 95% (i.e. d>3) could be considered as duplicated html pages. 10 | 11 | 12 | ## Get the source 13 | 14 | ``` 15 | go get github.com/yahoo/gryffin/html-distance/... 16 | ``` 17 | 18 | ## Install 19 | 20 | ``` 21 | go install github.com/yahoo/gryffin/html-distance/cmd/html-distance 22 | ``` 23 | 24 | ## Command Line Interface 25 | 26 | ``` 27 | Usage of html-distance: 28 | 29 | html-distance url1 url2 30 | ``` 31 | 32 | Example 1 33 | ``` 34 | $ html-distance https://www.flickr.com/photos/120759744@N07/20389369791/ https://www.flickr.com/photos/120759744@N07/20374523532/in/photostream/ 35 | 36 | Fetching https://www.flickr.com/photos/120759744@N07/20389369791/, Got 200 37 | Fetching https://www.flickr.com/photos/120759744@N07/20374523532/in/photostream/, Got 200 38 | Feature distance is 0. HTML Similarity is 100.00% 39 | ``` 40 | 41 | Example 2 42 | ``` 43 | $ html-distance https://www.yahoo.com/politics/kasichs-reception-on-gay-marriage-important-126109300441.html https://www.yahoo.com/tech/s/verizon-drop-phone-contracts-end-discounted-phones-201530971--finance.html 44 | 45 | Fetching https://www.yahoo.com/politics/kasichs-reception-on-gay-marriage-important-126109300441.html, Got 200 46 | Fetching https://www.yahoo.com/tech/s/verizon-drop-phone-contracts-end-discounted-phones-201530971--finance.html, Got 200 47 | Feature distance is 2. HTML Similarity is 96.88% 48 | ``` 49 | 50 | Example 3 51 | ``` 52 | $ html-distance https://www.flickr.com/photos/120759744@N07/20389369791/ https://www.yahoo.com/tech/s/verizon-drop-phone-contracts-end-discounted-phones-201530971--finance.html 53 | 54 | Fetching https://www.flickr.com/photos/120759744@N07/20389369791/, Got 200 55 | Fetching https://www.yahoo.com/tech/s/verizon-drop-phone-contracts-end-discounted-phones-201530971--finance.html, Got 200 56 | Feature distance is 9. HTML Similarity is 85.94% 57 | ``` 58 | -------------------------------------------------------------------------------- /renderer/resource/utils.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2015, Yahoo Inc. All rights reserved. 3 | * Use of this source code is governed by a BSD-style 4 | * license that can be found in the LICENSE file. 5 | * 6 | 7 | */ 8 | 9 | var re_hostname = /^(?:https?|ftp):\/\/([^:\/\?]+)/i, 10 | re_extensionFilter = /\.(?:css|pdf|svg|ttf|zip|tar|gz|pkg|exe)(?:[\?#;][^\?#;]*)?$/i, 11 | re_jsAnalyticsFilter = /^https?:\/\/(?:\w+\.)?yimg\.com\/mi(?:\/[^\/]+)?\/ywa\.js$/i, 12 | re_whitelistedRedirectionDomains = /(?:yahoo\.com?(?:\.\w\w)?|yimg\.com|flickr\.com|y-cloud\.net|yahoodns\.net|yahoofs\.com|zenfs\.com)$/; 13 | 14 | exports.getHostname = function(url) { 15 | url = url.match(re_hostname); 16 | return url ? url[1] : null; 17 | } 18 | exports.invalidUrl = function(url, allowedDomains) { 19 | url = exports.getHostname(url); 20 | return (url === null || (allowedDomains && allowedDomains.indexOf(url) === -1)); 21 | } 22 | exports.blacklistedUrl = function(url) { 23 | return re_extensionFilter.test(url) || re_jsAnalyticsFilter.test(url); 24 | } 25 | exports.whitelistedRedirectionDomains = function(url) { 26 | return re_whitelistedRedirectionDomains.test(exports.getHostname(url)); 27 | } 28 | 29 | exports.cleanResponseBody = function(body) { 30 | return (body == '') ? '' : body; 31 | } 32 | 33 | // to repackage headers as a dict format, as required by scrappy 34 | exports.prepareResponse = function(response, headersFilter) { 35 | return { 36 | headers: headersFilter(response.headers), 37 | contentType: response.contentType, 38 | status: response.status, 39 | url: response.url 40 | } 41 | } 42 | 43 | // TODO: add to redis 44 | exports.pageChanges = (function() { 45 | var changes = {}; 46 | return { 47 | fetch: function(eventName) { 48 | var ret = changes[eventName] || []; 49 | changes[eventName] = []; 50 | return ret; 51 | }, 52 | fetchAll: function() { 53 | var ret = changes; 54 | changes = {}; 55 | return ret; 56 | }, 57 | push: function(eventName, obj) { 58 | changes[eventName] = changes[eventName] || []; 59 | changes[eventName].push(obj); 60 | } 61 | } 62 | })(); 63 | 64 | var JSONSignature = '==lXlKfYWch7H9VdJgPCmJ=='; 65 | 66 | exports.printJSON = function(type, output) { 67 | output['msgType'] = type; 68 | output['signature'] = JSONSignature; 69 | console.log(JSON.stringify(output)); 70 | // console.log(['{'+type, JSON.stringify(output), type+'}'].join(JSONSignature)); 71 | } -------------------------------------------------------------------------------- /renderer/resource/headers.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2015, Yahoo Inc. All rights reserved. 3 | * Use of this source code is governed by a BSD-style 4 | * license that can be found in the LICENSE file. 5 | * 6 | 7 | */ 8 | 9 | exports.init = function(phantom, page){ 10 | 11 | function setReqHeaders(headers, cookieHostname){ 12 | phantom.clearCookies(); 13 | // console.log("DEBUG HEADERS... " + cookieHostname) 14 | 15 | // for(var i in headers) { 16 | // console.log("headers " + i) 17 | // console.log(headers[i]) 18 | // } 19 | 20 | 21 | if (!headers || typeof(headers) != 'object') return {}; 22 | 23 | // avoid requesting for gzipped/compressed content, i.e., Accept-Encoding and Accept request headers unconfigurable 24 | // gzip decompression is problematic: https://github.com/ariya/phantomjs/issues/10930 25 | // the following headers modification is moved to phantomjs.py 26 | // headers['Accept-Encoding'] = "identity"; 27 | // delete headers['Accept']; 28 | 29 | // make cookies available for subresources requests of the same hostname, otherwise, only the main page will receive cookie 30 | if (headers['Cookie']) { 31 | headers['Cookie'].split(';').forEach(function(cookie){ 32 | var eqIndex = cookie.indexOf('='); 33 | phantom.addCookie({ 34 | name: cookie.substr(0, eqIndex).trim(), 35 | value: cookie.substr(eqIndex + 1).trim(), 36 | domain: cookieHostname, // already defaulted to hostname of current page 37 | path: '/', httponly: true, secure: false 38 | }); 39 | }); 40 | delete headers['Cookie']; 41 | } 42 | 43 | 44 | // User-Agent in request header must be explicitly configured thru settings.userAgent 45 | Object.keys(headers).forEach(function(headerName){ 46 | if (headerName.toLowerCase() == 'user-agent') { 47 | page.settings.userAgent = headers[headerName]; 48 | delete headers[headerName]; 49 | } 50 | }); 51 | 52 | return headers; 53 | } 54 | 55 | 56 | function getRespHeaders(headers) { 57 | var out = {}; 58 | headers && headers.forEach(function(h){ 59 | // the following headers are stripped to prevent decoding twice by scrapy 60 | var name = h.name.toLowerCase(), value = h.value.toLowerCase(); 61 | if ((name == 'content-encoding' && ['gzip','deflate'].indexOf(value) != -1) 62 | || (name == 'transfer-encoding' && value == 'chunked')) 63 | return; 64 | 65 | name = h.name; 66 | out[name] = out[name] || []; 67 | out[name].push(h.value); 68 | }); 69 | return out; 70 | } 71 | 72 | 73 | return { 74 | 'setReqHeaders': setReqHeaders, 75 | 'getRespHeaders': getRespHeaders 76 | }; 77 | } -------------------------------------------------------------------------------- /fuzzer/sqlmap/sqlmap.go: -------------------------------------------------------------------------------- 1 | // Copyright 2015, Yahoo Inc. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package sqlmap 6 | 7 | import ( 8 | "fmt" 9 | "os/exec" 10 | "strconv" 11 | "strings" 12 | 13 | "github.com/yahoo/gryffin" 14 | ) 15 | 16 | // Fuzzer is the handle for the fuzzing methods. 17 | type Fuzzer struct{} 18 | 19 | // Fuzz runs an sqlmap scan. 20 | func (s *Fuzzer) Fuzz(g *gryffin.Scan) (count int, err error) { 21 | 22 | var cookies []string 23 | 24 | // for _, c := range g.CookieJar.Cookies(g.Request.URL) { 25 | for _, c := range g.Cookies { 26 | cookies = append(cookies, c.String()) 27 | } 28 | 29 | args := []string{ 30 | "--batch", 31 | "--timeout=2", 32 | "--retries=3", 33 | "--crawl=0", 34 | "--disable-coloring", 35 | "-o", 36 | "--text-only", 37 | // "--threads=4", 38 | "-v", "0", 39 | "--level=1", 40 | "--risk=1", 41 | "--smart", 42 | "--fresh-queries", 43 | "--purge-output", 44 | "--os=Linux", 45 | "--dbms=MySQL", 46 | "--delay=0.1", 47 | "--time-sec=1", 48 | } 49 | 50 | // TODO: Post method 51 | // if g.RequestBody != "" { 52 | // args = append(args, fmt.Sprintf("--data=..." 53 | // } 54 | 55 | // only for integer based injection. 56 | var testable []string 57 | for k, vs := range g.Request.URL.Query() { 58 | for _, v := range vs { 59 | _, err := strconv.ParseInt(v, 10, 64) 60 | if err == nil { 61 | // query param value is an integer 62 | testable = append(testable, k) 63 | } 64 | } 65 | } 66 | if len(testable) > 0 { 67 | args = append(args, "-p", strings.Join(testable, ",")) 68 | } 69 | 70 | // Cookie 71 | if len(cookies) > 0 { 72 | fmt.Println(cookies) 73 | args = append(args, "--cookie", strings.Join(cookies, ";")) 74 | } 75 | 76 | args = append(args, "-u", g.Request.URL.String()) 77 | 78 | cmd := exec.Command("sqlmap", args...) 79 | 80 | g.Logm("SQLMap.Scan", fmt.Sprintf("Run as %s", cmd.Args)) 81 | 82 | output, err := cmd.Output() 83 | 84 | if err != nil { 85 | return 86 | } 87 | 88 | count = s.extract(g, string(output)) 89 | 90 | g.Logm("SQLMap.Scan", fmt.Sprintf("SQLMap return %t", cmd.ProcessState.Success())) 91 | return 92 | 93 | } 94 | 95 | func (s *Fuzzer) extract(g *gryffin.Scan, output string) (count int) { 96 | 97 | for _, l := range strings.Split(output, "\n") { 98 | l = strings.TrimSpace(l) 99 | switch { 100 | case strings.HasPrefix(l, "Payload: "): 101 | g.Logm("SQLMap.Findings", l) 102 | count++ 103 | } 104 | } 105 | 106 | return 107 | } 108 | -------------------------------------------------------------------------------- /data/memory.go: -------------------------------------------------------------------------------- 1 | // Copyright 2015, Yahoo Inc. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package data 6 | 7 | import ( 8 | // "log" 9 | // "reflect" 10 | "strings" 11 | "sync/atomic" 12 | ) 13 | 14 | // MemoryStore is an implementation for memory based data store. 15 | type MemoryStore struct { 16 | heap map[string]interface{} 17 | } 18 | 19 | // Set stores the key value pair. 20 | func (m *MemoryStore) Set(key string, value interface{}) bool { 21 | switch value.(type) { 22 | 23 | case int, int8, int16, int32, int64, uint, uint8, uint16, uint32, uint64: 24 | s, _ := convertIntToPtr(value) 25 | m.heap[key] = s 26 | 27 | default: 28 | m.heap[key] = value 29 | } 30 | return true 31 | } 32 | 33 | // Get retrieves the value pointed by the key. 34 | func (m *MemoryStore) Get(key string) (value interface{}, ok bool) { 35 | value, ok = m.heap[key] 36 | switch value.(type) { 37 | case *int, *int8, *int16, *int32, *int64, *uint, *uint8, *uint16, *uint32, *uint64: 38 | s, ok := convertPtrToInt(value) 39 | return s, ok 40 | default: 41 | return value, ok 42 | } 43 | } 44 | 45 | // IncrBy increments the value pointed by key with the delta, and return the new value. 46 | func (m *MemoryStore) IncrBy(key string, delta int64) (newVal int64) { 47 | newVal = atomic.AddInt64(m.heap[key].(*int64), delta) 48 | return 49 | 50 | } 51 | 52 | // DelPrefix deletes records from the MemoryStore's heap 53 | // when the keys match the given prefix. 54 | func (m *MemoryStore) DelPrefix(prefix string) { 55 | for k := range m.heap { 56 | if strings.HasPrefix(k, prefix) { 57 | delete(m.heap, k) 58 | } 59 | } 60 | } 61 | 62 | // Publish is a dummy no-op method. 63 | func (m *MemoryStore) Publish(k string, d interface{}) { 64 | 65 | } 66 | 67 | // NewMemoryStore creates the new store. 68 | func NewMemoryStore() *MemoryStore { 69 | m := MemoryStore{ 70 | heap: make(map[string]interface{}), 71 | } 72 | return &m 73 | } 74 | 75 | func convertIntToPtr(v interface{}) (s *int64, ok bool) { 76 | var t int64 77 | 78 | switch v := v.(type) { 79 | 80 | case int: 81 | t = int64(v) 82 | case int8: 83 | t = int64(v) 84 | case int16: 85 | t = int64(v) 86 | case int32: 87 | t = int64(v) 88 | case int64: 89 | t = v 90 | case uint: 91 | t = int64(v) 92 | case uint8: 93 | t = int64(v) 94 | case uint16: 95 | t = int64(v) 96 | case uint32: 97 | t = int64(v) 98 | case uint64: 99 | t = int64(v) 100 | } 101 | 102 | return &t, ok 103 | } 104 | 105 | func convertPtrToInt(v interface{}) (s int64, ok bool) { 106 | 107 | switch v := v.(type) { 108 | 109 | case *int: 110 | return int64(*v), true 111 | case *int8: 112 | return int64(*v), true 113 | case *int16: 114 | return int64(*v), true 115 | case *int32: 116 | return int64(*v), true 117 | case *int64: 118 | return *v, true 119 | 120 | case *uint: 121 | return int64(*v), true 122 | case *uint8: 123 | return int64(*v), true 124 | case *uint16: 125 | return int64(*v), true 126 | case *uint32: 127 | return int64(*v), true 128 | case *uint64: 129 | return int64(*v), true 130 | } 131 | 132 | return 133 | 134 | } 135 | -------------------------------------------------------------------------------- /html-distance/feature.go: -------------------------------------------------------------------------------- 1 | // Copyright 2015, Yahoo Inc. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package distance 6 | 7 | import ( 8 | "bytes" 9 | "io" 10 | 11 | "github.com/mfonda/simhash" 12 | "golang.org/x/net/html" 13 | ) 14 | 15 | // Fingerprint generates the fingerprint of an HTML from the io.Reader r and a shingle factor. 16 | // Shingle refers to the level of shuffling. 17 | // E.g. with shingle factor =2, input "a", "b", "c" will be converted to "a b", "b c" 18 | func Fingerprint(r io.Reader, shingle int) uint64 { 19 | if shingle < 1 { 20 | shingle = 1 21 | } 22 | // collect the features via this cf channel. 23 | cf := make(chan string, 1000) 24 | cs := make(chan uint64, 1000) 25 | v := simhash.Vector{} 26 | 27 | // Tokenize and then Generate Features. . 28 | go func() { 29 | defer close(cf) 30 | z := html.NewTokenizer(r) 31 | // TODO - export the max token count as an function argument. 32 | count := 0 33 | for tt := z.Next(); count < 5000 && tt != html.ErrorToken; tt = z.Next() { 34 | t := z.Token() 35 | count++ 36 | genFeatures(&t, cf) 37 | } 38 | 39 | }() 40 | 41 | // Collect the features. 42 | go func() { 43 | defer close(cs) 44 | a := make([][]byte, shingle) 45 | for f := <-cf; f != ""; f = <-cf { 46 | // shingle: generate the k-gram token as a single feature. 47 | a = append(a[1:], []byte(f)) 48 | // fmt.Printf("%#v\n", a) 49 | // fmt.Printf("%s\n", bytes.Join(a, []byte(" "))) 50 | cs <- simhash.NewFeature(bytes.Join(a, []byte(" "))).Sum() 51 | // cs <- simhash.NewFeature([]byte(f)).Sum() 52 | } 53 | }() 54 | 55 | // from the checksum (of feature), append to vector. 56 | for s := <-cs; s != 0; s = <-cs { 57 | for i := uint8(0); i < 64; i++ { 58 | bit := ((s >> i) & 1) 59 | if bit == 1 { 60 | v[i]++ 61 | } else { 62 | v[i]-- 63 | } 64 | } 65 | } 66 | 67 | return simhash.Fingerprint(v) 68 | 69 | } 70 | 71 | func genFeatures(t *html.Token, cf chan<- string) { 72 | 73 | s := "" 74 | 75 | switch t.Type { 76 | case html.StartTagToken: 77 | s = "A:" + t.DataAtom.String() 78 | case html.EndTagToken: 79 | s = "B:" + t.DataAtom.String() 80 | case html.SelfClosingTagToken: 81 | s = "C:" + t.DataAtom.String() 82 | case html.DoctypeToken: 83 | s = "D:" + t.DataAtom.String() 84 | case html.CommentToken: 85 | s = "E:" + t.DataAtom.String() 86 | case html.TextToken: 87 | s = "F:" + t.DataAtom.String() 88 | case html.ErrorToken: 89 | s = "Z:" + t.DataAtom.String() 90 | } 91 | // fmt.Println(s) 92 | cf <- s 93 | 94 | for _, attr := range t.Attr { 95 | switch attr.Key { 96 | case "class": 97 | s = "G:" + t.DataAtom.String() + ":" + attr.Key + ":" + attr.Val 98 | // case "id": 99 | // s = "G:" + t.DataAtom.String() + ":" + attr.Key + ":" + attr.Val 100 | case "name": 101 | s = "G:" + t.DataAtom.String() + ":" + attr.Key + ":" + attr.Val 102 | case "rel": 103 | s = "G:" + t.DataAtom.String() + ":" + attr.Key + ":" + attr.Val 104 | default: 105 | s = "G:" + t.DataAtom.String() + ":" + attr.Key 106 | } 107 | // fmt.Println(s) 108 | cf <- s 109 | } 110 | 111 | // fmt.Println(s) 112 | 113 | } 114 | -------------------------------------------------------------------------------- /html-distance/feature_test.go: -------------------------------------------------------------------------------- 1 | // Copyright 2015, Yahoo Inc. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package distance 6 | 7 | import ( 8 | "bytes" 9 | "io/ioutil" 10 | "net/http" 11 | "strings" 12 | "testing" 13 | ) 14 | 15 | // var input = "

" 16 | // var input = "

te<&;xt

" 17 | var input = ` 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | net/token_test.go at master · golang/net 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | ` 35 | 36 | func TestCreateFingerprint(t *testing.T) { 37 | r := strings.NewReader(input) 38 | f := Fingerprint(r, 2) 39 | t.Logf("%064b", f) 40 | } 41 | 42 | func TestSee(t *testing.T) { 43 | 44 | oracle := NewOracle() 45 | 46 | tests := strings.Split(`b 47 | d 48 |

49 | 1

23

50 | 1 51 | 123 52 | 1
2
34
5
53 | 1

23

54 |

55 |

56 |

57 |

123

45 58 |
13
2
59 | AC
B
60 |
`, "\n") 61 | 62 | for _, test := range tests { 63 | r := strings.NewReader(test) 64 | f := Fingerprint(r, 2) 65 | oracle.See(f) 66 | t.Logf(" ---- for %064b %s.", f, test) 67 | } 68 | 69 | for _, test := range tests { 70 | _ = test 71 | ntest := "d" 72 | r := strings.NewReader(ntest) 73 | f := Fingerprint(r, 2) 74 | t.Logf("%t for %064b %s.", oracle.Seen(f, 2), f, ntest) 75 | } 76 | 77 | } 78 | 79 | func TestSeenWithExternalHTML(t *testing.T) { 80 | 81 | t.Skip("skip htmlsample test ..") 82 | oracle := NewOracle() 83 | 84 | f1, _ := ioutil.ReadFile("./htmlsamples/flickr001.html") 85 | f2, _ := ioutil.ReadFile("./htmlsamples/flickr002.html") 86 | f3, _ := ioutil.ReadFile("./htmlsamples/yahoo001.html") 87 | 88 | { 89 | r := bytes.NewReader(f1) 90 | f := Fingerprint(r, 2) 91 | oracle.See(f) 92 | } 93 | 94 | { 95 | r := bytes.NewReader(f2) 96 | f := Fingerprint(r, 2) 97 | t.Logf("found? %t", oracle.Seen(f, 2)) 98 | 99 | } 100 | 101 | { 102 | r := bytes.NewReader(f3) 103 | f := Fingerprint(r, 2) 104 | t.Logf("found? %t", oracle.Seen(f, 2)) 105 | 106 | } 107 | 108 | } 109 | 110 | func BenchmarkFingerprint(b *testing.B) { 111 | for i := 0; i < b.N; i++ { 112 | r := strings.NewReader(input) 113 | Fingerprint(r, 2) 114 | } 115 | } 116 | 117 | func BenchmarkFingerprintWithExternalHTML(b *testing.B) { 118 | 119 | b.Skip("Skip external dependent tests.") 120 | resp, err := http.Get("https://www.yahoo.com/") 121 | if err != nil { 122 | b.Fatal(err) 123 | } 124 | defer resp.Body.Close() 125 | input, err := ioutil.ReadAll(resp.Body) 126 | if err != nil { 127 | b.Fatal(err) 128 | } 129 | 130 | b.ResetTimer() 131 | 132 | for i := 0; i < b.N; i++ { 133 | r := bytes.NewReader(input) 134 | Fingerprint(r, 2) 135 | } 136 | } 137 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ARCHIVED 2 | 3 | 4 | Gryffin (beta) [![Build Status](https://travis-ci.org/yahoo/gryffin.svg?branch=master)](https://travis-ci.org/yahoo/gryffin) [![GoDoc](https://godoc.org/github.com/yahoo/gryffin?status.svg)](https://godoc.org/github.com/yahoo/gryffin) 5 | ========== 6 | 7 | Gryffin is a large scale web security scanning platform. It is not yet another scanner. It was written to solve two specific problems with existing scanners: coverage and scale. 8 | 9 | Better coverage translates to fewer false negatives. Inherent scalability translates to capability of scanning, and supporting a large elastic application infrastructure. Simply put, the ability to scan 1000 applications today to 100,000 applications tomorrow by straightforward horizontal scaling. 10 | 11 | ## Coverage 12 | Coverage has two dimensions - one during crawl and the other during fuzzing. In crawl phase, coverage implies being able to find as much of the application footprint. In scan phase, or while fuzzing, it implies being able to test each part of the application for an applied set of vulnerabilities in a deep. 13 | 14 | #### Crawl Coverage 15 | Today a large number of web applications are template-driven, meaning the same code or path generates millions of URLs. For a security scanner, it just needs one of the millions of URLs generated by the same code or path. Gryffin's crawler does just that. 16 | 17 | ##### Page Deduplication 18 | At the heart of Gryffin is a deduplication engine that compares a new page with already seen pages. If the HTML structure of the new page is similar to those already seen, it is classified as a duplicate and not crawled further. 19 | 20 | ##### DOM Rendering and Navigation 21 | A large number of applications today are rich applications. They are heavily driven by client-side JavaScript. In order to discover links and code paths in such applications, Gryffin's crawler uses PhantomJS for DOM rendering and navigation. 22 | 23 | #### Scan Coverage 24 | As Gryffin is a scanning platform, not a scanner, it does not have its own fuzzer modules, even for fuzzing common web vulnerabilities like XSS and SQL Injection. 25 | 26 | It's not wise to reinvent the wheel where you do not have to. Gryffin at production scale at Yahoo uses open source and custom fuzzers. Some of these custom fuzzers might be open sourced in the future, and might or might not be part of the Gryffin repository. 27 | 28 | For demonstration purposes, Gryffin comes integrated with sqlmap and arachni. It does not endorse them or any other scanner in particular. 29 | 30 | The philosophy is to improve scan coverage by being able to fuzz for just what you need. 31 | 32 | ## Scale 33 | While Gryffin is available as a standalone package, it's primarily built for scale. 34 | 35 | Gryffin is built on the publisher-subscriber model. Each component is either a publisher, or a subscriber, or both. This allows Gryffin to scale horizontally by simply adding more subscriber or publisher nodes. 36 | 37 | ## Operating Gryffin 38 | 39 | ### Pre-requisites 40 | 41 | 1. Go - `go1.13` or later 42 | 2. PhantomJS, v2 43 | 3. Sqlmap (for fuzzing SQLi) 44 | 4. Arachni (for fuzzing XSS and web vulnerabilities) 45 | 5. NSQ , 46 | - running lookupd at port 4160,4161 47 | - running nsqd at port 4150,4151 48 | - with `--max-msg-size=5000000` 49 | 6. Kibana and Elastic search, for dashboarding 50 | - listening to JSON over port 5000 51 | - Preconfigured docker image available in https://hub.docker.com/r/yukinying/elk/ 52 | 53 | 54 | ### Installation 55 | 56 | ``` 57 | go get -u github.com/yahoo/gryffin/... 58 | ``` 59 | 60 | ### Run 61 | 62 | (WIP) 63 | 64 | ## TODO 65 | 66 | 1. Mobile browser user agent 67 | 2. Preconfigured docker images 68 | 3. Redis for sharing states across machines 69 | 4. Instruction to run gryffin (distributed or standalone) 70 | 5. Documentation for html-distance 71 | 6. Implement a JSON serializable cookiejar. 72 | 7. Identify duplicate url patterns based on simhash result. 73 | 74 | ## Talks and Slides 75 | 76 | - AppsecUSA 2015: [abstract](http://sched.co/3Vgm), [slide](http://go-talks.appspot.com/github.com/yukinying/talks/gryffin/gryffin.slide), [recording](https://youtu.be/IWiR2CPOHvc) 77 | 78 | ## Credits 79 | 80 | - Adonis Fung @ Yahoo, for the asynchronous phantomjs based crawler and DOM event navigator. 81 | - [Simhash algorithm](http://www.cs.princeton.edu/courses/archive/spring04/cos598B/bib/CharikarEstim.pdf) by Moses Charikar 82 | - Simhash implementation provided by [mfonda/simhash](https://github.com/mfonda/simhash). 83 | - [Sqlmap](http://sqlmap.org/) 84 | - [Arachni](http://www.arachni-scanner.com/) 85 | 86 | 87 | ## Licence 88 | 89 | Code licensed under the BSD-style license. See LICENSE file for terms. 90 | -------------------------------------------------------------------------------- /session.go: -------------------------------------------------------------------------------- 1 | // Copyright 2015, Yahoo Inc. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package gryffin 6 | 7 | import ( 8 | "encoding/json" 9 | "fmt" 10 | "strconv" 11 | "sync" 12 | "time" 13 | 14 | distance "github.com/yahoo/gryffin/html-distance" 15 | ) 16 | 17 | // GryffinStore includes data and handles for Gryffin message processing, 18 | type GryffinStore struct { 19 | Oracles map[string]*distance.Oracle 20 | Hashes map[string]bool 21 | Hits map[string]int 22 | Mu sync.RWMutex 23 | // store data.Store - currently unused, TODO: use or remove 24 | snd chan []byte 25 | rcv chan []byte 26 | } 27 | 28 | // PublishMessage is the data in the messages handled by Gryffin. 29 | type PublishMessage struct { 30 | F string // function, i.e. See or Seen 31 | T string // type (kind), i.e. oracle or hash 32 | K string // key 33 | V string // value 34 | } 35 | 36 | func NewSharedGryffinStore() *GryffinStore { 37 | return newGryffinStore(true) 38 | } 39 | 40 | func NewGryffinStore() *GryffinStore { 41 | return newGryffinStore(false) 42 | } 43 | 44 | func newGryffinStore(shared bool) *GryffinStore { 45 | 46 | store := GryffinStore{ 47 | Oracles: make(map[string]*distance.Oracle), 48 | Hashes: make(map[string]bool), 49 | Hits: make(map[string]int), 50 | } 51 | 52 | if shared { 53 | store.snd = make(chan []byte, 10) 54 | store.rcv = make(chan []byte, 10) 55 | } 56 | 57 | // start a go rountine to read from the channel 58 | go store.processRcvMsg() 59 | 60 | return &store 61 | } 62 | 63 | func (s *GryffinStore) GetRcvChan() chan []byte { 64 | return s.rcv 65 | } 66 | 67 | func (s *GryffinStore) GetSndChan() chan []byte { 68 | return s.snd 69 | } 70 | 71 | func (s *GryffinStore) processRcvMsg() { 72 | for jsonPayload := range s.rcv { 73 | var m PublishMessage 74 | err := json.Unmarshal(jsonPayload, &m) 75 | if err != nil { 76 | fmt.Println("Error in processRcvMsg") 77 | continue 78 | } 79 | fmt.Println("Got a RcvMsg: ", m) // DEBUG 80 | if m.F == "See" { 81 | v, _ := strconv.ParseUint(m.V, 16, 64) 82 | switch m.T { 83 | case "hash": 84 | s.hashesSee(m.K, v, true) 85 | case "oracle": 86 | s.oracleSee(m.K, v, true) 87 | } 88 | } 89 | } 90 | } 91 | 92 | func (s *GryffinStore) See(prefix string, kind string, v uint64) { 93 | 94 | if kind == "oracle" { 95 | s.oracleSee(prefix, v, false) 96 | return 97 | } 98 | if kind == "hash" { 99 | s.hashesSee(prefix, v, false) 100 | return 101 | } 102 | } 103 | 104 | func (s *GryffinStore) Seen(prefix string, kind string, v uint64, r uint8) bool { 105 | 106 | switch kind { 107 | case "oracle": 108 | s.Mu.RLock() 109 | if oracle, ok := s.Oracles[prefix]; ok { 110 | s.Mu.RUnlock() 111 | return oracle.Seen(v, r) 112 | } 113 | s.Mu.RUnlock() 114 | case "hash": 115 | k := prefix + "/" + strconv.FormatUint(v, 10) 116 | s.Mu.RLock() 117 | _, ok := s.Hashes[k] 118 | s.Mu.RUnlock() 119 | return ok 120 | } 121 | return false 122 | } 123 | 124 | func (s *GryffinStore) oracleSee(prefix string, f uint64, localOnly bool) { 125 | k := prefix 126 | // Local update 127 | s.Mu.RLock() 128 | oracle, ok := s.Oracles[k] 129 | s.Mu.RUnlock() 130 | if !ok { 131 | s.Mu.Lock() 132 | s.Oracles[k] = distance.NewOracle() 133 | oracle = s.Oracles[k] 134 | s.Mu.Unlock() 135 | } 136 | oracle.See(f) 137 | 138 | // Remote update 139 | if !localOnly && s.snd != nil { 140 | go func() { 141 | jsonPayload, _ := json.Marshal(&PublishMessage{F: "See", T: "oracle", K: prefix, V: fmt.Sprintf("%x", f)}) 142 | // fmt.Println("Sending... ", s.snd, string(jsonPayload)) 143 | s.snd <- jsonPayload 144 | }() 145 | } 146 | } 147 | 148 | func (s *GryffinStore) hashesSee(prefix string, h uint64, localOnly bool) { 149 | k := prefix + "/" + strconv.FormatUint(h, 10) 150 | s.Mu.Lock() 151 | s.Hashes[k] = true 152 | s.Mu.Unlock() 153 | // Remote update 154 | if !localOnly && s.snd != nil { 155 | go func() { 156 | jsonPayload, _ := json.Marshal(&PublishMessage{F: "See", T: "hash", K: prefix, V: fmt.Sprintf("%x", h)}) 157 | s.snd <- jsonPayload 158 | }() 159 | } 160 | } 161 | 162 | func (s *GryffinStore) Hit(prefix string) bool { 163 | // prefix is domain. 164 | ts := time.Now().Truncate(5 * time.Second).Unix() 165 | k := prefix + "/" + strconv.FormatInt(ts, 10) 166 | s.Mu.Lock() 167 | defer s.Mu.Unlock() 168 | if v, ok := s.Hits[k]; ok { 169 | if v >= 5 { 170 | return false 171 | } 172 | s.Hits[k]++ 173 | return true 174 | } 175 | s.Hits[k] = 1 176 | return true 177 | } 178 | -------------------------------------------------------------------------------- /cmd/gryffin-standalone/main.go: -------------------------------------------------------------------------------- 1 | // Copyright 2015, Yahoo Inc. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package main 6 | 7 | import ( 8 | "flag" 9 | "fmt" 10 | "io" 11 | "net" 12 | "net/http" 13 | "os" 14 | "sync" 15 | "time" 16 | 17 | "github.com/yahoo/gryffin" 18 | "github.com/yahoo/gryffin/fuzzer/arachni" 19 | "github.com/yahoo/gryffin/fuzzer/sqlmap" 20 | "github.com/yahoo/gryffin/renderer" 21 | ) 22 | 23 | var method = flag.String("method", "GET", "the HTTP method for the request.") 24 | var url string 25 | var body = flag.String("data", "", "the data used in a (POST) request.") 26 | 27 | func usage() { 28 | fmt.Fprintf(os.Stderr, "Usage of %s:\n", os.Args[0]) 29 | fmt.Fprintf(os.Stderr, "\tgryffin-standalone [flags] seed-url\n") 30 | fmt.Fprintf(os.Stderr, "Flags:\n") 31 | flag.PrintDefaults() 32 | } 33 | 34 | // THIS IS BAD CODE per https://blog.golang.org/pipelines, and is created for prototyping. 35 | // In production, we will move the channels out and use message queue instead. 36 | func linkChannels(s *gryffin.Scan) { 37 | 38 | var wg sync.WaitGroup 39 | 40 | chanStart := make(chan *gryffin.Scan, 10) 41 | chanRateLimit := make(chan *gryffin.Scan, 10) 42 | chanCrawl := make(chan *gryffin.Scan, 10) 43 | chanFuzz := make(chan *gryffin.Scan, 10) 44 | // defer close(chanStart) 45 | defer close(chanRateLimit) 46 | defer close(chanCrawl) 47 | defer close(chanFuzz) 48 | 49 | // TODO - name all of these functions. 50 | 51 | // Crawl -> Filter by Domain / Rate Limit 52 | go func() { 53 | 54 | for scan := range chanCrawl { 55 | r := &renderer.PhantomJSRenderer{Timeout: 10} 56 | scan.CrawlAsync(r) 57 | 58 | go func() { 59 | if s := <-r.GetRequestBody(); s != nil { 60 | // add two workers (two fuzzers) 61 | wg.Add(2) 62 | chanFuzz <- s 63 | } 64 | 65 | }() 66 | 67 | scan := scan // prevent capturing by goroutine below 68 | go func() { 69 | // 70 | // Renderer will close all channels when a page is duplicated. 71 | // Therefore we don't need to test whether the link is coming 72 | // from a duplicated page or not 73 | for newScan := range r.GetLinks() { 74 | if ok := newScan.ShouldCrawl(); ok { 75 | // add one workers (a new crawl) 76 | wg.Add(1) 77 | chanRateLimit <- newScan 78 | } 79 | } 80 | // remove one worker (finish crawl) 81 | wg.Done() 82 | scan.Logm("Get Links", "Finished") 83 | 84 | }() 85 | 86 | } 87 | 88 | }() 89 | 90 | go func() { 91 | for scan := range chanFuzz { 92 | scan := scan // prevent capture by func literal below 93 | go func() { 94 | f := &arachni.Fuzzer{} 95 | f.Fuzz(scan) 96 | // remove a fuzzer worker. 97 | wg.Done() 98 | }() 99 | go func() { 100 | f := &sqlmap.Fuzzer{} 101 | f.Fuzz(scan) 102 | // remove a fuzzer worker. 103 | wg.Done() 104 | }() 105 | } 106 | 107 | }() 108 | 109 | // Rate Limit -> Crawl 110 | go func() { 111 | for scan := range chanRateLimit { 112 | if delay := scan.RateLimit(); delay != 0 { 113 | go func() { 114 | time.Sleep(time.Duration(delay) * time.Second) 115 | chanRateLimit <- scan 116 | }() 117 | // TODO queue it again. 118 | continue 119 | } 120 | chanCrawl <- scan 121 | } 122 | }() 123 | 124 | // Start, Poke -> RateLimit 125 | go func() { 126 | for scan := range chanStart { 127 | // TODO: add error handling 128 | // err := scan.Poke(&http.Client{}) 129 | _ = scan.Poke(&http.Client{}) 130 | // if err != nil { 131 | // if scan.HitCount <= 5 { 132 | // go func() { 133 | // time.Sleep(5 * time.Second) 134 | // chanStart <- scan 135 | // }() 136 | // } 137 | // continue 138 | // } 139 | chanRateLimit <- scan 140 | } 141 | }() 142 | 143 | chanStart <- s 144 | close(chanStart) 145 | 146 | // add one worker (start crawl) 147 | wg.Add(1) 148 | wg.Wait() 149 | } 150 | 151 | func main() { 152 | 153 | flag.Usage = usage 154 | flag.Parse() 155 | 156 | switch flag.NArg() { 157 | case 1: 158 | url = flag.Arg(0) 159 | default: 160 | usage() 161 | return 162 | 163 | } 164 | 165 | fmt.Println("=== Running Gryffin ===") 166 | 167 | var w io.Writer 168 | // TCP port listening messages. 169 | tcpout, err := net.Dial("tcp", "localhost:5000") 170 | if err != nil { 171 | // fmt.Println("Cannot establish tcp connection to log listener.") 172 | w = os.Stdout 173 | } else { 174 | w = io.MultiWriter(os.Stdout, tcpout) 175 | } 176 | 177 | gryffin.SetLogWriter(w) 178 | 179 | scan := gryffin.NewScan(*method, url, *body) 180 | scan.Logm("Main", "Started") 181 | 182 | linkChannels(scan) 183 | 184 | fmt.Println("=== End Running Gryffin ===") 185 | 186 | } 187 | -------------------------------------------------------------------------------- /renderer/phantomjs.go: -------------------------------------------------------------------------------- 1 | // Copyright 2015, Yahoo Inc. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package renderer 6 | 7 | import ( 8 | "encoding/json" 9 | "io" 10 | "io/ioutil" 11 | "net/http" 12 | "net/url" 13 | "os" 14 | "os/exec" 15 | "reflect" 16 | "strconv" 17 | "strings" 18 | "time" 19 | 20 | "github.com/yahoo/gryffin" 21 | _ "github.com/yahoo/gryffin/renderer/resource" 22 | ) 23 | 24 | /* all of these are the JSON struct for phantomjs render.js */ 25 | 26 | type PhantomJSRenderer struct { 27 | BaseRenderer 28 | Timeout int 29 | process *os.Process 30 | } 31 | 32 | type input struct { 33 | Method string `json:"method"` 34 | AllowedDomains []string `json:"allowed_domains,omitempty"` 35 | Headers inputHeaders `json:"headers"` 36 | } 37 | 38 | type inputHeaders struct { 39 | AcceptEncoding string `json:"Accept-Encoding"` 40 | AcceptLanguage string `json:"Accept-Language"` 41 | Cookie string 42 | UserAgent string `json:"User-Agent"` 43 | } 44 | 45 | type details struct { 46 | Links []link 47 | Forms []form 48 | ChildFrames []link 49 | SubResources []link 50 | Redirects []link 51 | MainFrame []link 52 | } 53 | 54 | type link struct { 55 | Text string 56 | Url string 57 | } 58 | 59 | type form struct { 60 | Data string 61 | DataType string 62 | Method string 63 | Url string 64 | } 65 | 66 | type response struct { 67 | Headers map[string][]string 68 | Body string 69 | ContentType string 70 | Status int 71 | Url string 72 | Details details 73 | } 74 | 75 | type responseMessage struct { 76 | Response response 77 | Elapsed int 78 | Ok int 79 | } 80 | 81 | type domMessage struct { 82 | Action string 83 | Events []string 84 | KeyChain []string 85 | JSError []string 86 | } 87 | 88 | type message struct { 89 | *responseMessage 90 | *domMessage 91 | *details 92 | Signature string 93 | MsgType string 94 | } 95 | 96 | type noCloseReader struct { 97 | io.Reader 98 | } 99 | 100 | func (r noCloseReader) Close() error { 101 | return nil 102 | } 103 | 104 | func (m *response) fill(s *gryffin.Scan) { 105 | 106 | /* 107 | {"response":{"headers":{"Date":["Thu, 30 Jul 2015 00:13:43 GMT"],"Set-Cookie":["B=82j3nrdarir1n&b=3&s=23; expires=Sun, 30-Jul-2017 00:13:43 GMT; path=/; domain=.yahoo.com"] 108 | 109 | */ 110 | resp := &http.Response{ 111 | Request: s.Request, 112 | StatusCode: m.Status, 113 | Status: strconv.FormatInt(int64(m.Status), 10), 114 | Proto: "HTTP/1.1", 115 | ProtoMajor: 1, 116 | ProtoMinor: 1, 117 | Header: m.Headers, 118 | Body: noCloseReader{strings.NewReader(m.Body)}, 119 | } 120 | 121 | s.Response = resp 122 | s.ReadResponseBody() 123 | 124 | } 125 | 126 | func (f *form) toScan(parent *gryffin.Scan) *gryffin.Scan { 127 | m := strings.ToUpper(f.Method) 128 | u := f.Url 129 | var r io.Reader 130 | if m == "POST" { 131 | r = ioutil.NopCloser(strings.NewReader(f.Data)) 132 | } else { 133 | parsed, err := url.Parse(u) 134 | if err == nil { 135 | parsed.RawQuery = f.Data 136 | u = parsed.String() 137 | } 138 | } 139 | 140 | if req, err := http.NewRequest(m, u, r); err == nil { 141 | s := parent.Spawn() 142 | s.MergeRequest(req) 143 | return s 144 | } 145 | // invalid url 146 | return nil 147 | } 148 | 149 | func (l *link) toScan(parent *gryffin.Scan) *gryffin.Scan { 150 | if req, err := http.NewRequest("GET", l.Url, nil); err == nil { 151 | s := parent.Spawn() 152 | s.MergeRequest(req) 153 | return s 154 | } 155 | // invalid url 156 | return nil 157 | } 158 | 159 | func (r *PhantomJSRenderer) extract(stdout io.ReadCloser, s *gryffin.Scan) { 160 | defer close(r.done) 161 | 162 | dec := json.NewDecoder(stdout) 163 | for { 164 | var m message 165 | err := dec.Decode(&m) 166 | if err == io.EOF { 167 | return 168 | } 169 | if m.responseMessage != nil { 170 | m.Response.fill(s) 171 | if s.IsDuplicatedPage() { 172 | return 173 | } 174 | r.chanResponse <- s 175 | r.parseDetails(&m.Response.Details, s) 176 | } 177 | 178 | if m.details != nil { 179 | r.parseDetails(m.details, s) 180 | } 181 | } 182 | } 183 | 184 | func (r *PhantomJSRenderer) parseDetails(d *details, s *gryffin.Scan) { 185 | v := reflect.ValueOf(*d) 186 | for i := 0; i < v.NumField(); i++ { 187 | if links, ok := v.Field(i).Interface().([]link); ok { 188 | for _, link := range links { 189 | if newScan := link.toScan(s); newScan != nil && newScan.IsScanAllowed() { 190 | r.chanLinks <- newScan 191 | } 192 | } 193 | } 194 | if forms, ok := v.Field(i).Interface().([]form); ok { 195 | for _, form := range forms { 196 | if newScan := form.toScan(s); newScan != nil && newScan.IsScanAllowed() { 197 | r.chanLinks <- newScan 198 | } 199 | } 200 | } 201 | } 202 | } 203 | 204 | func (r *PhantomJSRenderer) kill(reason string, s *gryffin.Scan) { 205 | if err := r.process.Kill(); err == nil { 206 | s.Logmf("PhantomjsRenderer.Do", "[%s] Terminating the crawl process.", reason) 207 | } 208 | } 209 | 210 | func (r *PhantomJSRenderer) wait(s *gryffin.Scan) { 211 | 212 | select { 213 | case <-r.done: 214 | r.kill("Cleanup", s) 215 | case <-time.After(time.Duration(r.Timeout) * time.Second): 216 | r.kill("Timeout", s) 217 | } 218 | close(r.chanResponse) 219 | close(r.chanLinks) 220 | } 221 | 222 | func (r *PhantomJSRenderer) Do(s *gryffin.Scan) { 223 | 224 | r.chanResponse = make(chan *gryffin.Scan, 10) 225 | r.chanLinks = make(chan *gryffin.Scan, 10) 226 | r.done = make(chan string) 227 | 228 | // Construct the command. 229 | // render.js http(s)://[:port][/path] [{"method":"post", "data":"a=1&b=2"}] 230 | url := s.Request.URL.String() 231 | cookies := make([]string, 0) 232 | // ua := s.Request.UserAgent() 233 | ua := "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.85 Safari/537.36" 234 | 235 | for _, c := range s.Cookies { 236 | cookies = append(cookies, c.String()) 237 | } 238 | 239 | arg := input{ 240 | Method: s.Request.Method, 241 | Headers: inputHeaders{ 242 | UserAgent: ua, 243 | Cookie: strings.Join(cookies, ";"), 244 | }, 245 | } 246 | 247 | opt, err := json.Marshal(arg) 248 | if err != nil { 249 | s.Error("PhantomjsRenderer.Do", err) 250 | return 251 | } 252 | 253 | // s.Logmf("PhantomjsRenderer.Do", "Running: render.js %s '%s'", url, string(opt)) 254 | s.Logmf("PhantomjsRenderer.Do", "Running: render.js") 255 | 256 | cmd := exec.Command( 257 | "phantomjs", 258 | "--ssl-protocol=any", 259 | "--ignore-ssl-errors=true", 260 | os.Getenv("GOPATH")+"/src/github.com/yahoo/gryffin/renderer/resource/render.js", 261 | url, 262 | string(opt)) 263 | 264 | stdout, err := cmd.StdoutPipe() 265 | if err != nil { 266 | s.Error("PhantomjsRenderer.Do", err) 267 | return 268 | } 269 | 270 | if err := cmd.Start(); err != nil { 271 | s.Error("PhantomjsRenderer.Do", err) 272 | return 273 | } 274 | 275 | r.process = cmd.Process 276 | 277 | // wait until done or timeout. 278 | go r.extract(stdout, s) 279 | go r.wait(s) 280 | 281 | // cmd.Wait will close the stdout pipe. 282 | go cmd.Wait() 283 | 284 | } 285 | -------------------------------------------------------------------------------- /gryffin_test.go: -------------------------------------------------------------------------------- 1 | // Copyright 2015, Yahoo Inc. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package gryffin 6 | 7 | import ( 8 | "net/http" 9 | "net/http/httptest" 10 | "net/url" 11 | "os" 12 | "reflect" 13 | "strings" 14 | "testing" 15 | ) 16 | 17 | var h = http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { 18 | w.Write([]byte("Hello World")) 19 | }) 20 | 21 | var ts = httptest.NewServer(h) 22 | 23 | func TestGenRandomID(t *testing.T) { 24 | t.Parallel() 25 | id := GenRandomID() 26 | if len(id) == 0 { 27 | t.Error("Empty ID from GenRandomID.") 28 | } 29 | } 30 | 31 | func TestNewScan(t *testing.T) { 32 | t.Parallel() 33 | s := NewScan("GET", ts.URL, "") 34 | if s == nil { 35 | t.Error("Scan is nil.") 36 | } 37 | // TODO - verify s.DomainAllowed. 38 | } 39 | 40 | func TestNewScanInvalid(t *testing.T) { 41 | t.Parallel() 42 | s := NewScan("GET", "%a", "") 43 | if s != nil { 44 | t.Error("Scan is not nil with invalid URL.", s.Request) 45 | } 46 | } 47 | 48 | // this test fails due to JSON Marshal of http.Response.Body 49 | // func TestNewScanFromJson(t *testing.T) { 50 | // t.Parallel() 51 | 52 | // // Test arbritary url. 53 | // s := NewScan("GET", ts.URL, "") 54 | // if err := s.Poke(&http.Client{}); err != nil { 55 | // t.Fatalf("error in s.Poke: %v", err) 56 | // } 57 | // j := s.Json() 58 | // if j == nil { 59 | // t.Fatalf("scan.Json: got %v, want a json string - ts.URL=%v", j, ts.URL) 60 | // } 61 | 62 | // s2 := NewScanFromJson(j) 63 | // if s2 == nil { 64 | // t.Error("NewScanFromJson should return a scan.") 65 | // } 66 | // t.Log(s2) 67 | // } 68 | 69 | func TestGetOrigin(t *testing.T) { 70 | t.Parallel() 71 | u, _ := url.Parse("http://127.0.0.1:1234/foo/bar?") 72 | o := getOrigin(u) 73 | if o != "http://127.0.0.1:1234" { 74 | t.Error("getOrigin is not valid", u, o) 75 | } 76 | } 77 | 78 | func TestScanPoke(t *testing.T) { 79 | t.Parallel() 80 | s := NewScan("GET", ts.URL, "") 81 | err := s.Poke(&http.Client{}) 82 | if err != nil { 83 | t.Error(err) 84 | } 85 | } 86 | 87 | func TestScanPokeInvalidURL(t *testing.T) { 88 | t.Parallel() 89 | client := &http.Client{} 90 | s := NewScan("GET", "/foo", "") 91 | err := s.Poke(client) 92 | if err == nil { 93 | t.Error("Expect an error with invalid scheme.") 94 | } 95 | t.Log("Negative test: Invalid url got ", err) 96 | } 97 | 98 | func TestScanSpawn(t *testing.T) { 99 | t.Parallel() 100 | s := NewScan("GET", ts.URL, "") 101 | s.Poke(&http.Client{}) 102 | s2 := s.Spawn() 103 | if s.Request.URL != s2.Request.URL { 104 | t.Error("Spawn gives a request with different URL.") 105 | } 106 | } 107 | 108 | func TestScanMergeRequest(t *testing.T) { 109 | t.Parallel() 110 | s := NewScan("GET", ts.URL, "foo=bar") 111 | s.Poke(&http.Client{}) 112 | s.Request.Header.Set("User-Agent", "foo") 113 | s.Cookies = []*http.Cookie{ 114 | &http.Cookie{Name: "cookie-name-1", Value: "cookie-value-1"}, 115 | } 116 | 117 | r, _ := http.NewRequest("GET", ts.URL, strings.NewReader("quz=quxx")) 118 | s.MergeRequest(r) 119 | if s.Request.UserAgent() != "foo" { 120 | t.Errorf("Merge request got a different user agent: %s", s.Request.UserAgent()) 121 | } 122 | } 123 | 124 | func TestScanMergeRequestRelative(t *testing.T) { 125 | t.Parallel() 126 | s := NewScan("GET", ts.URL, "") 127 | s.Request.Header.Set("User-Agent", "foo") 128 | r, _ := http.NewRequest("GET", "/#", nil) 129 | s.MergeRequest(r) 130 | 131 | if s.Request.URL.String() != ts.URL+"/" { 132 | t.Errorf("Merge request cannot resolve relative url: %s", s.Request.URL) 133 | } 134 | } 135 | 136 | func TestScanReadResponseBody(t *testing.T) { 137 | t.Parallel() 138 | s := NewScan("GET", ts.URL, "") 139 | s.Poke(&http.Client{}) 140 | s.ReadResponseBody() 141 | if s.ResponseBody == "" { 142 | t.Error("Empty ResponseBody") 143 | } 144 | // t.Log(s.ResponseBody) 145 | } 146 | 147 | func TestScanUpdateFingerprint(t *testing.T) { 148 | t.Parallel() 149 | s := NewScan("GET", "http://127.0.0.1", "") 150 | s.UpdateFingerprint() 151 | if !reflect.DeepEqual( 152 | s.Fingerprint, 153 | Fingerprint{0x7233A9A31DEADAF2, 0x7233A9A31DEADAF2, 0xF8A4322BD612093C, 0, 0}) { 154 | t.Error("Fingerprint mismatch", s.Fingerprint) 155 | } 156 | } 157 | 158 | func TestScanResponseFingerprint(t *testing.T) { 159 | t.Parallel() 160 | s := NewScan("GET", ts.URL, "") 161 | s.Poke(&http.Client{}) 162 | s.UpdateFingerprint() 163 | if s.Fingerprint.ResponseSimilarity != 0x62C1D0803B2AB139 { 164 | t.Error("Fingerprint mismatch", s.Fingerprint) 165 | } 166 | } 167 | 168 | func TestScanRateLimit(t *testing.T) { 169 | t.Parallel() 170 | s := NewScan("GET", ts.URL, "") 171 | for i := 0; i < 5; i++ { 172 | d := s.RateLimit() 173 | if d > 0 { 174 | t.Errorf("Got delayed for %d", d) 175 | } 176 | } 177 | d := s.RateLimit() 178 | if d == 0 { 179 | t.Errorf("No delay after 5 request. Got %d", d) 180 | } 181 | } 182 | 183 | func TestScanIsScanAllowed(t *testing.T) { 184 | t.Parallel() 185 | s := NewScan("GET", "http://foo.com", "") 186 | 187 | r, _ := http.NewRequest("GET", "http://bar.com", nil) 188 | s.MergeRequest(r) 189 | if s.IsScanAllowed() { 190 | t.Error("IsScanAllowed should return false", s) 191 | } 192 | 193 | r, _ = http.NewRequest("GET", "http://foo.com/test", nil) 194 | s.MergeRequest(r) 195 | if !s.IsScanAllowed() { 196 | t.Error("IsScanAllowed should return true", s) 197 | } 198 | 199 | s2 := NewScan("GET", "/no-domain", "") 200 | if !s2.IsScanAllowed() { 201 | t.Error("IsScanAllowed should return true", s2.Request.URL) 202 | } 203 | } 204 | 205 | func TestScanCrawlAsync(t *testing.T) { 206 | // TODO ... 207 | t.Parallel() 208 | } 209 | 210 | func TestScanIsDuplicatedPage(t *testing.T) { 211 | t.Parallel() 212 | s1 := NewScan("GET", ts.URL, "") 213 | _ = s1.Poke(&http.Client{}) 214 | if s1.IsDuplicatedPage() { 215 | t.Error("IsDuplicatedPage should return false for the first page", s1) 216 | } 217 | 218 | s2 := s1.Spawn() 219 | r, _ := http.NewRequest("GET", ts.URL, nil) 220 | s2.MergeRequest(r) 221 | _ = s2.Poke(&http.Client{}) 222 | if !s2.IsDuplicatedPage() { 223 | t.Errorf("IsDuplicatedPage should return true for the second page with same Job ID.\n1st Page: %064b\n2nd Page: %064b\n", 224 | s1.Fingerprint.ResponseSimilarity, s2.Fingerprint.ResponseSimilarity) 225 | } 226 | 227 | s3 := Scan(*s1) 228 | s3.Job.ID = "ABCDEF123456" 229 | if s3.IsDuplicatedPage() { 230 | t.Error("IsDuplicatedPage should return false for the a page with new Job ID", s3) 231 | } 232 | 233 | } 234 | 235 | func TestScanFuzz(t *testing.T) { 236 | // TODO ... 237 | t.Parallel() 238 | } 239 | 240 | func TestScanShouldCrawl(t *testing.T) { 241 | t.Parallel() 242 | s1 := NewScan("GET", ts.URL, "") 243 | if !s1.ShouldCrawl() { 244 | t.Error("ShouldCrawl should return true for the first page", s1) 245 | } 246 | 247 | s2 := s1.Spawn() 248 | r, _ := http.NewRequest("GET", ts.URL, nil) 249 | s2.MergeRequest(r) 250 | 251 | if s2.ShouldCrawl() { 252 | t.Errorf("ShouldCrawl should return false for the second page with same Job ID.\n1st Page: %064b\n2nd Page: %064b\n", 253 | s1.Fingerprint.ResponseSimilarity, s2.Fingerprint.ResponseSimilarity) 254 | } 255 | 256 | s3 := Scan(*s1) 257 | s3.Job.ID = "ABCDEF123456" 258 | if !s3.ShouldCrawl() { 259 | t.Error("ShouldCrawl should return true for the a page with new Job ID", s3) 260 | } 261 | } 262 | 263 | func TestScanLog(t *testing.T) { 264 | t.Parallel() 265 | SetLogWriter(os.Stdout) 266 | s := NewScan("GET", ts.URL, "") 267 | s.Log(s) 268 | } 269 | -------------------------------------------------------------------------------- /cmd/gryffin-distributed/main.go: -------------------------------------------------------------------------------- 1 | // Copyright 2015, Yahoo Inc. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package main 6 | 7 | import ( 8 | "flag" 9 | "fmt" 10 | "io" 11 | "math/rand" 12 | "net" 13 | "net/http" 14 | "os" 15 | "os/signal" 16 | "sync" 17 | "syscall" 18 | "time" 19 | 20 | "github.com/nsqio/go-nsq" 21 | 22 | "github.com/yahoo/gryffin" 23 | "github.com/yahoo/gryffin/fuzzer/arachni" 24 | "github.com/yahoo/gryffin/fuzzer/sqlmap" 25 | "github.com/yahoo/gryffin/renderer" 26 | ) 27 | 28 | var ( 29 | // storage is currently unused - TODO: use or remove 30 | // storage = flag.String("storage", "memory", "storag method or the storage url") 31 | service string 32 | url string 33 | wg sync.WaitGroup 34 | wq chan bool 35 | 36 | t *gryffin.Scan 37 | 38 | logWriter io.Writer 39 | store *gryffin.GryffinStore 40 | ) 41 | 42 | // var method = flag.String("method", "GET", "the HTTP method for the request.") 43 | // var url string 44 | // var body = flag.String("data", "", "the data used in a (POST) request.") 45 | 46 | func usage() { 47 | fmt.Fprintf(os.Stderr, "Usage of %s:\n", os.Args[0]) 48 | fmt.Fprintf(os.Stderr, "\tgryffin-distributed --storage=[memory,redis-url] [seed,crawl,fuzz-sqlmap,fuzz-arachni] [url] \n") 49 | fmt.Fprintf(os.Stderr, "Flags:\n") 50 | flag.PrintDefaults() 51 | } 52 | 53 | func captureCtrlC() { 54 | sigChan := make(chan os.Signal, 1) 55 | signal.Notify(sigChan, syscall.SIGINT, syscall.SIGTERM) 56 | wg.Add(1) 57 | 58 | go func() { 59 | <-sigChan 60 | fmt.Println("We got Ctrl-C. Stopping.") 61 | wg.Done() 62 | }() 63 | } 64 | 65 | func newProducer() *nsq.Producer { 66 | producer, err := nsq.NewProducer("127.0.0.1:4150", nsq.NewConfig()) 67 | if err != nil { 68 | fmt.Println("Cannot connect to NSQ for producing message", err) 69 | return nil 70 | } 71 | return producer 72 | } 73 | 74 | func newConsumer(topic, channel string, handler nsq.HandlerFunc) *nsq.Consumer { 75 | var err error 76 | consumer, err := nsq.NewConsumer(topic, channel, nsq.NewConfig()) 77 | if err != nil { 78 | fmt.Println("Cannot create consumer", err) 79 | return nil 80 | } 81 | 82 | consumer.AddHandler(handler) 83 | err = consumer.ConnectToNSQLookupd("127.0.0.1:4161") 84 | if err != nil { 85 | fmt.Println("Cannot connect to NSQ for consuming message", err) 86 | return nil 87 | } 88 | return consumer 89 | } 90 | 91 | func seed(url string) { 92 | producer := newProducer() 93 | defer producer.Stop() 94 | 95 | err := t.Poke(&http.Client{}) 96 | if err != nil { 97 | fmt.Println("Site is not up. Ignoring.", t.Request.URL) 98 | return 99 | } 100 | 101 | err = producer.Publish("seed", t.Json()) 102 | if err != nil { 103 | fmt.Println("Could not publish", "seed", err) 104 | } 105 | fmt.Printf("Seed %s injected.\n", url) 106 | 107 | } 108 | 109 | func shareCache() { 110 | 111 | var producer *nsq.Producer 112 | var consumer *nsq.Consumer 113 | 114 | handler := nsq.HandlerFunc(func(m *nsq.Message) error { 115 | store.GetRcvChan() <- m.Body 116 | return nil 117 | }) 118 | 119 | producer = newProducer() 120 | 121 | go func() { 122 | for { 123 | // fmt.Println("SndChan: ", store.GetSndChan(), string(json)) 124 | err := producer.Publish("share-cache", <-store.GetSndChan()) 125 | if err != nil { 126 | fmt.Println("Could not publish", "share-cache", err) 127 | } 128 | } 129 | }() 130 | 131 | rand.Seed(time.Now().UnixNano()) 132 | 133 | consumer = newConsumer("share-cache", fmt.Sprintf("%06d#ephemeral", rand.Int()%999999), handler) 134 | _ = consumer 135 | 136 | // defer producer.Stop() 137 | // defer consumer.Stop() 138 | 139 | } 140 | 141 | func crawl() { 142 | 143 | var producer *nsq.Producer 144 | var consumer *nsq.Consumer 145 | 146 | handler := nsq.HandlerFunc(func(m *nsq.Message) error { 147 | scan := gryffin.NewScanFromJson(m.Body) 148 | 149 | if delay := scan.RateLimit(); delay != 0 { 150 | go func() { 151 | time.Sleep(time.Duration(delay) * time.Second) 152 | err := producer.Publish("seed", scan.Json()) 153 | if err != nil { 154 | fmt.Println("Could not publish", "fuzz", err) 155 | } 156 | }() 157 | } else { 158 | // TODO - phantom JS timeout should be an input argument. 159 | r := &renderer.PhantomJSRenderer{Timeout: 60} 160 | wq <- true 161 | scan.CrawlAsync(r) 162 | go func() { 163 | if s := <-r.GetRequestBody(); s != nil { 164 | // fmt.Println("Got request body", s.Request.URL) 165 | err := producer.Publish("fuzz", s.Json()) 166 | if err != nil { 167 | fmt.Println("Could not publish", "fuzz", err) 168 | } 169 | } 170 | }() 171 | 172 | go func() { 173 | 174 | // 175 | // Renderer will close all channels when a page is duplicated. 176 | // Therefore we don't need to test whether the link is coming 177 | // from a duplicated page or not 178 | for s := range r.GetLinks() { 179 | if ok := s.ShouldCrawl(); ok { 180 | err := producer.Publish("seed", s.Json()) 181 | if err != nil { 182 | fmt.Println("Could not publish", "seed", err) 183 | } 184 | } 185 | } 186 | <-wq 187 | }() 188 | } 189 | 190 | return nil 191 | }) 192 | 193 | producer = newProducer() 194 | defer producer.Stop() 195 | consumer = newConsumer("seed", "primary", handler) 196 | defer consumer.Stop() 197 | 198 | wg.Wait() 199 | 200 | } 201 | 202 | func fuzzWithSqlmap() { 203 | var consumer *nsq.Consumer 204 | handler := nsq.HandlerFunc(func(m *nsq.Message) error { 205 | wq <- true 206 | scan := gryffin.NewScanFromJson(m.Body) 207 | f := &sqlmap.Fuzzer{} 208 | f.Fuzz(scan) 209 | <-wq 210 | return nil 211 | }) 212 | consumer = newConsumer("fuzz", "sqlmap", handler) 213 | defer consumer.Stop() 214 | wg.Wait() 215 | } 216 | 217 | func fuzzWithArachni() { 218 | var consumer *nsq.Consumer 219 | handler := nsq.HandlerFunc(func(m *nsq.Message) error { 220 | wq <- true 221 | scan := gryffin.NewScanFromJson(m.Body) 222 | f := &arachni.Fuzzer{} 223 | f.Fuzz(scan) 224 | <-wq 225 | return nil 226 | }) 227 | consumer = newConsumer("fuzz", "arachni", handler) 228 | defer consumer.Stop() 229 | wg.Wait() 230 | } 231 | 232 | func main() { 233 | 234 | flag.Usage = usage 235 | flag.Parse() 236 | 237 | switch flag.NArg() { 238 | case 1: 239 | // gryffin-distributed crawl 240 | service = flag.Arg(0) 241 | case 2: 242 | // gryffin-distributed seed "http://..." 243 | service = flag.Arg(0) 244 | if service == "seed" { 245 | url = flag.Arg(1) 246 | } else { 247 | usage() 248 | return 249 | } 250 | default: 251 | usage() 252 | return 253 | } 254 | 255 | // TCP port listening messages. 256 | tcpout, err := net.Dial("tcp", "localhost:5000") 257 | if err != nil { 258 | // fmt.Println("Cannot establish tcp connection to log listener.") 259 | logWriter = os.Stdout 260 | } else { 261 | logWriter = io.MultiWriter(os.Stdout, tcpout) 262 | } 263 | 264 | gryffin.SetLogWriter(logWriter) 265 | 266 | // we use a buffered channel to block when max concurrency is reach. 267 | maxconcurrency := 5 268 | wq = make(chan bool, maxconcurrency) 269 | 270 | t = gryffin.NewScan("GET", url, "") 271 | 272 | // seed is unique case that we exit the program immediately 273 | if service == "seed" { 274 | seed(url) 275 | return 276 | } 277 | 278 | store = gryffin.NewSharedGryffinStore() 279 | gryffin.SetMemoryStore(store) 280 | 281 | captureCtrlC() 282 | 283 | switch service { 284 | 285 | case "crawl": 286 | shareCache() 287 | crawl() 288 | 289 | case "fuzz-sqlmap": 290 | fuzzWithSqlmap() 291 | case "fuzz-arachni": 292 | fuzzWithArachni() 293 | 294 | default: 295 | fmt.Println("Unrecognizated service:", service) 296 | usage() 297 | } 298 | 299 | } 300 | -------------------------------------------------------------------------------- /gryffin.go: -------------------------------------------------------------------------------- 1 | // Copyright 2015, Yahoo Inc. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | /* 6 | Package gryffin is an application scanning infrastructure. 7 | */ 8 | package gryffin 9 | 10 | import ( 11 | "bytes" 12 | "encoding/json" 13 | "fmt" 14 | "hash/fnv" 15 | "io/ioutil" 16 | "net" 17 | "net/http" 18 | "net/http/cookiejar" 19 | "net/url" 20 | "strings" 21 | "time" 22 | 23 | distance "github.com/yahoo/gryffin/html-distance" 24 | ) 25 | 26 | // A Scan consists of the job, target, request and response. 27 | type Scan struct { 28 | // ID is a random ID to identify this particular scan. 29 | // if ID is empty, this scan should not be performed (but record for rate limiting). 30 | ID string 31 | Job *Job 32 | Request *http.Request 33 | RequestBody string 34 | Response *http.Response 35 | ResponseBody string 36 | Cookies []*http.Cookie 37 | Fingerprint Fingerprint 38 | HitCount int 39 | } 40 | 41 | // Job stores the job id and config (if any). 42 | type Job struct { 43 | ID string 44 | DomainsAllowed []string // Domains that we would crawl 45 | } 46 | 47 | // Fingerprint contains all the different types of hash for the Scan (Request & Response) 48 | type Fingerprint struct { 49 | Origin uint64 // origin 50 | URL uint64 // origin + path 51 | Request uint64 // method, url, body 52 | RequestFull uint64 // request + header 53 | ResponseSimilarity uint64 54 | } 55 | 56 | // HTTPDoer interface is to be implemented by http.Client 57 | type HTTPDoer interface { 58 | Do(*http.Request) (*http.Response, error) 59 | } 60 | 61 | // Fuzzer runs the fuzzing. 62 | type Fuzzer interface { 63 | Fuzz(*Scan) (int, error) 64 | } 65 | 66 | // Renderer is an interface for implementation HTML DOM renderer and obtain the response body and links. 67 | // Since DOM construction is very likely to be asynchronous, we return the channels to receive response and links. 68 | type Renderer interface { 69 | Do(*Scan) 70 | GetRequestBody() <-chan *Scan 71 | GetLinks() <-chan *Scan 72 | } 73 | 74 | // LogMessage contains the data fields to be marshalled as JSON for forwarding to the log processor. 75 | type LogMessage struct { 76 | Service string 77 | Msg string 78 | Method string 79 | Url string 80 | JobID string 81 | // Fingerprint Fingerprint 82 | } 83 | 84 | // NewScan creates a scan. 85 | func NewScan(method, url, post string) *Scan { 86 | // ensure we got a memory store.. 87 | memoryStoreMu.Lock() 88 | if memoryStore == nil { 89 | memoryStore = NewGryffinStore() 90 | } 91 | memoryStoreMu.Unlock() 92 | 93 | id := GenRandomID() 94 | 95 | job := &Job{ID: GenRandomID()} 96 | 97 | req, err := http.NewRequest(method, url, ioutil.NopCloser(strings.NewReader(post))) 98 | if err != nil { 99 | // s.Log("Invalid url for NewScan: %s", err) 100 | return nil 101 | } 102 | 103 | // put the host component of the url as the domains to be allowed 104 | host, _, err := net.SplitHostPort(req.URL.Host) 105 | if err != nil { 106 | job.DomainsAllowed = []string{req.URL.Host} 107 | } else { 108 | job.DomainsAllowed = []string{host} 109 | } 110 | 111 | // Add chrome user agent 112 | req.Header.Set("User-Agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.107 Safari/537.36") 113 | 114 | return &Scan{ 115 | ID: id, 116 | Job: job, 117 | Request: req, 118 | RequestBody: post, 119 | } 120 | } 121 | 122 | // getOrigin returns the Origin of the URL (scheme, hostname, port ) 123 | func getOrigin(u *url.URL) string { 124 | return u.Scheme + "://" + u.Host 125 | } 126 | 127 | // MergeRequest merge the request field in scan with the existing one. 128 | func (s *Scan) MergeRequest(req *http.Request) { 129 | 130 | // set cookie from response (if it is not done..) 131 | if s.Response != nil { 132 | s.Cookies = append(s.Cookies, s.Response.Cookies()...) 133 | // s.CookieJar.SetCookies(s.Request.URL, s.Response.Cookies()) 134 | } 135 | 136 | // read the request body, and then reset the reader 137 | var post []byte 138 | if req.Body != nil { 139 | if post, err := ioutil.ReadAll(req.Body); err == nil { 140 | req.Body = ioutil.NopCloser(bytes.NewReader(post)) 141 | } else { 142 | // only possible error is bytes.ErrTooLarge from ioutil package. 143 | s.Error("MergeRequest", err) 144 | } 145 | } 146 | 147 | // resolve relative url. 148 | if !req.URL.IsAbs() { 149 | req.URL = s.Request.URL.ResolveReference(req.URL) 150 | } 151 | 152 | // TODO - drop if Method, URL, Body are same.. 153 | // if req == s.Request { 154 | // s.Logf("Result after merge generate same request.", nil) 155 | // } 156 | 157 | // swap 158 | prevReq := s.Request 159 | s.Request = req 160 | s.RequestBody = string(post) 161 | 162 | // TODO - handle relative URL . 163 | 164 | // Create a cookie jar, add cookie list (so cookie jar reject invalid cookie.) 165 | jar, _ := cookiejar.New(nil) 166 | jar.SetCookies(req.URL, s.Cookies) 167 | 168 | // reset cookies 169 | s.Cookies = make([]*http.Cookie, 0) 170 | for _, c := range jar.Cookies(req.URL) { 171 | req.AddCookie(c) 172 | s.Cookies = append(s.Cookies, c) 173 | } 174 | 175 | // Add user agent 176 | req.Header.Set("User-Agent", prevReq.UserAgent()) 177 | 178 | // Add referrer - TODO, perhaps we don't need this! 179 | 180 | // remove Response. 181 | s.Response = nil 182 | s.ResponseBody = "" 183 | 184 | } 185 | 186 | // Spawn spawns a new scan object with a different ID. 187 | func (s *Scan) Spawn() *Scan { 188 | id := GenRandomID() 189 | job := *s.Job 190 | req := *s.Request // copy the value. 191 | 192 | post := s.RequestBody 193 | s.Request.Body = ioutil.NopCloser(strings.NewReader(post)) 194 | 195 | // get the cookiejar, save the new cookies 196 | // jar := s.CookieJar 197 | cookies := s.Cookies[:] 198 | if s.Response != nil { 199 | cookies = append(cookies, s.Response.Cookies()...) 200 | // jar.SetCookies(s.Request.URL, s.Response.Cookies()) 201 | } 202 | 203 | return &Scan{ 204 | ID: id, 205 | Job: &job, 206 | Request: &req, 207 | RequestBody: post, 208 | Cookies: cookies, 209 | } 210 | } 211 | 212 | // Poke checks if the target is up. 213 | func (s *Scan) Poke(client HTTPDoer) (err error) { 214 | 215 | s.Logm("Poke", "Poking") 216 | 217 | // Add 5s timeout if it is http.Client 218 | switch client := client.(type) { 219 | case *http.Client: 220 | client.Timeout = time.Duration(3) * time.Second 221 | } 222 | 223 | // delete the similarity case for the domain. 224 | // s.Session.DelPrefix("hash/unique/" + s.Request.URL.Host) 225 | 226 | // http.Request is embeded in a Request embeded in a Scan. 227 | s.Response, err = client.Do(s.Request) 228 | if err != nil { 229 | s.Logm("Poke", "Failed") 230 | return 231 | } 232 | 233 | s.ReadResponseBody() 234 | 235 | s.HitCount++ 236 | return 237 | } 238 | 239 | // ReadResponseBody read Response.Body and fill it to ReadResponseBody. 240 | // It will also reconstruct the io.ReaderCloser stream. 241 | func (s *Scan) ReadResponseBody() { 242 | if s.ResponseBody == "" && s.Response != nil { 243 | if b, err := ioutil.ReadAll(s.Response.Body); err == nil { 244 | s.ResponseBody = string(b) 245 | s.Response.Body = ioutil.NopCloser(bytes.NewReader(b)) 246 | } 247 | } 248 | } 249 | 250 | func hash(s string) uint64 { 251 | h := fnv.New64() 252 | h.Write([]byte(s)) 253 | return h.Sum64() 254 | } 255 | 256 | // UpdateFingerprint updates the fingerprint field. 257 | func (s *Scan) UpdateFingerprint() { 258 | f := &s.Fingerprint 259 | if s.Request != nil { 260 | if f.Origin == 0 { 261 | f.Origin = hash(getOrigin(s.Request.URL)) 262 | } 263 | if f.URL == 0 { 264 | f.URL = hash(s.Request.URL.String()) 265 | } 266 | if f.Request == 0 { 267 | f.Request = hash(s.Request.URL.String() + "\n" + s.RequestBody) 268 | } 269 | // if f.RequestFull == 0 { 270 | // TODO 271 | // } 272 | } 273 | 274 | if f.ResponseSimilarity == 0 { 275 | if r := strings.NewReader(s.ResponseBody); s.ResponseBody != "" && r != nil { 276 | f.ResponseSimilarity = distance.Fingerprint(r, 3) 277 | s.Logm("Fingerprint", "Computed") 278 | } 279 | } 280 | 281 | } 282 | 283 | // RateLimit checks whether we are under the allowed rate for crawling the site. 284 | // It returns a delay time to wait to check for ReadyToCrawl again. 285 | func (s *Scan) RateLimit() int { 286 | if memoryStore.Hit(s.Request.URL.Host) { 287 | return 0 288 | } 289 | return 5 290 | 291 | // store := s.Session 292 | // // for each 5 second epoch, we create a key and see how many crawls are done. 293 | // ts := time.Now().Truncate(5 * time.Second).Unix() 294 | // k := "rate/" + s.Request.URL.Host + "/" + strconv.FormatInt(ts, 10) 295 | // if v, ok := store.Get(k); ok { 296 | // if v.(int64) >= 5 { 297 | // // s.Logm("RateLimit", "Delay 5 second") 298 | // // s.Logf("Wait for 5 second for %s (v:%d)", s.Request.URL, v) 299 | // return 5 300 | // } 301 | // // ready to crawl. 302 | // // TODO - this is not atomic. 303 | // c, _ := store.Get(k) 304 | // store.Set(k, c.(int64)+1) 305 | // // s.Logm("RateLimit", "No Delay") 306 | // return 0 307 | // } 308 | 309 | // store.Set(k, 1) 310 | // // s.Logm("RateLimit", "No Delay") 311 | // return 0 312 | } 313 | 314 | // IsScanAllowed check if the request URL is allowed per Job.DomainsAllowed. 315 | func (s *Scan) IsScanAllowed() bool { 316 | // relative URL 317 | if !s.Request.URL.IsAbs() { 318 | return true 319 | } 320 | 321 | host, _, err := net.SplitHostPort(s.Request.URL.Host) 322 | if err != nil { 323 | host = s.Request.URL.Host 324 | } 325 | 326 | for _, allowed := range s.Job.DomainsAllowed { 327 | if host == allowed { 328 | return true 329 | } 330 | } 331 | return false 332 | } 333 | 334 | // CrawlAsync run the crawling asynchronously. 335 | func (s *Scan) CrawlAsync(r Renderer) { 336 | s.Logm("CrawlAsync", "Started") 337 | if s.IsScanAllowed() { 338 | r.Do(s) 339 | } else { 340 | s.Logm("CrawlAsync", "Scan Not Allowed") 341 | } 342 | } 343 | 344 | // IsDuplicatedPage checks if we should proceed based on the Response 345 | func (s *Scan) IsDuplicatedPage() bool { 346 | s.UpdateFingerprint() 347 | f := s.Fingerprint.ResponseSimilarity 348 | if !memoryStore.Seen(s.Job.ID, "oracle", f, 2) { 349 | memoryStore.See(s.Job.ID, "oracle", f) 350 | s.Logm("IsDuplicatedPage", "Unique Page") 351 | return false 352 | } 353 | s.Logm("IsDuplicatedPage", "Duplicate Page") 354 | return true 355 | } 356 | 357 | // Fuzz runs the vulnerability fuzzer, return the issue count. 358 | func (s *Scan) Fuzz(fuzzer Fuzzer) (int, error) { 359 | c, err := fuzzer.Fuzz(s) 360 | return c, err 361 | } 362 | 363 | // // ExtractLinks extracts the list of links found from the responseText in the Scan. 364 | // func (s *Scan) ExtractLinks() (scans []Scan, err error) { 365 | 366 | // return 367 | // } 368 | 369 | // ShouldCrawl checks if the links should be queued for next crawl. 370 | func (s *Scan) ShouldCrawl() bool { 371 | s.UpdateFingerprint() 372 | f := s.Fingerprint.URL 373 | if !memoryStore.Seen(s.Job.ID, "hash", f, 0) { 374 | memoryStore.See(s.Job.ID, "hash", f) 375 | s.Logm("ShouldCrawl", "Unique Link") 376 | return true 377 | } 378 | s.Logm("ShouldCrawl", "Duplicate Link") 379 | return false 380 | } 381 | 382 | // TODO - LogFmt (fmt string) 383 | // TODO - LogI (interface) 384 | // Error logs the error for the given service. 385 | func (s *Scan) Error(service string, err error) { 386 | errmsg := fmt.Sprint(err) 387 | s.Logm(service, errmsg) 388 | } 389 | 390 | // Logmf logs the message for the given service. 391 | func (s *Scan) Logmf(service, format string, a ...interface{}) { 392 | s.Logm(service, fmt.Sprintf(format, a...)) 393 | } 394 | 395 | // Logm sends a LogMessage to Log processor. 396 | func (s *Scan) Logm(service, msg string) { 397 | // TODO - improve the efficiency of this. 398 | m := &LogMessage{ 399 | Service: service, 400 | Msg: msg, 401 | // Fingerprint: s.Fingerprint, 402 | Method: s.Request.Method, 403 | Url: s.Request.URL.String(), 404 | JobID: s.Job.ID, 405 | } 406 | s.Log(m) 407 | } 408 | 409 | // Logf logs using the given format string. 410 | func (s *Scan) Logf(format string, a ...interface{}) { 411 | str := fmt.Sprintf(format, a...) 412 | s.Log(str) 413 | } 414 | 415 | // Log encodes the given argument as JSON and writes it to 416 | // the log writer. 417 | func (s *Scan) Log(v interface{}) { 418 | if logWriter == nil { 419 | return 420 | } 421 | logWriterMu.Lock() 422 | encoder := json.NewEncoder(logWriter) 423 | encoder.Encode(v) 424 | logWriterMu.Unlock() 425 | } 426 | -------------------------------------------------------------------------------- /renderer/resource/render.js: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env phantomjs --ssl-protocol=any --ignore-ssl-errors=true 2 | 3 | /** 4 | * Copyright 2015, Yahoo Inc. All rights reserved. 5 | * Use of this source code is governed by a BSD-style 6 | * license that can be found in the LICENSE file. 7 | * 8 | 9 | */ 10 | 11 | (function() { 12 | 13 | var page = require('webpage').create(), 14 | system = require('system'), 15 | pageTimeoutTimer; 16 | 17 | if (system.args.length === 1) { 18 | console.log('Usage: render.js http(s)://[:port][/path] [{"method":"post", "data":"a=1&b=2"}]'); 19 | return phantom.exit(1); 20 | } 21 | 22 | var utils = require('./utils.js'), 23 | headers = require('./headers.js').init(phantom, page), 24 | eventHandler = require('./events.js'), 25 | events = eventHandler.init(phantom, page), 26 | t = Date.now(), url = system.args[1], opt, output = {}; 27 | 28 | function quit() { 29 | try {events.invokeListeners('onExit')}catch(e){}; 30 | utils.printJSON('exit', 0); 31 | phantom.exit(); 32 | } 33 | 34 | // ensure that when our code fails, we could die gracefully 35 | phantom.onError = function(message, trace){ 36 | // prepare the JSON to directly die, without going thru events.notifyError() 37 | utils.printJSON('error', {errorCode:2001, errorString: message + ' \r\n' + JSON.stringify(trace)}); 38 | quit(); 39 | } 40 | 41 | // log the js error generated by the page 42 | page.onError = function(msg, trace) { 43 | utils.pageChanges.push('jsError', msg); 44 | } 45 | 46 | // the centralized (except phantom.onError) error handler 47 | events.addListener('MainFrameError', function(response) { 48 | // if (!response.errorCode) return; 49 | 50 | output.elasped = Date.now() - t; 51 | output.errorCode = response.errorCode; 52 | output.errorString = response.errorString; 53 | 54 | // http error with a proper status code is considered ok for scrapy 55 | var jsonType = 'error'; 56 | if (response.status && response.status > 0) { 57 | output.ok = 1; 58 | jsonType = 'domSteady'; 59 | } 60 | 61 | // during error, make sure phantom can die no matter what 62 | try { 63 | output.response = utils.prepareResponse(response, headers.getRespHeaders); 64 | output.response.body = utils.cleanResponseBody(page.content); 65 | output.response.details = utils.pageChanges.fetchAll(); 66 | } catch (e) {} 67 | 68 | utils.printJSON(jsonType, output); 69 | quit(); 70 | }); 71 | 72 | 73 | 74 | 75 | // validate the url 76 | if (utils.invalidUrl(url)) 77 | return events.notifyError(1000, 'Invalid Url'); 78 | 79 | // process the extra argument: options 80 | try { 81 | opt = JSON.parse(system.args[2] || '{}'); 82 | } catch(e) { 83 | return events.notifyError(1001, 'Invalid options'); 84 | } 85 | 86 | 87 | // impose a strict timeout in case this phantomjs does not die properly (180s is the default by scrapy) 88 | opt.timeout = opt.timeout || 180; 89 | function setPageTimeout(timeout) { 90 | window.clearTimeout(pageTimeoutTimer); 91 | pageTimeoutTimer = window.setTimeout(function(){ 92 | utils.printJSON('error', { 93 | errorCode: 4, 94 | errorString: 'Timeout Error (exceeded ' + opt.timeout + 's)', 95 | response: {url: url} 96 | }); 97 | quit(); 98 | }, timeout || (opt.timeout * 1000)); 99 | } 100 | setPageTimeout(); 101 | 102 | opt.debug = opt.debug || false; 103 | opt.method = opt.method || 'get'; 104 | opt.data = opt.data || null; 105 | opt.startHostname = utils.getHostname(url); 106 | 107 | // whitelist the domain from url when allowed_domains are not provided 108 | opt.allowed_domains = opt.allowed_domains || [opt.startHostname]; 109 | 110 | // by default no follow pre-redirections (post-redirections are not followed anyway) 111 | opt.followPreRedirections = opt.followPreRedirections || false; 112 | 113 | // if enabled, do not quit when utils.whitelistedRedirectionDomains(redirectUrl) 114 | opt.relaxFirstRedirection = opt.relaxFirstRedirection || true; 115 | 116 | // resource timeout should not exceed 30s 117 | page.settings.resourceTimeout = (opt.resourceTimeout || 30) * 1000; 118 | 119 | // make loadImages default to false 120 | page.settings.loadImages = (opt.loadImages = (!opt.loadImages === false)); 121 | 122 | // to handle any headers-related manipulation and configuration 123 | page.customHeaders = headers.setReqHeaders(opt.headers || {}, opt.startHostname); 124 | 125 | // if (opt.debug) { 126 | // console.log('Cookies: ' + JSON.stringify(phantom.cookies)); 127 | 128 | // events.addListener('LoadFinished', function(status) { 129 | // console.log('debug: onLoadFinished'); 130 | // var timeCounter = 1; 131 | // window.setInterval(function(){console.log('debug: onLoadFinished + '+ (timeCounter++) +'00ms: linkCount=' + page.evaluate(function(){return document.getElementsByTagName('a').length}) )}, 100); 132 | // }); 133 | 134 | // events.addListener('MainFrameSteady', function(response) { 135 | // console.log('debug: MainFrameSteady - linkCount=' + page.evaluate(function(){return document.getElementsByTagName('a').length}) + '\n\n'); 136 | // }); 137 | // } 138 | 139 | 140 | 141 | // stop the first url from navigating to disallowed_domains or disallowed extension (css, zip, etc) 142 | if (utils.invalidUrl(url, opt.allowed_domains)) 143 | events.notifyError(1002, 'Load Failed Error (from disallowed domains)'); 144 | else if (utils.blacklistedUrl(url)) 145 | events.notifyError(1003, 'Filetype unsupported/unrendered as derived from file extension'); 146 | 147 | 148 | // log all mainFrame navigations 149 | events.addListener('MainFrameRedirection', function(requestData, networkRequest){ 150 | utils.pageChanges.push('mainFrame', requestData); 151 | }); 152 | 153 | 154 | events.addListener('MainFramePreRedirection', function(requestData, networkRequest){ 155 | // abort any request that attempts to redirect the mainframe away if nofollows is configured 156 | if (!opt.followPreRedirections) { 157 | // mainFrameSteady will still be invoked during onLoadFinished 158 | networkRequest.abort(); 159 | return; 160 | } 161 | 162 | var redirectUrl = requestData.url; 163 | // prevent navigations to disallowed domains 164 | if (utils.invalidUrl(redirectUrl, opt.allowed_domains)) { 165 | 166 | // exception: do not abort the first redirection to some whitelisted domains 167 | if (opt.relaxFirstRedirection 168 | && !output.firstRedirectionRelaxed 169 | && utils.whitelistedRedirectionDomains(redirectUrl)) { 170 | output.firstRedirectionRelaxed = true; 171 | return; 172 | } 173 | 174 | networkRequest.abort(); 175 | events.notifyError(1002, 'Load Failed Error (from disallowed domains)'); 176 | } 177 | 178 | // prevent navigations to some blacklisted extensions (e.g, css, binaries) 179 | if (utils.blacklistedUrl(redirectUrl)) { 180 | networkRequest.abort(); 181 | events.notifyError(1003, 'Filetype unsupported/unrendered as derived from file extension'); 182 | } 183 | }); 184 | 185 | // disable any navigations after reaching its first destination (i.e. no more redirects) 186 | events.addListener('MainFramePostRedirection', function(requestData, networkRequest){ 187 | // further page load will be freezed 188 | // using page.navigationLocked = true; won't allow us to capture the request 189 | networkRequest.abort(); 190 | }); 191 | 192 | // extract all childFrames navigations 193 | events.addListener('ChildFrameNavigate', function(requestData, networkRequest, type) { 194 | // abort any disallowed requests 195 | if (utils.invalidUrl(requestData.url, opt.allowed_domains) || utils.blacklistedUrl(requestData.url)) 196 | networkRequest.abort(); 197 | utils.pageChanges.push('childFrames', requestData); 198 | }); 199 | 200 | events.addListener('MainFrameResourceReceived', function(response) { 201 | // phantomjs does not fetch binaries anyway 202 | if (response.status && response.status >= 200 && response.status < 300 203 | && !/(?:^text\/|xml|javascript|json)/i.test(response.contentType)) 204 | events.notifyError(1003, 'Filetype unsupported/unrendered (' + response.contentType + ')'); 205 | }); 206 | 207 | events.addListener('MainFrameNavigationsEnded', function(response) { 208 | output.response = utils.prepareResponse(response, headers.getRespHeaders); 209 | }); 210 | 211 | // skip downloading unnecessary subresources according to a known file extension list 212 | events.addListener('SubResourceRequested', function(requestData, networkRequest) { 213 | // prevent navigations to some blacklisted extensions (e.g, css, binaries) 214 | if (utils.blacklistedUrl(requestData.url)) 215 | networkRequest.abort(); 216 | 217 | // utils.pageChanges.push('subResources', requestData); 218 | }); 219 | 220 | // in onInitialized, ajax calls are hooked 221 | events.addListener('Initialized', function() { 222 | 223 | // page.injectJs('./incl/jquery-2.1.1.min.js'); 224 | 225 | // inject scripts to catch links 226 | page.injectJs('./extractors.js'); 227 | }); 228 | 229 | 230 | function extractDetails() { 231 | // childFrames, subResources, redirects extracted 232 | var extracted = {}, details = utils.pageChanges.fetchAll(); 233 | 234 | extracted = page.evaluate(function(){ 235 | // link, form, and jsLink extractions 236 | return window._gryffin_onMainFrameReady && window._gryffin_onMainFrameReady(); 237 | }) || {}; 238 | 239 | // console.log("DEBUG!!! " + page.title); 240 | // var cookies = page.cookies; 241 | 242 | // console.log('Listing cookies:'); 243 | // for(var i in cookies) { 244 | // console.log(cookies[i].name + '=' + cookies[i].value); 245 | // } 246 | details.links = extracted.links || []; 247 | details.forms = extracted.forms || []; 248 | 249 | details.jsLinkFeedback = extracted.jsLinkFeedback; 250 | 251 | return details; 252 | } 253 | 254 | events.addListener('MainFrameSteady', function(response) { 255 | // extend timeout to allow sufficient time for event enumerations 256 | setPageTimeout(); 257 | 258 | // here we terminate this process with the response we collected 259 | output.elasped = Date.now() - t; 260 | output.response.body = utils.cleanResponseBody(response.body); 261 | 262 | if (opt.htmlOnly) { 263 | console.log(output.response.body); 264 | phantom.exit(); 265 | return; 266 | } 267 | 268 | output.response.details = extractDetails(); 269 | 270 | // ensure only one JSON is outputed 271 | if (!output.ok) { 272 | output.ok = 1; 273 | // console.log(JSON.stringify(output, function(k, v){ 274 | // return (typeof v === "string") 275 | // ? v.replace(/[\u007f-\uffff]/g, function(c) { 276 | // return '\\u'+('0000'+c.charCodeAt(0).toString(16)).slice(-4); 277 | // }); 278 | // : v; 279 | // })); 280 | utils.printJSON('domSteady', output); 281 | } 282 | 283 | // can exit due to lack of jsLinks execution 284 | if (output.response.details && !output.response.details.jsLinkFeedback) 285 | quit(); 286 | }); 287 | 288 | // disable any navigations from new windows, instead, capture the request object 289 | events.addListener('PageCreated', function(newPage) { 290 | var newEvents = eventHandler.init(phantom, newPage); 291 | newEvents.addListener('ResourceRequested', function(requestData, networkRequest) { 292 | networkRequest.abort(); 293 | utils.pageChanges.push('childFrames', requestData); 294 | }); 295 | }); 296 | 297 | // get informed about new link discovery by incl/extractors.js 298 | events.addListener('Callback', function(data){ 299 | if (data.action === 'waitTimer') { 300 | events.invokeListeners('onSteady-waitTimer', data.timeout); 301 | 302 | } else if (data.action === 'element.triggering') { 303 | // wait for network steady once an element is being triggered 304 | events.addListener('onSteady', function() { 305 | var eventData = page.evaluate(function(){return jsLinks.getData()}), 306 | // associate other page changes to the recent element triggered 307 | changes = utils.pageChanges.fetchAll(); 308 | changesKeys = Object.keys(changes); 309 | 310 | // append any pageChanges to the eventData 311 | changesKeys.forEach(function(k){ 312 | eventData[k] = changes[k]; 313 | }); 314 | 315 | // if there exists any dom changes 316 | if (changesKeys.length > 0 || eventData.links || eventData.forms) 317 | events.invokeListeners('onDomChanged', eventData); 318 | 319 | // by design, onSteady is called only once even without "return false" 320 | return false; 321 | }); 322 | events.invokeListeners('onSteady-wait', 'element-trigger'); 323 | } else if (data.action === 'element.triggered') { 324 | events.invokeListeners('onSteady-ready', 'element-trigger'); 325 | } else if (data.action === 'done') 326 | quit(); 327 | }); 328 | 329 | // print the triggered element if new results are available 330 | events.addListener('DomChanged', function(data) { 331 | utils.printJSON('domChanged', data); 332 | }); 333 | 334 | // page.onConsoleMessage = function(msg) { 335 | // console.log('CONSOLE: ' + msg); 336 | // }; 337 | page.onConfirm = function(msg){return true}; 338 | 339 | page.openUrl(url, { 340 | operation: opt.method, 341 | data: opt.data // String expected 342 | }, page.settings); 343 | 344 | })(); 345 | -------------------------------------------------------------------------------- /renderer/resource/events.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2015, Yahoo Inc. All rights reserved. 3 | * Use of this source code is governed by a BSD-style 4 | * license that can be found in the LICENSE file. 5 | * 6 | * @author Adon adon@yahoo-inc.com 7 | * @desc this module exposes more usable events, and a better event handling logic 8 | * 9 | * Event Flow: 10 | * =================== 11 | * onNavigationRequested 12 | * onResourceRequested 13 | * onNavigationRequested + onResourceRequested -> onNavigate, onMainFrameNavigate, onMainFramePreRedirection, onChildFrameNavigate 14 | * onLoadStarted 15 | * 16 | * onResourceReceived 17 | * onResourceReceived + mainFrame -> onMainFrameResourceReceived 18 | * [onResourceTimeout/onResourceError] + mainFrame -> onMainFrameError, onMainFrameResourceError 19 | * 20 | * onInitialized 21 | * 22 | * onMainFrameNavigationsEnded 23 | * 24 | * onSubResourceRequested 25 | * [onMainFramePostRedirection] 26 | * 27 | * onLoadFinished 28 | * onLoadFinished + status=='success' -> onMainFrameLoadSuccess 29 | * onLoadFinished + status=='fail' -> onMainFrameLoadFailed, onMainFrameError 30 | * 31 | * [onMainFrameLoadSuccess] + steadyLogic() -> onMainFrameSteady 32 | 33 | ResourceError Codes 34 | # errorMessage[1] = "Connection Refused Error"; 35 | # errorMessage[2] = "RemoteHost Closed Error"; 36 | # errorMessage[3] = "Host Not Found Error"; 37 | # errorMessage[4] = "Timeout Error"; 38 | # errorMessage[5] = "Operation Canceled Error"; 39 | # errorMessage[6] = "Ssl Handshake Failed Error"; 40 | # errorMessage[7] = "Temporary Network Failure Error"; 41 | # errorMessage[8] = "Network Session Failed Error"; 42 | # errorMessage[9] = "Background Request Not Allowed Error"; 43 | # errorMessage[99] = "Unknown Network Error"; 44 | # errorMessage[101] = "ProxyConnectionRefusedError"; 45 | # errorMessage[102] = "ProxyConnectionClosedError"; 46 | # errorMessage[103] = "ProxyNotFoundError"; 47 | # errorMessage[104] = "ProxyTimeoutError"; 48 | # errorMessage[105] = "ProxyAuthenticationRequiredError"; 49 | # errorMessage[199] = "UnknownProxyError"; 50 | # errorMessage[201] = "ContentAccessDenied"; 51 | # errorMessage[202] = "ContentOperationNotPermittedError"; 52 | # errorMessage[203] = "ContentNotFoundError"; 53 | # errorMessage[204] = "AuthenticationRequiredError"; 54 | # errorMessage[205] = "ContentReSendError"; 55 | # errorMessage[299] = "UnknownContentError"; 56 | # errorMessage[301] = "ProtocolUnknownError"; // after networkRequest.abort() 57 | # errorMessage[302] = "ProtocolInvalidOperationError"; 58 | # errorMessage[399] = "ProtocolFailure"; 59 | 60 | */ 61 | 62 | exports.init = function(phantom, page) { 63 | 64 | var callbackList = {}, 65 | resourceDetails = {}, 66 | navigationalRequests = {}, 67 | mainFrameStatus = {}, 68 | mainFrameNetwork = {}, 69 | timerCounter = 0; 70 | 71 | // patch response.redirectURL to take the URL (can be relative) in Location header 72 | function patchRedirectURL(response) { 73 | // we honor the location header only if response.status = 3xx 74 | !response.redirectURL && response.status 75 | && response.status >= 300 && response.status < 400 76 | && response.headers && response.headers.some(function(h){ 77 | if (h.name.toLowerCase() == 'location') { 78 | response.redirectURL = h.value; 79 | return true; 80 | } 81 | }); 82 | } 83 | 84 | // a shortcut to invoke the customized listeners 85 | function invokeListeners(eventName) { 86 | // copy arguments to a new array, and removes the first element 87 | var i = 0, key, args = [], handler; 88 | for (key in arguments) 89 | args[i++] = arguments[key]; 90 | args.shift(); 91 | 92 | handler = page[eventName] || queuedEventCallbacks(eventName); 93 | return handler && handler.apply(page, args); 94 | } 95 | 96 | 97 | // when all handlers of an event returns false, give up the event listener 98 | function queuedEventCallbacks(eventName) { 99 | return function() { 100 | // disable executing any more event handlers when an error was once thrown 101 | if (mainFrameStatus.error) 102 | return; 103 | 104 | // (eventName == 'onCallback') ? console.log(JSON.stringify(arguments[0])) : console.log('debug: ' + eventName + ' ' + (/^onSteady-/.test(eventName) ? arguments[0] + ' ' + JSON.stringify(mainFrameNetwork.outstanding) : arguments[0]&&arguments[0].url)); 105 | // mainFrameStatus.externalError && console.log('extern:' + JSON.stringify(mainFrameStatus.externalError)); 106 | 107 | // if an externalError was ever raised, instead of invoking the following events, we raise an onMainFrameError 108 | if (mainFrameStatus.externalError 109 | && ['onMainFrameResourceReceived', 'onLoadStarted', 'onInitialized', 110 | 'onLoadFinished', 'onMainFrameLoadSuccess', 'onMainFrameSteady'].indexOf(eventName) !== -1) { 111 | var response = mainFrameStatus.response || {}; 112 | response.url = response.url || mainFrameStatus.request.url; 113 | response.errorCode = mainFrameStatus.externalError.errorCode; 114 | response.errorString = mainFrameStatus.externalError.errorString; 115 | invokeListeners('onMainFrameError', response); 116 | return; 117 | } 118 | 119 | var eventCallbackList = callbackList[eventName]; 120 | if (eventCallbackList) { 121 | for (var i = 0, _callback; _callback = eventCallbackList[i]; i++) 122 | if (_callback.apply(this, arguments) === false) 123 | eventCallbackList.splice(i--, 1); 124 | 125 | if (eventCallbackList.length === 0) 126 | page[eventName] = null; 127 | } 128 | 129 | if (eventName == 'onMainFrameError') 130 | mainFrameStatus.error = true; 131 | }; 132 | }; 133 | 134 | // callback added from this handler won't overwrite existing ones 135 | // return false to get itself removed from the event queue 136 | function addListener(eventName, callback, thirdarg){ 137 | if (!callback) 138 | return; 139 | 140 | if (eventName == 'onSteady') { 141 | mainFrameNetwork.steadyMonitor(callback, thirdarg); 142 | return; 143 | } 144 | 145 | if (eventName.indexOf('on') !== 0) 146 | eventName = 'on' + eventName; 147 | callbackList[eventName] = callbackList[eventName] || []; 148 | callbackList[eventName].push(callback); 149 | 150 | // skip adding those events to page that phantomjs won't fire by itself 151 | if (!/^(?:onMainFrame|onSteady)/.test(eventName) && !page[eventName]) 152 | page[eventName] = queuedEventCallbacks(eventName); 153 | } 154 | 155 | // keep track of the resource status 156 | // resourceDetails['req-N'] may have {req, actions, aborted, resp, err} 157 | addListener('ResourceRequested', function(arg0, arg1){ 158 | var resId = 'res' + arg0.id; 159 | resourceDetails[resId] = {'req': arg0, 'actions': arg1}; 160 | }); 161 | addListener('ResourceReceived', function(arg0){ 162 | var resId = 'res' + arg0.id, resObj = resourceDetails[resId]; 163 | 164 | // ResourceError fires before ResourceReceived 165 | // make error code and string captured at ResourceError available to ResourceReceived 166 | if (resObj.err) { 167 | arg0.errorCode = resObj.err.errorCode; 168 | arg0.errorString = resObj.err.errorString; 169 | resObj.aborted && (arg0.aborted = resObj.aborted); 170 | 171 | arg0.url = arg0.url || resObj.req.url; 172 | } 173 | patchRedirectURL(arg0); 174 | resObj.resp = arg0; 175 | }); 176 | // onResourceTimeout, onResourceError is also fired 177 | addListener('ResourceError', function(arg0){ 178 | var resId = 'res' + arg0.id, resObj = resourceDetails[resId]; 179 | // Upon abortion, url is stripped, resulting in protocol error (301) 180 | if (arg0.errorCode === 301 && arg0.url === '') 181 | resObj.aborted = arg0.aborted = true; 182 | resObj.resp = resObj.err = arg0; 183 | }); 184 | 185 | 186 | // onSteady Algorithm: 187 | // 1) mainFrameNetwork.monitor(onSteady, timeout=4000ms) initiaites: 188 | // a) minSteadyTimer (i.e., Min(300ms, timeout/10)) 189 | // b) maxSteadyTimer (i.e., maxSteadyTimeout = 4000ms) 190 | // c) Steady-ready(resourceId) cancels Steady-wait(resourceId) 191 | // 2) If nothing fired during minSteadyTimer, 192 | // or in case 1(c) above ever happened, cancels minSteadyTimer 193 | // finally, each Steady-ready() will see if for 75ms no more Steady-ready(), 194 | // and outStandingReqs.length == 0, fires onSteady() 195 | // 3) If take longer than maxSteadyTimeout, fires onSteady() 196 | mainFrameNetwork.steadyMonitor = function(onSteady, timeout){ 197 | mainFrameNetwork.outstanding = {'minSteadyTimer':true}; 198 | 199 | // install a one-time onSteady listener 200 | mainFrameNetwork.onSteady = onSteady || function(){}; 201 | // 4000ms based on stats concerning max time users'd normally expect, as suggested by @albertyu 202 | timeout = parseInt(timeout || 4000); 203 | mainFrameNetwork.maxTimeout = timeout; 204 | 205 | mainFrameNetwork.minSteadyTimer = setTimeout(function(){ 206 | invokeListeners('onSteady-ready', 'minSteadyTimer'); 207 | }, Math.min(300, timeout/10)); 208 | 209 | clearTimeout(mainFrameNetwork.maxSteadyTimer); 210 | // the max post onloaded time to tolerate: 4 secs as suggested by @albert 211 | mainFrameNetwork.maxSteadyTimer = setTimeout(function(){ 212 | mainFrameNetwork.onSteady('maxSteadyTimer'); 213 | }, timeout); 214 | 215 | // monitor outstanding requests 216 | if (!mainFrameStatus.steadyMonitor) { 217 | mainFrameStatus.steadyMonitor = true; 218 | addListener('ResourceRequested', function(arg0, arg1){ 219 | invokeListeners('onSteady-wait', 'res' + arg0.id); 220 | }); 221 | // onResourceTimeout, onResourceError will also be fired 222 | ['ResourceReceived','ResourceError'].forEach(function(eventName){ 223 | addListener(eventName, function(arg0){ 224 | (!arg0.stage || arg0.stage == 'end') && invokeListeners('onSteady-ready', 'res' + arg0.id); 225 | }); 226 | }); 227 | } 228 | }; 229 | addListener('Steady-wait', function(reason) { 230 | mainFrameNetwork.outstanding[reason] = true; 231 | clearTimeout(mainFrameNetwork.finalistTimer); 232 | 233 | // cancel the minSteadyTimer 234 | if (mainFrameNetwork.outstanding['minSteadyTimer']) { 235 | delete mainFrameNetwork.outstanding['minSteadyTimer']; 236 | clearTimeout(mainFrameNetwork.minSteadyTimer); 237 | } 238 | }) 239 | addListener('Steady-ready', function(reason){ 240 | delete mainFrameNetwork.outstanding[reason]; 241 | 242 | // extend the finialist timer by discarding the previous one (non-atomic operations, but good enough) 243 | clearTimeout(mainFrameNetwork.finalistTimer); 244 | // wait for another 20ms to make sure the sea is completely silenced (i.e., no more new requests) 245 | mainFrameNetwork.finalistTimer = setTimeout(function(){ 246 | if (Object.keys(mainFrameNetwork.outstanding).length === 0) 247 | mainFrameNetwork.onSteady('done'); 248 | }, 75); 249 | }) 250 | // introduce a reason to wait setTimeout/Interval for 'timeout' ms once 251 | addListener('Steady-waitTimer', function(timeout){ 252 | // directly ignore timeout longer than maxTimeout 253 | function readyToWait() { 254 | return mainFrameNetwork.maxTimeout && timeout < mainFrameNetwork.maxTimeout; 255 | } 256 | 257 | var reason = ['timer', timerCounter++, timeout].join('-'); 258 | if (readyToWait()) 259 | invokeListeners('onSteady-wait', reason); 260 | else 261 | addListener('MainFrameSteady', function(){ 262 | readyToWait() && invokeListeners('onSteady-wait', reason); 263 | return false; 264 | }); 265 | window.setTimeout(function(){ 266 | readyToWait() && invokeListeners('onSteady-ready', reason); 267 | }, timeout || 1); 268 | }) 269 | 270 | 271 | // LoadStarted fires only for mainFrame 272 | addListener('LoadStarted', function(){ 273 | mainFrameStatus.loadStarted = true; 274 | }); 275 | 276 | 277 | // expose the following customized events: 278 | // - onNavigate: 279 | // - onMainFrameNavigate: 280 | // - onChildFrameNavigate: 281 | addListener('NavigationRequested', function(url, type, willNavigate, fromMainFrame) { 282 | if (!url || url === 'about:blank' || !willNavigate) 283 | return; 284 | 285 | addListener('ResourceRequested', function(requestData, networkRequest) { 286 | // traceback if such URL is recently recorded as the navigation 287 | if (decodeURI(url) === decodeURI(requestData.url) || url === requestData.url) { 288 | 289 | // let resourceDetails know whether a particular resource happens in frames 290 | var resObj = resourceDetails['res' + requestData.id]; 291 | resObj.fromFrame = true; 292 | resObj.req.fromMainFrame = fromMainFrame; 293 | resObj.req.navigationType = type; 294 | 295 | // mark this as a navigational request 296 | navigationalRequests[requestData.id] = resObj; 297 | 298 | invokeListeners('onNavigate', requestData, networkRequest, fromMainFrame, type); 299 | invokeListeners((fromMainFrame ? 'onMainFrameNavigate' : 'onChildFrameNavigate'), 300 | requestData, networkRequest, type); 301 | 302 | return false; 303 | } 304 | }); 305 | }); 306 | 307 | // expose the following customized events: 308 | // - onSubResourceRequested: fired for subresource requests (i.e., not from frames/windows) 309 | addListener('NavigationRequested', function(url, type, willNavigate, fromMainFrame) { 310 | // the following is setup in NavigationRequested so as to run after the ResourceRequested setup above 311 | addListener('ResourceRequested', function(requestData, networkRequest) { 312 | if (!navigationalRequests[requestData.id]) 313 | invokeListeners('onSubResourceRequested', requestData, networkRequest); 314 | }); 315 | return false; 316 | }); 317 | 318 | // expose the following customized events: 319 | // - onMainFramePostRedirection: 320 | // - onMainFramePreRedirection 321 | addListener('MainFrameNavigate', function(requestData, networkRequest, type) { 322 | if (requestData.id !== 1) 323 | invokeListeners('onMainFrameRedirection', requestData, networkRequest, type); 324 | 325 | if (mainFrameStatus.navigationsEnded) 326 | invokeListeners('onMainFramePostRedirection', requestData, networkRequest, type); 327 | else if (requestData.id !== 1) 328 | invokeListeners('onMainFramePreRedirection', requestData, networkRequest, type); 329 | 330 | if (mainFrameStatus.externalError) 331 | return; 332 | 333 | // backup the current mainFrameStatus, in case the new mainFrameStatus is detected being aborted 334 | var mainFrameStatusBackup = mainFrameStatus; 335 | // prepare a new mainFrameStatus 336 | mainFrameStatus = {'requested': true, 'request': requestData}; 337 | if (mainFrameStatusBackup.requested) 338 | mainFrameStatus.lastBackup = mainFrameStatusBackup; 339 | 340 | // expose the following customized events: 341 | // - onMainFrameNavigationsEnded: fired once when the MainFrame has no more redirections 342 | addListener('ResourceReceived', function(response) { 343 | // ignore subresource's response 344 | if (mainFrameStatus.request.id !== response.id) 345 | return; 346 | 347 | // restore the original mainFrameStatus if the current one is aborted 348 | if (response.aborted) { 349 | invokeListeners('onMainFrameResourceAborted', response); 350 | invokeListeners('onMainFrameResourceReceived', response); 351 | 352 | if (mainFrameStatus.lastBackup) 353 | mainFrameStatus = mainFrameStatus.lastBackup; 354 | else 355 | mainFrameStatus.response = response; 356 | 357 | if (!mainFrameStatus.navigationsEnded) { 358 | mainFrameStatus.navigationsEnded = true; 359 | invokeListeners('onMainFrameNavigationsEnded', mainFrameStatus.response); 360 | } 361 | return false; 362 | } 363 | 364 | mainFrameStatus.response = response; 365 | 366 | if (response.errorCode) { 367 | invokeListeners('onMainFrameResourceError', response); 368 | invokeListeners('onMainFrameResourceReceived', response); 369 | invokeListeners('onMainFrameError', response); 370 | return false; 371 | } 372 | 373 | invokeListeners('onMainFrameResourceReceived', response); 374 | 375 | // the mainFrame's response[stage=start] that has no further redirections 376 | if (!mainFrameStatus.navigationsEnded 377 | && (response.status < 300 || !response.redirectURL)) { 378 | mainFrameStatus.navigationsEnded = true; 379 | invokeListeners('onMainFrameNavigationsEnded', response); 380 | 381 | mainFrameStatus.destResponse = response; 382 | 383 | // the corresponding mainFrame's response[stage=end] 384 | } else { 385 | delete mainFrameStatus.destResponse; 386 | // this is important to deactivate this listener once the main response is downloaded 387 | return false; 388 | } 389 | }); 390 | }); 391 | 392 | 393 | addListener('LoadFinished', function(status) { 394 | var response = mainFrameStatus.response || {'url': url}; 395 | 396 | mainFrameStatus.loadFinished = true; 397 | if (status == 'success' 398 | || (page.content && page.content !== '') 399 | || (response.status && response.status >= 300 && response.status < 400)) { 400 | mainFrameStatus.loadSuccess = true; 401 | invokeListeners('onMainFrameLoadSuccess', response); 402 | } else { 403 | mainFrameStatus.loadFailed = true; 404 | response.errorCode = response.errorCode || 1002; 405 | response.errorString = response.errorString || 'Load Failed Error (from disallowed domains)'; 406 | 407 | invokeListeners('onMainFrameLoadError', response); 408 | invokeListeners('onMainFrameError', response); 409 | } 410 | return false; 411 | }); 412 | 413 | addListener('MainFrameLoadSuccess', function(response){ 414 | addListener('onSteady', function(lastSignal){ 415 | if (!mainFrameStatus.steady) { 416 | mainFrameStatus.steady = true; 417 | 418 | // note that page.content may not contain all dynamically-generated content 419 | // for content-type like xml will have no JS execution context, page.evaluate() returns null, so we resort to page.content 420 | var html = page.evaluate(function(){return document.documentElement ? document.documentElement.outerHTML : ''}); 421 | response.body = html || page.content || response.body; 422 | 423 | invokeListeners('onMainFrameSteady', response, lastSignal); 424 | } 425 | }); 426 | }); 427 | 428 | return { 429 | addListener: addListener, 430 | invokeListeners: invokeListeners, 431 | notifyError: function(errorCode, errorString, url){ 432 | mainFrameStatus.externalError = {'errorCode': errorCode, 'errorString': errorString}; 433 | if (url) mainFrameStatus.externalError.url = url; 434 | 435 | // if not even requested, invoke MainFrameError immediately 436 | !mainFrameStatus.requested && invokeListeners('onMainFrameError', mainFrameStatus.externalError); 437 | }, 438 | getMainFrameStatus: function(){ 439 | return mainFrameStatus; 440 | }, 441 | getResources: function() { 442 | return resourceDetails; 443 | } 444 | }; 445 | }; -------------------------------------------------------------------------------- /renderer/resource/extractors.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2015, Yahoo Inc. All rights reserved. 3 | * Use of this source code is governed by a BSD-style 4 | * license that can be found in the LICENSE file. 5 | * 6 | 7 | */ 8 | 9 | // to call back phantom 10 | function phantomCallback(action, data) { 11 | if (window.callPhantom) { 12 | data = data || {}; 13 | data.action = action; 14 | window.callPhantom(data); 15 | } 16 | } 17 | 18 | // timer as a reason to wait 19 | var _gryffin_setTimeout = window.setTimeout, _gryffin_setInterval = window.setInterval; 20 | window.setTimeout = function(f, t){return phantomCallback('waitTimer', {timeout:t}) || _gryffin_setTimeout.call(this, f, t)}; 21 | window.setInterval = function(f, t){return phantomCallback('waitTimer', {timeout:t}) || _gryffin_setInterval.call(this, f, t)}; 22 | 23 | 24 | // Derived from casperjs 25 | function triggerMouseEvent(el, type) { 26 | 27 | try { 28 | var evt = document.createEvent("MouseEvents"), center_x = 1, center_y = 1; 29 | try { 30 | var pos = el.getBoundingClientRect(); 31 | center_x = Math.floor((pos.left + pos.right) / 2); 32 | center_y = Math.floor((pos.top + pos.bottom) / 2); 33 | } catch(e) {} 34 | evt.initMouseEvent(type, true, true, window, 1, 1, 1, center_x, center_y, false, false, false, false, 0, el); 35 | // dispatchEvent return value is false if at least one of the event 36 | // handlers which handled this event called preventDefault; 37 | // so we cannot returns this results as it cannot accurately informs on the status 38 | // of the operation 39 | // let's assume the event has been sent ok it didn't raise any error 40 | el.dispatchEvent(evt); 41 | return true; 42 | } catch (e) { 43 | return false; 44 | } 45 | }; 46 | 47 | // function getElementByXPath(expression) { 48 | // var a = document.evaluate(expression, document.body, null, XPathResult.ORDERED_NODE_SNAPSHOT_TYPE, null); 49 | // if (a.snapshotLength > 0) { 50 | // return a.snapshotItem(0); 51 | // } 52 | // }; 53 | 54 | // other candidates: focus, input, keydown, keypress, keyup, blur 55 | var jsLinks, jsLinkEvents = ['click', 'dblclick', 'change', 'submit', 'scroll', 'mousemove', 'mouseover', 'mousedown', 'mouseup', 'mouseout'], 56 | attributeFilter = ['href', 'action'].concat(jsLinkEvents.map(function(x){return 'on' + x})), 57 | re_js_links = /^javascript:/i, 58 | re_urls = /^(?:https?|ftp):\/\//i, 59 | re_absoluteUrls = /(?:https?|ftp):\/\/[^\s]+/ig, 60 | re_relativeUrls = /[^\s]+\.(?:php[s\d]?|s?html?|aspx?|jsp|cfm)[^\s]*/ig; 61 | 62 | jsLinks = (function(){ 63 | 64 | function getxpath(el) { 65 | if (el===document.body) return 'body'; 66 | if (el.id !== '') return "//*[@id='"+el.id+"']"; 67 | if (!el.parentNode) return el.toString(); 68 | 69 | for (var i = 0, s, cnt = 0, p = el.parentNode, siblings = p.childNodes; s = siblings[i]; i++) { 70 | if (s === el) return [getxpath(p), el.tagName.toLowerCase() + '[' + (cnt + 1) + ']'].join('/'); 71 | if (s.nodeType === 1 && s.tagName === el.tagName) cnt++; 72 | } 73 | }; 74 | 75 | var jsLinkCaptured = {'root':{'jsLinks':[],'trigger':function(){},'parent':null,'arrPtr':0,'key':'root'}}, 76 | triggeringElement = jsLinkCaptured['root'], 77 | elementProto = (window.EventTarget? window.EventTarget : window.HTMLElement ? window.HTMLElement : window.Element).prototype, 78 | elementProtoMethods = {'addEventListener': elementProto.addEventListener}; 79 | 80 | // extract DOM Level 0 events 81 | function extractDOM0Events(el) { 82 | el = el || document.body; 83 | function getJsLink(element) { 84 | jsLinkEvents.forEach(function(evt){ 85 | element['on' + evt] && jsLinks.add(evt, element, 'dom0'); 86 | }); 87 | } 88 | getJsLink(el); 89 | [].forEach.call(el.getElementsByTagName('*'), getJsLink); 90 | } 91 | 92 | // extract DOM Level 2 events 93 | elementProto.addEventListener = function(type, fn, capture) { 94 | if (jsLinkEvents.indexOf(type.toLowerCase()) !== -1) 95 | jsLinks.add(type, this, 'addEventListener'); 96 | return elementProtoMethods.addEventListener.call(this, type, fn, capture); 97 | }; 98 | 99 | return { 100 | add: function(eventType, node, triggerSource) { 101 | eventType = eventType.toLowerCase(); 102 | var key = getxpath(node); 103 | 104 | if (jsLinkCaptured[key]) { 105 | if (jsLinkCaptured[key]['events'].indexOf(eventType) === -1) 106 | jsLinkCaptured[key]['events'].push(eventType); 107 | jsLinkCaptured[key]['src'].push(triggerSource); 108 | } else { 109 | jsLinkCaptured[key] = { 110 | 'key': key, 111 | 'keyChain': function() { 112 | var trace = [], element = this; 113 | do { 114 | trace.push(element.key); 115 | } while (element = element.parent); 116 | return trace.reverse(); 117 | }, 118 | 'events': [eventType], 119 | 'trigger': function(onTriggered, delay){ 120 | var self = this, i = 0, 121 | eventsString = self.events.join('|'), 122 | results = {'keyChain':self.keyChain(), 'events': self.events}; 123 | 124 | if (node) { 125 | phantomCallback('element.triggering', results); 126 | 127 | // simulate scroll event 128 | if (self.events.indexOf('scroll') !== -1) 129 | _gryffin_setTimeout.call(window, function(){ 130 | try {node.scrollTop = node.scrollHeight} catch(e) {} 131 | }, i++ * delay); 132 | 133 | // group all mouse and (dbl)click events as follows 134 | if (/(?:click|mouse|change)/.test(eventsString)) { 135 | _gryffin_setTimeout.call(window, function(){ 136 | try {node.focus()} catch(e) {}; 137 | triggerMouseEvent(node, 'mousemove'); 138 | triggerMouseEvent(node, 'mouseenter'); 139 | triggerMouseEvent(node, 'mouseover'); 140 | triggerMouseEvent(node, 'mousemove'); 141 | triggerMouseEvent(node, 'mousedown') 142 | triggerMouseEvent(node, 'mouseup'); 143 | }, i * delay); 144 | if (self.events.indexOf('click') !== -1) 145 | _gryffin_setTimeout.call(window, function(){ 146 | triggerMouseEvent(node, 'click'); 147 | }, i++ * delay); 148 | if (self.events.indexOf('dblclick') !== -1) 149 | _gryffin_setTimeout.call(window, function(){ 150 | triggerMouseEvent(node, 'dblclick'); 151 | }, i++ * delay); 152 | if (self.events.indexOf('change') !== -1) { 153 | // for select element 154 | if (node.options) 155 | for (var j = 0, len = node.options.length; j < len; j++) 156 | // cycle through every option 157 | _gryffin_setTimeout.call(window, function(){ 158 | node.selectedIndex = (node.selectedIndex + 1) % node.options.length; 159 | node.dispatchEvent(new Event('change', {bubbles: true, cancelable: true})); 160 | }, i++ * delay); 161 | // for other elements 162 | else 163 | _gryffin_setTimeout.call(window, function(){ 164 | node.dispatchEvent(new Event("change", {bubbles: true, cancelable: true})); 165 | }, i++ * delay); 166 | } 167 | _gryffin_setTimeout.call(window, function(){ 168 | triggerMouseEvent(node, 'mouseout'); 169 | triggerMouseEvent(node, 'mouseleave'); 170 | }, i++ * delay); 171 | } 172 | 173 | // simulate submit event 174 | if (self.events.indexOf('submit') !== -1) 175 | _gryffin_setTimeout.call(window, function(){ 176 | node.dispatchEvent(new Event("submit", {bubbles: true, cancelable: true})); 177 | // dynamically evaluate jsurl of node.action 178 | // TODO: url resolved, enumerate multi-valued form elements? 179 | if (re_js_links.test(node.action)) 180 | (function(){eval(this.action.substring(11));this.submit()}).call(node); 181 | else 182 | node.submit(); 183 | }, i++ * delay); 184 | 185 | // append discovered links/forms 186 | _gryffin_setTimeout.call(window, function(){ 187 | phantomCallback('element.triggered', results); 188 | }, i * delay); 189 | } 190 | 191 | jsLinks.getData = function() { 192 | // append discovered links/forms 193 | if (self.links.length) 194 | results.links = self.links; 195 | if (self.forms.length) 196 | results.forms = self.forms; 197 | 198 | // trigger next element's events 199 | onTriggered && _gryffin_setTimeout.call(window, onTriggered, 10); 200 | return results; 201 | }; 202 | }, 203 | 'jsLinks': [], 204 | 'arrPtr': 0, 205 | 'parent': triggeringElement, 206 | 'src': [triggerSource], 207 | 'links': [], 208 | 'forms': [] 209 | }; 210 | triggeringElement.jsLinks.push(jsLinkCaptured[key]); 211 | } 212 | }, 213 | depthFirstTrigger: function(element, delay){ 214 | triggeringElement = element; 215 | triggeringElement.trigger(function() { 216 | var nextElement, parentTriggerElement = triggeringElement.parent; 217 | parentTriggerElement.arrPtr++; 218 | 219 | // depth-first approach: go one depth deeper if available 220 | // no more child, execute the immediate sibling 221 | // no more immediate sibling, execute the parent's sibling 222 | nextElement = triggeringElement.jsLinks[0] 223 | || parentTriggerElement.jsLinks[parentTriggerElement.arrPtr] 224 | || (parentTriggerElement.parent && parentTriggerElement.parent.jsLinks[parentTriggerElement.parent.arrPtr]); 225 | 226 | if (nextElement) 227 | jsLinks.depthFirstTrigger(nextElement, delay); 228 | else 229 | phantomCallback('done'); 230 | 231 | }, delay); 232 | }, 233 | triggerAll: function(delay) { 234 | observeDOMChanges(function(newNode){ 235 | extractDOM0Events(newNode); 236 | // append the newly discovered static links and forms 237 | extractRequests(triggeringElement, newNode); 238 | }); 239 | 240 | // extract jsLinks 241 | extractDOM0Events(); 242 | 243 | if (triggeringElement.jsLinks[0]) 244 | jsLinks.depthFirstTrigger(triggeringElement.jsLinks[0], delay); 245 | else 246 | phantomCallback('done'); 247 | } 248 | }; 249 | })(); 250 | 251 | 252 | function observeDOMChanges(onNewNode) { 253 | // create an observer instance 254 | var observer = new window.MutationObserver(function(mutations) { 255 | mutations.forEach(function(mutation) { 256 | onNewNode && [].forEach.call(mutation.addedNodes || [mutations.target], function(node){ 257 | node && (node.nodeType === 1) && onNewNode.call(this, node, observer); 258 | }); 259 | }); 260 | }); 261 | 262 | // monitor new nodes and attribute changes that involve URLs 263 | observer.observe(document.body, { 264 | subtree: true, 265 | childList: true, 266 | attributes: true, 267 | attributeFilter: attributeFilter 268 | }); 269 | 270 | return observer; 271 | } 272 | 273 | function arrayUnique(arr) { 274 | var result = [], i = 0, key, lastKey, sorted = arr.sort(), len = sorted.length; 275 | for (;key = sorted[i];i++) 276 | if (lastKey !== key) 277 | result.push(lastKey = key); 278 | return result; 279 | } 280 | 281 | 282 | function extractRequests(sink, el) { 283 | sink.links = sink.links || []; 284 | sink.forms = sink.forms || []; 285 | el = el || document.body; 286 | var links = sink.links, forms = sink.forms; 287 | 288 | function getLink(a) { 289 | if (a.hasAttribute('href') || a.href) { 290 | var href = a.href; 291 | if (re_js_links.test(href)) 292 | jsLinks.add('click', a, 'jsurl'); 293 | else if (re_urls.test(href)) 294 | links.push({'url':href, 'text':(a.textContent || a.innerText).replace(/\s+/g, ' ').trim()}); 295 | } 296 | } 297 | 298 | // TODO: ajax forms extractions 299 | function getForm(f) { 300 | var method = f.method ? f.method.toLowerCase() : 'get', 301 | url = f.action, 302 | urlparams = [], a, 303 | values = [], submits = [], multiDefaults = {}, dataType = {}, j = 0, input; 304 | 305 | // for javascript-uri submissions, yielding FormRequest with invalid url is meaningless 306 | if (re_js_links.test(url) || typeof url === 'object') { 307 | jsLinks.add('submit', f, 'jsurl'); 308 | return; 309 | } 310 | 311 | for (; input = f.elements[j]; j++) { 312 | var name = encodeURIComponent(input.name), 313 | value = encodeURIComponent(input.value), 314 | nodeName = input.nodeName.toLowerCase(), 315 | type = input.type ? input.type.toLowerCase() : nodeName; 316 | 317 | if (!name) continue; 318 | 319 | // ,