├── renderer
    ├── resource
    │   ├── dummy.go
    │   ├── utils.js
    │   ├── headers.js
    │   ├── render.js
    │   ├── events.js
    │   └── extractors.js
    ├── noscript_test.go
    ├── phantomjs_test.go
    ├── base_test.go
    ├── base.go
    ├── noscript.go
    └── phantomjs.go
├── go.mod
├── fuzzer
    ├── dummy
    │   ├── dummy_test.go
    │   └── dummy.go
    ├── arachni
    │   ├── arachni_test.go
    │   └── arachni.go
    └── sqlmap
    │   ├── sqlmap_test.go
    │   └── sqlmap.go
├── data
    ├── memory_test.go
    ├── store.go
    ├── store_test.go
    └── memory.go
├── .gitignore
├── util.go
├── cmd
    ├── gryffin-standalone
    │   ├── main_test.go
    │   └── main.go
    └── gryffin-distributed
    │   ├── main_test.go
    │   └── main.go
├── global.go
├── Makefile
├── .github
    └── workflows
    │   └── linux.yml
├── html-distance
    ├── bktree_test.go
    ├── bktree.go
    ├── README.md
    ├── feature.go
    └── feature_test.go
├── go.sum
├── session_test.go
├── LICENSE
├── serialize.go
├── README.md
├── session.go
├── gryffin_test.go
└── gryffin.go


/renderer/resource/dummy.go:
--------------------------------------------------------------------------------
1 | // Copyright 2015, Yahoo Inc. All rights reserved.
2 | // Use of this source code is governed by a BSD-style
3 | // license that can be found in the LICENSE file.
4 | 
5 | package dummy
6 | 


--------------------------------------------------------------------------------
/go.mod:
--------------------------------------------------------------------------------
 1 | module github.com/yahoo/gryffin
 2 | 
 3 | go 1.14
 4 | 
 5 | require (
 6 | 	github.com/mfonda/simhash v0.0.0-20151007195837-79f94a1100d6
 7 | 	github.com/nsqio/go-nsq v1.0.8
 8 | 	golang.org/x/net v0.0.0-20200425230154-ff2c4b7c35a0
 9 | )
10 | 


--------------------------------------------------------------------------------
/renderer/noscript_test.go:
--------------------------------------------------------------------------------
 1 | // Copyright 2015, Yahoo Inc. All rights reserved.
 2 | // Use of this source code is governed by a BSD-style
 3 | // license that can be found in the LICENSE file.
 4 | 
 5 | package renderer
 6 | 
 7 | import (
 8 | 	"testing"
 9 | )
10 | 
11 | func TestNoScriptCrawlAsync(t *testing.T) {
12 | 	t.Parallel()
13 | 	r := &NoScriptRenderer{}
14 | 	testCrawlAsync(t, r)
15 | }
16 | 


--------------------------------------------------------------------------------
/renderer/phantomjs_test.go:
--------------------------------------------------------------------------------
 1 | // Copyright 2015, Yahoo Inc. All rights reserved.
 2 | // Use of this source code is governed by a BSD-style
 3 | // license that can be found in the LICENSE file.
 4 | 
 5 | package renderer
 6 | 
 7 | import (
 8 | 	"testing"
 9 | )
10 | 
11 | func TestPhantomJSCrawlAsync(t *testing.T) {
12 | 	t.Parallel()
13 | 	r := &PhantomJSRenderer{Timeout: 30}
14 | 	testCrawlAsync(t, r)
15 | }
16 | 


--------------------------------------------------------------------------------
/fuzzer/dummy/dummy_test.go:
--------------------------------------------------------------------------------
 1 | // Copyright 2015, Yahoo Inc. All rights reserved.
 2 | // Use of this source code is governed by a BSD-style
 3 | // license that can be found in the LICENSE file.
 4 | 
 5 | package dummy
 6 | 
 7 | import (
 8 | 	"testing"
 9 | 
10 | 	"github.com/yahoo/gryffin"
11 | )
12 | 
13 | func TestFuzzer(t *testing.T) {
14 | 
15 | 	f := &Fuzzer{}
16 | 	scan := gryffin.NewScan("GET", "http://www.yahoo.com", "")
17 | 	_, err := f.Fuzz(scan)
18 | 	if err != nil {
19 | 		t.Error(err)
20 | 	}
21 | 
22 | }
23 | 


--------------------------------------------------------------------------------
/data/memory_test.go:
--------------------------------------------------------------------------------
 1 | // Copyright 2015, Yahoo Inc. All rights reserved.
 2 | // Use of this source code is governed by a BSD-style
 3 | // license that can be found in the LICENSE file.
 4 | 
 5 | package data
 6 | 
 7 | import (
 8 | 	"testing"
 9 | )
10 | 
11 | func TestMemoryStore(t *testing.T) {
12 | 	t.Parallel()
13 | 	testStore(t, NewMemoryStore())
14 | }
15 | 
16 | func BenchmarkMemoryStore(b *testing.B) {
17 | 	s := NewMemoryStore()
18 | 	b.ResetTimer()
19 | 	for i := 0; i < b.N; i++ {
20 | 		benchStore(b, s)
21 | 	}
22 | 
23 | }
24 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Compiled Object files, Static and Dynamic libs (Shared Objects)
 2 | *.o
 3 | *.a
 4 | *.so
 5 | 
 6 | # Folders
 7 | _obj
 8 | _test
 9 | 
10 | # Architecture specific extensions/prefixes
11 | *.[568vq]
12 | [568vq].out
13 | 
14 | *.cgo1.go
15 | *.cgo2.c
16 | _cgo_defun.c
17 | _cgo_gotypes.go
18 | _cgo_export.*
19 | 
20 | _testmain.go
21 | 
22 | *.exe
23 | *.test
24 | *.prof
25 | 
26 | # NSQ temporary files.
27 | *.dat 
28 | 
29 | # logstashes
30 | *.log
31 | logstash-forwarder.crt
32 | logstash-forwarder.key
33 | .logstash-forwarder


--------------------------------------------------------------------------------
/util.go:
--------------------------------------------------------------------------------
 1 | // Copyright 2015, Yahoo Inc. All rights reserved.
 2 | // Use of this source code is governed by a BSD-style
 3 | // license that can be found in the LICENSE file.
 4 | 
 5 | package gryffin
 6 | 
 7 | import (
 8 | 	"crypto/rand"
 9 | 	"fmt"
10 | 	"io"
11 | )
12 | 
13 | // GenRandomID generates a random ID.
14 | func GenRandomID() string {
15 | 	// UUID generation is trivial per RSC in https://groups.google.com/d/msg/golang-dev/zwB0k2mpshc/l3zS3oxXuNwJ
16 | 	buf := make([]byte, 16)
17 | 	io.ReadFull(rand.Reader, buf)
18 | 	return fmt.Sprintf("%X", buf)
19 | }
20 | 


--------------------------------------------------------------------------------
/data/store.go:
--------------------------------------------------------------------------------
 1 | // Copyright 2015, Yahoo Inc. All rights reserved.
 2 | // Use of this source code is governed by a BSD-style
 3 | // license that can be found in the LICENSE file.
 4 | 
 5 | // Package data provides an interface for common data store operations.
 6 | package data
 7 | 
 8 | // Store is an interface that capture all methods supported for a data store.
 9 | type Store interface {
10 | 	Get(key string) (value interface{}, ok bool)
11 | 	Set(key string, value interface{}) bool
12 | 	IncrBy(key string, delta int64) (newVal int64)
13 | 	Publish(key string, value interface{})
14 | }
15 | 


--------------------------------------------------------------------------------
/renderer/base_test.go:
--------------------------------------------------------------------------------
 1 | // Copyright 2015, Yahoo Inc. All rights reserved.
 2 | // Use of this source code is governed by a BSD-style
 3 | // license that can be found in the LICENSE file.
 4 | 
 5 | package renderer
 6 | 
 7 | import (
 8 | 	"os"
 9 | 	"testing"
10 | 
11 | 	"github.com/yahoo/gryffin"
12 | )
13 | 
14 | func testCrawlAsync(t *testing.T, r gryffin.Renderer) {
15 | 	if os.Getenv("INTEGRATION") == "" {
16 | 		t.Skip("Skip integration tests.")
17 | 	}
18 | 
19 | 	url := "https://www.yahoo.com/"
20 | 
21 | 	s := gryffin.NewScan("GET", url, "")
22 | 	r.Do(s)
23 | 	<-r.GetRequestBody()
24 | 	for link := range r.GetLinks() {
25 | 		t.Logf("Got link %s", link.Request.URL)
26 | 	}
27 | }
28 | 


--------------------------------------------------------------------------------
/fuzzer/dummy/dummy.go:
--------------------------------------------------------------------------------
 1 | // Copyright 2015, Yahoo Inc. All rights reserved.
 2 | // Use of this source code is governed by a BSD-style
 3 | // license that can be found in the LICENSE file.
 4 | 
 5 | package dummy
 6 | 
 7 | import (
 8 | 	"fmt"
 9 | 	"os/exec"
10 | 
11 | 	"github.com/yahoo/gryffin"
12 | )
13 | 
14 | // Fuzzer is the handle for the fuzzing methods.
15 | type Fuzzer struct{}
16 | 
17 | // Fuzz runs a dummy scan.
18 | func (s *Fuzzer) Fuzz(g *gryffin.Scan) (count int, err error) {
19 | 
20 | 	cmd := exec.Command("echo", g.Request.URL.Host)
21 | 	_, err = cmd.Output()
22 | 
23 | 	g.Logm("Dummy.Scan", fmt.Sprintf("Echo return %t", cmd.ProcessState.Success()))
24 | 	return 0, err
25 | 
26 | }
27 | 


--------------------------------------------------------------------------------
/fuzzer/arachni/arachni_test.go:
--------------------------------------------------------------------------------
 1 | // Copyright 2015, Yahoo Inc. All rights reserved.
 2 | // Use of this source code is governed by a BSD-style
 3 | // license that can be found in the LICENSE file.
 4 | 
 5 | package arachni
 6 | 
 7 | import (
 8 | 	"os"
 9 | 	"testing"
10 | 
11 | 	"github.com/yahoo/gryffin"
12 | )
13 | 
14 | func TestFuzzer(t *testing.T) {
15 | 	if os.Getenv("INTEGRATION") == "" {
16 | 		t.Skip("Skip integration tests.")
17 | 	}
18 | 	s := &Fuzzer{}
19 | 	scan := gryffin.NewScan("GET", "http://127.0.0.1:8081/xss/reflect/full1?in=change_me", "")
20 | 	c, err := s.Fuzz(scan)
21 | 	if err != nil {
22 | 		t.Error(err)
23 | 	}
24 | 	if c == 0 {
25 | 		t.Error("No issue detected.")
26 | 	}
27 | }
28 | 


--------------------------------------------------------------------------------
/fuzzer/sqlmap/sqlmap_test.go:
--------------------------------------------------------------------------------
 1 | // Copyright 2015, Yahoo Inc. All rights reserved.
 2 | // Use of this source code is governed by a BSD-style
 3 | // license that can be found in the LICENSE file.
 4 | 
 5 | package sqlmap
 6 | 
 7 | import (
 8 | 	"os"
 9 | 	"testing"
10 | 
11 | 	"github.com/yahoo/gryffin"
12 | )
13 | 
14 | func TestFuzzer(t *testing.T) {
15 | 	if os.Getenv("INTEGRATION") == "" {
16 | 		t.Skip("Skip integration tests.")
17 | 	}
18 | 
19 | 	s := &Fuzzer{}
20 | 	scan := gryffin.NewScan("GET", "http://127.0.0.1:8082/dvwa/vulnerabilities/sqli/?id=1&Submit=Submit", "")
21 | 	c, err := s.Fuzz(scan)
22 | 	if err != nil {
23 | 		t.Error(err)
24 | 	}
25 | 	if c == 0 {
26 | 		t.Error("No issue detected.")
27 | 	}
28 | 
29 | }
30 | 


--------------------------------------------------------------------------------
/cmd/gryffin-standalone/main_test.go:
--------------------------------------------------------------------------------
 1 | // Copyright 2015, Yahoo Inc. All rights reserved.
 2 | // Use of this source code is governed by a BSD-style
 3 | // license that can be found in the LICENSE file.
 4 | 
 5 | package main
 6 | 
 7 | import (
 8 | 	"net/http"
 9 | 	"net/http/httptest"
10 | 	"os"
11 | 	"testing"
12 | 
13 | 	"github.com/yahoo/gryffin"
14 | )
15 | 
16 | var h = http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
17 | 	w.Write([]byte("Hello World"))
18 | })
19 | 
20 | var ts = httptest.NewServer(h)
21 | 
22 | func TestMain(t *testing.T) {
23 | 	if os.Getenv("INTEGRATION") == "" {
24 | 		t.Skip("Skip integration tests.")
25 | 	}
26 | 	scan := gryffin.NewScan("GET", ts.URL, "")
27 | 	linkChannels(scan)
28 | 
29 | }
30 | 


--------------------------------------------------------------------------------
/global.go:
--------------------------------------------------------------------------------
 1 | // Copyright 2015, Yahoo Inc. All rights reserved.
 2 | // Use of this source code is governed by a BSD-style
 3 | // license that can be found in the LICENSE file.
 4 | 
 5 | package gryffin
 6 | 
 7 | import (
 8 | 	"io"
 9 | 	"sync"
10 | )
11 | 
12 | var (
13 | 	memoryStore   *GryffinStore
14 | 	logWriter     io.Writer
15 | 	memoryStoreMu sync.Mutex
16 | 	logWriterMu   sync.Mutex
17 | )
18 | 
19 | // SetMemoryStore sets the package internal global variable
20 | // for the memory store.
21 | func SetMemoryStore(m *GryffinStore) {
22 | 	memoryStoreMu.Lock()
23 | 	memoryStore = m
24 | 	memoryStoreMu.Unlock()
25 | }
26 | 
27 | // SetLogWriter sets the log writer.
28 | func SetLogWriter(w io.Writer) {
29 | 	logWriterMu.Lock()
30 | 	logWriter = w
31 | 	logWriterMu.Unlock()
32 | }
33 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | 
 2 | # This Makefile is adopted from https://github.com/hashicorp/consul/blob/master/Makefile
 3 | 
 4 | all: format build
 5 | 
 6 | cov:
 7 | 	gocov test | gocov-html > /tmp/coverage.html
 8 | 	open /tmp/coverage.html
 9 | 
10 | build: test
11 | 	cd cmd/gryffin-standalone; go build
12 | 
13 | test:
14 | 	go test ./...
15 | 	@$(MAKE) vet
16 | 
17 | test-mono:
18 | 	go run cmd/gryffin-standalone/main.go "http://127.0.0.1:8081"
19 | 	go run cmd/gryffin-standalone/main.go "http://127.0.0.1:8082/dvwa/vulnerabilities/sqli/?id=1&Submit=Submit"
20 | 
21 | 
22 | test-integration:
23 | 	INTEGRATION=1 go test ./...
24 | 
25 | test-cover:
26 | 	go test --cover ./...
27 | 
28 | format:
29 | 	@gofmt -l .
30 | 
31 | vet:
32 | 	@go vet ./...
33 | 
34 | .PHONY: all cov build test vet web web-push
35 | 


--------------------------------------------------------------------------------
/data/store_test.go:
--------------------------------------------------------------------------------
 1 | // Copyright 2015, Yahoo Inc. All rights reserved.
 2 | // Use of this source code is governed by a BSD-style
 3 | // license that can be found in the LICENSE file.
 4 | 
 5 | package data
 6 | 
 7 | import (
 8 | 	"testing"
 9 | )
10 | 
11 | func testStore(t *testing.T, s Store) {
12 | 	s.Set("hello", "world")
13 | 	if v, ok := s.Get("hello"); !ok || v != "world" {
14 | 		t.Error("Get and Set is inconsistent.", v)
15 | 	}
16 | 
17 | 	s.Set("foo", 100)
18 | 	if n := s.IncrBy("foo", 10); n != 110 {
19 | 		t.Error("Incr failed.")
20 | 	}
21 | 	if v, ok := s.Get("foo"); v.(int64) != 110 {
22 | 		t.Errorf("Incr is inconsistent %t, %t and %s", ok, v.(int64) == 110, v)
23 | 	}
24 | 
25 | }
26 | 
27 | func benchStore(b *testing.B, s Store) {
28 | 	s.Set("hello", "world")
29 | 	s.Set("foo", 100)
30 | 	s.IncrBy("foo", 10)
31 | }
32 | 


--------------------------------------------------------------------------------
/cmd/gryffin-distributed/main_test.go:
--------------------------------------------------------------------------------
 1 | // Copyright 2015, Yahoo Inc. All rights reserved.
 2 | // Use of this source code is governed by a BSD-style
 3 | // license that can be found in the LICENSE file.
 4 | 
 5 | package main
 6 | 
 7 | // Unit test for gryffin-distributed is still on todo list.
 8 | //
 9 | // import (
10 | // 	"net/http"
11 | // 	"net/http/httptest"
12 | // 	"os"
13 | // 	"testing"
14 | 
15 | // 	"github.com/yahoo/gryffin"
16 | // )
17 | 
18 | // var handler = http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
19 | // 	w.Write([]byte("Hello World"))
20 | // })
21 | 
22 | // var ts = httptest.NewServer(handler)
23 | 
24 | // func TestMain(t *testing.T) {
25 | // 	if os.Getenv("INTEGRATION") == "" {
26 | // 		t.Skip("Skip integration tests.")
27 | // 	}
28 | // 	scan := gryffin.NewScan("GET", ts.URL, "")
29 | // 	linkChannels(scan)
30 | 
31 | // }
32 | 


--------------------------------------------------------------------------------
/renderer/base.go:
--------------------------------------------------------------------------------
 1 | // Copyright 2015, Yahoo Inc. All rights reserved.
 2 | // Use of this source code is governed by a BSD-style
 3 | // license that can be found in the LICENSE file.
 4 | 
 5 | package renderer
 6 | 
 7 | import (
 8 | 	"github.com/yahoo/gryffin"
 9 | )
10 | 
11 | type BaseRenderer struct {
12 | 	chanResponse chan *gryffin.Scan
13 | 	chanLinks    chan *gryffin.Scan
14 | 	done         chan string // done, notify with a string of the "reason", e.g. terminated, completed, etc.
15 | }
16 | 
17 | func (r *BaseRenderer) Do(s *gryffin.Scan) {
18 | 	// Dummy operation, just close the channels.
19 | 	defer close(r.chanResponse)
20 | 	defer close(r.chanLinks)
21 | 	defer close(r.done)
22 | }
23 | 
24 | func (r *BaseRenderer) GetRequestBody() <-chan *gryffin.Scan {
25 | 	return r.chanResponse
26 | }
27 | 
28 | func (r *BaseRenderer) GetLinks() <-chan *gryffin.Scan {
29 | 	return r.chanLinks
30 | }
31 | 


--------------------------------------------------------------------------------
/.github/workflows/linux.yml:
--------------------------------------------------------------------------------
 1 | name: Linux
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - master
 7 |   pull_request:
 8 |     branches:
 9 |       - '*'
10 | 
11 | jobs:
12 |   build:
13 |     runs-on: ubuntu-latest
14 |     strategy:
15 |       matrix:
16 |         go: [ '1.14.2', '1.13' ]
17 |     name: Go ${{ matrix.go }} build
18 |     steps:
19 |       - uses: actions/checkout@v2
20 |       - name: Setup go
21 |         uses: actions/setup-go@v1
22 |         with:
23 |           go-version: ${{ matrix.go }}
24 | 
25 |       - name: go vet
26 |         run: go vet -v ./...
27 | 
28 |       - name: Basic build
29 |         run: go build ./cmd/...
30 | 
31 |       - name: Run tests on linux
32 |         run: go test ./...
33 | 
34 |       - name: Run tests with race detector
35 |         run: go test -v -race -coverprofile=coverage.txt -covermode=atomic ./...
36 | 
37 |       - name: Upload coverage to Codecov
38 |         uses: codecov/codecov-action@v1
39 |         with:
40 |          file: ./coverage.txt
41 | 


--------------------------------------------------------------------------------
/html-distance/bktree_test.go:
--------------------------------------------------------------------------------
 1 | // Copyright 2015, Yahoo Inc. All rights reserved.
 2 | // Use of this source code is governed by a BSD-style
 3 | // license that can be found in the LICENSE file.
 4 | 
 5 | package distance
 6 | 
 7 | import (
 8 | 	"testing"
 9 | )
10 | 
11 | func TestNewOracle(t *testing.T) {
12 | 	// just add 0 and 1.
13 | 	oracle := NewOracle()
14 | 	for i := uint64(1); i < 2; i++ {
15 | 		oracle.See(i)
16 | 	}
17 | 	r := uint8(2)
18 | 	for i := uint64(0); i < 30; i++ {
19 | 		t.Logf("Has the oracle seen anything closed to %02d (%08b) within distance of %d? %t", i, i, r, oracle.Seen(i, r))
20 | 	}
21 | 
22 | }
23 | 
24 | func BenchmarkOracleSee(b *testing.B) {
25 | 	oracle := NewOracle()
26 | 	for i := 0; i < b.N; i++ {
27 | 		// for i := uint64(1); i < 10000; i++ {
28 | 		oracle.See(uint64(i))
29 | 		// }
30 | 	}
31 | }
32 | 
33 | func BenchmarkOracleSeen(b *testing.B) {
34 | 	oracle := NewOracle()
35 | 	for i := uint64(1); i < 1000000; i++ {
36 | 		oracle.See(i)
37 | 	}
38 | 	b.ResetTimer()
39 | 	r := uint8(2)
40 | 	for i := 0; i < b.N; i++ {
41 | 		oracle.Seen(uint64(i), r)
42 | 	}
43 | }
44 | 


--------------------------------------------------------------------------------
/go.sum:
--------------------------------------------------------------------------------
 1 | github.com/golang/snappy v0.0.1 h1:Qgr9rKW7uDUkrbSmQeiDsGa8SjGyCOGtuasMWwvp2P4=
 2 | github.com/golang/snappy v0.0.1/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q=
 3 | github.com/mfonda/simhash v0.0.0-20151007195837-79f94a1100d6 h1:bjfMeqxWEJ6IRUvGkiTkSwx0a6UdQJsbirRSoXogteY=
 4 | github.com/mfonda/simhash v0.0.0-20151007195837-79f94a1100d6/go.mod h1:WVJJvUw/pIOcwu2O8ZzHEhmigq2jzwRNfJVRMJB7bR8=
 5 | github.com/nsqio/go-nsq v1.0.8 h1:3L2F8tNLlwXXlp2slDUrUWSBn2O3nMh8R1/KEDFTHPk=
 6 | github.com/nsqio/go-nsq v1.0.8/go.mod h1:vKq36oyeVXgsS5Q8YEO7WghqidAVXQlcFxzQbQTuDEY=
 7 | golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
 8 | golang.org/x/net v0.0.0-20200425230154-ff2c4b7c35a0 h1:Jcxah/M+oLZ/R4/z5RzfPzGbPXnVDPkEDtf2JnuxN+U=
 9 | golang.org/x/net v0.0.0-20200425230154-ff2c4b7c35a0/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A=
10 | golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
11 | golang.org/x/sys v0.0.0-20200323222414-85ca7c5b95cd/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
12 | golang.org/x/text v0.3.0 h1:g61tztE5qeGQ89tm6NTjjM9VPIm088od1l6aSorWRWg=
13 | golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
14 | 


--------------------------------------------------------------------------------
/session_test.go:
--------------------------------------------------------------------------------
 1 | package gryffin
 2 | 
 3 | import (
 4 | 	"sync"
 5 | 	"testing"
 6 | 	"time"
 7 | )
 8 | 
 9 | func TestNewGryffinStore(t *testing.T) {
10 | 
11 | 	t.Parallel()
12 | 
13 | 	store1 := NewSharedGryffinStore()
14 | 	store2 := NewSharedGryffinStore()
15 | 
16 | 	var wg sync.WaitGroup
17 | 	wg.Add(1)
18 | 
19 | 	go func() {
20 | 		store1.See("foo", "oracle", uint64(0x1234))
21 | 		b := <-store1.GetSndChan()
22 | 		t.Log("Store1 got ", string(b))
23 | 		store2.GetRcvChan() <- b
24 | 
25 | 		store1.See("foo", "hash", uint64(0x5678))
26 | 		b = <-store1.GetSndChan()
27 | 		t.Log("Store1 got ", string(b))
28 | 		store2.GetRcvChan() <- b
29 | 		wg.Done()
30 | 	}()
31 | 
32 | 	wg.Wait()
33 | 	for i := 0; i < 100000; i++ {
34 | 		if store2.Seen("foo", "oracle", uint64(0x1234), 2) {
35 | 			t.Logf("Store2 see the new oracle value in %d microseconds.", i)
36 | 			break
37 | 		}
38 | 		time.Sleep(1 * time.Microsecond)
39 | 	}
40 | 
41 | 	if !store2.Seen("foo", "oracle", uint64(0x1234), 2) {
42 | 		t.Error("2nd store should see the oracle value in oracle.", store2.Oracles)
43 | 	}
44 | 
45 | 	for i := 0; i < 100000; i++ {
46 | 		if store2.Seen("foo", "hash", uint64(0x5678), 2) {
47 | 			t.Logf("Store2 see the new hash value in %d microseconds.", i)
48 | 			break
49 | 		}
50 | 		time.Sleep(1 * time.Microsecond)
51 | 	}
52 | 
53 | 	if !store2.Seen("foo", "hash", uint64(0x5678), 2) {
54 | 		t.Error("2nd store should see the hash value in hashes.", store2.Hashes)
55 | 	}
56 | }
57 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2015 Yahoo Inc. All rights reserved.
 2 | 
 3 | Redistribution and use in source and binary forms, with or without
 4 | modification, are permitted provided that the following conditions are
 5 | met:
 6 | 
 7 |    * Redistributions of source code must retain the above copyright
 8 | notice, this list of conditions and the following disclaimer.
 9 |    * Redistributions in binary form must reproduce the above
10 | copyright notice, this list of conditions and the following disclaimer
11 | in the documentation and/or other materials provided with the
12 | distribution.
13 |    * Neither the name of Yahoo Inc. nor the names of its
14 | contributors may be used to endorse or promote products derived from
15 | this software without specific prior written permission.
16 | 
17 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 | 


--------------------------------------------------------------------------------
/serialize.go:
--------------------------------------------------------------------------------
 1 | // Copyright 2015, Yahoo Inc. All rights reserved.
 2 | // Use of this source code is governed by a BSD-style
 3 | // license that can be found in the LICENSE file.
 4 | 
 5 | package gryffin
 6 | 
 7 | import (
 8 | 	"encoding/json"
 9 | 	"log"
10 | 	"net/http"
11 | )
12 | 
13 | // NewScanFromJson creates a Scan from the passed JSON blob.
14 | func NewScanFromJson(b []byte) *Scan {
15 | 	// ensure we got a memory store..
16 | 	if memoryStore == nil {
17 | 		memoryStore = NewGryffinStore()
18 | 	}
19 | 
20 | 	var scan Scan
21 | 	json.Unmarshal(b, &scan)
22 | 	return &scan
23 | }
24 | 
25 | // Json serializes Scan as JSON.
26 | func (s *Scan) Json() []byte {
27 | 	ss := &SerializableScan{
28 | 		s,
29 | 		&SerializableRequest{s.Request, ""},
30 | 		&SerializableResponse{
31 | 			s.Response,
32 | 			&SerializableRequest{s.Request, ""},
33 | 		},
34 | 	}
35 | 	b, err := json.Marshal(ss)
36 | 	if err != nil {
37 | 		log.Printf("Scan.Json: err=%v", err)
38 | 		s.Error("Json", err)
39 | 	}
40 | 	return b
41 | 
42 | }
43 | 
44 | // SerializableScan is a Scan extended with serializable
45 | // request and response fields.
46 | type SerializableScan struct {
47 | 	*Scan
48 | 	Request  *SerializableRequest
49 | 	Response *SerializableResponse
50 | }
51 | 
52 | // SerializableResponse is a Scan extended with serializable
53 | // response field.
54 | type SerializableResponse struct {
55 | 	*http.Response
56 | 	Request *SerializableRequest
57 | }
58 | 
59 | // SerializableRequest is a Scan extended with serializable
60 | // request field.
61 | type SerializableRequest struct {
62 | 	*http.Request
63 | 	Cancel string
64 | }
65 | 


--------------------------------------------------------------------------------
/fuzzer/arachni/arachni.go:
--------------------------------------------------------------------------------
 1 | // Copyright 2015, Yahoo Inc. All rights reserved.
 2 | // Use of this source code is governed by a BSD-style
 3 | // license that can be found in the LICENSE file.
 4 | 
 5 | package arachni
 6 | 
 7 | import (
 8 | 	"fmt"
 9 | 	"os/exec"
10 | 	"strings"
11 | 
12 | 	"github.com/yahoo/gryffin"
13 | )
14 | 
15 | // Fuzzer is the handle for the fuzzing methods.
16 | type Fuzzer struct{}
17 | 
18 | // Fuzz runs an Arachni scan.
19 | func (s *Fuzzer) Fuzz(g *gryffin.Scan) (count int, err error) {
20 | 	var cookies []string
21 | 	// for _, c := range g.CookieJar.Cookies(g.Request.URL) {
22 | 	for _, c := range g.Cookies {
23 | 		cookies = append(cookies, c.String())
24 | 	}
25 | 
26 | 	args := []string{
27 | 		"--checks", "xss*",
28 | 		"--output-only-positives",
29 | 		"--http-request-concurrency", "1",
30 | 		"--http-request-timeout", "10000",
31 | 		"--timeout", "00:03:00",
32 | 		"--scope-dom-depth-limit", "0",
33 | 		"--scope-directory-depth-limit", "0",
34 | 		"--scope-page-limit", "1",
35 | 		"--audit-with-both-methods",
36 | 		"--report-save-path", "/dev/null",
37 | 		"--snapshot-save-path", "/dev/null",
38 | 	}
39 | 
40 | 	// TODO: Post method
41 | 
42 | 	// Cookie
43 | 	if len(cookies) > 0 {
44 | 		args = append(args, "--http-cookie-string", strings.Join(cookies, ";"))
45 | 	}
46 | 
47 | 	args = append(args, g.Request.URL.String())
48 | 
49 | 	cmd := exec.Command("arachni", args...)
50 | 
51 | 	g.Logm("Arachni.Scan", fmt.Sprintf("Run as %s", cmd.Args))
52 | 
53 | 	output, err := cmd.Output()
54 | 
55 | 	count = s.extract(g, string(output))
56 | 
57 | 	if err != nil {
58 | 		return
59 | 	}
60 | 
61 | 	g.Logm("Arachni.Scan", fmt.Sprintf("Arachni return %t", cmd.ProcessState.Success()))
62 | 	return
63 | 
64 | }
65 | 
66 | func (s *Fuzzer) extract(g *gryffin.Scan, output string) (count int) {
67 | 	for _, l := range strings.Split(output, "\n") {
68 | 		l = strings.TrimSpace(l)
69 | 		switch {
70 | 		case strings.HasPrefix(l, "[~] Affected page"):
71 | 			g.Logm("Arachni.Findings", l)
72 | 			count++
73 | 		}
74 | 	}
75 | 
76 | 	return
77 | }
78 | 


--------------------------------------------------------------------------------
/renderer/noscript.go:
--------------------------------------------------------------------------------
 1 | // Copyright 2015, Yahoo Inc. All rights reserved.
 2 | // Use of this source code is governed by a BSD-style
 3 | // license that can be found in the LICENSE file.
 4 | 
 5 | package renderer
 6 | 
 7 | import (
 8 | 	"fmt"
 9 | 	"log"
10 | 	"net/http"
11 | 	"strings"
12 | 	"time"
13 | 
14 | 	// "sync"
15 | 
16 | 	"github.com/yahoo/gryffin"
17 | 	"golang.org/x/net/html"
18 | )
19 | 
20 | // allow 100 crawling in the machine (regardless of domains)
21 | 
22 | type NoScriptRenderer struct {
23 | 	BaseRenderer
24 | }
25 | 
26 | func (r *NoScriptRenderer) Do(s *gryffin.Scan) {
27 | 	r.chanResponse = make(chan *gryffin.Scan, 10)
28 | 	r.chanLinks = make(chan *gryffin.Scan, 10)
29 | 
30 | 	crawl := func() {
31 | 
32 | 		defer close(r.chanResponse)
33 | 		defer close(r.chanLinks)
34 | 
35 | 		client := &http.Client{}
36 | 
37 | 		client.Timeout = time.Duration(3) * time.Second
38 | 
39 | 		if response, err := client.Do(s.Request); err == nil {
40 | 			s.Response = response
41 | 		} else {
42 | 			s.Logm("NoScriptRenderer", fmt.Sprintf("error in building request: %s", err))
43 | 			return
44 | 		}
45 | 
46 | 		s.ReadResponseBody()
47 | 
48 | 		if s.IsDuplicatedPage() {
49 | 			return
50 | 		}
51 | 
52 | 		tokenizer := html.NewTokenizer(strings.NewReader(s.ResponseBody))
53 | 
54 | 		r.chanResponse <- s
55 | 
56 | 		for {
57 | 			t := tokenizer.Next()
58 | 
59 | 			switch t {
60 | 
61 | 			case html.ErrorToken:
62 | 				return
63 | 
64 | 			case html.StartTagToken:
65 | 				token := tokenizer.Token()
66 | 				if token.DataAtom.String() == "a" {
67 | 					for _, attr := range token.Attr {
68 | 						if attr.Key == "href" {
69 | 							link := s.Spawn()
70 | 							// TODO - we drop relative URL as it would drop "#".
71 | 							// Yet, how about real relative URLs?
72 | 							if req, err := http.NewRequest("GET", attr.Val, nil); err == nil {
73 | 								if true {
74 | 									// || req.URL.IsAbs() {
75 | 									link.MergeRequest(req)
76 | 									if link.IsScanAllowed() {
77 | 										r.chanLinks <- link
78 | 									}
79 | 								}
80 | 								// else {
81 | 								// FIXME: ignore relative URL.
82 | 								// }
83 | 							} else {
84 | 								log.Printf("error in building request: %s", err)
85 | 							}
86 | 						}
87 | 					}
88 | 				}
89 | 			}
90 | 		}
91 | 
92 | 		// parse and find links.
93 | 
94 | 	}
95 | 
96 | 	go crawl()
97 | }
98 | 


--------------------------------------------------------------------------------
/html-distance/bktree.go:
--------------------------------------------------------------------------------
 1 | // Copyright 2015, Yahoo Inc. All rights reserved.
 2 | // Use of this source code is governed by a BSD-style
 3 | // license that can be found in the LICENSE file.
 4 | 
 5 | // Package distance is a go library for computing the proximity of the HTML pages.
 6 | // The implementation similiarity fingerprint is Charikar's simhash.
 7 | //
 8 | // Distance is the hamming distance of the fingerprints. Since fingerprint is
 9 | // of size 64 (inherited from hash/fnv), Similiarity is defined as 1 - d / 64.
10 | //
11 | // In normal scenario, similarity > 95% (i.e. d>3) could be considered as duplicated html pages.
12 | package distance
13 | 
14 | import (
15 | 	"sync"
16 | 
17 | 	"github.com/mfonda/simhash"
18 | )
19 | 
20 | // Oracle answers the query if a fingerprint has been seen.
21 | type Oracle struct {
22 | 	fingerprint uint64      // node value.
23 | 	nodes       [65]*Oracle // leaf nodes
24 | 	mu          sync.Mutex
25 | }
26 | 
27 | // NewOracle return an oracle that could tell if the fingerprint has been seen or not.
28 | func NewOracle() *Oracle {
29 | 	return newNode(0)
30 | }
31 | 
32 | func newNode(f uint64) *Oracle {
33 | 	return &Oracle{fingerprint: f}
34 | }
35 | 
36 | // Distance return the similarity distance between two fingerprint.
37 | func Distance(a, b uint64) uint8 {
38 | 	return simhash.Compare(a, b)
39 | }
40 | 
41 | // See asks the oracle to see the fingerprint.
42 | func (n *Oracle) See(f uint64) *Oracle {
43 | 	d := Distance(n.fingerprint, f)
44 | 
45 | 	if d == 0 {
46 | 		// current node with same fingerprint.
47 | 		return n
48 | 	}
49 | 
50 | 	// the target node is already set,
51 | 	n.mu.Lock()
52 | 	defer n.mu.Unlock()
53 | 	if c := n.nodes[d]; c != nil {
54 | 		return c.See(f)
55 | 	}
56 | 
57 | 	n.nodes[d] = newNode(f)
58 | 	return n.nodes[d]
59 | }
60 | 
61 | // Seen asks the oracle if anything closed to the fingerprint in a range (r) is seen before.
62 | func (n *Oracle) Seen(f uint64, r uint8) bool {
63 | 	d := Distance(n.fingerprint, f)
64 | 	if d < r {
65 | 		return true
66 | 	}
67 | 
68 | 	// TODO - should search from d, d-1, d+1, ... until d-r and d+r, for best performance
69 | 	for k := d - r; k <= d+r; k++ {
70 | 		if k > 64 {
71 | 			break
72 | 		}
73 | 		n.mu.Lock()
74 | 		c := n.nodes[k]
75 | 		n.mu.Unlock()
76 | 		if c != nil {
77 | 			if c.Seen(f, r) {
78 | 				return true
79 | 			}
80 | 		}
81 | 	}
82 | 	return false
83 | }
84 | 


--------------------------------------------------------------------------------
/html-distance/README.md:
--------------------------------------------------------------------------------
 1 | # html-distance
 2 | 
 3 | html-distance is a go library for computing the proximity of the HTML pages. The implementation similiarity fingerprint is Charikar's simhash. 
 4 | 
 5 | We used BK Tree (Burkhard and Keller) for verifying if a fingerprint is closed to a set of fingerprint within a defined proximity distance. 
 6 | 
 7 | Distance is the hamming distance of the fingerprints. Since fingerprint is of size 64 (inherited from hash/fnv), Similiarity is defined as 1 - d / 64.
 8 | 
 9 | In normal scenario, similarity > 95% (i.e. d>3) could be considered as duplicated html pages.
10 | 
11 | 
12 | ## Get the source
13 | 
14 | ```
15 | go get github.com/yahoo/gryffin/html-distance/...
16 | ```
17 | 
18 | ## Install 
19 | 
20 | ```
21 | go install github.com/yahoo/gryffin/html-distance/cmd/html-distance
22 | ```
23 | 
24 | ## Command Line Interface
25 | 
26 | ```
27 | Usage of html-distance:
28 | 
29 |     html-distance url1 url2
30 | ```
31 | 
32 | Example 1
33 | ```
34 | $ html-distance https://www.flickr.com/photos/120759744@N07/20389369791/ https://www.flickr.com/photos/120759744@N07/20374523532/in/photostream/
35 | 
36 | Fetching https://www.flickr.com/photos/120759744@N07/20389369791/, Got 200
37 | Fetching https://www.flickr.com/photos/120759744@N07/20374523532/in/photostream/, Got 200
38 | Feature distance is 0. HTML Similarity is 100.00%
39 | ```
40 | 
41 | Example 2
42 | ```
43 | $ html-distance https://www.yahoo.com/politics/kasichs-reception-on-gay-marriage-important-126109300441.html https://www.yahoo.com/tech/s/verizon-drop-phone-contracts-end-discounted-phones-201530971--finance.html
44 | 
45 | Fetching https://www.yahoo.com/politics/kasichs-reception-on-gay-marriage-important-126109300441.html, Got 200
46 | Fetching https://www.yahoo.com/tech/s/verizon-drop-phone-contracts-end-discounted-phones-201530971--finance.html, Got 200
47 | Feature distance is 2. HTML Similarity is 96.88%
48 | ```
49 | 
50 | Example 3
51 | ```
52 | $ html-distance https://www.flickr.com/photos/120759744@N07/20389369791/ https://www.yahoo.com/tech/s/verizon-drop-phone-contracts-end-discounted-phones-201530971--finance.html
53 | 
54 | Fetching https://www.flickr.com/photos/120759744@N07/20389369791/, Got 200
55 | Fetching https://www.yahoo.com/tech/s/verizon-drop-phone-contracts-end-discounted-phones-201530971--finance.html, Got 200
56 | Feature distance is 9. HTML Similarity is 85.94%
57 | ```
58 | 


--------------------------------------------------------------------------------
/renderer/resource/utils.js:
--------------------------------------------------------------------------------
 1 | /**
 2 |   * Copyright 2015, Yahoo Inc. All rights reserved.
 3 |   * Use of this source code is governed by a BSD-style
 4 |   * license that can be found in the LICENSE file.
 5 |   *
 6 |   
 7 | */
 8 | 
 9 | var re_hostname = /^(?:https?|ftp):\/\/([^:\/\?]+)/i,
10 | 	re_extensionFilter = /\.(?:css|pdf|svg|ttf|zip|tar|gz|pkg|exe)(?:[\?#;][^\?#;]*)?$/i,
11 | 	re_jsAnalyticsFilter = /^https?:\/\/(?:\w+\.)?yimg\.com\/mi(?:\/[^\/]+)?\/ywa\.js$/i,
12 | 	re_whitelistedRedirectionDomains = /(?:yahoo\.com?(?:\.\w\w)?|yimg\.com|flickr\.com|y-cloud\.net|yahoodns\.net|yahoofs\.com|zenfs\.com)$/;
13 | 
14 | exports.getHostname = function(url) {
15 | 	url = url.match(re_hostname);
16 | 	return url ? url[1] : null;
17 | }
18 | exports.invalidUrl = function(url, allowedDomains) {
19 | 	url = exports.getHostname(url);
20 | 	return (url === null || (allowedDomains && allowedDomains.indexOf(url) === -1));
21 | }
22 | exports.blacklistedUrl = function(url) {
23 | 	return re_extensionFilter.test(url) || re_jsAnalyticsFilter.test(url);
24 | }
25 | exports.whitelistedRedirectionDomains = function(url) {
26 | 	return re_whitelistedRedirectionDomains.test(exports.getHostname(url));
27 | }
28 | 
29 | exports.cleanResponseBody = function(body) {
30 | 	return (body == '<html><head></head><body></body></html>') ? '' : body;
31 | }
32 | 
33 | // to repackage headers as a dict format, as required by scrappy
34 | exports.prepareResponse = function(response, headersFilter) {
35 | 	return {
36 | 		headers: headersFilter(response.headers),
37 | 		contentType: response.contentType,
38 | 		status: response.status,
39 | 		url: response.url
40 | 	}
41 | }
42 | 
43 | // TODO: add to redis
44 | exports.pageChanges = (function() {
45 | 	var changes = {};
46 | 	return {
47 | 		fetch: function(eventName) {
48 | 			var ret = changes[eventName] || [];
49 | 			changes[eventName] = [];
50 | 			return ret;
51 | 		},
52 | 		fetchAll: function() {
53 | 			var ret = changes;
54 | 			changes = {};
55 | 			return ret;
56 | 		},
57 | 		push: function(eventName, obj) {
58 | 			changes[eventName] = changes[eventName] || [];
59 | 			changes[eventName].push(obj);
60 | 		}
61 | 	}
62 | })();
63 | 
64 | var JSONSignature = '==lXlKfYWch7H9VdJgPCmJ==';
65 | 
66 | exports.printJSON = function(type, output) {
67 | 	output['msgType'] = type;
68 | 	output['signature'] = JSONSignature;
69 | 	console.log(JSON.stringify(output));
70 | 	// console.log(['{'+type, JSON.stringify(output), type+'}'].join(JSONSignature));
71 | }


--------------------------------------------------------------------------------
/renderer/resource/headers.js:
--------------------------------------------------------------------------------
 1 | /**
 2 |   * Copyright 2015, Yahoo Inc. All rights reserved.
 3 |   * Use of this source code is governed by a BSD-style
 4 |   * license that can be found in the LICENSE file.
 5 |   *
 6 |   
 7 | */
 8 | 
 9 | exports.init = function(phantom, page){
10 | 
11 | 	function setReqHeaders(headers, cookieHostname){
12 | 		phantom.clearCookies();
13 | 		// console.log("DEBUG HEADERS... " + cookieHostname)
14 | 
15 | 	  	// for(var i in headers) {
16 | 		  // 	console.log("headers " + i)
17 | 		  // 	console.log(headers[i])
18 |   		// }
19 | 
20 | 
21 | 		if (!headers || typeof(headers) != 'object') return {};
22 | 
23 | 		// avoid requesting for gzipped/compressed content, i.e., Accept-Encoding and Accept request headers unconfigurable
24 | 		// gzip decompression is problematic: https://github.com/ariya/phantomjs/issues/10930
25 | 		// the following headers modification is moved to phantomjs.py
26 | 		// headers['Accept-Encoding'] = "identity";
27 | 		// delete headers['Accept'];
28 | 
29 | 		// make cookies available for subresources requests of the same hostname, otherwise, only the main page will receive cookie
30 | 		if (headers['Cookie']) {
31 | 			headers['Cookie'].split(';').forEach(function(cookie){
32 | 				var eqIndex = cookie.indexOf('=');
33 | 				phantom.addCookie({
34 | 					name: cookie.substr(0, eqIndex).trim(), 
35 | 					value: cookie.substr(eqIndex + 1).trim(),
36 | 					domain: cookieHostname, // already defaulted to hostname of current page
37 | 					path: '/', httponly: true, secure: false
38 | 				});
39 | 			});
40 | 			delete headers['Cookie'];
41 | 		}
42 | 
43 | 
44 | 		// User-Agent in request header must be explicitly configured thru settings.userAgent
45 | 		Object.keys(headers).forEach(function(headerName){
46 | 			if (headerName.toLowerCase() == 'user-agent') {
47 | 				page.settings.userAgent = headers[headerName];
48 | 				delete headers[headerName];
49 | 			}
50 | 		});
51 | 
52 | 		return headers;
53 | 	}
54 | 
55 | 
56 | 	function getRespHeaders(headers) {
57 | 		var out = {};
58 | 		headers && headers.forEach(function(h){
59 | 			// the following headers are stripped to prevent decoding twice by scrapy
60 | 			var name = h.name.toLowerCase(), value = h.value.toLowerCase();
61 | 			if ((name == 'content-encoding' && ['gzip','deflate'].indexOf(value) != -1)
62 | 				|| (name == 'transfer-encoding' && value == 'chunked'))
63 | 				return;
64 | 
65 | 			name = h.name;
66 | 			out[name] = out[name] || [];
67 | 			out[name].push(h.value);
68 | 		});
69 | 		return out;
70 | 	}
71 | 
72 | 
73 | 	return {
74 | 		'setReqHeaders': setReqHeaders,
75 | 		'getRespHeaders': getRespHeaders
76 | 	};
77 | }


--------------------------------------------------------------------------------
/fuzzer/sqlmap/sqlmap.go:
--------------------------------------------------------------------------------
  1 | // Copyright 2015, Yahoo Inc. All rights reserved.
  2 | // Use of this source code is governed by a BSD-style
  3 | // license that can be found in the LICENSE file.
  4 | 
  5 | package sqlmap
  6 | 
  7 | import (
  8 | 	"fmt"
  9 | 	"os/exec"
 10 | 	"strconv"
 11 | 	"strings"
 12 | 
 13 | 	"github.com/yahoo/gryffin"
 14 | )
 15 | 
 16 | // Fuzzer is the handle for the fuzzing methods.
 17 | type Fuzzer struct{}
 18 | 
 19 | // Fuzz runs an sqlmap scan.
 20 | func (s *Fuzzer) Fuzz(g *gryffin.Scan) (count int, err error) {
 21 | 
 22 | 	var cookies []string
 23 | 
 24 | 	// for _, c := range g.CookieJar.Cookies(g.Request.URL) {
 25 | 	for _, c := range g.Cookies {
 26 | 		cookies = append(cookies, c.String())
 27 | 	}
 28 | 
 29 | 	args := []string{
 30 | 		"--batch",
 31 | 		"--timeout=2",
 32 | 		"--retries=3",
 33 | 		"--crawl=0",
 34 | 		"--disable-coloring",
 35 | 		"-o",
 36 | 		"--text-only",
 37 | 		// "--threads=4",
 38 | 		"-v", "0",
 39 | 		"--level=1",
 40 | 		"--risk=1",
 41 | 		"--smart",
 42 | 		"--fresh-queries",
 43 | 		"--purge-output",
 44 | 		"--os=Linux",
 45 | 		"--dbms=MySQL",
 46 | 		"--delay=0.1",
 47 | 		"--time-sec=1",
 48 | 	}
 49 | 
 50 | 	// TODO: Post method
 51 | 	// if g.RequestBody != "" {
 52 | 	// args = append(args, fmt.Sprintf("--data=..."
 53 | 	// }
 54 | 
 55 | 	// only for integer based injection.
 56 | 	var testable []string
 57 | 	for k, vs := range g.Request.URL.Query() {
 58 | 		for _, v := range vs {
 59 | 			_, err := strconv.ParseInt(v, 10, 64)
 60 | 			if err == nil {
 61 | 				// query param value is an integer
 62 | 				testable = append(testable, k)
 63 | 			}
 64 | 		}
 65 | 	}
 66 | 	if len(testable) > 0 {
 67 | 		args = append(args, "-p", strings.Join(testable, ","))
 68 | 	}
 69 | 
 70 | 	// Cookie
 71 | 	if len(cookies) > 0 {
 72 | 		fmt.Println(cookies)
 73 | 		args = append(args, "--cookie", strings.Join(cookies, ";"))
 74 | 	}
 75 | 
 76 | 	args = append(args, "-u", g.Request.URL.String())
 77 | 
 78 | 	cmd := exec.Command("sqlmap", args...)
 79 | 
 80 | 	g.Logm("SQLMap.Scan", fmt.Sprintf("Run as %s", cmd.Args))
 81 | 
 82 | 	output, err := cmd.Output()
 83 | 
 84 | 	if err != nil {
 85 | 		return
 86 | 	}
 87 | 
 88 | 	count = s.extract(g, string(output))
 89 | 
 90 | 	g.Logm("SQLMap.Scan", fmt.Sprintf("SQLMap return %t", cmd.ProcessState.Success()))
 91 | 	return
 92 | 
 93 | }
 94 | 
 95 | func (s *Fuzzer) extract(g *gryffin.Scan, output string) (count int) {
 96 | 
 97 | 	for _, l := range strings.Split(output, "\n") {
 98 | 		l = strings.TrimSpace(l)
 99 | 		switch {
100 | 		case strings.HasPrefix(l, "Payload: "):
101 | 			g.Logm("SQLMap.Findings", l)
102 | 			count++
103 | 		}
104 | 	}
105 | 
106 | 	return
107 | }
108 | 


--------------------------------------------------------------------------------
/data/memory.go:
--------------------------------------------------------------------------------
  1 | // Copyright 2015, Yahoo Inc. All rights reserved.
  2 | // Use of this source code is governed by a BSD-style
  3 | // license that can be found in the LICENSE file.
  4 | 
  5 | package data
  6 | 
  7 | import (
  8 | 	// "log"
  9 | 	// "reflect"
 10 | 	"strings"
 11 | 	"sync/atomic"
 12 | )
 13 | 
 14 | // MemoryStore is an implementation for memory based data store.
 15 | type MemoryStore struct {
 16 | 	heap map[string]interface{}
 17 | }
 18 | 
 19 | // Set stores the key value pair.
 20 | func (m *MemoryStore) Set(key string, value interface{}) bool {
 21 | 	switch value.(type) {
 22 | 
 23 | 	case int, int8, int16, int32, int64, uint, uint8, uint16, uint32, uint64:
 24 | 		s, _ := convertIntToPtr(value)
 25 | 		m.heap[key] = s
 26 | 
 27 | 	default:
 28 | 		m.heap[key] = value
 29 | 	}
 30 | 	return true
 31 | }
 32 | 
 33 | // Get retrieves the value pointed by the key.
 34 | func (m *MemoryStore) Get(key string) (value interface{}, ok bool) {
 35 | 	value, ok = m.heap[key]
 36 | 	switch value.(type) {
 37 | 	case *int, *int8, *int16, *int32, *int64, *uint, *uint8, *uint16, *uint32, *uint64:
 38 | 		s, ok := convertPtrToInt(value)
 39 | 		return s, ok
 40 | 	default:
 41 | 		return value, ok
 42 | 	}
 43 | }
 44 | 
 45 | // IncrBy increments the value pointed by key with the delta, and return the new value.
 46 | func (m *MemoryStore) IncrBy(key string, delta int64) (newVal int64) {
 47 | 	newVal = atomic.AddInt64(m.heap[key].(*int64), delta)
 48 | 	return
 49 | 
 50 | }
 51 | 
 52 | // DelPrefix deletes records from the MemoryStore's heap
 53 | // when the keys match the given prefix.
 54 | func (m *MemoryStore) DelPrefix(prefix string) {
 55 | 	for k := range m.heap {
 56 | 		if strings.HasPrefix(k, prefix) {
 57 | 			delete(m.heap, k)
 58 | 		}
 59 | 	}
 60 | }
 61 | 
 62 | // Publish is a dummy no-op method.
 63 | func (m *MemoryStore) Publish(k string, d interface{}) {
 64 | 
 65 | }
 66 | 
 67 | // NewMemoryStore creates the new store.
 68 | func NewMemoryStore() *MemoryStore {
 69 | 	m := MemoryStore{
 70 | 		heap: make(map[string]interface{}),
 71 | 	}
 72 | 	return &m
 73 | }
 74 | 
 75 | func convertIntToPtr(v interface{}) (s *int64, ok bool) {
 76 | 	var t int64
 77 | 
 78 | 	switch v := v.(type) {
 79 | 
 80 | 	case int:
 81 | 		t = int64(v)
 82 | 	case int8:
 83 | 		t = int64(v)
 84 | 	case int16:
 85 | 		t = int64(v)
 86 | 	case int32:
 87 | 		t = int64(v)
 88 | 	case int64:
 89 | 		t = v
 90 | 	case uint:
 91 | 		t = int64(v)
 92 | 	case uint8:
 93 | 		t = int64(v)
 94 | 	case uint16:
 95 | 		t = int64(v)
 96 | 	case uint32:
 97 | 		t = int64(v)
 98 | 	case uint64:
 99 | 		t = int64(v)
100 | 	}
101 | 
102 | 	return &t, ok
103 | }
104 | 
105 | func convertPtrToInt(v interface{}) (s int64, ok bool) {
106 | 
107 | 	switch v := v.(type) {
108 | 
109 | 	case *int:
110 | 		return int64(*v), true
111 | 	case *int8:
112 | 		return int64(*v), true
113 | 	case *int16:
114 | 		return int64(*v), true
115 | 	case *int32:
116 | 		return int64(*v), true
117 | 	case *int64:
118 | 		return *v, true
119 | 
120 | 	case *uint:
121 | 		return int64(*v), true
122 | 	case *uint8:
123 | 		return int64(*v), true
124 | 	case *uint16:
125 | 		return int64(*v), true
126 | 	case *uint32:
127 | 		return int64(*v), true
128 | 	case *uint64:
129 | 		return int64(*v), true
130 | 	}
131 | 
132 | 	return
133 | 
134 | }
135 | 


--------------------------------------------------------------------------------
/html-distance/feature.go:
--------------------------------------------------------------------------------
  1 | // Copyright 2015, Yahoo Inc. All rights reserved.
  2 | // Use of this source code is governed by a BSD-style
  3 | // license that can be found in the LICENSE file.
  4 | 
  5 | package distance
  6 | 
  7 | import (
  8 | 	"bytes"
  9 | 	"io"
 10 | 
 11 | 	"github.com/mfonda/simhash"
 12 | 	"golang.org/x/net/html"
 13 | )
 14 | 
 15 | // Fingerprint generates the fingerprint of an HTML from the io.Reader r and a shingle factor.
 16 | // Shingle refers to the level of shuffling.
 17 | // E.g. with shingle factor =2, input "a", "b", "c" will be converted to "a b", "b c"
 18 | func Fingerprint(r io.Reader, shingle int) uint64 {
 19 | 	if shingle < 1 {
 20 | 		shingle = 1
 21 | 	}
 22 | 	// collect the features via this cf channel.
 23 | 	cf := make(chan string, 1000)
 24 | 	cs := make(chan uint64, 1000)
 25 | 	v := simhash.Vector{}
 26 | 
 27 | 	// Tokenize and then Generate Features. .
 28 | 	go func() {
 29 | 		defer close(cf)
 30 | 		z := html.NewTokenizer(r)
 31 | 		// TODO - export the max token count as an function argument.
 32 | 		count := 0
 33 | 		for tt := z.Next(); count < 5000 && tt != html.ErrorToken; tt = z.Next() {
 34 | 			t := z.Token()
 35 | 			count++
 36 | 			genFeatures(&t, cf)
 37 | 		}
 38 | 
 39 | 	}()
 40 | 
 41 | 	// Collect the features.
 42 | 	go func() {
 43 | 		defer close(cs)
 44 | 		a := make([][]byte, shingle)
 45 | 		for f := <-cf; f != ""; f = <-cf {
 46 | 			// shingle: generate the k-gram token as a single feature.
 47 | 			a = append(a[1:], []byte(f))
 48 | 			// fmt.Printf("%#v\n", a)
 49 | 			// fmt.Printf("%s\n", bytes.Join(a, []byte(" ")))
 50 | 			cs <- simhash.NewFeature(bytes.Join(a, []byte(" "))).Sum()
 51 | 			// cs <- simhash.NewFeature([]byte(f)).Sum()
 52 | 		}
 53 | 	}()
 54 | 
 55 | 	// from the checksum (of feature), append to vector.
 56 | 	for s := <-cs; s != 0; s = <-cs {
 57 | 		for i := uint8(0); i < 64; i++ {
 58 | 			bit := ((s >> i) & 1)
 59 | 			if bit == 1 {
 60 | 				v[i]++
 61 | 			} else {
 62 | 				v[i]--
 63 | 			}
 64 | 		}
 65 | 	}
 66 | 
 67 | 	return simhash.Fingerprint(v)
 68 | 
 69 | }
 70 | 
 71 | func genFeatures(t *html.Token, cf chan<- string) {
 72 | 
 73 | 	s := ""
 74 | 
 75 | 	switch t.Type {
 76 | 	case html.StartTagToken:
 77 | 		s = "A:" + t.DataAtom.String()
 78 | 	case html.EndTagToken:
 79 | 		s = "B:" + t.DataAtom.String()
 80 | 	case html.SelfClosingTagToken:
 81 | 		s = "C:" + t.DataAtom.String()
 82 | 	case html.DoctypeToken:
 83 | 		s = "D:" + t.DataAtom.String()
 84 | 	case html.CommentToken:
 85 | 		s = "E:" + t.DataAtom.String()
 86 | 	case html.TextToken:
 87 | 		s = "F:" + t.DataAtom.String()
 88 | 	case html.ErrorToken:
 89 | 		s = "Z:" + t.DataAtom.String()
 90 | 	}
 91 | 	// fmt.Println(s)
 92 | 	cf <- s
 93 | 
 94 | 	for _, attr := range t.Attr {
 95 | 		switch attr.Key {
 96 | 		case "class":
 97 | 			s = "G:" + t.DataAtom.String() + ":" + attr.Key + ":" + attr.Val
 98 | 		// case "id":
 99 | 		// 	s = "G:" + t.DataAtom.String() + ":" + attr.Key + ":" + attr.Val
100 | 		case "name":
101 | 			s = "G:" + t.DataAtom.String() + ":" + attr.Key + ":" + attr.Val
102 | 		case "rel":
103 | 			s = "G:" + t.DataAtom.String() + ":" + attr.Key + ":" + attr.Val
104 | 		default:
105 | 			s = "G:" + t.DataAtom.String() + ":" + attr.Key
106 | 		}
107 | 		// fmt.Println(s)
108 | 		cf <- s
109 | 	}
110 | 
111 | 	// fmt.Println(s)
112 | 
113 | }
114 | 


--------------------------------------------------------------------------------
/html-distance/feature_test.go:
--------------------------------------------------------------------------------
  1 | // Copyright 2015, Yahoo Inc. All rights reserved.
  2 | // Use of this source code is governed by a BSD-style
  3 | // license that can be found in the LICENSE file.
  4 | 
  5 | package distance
  6 | 
  7 | import (
  8 | 	"bytes"
  9 | 	"io/ioutil"
 10 | 	"net/http"
 11 | 	"strings"
 12 | 	"testing"
 13 | )
 14 | 
 15 | // var input = "<p id=0</p>"
 16 | // var input = "<p \t\n iD=\"a&quot;B\"  foo=\"bar\"><EM>te&lt;&amp;;xt</em></p>"
 17 | var input = `
 18 | <html lang="en" class=" is-copy-enabled">
 19 |   <head prefix="og: http://ogp.me/ns# fb: http://ogp.me/ns/fb# object: http://ogp.me/ns/object# article: http://ogp.me/ns/article# profile: http://ogp.me/ns/profile#">
 20 |     <meta charset='utf-8'>
 21 |     <meta http-equiv="X-UA-Compatible" content="IE=edge">
 22 |     <meta http-equiv="Content-Language" content="en">
 23 |     <meta name="viewport" content="width=1020">
 24 |     
 25 |     
 26 |     <title>net/token_test.go at master · golang/net</title>
 27 |     <link rel="search" type="application/opensearchdescription+xml" href="/opensearch.xml" title="GitHub">
 28 |     <link rel="fluid-icon" href="https://github.com/fluidicon.png" title="GitHub">
 29 |     <link rel="apple-touch-icon" sizes="57x57" href="/apple-touch-icon-114.png">
 30 |     <link rel="apple-touch-icon" sizes="114x114" href="/apple-touch-icon-114.png">
 31 |     <link rel="apple-touch-icon" sizes="72x72" href="/apple-touch-icon-144.png">
 32 |     <link rel="apple-touch-icon" sizes="144x144" href="/apple-touch-icon-144.png">
 33 |     <meta property="fb:app_id" content="1401488693436528">
 34 | `
 35 | 
 36 | func TestCreateFingerprint(t *testing.T) {
 37 | 	r := strings.NewReader(input)
 38 | 	f := Fingerprint(r, 2)
 39 | 	t.Logf("%064b", f)
 40 | }
 41 | 
 42 | func TestSee(t *testing.T) {
 43 | 
 44 | 	oracle := NewOracle()
 45 | 
 46 | 	tests := strings.Split(`<a>b<c>
 47 | <a>d<c>
 48 | <a><p></a></p>
 49 | <a>1<p>2</a>3</p>
 50 | <a>1<button>2</a>3</button>
 51 | <a>1<b>2</a>3</b>
 52 | <a>1<div>2<div>3</a>4</div>5</div>
 53 | <table><a>1<p>2</a>3</p>
 54 | <b><b><a><p></a>
 55 | <b><a><b><p></a>
 56 | <a><b><b><p></a>
 57 | <p>1<s id="A">2<b id="B">3</p>4</s>5</b>
 58 | <table><a>1<td>2</td>3</table>
 59 | <table>A<td>B</td>C</table>
 60 | <a><svg><tr><input></a>`, "\n")
 61 | 
 62 | 	for _, test := range tests {
 63 | 		r := strings.NewReader(test)
 64 | 		f := Fingerprint(r, 2)
 65 | 		oracle.See(f)
 66 | 		t.Logf(" ---- for %064b %s.", f, test)
 67 | 	}
 68 | 
 69 | 	for _, test := range tests {
 70 | 		_ = test
 71 | 		ntest := "<a>d<c>"
 72 | 		r := strings.NewReader(ntest)
 73 | 		f := Fingerprint(r, 2)
 74 | 		t.Logf("%t for %064b %s.", oracle.Seen(f, 2), f, ntest)
 75 | 	}
 76 | 
 77 | }
 78 | 
 79 | func TestSeenWithExternalHTML(t *testing.T) {
 80 | 
 81 | 	t.Skip("skip htmlsample test ..")
 82 | 	oracle := NewOracle()
 83 | 
 84 | 	f1, _ := ioutil.ReadFile("./htmlsamples/flickr001.html")
 85 | 	f2, _ := ioutil.ReadFile("./htmlsamples/flickr002.html")
 86 | 	f3, _ := ioutil.ReadFile("./htmlsamples/yahoo001.html")
 87 | 
 88 | 	{
 89 | 		r := bytes.NewReader(f1)
 90 | 		f := Fingerprint(r, 2)
 91 | 		oracle.See(f)
 92 | 	}
 93 | 
 94 | 	{
 95 | 		r := bytes.NewReader(f2)
 96 | 		f := Fingerprint(r, 2)
 97 | 		t.Logf("found? %t", oracle.Seen(f, 2))
 98 | 
 99 | 	}
100 | 
101 | 	{
102 | 		r := bytes.NewReader(f3)
103 | 		f := Fingerprint(r, 2)
104 | 		t.Logf("found? %t", oracle.Seen(f, 2))
105 | 
106 | 	}
107 | 
108 | }
109 | 
110 | func BenchmarkFingerprint(b *testing.B) {
111 | 	for i := 0; i < b.N; i++ {
112 | 		r := strings.NewReader(input)
113 | 		Fingerprint(r, 2)
114 | 	}
115 | }
116 | 
117 | func BenchmarkFingerprintWithExternalHTML(b *testing.B) {
118 | 
119 | 	b.Skip("Skip external dependent tests.")
120 | 	resp, err := http.Get("https://www.yahoo.com/")
121 | 	if err != nil {
122 | 		b.Fatal(err)
123 | 	}
124 | 	defer resp.Body.Close()
125 | 	input, err := ioutil.ReadAll(resp.Body)
126 | 	if err != nil {
127 | 		b.Fatal(err)
128 | 	}
129 | 
130 | 	b.ResetTimer()
131 | 
132 | 	for i := 0; i < b.N; i++ {
133 | 		r := bytes.NewReader(input)
134 | 		Fingerprint(r, 2)
135 | 	}
136 | }
137 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # ARCHIVED 
 2 | 
 3 | 
 4 | Gryffin (beta) [![Build Status](https://travis-ci.org/yahoo/gryffin.svg?branch=master)](https://travis-ci.org/yahoo/gryffin) [![GoDoc](https://godoc.org/github.com/yahoo/gryffin?status.svg)](https://godoc.org/github.com/yahoo/gryffin)
 5 | ==========
 6 | 
 7 | Gryffin is a large scale web security scanning platform. It is not yet another scanner. It was written to solve two specific problems with existing scanners: coverage and scale.
 8 | 
 9 | Better coverage translates to fewer false negatives. Inherent scalability translates to capability of scanning, and supporting a large elastic application infrastructure. Simply put, the ability to scan 1000 applications today to 100,000 applications tomorrow by straightforward horizontal scaling.
10 | 
11 | ## Coverage
12 | Coverage has two dimensions - one during crawl and the other during fuzzing. In crawl phase, coverage implies being able to find as much of the application footprint. In scan phase, or while fuzzing, it implies being able to test each part of the application for an applied set of vulnerabilities in a deep.
13 | 
14 | #### Crawl Coverage
15 | Today a large number of web applications are template-driven, meaning the same code or path generates millions of URLs. For a security scanner, it just needs one of the millions of URLs generated by the same code or path. Gryffin's crawler does just that.
16 | 
17 | ##### Page Deduplication
18 | At the heart of Gryffin is a deduplication engine that compares a new page with already seen pages. If the HTML structure of the new page is similar to those already seen, it is classified as a duplicate and not crawled further.
19 | 
20 | ##### DOM Rendering and Navigation
21 | A large number of applications today are rich applications. They are heavily driven by client-side JavaScript. In order to discover links and code paths in such applications, Gryffin's crawler uses PhantomJS for DOM rendering and navigation.
22 | 
23 | #### Scan Coverage
24 | As Gryffin is a scanning platform, not a scanner, it does not have its own fuzzer modules, even for fuzzing common web vulnerabilities like XSS and SQL Injection.
25 | 
26 | It's not wise to reinvent the wheel where you do not have to. Gryffin at production scale at Yahoo uses open source and custom fuzzers. Some of these custom fuzzers might be open sourced in the future, and might or might not be part of the Gryffin repository.
27 | 
28 | For demonstration purposes, Gryffin comes integrated with sqlmap and arachni. It does not endorse them or any other scanner in particular.
29 | 
30 | The philosophy is to improve scan coverage by being able to fuzz for just what you need.
31 | 
32 | ## Scale
33 | While Gryffin is available as a standalone package, it's primarily built for scale.
34 | 
35 | Gryffin is built on the publisher-subscriber model. Each component is either a publisher, or a subscriber, or both. This allows Gryffin to scale horizontally by simply adding more subscriber or publisher nodes.
36 | 
37 | ## Operating Gryffin
38 | 
39 | ### Pre-requisites
40 | 
41 | 1. Go - `go1.13` or later
42 | 2. PhantomJS, v2
43 | 3. Sqlmap (for fuzzing SQLi)
44 | 4. Arachni (for fuzzing XSS and web vulnerabilities)
45 | 5. NSQ ,
46 |     - running lookupd at port 4160,4161
47 |     - running nsqd at port 4150,4151
48 |     - with `--max-msg-size=5000000`
49 | 6. Kibana and Elastic search, for dashboarding
50 |     - listening to JSON over port 5000
51 |     - Preconfigured docker image available in https://hub.docker.com/r/yukinying/elk/
52 | 
53 | 
54 | ### Installation
55 | 
56 | ```
57 | go get -u github.com/yahoo/gryffin/...
58 | ```
59 | 
60 | ### Run
61 | 
62 | (WIP)
63 | 
64 | ## TODO
65 | 
66 | 1. Mobile browser user agent
67 | 2. Preconfigured docker images
68 | 3. Redis for sharing states across machines
69 | 4. Instruction to run gryffin (distributed or standalone)
70 | 5. Documentation for html-distance
71 | 6. Implement a JSON serializable cookiejar.
72 | 7. Identify duplicate url patterns based on simhash result.
73 | 
74 | ## Talks and Slides
75 | 
76 | - AppsecUSA 2015: [abstract](http://sched.co/3Vgm), [slide](http://go-talks.appspot.com/github.com/yukinying/talks/gryffin/gryffin.slide), [recording](https://youtu.be/IWiR2CPOHvc)
77 | 
78 | ## Credits
79 | 
80 | - Adonis Fung @ Yahoo, for the asynchronous phantomjs based crawler and DOM event navigator.
81 | - [Simhash algorithm](http://www.cs.princeton.edu/courses/archive/spring04/cos598B/bib/CharikarEstim.pdf) by Moses Charikar
82 | - Simhash implementation provided by [mfonda/simhash](https://github.com/mfonda/simhash).
83 | - [Sqlmap](http://sqlmap.org/)
84 | - [Arachni](http://www.arachni-scanner.com/)
85 | 
86 | 
87 | ## Licence
88 | 
89 | Code licensed under the BSD-style license. See LICENSE file for terms.
90 | 


--------------------------------------------------------------------------------
/session.go:
--------------------------------------------------------------------------------
  1 | // Copyright 2015, Yahoo Inc. All rights reserved.
  2 | // Use of this source code is governed by a BSD-style
  3 | // license that can be found in the LICENSE file.
  4 | 
  5 | package gryffin
  6 | 
  7 | import (
  8 | 	"encoding/json"
  9 | 	"fmt"
 10 | 	"strconv"
 11 | 	"sync"
 12 | 	"time"
 13 | 
 14 | 	distance "github.com/yahoo/gryffin/html-distance"
 15 | )
 16 | 
 17 | // GryffinStore includes data and handles for Gryffin message processing,
 18 | type GryffinStore struct {
 19 | 	Oracles map[string]*distance.Oracle
 20 | 	Hashes  map[string]bool
 21 | 	Hits    map[string]int
 22 | 	Mu      sync.RWMutex
 23 | 	// store   data.Store - currently unused, TODO: use or remove
 24 | 	snd chan []byte
 25 | 	rcv chan []byte
 26 | }
 27 | 
 28 | // PublishMessage is the data in the messages handled by Gryffin.
 29 | type PublishMessage struct {
 30 | 	F string // function, i.e. See or Seen
 31 | 	T string // type (kind), i.e. oracle or hash
 32 | 	K string // key
 33 | 	V string // value
 34 | }
 35 | 
 36 | func NewSharedGryffinStore() *GryffinStore {
 37 | 	return newGryffinStore(true)
 38 | }
 39 | 
 40 | func NewGryffinStore() *GryffinStore {
 41 | 	return newGryffinStore(false)
 42 | }
 43 | 
 44 | func newGryffinStore(shared bool) *GryffinStore {
 45 | 
 46 | 	store := GryffinStore{
 47 | 		Oracles: make(map[string]*distance.Oracle),
 48 | 		Hashes:  make(map[string]bool),
 49 | 		Hits:    make(map[string]int),
 50 | 	}
 51 | 
 52 | 	if shared {
 53 | 		store.snd = make(chan []byte, 10)
 54 | 		store.rcv = make(chan []byte, 10)
 55 | 	}
 56 | 
 57 | 	// start a go rountine to read from the channel
 58 | 	go store.processRcvMsg()
 59 | 
 60 | 	return &store
 61 | }
 62 | 
 63 | func (s *GryffinStore) GetRcvChan() chan []byte {
 64 | 	return s.rcv
 65 | }
 66 | 
 67 | func (s *GryffinStore) GetSndChan() chan []byte {
 68 | 	return s.snd
 69 | }
 70 | 
 71 | func (s *GryffinStore) processRcvMsg() {
 72 | 	for jsonPayload := range s.rcv {
 73 | 		var m PublishMessage
 74 | 		err := json.Unmarshal(jsonPayload, &m)
 75 | 		if err != nil {
 76 | 			fmt.Println("Error in processRcvMsg")
 77 | 			continue
 78 | 		}
 79 | 		fmt.Println("Got a RcvMsg: ", m) // DEBUG
 80 | 		if m.F == "See" {
 81 | 			v, _ := strconv.ParseUint(m.V, 16, 64)
 82 | 			switch m.T {
 83 | 			case "hash":
 84 | 				s.hashesSee(m.K, v, true)
 85 | 			case "oracle":
 86 | 				s.oracleSee(m.K, v, true)
 87 | 			}
 88 | 		}
 89 | 	}
 90 | }
 91 | 
 92 | func (s *GryffinStore) See(prefix string, kind string, v uint64) {
 93 | 
 94 | 	if kind == "oracle" {
 95 | 		s.oracleSee(prefix, v, false)
 96 | 		return
 97 | 	}
 98 | 	if kind == "hash" {
 99 | 		s.hashesSee(prefix, v, false)
100 | 		return
101 | 	}
102 | }
103 | 
104 | func (s *GryffinStore) Seen(prefix string, kind string, v uint64, r uint8) bool {
105 | 
106 | 	switch kind {
107 | 	case "oracle":
108 | 		s.Mu.RLock()
109 | 		if oracle, ok := s.Oracles[prefix]; ok {
110 | 			s.Mu.RUnlock()
111 | 			return oracle.Seen(v, r)
112 | 		}
113 | 		s.Mu.RUnlock()
114 | 	case "hash":
115 | 		k := prefix + "/" + strconv.FormatUint(v, 10)
116 | 		s.Mu.RLock()
117 | 		_, ok := s.Hashes[k]
118 | 		s.Mu.RUnlock()
119 | 		return ok
120 | 	}
121 | 	return false
122 | }
123 | 
124 | func (s *GryffinStore) oracleSee(prefix string, f uint64, localOnly bool) {
125 | 	k := prefix
126 | 	// Local update
127 | 	s.Mu.RLock()
128 | 	oracle, ok := s.Oracles[k]
129 | 	s.Mu.RUnlock()
130 | 	if !ok {
131 | 		s.Mu.Lock()
132 | 		s.Oracles[k] = distance.NewOracle()
133 | 		oracle = s.Oracles[k]
134 | 		s.Mu.Unlock()
135 | 	}
136 | 	oracle.See(f)
137 | 
138 | 	// Remote update
139 | 	if !localOnly && s.snd != nil {
140 | 		go func() {
141 | 			jsonPayload, _ := json.Marshal(&PublishMessage{F: "See", T: "oracle", K: prefix, V: fmt.Sprintf("%x", f)})
142 | 			// fmt.Println("Sending... ", s.snd, string(jsonPayload))
143 | 			s.snd <- jsonPayload
144 | 		}()
145 | 	}
146 | }
147 | 
148 | func (s *GryffinStore) hashesSee(prefix string, h uint64, localOnly bool) {
149 | 	k := prefix + "/" + strconv.FormatUint(h, 10)
150 | 	s.Mu.Lock()
151 | 	s.Hashes[k] = true
152 | 	s.Mu.Unlock()
153 | 	// Remote update
154 | 	if !localOnly && s.snd != nil {
155 | 		go func() {
156 | 			jsonPayload, _ := json.Marshal(&PublishMessage{F: "See", T: "hash", K: prefix, V: fmt.Sprintf("%x", h)})
157 | 			s.snd <- jsonPayload
158 | 		}()
159 | 	}
160 | }
161 | 
162 | func (s *GryffinStore) Hit(prefix string) bool {
163 | 	// prefix is domain.
164 | 	ts := time.Now().Truncate(5 * time.Second).Unix()
165 | 	k := prefix + "/" + strconv.FormatInt(ts, 10)
166 | 	s.Mu.Lock()
167 | 	defer s.Mu.Unlock()
168 | 	if v, ok := s.Hits[k]; ok {
169 | 		if v >= 5 {
170 | 			return false
171 | 		}
172 | 		s.Hits[k]++
173 | 		return true
174 | 	}
175 | 	s.Hits[k] = 1
176 | 	return true
177 | }
178 | 


--------------------------------------------------------------------------------
/cmd/gryffin-standalone/main.go:
--------------------------------------------------------------------------------
  1 | // Copyright 2015, Yahoo Inc. All rights reserved.
  2 | // Use of this source code is governed by a BSD-style
  3 | // license that can be found in the LICENSE file.
  4 | 
  5 | package main
  6 | 
  7 | import (
  8 | 	"flag"
  9 | 	"fmt"
 10 | 	"io"
 11 | 	"net"
 12 | 	"net/http"
 13 | 	"os"
 14 | 	"sync"
 15 | 	"time"
 16 | 
 17 | 	"github.com/yahoo/gryffin"
 18 | 	"github.com/yahoo/gryffin/fuzzer/arachni"
 19 | 	"github.com/yahoo/gryffin/fuzzer/sqlmap"
 20 | 	"github.com/yahoo/gryffin/renderer"
 21 | )
 22 | 
 23 | var method = flag.String("method", "GET", "the HTTP method for the request.")
 24 | var url string
 25 | var body = flag.String("data", "", "the data used in a (POST) request.")
 26 | 
 27 | func usage() {
 28 | 	fmt.Fprintf(os.Stderr, "Usage of %s:\n", os.Args[0])
 29 | 	fmt.Fprintf(os.Stderr, "\tgryffin-standalone [flags] seed-url\n")
 30 | 	fmt.Fprintf(os.Stderr, "Flags:\n")
 31 | 	flag.PrintDefaults()
 32 | }
 33 | 
 34 | // THIS IS BAD CODE per https://blog.golang.org/pipelines, and is created for prototyping.
 35 | // In production, we will move the channels out and use message queue instead.
 36 | func linkChannels(s *gryffin.Scan) {
 37 | 
 38 | 	var wg sync.WaitGroup
 39 | 
 40 | 	chanStart := make(chan *gryffin.Scan, 10)
 41 | 	chanRateLimit := make(chan *gryffin.Scan, 10)
 42 | 	chanCrawl := make(chan *gryffin.Scan, 10)
 43 | 	chanFuzz := make(chan *gryffin.Scan, 10)
 44 | 	// defer close(chanStart)
 45 | 	defer close(chanRateLimit)
 46 | 	defer close(chanCrawl)
 47 | 	defer close(chanFuzz)
 48 | 
 49 | 	// TODO - name all of these functions.
 50 | 
 51 | 	// Crawl -> Filter by Domain / Rate Limit
 52 | 	go func() {
 53 | 
 54 | 		for scan := range chanCrawl {
 55 | 			r := &renderer.PhantomJSRenderer{Timeout: 10}
 56 | 			scan.CrawlAsync(r)
 57 | 
 58 | 			go func() {
 59 | 				if s := <-r.GetRequestBody(); s != nil {
 60 | 					// add two workers (two fuzzers)
 61 | 					wg.Add(2)
 62 | 					chanFuzz <- s
 63 | 				}
 64 | 
 65 | 			}()
 66 | 
 67 | 			scan := scan // prevent capturing by goroutine below
 68 | 			go func() {
 69 | 				//
 70 | 				// Renderer will close all channels when a page is duplicated.
 71 | 				// Therefore we don't need to test whether the link is coming
 72 | 				// from a duplicated page or not
 73 | 				for newScan := range r.GetLinks() {
 74 | 					if ok := newScan.ShouldCrawl(); ok {
 75 | 						// add one workers (a new crawl)
 76 | 						wg.Add(1)
 77 | 						chanRateLimit <- newScan
 78 | 					}
 79 | 				}
 80 | 				// remove one worker (finish crawl)
 81 | 				wg.Done()
 82 | 				scan.Logm("Get Links", "Finished")
 83 | 
 84 | 			}()
 85 | 
 86 | 		}
 87 | 
 88 | 	}()
 89 | 
 90 | 	go func() {
 91 | 		for scan := range chanFuzz {
 92 | 			scan := scan // prevent capture by func literal below
 93 | 			go func() {
 94 | 				f := &arachni.Fuzzer{}
 95 | 				f.Fuzz(scan)
 96 | 				// remove a fuzzer worker.
 97 | 				wg.Done()
 98 | 			}()
 99 | 			go func() {
100 | 				f := &sqlmap.Fuzzer{}
101 | 				f.Fuzz(scan)
102 | 				// remove a fuzzer worker.
103 | 				wg.Done()
104 | 			}()
105 | 		}
106 | 
107 | 	}()
108 | 
109 | 	// Rate Limit -> Crawl
110 | 	go func() {
111 | 		for scan := range chanRateLimit {
112 | 			if delay := scan.RateLimit(); delay != 0 {
113 | 				go func() {
114 | 					time.Sleep(time.Duration(delay) * time.Second)
115 | 					chanRateLimit <- scan
116 | 				}()
117 | 				// TODO queue it again.
118 | 				continue
119 | 			}
120 | 			chanCrawl <- scan
121 | 		}
122 | 	}()
123 | 
124 | 	// Start, Poke -> RateLimit
125 | 	go func() {
126 | 		for scan := range chanStart {
127 | 			// TODO: add error handling
128 | 			// err := scan.Poke(&http.Client{})
129 | 			_ = scan.Poke(&http.Client{})
130 | 			// if err != nil {
131 | 			// if scan.HitCount <= 5 {
132 | 			// 	go func() {
133 | 			// 		time.Sleep(5 * time.Second)
134 | 			// 		chanStart <- scan
135 | 			// 	}()
136 | 			// }
137 | 			// continue
138 | 			// }
139 | 			chanRateLimit <- scan
140 | 		}
141 | 	}()
142 | 
143 | 	chanStart <- s
144 | 	close(chanStart)
145 | 
146 | 	// add one worker (start crawl)
147 | 	wg.Add(1)
148 | 	wg.Wait()
149 | }
150 | 
151 | func main() {
152 | 
153 | 	flag.Usage = usage
154 | 	flag.Parse()
155 | 
156 | 	switch flag.NArg() {
157 | 	case 1:
158 | 		url = flag.Arg(0)
159 | 	default:
160 | 		usage()
161 | 		return
162 | 
163 | 	}
164 | 
165 | 	fmt.Println("=== Running Gryffin ===")
166 | 
167 | 	var w io.Writer
168 | 	// TCP port listening messages.
169 | 	tcpout, err := net.Dial("tcp", "localhost:5000")
170 | 	if err != nil {
171 | 		// fmt.Println("Cannot establish tcp connection to log listener.")
172 | 		w = os.Stdout
173 | 	} else {
174 | 		w = io.MultiWriter(os.Stdout, tcpout)
175 | 	}
176 | 
177 | 	gryffin.SetLogWriter(w)
178 | 
179 | 	scan := gryffin.NewScan(*method, url, *body)
180 | 	scan.Logm("Main", "Started")
181 | 
182 | 	linkChannels(scan)
183 | 
184 | 	fmt.Println("=== End Running Gryffin ===")
185 | 
186 | }
187 | 


--------------------------------------------------------------------------------
/renderer/phantomjs.go:
--------------------------------------------------------------------------------
  1 | // Copyright 2015, Yahoo Inc. All rights reserved.
  2 | // Use of this source code is governed by a BSD-style
  3 | // license that can be found in the LICENSE file.
  4 | 
  5 | package renderer
  6 | 
  7 | import (
  8 | 	"encoding/json"
  9 | 	"io"
 10 | 	"io/ioutil"
 11 | 	"net/http"
 12 | 	"net/url"
 13 | 	"os"
 14 | 	"os/exec"
 15 | 	"reflect"
 16 | 	"strconv"
 17 | 	"strings"
 18 | 	"time"
 19 | 
 20 | 	"github.com/yahoo/gryffin"
 21 | 	_ "github.com/yahoo/gryffin/renderer/resource"
 22 | )
 23 | 
 24 | /* all of these are the JSON struct for phantomjs render.js */
 25 | 
 26 | type PhantomJSRenderer struct {
 27 | 	BaseRenderer
 28 | 	Timeout int
 29 | 	process *os.Process
 30 | }
 31 | 
 32 | type input struct {
 33 | 	Method         string       `json:"method"`
 34 | 	AllowedDomains []string     `json:"allowed_domains,omitempty"`
 35 | 	Headers        inputHeaders `json:"headers"`
 36 | }
 37 | 
 38 | type inputHeaders struct {
 39 | 	AcceptEncoding string `json:"Accept-Encoding"`
 40 | 	AcceptLanguage string `json:"Accept-Language"`
 41 | 	Cookie         string
 42 | 	UserAgent      string `json:"User-Agent"`
 43 | }
 44 | 
 45 | type details struct {
 46 | 	Links        []link
 47 | 	Forms        []form
 48 | 	ChildFrames  []link
 49 | 	SubResources []link
 50 | 	Redirects    []link
 51 | 	MainFrame    []link
 52 | }
 53 | 
 54 | type link struct {
 55 | 	Text string
 56 | 	Url  string
 57 | }
 58 | 
 59 | type form struct {
 60 | 	Data     string
 61 | 	DataType string
 62 | 	Method   string
 63 | 	Url      string
 64 | }
 65 | 
 66 | type response struct {
 67 | 	Headers     map[string][]string
 68 | 	Body        string
 69 | 	ContentType string
 70 | 	Status      int
 71 | 	Url         string
 72 | 	Details     details
 73 | }
 74 | 
 75 | type responseMessage struct {
 76 | 	Response response
 77 | 	Elapsed  int
 78 | 	Ok       int
 79 | }
 80 | 
 81 | type domMessage struct {
 82 | 	Action   string
 83 | 	Events   []string
 84 | 	KeyChain []string
 85 | 	JSError  []string
 86 | }
 87 | 
 88 | type message struct {
 89 | 	*responseMessage
 90 | 	*domMessage
 91 | 	*details
 92 | 	Signature string
 93 | 	MsgType   string
 94 | }
 95 | 
 96 | type noCloseReader struct {
 97 | 	io.Reader
 98 | }
 99 | 
100 | func (r noCloseReader) Close() error {
101 | 	return nil
102 | }
103 | 
104 | func (m *response) fill(s *gryffin.Scan) {
105 | 
106 | 	/*
107 | 	   {"response":{"headers":{"Date":["Thu, 30 Jul 2015 00:13:43 GMT"],"Set-Cookie":["B=82j3nrdarir1n&b=3&s=23; expires=Sun, 30-Jul-2017 00:13:43 GMT; path=/; domain=.yahoo.com"]
108 | 
109 | 	*/
110 | 	resp := &http.Response{
111 | 		Request:    s.Request,
112 | 		StatusCode: m.Status,
113 | 		Status:     strconv.FormatInt(int64(m.Status), 10),
114 | 		Proto:      "HTTP/1.1",
115 | 		ProtoMajor: 1,
116 | 		ProtoMinor: 1,
117 | 		Header:     m.Headers,
118 | 		Body:       noCloseReader{strings.NewReader(m.Body)},
119 | 	}
120 | 
121 | 	s.Response = resp
122 | 	s.ReadResponseBody()
123 | 
124 | }
125 | 
126 | func (f *form) toScan(parent *gryffin.Scan) *gryffin.Scan {
127 | 	m := strings.ToUpper(f.Method)
128 | 	u := f.Url
129 | 	var r io.Reader
130 | 	if m == "POST" {
131 | 		r = ioutil.NopCloser(strings.NewReader(f.Data))
132 | 	} else {
133 | 		parsed, err := url.Parse(u)
134 | 		if err == nil {
135 | 			parsed.RawQuery = f.Data
136 | 			u = parsed.String()
137 | 		}
138 | 	}
139 | 
140 | 	if req, err := http.NewRequest(m, u, r); err == nil {
141 | 		s := parent.Spawn()
142 | 		s.MergeRequest(req)
143 | 		return s
144 | 	}
145 | 	// invalid url
146 | 	return nil
147 | }
148 | 
149 | func (l *link) toScan(parent *gryffin.Scan) *gryffin.Scan {
150 | 	if req, err := http.NewRequest("GET", l.Url, nil); err == nil {
151 | 		s := parent.Spawn()
152 | 		s.MergeRequest(req)
153 | 		return s
154 | 	}
155 | 	// invalid url
156 | 	return nil
157 | }
158 | 
159 | func (r *PhantomJSRenderer) extract(stdout io.ReadCloser, s *gryffin.Scan) {
160 | 	defer close(r.done)
161 | 
162 | 	dec := json.NewDecoder(stdout)
163 | 	for {
164 | 		var m message
165 | 		err := dec.Decode(&m)
166 | 		if err == io.EOF {
167 | 			return
168 | 		}
169 | 		if m.responseMessage != nil {
170 | 			m.Response.fill(s)
171 | 			if s.IsDuplicatedPage() {
172 | 				return
173 | 			}
174 | 			r.chanResponse <- s
175 | 			r.parseDetails(&m.Response.Details, s)
176 | 		}
177 | 
178 | 		if m.details != nil {
179 | 			r.parseDetails(m.details, s)
180 | 		}
181 | 	}
182 | }
183 | 
184 | func (r *PhantomJSRenderer) parseDetails(d *details, s *gryffin.Scan) {
185 | 	v := reflect.ValueOf(*d)
186 | 	for i := 0; i < v.NumField(); i++ {
187 | 		if links, ok := v.Field(i).Interface().([]link); ok {
188 | 			for _, link := range links {
189 | 				if newScan := link.toScan(s); newScan != nil && newScan.IsScanAllowed() {
190 | 					r.chanLinks <- newScan
191 | 				}
192 | 			}
193 | 		}
194 | 		if forms, ok := v.Field(i).Interface().([]form); ok {
195 | 			for _, form := range forms {
196 | 				if newScan := form.toScan(s); newScan != nil && newScan.IsScanAllowed() {
197 | 					r.chanLinks <- newScan
198 | 				}
199 | 			}
200 | 		}
201 | 	}
202 | }
203 | 
204 | func (r *PhantomJSRenderer) kill(reason string, s *gryffin.Scan) {
205 | 	if err := r.process.Kill(); err == nil {
206 | 		s.Logmf("PhantomjsRenderer.Do", "[%s] Terminating the crawl process.", reason)
207 | 	}
208 | }
209 | 
210 | func (r *PhantomJSRenderer) wait(s *gryffin.Scan) {
211 | 
212 | 	select {
213 | 	case <-r.done:
214 | 		r.kill("Cleanup", s)
215 | 	case <-time.After(time.Duration(r.Timeout) * time.Second):
216 | 		r.kill("Timeout", s)
217 | 	}
218 | 	close(r.chanResponse)
219 | 	close(r.chanLinks)
220 | }
221 | 
222 | func (r *PhantomJSRenderer) Do(s *gryffin.Scan) {
223 | 
224 | 	r.chanResponse = make(chan *gryffin.Scan, 10)
225 | 	r.chanLinks = make(chan *gryffin.Scan, 10)
226 | 	r.done = make(chan string)
227 | 
228 | 	// Construct the command.
229 | 	// render.js http(s)://<host>[:port][/path] [{"method":"post", "data":"a=1&b=2"}]
230 | 	url := s.Request.URL.String()
231 | 	cookies := make([]string, 0)
232 | 	// ua := s.Request.UserAgent()
233 | 	ua := "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.85 Safari/537.36"
234 | 
235 | 	for _, c := range s.Cookies {
236 | 		cookies = append(cookies, c.String())
237 | 	}
238 | 
239 | 	arg := input{
240 | 		Method: s.Request.Method,
241 | 		Headers: inputHeaders{
242 | 			UserAgent: ua,
243 | 			Cookie:    strings.Join(cookies, ";"),
244 | 		},
245 | 	}
246 | 
247 | 	opt, err := json.Marshal(arg)
248 | 	if err != nil {
249 | 		s.Error("PhantomjsRenderer.Do", err)
250 | 		return
251 | 	}
252 | 
253 | 	// s.Logmf("PhantomjsRenderer.Do", "Running: render.js %s '%s'", url, string(opt))
254 | 	s.Logmf("PhantomjsRenderer.Do", "Running: render.js")
255 | 
256 | 	cmd := exec.Command(
257 | 		"phantomjs",
258 | 		"--ssl-protocol=any",
259 | 		"--ignore-ssl-errors=true",
260 | 		os.Getenv("GOPATH")+"/src/github.com/yahoo/gryffin/renderer/resource/render.js",
261 | 		url,
262 | 		string(opt))
263 | 
264 | 	stdout, err := cmd.StdoutPipe()
265 | 	if err != nil {
266 | 		s.Error("PhantomjsRenderer.Do", err)
267 | 		return
268 | 	}
269 | 
270 | 	if err := cmd.Start(); err != nil {
271 | 		s.Error("PhantomjsRenderer.Do", err)
272 | 		return
273 | 	}
274 | 
275 | 	r.process = cmd.Process
276 | 
277 | 	// wait until done or timeout.
278 | 	go r.extract(stdout, s)
279 | 	go r.wait(s)
280 | 
281 | 	// cmd.Wait will close the stdout pipe.
282 | 	go cmd.Wait()
283 | 
284 | }
285 | 


--------------------------------------------------------------------------------
/gryffin_test.go:
--------------------------------------------------------------------------------
  1 | // Copyright 2015, Yahoo Inc. All rights reserved.
  2 | // Use of this source code is governed by a BSD-style
  3 | // license that can be found in the LICENSE file.
  4 | 
  5 | package gryffin
  6 | 
  7 | import (
  8 | 	"net/http"
  9 | 	"net/http/httptest"
 10 | 	"net/url"
 11 | 	"os"
 12 | 	"reflect"
 13 | 	"strings"
 14 | 	"testing"
 15 | )
 16 | 
 17 | var h = http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
 18 | 	w.Write([]byte("Hello World"))
 19 | })
 20 | 
 21 | var ts = httptest.NewServer(h)
 22 | 
 23 | func TestGenRandomID(t *testing.T) {
 24 | 	t.Parallel()
 25 | 	id := GenRandomID()
 26 | 	if len(id) == 0 {
 27 | 		t.Error("Empty ID from GenRandomID.")
 28 | 	}
 29 | }
 30 | 
 31 | func TestNewScan(t *testing.T) {
 32 | 	t.Parallel()
 33 | 	s := NewScan("GET", ts.URL, "")
 34 | 	if s == nil {
 35 | 		t.Error("Scan is nil.")
 36 | 	}
 37 | 	// TODO - verify s.DomainAllowed.
 38 | }
 39 | 
 40 | func TestNewScanInvalid(t *testing.T) {
 41 | 	t.Parallel()
 42 | 	s := NewScan("GET", "%a", "")
 43 | 	if s != nil {
 44 | 		t.Error("Scan is not nil with invalid URL.", s.Request)
 45 | 	}
 46 | }
 47 | 
 48 | // this test fails due to JSON Marshal of http.Response.Body
 49 | // func TestNewScanFromJson(t *testing.T) {
 50 | // 	t.Parallel()
 51 | 
 52 | // 	// Test arbritary url.
 53 | // 	s := NewScan("GET", ts.URL, "")
 54 | // 	if err := s.Poke(&http.Client{}); err != nil {
 55 | // 		t.Fatalf("error in s.Poke: %v", err)
 56 | // 	}
 57 | // 	j := s.Json()
 58 | // 	if j == nil {
 59 | // 		t.Fatalf("scan.Json: got %v, want a json string - ts.URL=%v", j, ts.URL)
 60 | // 	}
 61 | 
 62 | // 	s2 := NewScanFromJson(j)
 63 | // 	if s2 == nil {
 64 | // 		t.Error("NewScanFromJson should return a scan.")
 65 | // 	}
 66 | // 	t.Log(s2)
 67 | // }
 68 | 
 69 | func TestGetOrigin(t *testing.T) {
 70 | 	t.Parallel()
 71 | 	u, _ := url.Parse("http://127.0.0.1:1234/foo/bar?")
 72 | 	o := getOrigin(u)
 73 | 	if o != "http://127.0.0.1:1234" {
 74 | 		t.Error("getOrigin is not valid", u, o)
 75 | 	}
 76 | }
 77 | 
 78 | func TestScanPoke(t *testing.T) {
 79 | 	t.Parallel()
 80 | 	s := NewScan("GET", ts.URL, "")
 81 | 	err := s.Poke(&http.Client{})
 82 | 	if err != nil {
 83 | 		t.Error(err)
 84 | 	}
 85 | }
 86 | 
 87 | func TestScanPokeInvalidURL(t *testing.T) {
 88 | 	t.Parallel()
 89 | 	client := &http.Client{}
 90 | 	s := NewScan("GET", "/foo", "")
 91 | 	err := s.Poke(client)
 92 | 	if err == nil {
 93 | 		t.Error("Expect an error with invalid scheme.")
 94 | 	}
 95 | 	t.Log("Negative test: Invalid url got ", err)
 96 | }
 97 | 
 98 | func TestScanSpawn(t *testing.T) {
 99 | 	t.Parallel()
100 | 	s := NewScan("GET", ts.URL, "")
101 | 	s.Poke(&http.Client{})
102 | 	s2 := s.Spawn()
103 | 	if s.Request.URL != s2.Request.URL {
104 | 		t.Error("Spawn gives a request with different URL.")
105 | 	}
106 | }
107 | 
108 | func TestScanMergeRequest(t *testing.T) {
109 | 	t.Parallel()
110 | 	s := NewScan("GET", ts.URL, "foo=bar")
111 | 	s.Poke(&http.Client{})
112 | 	s.Request.Header.Set("User-Agent", "foo")
113 | 	s.Cookies = []*http.Cookie{
114 | 		&http.Cookie{Name: "cookie-name-1", Value: "cookie-value-1"},
115 | 	}
116 | 
117 | 	r, _ := http.NewRequest("GET", ts.URL, strings.NewReader("quz=quxx"))
118 | 	s.MergeRequest(r)
119 | 	if s.Request.UserAgent() != "foo" {
120 | 		t.Errorf("Merge request got a different user agent: %s", s.Request.UserAgent())
121 | 	}
122 | }
123 | 
124 | func TestScanMergeRequestRelative(t *testing.T) {
125 | 	t.Parallel()
126 | 	s := NewScan("GET", ts.URL, "")
127 | 	s.Request.Header.Set("User-Agent", "foo")
128 | 	r, _ := http.NewRequest("GET", "/#", nil)
129 | 	s.MergeRequest(r)
130 | 
131 | 	if s.Request.URL.String() != ts.URL+"/" {
132 | 		t.Errorf("Merge request cannot resolve relative url: %s", s.Request.URL)
133 | 	}
134 | }
135 | 
136 | func TestScanReadResponseBody(t *testing.T) {
137 | 	t.Parallel()
138 | 	s := NewScan("GET", ts.URL, "")
139 | 	s.Poke(&http.Client{})
140 | 	s.ReadResponseBody()
141 | 	if s.ResponseBody == "" {
142 | 		t.Error("Empty ResponseBody")
143 | 	}
144 | 	// t.Log(s.ResponseBody)
145 | }
146 | 
147 | func TestScanUpdateFingerprint(t *testing.T) {
148 | 	t.Parallel()
149 | 	s := NewScan("GET", "http://127.0.0.1", "")
150 | 	s.UpdateFingerprint()
151 | 	if !reflect.DeepEqual(
152 | 		s.Fingerprint,
153 | 		Fingerprint{0x7233A9A31DEADAF2, 0x7233A9A31DEADAF2, 0xF8A4322BD612093C, 0, 0}) {
154 | 		t.Error("Fingerprint mismatch", s.Fingerprint)
155 | 	}
156 | }
157 | 
158 | func TestScanResponseFingerprint(t *testing.T) {
159 | 	t.Parallel()
160 | 	s := NewScan("GET", ts.URL, "")
161 | 	s.Poke(&http.Client{})
162 | 	s.UpdateFingerprint()
163 | 	if s.Fingerprint.ResponseSimilarity != 0x62C1D0803B2AB139 {
164 | 		t.Error("Fingerprint mismatch", s.Fingerprint)
165 | 	}
166 | }
167 | 
168 | func TestScanRateLimit(t *testing.T) {
169 | 	t.Parallel()
170 | 	s := NewScan("GET", ts.URL, "")
171 | 	for i := 0; i < 5; i++ {
172 | 		d := s.RateLimit()
173 | 		if d > 0 {
174 | 			t.Errorf("Got delayed for %d", d)
175 | 		}
176 | 	}
177 | 	d := s.RateLimit()
178 | 	if d == 0 {
179 | 		t.Errorf("No delay after 5 request. Got %d", d)
180 | 	}
181 | }
182 | 
183 | func TestScanIsScanAllowed(t *testing.T) {
184 | 	t.Parallel()
185 | 	s := NewScan("GET", "http://foo.com", "")
186 | 
187 | 	r, _ := http.NewRequest("GET", "http://bar.com", nil)
188 | 	s.MergeRequest(r)
189 | 	if s.IsScanAllowed() {
190 | 		t.Error("IsScanAllowed should return false", s)
191 | 	}
192 | 
193 | 	r, _ = http.NewRequest("GET", "http://foo.com/test", nil)
194 | 	s.MergeRequest(r)
195 | 	if !s.IsScanAllowed() {
196 | 		t.Error("IsScanAllowed should return true", s)
197 | 	}
198 | 
199 | 	s2 := NewScan("GET", "/no-domain", "")
200 | 	if !s2.IsScanAllowed() {
201 | 		t.Error("IsScanAllowed should return true", s2.Request.URL)
202 | 	}
203 | }
204 | 
205 | func TestScanCrawlAsync(t *testing.T) {
206 | 	// TODO ...
207 | 	t.Parallel()
208 | }
209 | 
210 | func TestScanIsDuplicatedPage(t *testing.T) {
211 | 	t.Parallel()
212 | 	s1 := NewScan("GET", ts.URL, "")
213 | 	_ = s1.Poke(&http.Client{})
214 | 	if s1.IsDuplicatedPage() {
215 | 		t.Error("IsDuplicatedPage should return false for the first page", s1)
216 | 	}
217 | 
218 | 	s2 := s1.Spawn()
219 | 	r, _ := http.NewRequest("GET", ts.URL, nil)
220 | 	s2.MergeRequest(r)
221 | 	_ = s2.Poke(&http.Client{})
222 | 	if !s2.IsDuplicatedPage() {
223 | 		t.Errorf("IsDuplicatedPage should return true for the second page with same Job ID.\n1st Page: %064b\n2nd Page: %064b\n",
224 | 			s1.Fingerprint.ResponseSimilarity, s2.Fingerprint.ResponseSimilarity)
225 | 	}
226 | 
227 | 	s3 := Scan(*s1)
228 | 	s3.Job.ID = "ABCDEF123456"
229 | 	if s3.IsDuplicatedPage() {
230 | 		t.Error("IsDuplicatedPage should return false for the a page with new Job ID", s3)
231 | 	}
232 | 
233 | }
234 | 
235 | func TestScanFuzz(t *testing.T) {
236 | 	// TODO ...
237 | 	t.Parallel()
238 | }
239 | 
240 | func TestScanShouldCrawl(t *testing.T) {
241 | 	t.Parallel()
242 | 	s1 := NewScan("GET", ts.URL, "")
243 | 	if !s1.ShouldCrawl() {
244 | 		t.Error("ShouldCrawl should return true for the first page", s1)
245 | 	}
246 | 
247 | 	s2 := s1.Spawn()
248 | 	r, _ := http.NewRequest("GET", ts.URL, nil)
249 | 	s2.MergeRequest(r)
250 | 
251 | 	if s2.ShouldCrawl() {
252 | 		t.Errorf("ShouldCrawl should return false for the second page with same Job ID.\n1st Page: %064b\n2nd Page: %064b\n",
253 | 			s1.Fingerprint.ResponseSimilarity, s2.Fingerprint.ResponseSimilarity)
254 | 	}
255 | 
256 | 	s3 := Scan(*s1)
257 | 	s3.Job.ID = "ABCDEF123456"
258 | 	if !s3.ShouldCrawl() {
259 | 		t.Error("ShouldCrawl should return true for the a page with new Job ID", s3)
260 | 	}
261 | }
262 | 
263 | func TestScanLog(t *testing.T) {
264 | 	t.Parallel()
265 | 	SetLogWriter(os.Stdout)
266 | 	s := NewScan("GET", ts.URL, "")
267 | 	s.Log(s)
268 | }
269 | 


--------------------------------------------------------------------------------
/cmd/gryffin-distributed/main.go:
--------------------------------------------------------------------------------
  1 | // Copyright 2015, Yahoo Inc. All rights reserved.
  2 | // Use of this source code is governed by a BSD-style
  3 | // license that can be found in the LICENSE file.
  4 | 
  5 | package main
  6 | 
  7 | import (
  8 | 	"flag"
  9 | 	"fmt"
 10 | 	"io"
 11 | 	"math/rand"
 12 | 	"net"
 13 | 	"net/http"
 14 | 	"os"
 15 | 	"os/signal"
 16 | 	"sync"
 17 | 	"syscall"
 18 | 	"time"
 19 | 
 20 | 	"github.com/nsqio/go-nsq"
 21 | 
 22 | 	"github.com/yahoo/gryffin"
 23 | 	"github.com/yahoo/gryffin/fuzzer/arachni"
 24 | 	"github.com/yahoo/gryffin/fuzzer/sqlmap"
 25 | 	"github.com/yahoo/gryffin/renderer"
 26 | )
 27 | 
 28 | var (
 29 | 	// storage is currently unused - TODO: use or remove
 30 | 	// storage = flag.String("storage", "memory", "storag method or the storage url")
 31 | 	service string
 32 | 	url     string
 33 | 	wg      sync.WaitGroup
 34 | 	wq      chan bool
 35 | 
 36 | 	t *gryffin.Scan
 37 | 
 38 | 	logWriter io.Writer
 39 | 	store     *gryffin.GryffinStore
 40 | )
 41 | 
 42 | // var method = flag.String("method", "GET", "the HTTP method for the request.")
 43 | // var url string
 44 | // var body = flag.String("data", "", "the data used in a (POST) request.")
 45 | 
 46 | func usage() {
 47 | 	fmt.Fprintf(os.Stderr, "Usage of %s:\n", os.Args[0])
 48 | 	fmt.Fprintf(os.Stderr, "\tgryffin-distributed --storage=[memory,redis-url] [seed,crawl,fuzz-sqlmap,fuzz-arachni] [url] \n")
 49 | 	fmt.Fprintf(os.Stderr, "Flags:\n")
 50 | 	flag.PrintDefaults()
 51 | }
 52 | 
 53 | func captureCtrlC() {
 54 | 	sigChan := make(chan os.Signal, 1)
 55 | 	signal.Notify(sigChan, syscall.SIGINT, syscall.SIGTERM)
 56 | 	wg.Add(1)
 57 | 
 58 | 	go func() {
 59 | 		<-sigChan
 60 | 		fmt.Println("We got Ctrl-C. Stopping.")
 61 | 		wg.Done()
 62 | 	}()
 63 | }
 64 | 
 65 | func newProducer() *nsq.Producer {
 66 | 	producer, err := nsq.NewProducer("127.0.0.1:4150", nsq.NewConfig())
 67 | 	if err != nil {
 68 | 		fmt.Println("Cannot connect to NSQ for producing message", err)
 69 | 		return nil
 70 | 	}
 71 | 	return producer
 72 | }
 73 | 
 74 | func newConsumer(topic, channel string, handler nsq.HandlerFunc) *nsq.Consumer {
 75 | 	var err error
 76 | 	consumer, err := nsq.NewConsumer(topic, channel, nsq.NewConfig())
 77 | 	if err != nil {
 78 | 		fmt.Println("Cannot create consumer", err)
 79 | 		return nil
 80 | 	}
 81 | 
 82 | 	consumer.AddHandler(handler)
 83 | 	err = consumer.ConnectToNSQLookupd("127.0.0.1:4161")
 84 | 	if err != nil {
 85 | 		fmt.Println("Cannot connect to NSQ for consuming message", err)
 86 | 		return nil
 87 | 	}
 88 | 	return consumer
 89 | }
 90 | 
 91 | func seed(url string) {
 92 | 	producer := newProducer()
 93 | 	defer producer.Stop()
 94 | 
 95 | 	err := t.Poke(&http.Client{})
 96 | 	if err != nil {
 97 | 		fmt.Println("Site is not up. Ignoring.", t.Request.URL)
 98 | 		return
 99 | 	}
100 | 
101 | 	err = producer.Publish("seed", t.Json())
102 | 	if err != nil {
103 | 		fmt.Println("Could not publish", "seed", err)
104 | 	}
105 | 	fmt.Printf("Seed %s injected.\n", url)
106 | 
107 | }
108 | 
109 | func shareCache() {
110 | 
111 | 	var producer *nsq.Producer
112 | 	var consumer *nsq.Consumer
113 | 
114 | 	handler := nsq.HandlerFunc(func(m *nsq.Message) error {
115 | 		store.GetRcvChan() <- m.Body
116 | 		return nil
117 | 	})
118 | 
119 | 	producer = newProducer()
120 | 
121 | 	go func() {
122 | 		for {
123 | 			// fmt.Println("SndChan: ", store.GetSndChan(), string(json))
124 | 			err := producer.Publish("share-cache", <-store.GetSndChan())
125 | 			if err != nil {
126 | 				fmt.Println("Could not publish", "share-cache", err)
127 | 			}
128 | 		}
129 | 	}()
130 | 
131 | 	rand.Seed(time.Now().UnixNano())
132 | 
133 | 	consumer = newConsumer("share-cache", fmt.Sprintf("%06d#ephemeral", rand.Int()%999999), handler)
134 | 	_ = consumer
135 | 
136 | 	// defer producer.Stop()
137 | 	// defer consumer.Stop()
138 | 
139 | }
140 | 
141 | func crawl() {
142 | 
143 | 	var producer *nsq.Producer
144 | 	var consumer *nsq.Consumer
145 | 
146 | 	handler := nsq.HandlerFunc(func(m *nsq.Message) error {
147 | 		scan := gryffin.NewScanFromJson(m.Body)
148 | 
149 | 		if delay := scan.RateLimit(); delay != 0 {
150 | 			go func() {
151 | 				time.Sleep(time.Duration(delay) * time.Second)
152 | 				err := producer.Publish("seed", scan.Json())
153 | 				if err != nil {
154 | 					fmt.Println("Could not publish", "fuzz", err)
155 | 				}
156 | 			}()
157 | 		} else {
158 | 			// TODO - phantom JS timeout should be an input argument.
159 | 			r := &renderer.PhantomJSRenderer{Timeout: 60}
160 | 			wq <- true
161 | 			scan.CrawlAsync(r)
162 | 			go func() {
163 | 				if s := <-r.GetRequestBody(); s != nil {
164 | 					// fmt.Println("Got request body", s.Request.URL)
165 | 					err := producer.Publish("fuzz", s.Json())
166 | 					if err != nil {
167 | 						fmt.Println("Could not publish", "fuzz", err)
168 | 					}
169 | 				}
170 | 			}()
171 | 
172 | 			go func() {
173 | 
174 | 				//
175 | 				// Renderer will close all channels when a page is duplicated.
176 | 				// Therefore we don't need to test whether the link is coming
177 | 				// from a duplicated page or not
178 | 				for s := range r.GetLinks() {
179 | 					if ok := s.ShouldCrawl(); ok {
180 | 						err := producer.Publish("seed", s.Json())
181 | 						if err != nil {
182 | 							fmt.Println("Could not publish", "seed", err)
183 | 						}
184 | 					}
185 | 				}
186 | 				<-wq
187 | 			}()
188 | 		}
189 | 
190 | 		return nil
191 | 	})
192 | 
193 | 	producer = newProducer()
194 | 	defer producer.Stop()
195 | 	consumer = newConsumer("seed", "primary", handler)
196 | 	defer consumer.Stop()
197 | 
198 | 	wg.Wait()
199 | 
200 | }
201 | 
202 | func fuzzWithSqlmap() {
203 | 	var consumer *nsq.Consumer
204 | 	handler := nsq.HandlerFunc(func(m *nsq.Message) error {
205 | 		wq <- true
206 | 		scan := gryffin.NewScanFromJson(m.Body)
207 | 		f := &sqlmap.Fuzzer{}
208 | 		f.Fuzz(scan)
209 | 		<-wq
210 | 		return nil
211 | 	})
212 | 	consumer = newConsumer("fuzz", "sqlmap", handler)
213 | 	defer consumer.Stop()
214 | 	wg.Wait()
215 | }
216 | 
217 | func fuzzWithArachni() {
218 | 	var consumer *nsq.Consumer
219 | 	handler := nsq.HandlerFunc(func(m *nsq.Message) error {
220 | 		wq <- true
221 | 		scan := gryffin.NewScanFromJson(m.Body)
222 | 		f := &arachni.Fuzzer{}
223 | 		f.Fuzz(scan)
224 | 		<-wq
225 | 		return nil
226 | 	})
227 | 	consumer = newConsumer("fuzz", "arachni", handler)
228 | 	defer consumer.Stop()
229 | 	wg.Wait()
230 | }
231 | 
232 | func main() {
233 | 
234 | 	flag.Usage = usage
235 | 	flag.Parse()
236 | 
237 | 	switch flag.NArg() {
238 | 	case 1:
239 | 		// gryffin-distributed crawl
240 | 		service = flag.Arg(0)
241 | 	case 2:
242 | 		// gryffin-distributed seed "http://..."
243 | 		service = flag.Arg(0)
244 | 		if service == "seed" {
245 | 			url = flag.Arg(1)
246 | 		} else {
247 | 			usage()
248 | 			return
249 | 		}
250 | 	default:
251 | 		usage()
252 | 		return
253 | 	}
254 | 
255 | 	// TCP port listening messages.
256 | 	tcpout, err := net.Dial("tcp", "localhost:5000")
257 | 	if err != nil {
258 | 		// fmt.Println("Cannot establish tcp connection to log listener.")
259 | 		logWriter = os.Stdout
260 | 	} else {
261 | 		logWriter = io.MultiWriter(os.Stdout, tcpout)
262 | 	}
263 | 
264 | 	gryffin.SetLogWriter(logWriter)
265 | 
266 | 	// we use a buffered channel to block when max concurrency is reach.
267 | 	maxconcurrency := 5
268 | 	wq = make(chan bool, maxconcurrency)
269 | 
270 | 	t = gryffin.NewScan("GET", url, "")
271 | 
272 | 	// seed is unique case that we exit the program immediately
273 | 	if service == "seed" {
274 | 		seed(url)
275 | 		return
276 | 	}
277 | 
278 | 	store = gryffin.NewSharedGryffinStore()
279 | 	gryffin.SetMemoryStore(store)
280 | 
281 | 	captureCtrlC()
282 | 
283 | 	switch service {
284 | 
285 | 	case "crawl":
286 | 		shareCache()
287 | 		crawl()
288 | 
289 | 	case "fuzz-sqlmap":
290 | 		fuzzWithSqlmap()
291 | 	case "fuzz-arachni":
292 | 		fuzzWithArachni()
293 | 
294 | 	default:
295 | 		fmt.Println("Unrecognizated service:", service)
296 | 		usage()
297 | 	}
298 | 
299 | }
300 | 


--------------------------------------------------------------------------------
/gryffin.go:
--------------------------------------------------------------------------------
  1 | // Copyright 2015, Yahoo Inc. All rights reserved.
  2 | // Use of this source code is governed by a BSD-style
  3 | // license that can be found in the LICENSE file.
  4 | 
  5 | /*
  6 | Package gryffin is an application scanning infrastructure.
  7 | */
  8 | package gryffin
  9 | 
 10 | import (
 11 | 	"bytes"
 12 | 	"encoding/json"
 13 | 	"fmt"
 14 | 	"hash/fnv"
 15 | 	"io/ioutil"
 16 | 	"net"
 17 | 	"net/http"
 18 | 	"net/http/cookiejar"
 19 | 	"net/url"
 20 | 	"strings"
 21 | 	"time"
 22 | 
 23 | 	distance "github.com/yahoo/gryffin/html-distance"
 24 | )
 25 | 
 26 | // A Scan consists of the job, target, request and response.
 27 | type Scan struct {
 28 | 	// ID is a random ID to identify this particular scan.
 29 | 	// if ID is empty, this scan should not be performed (but record for rate limiting).
 30 | 	ID           string
 31 | 	Job          *Job
 32 | 	Request      *http.Request
 33 | 	RequestBody  string
 34 | 	Response     *http.Response
 35 | 	ResponseBody string
 36 | 	Cookies      []*http.Cookie
 37 | 	Fingerprint  Fingerprint
 38 | 	HitCount     int
 39 | }
 40 | 
 41 | // Job stores the job id and config (if any).
 42 | type Job struct {
 43 | 	ID             string
 44 | 	DomainsAllowed []string // Domains that we would crawl
 45 | }
 46 | 
 47 | // Fingerprint contains all the different types of hash for the Scan (Request & Response)
 48 | type Fingerprint struct {
 49 | 	Origin             uint64 // origin
 50 | 	URL                uint64 // origin + path
 51 | 	Request            uint64 // method, url, body
 52 | 	RequestFull        uint64 // request + header
 53 | 	ResponseSimilarity uint64
 54 | }
 55 | 
 56 | // HTTPDoer interface is to be implemented by http.Client
 57 | type HTTPDoer interface {
 58 | 	Do(*http.Request) (*http.Response, error)
 59 | }
 60 | 
 61 | // Fuzzer runs the fuzzing.
 62 | type Fuzzer interface {
 63 | 	Fuzz(*Scan) (int, error)
 64 | }
 65 | 
 66 | // Renderer is an interface for implementation HTML DOM renderer and obtain the response body and links.
 67 | // Since DOM construction is very likely to be asynchronous, we return the channels to receive response and links.
 68 | type Renderer interface {
 69 | 	Do(*Scan)
 70 | 	GetRequestBody() <-chan *Scan
 71 | 	GetLinks() <-chan *Scan
 72 | }
 73 | 
 74 | // LogMessage contains the data fields to be marshalled as JSON for forwarding to the log processor.
 75 | type LogMessage struct {
 76 | 	Service string
 77 | 	Msg     string
 78 | 	Method  string
 79 | 	Url     string
 80 | 	JobID   string
 81 | 	// Fingerprint Fingerprint
 82 | }
 83 | 
 84 | // NewScan creates a scan.
 85 | func NewScan(method, url, post string) *Scan {
 86 | 	// ensure we got a memory store..
 87 | 	memoryStoreMu.Lock()
 88 | 	if memoryStore == nil {
 89 | 		memoryStore = NewGryffinStore()
 90 | 	}
 91 | 	memoryStoreMu.Unlock()
 92 | 
 93 | 	id := GenRandomID()
 94 | 
 95 | 	job := &Job{ID: GenRandomID()}
 96 | 
 97 | 	req, err := http.NewRequest(method, url, ioutil.NopCloser(strings.NewReader(post)))
 98 | 	if err != nil {
 99 | 		// s.Log("Invalid url for NewScan: %s", err)
100 | 		return nil
101 | 	}
102 | 
103 | 	// put the host component of the url as the domains to be allowed
104 | 	host, _, err := net.SplitHostPort(req.URL.Host)
105 | 	if err != nil {
106 | 		job.DomainsAllowed = []string{req.URL.Host}
107 | 	} else {
108 | 		job.DomainsAllowed = []string{host}
109 | 	}
110 | 
111 | 	// Add chrome user agent
112 | 	req.Header.Set("User-Agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.107 Safari/537.36")
113 | 
114 | 	return &Scan{
115 | 		ID:          id,
116 | 		Job:         job,
117 | 		Request:     req,
118 | 		RequestBody: post,
119 | 	}
120 | }
121 | 
122 | // getOrigin returns the Origin of the URL (scheme, hostname, port )
123 | func getOrigin(u *url.URL) string {
124 | 	return u.Scheme + "://" + u.Host
125 | }
126 | 
127 | // MergeRequest merge the request field in scan with the existing one.
128 | func (s *Scan) MergeRequest(req *http.Request) {
129 | 
130 | 	// set cookie from response (if it is not done..)
131 | 	if s.Response != nil {
132 | 		s.Cookies = append(s.Cookies, s.Response.Cookies()...)
133 | 		// s.CookieJar.SetCookies(s.Request.URL, s.Response.Cookies())
134 | 	}
135 | 
136 | 	// read the request body, and then reset the reader
137 | 	var post []byte
138 | 	if req.Body != nil {
139 | 		if post, err := ioutil.ReadAll(req.Body); err == nil {
140 | 			req.Body = ioutil.NopCloser(bytes.NewReader(post))
141 | 		} else {
142 | 			// only possible error is bytes.ErrTooLarge from ioutil package.
143 | 			s.Error("MergeRequest", err)
144 | 		}
145 | 	}
146 | 
147 | 	// resolve relative url.
148 | 	if !req.URL.IsAbs() {
149 | 		req.URL = s.Request.URL.ResolveReference(req.URL)
150 | 	}
151 | 
152 | 	// TODO - drop if Method, URL, Body are same..
153 | 	// if req == s.Request {
154 | 	// s.Logf("Result after merge generate same request.", nil)
155 | 	// }
156 | 
157 | 	// swap
158 | 	prevReq := s.Request
159 | 	s.Request = req
160 | 	s.RequestBody = string(post)
161 | 
162 | 	// TODO - handle relative URL .
163 | 
164 | 	// Create a cookie jar, add cookie list (so cookie jar reject invalid cookie.)
165 | 	jar, _ := cookiejar.New(nil)
166 | 	jar.SetCookies(req.URL, s.Cookies)
167 | 
168 | 	// reset cookies
169 | 	s.Cookies = make([]*http.Cookie, 0)
170 | 	for _, c := range jar.Cookies(req.URL) {
171 | 		req.AddCookie(c)
172 | 		s.Cookies = append(s.Cookies, c)
173 | 	}
174 | 
175 | 	// Add user agent
176 | 	req.Header.Set("User-Agent", prevReq.UserAgent())
177 | 
178 | 	// Add referrer - TODO, perhaps we don't need this!
179 | 
180 | 	// remove Response.
181 | 	s.Response = nil
182 | 	s.ResponseBody = ""
183 | 
184 | }
185 | 
186 | // Spawn spawns a new scan object with a different ID.
187 | func (s *Scan) Spawn() *Scan {
188 | 	id := GenRandomID()
189 | 	job := *s.Job
190 | 	req := *s.Request // copy the value.
191 | 
192 | 	post := s.RequestBody
193 | 	s.Request.Body = ioutil.NopCloser(strings.NewReader(post))
194 | 
195 | 	// get the cookiejar, save the new cookies
196 | 	// jar := s.CookieJar
197 | 	cookies := s.Cookies[:]
198 | 	if s.Response != nil {
199 | 		cookies = append(cookies, s.Response.Cookies()...)
200 | 		// jar.SetCookies(s.Request.URL, s.Response.Cookies())
201 | 	}
202 | 
203 | 	return &Scan{
204 | 		ID:          id,
205 | 		Job:         &job,
206 | 		Request:     &req,
207 | 		RequestBody: post,
208 | 		Cookies:     cookies,
209 | 	}
210 | }
211 | 
212 | // Poke checks if the target is up.
213 | func (s *Scan) Poke(client HTTPDoer) (err error) {
214 | 
215 | 	s.Logm("Poke", "Poking")
216 | 
217 | 	// Add 5s timeout if it is http.Client
218 | 	switch client := client.(type) {
219 | 	case *http.Client:
220 | 		client.Timeout = time.Duration(3) * time.Second
221 | 	}
222 | 
223 | 	// delete the similarity case for the domain.
224 | 	// s.Session.DelPrefix("hash/unique/" + s.Request.URL.Host)
225 | 
226 | 	// http.Request is embeded in a Request embeded in a Scan.
227 | 	s.Response, err = client.Do(s.Request)
228 | 	if err != nil {
229 | 		s.Logm("Poke", "Failed")
230 | 		return
231 | 	}
232 | 
233 | 	s.ReadResponseBody()
234 | 
235 | 	s.HitCount++
236 | 	return
237 | }
238 | 
239 | // ReadResponseBody read Response.Body and fill it to ReadResponseBody.
240 | // It will also reconstruct the io.ReaderCloser stream.
241 | func (s *Scan) ReadResponseBody() {
242 | 	if s.ResponseBody == "" && s.Response != nil {
243 | 		if b, err := ioutil.ReadAll(s.Response.Body); err == nil {
244 | 			s.ResponseBody = string(b)
245 | 			s.Response.Body = ioutil.NopCloser(bytes.NewReader(b))
246 | 		}
247 | 	}
248 | }
249 | 
250 | func hash(s string) uint64 {
251 | 	h := fnv.New64()
252 | 	h.Write([]byte(s))
253 | 	return h.Sum64()
254 | }
255 | 
256 | // UpdateFingerprint updates the fingerprint field.
257 | func (s *Scan) UpdateFingerprint() {
258 | 	f := &s.Fingerprint
259 | 	if s.Request != nil {
260 | 		if f.Origin == 0 {
261 | 			f.Origin = hash(getOrigin(s.Request.URL))
262 | 		}
263 | 		if f.URL == 0 {
264 | 			f.URL = hash(s.Request.URL.String())
265 | 		}
266 | 		if f.Request == 0 {
267 | 			f.Request = hash(s.Request.URL.String() + "\n" + s.RequestBody)
268 | 		}
269 | 		// if f.RequestFull == 0 {
270 | 		// TODO
271 | 		// }
272 | 	}
273 | 
274 | 	if f.ResponseSimilarity == 0 {
275 | 		if r := strings.NewReader(s.ResponseBody); s.ResponseBody != "" && r != nil {
276 | 			f.ResponseSimilarity = distance.Fingerprint(r, 3)
277 | 			s.Logm("Fingerprint", "Computed")
278 | 		}
279 | 	}
280 | 
281 | }
282 | 
283 | // RateLimit checks whether we are under the allowed rate for crawling the site.
284 | // It returns a delay time to wait to check for ReadyToCrawl again.
285 | func (s *Scan) RateLimit() int {
286 | 	if memoryStore.Hit(s.Request.URL.Host) {
287 | 		return 0
288 | 	}
289 | 	return 5
290 | 
291 | 	// store := s.Session
292 | 	// // for each 5 second epoch, we create a key and see how many crawls are done.
293 | 	// ts := time.Now().Truncate(5 * time.Second).Unix()
294 | 	// k := "rate/" + s.Request.URL.Host + "/" + strconv.FormatInt(ts, 10)
295 | 	// if v, ok := store.Get(k); ok {
296 | 	// 	if v.(int64) >= 5 {
297 | 	// 		// s.Logm("RateLimit", "Delay 5 second")
298 | 	// 		// s.Logf("Wait for 5 second for %s (v:%d)", s.Request.URL, v)
299 | 	// 		return 5
300 | 	// 	}
301 | 	// 	// ready to crawl.
302 | 	// 	// TODO - this is not atomic.
303 | 	// 	c, _ := store.Get(k)
304 | 	// 	store.Set(k, c.(int64)+1)
305 | 	// 	// s.Logm("RateLimit", "No Delay")
306 | 	// 	return 0
307 | 	// }
308 | 
309 | 	// store.Set(k, 1)
310 | 	// // s.Logm("RateLimit", "No Delay")
311 | 	// return 0
312 | }
313 | 
314 | // IsScanAllowed check if the request URL is allowed per Job.DomainsAllowed.
315 | func (s *Scan) IsScanAllowed() bool {
316 | 	// relative URL
317 | 	if !s.Request.URL.IsAbs() {
318 | 		return true
319 | 	}
320 | 
321 | 	host, _, err := net.SplitHostPort(s.Request.URL.Host)
322 | 	if err != nil {
323 | 		host = s.Request.URL.Host
324 | 	}
325 | 
326 | 	for _, allowed := range s.Job.DomainsAllowed {
327 | 		if host == allowed {
328 | 			return true
329 | 		}
330 | 	}
331 | 	return false
332 | }
333 | 
334 | // CrawlAsync run the crawling asynchronously.
335 | func (s *Scan) CrawlAsync(r Renderer) {
336 | 	s.Logm("CrawlAsync", "Started")
337 | 	if s.IsScanAllowed() {
338 | 		r.Do(s)
339 | 	} else {
340 | 		s.Logm("CrawlAsync", "Scan Not Allowed")
341 | 	}
342 | }
343 | 
344 | // IsDuplicatedPage checks if we should proceed based on the Response
345 | func (s *Scan) IsDuplicatedPage() bool {
346 | 	s.UpdateFingerprint()
347 | 	f := s.Fingerprint.ResponseSimilarity
348 | 	if !memoryStore.Seen(s.Job.ID, "oracle", f, 2) {
349 | 		memoryStore.See(s.Job.ID, "oracle", f)
350 | 		s.Logm("IsDuplicatedPage", "Unique Page")
351 | 		return false
352 | 	}
353 | 	s.Logm("IsDuplicatedPage", "Duplicate Page")
354 | 	return true
355 | }
356 | 
357 | // Fuzz runs the vulnerability fuzzer, return the issue count.
358 | func (s *Scan) Fuzz(fuzzer Fuzzer) (int, error) {
359 | 	c, err := fuzzer.Fuzz(s)
360 | 	return c, err
361 | }
362 | 
363 | // // ExtractLinks extracts the list of links found from the responseText in the Scan.
364 | // func (s *Scan) ExtractLinks() (scans []Scan, err error) {
365 | 
366 | // 	return
367 | // }
368 | 
369 | // ShouldCrawl checks if the links should be queued for next crawl.
370 | func (s *Scan) ShouldCrawl() bool {
371 | 	s.UpdateFingerprint()
372 | 	f := s.Fingerprint.URL
373 | 	if !memoryStore.Seen(s.Job.ID, "hash", f, 0) {
374 | 		memoryStore.See(s.Job.ID, "hash", f)
375 | 		s.Logm("ShouldCrawl", "Unique Link")
376 | 		return true
377 | 	}
378 | 	s.Logm("ShouldCrawl", "Duplicate Link")
379 | 	return false
380 | }
381 | 
382 | // TODO - LogFmt (fmt string)
383 | // TODO - LogI (interface)
384 | // Error logs the error for the given service.
385 | func (s *Scan) Error(service string, err error) {
386 | 	errmsg := fmt.Sprint(err)
387 | 	s.Logm(service, errmsg)
388 | }
389 | 
390 | // Logmf logs the message for the given service.
391 | func (s *Scan) Logmf(service, format string, a ...interface{}) {
392 | 	s.Logm(service, fmt.Sprintf(format, a...))
393 | }
394 | 
395 | // Logm sends a LogMessage to Log processor.
396 | func (s *Scan) Logm(service, msg string) {
397 | 	// TODO - improve the efficiency of this.
398 | 	m := &LogMessage{
399 | 		Service: service,
400 | 		Msg:     msg,
401 | 		// Fingerprint: s.Fingerprint,
402 | 		Method: s.Request.Method,
403 | 		Url:    s.Request.URL.String(),
404 | 		JobID:  s.Job.ID,
405 | 	}
406 | 	s.Log(m)
407 | }
408 | 
409 | // Logf logs using the given format string.
410 | func (s *Scan) Logf(format string, a ...interface{}) {
411 | 	str := fmt.Sprintf(format, a...)
412 | 	s.Log(str)
413 | }
414 | 
415 | // Log encodes the given argument as JSON and writes it to
416 | // the log writer.
417 | func (s *Scan) Log(v interface{}) {
418 | 	if logWriter == nil {
419 | 		return
420 | 	}
421 | 	logWriterMu.Lock()
422 | 	encoder := json.NewEncoder(logWriter)
423 | 	encoder.Encode(v)
424 | 	logWriterMu.Unlock()
425 | }
426 | 


--------------------------------------------------------------------------------
/renderer/resource/render.js:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env phantomjs --ssl-protocol=any --ignore-ssl-errors=true
  2 | 
  3 | /**
  4 |   * Copyright 2015, Yahoo Inc. All rights reserved.
  5 |   * Use of this source code is governed by a BSD-style
  6 |   * license that can be found in the LICENSE file.
  7 |   *
  8 |   
  9 | */
 10 | 
 11 | (function() {
 12 | 
 13 | var page = require('webpage').create(),
 14 |     system = require('system'),
 15 |     pageTimeoutTimer;
 16 | 
 17 | if (system.args.length === 1) {
 18 |     console.log('Usage: render.js http(s)://<host>[:port][/path] [{"method":"post", "data":"a=1&b=2"}]');
 19 |     return phantom.exit(1);
 20 | }
 21 | 
 22 | var utils = require('./utils.js'),
 23 |     headers = require('./headers.js').init(phantom, page),
 24 |     eventHandler = require('./events.js'),
 25 |     events = eventHandler.init(phantom, page),
 26 |     t = Date.now(), url = system.args[1], opt, output = {};
 27 | 
 28 | function quit() {
 29 |     try {events.invokeListeners('onExit')}catch(e){};
 30 |     utils.printJSON('exit', 0);
 31 |     phantom.exit();
 32 | }
 33 | 
 34 | // ensure that when our code fails, we could die gracefully
 35 | phantom.onError = function(message, trace){
 36 |     // prepare the JSON to directly die, without going thru events.notifyError()
 37 |     utils.printJSON('error', {errorCode:2001, errorString: message + ' \r\n' + JSON.stringify(trace)});
 38 |     quit();
 39 | }
 40 | 
 41 | // log the js error generated by the page
 42 | page.onError = function(msg, trace) {
 43 |     utils.pageChanges.push('jsError', msg);
 44 | }
 45 | 
 46 | // the centralized (except phantom.onError) error handler
 47 | events.addListener('MainFrameError', function(response) {
 48 |     // if (!response.errorCode) return;
 49 | 
 50 |     output.elasped = Date.now() - t;
 51 |     output.errorCode = response.errorCode;
 52 |     output.errorString = response.errorString;
 53 | 
 54 |     // http error with a proper status code is considered ok for scrapy
 55 |     var jsonType = 'error';
 56 |     if (response.status && response.status > 0) {
 57 |         output.ok = 1;
 58 |         jsonType = 'domSteady';
 59 |     }
 60 | 
 61 |     // during error, make sure phantom can die no matter what
 62 |     try {
 63 |         output.response = utils.prepareResponse(response, headers.getRespHeaders);
 64 |         output.response.body = utils.cleanResponseBody(page.content);
 65 |         output.response.details = utils.pageChanges.fetchAll();
 66 |     } catch (e) {}
 67 | 
 68 |     utils.printJSON(jsonType, output);
 69 |     quit();
 70 | });
 71 | 
 72 | 
 73 | 
 74 | 
 75 | // validate the url
 76 | if (utils.invalidUrl(url)) 
 77 |     return events.notifyError(1000, 'Invalid Url');
 78 | 
 79 | // process the extra argument: options 
 80 | try {
 81 |     opt = JSON.parse(system.args[2] || '{}');
 82 | } catch(e) {
 83 |     return events.notifyError(1001, 'Invalid options');
 84 | }
 85 | 
 86 | 
 87 | // impose a strict timeout in case this phantomjs does not die properly (180s is the default by scrapy)
 88 | opt.timeout = opt.timeout || 180;
 89 | function setPageTimeout(timeout) {
 90 |     window.clearTimeout(pageTimeoutTimer);
 91 |     pageTimeoutTimer = window.setTimeout(function(){
 92 |         utils.printJSON('error', {
 93 |             errorCode: 4, 
 94 |             errorString: 'Timeout Error (exceeded ' + opt.timeout + 's)', 
 95 |             response: {url: url}
 96 |         });
 97 |         quit();
 98 |     }, timeout || (opt.timeout * 1000));
 99 | }
100 | setPageTimeout();
101 | 
102 | opt.debug = opt.debug || false;
103 | opt.method = opt.method || 'get';
104 | opt.data = opt.data || null;
105 | opt.startHostname = utils.getHostname(url);
106 | 
107 | // whitelist the domain from url when allowed_domains are not provided
108 | opt.allowed_domains = opt.allowed_domains || [opt.startHostname];
109 | 
110 | // by default no follow pre-redirections (post-redirections are not followed anyway)
111 | opt.followPreRedirections = opt.followPreRedirections || false;
112 | 
113 | // if enabled, do not quit when utils.whitelistedRedirectionDomains(redirectUrl)
114 | opt.relaxFirstRedirection = opt.relaxFirstRedirection || true;
115 | 
116 | // resource timeout should not exceed 30s 
117 | page.settings.resourceTimeout = (opt.resourceTimeout || 30) * 1000;
118 | 
119 | // make loadImages default to false
120 | page.settings.loadImages = (opt.loadImages = (!opt.loadImages === false));
121 | 
122 | // to handle any headers-related manipulation and configuration
123 | page.customHeaders = headers.setReqHeaders(opt.headers || {}, opt.startHostname);
124 | 
125 | // if (opt.debug) {
126 |     // console.log('Cookies: ' + JSON.stringify(phantom.cookies));
127 | 
128 | //     events.addListener('LoadFinished', function(status) {
129 | //         console.log('debug: onLoadFinished');
130 | //         var timeCounter = 1;
131 | //         window.setInterval(function(){console.log('debug: onLoadFinished + '+ (timeCounter++) +'00ms: linkCount=' + page.evaluate(function(){return document.getElementsByTagName('a').length}) )}, 100);
132 | //     });
133 | 
134 | //     events.addListener('MainFrameSteady', function(response) {
135 | //         console.log('debug: MainFrameSteady - linkCount=' + page.evaluate(function(){return document.getElementsByTagName('a').length}) + '\n\n');
136 | //     });
137 | // }
138 | 
139 | 
140 | 
141 | // stop the first url from navigating to disallowed_domains or disallowed extension (css, zip, etc)
142 | if (utils.invalidUrl(url, opt.allowed_domains))
143 |     events.notifyError(1002, 'Load Failed Error (from disallowed domains)');
144 | else if (utils.blacklistedUrl(url))
145 |     events.notifyError(1003, 'Filetype unsupported/unrendered as derived from file extension');
146 | 
147 | 
148 | // log all mainFrame navigations
149 | events.addListener('MainFrameRedirection', function(requestData, networkRequest){
150 |     utils.pageChanges.push('mainFrame', requestData);
151 | });
152 | 
153 | 
154 | events.addListener('MainFramePreRedirection', function(requestData, networkRequest){
155 |     // abort any request that attempts to redirect the mainframe away if nofollows is configured
156 |     if (!opt.followPreRedirections) {
157 |         // mainFrameSteady will still be invoked during onLoadFinished
158 |         networkRequest.abort();
159 |         return;
160 |     }
161 | 
162 |     var redirectUrl = requestData.url;
163 |     // prevent navigations to disallowed domains
164 |     if (utils.invalidUrl(redirectUrl, opt.allowed_domains)) {
165 | 
166 |         // exception: do not abort the first redirection to some whitelisted domains
167 |         if (opt.relaxFirstRedirection 
168 |                 && !output.firstRedirectionRelaxed 
169 |                 && utils.whitelistedRedirectionDomains(redirectUrl)) {
170 |             output.firstRedirectionRelaxed = true;
171 |             return;
172 |         }
173 | 
174 |         networkRequest.abort();
175 |         events.notifyError(1002, 'Load Failed Error (from disallowed domains)');
176 |     }
177 | 
178 |     // prevent navigations to some blacklisted extensions (e.g, css, binaries)
179 |     if (utils.blacklistedUrl(redirectUrl)) {
180 |         networkRequest.abort();
181 |         events.notifyError(1003, 'Filetype unsupported/unrendered as derived from file extension');
182 |     }
183 | });
184 | 
185 | // disable any navigations after reaching its first destination (i.e. no more redirects)
186 | events.addListener('MainFramePostRedirection', function(requestData, networkRequest){
187 |     // further page load will be freezed 
188 |     // using page.navigationLocked = true; won't allow us to capture the request
189 |     networkRequest.abort();
190 | });
191 | 
192 | // extract all childFrames navigations
193 | events.addListener('ChildFrameNavigate', function(requestData, networkRequest, type) {
194 |     // abort any disallowed requests
195 |     if (utils.invalidUrl(requestData.url, opt.allowed_domains) || utils.blacklistedUrl(requestData.url))
196 |         networkRequest.abort();
197 |     utils.pageChanges.push('childFrames', requestData);
198 | });
199 | 
200 | events.addListener('MainFrameResourceReceived', function(response) {
201 |     // phantomjs does not fetch binaries anyway
202 |     if (response.status && response.status >= 200 && response.status < 300
203 |             && !/(?:^text\/|xml|javascript|json)/i.test(response.contentType))
204 |         events.notifyError(1003, 'Filetype unsupported/unrendered (' + response.contentType + ')');
205 | });
206 | 
207 | events.addListener('MainFrameNavigationsEnded', function(response) {
208 |     output.response = utils.prepareResponse(response, headers.getRespHeaders);
209 | });
210 | 
211 | // skip downloading unnecessary subresources according to a known file extension list
212 | events.addListener('SubResourceRequested', function(requestData, networkRequest) {
213 |     // prevent navigations to some blacklisted extensions (e.g, css, binaries)
214 |     if (utils.blacklistedUrl(requestData.url))
215 |         networkRequest.abort();
216 | 
217 |     // utils.pageChanges.push('subResources', requestData);
218 | });
219 | 
220 | // in onInitialized, ajax calls are hooked
221 | events.addListener('Initialized', function() {
222 | 
223 |     // page.injectJs('./incl/jquery-2.1.1.min.js');
224 | 
225 |     // inject scripts to catch links
226 |     page.injectJs('./extractors.js');
227 | });
228 | 
229 | 
230 | function extractDetails() {
231 |     // childFrames, subResources, redirects extracted
232 |     var extracted = {}, details = utils.pageChanges.fetchAll();
233 | 
234 |     extracted = page.evaluate(function(){
235 |         // link, form, and jsLink extractions
236 |         return window._gryffin_onMainFrameReady && window._gryffin_onMainFrameReady();
237 |     }) || {};
238 | 
239 |   //   console.log("DEBUG!!! " + page.title);
240 |   // var cookies = page.cookies;
241 | 
242 |   // console.log('Listing cookies:');
243 |   // for(var i in cookies) {
244 |   //   console.log(cookies[i].name + '=' + cookies[i].value);
245 |   // }
246 |     details.links = extracted.links || [];
247 |     details.forms = extracted.forms || [];
248 | 
249 |     details.jsLinkFeedback = extracted.jsLinkFeedback;
250 | 
251 |     return details;
252 | }
253 | 
254 | events.addListener('MainFrameSteady', function(response) {
255 | 	// extend timeout to allow sufficient time for event enumerations
256 | 	setPageTimeout();
257 | 
258 |     // here we terminate this process with the response we collected
259 |     output.elasped = Date.now() - t;
260 |     output.response.body = utils.cleanResponseBody(response.body);
261 | 
262 |     if (opt.htmlOnly) {
263 |         console.log(output.response.body);
264 |         phantom.exit();
265 |         return;
266 |     }
267 | 
268 |     output.response.details = extractDetails();
269 | 
270 |     // ensure only one JSON is outputed
271 |     if (!output.ok) {
272 |         output.ok = 1;
273 |         // console.log(JSON.stringify(output, function(k, v){
274 |         //     return (typeof v === "string")
275 |         //             ? v.replace(/[\u007f-\uffff]/g, function(c) {
276 |         //                     return '\\u'+('0000'+c.charCodeAt(0).toString(16)).slice(-4);
277 |         //                 });
278 |         //             : v;
279 |         // }));
280 |         utils.printJSON('domSteady', output);
281 |     }
282 |     
283 |     // can exit due to lack of jsLinks execution
284 |     if (output.response.details && !output.response.details.jsLinkFeedback)
285 |         quit();
286 | });
287 | 
288 | // disable any navigations from new windows, instead, capture the request object
289 | events.addListener('PageCreated', function(newPage) {
290 |     var newEvents = eventHandler.init(phantom, newPage);
291 |     newEvents.addListener('ResourceRequested', function(requestData, networkRequest) {
292 |         networkRequest.abort();
293 |         utils.pageChanges.push('childFrames', requestData);
294 |     });
295 | });
296 | 
297 | // get informed about new link discovery by incl/extractors.js
298 | events.addListener('Callback', function(data){
299 |     if (data.action === 'waitTimer') {
300 |         events.invokeListeners('onSteady-waitTimer', data.timeout);
301 | 
302 |     } else if (data.action === 'element.triggering') {
303 |         // wait for network steady once an element is being triggered 
304 |         events.addListener('onSteady', function() {
305 |             var eventData = page.evaluate(function(){return jsLinks.getData()}),
306 |                 // associate other page changes to the recent element triggered
307 |                 changes = utils.pageChanges.fetchAll();
308 |                 changesKeys = Object.keys(changes);
309 | 
310 |             // append any pageChanges to the eventData
311 |             changesKeys.forEach(function(k){
312 |                 eventData[k] = changes[k];
313 |             });
314 | 
315 |             // if there exists any dom changes
316 |             if (changesKeys.length > 0 || eventData.links || eventData.forms)
317 |                 events.invokeListeners('onDomChanged', eventData);
318 | 
319 |             // by design, onSteady is called only once even without "return false"
320 |             return false;
321 |         });
322 |         events.invokeListeners('onSteady-wait', 'element-trigger');
323 |     } else if (data.action === 'element.triggered') {
324 |         events.invokeListeners('onSteady-ready', 'element-trigger');
325 |     } else if (data.action === 'done')
326 |         quit();
327 | });
328 | 
329 | // print the triggered element if new results are available
330 | events.addListener('DomChanged', function(data) {
331 |     utils.printJSON('domChanged', data);
332 | });
333 | 
334 | // page.onConsoleMessage = function(msg) {
335 | //     console.log('CONSOLE: ' + msg);
336 | // };
337 | page.onConfirm = function(msg){return true};
338 | 
339 | page.openUrl(url, {
340 |     operation: opt.method,
341 |     data: opt.data    // String expected
342 | }, page.settings);
343 | 
344 | })();
345 | 


--------------------------------------------------------------------------------
/renderer/resource/events.js:
--------------------------------------------------------------------------------
  1 | /** 
  2 |   * Copyright 2015, Yahoo Inc. All rights reserved.
  3 |   * Use of this source code is governed by a BSD-style
  4 |   * license that can be found in the LICENSE file.
  5 |   *
  6 |   * @author Adon adon@yahoo-inc.com
  7 |   * @desc this module exposes more usable events, and a better event handling logic
  8 |   *
  9 |   *  Event Flow:
 10 |   * ===================
 11 |   * onNavigationRequested
 12 |   * onResourceRequested
 13 |   * onNavigationRequested + onResourceRequested -> onNavigate, onMainFrameNavigate, onMainFramePreRedirection, onChildFrameNavigate
 14 |   * onLoadStarted
 15 |   *
 16 |   * onResourceReceived 
 17 |   * onResourceReceived + mainFrame -> onMainFrameResourceReceived
 18 |   * [onResourceTimeout/onResourceError] + mainFrame -> onMainFrameError, onMainFrameResourceError
 19 |   *
 20 |   * onInitialized
 21 |   * 
 22 |   * onMainFrameNavigationsEnded
 23 |   * 
 24 |   * onSubResourceRequested
 25 |   * [onMainFramePostRedirection]
 26 |   *
 27 |   * onLoadFinished
 28 |   * onLoadFinished + status=='success' -> onMainFrameLoadSuccess
 29 |   * onLoadFinished + status=='fail' -> onMainFrameLoadFailed, onMainFrameError
 30 |   *
 31 |   * [onMainFrameLoadSuccess] + steadyLogic() -> onMainFrameSteady
 32 | 
 33 | ResourceError Codes
 34 | # errorMessage[1] = "Connection Refused Error";
 35 | # errorMessage[2] = "RemoteHost Closed Error";
 36 | # errorMessage[3] = "Host Not Found Error";
 37 | # errorMessage[4] = "Timeout Error";
 38 | # errorMessage[5] = "Operation Canceled Error";
 39 | # errorMessage[6] = "Ssl Handshake Failed Error";
 40 | # errorMessage[7] = "Temporary Network Failure Error";
 41 | # errorMessage[8] = "Network Session Failed Error";
 42 | # errorMessage[9] = "Background Request Not Allowed Error";
 43 | # errorMessage[99] = "Unknown Network Error";
 44 | # errorMessage[101] = "ProxyConnectionRefusedError";
 45 | # errorMessage[102] = "ProxyConnectionClosedError";
 46 | # errorMessage[103] = "ProxyNotFoundError";
 47 | # errorMessage[104] = "ProxyTimeoutError";
 48 | # errorMessage[105] = "ProxyAuthenticationRequiredError";
 49 | # errorMessage[199] = "UnknownProxyError";
 50 | # errorMessage[201] = "ContentAccessDenied";
 51 | # errorMessage[202] = "ContentOperationNotPermittedError";
 52 | # errorMessage[203] = "ContentNotFoundError";
 53 | # errorMessage[204] = "AuthenticationRequiredError";
 54 | # errorMessage[205] = "ContentReSendError";
 55 | # errorMessage[299] = "UnknownContentError";
 56 | # errorMessage[301] = "ProtocolUnknownError";  // after networkRequest.abort()
 57 | # errorMessage[302] = "ProtocolInvalidOperationError";
 58 | # errorMessage[399] = "ProtocolFailure";
 59 | 
 60 | */
 61 | 
 62 | exports.init = function(phantom, page) {
 63 | 
 64 |     var callbackList = {},
 65 |         resourceDetails = {},
 66 |         navigationalRequests = {},
 67 |         mainFrameStatus = {},
 68 |         mainFrameNetwork = {},
 69 |         timerCounter = 0;
 70 | 
 71 |     // patch response.redirectURL to take the URL (can be relative) in Location header 
 72 |     function patchRedirectURL(response) {
 73 |         // we honor the location header only if response.status = 3xx
 74 |         !response.redirectURL && response.status 
 75 |         && response.status >= 300 && response.status < 400 
 76 |         && response.headers && response.headers.some(function(h){
 77 |             if (h.name.toLowerCase() == 'location') {
 78 |                 response.redirectURL = h.value;
 79 |                 return true;
 80 |             }
 81 |         });
 82 |     }
 83 | 
 84 |     // a shortcut to invoke the customized listeners
 85 |     function invokeListeners(eventName) {
 86 |         // copy arguments to a new array, and removes the first element
 87 |         var i = 0, key, args = [], handler;
 88 |         for (key in arguments)
 89 |             args[i++] = arguments[key];
 90 |         args.shift();
 91 | 
 92 |         handler = page[eventName] || queuedEventCallbacks(eventName);
 93 |         return handler && handler.apply(page, args);
 94 |     }
 95 | 
 96 | 
 97 |     // when all handlers of an event returns false, give up the event listener
 98 |     function queuedEventCallbacks(eventName) {
 99 |         return function() {
100 |             // disable executing any more event handlers when an error was once thrown
101 |             if (mainFrameStatus.error)
102 |                 return;
103 |             
104 |             // (eventName == 'onCallback') ? console.log(JSON.stringify(arguments[0])) : console.log('debug: ' + eventName + ' ' + (/^onSteady-/.test(eventName) ? arguments[0] + ' ' + JSON.stringify(mainFrameNetwork.outstanding) : arguments[0]&&arguments[0].url));
105 |             // mainFrameStatus.externalError && console.log('extern:' + JSON.stringify(mainFrameStatus.externalError));
106 | 
107 |             // if an externalError was ever raised, instead of invoking the following events, we raise an onMainFrameError
108 |             if (mainFrameStatus.externalError
109 |                     && ['onMainFrameResourceReceived', 'onLoadStarted', 'onInitialized', 
110 |                         'onLoadFinished', 'onMainFrameLoadSuccess', 'onMainFrameSteady'].indexOf(eventName) !== -1) {
111 |                 var response = mainFrameStatus.response || {};
112 |                 response.url = response.url || mainFrameStatus.request.url;
113 |                 response.errorCode = mainFrameStatus.externalError.errorCode;
114 |                 response.errorString = mainFrameStatus.externalError.errorString;
115 |                 invokeListeners('onMainFrameError', response);
116 |                 return;
117 |             }
118 | 
119 |             var eventCallbackList = callbackList[eventName];
120 |             if (eventCallbackList) {
121 |                 for (var i = 0, _callback; _callback = eventCallbackList[i]; i++) 
122 |                     if (_callback.apply(this, arguments) === false)
123 |                         eventCallbackList.splice(i--, 1);
124 | 
125 |                 if (eventCallbackList.length === 0)
126 |                     page[eventName] = null;
127 |             }
128 | 
129 |             if (eventName == 'onMainFrameError')
130 |                 mainFrameStatus.error = true;
131 |         };
132 |     };
133 | 
134 |     // callback added from this handler won't overwrite existing ones
135 |     // return false to get itself removed from the event queue
136 |     function addListener(eventName, callback, thirdarg){
137 |         if (!callback)
138 |             return;
139 | 
140 |         if (eventName == 'onSteady') {
141 |             mainFrameNetwork.steadyMonitor(callback, thirdarg);
142 |             return;
143 |         }
144 | 
145 |         if (eventName.indexOf('on') !== 0)
146 |             eventName = 'on' + eventName;
147 |         callbackList[eventName] = callbackList[eventName] || [];
148 |         callbackList[eventName].push(callback);
149 | 
150 |         // skip adding those events to page that phantomjs won't fire by itself
151 |         if (!/^(?:onMainFrame|onSteady)/.test(eventName) && !page[eventName])
152 |             page[eventName] = queuedEventCallbacks(eventName);
153 |     }
154 | 
155 |     // keep track of the resource status
156 |     // resourceDetails['req-N'] may have {req, actions, aborted, resp, err}
157 |     addListener('ResourceRequested', function(arg0, arg1){
158 |         var resId = 'res' + arg0.id;
159 |         resourceDetails[resId] = {'req': arg0, 'actions': arg1};
160 |     });
161 |     addListener('ResourceReceived', function(arg0){
162 |         var resId = 'res' + arg0.id, resObj = resourceDetails[resId];
163 | 
164 |         // ResourceError fires before ResourceReceived
165 |         // make error code and string captured at ResourceError available to ResourceReceived 
166 |         if (resObj.err) {
167 |             arg0.errorCode = resObj.err.errorCode;
168 |             arg0.errorString = resObj.err.errorString;
169 |             resObj.aborted && (arg0.aborted = resObj.aborted);
170 | 
171 |             arg0.url = arg0.url || resObj.req.url; 
172 |         }
173 |         patchRedirectURL(arg0);
174 |         resObj.resp = arg0;
175 |     });
176 |     // onResourceTimeout, onResourceError is also fired
177 |     addListener('ResourceError', function(arg0){
178 |         var resId = 'res' + arg0.id, resObj = resourceDetails[resId];
179 |         // Upon abortion, url is stripped, resulting in protocol error (301)
180 |         if (arg0.errorCode === 301 && arg0.url === '') 
181 |             resObj.aborted = arg0.aborted = true;
182 |         resObj.resp = resObj.err = arg0;
183 |     });
184 | 
185 | 
186 |     // onSteady Algorithm: 
187 |     //  1) mainFrameNetwork.monitor(onSteady, timeout=4000ms) initiaites:
188 |     //        a) minSteadyTimer (i.e., Min(300ms, timeout/10))
189 |     //        b) maxSteadyTimer (i.e., maxSteadyTimeout = 4000ms)
190 |     //      c) Steady-ready(resourceId) cancels Steady-wait(resourceId)
191 |     //  2) If nothing fired during minSteadyTimer, 
192 |     //       or in case 1(c) above ever happened, cancels minSteadyTimer
193 |     //     finally, each Steady-ready() will see if for 75ms no more Steady-ready(), 
194 |     //              and outStandingReqs.length == 0, fires onSteady()
195 |     //  3) If take longer than maxSteadyTimeout, fires onSteady()
196 |     mainFrameNetwork.steadyMonitor = function(onSteady, timeout){
197 |         mainFrameNetwork.outstanding = {'minSteadyTimer':true};
198 | 
199 |         // install a one-time onSteady listener
200 |         mainFrameNetwork.onSteady = onSteady || function(){};
201 |         // 4000ms based on stats concerning max time users'd normally expect, as suggested by @albertyu
202 |         timeout = parseInt(timeout || 4000);
203 |         mainFrameNetwork.maxTimeout = timeout;
204 |         
205 |         mainFrameNetwork.minSteadyTimer = setTimeout(function(){
206 |             invokeListeners('onSteady-ready', 'minSteadyTimer');
207 |         }, Math.min(300, timeout/10));
208 | 
209 |         clearTimeout(mainFrameNetwork.maxSteadyTimer);
210 |         // the max post onloaded time to tolerate: 4 secs as suggested by @albert
211 |         mainFrameNetwork.maxSteadyTimer = setTimeout(function(){
212 |             mainFrameNetwork.onSteady('maxSteadyTimer');
213 |         }, timeout);
214 | 
215 |         // monitor outstanding requests
216 |         if (!mainFrameStatus.steadyMonitor) {
217 |             mainFrameStatus.steadyMonitor = true;
218 |             addListener('ResourceRequested', function(arg0, arg1){
219 |                 invokeListeners('onSteady-wait', 'res' + arg0.id);
220 |             });
221 |             // onResourceTimeout, onResourceError will also be fired
222 |             ['ResourceReceived','ResourceError'].forEach(function(eventName){
223 |                 addListener(eventName, function(arg0){
224 |                     (!arg0.stage || arg0.stage == 'end') && invokeListeners('onSteady-ready', 'res' + arg0.id);
225 |                 });
226 |             });
227 |         }
228 |     };
229 |     addListener('Steady-wait', function(reason) {
230 |         mainFrameNetwork.outstanding[reason] = true;
231 |         clearTimeout(mainFrameNetwork.finalistTimer);
232 | 
233 |         // cancel the minSteadyTimer
234 |         if (mainFrameNetwork.outstanding['minSteadyTimer']) {
235 |             delete mainFrameNetwork.outstanding['minSteadyTimer'];
236 |             clearTimeout(mainFrameNetwork.minSteadyTimer);
237 |         }
238 |     })
239 |     addListener('Steady-ready', function(reason){
240 |         delete mainFrameNetwork.outstanding[reason];
241 | 
242 |         // extend the finialist timer by discarding the previous one (non-atomic operations, but good enough)
243 |         clearTimeout(mainFrameNetwork.finalistTimer);
244 |         // wait for another 20ms to make sure the sea is completely silenced (i.e., no more new requests)
245 |         mainFrameNetwork.finalistTimer = setTimeout(function(){
246 |             if (Object.keys(mainFrameNetwork.outstanding).length === 0)
247 |                 mainFrameNetwork.onSteady('done');
248 |         }, 75);
249 |     })
250 |     // introduce a reason to wait setTimeout/Interval for 'timeout' ms once
251 |     addListener('Steady-waitTimer', function(timeout){
252 |     	// directly ignore timeout longer than maxTimeout
253 |     	function readyToWait() {
254 |     		return mainFrameNetwork.maxTimeout && timeout < mainFrameNetwork.maxTimeout;
255 |     	}
256 | 
257 |         var reason = ['timer', timerCounter++, timeout].join('-');
258 |         if (readyToWait())
259 |             invokeListeners('onSteady-wait', reason);
260 |         else 
261 |             addListener('MainFrameSteady', function(){
262 |                 readyToWait() && invokeListeners('onSteady-wait', reason);
263 |                 return false;
264 |             });
265 |         window.setTimeout(function(){
266 |             readyToWait() && invokeListeners('onSteady-ready', reason);
267 |         }, timeout || 1);
268 |     })
269 |     
270 | 
271 |     // LoadStarted fires only for mainFrame
272 |     addListener('LoadStarted', function(){
273 |         mainFrameStatus.loadStarted = true;
274 |     });
275 | 
276 | 
277 |     // expose the following customized events:
278 |     //  - onNavigate:
279 |     //  - onMainFrameNavigate: 
280 |     //  - onChildFrameNavigate: 
281 |     addListener('NavigationRequested', function(url, type, willNavigate, fromMainFrame) {
282 |         if (!url || url === 'about:blank' || !willNavigate)
283 |             return;
284 | 
285 |         addListener('ResourceRequested', function(requestData, networkRequest) {
286 |             // traceback if such URL is recently recorded as the navigation
287 |             if (decodeURI(url) === decodeURI(requestData.url) || url === requestData.url) {
288 | 
289 |                 // let resourceDetails know whether a particular resource happens in frames
290 |                 var resObj = resourceDetails['res' + requestData.id];
291 |                 resObj.fromFrame = true;
292 |                 resObj.req.fromMainFrame = fromMainFrame;
293 |                 resObj.req.navigationType = type;
294 | 
295 |                 // mark this as a navigational request
296 |                 navigationalRequests[requestData.id] = resObj;
297 | 
298 |                 invokeListeners('onNavigate', requestData, networkRequest, fromMainFrame, type);
299 |                 invokeListeners((fromMainFrame ? 'onMainFrameNavigate' : 'onChildFrameNavigate'),
300 |                                 requestData, networkRequest, type);
301 | 
302 |                 return false;
303 |             }
304 |         });
305 |     });
306 | 
307 |     // expose the following customized events:
308 |     //  - onSubResourceRequested: fired for subresource requests (i.e., not from frames/windows)
309 |     addListener('NavigationRequested', function(url, type, willNavigate, fromMainFrame) {
310 |         // the following is setup in NavigationRequested so as to run after the ResourceRequested setup above
311 |         addListener('ResourceRequested', function(requestData, networkRequest) {
312 |             if (!navigationalRequests[requestData.id])
313 |                 invokeListeners('onSubResourceRequested', requestData, networkRequest);
314 |         });
315 |         return false;
316 |     });
317 | 
318 |     // expose the following customized events:
319 |     //  - onMainFramePostRedirection:
320 |     //  - onMainFramePreRedirection
321 |     addListener('MainFrameNavigate', function(requestData, networkRequest, type) {
322 |         if (requestData.id !== 1)
323 |             invokeListeners('onMainFrameRedirection', requestData, networkRequest, type);
324 | 
325 |         if (mainFrameStatus.navigationsEnded)
326 |             invokeListeners('onMainFramePostRedirection', requestData, networkRequest, type);
327 |         else if (requestData.id !== 1)
328 |             invokeListeners('onMainFramePreRedirection', requestData, networkRequest, type);
329 | 
330 |         if (mainFrameStatus.externalError)
331 |             return;
332 | 
333 |         // backup the current mainFrameStatus, in case the new mainFrameStatus is detected being aborted
334 |         var mainFrameStatusBackup = mainFrameStatus;
335 |         // prepare a new mainFrameStatus
336 |         mainFrameStatus = {'requested': true, 'request': requestData};
337 |         if (mainFrameStatusBackup.requested)
338 |         	mainFrameStatus.lastBackup = mainFrameStatusBackup;
339 | 
340 |         // expose the following customized events:
341 |         //  - onMainFrameNavigationsEnded: fired once when the MainFrame has no more redirections
342 |         addListener('ResourceReceived', function(response) {
343 |             // ignore subresource's response 
344 |             if (mainFrameStatus.request.id !== response.id)
345 |                 return;
346 | 
347 |             // restore the original mainFrameStatus if the current one is aborted
348 |             if (response.aborted) {
349 |                 invokeListeners('onMainFrameResourceAborted', response);
350 |                 invokeListeners('onMainFrameResourceReceived', response);
351 | 
352 |                 if (mainFrameStatus.lastBackup)
353 |                 	mainFrameStatus = mainFrameStatus.lastBackup;
354 |                 else
355 |                 	mainFrameStatus.response = response;
356 | 
357 |                 if (!mainFrameStatus.navigationsEnded) {
358 |                 	mainFrameStatus.navigationsEnded = true;
359 |                 	invokeListeners('onMainFrameNavigationsEnded', mainFrameStatus.response);
360 |                 }
361 |                 return false;
362 |             }
363 | 
364 |             mainFrameStatus.response = response;
365 | 
366 |             if (response.errorCode) {
367 |                 invokeListeners('onMainFrameResourceError', response);
368 |                 invokeListeners('onMainFrameResourceReceived', response);
369 |                 invokeListeners('onMainFrameError', response);
370 |                 return false;
371 |             }
372 |             
373 |             invokeListeners('onMainFrameResourceReceived', response);
374 | 
375 |             // the mainFrame's response[stage=start] that has no further redirections 
376 |             if (!mainFrameStatus.navigationsEnded 
377 |                     && (response.status < 300 || !response.redirectURL)) {
378 |                 mainFrameStatus.navigationsEnded = true;
379 |                 invokeListeners('onMainFrameNavigationsEnded', response);
380 | 
381 |                 mainFrameStatus.destResponse = response;
382 | 
383 |             // the corresponding mainFrame's response[stage=end]
384 |             } else {
385 |                 delete mainFrameStatus.destResponse;
386 |                 // this is important to deactivate this listener once the main response is downloaded
387 |                 return false;
388 |             }
389 |         });
390 |     });
391 | 
392 | 
393 |     addListener('LoadFinished', function(status) {
394 |         var response = mainFrameStatus.response || {'url': url};
395 |         
396 |         mainFrameStatus.loadFinished = true;
397 |         if (status == 'success' 
398 |                 || (page.content && page.content !== '<html><head></head><body></body></html>')
399 |                 || (response.status && response.status >= 300 && response.status < 400)) {
400 |             mainFrameStatus.loadSuccess = true;
401 |             invokeListeners('onMainFrameLoadSuccess', response);
402 |         } else {
403 |             mainFrameStatus.loadFailed = true;
404 |             response.errorCode = response.errorCode || 1002;
405 |             response.errorString = response.errorString || 'Load Failed Error (from disallowed domains)';
406 |             
407 |             invokeListeners('onMainFrameLoadError', response);
408 |             invokeListeners('onMainFrameError', response);
409 |         }
410 |         return false;
411 |     });
412 | 
413 |     addListener('MainFrameLoadSuccess', function(response){
414 |         addListener('onSteady', function(lastSignal){
415 |             if (!mainFrameStatus.steady) {
416 |                 mainFrameStatus.steady = true;
417 | 
418 |                 // note that page.content may not contain all dynamically-generated content
419 |                 // for content-type like xml will have no JS execution context, page.evaluate() returns null, so we resort to page.content
420 |                 var html = page.evaluate(function(){return document.documentElement ? document.documentElement.outerHTML : ''});
421 |                 response.body = html || page.content || response.body;
422 | 
423 |                 invokeListeners('onMainFrameSteady', response, lastSignal);
424 |             }
425 |         });
426 |     });
427 | 
428 |     return {
429 |         addListener: addListener,
430 |         invokeListeners: invokeListeners,
431 |         notifyError: function(errorCode, errorString, url){
432 |             mainFrameStatus.externalError = {'errorCode': errorCode, 'errorString': errorString};
433 |             if (url) mainFrameStatus.externalError.url = url;
434 | 
435 |             // if not even requested, invoke MainFrameError immediately
436 |             !mainFrameStatus.requested && invokeListeners('onMainFrameError', mainFrameStatus.externalError);
437 |         },
438 |         getMainFrameStatus: function(){
439 |             return mainFrameStatus;
440 |         },
441 |         getResources: function() {
442 |             return resourceDetails;
443 |         }
444 |     };
445 | };


--------------------------------------------------------------------------------
/renderer/resource/extractors.js:
--------------------------------------------------------------------------------
  1 | /**
  2 |   * Copyright 2015, Yahoo Inc. All rights reserved.
  3 |   * Use of this source code is governed by a BSD-style
  4 |   * license that can be found in the LICENSE file.
  5 |   *
  6 |   
  7 | */
  8 | 
  9 | // to call back phantom
 10 | function phantomCallback(action, data) {
 11 |     if (window.callPhantom) {
 12 |         data = data || {};
 13 |         data.action = action;
 14 |         window.callPhantom(data);
 15 |     }
 16 | }
 17 | 
 18 | // timer as a reason to wait
 19 | var _gryffin_setTimeout = window.setTimeout, _gryffin_setInterval = window.setInterval;
 20 | window.setTimeout = function(f, t){return phantomCallback('waitTimer', {timeout:t}) || _gryffin_setTimeout.call(this, f, t)};
 21 | window.setInterval = function(f, t){return phantomCallback('waitTimer', {timeout:t}) || _gryffin_setInterval.call(this, f, t)};
 22 | 
 23 | 
 24 | // Derived from casperjs
 25 | function triggerMouseEvent(el, type) {
 26 | 
 27 |     try {
 28 |         var evt = document.createEvent("MouseEvents"), center_x = 1, center_y = 1;
 29 |         try {
 30 |             var pos = el.getBoundingClientRect();
 31 |             center_x = Math.floor((pos.left + pos.right) / 2);
 32 |             center_y = Math.floor((pos.top + pos.bottom) / 2);
 33 |         } catch(e) {}
 34 |         evt.initMouseEvent(type, true, true, window, 1, 1, 1, center_x, center_y, false, false, false, false, 0, el);
 35 |         // dispatchEvent return value is false if at least one of the event
 36 |         // handlers which handled this event called preventDefault;
 37 |         // so we cannot returns this results as it cannot accurately informs on the status
 38 |         // of the operation
 39 |         // let's assume the event has been sent ok it didn't raise any error
 40 |         el.dispatchEvent(evt);
 41 |         return true;
 42 |     } catch (e) {
 43 |         return false;
 44 |     }
 45 | };
 46 | 
 47 | // function getElementByXPath(expression) {
 48 | //     var a = document.evaluate(expression, document.body, null, XPathResult.ORDERED_NODE_SNAPSHOT_TYPE, null);
 49 | //     if (a.snapshotLength > 0) {
 50 | //         return a.snapshotItem(0);
 51 | //     }
 52 | // };
 53 | 
 54 | // other candidates: focus, input, keydown, keypress, keyup, blur
 55 | var jsLinks, jsLinkEvents = ['click', 'dblclick', 'change', 'submit', 'scroll', 'mousemove', 'mouseover', 'mousedown', 'mouseup', 'mouseout'],
 56 |     attributeFilter = ['href', 'action'].concat(jsLinkEvents.map(function(x){return 'on' + x})),
 57 |     re_js_links = /^javascript:/i, 
 58 |     re_urls = /^(?:https?|ftp):\/\//i,
 59 |     re_absoluteUrls = /(?:https?|ftp):\/\/[^\s]+/ig,
 60 |     re_relativeUrls = /[^\s]+\.(?:php[s\d]?|s?html?|aspx?|jsp|cfm)[^\s]*/ig;
 61 | 
 62 | jsLinks = (function(){
 63 | 
 64 |     function getxpath(el) {
 65 |         if (el===document.body) return 'body';
 66 |         if (el.id !== '') return "//*[@id='"+el.id+"']";
 67 |         if (!el.parentNode) return el.toString();
 68 | 
 69 |         for (var i = 0, s, cnt = 0, p = el.parentNode, siblings = p.childNodes; s = siblings[i]; i++) {
 70 |             if (s === el) return [getxpath(p), el.tagName.toLowerCase() + '[' + (cnt + 1) + ']'].join('/');
 71 |             if (s.nodeType === 1 && s.tagName === el.tagName) cnt++;
 72 |         }
 73 |     };
 74 | 
 75 |     var jsLinkCaptured = {'root':{'jsLinks':[],'trigger':function(){},'parent':null,'arrPtr':0,'key':'root'}}, 
 76 |         triggeringElement = jsLinkCaptured['root'],
 77 |         elementProto = (window.EventTarget? window.EventTarget : window.HTMLElement ? window.HTMLElement : window.Element).prototype,
 78 |         elementProtoMethods = {'addEventListener': elementProto.addEventListener};
 79 | 
 80 |     // extract DOM Level 0 events
 81 |     function extractDOM0Events(el) {
 82 |         el = el || document.body;
 83 |         function getJsLink(element) {
 84 |             jsLinkEvents.forEach(function(evt){
 85 |                 element['on' + evt] && jsLinks.add(evt, element, 'dom0');
 86 |             });
 87 |         }
 88 |         getJsLink(el);
 89 |         [].forEach.call(el.getElementsByTagName('*'), getJsLink);
 90 |     }
 91 | 
 92 |     // extract DOM Level 2 events
 93 |     elementProto.addEventListener = function(type, fn, capture) {
 94 |         if (jsLinkEvents.indexOf(type.toLowerCase()) !== -1)
 95 |             jsLinks.add(type, this, 'addEventListener');
 96 |         return elementProtoMethods.addEventListener.call(this, type, fn, capture);
 97 |     };
 98 | 
 99 |     return {
100 |         add: function(eventType, node, triggerSource) {
101 |             eventType = eventType.toLowerCase();
102 |             var key = getxpath(node);
103 | 
104 |             if (jsLinkCaptured[key]) {
105 |                 if (jsLinkCaptured[key]['events'].indexOf(eventType) === -1)
106 |                     jsLinkCaptured[key]['events'].push(eventType);
107 |                 jsLinkCaptured[key]['src'].push(triggerSource);
108 |             } else {
109 |                 jsLinkCaptured[key] = {
110 |                     'key': key,
111 |                     'keyChain': function() {
112 |                         var trace = [], element = this;
113 |                         do {
114 |                             trace.push(element.key);
115 |                         } while (element = element.parent);
116 |                         return trace.reverse();
117 |                     },
118 |                     'events': [eventType],
119 |                     'trigger': function(onTriggered, delay){
120 |                         var self = this, i = 0,
121 |                                 eventsString = self.events.join('|'),
122 |                                 results = {'keyChain':self.keyChain(), 'events': self.events};
123 |                         
124 |                         if (node) {
125 |                             phantomCallback('element.triggering', results);
126 | 
127 |                             // simulate scroll event
128 |                             if (self.events.indexOf('scroll') !== -1)
129 |                                 _gryffin_setTimeout.call(window, function(){
130 |                                     try {node.scrollTop = node.scrollHeight} catch(e) {}
131 |                                 }, i++ * delay);
132 | 
133 |                             // group all mouse and (dbl)click events as follows
134 |                             if (/(?:click|mouse|change)/.test(eventsString)) {
135 |                                 _gryffin_setTimeout.call(window, function(){
136 |                                     try {node.focus()} catch(e) {};
137 |                                     triggerMouseEvent(node, 'mousemove');
138 |                                     triggerMouseEvent(node, 'mouseenter');
139 |                                     triggerMouseEvent(node, 'mouseover');
140 |                                     triggerMouseEvent(node, 'mousemove');
141 |                                     triggerMouseEvent(node, 'mousedown')
142 |                                     triggerMouseEvent(node, 'mouseup');
143 |                                 }, i * delay);
144 |                                 if (self.events.indexOf('click') !== -1) 
145 |                                     _gryffin_setTimeout.call(window, function(){
146 |                                         triggerMouseEvent(node, 'click');
147 |                                     }, i++ * delay);
148 |                                 if (self.events.indexOf('dblclick') !== -1) 
149 |                                     _gryffin_setTimeout.call(window, function(){
150 |                                         triggerMouseEvent(node, 'dblclick');
151 |                                     }, i++ * delay);
152 |                                 if (self.events.indexOf('change') !== -1) {
153 |                                     // for select element
154 |                                     if (node.options)
155 |                                         for (var j = 0, len = node.options.length; j < len; j++)
156 |                                             // cycle through every option
157 |                                             _gryffin_setTimeout.call(window, function(){
158 |                                                 node.selectedIndex = (node.selectedIndex + 1) % node.options.length;
159 |                                                 node.dispatchEvent(new Event('change', {bubbles: true, cancelable: true}));
160 |                                             }, i++ * delay);
161 |                                     // for other elements
162 |                                     else
163 |                                         _gryffin_setTimeout.call(window, function(){
164 |                                             node.dispatchEvent(new Event("change", {bubbles: true, cancelable: true}));
165 |                                         }, i++ * delay);
166 |                                 }
167 |                                 _gryffin_setTimeout.call(window, function(){
168 |                                     triggerMouseEvent(node, 'mouseout');
169 |                                     triggerMouseEvent(node, 'mouseleave');
170 |                                 }, i++ * delay);
171 |                             }
172 | 
173 |                             // simulate submit event
174 |                             if (self.events.indexOf('submit') !== -1)
175 |                                 _gryffin_setTimeout.call(window, function(){
176 |                                     node.dispatchEvent(new Event("submit", {bubbles: true, cancelable: true}));
177 |                                     // dynamically evaluate jsurl of node.action
178 |                                     // TODO: url resolved, enumerate multi-valued form elements?
179 |                                     if (re_js_links.test(node.action))
180 |                                         (function(){eval(this.action.substring(11));this.submit()}).call(node);
181 |                                     else
182 |                                         node.submit();
183 |                                 }, i++ * delay);
184 |                         
185 |                             // append discovered links/forms
186 |                             _gryffin_setTimeout.call(window, function(){
187 |                                 phantomCallback('element.triggered', results);
188 |                             }, i * delay);
189 |                         }
190 | 
191 |                         jsLinks.getData = function() {
192 |                             // append discovered links/forms
193 |                             if (self.links.length)
194 |                                 results.links = self.links;
195 |                             if (self.forms.length)
196 |                                 results.forms = self.forms;
197 | 
198 |                             // trigger next element's events
199 |                             onTriggered && _gryffin_setTimeout.call(window, onTriggered, 10);
200 |                             return results;
201 |                         };
202 |                     },
203 |                     'jsLinks': [],
204 |                     'arrPtr': 0,
205 |                     'parent': triggeringElement,
206 |                     'src': [triggerSource],
207 |                     'links': [],
208 |                     'forms': []
209 |                 };
210 |                 triggeringElement.jsLinks.push(jsLinkCaptured[key]);
211 |             }
212 |         },
213 |         depthFirstTrigger: function(element, delay){
214 |             triggeringElement = element;
215 |             triggeringElement.trigger(function() {
216 |                 var nextElement, parentTriggerElement = triggeringElement.parent;
217 |                 parentTriggerElement.arrPtr++;
218 | 
219 |                 // depth-first approach: go one depth deeper if available
220 |                 // no more child, execute the immediate sibling
221 |                 // no more immediate sibling, execute the parent's sibling
222 |                 nextElement = triggeringElement.jsLinks[0] 
223 |                         || parentTriggerElement.jsLinks[parentTriggerElement.arrPtr]
224 |                         || (parentTriggerElement.parent && parentTriggerElement.parent.jsLinks[parentTriggerElement.parent.arrPtr]);
225 | 
226 |                 if (nextElement)
227 |                     jsLinks.depthFirstTrigger(nextElement, delay);
228 |                 else
229 |                     phantomCallback('done');
230 | 
231 |             }, delay);
232 |         },
233 |         triggerAll: function(delay) {
234 |             observeDOMChanges(function(newNode){
235 |                 extractDOM0Events(newNode);
236 |                 // append the newly discovered static links and forms
237 |                 extractRequests(triggeringElement, newNode);
238 |             });
239 | 
240 |             // extract jsLinks
241 |             extractDOM0Events();
242 | 
243 |             if (triggeringElement.jsLinks[0])
244 |                 jsLinks.depthFirstTrigger(triggeringElement.jsLinks[0], delay);
245 |             else
246 |                 phantomCallback('done');
247 |         }
248 |     };
249 | })();
250 | 
251 | 
252 | function observeDOMChanges(onNewNode) {
253 |     // create an observer instance
254 |     var observer = new window.MutationObserver(function(mutations) {
255 |         mutations.forEach(function(mutation) {
256 |             onNewNode && [].forEach.call(mutation.addedNodes || [mutations.target], function(node){
257 |                 node && (node.nodeType === 1) && onNewNode.call(this, node, observer);
258 |             });
259 |         });
260 |     });
261 | 
262 |     // monitor new nodes and attribute changes that involve URLs
263 |     observer.observe(document.body, {
264 |         subtree: true, 
265 |         childList: true, 
266 |         attributes: true, 
267 |         attributeFilter: attributeFilter
268 |     });
269 | 
270 |     return observer;
271 | }
272 | 
273 | function arrayUnique(arr) {
274 |     var result = [], i = 0, key, lastKey, sorted = arr.sort(), len = sorted.length;
275 |     for (;key = sorted[i];i++)
276 |         if (lastKey !== key)
277 |             result.push(lastKey = key);
278 |     return result;
279 | }
280 | 
281 | 
282 | function extractRequests(sink, el) {
283 |     sink.links = sink.links || [];
284 |     sink.forms = sink.forms || [];
285 |     el = el || document.body;
286 |     var links = sink.links, forms = sink.forms;
287 |     
288 |     function getLink(a) {
289 |         if (a.hasAttribute('href') || a.href) {
290 |             var href = a.href;
291 |             if (re_js_links.test(href))
292 |                 jsLinks.add('click', a, 'jsurl');
293 |             else if (re_urls.test(href))
294 |                 links.push({'url':href, 'text':(a.textContent || a.innerText).replace(/\s+/g, ' ').trim()});
295 |         }
296 |     }
297 | 
298 |     // TODO: ajax forms extractions
299 |     function getForm(f) {
300 |         var method = f.method ? f.method.toLowerCase() : 'get',
301 |             url = f.action,
302 |             urlparams = [], a,
303 |             values = [], submits = [], multiDefaults = {}, dataType = {}, j = 0, input;
304 | 
305 |         // for javascript-uri submissions, yielding FormRequest with invalid url is meaningless
306 |         if (re_js_links.test(url) || typeof url === 'object') {
307 |             jsLinks.add('submit', f, 'jsurl');
308 |             return;
309 |         }
310 | 
311 |         for (; input = f.elements[j]; j++) {
312 |             var name = encodeURIComponent(input.name), 
313 |                 value = encodeURIComponent(input.value),
314 |                 nodeName = input.nodeName.toLowerCase(), 
315 |                 type = input.type ? input.type.toLowerCase() : nodeName;
316 | 
317 |             if (!name) continue;
318 | 
319 |             // <input type=submit|image>, <button type=submit|image>
320 |             if (['input','button'].indexOf(nodeName) !== -1 && ['submit','image'].indexOf(type) !== -1) {
321 |                 submits.push([[name, value].join('=')]);
322 | 
323 |             // <input type!=reset|button>, <textarea>, <keygen> element
324 |             } else if ((nodeName === 'input' && ['reset','button'].indexOf(type) === -1)
325 |                     || ['textarea','keygen'].indexOf(nodeName) !== -1) {
326 |                 
327 |                 if (typeof(f[name].length) === 'undefined') // an unique element with such a 'name'
328 |                     values.push([name, value].join('='));
329 |                 else if (!dataType[name]) // i.e., radio/checkbox, and the first time being recorded
330 |                     multiDefaults[name] = [].map.call(f[name], function(opt){return [name, encodeURIComponent(opt.value)].join('=')});
331 | 
332 |             // <select> element
333 |             } else if (nodeName === 'select') {
334 |                 
335 |                 if (input.options.length)
336 |                     multiDefaults[name] = [].map.call(f[name], function(opt){return [name, encodeURIComponent(opt.value)].join('=')});
337 |                 else
338 |                     values.push(name + '=');
339 |             }
340 | 
341 |             // TODO: if no default value, supply one depending on type? (e.g., email)
342 |             dataType[name] = type;
343 |         }
344 | 
345 |         // for unknown/empty urls
346 |         if (!re_urls.test(url))
347 |             url = window.location.href;
348 | 
349 |         a = document.createElement('a');
350 |         a.href = url;
351 |         // process any parameters given as part of the url
352 |         if (a.search.length > 1) {
353 |             // url's params will be later combined with those collected in form, and used for deduplication
354 |             urlparams = a.search.substring(1).split('&');
355 | 
356 |             // url's params are considered of hidden types
357 |             urlparams.forEach(function(param){
358 |                 if (param = param.split('=')[0])
359 |                     dataType[param] = 'hidden';
360 |             });
361 | 
362 |             // for GET method, transfer url's params to values for deduplication
363 |             if (method === 'get') {
364 |                 values = urlparams.concat(values);
365 |                 a.search = '';
366 |                 url = a.href;
367 |             }
368 |         }
369 | 
370 |         // in case no name for submit button, simply let other values be serialized
371 |         if (submits.length === 0)
372 |             submits.push([]);
373 | 
374 |         // enumerate all possibilities for every multiDefaults, with submits as default
375 |         // Example: 
376 |         //  - submits = [['submit=save'], ['submit=send']]
377 |         //  - multiDefaults = {'sex':['sex=M', 'sex=F'], 'relationship': ['relationship=complicated', 'relationship=single', 'relationship=married']}
378 |         // results: [
379 |         //         ["submit=save", "sex=M", "relationship=complicated"],
380 |         //         ["submit=save", "sex=M", "relationship=single"],
381 |         //         ["submit=save", "sex=M", "relationship=married"],
382 |         //         ["submit=save", "sex=F", "relationship=complicated"],
383 |         //         ["submit=save", "sex=F", "relationship=single"],
384 |         //         ["submit=save", "sex=F", "relationship=married"],
385 |         //         ["submit=send", "sex=M", "relationship=complicated"],
386 |         //         ["submit=send", "sex=M", "relationship=single"],
387 |         //         ["submit=send", "sex=M", "relationship=married"],
388 |         //         ["submit=send", "sex=F", "relationship=complicated"],
389 |         //         ["submit=send", "sex=F", "relationship=single"],
390 |         //         ["submit=send", "sex=F", "relationship=married"]
391 |         // ]
392 |         multiDefaults = Object.keys(multiDefaults).reduce(function(previousValues, currentKey){
393 |             var currentValues = multiDefaults[currentKey];
394 |             return previousValues.map(function(previousValue){
395 |                 return currentValues.map(function(currentValue){return previousValue.concat(currentValue)});
396 |             }).reduce(function(a, b){return a.concat(b)});
397 |         }, submits);
398 | 
399 |         // each submit button can correspond to a different name/value pair to submit
400 |         // then, concat all params collected, de-duplicate tuples of "[key]=[value]", finally join with &
401 |         multiDefaults.forEach(function(combinator){
402 |             forms.push({
403 |                 method: method,
404 |                 url: url,
405 |                 data: arrayUnique(values.concat(combinator)).join('&'),
406 |                 dataType: dataType
407 |             });
408 |         });
409 |     }
410 | 
411 |     function getCommentedLinks(comment) {
412 |         var commentedLinks;
413 |         // for absolute URLs 
414 |         if (commentedLinks = comment.match(re_absoluteUrls))
415 |             commentedLinks.forEach(function(url){links.push({'url':url, 'text':'__comments'})});
416 | 
417 |         // for relative URLs - to avoid false positives, must ends with extensions known to give html
418 |         if (commentedLinks = comment.match(re_relativeUrls)) {
419 |             var a = document.createElement('a');
420 |             commentedLinks.forEach(function(url){
421 |                 if (!re_urls.test(url)) {
422 |                     a.href = url;
423 |                     links.push({'url': a.href, 'text':'__comments'})
424 |                 }
425 |             });
426 |         }
427 |     }
428 | 
429 | 
430 |     // the el itself could be a link
431 |     getLink(el);
432 |     // links extraction from a/area tags
433 |     [].forEach.call(el.getElementsByTagName('a'), getLink);
434 |     [].forEach.call(el.getElementsByTagName('area'), getLink)
435 | 
436 | 
437 |     // the el itself could be a form
438 |     if (el.tagName.toLowerCase() == 'form')
439 |         getForm(el);
440 |     // forms extraction
441 |     [].forEach.call(el.getElementsByTagName('form'), getForm);
442 | }
443 | 
444 | window._gryffin_onMainFrameReady = function() {
445 |     // a page may have no document.body
446 |     if (!document.body) 
447 |         return {};
448 | 
449 |     var results = {
450 |         'jsLinkFeedback': true     // always true now because we now hardcoded (at least) the scroll event 
451 |     };
452 |     extractRequests(results);
453 | 
454 |     jsLinks.add('scroll', document.body, 'hardcoded');
455 |     jsLinks.triggerAll(250);
456 | 
457 |     return results;
458 | }
459 | 
460 | 


--------------------------------------------------------------------------------