├── spider ├── github │ ├── testdata │ │ ├── sub │ │ │ └── README │ │ ├── README │ │ ├── pkg_test.go │ │ └── pkg.go │ ├── utils.go │ └── github_test.go ├── godocorg │ ├── packages_test.go │ └── packages.go ├── ranking_test.go ├── filecache_test.go ├── filecache.go └── ranking.go ├── pipelines ├── spider │ ├── spider │ └── spider.go ├── indexer │ ├── imain.go │ └── index.go ├── crawler │ ├── imports.go │ ├── person.go │ └── cmain.go ├── tocrawl │ └── ghup.go └── mergedocs │ └── mergedocs.go ├── service ├── web │ ├── .DS_Store │ ├── static │ │ └── robots.txt │ ├── images │ │ ├── logo-16.png │ │ ├── logo-32.png │ │ ├── logo-error-64.png │ │ ├── glyphicons-halflings.png │ │ └── glyphicons-halflings-white.png │ ├── resource │ │ ├── icon.psd │ │ ├── logo.png │ │ ├── gopher.png │ │ ├── logo-128.png │ │ ├── logo-16.png │ │ ├── logo-256.png │ │ ├── logo-32.png │ │ ├── logo-64.png │ │ ├── error-logo.png │ │ ├── logo-error.png │ │ ├── logo-error.psd │ │ ├── gplus-cover.png │ │ ├── logo-error-16.png │ │ ├── logo-error-64.png │ │ ├── twitter-cover.png │ │ ├── logo-error-128.png │ │ ├── round-logo-256.png │ │ └── magnifying_glass_black.png │ ├── web │ │ ├── 404.html │ │ ├── searchbox.html │ │ ├── crawlhistory.html │ │ ├── badgepage.html │ │ ├── tops.html │ │ ├── add.html │ │ ├── footer.html │ │ ├── header.html │ │ ├── index.html │ │ ├── search.html │ │ ├── infoapi.html │ │ ├── about.html │ │ └── view.html │ ├── db_test.go │ ├── add.go │ ├── crawlhistory.go │ ├── view.go │ ├── tops.go │ ├── db.go │ ├── api.go │ └── css │ │ └── gc.css └── stored │ └── stored.go ├── scripts ├── backup-conf.json.template ├── crawler.gs ├── web.gs ├── stored.gs ├── gen_proto.gs ├── testall.gs ├── backup.gs └── install.gs ├── chrome-app ├── dist │ ├── screenshot-1.png │ ├── screenshot-2.png │ └── promo-440x280-1.png └── go-search │ ├── logo-128.png │ ├── logo-16.png │ └── manifest.json ├── crawler.bat ├── indexer.bat ├── server.bat ├── shared └── proto │ ├── store.go │ ├── spider.go │ ├── stored.proto │ ├── store.proto │ ├── spider.proto │ └── stored.pb.go ├── gcse.go ├── bi.go ├── ACKNOWLEDGEMENTS ├── utils ├── utils_test.go ├── json.go ├── utils.go ├── segment_test.go └── segment.go ├── .gitignore ├── tokenize_test.go ├── utils.go ├── conf.json.template ├── tools ├── fillfound │ ├── fillfound.go │ └── fillfound_test.go ├── countdocs │ └── countdocs.go ├── exps │ └── importsents.go ├── fixcrawldb │ └── fixcrawldb.go └── dump │ └── dump.go ├── LICENSE ├── db_test.go ├── README.md ├── license.txt ├── data_test.go ├── store ├── repo_test.go ├── repo.go ├── store_test.go ├── history.go ├── history_test.go └── store.go ├── text_test.go ├── crawlerdb.go ├── configs └── configs.go ├── crawler_test.go ├── data.go └── index_test.go /spider/github/testdata/sub/README: -------------------------------------------------------------------------------- 1 | The sub folder of testdata. 2 | -------------------------------------------------------------------------------- /spider/github/testdata/README: -------------------------------------------------------------------------------- 1 | This folder is used for Github spider testing. 2 | -------------------------------------------------------------------------------- /pipelines/spider/spider: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/daviddengcn/gcse/HEAD/pipelines/spider/spider -------------------------------------------------------------------------------- /service/web/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/daviddengcn/gcse/HEAD/service/web/.DS_Store -------------------------------------------------------------------------------- /scripts/backup-conf.json.template: -------------------------------------------------------------------------------- 1 | { 2 | gdrive: { 3 | folder: { 4 | // id: "" 5 | } 6 | } 7 | } 8 | -------------------------------------------------------------------------------- /service/web/static/robots.txt: -------------------------------------------------------------------------------- 1 | User-agent: * 2 | Disallow: /search 3 | Disallow: /add 4 | Disallow: /api 5 | -------------------------------------------------------------------------------- /service/web/images/logo-16.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/daviddengcn/gcse/HEAD/service/web/images/logo-16.png -------------------------------------------------------------------------------- /service/web/images/logo-32.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/daviddengcn/gcse/HEAD/service/web/images/logo-32.png -------------------------------------------------------------------------------- /service/web/resource/icon.psd: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/daviddengcn/gcse/HEAD/service/web/resource/icon.psd -------------------------------------------------------------------------------- /service/web/resource/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/daviddengcn/gcse/HEAD/service/web/resource/logo.png -------------------------------------------------------------------------------- /chrome-app/dist/screenshot-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/daviddengcn/gcse/HEAD/chrome-app/dist/screenshot-1.png -------------------------------------------------------------------------------- /chrome-app/dist/screenshot-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/daviddengcn/gcse/HEAD/chrome-app/dist/screenshot-2.png -------------------------------------------------------------------------------- /chrome-app/go-search/logo-128.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/daviddengcn/gcse/HEAD/chrome-app/go-search/logo-128.png -------------------------------------------------------------------------------- /chrome-app/go-search/logo-16.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/daviddengcn/gcse/HEAD/chrome-app/go-search/logo-16.png -------------------------------------------------------------------------------- /service/web/resource/gopher.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/daviddengcn/gcse/HEAD/service/web/resource/gopher.png -------------------------------------------------------------------------------- /service/web/resource/logo-128.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/daviddengcn/gcse/HEAD/service/web/resource/logo-128.png -------------------------------------------------------------------------------- /service/web/resource/logo-16.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/daviddengcn/gcse/HEAD/service/web/resource/logo-16.png -------------------------------------------------------------------------------- /service/web/resource/logo-256.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/daviddengcn/gcse/HEAD/service/web/resource/logo-256.png -------------------------------------------------------------------------------- /service/web/resource/logo-32.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/daviddengcn/gcse/HEAD/service/web/resource/logo-32.png -------------------------------------------------------------------------------- /service/web/resource/logo-64.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/daviddengcn/gcse/HEAD/service/web/resource/logo-64.png -------------------------------------------------------------------------------- /chrome-app/dist/promo-440x280-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/daviddengcn/gcse/HEAD/chrome-app/dist/promo-440x280-1.png -------------------------------------------------------------------------------- /service/web/resource/error-logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/daviddengcn/gcse/HEAD/service/web/resource/error-logo.png -------------------------------------------------------------------------------- /service/web/resource/logo-error.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/daviddengcn/gcse/HEAD/service/web/resource/logo-error.png -------------------------------------------------------------------------------- /service/web/resource/logo-error.psd: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/daviddengcn/gcse/HEAD/service/web/resource/logo-error.psd -------------------------------------------------------------------------------- /service/web/images/logo-error-64.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/daviddengcn/gcse/HEAD/service/web/images/logo-error-64.png -------------------------------------------------------------------------------- /service/web/resource/gplus-cover.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/daviddengcn/gcse/HEAD/service/web/resource/gplus-cover.png -------------------------------------------------------------------------------- /service/web/resource/logo-error-16.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/daviddengcn/gcse/HEAD/service/web/resource/logo-error-16.png -------------------------------------------------------------------------------- /service/web/resource/logo-error-64.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/daviddengcn/gcse/HEAD/service/web/resource/logo-error-64.png -------------------------------------------------------------------------------- /service/web/resource/twitter-cover.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/daviddengcn/gcse/HEAD/service/web/resource/twitter-cover.png -------------------------------------------------------------------------------- /crawler.bat: -------------------------------------------------------------------------------- 1 | go install github.com/daviddengcn/gcse/crawler 2 | @if errorlevel 1 goto exit 3 | %GOPATH%\bin\crawler 4 | 5 | :exit 6 | -------------------------------------------------------------------------------- /indexer.bat: -------------------------------------------------------------------------------- 1 | go install github.com/daviddengcn/gcse/indexer 2 | @if errorlevel 1 goto exit 3 | %GOPATH%\bin\indexer 4 | 5 | :exit 6 | -------------------------------------------------------------------------------- /server.bat: -------------------------------------------------------------------------------- 1 | go install github.com/daviddengcn/gcse/server 2 | @if errorlevel 1 goto exit 3 | %GOPATH%\bin\server 4 | 5 | :exit 6 | -------------------------------------------------------------------------------- /service/web/resource/logo-error-128.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/daviddengcn/gcse/HEAD/service/web/resource/logo-error-128.png -------------------------------------------------------------------------------- /service/web/resource/round-logo-256.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/daviddengcn/gcse/HEAD/service/web/resource/round-logo-256.png -------------------------------------------------------------------------------- /service/web/images/glyphicons-halflings.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/daviddengcn/gcse/HEAD/service/web/images/glyphicons-halflings.png -------------------------------------------------------------------------------- /service/web/resource/magnifying_glass_black.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/daviddengcn/gcse/HEAD/service/web/resource/magnifying_glass_black.png -------------------------------------------------------------------------------- /spider/github/testdata/pkg_test.go: -------------------------------------------------------------------------------- 1 | package pkg 2 | 3 | import ( 4 | "github.com/golangplus/testing/assert" 5 | ) 6 | 7 | var _ = assert.Equal 8 | -------------------------------------------------------------------------------- /service/web/images/glyphicons-halflings-white.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/daviddengcn/gcse/HEAD/service/web/images/glyphicons-halflings-white.png -------------------------------------------------------------------------------- /shared/proto/store.go: -------------------------------------------------------------------------------- 1 | package gcsepb 2 | 3 | func (m *Repository) PutPackage(path string, pkg *Package) { 4 | if m.Packages == nil { 5 | m.Packages = make(map[string]*Package) 6 | } 7 | m.Packages[path] = pkg 8 | } 9 | -------------------------------------------------------------------------------- /spider/github/testdata/pkg.go: -------------------------------------------------------------------------------- 1 | package pkg 2 | 3 | import ( 4 | "github.com/daviddengcn/gcse/spider/github" 5 | strs "github.com/golangplus/strings" 6 | ) 7 | 8 | var _ = github.ErrInvalidPackage 9 | var _ = strs.Get 10 | -------------------------------------------------------------------------------- /gcse.go: -------------------------------------------------------------------------------- 1 | /* 2 | Package gcse is the core supporting library for go-code-search-engine (GCSE). 3 | Its exported types and functions are mainly for sub packages. If you want 4 | some of the function, copy the code away. 5 | */ 6 | package gcse 7 | -------------------------------------------------------------------------------- /bi.go: -------------------------------------------------------------------------------- 1 | package gcse 2 | 3 | import ( 4 | "github.com/daviddengcn/go-easybi" 5 | ) 6 | 7 | func AddBiValueAndProcess(aggr bi.AggregateMethod, name string, value int) { 8 | bi.AddValue(aggr, name, value) 9 | bi.Flush() 10 | bi.Process() 11 | } 12 | -------------------------------------------------------------------------------- /scripts/crawler.gs: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env gosl 2 | 3 | APPS := []string { 4 | "tocrawl", "crawler", "mergedocs", "indexer", 5 | } 6 | 7 | for { 8 | for _, app := range APPS { 9 | Printf("Running %s...\n", app) 10 | Bash(app) 11 | } 12 | } 13 | -------------------------------------------------------------------------------- /scripts/web.gs: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env gosl 2 | 3 | import "time" 4 | import "github.com/daviddengcn/gcse/configs" 5 | 6 | Printfln("Logging to %q...", configs.LogDir) 7 | 8 | for { 9 | Bash("web -log_dir %s", configs.LogDir) 10 | time.Sleep(time.Second) 11 | } 12 | -------------------------------------------------------------------------------- /scripts/stored.gs: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env gosl 2 | 3 | import "time" 4 | import "github.com/daviddengcn/gcse/configs" 5 | 6 | Printfln("Logging to %q...", configs.LogDir) 7 | 8 | for { 9 | Bash("stored -log_dir %s", configs.LogDir) 10 | time.Sleep(time.Second) 11 | } 12 | -------------------------------------------------------------------------------- /ACKNOWLEDGEMENTS: -------------------------------------------------------------------------------- 1 | (sorted by names) 2 | Alif Rachmawadi(subosito) Fix a bug on www server. 3 | mipearson Creates the step-by-step document and some optimization that makes the command more robust. 4 | Robert Melton(@robertmeta) Textual refining and code refactoring. 5 | -------------------------------------------------------------------------------- /service/web/web/404.html: -------------------------------------------------------------------------------- 1 | {{template "header.html" .UIUtils.Slice "404" "404"}} 2 |
3 |

4 | 5 | Page "{{.Path}}" not found, please go to another one. 6 |

7 |
8 | {{template "footer.html"}} 9 | -------------------------------------------------------------------------------- /service/web/db_test.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "testing" 5 | 6 | "github.com/golangplus/testing/assert" 7 | ) 8 | 9 | func TestFindFullPackage_NotFound(t *testing.T) { 10 | db := &searcherDB{} 11 | _, found := db.FindFullPackage("abc") 12 | assert.False(t, "found", found) 13 | } 14 | -------------------------------------------------------------------------------- /chrome-app/go-search/manifest.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "Go Search", 3 | "description": "Find popular and relevant Go packages.", 4 | "version": "1.0", 5 | "manifest_version": 2, 6 | "app": { 7 | "launch": { 8 | "web_url": "http://go-search.org/" 9 | } 10 | }, 11 | "icons": { 12 | "16": "logo-16.png", 13 | "128": "logo-128.png" 14 | } 15 | } 16 | -------------------------------------------------------------------------------- /spider/godocorg/packages_test.go: -------------------------------------------------------------------------------- 1 | package godocorg 2 | 3 | import ( 4 | "net/http" 5 | "testing" 6 | 7 | "github.com/golangplus/testing/assert" 8 | ) 9 | 10 | func TestFetchAllPackagesInGodoc(t *testing.T) { 11 | pkgs, err := FetchAllPackagesInGodoc(http.DefaultClient) 12 | assert.NoError(t, err) 13 | 14 | if len(pkgs) == 0 { 15 | t.Errorf("No packages returned!") 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /service/web/web/searchbox.html: -------------------------------------------------------------------------------- 1 |
2 |
3 |
4 | 5 | 6 |
7 |
8 |
9 | -------------------------------------------------------------------------------- /scripts/gen_proto.gs: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env gosl 2 | 3 | import "path/filepath" 4 | 5 | const GCSE = "github.com/daviddengcn/gcse" 6 | 7 | protoPath, _ := filepath.Abs("shared/proto/*.proto") 8 | 9 | gopath := Eval("go", "env", "GOPATH") + "/src" 10 | Printfln("protoc --proto_path %[1]s --go_out %[1]s %s", gopath, protoPath) 11 | Bash("protoc --proto_path %[1]s --go_out plugins=grpc:%[1]s %s ", gopath, protoPath) 12 | -------------------------------------------------------------------------------- /shared/proto/spider.go: -------------------------------------------------------------------------------- 1 | package gcsepb 2 | 3 | import ( 4 | "time" 5 | 6 | "github.com/golang/protobuf/ptypes" 7 | ) 8 | 9 | func (ci *CrawlingInfo) CrawlingTimeAsTime() time.Time { 10 | t, _ := ptypes.Timestamp(ci.GetCrawlingTime()) 11 | return t 12 | } 13 | 14 | func (ci *CrawlingInfo) SetCrawlingTime(t time.Time) *CrawlingInfo { 15 | ci.CrawlingTime, _ = ptypes.TimestampProto(t) 16 | return ci 17 | } 18 | -------------------------------------------------------------------------------- /shared/proto/stored.proto: -------------------------------------------------------------------------------- 1 | syntax = "proto3"; 2 | 3 | package gcse; 4 | 5 | import "github.com/daviddengcn/gcse/shared/proto/spider.proto"; 6 | 7 | option go_package = "gcsepb"; 8 | 9 | message PackageCrawlHistoryReq { 10 | string package = 1; 11 | } 12 | 13 | message PackageCrawlHistoryResp { 14 | HistoryInfo info = 1; 15 | } 16 | 17 | service StoreService { 18 | rpc PackageCrawlHistory(PackageCrawlHistoryReq) returns (PackageCrawlHistoryResp); 19 | } 20 | -------------------------------------------------------------------------------- /spider/github/utils.go: -------------------------------------------------------------------------------- 1 | package github 2 | 3 | import ( 4 | "github.com/google/go-github/github" 5 | ) 6 | 7 | func getString(s *string) string { 8 | if s == nil { 9 | return "" 10 | } 11 | return *s 12 | } 13 | 14 | func getInt(i *int) int { 15 | if i == nil { 16 | return 0 17 | } 18 | return *i 19 | } 20 | 21 | func getTimestamp(ts *github.Timestamp) github.Timestamp { 22 | if ts == nil { 23 | return github.Timestamp{} 24 | } 25 | return *ts 26 | } 27 | -------------------------------------------------------------------------------- /pipelines/indexer/imain.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "log" 5 | "runtime" 6 | 7 | "github.com/daviddengcn/gcse/configs" 8 | ) 9 | 10 | func main() { 11 | runtime.GOMAXPROCS(2) 12 | log.Println("indexer started...") 13 | 14 | if err := configs.IndexSegments().ClearUndones(); err != nil { 15 | log.Printf("ClearUndones failed: %v", err) 16 | } 17 | 18 | if err := clearOutdatedIndex(); err != nil { 19 | log.Printf("clearOutdatedIndex failed: %v", err) 20 | } 21 | doIndex() 22 | 23 | log.Println("indexer exits...") 24 | } 25 | -------------------------------------------------------------------------------- /utils/utils_test.go: -------------------------------------------------------------------------------- 1 | package utils 2 | 3 | import ( 4 | "testing" 5 | 6 | "github.com/golangplus/testing/assert" 7 | ) 8 | 9 | func TestSplitPackage(t *testing.T) { 10 | for _, c := range []struct { 11 | pkg string 12 | site string 13 | path string 14 | }{ 15 | {"github.com/daviddengcn", "github.com", "daviddengcn"}, 16 | {"github.com", "github.com", ""}, 17 | {"", "", ""}, 18 | } { 19 | site, path := SplitPackage(c.pkg) 20 | assert.Equal(t, "site", site, c.site) 21 | assert.Equal(t, "path", path, c.path) 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /utils/json.go: -------------------------------------------------------------------------------- 1 | package utils 2 | 3 | import ( 4 | "encoding/json" 5 | "os" 6 | ) 7 | 8 | func WriteJsonFile(fn string, data interface{}) error { 9 | f, err := os.Create(fn) 10 | if err != nil { 11 | return err 12 | } 13 | defer f.Close() 14 | 15 | enc := json.NewEncoder(f) 16 | return enc.Encode(data) 17 | } 18 | 19 | func ReadJsonFile(fn string, data interface{}) error { 20 | f, err := os.Open(fn) 21 | if err != nil { 22 | return err 23 | } 24 | defer f.Close() 25 | 26 | dec := json.NewDecoder(f) 27 | return dec.Decode(data) 28 | } 29 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Compiled Object files, Static and Dynamic libs (Shared Objects) 2 | *.o 3 | *.a 4 | *.so 5 | 6 | # Folders 7 | _obj 8 | _test 9 | 10 | # Architecture specific extensions/prefixes 11 | *.[568vq] 12 | [568vq].out 13 | 14 | *.cgo1.go 15 | *.cgo2.c 16 | _cgo_defun.c 17 | _cgo_gotypes.go 18 | _cgo_export.* 19 | 20 | _testmain.go 21 | 22 | *.exe 23 | *.gob 24 | *.bolt 25 | 26 | /conf.json 27 | /data/ 28 | /logs/ 29 | /scripts/backup-conf.json 30 | /server/server 31 | /pipelines/crawler/crawler 32 | /pipelines/indexer/indexer 33 | /pipelines/mergedocs/mergedocs 34 | /pipelines/tocrawl/tocrawl 35 | -------------------------------------------------------------------------------- /tokenize_test.go: -------------------------------------------------------------------------------- 1 | package gcse 2 | 3 | import ( 4 | "testing" 5 | 6 | "github.com/golangplus/strings" 7 | "github.com/golangplus/testing/assert" 8 | ) 9 | 10 | func TestTokenize(t *testing.T) { 11 | text := []byte("abc 3d 中文输入") 12 | tokens := AppendTokens(nil, text) 13 | assert.Equal(t, "tokens", tokens, 14 | stringsp.NewSet("abc", "3", "d", "3-d", "中", "文", "输", "入", "中文", "文输", "输入")) 15 | } 16 | 17 | func TestTokenize2(t *testing.T) { 18 | text := []byte("PubSubHub") 19 | tokens := AppendTokens(nil, text) 20 | assert.Equal(t, "tokens", tokens, 21 | stringsp.NewSet("pub", "sub", "hub", "pubsub", "subhub", "pubsubhub")) 22 | } 23 | -------------------------------------------------------------------------------- /scripts/testall.gs: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env gosl 2 | 3 | GCSE := "github.com/daviddengcn/gcse" 4 | 5 | APPS := []string { 6 | "service/web", "service/stored", "pipelines/tocrawl", "pipelines/crawler", "pipelines/mergedocs", "pipelines/indexer", "pipelines/spider", "store", "spider", 7 | } 8 | 9 | Exec("go", "fmt", GCSE) 10 | Printfln("Testing %s ...", GCSE) 11 | MustSucc(Bash("go test %s", GCSE)) 12 | 13 | for _, app := range APPS { 14 | Exec("go", "fmt", S("%s/%s", GCSE, app)) 15 | MustSucc(Bash("go vet %s/*.go", app)) 16 | Printf("Testing %s ...\n", app) 17 | MustSucc(Bash("go test %s/%s", GCSE, app)) 18 | } 19 | 20 | Println("All tests passed!") 21 | -------------------------------------------------------------------------------- /service/web/web/crawlhistory.html: -------------------------------------------------------------------------------- 1 | {{template "header.html" .UIUtils.Slice "Crawl Histotry" "crawlhistory"}} 2 | {{if .FoundTime}} 3 |
4 | Found at {{.FoundTime}} by {{.FoundWay}}
5 | {{if .LatestSuccess}}
Latest success at {{.LatestSuccess}}{{end}} 6 | {{if .LatestFailed}}
Latest failed at {{.LatestFailed}}{{end}} 7 |
8 |
9 |
{{end}} 10 | 11 | 12 | 13 | 14 | 15 | {{range .Events}} 16 | 17 | {{end}} 18 | 19 |
TimeAction
{{.Time}}{{.Action}}
20 | {{template "footer.html"}} 21 | -------------------------------------------------------------------------------- /utils.go: -------------------------------------------------------------------------------- 1 | package gcse 2 | 3 | import ( 4 | "time" 5 | 6 | "github.com/howeyc/fsnotify" 7 | ) 8 | 9 | func ClearWatcherEvents(watcher *fsnotify.Watcher) { 10 | return 11 | /* 12 | for { 13 | select { 14 | case <-watcher.Event: 15 | case err := <-watcher.Error: 16 | log.Printf("Wather.Error: %v", err) 17 | default: 18 | break 19 | } 20 | } 21 | */ 22 | } 23 | 24 | func WaitForWatcherEvents(watcher *fsnotify.Watcher) { 25 | time.Sleep(10 * time.Second) 26 | return 27 | /* 28 | for { 29 | select { 30 | case <-watcher.Event: 31 | case err := <-watcher.Error: 32 | log.Println("Wather.Error: %v", err) 33 | } 34 | } 35 | */ 36 | } 37 | -------------------------------------------------------------------------------- /conf.json.template: -------------------------------------------------------------------------------- 1 | { 2 | web: { 3 | // addr: ":8080" 4 | // root: "./server/" 5 | // loadtemplatepass: "" 6 | // autoloadtemplate: false 7 | } 8 | 9 | back: { 10 | // dbroot: "./data/" 11 | } 12 | 13 | // log: { 14 | // dir: "/tmp" 15 | // } 16 | crawler: { 17 | // due_per_run: "1h" 18 | // godoc: true 19 | // github_update: true 20 | // noncrawl_hosts: [] 21 | // github: { 22 | // clientid: "" 23 | // clientsecret: "" 24 | // personal: "" 25 | // } 26 | } 27 | 28 | docdb: { 29 | // nonstore_regexps: [] 30 | } 31 | 32 | bi: { 33 | // data_path: "/tmp/gcse.bolt" 34 | // web_path: "/bi" 35 | } 36 | 37 | // stored: { 38 | // addr: ":8081" 39 | // } 40 | } 41 | -------------------------------------------------------------------------------- /service/web/web/badgepage.html: -------------------------------------------------------------------------------- 1 | {{template "header.html" .UIUtils.Slice "Badge" "badge"}} 2 | 3 | 8 | 9 |

10 | 11 |
12 |
13 |
14 | 15 | 16 |
17 |
18 | 19 | 20 |
21 |
22 |
23 | 24 | 25 | {{template "footer.html"}} -------------------------------------------------------------------------------- /tools/fillfound/fillfound.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "log" 5 | 6 | "github.com/golangplus/time" 7 | 8 | "github.com/daviddengcn/gcse" 9 | "github.com/daviddengcn/gcse/store" 10 | "github.com/daviddengcn/gcse/utils" 11 | 12 | sppb "github.com/daviddengcn/gcse/proto/spider" 13 | ) 14 | 15 | func doFill() error { 16 | cDB := gcse.LoadCrawlerDB() 17 | return cDB.PackageDB.Iterate(func(pkg string, val interface{}) error { 18 | ent, ok := val.(gcse.CrawlingEntry) 19 | if !ok { 20 | log.Printf("Wrong entry, ignored: %+v", ent) 21 | return nil 22 | } 23 | site, path := utils.SplitPackage(pkg) 24 | return store.AppendPackageEvent(site, path, "unknown", ent.ScheduleTime.Add(-10*timep.Day), sppb.HistoryEvent_Action_None) 25 | }) 26 | } 27 | 28 | func main() { 29 | if err := doFill(); err != nil { 30 | log.Fatalf("doFill failed: %v", err) 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /pipelines/crawler/imports.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "log" 5 | 6 | "github.com/daviddengcn/gcse" 7 | "github.com/daviddengcn/gcse/configs" 8 | ) 9 | 10 | // processing sumitted packages (from go-search.org/add path) 11 | func processImports() error { 12 | dones, err := configs.ImportSegments().ListDones() 13 | if err != nil { 14 | return err 15 | } 16 | for _, segm := range dones { 17 | log.Printf("Processing done segment %v ...", segm) 18 | pkgs, err := gcse.ReadPackages(segm) 19 | if err != nil { 20 | log.Printf("ReadPackages %v failed: %v", segm, err) 21 | } 22 | if len(pkgs) > 0 { 23 | log.Printf("Importing %d packages ...", len(pkgs)) 24 | for _, pkg := range pkgs { 25 | appendNewPackage(pkg, "web") 26 | } 27 | } 28 | if err := segm.Remove(); err != nil { 29 | log.Printf("Remove %v failed: %v", segm, err) 30 | } 31 | } 32 | syncDatabases() 33 | 34 | return nil 35 | } 36 | -------------------------------------------------------------------------------- /service/web/web/tops.html: -------------------------------------------------------------------------------- 1 | {{template "header.html" .UIUtils.Slice "Top" "top"}} 2 | 3 |
4 | {{range .Lists}} 5 |
6 |
7 |
{{.Info}}
8 |
{{.Name}}
9 |
10 | 19 |
20 | {{end}} 21 |
22 | show more 23 | | JSON 24 |
25 |
26 | {{template "footer.html"}} 27 | -------------------------------------------------------------------------------- /tools/countdocs/countdocs.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "log" 5 | 6 | "github.com/golangplus/fmt" 7 | 8 | "github.com/daviddengcn/gcse" 9 | "github.com/daviddengcn/sophie" 10 | "github.com/daviddengcn/sophie/kv" 11 | ) 12 | 13 | func main() { 14 | // path := "data/docs" 15 | path := "data/docs-updated" 16 | kvDir := kv.DirInput(sophie.LocalFsPath(path)) 17 | 18 | cnt, err := kvDir.PartCount() 19 | if err != nil { 20 | log.Fatalf("kvDir.PartCount failed: %v", err) 21 | } 22 | 23 | totalEntries := 0 24 | for i := 0; i < cnt; i++ { 25 | it, err := kvDir.Iterator(i) 26 | if err != nil { 27 | log.Fatalf("kvDir.Collector(%d) failed: %v", i, err) 28 | } 29 | 30 | var key sophie.RawString 31 | var val gcse.DocInfo 32 | for { 33 | if err := it.Next(&key, &val); err != nil { 34 | if err == sophie.EOF { 35 | break 36 | } 37 | log.Fatalf("it.Next failed %v", err) 38 | } 39 | totalEntries++ 40 | } 41 | 42 | it.Close() 43 | } 44 | 45 | fmtp.Printfln("Total %d files, %d entries.", cnt, totalEntries) 46 | } 47 | -------------------------------------------------------------------------------- /service/web/web/add.html: -------------------------------------------------------------------------------- 1 | {{template "header.html" .UIUtils.Slice "Add Packages" "add"}} 2 | {{if .Message }} 3 |
4 | 8 |
9 | {{end}} 10 |
11 |
12 |
13 | 14 | 15 |

One package a line. No quotes.

16 |
17 | 18 |
19 |
20 | 25 | {{template "footer.html"}} 26 | -------------------------------------------------------------------------------- /tools/fillfound/fillfound_test.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "testing" 5 | "time" 6 | 7 | "github.com/golang/protobuf/ptypes" 8 | "github.com/golangplus/testing/assert" 9 | "github.com/golangplus/time" 10 | 11 | "github.com/daviddengcn/gcse" 12 | "github.com/daviddengcn/gcse/configs" 13 | "github.com/daviddengcn/gcse/store" 14 | 15 | sppb "github.com/daviddengcn/gcse/proto/spider" 16 | ) 17 | 18 | func init() { 19 | configs.SetTestingDataPath() 20 | } 21 | 22 | func TestDoFill(t *testing.T) { 23 | const ( 24 | site = "github.com" 25 | path = "daviddengcn/gcse" 26 | ) 27 | tm := time.Now().Add(-20 * timep.Day) 28 | cDB := gcse.LoadCrawlerDB() 29 | cDB.PackageDB.Put(site+"/"+path, gcse.CrawlingEntry{ 30 | ScheduleTime: tm.Add(10 * timep.Day), 31 | }) 32 | assert.NoError(t, cDB.Sync()) 33 | 34 | assert.NoError(t, doFill()) 35 | 36 | h, err := store.ReadPackageHistory(site, path) 37 | assert.NoError(t, err) 38 | ts, _ := ptypes.TimestampProto(tm) 39 | assert.Equal(t, "h", h, &sppb.HistoryInfo{ 40 | FoundTime: ts, 41 | FoundWay: "unknown", 42 | }) 43 | } 44 | -------------------------------------------------------------------------------- /scripts/backup.gs: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env gosl 2 | 3 | import "flag" 4 | import "github.com/daviddengcn/go-villa" 5 | import "github.com/daviddengcn/go-ljson-conf" 6 | 7 | backupFolders := flag.String("folder", "docs:crawler", "Colon-delimited folders to backup.") 8 | 9 | flag.Parse() 10 | 11 | dir := villa.Path(ScriptDir()) 12 | 13 | conf, _ := ljconf.Load(dir.Join("backup-conf.json").S()) 14 | 15 | fdid := conf.String("gdrive.folder.id", "") 16 | if fdid == "" { 17 | Fatalf("Please set gdrive.folder.id in configuration!") 18 | } 19 | 20 | today := Now().Format("2006-01-02") 21 | Printf("Backup to %s\n", today) 22 | 23 | folders := Split(*backupFolders, ":") 24 | 25 | Println("Compressing files") 26 | for _, folder := range folders { 27 | Printfln("Compressing data/%s into data/%s.%s.tar.gz", folder, folder, today) 28 | MustSucc(Bash("tar czf data/%s.%s.tar.gz data/%s", folder, today, folder)) 29 | } 30 | 31 | Println("Uploading to GDrive") 32 | for _, folder := range folders { 33 | MustSucc(Bash("gdrive upload -f data/%s.%s.tar.gz -p %s", folder, today, fdid)) 34 | Bash("rm data/%s.%s.tar.gz", folder, today) 35 | } 36 | 37 | Println("Backup finished") 38 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2014, Yi Deng 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 5 | 6 | Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 7 | -------------------------------------------------------------------------------- /scripts/install.gs: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env gosl 2 | 3 | import "flag" 4 | 5 | goGet := flag.Bool("go_get", true, `Whether do "go get" before installing`) 6 | doTest := flag.Bool("do_test", false, `Whether do "go test" on essential packages`) 7 | compileAll := flag.Bool("a", true, `Whether use -a in go install command`) 8 | 9 | flag.Parse() 10 | 11 | const GCSE = "github.com/daviddengcn/gcse" 12 | APPS := []string { 13 | "pipelines/tocrawl", "pipelines/crawler", "pipelines/mergedocs", "pipelines/indexer", "service/stored", "service/web", 14 | } 15 | 16 | if *goGet { 17 | Printfln("go get -u -v %s", GCSE) 18 | MustSucc(Bash("go get -u -v %s", GCSE)) 19 | for _, a := range APPS { 20 | Printfln("go get -u -v %s/%s", GCSE, a) 21 | MustSucc(Bash("go get -u -v %s/%s", GCSE, a)) 22 | } 23 | } 24 | 25 | if *doTest { 26 | Println("go test -a") 27 | MustSucc(Bash("go test -a")) 28 | Println("go test store/*.go") 29 | MustSucc(Bash("go test store/*.go")) 30 | Println("go test spider/*.go") 31 | MustSucc(Bash("go test spider/*.go")) 32 | } 33 | 34 | buildFlags := "" 35 | if *compileAll { 36 | buildFlags += " -a" 37 | } 38 | 39 | for _, a := range APPS { 40 | Printfln("go install %s %s/%s", buildFlags, GCSE, a) 41 | MustSucc(Bash("go install %s %s/%s", buildFlags, GCSE, a)) 42 | } 43 | 44 | -------------------------------------------------------------------------------- /service/stored/stored.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "context" 5 | "flag" 6 | "net" 7 | 8 | "github.com/daviddengcn/gcse/configs" 9 | "github.com/daviddengcn/gcse/store" 10 | "github.com/daviddengcn/gcse/utils" 11 | "github.com/golang/glog" 12 | "google.golang.org/grpc" 13 | 14 | gpb "github.com/daviddengcn/gcse/shared/proto" 15 | ) 16 | 17 | type server struct { 18 | } 19 | 20 | var _ gpb.StoreServiceServer = (*server)(nil) 21 | 22 | func (s *server) PackageCrawlHistory(_ context.Context, req *gpb.PackageCrawlHistoryReq) (*gpb.PackageCrawlHistoryResp, error) { 23 | site, path := utils.SplitPackage(req.Package) 24 | info, err := store.ReadPackageHistory(site, path) 25 | if err != nil { 26 | glog.Errorf("ReadPackageHistoryOf %q %q failed: %v", site, path, err) 27 | return nil, err 28 | } 29 | return &gpb.PackageCrawlHistoryResp{Info: info}, nil 30 | } 31 | 32 | func main() { 33 | addr := flag.String("addr", configs.StoreDAddr, "addr to listen") 34 | 35 | flag.Parse() 36 | 37 | glog.Infof("Listening to %s", *addr) 38 | lis, err := net.Listen("tcp", *addr) 39 | if err != nil { 40 | glog.Fatalf("failed to listen: %v", err) 41 | } 42 | grpcServer := grpc.NewServer() 43 | gpb.RegisterStoreServiceServer(grpcServer, &server{}) 44 | grpcServer.Serve(lis) 45 | } 46 | -------------------------------------------------------------------------------- /shared/proto/store.proto: -------------------------------------------------------------------------------- 1 | syntax = "proto3"; 2 | 3 | package gcse; 4 | 5 | option go_package = "gcsepb"; 6 | 7 | import "github.com/daviddengcn/gcse/shared/proto/spider.proto"; 8 | 9 | message PackageInfo { 10 | string name = 1; 11 | string package = 2; 12 | string author = 3; 13 | int32 stars = 4; 14 | string synopsis = 5; 15 | string description = 6; 16 | string project_url = 7; 17 | string readme_fn = 8; 18 | string readme_data = 9; 19 | repeated string imports = 10; 20 | repeated string test_imports = 11; 21 | repeated string exported = 12; 22 | repeated string references = 18; 23 | 24 | CrawlingInfo crawling_info = 17; 25 | 26 | // Available if the package is not the repo's root. 27 | FolderInfo folder_info = 14; 28 | 29 | // Available if the package is the repo's root. 30 | RepoInfo repo_info = 15; 31 | } 32 | 33 | message PersonInfo { 34 | CrawlingInfo crawling_info = 1; 35 | } 36 | 37 | message Repository { 38 | string branch = 6; 39 | string signature = 7; 40 | 41 | // map from relative path, e.g. "proto/store", to Package 42 | map packages = 8; 43 | 44 | string ReadmeFn = 2; // No directory info 45 | string ReadmeData = 3; // Raw content, cound be md, txt, etc. 46 | int32 stars = 4; 47 | 48 | CrawlingInfo crawling_info = 5; 49 | } 50 | -------------------------------------------------------------------------------- /spider/godocorg/packages.go: -------------------------------------------------------------------------------- 1 | package godocorg 2 | 3 | import ( 4 | "encoding/json" 5 | "net/http" 6 | 7 | "github.com/daviddengcn/gddo/doc" 8 | "github.com/golangplus/errors" 9 | ) 10 | 11 | const ( 12 | godocApiUrl = "http://api.godoc.org/packages" 13 | ) 14 | 15 | // FetchAllPackagesInGodoc fetches the list of all packages on godoc.org 16 | func FetchAllPackagesInGodoc(httpClient doc.HttpClient) ([]string, error) { 17 | req, err := http.NewRequest("GET", godocApiUrl, nil) 18 | if err != nil { 19 | return nil, errorsp.WithStacksAndMessage(err, "new request for %v failed", godocApiUrl) 20 | } 21 | resp, err := httpClient.Do(req) 22 | if err != nil { 23 | return nil, errorsp.WithStacksAndMessage(err, "fetching %v failed", godocApiUrl) 24 | } 25 | defer resp.Body.Close() 26 | if resp.StatusCode != 200 { 27 | return nil, errorsp.NewWithStacks("StatusCode: %d", resp.StatusCode) 28 | } 29 | var results struct { 30 | Results []struct { 31 | Path string 32 | } 33 | } 34 | dec := json.NewDecoder(resp.Body) 35 | 36 | if err := dec.Decode(&results); err != nil { 37 | return nil, errorsp.WithStacks(err) 38 | } 39 | list := make([]string, 0, len(results.Results)) 40 | for _, res := range results.Results { 41 | list = append(list, res.Path) 42 | } 43 | return list, nil 44 | } 45 | -------------------------------------------------------------------------------- /db_test.go: -------------------------------------------------------------------------------- 1 | package gcse 2 | 3 | import ( 4 | "testing" 5 | 6 | "github.com/golangplus/testing/assert" 7 | 8 | "github.com/daviddengcn/go-villa" 9 | ) 10 | 11 | func TestMemDB_Bug_Sync(t *testing.T) { 12 | path := villa.Path(".").Join("testmemdb.gob") 13 | if path.Exists() { 14 | path.Remove() 15 | } 16 | 17 | db := NewMemDB(".", "testmemdb") 18 | db.Put("s", 1) 19 | err := db.Sync() 20 | if err != nil { 21 | t.Error(err) 22 | } 23 | 24 | assert.Equal(t, "Exists", path.Exists(), true) 25 | if err := path.Remove(); err != nil { 26 | t.Error(err) 27 | } 28 | assert.Equal(t, "Exists", path.Exists(), false) 29 | 30 | //if err := db.Load(); err != nil { 31 | // t.Error(err) 32 | //} 33 | } 34 | 35 | func TestMemDB_Recover(t *testing.T) { 36 | path := villa.Path(".").Join("testmemdb.gob") 37 | if path.Exists() { 38 | path.Remove() 39 | } 40 | 41 | db := NewMemDB(".", "testmemdb") 42 | db.Put("s", 1) 43 | if err := db.Sync(); err != nil { 44 | t.Error(err) 45 | return 46 | } 47 | 48 | if err := path.Rename(path + ".new"); err != nil { 49 | t.Error(err) 50 | return 51 | } 52 | // Now in the status of fn.new exists, fn not exist 53 | 54 | if err := db.Load(); err != nil { 55 | t.Error(err) 56 | return 57 | } 58 | var vl int 59 | if ok := db.Get("s", &vl); !ok { 60 | t.Error("Recover failed!") 61 | return 62 | } 63 | assert.Equal(t, "vl", vl, 1) 64 | } 65 | -------------------------------------------------------------------------------- /service/web/add.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "fmt" 5 | "log" 6 | "net/http" 7 | "strings" 8 | 9 | "github.com/daviddengcn/gcse" 10 | "github.com/daviddengcn/gddo/doc" 11 | ) 12 | 13 | func filterPackages(pkgs []string) (res []string) { 14 | for _, pkg := range pkgs { 15 | pkg = gcse.TrimPackageName(pkg) 16 | if !doc.IsValidRemotePath(pkg) { 17 | continue 18 | } 19 | res = append(res, pkg) 20 | } 21 | return 22 | } 23 | 24 | func pageAdd(w http.ResponseWriter, r *http.Request) { 25 | w.Header().Set("Content-Type", "text/html") 26 | 27 | pkgsStr := r.FormValue("pkg") 28 | pkgMessage := "" 29 | msgCls := "success" 30 | taValue := "" 31 | if pkgsStr != "" { 32 | pkgs := filterPackages(strings.Split(pkgsStr, "\n")) 33 | if len(pkgs) > 0 { 34 | log.Printf("%d packages added!", len(pkgs)) 35 | pkgMessage = fmt.Sprintf("Totally %d package(s) added!", len(pkgs)) 36 | gcse.AppendPackages(pkgs) 37 | } else { 38 | msgCls = "danger" 39 | pkgMessage = "No package added! Check the format you submitted, please." 40 | taValue = pkgsStr 41 | } 42 | } 43 | err := templates.ExecuteTemplate(w, "add.html", struct { 44 | UIUtils 45 | Message string 46 | MsgCls string 47 | TAValue string 48 | }{ 49 | Message: pkgMessage, 50 | MsgCls: msgCls, 51 | TAValue: taValue, 52 | }) 53 | if err != nil { 54 | http.Error(w, err.Error(), http.StatusInternalServerError) 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Go Search [![GoSearch](http://go-search.org/badge?id=github.com%2Fdaviddengcn%2Fgcse)](http://go-search.org/view?id=github.com%2Fdaviddengcn%2Fgcse) 2 | ========= 3 | 4 | A keyword search engine helping people to find popular and relevant Go packages. 5 | 6 | Online service: [Go Search](http://go-search.org/) 7 | 8 | This is the root package with shared functions. 9 | 10 | Sub packages are commands for running: 11 | 12 | * [HTTP Server](http://github.com/daviddengcn/gcse/server): Searching and web service 13 | * [ToCrawl](http://github.com/daviddengcn/gcse/tocrawl): Find packages to crawl. 14 | * [Crawler](http://github.com/daviddengcn/gcse/crawler): Crawling package files. 15 | * [MergeDocs](http://github.com/daviddengcn/gcse/mergedocs): Merge crawled package files with doc DB. 16 | * [Indexer](http://github.com/daviddengcn/gcse/indexer): Analyzing package information and generating indexed data for searching. 17 | 18 | Development 19 | ----------- 20 | 21 | You'll need to perform the following steps to get a basic server running: 22 | 23 | 1. Create a basic `conf.json` file, limiting the crawler to a one minute run: `{ "crawler": { "due_per_run": "1m" } }` 24 | 1. Run the package finder: `go run tocrawl/*.go` 25 | 1. Run the crawler: `go run crawler/*.go` 26 | 1. Merge the crawled docs: `go run mergedocs/*.go` 27 | 1. Run the indexer: `go run indexer/*.go` 28 | 1. Run the server: ` go run server/*.go` 29 | 1. Visit [http://localhost:8080](http://localhost:8080) in your browser 30 | 31 | 32 | LICENSE 33 | ------- 34 | BSD license. 35 | -------------------------------------------------------------------------------- /license.txt: -------------------------------------------------------------------------------- 1 | Copyright (c) 2013, David Deng 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions are met: 6 | 7 | 1. Redistributions of source code must retain the above copyright notice, 8 | this list of conditions and the following disclaimer. 9 | 2. Redistributions in binary form must reproduce the above copyright notice, 10 | this list of conditions and the following disclaimer in the documentation 11 | and/or other materials provided with the distribution. 12 | 3. Neither the name of the PostgreSQL Global Development Group nor the names 13 | of its contributors may be used to endorse or promote products derived 14 | from this software without specific prior written permission. 15 | 16 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 17 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 20 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 21 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 22 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 23 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 24 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 25 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 26 | POSSIBILITY OF SUCH DAMAGE. 27 | -------------------------------------------------------------------------------- /utils/utils.go: -------------------------------------------------------------------------------- 1 | package utils 2 | 3 | import ( 4 | "fmt" 5 | "log" 6 | "runtime" 7 | "strings" 8 | ) 9 | 10 | func SplitPackage(pkg string) (site, path string) { 11 | parts := strings.SplitN(pkg, "/", 2) 12 | if len(parts) > 0 { 13 | site = parts[0] 14 | } 15 | if len(parts) > 1 { 16 | path = parts[1] 17 | } 18 | return site, path 19 | } 20 | 21 | // LogError is used to ignore an error but log it. 22 | func LogError(err error, format string, args ...interface{}) { 23 | if err == nil { 24 | return 25 | } 26 | log.Print(fmt.Sprintf("%s: %v", fmt.Sprintf(format, args...), err)) 27 | } 28 | 29 | type Size int64 30 | 31 | func (s Size) String() string { 32 | var unit string 33 | var base int64 34 | switch { 35 | case s < 1024: 36 | unit, base = "", 1 37 | case s < 1024*1024: 38 | unit, base = "K", 1024 39 | case s < 1024*1024*1024: 40 | unit, base = "M", 1024*1024 41 | case s < 1024*1024*1024*1024: 42 | unit, base = "G", 1024*1024*1024 43 | case s < 1024*1024*1024*1024*1024: 44 | unit, base = "T", 1024*1024*1024*1024 45 | case s < 1024*1024*1024*1024*1024*1024: 46 | unit, base = "P", 1024*1024*1024*1024*1024 47 | } 48 | 49 | remain := int64(s) / base 50 | if remain < 10 { 51 | return fmt.Sprintf("%.2f%s", float64(s)/float64(base), unit) 52 | } 53 | if remain < 100 { 54 | return fmt.Sprintf("%.1f%s", float64(s)/float64(base), unit) 55 | } 56 | 57 | return fmt.Sprintf("%d%s", int64(s)/base, unit) 58 | } 59 | 60 | func DumpMemStats() { 61 | var ms runtime.MemStats 62 | runtime.ReadMemStats(&ms) 63 | log.Printf("[MemStats] Alloc: %v, TotalAlloc: %v, Sys: %v, Go: %d", 64 | Size(ms.Alloc), Size(ms.TotalAlloc), Size(ms.Sys), 65 | runtime.NumGoroutine()) 66 | } 67 | -------------------------------------------------------------------------------- /tools/exps/importsents.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "fmt" 5 | "log" 6 | 7 | "github.com/daviddengcn/gcse" 8 | "github.com/daviddengcn/gcse/configs" 9 | "github.com/daviddengcn/go-villa" 10 | ) 11 | 12 | const ( 13 | fnDocDB = "docdb" 14 | ) 15 | 16 | var ( 17 | DocDBPath villa.Path 18 | 19 | // CrawlerDBPath villa.Path 20 | ) 21 | 22 | func init() { 23 | DocDBPath = configs.DataRoot.Join(fnDocDB) 24 | // CrawlerDBPath = gcse.DataRoot.Join(fnCrawlerDB) 25 | } 26 | 27 | func main() { 28 | docDB := gcse.NewMemDB(DocDBPath, gcse.KindDocDB) 29 | countAll, countReadme, countHasSents := 0, 0, 0 30 | countSents := 0 31 | 32 | f, err := villa.Path("exps/notfound.txt").Create() 33 | if err != nil { 34 | log.Fatal(err) 35 | } 36 | defer f.Close() 37 | 38 | log.Printf("Start processing ...") 39 | if err := docDB.Iterate(func(key string, val interface{}) error { 40 | countAll++ 41 | 42 | d := val.(gcse.DocInfo) 43 | if d.ReadmeData != "" { 44 | countReadme++ 45 | 46 | readme := gcse.ReadmeToText(d.ReadmeFn, d.ReadmeData) 47 | 48 | sents := gcse.ChooseImportantSentenses(readme, d.Name, d.Package) 49 | if len(sents) > 0 { 50 | countSents += len(sents) 51 | countHasSents++ 52 | } else { 53 | fmt.Fprintln(f, "$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$") 54 | fmt.Fprintf(f, "%s - %s - %s\n", d.Name, d.Package, d.ReadmeFn) 55 | fmt.Fprintf(f, "%s\n", readme) 56 | } 57 | } 58 | 59 | return nil 60 | }); err != nil { 61 | log.Fatalf("docDB.Iterate failed: %v", err) 62 | } 63 | 64 | log.Printf("%d documents processed.", countAll) 65 | log.Printf("%d have readme.", countReadme) 66 | log.Printf("%d found %d important sentenses.", countHasSents, countSents) 67 | } 68 | -------------------------------------------------------------------------------- /spider/ranking_test.go: -------------------------------------------------------------------------------- 1 | package spider 2 | 3 | import ( 4 | "fmt" 5 | "testing" 6 | "time" 7 | 8 | "github.com/golang/protobuf/ptypes" 9 | "github.com/golangplus/testing/assert" 10 | "github.com/golangplus/time" 11 | 12 | gpb "github.com/daviddengcn/gcse/shared/proto" 13 | ) 14 | 15 | func TestLikeGoSubFolder(t *testing.T) { 16 | pos_cases := []string{ 17 | "go", "v8", "v-8", 18 | } 19 | for _, c := range pos_cases { 20 | assert.True(t, fmt.Sprintf("LikeGoSubFolder %v", c), LikeGoSubFolder(c)) 21 | } 22 | neg_cases := []string{ 23 | "js", "1234", "1234-5678", "1234_5678", 24 | } 25 | for _, c := range neg_cases { 26 | assert.False(t, fmt.Sprintf("LikeGoSubFolder %v", c), LikeGoSubFolder(c)) 27 | } 28 | } 29 | 30 | func TestCheckPackageStatus(t *testing.T) { 31 | // No crawling info, new package 32 | assert.Equal(t, "CheckPackageStatus", CheckPackageStatus(&gpb.PackageInfo{}, nil), OutOfDate) 33 | pkgCrawlTime, _ := ptypes.TimestampProto(time.Now().Add(-5 * timep.Day)) 34 | 35 | newRepoInfoCrawlTime, _ := ptypes.TimestampProto(time.Now().Add(-3 * timep.Day)) 36 | newPkgUpdateTime, _ := ptypes.TimestampProto(time.Now().Add(-4 * timep.Day)) 37 | assert.Equal(t, "CheckPackageStatus", CheckPackageStatus(&gpb.PackageInfo{ 38 | CrawlingInfo: &gpb.CrawlingInfo{ 39 | CrawlingTime: pkgCrawlTime, 40 | }, 41 | }, &gpb.RepoInfo{ 42 | CrawlingTime: newRepoInfoCrawlTime, 43 | LastUpdated: newPkgUpdateTime, 44 | }), OutOfDate) 45 | 46 | newPkgUpdateTime, _ = ptypes.TimestampProto(time.Now().Add(-6 * timep.Day)) 47 | assert.Equal(t, "CheckPackageStatus", CheckPackageStatus(&gpb.PackageInfo{ 48 | CrawlingInfo: &gpb.CrawlingInfo{ 49 | CrawlingTime: pkgCrawlTime, 50 | }, 51 | }, &gpb.RepoInfo{ 52 | CrawlingTime: newRepoInfoCrawlTime, 53 | LastUpdated: newPkgUpdateTime, 54 | }), UpToDate) 55 | } 56 | -------------------------------------------------------------------------------- /data_test.go: -------------------------------------------------------------------------------- 1 | package gcse 2 | 3 | import ( 4 | "testing" 5 | "time" 6 | 7 | "github.com/golangplus/bytes" 8 | "github.com/golangplus/testing/assert" 9 | 10 | "github.com/daviddengcn/go-index" 11 | ) 12 | 13 | func TestDocInfo(t *testing.T) { 14 | src := DocInfo{ 15 | Name: "gcse", 16 | Package: "github.com/daviddengcn/gcse", 17 | Author: "github.com/daviddengcn", 18 | LastUpdated: time.Now().Round(0), 19 | StarCount: 10, 20 | Synopsis: "Go Package Search Engine", 21 | Description: "More details about GCSE", 22 | ProjectURL: "http://github.com/daviddengcn/gcse", 23 | ReadmeFn: "readme.txt", 24 | ReadmeData: "Just read me", 25 | Imports: []string{ 26 | "github.com/daviddengcn/go-villa", 27 | "github.com/daviddengcn/sophie", 28 | }, 29 | TestImports: []string{ 30 | "github.com/daviddengcn/go-check", 31 | }, 32 | Exported: []string{ 33 | "DocInfo", "CheckRuneType", 34 | }, 35 | } 36 | var buf bytesp.Slice 37 | assert.NoError(t, src.WriteTo(&buf)) 38 | 39 | var dst DocInfo 40 | assert.NoError(t, dst.ReadFrom(&buf, -1)) 41 | dst.LastUpdated = dst.LastUpdated.Round(0) 42 | 43 | assert.StringEqual(t, "dst", dst, src) 44 | 45 | // checking the bug introduced by reusing slice 46 | dst2 := dst 47 | assert.StringEqual(t, "dst2.Imports[0]", dst2.Imports[0], 48 | "github.com/daviddengcn/go-villa") 49 | 50 | src.Imports[0] = "github.com/daviddengcn/go-assert" 51 | buf = nil 52 | assert.NoError(t, src.WriteTo(&buf)) 53 | assert.NoError(t, dst.ReadFrom(&buf, -1)) 54 | assert.StringEqual(t, "dst", dst, src) 55 | 56 | assert.StringEqual(t, "dst2.Imports[0]", dst2.Imports[0], 57 | "github.com/daviddengcn/go-villa") 58 | } 59 | 60 | func TestCheckRuneType_BOM(t *testing.T) { 61 | tp := CheckRuneType('A', 0xfeff) 62 | assert.Equal(t, "CheckRuneType(A, 0xfeff)", tp, index.TokenSep) 63 | } 64 | -------------------------------------------------------------------------------- /service/web/web/footer.html: -------------------------------------------------------------------------------- 1 |
2 | 19 | 20 | 21 | 35 | 36 | 37 | -------------------------------------------------------------------------------- /service/web/crawlhistory.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "net/http" 5 | "strings" 6 | "time" 7 | 8 | "github.com/golang/glog" 9 | "github.com/golang/protobuf/ptypes" 10 | 11 | gpb "github.com/daviddengcn/gcse/shared/proto" 12 | ) 13 | 14 | func (s *server) pageCrawlHistory(w http.ResponseWriter, r *http.Request) { 15 | ctx := r.Context() 16 | 17 | w.Header().Set("Content-Type", "text/html") 18 | 19 | pkg := strings.ToLower(r.FormValue("id")) 20 | resp, err := s.storeClient.PackageCrawlHistory(ctx, &gpb.PackageCrawlHistoryReq{ 21 | Package: pkg, 22 | }) 23 | if err != nil { 24 | glog.Errorf("PackageCrawlHistory %q failed: %v", pkg, err) 25 | pageNotFound(w, r) 26 | return 27 | } 28 | hi := resp.Info 29 | type Event struct { 30 | Time time.Time 31 | Action string 32 | } 33 | events := make([]Event, 0, len(hi.Events)) 34 | for _, e := range hi.Events { 35 | t, _ := ptypes.Timestamp(e.Timestamp) 36 | events = append(events, Event{ 37 | Time: t, 38 | Action: e.Action.String(), 39 | }) 40 | } 41 | var foundTm, succTm, failedTm *time.Time 42 | if hi.FoundTime != nil { 43 | foundTm = &time.Time{} 44 | *foundTm, _ = ptypes.Timestamp(hi.FoundTime) 45 | } 46 | if hi.LatestSuccess != nil { 47 | succTm := &time.Time{} 48 | *succTm, _ = ptypes.Timestamp(hi.LatestSuccess) 49 | } 50 | if hi.LatestFailed != nil { 51 | failedTm := &time.Time{} 52 | *failedTm, _ = ptypes.Timestamp(hi.LatestFailed) 53 | } 54 | if err := templates.ExecuteTemplate(w, "crawlhistory.html", struct { 55 | UIUtils 56 | FoundTime *time.Time 57 | FoundWay string 58 | LatestSuccess *time.Time 59 | LatestFailed *time.Time 60 | Events []Event 61 | }{ 62 | FoundTime: foundTm, 63 | FoundWay: hi.FoundWay, 64 | LatestSuccess: succTm, 65 | LatestFailed: failedTm, 66 | Events: events, 67 | }); err != nil { 68 | http.Error(w, err.Error(), http.StatusInternalServerError) 69 | } 70 | } 71 | -------------------------------------------------------------------------------- /service/web/web/header.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | {{if index . 0}}{{index . 0}} - Go Search{{else}}Go Search - Find popular and relevant Go packages!{{end}} 7 | 8 | 9 | 10 | 11 | 12 | 32 |
33 | top 34 |
35 | 36 |
37 | -------------------------------------------------------------------------------- /store/repo_test.go: -------------------------------------------------------------------------------- 1 | package store 2 | 3 | import ( 4 | "testing" 5 | 6 | "github.com/golangplus/testing/assert" 7 | 8 | gpb "github.com/daviddengcn/gcse/shared/proto" 9 | ) 10 | 11 | func TestUpdateReadDeleteRepository(t *testing.T) { 12 | const ( 13 | site = "TestUpdateReadDeleteRepository.com" 14 | user = "daviddengcn" 15 | repo = "gcse" 16 | ) 17 | assert.NoError(t, UpdateRepository(site, user, repo, func(doc *gpb.Repository) error { 18 | assert.Equal(t, "doc", doc, &gpb.Repository{}) 19 | doc.Stars = 10 20 | return nil 21 | })) 22 | r, err := ReadRepository(site, user, repo) 23 | assert.NoError(t, err) 24 | assert.Equal(t, "r", r, &gpb.Repository{Stars: 10}) 25 | 26 | assert.NoError(t, DeleteRepository(site, user, repo)) 27 | 28 | r, err = ReadRepository(site, user, repo) 29 | assert.NoError(t, err) 30 | assert.Equal(t, "r", r, &gpb.Repository{}) 31 | } 32 | 33 | func TestForEachRepositorySite(t *testing.T) { 34 | cleanDatabase(t) 35 | 36 | const ( 37 | site = "TestForEachRepositorySite.com" 38 | user = "daviddengcn" 39 | repo = "gcse" 40 | ) 41 | assert.NoError(t, UpdateRepository(site, user, repo, func(doc *gpb.Repository) error { 42 | return nil 43 | })) 44 | var sites []string 45 | assert.NoError(t, ForEachRepositorySite(func(site string) error { 46 | sites = append(sites, site) 47 | return nil 48 | })) 49 | assert.Equal(t, "sites", sites, []string{site}) 50 | } 51 | 52 | func TestForEachRepositoryOfSite(t *testing.T) { 53 | const ( 54 | site = "TestForEachRepositoryOfSite.com" 55 | user = "daviddengcn" 56 | repo = "gcse" 57 | ) 58 | assert.NoError(t, UpdateRepository(site, user, repo, func(doc *gpb.Repository) error { 59 | doc.ReadmeData = "hello" 60 | return nil 61 | })) 62 | assert.NoError(t, ForEachRepositoryOfSite(site, func(u, r string, doc *gpb.Repository) error { 63 | assert.Equal(t, "user", u, user) 64 | assert.Equal(t, "repo", r, repo) 65 | assert.Equal(t, "doc", doc, &gpb.Repository{ReadmeData: "hello"}) 66 | return nil 67 | })) 68 | } 69 | -------------------------------------------------------------------------------- /spider/filecache_test.go: -------------------------------------------------------------------------------- 1 | package spider 2 | 3 | import ( 4 | "os" 5 | "path/filepath" 6 | "testing" 7 | 8 | "github.com/golangplus/testing/assert" 9 | 10 | "github.com/daviddengcn/bolthelper" 11 | 12 | gpb "github.com/daviddengcn/gcse/shared/proto" 13 | ) 14 | 15 | func TestNullFileCache(t *testing.T) { 16 | c := NullFileCache{} 17 | c.Set("", nil) 18 | assert.False(t, "c.Get", c.Get("", nil)) 19 | } 20 | 21 | func TestBoltFileCache(t *testing.T) { 22 | fn := filepath.Join(os.TempDir(), "TestBoltFileCache.bolt") 23 | assert.NoErrorOrDie(t, os.RemoveAll(fn)) 24 | 25 | db, err := bh.Open(fn, 0755, nil) 26 | assert.NoErrorOrDie(t, err) 27 | 28 | counter := make(map[string]int) 29 | c := BoltFileCache{ 30 | DB: db, 31 | IncCounter: func(name string) { 32 | counter[name] = counter[name] + 1 33 | }, 34 | } 35 | const ( 36 | sign1 = "abc" 37 | sign2 = "def" 38 | sign3 = "ghi" 39 | gofile = "file.go" 40 | rootfolder = "root" 41 | sub = "sub" 42 | subfolder = "root/sub" 43 | ) 44 | fi := &gpb.GoFileInfo{} 45 | 46 | ////////////////////////////////////////////////////////////// 47 | // New file found. 48 | ////////////////////////////////////////////////////////////// 49 | // Get before set, should return false 50 | assert.False(t, "c.Get", c.Get(sign1, fi)) 51 | assert.Equal(t, "counter", counter, map[string]int{ 52 | "crawler.filecache.missed": 1, 53 | }) 54 | // Set the info. 55 | c.Set(sign1, &gpb.GoFileInfo{Status: gpb.GoFileInfo_ShouldIgnore}) 56 | assert.Equal(t, "counter", counter, map[string]int{ 57 | "crawler.filecache.missed": 1, 58 | "crawler.filecache.sign_saved": 1, 59 | }) 60 | // Now, should fetch the cache 61 | assert.True(t, "c.Get", c.Get(sign1, fi)) 62 | assert.Equal(t, "fi", fi, &gpb.GoFileInfo{Status: gpb.GoFileInfo_ShouldIgnore}) 63 | assert.Equal(t, "counter", counter, map[string]int{ 64 | "crawler.filecache.missed": 1, 65 | "crawler.filecache.sign_saved": 1, 66 | "crawler.filecache.hit": 1, 67 | }) 68 | } 69 | -------------------------------------------------------------------------------- /pipelines/tocrawl/ghup.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "context" 5 | "fmt" 6 | "log" 7 | "strings" 8 | "time" 9 | 10 | "github.com/daviddengcn/gcse" 11 | "github.com/daviddengcn/gcse/store" 12 | "github.com/daviddengcn/gddo/doc" 13 | "github.com/golang/glog" 14 | 15 | gpb "github.com/daviddengcn/gcse/shared/proto" 16 | ) 17 | 18 | // touchPackage forces a package to update if it was not crawled before a 19 | // specific time. 20 | func touchPackage(pkg string, crawledBefore time.Time, pkgUTs map[string]time.Time) { 21 | pkg = strings.TrimSpace(pkg) 22 | if !doc.IsValidRemotePath(pkg) { 23 | //log.Printf(" [touchPackage] Not a valid remote path: %s", pkg) 24 | return 25 | } 26 | 27 | ut, ok := pkgUTs[pkg] 28 | if ok && ut.After(crawledBefore) { 29 | return 30 | } 31 | 32 | // set Etag to "" to force updating 33 | cDB.PushToCrawlPackage(pkg) 34 | } 35 | 36 | func touchByGithubUpdates(ctx context.Context, pkgUTs map[string]time.Time) { 37 | log.Printf("touchByGithubUpdates ...") 38 | 39 | rs, err := gcse.GithubSpider.SearchRepositories(ctx, "") 40 | if err != nil { 41 | log.Printf("SearchRepositories failed: %v", err) 42 | return 43 | } 44 | count := 0 45 | now := time.Now() 46 | emptyOwnerOrUpdatedAt, emptyUserOrPath := 0, 0 47 | for _, r := range rs { 48 | if r.Owner == nil || r.UpdatedAt == nil { 49 | emptyOwnerOrUpdatedAt++ 50 | continue 51 | } 52 | user := r.Owner.GetName() 53 | if user == "" { 54 | user = r.Owner.GetLogin() 55 | } 56 | path := r.GetName() 57 | if user == "" || path == "" { 58 | emptyUserOrPath++ 59 | continue 60 | } 61 | touchPackage(fmt.Sprintf("github.com/%s/%s", user, path), r.UpdatedAt.Time, pkgUTs) 62 | if err := store.AppendPackageEvent("github.com", user+"/"+path, "githubhupdate", now, gpb.HistoryEvent_Action_None); err != nil { 63 | log.Printf("UpdatePackageHistory %s %s failed: %v", "github.com", user+"/"+path, err) 64 | } 65 | count++ 66 | } 67 | glog.Infof("%d updates found!", count) 68 | glog.Infof("Total: %d, emptyOwnerOrUpdatedAt: %d, emptyUserOrPath: %d", len(rs), emptyOwnerOrUpdatedAt, emptyUserOrPath) 69 | } 70 | -------------------------------------------------------------------------------- /service/web/web/index.html: -------------------------------------------------------------------------------- 1 | {{template "header.html" .UIUtils.Slice "" "home"}} 2 |
3 | 16 |
17 |

18 | Go Search 19 |

20 | 21 | 22 | {{template "searchbox.html" .UIUtils.Slice "" true}} 23 | {{if .}} 24 |
25 | {{.TotalDocs}} golang packages in {{.TotalProjects}} projects indexed, 26 | last updated {{.IndexAge}} ago. 27 |
28 |
29 |
30 |
31 | 38 | 39 | 50 | {{end}} 51 | {{template "footer.html"}} 52 | -------------------------------------------------------------------------------- /spider/filecache.go: -------------------------------------------------------------------------------- 1 | package spider 2 | 3 | import ( 4 | "log" 5 | 6 | "github.com/golangplus/bytes" 7 | "github.com/golangplus/errors" 8 | 9 | "github.com/daviddengcn/bolthelper" 10 | "github.com/golang/protobuf/proto" 11 | ) 12 | 13 | type FileCache interface { 14 | Get(signature string, contents proto.Message) bool 15 | Set(signature string, contents proto.Message) 16 | } 17 | 18 | type NullFileCache struct{} 19 | 20 | func (NullFileCache) Get(string, proto.Message) bool { return false } 21 | func (NullFileCache) Set(string, proto.Message) {} 22 | 23 | var _ FileCache = NullFileCache{} 24 | 25 | type BoltFileCache struct { 26 | bh.DB 27 | IncCounter func(string) 28 | } 29 | 30 | var _ FileCache = BoltFileCache{} 31 | 32 | // Filecache folders: 33 | // s/ - signature of this path 34 | // c/ - contents of a signagure 35 | // p// - list of paths referencing this signature 36 | 37 | var ( 38 | cacheSignatureKey = []byte("s") 39 | cacheContentsKey = []byte("c") 40 | cachePathsKey = []byte("p") 41 | ) 42 | 43 | func (bc BoltFileCache) inc(name string) { 44 | if bc.IncCounter == nil { 45 | return 46 | } 47 | bc.IncCounter(name) 48 | } 49 | 50 | func (bc BoltFileCache) Get(sign string, contents proto.Message) bool { 51 | found := false 52 | if err := bc.View(func(tx bh.Tx) error { 53 | return tx.Value([][]byte{cacheContentsKey, []byte(sign)}, func(v bytesp.Slice) error { 54 | found = true 55 | return errorsp.WithStacks(proto.Unmarshal(v, contents)) 56 | }) 57 | }); err != nil { 58 | log.Printf("Reading from file cache DB for %v failed: %v", sign, err) 59 | bc.inc("crawler.filecache.get_error") 60 | return false 61 | } 62 | if found { 63 | bc.inc("crawler.filecache.hit") 64 | } else { 65 | bc.inc("crawler.filecache.missed") 66 | } 67 | return found 68 | } 69 | 70 | func (bc BoltFileCache) Set(signature string, contents proto.Message) { 71 | if err := bc.Update(func(tx bh.Tx) error { 72 | bs, err := proto.Marshal(contents) 73 | if err != nil { 74 | return errorsp.WithStacksAndMessage(err, "Marshal %v failed", contents) 75 | } 76 | return tx.Put([][]byte{cacheContentsKey, []byte(signature)}, bs) 77 | }); err != nil { 78 | bc.inc("crawler.filecache.set_error") 79 | log.Printf("Updating to file cache DB for %v failed: %v", signature, err) 80 | } 81 | bc.inc("crawler.filecache.sign_saved") 82 | } 83 | -------------------------------------------------------------------------------- /pipelines/indexer/index.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "log" 5 | "os" 6 | "runtime" 7 | 8 | "github.com/daviddengcn/gcse" 9 | "github.com/daviddengcn/gcse/configs" 10 | "github.com/daviddengcn/gcse/store" 11 | "github.com/daviddengcn/gcse/utils" 12 | "github.com/daviddengcn/go-easybi" 13 | "github.com/daviddengcn/sophie/kv" 14 | ) 15 | 16 | func clearOutdatedIndex() error { 17 | segm, err := configs.IndexSegments().FindMaxDone() 18 | if err != nil { 19 | return err 20 | } 21 | all, err := configs.IndexSegments().ListAll() 22 | if err != nil { 23 | return err 24 | } 25 | for _, s := range all { 26 | if s == segm { 27 | continue 28 | } 29 | err := s.Remove() 30 | if err != nil { 31 | return err 32 | } 33 | log.Printf("Outdated segment %v removed!", s) 34 | } 35 | return nil 36 | } 37 | 38 | func doIndex() bool { 39 | idxSegm, err := configs.IndexSegments().GenMaxSegment() 40 | if err != nil { 41 | log.Printf("GenMaxSegment failed: %v", err) 42 | return false 43 | } 44 | 45 | runtime.GC() 46 | utils.DumpMemStats() 47 | 48 | log.Printf("Indexing to %v ...", idxSegm) 49 | 50 | fpDocDB := configs.DocsDBFsPath() 51 | ts, err := gcse.Index(kv.DirInput(fpDocDB), string(idxSegm)) 52 | if err != nil { 53 | log.Printf("Indexing failed: %v", err) 54 | return false 55 | } 56 | 57 | if !func() bool { 58 | f, err := os.Create(idxSegm.Join(gcse.IndexFn)) 59 | if err != nil { 60 | log.Printf("Create index file failed: %v", err) 61 | return false 62 | } 63 | defer f.Close() 64 | 65 | log.Printf("Saving index to %v ...", idxSegm) 66 | if err := ts.Save(f); err != nil { 67 | log.Printf("ts.Save failed: %v", err) 68 | return false 69 | } 70 | return true 71 | }() { 72 | return false 73 | } 74 | runtime.GC() 75 | utils.DumpMemStats() 76 | 77 | storePath := idxSegm.Join(configs.FnStore) 78 | log.Printf("Saving store snapshot to %v", storePath) 79 | if err := store.SaveSnapshot(storePath); err != nil { 80 | log.Printf("SaveSnapshot %v failed: %v", storePath, err) 81 | } 82 | 83 | if err := idxSegm.Done(); err != nil { 84 | log.Printf("segm.Done failed: %v", err) 85 | return false 86 | } 87 | 88 | log.Printf("Indexing success: %s (%d)", idxSegm, ts.DocCount()) 89 | gcse.AddBiValueAndProcess(bi.Average, "index.doc-count", ts.DocCount()) 90 | 91 | ts = nil 92 | utils.DumpMemStats() 93 | runtime.GC() 94 | utils.DumpMemStats() 95 | 96 | return true 97 | } 98 | -------------------------------------------------------------------------------- /service/web/web/search.html: -------------------------------------------------------------------------------- 1 | {{template "header.html" .UIUtils.Slice (.Q) ("search") }} 2 | {{template "searchbox.html" .UIUtils.Slice .Q false}} 3 |
4 |
5 | {{if .Results.TotalResults}} 6 | Total {{.Results.TotalResults}} packages{{if .Results.Folded}} ({{.Results.Folded}} folded){{end}} 7 | {{else}} 8 | No packages 9 | {{end}} 10 | related to {{.Q}}, {{.SearchTime}} 11 |
12 |
    13 | {{range .Results.Docs}} 14 |
  1. 15 |
    16 |
    {{.Index}}.
    {{if .MarkedName}}{{.MarkedName}}{{else}}({{.MarkedPackage}}){{end}} 17 | - {{.ImportedLen}}+{{.TestImportedLen}} refs 18 | - {{.StarCount}} stars 19 |
    20 |
    {{.Summary}}
    21 | {{if .Subs }} 22 |
    sub: 23 | {{range .Subs}} 24 | 25 | {{.MarkedName}}({{.SubPath}}) 26 | 27 | {{end}} 28 |
    29 | {{end}} 30 |
    31 | {{.MarkedPackage}} 32 | - GoDoc 33 | - {{printf "%.2f" .Score}} ({{printf "M: %.2f" .MatchScore}}, {{printf "S: %.2f" .StaticScore}}) 34 |
    35 |
  2. 36 | {{end}} 37 |
38 |
39 | {{if .TotalPages}} 40 |
    {{$q := .Q}} 41 |
  • {{with .PrevPage}} « {{end}}
  • 42 | {{range .BeforePages}} 43 |
  • {{.}}
  • 44 | {{end}} 45 |
  • {{.CurrentPage}} (current)
  • 46 | {{range .AfterPages}} 47 |
  • {{.}}
  • 48 | {{end}} 49 |
  • {{with .NextPage}} » {{end}}
  • 50 |
51 | {{end}} 52 | {{if .BottomQ}} 53 | {{template "searchbox.html" .UIUtils.Slice .Q false}} 54 | {{end}} 55 | 62 | {{template "footer.html"}} 63 | -------------------------------------------------------------------------------- /tools/fixcrawldb/fixcrawldb.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "io" 5 | "log" 6 | "strings" 7 | 8 | "github.com/golangplus/errors" 9 | "github.com/golangplus/fmt" 10 | "github.com/golangplus/strings" 11 | 12 | "github.com/daviddengcn/gcse" 13 | "github.com/daviddengcn/gcse/configs" 14 | "github.com/daviddengcn/gcse/spider" 15 | "github.com/daviddengcn/sophie" 16 | "github.com/daviddengcn/sophie/kv" 17 | ) 18 | 19 | func loadDocsPkgs(in kv.DirInput) (stringsp.Set, error) { 20 | var pkgs stringsp.Set 21 | cnt, err := in.PartCount() 22 | if err != nil { 23 | return nil, err 24 | } 25 | for part := 0; part < cnt; part++ { 26 | c, err := in.Iterator(part) 27 | if err != nil { 28 | return nil, err 29 | } 30 | for { 31 | var key sophie.RawString 32 | var val gcse.DocInfo 33 | if err := c.Next(&key, &val); err != nil { 34 | if errorsp.Cause(err) == io.EOF { 35 | break 36 | } 37 | return nil, err 38 | } 39 | pkgs.Add(string(key)) 40 | // value is ignored 41 | } 42 | } 43 | return pkgs, nil 44 | } 45 | 46 | func main() { 47 | dryRun := false 48 | // Load CrawlerDB 49 | cDB := gcse.LoadCrawlerDB() 50 | fpDataRoot := sophie.FsPath{ 51 | Fs: sophie.LocalFS, 52 | Path: configs.DataRoot.S(), 53 | } 54 | pkgs, err := loadDocsPkgs(kv.DirInput(fpDataRoot.Join(configs.FnDocs))) 55 | if err != nil { 56 | log.Fatalf("loadDocsPkgs failed: %v", err) 57 | } 58 | db := cDB.PackageDB 59 | var toDelete []string 60 | if err := db.Iterate(func(id string, val interface{}) error { 61 | if pkgs.Contain(id) { 62 | // If the pacakge is already in docs, do not touch it. 63 | return nil 64 | } 65 | parts := strings.Split(id, "/") 66 | if len(parts) >= 4 { 67 | // Check last part. 68 | // github.com/user/repo/sub 69 | name := parts[len(parts)-1] 70 | if !spider.LikeGoSubFolder(name) { 71 | toDelete = append(toDelete, id) 72 | return nil 73 | } 74 | } 75 | if len(parts) < 6 || len(parts)%2 != 0 { 76 | return nil 77 | } 78 | l := (len(parts) - 4) / 2 79 | a := parts[3 : 3+l] 80 | b := parts[3+l : 3+l+l] 81 | for i := range a { 82 | if a[i] != b[i] { 83 | return nil 84 | } 85 | } 86 | toDelete = append(toDelete, id) 87 | return nil 88 | }); err != nil { 89 | log.Fatalf("Iterate failed: %v", err) 90 | } 91 | fmtp.Printfln("Total: %d", len(toDelete)) 92 | if dryRun { 93 | return 94 | } 95 | for _, id := range toDelete { 96 | db.Delete(id) 97 | } 98 | log.Printf("Synchronizing databases to disk...") 99 | if err := cDB.Sync(); err != nil { 100 | log.Fatalf("cdb.Sync() failed: %v", err) 101 | } 102 | } 103 | -------------------------------------------------------------------------------- /spider/ranking.go: -------------------------------------------------------------------------------- 1 | package spider 2 | 3 | import ( 4 | "regexp" 5 | "strings" 6 | "time" 7 | 8 | "github.com/golang/protobuf/ptypes" 9 | "github.com/golangplus/strings" 10 | "github.com/golangplus/time" 11 | 12 | gpb "github.com/daviddengcn/gcse/shared/proto" 13 | ) 14 | 15 | const ( 16 | maxFolderInfoDue = timep.Day * 10 17 | maxRepoInfoDue = timep.Day * 10 18 | maxPackageInfoDue = timep.Day * 5 19 | ) 20 | 21 | var nonGoSubFolders = stringsp.NewSet( 22 | "android", 23 | "bin", "binary", 24 | "c", "cmd", "cpp", "css", 25 | "doc", "dll", 26 | "faq", "font", "fonts", 27 | "gif", "django", 28 | "help", "html", 29 | "image", "images", "icon", "icons", 30 | "java", "javascript", "js", "jpg", "jpeg", 31 | "lib", "less", 32 | "nodejs", 33 | "pdf", "python", 34 | "r", "readme", 35 | "src", "script", "scripts", "static", 36 | "themes", "templates", "tex", 37 | "vendor", 38 | "wav", 39 | "xml", 40 | "zip", 41 | ) 42 | 43 | var nonGoSubPattern = regexp.MustCompile(`^[0-9\-_]+$`) 44 | 45 | func LikeGoSubFolder(folder string) bool { 46 | folder = strings.ToLower(folder) 47 | if nonGoSubFolders.Contain(folder) { 48 | return false 49 | } 50 | if nonGoSubPattern.MatchString(folder) { 51 | return false 52 | } 53 | if strings.ContainsAny(folder, ".") { 54 | return false 55 | } 56 | if folder[0] < 'a' || folder[0] > 'z' { 57 | return false 58 | } 59 | if strings.Contains(folder, "nodejs") { 60 | return false 61 | } 62 | return true 63 | } 64 | 65 | type PackageStatus int 66 | 67 | const ( 68 | OutOfDate PackageStatus = iota 69 | UpToDate 70 | ) 71 | 72 | func (s PackageStatus) String() string { 73 | switch s { 74 | case OutOfDate: 75 | return "out-of-date" 76 | case UpToDate: 77 | return "up-to-date" 78 | } 79 | return "-" 80 | } 81 | 82 | func repoInfoAvailable(info *gpb.RepoInfo) bool { 83 | if info == nil { 84 | return false 85 | } 86 | t, _ := ptypes.Timestamp(info.CrawlingTime) 87 | return t.After(time.Now().Add(-maxRepoInfoDue)) 88 | } 89 | 90 | func folderInfoAvailable(info *gpb.FolderInfo) bool { 91 | if info == nil { 92 | return false 93 | } 94 | t, _ := ptypes.Timestamp(info.CrawlingTime) 95 | return t.After(time.Now().Add(-maxFolderInfoDue)) 96 | } 97 | 98 | func CheckPackageStatus(pkg *gpb.PackageInfo, repo *gpb.RepoInfo) PackageStatus { 99 | if pkg.CrawlingInfo == nil { 100 | return OutOfDate 101 | } 102 | ct, _ := ptypes.Timestamp(pkg.CrawlingInfo.CrawlingTime) 103 | if repoInfoAvailable(repo) { 104 | lu, _ := ptypes.Timestamp(repo.LastUpdated) 105 | if lu.After(ct) { 106 | return OutOfDate 107 | } 108 | return UpToDate 109 | } 110 | if ct.After(time.Now().Add(-maxPackageInfoDue)) { 111 | return UpToDate 112 | } 113 | return OutOfDate 114 | } 115 | -------------------------------------------------------------------------------- /utils/segment_test.go: -------------------------------------------------------------------------------- 1 | package utils 2 | 3 | import ( 4 | "os" 5 | "path/filepath" 6 | "testing" 7 | 8 | "github.com/golangplus/testing/assert" 9 | ) 10 | 11 | func TestSegment(t *testing.T) { 12 | const ( 13 | name = "1" 14 | dataFn = "data.txt" 15 | subDir = "sub" 16 | ) 17 | 18 | path := filepath.Join(os.TempDir(), name) 19 | assert.NoErrorOrDie(t, os.RemoveAll(path)) 20 | 21 | s := Segment(path) 22 | assert.Equal(t, "name", s.Name(), name) 23 | assert.Equal(t, "join", s.Join(""), path) 24 | assert.Equal(t, "join", s.Join(dataFn), filepath.Join(path, dataFn)) 25 | assert.False(t, "is-done", s.IsDone()) 26 | assert.NoError(t, s.Done()) 27 | assert.True(t, "is-done", s.IsDone()) 28 | 29 | // Check ListFiles returns sub directories. 30 | assert.NoError(t, Segment(s.Join(subDir)).Make()) 31 | files, err := s.ListFiles() 32 | assert.NoError(t, err) 33 | assert.Equal(t, "files", files, []string{s.Join(subDir)}) 34 | } 35 | 36 | func TestSegments(t *testing.T) { 37 | path := filepath.Join(os.TempDir(), "TestSegments") 38 | assert.NoErrorOrDie(t, os.RemoveAll(path)) 39 | 40 | ss := Segments(path) 41 | s0, err := ss.GenNewSegment() 42 | assert.NoError(t, err) 43 | assert.Equal(t, "s0", s0, Segment(filepath.Join(path, "0"))) 44 | assert.NoError(t, s0.Done()) 45 | 46 | s1, err := ss.GenNewSegment() 47 | assert.NoError(t, err) 48 | assert.Equal(t, "s1", s1, Segment(filepath.Join(path, "1"))) 49 | 50 | // Create a file under path, should not be returned by ListAll() 51 | f, err := os.Create(filepath.Join(path, "a.txt")) 52 | assert.NoError(t, err) 53 | assert.NoError(t, f.Close()) 54 | sa, err := ss.ListAll() 55 | assert.NoError(t, err) 56 | assert.Equal(t, "sa", sa, []Segment{s0, s1}) 57 | 58 | sa, err = ss.ListDones() 59 | assert.NoError(t, err) 60 | assert.Equal(t, "sa", sa, []Segment{s0}) 61 | 62 | s2, err := ss.GenMaxSegment() 63 | assert.NoError(t, err) 64 | assert.Equal(t, "s2", s2, ss.Join("2")) 65 | 66 | ms, err := ss.FindMaxDone() 67 | assert.NoError(t, err) 68 | assert.Equal(t, "ms", ms, ss.Join("0")) 69 | 70 | assert.NoError(t, s2.Done()) 71 | ms, err = ss.FindMaxDone() 72 | assert.NoError(t, err) 73 | assert.Equal(t, "ms", ms, ss.Join("2")) 74 | 75 | assert.NoError(t, ss.ClearUndones()) 76 | sa, err = ss.ListAll() 77 | assert.NoError(t, err) 78 | assert.Equal(t, "sa", sa, []Segment{s0, s2}) 79 | } 80 | 81 | func TestSegments_GenMaxSegment(t *testing.T) { 82 | path := filepath.Join(os.TempDir(), "TestSegments_GenMaxSegment") 83 | assert.NoErrorOrDie(t, os.RemoveAll(path)) 84 | assert.NoErrorOrDie(t, os.MkdirAll(path, 0755)) 85 | 86 | ss := Segments(path) 87 | 88 | s, err := ss.GenMaxSegment() 89 | assert.NoError(t, err) 90 | assert.Equal(t, "s", s, ss.Join("0")) 91 | assert.NoError(t, s.Remove()) 92 | 93 | assert.NoError(t, os.MkdirAll(filepath.Join(path, "word"), 0755)) 94 | s, err = ss.GenMaxSegment() 95 | assert.NoError(t, err) 96 | assert.Equal(t, "s", s, ss.Join("0")) 97 | assert.NoError(t, s.Remove()) 98 | } 99 | -------------------------------------------------------------------------------- /service/web/view.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "fmt" 5 | "go/doc" 6 | "html/template" 7 | "net/http" 8 | "strings" 9 | 10 | "github.com/golangplus/bytes" 11 | 12 | "github.com/ajstarks/svgo" 13 | "github.com/daviddengcn/gcse" 14 | ) 15 | 16 | func pageView(w http.ResponseWriter, r *http.Request) { 17 | w.Header().Set("Content-Type", "text/html") 18 | 19 | id := strings.TrimSpace(r.FormValue("id")) 20 | if id != "" { 21 | db := getDatabase() 22 | d, found := db.FindFullPackage(id) 23 | if !found { 24 | pageNotFound(w, r) 25 | return 26 | } 27 | if d.StarCount < 0 { 28 | d.StarCount = 0 29 | } 30 | var descHTML bytesp.Slice 31 | doc.ToHTML(&descHTML, d.Description, nil) 32 | 33 | if err := templates.ExecuteTemplate(w, "view.html", struct { 34 | UIUtils 35 | gcse.HitInfo 36 | DescHTML template.HTML 37 | TotalDocCount int 38 | StaticRank int 39 | ShowReadme bool 40 | }{ 41 | HitInfo: d, 42 | DescHTML: template.HTML(descHTML), 43 | TotalDocCount: db.PackageCount(), 44 | StaticRank: d.StaticRank + 1, 45 | ShowReadme: len(d.Description) < 10 && len(d.ReadmeData) > 0, 46 | }); err != nil { 47 | http.Error(w, err.Error(), http.StatusInternalServerError) 48 | } 49 | } 50 | } 51 | 52 | func pageBadgePage(w http.ResponseWriter, r *http.Request) { 53 | w.Header().Set("Content-Type", "text/html") 54 | id := strings.TrimSpace(r.FormValue("id")) 55 | if id != "" { 56 | doc, found := getDatabase().FindFullPackage(id) 57 | if !found { 58 | http.Error(w, fmt.Sprintf("Package %s not found!", id), http.StatusNotFound) 59 | return 60 | } 61 | badgeUrl := "http://go-search.org/badge?id=" + template.URLQueryEscaper(doc.Package) 62 | viewUrl := "http://go-search.org/view?id=" + template.URLQueryEscaper(doc.Package) 63 | 64 | htmlCode := fmt.Sprintf(`GoSearch`, viewUrl, badgeUrl) 65 | mdCode := fmt.Sprintf(`[![GoSearch](%s)](%s)`, badgeUrl, viewUrl) 66 | 67 | if err := templates.ExecuteTemplate(w, "badgepage.html", struct { 68 | UIUtils 69 | gcse.HitInfo 70 | HTMLCode string 71 | MDCode string 72 | }{ 73 | HitInfo: doc, 74 | HTMLCode: htmlCode, 75 | MDCode: mdCode, 76 | }); err != nil { 77 | http.Error(w, err.Error(), http.StatusInternalServerError) 78 | } 79 | } 80 | } 81 | 82 | func pageBadge(w http.ResponseWriter, r *http.Request) { 83 | id := strings.TrimSpace(r.FormValue("id")) 84 | if id != "" { 85 | doc, found := getDatabase().FindFullPackage(id) 86 | if !found { 87 | http.Error(w, fmt.Sprintf("Package %s not found!", id), http.StatusNotFound) 88 | return 89 | } 90 | w.Header().Set("Content-Type", "image/svg+xml") 91 | 92 | W, H := 100, 22 93 | 94 | s := svg.New(w) 95 | s.Start(W, H) 96 | s.Roundrect(1, 1, W-2, H-2, 4, 4, "fill:#5bc0de") 97 | 98 | s.Text(5, 15, fmt.Sprintf("GoSearch #%d", doc.StaticRank+1), 99 | `font-size:10;fill:white;font-weight:bold;font-family:Arial, Helvetica, sans-serif`) 100 | s.End() 101 | } 102 | } 103 | -------------------------------------------------------------------------------- /shared/proto/spider.proto: -------------------------------------------------------------------------------- 1 | syntax = "proto3"; 2 | 3 | package gcse; 4 | 5 | option go_package = "gcsepb"; 6 | 7 | import "github.com/golang/protobuf/ptypes/timestamp/timestamp.proto"; 8 | 9 | message GoFileInfo { 10 | enum Status { 11 | Unknown = 0; 12 | ParseSuccess = 1; 13 | ParseFailed = 2; 14 | ShouldIgnore = 3; 15 | } 16 | Status status = 1; 17 | 18 | string name = 2; 19 | string description = 3; 20 | bool is_test = 4; 21 | repeated string imports = 5; 22 | } 23 | 24 | message RepoInfo { 25 | // The timestamp this repo-info is crawled 26 | google.protobuf.Timestamp crawling_time = 1; 27 | 28 | int32 stars = 2; 29 | string description = 3; 30 | // Where this project was forked from, full path 31 | string source = 5; 32 | // As far as we know, when this repo was updated 33 | google.protobuf.Timestamp last_updated = 4; 34 | } 35 | 36 | // Information for a non-repository folder. 37 | message FolderInfo { 38 | // E.g. "sub" 39 | string name = 1; 40 | 41 | // E.g. "spider/sub" 42 | string path = 2; 43 | 44 | string sha = 3; 45 | string html_url = 4; 46 | 47 | // The timestamp this folder-info is crawled 48 | google.protobuf.Timestamp crawling_time = 5; 49 | } 50 | 51 | message CrawlingInfo { 52 | // The timestamp the related entry was crawled 53 | google.protobuf.Timestamp crawling_time = 1; 54 | string etag = 2; 55 | } 56 | 57 | message HistoryEvent { 58 | message Action { 59 | enum Enum { 60 | None = 0; // No action 61 | Success = 1; // Success crawling and the package is valid 62 | Failed = 2; // Failed crawling, do not know validity of the package 63 | Invalid = 3; // Success crawling and the package is invalid 64 | } 65 | } 66 | google.protobuf.Timestamp timestamp = 1; 67 | Action.Enum action = 2; 68 | } 69 | 70 | message HistoryInfo { 71 | repeated HistoryEvent events = 1; 72 | 73 | google.protobuf.Timestamp found_time = 2; 74 | // Possible value: 75 | // web added from web 76 | // user: found from user crawling 77 | // parent found by crawling his parent 78 | // imported: imported by a 79 | // testimported: test imported by a 80 | // package: 81 | // reference: referenced in the readme file of 82 | // godoc found by godoc.org/api 83 | string found_way = 3; 84 | 85 | google.protobuf.Timestamp latest_success = 4; 86 | google.protobuf.Timestamp latest_failed = 5; 87 | } 88 | 89 | message Package { 90 | // package "name" 91 | string Name = 1; 92 | 93 | // Relative path to the repository, "" for root repository, "/sub" for a sub package. 94 | // Full path: site + "/" + user + "/" + repo + path 95 | string Path = 2; 96 | 97 | string Synopsis = 9; 98 | string Description = 3; 99 | // No directory info 100 | string ReadmeFn = 4; 101 | 102 | // Raw content, cound be md, txt, etc. 103 | string ReadmeData = 5; 104 | 105 | repeated string Imports = 6; 106 | repeated string TestImports = 7; 107 | 108 | // URL to the package source code. 109 | string url = 8; 110 | } 111 | -------------------------------------------------------------------------------- /store/repo.go: -------------------------------------------------------------------------------- 1 | package store 2 | 3 | import ( 4 | "log" 5 | 6 | "github.com/golang/protobuf/proto" 7 | "github.com/golangplus/bytes" 8 | "github.com/golangplus/errors" 9 | 10 | "github.com/daviddengcn/bolthelper" 11 | 12 | gpb "github.com/daviddengcn/gcse/shared/proto" 13 | ) 14 | 15 | // Returns an empty (non-nil) PackageInfo if not found. 16 | func ReadRepository(site, user, repo string) (*gpb.Repository, error) { 17 | doc := &gpb.Repository{} 18 | if err := box.View(func(tx bh.Tx) error { 19 | return tx.Value([][]byte{reposRoot, []byte(site), []byte(user), []byte(repo)}, func(bs bytesp.Slice) error { 20 | if err := errorsp.WithStacksAndMessage(proto.Unmarshal(bs, doc), "Unmarshal %d bytes failed", len(bs)); err != nil { 21 | log.Printf("Unmarshal failed: %v", err) 22 | *doc = gpb.Repository{} 23 | } 24 | return nil 25 | }) 26 | }); err != nil { 27 | return nil, err 28 | } 29 | return doc, nil 30 | } 31 | 32 | func UpdateRepository(site, user, repo string, f func(doc *gpb.Repository) error) error { 33 | return box.Update(func(tx bh.Tx) error { 34 | b, err := tx.CreateBucketIfNotExists([][]byte{reposRoot, []byte(site), []byte(user)}) 35 | if err != nil { 36 | return err 37 | } 38 | doc := &gpb.Repository{} 39 | if err := b.Value([][]byte{[]byte(repo)}, func(bs bytesp.Slice) error { 40 | if err := errorsp.WithStacksAndMessage(proto.Unmarshal(bs, doc), "Unmarshal %d bytes", len(bs)); err != nil { 41 | log.Printf("Unmarshaling failed: %v", err) 42 | *doc = gpb.Repository{} 43 | } 44 | return nil 45 | }); err != nil { 46 | return err 47 | } 48 | if err := errorsp.WithStacks(f(doc)); err != nil { 49 | return err 50 | } 51 | bs, err := proto.Marshal(doc) 52 | if err != nil { 53 | return errorsp.WithStacksAndMessage(err, "marshaling %v failed: %v", doc, err) 54 | } 55 | return b.Put([][]byte{[]byte(repo)}, bs) 56 | }) 57 | } 58 | 59 | func DeleteRepository(site, user, repo string) error { 60 | return box.Update(func(tx bh.Tx) error { 61 | return tx.Delete([][]byte{reposRoot, []byte(site), []byte(user), []byte(repo)}) 62 | }) 63 | } 64 | 65 | func ForEachRepositorySite(f func(string) error) error { 66 | return box.View(func(tx bh.Tx) error { 67 | return tx.ForEach([][]byte{reposRoot}, func(_ bh.Bucket, k, v bytesp.Slice) error { 68 | if v != nil { 69 | log.Printf("Unexpected value %q for key %q, ignored", string(v), string(k)) 70 | return nil 71 | } 72 | return errorsp.WithStacks(f(string(k))) 73 | }) 74 | }) 75 | } 76 | 77 | func ForEachRepositoryOfSite(site string, f func(user, name string, doc *gpb.Repository) error) error { 78 | return box.View(func(tx bh.Tx) error { 79 | return tx.ForEach([][]byte{reposRoot, []byte(site)}, func(b bh.Bucket, user, v bytesp.Slice) error { 80 | if v != nil { 81 | log.Printf("Unexpected value %q for key %q, ignored", string(v), string(user)) 82 | return nil 83 | } 84 | return b.ForEach([][]byte{user}, func(name, bs bytesp.Slice) error { 85 | if bs == nil { 86 | log.Printf("Unexpected nil value for key %q, ignored", string(name)) 87 | return nil 88 | } 89 | doc := &gpb.Repository{} 90 | if err := errorsp.WithStacksAndMessage(proto.Unmarshal(bs, doc), "Unmarshal %d bytes", len(bs)); err != nil { 91 | log.Printf("Unmarshaling value for %v failed, ignored: %v", name, err) 92 | return nil 93 | } 94 | return errorsp.WithStacks(f(string(user), string(name), doc)) 95 | }) 96 | }) 97 | }) 98 | } 99 | -------------------------------------------------------------------------------- /pipelines/crawler/person.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "context" 5 | "errors" 6 | "log" 7 | "math/rand" 8 | "strings" 9 | "time" 10 | 11 | "github.com/golangplus/time" 12 | 13 | "github.com/daviddengcn/gcse" 14 | "github.com/daviddengcn/gcse/configs" 15 | "github.com/daviddengcn/gddo/doc" 16 | "github.com/daviddengcn/go-easybi" 17 | "github.com/daviddengcn/sophie" 18 | "github.com/daviddengcn/sophie/kv" 19 | "github.com/daviddengcn/sophie/mr" 20 | ) 21 | 22 | const ( 23 | DefaultPersonAge = 100 * timep.Day 24 | ) 25 | 26 | type PersonCrawler struct { 27 | crawlerMapper 28 | 29 | part int 30 | failCount int 31 | httpClient doc.HttpClient 32 | } 33 | 34 | func pushPerson(p *gcse.Person) { 35 | for _, pkg := range p.Packages { 36 | appendNewPackage(pkg, "user:"+p.Id) 37 | } 38 | cDB.SchedulePerson(p.Id, time.Now().Add(time.Duration(float64(DefaultPersonAge)*(1+(rand.Float64()-0.5)*0.2)))) 39 | } 40 | 41 | // OnlyMapper.Map 42 | func (pc *PersonCrawler) Map(key, val sophie.SophieWriter, 43 | c []sophie.Collector) error { 44 | ctx := context.Background() 45 | if time.Now().After(AppStopTime) { 46 | log.Printf("[Part %d] Timeout(key = %v), PersonCrawler returns EOM", pc.part, key) 47 | return mr.EOM 48 | } 49 | id := string(*key.(*sophie.RawString)) 50 | // ent := val.(*gcse.CrawlingEntry) 51 | log.Printf("[Part %d] Crawling person %v\n", pc.part, id) 52 | 53 | p, err := gcse.CrawlPerson(ctx, pc.httpClient, id) 54 | if err != nil { 55 | bi.AddValue(bi.Sum, "crawler.person.failed", 1) 56 | pc.failCount++ 57 | log.Printf("[Part %d] Crawling person %s failed: %v", pc.part, id, err) 58 | 59 | cDB.SchedulePerson(id, time.Now().Add(12*time.Hour)) 60 | 61 | if pc.failCount >= 10 || strings.Contains(err.Error(), "403") { 62 | durToSleep := 10 * time.Minute 63 | if time.Now().Add(durToSleep).After(AppStopTime) { 64 | log.Printf("[Part %d] Timeout(key = %v), PersonCrawler returns EOM", pc.part, key) 65 | return mr.EOM 66 | } 67 | 68 | log.Printf("[Part %d] Last ten crawling persons failed, sleep for a while...(current: %s)", pc.part, id) 69 | time.Sleep(durToSleep) 70 | pc.failCount = 0 71 | } 72 | return nil 73 | } 74 | bi.AddValue(bi.Sum, "crawler.person.success", 1) 75 | log.Printf("[Part %d] Crawled person %s success!", pc.part, id) 76 | pushPerson(p) 77 | log.Printf("[Part %d] Push person %s success", pc.part, id) 78 | pc.failCount = 0 79 | 80 | time.Sleep(10 * time.Second) 81 | 82 | return nil 83 | } 84 | 85 | type PeresonCrawlerFactory struct { 86 | httpClient doc.HttpClient 87 | } 88 | 89 | func (pcf PeresonCrawlerFactory) NewMapper(part int) mr.OnlyMapper { 90 | return &PersonCrawler{part: part, httpClient: pcf.httpClient} 91 | } 92 | 93 | // crawl packages, send error back to end 94 | func crawlPersons(httpClient doc.HttpClient, fpToCrawlPsn sophie.FsPath, end chan error) { 95 | time.AfterFunc(configs.CrawlerDuePerRun+time.Minute*10, func() { 96 | end <- errors.New("Crawling persons timeout!") 97 | }) 98 | end <- func() error { 99 | job := mr.MapOnlyJob{ 100 | Source: []mr.Input{ 101 | kv.DirInput(fpToCrawlPsn), 102 | }, 103 | NewMapperF: func(src, part int) mr.OnlyMapper { 104 | return &PersonCrawler{ 105 | part: part, 106 | httpClient: httpClient, 107 | } 108 | }, 109 | } 110 | if err := job.Run(); err != nil { 111 | log.Printf("crawlPersons: job.Run failed: %v", err) 112 | return err 113 | } 114 | return nil 115 | }() 116 | } 117 | -------------------------------------------------------------------------------- /text_test.go: -------------------------------------------------------------------------------- 1 | package gcse 2 | 3 | import ( 4 | "testing" 5 | 6 | "github.com/golangplus/testing/assert" 7 | ) 8 | 9 | func TestSplitSentences(t *testing.T) { 10 | TEXT := ` 11 | Package gcse is the core supporting library for go-code-search-engine (GCSE). 12 | Its exported types and functions are mainly for sub packages. If you want 13 | some of the function, copy the code away. 14 | 15 | Sub-projects 16 | 17 | crawler crawling packages 18 | 19 | indexer creating index data for web-server 20 | 21 | --== Godit - a very religious text editor ==-- 22 | 23 | server providing web services, including home/top/search services. 24 | ` 25 | SENTS := []string{ 26 | `Package gcse is the core supporting library for go-code-search-engine (GCSE).`, 27 | `Its exported types and functions are mainly for sub packages.`, 28 | `If you want some of the function, copy the code away.`, 29 | `Sub-projects`, 30 | `crawler crawling packages`, 31 | `indexer creating index data for web-server`, 32 | `Godit - a very religious text editor`, 33 | `server providing web services, including home/top/search services.`, 34 | } 35 | sents := SplitSentences(TEXT) 36 | assert.StringEqual(t, "Sentences", sents, SENTS) 37 | } 38 | 39 | func TestChooseImportantSentenses(t *testing.T) { 40 | TEXT := ` 41 | gcse implements something. If you want some of the function, copy the code away. 42 | 43 | Package gcse provides something 44 | 45 | daviddengcn/core is a something 46 | 47 | github/daviddengcn/core is more than a something 48 | ------------------------------------------------- 49 | This is a something 50 | 51 | gcse是一个something 52 | 53 | gcse 是一个something 54 | 55 | is a framework to compare the performance of go 1.0 (go 1.0.3) and go 1.1 (go +tip). 56 | 57 | 这是一个something 58 | 59 | 非这是一个something2 60 | 61 | the core package provides something 62 | 63 | Go language implementation of selected algorithms from the 64 | 65 | A simple pluggable lexer package. 66 | ` 67 | IMPORTANTS := []string{ 68 | `gcse implements something.`, 69 | `Package gcse provides something`, 70 | `daviddengcn/core is a something`, 71 | `github/daviddengcn/core is more than a something`, 72 | `This is a something`, 73 | `gcse是一个something`, 74 | `gcse 是一个something`, 75 | `is a framework to compare the performance of go 1.0 (go 1.0.3) and go 1.1 (go +tip).`, 76 | `这是一个something`, 77 | `the core package provides something`, 78 | `Go language implementation of selected algorithms from the`, 79 | `A simple pluggable lexer package.`, 80 | } 81 | importants := ChooseImportantSentenses(TEXT, "gcse", "github/daviddengcn/core") 82 | assert.StringEqual(t, "importants", importants, IMPORTANTS) 83 | } 84 | 85 | func TestChooseImportantSentenses_GoBot(t *testing.T) { 86 | TEXT := ` 87 | GoBot is an IRC Bot programmed in Golang![Build Status](https://secure.travis-ci.org/prometheus/client_golang.png?branch=master). It is designed to be lightweight and fast. 88 | ` 89 | IMPORTANTS := []string{ 90 | `GoBot is an IRC Bot programmed in Golang.`, 91 | } 92 | importants := ChooseImportantSentenses(TEXT, "main", "github.com/wei2912/GoBot") 93 | assert.StringEqual(t, "importants", importants, IMPORTANTS) 94 | } 95 | 96 | func TestChooseImportantSentenses_PackageEscape(t *testing.T) { 97 | TEXT := ` 98 | GoBot is an IRC Bot programmed. 99 | ` 100 | IMPORTANTS := []string{ 101 | `GoBot is an IRC Bot programmed.`, 102 | } 103 | importants := ChooseImportantSentenses(TEXT, "main", "github.com/+wei2912/GoBot") 104 | assert.StringEqual(t, "importants", importants, IMPORTANTS) 105 | } 106 | -------------------------------------------------------------------------------- /tools/dump/dump.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "fmt" 5 | "log" 6 | "os" 7 | 8 | "github.com/golangplus/fmt" 9 | 10 | "github.com/daviddengcn/gcse" 11 | "github.com/daviddengcn/gcse/configs" 12 | "github.com/daviddengcn/go-index" 13 | "github.com/daviddengcn/sophie" 14 | "github.com/daviddengcn/sophie/kv" 15 | ) 16 | 17 | func help() { 18 | fmt.Fprintln(os.Stderr, `Usage: dump docs|index|crawler [keys...]`) 19 | } 20 | 21 | func dumpDocs(keys []string) { 22 | path := configs.DataRoot.Join(configs.FnDocs).S() 23 | kvDir := kv.DirInput(sophie.LocalFsPath(path)) 24 | cnt, err := kvDir.PartCount() 25 | if err != nil { 26 | log.Fatalf("kvDir.PartCount() failed: %v", err) 27 | } 28 | 29 | parts := make(map[int]map[string]bool) 30 | for _, key := range keys { 31 | part := gcse.CalcPackagePartition(key, gcse.DOCS_PARTS) 32 | if parts[part] == nil { 33 | parts[part] = make(map[string]bool) 34 | } 35 | 36 | parts[part][key] = true 37 | } 38 | 39 | var key sophie.RawString 40 | var val gcse.DocInfo 41 | for part := 0; part < cnt; part++ { 42 | if len(keys) > 0 && parts[part] == nil { 43 | continue 44 | } 45 | 46 | it, err := kvDir.Iterator(part) 47 | if err != nil { 48 | log.Fatalf("kvDir.Collector(%d) failed: %v", part, err) 49 | } 50 | 51 | func() { 52 | defer it.Close() 53 | 54 | for { 55 | if err := it.Next(&key, &val); err != nil { 56 | if err == sophie.EOF { 57 | break 58 | } 59 | log.Fatalf("it.Next failed %v", err) 60 | } 61 | pkg := key.String() 62 | if len(keys) > 0 && !parts[part][pkg] { 63 | continue 64 | } 65 | fmtp.Printfln("%v -> %+v", key, val) 66 | } 67 | 68 | it.Close() 69 | }() 70 | } 71 | } 72 | 73 | func dumpIndex(keys []string) { 74 | segm, err := gcse.IndexSegments.FindMaxDone() 75 | if segm == nil || err != nil { 76 | log.Fatalf("gcse.IndexSegments.FindMaxDone() failed: %v", err) 77 | } 78 | 79 | db := &index.TokenSetSearcher{} 80 | f, err := segm.Join(gcse.IndexFn).Open() 81 | if err != nil { 82 | log.Fatalf("%v.Join(%s).Open() failed: %v", segm, gcse.IndexFn, err) 83 | } 84 | defer f.Close() 85 | 86 | if err := db.Load(f); err != nil { 87 | log.Fatalf("db.Open() failed: %v", err) 88 | } 89 | 90 | for _, key := range keys { 91 | db.Search(index.SingleFieldQuery(gcse.IndexPkgField, key), 92 | func(docID int32, data interface{}) error { 93 | info, _ := data.(gcse.HitInfo) 94 | fmtp.Printfln("%s:%s -> %+v", gcse.IndexPkgField, key, info) 95 | return nil 96 | }) 97 | db.Search(index.SingleFieldQuery(gcse.IndexTextField, key), 98 | func(docID int32, data interface{}) error { 99 | info, _ := data.(gcse.HitInfo) 100 | fmtp.Printfln("%s:%s -> %+v", gcse.IndexTextField, key, info) 101 | return nil 102 | }) 103 | } 104 | } 105 | 106 | func dumpCrawler(keys []string) { 107 | cDB := gcse.LoadCrawlerDB() 108 | if len(keys) == 0 { 109 | // Full dump 110 | log.Printf("Dumping PackageDB...") 111 | cDB.PackageDB.Iterate(func(k string, v interface{}) error { 112 | fmtp.Printfln("Package %v: %+v", k, v) 113 | return nil 114 | }) 115 | return 116 | } 117 | for _, key := range keys { 118 | var ent gcse.CrawlingEntry 119 | if cDB.PackageDB.Get(key, &ent) { 120 | fmtp.Printfln("Package %v: %+v", key, ent) 121 | } 122 | } 123 | } 124 | 125 | func main() { 126 | if len(os.Args) < 2 { 127 | help() 128 | return 129 | } 130 | 131 | switch os.Args[1] { 132 | case "docs": 133 | dumpDocs(os.Args[2:]) 134 | case "index": 135 | dumpIndex(os.Args[2:]) 136 | case "crawler": 137 | dumpCrawler(os.Args[2:]) 138 | default: 139 | help() 140 | } 141 | } 142 | -------------------------------------------------------------------------------- /store/store_test.go: -------------------------------------------------------------------------------- 1 | package store 2 | 3 | import ( 4 | "os" 5 | "testing" 6 | "time" 7 | 8 | "github.com/golang/protobuf/ptypes" 9 | "github.com/golangplus/testing/assert" 10 | 11 | "github.com/daviddengcn/gcse/configs" 12 | 13 | gpb "github.com/daviddengcn/gcse/shared/proto" 14 | ) 15 | 16 | func init() { 17 | configs.SetTestingDataPath() 18 | } 19 | 20 | func cleanDatabase(t *testing.T) { 21 | assert.NoErrorOrDie(t, os.RemoveAll(configs.StoreBoltPath())) 22 | } 23 | 24 | func TestRepoInfoAge(t *testing.T) { 25 | ts, _ := ptypes.TimestampProto(time.Now().Add(-time.Hour)) 26 | age := RepoInfoAge(&gpb.RepoInfo{ 27 | CrawlingTime: ts, 28 | }) 29 | assert.ValueShould(t, "age", age, age >= time.Hour && age < time.Hour+time.Minute, "age out of expected range") 30 | } 31 | 32 | func TestForEachPackageSite(t *testing.T) { 33 | cleanDatabase(t) 34 | 35 | const ( 36 | site1 = "TestForEachPackageSite1.com" 37 | site2 = "github.com" 38 | path = "gcse" 39 | name = "pkgname" 40 | ) 41 | assert.NoError(t, UpdatePackage(site1, path, func(info *gpb.PackageInfo) error { 42 | return nil 43 | })) 44 | assert.NoError(t, UpdatePackage(site2, path, func(info *gpb.PackageInfo) error { 45 | return nil 46 | })) 47 | var sites []string 48 | assert.NoError(t, ForEachPackageSite(func(site string) error { 49 | sites = append(sites, site) 50 | return nil 51 | })) 52 | assert.Equal(t, "sites", sites, []string{site1, site2}) 53 | } 54 | 55 | func TestForEachPackageOfSite(t *testing.T) { 56 | cleanDatabase(t) 57 | 58 | const ( 59 | site = "TestForEachPackageOfSite.com" 60 | path1 = "gcse" 61 | name1 = "pkgname" 62 | path2 = "gcse2" 63 | name2 = "TestForEachPackageOfSite" 64 | ) 65 | assert.NoError(t, UpdatePackage(site, path1, func(info *gpb.PackageInfo) error { 66 | info.Name = name1 67 | return nil 68 | })) 69 | assert.NoError(t, UpdatePackage(site, path2, func(info *gpb.PackageInfo) error { 70 | info.Name = name2 71 | return nil 72 | })) 73 | var paths, names []string 74 | assert.NoError(t, ForEachPackageOfSite(site, func(path string, info *gpb.PackageInfo) error { 75 | paths = append(paths, path) 76 | names = append(names, info.Name) 77 | return nil 78 | })) 79 | assert.Equal(t, "paths", paths, []string{path1, path2}) 80 | assert.Equal(t, "names", names, []string{name1, name2}) 81 | } 82 | 83 | func TestUpdateReadDeletePackage(t *testing.T) { 84 | const ( 85 | site = "TestUpdateReadPackage.com" 86 | path = "gcse" 87 | name = "pkgname" 88 | ) 89 | assert.NoError(t, UpdatePackage(site, path, func(info *gpb.PackageInfo) error { 90 | assert.Equal(t, "info", info, &gpb.PackageInfo{}) 91 | info.Name = name 92 | return nil 93 | })) 94 | pkg, err := ReadPackage(site, path) 95 | assert.NoError(t, err) 96 | assert.Equal(t, "pkg", pkg, &gpb.PackageInfo{Name: name}) 97 | 98 | assert.NoError(t, DeletePackage(site, path)) 99 | 100 | pkg, err = ReadPackage(site, path) 101 | assert.NoError(t, err) 102 | assert.Equal(t, "pkg", pkg, &gpb.PackageInfo{}) 103 | } 104 | 105 | func TestUpdateReadDeletePerson(t *testing.T) { 106 | const ( 107 | site = "TestUpdateReadDeletePerson.com" 108 | id = "daviddengcn" 109 | etag = "tag" 110 | ) 111 | assert.NoError(t, UpdatePerson(site, id, func(info *gpb.PersonInfo) error { 112 | assert.Equal(t, "info", info, &gpb.PersonInfo{}) 113 | info.CrawlingInfo = &gpb.CrawlingInfo{ 114 | Etag: etag, 115 | } 116 | return nil 117 | })) 118 | p, err := ReadPerson(site, id) 119 | assert.NoError(t, err) 120 | assert.Equal(t, "p", p, &gpb.PersonInfo{CrawlingInfo: &gpb.CrawlingInfo{Etag: etag}}) 121 | 122 | assert.NoError(t, DeletePerson(site, id)) 123 | 124 | p, err = ReadPerson(site, id) 125 | assert.NoError(t, err) 126 | assert.Equal(t, "p", p, &gpb.PersonInfo{}) 127 | } 128 | -------------------------------------------------------------------------------- /crawlerdb.go: -------------------------------------------------------------------------------- 1 | package gcse 2 | 3 | import ( 4 | "log" 5 | "strings" 6 | "time" 7 | 8 | "github.com/daviddengcn/gcse/configs" 9 | "github.com/daviddengcn/gddo/doc" 10 | ) 11 | 12 | const ( 13 | KindIndex = "index" 14 | KindDocDB = "docdb" 15 | KindPackage = "package" 16 | KindPerson = "person" 17 | KindToCheck = "tocheck" 18 | IndexFn = KindIndex + ".gob" 19 | ) 20 | 21 | /* 22 | * CrawlerDB including all crawler entires database. 23 | */ 24 | type CrawlerDB struct { 25 | PackageDB *MemDB 26 | PersonDB *MemDB 27 | } 28 | 29 | // LoadCrawlerDB loads PackageDB and PersonDB and returns a new *CrawlerDB 30 | func LoadCrawlerDB() *CrawlerDB { 31 | root := configs.CrawlerDBPath() 32 | 33 | log.Printf("Loading CrawlerDB from %s", root) 34 | 35 | return &CrawlerDB{ 36 | PackageDB: NewMemDB(root, KindPackage), 37 | PersonDB: NewMemDB(root, KindPerson), 38 | } 39 | } 40 | 41 | // Sync syncs both PackageDB and PersonDB. Returns error if any of the sync 42 | // failed. 43 | func (cdb *CrawlerDB) Sync() error { 44 | if err := cdb.PackageDB.Sync(); err != nil { 45 | log.Printf("cdb.PackageDB.Sync failed: %v", err) 46 | return err 47 | } 48 | if err := cdb.PersonDB.Sync(); err != nil { 49 | log.Printf("cdb.PersonDB.Sync failed: %v", err) 50 | return err 51 | } 52 | 53 | return nil 54 | } 55 | 56 | // SchedulePackage schedules a package to be crawled at a specific time. 57 | func (cdb *CrawlerDB) SchedulePackage(pkg string, sTime time.Time, etag string) error { 58 | ent := CrawlingEntry{ 59 | ScheduleTime: sTime, 60 | Version: CrawlerVersion, 61 | Etag: etag, 62 | } 63 | 64 | cdb.PackageDB.Put(pkg, ent) 65 | 66 | // log.Printf("Schedule package %s to %v", pkg, sTime) 67 | return nil 68 | } 69 | 70 | // SchedulePackage schedules a package to be crawled at a specific time if not specified earlier. 71 | func (cdb *CrawlerDB) PushToCrawlPackage(pkg string) { 72 | now := time.Now() 73 | var ent CrawlingEntry 74 | if cdb.PackageDB.Get(pkg, &ent) { 75 | if ent.ScheduleTime.Before(now) { 76 | // The package has been scheduled to an earlier time. 77 | return 78 | } 79 | } 80 | ent.ScheduleTime = now 81 | cdb.PackageDB.Put(pkg, ent) 82 | } 83 | 84 | func TrimPackageName(pkg string) string { 85 | return strings.TrimFunc(strings.TrimSpace(pkg), func(r rune) bool { 86 | return r > rune(128) 87 | }) 88 | } 89 | 90 | // AppendPackage appends a package. If the package did not exist in either 91 | // PackageDB or Docs, schedule it (immediately). 92 | func (cdb *CrawlerDB) AppendPackage(pkg string, inDocs func(pkg string) bool) { 93 | pkg = TrimPackageName(pkg) 94 | if !doc.IsValidRemotePath(pkg) { 95 | return 96 | } 97 | var ent CrawlingEntry 98 | if cdb.PackageDB.Get(pkg, &ent) { 99 | if ent.ScheduleTime.Before(time.Now()) || inDocs(pkg) { 100 | return 101 | } 102 | // if the docs is missing in Docs, schedule it earlier 103 | log.Printf("Scheduling a package with missing docs: %v", pkg) 104 | } else { 105 | log.Printf("Scheduling new package: %v", pkg) 106 | } 107 | cdb.SchedulePackage(pkg, time.Now(), "") 108 | } 109 | 110 | // SchedulePerson schedules a person to be crawled at a specific time. 111 | func (cdb *CrawlerDB) SchedulePerson(id string, sTime time.Time) error { 112 | ent := CrawlingEntry{ 113 | ScheduleTime: sTime, 114 | Version: CrawlerVersion, 115 | } 116 | 117 | cdb.PersonDB.Put(id, ent) 118 | 119 | log.Printf("Schedule person %s to %v", id, sTime) 120 | return nil 121 | } 122 | 123 | // AppendPerson appends a person to the PersonDB, schedules to crawl 124 | // immediately for a new person 125 | func (cdb *CrawlerDB) AppendPerson(site, username string) bool { 126 | id := IdOfPerson(site, username) 127 | 128 | var ent CrawlingEntry 129 | exists := cdb.PersonDB.Get(id, &ent) 130 | if exists { 131 | // already scheduled 132 | return false 133 | } 134 | 135 | return cdb.SchedulePerson(id, time.Now()) == nil 136 | } 137 | -------------------------------------------------------------------------------- /store/history.go: -------------------------------------------------------------------------------- 1 | package store 2 | 3 | import ( 4 | "log" 5 | "time" 6 | 7 | "github.com/golang/protobuf/proto" 8 | "github.com/golang/protobuf/ptypes" 9 | "github.com/golangplus/bytes" 10 | "github.com/golangplus/errors" 11 | 12 | "github.com/daviddengcn/bolthelper" 13 | 14 | gpb "github.com/daviddengcn/gcse/shared/proto" 15 | ) 16 | 17 | func SaveSnapshot(path string) error { 18 | return box.Update(func(tx bh.Tx) error { 19 | return tx.CopyFile(path, 0644) 20 | }) 21 | } 22 | 23 | const ( 24 | maxHistoryEvents = 10 25 | ) 26 | 27 | func readHistoryOf(box *bh.RefCountBox, root []byte, site, idOrPath string) (*gpb.HistoryInfo, error) { 28 | info := &gpb.HistoryInfo{} 29 | if err := box.View(func(tx bh.Tx) error { 30 | return tx.Value([][]byte{historyRoot, root, []byte(site), []byte(idOrPath)}, func(bs bytesp.Slice) error { 31 | if err := errorsp.WithStacksAndMessage(proto.Unmarshal(bs, info), "Unmarshal %d bytes failed", len(bs)); err != nil { 32 | log.Printf("Unmarshal failed: %v", err) 33 | *info = gpb.HistoryInfo{} 34 | } 35 | return nil 36 | }) 37 | }); err != nil { 38 | return nil, err 39 | } 40 | return info, nil 41 | } 42 | 43 | func readHistory(root []byte, site, idOrPath string) (*gpb.HistoryInfo, error) { 44 | return readHistoryOf(box, root, site, idOrPath) 45 | } 46 | 47 | func ReadPackageHistory(site, path string) (*gpb.HistoryInfo, error) { 48 | return readHistory(pkgsRoot, site, path) 49 | } 50 | 51 | func ReadPackageHistoryOf(box *bh.RefCountBox, site, path string) (*gpb.HistoryInfo, error) { 52 | return readHistoryOf(box, pkgsRoot, site, path) 53 | } 54 | 55 | func ReadPersonHistory(site, path string) (*gpb.HistoryInfo, error) { 56 | return readHistory(personsRoot, site, path) 57 | } 58 | 59 | func updateHistory(root []byte, site, idOrPath string, f func(*gpb.HistoryInfo) error) error { 60 | return box.Update(func(tx bh.Tx) error { 61 | b, err := tx.CreateBucketIfNotExists([][]byte{historyRoot, root, []byte(site)}) 62 | if err != nil { 63 | return err 64 | } 65 | info := &gpb.HistoryInfo{} 66 | if err := b.Value([][]byte{[]byte(idOrPath)}, func(bs bytesp.Slice) error { 67 | err := errorsp.WithStacksAndMessage(proto.Unmarshal(bs, info), "Unmarshal %d bytes", len(bs)) 68 | if err != nil { 69 | log.Printf("Unmarshaling failed: %v", err) 70 | *info = gpb.HistoryInfo{} 71 | } 72 | return nil 73 | }); err != nil { 74 | return err 75 | } 76 | if err := errorsp.WithStacks(f(info)); err != nil { 77 | return err 78 | } 79 | bs, err := proto.Marshal(info) 80 | if err != nil { 81 | return errorsp.WithStacksAndMessage(err, "marshaling %v failed: %v", info, err) 82 | } 83 | return b.Put([][]byte{[]byte(idOrPath)}, bs) 84 | }) 85 | } 86 | 87 | func UpdatePackageHistory(site, path string, f func(*gpb.HistoryInfo) error) error { 88 | return updateHistory(pkgsRoot, site, path, f) 89 | } 90 | 91 | func AppendPackageEvent(site, path, foundWay string, t time.Time, a gpb.HistoryEvent_Action_Enum) error { 92 | return UpdatePackageHistory(site, path, func(hi *gpb.HistoryInfo) error { 93 | if hi.FoundTime == nil { 94 | // The first time the package was found 95 | hi.FoundTime, _ = ptypes.TimestampProto(t) 96 | hi.FoundWay = foundWay 97 | } 98 | if a == gpb.HistoryEvent_Action_None { 99 | return nil 100 | } 101 | // Insert the event 102 | tsp, _ := ptypes.TimestampProto(t) 103 | hi.Events = append([]*gpb.HistoryEvent{{ 104 | Action: a, 105 | Timestamp: tsp, 106 | }}, hi.Events...) 107 | if len(hi.Events) > maxHistoryEvents { 108 | hi.Events = hi.Events[:maxHistoryEvents] 109 | } 110 | switch a { 111 | case gpb.HistoryEvent_Action_Success: 112 | hi.LatestSuccess = tsp 113 | case gpb.HistoryEvent_Action_Failed: 114 | hi.LatestFailed = tsp 115 | } 116 | return nil 117 | }) 118 | } 119 | 120 | func UpdatePersonHistory(site, path string, f func(*gpb.HistoryInfo) error) error { 121 | return updateHistory(personsRoot, site, path, f) 122 | } 123 | 124 | func deleteHistory(root []byte, site, idOrPath string) error { 125 | return box.Update(func(tx bh.Tx) error { 126 | return tx.Delete([][]byte{historyRoot, root, []byte(site), []byte(idOrPath)}) 127 | }) 128 | } 129 | 130 | func DeletePackageHistory(site, path string) error { 131 | return deleteHistory(pkgsRoot, site, path) 132 | } 133 | 134 | func DeletePersonHistory(site, path string) error { 135 | return deleteHistory(personsRoot, site, path) 136 | } 137 | -------------------------------------------------------------------------------- /store/history_test.go: -------------------------------------------------------------------------------- 1 | package store 2 | 3 | import ( 4 | "os" 5 | "testing" 6 | "time" 7 | 8 | "github.com/golang/protobuf/ptypes" 9 | "github.com/golangplus/testing/assert" 10 | 11 | "github.com/daviddengcn/bolthelper" 12 | "github.com/daviddengcn/go-villa" 13 | 14 | gpb "github.com/daviddengcn/gcse/shared/proto" 15 | ) 16 | 17 | func TestUpdateReadDeletePackageHistory(t *testing.T) { 18 | const ( 19 | site = "TestUpdateReadDeletePackageHistory.com" 20 | path = "gcse" 21 | foundWay = "testing" 22 | ) 23 | assert.NoError(t, UpdatePackageHistory(site, path, func(info *gpb.HistoryInfo) error { 24 | assert.Equal(t, "info", info, &gpb.HistoryInfo{}) 25 | info.FoundWay = foundWay 26 | return nil 27 | })) 28 | h, err := ReadPackageHistory(site, path) 29 | assert.NoError(t, err) 30 | assert.Equal(t, "h", h, &gpb.HistoryInfo{FoundWay: foundWay}) 31 | 32 | assert.NoError(t, DeletePackageHistory(site, path)) 33 | 34 | h, err = ReadPackageHistory(site, path) 35 | assert.NoError(t, err) 36 | assert.Equal(t, "h", h, &gpb.HistoryInfo{}) 37 | } 38 | 39 | func TestAppendPackageEvent(t *testing.T) { 40 | const ( 41 | site = "TestAppendPackageEvent.com" 42 | path = "gcse" 43 | foundWay = "test" 44 | ) 45 | // Insert a found only event, no action. 46 | foundTm := time.Now() 47 | foundTs, _ := ptypes.TimestampProto(foundTm) 48 | assert.NoError(t, AppendPackageEvent(site, path, "test", foundTm, gpb.HistoryEvent_Action_None)) 49 | h, err := ReadPackageHistory(site, path) 50 | assert.NoError(t, err) 51 | assert.Equal(t, "h", h, &gpb.HistoryInfo{FoundWay: foundWay, FoundTime: foundTs}) 52 | 53 | // Inser a Success action 54 | succTm := foundTm.Add(time.Hour) 55 | succTs, _ := ptypes.TimestampProto(succTm) 56 | assert.NoError(t, AppendPackageEvent(site, path, "non-test", succTm, gpb.HistoryEvent_Action_Success)) 57 | h, err = ReadPackageHistory(site, path) 58 | assert.NoError(t, err) 59 | assert.Equal(t, "h", h, &gpb.HistoryInfo{ 60 | FoundWay: foundWay, 61 | FoundTime: foundTs, 62 | Events: []*gpb.HistoryEvent{{ 63 | Timestamp: succTs, 64 | Action: gpb.HistoryEvent_Action_Success, 65 | }}, 66 | LatestSuccess: succTs, 67 | }) 68 | // Inser a Failed action 69 | failedTm := succTm.Add(time.Hour) 70 | failedTs, _ := ptypes.TimestampProto(failedTm) 71 | assert.NoError(t, AppendPackageEvent(site, path, "", failedTm, gpb.HistoryEvent_Action_Failed)) 72 | h, err = ReadPackageHistory(site, path) 73 | assert.NoError(t, err) 74 | assert.Equal(t, "h", h, &gpb.HistoryInfo{ 75 | FoundWay: foundWay, 76 | FoundTime: foundTs, 77 | Events: []*gpb.HistoryEvent{{ 78 | Timestamp: failedTs, 79 | Action: gpb.HistoryEvent_Action_Failed, 80 | }, { 81 | Timestamp: succTs, 82 | Action: gpb.HistoryEvent_Action_Success, 83 | }}, 84 | LatestSuccess: succTs, 85 | LatestFailed: failedTs, 86 | }) 87 | } 88 | 89 | func TestUpdateReadDeletePersonHistory(t *testing.T) { 90 | const ( 91 | site = "TestUpdateReadDeletePersonHistory.com" 92 | id = "daviddengcn" 93 | foundWay = "testing" 94 | ) 95 | assert.NoError(t, UpdatePersonHistory(site, id, func(info *gpb.HistoryInfo) error { 96 | assert.Equal(t, "info", info, &gpb.HistoryInfo{}) 97 | info.FoundWay = foundWay 98 | return nil 99 | })) 100 | h, err := ReadPersonHistory(site, id) 101 | assert.NoError(t, err) 102 | assert.Equal(t, "h", h, &gpb.HistoryInfo{FoundWay: foundWay}) 103 | 104 | assert.NoError(t, DeletePersonHistory(site, id)) 105 | 106 | h, err = ReadPersonHistory(site, id) 107 | assert.NoError(t, err) 108 | assert.Equal(t, "h", h, &gpb.HistoryInfo{}) 109 | } 110 | 111 | func TestSaveSnapshot(t *testing.T) { 112 | const ( 113 | site = "TestUpdateReadDeletePackageHistory.com" 114 | path = "gcse" 115 | foundWay = "testing" 116 | ) 117 | assert.NoError(t, UpdatePackageHistory(site, path, func(info *gpb.HistoryInfo) error { 118 | assert.Equal(t, "info", info, &gpb.HistoryInfo{}) 119 | info.FoundWay = foundWay 120 | return nil 121 | })) 122 | h, err := ReadPackageHistory(site, path) 123 | assert.NoError(t, err) 124 | assert.Equal(t, "h", h, &gpb.HistoryInfo{FoundWay: foundWay}) 125 | 126 | outPath := villa.Path(os.TempDir()).Join("TestSaveSnapshot").S() 127 | assert.NoError(t, SaveSnapshot(outPath)) 128 | box := &bh.RefCountBox{ 129 | DataPath: func() string { 130 | return outPath 131 | }, 132 | } 133 | h, err = ReadPackageHistoryOf(box, site, path) 134 | assert.NoError(t, err) 135 | assert.Equal(t, "h", h, &gpb.HistoryInfo{FoundWay: foundWay}) 136 | } 137 | -------------------------------------------------------------------------------- /configs/configs.go: -------------------------------------------------------------------------------- 1 | // Package configs define and load all configurations. It depends on no othe GCSE packages. 2 | package configs 3 | 4 | import ( 5 | "log" 6 | "os" 7 | "time" 8 | 9 | "github.com/golangplus/strings" 10 | 11 | "github.com/daviddengcn/gcse/utils" 12 | "github.com/daviddengcn/go-easybi" 13 | "github.com/daviddengcn/go-ljson-conf" 14 | "github.com/daviddengcn/go-villa" 15 | "github.com/daviddengcn/sophie" 16 | ) 17 | 18 | const ( 19 | fnCrawlerDB = "crawler" 20 | 21 | fnToCrawl = "tocrawl" 22 | FnPackage = "package" 23 | FnPerson = "person" 24 | // key: RawString, value: DocInfo 25 | FnDocs = "docs" 26 | FnNewDocs = "newdocs" 27 | 28 | FnStore = "store" 29 | ) 30 | 31 | var ( 32 | ServerAddr = ":8080" 33 | ServerRoot = villa.Path("./service/web") 34 | 35 | LoadTemplatePass = "" 36 | AutoLoadTemplate = false 37 | 38 | DataRoot = villa.Path("./data/") 39 | 40 | // producer: server, consumer: crawler 41 | ImportPath villa.Path 42 | 43 | // producer: crawler, consumer: indexer 44 | DBOutPath villa.Path 45 | 46 | // configures of crawler 47 | CrawlByGodocApi = true 48 | CrawlGithubUpdate = true 49 | CrawlerDuePerRun = 1 * time.Hour 50 | CrawlerGithubClientID = "" 51 | CrawlerGithubClientSecret = "" 52 | CrawlerGithubPersonal = "" 53 | 54 | BiWebPath = "/bi" 55 | 56 | NonCrawlHosts = stringsp.Set{} 57 | NonStorePackageRegexps = []string{} 58 | 59 | StoreDAddr = ":8081" 60 | 61 | LogDir = "/tmp" 62 | ) 63 | 64 | func init() { 65 | log.SetFlags(log.Flags() | log.Lshortfile) 66 | 67 | conf, err := ljconf.Load("conf.json") 68 | if err != nil { 69 | // we must make sure configuration exist 70 | log.Fatal(err) 71 | } 72 | ServerAddr = conf.String("web.addr", ServerAddr) 73 | ServerRoot = conf.Path("web.root", ServerRoot) 74 | LoadTemplatePass = conf.String("web.loadtemplatepass", LoadTemplatePass) 75 | AutoLoadTemplate = conf.Bool("web.autoloadtemplate", AutoLoadTemplate) 76 | 77 | DataRoot = conf.Path("back.dbroot", DataRoot) 78 | 79 | ImportPath = DataRoot.Join("imports") 80 | ImportPath.MkdirAll(0755) 81 | 82 | DBOutPath = DataRoot.Join("dbout") 83 | DBOutPath.MkdirAll(0755) 84 | 85 | CrawlByGodocApi = conf.Bool("crawler.godoc", CrawlByGodocApi) 86 | CrawlGithubUpdate = conf.Bool("crawler.github_update", CrawlGithubUpdate) 87 | CrawlerDuePerRun = conf.Duration("crawler.due_per_run", CrawlerDuePerRun) 88 | 89 | ncHosts := conf.StringList("crawler.noncrawl_hosts", nil) 90 | NonCrawlHosts.Add(ncHosts...) 91 | 92 | CrawlerGithubClientID = conf.String("crawler.github.clientid", "") 93 | CrawlerGithubClientSecret = conf.String("crawler.github.clientsecret", "") 94 | CrawlerGithubPersonal = conf.String("crawler.github.personal", "") 95 | 96 | NonStorePackageRegexps = conf.StringList("docdb.nonstore_regexps", nil) 97 | 98 | bi.DataPath = conf.String("bi.data_path", "/tmp/gcse.bolt") 99 | BiWebPath = conf.String("bi.web_path", BiWebPath) 100 | 101 | StoreDAddr = conf.String("stored.addr", StoreDAddr) 102 | 103 | LogDir = conf.String("log.dir", LogDir) 104 | } 105 | 106 | func DataRootFsPath() sophie.FsPath { 107 | return sophie.LocalFsPath(DataRoot.S()) 108 | } 109 | 110 | func CrawlerDBPath() villa.Path { 111 | return DataRoot.Join(fnCrawlerDB) 112 | } 113 | 114 | func CrawlerDBFsPath() sophie.FsPath { 115 | return DataRootFsPath().Join(fnCrawlerDB) 116 | } 117 | 118 | func DocsDBPath() string { 119 | return DataRoot.Join(FnDocs).S() 120 | } 121 | 122 | func DocsDBFsPath() sophie.FsPath { 123 | return DataRootFsPath().Join(FnDocs) 124 | } 125 | 126 | func ToCrawlPath() string { 127 | return DataRoot.Join(fnToCrawl).S() 128 | } 129 | 130 | func ToCrawlFsPath() sophie.FsPath { 131 | return DataRootFsPath().Join(fnToCrawl) 132 | } 133 | 134 | func IndexPath() villa.Path { 135 | return DataRoot.Join("index") 136 | } 137 | 138 | func StoreBoltPath() string { 139 | return DataRoot.Join("store.bolt").S() 140 | } 141 | 142 | func FileCacheBoltPath() string { 143 | return DataRoot.Join("filecache.bolt").S() 144 | } 145 | 146 | func SetTestingDataPath() { 147 | DataRoot = villa.Path(os.TempDir()).Join("gcse_testing") 148 | DataRoot.RemoveAll() 149 | DataRoot.MkdirAll(0755) 150 | log.Printf("DataRoot: %v", DataRoot) 151 | } 152 | 153 | // Returns the segments imported from web site. 154 | func ImportSegments() utils.Segments { 155 | return utils.Segments(ImportPath) 156 | } 157 | 158 | func DBOutSegments() utils.Segments { 159 | return utils.Segments(DBOutPath) 160 | } 161 | 162 | func IndexSegments() utils.Segments { 163 | return utils.Segments(IndexPath()) 164 | } 165 | -------------------------------------------------------------------------------- /pipelines/spider/spider.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "context" 5 | "log" 6 | "time" 7 | 8 | "github.com/golangplus/container/heap" 9 | "github.com/golangplus/errors" 10 | "github.com/golangplus/time" 11 | 12 | "github.com/daviddengcn/gcse/configs" 13 | "github.com/daviddengcn/gcse/spider/github" 14 | "github.com/daviddengcn/gcse/store" 15 | 16 | gpb "github.com/daviddengcn/gcse/shared/proto" 17 | ) 18 | 19 | type RepositoryInfo struct { 20 | *gpb.Repository 21 | 22 | User string 23 | Name string 24 | } 25 | 26 | func needCrawl(r *gpb.Repository) bool { 27 | if r.CrawlingInfo == nil { 28 | return true 29 | } 30 | return r.CrawlingInfo.CrawlingTimeAsTime().Before(time.Now().Add(-timep.Day)) 31 | } 32 | 33 | func shouldCrawlLater(a, b *RepositoryInfo) bool { 34 | if a.CrawlingInfo == nil { 35 | if b.CrawlingInfo == nil { 36 | return a.Name+a.User < b.Name+b.User 37 | } 38 | return false 39 | } 40 | if b.CrawlingInfo == nil { 41 | return true 42 | } 43 | return a.CrawlingInfo.CrawlingTimeAsTime().After(b.CrawlingInfo.CrawlingTimeAsTime()) 44 | } 45 | 46 | func selectRepos(site string, maxCrawl int) ([]*RepositoryInfo, error) { 47 | repos := heap.NewInterfaces(func(x, y interface{}) bool { 48 | return shouldCrawlLater(x.(*RepositoryInfo), y.(*RepositoryInfo)) 49 | }, maxCrawl) 50 | if err := store.ForEachRepositoryOfSite(site, func(user, name string, doc *gpb.Repository) error { 51 | if !needCrawl(doc) { 52 | return nil 53 | } 54 | ri := &RepositoryInfo{ 55 | User: user, 56 | Name: name, 57 | Repository: doc, 58 | } 59 | repos.TopNPush(ri) 60 | return nil 61 | }); err != nil { 62 | return nil, err 63 | } 64 | res := make([]*RepositoryInfo, 0, repos.Len()) 65 | for _, r := range repos.PopAll() { 66 | res = append(res, r.(*RepositoryInfo)) 67 | } 68 | return res, nil 69 | } 70 | 71 | var githubSpider *github.Spider 72 | var now timep.NowFunc = time.Now 73 | 74 | func crawlRepo(ctx context.Context, site string, repo *RepositoryInfo) error { 75 | if site != "github.com" { 76 | return errorsp.NewWithStacks("Cannot crawl the repository in %v", site) 77 | } 78 | repo.CrawlingInfo = &gpb.CrawlingInfo{} 79 | repo.CrawlingInfo.SetCrawlingTime(now()) 80 | 81 | sha, err := githubSpider.RepoBranchSHA(ctx, repo.User, repo.Name, repo.Branch) 82 | if err != nil { 83 | return err 84 | } 85 | if repo.Signature == sha { 86 | return nil 87 | } 88 | repo.Signature = sha 89 | 90 | repo.Packages = make(map[string]*gpb.Package) 91 | if err := githubSpider.ReadRepo(ctx, repo.User, repo.Name, repo.Signature, func(path string, doc *gpb.Package) error { 92 | log.Printf("Package: %v", doc) 93 | repo.Packages[path] = doc 94 | return nil 95 | }); err != nil { 96 | return err 97 | } 98 | return nil 99 | } 100 | 101 | func crawlAndSaveRepo(ctx context.Context, site string, repo *RepositoryInfo) error { 102 | if err := crawlRepo(ctx, site, repo); err != nil { 103 | if errorsp.Cause(err) == github.ErrInvalidRepository { 104 | // Remove the repo entry. 105 | return store.DeleteRepository(site, repo.User, repo.Name) 106 | } 107 | return err 108 | } 109 | return store.UpdateRepository(site, repo.User, repo.Name, func(doc *gpb.Repository) error { 110 | *doc = *repo.Repository 111 | return nil 112 | }) 113 | } 114 | 115 | func crawl(ctx context.Context, site string, out chan error, maxCrawl int, dur time.Duration) { 116 | repos, err := selectRepos(site, maxCrawl) 117 | if err != nil { 118 | out <- err 119 | return 120 | } 121 | log.Printf("%d repos selected", len(repos)) 122 | var anyErr error 123 | for _, repo := range repos { 124 | if err := crawlAndSaveRepo(ctx, site, repo); err != nil { 125 | anyErr = err 126 | log.Printf("crawlAndSaveRepo %v %v %v failed: %v", site, repo.User, repo.Name, err) 127 | } 128 | } 129 | out <- anyErr 130 | } 131 | 132 | func exec(maxCrawl int, dur time.Duration) error { 133 | out := make(chan error) 134 | n := 0 135 | anyErr := store.ForEachRepositorySite(func(site string) error { 136 | n++ 137 | go crawl(context.Background(), site, out, maxCrawl, dur) 138 | return nil 139 | }) 140 | if anyErr != nil { 141 | log.Printf("ForEachRepositorySite failed: %v", anyErr) 142 | } 143 | log.Printf("Waiting for %d site(s)...", n) 144 | for ; n > 0; n-- { 145 | if e := <-out; e != nil { 146 | log.Printf("Error from out: %v", e) 147 | anyErr = e 148 | } 149 | } 150 | return anyErr 151 | } 152 | 153 | func main() { 154 | log.Printf("Using Github personal token: %v", configs.CrawlerGithubPersonal) 155 | githubSpider = github.NewSpiderWithToken(configs.CrawlerGithubPersonal) 156 | 157 | if err := exec(1000, configs.CrawlerDuePerRun); err != nil { 158 | log.Fatalf("exec failed: %v", err) 159 | } 160 | } 161 | -------------------------------------------------------------------------------- /utils/segment.go: -------------------------------------------------------------------------------- 1 | package utils 2 | 3 | import ( 4 | "io/ioutil" 5 | "log" 6 | "os" 7 | "path/filepath" 8 | "strconv" 9 | 10 | "github.com/golangplus/errors" 11 | "github.com/golangplus/strings" 12 | ) 13 | 14 | const ( 15 | fnDone = ".done" 16 | ) 17 | 18 | type Segment string 19 | 20 | func (s Segment) Make() error { 21 | return os.MkdirAll(string(s), 0755) 22 | } 23 | 24 | func (s Segment) Name() string { 25 | return filepath.Base(string(s)) 26 | } 27 | 28 | func (s Segment) Join(name string) string { 29 | if name == "" { 30 | return string(s) 31 | } 32 | return filepath.Join(string(s), name) 33 | } 34 | 35 | func (s Segment) IsDone() bool { 36 | _, err := os.Stat(s.Join(fnDone)) 37 | return err == nil 38 | } 39 | 40 | func (s Segment) Done() error { 41 | if err := os.MkdirAll(string(s), 0755); err != nil { 42 | return err 43 | } 44 | f, err := os.Create(s.Join(fnDone)) 45 | if err != nil { 46 | return errorsp.WithStacks(err) 47 | } 48 | return errorsp.WithStacks(f.Close()) 49 | } 50 | 51 | func (s Segment) ListFiles() ([]string, error) { 52 | files, err := ioutil.ReadDir(string(s)) 53 | if err != nil { 54 | return nil, errorsp.WithStacks(err) 55 | } 56 | list := make([]string, 0, len(files)) 57 | for _, f := range files { 58 | if f.Name() == fnDone { 59 | continue 60 | } 61 | list = append(list, filepath.Join(string(s), f.Name())) 62 | } 63 | return list, nil 64 | } 65 | 66 | func (s Segment) Remove() error { 67 | return errorsp.WithStacks(os.RemoveAll(string(s))) 68 | } 69 | 70 | type Segments string 71 | 72 | func (ss Segments) Join(sub string) Segment { 73 | return Segment(filepath.Join(string(ss), sub)) 74 | } 75 | 76 | func (ss Segments) ListAll() ([]Segment, error) { 77 | files, err := ioutil.ReadDir(string(ss)) 78 | if err != nil { 79 | if os.IsNotExist(errorsp.Cause(err)) { 80 | // Returns empty slice if the folder does not exist. 81 | return nil, nil 82 | } 83 | return nil, errorsp.WithStacks(err) 84 | } 85 | segms := make([]Segment, 0, len(files)) 86 | for _, f := range files { 87 | if !f.IsDir() { 88 | // A segment is always a folder. 89 | continue 90 | } 91 | segms = append(segms, ss.Join(f.Name())) 92 | } 93 | return segms, nil 94 | } 95 | 96 | func (ss Segments) ListDones() ([]Segment, error) { 97 | segms, err := ss.ListAll() 98 | if err != nil { 99 | return nil, err 100 | } 101 | dones := make([]Segment, 0, len(segms)) 102 | for _, s := range segms { 103 | if s.IsDone() { 104 | dones = append(dones, s) 105 | } 106 | } 107 | return dones, nil 108 | } 109 | 110 | func SegmentLess(a, b Segment) bool { 111 | numA, errA := strconv.Atoi(a.Name()) 112 | numB, errB := strconv.Atoi(b.Name()) 113 | 114 | if errA != nil { 115 | if errB != nil { 116 | // both non-numbers 117 | return a.Name() < b.Name() 118 | } 119 | // non < number 120 | return true 121 | } 122 | if errB != nil { 123 | // number > non 124 | return false 125 | } 126 | // number comparison 127 | return numA < numB 128 | } 129 | 130 | func (ss Segments) FindMaxDone() (Segment, error) { 131 | var maxS Segment 132 | dones, err := ss.ListDones() 133 | if err != nil { 134 | return "", errorsp.WithStacks(err) 135 | } 136 | for _, s := range dones { 137 | if maxS == "" || SegmentLess(maxS, s) { 138 | maxS = s 139 | } 140 | } 141 | return maxS, nil 142 | } 143 | 144 | func makeSegment(s Segment) (Segment, error) { 145 | return s, os.MkdirAll(string(s), 0755) 146 | } 147 | 148 | func (ss Segments) GenNewSegment() (Segment, error) { 149 | curSs, err := ss.ListAll() 150 | if err != nil { 151 | return "", errorsp.WithStacks(err) 152 | } 153 | 154 | var nset stringsp.Set 155 | for _, s := range curSs { 156 | nset.Add(s.Name()) 157 | } 158 | 159 | for i := 0; ; i++ { 160 | fn := strconv.Itoa(i) 161 | if nset.Contain(fn) { 162 | continue 163 | } 164 | return makeSegment(ss.Join(fn)) 165 | } 166 | } 167 | 168 | func (ss Segments) GenMaxSegment() (Segment, error) { 169 | var maxS Segment 170 | dones, err := ss.ListAll() 171 | if err != nil { 172 | return "", errorsp.WithStacks(err) 173 | } 174 | for _, s := range dones { 175 | if maxS == "" || SegmentLess(maxS, s) { 176 | maxS = s 177 | } 178 | } 179 | if maxS == "" { 180 | return makeSegment(ss.Join("0")) 181 | } 182 | num, err := strconv.Atoi(maxS.Name()) 183 | if err != nil { 184 | return makeSegment(ss.Join("0")) 185 | } 186 | return makeSegment(ss.Join(strconv.Itoa(num + 1))) 187 | } 188 | 189 | func (ss Segments) ClearUndones() error { 190 | segms, err := ss.ListAll() 191 | if err != nil { 192 | return err 193 | } 194 | for _, segm := range segms { 195 | if !segm.IsDone() { 196 | if err := segm.Remove(); err != nil { 197 | return err 198 | } 199 | log.Printf("Undone segment %v is removed!", segm) 200 | } 201 | } 202 | return nil 203 | } 204 | -------------------------------------------------------------------------------- /pipelines/mergedocs/mergedocs.go: -------------------------------------------------------------------------------- 1 | // Input 2 | // FnDocs 3 | // FnNewDocs 4 | package main 5 | 6 | import ( 7 | // "fmt" 8 | "io" 9 | "log" 10 | "regexp" 11 | "sync/atomic" 12 | 13 | "github.com/golangplus/errors" 14 | "github.com/golangplus/strings" 15 | 16 | "github.com/daviddengcn/gcse" 17 | "github.com/daviddengcn/gcse/configs" 18 | "github.com/daviddengcn/go-villa" 19 | "github.com/daviddengcn/sophie" 20 | "github.com/daviddengcn/sophie/kv" 21 | "github.com/daviddengcn/sophie/mr" 22 | ) 23 | 24 | func main() { 25 | log.Println("Merging new crawled docs back...") 26 | 27 | var nonStorePackage *regexp.Regexp 28 | if len(configs.NonStorePackageRegexps) > 0 { 29 | nonStorePackage = regexp.MustCompile( 30 | stringsp.FullJoin(configs.NonStorePackageRegexps, "(", ")|(", ")")) 31 | } 32 | 33 | fpDataRoot := sophie.LocalFsPath(configs.DataRoot.S()) 34 | 35 | fpCrawler := configs.CrawlerDBFsPath() 36 | outDocsUpdated := kv.DirOutput(fpDataRoot.Join("docs-updated")) 37 | outDocsUpdated.Clean() 38 | 39 | var cntDeleted, cntUpdated, cntNew, cntUnchanged int64 40 | 41 | job := mr.MrJob{ 42 | Source: []mr.Input{ 43 | kv.DirInput(fpDataRoot.Join(configs.FnDocs)), // 0 44 | kv.DirInput(fpCrawler.Join(configs.FnNewDocs)), // 1 45 | }, 46 | 47 | NewMapperF: func(src, part int) mr.Mapper { 48 | if src == 0 { 49 | // Mapper for docs 50 | return &mr.MapperStruct{ 51 | NewKeyF: sophie.NewRawString, 52 | NewValF: gcse.NewDocInfo, 53 | MapF: func(key, val sophie.SophieWriter, c mr.PartCollector) error { 54 | pkg := key.(*sophie.RawString).String() 55 | di := val.(*gcse.DocInfo) 56 | act := gcse.NewDocAction{ 57 | Action: gcse.NDA_ORIGINAL, 58 | DocInfo: *di, 59 | } 60 | part := gcse.CalcPackagePartition(pkg, gcse.DOCS_PARTS) 61 | return c.CollectTo(part, key, &act) 62 | }, 63 | } 64 | } 65 | // Mapper for new docs 66 | return &mr.MapperStruct{ 67 | NewKeyF: sophie.NewRawString, 68 | NewValF: gcse.NewNewDocAction, 69 | MapF: func(key, val sophie.SophieWriter, c mr.PartCollector) error { 70 | pkg := string(*key.(*sophie.RawString)) 71 | part := gcse.CalcPackagePartition(pkg, gcse.DOCS_PARTS) 72 | return c.CollectTo(part, key, val) 73 | }, 74 | } 75 | }, 76 | 77 | Sorter: mr.NewFileSorter(fpDataRoot.Join("tmp")), 78 | 79 | NewReducerF: func(part int) mr.Reducer { 80 | return &mr.ReducerStruct{ 81 | NewKeyF: sophie.NewRawString, 82 | NewValF: gcse.NewNewDocAction, 83 | ReduceF: func(key sophie.SophieWriter, 84 | nextVal mr.SophierIterator, c []sophie.Collector) error { 85 | 86 | if nonStorePackage != nil { 87 | pkg := string(*key.(*sophie.RawString)) 88 | if nonStorePackage.MatchString(pkg) { 89 | log.Printf("Ignoring non-store pkg: %s", pkg) 90 | return nil 91 | } 92 | } 93 | 94 | var act gcse.DocInfo 95 | isSet := false 96 | isUpdated := false 97 | hasOriginal := false 98 | for { 99 | val, err := nextVal() 100 | if errorsp.Cause(err) == io.EOF { 101 | break 102 | } 103 | if err != nil { 104 | return err 105 | } 106 | 107 | cur := val.(*gcse.NewDocAction) 108 | switch cur.Action { 109 | case gcse.NDA_DEL: 110 | // not collect out to delete it 111 | atomic.AddInt64(&cntDeleted, 1) 112 | return nil 113 | 114 | case gcse.NDA_ORIGINAL: 115 | hasOriginal = true 116 | } 117 | 118 | if !isSet { 119 | isSet = true 120 | act = cur.DocInfo 121 | } else { 122 | if cur.LastUpdated.After(act.LastUpdated) { 123 | isUpdated = true 124 | act = cur.DocInfo 125 | } 126 | } 127 | } 128 | 129 | if isSet { 130 | if isUpdated { 131 | atomic.AddInt64(&cntUpdated, 1) 132 | } else if hasOriginal { 133 | atomic.AddInt64(&cntUnchanged, 1) 134 | } else { 135 | atomic.AddInt64(&cntNew, 1) 136 | } 137 | return c[0].Collect(key, &act) 138 | } else { 139 | return nil 140 | } 141 | }, 142 | } 143 | }, 144 | 145 | Dest: []mr.Output{ 146 | outDocsUpdated, 147 | }, 148 | } 149 | 150 | if err := job.Run(); err != nil { 151 | log.Fatalf("job.Run failed: %v", err) 152 | } 153 | 154 | log.Printf("Deleted: %v", cntDeleted) 155 | log.Printf("Updated: %v", cntUpdated) 156 | log.Printf("New: %v", cntNew) 157 | log.Printf("Unchanged: %v", cntUnchanged) 158 | 159 | pDocs := villa.Path(configs.DocsDBPath()) 160 | pUpdated := configs.DataRoot.Join("docs-updated") 161 | pTmp := configs.DataRoot.Join("docs-tmp") 162 | 163 | pTmp.RemoveAll() 164 | if err := pDocs.Rename(pTmp); err != nil { 165 | log.Fatalf("rename %v to %v failed: %v", pDocs, pTmp, err) 166 | } 167 | if err := pUpdated.Rename(pDocs); err != nil { 168 | log.Fatalf("rename %v to %v failed: %v", pUpdated, pDocs, err) 169 | } 170 | 171 | log.Println("Merging success...") 172 | } 173 | -------------------------------------------------------------------------------- /crawler_test.go: -------------------------------------------------------------------------------- 1 | package gcse 2 | 3 | import ( 4 | "context" 5 | "errors" 6 | "net/http" 7 | "strings" 8 | "testing" 9 | "time" 10 | 11 | "github.com/golangplus/bytes" 12 | "github.com/golangplus/testing/assert" 13 | 14 | "github.com/daviddengcn/gcse/configs" 15 | "github.com/daviddengcn/gcse/spider/github" 16 | "github.com/daviddengcn/gddo/doc" 17 | "github.com/daviddengcn/go-villa" 18 | ) 19 | 20 | func TestReadmeToText(t *testing.T) { 21 | text := strings.TrimSpace(ReadmeToText("a.md", "#abc")) 22 | assert.Equal(t, "text", text, "abc") 23 | } 24 | 25 | func TestReadmeToText_Panic(t *testing.T) { 26 | ReadmeToText("a.md", "* [[t]](/t)") 27 | } 28 | 29 | func TestPlusone(t *testing.T) { 30 | url := "http://www.google.com/" 31 | cnt, err := Plusone(http.DefaultClient, url) 32 | assert.NoError(t, err) 33 | t.Logf("Plusone of %s: %d", url, cnt) 34 | // if cnt <= 0 { 35 | // t.Errorf("Zero Plusone count for %s", url) 36 | // } 37 | } 38 | 39 | func TestLikeButton(t *testing.T) { 40 | url := "http://www.facebook.com/" 41 | cnt, err := LikeButton(http.DefaultClient, url) 42 | if err != nil { 43 | t.Error(err) 44 | return 45 | } 46 | t.Logf("LikeButton of %s: %d", url, cnt) 47 | if cnt <= 0 { 48 | // t.Errorf("Zero LikeButton count for %s", url) 49 | } 50 | } 51 | 52 | func TestCrawlPackage(t *testing.T) { 53 | ctx := context.Background() 54 | 55 | if configs.CrawlerGithubClientID != "" { 56 | t.Logf("Github clientid: %s", configs.CrawlerGithubClientID) 57 | t.Logf("Github clientsecret: %s", configs.CrawlerGithubClientSecret) 58 | doc.SetGithubCredentials(configs.CrawlerGithubClientID, configs.CrawlerGithubClientSecret) 59 | } 60 | GithubSpider = github.NewSpiderWithToken(configs.CrawlerGithubPersonal) 61 | 62 | pkg := "github.com/daviddengcn/gcse" 63 | httpClient := GenHttpClient("") 64 | p, _, err := CrawlPackage(ctx, httpClient, pkg, "") 65 | if err != nil { 66 | if strings.Index(err.Error(), "403") == -1 { 67 | t.Error(err) 68 | } 69 | } else { 70 | assert.Equal(t, "pkg", p.Package, pkg) 71 | } 72 | 73 | // pkg = "git.gitorious.org/go-pkg/epubgo.git" 74 | // p, err = CrawlPackage(httpClient, pkg, "") 75 | // if err != nil { 76 | // if strings.Index(err.Error(), "403") == -1 { 77 | // t.Error(err) 78 | // } 79 | // } else { 80 | // assert.Equal(t, "pkg", p.Package, pkg) 81 | // } 82 | 83 | pkg = "thezombie.net/libgojira" 84 | p, _, err = CrawlPackage(ctx, httpClient, pkg, "") 85 | if err != nil { 86 | if !IsBadPackage(err) { 87 | t.Errorf("%s should be an invalid package", pkg) 88 | } 89 | } else { 90 | t.Errorf("%s should be an invalid package", pkg) 91 | } 92 | } 93 | 94 | func TestDocDB(t *testing.T) { 95 | var db DocDB = PackedDocDB{NewMemDB("", "")} 96 | 97 | info := DocInfo{ 98 | Name: "github.com/daviddengcn/gcse", 99 | } 100 | db.Put("hello", info) 101 | var info2 DocInfo 102 | if ok := db.Get("hello", &info2); !ok { 103 | t.Error("db.Get failed!") 104 | return 105 | } 106 | assert.StringEqual(t, "hello", info2, info) 107 | 108 | if err := db.Iterate(func(key string, val interface{}) error { 109 | info3, ok := val.(DocInfo) 110 | if !ok { 111 | return errors.New("errNotDocInfo") 112 | } 113 | 114 | assert.StringEqual(t, key, info3, info) 115 | return nil 116 | }); err != nil { 117 | t.Errorf("db.Iterate failed: %v", err) 118 | } 119 | } 120 | 121 | func TestDocDB_Export(t *testing.T) { 122 | var db DocDB = PackedDocDB{NewMemDB("", "")} 123 | 124 | info := DocInfo{ 125 | Name: "github.com/daviddengcn/gcse", 126 | } 127 | 128 | db.Put("go", info) 129 | 130 | if err := db.Export(villa.Path("."), "testexport_db"); err != nil { 131 | t.Errorf("db.Export failed: %v", err) 132 | return 133 | } 134 | 135 | var newDB DocDB = PackedDocDB{NewMemDB(villa.Path("."), "testexport_db")} 136 | count := 0 137 | if err := newDB.Iterate(func(key string, val interface{}) error { 138 | info, ok := val.(DocInfo) 139 | if !ok { 140 | return errors.New("Not a DocInfo object") 141 | } 142 | assert.StringEqual(t, "info.Name", info.Name, 143 | "github.com/daviddengcn/gcse") 144 | count++ 145 | return nil 146 | }); err != nil { 147 | t.Errorf("newDB.Iterate failed: %v", err) 148 | } 149 | 150 | assert.Equal(t, "count", count, 1) 151 | } 152 | 153 | func TestCrawlingEntry(t *testing.T) { 154 | src := CrawlingEntry{ 155 | ScheduleTime: time.Now(), 156 | Version: 19, 157 | Etag: "Hello", 158 | } 159 | 160 | var buf bytesp.Slice 161 | assert.NoError(t, src.WriteTo(&buf)) 162 | 163 | var dst CrawlingEntry 164 | assert.NoError(t, dst.ReadFrom(&buf, -1)) 165 | 166 | if got, want := dst.ScheduleTime, src.ScheduleTime; !got.Equal(want) { 167 | t.Errorf("dst.ScheduleTime = %v, want %v", got, want) 168 | } 169 | assert.Equal(t, "dst.Version", dst.Version, src.Version) 170 | assert.Equal(t, "dst.Etag", dst.Etag, src.Etag) 171 | } 172 | 173 | func TestFullProjectOfPackage(t *testing.T) { 174 | DATA := []string{ 175 | "github.com/daviddengcn/gcse", "github.com/daviddengcn/gcse", 176 | "github.com/daviddengcn/gcse/index", "github.com/daviddengcn/gcse", 177 | "code.google.com/p/go.net/websocket", "code.google.com/p/go.net", 178 | } 179 | 180 | for i := 0; i < len(DATA); i += 2 { 181 | pkg, prj := DATA[i], DATA[i+1] 182 | assert.Equal(t, "FullProjectOfPackage "+pkg, FullProjectOfPackage(pkg), prj) 183 | } 184 | } 185 | -------------------------------------------------------------------------------- /service/web/tops.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "fmt" 5 | "net/http" 6 | "strconv" 7 | "strings" 8 | 9 | "github.com/golangplus/container/heap" 10 | "github.com/golangplus/strings" 11 | 12 | "github.com/daviddengcn/gcse" 13 | ) 14 | 15 | type StatItem struct { 16 | Index int 17 | Name string 18 | Package string 19 | Link string // no package, specify a link 20 | Info string 21 | } 22 | type StatList struct { 23 | Name string 24 | Info string 25 | Items []StatItem 26 | } 27 | 28 | type TopN struct { 29 | less func(a, b interface{}) bool 30 | pq heap.Interfaces 31 | n int 32 | } 33 | 34 | func NewTopN(less func(a, b interface{}) bool, n int) *TopN { 35 | return &TopN{ 36 | less: less, 37 | pq: heap.NewInterfaces(less, n), 38 | n: n, 39 | } 40 | } 41 | 42 | func (t *TopN) Append(item interface{}) { 43 | if t.pq.Len() < t.n { 44 | t.pq.Push(item) 45 | } else if t.less(t.pq.Peek(), item) { 46 | t.pq.Pop() 47 | t.pq.Push(item) 48 | } 49 | } 50 | 51 | func (t *TopN) PopAll() []interface{} { 52 | return t.pq.PopAll() 53 | } 54 | 55 | func (t *TopN) Len() int { 56 | return t.pq.Len() 57 | } 58 | 59 | func inProjects(projs stringsp.Set, pkg string) bool { 60 | for { 61 | if projs.Contain(pkg) { 62 | return true 63 | } 64 | p := strings.LastIndex(pkg, "/") 65 | if p < 0 { 66 | break 67 | } 68 | pkg = pkg[:p] 69 | } 70 | 71 | return false 72 | } 73 | 74 | func statTops(N int) []StatList { 75 | db := getDatabase() 76 | if db == nil { 77 | return nil 78 | } 79 | var topStaticScores []gcse.HitInfo 80 | var tssProjects stringsp.Set 81 | 82 | topImported := NewTopN(func(a, b interface{}) bool { 83 | ia, ib := a.(gcse.HitInfo), b.(gcse.HitInfo) 84 | return ia.ImportedLen+ia.TestImportedLen < ib.ImportedLen+ib.TestImportedLen 85 | }, N) 86 | 87 | topTestStatic := NewTopN(func(a, b interface{}) bool { 88 | return a.(gcse.HitInfo).TestStaticScore < b.(gcse.HitInfo).TestStaticScore 89 | }, N) 90 | 91 | sites := make(map[string]int) 92 | 93 | db.Search(nil, func(_ int32, data interface{}) error { 94 | hit := data.(gcse.HitInfo) 95 | orgName := hit.Name 96 | hit.Name = packageShowName(hit.Name, hit.Package) 97 | 98 | // assuming all packages has been sorted by static-scores. 99 | if len(topStaticScores) < N { 100 | if hit.ImportedLen > 0 && orgName != "" && orgName != "main" && !inProjects(tssProjects, hit.ProjectURL) { 101 | topStaticScores = append(topStaticScores, hit) 102 | tssProjects.Add(hit.ProjectURL) 103 | } 104 | } 105 | if hit.TestImportedLen > 0 { 106 | topTestStatic.Append(hit) 107 | } 108 | topImported.Append(hit) 109 | 110 | host := strings.ToLower(gcse.HostOfPackage(hit.Package)) 111 | if host != "" { 112 | sites[host] = sites[host] + 1 113 | } 114 | return nil 115 | }) 116 | tlStaticScore := StatList{ 117 | Name: "Hot", 118 | Info: "refs stars", 119 | Items: make([]StatItem, 0, len(topStaticScores)), 120 | } 121 | for idx, hit := range topStaticScores { 122 | tlStaticScore.Items = append(tlStaticScore.Items, StatItem{ 123 | Index: idx + 1, 124 | Name: hit.Name, 125 | Package: hit.Package, 126 | Info: fmt.Sprintf("%d %d", hit.ImportedLen, hit.StarCount), 127 | }) 128 | } 129 | tlTestStatic := StatList{ 130 | Name: "Hot Test", 131 | Info: "refs stars", 132 | Items: make([]StatItem, 0, topTestStatic.Len()), 133 | } 134 | for idx, item := range topTestStatic.PopAll() { 135 | hit := item.(gcse.HitInfo) 136 | tlTestStatic.Items = append(tlTestStatic.Items, StatItem{ 137 | Index: idx + 1, 138 | Name: hit.Name, 139 | Package: hit.Package, 140 | Info: fmt.Sprintf("%d %d", hit.TestImportedLen, hit.StarCount), 141 | }) 142 | } 143 | tlImported := StatList{ 144 | Name: "Most Imported", 145 | Info: "refs", 146 | Items: make([]StatItem, 0, topImported.Len()), 147 | } 148 | for idx, item := range topImported.PopAll() { 149 | hit := item.(gcse.HitInfo) 150 | tlImported.Items = append(tlImported.Items, StatItem{ 151 | Index: idx + 1, 152 | Name: hit.Name, 153 | Package: hit.Package, 154 | Info: fmt.Sprintf("%d", hit.ImportedLen+hit.TestImportedLen), 155 | }) 156 | } 157 | topSites := NewTopN(func(a, b interface{}) bool { 158 | return sites[a.(string)] < sites[b.(string)] 159 | }, N) 160 | for site := range sites { 161 | topSites.Append(site) 162 | } 163 | tlSites := StatList{ 164 | Name: "Sites", 165 | Info: "packages", 166 | Items: make([]StatItem, 0, topSites.Len()), 167 | } 168 | for idx, st := range topSites.PopAll() { 169 | site := st.(string) 170 | cnt := sites[site] 171 | tlSites.Items = append(tlSites.Items, StatItem{ 172 | Index: idx + 1, 173 | Name: site, 174 | Link: "http://" + site, 175 | Info: fmt.Sprintf("%d", cnt), 176 | }) 177 | } 178 | return []StatList{ 179 | tlStaticScore, tlTestStatic, tlImported, tlSites, 180 | } 181 | } 182 | 183 | func pageTops(w http.ResponseWriter, r *http.Request) { 184 | w.Header().Set("Content-Type", "text/html") 185 | 186 | N, _ := strconv.Atoi(r.FormValue("len")) 187 | if N < 20 { 188 | N = 20 189 | } else if N > 100 { 190 | N = 100 191 | } 192 | if err := templates.ExecuteTemplate(w, "tops.html", struct { 193 | UIUtils 194 | Lists []StatList 195 | }{ 196 | Lists: statTops(N), 197 | }); err != nil { 198 | http.Error(w, err.Error(), http.StatusInternalServerError) 199 | } 200 | } 201 | -------------------------------------------------------------------------------- /service/web/web/infoapi.html: -------------------------------------------------------------------------------- 1 | {{template "header.html" .UIUtils.Slice "Go Search API" "api"}} 2 | {{define "apibody"}} 3 | ### Introduction 4 | 5 | Go Search API, or GSAPI, returns structured data with json or jsonp format. 6 | 7 | The path of GSAPI is "`/api`". 8 | 9 | Please put a link to http://go-search.org/ on you website if this API helps you. 10 | 11 | ### Shared parameters 12 | 13 | Field | Value 14 | -----------|------------------------------------------------------------------ 15 | `action` | Possible values: `package`, `tops`, `packages` 16 | `callback` | (optional) If provided, return jsonp code with this as the callback function.
The callback function has two parameters. First parameter is an integer of code, and the second is the value object returned.
[example](/api?action=tops&callback=myfunc) 17 | 18 | ### "package" Action 19 | 20 | Returns the information of a package. [example](/api?action=package&id=github.com%2fdaviddengcn%2fgcse) 21 | 22 | * Parameters 23 | 24 | Field | Value 25 | ---------|----------------------------------------------------------------- 26 | `action` | `package` 27 | `id` | The ID of the package. Same as the string used to import 28 | 29 | * Return value 30 | 31 | Field | Type | Value 32 | --------------|------------|----------------------------------------------- 33 | `Package` | `string` | Import path, ID of a package in GS 34 | `Name` | `string` | Package name 35 | `StarCount` | `int` | Number of stars(github, bitbucket, plus/like buttons) 36 | `Synopsis` | `string` | Synopsis(from package comment or documentaion) 37 | `Description` | `string` | Detailed documents 38 | `TestImported`| `[]string` | List of packages that imports this package only in test cases. 39 | `Imported` | `[]string` | List of packages that imports this package 40 | `TestImports` | `[]string` | List of packages this package (only) test imports 41 | `Imports` | `[]string` | List of packages this package imports 42 | `ProjectURL` | `string` | URL of the project of this package 43 | `StaticRank` | `int` | Static rank of this package. One-based. 44 | 45 | 46 | ### "tops" Action 47 | 48 | Returns the [tops](/tops) tables. [example](/api?action=tops) 49 | 50 | * Parameters 51 | 52 | Key | Value 53 | ---------|------------------------------------------------------------------ 54 | `action` | `tops` 55 | `len` | (optional) The maximum number of entries in each table. Limited to [20, 100]. 56 | 57 | * Return value (An array of tables) 58 | 59 | Field | Type | Value 60 | --------|------------|----------------------------------------------- 61 | `Name` | `string` | Table title 62 | `Info` | `string` | Top right comments 63 | `Items` | `[]` | Items of the table. For each item:
`Name` is the anchor text,
`Package` is the package import path,
`Link` is the URL if the item is not a package,
`Info` is the information text on the second column 64 | 65 | 66 | ### "packages" Action 67 | 68 | Returns the ID array of all packages. [link](/api?action=packages) 69 | 70 | * Parameters 71 | 72 | Key | Value 73 | ---------|------------------------------------------------------------------ 74 | `action` | `packages` 75 | 76 | * Return values 77 | 78 | An array of strings, each of which is the ID (or import path) of a package. 79 | 80 | 81 | ### "package_depends" Action 82 | 83 | Returns an array of dependency information of all packages. [link](/api?action=package_depends) 84 | 85 | * Parameters 86 | 87 | Key | Value 88 | ---------|------------------------------------------------------------------ 89 | `action` | `package_depends` 90 | 91 | * Return values 92 | 93 | An array of the following struct: 94 | 95 | Field | Type | Value 96 | ---------------|------------|----------------------------------------------------------------- 97 | `Package` | `string` | Import path, ID of a package in GS 98 | `Name` | `string` | Package name 99 | `TestImported`| `[]string` | List of packages that imports this package only in test cases. 100 | `Imported` | `[]string` | List of packages that imports this package 101 | `TestImports` | `[]string` | List of packages this package (only) test imports 102 | `Imports` | `[]string` | List of packages that imports this package 103 | 104 | 105 | ### "search" Action 106 | 107 | Returns the search result. [example](/api?action=search&q=gcse) 108 | 109 | * Parameters 110 | 111 | Key | Value 112 | ---------|------------------------------------------------------------------ 113 | `action` | `search` 114 | `q` | the query 115 | 116 | * Return values 117 | 118 | Field | Type | Value 119 | --------|------------|----------------------------------------------- 120 | `query` | `string` | the search query 121 | `hits` | `[]` | Hit entries. For each item:
`name` is the name of the project,
`package` is the package import path,
`projecturl` is the URL if the item is not a package,
`author` is the author name of the project,
`synopsis` is the brief introduction of the project,
`description` is the detailed introduction of the project. 122 | 123 | 124 | {{end}} 125 |
126 | {{markdown "apibody"}} 127 |
128 | {{template "footer.html"}} 129 | -------------------------------------------------------------------------------- /service/web/db.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "errors" 5 | "log" 6 | "os" 7 | "runtime" 8 | "sync/atomic" 9 | "time" 10 | 11 | "github.com/golangplus/strings" 12 | 13 | "github.com/daviddengcn/bolthelper" 14 | "github.com/daviddengcn/gcse" 15 | "github.com/daviddengcn/gcse/configs" 16 | "github.com/daviddengcn/gcse/utils" 17 | "github.com/daviddengcn/go-easybi" 18 | "github.com/daviddengcn/go-index" 19 | ) 20 | 21 | var ( 22 | databaseValue atomic.Value 23 | indexSegment utils.Segment 24 | ) 25 | 26 | type database interface { 27 | PackageCount() int 28 | ProjectCount() int 29 | IndexUpdated() time.Time 30 | Close() 31 | 32 | FindFullPackage(id string) (hit gcse.HitInfo, found bool) 33 | ForEachFullPackage(func(gcse.HitInfo) error) error 34 | PackageCountOfToken(field, token string) int 35 | Search(q map[string]stringsp.Set, out func(docID int32, data interface{}) error) error 36 | } 37 | 38 | type searcherDB struct { 39 | ts index.TokenSetSearcher 40 | hits *index.ConstArrayReader 41 | 42 | projectCount int 43 | indexUpdated time.Time 44 | 45 | storeDB *bh.RefCountBox 46 | } 47 | 48 | func (db *searcherDB) PackageCount() int { 49 | if db == nil { 50 | return 0 51 | } 52 | return db.ts.DocCount() 53 | } 54 | 55 | func (db *searcherDB) ProjectCount() int { 56 | if db == nil { 57 | return 0 58 | } 59 | return db.projectCount 60 | } 61 | 62 | func (db *searcherDB) IndexUpdated() time.Time { 63 | if db == nil { 64 | return time.Now() 65 | } 66 | return db.indexUpdated 67 | } 68 | 69 | func (db *searcherDB) Close() { 70 | if db == nil { 71 | return 72 | } 73 | db.hits.Close() 74 | } 75 | 76 | var notFoundInHits = errors.New("Not found in hits") 77 | 78 | func (db *searcherDB) FindFullPackage(id string) (gcse.HitInfo, bool) { 79 | if db == nil { 80 | log.Print("Database not loaded!") 81 | return gcse.HitInfo{}, false 82 | } 83 | var hit gcse.HitInfo 84 | found := false 85 | if err := db.ts.Search(index.SingleFieldQuery(gcse.IndexPkgField, id), func(docID int32, _ interface{}) error { 86 | h, err := db.hits.GetGob(int(docID)) 87 | if err != nil { 88 | return err 89 | } 90 | hit = h.(gcse.HitInfo) 91 | found = true 92 | return nil 93 | }); err != nil { 94 | return gcse.HitInfo{}, false 95 | } 96 | if !found { 97 | return gcse.HitInfo{}, false 98 | } 99 | return hit, true 100 | } 101 | 102 | func (db *searcherDB) ForEachFullPackage(out func(gcse.HitInfo) error) error { 103 | if db == nil { 104 | return nil 105 | } 106 | return db.hits.ForEachGob(func(_ int, hit interface{}) error { 107 | return out(hit.(gcse.HitInfo)) 108 | }) 109 | } 110 | 111 | func (db *searcherDB) PackageCountOfToken(field, token string) int { 112 | if db == nil { 113 | return 0 114 | } 115 | return len(db.ts.TokenDocList(field, token)) 116 | } 117 | 118 | func (db *searcherDB) Search(q map[string]stringsp.Set, out func(docID int32, data interface{}) error) error { 119 | if db == nil { 120 | return nil 121 | } 122 | return db.ts.Search(q, out) 123 | } 124 | 125 | func getDatabase() database { 126 | db, ok := databaseValue.Load().(database) 127 | if !ok { 128 | return (*searcherDB)(nil) 129 | } 130 | return db 131 | } 132 | 133 | func loadIndex() error { 134 | segm, err := configs.IndexSegments().FindMaxDone() 135 | if segm == "" || err != nil { 136 | return err 137 | } 138 | if indexSegment != "" && !utils.SegmentLess(indexSegment, segm) { 139 | // no new index 140 | return nil 141 | } 142 | db := &searcherDB{} 143 | if err := func() error { 144 | f, err := os.Open(segm.Join(gcse.IndexFn)) 145 | if err != nil { 146 | return err 147 | } 148 | defer f.Close() 149 | 150 | return db.ts.Load(f) 151 | }(); err != nil { 152 | return err 153 | } 154 | db.storeDB = &bh.RefCountBox{ 155 | DataPath: func() string { 156 | return segm.Join(configs.FnStore) 157 | }, 158 | } 159 | hitsPath := segm.Join(gcse.HitsArrFn) 160 | if db.hits, err = index.OpenConstArray(hitsPath); err != nil { 161 | log.Printf("OpenConstArray %v failed: %v", hitsPath, err) 162 | return err 163 | } 164 | // Calculate db.projectCount 165 | var projects stringsp.Set 166 | db.ts.Search(nil, func(docID int32, data interface{}) error { 167 | hit := data.(gcse.HitInfo) 168 | projects.Add(hit.ProjectURL) 169 | return nil 170 | }) 171 | db.projectCount = len(projects) 172 | gcse.AddBiValueAndProcess(bi.Max, "index.proj-count", db.projectCount) 173 | 174 | // Update db.indexUpdated 175 | db.indexUpdated = time.Now() 176 | if st, err := os.Stat(segm.Join(gcse.IndexFn)); err == nil { 177 | db.indexUpdated = st.ModTime() 178 | } 179 | indexSegment = segm 180 | log.Printf("Load index from %v (%d packages)", segm, db.PackageCount()) 181 | 182 | // Exchange new/old database and close the old one. 183 | oldDB := getDatabase() 184 | databaseValue.Store(db) 185 | oldDB.Close() 186 | oldDB = nil 187 | utils.DumpMemStats() 188 | 189 | runtime.GC() 190 | utils.DumpMemStats() 191 | 192 | return nil 193 | } 194 | 195 | func loadIndexLoop() { 196 | for { 197 | time.Sleep(30 * time.Second) 198 | 199 | if err := loadIndex(); err != nil { 200 | log.Printf("loadIndex failed: %v", err) 201 | } 202 | bi.AddValue(bi.Max, "search.age_in_hours", int(time.Now().Sub(getDatabase().IndexUpdated()).Hours())) 203 | bi.AddValue(bi.Max, "search.age_in_mins", int(time.Now().Sub(getDatabase().IndexUpdated()).Minutes())) 204 | } 205 | } 206 | 207 | func processBi() { 208 | for { 209 | bi.Process() 210 | time.Sleep(time.Minute) 211 | } 212 | } 213 | -------------------------------------------------------------------------------- /spider/github/github_test.go: -------------------------------------------------------------------------------- 1 | package github 2 | 3 | import ( 4 | "testing" 5 | 6 | "github.com/golangplus/errors" 7 | "github.com/golangplus/testing/assert" 8 | 9 | sppb "github.com/daviddengcn/gcse/proto/spider" 10 | ) 11 | 12 | //func TestReadUser(t *testing.T) { 13 | // s := NewSpiderWithToken("") 14 | // assert.Should(t, s != nil, "s == nil") 15 | 16 | // da, err := s.ReadUser("daviddengcn") 17 | // assert.NoErrorOrDie(t, err) 18 | // assert.ValueShould(t, "len(da.Repos)", len(da.Repos), len(da.Repos) > 0, "> 0") 19 | //} 20 | 21 | //func TestReadRepository(t *testing.T) { 22 | // s := NewSpiderWithToken("") 23 | // assert.Should(t, s != nil, "s == nil") 24 | 25 | // repo, err := s.ReadRepository("daviddengcn", "gosl") 26 | // assert.NoErrorOrDie(t, err) 27 | // assert.ValueShould(t, "repo.Stars", repo.Stars, repo.Stars > 0, "> 0") 28 | //} 29 | 30 | //func TestReadPackage(t *testing.T) { 31 | // s := NewSpiderWithToken("") 32 | // assert.Should(t, s != nil, "s == nil") 33 | 34 | // pkg, folders, err := s.ReadPackage("daviddengcn", "gcse", "spider/github/testdata") 35 | // assert.NoErrorOrDie(t, err) 36 | // assert.Equal(t, "pkg.Name", pkg.Name, "pkg") 37 | // sort.Strings(pkg.Imports) 38 | // assert.Equal(t, "pkg.Imports", pkg.Imports, []string{ 39 | // "github.com/daviddengcn/gcse/spider/github", 40 | // "github.com/golangplus/strings", 41 | // }) 42 | // assert.Equal(t, "pkg.TestImports", pkg.TestImports, []string{"github.com/golangplus/testing/assert"}) 43 | // assert.Equal(t, "len(folders)", len(folders), 1) 44 | // assert.Equal(t, "folders[0].Name", folders[0].Name, "sub") 45 | // assert.Equal(t, "folders[0].Path", folders[0].Path, "spider/github/testdata/sub") 46 | //} 47 | 48 | //func TestSearchRepositories(t *testing.T) { 49 | // s := NewSpiderWithToken("") 50 | // assert.Should(t, s != nil, "s == nil") 51 | 52 | // rs, err := s.SearchRepositories("") 53 | // assert.NoErrorOrDie(t, err) 54 | // assert.ValueShould(t, "len(rs)", len(rs), len(rs) > 0, "> 0") 55 | //} 56 | 57 | func TestParseGoFile(t *testing.T) { 58 | fi := &sppb.GoFileInfo{} 59 | parseGoFile("g.go", []byte(` 60 | package main 61 | `+`// +build ignore 62 | `), fi) 63 | assert.Equal(t, "fi", fi, &sppb.GoFileInfo{Status: sppb.GoFileInfo_ShouldIgnore}) 64 | } 65 | 66 | func TestRepoBranchSHA(t *testing.T) { 67 | s := NewSpiderWithContents(map[string]string{ 68 | "/repos/daviddengcn/repo-branch-sha/branches/master": ` 69 | { 70 | "name": "master", 71 | "commit": { 72 | "sha": "sha-1" 73 | } 74 | } 75 | `, 76 | }) 77 | sha, err := s.RepoBranchSHA("daviddengcn", "repo-branch-sha", "master") 78 | assert.NoError(t, err) 79 | assert.Equal(t, "sha", sha, "sha-1") 80 | } 81 | 82 | func TestRepoBranchSHA_NotFound(t *testing.T) { 83 | s := NewSpiderWithContents(map[string]string{}) 84 | _, err := s.RepoBranchSHA("noone", "nothing", "master") 85 | assert.Equal(t, "err", errorsp.Cause(err), ErrInvalidRepository) 86 | } 87 | 88 | func TestReadRepo(t *testing.T) { 89 | s := NewSpiderWithContents(map[string]string{ 90 | "/repos/daviddengcn/readrepo/branches/master": ` 91 | { 92 | "name": "master", 93 | "commit": { 94 | "sha": "sha-1" 95 | } 96 | } 97 | `, 98 | "/repos/daviddengcn/readrepo/git/trees/sha-1?recursive=1": ` 99 | { 100 | "sha": "sha-1", 101 | "tree": [ 102 | { 103 | "path": "a.go", 104 | "type": "blob", 105 | "sha": "sha-2" 106 | }, 107 | { 108 | "path": "sub/a.go", 109 | "type": "blob", 110 | "sha": "sha-2" 111 | } 112 | ], 113 | "truncated": false 114 | }`, 115 | "/repos/daviddengcn/readrepo/contents/a.go": ` 116 | { 117 | "name": "bi.go", 118 | "path": "bi.go", 119 | "sha": "sha-2", 120 | "content": "cGFja2FnZSBnY3NlCgppbXBvcnQgKAoJImdpdGh1Yi5jb20vZGF2aWRkZW5n\nY24vZ28tZWFzeWJpIgopCgpmdW5jIEFkZEJpVmFsdWVBbmRQcm9jZXNzKGFn\nZ3IgYmkuQWdncmVnYXRlTWV0aG9kLCBuYW1lIHN0cmluZywgdmFsdWUgaW50\nKSB7CgliaS5BZGRWYWx1ZShhZ2dyLCBuYW1lLCB2YWx1ZSkKCWJpLkZsdXNo\nKCkKCWJpLlByb2Nlc3MoKQp9Cg==\n", 121 | "encoding": "base64", 122 | "type": "file" 123 | } 124 | `, 125 | "/repos/daviddengcn/readrepo/contents/sub/a.go": ` 126 | { 127 | "name": "bi.go", 128 | "path": "bi.go", 129 | "sha": "sha-2", 130 | "content": "cGFja2FnZSBnY3NlCgppbXBvcnQgKAoJImdpdGh1Yi5jb20vZGF2aWRkZW5n\nY24vZ28tZWFzeWJpIgopCgpmdW5jIEFkZEJpVmFsdWVBbmRQcm9jZXNzKGFn\nZ3IgYmkuQWdncmVnYXRlTWV0aG9kLCBuYW1lIHN0cmluZywgdmFsdWUgaW50\nKSB7CgliaS5BZGRWYWx1ZShhZ2dyLCBuYW1lLCB2YWx1ZSkKCWJpLkZsdXNo\nKCkKCWJpLlByb2Nlc3MoKQp9Cg==\n", 131 | "encoding": "base64", 132 | "type": "file" 133 | } 134 | `, 135 | }) 136 | pkgs := make(map[string]*sppb.Package) 137 | assert.NoError(t, s.ReadRepo("daviddengcn", "readrepo", "sha-1", func(path string, pkg *sppb.Package) error { 138 | pkgs[path] = pkg 139 | return nil 140 | })) 141 | assert.Equal(t, "pkgs", pkgs, map[string]*sppb.Package{ 142 | "": &sppb.Package{ 143 | Name: "gcse", 144 | Path: "", 145 | Imports: []string{"github.com/daviddengcn/go-easybi"}, 146 | TestImports: []string{}, 147 | }, 148 | "/sub": &sppb.Package{ 149 | Name: "gcse", 150 | Path: "/sub", 151 | Imports: []string{"github.com/daviddengcn/go-easybi"}, 152 | TestImports: []string{}, 153 | }, 154 | }) 155 | } 156 | 157 | func TestReadRepo_NotFound(t *testing.T) { 158 | s := NewSpiderWithContents(map[string]string{}) 159 | assert.Equal(t, "err", errorsp.Cause(s.ReadRepo("noone", "nothing", "sha-1", nil)), ErrInvalidRepository) 160 | } 161 | -------------------------------------------------------------------------------- /pipelines/crawler/cmain.go: -------------------------------------------------------------------------------- 1 | /* 2 | GCSE Crawler background program. 3 | */ 4 | package main 5 | 6 | import ( 7 | "context" 8 | "flag" 9 | "io" 10 | "log" 11 | "runtime" 12 | "time" 13 | 14 | "github.com/golangplus/errors" 15 | "github.com/golangplus/fmt" 16 | 17 | "github.com/daviddengcn/bolthelper" 18 | "github.com/daviddengcn/gcse" 19 | "github.com/daviddengcn/gcse/configs" 20 | "github.com/daviddengcn/gcse/spider" 21 | "github.com/daviddengcn/gcse/spider/github" 22 | "github.com/daviddengcn/gcse/utils" 23 | "github.com/daviddengcn/gddo/doc" 24 | "github.com/daviddengcn/go-easybi" 25 | "github.com/daviddengcn/go-villa" 26 | "github.com/daviddengcn/sophie" 27 | "github.com/daviddengcn/sophie/kv" 28 | ) 29 | 30 | var ( 31 | AppStopTime time.Time 32 | cDB *gcse.CrawlerDB 33 | ) 34 | 35 | func init() { 36 | if configs.CrawlerGithubClientID != "" { 37 | log.Printf("Github clientid: %s", configs.CrawlerGithubClientID) 38 | log.Printf("Github clientsecret: %s", configs.CrawlerGithubClientSecret) 39 | doc.SetGithubCredentials(configs.CrawlerGithubClientID, configs.CrawlerGithubClientSecret) 40 | } 41 | doc.SetUserAgent("Go-Search(http://go-search.org/)") 42 | } 43 | 44 | func syncDatabases() { 45 | utils.DumpMemStats() 46 | log.Printf("Synchronizing databases to disk...") 47 | if err := cDB.Sync(); err != nil { 48 | log.Fatalf("cdb.Sync() failed: %v", err) 49 | } 50 | utils.DumpMemStats() 51 | runtime.GC() 52 | utils.DumpMemStats() 53 | } 54 | 55 | func loadAllDocsPkgs(in kv.DirInput) error { 56 | cnt, err := in.PartCount() 57 | if err != nil { 58 | return err 59 | } 60 | for part := 0; part < cnt; part++ { 61 | c, err := in.Iterator(part) 62 | if err != nil { 63 | return err 64 | } 65 | for { 66 | var key sophie.RawString 67 | var val gcse.DocInfo 68 | if err := c.Next(&key, &val); err != nil { 69 | if errorsp.Cause(err) == io.EOF { 70 | break 71 | } 72 | return err 73 | } 74 | allDocsPkgs.Add(string(key)) 75 | // value is ignored 76 | } 77 | } 78 | return nil 79 | } 80 | 81 | type crawlerMapper struct { 82 | } 83 | 84 | // Mapper interface 85 | func (crawlerMapper) NewKey() sophie.Sophier { 86 | return new(sophie.RawString) 87 | } 88 | 89 | // Mapper interface 90 | func (crawlerMapper) NewVal() sophie.Sophier { 91 | return new(gcse.CrawlingEntry) 92 | } 93 | 94 | // Mapper interface 95 | func (crawlerMapper) MapEnd(c []sophie.Collector) error { 96 | return nil 97 | } 98 | 99 | func cleanTempDir() { 100 | tmpFn := villa.Path("/tmp/gddo") 101 | if err := tmpFn.RemoveAll(); err != nil { 102 | log.Printf("Delete %v failed: %v", tmpFn, err) 103 | } 104 | } 105 | 106 | func main() { 107 | ctx := context.Background() 108 | runtime.GOMAXPROCS(2) 109 | 110 | log.Printf("Using personal: %v", configs.CrawlerGithubPersonal) 111 | gcse.GithubSpider = github.NewSpiderWithToken(configs.CrawlerGithubPersonal) 112 | 113 | if db, err := bh.Open(configs.FileCacheBoltPath(), 0644, nil); err == nil { 114 | log.Print("Using file cache!") 115 | gcse.GithubSpider.FileCache = spider.BoltFileCache{ 116 | DB: db, 117 | IncCounter: bi.Inc, 118 | } 119 | } else { 120 | log.Printf("Open file cache failed: %v", err) 121 | } 122 | 123 | cleanTempDir() 124 | defer cleanTempDir() 125 | 126 | singlePackage := flag.String("pkg", "", "Crawling a single package") 127 | singleETag := flag.String("etag", "", "ETag for the single package crawling") 128 | singlePerson := flag.String("person", "", "Crawling a single person") 129 | 130 | flag.Parse() 131 | 132 | httpClient := gcse.GenHttpClient("") 133 | 134 | if *singlePerson != "" { 135 | log.Printf("Crawling single person %s ...", *singlePerson) 136 | p, err := gcse.CrawlPerson(ctx, httpClient, *singlePerson) 137 | if err != nil { 138 | fmtp.Printfln("Crawling person %s failed: %v", *singlePerson, err) 139 | } else { 140 | fmtp.Printfln("Person %s: %+v", *singlePerson, p) 141 | } 142 | } 143 | if *singlePackage != "" { 144 | log.Printf("Crawling single package %s ...", *singlePackage) 145 | p, flds, err := gcse.CrawlPackage(ctx, httpClient, *singlePackage, *singleETag) 146 | if err != nil { 147 | fmtp.Printfln("Crawling package %s failed: %v\nfolders: %v", *singlePackage, err, flds) 148 | } else { 149 | fmtp.Printfln("Package %s: %+v\nfolders: %v", *singlePackage, p, flds) 150 | } 151 | } 152 | if *singlePackage != "" || *singlePerson != "" { 153 | return 154 | } 155 | 156 | log.Println("crawler started...") 157 | 158 | // Load CrawlerDB 159 | cDB = gcse.LoadCrawlerDB() 160 | 161 | fpDocs := configs.DocsDBFsPath() 162 | if err := loadAllDocsPkgs(kv.DirInput(fpDocs)); err != nil { 163 | log.Fatalf("loadAllDocsPkgs: %v", err) 164 | } 165 | log.Printf("%d docs loaded!", len(allDocsPkgs)) 166 | 167 | AppStopTime = time.Now().Add(configs.CrawlerDuePerRun) 168 | 169 | //pathToCrawl := gcse.DataRoot.Join(gcse.FnToCrawl) 170 | fpCrawler := configs.CrawlerDBFsPath() 171 | fpToCrawl := configs.ToCrawlFsPath() 172 | 173 | fpNewDocs := fpCrawler.Join(configs.FnNewDocs) 174 | fpNewDocs.Remove() 175 | 176 | if err := processImports(); err != nil { 177 | log.Printf("processImports failed: %v", err) 178 | } 179 | 180 | pkgEnd := make(chan error, 1) 181 | go crawlPackages(httpClient, fpToCrawl.Join(configs.FnPackage), fpNewDocs, pkgEnd) 182 | 183 | psnEnd := make(chan error, 1) 184 | go crawlPersons(httpClient, fpToCrawl.Join(configs.FnPerson), psnEnd) 185 | 186 | errPkg, errPsn := <-pkgEnd, <-psnEnd 187 | bi.Flush() 188 | bi.Process() 189 | syncDatabases() 190 | if errPkg != nil || errPsn != nil { 191 | log.Fatalf("Some job may failed, package: %v, person: %v", errPkg, errPsn) 192 | } 193 | log.Println("crawler stopped...") 194 | } 195 | -------------------------------------------------------------------------------- /store/store.go: -------------------------------------------------------------------------------- 1 | // Package store handlings all the storage in GCSE backend. 2 | package store 3 | 4 | import ( 5 | "log" 6 | "time" 7 | 8 | "github.com/golangplus/bytes" 9 | "github.com/golangplus/errors" 10 | 11 | "github.com/daviddengcn/bolthelper" 12 | "github.com/daviddengcn/gcse/configs" 13 | "github.com/golang/protobuf/proto" 14 | "github.com/golang/protobuf/ptypes" 15 | 16 | gpb "github.com/daviddengcn/gcse/shared/proto" 17 | ) 18 | 19 | var ( 20 | // pkgs 21 | // - 22 | // - -> PackageInfo 23 | // persons 24 | // - 25 | // - -> PersonInfo 26 | // history 27 | // - pkgs 28 | // - -> HistoryInfo 29 | // - persons 30 | // - -> HistoryInfo 31 | // repos 32 | // - 33 | // - 34 | // - -> Repository 35 | pkgsRoot = []byte("pkgs") 36 | personsRoot = []byte("persons") 37 | historyRoot = []byte("history") 38 | reposRoot = []byte("repos") 39 | ) 40 | 41 | var box = &bh.RefCountBox{ 42 | DataPath: configs.StoreBoltPath, 43 | } 44 | 45 | func RepoInfoAge(r *gpb.RepoInfo) time.Duration { 46 | t, _ := ptypes.Timestamp(r.CrawlingTime) 47 | return time.Now().Sub(t) 48 | } 49 | 50 | // Returns all the sites one by one by calling the provided func. 51 | func ForEachPackageSite(f func(string) error) error { 52 | return box.View(func(tx bh.Tx) error { 53 | return tx.ForEach([][]byte{pkgsRoot}, func(_ bh.Bucket, k, v bytesp.Slice) error { 54 | if v != nil { 55 | log.Printf("Unexpected value %q for key %q, ignored", string(v), string(k)) 56 | return nil 57 | } 58 | return errorsp.WithStacks(f(string(k))) 59 | }) 60 | }) 61 | } 62 | 63 | func ForEachPackageOfSite(site string, f func(string, *gpb.PackageInfo) error) error { 64 | return box.View(func(tx bh.Tx) error { 65 | return tx.ForEach([][]byte{pkgsRoot, []byte(site)}, func(_ bh.Bucket, k, v bytesp.Slice) error { 66 | if v == nil { 67 | log.Printf("Unexpected nil value for key %q, ignored", string(k)) 68 | return nil 69 | } 70 | info := &gpb.PackageInfo{} 71 | if err := errorsp.WithStacksAndMessage(proto.Unmarshal(v, info), "Unmarshal %d bytes failed", len(v)); err != nil { 72 | log.Printf("Unmarshal failed: %v, ignored", err) 73 | return nil 74 | } 75 | return errorsp.WithStacks(f(string(k), info)) 76 | }) 77 | }) 78 | } 79 | 80 | // Returns an empty (non-nil) PackageInfo if not found. 81 | func ReadPackage(site, path string) (*gpb.PackageInfo, error) { 82 | info := &gpb.PackageInfo{} 83 | if err := box.View(func(tx bh.Tx) error { 84 | return tx.Value([][]byte{pkgsRoot, []byte(site), []byte(path)}, func(bs bytesp.Slice) error { 85 | if err := errorsp.WithStacksAndMessage(proto.Unmarshal(bs, info), "Unmarshal %d bytes failed", len(bs)); err != nil { 86 | log.Printf("Unmarshal failed: %v", err) 87 | *info = gpb.PackageInfo{} 88 | } 89 | return nil 90 | }) 91 | }); err != nil { 92 | return nil, err 93 | } 94 | return info, nil 95 | } 96 | 97 | func UpdatePackage(site, path string, f func(*gpb.PackageInfo) error) error { 98 | return box.Update(func(tx bh.Tx) error { 99 | b, err := tx.CreateBucketIfNotExists([][]byte{pkgsRoot, []byte(site)}) 100 | if err != nil { 101 | return err 102 | } 103 | info := &gpb.PackageInfo{} 104 | if err := b.Value([][]byte{[]byte(path)}, func(bs bytesp.Slice) error { 105 | if err := errorsp.WithStacksAndMessage(proto.Unmarshal(bs, info), "Unmarshal %d bytes", len(bs)); err != nil { 106 | log.Printf("Unmarshaling failed: %v", err) 107 | *info = gpb.PackageInfo{} 108 | } 109 | return nil 110 | }); err != nil { 111 | return err 112 | } 113 | if err := errorsp.WithStacks(f(info)); err != nil { 114 | return err 115 | } 116 | bs, err := proto.Marshal(info) 117 | if err != nil { 118 | return errorsp.WithStacksAndMessage(err, "marshaling %v failed: %v", info, err) 119 | } 120 | return b.Put([][]byte{[]byte(path)}, bs) 121 | }) 122 | } 123 | 124 | func DeletePackage(site, path string) error { 125 | return box.Update(func(tx bh.Tx) error { 126 | return tx.Delete([][]byte{pkgsRoot, []byte(site), []byte(path)}) 127 | }) 128 | } 129 | 130 | func ReadPerson(site, id string) (*gpb.PersonInfo, error) { 131 | info := &gpb.PersonInfo{} 132 | if err := box.View(func(tx bh.Tx) error { 133 | return tx.Value([][]byte{personsRoot, []byte(site), []byte(id)}, func(bs bytesp.Slice) error { 134 | if err := errorsp.WithStacksAndMessage(proto.Unmarshal(bs, info), "Unmarshal %d bytes failed", len(bs)); err != nil { 135 | log.Printf("Unmarshal failed: %v", err) 136 | *info = gpb.PersonInfo{} 137 | } 138 | return nil 139 | }) 140 | }); err != nil { 141 | return nil, err 142 | } 143 | return info, nil 144 | } 145 | 146 | func UpdatePerson(site, id string, f func(*gpb.PersonInfo) error) error { 147 | return box.Update(func(tx bh.Tx) error { 148 | b, err := tx.CreateBucketIfNotExists([][]byte{personsRoot, []byte(site)}) 149 | if err != nil { 150 | return err 151 | } 152 | info := &gpb.PersonInfo{} 153 | if err := b.Value([][]byte{[]byte(id)}, func(bs bytesp.Slice) error { 154 | err := errorsp.WithStacksAndMessage(proto.Unmarshal(bs, info), "Unmarshal %d bytes", len(bs)) 155 | if err != nil { 156 | log.Printf("Unmarshaling failed: %v", err) 157 | *info = gpb.PersonInfo{} 158 | } 159 | return nil 160 | }); err != nil { 161 | return err 162 | } 163 | if err := errorsp.WithStacks(f(info)); err != nil { 164 | return err 165 | } 166 | bs, err := proto.Marshal(info) 167 | if err != nil { 168 | return errorsp.WithStacksAndMessage(err, "marshaling %v failed: %v", info, err) 169 | } 170 | return b.Put([][]byte{[]byte(id)}, bs) 171 | }) 172 | } 173 | 174 | func DeletePerson(site, id string) error { 175 | return box.Update(func(tx bh.Tx) error { 176 | return tx.Delete([][]byte{personsRoot, []byte(site), []byte(id)}) 177 | }) 178 | } 179 | -------------------------------------------------------------------------------- /data.go: -------------------------------------------------------------------------------- 1 | package gcse 2 | 3 | import ( 4 | "encoding/gob" 5 | "regexp" 6 | "strings" 7 | "time" 8 | "unicode" 9 | 10 | "github.com/golangplus/bytes" 11 | "github.com/golangplus/errors" 12 | "github.com/golangplus/strings" 13 | 14 | "github.com/agonopol/go-stem" 15 | "github.com/daviddengcn/go-index" 16 | "github.com/daviddengcn/sophie" 17 | ) 18 | 19 | // DocInfo is the information stored in backend docDB 20 | type DocInfo struct { 21 | Name string // Package name 22 | Package string // Package path 23 | Author string 24 | LastUpdated time.Time 25 | StarCount int 26 | Synopsis string 27 | Description string 28 | ProjectURL string 29 | ReadmeFn string 30 | ReadmeData string 31 | Imports []string 32 | TestImports []string 33 | Exported []string // exported tokens(funcs/types) 34 | } 35 | 36 | // Returns a new instance of DocInfo as a sophie.Sophier 37 | func NewDocInfo() sophie.Sophier { 38 | return new(DocInfo) 39 | } 40 | 41 | func (d *DocInfo) WriteTo(w sophie.Writer) error { 42 | return errorsp.WithStacks(gob.NewEncoder(w).Encode(d)) 43 | } 44 | 45 | func (d *DocInfo) ReadFrom(r sophie.Reader, l int) error { 46 | // clear before decoding, otherwise some slice will be reused 47 | *d = DocInfo{} 48 | return errorsp.WithStacks(gob.NewDecoder(r).Decode(d)) 49 | } 50 | 51 | // HitInfo is the information provided to frontend 52 | type HitInfo struct { 53 | DocInfo 54 | 55 | Imported []string 56 | ImportedLen int 57 | 58 | TestImported []string 59 | TestImportedLen int 60 | 61 | ImportantSentences []string 62 | 63 | AssignedStarCount float64 64 | StaticScore float64 65 | TestStaticScore float64 66 | StaticRank int // zero-based 67 | } 68 | 69 | func init() { 70 | gob.Register(DocInfo{}) 71 | gob.Register(HitInfo{}) 72 | } 73 | 74 | var patURL = regexp.MustCompile(`http[s]?://\S+`) 75 | 76 | func filterURLs(text []byte) []byte { 77 | return patURL.ReplaceAll(text, []byte(" ")) 78 | } 79 | 80 | var patEmail = regexp.MustCompile(`[A-Za-z0-9_.+-]+@([a-zA-Z0-9_-]+[.])+[A-Za-z]+`) 81 | 82 | func filterEmails(text []byte) []byte { 83 | return patEmail.ReplaceAll(text, nil) 84 | } 85 | 86 | func isTermSep(r rune) bool { 87 | return unicode.IsPunct(r) || unicode.IsSymbol(r) || r == 0xfeff 88 | } 89 | 90 | var stemBlackList = map[string]string{ 91 | "ide": "ide", 92 | "generics": "generic", 93 | "generic": "generic", 94 | } 95 | 96 | func NormWord(word string) string { 97 | word = strings.ToLower(word) 98 | if mapWord, ok := stemBlackList[word]; ok { 99 | word = mapWord 100 | } else { 101 | word = string(stemmer.Stem([]byte(word))) 102 | } 103 | return word 104 | } 105 | 106 | var stopWords = stringsp.NewSet( 107 | "the", "on", "in", "as", 108 | ) 109 | 110 | func CheckRuneType(last, current rune) index.RuneType { 111 | if isTermSep(current) { 112 | return index.TokenSep 113 | } 114 | 115 | if current > 128 { 116 | return index.TokenStart 117 | } 118 | 119 | if unicode.IsLetter(current) { 120 | if unicode.IsLetter(last) { 121 | return index.TokenBody 122 | } 123 | return index.TokenStart 124 | } 125 | 126 | if unicode.IsNumber(current) { 127 | if unicode.IsNumber(last) { 128 | return index.TokenBody 129 | } 130 | return index.TokenStart 131 | } 132 | 133 | return index.TokenStart 134 | } 135 | 136 | func isCamel(token string) bool { 137 | upper, lower := false, false 138 | for _, r := range token { 139 | if !unicode.IsLetter(r) { 140 | return false 141 | } 142 | 143 | if unicode.IsUpper(r) { 144 | upper = true 145 | if lower { 146 | break 147 | } 148 | } else { 149 | lower = true 150 | } 151 | } 152 | 153 | return upper && lower 154 | } 155 | 156 | func CheckCamel(last, current rune) index.RuneType { 157 | if unicode.IsUpper(current) { 158 | return index.TokenStart 159 | } 160 | 161 | return index.TokenBody 162 | } 163 | 164 | // a block does not contain blanks 165 | func appendTokensOfBlock(tokens stringsp.Set, block []byte) stringsp.Set { 166 | lastToken := "" 167 | index.Tokenize(CheckRuneType, (*bytesp.Slice)(&block), 168 | func(token []byte) error { 169 | tokenStr := string(token) 170 | if isCamel(tokenStr) { 171 | last := "" 172 | index.Tokenize(CheckCamel, bytesp.NewPSlice(token), 173 | func(token []byte) error { 174 | tokenStr := string(token) 175 | tokenStr = NormWord(tokenStr) 176 | if !stopWords.Contain(tokenStr) { 177 | tokens.Add(tokenStr) 178 | } 179 | if last != "" { 180 | tokens.Add(last + string(tokenStr)) 181 | } 182 | last = tokenStr 183 | return nil 184 | }) 185 | } 186 | tokenStr = NormWord(tokenStr) 187 | if !stopWords.Contain(tokenStr) { 188 | tokens.Add(tokenStr) 189 | } 190 | if lastToken != "" { 191 | if tokenStr[0] > 128 && lastToken[0] > 128 { 192 | // Chinese bigrams 193 | tokens.Add(lastToken + tokenStr) 194 | } else if tokenStr[0] <= 128 && lastToken[0] <= 128 { 195 | tokens.Add(lastToken + "-" + tokenStr) 196 | } 197 | } 198 | lastToken = tokenStr 199 | return nil 200 | }) 201 | return tokens 202 | } 203 | 204 | // Tokenizes text into the current token set. 205 | func AppendTokens(tokens stringsp.Set, text []byte) stringsp.Set { 206 | textBuf := filterURLs(text) 207 | textBuf = filterEmails(textBuf) 208 | 209 | index.Tokenize(index.SeparatorFRuneTypeFunc(unicode.IsSpace), 210 | (*bytesp.Slice)(&textBuf), func(block []byte) error { 211 | tokens = appendTokensOfBlock(tokens, block) 212 | return nil 213 | }) 214 | 215 | return tokens 216 | } 217 | 218 | const ( 219 | DOCS_PARTS = 128 220 | ) 221 | 222 | func CalcPackagePartition(pkg string, totalParts int) int { 223 | hash := 0 224 | for i, l := 0, len(pkg); i < l; i++ { 225 | b := pkg[i] 226 | hash = hash*33 + int(b) 227 | if hash > totalParts { 228 | hash = hash % totalParts 229 | } 230 | } 231 | 232 | return hash 233 | } 234 | -------------------------------------------------------------------------------- /service/web/api.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "fmt" 5 | "log" 6 | "net/http" 7 | "strconv" 8 | "strings" 9 | "unicode/utf8" 10 | 11 | "github.com/golangplus/bytes" 12 | "github.com/golangplus/encoding/json" 13 | "golang.org/x/net/trace" 14 | 15 | "github.com/daviddengcn/gcse" 16 | "github.com/daviddengcn/go-easybi" 17 | ) 18 | 19 | func filterFunc(s string, f func(r rune) bool) string { 20 | for i, r := range s { 21 | if f(r) { 22 | // first time 23 | buf := bytesp.Slice(s[:i]) 24 | i += utf8.RuneLen(r) 25 | for _, r := range s[i:] { 26 | if !f(r) { 27 | buf.WriteRune(r) 28 | } 29 | } 30 | return string(buf) 31 | } 32 | } 33 | return s 34 | } 35 | 36 | type SearchApiHit struct { 37 | Name string `json:"name"` 38 | Package string `json:"package"` 39 | Author string `json:"author"` 40 | Synopsis string `json:"synopsis"` 41 | Description string `json:"description"` 42 | ProjectURL string `json:"projecturl"` 43 | } 44 | 45 | type SearchApiStruct struct { 46 | Q string `json:"query"` 47 | Hits []*SearchApiHit `json:"hits"` 48 | } 49 | 50 | const MAX_API_SEARCH_HITS = 100 51 | 52 | func SearchResultToApi(q string, res *SearchResult) *SearchApiStruct { 53 | apiRes := SearchApiStruct{ 54 | Q: q, 55 | } 56 | for i, hit := range res.Hits { 57 | if i >= MAX_API_SEARCH_HITS { 58 | break 59 | } 60 | apiHit := &SearchApiHit{ 61 | Name: hit.Name, 62 | Package: hit.Package, 63 | Author: hit.Author, 64 | Synopsis: hit.Synopsis, 65 | Description: hit.Description, 66 | ProjectURL: hit.ProjectURL, 67 | } 68 | apiRes.Hits = append(apiRes.Hits, apiHit) 69 | } 70 | return &apiRes 71 | } 72 | 73 | func apiContent(w http.ResponseWriter, code int, obj interface{}, callback string) error { 74 | if callback == "" { 75 | w.Header().Set("Content-Type", "application/json; charset=utf-8") 76 | w.WriteHeader(code) 77 | _, err := w.Write(jsonp.MarshalIgnoreError(obj)) 78 | return err 79 | } 80 | w.Header().Set("Content-Type", "application/javascript; charset=utf-8") 81 | /* 82 | (, ); 83 | */ 84 | if _, err := w.Write([]byte(fmt.Sprintf("%s(%d, ", callback, code))); err != nil { 85 | return err 86 | } 87 | if _, err := w.Write(jsonp.MarshalIgnoreError(obj)); err != nil { 88 | return err 89 | } 90 | if _, err := w.Write([]byte(");")); err != nil { 91 | return err 92 | } 93 | return nil 94 | } 95 | 96 | type PackageDependenceInfo struct { 97 | Name string 98 | Package string 99 | Imports []string 100 | TestImports []string 101 | Imported []string 102 | TestImported []string 103 | } 104 | 105 | func pageApi(w http.ResponseWriter, r *http.Request) { 106 | tr := trace.New("pageApi", r.URL.Path) 107 | defer tr.Finish() 108 | 109 | action := strings.ToLower(r.FormValue("action")) 110 | callback := strings.TrimSpace(r.FormValue("callback")) 111 | callback = filterFunc(callback, func(r rune) bool { 112 | if r >= 'a' && r <= 'z' || r >= 'A' && r <= 'Z' || r >= '0' && r <= '9' { 113 | return false 114 | } 115 | if r == '_' || r == '$' { 116 | return false 117 | } 118 | return true 119 | }) 120 | switch action { 121 | case "package": 122 | bi.Inc("api.package") 123 | id := r.FormValue("id") 124 | 125 | db := getDatabase() 126 | doc, found := db.FindFullPackage(id) 127 | if !found { 128 | apiContent(w, http.StatusNotFound, fmt.Sprintf("Package %s not found!", id), callback) 129 | return 130 | } 131 | apiContent(w, http.StatusOK, struct { 132 | Package string 133 | Name string 134 | StarCount int 135 | Synopsis string 136 | Description string 137 | Imported []string 138 | TestImported []string 139 | Imports []string 140 | TestImports []string 141 | ProjectURL string 142 | StaticRank int 143 | }{ 144 | doc.Package, 145 | doc.Name, 146 | doc.StarCount, 147 | doc.Synopsis, 148 | doc.Description, 149 | doc.Imported, 150 | doc.TestImported, 151 | doc.Imports, 152 | doc.TestImports, 153 | doc.ProjectURL, 154 | doc.StaticRank + 1, 155 | }, callback) 156 | 157 | case "tops": 158 | bi.Inc("api.tops") 159 | N, _ := strconv.Atoi(r.FormValue("len")) 160 | if N < 20 { 161 | N = 20 162 | } else if N > 100 { 163 | N = 100 164 | } 165 | apiContent(w, http.StatusOK, statTops(N), callback) 166 | 167 | case "packages": 168 | bi.Inc("api.packages") 169 | db := getDatabase() 170 | var pkgs []string 171 | if db != nil { 172 | pkgs = make([]string, 0, db.PackageCount()) 173 | db.Search(nil, func(docID int32, data interface{}) error { 174 | doc := data.(gcse.HitInfo) 175 | pkgs = append(pkgs, doc.Package) 176 | 177 | return nil 178 | }) 179 | } 180 | apiContent(w, http.StatusOK, pkgs, callback) 181 | 182 | case "package_depends": 183 | bi.Inc("api.package_depends") 184 | db := getDatabase() 185 | var pkgs []PackageDependenceInfo 186 | if db != nil { 187 | pkgs = make([]PackageDependenceInfo, 0, db.PackageCount()) 188 | if err := db.ForEachFullPackage(func(doc gcse.HitInfo) error { 189 | pkgs = append(pkgs, PackageDependenceInfo{ 190 | Name: doc.Name, 191 | Package: doc.Package, 192 | Imports: doc.Imports, 193 | TestImports: doc.TestImports, 194 | Imported: doc.Imported, 195 | TestImported: doc.TestImported, 196 | }) 197 | return nil 198 | }); err != nil { 199 | log.Printf("ForEachFullPackage failed: %v", err) 200 | } 201 | } 202 | apiContent(w, http.StatusOK, pkgs, callback) 203 | 204 | case "search": 205 | bi.Inc("api.search") 206 | q := strings.TrimSpace(r.FormValue("q")) 207 | results, _, err := search(tr, getDatabase(), q) 208 | if err != nil { 209 | apiContent(w, http.StatusInternalServerError, err.Error(), callback) 210 | return 211 | } 212 | apiContent(w, http.StatusOK, SearchResultToApi(q, results), callback) 213 | 214 | default: 215 | bi.Inc("api.unknown") 216 | apiContent(w, http.StatusBadRequest, fmt.Sprintf("Unknown action: %s", action), callback) 217 | } 218 | } 219 | -------------------------------------------------------------------------------- /shared/proto/stored.pb.go: -------------------------------------------------------------------------------- 1 | // Code generated by protoc-gen-go. DO NOT EDIT. 2 | // source: github.com/daviddengcn/gcse/shared/proto/stored.proto 3 | 4 | package gcsepb 5 | 6 | import proto "github.com/golang/protobuf/proto" 7 | import fmt "fmt" 8 | import math "math" 9 | 10 | import ( 11 | context "golang.org/x/net/context" 12 | grpc "google.golang.org/grpc" 13 | ) 14 | 15 | // Reference imports to suppress errors if they are not otherwise used. 16 | var _ = proto.Marshal 17 | var _ = fmt.Errorf 18 | var _ = math.Inf 19 | 20 | type PackageCrawlHistoryReq struct { 21 | Package string `protobuf:"bytes,1,opt,name=package" json:"package,omitempty"` 22 | } 23 | 24 | func (m *PackageCrawlHistoryReq) Reset() { *m = PackageCrawlHistoryReq{} } 25 | func (m *PackageCrawlHistoryReq) String() string { return proto.CompactTextString(m) } 26 | func (*PackageCrawlHistoryReq) ProtoMessage() {} 27 | func (*PackageCrawlHistoryReq) Descriptor() ([]byte, []int) { return fileDescriptor2, []int{0} } 28 | 29 | func (m *PackageCrawlHistoryReq) GetPackage() string { 30 | if m != nil { 31 | return m.Package 32 | } 33 | return "" 34 | } 35 | 36 | type PackageCrawlHistoryResp struct { 37 | Info *HistoryInfo `protobuf:"bytes,1,opt,name=info" json:"info,omitempty"` 38 | } 39 | 40 | func (m *PackageCrawlHistoryResp) Reset() { *m = PackageCrawlHistoryResp{} } 41 | func (m *PackageCrawlHistoryResp) String() string { return proto.CompactTextString(m) } 42 | func (*PackageCrawlHistoryResp) ProtoMessage() {} 43 | func (*PackageCrawlHistoryResp) Descriptor() ([]byte, []int) { return fileDescriptor2, []int{1} } 44 | 45 | func (m *PackageCrawlHistoryResp) GetInfo() *HistoryInfo { 46 | if m != nil { 47 | return m.Info 48 | } 49 | return nil 50 | } 51 | 52 | func init() { 53 | proto.RegisterType((*PackageCrawlHistoryReq)(nil), "gcse.PackageCrawlHistoryReq") 54 | proto.RegisterType((*PackageCrawlHistoryResp)(nil), "gcse.PackageCrawlHistoryResp") 55 | } 56 | 57 | // Reference imports to suppress errors if they are not otherwise used. 58 | var _ context.Context 59 | var _ grpc.ClientConn 60 | 61 | // This is a compile-time assertion to ensure that this generated file 62 | // is compatible with the grpc package it is being compiled against. 63 | const _ = grpc.SupportPackageIsVersion4 64 | 65 | // Client API for StoreService service 66 | 67 | type StoreServiceClient interface { 68 | PackageCrawlHistory(ctx context.Context, in *PackageCrawlHistoryReq, opts ...grpc.CallOption) (*PackageCrawlHistoryResp, error) 69 | } 70 | 71 | type storeServiceClient struct { 72 | cc *grpc.ClientConn 73 | } 74 | 75 | func NewStoreServiceClient(cc *grpc.ClientConn) StoreServiceClient { 76 | return &storeServiceClient{cc} 77 | } 78 | 79 | func (c *storeServiceClient) PackageCrawlHistory(ctx context.Context, in *PackageCrawlHistoryReq, opts ...grpc.CallOption) (*PackageCrawlHistoryResp, error) { 80 | out := new(PackageCrawlHistoryResp) 81 | err := grpc.Invoke(ctx, "/gcse.StoreService/PackageCrawlHistory", in, out, c.cc, opts...) 82 | if err != nil { 83 | return nil, err 84 | } 85 | return out, nil 86 | } 87 | 88 | // Server API for StoreService service 89 | 90 | type StoreServiceServer interface { 91 | PackageCrawlHistory(context.Context, *PackageCrawlHistoryReq) (*PackageCrawlHistoryResp, error) 92 | } 93 | 94 | func RegisterStoreServiceServer(s *grpc.Server, srv StoreServiceServer) { 95 | s.RegisterService(&_StoreService_serviceDesc, srv) 96 | } 97 | 98 | func _StoreService_PackageCrawlHistory_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) { 99 | in := new(PackageCrawlHistoryReq) 100 | if err := dec(in); err != nil { 101 | return nil, err 102 | } 103 | if interceptor == nil { 104 | return srv.(StoreServiceServer).PackageCrawlHistory(ctx, in) 105 | } 106 | info := &grpc.UnaryServerInfo{ 107 | Server: srv, 108 | FullMethod: "/gcse.StoreService/PackageCrawlHistory", 109 | } 110 | handler := func(ctx context.Context, req interface{}) (interface{}, error) { 111 | return srv.(StoreServiceServer).PackageCrawlHistory(ctx, req.(*PackageCrawlHistoryReq)) 112 | } 113 | return interceptor(ctx, in, info, handler) 114 | } 115 | 116 | var _StoreService_serviceDesc = grpc.ServiceDesc{ 117 | ServiceName: "gcse.StoreService", 118 | HandlerType: (*StoreServiceServer)(nil), 119 | Methods: []grpc.MethodDesc{ 120 | { 121 | MethodName: "PackageCrawlHistory", 122 | Handler: _StoreService_PackageCrawlHistory_Handler, 123 | }, 124 | }, 125 | Streams: []grpc.StreamDesc{}, 126 | Metadata: "github.com/daviddengcn/gcse/shared/proto/stored.proto", 127 | } 128 | 129 | func init() { 130 | proto.RegisterFile("github.com/daviddengcn/gcse/shared/proto/stored.proto", fileDescriptor2) 131 | } 132 | 133 | var fileDescriptor2 = []byte{ 134 | // 210 bytes of a gzipped FileDescriptorProto 135 | 0x1f, 0x8b, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0xff, 0xe2, 0x32, 0x4d, 0xcf, 0x2c, 0xc9, 136 | 0x28, 0x4d, 0xd2, 0x4b, 0xce, 0xcf, 0xd5, 0x4f, 0x49, 0x2c, 0xcb, 0x4c, 0x49, 0x49, 0xcd, 0x4b, 137 | 0x4f, 0xce, 0xd3, 0x4f, 0x4f, 0x2e, 0x4e, 0xd5, 0x2f, 0xce, 0x48, 0x2c, 0x4a, 0x4d, 0xd1, 0x2f, 138 | 0x28, 0xca, 0x2f, 0xc9, 0xd7, 0x2f, 0x2e, 0xc9, 0x2f, 0x4a, 0x4d, 0xd1, 0x03, 0x73, 0x84, 0x58, 139 | 0x40, 0xf2, 0x52, 0x24, 0x68, 0x2e, 0xc8, 0x4c, 0x49, 0x2d, 0x82, 0x68, 0x56, 0x32, 0xe2, 0x12, 140 | 0x0b, 0x48, 0x4c, 0xce, 0x4e, 0x4c, 0x4f, 0x75, 0x2e, 0x4a, 0x2c, 0xcf, 0xf1, 0xc8, 0x04, 0x19, 141 | 0x5d, 0x19, 0x94, 0x5a, 0x28, 0x24, 0xc1, 0xc5, 0x5e, 0x00, 0x91, 0x91, 0x60, 0x54, 0x60, 0xd4, 142 | 0xe0, 0x0c, 0x82, 0x71, 0x95, 0x1c, 0xb8, 0xc4, 0xb1, 0xea, 0x29, 0x2e, 0x10, 0x52, 0xe5, 0x62, 143 | 0xc9, 0xcc, 0x4b, 0xcb, 0x07, 0xeb, 0xe0, 0x36, 0x12, 0xd4, 0x03, 0xd9, 0xae, 0x07, 0x55, 0xe0, 144 | 0x99, 0x97, 0x96, 0x1f, 0x04, 0x96, 0x36, 0x4a, 0xe2, 0xe2, 0x09, 0x06, 0x79, 0x21, 0x38, 0xb5, 145 | 0xa8, 0x2c, 0x33, 0x39, 0x55, 0x28, 0x88, 0x4b, 0x18, 0x8b, 0x89, 0x42, 0x32, 0x10, 0xfd, 0xd8, 146 | 0x1d, 0x28, 0x25, 0x8b, 0x47, 0xb6, 0xb8, 0xc0, 0x89, 0x23, 0x8a, 0x0d, 0x24, 0x5f, 0x90, 0x94, 147 | 0xc4, 0x06, 0xf6, 0xaa, 0x31, 0x20, 0x00, 0x00, 0xff, 0xff, 0x4d, 0x64, 0x41, 0x27, 0x60, 0x01, 148 | 0x00, 0x00, 149 | } 150 | -------------------------------------------------------------------------------- /index_test.go: -------------------------------------------------------------------------------- 1 | package gcse 2 | 3 | import ( 4 | "io" 5 | "path" 6 | "testing" 7 | 8 | "github.com/golangplus/strings" 9 | "github.com/golangplus/testing/assert" 10 | 11 | "github.com/daviddengcn/go-index" 12 | "github.com/daviddengcn/sophie" 13 | "github.com/daviddengcn/sophie/mr" 14 | ) 15 | 16 | func TestIndex(t *testing.T) { 17 | const ( 18 | package0 = "github.com/daviddengcn/gcse" 19 | package1 = "github.com/daviddengcn/gcse/indexer" 20 | package2 = "github.com/daviddengcn/go-villa" 21 | ) 22 | 23 | docs := []DocInfo{ 24 | { 25 | Package: package0, 26 | Name: "gcse", 27 | TestImports: []string{ 28 | package2, package0, 29 | }, 30 | }, { 31 | Package: package1, 32 | Name: "main", 33 | Imports: []string{ 34 | package0, 35 | package2, 36 | package1, 37 | }, 38 | }, { 39 | Package: package2, 40 | Name: "villa", 41 | }, 42 | } 43 | ts, err := Index(&mr.InputStruct{ 44 | PartCountF: func() (int, error) { 45 | return 1, nil 46 | }, 47 | IteratorF: func(int) (sophie.IterateCloser, error) { 48 | index := 0 49 | return &sophie.IterateCloserStruct{ 50 | NextF: func(key, val sophie.SophieReader) error { 51 | if index >= len(docs) { 52 | return io.EOF 53 | } 54 | *key.(*sophie.RawString) = sophie.RawString( 55 | docs[index].Package) 56 | *val.(*DocInfo) = docs[index] 57 | val.(*DocInfo).Imports = append([]string{}, docs[index].Imports...) 58 | val.(*DocInfo).TestImports = append([]string{}, docs[index].TestImports...) 59 | 60 | index++ 61 | return nil 62 | }, 63 | }, nil 64 | }, 65 | }, "./tmp") 66 | assert.NoErrorOrDie(t, err) 67 | 68 | hitsArr, err := index.OpenConstArray(path.Join("./tmp", HitsArrFn)) 69 | for _, doc := range docs { 70 | idx := -1 71 | ts.Search(index.SingleFieldQuery(IndexPkgField, doc.Package), func(docID int32, data interface{}) error { 72 | idx = int(docID) 73 | return nil 74 | }) 75 | d, err := hitsArr.GetGob(idx) 76 | assert.NoError(t, err) 77 | assert.Equal(t, "d.Package", d.(HitInfo).Package, doc.Package) 78 | } 79 | numDocs := ts.DocCount() 80 | assert.Equal(t, "DocCount", numDocs, 3) 81 | 82 | var pkgs []string 83 | if err := ts.Search(map[string]stringsp.Set{IndexTextField: nil}, 84 | func(docID int32, data interface{}) error { 85 | hit := data.(HitInfo) 86 | pkgs = append(pkgs, hit.Package) 87 | return nil 88 | }, 89 | ); err != nil { 90 | t.Error(err) 91 | return 92 | } 93 | assert.StringEqual(t, "all", pkgs, 94 | []string{ 95 | "github.com/daviddengcn/gcse", 96 | "github.com/daviddengcn/go-villa", 97 | "github.com/daviddengcn/gcse/indexer", 98 | }) 99 | 100 | var gcseInfo HitInfo 101 | if err := ts.Search(map[string]stringsp.Set{ 102 | IndexPkgField: stringsp.NewSet("github.com/daviddengcn/gcse"), 103 | }, func(docID int32, data interface{}) error { 104 | gcseInfo = data.(HitInfo) 105 | return nil 106 | }); err != nil { 107 | t.Errorf("ts.Search: %v", err) 108 | return 109 | } 110 | assert.Equal(t, "gcseInfo.Imported", gcseInfo.Imported, []string(nil)) 111 | assert.Equal(t, "gcseInfo.ImportedLen", gcseInfo.ImportedLen, 1) 112 | assert.Equal(t, "gcseInfo.TestImports", gcseInfo.TestImports, []string{"github.com/daviddengcn/go-villa"}) 113 | 114 | var indexerInfo HitInfo 115 | if err := ts.Search(map[string]stringsp.Set{ 116 | IndexPkgField: stringsp.NewSet("github.com/daviddengcn/gcse/indexer"), 117 | }, func(docID int32, data interface{}) error { 118 | gcseInfo = data.(HitInfo) 119 | return nil 120 | }); err != nil { 121 | t.Errorf("ts.Search: %v", err) 122 | return 123 | } 124 | assert.StringEqual(t, "indexerInfo.Imported", 125 | indexerInfo.Imported, []string{}) 126 | assert.StringEqual(t, "indexerInfo.Imports", 127 | indexerInfo.Imports, []string{}) 128 | 129 | if err := ts.Search(map[string]stringsp.Set{ 130 | IndexPkgField: stringsp.NewSet("github.com/daviddengcn/go-villa"), 131 | }, func(docID int32, data interface{}) error { 132 | gcseInfo = data.(HitInfo) 133 | return nil 134 | }); err != nil { 135 | t.Errorf("ts.Search: %v", err) 136 | return 137 | } 138 | assert.Equal(t, "indexerInfo.Imported", indexerInfo.Imported, []string(nil)) 139 | assert.Equal(t, "gcseInfo.TestImportedLen", gcseInfo.TestImportedLen, 1) 140 | assert.Equal(t, "gcseInfo.TestImported", gcseInfo.TestImported, []string(nil)) 141 | } 142 | 143 | func TestAppendTokens_filter(t *testing.T) { 144 | SRC_DST := []interface{}{ 145 | "My address is http://go-search.org", []string{"my", "address", "is"}, 146 | "Hello david_deng-cn.123@gmail-yahoo.com", []string{"hello"}, 147 | } 148 | 149 | for i := 0; i < len(SRC_DST); i += 2 { 150 | SRC := SRC_DST[i].(string) 151 | DST := stringsp.NewSet(SRC_DST[i+1].([]string)...) 152 | 153 | assert.Equal(t, "Tokens of "+SRC, AppendTokens(nil, []byte(SRC)), DST) 154 | } 155 | } 156 | 157 | func search(ts *index.TokenSetSearcher, field string, text string) ([]HitInfo, error) { 158 | var hits []HitInfo 159 | err := ts.Search(map[string]stringsp.Set{field: AppendTokens(nil, []byte(text))}, func(_ int32, data interface{}) error { 160 | hits = append(hits, data.(HitInfo)) 161 | return nil 162 | }) 163 | return hits, err 164 | } 165 | 166 | func TestIndex_DescNotIndexedBug(t *testing.T) { 167 | const ( 168 | description = "description" 169 | readme = "readme" 170 | ) 171 | hits := []HitInfo{{ 172 | DocInfo: DocInfo{ 173 | Package: "github.com/daviddengcn/gcse", 174 | Name: "gcse", 175 | Description: description, 176 | ReadmeData: readme, 177 | }, 178 | }} 179 | idxs := []int{0} 180 | fullHitSaved := 0 181 | ts := &index.TokenSetSearcher{} 182 | assert.NoError(t, indexAndSaveHits(ts, hits, idxs, func(hit *HitInfo) error { 183 | fullHitSaved++ 184 | assert.Equal(t, "Description", hit.Description, description) 185 | assert.Equal(t, "Readme", hit.ReadmeData, readme) 186 | return nil 187 | })) 188 | assert.Equal(t, "fullHitSaved", fullHitSaved, 1) 189 | results, err := search(ts, IndexTextField, description) 190 | assert.NoError(t, err) 191 | assert.Equal(t, "results", results, hits) 192 | 193 | results, err = search(ts, IndexTextField, readme) 194 | assert.NoError(t, err) 195 | assert.Equal(t, "results", results, hits) 196 | } 197 | -------------------------------------------------------------------------------- /service/web/web/about.html: -------------------------------------------------------------------------------- 1 | {{template "header.html" .UIUtils.Slice "About GCSE" "about"}} 2 |
3 | 16 | {{define "aboutbody"}} 17 | ### Motivation 18 | 19 | When I began to learn [Go](http://golang.org/) (or golang), I loved it. The integrated tooling for 20 | importing directly from version control systems is exceptional useful. It allows 21 | projects to be connected in an elegant manner. 22 | 23 | With so many brillant people contributing to the Go ecosystem, there is a lot of 24 | "reinventing the wheel" happening. Due to the way Go does imports, a central 25 | clearing house like Maven or NPM is simply not needed. However, what is needed 26 | is a way to find the packages you need to import, and that is why GS, the 27 | _Go Search_, was created. 28 | 29 | A search engine specifically designed for Go can help in the following ways: 30 | 31 | * By analyzing the importing relations between projects, popular projects can be 32 | found. This gives a better ranking than a general search engine. 33 | * Structured data for each package can be extracted: package path, name, 34 | synopsis, documentation, etc. Matching can be done in a better way. 35 | * Package comments can be parsed and indexed. 36 | * Stars (or watchers) of some sites are crawled to further help ranking. 37 | 38 | ### Project 39 | 40 | This is an [open source project](https://github.com/daviddengcn/gcse) hosted 41 | on [Github](http://github.com/). 42 | 43 | GS mainly aims at searching function other than hosting the documents. So, the links to the [GoDoc](http://godoc.org/) 44 | or that package and the original project are provided on the view page in a very convenient place. 45 | 46 | If you find a bug, have a new idea or any other suggestion, please submit 47 | an [issue](https://github.com/daviddengcn/gcse/issues). 48 | 49 | While creating GS, several supporting projects were created and are published. 50 | Feel free to use them! 51 | 52 | * [go-index](https://github.com/daviddengcn/go-index): a text indexing package, 53 | serving the index/search function in GS. 54 | * [go-rpc](https://github.com/daviddengcn/go-rpc): an RPC service through HTTP 55 | protocol. 56 | 57 | ### Acknowledgement 58 | 59 | GS [imports](/view?id={{urlquery "github.com/daviddengcn/gcse"}}#imports) (or forks) the 60 | following projects: 61 | 62 | * [gddo](https://github.com/garyburd/gddo) by 63 | [Gary Burd](http://gary.beagledreams.com/) for crawling. It's forked (for 64 | modification) to my [gddo](https://github.com/daviddengcn/gddo) project. 65 | * [fsnotify](https://github.com/howeyc/fsnotify) by 66 | [Chris Howey](http://chris.howey.me) for monitoring folders. 67 | * [go-stem](https://github.com/agonopol/go-stem) by 68 | [Alex Gonopolskiy](https://github.com/agonopol) for word stemming. 69 | * [Bootstrap](http://getbootstrap.com/) for free UI framework. 70 | 71 | Special thanks to following contributors 72 | 73 | * [Robert Melton](https://github.com/robertmeta) for textual refining and code refactoring. 74 | * [mipearson](https://github.com/mipearson) Readme enhancement 75 | * [Michael Nagel](https://github.com/mnagel) Some good suggestion on UI. 76 | 77 | ### Contact us 78 | 79 | {{end}} 80 | 81 | {{markdown "aboutbody"}} 82 |
83 | Following the Facebook page for GS:
84 |
85 | 86 |
87 | Share it to your friends who also need it: 88 |
89 |
90 |
91 | 92 |
93 |
94 | 113 | 114 |
115 |
116 | 117 | {{template "searchbox.html" .UIUtils.Slice "" false}} 118 | 119 | 126 | 127 | 138 | {{template "footer.html"}} 139 | -------------------------------------------------------------------------------- /service/web/web/view.html: -------------------------------------------------------------------------------- 1 | {{template "header.html" .UIUtils.Slice (printf "%s - Package" .Name) "view"}} 2 |
3 | 16 |
17 | 22 | 23 | 44 | 45 |
46 | 47 | 50 | 51 | 52 |
53 | 54 | {{if .Description}} 55 |
56 |
57 | {{.DescHTML}} 58 |
59 |
60 | {{end}}{{if .ShowReadme}}
({{.ReadmeFn}})
 61 | {{.ReadmeData}}
 62 | 
{{end}} 63 | 64 | {{if len .Imported}} 65 |

Imported by {{len .Imported}} package(s)

66 |
    67 | {{range .Imported}} 68 |
  1. {{.}}
  2. 69 | {{end}} 70 |
71 | {{end}} 72 | {{if len .TestImported}} 73 |

Imported only in test by {{len .TestImported}} package(s)

74 |
    75 | {{range .TestImported}} 76 |
  1. {{.}}
  2. 77 | {{end}} 78 |
79 | {{end}} 80 | {{if len .Imports}} 81 |

Imports {{len .Imports}} package(s)

82 |
    83 | {{range .Imports}} 84 |
  1. {{.}}
  2. 85 | {{end}} 86 |
87 | {{end}} 88 | {{if len .TestImports}} 89 |

Test imports {{len .TestImports}} package(s)

90 |
    91 | {{range .TestImports}} 92 |
  1. {{.}}
  2. 93 | {{end}} 94 |
95 | {{end}} 96 |
97 | 98 |
99 | 118 | 119 |
120 |
121 | 122 |
123 | {{template "searchbox.html" .UIUtils.Slice "" false}} 124 | 125 | 126 | 129 | 130 | 137 | 138 | 153 | {{template "footer.html"}} 154 | -------------------------------------------------------------------------------- /service/web/css/gc.css: -------------------------------------------------------------------------------- 1 | body { 2 | padding-top: 70px; 3 | } 4 | 5 | div.info { 6 | margin-top: 10px; 7 | margin-bottom: 10px; 8 | } 9 | 10 | div.schblock { 11 | margin-top: 10px; 12 | } 13 | 14 | div.toplist { 15 | float: left; 16 | width: 280px; 17 | margin-right: 5px; 18 | margin-bottom: 5px; 19 | } 20 | 21 | div.toplist div.pkg { 22 | width: 175px; 23 | } 24 | 25 | div.toplist div.pkg { 26 | width: 160px; 27 | height: 20px; 28 | overflow: hidden; 29 | text-overflow: ellipsis; 30 | vertical-align: bottom; 31 | } 32 | 33 | div.toplist div.tableinfo { 34 | float: right; 35 | height: 18px; 36 | text-align: right; 37 | overflow: hidden; 38 | text-overflow: ellipsis; 39 | font-size: 90%; 40 | } 41 | 42 | div.toplist div.topnum { 43 | margin-top: 2px; 44 | margin-left: -18px; 45 | margin-right: 5px; 46 | float: left; 47 | width: 30px; 48 | text-align: right; 49 | font-size: 90%; 50 | } 51 | 52 | div.toplist a:visited { 53 | color: #12c; 54 | } 55 | 56 | 57 | ol.schres div.title { 58 | margin-bottom: 5px; 59 | } 60 | 61 | ol.schres div.summary { 62 | line-height: 1.3em; 63 | } 64 | 65 | ol.schres div.info { 66 | margin-top: 5px; 67 | margin-bottom: 10px; 68 | } 69 | 70 | div#totopbtn { 71 | position: fixed; 72 | bottom: 5px; 73 | right: 20px; 74 | } 75 | 76 | a#top { 77 | display: block; 78 | position: absolute; 79 | top: 0px; 80 | } 81 | 82 | textarea#pkg { 83 | height: 500px; 84 | } 85 | 86 | 87 | 88 | 89 | img.logo { 90 | vertical-align: bottom; 91 | } 92 | 93 | input.query-box { 94 | width: 500px; 95 | } 96 | 97 | div.content { 98 | text-align: left; 99 | } 100 | 101 | ol.schres { 102 | line-height: 1.2em; 103 | } 104 | 105 | ol.schres li { 106 | margin-bottom: 10px; 107 | } 108 | 109 | ol.schres div.title a { 110 | font-size: medium; 111 | } 112 | 113 | .schres div.num { 114 | display: inline-block; 115 | width: 2em; 116 | margin-left: -2.5em; 117 | margin-right: 0.5em; 118 | text-align: right; 119 | } 120 | 121 | 122 | .schres li { 123 | list-style-type: none; 124 | } 125 | 126 | .schres .title a, .schres .title a:link, .schres .title a:visited { 127 | color: #12c; 128 | } 129 | 130 | .schres .info, .schres .info a { 131 | color: #093; 132 | } 133 | 134 | .schres .info a { 135 | text-decoration: none; 136 | } 137 | 138 | div.pages { 139 | margin-bottom: 10px; 140 | } 141 | 142 | .pages span.prevpage { 143 | display: inline-block; 144 | width: 1em; 145 | text-align: center; 146 | } 147 | 148 | .pages span.prevpage a { 149 | text-decoration: none; 150 | } 151 | 152 | .pages .page { 153 | padding-left: 0.2em; 154 | padding-right: 0.2em; 155 | } 156 | 157 | .pages a.page { 158 | border: 1px solid gray; 159 | border-radius: 5px; 160 | text-decoration: none; 161 | } 162 | 163 | .pages a.page:hover { 164 | background: gray; 165 | color: white; 166 | } 167 | 168 | .code { 169 | font-family: courier new; 170 | } 171 | 172 | header { 173 | border-bottom: 1px solid gray; 174 | margin-top: 5px; 175 | margin-bottom: 10px; 176 | height: 25px; 177 | padding-left: 2px; 178 | } 179 | 180 | header a, footer a { 181 | text-decoration: none; 182 | } 183 | 184 | div.hdsch { 185 | float: right; 186 | margin-top: -2px; 187 | } 188 | 189 | div.hdsch input[type="search"] { 190 | width: 80px; 191 | transition: all 0.1s ease-in; 192 | opacity: 0.5; 193 | } 194 | 195 | div.hdsch input[type="search"]:focus { 196 | background: white; 197 | width: auto; 198 | width: 300px; 199 | opacity: 1.0; 200 | } 201 | 202 | div.hdsch input[type="search"]~button { 203 | opacity: 0.5; 204 | } 205 | 206 | div.hdsch input[type="search"]:focus~button { 207 | opacity: 1.0; 208 | } 209 | 210 | div.rightalign { 211 | text-align: right; 212 | } 213 | 214 | 215 | h4 a.anchor { 216 | text-decoration: none; 217 | color: rgba(0, 0, 0, 0); 218 | } 219 | 220 | h4:hover a.anchor, h4 a.anchor:focus { 221 | color: inherit; 222 | } 223 | 224 | footer { 225 | margin-top: 15px; 226 | border-top: 1px solid gray; 227 | padding-bottom: 20px; 228 | text-align: center; 229 | font-size: 90%; 230 | line-height: 150%; 231 | margin-bottom: 10px; 232 | } 233 | 234 | footer div.block { 235 | display: inline-block; 236 | height: 50px; 237 | text-align: left; 238 | vertical-align: top; 239 | margin-top: 10px; 240 | padding: 0px 10px; 241 | } 242 | 243 | footer div.rightline { 244 | border-right: 1px solid gray; 245 | } 246 | 247 | tfoot tr td { 248 | border-top: 1px solid black; 249 | border-bottom: 3px double black; 250 | } 251 | 252 | td.numcell { 253 | font-family: courier new; 254 | text-align: right; 255 | } 256 | 257 | div.half { 258 | width: 49%; 259 | height: auto; 260 | display: inline-block; 261 | } 262 | 263 | div.desc pre { 264 | background: #e0e0e0; 265 | margin: 0 0 10px; 266 | padding: 5px; 267 | font-size: 13px; 268 | border-radius: 5px; 269 | 270 | font-family: monospace; 271 | } 272 | 273 | pre.readme { 274 | word-break: break-word; 275 | font-size: 13px; 276 | } 277 | 278 | 279 | div.toplist div.listname { 280 | border-bottom: 1px solid gray; 281 | } 282 | 283 | div.toplist ol { 284 | margin: 0px; 285 | padding: 0px; 286 | padding-left: 25px; 287 | } 288 | 289 | div.toplist li.line { 290 | height: 25px; 291 | vertical-align: bottom; 292 | } 293 | 294 | .clearboth { 295 | clear: both; 296 | } 297 | 298 | div.view-bottom { 299 | color: gray; 300 | margin-top: 0px; 301 | margin-bottom: 10px; 302 | font-size: 75%; 303 | } 304 | 305 | div.view-bottom a:link { 306 | color: gray; 307 | } 308 | 309 | div.bottom-search { 310 | margin-top: 5px; 311 | } 312 | 313 | h4 { 314 | margin: 1em 0px 0px 0px; 315 | } 316 | 317 | div.markdown ul { 318 | padding-left: 20px; 319 | } 320 | 321 | div.markdown thead td { 322 | font-weight: bold; 323 | } 324 | 325 | div.markdown td { 326 | vertical-align: top; 327 | padding: 0px 10px 0px 0px; 328 | } 329 | 330 | .import-box { 331 | margin-bottom: 1.5rem; 332 | } 333 | .import-box input { 334 | font-family: monospace; 335 | } 336 | 337 | @media only screen and (max-device-width: 480px) { 338 | body { 339 | margin: 0px auto; 340 | width: 100%; 341 | } 342 | 343 | div.main { 344 | padding: 0px 0px 0px 2px; 345 | overflow: hidden; 346 | } 347 | input.query-box { 348 | width: auto; 349 | } 350 | 351 | div.hdsch { 352 | padding-right: 2px; 353 | } 354 | 355 | div.hdsch input[type="search"]:focus { 356 | background: white; 357 | width: 150px; 358 | } 359 | __h1 { 360 | font-size: 1em; 361 | padding-left: 2px; 362 | } 363 | 364 | __h1 img { 365 | height: 16px; 366 | } 367 | 368 | div.taframe { 369 | padding: 0px; 370 | } 371 | 372 | textarea#pkg { 373 | width: 95%; 374 | height: 200px; 375 | } 376 | 377 | div#totopbtn { 378 | display: none; 379 | } 380 | } 381 | --------------------------------------------------------------------------------