├── spider
├── github
│ ├── testdata
│ │ ├── sub
│ │ │ └── README
│ │ ├── README
│ │ ├── pkg_test.go
│ │ └── pkg.go
│ ├── utils.go
│ └── github_test.go
├── godocorg
│ ├── packages_test.go
│ └── packages.go
├── ranking_test.go
├── filecache_test.go
├── filecache.go
└── ranking.go
├── pipelines
├── spider
│ ├── spider
│ └── spider.go
├── indexer
│ ├── imain.go
│ └── index.go
├── crawler
│ ├── imports.go
│ ├── person.go
│ └── cmain.go
├── tocrawl
│ └── ghup.go
└── mergedocs
│ └── mergedocs.go
├── service
├── web
│ ├── .DS_Store
│ ├── static
│ │ └── robots.txt
│ ├── images
│ │ ├── logo-16.png
│ │ ├── logo-32.png
│ │ ├── logo-error-64.png
│ │ ├── glyphicons-halflings.png
│ │ └── glyphicons-halflings-white.png
│ ├── resource
│ │ ├── icon.psd
│ │ ├── logo.png
│ │ ├── gopher.png
│ │ ├── logo-128.png
│ │ ├── logo-16.png
│ │ ├── logo-256.png
│ │ ├── logo-32.png
│ │ ├── logo-64.png
│ │ ├── error-logo.png
│ │ ├── logo-error.png
│ │ ├── logo-error.psd
│ │ ├── gplus-cover.png
│ │ ├── logo-error-16.png
│ │ ├── logo-error-64.png
│ │ ├── twitter-cover.png
│ │ ├── logo-error-128.png
│ │ ├── round-logo-256.png
│ │ └── magnifying_glass_black.png
│ ├── web
│ │ ├── 404.html
│ │ ├── searchbox.html
│ │ ├── crawlhistory.html
│ │ ├── badgepage.html
│ │ ├── tops.html
│ │ ├── add.html
│ │ ├── footer.html
│ │ ├── header.html
│ │ ├── index.html
│ │ ├── search.html
│ │ ├── infoapi.html
│ │ ├── about.html
│ │ └── view.html
│ ├── db_test.go
│ ├── add.go
│ ├── crawlhistory.go
│ ├── view.go
│ ├── tops.go
│ ├── db.go
│ ├── api.go
│ └── css
│ │ └── gc.css
└── stored
│ └── stored.go
├── scripts
├── backup-conf.json.template
├── crawler.gs
├── web.gs
├── stored.gs
├── gen_proto.gs
├── testall.gs
├── backup.gs
└── install.gs
├── chrome-app
├── dist
│ ├── screenshot-1.png
│ ├── screenshot-2.png
│ └── promo-440x280-1.png
└── go-search
│ ├── logo-128.png
│ ├── logo-16.png
│ └── manifest.json
├── crawler.bat
├── indexer.bat
├── server.bat
├── shared
└── proto
│ ├── store.go
│ ├── spider.go
│ ├── stored.proto
│ ├── store.proto
│ ├── spider.proto
│ └── stored.pb.go
├── gcse.go
├── bi.go
├── ACKNOWLEDGEMENTS
├── utils
├── utils_test.go
├── json.go
├── utils.go
├── segment_test.go
└── segment.go
├── .gitignore
├── tokenize_test.go
├── utils.go
├── conf.json.template
├── tools
├── fillfound
│ ├── fillfound.go
│ └── fillfound_test.go
├── countdocs
│ └── countdocs.go
├── exps
│ └── importsents.go
├── fixcrawldb
│ └── fixcrawldb.go
└── dump
│ └── dump.go
├── LICENSE
├── db_test.go
├── README.md
├── license.txt
├── data_test.go
├── store
├── repo_test.go
├── repo.go
├── store_test.go
├── history.go
├── history_test.go
└── store.go
├── text_test.go
├── crawlerdb.go
├── configs
└── configs.go
├── crawler_test.go
├── data.go
└── index_test.go
/spider/github/testdata/sub/README:
--------------------------------------------------------------------------------
1 | The sub folder of testdata.
2 |
--------------------------------------------------------------------------------
/spider/github/testdata/README:
--------------------------------------------------------------------------------
1 | This folder is used for Github spider testing.
2 |
--------------------------------------------------------------------------------
/pipelines/spider/spider:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/daviddengcn/gcse/HEAD/pipelines/spider/spider
--------------------------------------------------------------------------------
/service/web/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/daviddengcn/gcse/HEAD/service/web/.DS_Store
--------------------------------------------------------------------------------
/scripts/backup-conf.json.template:
--------------------------------------------------------------------------------
1 | {
2 | gdrive: {
3 | folder: {
4 | // id: ""
5 | }
6 | }
7 | }
8 |
--------------------------------------------------------------------------------
/service/web/static/robots.txt:
--------------------------------------------------------------------------------
1 | User-agent: *
2 | Disallow: /search
3 | Disallow: /add
4 | Disallow: /api
5 |
--------------------------------------------------------------------------------
/service/web/images/logo-16.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/daviddengcn/gcse/HEAD/service/web/images/logo-16.png
--------------------------------------------------------------------------------
/service/web/images/logo-32.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/daviddengcn/gcse/HEAD/service/web/images/logo-32.png
--------------------------------------------------------------------------------
/service/web/resource/icon.psd:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/daviddengcn/gcse/HEAD/service/web/resource/icon.psd
--------------------------------------------------------------------------------
/service/web/resource/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/daviddengcn/gcse/HEAD/service/web/resource/logo.png
--------------------------------------------------------------------------------
/chrome-app/dist/screenshot-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/daviddengcn/gcse/HEAD/chrome-app/dist/screenshot-1.png
--------------------------------------------------------------------------------
/chrome-app/dist/screenshot-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/daviddengcn/gcse/HEAD/chrome-app/dist/screenshot-2.png
--------------------------------------------------------------------------------
/chrome-app/go-search/logo-128.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/daviddengcn/gcse/HEAD/chrome-app/go-search/logo-128.png
--------------------------------------------------------------------------------
/chrome-app/go-search/logo-16.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/daviddengcn/gcse/HEAD/chrome-app/go-search/logo-16.png
--------------------------------------------------------------------------------
/service/web/resource/gopher.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/daviddengcn/gcse/HEAD/service/web/resource/gopher.png
--------------------------------------------------------------------------------
/service/web/resource/logo-128.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/daviddengcn/gcse/HEAD/service/web/resource/logo-128.png
--------------------------------------------------------------------------------
/service/web/resource/logo-16.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/daviddengcn/gcse/HEAD/service/web/resource/logo-16.png
--------------------------------------------------------------------------------
/service/web/resource/logo-256.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/daviddengcn/gcse/HEAD/service/web/resource/logo-256.png
--------------------------------------------------------------------------------
/service/web/resource/logo-32.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/daviddengcn/gcse/HEAD/service/web/resource/logo-32.png
--------------------------------------------------------------------------------
/service/web/resource/logo-64.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/daviddengcn/gcse/HEAD/service/web/resource/logo-64.png
--------------------------------------------------------------------------------
/chrome-app/dist/promo-440x280-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/daviddengcn/gcse/HEAD/chrome-app/dist/promo-440x280-1.png
--------------------------------------------------------------------------------
/service/web/resource/error-logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/daviddengcn/gcse/HEAD/service/web/resource/error-logo.png
--------------------------------------------------------------------------------
/service/web/resource/logo-error.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/daviddengcn/gcse/HEAD/service/web/resource/logo-error.png
--------------------------------------------------------------------------------
/service/web/resource/logo-error.psd:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/daviddengcn/gcse/HEAD/service/web/resource/logo-error.psd
--------------------------------------------------------------------------------
/service/web/images/logo-error-64.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/daviddengcn/gcse/HEAD/service/web/images/logo-error-64.png
--------------------------------------------------------------------------------
/service/web/resource/gplus-cover.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/daviddengcn/gcse/HEAD/service/web/resource/gplus-cover.png
--------------------------------------------------------------------------------
/service/web/resource/logo-error-16.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/daviddengcn/gcse/HEAD/service/web/resource/logo-error-16.png
--------------------------------------------------------------------------------
/service/web/resource/logo-error-64.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/daviddengcn/gcse/HEAD/service/web/resource/logo-error-64.png
--------------------------------------------------------------------------------
/service/web/resource/twitter-cover.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/daviddengcn/gcse/HEAD/service/web/resource/twitter-cover.png
--------------------------------------------------------------------------------
/crawler.bat:
--------------------------------------------------------------------------------
1 | go install github.com/daviddengcn/gcse/crawler
2 | @if errorlevel 1 goto exit
3 | %GOPATH%\bin\crawler
4 |
5 | :exit
6 |
--------------------------------------------------------------------------------
/indexer.bat:
--------------------------------------------------------------------------------
1 | go install github.com/daviddengcn/gcse/indexer
2 | @if errorlevel 1 goto exit
3 | %GOPATH%\bin\indexer
4 |
5 | :exit
6 |
--------------------------------------------------------------------------------
/server.bat:
--------------------------------------------------------------------------------
1 | go install github.com/daviddengcn/gcse/server
2 | @if errorlevel 1 goto exit
3 | %GOPATH%\bin\server
4 |
5 | :exit
6 |
--------------------------------------------------------------------------------
/service/web/resource/logo-error-128.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/daviddengcn/gcse/HEAD/service/web/resource/logo-error-128.png
--------------------------------------------------------------------------------
/service/web/resource/round-logo-256.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/daviddengcn/gcse/HEAD/service/web/resource/round-logo-256.png
--------------------------------------------------------------------------------
/service/web/images/glyphicons-halflings.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/daviddengcn/gcse/HEAD/service/web/images/glyphicons-halflings.png
--------------------------------------------------------------------------------
/service/web/resource/magnifying_glass_black.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/daviddengcn/gcse/HEAD/service/web/resource/magnifying_glass_black.png
--------------------------------------------------------------------------------
/spider/github/testdata/pkg_test.go:
--------------------------------------------------------------------------------
1 | package pkg
2 |
3 | import (
4 | "github.com/golangplus/testing/assert"
5 | )
6 |
7 | var _ = assert.Equal
8 |
--------------------------------------------------------------------------------
/service/web/images/glyphicons-halflings-white.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/daviddengcn/gcse/HEAD/service/web/images/glyphicons-halflings-white.png
--------------------------------------------------------------------------------
/shared/proto/store.go:
--------------------------------------------------------------------------------
1 | package gcsepb
2 |
3 | func (m *Repository) PutPackage(path string, pkg *Package) {
4 | if m.Packages == nil {
5 | m.Packages = make(map[string]*Package)
6 | }
7 | m.Packages[path] = pkg
8 | }
9 |
--------------------------------------------------------------------------------
/spider/github/testdata/pkg.go:
--------------------------------------------------------------------------------
1 | package pkg
2 |
3 | import (
4 | "github.com/daviddengcn/gcse/spider/github"
5 | strs "github.com/golangplus/strings"
6 | )
7 |
8 | var _ = github.ErrInvalidPackage
9 | var _ = strs.Get
10 |
--------------------------------------------------------------------------------
/gcse.go:
--------------------------------------------------------------------------------
1 | /*
2 | Package gcse is the core supporting library for go-code-search-engine (GCSE).
3 | Its exported types and functions are mainly for sub packages. If you want
4 | some of the function, copy the code away.
5 | */
6 | package gcse
7 |
--------------------------------------------------------------------------------
/bi.go:
--------------------------------------------------------------------------------
1 | package gcse
2 |
3 | import (
4 | "github.com/daviddengcn/go-easybi"
5 | )
6 |
7 | func AddBiValueAndProcess(aggr bi.AggregateMethod, name string, value int) {
8 | bi.AddValue(aggr, name, value)
9 | bi.Flush()
10 | bi.Process()
11 | }
12 |
--------------------------------------------------------------------------------
/scripts/crawler.gs:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env gosl
2 |
3 | APPS := []string {
4 | "tocrawl", "crawler", "mergedocs", "indexer",
5 | }
6 |
7 | for {
8 | for _, app := range APPS {
9 | Printf("Running %s...\n", app)
10 | Bash(app)
11 | }
12 | }
13 |
--------------------------------------------------------------------------------
/scripts/web.gs:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env gosl
2 |
3 | import "time"
4 | import "github.com/daviddengcn/gcse/configs"
5 |
6 | Printfln("Logging to %q...", configs.LogDir)
7 |
8 | for {
9 | Bash("web -log_dir %s", configs.LogDir)
10 | time.Sleep(time.Second)
11 | }
12 |
--------------------------------------------------------------------------------
/scripts/stored.gs:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env gosl
2 |
3 | import "time"
4 | import "github.com/daviddengcn/gcse/configs"
5 |
6 | Printfln("Logging to %q...", configs.LogDir)
7 |
8 | for {
9 | Bash("stored -log_dir %s", configs.LogDir)
10 | time.Sleep(time.Second)
11 | }
12 |
--------------------------------------------------------------------------------
/ACKNOWLEDGEMENTS:
--------------------------------------------------------------------------------
1 | (sorted by names)
2 | Alif Rachmawadi(subosito) Fix a bug on www server.
3 | mipearson Creates the step-by-step document and some optimization that makes the command more robust.
4 | Robert Melton(@robertmeta) Textual refining and code refactoring.
5 |
--------------------------------------------------------------------------------
/service/web/web/404.html:
--------------------------------------------------------------------------------
1 | {{template "header.html" .UIUtils.Slice "404" "404"}}
2 |
4 | {{range .Lists}}
5 |
6 |
7 |
{{.Info}}
8 |
{{.Name}}
9 |
10 |
11 | {{range .Items}}
12 |
13 | {{.Index}}.
14 | {{.Info}}
15 |
16 |
17 | {{end}}
18 |
19 |
20 | {{end}}
21 |
25 |
26 | {{template "footer.html"}}
27 |
--------------------------------------------------------------------------------
/tools/countdocs/countdocs.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import (
4 | "log"
5 |
6 | "github.com/golangplus/fmt"
7 |
8 | "github.com/daviddengcn/gcse"
9 | "github.com/daviddengcn/sophie"
10 | "github.com/daviddengcn/sophie/kv"
11 | )
12 |
13 | func main() {
14 | // path := "data/docs"
15 | path := "data/docs-updated"
16 | kvDir := kv.DirInput(sophie.LocalFsPath(path))
17 |
18 | cnt, err := kvDir.PartCount()
19 | if err != nil {
20 | log.Fatalf("kvDir.PartCount failed: %v", err)
21 | }
22 |
23 | totalEntries := 0
24 | for i := 0; i < cnt; i++ {
25 | it, err := kvDir.Iterator(i)
26 | if err != nil {
27 | log.Fatalf("kvDir.Collector(%d) failed: %v", i, err)
28 | }
29 |
30 | var key sophie.RawString
31 | var val gcse.DocInfo
32 | for {
33 | if err := it.Next(&key, &val); err != nil {
34 | if err == sophie.EOF {
35 | break
36 | }
37 | log.Fatalf("it.Next failed %v", err)
38 | }
39 | totalEntries++
40 | }
41 |
42 | it.Close()
43 | }
44 |
45 | fmtp.Printfln("Total %d files, %d entries.", cnt, totalEntries)
46 | }
47 |
--------------------------------------------------------------------------------
/service/web/web/add.html:
--------------------------------------------------------------------------------
1 | {{template "header.html" .UIUtils.Slice "Add Packages" "add"}}
2 | {{if .Message }}
3 |
4 |
5 | × Close
6 | {{.Message}}
7 |
8 |
9 | {{end}}
10 |
20 |
25 | {{template "footer.html"}}
26 |
--------------------------------------------------------------------------------
/tools/fillfound/fillfound_test.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import (
4 | "testing"
5 | "time"
6 |
7 | "github.com/golang/protobuf/ptypes"
8 | "github.com/golangplus/testing/assert"
9 | "github.com/golangplus/time"
10 |
11 | "github.com/daviddengcn/gcse"
12 | "github.com/daviddengcn/gcse/configs"
13 | "github.com/daviddengcn/gcse/store"
14 |
15 | sppb "github.com/daviddengcn/gcse/proto/spider"
16 | )
17 |
18 | func init() {
19 | configs.SetTestingDataPath()
20 | }
21 |
22 | func TestDoFill(t *testing.T) {
23 | const (
24 | site = "github.com"
25 | path = "daviddengcn/gcse"
26 | )
27 | tm := time.Now().Add(-20 * timep.Day)
28 | cDB := gcse.LoadCrawlerDB()
29 | cDB.PackageDB.Put(site+"/"+path, gcse.CrawlingEntry{
30 | ScheduleTime: tm.Add(10 * timep.Day),
31 | })
32 | assert.NoError(t, cDB.Sync())
33 |
34 | assert.NoError(t, doFill())
35 |
36 | h, err := store.ReadPackageHistory(site, path)
37 | assert.NoError(t, err)
38 | ts, _ := ptypes.TimestampProto(tm)
39 | assert.Equal(t, "h", h, &sppb.HistoryInfo{
40 | FoundTime: ts,
41 | FoundWay: "unknown",
42 | })
43 | }
44 |
--------------------------------------------------------------------------------
/scripts/backup.gs:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env gosl
2 |
3 | import "flag"
4 | import "github.com/daviddengcn/go-villa"
5 | import "github.com/daviddengcn/go-ljson-conf"
6 |
7 | backupFolders := flag.String("folder", "docs:crawler", "Colon-delimited folders to backup.")
8 |
9 | flag.Parse()
10 |
11 | dir := villa.Path(ScriptDir())
12 |
13 | conf, _ := ljconf.Load(dir.Join("backup-conf.json").S())
14 |
15 | fdid := conf.String("gdrive.folder.id", "")
16 | if fdid == "" {
17 | Fatalf("Please set gdrive.folder.id in configuration!")
18 | }
19 |
20 | today := Now().Format("2006-01-02")
21 | Printf("Backup to %s\n", today)
22 |
23 | folders := Split(*backupFolders, ":")
24 |
25 | Println("Compressing files")
26 | for _, folder := range folders {
27 | Printfln("Compressing data/%s into data/%s.%s.tar.gz", folder, folder, today)
28 | MustSucc(Bash("tar czf data/%s.%s.tar.gz data/%s", folder, today, folder))
29 | }
30 |
31 | Println("Uploading to GDrive")
32 | for _, folder := range folders {
33 | MustSucc(Bash("gdrive upload -f data/%s.%s.tar.gz -p %s", folder, today, fdid))
34 | Bash("rm data/%s.%s.tar.gz", folder, today)
35 | }
36 |
37 | Println("Backup finished")
38 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Copyright (c) 2014, Yi Deng
2 | All rights reserved.
3 |
4 | Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
5 |
6 | Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
7 |
--------------------------------------------------------------------------------
/scripts/install.gs:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env gosl
2 |
3 | import "flag"
4 |
5 | goGet := flag.Bool("go_get", true, `Whether do "go get" before installing`)
6 | doTest := flag.Bool("do_test", false, `Whether do "go test" on essential packages`)
7 | compileAll := flag.Bool("a", true, `Whether use -a in go install command`)
8 |
9 | flag.Parse()
10 |
11 | const GCSE = "github.com/daviddengcn/gcse"
12 | APPS := []string {
13 | "pipelines/tocrawl", "pipelines/crawler", "pipelines/mergedocs", "pipelines/indexer", "service/stored", "service/web",
14 | }
15 |
16 | if *goGet {
17 | Printfln("go get -u -v %s", GCSE)
18 | MustSucc(Bash("go get -u -v %s", GCSE))
19 | for _, a := range APPS {
20 | Printfln("go get -u -v %s/%s", GCSE, a)
21 | MustSucc(Bash("go get -u -v %s/%s", GCSE, a))
22 | }
23 | }
24 |
25 | if *doTest {
26 | Println("go test -a")
27 | MustSucc(Bash("go test -a"))
28 | Println("go test store/*.go")
29 | MustSucc(Bash("go test store/*.go"))
30 | Println("go test spider/*.go")
31 | MustSucc(Bash("go test spider/*.go"))
32 | }
33 |
34 | buildFlags := ""
35 | if *compileAll {
36 | buildFlags += " -a"
37 | }
38 |
39 | for _, a := range APPS {
40 | Printfln("go install %s %s/%s", buildFlags, GCSE, a)
41 | MustSucc(Bash("go install %s %s/%s", buildFlags, GCSE, a))
42 | }
43 |
44 |
--------------------------------------------------------------------------------
/service/stored/stored.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import (
4 | "context"
5 | "flag"
6 | "net"
7 |
8 | "github.com/daviddengcn/gcse/configs"
9 | "github.com/daviddengcn/gcse/store"
10 | "github.com/daviddengcn/gcse/utils"
11 | "github.com/golang/glog"
12 | "google.golang.org/grpc"
13 |
14 | gpb "github.com/daviddengcn/gcse/shared/proto"
15 | )
16 |
17 | type server struct {
18 | }
19 |
20 | var _ gpb.StoreServiceServer = (*server)(nil)
21 |
22 | func (s *server) PackageCrawlHistory(_ context.Context, req *gpb.PackageCrawlHistoryReq) (*gpb.PackageCrawlHistoryResp, error) {
23 | site, path := utils.SplitPackage(req.Package)
24 | info, err := store.ReadPackageHistory(site, path)
25 | if err != nil {
26 | glog.Errorf("ReadPackageHistoryOf %q %q failed: %v", site, path, err)
27 | return nil, err
28 | }
29 | return &gpb.PackageCrawlHistoryResp{Info: info}, nil
30 | }
31 |
32 | func main() {
33 | addr := flag.String("addr", configs.StoreDAddr, "addr to listen")
34 |
35 | flag.Parse()
36 |
37 | glog.Infof("Listening to %s", *addr)
38 | lis, err := net.Listen("tcp", *addr)
39 | if err != nil {
40 | glog.Fatalf("failed to listen: %v", err)
41 | }
42 | grpcServer := grpc.NewServer()
43 | gpb.RegisterStoreServiceServer(grpcServer, &server{})
44 | grpcServer.Serve(lis)
45 | }
46 |
--------------------------------------------------------------------------------
/shared/proto/store.proto:
--------------------------------------------------------------------------------
1 | syntax = "proto3";
2 |
3 | package gcse;
4 |
5 | option go_package = "gcsepb";
6 |
7 | import "github.com/daviddengcn/gcse/shared/proto/spider.proto";
8 |
9 | message PackageInfo {
10 | string name = 1;
11 | string package = 2;
12 | string author = 3;
13 | int32 stars = 4;
14 | string synopsis = 5;
15 | string description = 6;
16 | string project_url = 7;
17 | string readme_fn = 8;
18 | string readme_data = 9;
19 | repeated string imports = 10;
20 | repeated string test_imports = 11;
21 | repeated string exported = 12;
22 | repeated string references = 18;
23 |
24 | CrawlingInfo crawling_info = 17;
25 |
26 | // Available if the package is not the repo's root.
27 | FolderInfo folder_info = 14;
28 |
29 | // Available if the package is the repo's root.
30 | RepoInfo repo_info = 15;
31 | }
32 |
33 | message PersonInfo {
34 | CrawlingInfo crawling_info = 1;
35 | }
36 |
37 | message Repository {
38 | string branch = 6;
39 | string signature = 7;
40 |
41 | // map from relative path, e.g. "proto/store", to Package
42 | map
packages = 8;
43 |
44 | string ReadmeFn = 2; // No directory info
45 | string ReadmeData = 3; // Raw content, cound be md, txt, etc.
46 | int32 stars = 4;
47 |
48 | CrawlingInfo crawling_info = 5;
49 | }
50 |
--------------------------------------------------------------------------------
/spider/godocorg/packages.go:
--------------------------------------------------------------------------------
1 | package godocorg
2 |
3 | import (
4 | "encoding/json"
5 | "net/http"
6 |
7 | "github.com/daviddengcn/gddo/doc"
8 | "github.com/golangplus/errors"
9 | )
10 |
11 | const (
12 | godocApiUrl = "http://api.godoc.org/packages"
13 | )
14 |
15 | // FetchAllPackagesInGodoc fetches the list of all packages on godoc.org
16 | func FetchAllPackagesInGodoc(httpClient doc.HttpClient) ([]string, error) {
17 | req, err := http.NewRequest("GET", godocApiUrl, nil)
18 | if err != nil {
19 | return nil, errorsp.WithStacksAndMessage(err, "new request for %v failed", godocApiUrl)
20 | }
21 | resp, err := httpClient.Do(req)
22 | if err != nil {
23 | return nil, errorsp.WithStacksAndMessage(err, "fetching %v failed", godocApiUrl)
24 | }
25 | defer resp.Body.Close()
26 | if resp.StatusCode != 200 {
27 | return nil, errorsp.NewWithStacks("StatusCode: %d", resp.StatusCode)
28 | }
29 | var results struct {
30 | Results []struct {
31 | Path string
32 | }
33 | }
34 | dec := json.NewDecoder(resp.Body)
35 |
36 | if err := dec.Decode(&results); err != nil {
37 | return nil, errorsp.WithStacks(err)
38 | }
39 | list := make([]string, 0, len(results.Results))
40 | for _, res := range results.Results {
41 | list = append(list, res.Path)
42 | }
43 | return list, nil
44 | }
45 |
--------------------------------------------------------------------------------
/db_test.go:
--------------------------------------------------------------------------------
1 | package gcse
2 |
3 | import (
4 | "testing"
5 |
6 | "github.com/golangplus/testing/assert"
7 |
8 | "github.com/daviddengcn/go-villa"
9 | )
10 |
11 | func TestMemDB_Bug_Sync(t *testing.T) {
12 | path := villa.Path(".").Join("testmemdb.gob")
13 | if path.Exists() {
14 | path.Remove()
15 | }
16 |
17 | db := NewMemDB(".", "testmemdb")
18 | db.Put("s", 1)
19 | err := db.Sync()
20 | if err != nil {
21 | t.Error(err)
22 | }
23 |
24 | assert.Equal(t, "Exists", path.Exists(), true)
25 | if err := path.Remove(); err != nil {
26 | t.Error(err)
27 | }
28 | assert.Equal(t, "Exists", path.Exists(), false)
29 |
30 | //if err := db.Load(); err != nil {
31 | // t.Error(err)
32 | //}
33 | }
34 |
35 | func TestMemDB_Recover(t *testing.T) {
36 | path := villa.Path(".").Join("testmemdb.gob")
37 | if path.Exists() {
38 | path.Remove()
39 | }
40 |
41 | db := NewMemDB(".", "testmemdb")
42 | db.Put("s", 1)
43 | if err := db.Sync(); err != nil {
44 | t.Error(err)
45 | return
46 | }
47 |
48 | if err := path.Rename(path + ".new"); err != nil {
49 | t.Error(err)
50 | return
51 | }
52 | // Now in the status of fn.new exists, fn not exist
53 |
54 | if err := db.Load(); err != nil {
55 | t.Error(err)
56 | return
57 | }
58 | var vl int
59 | if ok := db.Get("s", &vl); !ok {
60 | t.Error("Recover failed!")
61 | return
62 | }
63 | assert.Equal(t, "vl", vl, 1)
64 | }
65 |
--------------------------------------------------------------------------------
/service/web/add.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import (
4 | "fmt"
5 | "log"
6 | "net/http"
7 | "strings"
8 |
9 | "github.com/daviddengcn/gcse"
10 | "github.com/daviddengcn/gddo/doc"
11 | )
12 |
13 | func filterPackages(pkgs []string) (res []string) {
14 | for _, pkg := range pkgs {
15 | pkg = gcse.TrimPackageName(pkg)
16 | if !doc.IsValidRemotePath(pkg) {
17 | continue
18 | }
19 | res = append(res, pkg)
20 | }
21 | return
22 | }
23 |
24 | func pageAdd(w http.ResponseWriter, r *http.Request) {
25 | w.Header().Set("Content-Type", "text/html")
26 |
27 | pkgsStr := r.FormValue("pkg")
28 | pkgMessage := ""
29 | msgCls := "success"
30 | taValue := ""
31 | if pkgsStr != "" {
32 | pkgs := filterPackages(strings.Split(pkgsStr, "\n"))
33 | if len(pkgs) > 0 {
34 | log.Printf("%d packages added!", len(pkgs))
35 | pkgMessage = fmt.Sprintf("Totally %d package(s) added!", len(pkgs))
36 | gcse.AppendPackages(pkgs)
37 | } else {
38 | msgCls = "danger"
39 | pkgMessage = "No package added! Check the format you submitted, please."
40 | taValue = pkgsStr
41 | }
42 | }
43 | err := templates.ExecuteTemplate(w, "add.html", struct {
44 | UIUtils
45 | Message string
46 | MsgCls string
47 | TAValue string
48 | }{
49 | Message: pkgMessage,
50 | MsgCls: msgCls,
51 | TAValue: taValue,
52 | })
53 | if err != nil {
54 | http.Error(w, err.Error(), http.StatusInternalServerError)
55 | }
56 | }
57 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | Go Search [](http://go-search.org/view?id=github.com%2Fdaviddengcn%2Fgcse)
2 | =========
3 |
4 | A keyword search engine helping people to find popular and relevant Go packages.
5 |
6 | Online service: [Go Search](http://go-search.org/)
7 |
8 | This is the root package with shared functions.
9 |
10 | Sub packages are commands for running:
11 |
12 | * [HTTP Server](http://github.com/daviddengcn/gcse/server): Searching and web service
13 | * [ToCrawl](http://github.com/daviddengcn/gcse/tocrawl): Find packages to crawl.
14 | * [Crawler](http://github.com/daviddengcn/gcse/crawler): Crawling package files.
15 | * [MergeDocs](http://github.com/daviddengcn/gcse/mergedocs): Merge crawled package files with doc DB.
16 | * [Indexer](http://github.com/daviddengcn/gcse/indexer): Analyzing package information and generating indexed data for searching.
17 |
18 | Development
19 | -----------
20 |
21 | You'll need to perform the following steps to get a basic server running:
22 |
23 | 1. Create a basic `conf.json` file, limiting the crawler to a one minute run: `{ "crawler": { "due_per_run": "1m" } }`
24 | 1. Run the package finder: `go run tocrawl/*.go`
25 | 1. Run the crawler: `go run crawler/*.go`
26 | 1. Merge the crawled docs: `go run mergedocs/*.go`
27 | 1. Run the indexer: `go run indexer/*.go`
28 | 1. Run the server: ` go run server/*.go`
29 | 1. Visit [http://localhost:8080](http://localhost:8080) in your browser
30 |
31 |
32 | LICENSE
33 | -------
34 | BSD license.
35 |
--------------------------------------------------------------------------------
/license.txt:
--------------------------------------------------------------------------------
1 | Copyright (c) 2013, David Deng
2 | All rights reserved.
3 |
4 | Redistribution and use in source and binary forms, with or without
5 | modification, are permitted provided that the following conditions are met:
6 |
7 | 1. Redistributions of source code must retain the above copyright notice,
8 | this list of conditions and the following disclaimer.
9 | 2. Redistributions in binary form must reproduce the above copyright notice,
10 | this list of conditions and the following disclaimer in the documentation
11 | and/or other materials provided with the distribution.
12 | 3. Neither the name of the PostgreSQL Global Development Group nor the names
13 | of its contributors may be used to endorse or promote products derived
14 | from this software without specific prior written permission.
15 |
16 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 | POSSIBILITY OF SUCH DAMAGE.
27 |
--------------------------------------------------------------------------------
/utils/utils.go:
--------------------------------------------------------------------------------
1 | package utils
2 |
3 | import (
4 | "fmt"
5 | "log"
6 | "runtime"
7 | "strings"
8 | )
9 |
10 | func SplitPackage(pkg string) (site, path string) {
11 | parts := strings.SplitN(pkg, "/", 2)
12 | if len(parts) > 0 {
13 | site = parts[0]
14 | }
15 | if len(parts) > 1 {
16 | path = parts[1]
17 | }
18 | return site, path
19 | }
20 |
21 | // LogError is used to ignore an error but log it.
22 | func LogError(err error, format string, args ...interface{}) {
23 | if err == nil {
24 | return
25 | }
26 | log.Print(fmt.Sprintf("%s: %v", fmt.Sprintf(format, args...), err))
27 | }
28 |
29 | type Size int64
30 |
31 | func (s Size) String() string {
32 | var unit string
33 | var base int64
34 | switch {
35 | case s < 1024:
36 | unit, base = "", 1
37 | case s < 1024*1024:
38 | unit, base = "K", 1024
39 | case s < 1024*1024*1024:
40 | unit, base = "M", 1024*1024
41 | case s < 1024*1024*1024*1024:
42 | unit, base = "G", 1024*1024*1024
43 | case s < 1024*1024*1024*1024*1024:
44 | unit, base = "T", 1024*1024*1024*1024
45 | case s < 1024*1024*1024*1024*1024*1024:
46 | unit, base = "P", 1024*1024*1024*1024*1024
47 | }
48 |
49 | remain := int64(s) / base
50 | if remain < 10 {
51 | return fmt.Sprintf("%.2f%s", float64(s)/float64(base), unit)
52 | }
53 | if remain < 100 {
54 | return fmt.Sprintf("%.1f%s", float64(s)/float64(base), unit)
55 | }
56 |
57 | return fmt.Sprintf("%d%s", int64(s)/base, unit)
58 | }
59 |
60 | func DumpMemStats() {
61 | var ms runtime.MemStats
62 | runtime.ReadMemStats(&ms)
63 | log.Printf("[MemStats] Alloc: %v, TotalAlloc: %v, Sys: %v, Go: %d",
64 | Size(ms.Alloc), Size(ms.TotalAlloc), Size(ms.Sys),
65 | runtime.NumGoroutine())
66 | }
67 |
--------------------------------------------------------------------------------
/tools/exps/importsents.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import (
4 | "fmt"
5 | "log"
6 |
7 | "github.com/daviddengcn/gcse"
8 | "github.com/daviddengcn/gcse/configs"
9 | "github.com/daviddengcn/go-villa"
10 | )
11 |
12 | const (
13 | fnDocDB = "docdb"
14 | )
15 |
16 | var (
17 | DocDBPath villa.Path
18 |
19 | // CrawlerDBPath villa.Path
20 | )
21 |
22 | func init() {
23 | DocDBPath = configs.DataRoot.Join(fnDocDB)
24 | // CrawlerDBPath = gcse.DataRoot.Join(fnCrawlerDB)
25 | }
26 |
27 | func main() {
28 | docDB := gcse.NewMemDB(DocDBPath, gcse.KindDocDB)
29 | countAll, countReadme, countHasSents := 0, 0, 0
30 | countSents := 0
31 |
32 | f, err := villa.Path("exps/notfound.txt").Create()
33 | if err != nil {
34 | log.Fatal(err)
35 | }
36 | defer f.Close()
37 |
38 | log.Printf("Start processing ...")
39 | if err := docDB.Iterate(func(key string, val interface{}) error {
40 | countAll++
41 |
42 | d := val.(gcse.DocInfo)
43 | if d.ReadmeData != "" {
44 | countReadme++
45 |
46 | readme := gcse.ReadmeToText(d.ReadmeFn, d.ReadmeData)
47 |
48 | sents := gcse.ChooseImportantSentenses(readme, d.Name, d.Package)
49 | if len(sents) > 0 {
50 | countSents += len(sents)
51 | countHasSents++
52 | } else {
53 | fmt.Fprintln(f, "$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$")
54 | fmt.Fprintf(f, "%s - %s - %s\n", d.Name, d.Package, d.ReadmeFn)
55 | fmt.Fprintf(f, "%s\n", readme)
56 | }
57 | }
58 |
59 | return nil
60 | }); err != nil {
61 | log.Fatalf("docDB.Iterate failed: %v", err)
62 | }
63 |
64 | log.Printf("%d documents processed.", countAll)
65 | log.Printf("%d have readme.", countReadme)
66 | log.Printf("%d found %d important sentenses.", countHasSents, countSents)
67 | }
68 |
--------------------------------------------------------------------------------
/spider/ranking_test.go:
--------------------------------------------------------------------------------
1 | package spider
2 |
3 | import (
4 | "fmt"
5 | "testing"
6 | "time"
7 |
8 | "github.com/golang/protobuf/ptypes"
9 | "github.com/golangplus/testing/assert"
10 | "github.com/golangplus/time"
11 |
12 | gpb "github.com/daviddengcn/gcse/shared/proto"
13 | )
14 |
15 | func TestLikeGoSubFolder(t *testing.T) {
16 | pos_cases := []string{
17 | "go", "v8", "v-8",
18 | }
19 | for _, c := range pos_cases {
20 | assert.True(t, fmt.Sprintf("LikeGoSubFolder %v", c), LikeGoSubFolder(c))
21 | }
22 | neg_cases := []string{
23 | "js", "1234", "1234-5678", "1234_5678",
24 | }
25 | for _, c := range neg_cases {
26 | assert.False(t, fmt.Sprintf("LikeGoSubFolder %v", c), LikeGoSubFolder(c))
27 | }
28 | }
29 |
30 | func TestCheckPackageStatus(t *testing.T) {
31 | // No crawling info, new package
32 | assert.Equal(t, "CheckPackageStatus", CheckPackageStatus(&gpb.PackageInfo{}, nil), OutOfDate)
33 | pkgCrawlTime, _ := ptypes.TimestampProto(time.Now().Add(-5 * timep.Day))
34 |
35 | newRepoInfoCrawlTime, _ := ptypes.TimestampProto(time.Now().Add(-3 * timep.Day))
36 | newPkgUpdateTime, _ := ptypes.TimestampProto(time.Now().Add(-4 * timep.Day))
37 | assert.Equal(t, "CheckPackageStatus", CheckPackageStatus(&gpb.PackageInfo{
38 | CrawlingInfo: &gpb.CrawlingInfo{
39 | CrawlingTime: pkgCrawlTime,
40 | },
41 | }, &gpb.RepoInfo{
42 | CrawlingTime: newRepoInfoCrawlTime,
43 | LastUpdated: newPkgUpdateTime,
44 | }), OutOfDate)
45 |
46 | newPkgUpdateTime, _ = ptypes.TimestampProto(time.Now().Add(-6 * timep.Day))
47 | assert.Equal(t, "CheckPackageStatus", CheckPackageStatus(&gpb.PackageInfo{
48 | CrawlingInfo: &gpb.CrawlingInfo{
49 | CrawlingTime: pkgCrawlTime,
50 | },
51 | }, &gpb.RepoInfo{
52 | CrawlingTime: newRepoInfoCrawlTime,
53 | LastUpdated: newPkgUpdateTime,
54 | }), UpToDate)
55 | }
56 |
--------------------------------------------------------------------------------
/data_test.go:
--------------------------------------------------------------------------------
1 | package gcse
2 |
3 | import (
4 | "testing"
5 | "time"
6 |
7 | "github.com/golangplus/bytes"
8 | "github.com/golangplus/testing/assert"
9 |
10 | "github.com/daviddengcn/go-index"
11 | )
12 |
13 | func TestDocInfo(t *testing.T) {
14 | src := DocInfo{
15 | Name: "gcse",
16 | Package: "github.com/daviddengcn/gcse",
17 | Author: "github.com/daviddengcn",
18 | LastUpdated: time.Now().Round(0),
19 | StarCount: 10,
20 | Synopsis: "Go Package Search Engine",
21 | Description: "More details about GCSE",
22 | ProjectURL: "http://github.com/daviddengcn/gcse",
23 | ReadmeFn: "readme.txt",
24 | ReadmeData: "Just read me",
25 | Imports: []string{
26 | "github.com/daviddengcn/go-villa",
27 | "github.com/daviddengcn/sophie",
28 | },
29 | TestImports: []string{
30 | "github.com/daviddengcn/go-check",
31 | },
32 | Exported: []string{
33 | "DocInfo", "CheckRuneType",
34 | },
35 | }
36 | var buf bytesp.Slice
37 | assert.NoError(t, src.WriteTo(&buf))
38 |
39 | var dst DocInfo
40 | assert.NoError(t, dst.ReadFrom(&buf, -1))
41 | dst.LastUpdated = dst.LastUpdated.Round(0)
42 |
43 | assert.StringEqual(t, "dst", dst, src)
44 |
45 | // checking the bug introduced by reusing slice
46 | dst2 := dst
47 | assert.StringEqual(t, "dst2.Imports[0]", dst2.Imports[0],
48 | "github.com/daviddengcn/go-villa")
49 |
50 | src.Imports[0] = "github.com/daviddengcn/go-assert"
51 | buf = nil
52 | assert.NoError(t, src.WriteTo(&buf))
53 | assert.NoError(t, dst.ReadFrom(&buf, -1))
54 | assert.StringEqual(t, "dst", dst, src)
55 |
56 | assert.StringEqual(t, "dst2.Imports[0]", dst2.Imports[0],
57 | "github.com/daviddengcn/go-villa")
58 | }
59 |
60 | func TestCheckRuneType_BOM(t *testing.T) {
61 | tp := CheckRuneType('A', 0xfeff)
62 | assert.Equal(t, "CheckRuneType(A, 0xfeff)", tp, index.TokenSep)
63 | }
64 |
--------------------------------------------------------------------------------
/service/web/web/footer.html:
--------------------------------------------------------------------------------
1 |
2 |