├── AUTHORS
├── CONTRIBUTORS
├── LICENSE
├── README
├── cmd
    ├── cgrep
    │   └── cgrep.go
    ├── cindex
    │   └── cindex.go
    └── csearch
    │   └── csearch.go
├── index
    ├── merge.go
    ├── merge_test.go
    ├── mmap_bsd.go
    ├── mmap_linux.go
    ├── mmap_windows.go
    ├── read.go
    ├── read_test.go
    ├── regexp.go
    ├── regexp_test.go
    ├── write.go
    └── write_test.go
├── lib
    ├── README.template
    ├── buildall
    ├── setup
    ├── uploadall
    └── version
├── regexp
    ├── copy.go
    ├── match.go
    ├── regexp.go
    ├── regexp_test.go
    └── utf.go
└── sparse
    └── set.go


/AUTHORS:
--------------------------------------------------------------------------------
1 | # This source code is copyright "The Go Authors",
2 | # as defined by the AUTHORS file in the root of the Go tree.
3 | #
4 | # http://tip.golang.org/AUTHORS.
5 | 


--------------------------------------------------------------------------------
/CONTRIBUTORS:
--------------------------------------------------------------------------------
1 | # The official list of people who can contribute code to the repository
2 | # is maintained in the standard Go repository as the CONTRIBUTORS
3 | # file in the root of the Go tree.
4 | #
5 | # http://tip.golang.org/CONTRIBUTORS
6 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2011 The Go Authors. All rights reserved.
 2 | 
 3 | Redistribution and use in source and binary forms, with or without
 4 | modification, are permitted provided that the following conditions are
 5 | met:
 6 | 
 7 |    * Redistributions of source code must retain the above copyright
 8 | notice, this list of conditions and the following disclaimer.
 9 |    * Redistributions in binary form must reproduce the above
10 | copyright notice, this list of conditions and the following disclaimer
11 | in the documentation and/or other materials provided with the
12 | distribution.
13 |    * Neither the name of Google Inc. nor the names of its
14 | contributors may be used to endorse or promote products derived from
15 | this software without specific prior written permission.
16 | 
17 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 | 


--------------------------------------------------------------------------------
/README:
--------------------------------------------------------------------------------
 1 | Code Search is a tool for indexing and then performing
 2 | regular expression searches over large bodies of source code.
 3 | It is a set of command-line programs written in Go.
 4 | 
 5 | For background and an overview of the commands,
 6 | see http://swtch.com/~rsc/regexp/regexp4.html.
 7 | 
 8 | To install:
 9 | 
10 | 	go get github.com/google/codesearch/cmd/...
11 | 
12 | Use "go get -u" to update an existing installation.
13 | 
14 | Russ Cox
15 | rsc@swtch.com
16 | June 2015
17 | 


--------------------------------------------------------------------------------
/cmd/cgrep/cgrep.go:
--------------------------------------------------------------------------------
 1 | // Copyright 2011 The Go Authors.  All rights reserved.
 2 | // Use of this source code is governed by a BSD-style
 3 | // license that can be found in the LICENSE file.
 4 | 
 5 | package main
 6 | 
 7 | import (
 8 | 	"flag"
 9 | 	"fmt"
10 | 	"log"
11 | 	"os"
12 | 	"runtime/pprof"
13 | 
14 | 	"github.com/google/codesearch/regexp"
15 | )
16 | 
17 | var usageMessage = `usage: cgrep [-c] [-h] [-i] [-l] [-n] regexp [file...]
18 | 
19 | Cgrep behaves like grep, searching for regexp, an RE2 (nearly PCRE) regular expression.
20 | 
21 | The -c, -h, -i, -l, and -n flags are as in grep, although note that as per Go's
22 | flag parsing convention, they cannot be combined: the option pair -i -n 
23 | cannot be abbreviated to -in.
24 | `
25 | 
26 | func usage() {
27 | 	fmt.Fprintf(os.Stderr, usageMessage)
28 | 	os.Exit(2)
29 | }
30 | 
31 | var (
32 | 	iflag      = flag.Bool("i", false, "case-insensitive match")
33 | 	cpuProfile = flag.String("cpuprofile", "", "write cpu profile to this file")
34 | )
35 | 
36 | func main() {
37 | 	var g regexp.Grep
38 | 	g.AddFlags()
39 | 	g.Stdout = os.Stdout
40 | 	g.Stderr = os.Stderr
41 | 	flag.Usage = usage
42 | 	flag.Parse()
43 | 	args := flag.Args()
44 | 	if len(args) == 0 {
45 | 		flag.Usage()
46 | 	}
47 | 
48 | 	if *cpuProfile != "" {
49 | 		f, err := os.Create(*cpuProfile)
50 | 		if err != nil {
51 | 			log.Fatal(err)
52 | 		}
53 | 		defer f.Close()
54 | 		pprof.StartCPUProfile(f)
55 | 		defer pprof.StopCPUProfile()
56 | 	}
57 | 
58 | 	pat := "(?m)" + args[0]
59 | 	if *iflag {
60 | 		pat = "(?i)" + pat
61 | 	}
62 | 	re, err := regexp.Compile(pat)
63 | 	if err != nil {
64 | 		log.Fatal(err)
65 | 	}
66 | 	g.Regexp = re
67 | 	if len(args) == 1 {
68 | 		g.Reader(os.Stdin, "<standard input>")
69 | 	} else {
70 | 		for _, arg := range args[1:] {
71 | 			g.File(arg)
72 | 		}
73 | 	}
74 | 	if !g.Match {
75 | 		os.Exit(1)
76 | 	}
77 | }
78 | 


--------------------------------------------------------------------------------
/cmd/cindex/cindex.go:
--------------------------------------------------------------------------------
  1 | // Copyright 2011 The Go Authors.  All rights reserved.
  2 | // Use of this source code is governed by a BSD-style
  3 | // license that can be found in the LICENSE file.
  4 | 
  5 | package main
  6 | 
  7 | import (
  8 | 	"flag"
  9 | 	"fmt"
 10 | 	"log"
 11 | 	"os"
 12 | 	"path/filepath"
 13 | 	"runtime/pprof"
 14 | 	"sort"
 15 | 
 16 | 	"github.com/google/codesearch/index"
 17 | )
 18 | 
 19 | var usageMessage = `usage: cindex [-list] [-reset] [path...]
 20 | 
 21 | Cindex prepares the trigram index for use by csearch.  The index is the
 22 | file named by $CSEARCHINDEX, or else $HOME/.csearchindex.
 23 | 
 24 | The simplest invocation is
 25 | 
 26 | 	cindex path...
 27 | 
 28 | which adds the file or directory tree named by each path to the index.
 29 | For example:
 30 | 
 31 | 	cindex $HOME/src /usr/include
 32 | 
 33 | or, equivalently:
 34 | 
 35 | 	cindex $HOME/src
 36 | 	cindex /usr/include
 37 | 
 38 | If cindex is invoked with no paths, it reindexes the paths that have
 39 | already been added, in case the files have changed.  Thus, 'cindex' by
 40 | itself is a useful command to run in a nightly cron job.
 41 | 
 42 | The -list flag causes cindex to list the paths it has indexed and exit.
 43 | 
 44 | By default cindex adds the named paths to the index but preserves 
 45 | information about other paths that might already be indexed
 46 | (the ones printed by cindex -list).  The -reset flag causes cindex to
 47 | delete the existing index before indexing the new paths.
 48 | With no path arguments, cindex -reset removes the index.
 49 | `
 50 | 
 51 | func usage() {
 52 | 	fmt.Fprintf(os.Stderr, usageMessage)
 53 | 	os.Exit(2)
 54 | }
 55 | 
 56 | var (
 57 | 	listFlag    = flag.Bool("list", false, "list indexed paths and exit")
 58 | 	resetFlag   = flag.Bool("reset", false, "discard existing index")
 59 | 	verboseFlag = flag.Bool("verbose", false, "print extra information")
 60 | 	cpuProfile  = flag.String("cpuprofile", "", "write cpu profile to this file")
 61 | )
 62 | 
 63 | func main() {
 64 | 	flag.Usage = usage
 65 | 	flag.Parse()
 66 | 	args := flag.Args()
 67 | 
 68 | 	if *listFlag {
 69 | 		ix := index.Open(index.File())
 70 | 		for _, arg := range ix.Paths() {
 71 | 			fmt.Printf("%s\n", arg)
 72 | 		}
 73 | 		return
 74 | 	}
 75 | 
 76 | 	if *cpuProfile != "" {
 77 | 		f, err := os.Create(*cpuProfile)
 78 | 		if err != nil {
 79 | 			log.Fatal(err)
 80 | 		}
 81 | 		defer f.Close()
 82 | 		pprof.StartCPUProfile(f)
 83 | 		defer pprof.StopCPUProfile()
 84 | 	}
 85 | 
 86 | 	if *resetFlag && len(args) == 0 {
 87 | 		os.Remove(index.File())
 88 | 		return
 89 | 	}
 90 | 	if len(args) == 0 {
 91 | 		ix := index.Open(index.File())
 92 | 		for _, arg := range ix.Paths() {
 93 | 			args = append(args, arg)
 94 | 		}
 95 | 	}
 96 | 
 97 | 	// Translate paths to absolute paths so that we can
 98 | 	// generate the file list in sorted order.
 99 | 	for i, arg := range args {
100 | 		a, err := filepath.Abs(arg)
101 | 		if err != nil {
102 | 			log.Printf("%s: %s", arg, err)
103 | 			args[i] = ""
104 | 			continue
105 | 		}
106 | 		args[i] = a
107 | 	}
108 | 	sort.Strings(args)
109 | 
110 | 	for len(args) > 0 && args[0] == "" {
111 | 		args = args[1:]
112 | 	}
113 | 
114 | 	master := index.File()
115 | 	if _, err := os.Stat(master); err != nil {
116 | 		// Does not exist.
117 | 		*resetFlag = true
118 | 	}
119 | 	file := master
120 | 	if !*resetFlag {
121 | 		file += "~"
122 | 	}
123 | 
124 | 	ix := index.Create(file)
125 | 	ix.Verbose = *verboseFlag
126 | 	ix.AddPaths(args)
127 | 	for _, arg := range args {
128 | 		log.Printf("index %s", arg)
129 | 		filepath.Walk(arg, func(path string, info os.FileInfo, err error) error {
130 | 			if _, elem := filepath.Split(path); elem != "" {
131 | 				// Skip various temporary or "hidden" files or directories.
132 | 				if elem[0] == '.' || elem[0] == '#' || elem[0] == '~' || elem[len(elem)-1] == '~' {
133 | 					if info.IsDir() {
134 | 						return filepath.SkipDir
135 | 					}
136 | 					return nil
137 | 				}
138 | 			}
139 | 			if err != nil {
140 | 				log.Printf("%s: %s", path, err)
141 | 				return nil
142 | 			}
143 | 			if info != nil && info.Mode()&os.ModeType == 0 {
144 | 				ix.AddFile(path)
145 | 			}
146 | 			return nil
147 | 		})
148 | 	}
149 | 	log.Printf("flush index")
150 | 	ix.Flush()
151 | 
152 | 	if !*resetFlag {
153 | 		log.Printf("merge %s %s", master, file)
154 | 		index.Merge(file+"~", master, file)
155 | 		os.Remove(file)
156 | 		os.Rename(file+"~", master)
157 | 	}
158 | 	log.Printf("done")
159 | 	return
160 | }
161 | 


--------------------------------------------------------------------------------
/cmd/csearch/csearch.go:
--------------------------------------------------------------------------------
  1 | // Copyright 2011 The Go Authors.  All rights reserved.
  2 | // Use of this source code is governed by a BSD-style
  3 | // license that can be found in the LICENSE file.
  4 | 
  5 | package main
  6 | 
  7 | import (
  8 | 	"flag"
  9 | 	"fmt"
 10 | 	"log"
 11 | 	"os"
 12 | 	"runtime/pprof"
 13 | 
 14 | 	"github.com/google/codesearch/index"
 15 | 	"github.com/google/codesearch/regexp"
 16 | )
 17 | 
 18 | var usageMessage = `usage: csearch [-c] [-f fileregexp] [-h] [-i] [-l] [-n] regexp
 19 | 
 20 | Csearch behaves like grep over all indexed files, searching for regexp,
 21 | an RE2 (nearly PCRE) regular expression.
 22 | 
 23 | The -c, -h, -i, -l, and -n flags are as in grep, although note that as per Go's
 24 | flag parsing convention, they cannot be combined: the option pair -i -n 
 25 | cannot be abbreviated to -in.
 26 | 
 27 | The -f flag restricts the search to files whose names match the RE2 regular
 28 | expression fileregexp.
 29 | 
 30 | Csearch relies on the existence of an up-to-date index created ahead of time.
 31 | To build or rebuild the index that csearch uses, run:
 32 | 
 33 | 	cindex path...
 34 | 
 35 | where path... is a list of directories or individual files to be included in the index.
 36 | If no index exists, this command creates one.  If an index already exists, cindex
 37 | overwrites it.  Run cindex -help for more.
 38 | 
 39 | Csearch uses the index stored in $CSEARCHINDEX or, if that variable is unset or
 40 | empty, $HOME/.csearchindex.
 41 | `
 42 | 
 43 | func usage() {
 44 | 	fmt.Fprintf(os.Stderr, usageMessage)
 45 | 	os.Exit(2)
 46 | }
 47 | 
 48 | var (
 49 | 	fFlag       = flag.String("f", "", "search only files with names matching this regexp")
 50 | 	iFlag       = flag.Bool("i", false, "case-insensitive search")
 51 | 	verboseFlag = flag.Bool("verbose", false, "print extra information")
 52 | 	bruteFlag   = flag.Bool("brute", false, "brute force - search all files in index")
 53 | 	cpuProfile  = flag.String("cpuprofile", "", "write cpu profile to this file")
 54 | 
 55 | 	matches bool
 56 | )
 57 | 
 58 | func Main() {
 59 | 	g := regexp.Grep{
 60 | 		Stdout: os.Stdout,
 61 | 		Stderr: os.Stderr,
 62 | 	}
 63 | 	g.AddFlags()
 64 | 
 65 | 	flag.Usage = usage
 66 | 	flag.Parse()
 67 | 	args := flag.Args()
 68 | 
 69 | 	if len(args) != 1 {
 70 | 		usage()
 71 | 	}
 72 | 
 73 | 	if *cpuProfile != "" {
 74 | 		f, err := os.Create(*cpuProfile)
 75 | 		if err != nil {
 76 | 			log.Fatal(err)
 77 | 		}
 78 | 		defer f.Close()
 79 | 		pprof.StartCPUProfile(f)
 80 | 		defer pprof.StopCPUProfile()
 81 | 	}
 82 | 
 83 | 	pat := "(?m)" + args[0]
 84 | 	if *iFlag {
 85 | 		pat = "(?i)" + pat
 86 | 	}
 87 | 	re, err := regexp.Compile(pat)
 88 | 	if err != nil {
 89 | 		log.Fatal(err)
 90 | 	}
 91 | 	g.Regexp = re
 92 | 	var fre *regexp.Regexp
 93 | 	if *fFlag != "" {
 94 | 		fre, err = regexp.Compile(*fFlag)
 95 | 		if err != nil {
 96 | 			log.Fatal(err)
 97 | 		}
 98 | 	}
 99 | 	q := index.RegexpQuery(re.Syntax)
100 | 	if *verboseFlag {
101 | 		log.Printf("query: %s\n", q)
102 | 	}
103 | 
104 | 	ix := index.Open(index.File())
105 | 	ix.Verbose = *verboseFlag
106 | 	var post []uint32
107 | 	if *bruteFlag {
108 | 		post = ix.PostingQuery(&index.Query{Op: index.QAll})
109 | 	} else {
110 | 		post = ix.PostingQuery(q)
111 | 	}
112 | 	if *verboseFlag {
113 | 		log.Printf("post query identified %d possible files\n", len(post))
114 | 	}
115 | 
116 | 	if fre != nil {
117 | 		fnames := make([]uint32, 0, len(post))
118 | 
119 | 		for _, fileid := range post {
120 | 			name := ix.Name(fileid)
121 | 			if fre.MatchString(name, true, true) < 0 {
122 | 				continue
123 | 			}
124 | 			fnames = append(fnames, fileid)
125 | 		}
126 | 
127 | 		if *verboseFlag {
128 | 			log.Printf("filename regexp matched %d files\n", len(fnames))
129 | 		}
130 | 		post = fnames
131 | 	}
132 | 
133 | 	for _, fileid := range post {
134 | 		name := ix.Name(fileid)
135 | 		g.File(name)
136 | 	}
137 | 
138 | 	matches = g.Match
139 | }
140 | 
141 | func main() {
142 | 	Main()
143 | 	if !matches {
144 | 		os.Exit(1)
145 | 	}
146 | 	os.Exit(0)
147 | }
148 | 


--------------------------------------------------------------------------------
/index/merge.go:
--------------------------------------------------------------------------------
  1 | // Copyright 2011 The Go Authors.  All rights reserved.
  2 | // Use of this source code is governed by a BSD-style
  3 | // license that can be found in the LICENSE file.
  4 | 
  5 | package index
  6 | 
  7 | // Merging indexes.
  8 | //
  9 | // To merge two indexes A and B (newer) into a combined index C:
 10 | //
 11 | // Load the path list from B and determine for each path the docid ranges
 12 | // that it will replace in A.
 13 | //
 14 | // Read A's and B's name lists together, merging them into C's name list.
 15 | // Discard the identified ranges from A during the merge.  Also during the merge,
 16 | // record the mapping from A's docids to C's docids, and also the mapping from
 17 | // B's docids to C's docids.  Both mappings can be summarized in a table like
 18 | //
 19 | //	10-14 map to 20-24
 20 | //	15-24 is deleted
 21 | //	25-34 maps to 40-49
 22 | //
 23 | // The number of ranges will be at most the combined number of paths.
 24 | // Also during the merge, write the name index to a temporary file as usual.
 25 | //
 26 | // Now merge the posting lists (this is why they begin with the trigram).
 27 | // During the merge, translate the docid numbers to the new C docid space.
 28 | // Also during the merge, write the posting list index to a temporary file as usual.
 29 | // 
 30 | // Copy the name index and posting list index into C's index and write the trailer.
 31 | // Rename C's index onto the new index.
 32 | 
 33 | import (
 34 | 	"encoding/binary"
 35 | 	"os"
 36 | 	"strings"
 37 | )
 38 | 
 39 | // An idrange records that the half-open interval [lo, hi) maps to [new, new+hi-lo).
 40 | type idrange struct {
 41 | 	lo, hi, new uint32
 42 | }
 43 | 
 44 | type postIndex struct {
 45 | 	tri    uint32
 46 | 	count  uint32
 47 | 	offset uint32
 48 | }
 49 | 
 50 | // Merge creates a new index in the file dst that corresponds to merging
 51 | // the two indices src1 and src2.  If both src1 and src2 claim responsibility
 52 | // for a path, src2 is assumed to be newer and is given preference.
 53 | func Merge(dst, src1, src2 string) {
 54 | 	ix1 := Open(src1)
 55 | 	ix2 := Open(src2)
 56 | 	paths1 := ix1.Paths()
 57 | 	paths2 := ix2.Paths()
 58 | 
 59 | 	// Build docid maps.
 60 | 	var i1, i2, new uint32
 61 | 	var map1, map2 []idrange
 62 | 	for _, path := range paths2 {
 63 | 		// Determine range shadowed by this path.
 64 | 		old := i1
 65 | 		for i1 < uint32(ix1.numName) && ix1.Name(i1) < path {
 66 | 			i1++
 67 | 		}
 68 | 		lo := i1
 69 | 		limit := path[:len(path)-1] + string(path[len(path)-1]+1)
 70 | 		for i1 < uint32(ix1.numName) && ix1.Name(i1) < limit {
 71 | 			i1++
 72 | 		}
 73 | 		hi := i1
 74 | 
 75 | 		// Record range before the shadow.
 76 | 		if old < lo {
 77 | 			map1 = append(map1, idrange{old, lo, new})
 78 | 			new += lo - old
 79 | 		}
 80 | 
 81 | 		// Determine range defined by this path.
 82 | 		// Because we are iterating over the ix2 paths,
 83 | 		// there can't be gaps, so it must start at i2.
 84 | 		if i2 < uint32(ix2.numName) && ix2.Name(i2) < path {
 85 | 			panic("merge: inconsistent index")
 86 | 		}
 87 | 		lo = i2
 88 | 		for i2 < uint32(ix2.numName) && ix2.Name(i2) < limit {
 89 | 			i2++
 90 | 		}
 91 | 		hi = i2
 92 | 		if lo < hi {
 93 | 			map2 = append(map2, idrange{lo, hi, new})
 94 | 			new += hi - lo
 95 | 		}
 96 | 	}
 97 | 
 98 | 	if i1 < uint32(ix1.numName) {
 99 | 		map1 = append(map1, idrange{i1, uint32(ix1.numName), new})
100 | 		new += uint32(ix1.numName) - i1
101 | 	}
102 | 	if i2 < uint32(ix2.numName) {
103 | 		panic("merge: inconsistent index")
104 | 	}
105 | 	numName := new
106 | 
107 | 	ix3 := bufCreate(dst)
108 | 	ix3.writeString(magic)
109 | 
110 | 	// Merged list of paths.
111 | 	pathData := ix3.offset()
112 | 	mi1 := 0
113 | 	mi2 := 0
114 | 	last := "\x00" // not a prefix of anything
115 | 	for mi1 < len(paths1) || mi2 < len(paths2) {
116 | 		var p string
117 | 		if mi2 >= len(paths2) || mi1 < len(paths1) && paths1[mi1] <= paths2[mi2] {
118 | 			p = paths1[mi1]
119 | 			mi1++
120 | 		} else {
121 | 			p = paths2[mi2]
122 | 			mi2++
123 | 		}
124 | 		if strings.HasPrefix(p, last) {
125 | 			continue
126 | 		}
127 | 		last = p
128 | 		ix3.writeString(p)
129 | 		ix3.writeString("\x00")
130 | 	}
131 | 	ix3.writeString("\x00")
132 | 
133 | 	// Merged list of names.
134 | 	nameData := ix3.offset()
135 | 	nameIndexFile := bufCreate("")
136 | 	new = 0
137 | 	mi1 = 0
138 | 	mi2 = 0
139 | 	for new < numName {
140 | 		if mi1 < len(map1) && map1[mi1].new == new {
141 | 			for i := map1[mi1].lo; i < map1[mi1].hi; i++ {
142 | 				name := ix1.Name(i)
143 | 				nameIndexFile.writeUint32(ix3.offset() - nameData)
144 | 				ix3.writeString(name)
145 | 				ix3.writeString("\x00")
146 | 				new++
147 | 			}
148 | 			mi1++
149 | 		} else if mi2 < len(map2) && map2[mi2].new == new {
150 | 			for i := map2[mi2].lo; i < map2[mi2].hi; i++ {
151 | 				name := ix2.Name(i)
152 | 				nameIndexFile.writeUint32(ix3.offset() - nameData)
153 | 				ix3.writeString(name)
154 | 				ix3.writeString("\x00")
155 | 				new++
156 | 			}
157 | 			mi2++
158 | 		} else {
159 | 			panic("merge: inconsistent index")
160 | 		}
161 | 	}
162 | 	if new*4 != nameIndexFile.offset() {
163 | 		panic("merge: inconsistent index")
164 | 	}
165 | 	nameIndexFile.writeUint32(ix3.offset())
166 | 
167 | 	// Merged list of posting lists.
168 | 	postData := ix3.offset()
169 | 	var r1 postMapReader
170 | 	var r2 postMapReader
171 | 	var w postDataWriter
172 | 	r1.init(ix1, map1)
173 | 	r2.init(ix2, map2)
174 | 	w.init(ix3)
175 | 	for {
176 | 		if r1.trigram < r2.trigram {
177 | 			w.trigram(r1.trigram)
178 | 			for r1.nextId() {
179 | 				w.fileid(r1.fileid)
180 | 			}
181 | 			r1.nextTrigram()
182 | 			w.endTrigram()
183 | 		} else if r2.trigram < r1.trigram {
184 | 			w.trigram(r2.trigram)
185 | 			for r2.nextId() {
186 | 				w.fileid(r2.fileid)
187 | 			}
188 | 			r2.nextTrigram()
189 | 			w.endTrigram()
190 | 		} else {
191 | 			if r1.trigram == ^uint32(0) {
192 | 				break
193 | 			}
194 | 			w.trigram(r1.trigram)
195 | 			r1.nextId()
196 | 			r2.nextId()
197 | 			for r1.fileid < ^uint32(0) || r2.fileid < ^uint32(0) {
198 | 				if r1.fileid < r2.fileid {
199 | 					w.fileid(r1.fileid)
200 | 					r1.nextId()
201 | 				} else if r2.fileid < r1.fileid {
202 | 					w.fileid(r2.fileid)
203 | 					r2.nextId()
204 | 				} else {
205 | 					panic("merge: inconsistent index")
206 | 				}
207 | 			}
208 | 			r1.nextTrigram()
209 | 			r2.nextTrigram()
210 | 			w.endTrigram()
211 | 		}
212 | 	}
213 | 
214 | 	// Name index
215 | 	nameIndex := ix3.offset()
216 | 	copyFile(ix3, nameIndexFile)
217 | 
218 | 	// Posting list index
219 | 	postIndex := ix3.offset()
220 | 	copyFile(ix3, w.postIndexFile)
221 | 
222 | 	ix3.writeUint32(pathData)
223 | 	ix3.writeUint32(nameData)
224 | 	ix3.writeUint32(postData)
225 | 	ix3.writeUint32(nameIndex)
226 | 	ix3.writeUint32(postIndex)
227 | 	ix3.writeString(trailerMagic)
228 | 	ix3.flush()
229 | 
230 | 	os.Remove(nameIndexFile.name)
231 | 	os.Remove(w.postIndexFile.name)
232 | }
233 | 
234 | type postMapReader struct {
235 | 	ix      *Index
236 | 	idmap   []idrange
237 | 	triNum  uint32
238 | 	trigram uint32
239 | 	count   uint32
240 | 	offset  uint32
241 | 	d       []byte
242 | 	oldid   uint32
243 | 	fileid  uint32
244 | 	i       int
245 | }
246 | 
247 | func (r *postMapReader) init(ix *Index, idmap []idrange) {
248 | 	r.ix = ix
249 | 	r.idmap = idmap
250 | 	r.trigram = ^uint32(0)
251 | 	r.load()
252 | }
253 | 
254 | func (r *postMapReader) nextTrigram() {
255 | 	r.triNum++
256 | 	r.load()
257 | }
258 | 
259 | func (r *postMapReader) load() {
260 | 	if r.triNum >= uint32(r.ix.numPost) {
261 | 		r.trigram = ^uint32(0)
262 | 		r.count = 0
263 | 		r.fileid = ^uint32(0)
264 | 		return
265 | 	}
266 | 	r.trigram, r.count, r.offset = r.ix.listAt(r.triNum * postEntrySize)
267 | 	if r.count == 0 {
268 | 		r.fileid = ^uint32(0)
269 | 		return
270 | 	}
271 | 	r.d = r.ix.slice(r.ix.postData+r.offset+3, -1)
272 | 	r.oldid = ^uint32(0)
273 | 	r.i = 0
274 | }
275 | 
276 | func (r *postMapReader) nextId() bool {
277 | 	for r.count > 0 {
278 | 		r.count--
279 | 		delta64, n := binary.Uvarint(r.d)
280 | 		delta := uint32(delta64)
281 | 		if n <= 0 || delta == 0 {
282 | 			corrupt()
283 | 		}
284 | 		r.d = r.d[n:]
285 | 		r.oldid += delta
286 | 		for r.i < len(r.idmap) && r.idmap[r.i].hi <= r.oldid {
287 | 			r.i++
288 | 		}
289 | 		if r.i >= len(r.idmap) {
290 | 			r.count = 0
291 | 			break
292 | 		}
293 | 		if r.oldid < r.idmap[r.i].lo {
294 | 			continue
295 | 		}
296 | 		r.fileid = r.idmap[r.i].new + r.oldid - r.idmap[r.i].lo
297 | 		return true
298 | 	}
299 | 
300 | 	r.fileid = ^uint32(0)
301 | 	return false
302 | }
303 | 
304 | type postDataWriter struct {
305 | 	out           *bufWriter
306 | 	postIndexFile *bufWriter
307 | 	buf           [10]byte
308 | 	base          uint32
309 | 	count, offset uint32
310 | 	last          uint32
311 | 	t             uint32
312 | }
313 | 
314 | func (w *postDataWriter) init(out *bufWriter) {
315 | 	w.out = out
316 | 	w.postIndexFile = bufCreate("")
317 | 	w.base = out.offset()
318 | }
319 | 
320 | func (w *postDataWriter) trigram(t uint32) {
321 | 	w.offset = w.out.offset()
322 | 	w.count = 0
323 | 	w.t = t
324 | 	w.last = ^uint32(0)
325 | }
326 | 
327 | func (w *postDataWriter) fileid(id uint32) {
328 | 	if w.count == 0 {
329 | 		w.out.writeTrigram(w.t)
330 | 	}
331 | 	w.out.writeUvarint(id - w.last)
332 | 	w.last = id
333 | 	w.count++
334 | }
335 | 
336 | func (w *postDataWriter) endTrigram() {
337 | 	if w.count == 0 {
338 | 		return
339 | 	}
340 | 	w.out.writeUvarint(0)
341 | 	w.postIndexFile.writeTrigram(w.t)
342 | 	w.postIndexFile.writeUint32(w.count)
343 | 	w.postIndexFile.writeUint32(w.offset - w.base)
344 | }
345 | 


--------------------------------------------------------------------------------
/index/merge_test.go:
--------------------------------------------------------------------------------
  1 | // Copyright 2011 The Go Authors.  All rights reserved.
  2 | // Use of this source code is governed by a BSD-style
  3 | // license that can be found in the LICENSE file.
  4 | 
  5 | package index
  6 | 
  7 | import (
  8 | 	"io/ioutil"
  9 | 	"os"
 10 | 	"testing"
 11 | )
 12 | 
 13 | var mergePaths1 = []string{
 14 | 	"/a",
 15 | 	"/b",
 16 | 	"/c",
 17 | }
 18 | 
 19 | var mergePaths2 = []string{
 20 | 	"/b",
 21 | 	"/cc",
 22 | }
 23 | 
 24 | var mergeFiles1 = map[string]string{
 25 | 	"/a/x":  "hello world",
 26 | 	"/a/y":  "goodbye world",
 27 | 	"/b/xx": "now is the time",
 28 | 	"/b/xy": "for all good men",
 29 | 	"/c/ab": "give me all the potatoes",
 30 | 	"/c/de": "or give me death now",
 31 | }
 32 | 
 33 | var mergeFiles2 = map[string]string{
 34 | 	"/b/www": "world wide indeed",
 35 | 	"/b/xx":  "no, not now",
 36 | 	"/b/yy":  "first potatoes, now liberty?",
 37 | 	"/cc":    "come to the aid of his potatoes",
 38 | }
 39 | 
 40 | func TestMerge(t *testing.T) {
 41 | 	f1, _ := ioutil.TempFile("", "index-test")
 42 | 	f2, _ := ioutil.TempFile("", "index-test")
 43 | 	f3, _ := ioutil.TempFile("", "index-test")
 44 | 	defer os.Remove(f1.Name())
 45 | 	defer os.Remove(f2.Name())
 46 | 	defer os.Remove(f3.Name())
 47 | 
 48 | 	out1 := f1.Name()
 49 | 	out2 := f2.Name()
 50 | 	out3 := f3.Name()
 51 | 
 52 | 	buildIndex(out1, mergePaths1, mergeFiles1)
 53 | 	buildIndex(out2, mergePaths2, mergeFiles2)
 54 | 
 55 | 	Merge(out3, out1, out2)
 56 | 
 57 | 	ix1 := Open(out1)
 58 | 	ix2 := Open(out2)
 59 | 	ix3 := Open(out3)
 60 | 
 61 | 	nameof := func(ix *Index) string {
 62 | 		switch {
 63 | 		case ix == ix1:
 64 | 			return "ix1"
 65 | 		case ix == ix2:
 66 | 			return "ix2"
 67 | 		case ix == ix3:
 68 | 			return "ix3"
 69 | 		}
 70 | 		return "???"
 71 | 	}
 72 | 
 73 | 	checkFiles := func(ix *Index, l ...string) {
 74 | 		for i, s := range l {
 75 | 			if n := ix.Name(uint32(i)); n != s {
 76 | 				t.Errorf("%s: Name(%d) = %s, want %s", nameof(ix), i, n, s)
 77 | 			}
 78 | 		}
 79 | 	}
 80 | 
 81 | 	checkFiles(ix1, "/a/x", "/a/y", "/b/xx", "/b/xy", "/c/ab", "/c/de")
 82 | 	checkFiles(ix2, "/b/www", "/b/xx", "/b/yy", "/cc")
 83 | 	checkFiles(ix3, "/a/x", "/a/y", "/b/www", "/b/xx", "/b/yy", "/c/ab", "/c/de", "/cc")
 84 | 
 85 | 	check := func(ix *Index, trig string, l ...uint32) {
 86 | 		l1 := ix.PostingList(tri(trig[0], trig[1], trig[2]))
 87 | 		if !equalList(l1, l) {
 88 | 			t.Errorf("PostingList(%s, %s) = %v, want %v", nameof(ix), trig, l1, l)
 89 | 		}
 90 | 	}
 91 | 
 92 | 	check(ix1, "wor", 0, 1)
 93 | 	check(ix1, "now", 2, 5)
 94 | 	check(ix1, "all", 3, 4)
 95 | 
 96 | 	check(ix2, "now", 1, 2)
 97 | 
 98 | 	check(ix3, "all", 5)
 99 | 	check(ix3, "wor", 0, 1, 2)
100 | 	check(ix3, "now", 3, 4, 6)
101 | 	check(ix3, "pot", 4, 5, 7)
102 | }
103 | 


--------------------------------------------------------------------------------
/index/mmap_bsd.go:
--------------------------------------------------------------------------------
 1 | // Copyright 2011 The Go Authors.  All rights reserved.
 2 | // Use of this source code is governed by a BSD-style
 3 | // license that can be found in the LICENSE file.
 4 | 
 5 | // +build darwin freebsd openbsd netbsd
 6 | 
 7 | package index
 8 | 
 9 | import (
10 | 	"log"
11 | 	"os"
12 | 	"syscall"
13 | )
14 | 
15 | // missing from package syscall on freebsd, openbsd
16 | const (
17 | 	_PROT_READ  = 1
18 | 	_MAP_SHARED = 1
19 | )
20 | 
21 | func mmapFile(f *os.File) mmapData {
22 | 	st, err := f.Stat()
23 | 	if err != nil {
24 | 		log.Fatal(err)
25 | 	}
26 | 	size := st.Size()
27 | 	if int64(int(size+4095)) != size+4095 {
28 | 		log.Fatalf("%s: too large for mmap", f.Name())
29 | 	}
30 | 	n := int(size)
31 | 	if n == 0 {
32 | 		return mmapData{f, nil}
33 | 	}
34 | 	data, err := syscall.Mmap(int(f.Fd()), 0, (n+4095)&^4095, _PROT_READ, _MAP_SHARED)
35 | 	if err != nil {
36 | 		log.Fatalf("mmap %s: %v", f.Name(), err)
37 | 	}
38 | 	return mmapData{f, data[:n]}
39 | }
40 | 


--------------------------------------------------------------------------------
/index/mmap_linux.go:
--------------------------------------------------------------------------------
 1 | // Copyright 2011 The Go Authors.  All rights reserved.
 2 | // Use of this source code is governed by a BSD-style
 3 | // license that can be found in the LICENSE file.
 4 | 
 5 | package index
 6 | 
 7 | import (
 8 | 	"log"
 9 | 	"os"
10 | 	"syscall"
11 | )
12 | 
13 | func mmapFile(f *os.File) mmapData {
14 | 	st, err := f.Stat()
15 | 	if err != nil {
16 | 		log.Fatal(err)
17 | 	}
18 | 	size := st.Size()
19 | 	if int64(int(size+4095)) != size+4095 {
20 | 		log.Fatalf("%s: too large for mmap", f.Name())
21 | 	}
22 | 	n := int(size)
23 | 	if n == 0 {
24 | 		return mmapData{f, nil}
25 | 	}
26 | 	data, err := syscall.Mmap(int(f.Fd()), 0, (n+4095)&^4095, syscall.PROT_READ, syscall.MAP_SHARED)
27 | 	if err != nil {
28 | 		log.Fatalf("mmap %s: %v", f.Name(), err)
29 | 	}
30 | 	return mmapData{f, data[:n]}
31 | }
32 | 


--------------------------------------------------------------------------------
/index/mmap_windows.go:
--------------------------------------------------------------------------------
 1 | // Copyright 2011 The Go Authors.  All rights reserved.
 2 | // Use of this source code is governed by a BSD-style
 3 | // license that can be found in the LICENSE file.
 4 | 
 5 | package index
 6 | 
 7 | import (
 8 | 	"log"
 9 | 	"os"
10 | 	"syscall"
11 | 	"unsafe"
12 | )
13 | 
14 | func mmapFile(f *os.File) mmapData {
15 | 	st, err := f.Stat()
16 | 	if err != nil {
17 | 		log.Fatal(err)
18 | 	}
19 | 	size := st.Size()
20 | 	if int64(int(size+4095)) != size+4095 {
21 | 		log.Fatalf("%s: too large for mmap", f.Name())
22 | 	}
23 | 	if size == 0 {
24 | 		return mmapData{f, nil}
25 | 	}
26 | 	h, err := syscall.CreateFileMapping(syscall.Handle(f.Fd()), nil, syscall.PAGE_READONLY, uint32(size>>32), uint32(size), nil)
27 | 	if err != nil {
28 | 		log.Fatalf("CreateFileMapping %s: %v", f.Name(), err)
29 | 	}
30 | 
31 | 	addr, err := syscall.MapViewOfFile(h, syscall.FILE_MAP_READ, 0, 0, 0)
32 | 	if err != nil {
33 | 		log.Fatalf("MapViewOfFile %s: %v", f.Name(), err)
34 | 	}
35 | 	data := (*[1 << 30]byte)(unsafe.Pointer(addr))
36 | 	return mmapData{f, data[:size]}
37 | }
38 | 


--------------------------------------------------------------------------------
/index/read.go:
--------------------------------------------------------------------------------
  1 | // Copyright 2011 The Go Authors.  All rights reserved.
  2 | // Use of this source code is governed by a BSD-style
  3 | // license that can be found in the LICENSE file.
  4 | 
  5 | package index
  6 | 
  7 | // Index format.
  8 | //
  9 | // An index stored on disk has the format:
 10 | //
 11 | //	"csearch index 1\n"
 12 | //	list of paths
 13 | //	list of names
 14 | //	list of posting lists
 15 | //	name index
 16 | //	posting list index
 17 | //	trailer
 18 | //
 19 | // The list of paths is a sorted sequence of NUL-terminated file or directory names.
 20 | // The index covers the file trees rooted at those paths.
 21 | // The list ends with an empty name ("\x00").
 22 | //
 23 | // The list of names is a sorted sequence of NUL-terminated file names.
 24 | // The initial entry in the list corresponds to file #0,
 25 | // the next to file #1, and so on.  The list ends with an
 26 | // empty name ("\x00").
 27 | //
 28 | // The list of posting lists are a sequence of posting lists.
 29 | // Each posting list has the form:
 30 | //
 31 | //	trigram [3]
 32 | //	deltas [v]...
 33 | //
 34 | // The trigram gives the 3 byte trigram that this list describes.  The
 35 | // delta list is a sequence of varint-encoded deltas between file
 36 | // IDs, ending with a zero delta.  For example, the delta list [2,5,1,1,0]
 37 | // encodes the file ID list 1, 6, 7, 8.  The delta list [0] would
 38 | // encode the empty file ID list, but empty posting lists are usually
 39 | // not recorded at all.  The list of posting lists ends with an entry
 40 | // with trigram "\xff\xff\xff" and a delta list consisting a single zero.
 41 | //
 42 | // The indexes enable efficient random access to the lists.  The name
 43 | // index is a sequence of 4-byte big-endian values listing the byte
 44 | // offset in the name list where each name begins.  The posting list
 45 | // index is a sequence of index entries describing each successive
 46 | // posting list.  Each index entry has the form:
 47 | //
 48 | //	trigram [3]
 49 | //	file count [4]
 50 | //	offset [4]
 51 | //
 52 | // Index entries are only written for the non-empty posting lists,
 53 | // so finding the posting list for a specific trigram requires a
 54 | // binary search over the posting list index.  In practice, the majority
 55 | // of the possible trigrams are never seen, so omitting the missing
 56 | // ones represents a significant storage savings.
 57 | //
 58 | // The trailer has the form:
 59 | //
 60 | //	offset of path list [4]
 61 | //	offset of name list [4]
 62 | //	offset of posting lists [4]
 63 | //	offset of name index [4]
 64 | //	offset of posting list index [4]
 65 | //	"\ncsearch trailr\n"
 66 | 
 67 | import (
 68 | 	"bytes"
 69 | 	"encoding/binary"
 70 | 	"log"
 71 | 	"os"
 72 | 	"path/filepath"
 73 | 	"runtime"
 74 | 	"sort"
 75 | )
 76 | 
 77 | const (
 78 | 	magic        = "csearch index 1\n"
 79 | 	trailerMagic = "\ncsearch trailr\n"
 80 | )
 81 | 
 82 | // An Index implements read-only access to a trigram index.
 83 | type Index struct {
 84 | 	Verbose   bool
 85 | 	data      mmapData
 86 | 	pathData  uint32
 87 | 	nameData  uint32
 88 | 	postData  uint32
 89 | 	nameIndex uint32
 90 | 	postIndex uint32
 91 | 	numName   int
 92 | 	numPost   int
 93 | }
 94 | 
 95 | const postEntrySize = 3 + 4 + 4
 96 | 
 97 | func Open(file string) *Index {
 98 | 	mm := mmap(file)
 99 | 	if len(mm.d) < 4*4+len(trailerMagic) || string(mm.d[len(mm.d)-len(trailerMagic):]) != trailerMagic {
100 | 		corrupt()
101 | 	}
102 | 	n := uint32(len(mm.d) - len(trailerMagic) - 5*4)
103 | 	ix := &Index{data: mm}
104 | 	ix.pathData = ix.uint32(n)
105 | 	ix.nameData = ix.uint32(n + 4)
106 | 	ix.postData = ix.uint32(n + 8)
107 | 	ix.nameIndex = ix.uint32(n + 12)
108 | 	ix.postIndex = ix.uint32(n + 16)
109 | 	ix.numName = int((ix.postIndex-ix.nameIndex)/4) - 1
110 | 	ix.numPost = int((n - ix.postIndex) / postEntrySize)
111 | 	return ix
112 | }
113 | 
114 | // slice returns the slice of index data starting at the given byte offset.
115 | // If n >= 0, the slice must have length at least n and is truncated to length n.
116 | func (ix *Index) slice(off uint32, n int) []byte {
117 | 	o := int(off)
118 | 	if uint32(o) != off || n >= 0 && o+n > len(ix.data.d) {
119 | 		corrupt()
120 | 	}
121 | 	if n < 0 {
122 | 		return ix.data.d[o:]
123 | 	}
124 | 	return ix.data.d[o : o+n]
125 | }
126 | 
127 | // uint32 returns the uint32 value at the given offset in the index data.
128 | func (ix *Index) uint32(off uint32) uint32 {
129 | 	return binary.BigEndian.Uint32(ix.slice(off, 4))
130 | }
131 | 
132 | // uvarint returns the varint value at the given offset in the index data.
133 | func (ix *Index) uvarint(off uint32) uint32 {
134 | 	v, n := binary.Uvarint(ix.slice(off, -1))
135 | 	if n <= 0 {
136 | 		corrupt()
137 | 	}
138 | 	return uint32(v)
139 | }
140 | 
141 | // Paths returns the list of indexed paths.
142 | func (ix *Index) Paths() []string {
143 | 	off := ix.pathData
144 | 	var x []string
145 | 	for {
146 | 		s := ix.str(off)
147 | 		if len(s) == 0 {
148 | 			break
149 | 		}
150 | 		x = append(x, string(s))
151 | 		off += uint32(len(s) + 1)
152 | 	}
153 | 	return x
154 | }
155 | 
156 | // NameBytes returns the name corresponding to the given fileid.
157 | func (ix *Index) NameBytes(fileid uint32) []byte {
158 | 	off := ix.uint32(ix.nameIndex + 4*fileid)
159 | 	return ix.str(ix.nameData + off)
160 | }
161 | 
162 | func (ix *Index) str(off uint32) []byte {
163 | 	str := ix.slice(off, -1)
164 | 	i := bytes.IndexByte(str, '\x00')
165 | 	if i < 0 {
166 | 		corrupt()
167 | 	}
168 | 	return str[:i]
169 | }
170 | 
171 | // Name returns the name corresponding to the given fileid.
172 | func (ix *Index) Name(fileid uint32) string {
173 | 	return string(ix.NameBytes(fileid))
174 | }
175 | 
176 | // listAt returns the index list entry at the given offset.
177 | func (ix *Index) listAt(off uint32) (trigram, count, offset uint32) {
178 | 	d := ix.slice(ix.postIndex+off, postEntrySize)
179 | 	trigram = uint32(d[0])<<16 | uint32(d[1])<<8 | uint32(d[2])
180 | 	count = binary.BigEndian.Uint32(d[3:])
181 | 	offset = binary.BigEndian.Uint32(d[3+4:])
182 | 	return
183 | }
184 | 
185 | func (ix *Index) dumpPosting() {
186 | 	d := ix.slice(ix.postIndex, postEntrySize*ix.numPost)
187 | 	for i := 0; i < ix.numPost; i++ {
188 | 		j := i * postEntrySize
189 | 		t := uint32(d[j])<<16 | uint32(d[j+1])<<8 | uint32(d[j+2])
190 | 		count := int(binary.BigEndian.Uint32(d[j+3:]))
191 | 		offset := binary.BigEndian.Uint32(d[j+3+4:])
192 | 		log.Printf("%#x: %d at %d", t, count, offset)
193 | 	}
194 | }
195 | 
196 | func (ix *Index) findList(trigram uint32) (count int, offset uint32) {
197 | 	// binary search
198 | 	d := ix.slice(ix.postIndex, postEntrySize*ix.numPost)
199 | 	i := sort.Search(ix.numPost, func(i int) bool {
200 | 		i *= postEntrySize
201 | 		t := uint32(d[i])<<16 | uint32(d[i+1])<<8 | uint32(d[i+2])
202 | 		return t >= trigram
203 | 	})
204 | 	if i >= ix.numPost {
205 | 		return 0, 0
206 | 	}
207 | 	i *= postEntrySize
208 | 	t := uint32(d[i])<<16 | uint32(d[i+1])<<8 | uint32(d[i+2])
209 | 	if t != trigram {
210 | 		return 0, 0
211 | 	}
212 | 	count = int(binary.BigEndian.Uint32(d[i+3:]))
213 | 	offset = binary.BigEndian.Uint32(d[i+3+4:])
214 | 	return
215 | }
216 | 
217 | type postReader struct {
218 | 	ix       *Index
219 | 	count    int
220 | 	offset   uint32
221 | 	fileid   uint32
222 | 	d        []byte
223 | 	restrict []uint32
224 | }
225 | 
226 | func (r *postReader) init(ix *Index, trigram uint32, restrict []uint32) {
227 | 	count, offset := ix.findList(trigram)
228 | 	if count == 0 {
229 | 		return
230 | 	}
231 | 	r.ix = ix
232 | 	r.count = count
233 | 	r.offset = offset
234 | 	r.fileid = ^uint32(0)
235 | 	r.d = ix.slice(ix.postData+offset+3, -1)
236 | 	r.restrict = restrict
237 | }
238 | 
239 | func (r *postReader) max() int {
240 | 	return int(r.count)
241 | }
242 | 
243 | func (r *postReader) next() bool {
244 | 	for r.count > 0 {
245 | 		r.count--
246 | 		delta64, n := binary.Uvarint(r.d)
247 | 		delta := uint32(delta64)
248 | 		if n <= 0 || delta == 0 {
249 | 			corrupt()
250 | 		}
251 | 		r.d = r.d[n:]
252 | 		r.fileid += delta
253 | 		if r.restrict != nil {
254 | 			i := 0
255 | 			for i < len(r.restrict) && r.restrict[i] < r.fileid {
256 | 				i++
257 | 			}
258 | 			r.restrict = r.restrict[i:]
259 | 			if len(r.restrict) == 0 || r.restrict[0] != r.fileid {
260 | 				continue
261 | 			}
262 | 		}
263 | 		return true
264 | 	}
265 | 	// list should end with terminating 0 delta
266 | 	if r.d != nil && (len(r.d) == 0 || r.d[0] != 0) {
267 | 		corrupt()
268 | 	}
269 | 	r.fileid = ^uint32(0)
270 | 	return false
271 | }
272 | 
273 | func (ix *Index) PostingList(trigram uint32) []uint32 {
274 | 	return ix.postingList(trigram, nil)
275 | }
276 | 
277 | func (ix *Index) postingList(trigram uint32, restrict []uint32) []uint32 {
278 | 	var r postReader
279 | 	r.init(ix, trigram, restrict)
280 | 	x := make([]uint32, 0, r.max())
281 | 	for r.next() {
282 | 		x = append(x, r.fileid)
283 | 	}
284 | 	return x
285 | }
286 | 
287 | func (ix *Index) PostingAnd(list []uint32, trigram uint32) []uint32 {
288 | 	return ix.postingAnd(list, trigram, nil)
289 | }
290 | 
291 | func (ix *Index) postingAnd(list []uint32, trigram uint32, restrict []uint32) []uint32 {
292 | 	var r postReader
293 | 	r.init(ix, trigram, restrict)
294 | 	x := list[:0]
295 | 	i := 0
296 | 	for r.next() {
297 | 		fileid := r.fileid
298 | 		for i < len(list) && list[i] < fileid {
299 | 			i++
300 | 		}
301 | 		if i < len(list) && list[i] == fileid {
302 | 			x = append(x, fileid)
303 | 			i++
304 | 		}
305 | 	}
306 | 	return x
307 | }
308 | 
309 | func (ix *Index) PostingOr(list []uint32, trigram uint32) []uint32 {
310 | 	return ix.postingOr(list, trigram, nil)
311 | }
312 | 
313 | func (ix *Index) postingOr(list []uint32, trigram uint32, restrict []uint32) []uint32 {
314 | 	var r postReader
315 | 	r.init(ix, trigram, restrict)
316 | 	x := make([]uint32, 0, len(list)+r.max())
317 | 	i := 0
318 | 	for r.next() {
319 | 		fileid := r.fileid
320 | 		for i < len(list) && list[i] < fileid {
321 | 			x = append(x, list[i])
322 | 			i++
323 | 		}
324 | 		x = append(x, fileid)
325 | 		if i < len(list) && list[i] == fileid {
326 | 			i++
327 | 		}
328 | 	}
329 | 	x = append(x, list[i:]...)
330 | 	return x
331 | }
332 | 
333 | func (ix *Index) PostingQuery(q *Query) []uint32 {
334 | 	return ix.postingQuery(q, nil)
335 | }
336 | 
337 | func (ix *Index) postingQuery(q *Query, restrict []uint32) (ret []uint32) {
338 | 	var list []uint32
339 | 	switch q.Op {
340 | 	case QNone:
341 | 		// nothing
342 | 	case QAll:
343 | 		if restrict != nil {
344 | 			return restrict
345 | 		}
346 | 		list = make([]uint32, ix.numName)
347 | 		for i := range list {
348 | 			list[i] = uint32(i)
349 | 		}
350 | 		return list
351 | 	case QAnd:
352 | 		for _, t := range q.Trigram {
353 | 			tri := uint32(t[0])<<16 | uint32(t[1])<<8 | uint32(t[2])
354 | 			if list == nil {
355 | 				list = ix.postingList(tri, restrict)
356 | 			} else {
357 | 				list = ix.postingAnd(list, tri, restrict)
358 | 			}
359 | 			if len(list) == 0 {
360 | 				return nil
361 | 			}
362 | 		}
363 | 		for _, sub := range q.Sub {
364 | 			if list == nil {
365 | 				list = restrict
366 | 			}
367 | 			list = ix.postingQuery(sub, list)
368 | 			if len(list) == 0 {
369 | 				return nil
370 | 			}
371 | 		}
372 | 	case QOr:
373 | 		for _, t := range q.Trigram {
374 | 			tri := uint32(t[0])<<16 | uint32(t[1])<<8 | uint32(t[2])
375 | 			if list == nil {
376 | 				list = ix.postingList(tri, restrict)
377 | 			} else {
378 | 				list = ix.postingOr(list, tri, restrict)
379 | 			}
380 | 		}
381 | 		for _, sub := range q.Sub {
382 | 			list1 := ix.postingQuery(sub, restrict)
383 | 			list = mergeOr(list, list1)
384 | 		}
385 | 	}
386 | 	return list
387 | }
388 | 
389 | func mergeOr(l1, l2 []uint32) []uint32 {
390 | 	var l []uint32
391 | 	i := 0
392 | 	j := 0
393 | 	for i < len(l1) || j < len(l2) {
394 | 		switch {
395 | 		case j == len(l2) || (i < len(l1) && l1[i] < l2[j]):
396 | 			l = append(l, l1[i])
397 | 			i++
398 | 		case i == len(l1) || (j < len(l2) && l1[i] > l2[j]):
399 | 			l = append(l, l2[j])
400 | 			j++
401 | 		case l1[i] == l2[j]:
402 | 			l = append(l, l1[i])
403 | 			i++
404 | 			j++
405 | 		}
406 | 	}
407 | 	return l
408 | }
409 | 
410 | func corrupt() {
411 | 	log.Fatal("corrupt index: remove " + File())
412 | }
413 | 
414 | // An mmapData is mmap'ed read-only data from a file.
415 | type mmapData struct {
416 | 	f *os.File
417 | 	d []byte
418 | }
419 | 
420 | // mmap maps the given file into memory.
421 | func mmap(file string) mmapData {
422 | 	f, err := os.Open(file)
423 | 	if err != nil {
424 | 		log.Fatal(err)
425 | 	}
426 | 	return mmapFile(f)
427 | }
428 | 
429 | // File returns the name of the index file to use.
430 | // It is either $CSEARCHINDEX, a .csearchindex file in the PWD or an
431 | // ancestor dir, or $HOME/.csearchindex as a last resort
432 | func File() string {
433 | 	f := os.Getenv("CSEARCHINDEX")
434 | 	if f != "" {
435 | 		return f
436 | 	}
437 | 
438 | 	var home string
439 | 	home = os.Getenv("HOME")
440 | 	if runtime.GOOS == "windows" && home == "" {
441 | 		home = os.Getenv("USERPROFILE")
442 | 	}
443 | 	home = filepath.Join(home, ".csearchindex")
444 | 
445 | 	pwd, err := filepath.Abs(".")
446 | 	if err != nil {
447 | 		return home
448 | 	}
449 | 
450 | 	for {
451 | 		candidate := filepath.Join(pwd, ".csearchindex")
452 | 		f, err := os.Open(candidate)
453 | 		f.Close()
454 | 
455 | 		// found one!
456 | 		if err == nil {
457 | 			return candidate
458 | 		}
459 | 
460 | 		newPwd := filepath.Dir(pwd)
461 | 		if newPwd == pwd {
462 | 			// hit the root dir
463 | 			break
464 | 		}
465 | 		pwd = newPwd
466 | 	}
467 | 
468 | 	return home
469 | 
470 | }
471 | 


--------------------------------------------------------------------------------
/index/read_test.go:
--------------------------------------------------------------------------------
 1 | // Copyright 2011 The Go Authors.  All rights reserved.
 2 | // Use of this source code is governed by a BSD-style
 3 | // license that can be found in the LICENSE file.
 4 | 
 5 | package index
 6 | 
 7 | import (
 8 | 	"io/ioutil"
 9 | 	"os"
10 | 	"testing"
11 | )
12 | 
13 | var postFiles = map[string]string{
14 | 	"file0": "",
15 | 	"file1": "Google Code Search",
16 | 	"file2": "Google Code Project Hosting",
17 | 	"file3": "Google Web Search",
18 | }
19 | 
20 | func tri(x, y, z byte) uint32 {
21 | 	return uint32(x)<<16 | uint32(y)<<8 | uint32(z)
22 | }
23 | 
24 | func TestTrivialPosting(t *testing.T) {
25 | 	f, _ := ioutil.TempFile("", "index-test")
26 | 	defer os.Remove(f.Name())
27 | 	out := f.Name()
28 | 	buildIndex(out, nil, postFiles)
29 | 	ix := Open(out)
30 | 	if l := ix.PostingList(tri('S', 'e', 'a')); !equalList(l, []uint32{1, 3}) {
31 | 		t.Errorf("PostingList(Sea) = %v, want [1 3]", l)
32 | 	}
33 | 	if l := ix.PostingList(tri('G', 'o', 'o')); !equalList(l, []uint32{1, 2, 3}) {
34 | 		t.Errorf("PostingList(Goo) = %v, want [1 2 3]", l)
35 | 	}
36 | 	if l := ix.PostingAnd(ix.PostingList(tri('S', 'e', 'a')), tri('G', 'o', 'o')); !equalList(l, []uint32{1, 3}) {
37 | 		t.Errorf("PostingList(Sea&Goo) = %v, want [1 3]", l)
38 | 	}
39 | 	if l := ix.PostingAnd(ix.PostingList(tri('G', 'o', 'o')), tri('S', 'e', 'a')); !equalList(l, []uint32{1, 3}) {
40 | 		t.Errorf("PostingList(Goo&Sea) = %v, want [1 3]", l)
41 | 	}
42 | 	if l := ix.PostingOr(ix.PostingList(tri('S', 'e', 'a')), tri('G', 'o', 'o')); !equalList(l, []uint32{1, 2, 3}) {
43 | 		t.Errorf("PostingList(Sea|Goo) = %v, want [1 2 3]", l)
44 | 	}
45 | 	if l := ix.PostingOr(ix.PostingList(tri('G', 'o', 'o')), tri('S', 'e', 'a')); !equalList(l, []uint32{1, 2, 3}) {
46 | 		t.Errorf("PostingList(Goo|Sea) = %v, want [1 2 3]", l)
47 | 	}
48 | }
49 | 
50 | func equalList(x, y []uint32) bool {
51 | 	if len(x) != len(y) {
52 | 		return false
53 | 	}
54 | 	for i, xi := range x {
55 | 		if xi != y[i] {
56 | 			return false
57 | 		}
58 | 	}
59 | 	return true
60 | }
61 | 


--------------------------------------------------------------------------------
/index/regexp.go:
--------------------------------------------------------------------------------
  1 | // Copyright 2011 The Go Authors.  All rights reserved.
  2 | // Use of this source code is governed by a BSD-style
  3 | // license that can be found in the LICENSE file.
  4 | 
  5 | package index
  6 | 
  7 | import (
  8 | 	"regexp/syntax"
  9 | 	"sort"
 10 | 	"strconv"
 11 | 	"strings"
 12 | 	"unicode"
 13 | )
 14 | 
 15 | // A Query is a matching machine, like a regular expression,
 16 | // that matches some text and not other text.  When we compute a
 17 | // Query from a regexp, the Query is a conservative version of the
 18 | // regexp: it matches everything the regexp would match, and probably
 19 | // quite a bit more.  We can then filter target files by whether they match
 20 | // the Query (using a trigram index) before running the comparatively
 21 | // more expensive regexp machinery.
 22 | type Query struct {
 23 | 	Op      QueryOp
 24 | 	Trigram []string
 25 | 	Sub     []*Query
 26 | }
 27 | 
 28 | type QueryOp int
 29 | 
 30 | const (
 31 | 	QAll  QueryOp = iota // Everything matches
 32 | 	QNone                // Nothing matches
 33 | 	QAnd                 // All in Sub and Trigram must match
 34 | 	QOr                  // At least one in Sub or Trigram must match
 35 | )
 36 | 
 37 | var allQuery = &Query{Op: QAll}
 38 | var noneQuery = &Query{Op: QNone}
 39 | 
 40 | // and returns the query q AND r, possibly reusing q's and r's storage.
 41 | func (q *Query) and(r *Query) *Query {
 42 | 	return q.andOr(r, QAnd)
 43 | }
 44 | 
 45 | // or returns the query q OR r, possibly reusing q's and r's storage.
 46 | func (q *Query) or(r *Query) *Query {
 47 | 	return q.andOr(r, QOr)
 48 | }
 49 | 
 50 | // andOr returns the query q AND r or q OR r, possibly reusing q's and r's storage.
 51 | // It works hard to avoid creating unnecessarily complicated structures.
 52 | func (q *Query) andOr(r *Query, op QueryOp) (out *Query) {
 53 | 	opstr := "&"
 54 | 	if op == QOr {
 55 | 		opstr = "|"
 56 | 	}
 57 | 	//println("andOr", q.String(), opstr, r.String())
 58 | 	//defer func() { println("  ->", out.String()) }()
 59 | 	_ = opstr
 60 | 
 61 | 	if len(q.Trigram) == 0 && len(q.Sub) == 1 {
 62 | 		q = q.Sub[0]
 63 | 	}
 64 | 	if len(r.Trigram) == 0 && len(r.Sub) == 1 {
 65 | 		r = r.Sub[0]
 66 | 	}
 67 | 
 68 | 	// Boolean simplification.
 69 | 	// If q ⇒ r, q AND r ≡ q.
 70 | 	// If q ⇒ r, q OR r ≡ r.
 71 | 	if q.implies(r) {
 72 | 		//println(q.String(), "implies", r.String())
 73 | 		if op == QAnd {
 74 | 			return q
 75 | 		}
 76 | 		return r
 77 | 	}
 78 | 	if r.implies(q) {
 79 | 		//println(r.String(), "implies", q.String())
 80 | 		if op == QAnd {
 81 | 			return r
 82 | 		}
 83 | 		return q
 84 | 	}
 85 | 
 86 | 	// Both q and r are QAnd or QOr.
 87 | 	// If they match or can be made to match, merge.
 88 | 	qAtom := len(q.Trigram) == 1 && len(q.Sub) == 0
 89 | 	rAtom := len(r.Trigram) == 1 && len(r.Sub) == 0
 90 | 	if q.Op == op && (r.Op == op || rAtom) {
 91 | 		q.Trigram = stringSet.union(q.Trigram, r.Trigram, false)
 92 | 		q.Sub = append(q.Sub, r.Sub...)
 93 | 		return q
 94 | 	}
 95 | 	if r.Op == op && qAtom {
 96 | 		r.Trigram = stringSet.union(r.Trigram, q.Trigram, false)
 97 | 		return r
 98 | 	}
 99 | 	if qAtom && rAtom {
100 | 		q.Op = op
101 | 		q.Trigram = append(q.Trigram, r.Trigram...)
102 | 		return q
103 | 	}
104 | 
105 | 	// If one matches the op, add the other to it.
106 | 	if q.Op == op {
107 | 		q.Sub = append(q.Sub, r)
108 | 		return q
109 | 	}
110 | 	if r.Op == op {
111 | 		r.Sub = append(r.Sub, q)
112 | 		return r
113 | 	}
114 | 
115 | 	// We are creating an AND of ORs or an OR of ANDs.
116 | 	// Factor out common trigrams, if any.
117 | 	common := stringSet{}
118 | 	i, j := 0, 0
119 | 	wi, wj := 0, 0
120 | 	for i < len(q.Trigram) && j < len(r.Trigram) {
121 | 		qt, rt := q.Trigram[i], r.Trigram[j]
122 | 		if qt < rt {
123 | 			q.Trigram[wi] = qt
124 | 			wi++
125 | 			i++
126 | 		} else if qt > rt {
127 | 			r.Trigram[wj] = rt
128 | 			wj++
129 | 			j++
130 | 		} else {
131 | 			common = append(common, qt)
132 | 			i++
133 | 			j++
134 | 		}
135 | 	}
136 | 	for ; i < len(q.Trigram); i++ {
137 | 		q.Trigram[wi] = q.Trigram[i]
138 | 		wi++
139 | 	}
140 | 	for ; j < len(r.Trigram); j++ {
141 | 		r.Trigram[wj] = r.Trigram[j]
142 | 		wj++
143 | 	}
144 | 	q.Trigram = q.Trigram[:wi]
145 | 	r.Trigram = r.Trigram[:wj]
146 | 	if len(common) > 0 {
147 | 		// If there were common trigrams, rewrite
148 | 		//
149 | 		//	(abc|def|ghi|jkl) AND (abc|def|mno|prs) =>
150 | 		//		(abc|def) OR ((ghi|jkl) AND (mno|prs))
151 | 		//
152 | 		//	(abc&def&ghi&jkl) OR (abc&def&mno&prs) =>
153 | 		//		(abc&def) AND ((ghi&jkl) OR (mno&prs))
154 | 		//
155 | 		// Build up the right one of
156 | 		//	(ghi|jkl) AND (mno|prs)
157 | 		//	(ghi&jkl) OR (mno&prs)
158 | 		// Call andOr recursively in case q and r can now be simplified
159 | 		// (we removed some trigrams).
160 | 		s := q.andOr(r, op)
161 | 
162 | 		// Add in factored trigrams.
163 | 		otherOp := QAnd + QOr - op
164 | 		t := &Query{Op: otherOp, Trigram: common}
165 | 		return t.andOr(s, t.Op)
166 | 	}
167 | 
168 | 	// Otherwise just create the op.
169 | 	return &Query{Op: op, Sub: []*Query{q, r}}
170 | }
171 | 
172 | // implies reports whether q implies r.
173 | // It is okay for it to return false negatives.
174 | func (q *Query) implies(r *Query) bool {
175 | 	if q.Op == QNone || r.Op == QAll {
176 | 		// False implies everything.
177 | 		// Everything implies True.
178 | 		return true
179 | 	}
180 | 	if q.Op == QAll || r.Op == QNone {
181 | 		// True implies nothing.
182 | 		// Nothing implies False.
183 | 		return false
184 | 	}
185 | 
186 | 	if q.Op == QAnd || (q.Op == QOr && len(q.Trigram) == 1 && len(q.Sub) == 0) {
187 | 		return trigramsImply(q.Trigram, r)
188 | 	}
189 | 
190 | 	if q.Op == QOr && r.Op == QOr &&
191 | 		len(q.Trigram) > 0 && len(q.Sub) == 0 &&
192 | 		stringSet.isSubsetOf(q.Trigram, r.Trigram) {
193 | 		return true
194 | 	}
195 | 	return false
196 | }
197 | 
198 | func trigramsImply(t []string, q *Query) bool {
199 | 	switch q.Op {
200 | 	case QOr:
201 | 		for _, qq := range q.Sub {
202 | 			if trigramsImply(t, qq) {
203 | 				return true
204 | 			}
205 | 		}
206 | 		for i := range t {
207 | 			if stringSet.isSubsetOf(t[i:i+1], q.Trigram) {
208 | 				return true
209 | 			}
210 | 		}
211 | 		return false
212 | 	case QAnd:
213 | 		for _, qq := range q.Sub {
214 | 			if !trigramsImply(t, qq) {
215 | 				return false
216 | 			}
217 | 		}
218 | 		if !stringSet.isSubsetOf(q.Trigram, t) {
219 | 			return false
220 | 		}
221 | 		return true
222 | 	}
223 | 	return false
224 | }
225 | 
226 | // maybeRewrite rewrites q to use op if it is possible to do so
227 | // without changing the meaning.  It also simplifies if the node
228 | // is an empty OR or AND.
229 | func (q *Query) maybeRewrite(op QueryOp) {
230 | 	if q.Op != QAnd && q.Op != QOr {
231 | 		return
232 | 	}
233 | 
234 | 	// AND/OR doing real work?  Can't rewrite.
235 | 	n := len(q.Sub) + len(q.Trigram)
236 | 	if n > 1 {
237 | 		return
238 | 	}
239 | 
240 | 	// Nothing left in the AND/OR?
241 | 	if n == 0 {
242 | 		if q.Op == QAnd {
243 | 			q.Op = QAll
244 | 		} else {
245 | 			q.Op = QNone
246 | 		}
247 | 		return
248 | 	}
249 | 
250 | 	// Just a sub-node: throw away wrapper.
251 | 	if len(q.Sub) == 1 {
252 | 		*q = *q.Sub[0]
253 | 	}
254 | 
255 | 	// Just a trigram: can use either op.
256 | 	q.Op = op
257 | }
258 | 
259 | // andTrigrams returns q AND the OR of the AND of the trigrams present in each string.
260 | func (q *Query) andTrigrams(t stringSet) *Query {
261 | 	if t.minLen() < 3 {
262 | 		// If there is a short string, we can't guarantee
263 | 		// that any trigrams must be present, so use ALL.
264 | 		// q AND ALL = q.
265 | 		return q
266 | 	}
267 | 
268 | 	//println("andtrigrams", strings.Join(t, ","))
269 | 	or := noneQuery
270 | 	for _, tt := range t {
271 | 		var trig stringSet
272 | 		for i := 0; i+3 <= len(tt); i++ {
273 | 			trig.add(tt[i : i+3])
274 | 		}
275 | 		trig.clean(false)
276 | 		//println(tt, "trig", strings.Join(trig, ","))
277 | 		or = or.or(&Query{Op: QAnd, Trigram: trig})
278 | 	}
279 | 	q = q.and(or)
280 | 	return q
281 | }
282 | 
283 | func (q *Query) String() string {
284 | 	if q == nil {
285 | 		return "?"
286 | 	}
287 | 	if q.Op == QNone {
288 | 		return "-"
289 | 	}
290 | 	if q.Op == QAll {
291 | 		return "+"
292 | 	}
293 | 
294 | 	if len(q.Sub) == 0 && len(q.Trigram) == 1 {
295 | 		return strconv.Quote(q.Trigram[0])
296 | 	}
297 | 
298 | 	var (
299 | 		s     string
300 | 		sjoin string
301 | 		end   string
302 | 		tjoin string
303 | 	)
304 | 	if q.Op == QAnd {
305 | 		sjoin = " "
306 | 		tjoin = " "
307 | 	} else {
308 | 		s = "("
309 | 		sjoin = ")|("
310 | 		end = ")"
311 | 		tjoin = "|"
312 | 	}
313 | 	for i, t := range q.Trigram {
314 | 		if i > 0 {
315 | 			s += tjoin
316 | 		}
317 | 		s += strconv.Quote(t)
318 | 	}
319 | 	if len(q.Sub) > 0 {
320 | 		if len(q.Trigram) > 0 {
321 | 			s += sjoin
322 | 		}
323 | 		s += q.Sub[0].String()
324 | 		for i := 1; i < len(q.Sub); i++ {
325 | 			s += sjoin + q.Sub[i].String()
326 | 		}
327 | 	}
328 | 	s += end
329 | 	return s
330 | }
331 | 
332 | // RegexpQuery returns a Query for the given regexp.
333 | func RegexpQuery(re *syntax.Regexp) *Query {
334 | 	info := analyze(re)
335 | 	info.simplify(true)
336 | 	info.addExact()
337 | 	return info.match
338 | }
339 | 
340 | // A regexpInfo summarizes the results of analyzing a regexp.
341 | type regexpInfo struct {
342 | 	// canEmpty records whether the regexp matches the empty string
343 | 	canEmpty bool
344 | 
345 | 	// exact is the exact set of strings matching the regexp.
346 | 	exact stringSet
347 | 
348 | 	// if exact is nil, prefix is the set of possible match prefixes,
349 | 	// and suffix is the set of possible match suffixes.
350 | 	prefix stringSet // otherwise: the exact set of matching prefixes ...
351 | 	suffix stringSet // ... and suffixes
352 | 
353 | 	// match records a query that must be satisfied by any
354 | 	// match for the regexp, in addition to the information
355 | 	// recorded above.
356 | 	match *Query
357 | }
358 | 
359 | const (
360 | 	// Exact sets are limited to maxExact strings.
361 | 	// If they get too big, simplify will rewrite the regexpInfo
362 | 	// to use prefix and suffix instead.  It's not worthwhile for
363 | 	// this to be bigger than maxSet.
364 | 	// Because we allow the maximum length of an exact string
365 | 	// to grow to 5 below (see simplify), it helps to avoid ridiculous
366 | 	// alternations if maxExact is sized so that 3 case-insensitive letters
367 | 	// triggers a flush.
368 | 	maxExact = 7
369 | 
370 | 	// Prefix and suffix sets are limited to maxSet strings.
371 | 	// If they get too big, simplify will replace groups of strings
372 | 	// sharing a common leading prefix (or trailing suffix) with
373 | 	// that common prefix (or suffix).  It is useful for maxSet
374 | 	// to be at least 2³ = 8 so that we can exactly
375 | 	// represent a case-insensitive abc by the set
376 | 	// {abc, abC, aBc, aBC, Abc, AbC, ABc, ABC}.
377 | 	maxSet = 20
378 | )
379 | 
380 | // anyMatch returns the regexpInfo describing a regexp that
381 | // matches any string.
382 | func anyMatch() regexpInfo {
383 | 	return regexpInfo{
384 | 		canEmpty: true,
385 | 		prefix:   []string{""},
386 | 		suffix:   []string{""},
387 | 		match:    allQuery,
388 | 	}
389 | }
390 | 
391 | // anyChar returns the regexpInfo describing a regexp that
392 | // matches any single character.
393 | func anyChar() regexpInfo {
394 | 	return regexpInfo{
395 | 		prefix: []string{""},
396 | 		suffix: []string{""},
397 | 		match:  allQuery,
398 | 	}
399 | }
400 | 
401 | // noMatch returns the regexpInfo describing a regexp that
402 | // matches no strings at all.
403 | func noMatch() regexpInfo {
404 | 	return regexpInfo{
405 | 		match: noneQuery,
406 | 	}
407 | }
408 | 
409 | // emptyString returns the regexpInfo describing a regexp that
410 | // matches only the empty string.
411 | func emptyString() regexpInfo {
412 | 	return regexpInfo{
413 | 		canEmpty: true,
414 | 		exact:    []string{""},
415 | 		match:    allQuery,
416 | 	}
417 | }
418 | 
419 | // analyze returns the regexpInfo for the regexp re.
420 | func analyze(re *syntax.Regexp) (ret regexpInfo) {
421 | 	//println("analyze", re.String())
422 | 	//defer func() { println("->", ret.String()) }()
423 | 	var info regexpInfo
424 | 	switch re.Op {
425 | 	case syntax.OpNoMatch:
426 | 		return noMatch()
427 | 
428 | 	case syntax.OpEmptyMatch,
429 | 		syntax.OpBeginLine, syntax.OpEndLine,
430 | 		syntax.OpBeginText, syntax.OpEndText,
431 | 		syntax.OpWordBoundary, syntax.OpNoWordBoundary:
432 | 		return emptyString()
433 | 
434 | 	case syntax.OpLiteral:
435 | 		if re.Flags&syntax.FoldCase != 0 {
436 | 			switch len(re.Rune) {
437 | 			case 0:
438 | 				return emptyString()
439 | 			case 1:
440 | 				// Single-letter case-folded string:
441 | 				// rewrite into char class and analyze.
442 | 				re1 := &syntax.Regexp{
443 | 					Op: syntax.OpCharClass,
444 | 				}
445 | 				re1.Rune = re1.Rune0[:0]
446 | 				r0 := re.Rune[0]
447 | 				re1.Rune = append(re1.Rune, r0, r0)
448 | 				for r1 := unicode.SimpleFold(r0); r1 != r0; r1 = unicode.SimpleFold(r1) {
449 | 					re1.Rune = append(re1.Rune, r1, r1)
450 | 				}
451 | 				info = analyze(re1)
452 | 				return info
453 | 			}
454 | 			// Multi-letter case-folded string:
455 | 			// treat as concatenation of single-letter case-folded strings.
456 | 			re1 := &syntax.Regexp{
457 | 				Op:    syntax.OpLiteral,
458 | 				Flags: syntax.FoldCase,
459 | 			}
460 | 			info = emptyString()
461 | 			for i := range re.Rune {
462 | 				re1.Rune = re.Rune[i : i+1]
463 | 				info = concat(info, analyze(re1))
464 | 			}
465 | 			return info
466 | 		}
467 | 		info.exact = stringSet{string(re.Rune)}
468 | 		info.match = allQuery
469 | 
470 | 	case syntax.OpAnyCharNotNL, syntax.OpAnyChar:
471 | 		return anyChar()
472 | 
473 | 	case syntax.OpCapture:
474 | 		return analyze(re.Sub[0])
475 | 
476 | 	case syntax.OpConcat:
477 | 		return fold(concat, re.Sub, emptyString())
478 | 
479 | 	case syntax.OpAlternate:
480 | 		return fold(alternate, re.Sub, noMatch())
481 | 
482 | 	case syntax.OpQuest:
483 | 		return alternate(analyze(re.Sub[0]), emptyString())
484 | 
485 | 	case syntax.OpStar:
486 | 		// We don't know anything, so assume the worst.
487 | 		return anyMatch()
488 | 
489 | 	case syntax.OpRepeat:
490 | 		if re.Min == 0 {
491 | 			// Like OpStar
492 | 			return anyMatch()
493 | 		}
494 | 		fallthrough
495 | 	case syntax.OpPlus:
496 | 		// x+
497 | 		// Since there has to be at least one x, the prefixes and suffixes
498 | 		// stay the same.  If x was exact, it isn't anymore.
499 | 		info = analyze(re.Sub[0])
500 | 		if info.exact.have() {
501 | 			info.prefix = info.exact
502 | 			info.suffix = info.exact.copy()
503 | 			info.exact = nil
504 | 		}
505 | 
506 | 	case syntax.OpCharClass:
507 | 		info.match = allQuery
508 | 
509 | 		// Special case.
510 | 		if len(re.Rune) == 0 {
511 | 			return noMatch()
512 | 		}
513 | 
514 | 		// Special case.
515 | 		if len(re.Rune) == 1 {
516 | 			info.exact = stringSet{string(re.Rune[0])}
517 | 			break
518 | 		}
519 | 
520 | 		n := 0
521 | 		for i := 0; i < len(re.Rune); i += 2 {
522 | 			n += int(re.Rune[i+1] - re.Rune[i])
523 | 		}
524 | 		// If the class is too large, it's okay to overestimate.
525 | 		if n > 100 {
526 | 			return anyChar()
527 | 		}
528 | 
529 | 		info.exact = []string{}
530 | 		for i := 0; i < len(re.Rune); i += 2 {
531 | 			lo, hi := re.Rune[i], re.Rune[i+1]
532 | 			for rr := lo; rr <= hi; rr++ {
533 | 				info.exact.add(string(rr))
534 | 			}
535 | 		}
536 | 	}
537 | 
538 | 	info.simplify(false)
539 | 	return info
540 | }
541 | 
542 | // fold is the usual higher-order function.
543 | func fold(f func(x, y regexpInfo) regexpInfo, sub []*syntax.Regexp, zero regexpInfo) regexpInfo {
544 | 	if len(sub) == 0 {
545 | 		return zero
546 | 	}
547 | 	if len(sub) == 1 {
548 | 		return analyze(sub[0])
549 | 	}
550 | 	info := f(analyze(sub[0]), analyze(sub[1]))
551 | 	for i := 2; i < len(sub); i++ {
552 | 		info = f(info, analyze(sub[i]))
553 | 	}
554 | 	return info
555 | }
556 | 
557 | // concat returns the regexp info for xy given x and y.
558 | func concat(x, y regexpInfo) (out regexpInfo) {
559 | 	//println("concat", x.String(), "...", y.String())
560 | 	//defer func() { println("->", out.String()) }()
561 | 	var xy regexpInfo
562 | 	xy.match = x.match.and(y.match)
563 | 	if x.exact.have() && y.exact.have() {
564 | 		xy.exact = x.exact.cross(y.exact, false)
565 | 	} else {
566 | 		if x.exact.have() {
567 | 			xy.prefix = x.exact.cross(y.prefix, false)
568 | 		} else {
569 | 			xy.prefix = x.prefix
570 | 			if x.canEmpty {
571 | 				xy.prefix = xy.prefix.union(y.prefix, false)
572 | 			}
573 | 		}
574 | 		if y.exact.have() {
575 | 			xy.suffix = x.suffix.cross(y.exact, true)
576 | 		} else {
577 | 			xy.suffix = y.suffix
578 | 			if y.canEmpty {
579 | 				xy.suffix = xy.suffix.union(x.suffix, true)
580 | 			}
581 | 		}
582 | 	}
583 | 
584 | 	// If all the possible strings in the cross product of x.suffix
585 | 	// and y.prefix are long enough, then the trigram for one
586 | 	// of them must be present and would not necessarily be
587 | 	// accounted for in xy.prefix or xy.suffix yet.  Cut things off
588 | 	// at maxSet just to keep the sets manageable.
589 | 	if !x.exact.have() && !y.exact.have() &&
590 | 		x.suffix.size() <= maxSet && y.prefix.size() <= maxSet &&
591 | 		x.suffix.minLen()+y.prefix.minLen() >= 3 {
592 | 		xy.match = xy.match.andTrigrams(x.suffix.cross(y.prefix, false))
593 | 	}
594 | 
595 | 	xy.simplify(false)
596 | 	return xy
597 | }
598 | 
599 | // alternate returns the regexpInfo for x|y given x and y.
600 | func alternate(x, y regexpInfo) (out regexpInfo) {
601 | 	//println("alternate", x.String(), "...", y.String())
602 | 	//defer func() { println("->", out.String()) }()
603 | 	var xy regexpInfo
604 | 	if x.exact.have() && y.exact.have() {
605 | 		xy.exact = x.exact.union(y.exact, false)
606 | 	} else if x.exact.have() {
607 | 		xy.prefix = x.exact.union(y.prefix, false)
608 | 		xy.suffix = x.exact.union(y.suffix, true)
609 | 		x.addExact()
610 | 	} else if y.exact.have() {
611 | 		xy.prefix = x.prefix.union(y.exact, false)
612 | 		xy.suffix = x.suffix.union(y.exact.copy(), true)
613 | 		y.addExact()
614 | 	} else {
615 | 		xy.prefix = x.prefix.union(y.prefix, false)
616 | 		xy.suffix = x.suffix.union(y.suffix, true)
617 | 	}
618 | 	xy.canEmpty = x.canEmpty || y.canEmpty
619 | 	xy.match = x.match.or(y.match)
620 | 
621 | 	xy.simplify(false)
622 | 	return xy
623 | }
624 | 
625 | // addExact adds to the match query the trigrams for matching info.exact.
626 | func (info *regexpInfo) addExact() {
627 | 	if info.exact.have() {
628 | 		info.match = info.match.andTrigrams(info.exact)
629 | 	}
630 | }
631 | 
632 | // simplify simplifies the regexpInfo when the exact set gets too large.
633 | func (info *regexpInfo) simplify(force bool) {
634 | 	//println("  simplify", info.String(), " force=", force)
635 | 	//defer func() { println("  ->", info.String()) }()
636 | 	// If there are now too many exact strings,
637 | 	// loop over them, adding trigrams and moving
638 | 	// the relevant pieces into prefix and suffix.
639 | 	info.exact.clean(false)
640 | 	if len(info.exact) > maxExact || (info.exact.minLen() >= 3 && force) || info.exact.minLen() >= 4 {
641 | 		info.addExact()
642 | 		for _, s := range info.exact {
643 | 			n := len(s)
644 | 			if n < 3 {
645 | 				info.prefix.add(s)
646 | 				info.suffix.add(s)
647 | 			} else {
648 | 				info.prefix.add(s[:2])
649 | 				info.suffix.add(s[n-2:])
650 | 			}
651 | 		}
652 | 		info.exact = nil
653 | 	}
654 | 
655 | 	if !info.exact.have() {
656 | 		info.simplifySet(&info.prefix)
657 | 		info.simplifySet(&info.suffix)
658 | 	}
659 | }
660 | 
661 | // simplifySet reduces the size of the given set (either prefix or suffix).
662 | // There is no need to pass around enormous prefix or suffix sets, since
663 | // they will only be used to create trigrams.  As they get too big, simplifySet
664 | // moves the information they contain into the match query, which is
665 | // more efficient to pass around.
666 | func (info *regexpInfo) simplifySet(s *stringSet) {
667 | 	t := *s
668 | 	t.clean(s == &info.suffix)
669 | 
670 | 	// Add the OR of the current prefix/suffix set to the query.
671 | 	info.match = info.match.andTrigrams(t)
672 | 
673 | 	for n := 3; n == 3 || t.size() > maxSet; n-- {
674 | 		// Replace set by strings of length n-1.
675 | 		w := 0
676 | 		for _, str := range t {
677 | 			if len(str) >= n {
678 | 				if s == &info.prefix {
679 | 					str = str[:n-1]
680 | 				} else {
681 | 					str = str[len(str)-n+1:]
682 | 				}
683 | 			}
684 | 			if w == 0 || t[w-1] != str {
685 | 				t[w] = str
686 | 				w++
687 | 			}
688 | 		}
689 | 		t = t[:w]
690 | 		t.clean(s == &info.suffix)
691 | 	}
692 | 
693 | 	// Now make sure that the prefix/suffix sets aren't redundant.
694 | 	// For example, if we know "ab" is a possible prefix, then it
695 | 	// doesn't help at all to know that  "abc" is also a possible
696 | 	// prefix, so delete "abc".
697 | 	w := 0
698 | 	f := strings.HasPrefix
699 | 	if s == &info.suffix {
700 | 		f = strings.HasSuffix
701 | 	}
702 | 	for _, str := range t {
703 | 		if w == 0 || !f(str, t[w-1]) {
704 | 			t[w] = str
705 | 			w++
706 | 		}
707 | 	}
708 | 	t = t[:w]
709 | 
710 | 	*s = t
711 | }
712 | 
713 | func (info regexpInfo) String() string {
714 | 	s := ""
715 | 	if info.canEmpty {
716 | 		s += "canempty "
717 | 	}
718 | 	if info.exact.have() {
719 | 		s += "exact:" + strings.Join(info.exact, ",")
720 | 	} else {
721 | 		s += "prefix:" + strings.Join(info.prefix, ",")
722 | 		s += " suffix:" + strings.Join(info.suffix, ",")
723 | 	}
724 | 	s += " match: " + info.match.String()
725 | 	return s
726 | }
727 | 
728 | // A stringSet is a set of strings.
729 | // The nil stringSet indicates not having a set.
730 | // The non-nil but empty stringSet is the empty set.
731 | type stringSet []string
732 | 
733 | // have reports whether we have a stringSet.
734 | func (s stringSet) have() bool {
735 | 	return s != nil
736 | }
737 | 
738 | // contains reports whether s contains str.
739 | func (s stringSet) contains(str string) bool {
740 | 	for _, ss := range s {
741 | 		if ss == str {
742 | 			return true
743 | 		}
744 | 	}
745 | 	return false
746 | }
747 | 
748 | type byPrefix []string
749 | 
750 | func (x *byPrefix) Len() int           { return len(*x) }
751 | func (x *byPrefix) Swap(i, j int)      { (*x)[i], (*x)[j] = (*x)[j], (*x)[i] }
752 | func (x *byPrefix) Less(i, j int) bool { return (*x)[i] < (*x)[j] }
753 | 
754 | type bySuffix []string
755 | 
756 | func (x *bySuffix) Len() int      { return len(*x) }
757 | func (x *bySuffix) Swap(i, j int) { (*x)[i], (*x)[j] = (*x)[j], (*x)[i] }
758 | func (x *bySuffix) Less(i, j int) bool {
759 | 	s := (*x)[i]
760 | 	t := (*x)[j]
761 | 	for i := 1; i <= len(s) && i <= len(t); i++ {
762 | 		si := s[len(s)-i]
763 | 		ti := t[len(t)-i]
764 | 		if si < ti {
765 | 			return true
766 | 		}
767 | 		if si > ti {
768 | 			return false
769 | 		}
770 | 	}
771 | 	return len(s) < len(t)
772 | }
773 | 
774 | // add adds str to the set.
775 | func (s *stringSet) add(str string) {
776 | 	*s = append(*s, str)
777 | }
778 | 
779 | // clean removes duplicates from the stringSet.
780 | func (s *stringSet) clean(isSuffix bool) {
781 | 	t := *s
782 | 	if isSuffix {
783 | 		sort.Sort((*bySuffix)(s))
784 | 	} else {
785 | 		sort.Sort((*byPrefix)(s))
786 | 	}
787 | 	w := 0
788 | 	for _, str := range t {
789 | 		if w == 0 || t[w-1] != str {
790 | 			t[w] = str
791 | 			w++
792 | 		}
793 | 	}
794 | 	*s = t[:w]
795 | }
796 | 
797 | // size returns the number of strings in s.
798 | func (s stringSet) size() int {
799 | 	return len(s)
800 | }
801 | 
802 | // minLen returns the length of the shortest string in s.
803 | func (s stringSet) minLen() int {
804 | 	if len(s) == 0 {
805 | 		return 0
806 | 	}
807 | 	m := len(s[0])
808 | 	for _, str := range s {
809 | 		if m > len(str) {
810 | 			m = len(str)
811 | 		}
812 | 	}
813 | 	return m
814 | }
815 | 
816 | // maxLen returns the length of the longest string in s.
817 | func (s stringSet) maxLen() int {
818 | 	if len(s) == 0 {
819 | 		return 0
820 | 	}
821 | 	m := len(s[0])
822 | 	for _, str := range s {
823 | 		if m < len(str) {
824 | 			m = len(str)
825 | 		}
826 | 	}
827 | 	return m
828 | }
829 | 
830 | // union returns the union of s and t, reusing s's storage.
831 | func (s stringSet) union(t stringSet, isSuffix bool) stringSet {
832 | 	s = append(s, t...)
833 | 	s.clean(isSuffix)
834 | 	return s
835 | }
836 | 
837 | // cross returns the cross product of s and t.
838 | func (s stringSet) cross(t stringSet, isSuffix bool) stringSet {
839 | 	p := stringSet{}
840 | 	for _, ss := range s {
841 | 		for _, tt := range t {
842 | 			p.add(ss + tt)
843 | 		}
844 | 	}
845 | 	p.clean(isSuffix)
846 | 	return p
847 | }
848 | 
849 | // clear empties the set but preserves the storage.
850 | func (s *stringSet) clear() {
851 | 	*s = (*s)[:0]
852 | }
853 | 
854 | // copy returns a copy of the set that does not share storage with the original.
855 | func (s stringSet) copy() stringSet {
856 | 	return append(stringSet{}, s...)
857 | }
858 | 
859 | // isSubsetOf returns true if all strings in s are also in t.
860 | // It assumes both sets are sorted.
861 | func (s stringSet) isSubsetOf(t stringSet) bool {
862 | 	j := 0
863 | 	for _, ss := range s {
864 | 		for j < len(t) && t[j] < ss {
865 | 			j++
866 | 		}
867 | 		if j >= len(t) || t[j] != ss {
868 | 			return false
869 | 		}
870 | 	}
871 | 	return true
872 | }
873 | 


--------------------------------------------------------------------------------
/index/regexp_test.go:
--------------------------------------------------------------------------------
 1 | // Copyright 2011 The Go Authors.  All rights reserved.
 2 | // Use of this source code is governed by a BSD-style
 3 | // license that can be found in the LICENSE file.
 4 | 
 5 | package index
 6 | 
 7 | import (
 8 | 	"regexp/syntax"
 9 | 	"testing"
10 | )
11 | 
12 | var queryTests = []struct {
13 | 	re string
14 | 	q  string
15 | }{
16 | 	{`Abcdef`, `"Abc" "bcd" "cde" "def"`},
17 | 	{`(abc)(def)`, `"abc" "bcd" "cde" "def"`},
18 | 	{`abc.*(def|ghi)`, `"abc" ("def"|"ghi")`},
19 | 	{`abc(def|ghi)`, `"abc" ("bcd" "cde" "def")|("bcg" "cgh" "ghi")`},
20 | 	{`a+hello`, `"ahe" "ell" "hel" "llo"`},
21 | 	{`(a+hello|b+world)`, `("ahe" "ell" "hel" "llo")|("bwo" "orl" "rld" "wor")`},
22 | 	{`a*bbb`, `"bbb"`},
23 | 	{`a?bbb`, `"bbb"`},
24 | 	{`(bbb)a?`, `"bbb"`},
25 | 	{`(bbb)a*`, `"bbb"`},
26 | 	{`^abc`, `"abc"`},
27 | 	{`abc$`, `"abc"`},
28 | 	{`ab[cde]f`, `("abc" "bcf")|("abd" "bdf")|("abe" "bef")`},
29 | 	{`(abc|bac)de`, `"cde" ("abc" "bcd")|("acd" "bac")`},
30 | 
31 | 	// These don't have enough letters for a trigram, so they return the
32 | 	// always matching query "+".
33 | 	{`ab[^cde]f`, `+`},
34 | 	{`ab.f`, `+`},
35 | 	{`.`, `+`},
36 | 	{`()`, `+`},
37 | 
38 | 	// No matches.
39 | 	{`[^\s\S]`, `-`},
40 | 
41 | 	// Factoring works.
42 | 	{`(abc|abc)`, `"abc"`},
43 | 	{`(ab|ab)c`, `"abc"`},
44 | 	{`ab(cab|cat)`, `"abc" "bca" ("cab"|"cat")`},
45 | 	{`(z*(abc|def)z*)(z*(abc|def)z*)`, `("abc"|"def")`},
46 | 	{`(z*abcz*defz*)|(z*abcz*defz*)`, `"abc" "def"`},
47 | 	{`(z*abcz*defz*(ghi|jkl)z*)|(z*abcz*defz*(mno|prs)z*)`,
48 | 		`"abc" "def" ("ghi"|"jkl"|"mno"|"prs")`},
49 | 	{`(z*(abcz*def)|(ghiz*jkl)z*)|(z*(mnoz*prs)|(tuvz*wxy)z*)`,
50 | 		`("abc" "def")|("ghi" "jkl")|("mno" "prs")|("tuv" "wxy")`},
51 | 	{`(z*abcz*defz*)(z*(ghi|jkl)z*)`, `"abc" "def" ("ghi"|"jkl")`},
52 | 	{`(z*abcz*defz*)|(z*(ghi|jkl)z*)`, `("ghi"|"jkl")|("abc" "def")`},
53 | 
54 | 	// analyze keeps track of multiple possible prefix/suffixes.
55 | 	{`[ab][cd][ef]`, `("ace"|"acf"|"ade"|"adf"|"bce"|"bcf"|"bde"|"bdf")`},
56 | 	{`ab[cd]e`, `("abc" "bce")|("abd" "bde")`},
57 | 
58 | 	// Different sized suffixes.
59 | 	{`(a|ab)cde`, `"cde" ("abc" "bcd")|("acd")`},
60 | 	{`(a|b|c|d)(ef|g|hi|j)`, `+`},
61 | 
62 | 	{`(?s).`, `+`},
63 | 
64 | 	// Expanding case.
65 | 	{`(?i)a~~`, `("A~~"|"a~~")`},
66 | 	{`(?i)ab~`, `("AB~"|"Ab~"|"aB~"|"ab~")`},
67 | 	{`(?i)abc`, `("ABC"|"ABc"|"AbC"|"Abc"|"aBC"|"aBc"|"abC"|"abc")`},
68 | 	{`(?i)abc|def`, `("ABC"|"ABc"|"AbC"|"Abc"|"DEF"|"DEf"|"DeF"|"Def"|"aBC"|"aBc"|"abC"|"abc"|"dEF"|"dEf"|"deF"|"def")`},
69 | 	{`(?i)abcd`, `("ABC"|"ABc"|"AbC"|"Abc"|"aBC"|"aBc"|"abC"|"abc") ("BCD"|"BCd"|"BcD"|"Bcd"|"bCD"|"bCd"|"bcD"|"bcd")`},
70 | 	{`(?i)abc|abc`, `("ABC"|"ABc"|"AbC"|"Abc"|"aBC"|"aBc"|"abC"|"abc")`},
71 | 
72 | 	// Word boundary.
73 | 	{`\b`, `+`},
74 | 	{`\B`, `+`},
75 | 	{`\babc`, `"abc"`},
76 | 	{`\Babc`, `"abc"`},
77 | 	{`abc\b`, `"abc"`},
78 | 	{`abc\B`, `"abc"`},
79 | 	{`ab\bc`, `"abc"`},
80 | 	{`ab\Bc`, `"abc"`},
81 | }
82 | 
83 | func TestQuery(t *testing.T) {
84 | 	for _, tt := range queryTests {
85 | 		re, err := syntax.Parse(tt.re, syntax.Perl)
86 | 		if err != nil {
87 | 			t.Fatal(err)
88 | 		}
89 | 		q := RegexpQuery(re).String()
90 | 		if q != tt.q {
91 | 			t.Errorf("RegexpQuery(%#q) = %#q, want %#q", tt.re, q, tt.q)
92 | 		}
93 | 	}
94 | }
95 | 


--------------------------------------------------------------------------------
/index/write.go:
--------------------------------------------------------------------------------
  1 | // Copyright 2011 The Go Authors.  All rights reserved.
  2 | // Use of this source code is governed by a BSD-style
  3 | // license that can be found in the LICENSE file.
  4 | 
  5 | package index
  6 | 
  7 | import (
  8 | 	"io"
  9 | 	"io/ioutil"
 10 | 	"log"
 11 | 	"os"
 12 | 	"strings"
 13 | 	"unsafe"
 14 | 
 15 | 	"github.com/google/codesearch/sparse"
 16 | )
 17 | 
 18 | // Index writing.  See read.go for details of on-disk format.
 19 | //
 20 | // It would suffice to make a single large list of (trigram, file#) pairs
 21 | // while processing the files one at a time, sort that list by trigram,
 22 | // and then create the posting lists from subsequences of the list.
 23 | // However, we do not assume that the entire index fits in memory.
 24 | // Instead, we sort and flush the list to a new temporary file each time
 25 | // it reaches its maximum in-memory size, and then at the end we
 26 | // create the final posting lists by merging the temporary files as we
 27 | // read them back in.
 28 | //
 29 | // It would also be useful to be able to create an index for a subset
 30 | // of the files and then merge that index into an existing one.  This would
 31 | // allow incremental updating of an existing index when a directory changes.
 32 | // But we have not implemented that.
 33 | 
 34 | // An IndexWriter creates an on-disk index corresponding to a set of files.
 35 | type IndexWriter struct {
 36 | 	LogSkip bool // log information about skipped files
 37 | 	Verbose bool // log status using package log
 38 | 
 39 | 	trigram *sparse.Set // trigrams for the current file
 40 | 	buf     [8]byte     // scratch buffer
 41 | 
 42 | 	paths []string
 43 | 
 44 | 	nameData   *bufWriter // temp file holding list of names
 45 | 	nameLen    uint32     // number of bytes written to nameData
 46 | 	nameIndex  *bufWriter // temp file holding name index
 47 | 	numName    int        // number of names written
 48 | 	totalBytes int64
 49 | 
 50 | 	post      []postEntry // list of (trigram, file#) pairs
 51 | 	postFile  []*os.File  // flushed post entries
 52 | 	postIndex *bufWriter  // temp file holding posting list index
 53 | 
 54 | 	inbuf []byte     // input buffer
 55 | 	main  *bufWriter // main index file
 56 | }
 57 | 
 58 | const npost = 64 << 20 / 8 // 64 MB worth of post entries
 59 | 
 60 | // Create returns a new IndexWriter that will write the index to file.
 61 | func Create(file string) *IndexWriter {
 62 | 	return &IndexWriter{
 63 | 		trigram:   sparse.NewSet(1 << 24),
 64 | 		nameData:  bufCreate(""),
 65 | 		nameIndex: bufCreate(""),
 66 | 		postIndex: bufCreate(""),
 67 | 		main:      bufCreate(file),
 68 | 		post:      make([]postEntry, 0, npost),
 69 | 		inbuf:     make([]byte, 16384),
 70 | 	}
 71 | }
 72 | 
 73 | // A postEntry is an in-memory (trigram, file#) pair.
 74 | type postEntry uint64
 75 | 
 76 | func (p postEntry) trigram() uint32 {
 77 | 	return uint32(p >> 32)
 78 | }
 79 | 
 80 | func (p postEntry) fileid() uint32 {
 81 | 	return uint32(p)
 82 | }
 83 | 
 84 | func makePostEntry(trigram, fileid uint32) postEntry {
 85 | 	return postEntry(trigram)<<32 | postEntry(fileid)
 86 | }
 87 | 
 88 | // Tuning constants for detecting text files.
 89 | // A file is assumed not to be text files (and thus not indexed)
 90 | // if it contains an invalid UTF-8 sequences, if it is longer than maxFileLength
 91 | // bytes, if it contains a line longer than maxLineLen bytes,
 92 | // or if it contains more than maxTextTrigrams distinct trigrams.
 93 | const (
 94 | 	maxFileLen      = 1 << 30
 95 | 	maxLineLen      = 2000
 96 | 	maxTextTrigrams = 20000
 97 | )
 98 | 
 99 | // AddPaths adds the given paths to the index's list of paths.
100 | func (ix *IndexWriter) AddPaths(paths []string) {
101 | 	ix.paths = append(ix.paths, paths...)
102 | }
103 | 
104 | // AddFile adds the file with the given name (opened using os.Open)
105 | // to the index.  It logs errors using package log.
106 | func (ix *IndexWriter) AddFile(name string) {
107 | 	f, err := os.Open(name)
108 | 	if err != nil {
109 | 		log.Print(err)
110 | 		return
111 | 	}
112 | 	defer f.Close()
113 | 	ix.Add(name, f)
114 | }
115 | 
116 | // Add adds the file f to the index under the given name.
117 | // It logs errors using package log.
118 | func (ix *IndexWriter) Add(name string, f io.Reader) {
119 | 	ix.trigram.Reset()
120 | 	var (
121 | 		c       = byte(0)
122 | 		i       = 0
123 | 		buf     = ix.inbuf[:0]
124 | 		tv      = uint32(0)
125 | 		n       = int64(0)
126 | 		linelen = 0
127 | 	)
128 | 	for {
129 | 		tv = (tv << 8) & (1<<24 - 1)
130 | 		if i >= len(buf) {
131 | 			n, err := f.Read(buf[:cap(buf)])
132 | 			if n == 0 {
133 | 				if err != nil {
134 | 					if err == io.EOF {
135 | 						break
136 | 					}
137 | 					log.Printf("%s: %v\n", name, err)
138 | 					return
139 | 				}
140 | 				log.Printf("%s: 0-length read\n", name)
141 | 				return
142 | 			}
143 | 			buf = buf[:n]
144 | 			i = 0
145 | 		}
146 | 		c = buf[i]
147 | 		i++
148 | 		tv |= uint32(c)
149 | 		if n++; n >= 3 {
150 | 			ix.trigram.Add(tv)
151 | 		}
152 | 		if !validUTF8((tv>>8)&0xFF, tv&0xFF) {
153 | 			if ix.LogSkip {
154 | 				log.Printf("%s: invalid UTF-8, ignoring\n", name)
155 | 			}
156 | 			return
157 | 		}
158 | 		if n > maxFileLen {
159 | 			if ix.LogSkip {
160 | 				log.Printf("%s: too long, ignoring\n", name)
161 | 			}
162 | 			return
163 | 		}
164 | 		if linelen++; linelen > maxLineLen {
165 | 			if ix.LogSkip {
166 | 				log.Printf("%s: very long lines, ignoring\n", name)
167 | 			}
168 | 			return
169 | 		}
170 | 		if c == '\n' {
171 | 			linelen = 0
172 | 		}
173 | 	}
174 | 	if ix.trigram.Len() > maxTextTrigrams {
175 | 		if ix.LogSkip {
176 | 			log.Printf("%s: too many trigrams, probably not text, ignoring\n", name)
177 | 		}
178 | 		return
179 | 	}
180 | 	ix.totalBytes += n
181 | 
182 | 	if ix.Verbose {
183 | 		log.Printf("%d %d %s\n", n, ix.trigram.Len(), name)
184 | 	}
185 | 
186 | 	fileid := ix.addName(name)
187 | 	for _, trigram := range ix.trigram.Dense() {
188 | 		if len(ix.post) >= cap(ix.post) {
189 | 			ix.flushPost()
190 | 		}
191 | 		ix.post = append(ix.post, makePostEntry(trigram, fileid))
192 | 	}
193 | }
194 | 
195 | // Flush flushes the index entry to the target file.
196 | func (ix *IndexWriter) Flush() {
197 | 	ix.addName("")
198 | 
199 | 	var off [5]uint32
200 | 	ix.main.writeString(magic)
201 | 	off[0] = ix.main.offset()
202 | 	for _, p := range ix.paths {
203 | 		ix.main.writeString(p)
204 | 		ix.main.writeString("\x00")
205 | 	}
206 | 	ix.main.writeString("\x00")
207 | 	off[1] = ix.main.offset()
208 | 	copyFile(ix.main, ix.nameData)
209 | 	off[2] = ix.main.offset()
210 | 	ix.mergePost(ix.main)
211 | 	off[3] = ix.main.offset()
212 | 	copyFile(ix.main, ix.nameIndex)
213 | 	off[4] = ix.main.offset()
214 | 	copyFile(ix.main, ix.postIndex)
215 | 	for _, v := range off {
216 | 		ix.main.writeUint32(v)
217 | 	}
218 | 	ix.main.writeString(trailerMagic)
219 | 
220 | 	os.Remove(ix.nameData.name)
221 | 	for _, f := range ix.postFile {
222 | 		os.Remove(f.Name())
223 | 	}
224 | 	os.Remove(ix.nameIndex.name)
225 | 	os.Remove(ix.postIndex.name)
226 | 
227 | 	log.Printf("%d data bytes, %d index bytes", ix.totalBytes, ix.main.offset())
228 | 
229 | 	ix.main.flush()
230 | }
231 | 
232 | func copyFile(dst, src *bufWriter) {
233 | 	dst.flush()
234 | 	_, err := io.Copy(dst.file, src.finish())
235 | 	if err != nil {
236 | 		log.Fatalf("copying %s to %s: %v", src.name, dst.name, err)
237 | 	}
238 | }
239 | 
240 | // addName adds the file with the given name to the index.
241 | // It returns the assigned file ID number.
242 | func (ix *IndexWriter) addName(name string) uint32 {
243 | 	if strings.Contains(name, "\x00") {
244 | 		log.Fatalf("%q: file has NUL byte in name", name)
245 | 	}
246 | 
247 | 	ix.nameIndex.writeUint32(ix.nameData.offset())
248 | 	ix.nameData.writeString(name)
249 | 	ix.nameData.writeByte(0)
250 | 	id := ix.numName
251 | 	ix.numName++
252 | 	return uint32(id)
253 | }
254 | 
255 | // flushPost writes ix.post to a new temporary file and
256 | // clears the slice.
257 | func (ix *IndexWriter) flushPost() {
258 | 	w, err := ioutil.TempFile("", "csearch-index")
259 | 	if err != nil {
260 | 		log.Fatal(err)
261 | 	}
262 | 	if ix.Verbose {
263 | 		log.Printf("flush %d entries to %s", len(ix.post), w.Name())
264 | 	}
265 | 	sortPost(ix.post)
266 | 
267 | 	// Write the raw ix.post array to disk as is.
268 | 	// This process is the one reading it back in, so byte order is not a concern.
269 | 	data := (*[npost * 8]byte)(unsafe.Pointer(&ix.post[0]))[:len(ix.post)*8]
270 | 	if n, err := w.Write(data); err != nil || n < len(data) {
271 | 		if err != nil {
272 | 			log.Fatal(err)
273 | 		}
274 | 		log.Fatalf("short write writing %s", w.Name())
275 | 	}
276 | 
277 | 	ix.post = ix.post[:0]
278 | 	w.Seek(0, 0)
279 | 	ix.postFile = append(ix.postFile, w)
280 | }
281 | 
282 | // mergePost reads the flushed index entries and merges them
283 | // into posting lists, writing the resulting lists to out.
284 | func (ix *IndexWriter) mergePost(out *bufWriter) {
285 | 	var h postHeap
286 | 
287 | 	log.Printf("merge %d files + mem", len(ix.postFile))
288 | 	for _, f := range ix.postFile {
289 | 		h.addFile(f)
290 | 	}
291 | 	sortPost(ix.post)
292 | 	h.addMem(ix.post)
293 | 
294 | 	npost := 0
295 | 	e := h.next()
296 | 	offset0 := out.offset()
297 | 	for {
298 | 		npost++
299 | 		offset := out.offset() - offset0
300 | 		trigram := e.trigram()
301 | 		ix.buf[0] = byte(trigram >> 16)
302 | 		ix.buf[1] = byte(trigram >> 8)
303 | 		ix.buf[2] = byte(trigram)
304 | 
305 | 		// posting list
306 | 		fileid := ^uint32(0)
307 | 		nfile := uint32(0)
308 | 		out.write(ix.buf[:3])
309 | 		for ; e.trigram() == trigram && trigram != 1<<24-1; e = h.next() {
310 | 			out.writeUvarint(e.fileid() - fileid)
311 | 			fileid = e.fileid()
312 | 			nfile++
313 | 		}
314 | 		out.writeUvarint(0)
315 | 
316 | 		// index entry
317 | 		ix.postIndex.write(ix.buf[:3])
318 | 		ix.postIndex.writeUint32(nfile)
319 | 		ix.postIndex.writeUint32(offset)
320 | 
321 | 		if trigram == 1<<24-1 {
322 | 			break
323 | 		}
324 | 	}
325 | }
326 | 
327 | // A postChunk represents a chunk of post entries flushed to disk or
328 | // still in memory.
329 | type postChunk struct {
330 | 	e postEntry   // next entry
331 | 	m []postEntry // remaining entries after e
332 | }
333 | 
334 | const postBuf = 4096
335 | 
336 | // A postHeap is a heap (priority queue) of postChunks.
337 | type postHeap struct {
338 | 	ch []*postChunk
339 | }
340 | 
341 | func (h *postHeap) addFile(f *os.File) {
342 | 	data := mmapFile(f).d
343 | 	m := (*[npost]postEntry)(unsafe.Pointer(&data[0]))[:len(data)/8]
344 | 	h.addMem(m)
345 | }
346 | 
347 | func (h *postHeap) addMem(x []postEntry) {
348 | 	h.add(&postChunk{m: x})
349 | }
350 | 
351 | // step reads the next entry from ch and saves it in ch.e.
352 | // It returns false if ch is over.
353 | func (h *postHeap) step(ch *postChunk) bool {
354 | 	old := ch.e
355 | 	m := ch.m
356 | 	if len(m) == 0 {
357 | 		return false
358 | 	}
359 | 	ch.e = postEntry(m[0])
360 | 	m = m[1:]
361 | 	ch.m = m
362 | 	if old >= ch.e {
363 | 		panic("bad sort")
364 | 	}
365 | 	return true
366 | }
367 | 
368 | // add adds the chunk to the postHeap.
369 | // All adds must be called before the first call to next.
370 | func (h *postHeap) add(ch *postChunk) {
371 | 	if len(ch.m) > 0 {
372 | 		ch.e = ch.m[0]
373 | 		ch.m = ch.m[1:]
374 | 		h.push(ch)
375 | 	}
376 | }
377 | 
378 | // empty reports whether the postHeap is empty.
379 | func (h *postHeap) empty() bool {
380 | 	return len(h.ch) == 0
381 | }
382 | 
383 | // next returns the next entry from the postHeap.
384 | // It returns a postEntry with trigram == 1<<24 - 1 if h is empty.
385 | func (h *postHeap) next() postEntry {
386 | 	if len(h.ch) == 0 {
387 | 		return makePostEntry(1<<24-1, 0)
388 | 	}
389 | 	ch := h.ch[0]
390 | 	e := ch.e
391 | 	m := ch.m
392 | 	if len(m) == 0 {
393 | 		h.pop()
394 | 	} else {
395 | 		ch.e = m[0]
396 | 		ch.m = m[1:]
397 | 		h.siftDown(0)
398 | 	}
399 | 	return e
400 | }
401 | 
402 | func (h *postHeap) pop() *postChunk {
403 | 	ch := h.ch[0]
404 | 	n := len(h.ch) - 1
405 | 	h.ch[0] = h.ch[n]
406 | 	h.ch = h.ch[:n]
407 | 	if n > 1 {
408 | 		h.siftDown(0)
409 | 	}
410 | 	return ch
411 | }
412 | 
413 | func (h *postHeap) push(ch *postChunk) {
414 | 	n := len(h.ch)
415 | 	h.ch = append(h.ch, ch)
416 | 	if len(h.ch) >= 2 {
417 | 		h.siftUp(n)
418 | 	}
419 | }
420 | 
421 | func (h *postHeap) siftDown(i int) {
422 | 	ch := h.ch
423 | 	for {
424 | 		j1 := 2*i + 1
425 | 		if j1 >= len(ch) {
426 | 			break
427 | 		}
428 | 		j := j1
429 | 		if j2 := j1 + 1; j2 < len(ch) && ch[j1].e >= ch[j2].e {
430 | 			j = j2
431 | 		}
432 | 		if ch[i].e < ch[j].e {
433 | 			break
434 | 		}
435 | 		ch[i], ch[j] = ch[j], ch[i]
436 | 		i = j
437 | 	}
438 | }
439 | 
440 | func (h *postHeap) siftUp(j int) {
441 | 	ch := h.ch
442 | 	for {
443 | 		i := (j - 1) / 2
444 | 		if i == j || ch[i].e < ch[j].e {
445 | 			break
446 | 		}
447 | 		ch[i], ch[j] = ch[j], ch[i]
448 | 		j = i
449 | 	}
450 | }
451 | 
452 | // A bufWriter is a convenience wrapper: a closeable bufio.Writer.
453 | type bufWriter struct {
454 | 	name string
455 | 	file *os.File
456 | 	buf  []byte
457 | 	tmp  [8]byte
458 | }
459 | 
460 | // bufCreate creates a new file with the given name and returns a
461 | // corresponding bufWriter.  If name is empty, bufCreate uses a
462 | // temporary file.
463 | func bufCreate(name string) *bufWriter {
464 | 	var (
465 | 		f   *os.File
466 | 		err error
467 | 	)
468 | 	if name != "" {
469 | 		f, err = os.OpenFile(name, os.O_RDWR|os.O_CREATE|os.O_TRUNC, 0600)
470 | 	} else {
471 | 		f, err = ioutil.TempFile("", "csearch")
472 | 	}
473 | 	if err != nil {
474 | 		log.Fatal(err)
475 | 	}
476 | 	return &bufWriter{
477 | 		name: f.Name(),
478 | 		buf:  make([]byte, 0, 256<<10),
479 | 		file: f,
480 | 	}
481 | }
482 | 
483 | func (b *bufWriter) write(x []byte) {
484 | 	n := cap(b.buf) - len(b.buf)
485 | 	if len(x) > n {
486 | 		b.flush()
487 | 		if len(x) >= cap(b.buf) {
488 | 			if _, err := b.file.Write(x); err != nil {
489 | 				log.Fatalf("writing %s: %v", b.name, err)
490 | 			}
491 | 			return
492 | 		}
493 | 	}
494 | 	b.buf = append(b.buf, x...)
495 | }
496 | 
497 | func (b *bufWriter) writeByte(x byte) {
498 | 	if len(b.buf) >= cap(b.buf) {
499 | 		b.flush()
500 | 	}
501 | 	b.buf = append(b.buf, x)
502 | }
503 | 
504 | func (b *bufWriter) writeString(s string) {
505 | 	n := cap(b.buf) - len(b.buf)
506 | 	if len(s) > n {
507 | 		b.flush()
508 | 		if len(s) >= cap(b.buf) {
509 | 			if _, err := b.file.WriteString(s); err != nil {
510 | 				log.Fatalf("writing %s: %v", b.name, err)
511 | 			}
512 | 			return
513 | 		}
514 | 	}
515 | 	b.buf = append(b.buf, s...)
516 | }
517 | 
518 | // offset returns the current write offset.
519 | func (b *bufWriter) offset() uint32 {
520 | 	off, _ := b.file.Seek(0, 1)
521 | 	off += int64(len(b.buf))
522 | 	if int64(uint32(off)) != off {
523 | 		log.Fatalf("index is larger than 4GB")
524 | 	}
525 | 	return uint32(off)
526 | }
527 | 
528 | func (b *bufWriter) flush() {
529 | 	if len(b.buf) == 0 {
530 | 		return
531 | 	}
532 | 	_, err := b.file.Write(b.buf)
533 | 	if err != nil {
534 | 		log.Fatalf("writing %s: %v", b.name, err)
535 | 	}
536 | 	b.buf = b.buf[:0]
537 | }
538 | 
539 | // finish flushes the file to disk and returns an open file ready for reading.
540 | func (b *bufWriter) finish() *os.File {
541 | 	b.flush()
542 | 	f := b.file
543 | 	f.Seek(0, 0)
544 | 	return f
545 | }
546 | 
547 | func (b *bufWriter) writeTrigram(t uint32) {
548 | 	if cap(b.buf)-len(b.buf) < 3 {
549 | 		b.flush()
550 | 	}
551 | 	b.buf = append(b.buf, byte(t>>16), byte(t>>8), byte(t))
552 | }
553 | 
554 | func (b *bufWriter) writeUint32(x uint32) {
555 | 	if cap(b.buf)-len(b.buf) < 4 {
556 | 		b.flush()
557 | 	}
558 | 	b.buf = append(b.buf, byte(x>>24), byte(x>>16), byte(x>>8), byte(x))
559 | }
560 | 
561 | func (b *bufWriter) writeUvarint(x uint32) {
562 | 	if cap(b.buf)-len(b.buf) < 5 {
563 | 		b.flush()
564 | 	}
565 | 	switch {
566 | 	case x < 1<<7:
567 | 		b.buf = append(b.buf, byte(x))
568 | 	case x < 1<<14:
569 | 		b.buf = append(b.buf, byte(x|0x80), byte(x>>7))
570 | 	case x < 1<<21:
571 | 		b.buf = append(b.buf, byte(x|0x80), byte(x>>7|0x80), byte(x>>14))
572 | 	case x < 1<<28:
573 | 		b.buf = append(b.buf, byte(x|0x80), byte(x>>7|0x80), byte(x>>14|0x80), byte(x>>21))
574 | 	default:
575 | 		b.buf = append(b.buf, byte(x|0x80), byte(x>>7|0x80), byte(x>>14|0x80), byte(x>>21|0x80), byte(x>>28))
576 | 	}
577 | }
578 | 
579 | // validUTF8 reports whether the byte pair can appear in a
580 | // valid sequence of UTF-8-encoded code points.
581 | func validUTF8(c1, c2 uint32) bool {
582 | 	switch {
583 | 	case c1 < 0x80:
584 | 		// 1-byte, must be followed by 1-byte or first of multi-byte
585 | 		return c2 < 0x80 || 0xc0 <= c2 && c2 < 0xf8
586 | 	case c1 < 0xc0:
587 | 		// continuation byte, can be followed by nearly anything
588 | 		return c2 < 0xf8
589 | 	case c1 < 0xf8:
590 | 		// first of multi-byte, must be followed by continuation byte
591 | 		return 0x80 <= c2 && c2 < 0xc0
592 | 	}
593 | 	return false
594 | }
595 | 
596 | // sortPost sorts the postentry list.
597 | // The list is already sorted by fileid (bottom 32 bits)
598 | // and the top 8 bits are always zero, so there are only
599 | // 24 bits to sort.  Run two rounds of 12-bit radix sort.
600 | const sortK = 12
601 | 
602 | var sortTmp []postEntry
603 | var sortN [1 << sortK]int
604 | 
605 | func sortPost(post []postEntry) {
606 | 	if len(post) > len(sortTmp) {
607 | 		sortTmp = make([]postEntry, len(post))
608 | 	}
609 | 	tmp := sortTmp[:len(post)]
610 | 
611 | 	const k = sortK
612 | 	for i := range sortN {
613 | 		sortN[i] = 0
614 | 	}
615 | 	for _, p := range post {
616 | 		r := uintptr(p>>32) & (1<<k - 1)
617 | 		sortN[r]++
618 | 	}
619 | 	tot := 0
620 | 	for i, count := range sortN {
621 | 		sortN[i] = tot
622 | 		tot += count
623 | 	}
624 | 	for _, p := range post {
625 | 		r := uintptr(p>>32) & (1<<k - 1)
626 | 		o := sortN[r]
627 | 		sortN[r]++
628 | 		tmp[o] = p
629 | 	}
630 | 	tmp, post = post, tmp
631 | 
632 | 	for i := range sortN {
633 | 		sortN[i] = 0
634 | 	}
635 | 	for _, p := range post {
636 | 		r := uintptr(p>>(32+k)) & (1<<k - 1)
637 | 		sortN[r]++
638 | 	}
639 | 	tot = 0
640 | 	for i, count := range sortN {
641 | 		sortN[i] = tot
642 | 		tot += count
643 | 	}
644 | 	for _, p := range post {
645 | 		r := uintptr(p>>(32+k)) & (1<<k - 1)
646 | 		o := sortN[r]
647 | 		sortN[r]++
648 | 		tmp[o] = p
649 | 	}
650 | }
651 | 


--------------------------------------------------------------------------------
/index/write_test.go:
--------------------------------------------------------------------------------
  1 | // Copyright 2011 The Go Authors.  All rights reserved.
  2 | // Use of this source code is governed by a BSD-style
  3 | // license that can be found in the LICENSE file.
  4 | 
  5 | package index
  6 | 
  7 | import (
  8 | 	"bytes"
  9 | 	"io/ioutil"
 10 | 	"os"
 11 | 	"sort"
 12 | 	"strings"
 13 | 	"testing"
 14 | )
 15 | 
 16 | var trivialFiles = map[string]string{
 17 | 	"f0":       "\n\n",
 18 | 	"file1":    "\na\n",
 19 | 	"thefile2": "\nab\n",
 20 | 	"file3":    "\nabc\n",
 21 | 	"afile4":   "\ndabc\n",
 22 | 	"file5":    "\nxyzw\n",
 23 | }
 24 | 
 25 | var trivialIndex = join(
 26 | 	// header
 27 | 	"csearch index 1\n",
 28 | 
 29 | 	// list of paths
 30 | 	"\x00",
 31 | 
 32 | 	// list of names
 33 | 	"afile4\x00",
 34 | 	"f0\x00",
 35 | 	"file1\x00",
 36 | 	"file3\x00",
 37 | 	"file5\x00",
 38 | 	"thefile2\x00",
 39 | 	"\x00",
 40 | 
 41 | 	// list of posting lists
 42 | 	"\na\n", fileList(2), // file1
 43 | 	"\nab", fileList(3, 5), // file3, thefile2
 44 | 	"\nda", fileList(0), // afile4
 45 | 	"\nxy", fileList(4), // file5
 46 | 	"ab\n", fileList(5), // thefile2
 47 | 	"abc", fileList(0, 3), // afile4, file3
 48 | 	"bc\n", fileList(0, 3), // afile4, file3
 49 | 	"dab", fileList(0), // afile4
 50 | 	"xyz", fileList(4), // file5
 51 | 	"yzw", fileList(4), // file5
 52 | 	"zw\n", fileList(4), // file5
 53 | 	"\xff\xff\xff", fileList(),
 54 | 
 55 | 	// name index
 56 | 	u32(0),
 57 | 	u32(6+1),
 58 | 	u32(6+1+2+1),
 59 | 	u32(6+1+2+1+5+1),
 60 | 	u32(6+1+2+1+5+1+5+1),
 61 | 	u32(6+1+2+1+5+1+5+1+5+1),
 62 | 	u32(6+1+2+1+5+1+5+1+5+1+8+1),
 63 | 
 64 | 	// posting list index,
 65 | 	"\na\n", u32(1), u32(0),
 66 | 	"\nab", u32(2), u32(5),
 67 | 	"\nda", u32(1), u32(5+6),
 68 | 	"\nxy", u32(1), u32(5+6+5),
 69 | 	"ab\n", u32(1), u32(5+6+5+5),
 70 | 	"abc", u32(2), u32(5+6+5+5+5),
 71 | 	"bc\n", u32(2), u32(5+6+5+5+5+6),
 72 | 	"dab", u32(1), u32(5+6+5+5+5+6+6),
 73 | 	"xyz", u32(1), u32(5+6+5+5+5+6+6+5),
 74 | 	"yzw", u32(1), u32(5+6+5+5+5+6+6+5+5),
 75 | 	"zw\n", u32(1), u32(5+6+5+5+5+6+6+5+5+5),
 76 | 	"\xff\xff\xff", u32(0), u32(5+6+5+5+5+6+6+5+5+5+5),
 77 | 
 78 | 	// trailer
 79 | 	u32(16),
 80 | 	u32(16+1),
 81 | 	u32(16+1+38),
 82 | 	u32(16+1+38+62),
 83 | 	u32(16+1+38+62+28),
 84 | 
 85 | 	"\ncsearch trailr\n",
 86 | )
 87 | 
 88 | func join(s ...string) string {
 89 | 	return strings.Join(s, "")
 90 | }
 91 | 
 92 | func u32(x uint32) string {
 93 | 	var buf [4]byte
 94 | 	buf[0] = byte(x >> 24)
 95 | 	buf[1] = byte(x >> 16)
 96 | 	buf[2] = byte(x >> 8)
 97 | 	buf[3] = byte(x)
 98 | 	return string(buf[:])
 99 | }
100 | 
101 | func fileList(list ...uint32) string {
102 | 	var buf []byte
103 | 
104 | 	last := ^uint32(0)
105 | 	for _, x := range list {
106 | 		delta := x - last
107 | 		for delta >= 0x80 {
108 | 			buf = append(buf, byte(delta)|0x80)
109 | 			delta >>= 7
110 | 		}
111 | 		buf = append(buf, byte(delta))
112 | 		last = x
113 | 	}
114 | 	buf = append(buf, 0)
115 | 	return string(buf)
116 | }
117 | 
118 | func buildFlushIndex(out string, paths []string, doFlush bool, fileData map[string]string) {
119 | 	ix := Create(out)
120 | 	ix.AddPaths(paths)
121 | 	var files []string
122 | 	for name := range fileData {
123 | 		files = append(files, name)
124 | 	}
125 | 	sort.Strings(files)
126 | 	for _, name := range files {
127 | 		ix.Add(name, strings.NewReader(fileData[name]))
128 | 	}
129 | 	if doFlush {
130 | 		ix.flushPost()
131 | 	}
132 | 	ix.Flush()
133 | }
134 | 
135 | func buildIndex(name string, paths []string, fileData map[string]string) {
136 | 	buildFlushIndex(name, paths, false, fileData)
137 | }
138 | 
139 | func testTrivialWrite(t *testing.T, doFlush bool) {
140 | 	f, _ := ioutil.TempFile("", "index-test")
141 | 	defer os.Remove(f.Name())
142 | 	out := f.Name()
143 | 	buildFlushIndex(out, nil, doFlush, trivialFiles)
144 | 
145 | 	data, err := ioutil.ReadFile(out)
146 | 	if err != nil {
147 | 		t.Fatalf("reading _test/index.triv: %v", err)
148 | 	}
149 | 	want := []byte(trivialIndex)
150 | 	if !bytes.Equal(data, want) {
151 | 		i := 0
152 | 		for i < len(data) && i < len(want) && data[i] == want[i] {
153 | 			i++
154 | 		}
155 | 		t.Fatalf("wrong index:\nhave: %q %q\nwant: %q %q", data[:i], data[i:], want[:i], want[i:])
156 | 	}
157 | }
158 | 
159 | func TestTrivialWrite(t *testing.T) {
160 | 	testTrivialWrite(t, false)
161 | }
162 | 
163 | func TestTrivialWriteDisk(t *testing.T) {
164 | 	testTrivialWrite(t, true)
165 | }
166 | 
167 | func TestHeap(t *testing.T) {
168 | 	h := &postHeap{}
169 | 	es := []postEntry{7, 4, 3, 2, 4}
170 | 	for _, e := range es {
171 | 		h.addMem([]postEntry{e})
172 | 	}
173 | 	if len(h.ch) != len(es) {
174 | 		t.Fatalf("wrong heap size: %d, want %d", len(h.ch), len(es))
175 | 	}
176 | 	for a, b := h.next(), h.next(); b.trigram() != (1<<24 - 1); a, b = b, h.next() {
177 | 		if a > b {
178 | 			t.Fatalf("%d should <= %d", a, b)
179 | 		}
180 | 	}
181 | }
182 | 


--------------------------------------------------------------------------------
/lib/README.template:
--------------------------------------------------------------------------------
 1 | These are the command-line Code Search tools from
 2 | https://github.com/google/codesearch.
 3 | 
 4 | These binaries are for ARCH systems running OPERSYS.
 5 | 
 6 | To get started, run cindex with a list of directories to index:
 7 | 
 8 | 	cindex /usr/include $HOME/src
 9 | 
10 | Then run csearch to run grep over all the indexed sources:
11 | 
12 | 	csearch DATAKIT
13 | 
14 | For details, run either command with the -help option, and
15 | read http://swtch.com/~rsc/regexp/regexp4.html.
16 | 


--------------------------------------------------------------------------------
/lib/buildall:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # This script builds the code search binaries for a variety of OS/architecture combinations.
 4 | 
 5 | . ./setup
 6 | 
 7 | for i in {5,6,8}{c,g,a,l}
 8 | do
 9 | 	go tool dist install cmd/$i
10 | done
11 | 
12 | build() {
13 | 	echo "# $1"
14 | 	goos=$(echo $1 | sed 's;/.*;;')
15 | 	goarch=$(echo $1 | sed 's;.*/;;')
16 | 	GOOS=$goos GOARCH=$goarch CGO_ENABLED=0 \
17 | 		go install -a code.google.com/p/codesearch/cmd/{cgrep,cindex,csearch}
18 | 	rm -rf codesearch-$version
19 | 	mkdir codesearch-$version
20 | 	mv ~/g/bin/{cgrep,cindex,csearch}* codesearch-$version
21 | 	chmod +x codesearch-$version/*
22 | 	cat README.template | sed "s/ARCH/$(arch $goarch)/; s/OPERSYS/$(os $goos)/" >codesearch-$version/README.txt
23 | 	rm -f codesearch-$version-$goos-$goarch.zip
24 | 	zip -z -r codesearch-$version-$goos-$goarch.zip codesearch-$version < codesearch-$version/README.txt
25 | 	rm -rf codesearch-0.01
26 | }
27 | 
28 | for i in {linux,darwin,freebsd,windows}/{amd64,386}
29 | do
30 | 	build $i
31 | done
32 | 


--------------------------------------------------------------------------------
/lib/setup:
--------------------------------------------------------------------------------
 1 | set -e
 2 | 
 3 | os() {
 4 | 	case "$1" in
 5 | 	freebsd) echo FreeBSD;;
 6 | 	linux) echo Linux;;
 7 | 	darwin) echo Mac OS X;;
 8 | 	openbsd) echo OpenBSD;;
 9 | 	netbsd) echo NetBSD;;
10 | 	windows) echo Windows;;
11 | 	*) echo $1;;
12 | 	esac
13 | }
14 | 
15 | arch() {
16 | 	case "$1" in
17 | 	386) echo 32-bit x86;;
18 | 	amd64) echo 64-bit x86;;
19 | 	*) echo $1;;
20 | 	esac
21 | }
22 | 
23 | version=$(cat version)
24 | 


--------------------------------------------------------------------------------
/lib/uploadall:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | # gcodeup is a copy of $GOROOT/misc/dashboard/googlecode_upload.py.
 4 | 
 5 | . ./setup
 6 | user=$(sed -n 's/^re2.username = //' ~/.hgrc)
 7 | password=$(sed -n 's/^re2\.password = //' ~/.hgrc)
 8 | 
 9 | upload() {
10 | 	goos=$(echo $1 | sed "s/codesearch-$version-//; s/-.*//")
11 | 	goarch=$(echo $1 | sed "s/codesearch-$version-//; s/[a-z0-9]*-//; s/-.*//")
12 | 	gcodeup -s "binaries for $(os $goos) $(arch $goarch)" -p codesearch -u "$user" -w "$password" codesearch-$version-$1-$2.zip
13 | }
14 | 
15 | for i in codesearch-$version-*
16 | do
17 | 	upload $i
18 | done
19 | 


--------------------------------------------------------------------------------
/lib/version:
--------------------------------------------------------------------------------
1 | 0.01
2 | 


--------------------------------------------------------------------------------
/regexp/copy.go:
--------------------------------------------------------------------------------
  1 | // Copyright 2011 The Go Authors.  All rights reserved.
  2 | // Use of this source code is governed by a BSD-style
  3 | // license that can be found in the LICENSE file.
  4 | 
  5 | // Copied from Go's regexp/syntax.
  6 | // Formatters edited to handle instByteRange.
  7 | 
  8 | package regexp
  9 | 
 10 | import (
 11 | 	"bytes"
 12 | 	"fmt"
 13 | 	"regexp/syntax"
 14 | 	"sort"
 15 | 	"strconv"
 16 | 	"unicode"
 17 | )
 18 | 
 19 | // cleanClass sorts the ranges (pairs of elements of r),
 20 | // merges them, and eliminates duplicates.
 21 | func cleanClass(rp *[]rune) []rune {
 22 | 
 23 | 	// Sort by lo increasing, hi decreasing to break ties.
 24 | 	sort.Sort(ranges{rp})
 25 | 
 26 | 	r := *rp
 27 | 	if len(r) < 2 {
 28 | 		return r
 29 | 	}
 30 | 
 31 | 	// Merge abutting, overlapping.
 32 | 	w := 2 // write index
 33 | 	for i := 2; i < len(r); i += 2 {
 34 | 		lo, hi := r[i], r[i+1]
 35 | 		if lo <= r[w-1]+1 {
 36 | 			// merge with previous range
 37 | 			if hi > r[w-1] {
 38 | 				r[w-1] = hi
 39 | 			}
 40 | 			continue
 41 | 		}
 42 | 		// new disjoint range
 43 | 		r[w] = lo
 44 | 		r[w+1] = hi
 45 | 		w += 2
 46 | 	}
 47 | 
 48 | 	return r[:w]
 49 | }
 50 | 
 51 | // appendRange returns the result of appending the range lo-hi to the class r.
 52 | func appendRange(r []rune, lo, hi rune) []rune {
 53 | 	// Expand last range or next to last range if it overlaps or abuts.
 54 | 	// Checking two ranges helps when appending case-folded
 55 | 	// alphabets, so that one range can be expanding A-Z and the
 56 | 	// other expanding a-z.
 57 | 	n := len(r)
 58 | 	for i := 2; i <= 4; i += 2 { // twice, using i=2, i=4
 59 | 		if n >= i {
 60 | 			rlo, rhi := r[n-i], r[n-i+1]
 61 | 			if lo <= rhi+1 && rlo <= hi+1 {
 62 | 				if lo < rlo {
 63 | 					r[n-i] = lo
 64 | 				}
 65 | 				if hi > rhi {
 66 | 					r[n-i+1] = hi
 67 | 				}
 68 | 				return r
 69 | 			}
 70 | 		}
 71 | 	}
 72 | 
 73 | 	return append(r, lo, hi)
 74 | }
 75 | 
 76 | const (
 77 | 	// minimum and maximum runes involved in folding.
 78 | 	// checked during test.
 79 | 	minFold = 0x0041
 80 | 	maxFold = 0x1044f
 81 | )
 82 | 
 83 | // appendFoldedRange returns the result of appending the range lo-hi
 84 | // and its case folding-equivalent runes to the class r.
 85 | func appendFoldedRange(r []rune, lo, hi rune) []rune {
 86 | 	// Optimizations.
 87 | 	if lo <= minFold && hi >= maxFold {
 88 | 		// Range is full: folding can't add more.
 89 | 		return appendRange(r, lo, hi)
 90 | 	}
 91 | 	if hi < minFold || lo > maxFold {
 92 | 		// Range is outside folding possibilities.
 93 | 		return appendRange(r, lo, hi)
 94 | 	}
 95 | 	if lo < minFold {
 96 | 		// [lo, minFold-1] needs no folding.
 97 | 		r = appendRange(r, lo, minFold-1)
 98 | 		lo = minFold
 99 | 	}
100 | 	if hi > maxFold {
101 | 		// [maxFold+1, hi] needs no folding.
102 | 		r = appendRange(r, maxFold+1, hi)
103 | 		hi = maxFold
104 | 	}
105 | 
106 | 	// Brute force.  Depend on appendRange to coalesce ranges on the fly.
107 | 	for c := lo; c <= hi; c++ {
108 | 		r = appendRange(r, c, c)
109 | 		f := unicode.SimpleFold(c)
110 | 		for f != c {
111 | 			r = appendRange(r, f, f)
112 | 			f = unicode.SimpleFold(f)
113 | 		}
114 | 	}
115 | 	return r
116 | }
117 | 
118 | // ranges implements sort.Interface on a []rune.
119 | // The choice of receiver type definition is strange
120 | // but avoids an allocation since we already have
121 | // a *[]rune.
122 | type ranges struct {
123 | 	p *[]rune
124 | }
125 | 
126 | func (ra ranges) Less(i, j int) bool {
127 | 	p := *ra.p
128 | 	i *= 2
129 | 	j *= 2
130 | 	return p[i] < p[j] || p[i] == p[j] && p[i+1] > p[j+1]
131 | }
132 | 
133 | func (ra ranges) Len() int {
134 | 	return len(*ra.p) / 2
135 | }
136 | 
137 | func (ra ranges) Swap(i, j int) {
138 | 	p := *ra.p
139 | 	i *= 2
140 | 	j *= 2
141 | 	p[i], p[i+1], p[j], p[j+1] = p[j], p[j+1], p[i], p[i+1]
142 | }
143 | 
144 | func progString(p *syntax.Prog) string {
145 | 	var b bytes.Buffer
146 | 	dumpProg(&b, p)
147 | 	return b.String()
148 | }
149 | 
150 | func instString(i *syntax.Inst) string {
151 | 	var b bytes.Buffer
152 | 	dumpInst(&b, i)
153 | 	return b.String()
154 | }
155 | 
156 | func bw(b *bytes.Buffer, args ...string) {
157 | 	for _, s := range args {
158 | 		b.WriteString(s)
159 | 	}
160 | }
161 | 
162 | func dumpProg(b *bytes.Buffer, p *syntax.Prog) {
163 | 	for j := range p.Inst {
164 | 		i := &p.Inst[j]
165 | 		pc := strconv.Itoa(j)
166 | 		if len(pc) < 3 {
167 | 			b.WriteString("   "[len(pc):])
168 | 		}
169 | 		if j == p.Start {
170 | 			pc += "*"
171 | 		}
172 | 		bw(b, pc, "\t")
173 | 		dumpInst(b, i)
174 | 		bw(b, "\n")
175 | 	}
176 | }
177 | 
178 | func u32(i uint32) string {
179 | 	return strconv.FormatUint(uint64(i), 10)
180 | }
181 | 
182 | func dumpInst(b *bytes.Buffer, i *syntax.Inst) {
183 | 	switch i.Op {
184 | 	case syntax.InstAlt:
185 | 		bw(b, "alt -> ", u32(i.Out), ", ", u32(i.Arg))
186 | 	case syntax.InstAltMatch:
187 | 		bw(b, "altmatch -> ", u32(i.Out), ", ", u32(i.Arg))
188 | 	case syntax.InstCapture:
189 | 		bw(b, "cap ", u32(i.Arg), " -> ", u32(i.Out))
190 | 	case syntax.InstEmptyWidth:
191 | 		bw(b, "empty ", u32(i.Arg), " -> ", u32(i.Out))
192 | 	case syntax.InstMatch:
193 | 		bw(b, "match")
194 | 	case syntax.InstFail:
195 | 		bw(b, "fail")
196 | 	case syntax.InstNop:
197 | 		bw(b, "nop -> ", u32(i.Out))
198 | 	case instByteRange:
199 | 		fmt.Fprintf(b, "byte %02x-%02x", (i.Arg>>8)&0xFF, i.Arg&0xFF)
200 | 		if i.Arg&argFold != 0 {
201 | 			bw(b, "/i")
202 | 		}
203 | 		bw(b, " -> ", u32(i.Out))
204 | 
205 | 	// Should not happen
206 | 	case syntax.InstRune:
207 | 		if i.Rune == nil {
208 | 			// shouldn't happen
209 | 			bw(b, "rune <nil>")
210 | 		}
211 | 		bw(b, "rune ", strconv.QuoteToASCII(string(i.Rune)))
212 | 		if syntax.Flags(i.Arg)&syntax.FoldCase != 0 {
213 | 			bw(b, "/i")
214 | 		}
215 | 		bw(b, " -> ", u32(i.Out))
216 | 	case syntax.InstRune1:
217 | 		bw(b, "rune1 ", strconv.QuoteToASCII(string(i.Rune)), " -> ", u32(i.Out))
218 | 	case syntax.InstRuneAny:
219 | 		bw(b, "any -> ", u32(i.Out))
220 | 	case syntax.InstRuneAnyNotNL:
221 | 		bw(b, "anynotnl -> ", u32(i.Out))
222 | 	}
223 | }
224 | 


--------------------------------------------------------------------------------
/regexp/match.go:
--------------------------------------------------------------------------------
  1 | // Copyright 2011 The Go Authors.  All rights reserved.
  2 | // Use of this source code is governed by a BSD-style
  3 | // license that can be found in the LICENSE file.
  4 | 
  5 | package regexp
  6 | 
  7 | import (
  8 | 	"bytes"
  9 | 	"encoding/binary"
 10 | 	"flag"
 11 | 	"fmt"
 12 | 	"io"
 13 | 	"os"
 14 | 	"regexp/syntax"
 15 | 	"sort"
 16 | 
 17 | 	"github.com/google/codesearch/sparse"
 18 | )
 19 | 
 20 | // A matcher holds the state for running regular expression search.
 21 | type matcher struct {
 22 | 	prog      *syntax.Prog       // compiled program
 23 | 	dstate    map[string]*dstate // dstate cache
 24 | 	start     *dstate            // start state
 25 | 	startLine *dstate            // start state for beginning of line
 26 | 	z1, z2    nstate             // two temporary nstates
 27 | }
 28 | 
 29 | // An nstate corresponds to an NFA state.
 30 | type nstate struct {
 31 | 	q       sparse.Set // queue of program instructions
 32 | 	partial rune       // partially decoded rune (TODO)
 33 | 	flag    flags      // flags (TODO)
 34 | }
 35 | 
 36 | // The flags record state about a position between bytes in the text.
 37 | type flags uint32
 38 | 
 39 | const (
 40 | 	flagBOL  flags = 1 << iota // beginning of line
 41 | 	flagEOL                    // end of line
 42 | 	flagBOT                    // beginning of text
 43 | 	flagEOT                    // end of text
 44 | 	flagWord                   // last byte was word byte
 45 | )
 46 | 
 47 | // A dstate corresponds to a DFA state.
 48 | type dstate struct {
 49 | 	next     [256]*dstate // next state, per byte
 50 | 	enc      string       // encoded nstate
 51 | 	matchNL  bool         // match when next byte is \n
 52 | 	matchEOT bool         // match in this state at end of text
 53 | }
 54 | 
 55 | func (z *nstate) String() string {
 56 | 	return fmt.Sprintf("%v/%#x+%#x", z.q.Dense(), z.flag, z.partial)
 57 | }
 58 | 
 59 | // enc encodes z as a string.
 60 | func (z *nstate) enc() string {
 61 | 	var buf []byte
 62 | 	var v [10]byte
 63 | 	last := ^uint32(0)
 64 | 	n := binary.PutUvarint(v[:], uint64(z.partial))
 65 | 	buf = append(buf, v[:n]...)
 66 | 	n = binary.PutUvarint(v[:], uint64(z.flag))
 67 | 	buf = append(buf, v[:n]...)
 68 | 	dense := z.q.Dense()
 69 | 	ids := make([]int, 0, len(dense))
 70 | 	for _, id := range z.q.Dense() {
 71 | 		ids = append(ids, int(id))
 72 | 	}
 73 | 	sort.Ints(ids)
 74 | 	for _, id := range ids {
 75 | 		n := binary.PutUvarint(v[:], uint64(uint32(id)-last))
 76 | 		buf = append(buf, v[:n]...)
 77 | 		last = uint32(id)
 78 | 	}
 79 | 	return string(buf)
 80 | }
 81 | 
 82 | // dec decodes the encoding s into z.
 83 | func (z *nstate) dec(s string) {
 84 | 	b := []byte(s)
 85 | 	i, n := binary.Uvarint(b)
 86 | 	if n <= 0 {
 87 | 		bug()
 88 | 	}
 89 | 	b = b[n:]
 90 | 	z.partial = rune(i)
 91 | 	i, n = binary.Uvarint(b)
 92 | 	if n <= 0 {
 93 | 		bug()
 94 | 	}
 95 | 	b = b[n:]
 96 | 	z.flag = flags(i)
 97 | 	z.q.Reset()
 98 | 	last := ^uint32(0)
 99 | 	for len(b) > 0 {
100 | 		i, n = binary.Uvarint(b)
101 | 		if n <= 0 {
102 | 			bug()
103 | 		}
104 | 		b = b[n:]
105 | 		last += uint32(i)
106 | 		z.q.Add(last)
107 | 	}
108 | }
109 | 
110 | // dmatch is the state we're in when we've seen a match and are just
111 | // waiting for the end of the line.
112 | var dmatch = dstate{
113 | 	matchNL:  true,
114 | 	matchEOT: true,
115 | }
116 | 
117 | func init() {
118 | 	var z nstate
119 | 	dmatch.enc = z.enc()
120 | 	for i := range dmatch.next {
121 | 		if i != '\n' {
122 | 			dmatch.next[i] = &dmatch
123 | 		}
124 | 	}
125 | }
126 | 
127 | // init initializes the matcher.
128 | func (m *matcher) init(prog *syntax.Prog) error {
129 | 	m.prog = prog
130 | 	m.dstate = make(map[string]*dstate)
131 | 
132 | 	m.z1.q.Init(uint32(len(prog.Inst)))
133 | 	m.z2.q.Init(uint32(len(prog.Inst)))
134 | 
135 | 	m.addq(&m.z1.q, uint32(prog.Start), syntax.EmptyBeginLine|syntax.EmptyBeginText)
136 | 	m.z1.flag = flagBOL | flagBOT
137 | 	m.start = m.cache(&m.z1)
138 | 
139 | 	m.z1.q.Reset()
140 | 	m.addq(&m.z1.q, uint32(prog.Start), syntax.EmptyBeginLine)
141 | 	m.z1.flag = flagBOL
142 | 	m.startLine = m.cache(&m.z1)
143 | 
144 | 	return nil
145 | }
146 | 
147 | // stepEmpty steps runq to nextq expanding according to flag.
148 | func (m *matcher) stepEmpty(runq, nextq *sparse.Set, flag syntax.EmptyOp) {
149 | 	nextq.Reset()
150 | 	for _, id := range runq.Dense() {
151 | 		m.addq(nextq, id, flag)
152 | 	}
153 | }
154 | 
155 | // stepByte steps runq to nextq consuming c and then expanding according to flag.
156 | // It returns true if a match ends immediately before c.
157 | // c is either an input byte or endText.
158 | func (m *matcher) stepByte(runq, nextq *sparse.Set, c int, flag syntax.EmptyOp) (match bool) {
159 | 	nextq.Reset()
160 | 	m.addq(nextq, uint32(m.prog.Start), flag)
161 | 	for _, id := range runq.Dense() {
162 | 		i := &m.prog.Inst[id]
163 | 		switch i.Op {
164 | 		default:
165 | 			continue
166 | 		case syntax.InstMatch:
167 | 			match = true
168 | 			continue
169 | 		case instByteRange:
170 | 			if c == endText {
171 | 				break
172 | 			}
173 | 			lo := int((i.Arg >> 8) & 0xFF)
174 | 			hi := int(i.Arg & 0xFF)
175 | 			ch := c
176 | 			if i.Arg&argFold != 0 && 'a' <= ch && ch <= 'z' {
177 | 				ch += 'A' - 'a'
178 | 			}
179 | 			if lo <= ch && ch <= hi {
180 | 				m.addq(nextq, i.Out, flag)
181 | 			}
182 | 		}
183 | 	}
184 | 	return
185 | }
186 | 
187 | // addq adds id to the queue, expanding according to flag.
188 | func (m *matcher) addq(q *sparse.Set, id uint32, flag syntax.EmptyOp) {
189 | 	if q.Has(id) {
190 | 		return
191 | 	}
192 | 	q.Add(id)
193 | 	i := &m.prog.Inst[id]
194 | 	switch i.Op {
195 | 	case syntax.InstCapture, syntax.InstNop:
196 | 		m.addq(q, i.Out, flag)
197 | 	case syntax.InstAlt, syntax.InstAltMatch:
198 | 		m.addq(q, i.Out, flag)
199 | 		m.addq(q, i.Arg, flag)
200 | 	case syntax.InstEmptyWidth:
201 | 		if syntax.EmptyOp(i.Arg)&^flag == 0 {
202 | 			m.addq(q, i.Out, flag)
203 | 		}
204 | 	}
205 | }
206 | 
207 | const endText = -1
208 | 
209 | // computeNext computes the next DFA state if we're in d reading c (an input byte or endText).
210 | func (m *matcher) computeNext(d *dstate, c int) *dstate {
211 | 	this, next := &m.z1, &m.z2
212 | 	this.dec(d.enc)
213 | 
214 | 	// compute flags in effect before c
215 | 	flag := syntax.EmptyOp(0)
216 | 	if this.flag&flagBOL != 0 {
217 | 		flag |= syntax.EmptyBeginLine
218 | 	}
219 | 	if this.flag&flagBOT != 0 {
220 | 		flag |= syntax.EmptyBeginText
221 | 	}
222 | 	if this.flag&flagWord != 0 {
223 | 		if !isWordByte(c) {
224 | 			flag |= syntax.EmptyWordBoundary
225 | 		} else {
226 | 			flag |= syntax.EmptyNoWordBoundary
227 | 		}
228 | 	} else {
229 | 		if isWordByte(c) {
230 | 			flag |= syntax.EmptyWordBoundary
231 | 		} else {
232 | 			flag |= syntax.EmptyNoWordBoundary
233 | 		}
234 | 	}
235 | 	if c == '\n' {
236 | 		flag |= syntax.EmptyEndLine
237 | 	}
238 | 	if c == endText {
239 | 		flag |= syntax.EmptyEndLine | syntax.EmptyEndText
240 | 	}
241 | 
242 | 	// re-expand queue using new flags.
243 | 	// TODO: only do this when it matters
244 | 	// (something is gating on word boundaries).
245 | 	m.stepEmpty(&this.q, &next.q, flag)
246 | 	this, next = next, this
247 | 
248 | 	// now compute flags after c.
249 | 	flag = 0
250 | 	next.flag = 0
251 | 	if c == '\n' {
252 | 		flag |= syntax.EmptyBeginLine
253 | 		next.flag |= flagBOL
254 | 	}
255 | 	if isWordByte(c) {
256 | 		next.flag |= flagWord
257 | 	}
258 | 
259 | 	// re-add start, process rune + expand according to flags.
260 | 	if m.stepByte(&this.q, &next.q, c, flag) {
261 | 		return &dmatch
262 | 	}
263 | 	return m.cache(next)
264 | }
265 | 
266 | func (m *matcher) cache(z *nstate) *dstate {
267 | 	enc := z.enc()
268 | 	d := m.dstate[enc]
269 | 	if d != nil {
270 | 		return d
271 | 	}
272 | 
273 | 	d = &dstate{enc: enc}
274 | 	m.dstate[enc] = d
275 | 	d.matchNL = m.computeNext(d, '\n') == &dmatch
276 | 	d.matchEOT = m.computeNext(d, endText) == &dmatch
277 | 	return d
278 | }
279 | 
280 | func (m *matcher) match(b []byte, beginText, endText bool) (end int) {
281 | 	//	fmt.Printf("%v\n", m.prog)
282 | 
283 | 	d := m.startLine
284 | 	if beginText {
285 | 		d = m.start
286 | 	}
287 | 	//	m.z1.dec(d.enc)
288 | 	//	fmt.Printf("%v (%v)\n", &m.z1, d==&dmatch)
289 | 	for i, c := range b {
290 | 		d1 := d.next[c]
291 | 		if d1 == nil {
292 | 			if c == '\n' {
293 | 				if d.matchNL {
294 | 					return i
295 | 				}
296 | 				d1 = m.startLine
297 | 			} else {
298 | 				d1 = m.computeNext(d, int(c))
299 | 			}
300 | 			d.next[c] = d1
301 | 		}
302 | 		d = d1
303 | 		//		m.z1.dec(d.enc)
304 | 		//		fmt.Printf("%#U: %v (%v, %v, %v)\n", c, &m.z1, d==&dmatch, d.matchNL, d.matchEOT)
305 | 	}
306 | 	if d.matchNL || endText && d.matchEOT {
307 | 		return len(b)
308 | 	}
309 | 	return -1
310 | }
311 | 
312 | func (m *matcher) matchString(b string, beginText, endText bool) (end int) {
313 | 	d := m.startLine
314 | 	if beginText {
315 | 		d = m.start
316 | 	}
317 | 	for i := 0; i < len(b); i++ {
318 | 		c := b[i]
319 | 		d1 := d.next[c]
320 | 		if d1 == nil {
321 | 			if c == '\n' {
322 | 				if d.matchNL {
323 | 					return i
324 | 				}
325 | 				d1 = m.startLine
326 | 			} else {
327 | 				d1 = m.computeNext(d, int(c))
328 | 			}
329 | 			d.next[c] = d1
330 | 		}
331 | 		d = d1
332 | 	}
333 | 	if d.matchNL || endText && d.matchEOT {
334 | 		return len(b)
335 | 	}
336 | 	return -1
337 | }
338 | 
339 | // isWordByte reports whether the byte c is a word character: ASCII only.
340 | // This is used to implement \b and \B.  This is not right for Unicode, but:
341 | //	- it's hard to get right in a byte-at-a-time matching world
342 | //	  (the DFA has only one-byte lookahead)
343 | //	- this crude approximation is the same one PCRE uses
344 | func isWordByte(c int) bool {
345 | 	return 'A' <= c && c <= 'Z' ||
346 | 		'a' <= c && c <= 'z' ||
347 | 		'0' <= c && c <= '9' ||
348 | 		c == '_'
349 | }
350 | 
351 | // TODO:
352 | type Grep struct {
353 | 	Regexp *Regexp   // regexp to search for
354 | 	Stdout io.Writer // output target
355 | 	Stderr io.Writer // error target
356 | 
357 | 	L bool // L flag - print file names only
358 | 	C bool // C flag - print count of matches
359 | 	N bool // N flag - print line numbers
360 | 	H bool // H flag - do not print file names
361 | 
362 | 	Match bool
363 | 
364 | 	buf []byte
365 | }
366 | 
367 | func (g *Grep) AddFlags() {
368 | 	flag.BoolVar(&g.L, "l", false, "list matching files only")
369 | 	flag.BoolVar(&g.C, "c", false, "print match counts only")
370 | 	flag.BoolVar(&g.N, "n", false, "show line numbers")
371 | 	flag.BoolVar(&g.H, "h", false, "omit file names")
372 | }
373 | 
374 | func (g *Grep) File(name string) {
375 | 	f, err := os.Open(name)
376 | 	if err != nil {
377 | 		fmt.Fprintf(g.Stderr, "%s\n", err)
378 | 		return
379 | 	}
380 | 	defer f.Close()
381 | 	g.Reader(f, name)
382 | }
383 | 
384 | var nl = []byte{'\n'}
385 | 
386 | func countNL(b []byte) int {
387 | 	n := 0
388 | 	for {
389 | 		i := bytes.IndexByte(b, '\n')
390 | 		if i < 0 {
391 | 			break
392 | 		}
393 | 		n++
394 | 		b = b[i+1:]
395 | 	}
396 | 	return n
397 | }
398 | 
399 | func (g *Grep) Reader(r io.Reader, name string) {
400 | 	if g.buf == nil {
401 | 		g.buf = make([]byte, 1<<20)
402 | 	}
403 | 	var (
404 | 		buf        = g.buf[:0]
405 | 		needLineno = g.N
406 | 		lineno     = 1
407 | 		count      = 0
408 | 		prefix     = ""
409 | 		beginText  = true
410 | 		endText    = false
411 | 	)
412 | 	if !g.H {
413 | 		prefix = name + ":"
414 | 	}
415 | 	for {
416 | 		n, err := io.ReadFull(r, buf[len(buf):cap(buf)])
417 | 		buf = buf[:len(buf)+n]
418 | 		end := len(buf)
419 | 		if err == nil {
420 | 			i := bytes.LastIndex(buf, nl)
421 | 			if i >= 0 {
422 | 				end = i + 1
423 | 			}
424 | 		} else {
425 | 			endText = true
426 | 		}
427 | 		chunkStart := 0
428 | 		for chunkStart < end {
429 | 			m1 := g.Regexp.Match(buf[chunkStart:end], beginText, endText) + chunkStart
430 | 			beginText = false
431 | 			if m1 < chunkStart {
432 | 				break
433 | 			}
434 | 			g.Match = true
435 | 			if g.L {
436 | 				fmt.Fprintf(g.Stdout, "%s\n", name)
437 | 				return
438 | 			}
439 | 			lineStart := bytes.LastIndex(buf[chunkStart:m1], nl) + 1 + chunkStart
440 | 			lineEnd := m1 + 1
441 | 			if lineEnd > end {
442 | 				lineEnd = end
443 | 			}
444 | 			if needLineno {
445 | 				lineno += countNL(buf[chunkStart:lineStart])
446 | 			}
447 | 			line := buf[lineStart:lineEnd]
448 | 			nl := ""
449 | 			if len(line) == 0 || line[len(line)-1] != '\n' {
450 | 				nl = "\n"
451 | 			}
452 | 			switch {
453 | 			case g.C:
454 | 				count++
455 | 			case g.N:
456 | 				fmt.Fprintf(g.Stdout, "%s%d:%s%s", prefix, lineno, line, nl)
457 | 			default:
458 | 				fmt.Fprintf(g.Stdout, "%s%s%s", prefix, line, nl)
459 | 			}
460 | 			if needLineno {
461 | 				lineno++
462 | 			}
463 | 			chunkStart = lineEnd
464 | 		}
465 | 		if needLineno && err == nil {
466 | 			lineno += countNL(buf[chunkStart:end])
467 | 		}
468 | 		n = copy(buf, buf[end:])
469 | 		buf = buf[:n]
470 | 		if len(buf) == 0 && err != nil {
471 | 			if err != io.EOF && err != io.ErrUnexpectedEOF {
472 | 				fmt.Fprintf(g.Stderr, "%s: %v\n", name, err)
473 | 			}
474 | 			break
475 | 		}
476 | 	}
477 | 	if g.C && count > 0 {
478 | 		fmt.Fprintf(g.Stdout, "%s: %d\n", name, count)
479 | 	}
480 | }
481 | 


--------------------------------------------------------------------------------
/regexp/regexp.go:
--------------------------------------------------------------------------------
 1 | // Copyright 2011 The Go Authors.  All rights reserved.
 2 | // Use of this source code is governed by a BSD-style
 3 | // license that can be found in the LICENSE file.
 4 | 
 5 | // Package regexp implements regular expression search tuned for
 6 | // use in grep-like programs.
 7 | package regexp
 8 | 
 9 | import "regexp/syntax"
10 | 
11 | func bug() {
12 | 	panic("codesearch/regexp: internal error")
13 | }
14 | 
15 | // Regexp is the representation of a compiled regular expression.
16 | // A Regexp is NOT SAFE for concurrent use by multiple goroutines.
17 | type Regexp struct {
18 | 	Syntax *syntax.Regexp
19 | 	expr   string // original expression
20 | 	m      matcher
21 | }
22 | 
23 | // String returns the source text used to compile the regular expression.
24 | func (re *Regexp) String() string {
25 | 	return re.expr
26 | }
27 | 
28 | // Compile parses a regular expression and returns, if successful,
29 | // a Regexp object that can be used to match against lines of text.
30 | func Compile(expr string) (*Regexp, error) {
31 | 	re, err := syntax.Parse(expr, syntax.Perl)
32 | 	if err != nil {
33 | 		return nil, err
34 | 	}
35 | 	sre := re.Simplify()
36 | 	prog, err := syntax.Compile(sre)
37 | 	if err != nil {
38 | 		return nil, err
39 | 	}
40 | 	if err := toByteProg(prog); err != nil {
41 | 		return nil, err
42 | 	}
43 | 	r := &Regexp{
44 | 		Syntax: re,
45 | 		expr:   expr,
46 | 	}
47 | 	if err := r.m.init(prog); err != nil {
48 | 		return nil, err
49 | 	}
50 | 	return r, nil
51 | }
52 | 
53 | func (r *Regexp) Match(b []byte, beginText, endText bool) (end int) {
54 | 	return r.m.match(b, beginText, endText)
55 | }
56 | 
57 | func (r *Regexp) MatchString(s string, beginText, endText bool) (end int) {
58 | 	return r.m.matchString(s, beginText, endText)
59 | }
60 | 


--------------------------------------------------------------------------------
/regexp/regexp_test.go:
--------------------------------------------------------------------------------
  1 | // Copyright 2011 The Go Authors.  All rights reserved.
  2 | // Use of this source code is governed by a BSD-style
  3 | // license that can be found in the LICENSE file.
  4 | 
  5 | package regexp
  6 | 
  7 | import (
  8 | 	"bytes"
  9 | 	"reflect"
 10 | 	"strings"
 11 | 	"testing"
 12 | )
 13 | 
 14 | var nstateTests = []struct {
 15 | 	q       []uint32
 16 | 	partial rune
 17 | }{
 18 | 	{[]uint32{1, 2, 3}, 1},
 19 | 	{[]uint32{1}, 1},
 20 | 	{[]uint32{}, 0},
 21 | 	{[]uint32{1, 2, 8}, 0x10FFF},
 22 | }
 23 | 
 24 | func TestNstateEnc(t *testing.T) {
 25 | 	var n1, n2 nstate
 26 | 	n1.q.Init(10)
 27 | 	n2.q.Init(10)
 28 | 	for _, tt := range nstateTests {
 29 | 		n1.q.Reset()
 30 | 		n1.partial = tt.partial
 31 | 		for _, id := range tt.q {
 32 | 			n1.q.Add(id)
 33 | 		}
 34 | 		enc := n1.enc()
 35 | 		n2.dec(enc)
 36 | 		if n2.partial != n1.partial || !reflect.DeepEqual(n1.q.Dense(), n2.q.Dense()) {
 37 | 			t.Errorf("%v.enc.dec = %v", &n1, &n2)
 38 | 		}
 39 | 	}
 40 | }
 41 | 
 42 | var matchTests = []struct {
 43 | 	re string
 44 | 	s  string
 45 | 	m  []int
 46 | }{
 47 | 	// Adapted from go/src/pkg/regexp/find_test.go.
 48 | 	{`a+`, "abc\ndef\nghi\n", []int{1}},
 49 | 	{``, ``, []int{1}},
 50 | 	{`^abcdefg`, "abcdefg", []int{1}},
 51 | 	{`a+`, "baaab", []int{1}},
 52 | 	{"abcd..", "abcdef", []int{1}},
 53 | 	{`a`, "a", []int{1}},
 54 | 	{`x`, "y", nil},
 55 | 	{`b`, "abc", []int{1}},
 56 | 	{`.`, "a", []int{1}},
 57 | 	{`.*`, "abcdef", []int{1}},
 58 | 	{`^`, "abcde", []int{1}},
 59 | 	{`$`, "abcde", []int{1}},
 60 | 	{`^abcd$`, "abcd", []int{1}},
 61 | 	{`^bcd'`, "abcdef", nil},
 62 | 	{`^abcd$`, "abcde", nil},
 63 | 	{`a+`, "baaab", []int{1}},
 64 | 	{`a*`, "baaab", []int{1}},
 65 | 	{`[a-z]+`, "abcd", []int{1}},
 66 | 	{`[^a-z]+`, "ab1234cd", []int{1}},
 67 | 	{`[a\-\]z]+`, "az]-bcz", []int{1}},
 68 | 	{`[^\n]+`, "abcd\n", []int{1}},
 69 | 	{`[日本語]+`, "日本語日本語", []int{1}},
 70 | 	{`日本語+`, "日本語", []int{1}},
 71 | 	{`日本語+`, "日本語語語語", []int{1}},
 72 | 	{`()`, "", []int{1}},
 73 | 	{`(a)`, "a", []int{1}},
 74 | 	{`(.)(.)`, "日a", []int{1}},
 75 | 	{`(.*)`, "", []int{1}},
 76 | 	{`(.*)`, "abcd", []int{1}},
 77 | 	{`(..)(..)`, "abcd", []int{1}},
 78 | 	{`(([^xyz]*)(d))`, "abcd", []int{1}},
 79 | 	{`((a|b|c)*(d))`, "abcd", []int{1}},
 80 | 	{`(((a|b|c)*)(d))`, "abcd", []int{1}},
 81 | 	{`\a\f\r\t\v`, "\a\f\r\t\v", []int{1}},
 82 | 	{`[\a\f\n\r\t\v]+`, "\a\f\r\t\v", []int{1}},
 83 | 
 84 | 	{`a*(|(b))c*`, "aacc", []int{1}},
 85 | 	{`(.*).*`, "ab", []int{1}},
 86 | 	{`[.]`, ".", []int{1}},
 87 | 	{`/$`, "/abc/", []int{1}},
 88 | 	{`/$`, "/abc", nil},
 89 | 
 90 | 	// multiple matches
 91 | 	{`.`, "abc", []int{1}},
 92 | 	{`(.)`, "abc", []int{1}},
 93 | 	{`.(.)`, "abcd", []int{1}},
 94 | 	{`ab*`, "abbaab", []int{1}},
 95 | 	{`a(b*)`, "abbaab", []int{1}},
 96 | 
 97 | 	// fixed bugs
 98 | 	{`ab$`, "cab", []int{1}},
 99 | 	{`axxb$`, "axxcb", nil},
100 | 	{`data`, "daXY data", []int{1}},
101 | 	{`da(.)a$`, "daXY data", []int{1}},
102 | 	{`zx+`, "zzx", []int{1}},
103 | 	{`ab$`, "abcab", []int{1}},
104 | 	{`(aa)*$`, "a", []int{1}},
105 | 	{`(?:.|(?:.a))`, "", nil},
106 | 	{`(?:A(?:A|a))`, "Aa", []int{1}},
107 | 	{`(?:A|(?:A|a))`, "a", []int{1}},
108 | 	{`(a){0}`, "", []int{1}},
109 | 	//	{`(?-s)(?:(?:^).)`, "\n", nil},
110 | 	//	{`(?s)(?:(?:^).)`, "\n", []int{1}},
111 | 	//	{`(?:(?:^).)`, "\n", nil},
112 | 	{`\b`, "x", []int{1}},
113 | 	{`\b`, "xx", []int{1}},
114 | 	{`\b`, "x y", []int{1}},
115 | 	{`\b`, "xx yy", []int{1}},
116 | 	{`\B`, "x", nil},
117 | 	{`\B`, "xx", []int{1}},
118 | 	{`\B`, "x y", nil},
119 | 	{`\B`, "xx yy", []int{1}},
120 | 	{`(?im)^[abc]+$`, "abcABC", []int{1}},
121 | 	{`(?im)^[α]+$`, "αΑ", []int{1}},
122 | 	{`[Aa]BC`, "abc", nil},
123 | 	{`[Aa]bc`, "abc", []int{1}},
124 | 
125 | 	// RE2 tests
126 | 	{`[^\S\s]`, "abcd", nil},
127 | 	{`[^\S[:space:]]`, "abcd", nil},
128 | 	{`[^\D\d]`, "abcd", nil},
129 | 	{`[^\D[:digit:]]`, "abcd", nil},
130 | 	{`(?i)\W`, "x", nil},
131 | 	{`(?i)\W`, "k", nil},
132 | 	{`(?i)\W`, "s", nil},
133 | 
134 | 	// can backslash-escape any punctuation
135 | 	{`\!\"\#\$\%\&\'\(\)\*\+\,\-\.\/\:\;\<\=\>\?\@\[\\\]\^\_\{\|\}\~`,
136 | 		`!"#$%&'()*+,-./:;<=>?@[\]^_{|}~`, []int{1}},
137 | 	{`[\!\"\#\$\%\&\'\(\)\*\+\,\-\.\/\:\;\<\=\>\?\@\[\\\]\^\_\{\|\}\~]+`,
138 | 		`!"#$%&'()*+,-./:;<=>?@[\]^_{|}~`, []int{1}},
139 | 	{"\\`", "`", []int{1}},
140 | 	{"[\\`]+", "`", []int{1}},
141 | 
142 | 	// long set of matches (longer than startSize)
143 | 	{
144 | 		".",
145 | 		"qwertyuiopasdfghjklzxcvbnm1234567890",
146 | 		[]int{1},
147 | 	},
148 | }
149 | 
150 | func TestMatch(t *testing.T) {
151 | 	for _, tt := range matchTests {
152 | 		re, err := Compile("(?m)" + tt.re)
153 | 		if err != nil {
154 | 			t.Errorf("Compile(%#q): %v", tt.re, err)
155 | 			continue
156 | 		}
157 | 		b := []byte(tt.s)
158 | 		lines := grep(re, b)
159 | 		if !reflect.DeepEqual(lines, tt.m) {
160 | 			t.Errorf("grep(%#q, %q) = %v, want %v", tt.re, tt.s, lines, tt.m)
161 | 		}
162 | 	}
163 | }
164 | 
165 | func grep(re *Regexp, b []byte) []int {
166 | 	var m []int
167 | 	lineno := 1
168 | 	for {
169 | 		i := re.Match(b, true, true)
170 | 		if i < 0 {
171 | 			break
172 | 		}
173 | 		start := bytes.LastIndex(b[:i], nl) + 1
174 | 		end := i + 1
175 | 		if end > len(b) {
176 | 			end = len(b)
177 | 		}
178 | 		lineno += bytes.Count(b[:start], nl)
179 | 		m = append(m, lineno)
180 | 		if start < end && b[end-1] == '\n' {
181 | 			lineno++
182 | 		}
183 | 		b = b[end:]
184 | 		if len(b) == 0 {
185 | 			break
186 | 		}
187 | 	}
188 | 	return m
189 | }
190 | 
191 | var grepTests = []struct {
192 | 	re  string
193 | 	s   string
194 | 	out string
195 | 	err string
196 | 	g   Grep
197 | }{
198 | 	{re: `a+`, s: "abc\ndef\nghalloo\n", out: "input:abc\ninput:ghalloo\n"},
199 | 	{re: `x.*y`, s: "xay\nxa\ny\n", out: "input:xay\n"},
200 | }
201 | 
202 | func TestGrep(t *testing.T) {
203 | 	for i, tt := range grepTests {
204 | 		re, err := Compile("(?m)" + tt.re)
205 | 		if err != nil {
206 | 			t.Errorf("Compile(%#q): %v", tt.re, err)
207 | 			continue
208 | 		}
209 | 		g := tt.g
210 | 		g.Regexp = re
211 | 		var out, errb bytes.Buffer
212 | 		g.Stdout = &out
213 | 		g.Stderr = &errb
214 | 		g.Reader(strings.NewReader(tt.s), "input")
215 | 		if out.String() != tt.out || errb.String() != tt.err {
216 | 			t.Errorf("#%d: grep(%#q, %q) = %q, %q, want %q, %q", i, tt.re, tt.s, out.String(), errb.String(), tt.out, tt.err)
217 | 		}
218 | 	}
219 | }
220 | 


--------------------------------------------------------------------------------
/regexp/utf.go:
--------------------------------------------------------------------------------
  1 | // Copyright 2011 The Go Authors.  All rights reserved.
  2 | // Use of this source code is governed by a BSD-style
  3 | // license that can be found in the LICENSE file.
  4 | 
  5 | package regexp
  6 | 
  7 | import (
  8 | 	"regexp/syntax"
  9 | 	"unicode"
 10 | 	"unicode/utf8"
 11 | )
 12 | 
 13 | const (
 14 | 	instFail      = syntax.InstFail
 15 | 	instAlt       = syntax.InstAlt
 16 | 	instByteRange = syntax.InstRune | 0x80 // local opcode
 17 | 
 18 | 	argFold = 1 << 16
 19 | )
 20 | 
 21 | func toByteProg(prog *syntax.Prog) error {
 22 | 	var b runeBuilder
 23 | 	for pc := range prog.Inst {
 24 | 		i := &prog.Inst[pc]
 25 | 		switch i.Op {
 26 | 		case syntax.InstRune, syntax.InstRune1:
 27 | 			// General rune range.  PIA.
 28 | 			// TODO: Pick off single-byte case.
 29 | 			if lo, hi, fold, ok := oneByteRange(i); ok {
 30 | 				i.Op = instByteRange
 31 | 				i.Arg = uint32(lo)<<8 | uint32(hi)
 32 | 				if fold {
 33 | 					i.Arg |= argFold
 34 | 				}
 35 | 				break
 36 | 			}
 37 | 
 38 | 			r := i.Rune
 39 | 			if syntax.Flags(i.Arg)&syntax.FoldCase != 0 {
 40 | 				// Build folded list.
 41 | 				var rr []rune
 42 | 				if len(r) == 1 {
 43 | 					rr = appendFoldedRange(rr, r[0], r[0])
 44 | 				} else {
 45 | 					for j := 0; j < len(r); j += 2 {
 46 | 						rr = appendFoldedRange(rr, r[j], r[j+1])
 47 | 					}
 48 | 				}
 49 | 				r = rr
 50 | 			}
 51 | 
 52 | 			b.init(prog, uint32(pc), i.Out)
 53 | 			if len(r) == 1 {
 54 | 				b.addRange(r[0], r[0], false)
 55 | 			} else {
 56 | 				for j := 0; j < len(r); j += 2 {
 57 | 					b.addRange(r[j], r[j+1], false)
 58 | 				}
 59 | 			}
 60 | 
 61 | 		case syntax.InstRuneAny, syntax.InstRuneAnyNotNL:
 62 | 			// All runes.
 63 | 			// AnyNotNL should exclude \n but the line-at-a-time
 64 | 			// execution takes care of that for us.
 65 | 			b.init(prog, uint32(pc), i.Out)
 66 | 			b.addRange(0, unicode.MaxRune, false)
 67 | 		}
 68 | 	}
 69 | 	return nil
 70 | }
 71 | 
 72 | func oneByteRange(i *syntax.Inst) (lo, hi byte, fold, ok bool) {
 73 | 	if i.Op == syntax.InstRune1 {
 74 | 		r := i.Rune[0]
 75 | 		if r < utf8.RuneSelf {
 76 | 			return byte(r), byte(r), false, true
 77 | 		}
 78 | 	}
 79 | 	if i.Op != syntax.InstRune {
 80 | 		return
 81 | 	}
 82 | 	fold = syntax.Flags(i.Arg)&syntax.FoldCase != 0
 83 | 	if len(i.Rune) == 1 || len(i.Rune) == 2 && i.Rune[0] == i.Rune[1] {
 84 | 		r := i.Rune[0]
 85 | 		if r >= utf8.RuneSelf {
 86 | 			return
 87 | 		}
 88 | 		if fold && !asciiFold(r) {
 89 | 			return
 90 | 		}
 91 | 		return byte(r), byte(r), fold, true
 92 | 	}
 93 | 	if len(i.Rune) == 2 && i.Rune[1] < utf8.RuneSelf {
 94 | 		if fold {
 95 | 			for r := i.Rune[0]; r <= i.Rune[1]; r++ {
 96 | 				if asciiFold(r) {
 97 | 					return
 98 | 				}
 99 | 			}
100 | 		}
101 | 		return byte(i.Rune[0]), byte(i.Rune[1]), fold, true
102 | 	}
103 | 	if len(i.Rune) == 4 && i.Rune[0] == i.Rune[1] && i.Rune[2] == i.Rune[3] && unicode.SimpleFold(i.Rune[0]) == i.Rune[2] && unicode.SimpleFold(i.Rune[2]) == i.Rune[0] {
104 | 		return byte(i.Rune[0]), byte(i.Rune[0]), true, true
105 | 	}
106 | 
107 | 	return
108 | }
109 | 
110 | func asciiFold(r rune) bool {
111 | 	if r >= utf8.RuneSelf {
112 | 		return false
113 | 	}
114 | 	r1 := unicode.SimpleFold(r)
115 | 	if r1 >= utf8.RuneSelf {
116 | 		return false
117 | 	}
118 | 	if r1 == r {
119 | 		return true
120 | 	}
121 | 	return unicode.SimpleFold(r1) == r
122 | }
123 | 
124 | func maxRune(n int) rune {
125 | 	b := 0
126 | 	if n == 1 {
127 | 		b = 7
128 | 	} else {
129 | 		b = 8 - (n + 1) + 6*(n-1)
130 | 	}
131 | 	return 1<<uint(b) - 1
132 | }
133 | 
134 | type cacheKey struct {
135 | 	lo, hi uint8
136 | 	fold   bool
137 | 	next   uint32
138 | }
139 | 
140 | type runeBuilder struct {
141 | 	begin uint32
142 | 	out   uint32
143 | 	cache map[cacheKey]uint32
144 | 	p     *syntax.Prog
145 | }
146 | 
147 | func (b *runeBuilder) init(p *syntax.Prog, begin, out uint32) {
148 | 	// We will rewrite p.Inst[begin] to hold the accumulated
149 | 	// machine.  For now, there is no match.
150 | 	p.Inst[begin].Op = instFail
151 | 
152 | 	b.begin = begin
153 | 	b.out = out
154 | 	if b.cache == nil {
155 | 		b.cache = make(map[cacheKey]uint32)
156 | 	}
157 | 	for k := range b.cache {
158 | 		delete(b.cache, k)
159 | 	}
160 | 	b.p = p
161 | }
162 | 
163 | func (b *runeBuilder) uncachedSuffix(lo, hi byte, fold bool, next uint32) uint32 {
164 | 	if next == 0 {
165 | 		next = b.out
166 | 	}
167 | 	pc := len(b.p.Inst)
168 | 	i := syntax.Inst{Op: instByteRange, Arg: uint32(lo)<<8 | uint32(hi), Out: next}
169 | 	if fold {
170 | 		i.Arg |= argFold
171 | 	}
172 | 	b.p.Inst = append(b.p.Inst, i)
173 | 	return uint32(pc)
174 | }
175 | 
176 | func (b *runeBuilder) suffix(lo, hi byte, fold bool, next uint32) uint32 {
177 | 	if lo < 0x80 || hi > 0xbf {
178 | 		// Not a continuation byte, no need to cache.
179 | 		return b.uncachedSuffix(lo, hi, fold, next)
180 | 	}
181 | 
182 | 	key := cacheKey{lo, hi, fold, next}
183 | 	if pc, ok := b.cache[key]; ok {
184 | 		return pc
185 | 	}
186 | 
187 | 	pc := b.uncachedSuffix(lo, hi, fold, next)
188 | 	b.cache[key] = pc
189 | 	return pc
190 | }
191 | 
192 | func (b *runeBuilder) addBranch(pc uint32) {
193 | 	// Add pc to the branch at the beginning.
194 | 	i := &b.p.Inst[b.begin]
195 | 	switch i.Op {
196 | 	case syntax.InstFail:
197 | 		i.Op = syntax.InstNop
198 | 		i.Out = pc
199 | 		return
200 | 	case syntax.InstNop:
201 | 		i.Op = syntax.InstAlt
202 | 		i.Arg = pc
203 | 		return
204 | 	case syntax.InstAlt:
205 | 		apc := uint32(len(b.p.Inst))
206 | 		b.p.Inst = append(b.p.Inst, syntax.Inst{Op: instAlt, Out: i.Arg, Arg: pc})
207 | 		i = &b.p.Inst[b.begin]
208 | 		i.Arg = apc
209 | 		b.begin = apc
210 | 	}
211 | }
212 | 
213 | func (b *runeBuilder) addRange(lo, hi rune, fold bool) {
214 | 	if lo > hi {
215 | 		return
216 | 	}
217 | 
218 | 	// TODO: Pick off 80-10FFFF for special handling?
219 | 	if lo == 0x80 && hi == 0x10FFFF {
220 | 	}
221 | 
222 | 	// Split range into same-length sized ranges.
223 | 	for i := 1; i < utf8.UTFMax; i++ {
224 | 		max := maxRune(i)
225 | 		if lo <= max && max < hi {
226 | 			b.addRange(lo, max, fold)
227 | 			b.addRange(max+1, hi, fold)
228 | 			return
229 | 		}
230 | 	}
231 | 
232 | 	// ASCII range is special.
233 | 	if hi < utf8.RuneSelf {
234 | 		b.addBranch(b.suffix(byte(lo), byte(hi), fold, 0))
235 | 		return
236 | 	}
237 | 
238 | 	// Split range into sections that agree on leading bytes.
239 | 	for i := 1; i < utf8.UTFMax; i++ {
240 | 		m := rune(1)<<uint(6*i) - 1 // last i bytes of UTF-8 sequence
241 | 		if lo&^m != hi&^m {
242 | 			if lo&m != 0 {
243 | 				b.addRange(lo, lo|m, fold)
244 | 				b.addRange((lo|m)+1, hi, fold)
245 | 				return
246 | 			}
247 | 			if hi&m != m {
248 | 				b.addRange(lo, hi&^m-1, fold)
249 | 				b.addRange(hi&^m, hi, fold)
250 | 				return
251 | 			}
252 | 		}
253 | 	}
254 | 
255 | 	// Finally.  Generate byte matching equivalent for lo-hi.
256 | 	var ulo, uhi [utf8.UTFMax]byte
257 | 	n := utf8.EncodeRune(ulo[:], lo)
258 | 	m := utf8.EncodeRune(uhi[:], hi)
259 | 	if n != m {
260 | 		panic("codesearch/regexp: bad utf-8 math")
261 | 	}
262 | 
263 | 	pc := uint32(0)
264 | 	for i := n - 1; i >= 0; i-- {
265 | 		pc = b.suffix(ulo[i], uhi[i], false, pc)
266 | 	}
267 | 	b.addBranch(pc)
268 | }
269 | 


--------------------------------------------------------------------------------
/sparse/set.go:
--------------------------------------------------------------------------------
 1 | // Copyright 2011 The Go Authors.  All rights reserved.
 2 | // Use of this source code is governed by a BSD-style
 3 | // license that can be found in the LICENSE file.
 4 | 
 5 | // Package sparse implements sparse sets.
 6 | package sparse
 7 | 
 8 | // For comparison: running cindex over the Linux 2.6 kernel with this
 9 | // implementation of trigram sets takes 11 seconds.  If I change it to
10 | // a bitmap (which must be cleared between files) it takes 25 seconds.
11 | 
12 | // A Set is a sparse set of uint32 values.
13 | // http://research.swtch.com/2008/03/using-uninitialized-memory-for-fun-and.html
14 | type Set struct {
15 | 	dense  []uint32
16 | 	sparse []uint32
17 | }
18 | 
19 | // NewSet returns a new Set with a given maximum size.
20 | // The set can contain numbers in [0, max-1].
21 | func NewSet(max uint32) *Set {
22 | 	return &Set{
23 | 		sparse: make([]uint32, max),
24 | 	}
25 | }
26 | 
27 | // Init initializes a Set to have a given maximum size.
28 | // The set can contain numbers in [0, max-1].
29 | func (s *Set) Init(max uint32) {
30 | 	s.sparse = make([]uint32, max)
31 | }
32 | 
33 | // Reset clears (empties) the set.
34 | func (s *Set) Reset() {
35 | 	s.dense = s.dense[:0]
36 | }
37 | 
38 | // Add adds x to the set if it is not already there.
39 | func (s *Set) Add(x uint32) {
40 | 	v := s.sparse[x]
41 | 	if v < uint32(len(s.dense)) && s.dense[v] == x {
42 | 		return
43 | 	}
44 | 	n := len(s.dense)
45 | 	s.sparse[x] = uint32(n)
46 | 	s.dense = append(s.dense, x)
47 | }
48 | 
49 | // Has reports whether x is in the set.
50 | func (s *Set) Has(x uint32) bool {
51 | 	v := s.sparse[x]
52 | 	return v < uint32(len(s.dense)) && s.dense[v] == x
53 | }
54 | 
55 | // Dense returns the values in the set.
56 | // The values are listed in the order in which they
57 | // were inserted.
58 | func (s *Set) Dense() []uint32 {
59 | 	return s.dense
60 | }
61 | 
62 | // Len returns the number of values in the set.
63 | func (s *Set) Len() int {
64 | 	return len(s.dense)
65 | }
66 | 


--------------------------------------------------------------------------------