├── AUTHORS ├── CONTRIBUTORS ├── LICENSE ├── README ├── cmd ├── cgrep │ └── cgrep.go ├── cindex │ └── cindex.go └── csearch │ └── csearch.go ├── index ├── merge.go ├── merge_test.go ├── mmap_bsd.go ├── mmap_linux.go ├── mmap_windows.go ├── read.go ├── read_test.go ├── regexp.go ├── regexp_test.go ├── write.go └── write_test.go ├── lib ├── README.template ├── buildall ├── setup ├── uploadall └── version ├── regexp ├── copy.go ├── match.go ├── regexp.go ├── regexp_test.go └── utf.go └── sparse └── set.go /AUTHORS: -------------------------------------------------------------------------------- 1 | # This source code is copyright "The Go Authors", 2 | # as defined by the AUTHORS file in the root of the Go tree. 3 | # 4 | # http://tip.golang.org/AUTHORS. 5 | -------------------------------------------------------------------------------- /CONTRIBUTORS: -------------------------------------------------------------------------------- 1 | # The official list of people who can contribute code to the repository 2 | # is maintained in the standard Go repository as the CONTRIBUTORS 3 | # file in the root of the Go tree. 4 | # 5 | # http://tip.golang.org/CONTRIBUTORS 6 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2011 The Go Authors. All rights reserved. 2 | 3 | Redistribution and use in source and binary forms, with or without 4 | modification, are permitted provided that the following conditions are 5 | met: 6 | 7 | * Redistributions of source code must retain the above copyright 8 | notice, this list of conditions and the following disclaimer. 9 | * Redistributions in binary form must reproduce the above 10 | copyright notice, this list of conditions and the following disclaimer 11 | in the documentation and/or other materials provided with the 12 | distribution. 13 | * Neither the name of Google Inc. nor the names of its 14 | contributors may be used to endorse or promote products derived from 15 | this software without specific prior written permission. 16 | 17 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 18 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 19 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 20 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 21 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 22 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 23 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 24 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 25 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 27 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | -------------------------------------------------------------------------------- /README: -------------------------------------------------------------------------------- 1 | Code Search is a tool for indexing and then performing 2 | regular expression searches over large bodies of source code. 3 | It is a set of command-line programs written in Go. 4 | 5 | For background and an overview of the commands, 6 | see http://swtch.com/~rsc/regexp/regexp4.html. 7 | 8 | To install: 9 | 10 | go get github.com/google/codesearch/cmd/... 11 | 12 | Use "go get -u" to update an existing installation. 13 | 14 | Russ Cox 15 | rsc@swtch.com 16 | June 2015 17 | -------------------------------------------------------------------------------- /cmd/cgrep/cgrep.go: -------------------------------------------------------------------------------- 1 | // Copyright 2011 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package main 6 | 7 | import ( 8 | "flag" 9 | "fmt" 10 | "log" 11 | "os" 12 | "runtime/pprof" 13 | 14 | "github.com/google/codesearch/regexp" 15 | ) 16 | 17 | var usageMessage = `usage: cgrep [-c] [-h] [-i] [-l] [-n] regexp [file...] 18 | 19 | Cgrep behaves like grep, searching for regexp, an RE2 (nearly PCRE) regular expression. 20 | 21 | The -c, -h, -i, -l, and -n flags are as in grep, although note that as per Go's 22 | flag parsing convention, they cannot be combined: the option pair -i -n 23 | cannot be abbreviated to -in. 24 | ` 25 | 26 | func usage() { 27 | fmt.Fprintf(os.Stderr, usageMessage) 28 | os.Exit(2) 29 | } 30 | 31 | var ( 32 | iflag = flag.Bool("i", false, "case-insensitive match") 33 | cpuProfile = flag.String("cpuprofile", "", "write cpu profile to this file") 34 | ) 35 | 36 | func main() { 37 | var g regexp.Grep 38 | g.AddFlags() 39 | g.Stdout = os.Stdout 40 | g.Stderr = os.Stderr 41 | flag.Usage = usage 42 | flag.Parse() 43 | args := flag.Args() 44 | if len(args) == 0 { 45 | flag.Usage() 46 | } 47 | 48 | if *cpuProfile != "" { 49 | f, err := os.Create(*cpuProfile) 50 | if err != nil { 51 | log.Fatal(err) 52 | } 53 | defer f.Close() 54 | pprof.StartCPUProfile(f) 55 | defer pprof.StopCPUProfile() 56 | } 57 | 58 | pat := "(?m)" + args[0] 59 | if *iflag { 60 | pat = "(?i)" + pat 61 | } 62 | re, err := regexp.Compile(pat) 63 | if err != nil { 64 | log.Fatal(err) 65 | } 66 | g.Regexp = re 67 | if len(args) == 1 { 68 | g.Reader(os.Stdin, "") 69 | } else { 70 | for _, arg := range args[1:] { 71 | g.File(arg) 72 | } 73 | } 74 | if !g.Match { 75 | os.Exit(1) 76 | } 77 | } 78 | -------------------------------------------------------------------------------- /cmd/cindex/cindex.go: -------------------------------------------------------------------------------- 1 | // Copyright 2011 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package main 6 | 7 | import ( 8 | "flag" 9 | "fmt" 10 | "log" 11 | "os" 12 | "path/filepath" 13 | "runtime/pprof" 14 | "sort" 15 | 16 | "github.com/google/codesearch/index" 17 | ) 18 | 19 | var usageMessage = `usage: cindex [-list] [-reset] [path...] 20 | 21 | Cindex prepares the trigram index for use by csearch. The index is the 22 | file named by $CSEARCHINDEX, or else $HOME/.csearchindex. 23 | 24 | The simplest invocation is 25 | 26 | cindex path... 27 | 28 | which adds the file or directory tree named by each path to the index. 29 | For example: 30 | 31 | cindex $HOME/src /usr/include 32 | 33 | or, equivalently: 34 | 35 | cindex $HOME/src 36 | cindex /usr/include 37 | 38 | If cindex is invoked with no paths, it reindexes the paths that have 39 | already been added, in case the files have changed. Thus, 'cindex' by 40 | itself is a useful command to run in a nightly cron job. 41 | 42 | The -list flag causes cindex to list the paths it has indexed and exit. 43 | 44 | By default cindex adds the named paths to the index but preserves 45 | information about other paths that might already be indexed 46 | (the ones printed by cindex -list). The -reset flag causes cindex to 47 | delete the existing index before indexing the new paths. 48 | With no path arguments, cindex -reset removes the index. 49 | ` 50 | 51 | func usage() { 52 | fmt.Fprintf(os.Stderr, usageMessage) 53 | os.Exit(2) 54 | } 55 | 56 | var ( 57 | listFlag = flag.Bool("list", false, "list indexed paths and exit") 58 | resetFlag = flag.Bool("reset", false, "discard existing index") 59 | verboseFlag = flag.Bool("verbose", false, "print extra information") 60 | cpuProfile = flag.String("cpuprofile", "", "write cpu profile to this file") 61 | ) 62 | 63 | func main() { 64 | flag.Usage = usage 65 | flag.Parse() 66 | args := flag.Args() 67 | 68 | if *listFlag { 69 | ix := index.Open(index.File()) 70 | for _, arg := range ix.Paths() { 71 | fmt.Printf("%s\n", arg) 72 | } 73 | return 74 | } 75 | 76 | if *cpuProfile != "" { 77 | f, err := os.Create(*cpuProfile) 78 | if err != nil { 79 | log.Fatal(err) 80 | } 81 | defer f.Close() 82 | pprof.StartCPUProfile(f) 83 | defer pprof.StopCPUProfile() 84 | } 85 | 86 | if *resetFlag && len(args) == 0 { 87 | os.Remove(index.File()) 88 | return 89 | } 90 | if len(args) == 0 { 91 | ix := index.Open(index.File()) 92 | for _, arg := range ix.Paths() { 93 | args = append(args, arg) 94 | } 95 | } 96 | 97 | // Translate paths to absolute paths so that we can 98 | // generate the file list in sorted order. 99 | for i, arg := range args { 100 | a, err := filepath.Abs(arg) 101 | if err != nil { 102 | log.Printf("%s: %s", arg, err) 103 | args[i] = "" 104 | continue 105 | } 106 | args[i] = a 107 | } 108 | sort.Strings(args) 109 | 110 | for len(args) > 0 && args[0] == "" { 111 | args = args[1:] 112 | } 113 | 114 | master := index.File() 115 | if _, err := os.Stat(master); err != nil { 116 | // Does not exist. 117 | *resetFlag = true 118 | } 119 | file := master 120 | if !*resetFlag { 121 | file += "~" 122 | } 123 | 124 | ix := index.Create(file) 125 | ix.Verbose = *verboseFlag 126 | ix.AddPaths(args) 127 | for _, arg := range args { 128 | log.Printf("index %s", arg) 129 | filepath.Walk(arg, func(path string, info os.FileInfo, err error) error { 130 | if _, elem := filepath.Split(path); elem != "" { 131 | // Skip various temporary or "hidden" files or directories. 132 | if elem[0] == '.' || elem[0] == '#' || elem[0] == '~' || elem[len(elem)-1] == '~' { 133 | if info.IsDir() { 134 | return filepath.SkipDir 135 | } 136 | return nil 137 | } 138 | } 139 | if err != nil { 140 | log.Printf("%s: %s", path, err) 141 | return nil 142 | } 143 | if info != nil && info.Mode()&os.ModeType == 0 { 144 | ix.AddFile(path) 145 | } 146 | return nil 147 | }) 148 | } 149 | log.Printf("flush index") 150 | ix.Flush() 151 | 152 | if !*resetFlag { 153 | log.Printf("merge %s %s", master, file) 154 | index.Merge(file+"~", master, file) 155 | os.Remove(file) 156 | os.Rename(file+"~", master) 157 | } 158 | log.Printf("done") 159 | return 160 | } 161 | -------------------------------------------------------------------------------- /cmd/csearch/csearch.go: -------------------------------------------------------------------------------- 1 | // Copyright 2011 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package main 6 | 7 | import ( 8 | "flag" 9 | "fmt" 10 | "log" 11 | "os" 12 | "runtime/pprof" 13 | 14 | "github.com/google/codesearch/index" 15 | "github.com/google/codesearch/regexp" 16 | ) 17 | 18 | var usageMessage = `usage: csearch [-c] [-f fileregexp] [-h] [-i] [-l] [-n] regexp 19 | 20 | Csearch behaves like grep over all indexed files, searching for regexp, 21 | an RE2 (nearly PCRE) regular expression. 22 | 23 | The -c, -h, -i, -l, and -n flags are as in grep, although note that as per Go's 24 | flag parsing convention, they cannot be combined: the option pair -i -n 25 | cannot be abbreviated to -in. 26 | 27 | The -f flag restricts the search to files whose names match the RE2 regular 28 | expression fileregexp. 29 | 30 | Csearch relies on the existence of an up-to-date index created ahead of time. 31 | To build or rebuild the index that csearch uses, run: 32 | 33 | cindex path... 34 | 35 | where path... is a list of directories or individual files to be included in the index. 36 | If no index exists, this command creates one. If an index already exists, cindex 37 | overwrites it. Run cindex -help for more. 38 | 39 | Csearch uses the index stored in $CSEARCHINDEX or, if that variable is unset or 40 | empty, $HOME/.csearchindex. 41 | ` 42 | 43 | func usage() { 44 | fmt.Fprintf(os.Stderr, usageMessage) 45 | os.Exit(2) 46 | } 47 | 48 | var ( 49 | fFlag = flag.String("f", "", "search only files with names matching this regexp") 50 | iFlag = flag.Bool("i", false, "case-insensitive search") 51 | verboseFlag = flag.Bool("verbose", false, "print extra information") 52 | bruteFlag = flag.Bool("brute", false, "brute force - search all files in index") 53 | cpuProfile = flag.String("cpuprofile", "", "write cpu profile to this file") 54 | 55 | matches bool 56 | ) 57 | 58 | func Main() { 59 | g := regexp.Grep{ 60 | Stdout: os.Stdout, 61 | Stderr: os.Stderr, 62 | } 63 | g.AddFlags() 64 | 65 | flag.Usage = usage 66 | flag.Parse() 67 | args := flag.Args() 68 | 69 | if len(args) != 1 { 70 | usage() 71 | } 72 | 73 | if *cpuProfile != "" { 74 | f, err := os.Create(*cpuProfile) 75 | if err != nil { 76 | log.Fatal(err) 77 | } 78 | defer f.Close() 79 | pprof.StartCPUProfile(f) 80 | defer pprof.StopCPUProfile() 81 | } 82 | 83 | pat := "(?m)" + args[0] 84 | if *iFlag { 85 | pat = "(?i)" + pat 86 | } 87 | re, err := regexp.Compile(pat) 88 | if err != nil { 89 | log.Fatal(err) 90 | } 91 | g.Regexp = re 92 | var fre *regexp.Regexp 93 | if *fFlag != "" { 94 | fre, err = regexp.Compile(*fFlag) 95 | if err != nil { 96 | log.Fatal(err) 97 | } 98 | } 99 | q := index.RegexpQuery(re.Syntax) 100 | if *verboseFlag { 101 | log.Printf("query: %s\n", q) 102 | } 103 | 104 | ix := index.Open(index.File()) 105 | ix.Verbose = *verboseFlag 106 | var post []uint32 107 | if *bruteFlag { 108 | post = ix.PostingQuery(&index.Query{Op: index.QAll}) 109 | } else { 110 | post = ix.PostingQuery(q) 111 | } 112 | if *verboseFlag { 113 | log.Printf("post query identified %d possible files\n", len(post)) 114 | } 115 | 116 | if fre != nil { 117 | fnames := make([]uint32, 0, len(post)) 118 | 119 | for _, fileid := range post { 120 | name := ix.Name(fileid) 121 | if fre.MatchString(name, true, true) < 0 { 122 | continue 123 | } 124 | fnames = append(fnames, fileid) 125 | } 126 | 127 | if *verboseFlag { 128 | log.Printf("filename regexp matched %d files\n", len(fnames)) 129 | } 130 | post = fnames 131 | } 132 | 133 | for _, fileid := range post { 134 | name := ix.Name(fileid) 135 | g.File(name) 136 | } 137 | 138 | matches = g.Match 139 | } 140 | 141 | func main() { 142 | Main() 143 | if !matches { 144 | os.Exit(1) 145 | } 146 | os.Exit(0) 147 | } 148 | -------------------------------------------------------------------------------- /index/merge.go: -------------------------------------------------------------------------------- 1 | // Copyright 2011 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package index 6 | 7 | // Merging indexes. 8 | // 9 | // To merge two indexes A and B (newer) into a combined index C: 10 | // 11 | // Load the path list from B and determine for each path the docid ranges 12 | // that it will replace in A. 13 | // 14 | // Read A's and B's name lists together, merging them into C's name list. 15 | // Discard the identified ranges from A during the merge. Also during the merge, 16 | // record the mapping from A's docids to C's docids, and also the mapping from 17 | // B's docids to C's docids. Both mappings can be summarized in a table like 18 | // 19 | // 10-14 map to 20-24 20 | // 15-24 is deleted 21 | // 25-34 maps to 40-49 22 | // 23 | // The number of ranges will be at most the combined number of paths. 24 | // Also during the merge, write the name index to a temporary file as usual. 25 | // 26 | // Now merge the posting lists (this is why they begin with the trigram). 27 | // During the merge, translate the docid numbers to the new C docid space. 28 | // Also during the merge, write the posting list index to a temporary file as usual. 29 | // 30 | // Copy the name index and posting list index into C's index and write the trailer. 31 | // Rename C's index onto the new index. 32 | 33 | import ( 34 | "encoding/binary" 35 | "os" 36 | "strings" 37 | ) 38 | 39 | // An idrange records that the half-open interval [lo, hi) maps to [new, new+hi-lo). 40 | type idrange struct { 41 | lo, hi, new uint32 42 | } 43 | 44 | type postIndex struct { 45 | tri uint32 46 | count uint32 47 | offset uint32 48 | } 49 | 50 | // Merge creates a new index in the file dst that corresponds to merging 51 | // the two indices src1 and src2. If both src1 and src2 claim responsibility 52 | // for a path, src2 is assumed to be newer and is given preference. 53 | func Merge(dst, src1, src2 string) { 54 | ix1 := Open(src1) 55 | ix2 := Open(src2) 56 | paths1 := ix1.Paths() 57 | paths2 := ix2.Paths() 58 | 59 | // Build docid maps. 60 | var i1, i2, new uint32 61 | var map1, map2 []idrange 62 | for _, path := range paths2 { 63 | // Determine range shadowed by this path. 64 | old := i1 65 | for i1 < uint32(ix1.numName) && ix1.Name(i1) < path { 66 | i1++ 67 | } 68 | lo := i1 69 | limit := path[:len(path)-1] + string(path[len(path)-1]+1) 70 | for i1 < uint32(ix1.numName) && ix1.Name(i1) < limit { 71 | i1++ 72 | } 73 | hi := i1 74 | 75 | // Record range before the shadow. 76 | if old < lo { 77 | map1 = append(map1, idrange{old, lo, new}) 78 | new += lo - old 79 | } 80 | 81 | // Determine range defined by this path. 82 | // Because we are iterating over the ix2 paths, 83 | // there can't be gaps, so it must start at i2. 84 | if i2 < uint32(ix2.numName) && ix2.Name(i2) < path { 85 | panic("merge: inconsistent index") 86 | } 87 | lo = i2 88 | for i2 < uint32(ix2.numName) && ix2.Name(i2) < limit { 89 | i2++ 90 | } 91 | hi = i2 92 | if lo < hi { 93 | map2 = append(map2, idrange{lo, hi, new}) 94 | new += hi - lo 95 | } 96 | } 97 | 98 | if i1 < uint32(ix1.numName) { 99 | map1 = append(map1, idrange{i1, uint32(ix1.numName), new}) 100 | new += uint32(ix1.numName) - i1 101 | } 102 | if i2 < uint32(ix2.numName) { 103 | panic("merge: inconsistent index") 104 | } 105 | numName := new 106 | 107 | ix3 := bufCreate(dst) 108 | ix3.writeString(magic) 109 | 110 | // Merged list of paths. 111 | pathData := ix3.offset() 112 | mi1 := 0 113 | mi2 := 0 114 | last := "\x00" // not a prefix of anything 115 | for mi1 < len(paths1) || mi2 < len(paths2) { 116 | var p string 117 | if mi2 >= len(paths2) || mi1 < len(paths1) && paths1[mi1] <= paths2[mi2] { 118 | p = paths1[mi1] 119 | mi1++ 120 | } else { 121 | p = paths2[mi2] 122 | mi2++ 123 | } 124 | if strings.HasPrefix(p, last) { 125 | continue 126 | } 127 | last = p 128 | ix3.writeString(p) 129 | ix3.writeString("\x00") 130 | } 131 | ix3.writeString("\x00") 132 | 133 | // Merged list of names. 134 | nameData := ix3.offset() 135 | nameIndexFile := bufCreate("") 136 | new = 0 137 | mi1 = 0 138 | mi2 = 0 139 | for new < numName { 140 | if mi1 < len(map1) && map1[mi1].new == new { 141 | for i := map1[mi1].lo; i < map1[mi1].hi; i++ { 142 | name := ix1.Name(i) 143 | nameIndexFile.writeUint32(ix3.offset() - nameData) 144 | ix3.writeString(name) 145 | ix3.writeString("\x00") 146 | new++ 147 | } 148 | mi1++ 149 | } else if mi2 < len(map2) && map2[mi2].new == new { 150 | for i := map2[mi2].lo; i < map2[mi2].hi; i++ { 151 | name := ix2.Name(i) 152 | nameIndexFile.writeUint32(ix3.offset() - nameData) 153 | ix3.writeString(name) 154 | ix3.writeString("\x00") 155 | new++ 156 | } 157 | mi2++ 158 | } else { 159 | panic("merge: inconsistent index") 160 | } 161 | } 162 | if new*4 != nameIndexFile.offset() { 163 | panic("merge: inconsistent index") 164 | } 165 | nameIndexFile.writeUint32(ix3.offset()) 166 | 167 | // Merged list of posting lists. 168 | postData := ix3.offset() 169 | var r1 postMapReader 170 | var r2 postMapReader 171 | var w postDataWriter 172 | r1.init(ix1, map1) 173 | r2.init(ix2, map2) 174 | w.init(ix3) 175 | for { 176 | if r1.trigram < r2.trigram { 177 | w.trigram(r1.trigram) 178 | for r1.nextId() { 179 | w.fileid(r1.fileid) 180 | } 181 | r1.nextTrigram() 182 | w.endTrigram() 183 | } else if r2.trigram < r1.trigram { 184 | w.trigram(r2.trigram) 185 | for r2.nextId() { 186 | w.fileid(r2.fileid) 187 | } 188 | r2.nextTrigram() 189 | w.endTrigram() 190 | } else { 191 | if r1.trigram == ^uint32(0) { 192 | break 193 | } 194 | w.trigram(r1.trigram) 195 | r1.nextId() 196 | r2.nextId() 197 | for r1.fileid < ^uint32(0) || r2.fileid < ^uint32(0) { 198 | if r1.fileid < r2.fileid { 199 | w.fileid(r1.fileid) 200 | r1.nextId() 201 | } else if r2.fileid < r1.fileid { 202 | w.fileid(r2.fileid) 203 | r2.nextId() 204 | } else { 205 | panic("merge: inconsistent index") 206 | } 207 | } 208 | r1.nextTrigram() 209 | r2.nextTrigram() 210 | w.endTrigram() 211 | } 212 | } 213 | 214 | // Name index 215 | nameIndex := ix3.offset() 216 | copyFile(ix3, nameIndexFile) 217 | 218 | // Posting list index 219 | postIndex := ix3.offset() 220 | copyFile(ix3, w.postIndexFile) 221 | 222 | ix3.writeUint32(pathData) 223 | ix3.writeUint32(nameData) 224 | ix3.writeUint32(postData) 225 | ix3.writeUint32(nameIndex) 226 | ix3.writeUint32(postIndex) 227 | ix3.writeString(trailerMagic) 228 | ix3.flush() 229 | 230 | os.Remove(nameIndexFile.name) 231 | os.Remove(w.postIndexFile.name) 232 | } 233 | 234 | type postMapReader struct { 235 | ix *Index 236 | idmap []idrange 237 | triNum uint32 238 | trigram uint32 239 | count uint32 240 | offset uint32 241 | d []byte 242 | oldid uint32 243 | fileid uint32 244 | i int 245 | } 246 | 247 | func (r *postMapReader) init(ix *Index, idmap []idrange) { 248 | r.ix = ix 249 | r.idmap = idmap 250 | r.trigram = ^uint32(0) 251 | r.load() 252 | } 253 | 254 | func (r *postMapReader) nextTrigram() { 255 | r.triNum++ 256 | r.load() 257 | } 258 | 259 | func (r *postMapReader) load() { 260 | if r.triNum >= uint32(r.ix.numPost) { 261 | r.trigram = ^uint32(0) 262 | r.count = 0 263 | r.fileid = ^uint32(0) 264 | return 265 | } 266 | r.trigram, r.count, r.offset = r.ix.listAt(r.triNum * postEntrySize) 267 | if r.count == 0 { 268 | r.fileid = ^uint32(0) 269 | return 270 | } 271 | r.d = r.ix.slice(r.ix.postData+r.offset+3, -1) 272 | r.oldid = ^uint32(0) 273 | r.i = 0 274 | } 275 | 276 | func (r *postMapReader) nextId() bool { 277 | for r.count > 0 { 278 | r.count-- 279 | delta64, n := binary.Uvarint(r.d) 280 | delta := uint32(delta64) 281 | if n <= 0 || delta == 0 { 282 | corrupt() 283 | } 284 | r.d = r.d[n:] 285 | r.oldid += delta 286 | for r.i < len(r.idmap) && r.idmap[r.i].hi <= r.oldid { 287 | r.i++ 288 | } 289 | if r.i >= len(r.idmap) { 290 | r.count = 0 291 | break 292 | } 293 | if r.oldid < r.idmap[r.i].lo { 294 | continue 295 | } 296 | r.fileid = r.idmap[r.i].new + r.oldid - r.idmap[r.i].lo 297 | return true 298 | } 299 | 300 | r.fileid = ^uint32(0) 301 | return false 302 | } 303 | 304 | type postDataWriter struct { 305 | out *bufWriter 306 | postIndexFile *bufWriter 307 | buf [10]byte 308 | base uint32 309 | count, offset uint32 310 | last uint32 311 | t uint32 312 | } 313 | 314 | func (w *postDataWriter) init(out *bufWriter) { 315 | w.out = out 316 | w.postIndexFile = bufCreate("") 317 | w.base = out.offset() 318 | } 319 | 320 | func (w *postDataWriter) trigram(t uint32) { 321 | w.offset = w.out.offset() 322 | w.count = 0 323 | w.t = t 324 | w.last = ^uint32(0) 325 | } 326 | 327 | func (w *postDataWriter) fileid(id uint32) { 328 | if w.count == 0 { 329 | w.out.writeTrigram(w.t) 330 | } 331 | w.out.writeUvarint(id - w.last) 332 | w.last = id 333 | w.count++ 334 | } 335 | 336 | func (w *postDataWriter) endTrigram() { 337 | if w.count == 0 { 338 | return 339 | } 340 | w.out.writeUvarint(0) 341 | w.postIndexFile.writeTrigram(w.t) 342 | w.postIndexFile.writeUint32(w.count) 343 | w.postIndexFile.writeUint32(w.offset - w.base) 344 | } 345 | -------------------------------------------------------------------------------- /index/merge_test.go: -------------------------------------------------------------------------------- 1 | // Copyright 2011 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package index 6 | 7 | import ( 8 | "io/ioutil" 9 | "os" 10 | "testing" 11 | ) 12 | 13 | var mergePaths1 = []string{ 14 | "/a", 15 | "/b", 16 | "/c", 17 | } 18 | 19 | var mergePaths2 = []string{ 20 | "/b", 21 | "/cc", 22 | } 23 | 24 | var mergeFiles1 = map[string]string{ 25 | "/a/x": "hello world", 26 | "/a/y": "goodbye world", 27 | "/b/xx": "now is the time", 28 | "/b/xy": "for all good men", 29 | "/c/ab": "give me all the potatoes", 30 | "/c/de": "or give me death now", 31 | } 32 | 33 | var mergeFiles2 = map[string]string{ 34 | "/b/www": "world wide indeed", 35 | "/b/xx": "no, not now", 36 | "/b/yy": "first potatoes, now liberty?", 37 | "/cc": "come to the aid of his potatoes", 38 | } 39 | 40 | func TestMerge(t *testing.T) { 41 | f1, _ := ioutil.TempFile("", "index-test") 42 | f2, _ := ioutil.TempFile("", "index-test") 43 | f3, _ := ioutil.TempFile("", "index-test") 44 | defer os.Remove(f1.Name()) 45 | defer os.Remove(f2.Name()) 46 | defer os.Remove(f3.Name()) 47 | 48 | out1 := f1.Name() 49 | out2 := f2.Name() 50 | out3 := f3.Name() 51 | 52 | buildIndex(out1, mergePaths1, mergeFiles1) 53 | buildIndex(out2, mergePaths2, mergeFiles2) 54 | 55 | Merge(out3, out1, out2) 56 | 57 | ix1 := Open(out1) 58 | ix2 := Open(out2) 59 | ix3 := Open(out3) 60 | 61 | nameof := func(ix *Index) string { 62 | switch { 63 | case ix == ix1: 64 | return "ix1" 65 | case ix == ix2: 66 | return "ix2" 67 | case ix == ix3: 68 | return "ix3" 69 | } 70 | return "???" 71 | } 72 | 73 | checkFiles := func(ix *Index, l ...string) { 74 | for i, s := range l { 75 | if n := ix.Name(uint32(i)); n != s { 76 | t.Errorf("%s: Name(%d) = %s, want %s", nameof(ix), i, n, s) 77 | } 78 | } 79 | } 80 | 81 | checkFiles(ix1, "/a/x", "/a/y", "/b/xx", "/b/xy", "/c/ab", "/c/de") 82 | checkFiles(ix2, "/b/www", "/b/xx", "/b/yy", "/cc") 83 | checkFiles(ix3, "/a/x", "/a/y", "/b/www", "/b/xx", "/b/yy", "/c/ab", "/c/de", "/cc") 84 | 85 | check := func(ix *Index, trig string, l ...uint32) { 86 | l1 := ix.PostingList(tri(trig[0], trig[1], trig[2])) 87 | if !equalList(l1, l) { 88 | t.Errorf("PostingList(%s, %s) = %v, want %v", nameof(ix), trig, l1, l) 89 | } 90 | } 91 | 92 | check(ix1, "wor", 0, 1) 93 | check(ix1, "now", 2, 5) 94 | check(ix1, "all", 3, 4) 95 | 96 | check(ix2, "now", 1, 2) 97 | 98 | check(ix3, "all", 5) 99 | check(ix3, "wor", 0, 1, 2) 100 | check(ix3, "now", 3, 4, 6) 101 | check(ix3, "pot", 4, 5, 7) 102 | } 103 | -------------------------------------------------------------------------------- /index/mmap_bsd.go: -------------------------------------------------------------------------------- 1 | // Copyright 2011 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | // +build darwin freebsd openbsd netbsd 6 | 7 | package index 8 | 9 | import ( 10 | "log" 11 | "os" 12 | "syscall" 13 | ) 14 | 15 | // missing from package syscall on freebsd, openbsd 16 | const ( 17 | _PROT_READ = 1 18 | _MAP_SHARED = 1 19 | ) 20 | 21 | func mmapFile(f *os.File) mmapData { 22 | st, err := f.Stat() 23 | if err != nil { 24 | log.Fatal(err) 25 | } 26 | size := st.Size() 27 | if int64(int(size+4095)) != size+4095 { 28 | log.Fatalf("%s: too large for mmap", f.Name()) 29 | } 30 | n := int(size) 31 | if n == 0 { 32 | return mmapData{f, nil} 33 | } 34 | data, err := syscall.Mmap(int(f.Fd()), 0, (n+4095)&^4095, _PROT_READ, _MAP_SHARED) 35 | if err != nil { 36 | log.Fatalf("mmap %s: %v", f.Name(), err) 37 | } 38 | return mmapData{f, data[:n]} 39 | } 40 | -------------------------------------------------------------------------------- /index/mmap_linux.go: -------------------------------------------------------------------------------- 1 | // Copyright 2011 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package index 6 | 7 | import ( 8 | "log" 9 | "os" 10 | "syscall" 11 | ) 12 | 13 | func mmapFile(f *os.File) mmapData { 14 | st, err := f.Stat() 15 | if err != nil { 16 | log.Fatal(err) 17 | } 18 | size := st.Size() 19 | if int64(int(size+4095)) != size+4095 { 20 | log.Fatalf("%s: too large for mmap", f.Name()) 21 | } 22 | n := int(size) 23 | if n == 0 { 24 | return mmapData{f, nil} 25 | } 26 | data, err := syscall.Mmap(int(f.Fd()), 0, (n+4095)&^4095, syscall.PROT_READ, syscall.MAP_SHARED) 27 | if err != nil { 28 | log.Fatalf("mmap %s: %v", f.Name(), err) 29 | } 30 | return mmapData{f, data[:n]} 31 | } 32 | -------------------------------------------------------------------------------- /index/mmap_windows.go: -------------------------------------------------------------------------------- 1 | // Copyright 2011 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package index 6 | 7 | import ( 8 | "log" 9 | "os" 10 | "syscall" 11 | "unsafe" 12 | ) 13 | 14 | func mmapFile(f *os.File) mmapData { 15 | st, err := f.Stat() 16 | if err != nil { 17 | log.Fatal(err) 18 | } 19 | size := st.Size() 20 | if int64(int(size+4095)) != size+4095 { 21 | log.Fatalf("%s: too large for mmap", f.Name()) 22 | } 23 | if size == 0 { 24 | return mmapData{f, nil} 25 | } 26 | h, err := syscall.CreateFileMapping(syscall.Handle(f.Fd()), nil, syscall.PAGE_READONLY, uint32(size>>32), uint32(size), nil) 27 | if err != nil { 28 | log.Fatalf("CreateFileMapping %s: %v", f.Name(), err) 29 | } 30 | 31 | addr, err := syscall.MapViewOfFile(h, syscall.FILE_MAP_READ, 0, 0, 0) 32 | if err != nil { 33 | log.Fatalf("MapViewOfFile %s: %v", f.Name(), err) 34 | } 35 | data := (*[1 << 30]byte)(unsafe.Pointer(addr)) 36 | return mmapData{f, data[:size]} 37 | } 38 | -------------------------------------------------------------------------------- /index/read.go: -------------------------------------------------------------------------------- 1 | // Copyright 2011 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package index 6 | 7 | // Index format. 8 | // 9 | // An index stored on disk has the format: 10 | // 11 | // "csearch index 1\n" 12 | // list of paths 13 | // list of names 14 | // list of posting lists 15 | // name index 16 | // posting list index 17 | // trailer 18 | // 19 | // The list of paths is a sorted sequence of NUL-terminated file or directory names. 20 | // The index covers the file trees rooted at those paths. 21 | // The list ends with an empty name ("\x00"). 22 | // 23 | // The list of names is a sorted sequence of NUL-terminated file names. 24 | // The initial entry in the list corresponds to file #0, 25 | // the next to file #1, and so on. The list ends with an 26 | // empty name ("\x00"). 27 | // 28 | // The list of posting lists are a sequence of posting lists. 29 | // Each posting list has the form: 30 | // 31 | // trigram [3] 32 | // deltas [v]... 33 | // 34 | // The trigram gives the 3 byte trigram that this list describes. The 35 | // delta list is a sequence of varint-encoded deltas between file 36 | // IDs, ending with a zero delta. For example, the delta list [2,5,1,1,0] 37 | // encodes the file ID list 1, 6, 7, 8. The delta list [0] would 38 | // encode the empty file ID list, but empty posting lists are usually 39 | // not recorded at all. The list of posting lists ends with an entry 40 | // with trigram "\xff\xff\xff" and a delta list consisting a single zero. 41 | // 42 | // The indexes enable efficient random access to the lists. The name 43 | // index is a sequence of 4-byte big-endian values listing the byte 44 | // offset in the name list where each name begins. The posting list 45 | // index is a sequence of index entries describing each successive 46 | // posting list. Each index entry has the form: 47 | // 48 | // trigram [3] 49 | // file count [4] 50 | // offset [4] 51 | // 52 | // Index entries are only written for the non-empty posting lists, 53 | // so finding the posting list for a specific trigram requires a 54 | // binary search over the posting list index. In practice, the majority 55 | // of the possible trigrams are never seen, so omitting the missing 56 | // ones represents a significant storage savings. 57 | // 58 | // The trailer has the form: 59 | // 60 | // offset of path list [4] 61 | // offset of name list [4] 62 | // offset of posting lists [4] 63 | // offset of name index [4] 64 | // offset of posting list index [4] 65 | // "\ncsearch trailr\n" 66 | 67 | import ( 68 | "bytes" 69 | "encoding/binary" 70 | "log" 71 | "os" 72 | "path/filepath" 73 | "runtime" 74 | "sort" 75 | ) 76 | 77 | const ( 78 | magic = "csearch index 1\n" 79 | trailerMagic = "\ncsearch trailr\n" 80 | ) 81 | 82 | // An Index implements read-only access to a trigram index. 83 | type Index struct { 84 | Verbose bool 85 | data mmapData 86 | pathData uint32 87 | nameData uint32 88 | postData uint32 89 | nameIndex uint32 90 | postIndex uint32 91 | numName int 92 | numPost int 93 | } 94 | 95 | const postEntrySize = 3 + 4 + 4 96 | 97 | func Open(file string) *Index { 98 | mm := mmap(file) 99 | if len(mm.d) < 4*4+len(trailerMagic) || string(mm.d[len(mm.d)-len(trailerMagic):]) != trailerMagic { 100 | corrupt() 101 | } 102 | n := uint32(len(mm.d) - len(trailerMagic) - 5*4) 103 | ix := &Index{data: mm} 104 | ix.pathData = ix.uint32(n) 105 | ix.nameData = ix.uint32(n + 4) 106 | ix.postData = ix.uint32(n + 8) 107 | ix.nameIndex = ix.uint32(n + 12) 108 | ix.postIndex = ix.uint32(n + 16) 109 | ix.numName = int((ix.postIndex-ix.nameIndex)/4) - 1 110 | ix.numPost = int((n - ix.postIndex) / postEntrySize) 111 | return ix 112 | } 113 | 114 | // slice returns the slice of index data starting at the given byte offset. 115 | // If n >= 0, the slice must have length at least n and is truncated to length n. 116 | func (ix *Index) slice(off uint32, n int) []byte { 117 | o := int(off) 118 | if uint32(o) != off || n >= 0 && o+n > len(ix.data.d) { 119 | corrupt() 120 | } 121 | if n < 0 { 122 | return ix.data.d[o:] 123 | } 124 | return ix.data.d[o : o+n] 125 | } 126 | 127 | // uint32 returns the uint32 value at the given offset in the index data. 128 | func (ix *Index) uint32(off uint32) uint32 { 129 | return binary.BigEndian.Uint32(ix.slice(off, 4)) 130 | } 131 | 132 | // uvarint returns the varint value at the given offset in the index data. 133 | func (ix *Index) uvarint(off uint32) uint32 { 134 | v, n := binary.Uvarint(ix.slice(off, -1)) 135 | if n <= 0 { 136 | corrupt() 137 | } 138 | return uint32(v) 139 | } 140 | 141 | // Paths returns the list of indexed paths. 142 | func (ix *Index) Paths() []string { 143 | off := ix.pathData 144 | var x []string 145 | for { 146 | s := ix.str(off) 147 | if len(s) == 0 { 148 | break 149 | } 150 | x = append(x, string(s)) 151 | off += uint32(len(s) + 1) 152 | } 153 | return x 154 | } 155 | 156 | // NameBytes returns the name corresponding to the given fileid. 157 | func (ix *Index) NameBytes(fileid uint32) []byte { 158 | off := ix.uint32(ix.nameIndex + 4*fileid) 159 | return ix.str(ix.nameData + off) 160 | } 161 | 162 | func (ix *Index) str(off uint32) []byte { 163 | str := ix.slice(off, -1) 164 | i := bytes.IndexByte(str, '\x00') 165 | if i < 0 { 166 | corrupt() 167 | } 168 | return str[:i] 169 | } 170 | 171 | // Name returns the name corresponding to the given fileid. 172 | func (ix *Index) Name(fileid uint32) string { 173 | return string(ix.NameBytes(fileid)) 174 | } 175 | 176 | // listAt returns the index list entry at the given offset. 177 | func (ix *Index) listAt(off uint32) (trigram, count, offset uint32) { 178 | d := ix.slice(ix.postIndex+off, postEntrySize) 179 | trigram = uint32(d[0])<<16 | uint32(d[1])<<8 | uint32(d[2]) 180 | count = binary.BigEndian.Uint32(d[3:]) 181 | offset = binary.BigEndian.Uint32(d[3+4:]) 182 | return 183 | } 184 | 185 | func (ix *Index) dumpPosting() { 186 | d := ix.slice(ix.postIndex, postEntrySize*ix.numPost) 187 | for i := 0; i < ix.numPost; i++ { 188 | j := i * postEntrySize 189 | t := uint32(d[j])<<16 | uint32(d[j+1])<<8 | uint32(d[j+2]) 190 | count := int(binary.BigEndian.Uint32(d[j+3:])) 191 | offset := binary.BigEndian.Uint32(d[j+3+4:]) 192 | log.Printf("%#x: %d at %d", t, count, offset) 193 | } 194 | } 195 | 196 | func (ix *Index) findList(trigram uint32) (count int, offset uint32) { 197 | // binary search 198 | d := ix.slice(ix.postIndex, postEntrySize*ix.numPost) 199 | i := sort.Search(ix.numPost, func(i int) bool { 200 | i *= postEntrySize 201 | t := uint32(d[i])<<16 | uint32(d[i+1])<<8 | uint32(d[i+2]) 202 | return t >= trigram 203 | }) 204 | if i >= ix.numPost { 205 | return 0, 0 206 | } 207 | i *= postEntrySize 208 | t := uint32(d[i])<<16 | uint32(d[i+1])<<8 | uint32(d[i+2]) 209 | if t != trigram { 210 | return 0, 0 211 | } 212 | count = int(binary.BigEndian.Uint32(d[i+3:])) 213 | offset = binary.BigEndian.Uint32(d[i+3+4:]) 214 | return 215 | } 216 | 217 | type postReader struct { 218 | ix *Index 219 | count int 220 | offset uint32 221 | fileid uint32 222 | d []byte 223 | restrict []uint32 224 | } 225 | 226 | func (r *postReader) init(ix *Index, trigram uint32, restrict []uint32) { 227 | count, offset := ix.findList(trigram) 228 | if count == 0 { 229 | return 230 | } 231 | r.ix = ix 232 | r.count = count 233 | r.offset = offset 234 | r.fileid = ^uint32(0) 235 | r.d = ix.slice(ix.postData+offset+3, -1) 236 | r.restrict = restrict 237 | } 238 | 239 | func (r *postReader) max() int { 240 | return int(r.count) 241 | } 242 | 243 | func (r *postReader) next() bool { 244 | for r.count > 0 { 245 | r.count-- 246 | delta64, n := binary.Uvarint(r.d) 247 | delta := uint32(delta64) 248 | if n <= 0 || delta == 0 { 249 | corrupt() 250 | } 251 | r.d = r.d[n:] 252 | r.fileid += delta 253 | if r.restrict != nil { 254 | i := 0 255 | for i < len(r.restrict) && r.restrict[i] < r.fileid { 256 | i++ 257 | } 258 | r.restrict = r.restrict[i:] 259 | if len(r.restrict) == 0 || r.restrict[0] != r.fileid { 260 | continue 261 | } 262 | } 263 | return true 264 | } 265 | // list should end with terminating 0 delta 266 | if r.d != nil && (len(r.d) == 0 || r.d[0] != 0) { 267 | corrupt() 268 | } 269 | r.fileid = ^uint32(0) 270 | return false 271 | } 272 | 273 | func (ix *Index) PostingList(trigram uint32) []uint32 { 274 | return ix.postingList(trigram, nil) 275 | } 276 | 277 | func (ix *Index) postingList(trigram uint32, restrict []uint32) []uint32 { 278 | var r postReader 279 | r.init(ix, trigram, restrict) 280 | x := make([]uint32, 0, r.max()) 281 | for r.next() { 282 | x = append(x, r.fileid) 283 | } 284 | return x 285 | } 286 | 287 | func (ix *Index) PostingAnd(list []uint32, trigram uint32) []uint32 { 288 | return ix.postingAnd(list, trigram, nil) 289 | } 290 | 291 | func (ix *Index) postingAnd(list []uint32, trigram uint32, restrict []uint32) []uint32 { 292 | var r postReader 293 | r.init(ix, trigram, restrict) 294 | x := list[:0] 295 | i := 0 296 | for r.next() { 297 | fileid := r.fileid 298 | for i < len(list) && list[i] < fileid { 299 | i++ 300 | } 301 | if i < len(list) && list[i] == fileid { 302 | x = append(x, fileid) 303 | i++ 304 | } 305 | } 306 | return x 307 | } 308 | 309 | func (ix *Index) PostingOr(list []uint32, trigram uint32) []uint32 { 310 | return ix.postingOr(list, trigram, nil) 311 | } 312 | 313 | func (ix *Index) postingOr(list []uint32, trigram uint32, restrict []uint32) []uint32 { 314 | var r postReader 315 | r.init(ix, trigram, restrict) 316 | x := make([]uint32, 0, len(list)+r.max()) 317 | i := 0 318 | for r.next() { 319 | fileid := r.fileid 320 | for i < len(list) && list[i] < fileid { 321 | x = append(x, list[i]) 322 | i++ 323 | } 324 | x = append(x, fileid) 325 | if i < len(list) && list[i] == fileid { 326 | i++ 327 | } 328 | } 329 | x = append(x, list[i:]...) 330 | return x 331 | } 332 | 333 | func (ix *Index) PostingQuery(q *Query) []uint32 { 334 | return ix.postingQuery(q, nil) 335 | } 336 | 337 | func (ix *Index) postingQuery(q *Query, restrict []uint32) (ret []uint32) { 338 | var list []uint32 339 | switch q.Op { 340 | case QNone: 341 | // nothing 342 | case QAll: 343 | if restrict != nil { 344 | return restrict 345 | } 346 | list = make([]uint32, ix.numName) 347 | for i := range list { 348 | list[i] = uint32(i) 349 | } 350 | return list 351 | case QAnd: 352 | for _, t := range q.Trigram { 353 | tri := uint32(t[0])<<16 | uint32(t[1])<<8 | uint32(t[2]) 354 | if list == nil { 355 | list = ix.postingList(tri, restrict) 356 | } else { 357 | list = ix.postingAnd(list, tri, restrict) 358 | } 359 | if len(list) == 0 { 360 | return nil 361 | } 362 | } 363 | for _, sub := range q.Sub { 364 | if list == nil { 365 | list = restrict 366 | } 367 | list = ix.postingQuery(sub, list) 368 | if len(list) == 0 { 369 | return nil 370 | } 371 | } 372 | case QOr: 373 | for _, t := range q.Trigram { 374 | tri := uint32(t[0])<<16 | uint32(t[1])<<8 | uint32(t[2]) 375 | if list == nil { 376 | list = ix.postingList(tri, restrict) 377 | } else { 378 | list = ix.postingOr(list, tri, restrict) 379 | } 380 | } 381 | for _, sub := range q.Sub { 382 | list1 := ix.postingQuery(sub, restrict) 383 | list = mergeOr(list, list1) 384 | } 385 | } 386 | return list 387 | } 388 | 389 | func mergeOr(l1, l2 []uint32) []uint32 { 390 | var l []uint32 391 | i := 0 392 | j := 0 393 | for i < len(l1) || j < len(l2) { 394 | switch { 395 | case j == len(l2) || (i < len(l1) && l1[i] < l2[j]): 396 | l = append(l, l1[i]) 397 | i++ 398 | case i == len(l1) || (j < len(l2) && l1[i] > l2[j]): 399 | l = append(l, l2[j]) 400 | j++ 401 | case l1[i] == l2[j]: 402 | l = append(l, l1[i]) 403 | i++ 404 | j++ 405 | } 406 | } 407 | return l 408 | } 409 | 410 | func corrupt() { 411 | log.Fatal("corrupt index: remove " + File()) 412 | } 413 | 414 | // An mmapData is mmap'ed read-only data from a file. 415 | type mmapData struct { 416 | f *os.File 417 | d []byte 418 | } 419 | 420 | // mmap maps the given file into memory. 421 | func mmap(file string) mmapData { 422 | f, err := os.Open(file) 423 | if err != nil { 424 | log.Fatal(err) 425 | } 426 | return mmapFile(f) 427 | } 428 | 429 | // File returns the name of the index file to use. 430 | // It is either $CSEARCHINDEX, a .csearchindex file in the PWD or an 431 | // ancestor dir, or $HOME/.csearchindex as a last resort 432 | func File() string { 433 | f := os.Getenv("CSEARCHINDEX") 434 | if f != "" { 435 | return f 436 | } 437 | 438 | var home string 439 | home = os.Getenv("HOME") 440 | if runtime.GOOS == "windows" && home == "" { 441 | home = os.Getenv("USERPROFILE") 442 | } 443 | home = filepath.Join(home, ".csearchindex") 444 | 445 | pwd, err := filepath.Abs(".") 446 | if err != nil { 447 | return home 448 | } 449 | 450 | for { 451 | candidate := filepath.Join(pwd, ".csearchindex") 452 | f, err := os.Open(candidate) 453 | f.Close() 454 | 455 | // found one! 456 | if err == nil { 457 | return candidate 458 | } 459 | 460 | newPwd := filepath.Dir(pwd) 461 | if newPwd == pwd { 462 | // hit the root dir 463 | break 464 | } 465 | pwd = newPwd 466 | } 467 | 468 | return home 469 | 470 | } 471 | -------------------------------------------------------------------------------- /index/read_test.go: -------------------------------------------------------------------------------- 1 | // Copyright 2011 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package index 6 | 7 | import ( 8 | "io/ioutil" 9 | "os" 10 | "testing" 11 | ) 12 | 13 | var postFiles = map[string]string{ 14 | "file0": "", 15 | "file1": "Google Code Search", 16 | "file2": "Google Code Project Hosting", 17 | "file3": "Google Web Search", 18 | } 19 | 20 | func tri(x, y, z byte) uint32 { 21 | return uint32(x)<<16 | uint32(y)<<8 | uint32(z) 22 | } 23 | 24 | func TestTrivialPosting(t *testing.T) { 25 | f, _ := ioutil.TempFile("", "index-test") 26 | defer os.Remove(f.Name()) 27 | out := f.Name() 28 | buildIndex(out, nil, postFiles) 29 | ix := Open(out) 30 | if l := ix.PostingList(tri('S', 'e', 'a')); !equalList(l, []uint32{1, 3}) { 31 | t.Errorf("PostingList(Sea) = %v, want [1 3]", l) 32 | } 33 | if l := ix.PostingList(tri('G', 'o', 'o')); !equalList(l, []uint32{1, 2, 3}) { 34 | t.Errorf("PostingList(Goo) = %v, want [1 2 3]", l) 35 | } 36 | if l := ix.PostingAnd(ix.PostingList(tri('S', 'e', 'a')), tri('G', 'o', 'o')); !equalList(l, []uint32{1, 3}) { 37 | t.Errorf("PostingList(Sea&Goo) = %v, want [1 3]", l) 38 | } 39 | if l := ix.PostingAnd(ix.PostingList(tri('G', 'o', 'o')), tri('S', 'e', 'a')); !equalList(l, []uint32{1, 3}) { 40 | t.Errorf("PostingList(Goo&Sea) = %v, want [1 3]", l) 41 | } 42 | if l := ix.PostingOr(ix.PostingList(tri('S', 'e', 'a')), tri('G', 'o', 'o')); !equalList(l, []uint32{1, 2, 3}) { 43 | t.Errorf("PostingList(Sea|Goo) = %v, want [1 2 3]", l) 44 | } 45 | if l := ix.PostingOr(ix.PostingList(tri('G', 'o', 'o')), tri('S', 'e', 'a')); !equalList(l, []uint32{1, 2, 3}) { 46 | t.Errorf("PostingList(Goo|Sea) = %v, want [1 2 3]", l) 47 | } 48 | } 49 | 50 | func equalList(x, y []uint32) bool { 51 | if len(x) != len(y) { 52 | return false 53 | } 54 | for i, xi := range x { 55 | if xi != y[i] { 56 | return false 57 | } 58 | } 59 | return true 60 | } 61 | -------------------------------------------------------------------------------- /index/regexp.go: -------------------------------------------------------------------------------- 1 | // Copyright 2011 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package index 6 | 7 | import ( 8 | "regexp/syntax" 9 | "sort" 10 | "strconv" 11 | "strings" 12 | "unicode" 13 | ) 14 | 15 | // A Query is a matching machine, like a regular expression, 16 | // that matches some text and not other text. When we compute a 17 | // Query from a regexp, the Query is a conservative version of the 18 | // regexp: it matches everything the regexp would match, and probably 19 | // quite a bit more. We can then filter target files by whether they match 20 | // the Query (using a trigram index) before running the comparatively 21 | // more expensive regexp machinery. 22 | type Query struct { 23 | Op QueryOp 24 | Trigram []string 25 | Sub []*Query 26 | } 27 | 28 | type QueryOp int 29 | 30 | const ( 31 | QAll QueryOp = iota // Everything matches 32 | QNone // Nothing matches 33 | QAnd // All in Sub and Trigram must match 34 | QOr // At least one in Sub or Trigram must match 35 | ) 36 | 37 | var allQuery = &Query{Op: QAll} 38 | var noneQuery = &Query{Op: QNone} 39 | 40 | // and returns the query q AND r, possibly reusing q's and r's storage. 41 | func (q *Query) and(r *Query) *Query { 42 | return q.andOr(r, QAnd) 43 | } 44 | 45 | // or returns the query q OR r, possibly reusing q's and r's storage. 46 | func (q *Query) or(r *Query) *Query { 47 | return q.andOr(r, QOr) 48 | } 49 | 50 | // andOr returns the query q AND r or q OR r, possibly reusing q's and r's storage. 51 | // It works hard to avoid creating unnecessarily complicated structures. 52 | func (q *Query) andOr(r *Query, op QueryOp) (out *Query) { 53 | opstr := "&" 54 | if op == QOr { 55 | opstr = "|" 56 | } 57 | //println("andOr", q.String(), opstr, r.String()) 58 | //defer func() { println(" ->", out.String()) }() 59 | _ = opstr 60 | 61 | if len(q.Trigram) == 0 && len(q.Sub) == 1 { 62 | q = q.Sub[0] 63 | } 64 | if len(r.Trigram) == 0 && len(r.Sub) == 1 { 65 | r = r.Sub[0] 66 | } 67 | 68 | // Boolean simplification. 69 | // If q ⇒ r, q AND r ≡ q. 70 | // If q ⇒ r, q OR r ≡ r. 71 | if q.implies(r) { 72 | //println(q.String(), "implies", r.String()) 73 | if op == QAnd { 74 | return q 75 | } 76 | return r 77 | } 78 | if r.implies(q) { 79 | //println(r.String(), "implies", q.String()) 80 | if op == QAnd { 81 | return r 82 | } 83 | return q 84 | } 85 | 86 | // Both q and r are QAnd or QOr. 87 | // If they match or can be made to match, merge. 88 | qAtom := len(q.Trigram) == 1 && len(q.Sub) == 0 89 | rAtom := len(r.Trigram) == 1 && len(r.Sub) == 0 90 | if q.Op == op && (r.Op == op || rAtom) { 91 | q.Trigram = stringSet.union(q.Trigram, r.Trigram, false) 92 | q.Sub = append(q.Sub, r.Sub...) 93 | return q 94 | } 95 | if r.Op == op && qAtom { 96 | r.Trigram = stringSet.union(r.Trigram, q.Trigram, false) 97 | return r 98 | } 99 | if qAtom && rAtom { 100 | q.Op = op 101 | q.Trigram = append(q.Trigram, r.Trigram...) 102 | return q 103 | } 104 | 105 | // If one matches the op, add the other to it. 106 | if q.Op == op { 107 | q.Sub = append(q.Sub, r) 108 | return q 109 | } 110 | if r.Op == op { 111 | r.Sub = append(r.Sub, q) 112 | return r 113 | } 114 | 115 | // We are creating an AND of ORs or an OR of ANDs. 116 | // Factor out common trigrams, if any. 117 | common := stringSet{} 118 | i, j := 0, 0 119 | wi, wj := 0, 0 120 | for i < len(q.Trigram) && j < len(r.Trigram) { 121 | qt, rt := q.Trigram[i], r.Trigram[j] 122 | if qt < rt { 123 | q.Trigram[wi] = qt 124 | wi++ 125 | i++ 126 | } else if qt > rt { 127 | r.Trigram[wj] = rt 128 | wj++ 129 | j++ 130 | } else { 131 | common = append(common, qt) 132 | i++ 133 | j++ 134 | } 135 | } 136 | for ; i < len(q.Trigram); i++ { 137 | q.Trigram[wi] = q.Trigram[i] 138 | wi++ 139 | } 140 | for ; j < len(r.Trigram); j++ { 141 | r.Trigram[wj] = r.Trigram[j] 142 | wj++ 143 | } 144 | q.Trigram = q.Trigram[:wi] 145 | r.Trigram = r.Trigram[:wj] 146 | if len(common) > 0 { 147 | // If there were common trigrams, rewrite 148 | // 149 | // (abc|def|ghi|jkl) AND (abc|def|mno|prs) => 150 | // (abc|def) OR ((ghi|jkl) AND (mno|prs)) 151 | // 152 | // (abc&def&ghi&jkl) OR (abc&def&mno&prs) => 153 | // (abc&def) AND ((ghi&jkl) OR (mno&prs)) 154 | // 155 | // Build up the right one of 156 | // (ghi|jkl) AND (mno|prs) 157 | // (ghi&jkl) OR (mno&prs) 158 | // Call andOr recursively in case q and r can now be simplified 159 | // (we removed some trigrams). 160 | s := q.andOr(r, op) 161 | 162 | // Add in factored trigrams. 163 | otherOp := QAnd + QOr - op 164 | t := &Query{Op: otherOp, Trigram: common} 165 | return t.andOr(s, t.Op) 166 | } 167 | 168 | // Otherwise just create the op. 169 | return &Query{Op: op, Sub: []*Query{q, r}} 170 | } 171 | 172 | // implies reports whether q implies r. 173 | // It is okay for it to return false negatives. 174 | func (q *Query) implies(r *Query) bool { 175 | if q.Op == QNone || r.Op == QAll { 176 | // False implies everything. 177 | // Everything implies True. 178 | return true 179 | } 180 | if q.Op == QAll || r.Op == QNone { 181 | // True implies nothing. 182 | // Nothing implies False. 183 | return false 184 | } 185 | 186 | if q.Op == QAnd || (q.Op == QOr && len(q.Trigram) == 1 && len(q.Sub) == 0) { 187 | return trigramsImply(q.Trigram, r) 188 | } 189 | 190 | if q.Op == QOr && r.Op == QOr && 191 | len(q.Trigram) > 0 && len(q.Sub) == 0 && 192 | stringSet.isSubsetOf(q.Trigram, r.Trigram) { 193 | return true 194 | } 195 | return false 196 | } 197 | 198 | func trigramsImply(t []string, q *Query) bool { 199 | switch q.Op { 200 | case QOr: 201 | for _, qq := range q.Sub { 202 | if trigramsImply(t, qq) { 203 | return true 204 | } 205 | } 206 | for i := range t { 207 | if stringSet.isSubsetOf(t[i:i+1], q.Trigram) { 208 | return true 209 | } 210 | } 211 | return false 212 | case QAnd: 213 | for _, qq := range q.Sub { 214 | if !trigramsImply(t, qq) { 215 | return false 216 | } 217 | } 218 | if !stringSet.isSubsetOf(q.Trigram, t) { 219 | return false 220 | } 221 | return true 222 | } 223 | return false 224 | } 225 | 226 | // maybeRewrite rewrites q to use op if it is possible to do so 227 | // without changing the meaning. It also simplifies if the node 228 | // is an empty OR or AND. 229 | func (q *Query) maybeRewrite(op QueryOp) { 230 | if q.Op != QAnd && q.Op != QOr { 231 | return 232 | } 233 | 234 | // AND/OR doing real work? Can't rewrite. 235 | n := len(q.Sub) + len(q.Trigram) 236 | if n > 1 { 237 | return 238 | } 239 | 240 | // Nothing left in the AND/OR? 241 | if n == 0 { 242 | if q.Op == QAnd { 243 | q.Op = QAll 244 | } else { 245 | q.Op = QNone 246 | } 247 | return 248 | } 249 | 250 | // Just a sub-node: throw away wrapper. 251 | if len(q.Sub) == 1 { 252 | *q = *q.Sub[0] 253 | } 254 | 255 | // Just a trigram: can use either op. 256 | q.Op = op 257 | } 258 | 259 | // andTrigrams returns q AND the OR of the AND of the trigrams present in each string. 260 | func (q *Query) andTrigrams(t stringSet) *Query { 261 | if t.minLen() < 3 { 262 | // If there is a short string, we can't guarantee 263 | // that any trigrams must be present, so use ALL. 264 | // q AND ALL = q. 265 | return q 266 | } 267 | 268 | //println("andtrigrams", strings.Join(t, ",")) 269 | or := noneQuery 270 | for _, tt := range t { 271 | var trig stringSet 272 | for i := 0; i+3 <= len(tt); i++ { 273 | trig.add(tt[i : i+3]) 274 | } 275 | trig.clean(false) 276 | //println(tt, "trig", strings.Join(trig, ",")) 277 | or = or.or(&Query{Op: QAnd, Trigram: trig}) 278 | } 279 | q = q.and(or) 280 | return q 281 | } 282 | 283 | func (q *Query) String() string { 284 | if q == nil { 285 | return "?" 286 | } 287 | if q.Op == QNone { 288 | return "-" 289 | } 290 | if q.Op == QAll { 291 | return "+" 292 | } 293 | 294 | if len(q.Sub) == 0 && len(q.Trigram) == 1 { 295 | return strconv.Quote(q.Trigram[0]) 296 | } 297 | 298 | var ( 299 | s string 300 | sjoin string 301 | end string 302 | tjoin string 303 | ) 304 | if q.Op == QAnd { 305 | sjoin = " " 306 | tjoin = " " 307 | } else { 308 | s = "(" 309 | sjoin = ")|(" 310 | end = ")" 311 | tjoin = "|" 312 | } 313 | for i, t := range q.Trigram { 314 | if i > 0 { 315 | s += tjoin 316 | } 317 | s += strconv.Quote(t) 318 | } 319 | if len(q.Sub) > 0 { 320 | if len(q.Trigram) > 0 { 321 | s += sjoin 322 | } 323 | s += q.Sub[0].String() 324 | for i := 1; i < len(q.Sub); i++ { 325 | s += sjoin + q.Sub[i].String() 326 | } 327 | } 328 | s += end 329 | return s 330 | } 331 | 332 | // RegexpQuery returns a Query for the given regexp. 333 | func RegexpQuery(re *syntax.Regexp) *Query { 334 | info := analyze(re) 335 | info.simplify(true) 336 | info.addExact() 337 | return info.match 338 | } 339 | 340 | // A regexpInfo summarizes the results of analyzing a regexp. 341 | type regexpInfo struct { 342 | // canEmpty records whether the regexp matches the empty string 343 | canEmpty bool 344 | 345 | // exact is the exact set of strings matching the regexp. 346 | exact stringSet 347 | 348 | // if exact is nil, prefix is the set of possible match prefixes, 349 | // and suffix is the set of possible match suffixes. 350 | prefix stringSet // otherwise: the exact set of matching prefixes ... 351 | suffix stringSet // ... and suffixes 352 | 353 | // match records a query that must be satisfied by any 354 | // match for the regexp, in addition to the information 355 | // recorded above. 356 | match *Query 357 | } 358 | 359 | const ( 360 | // Exact sets are limited to maxExact strings. 361 | // If they get too big, simplify will rewrite the regexpInfo 362 | // to use prefix and suffix instead. It's not worthwhile for 363 | // this to be bigger than maxSet. 364 | // Because we allow the maximum length of an exact string 365 | // to grow to 5 below (see simplify), it helps to avoid ridiculous 366 | // alternations if maxExact is sized so that 3 case-insensitive letters 367 | // triggers a flush. 368 | maxExact = 7 369 | 370 | // Prefix and suffix sets are limited to maxSet strings. 371 | // If they get too big, simplify will replace groups of strings 372 | // sharing a common leading prefix (or trailing suffix) with 373 | // that common prefix (or suffix). It is useful for maxSet 374 | // to be at least 2³ = 8 so that we can exactly 375 | // represent a case-insensitive abc by the set 376 | // {abc, abC, aBc, aBC, Abc, AbC, ABc, ABC}. 377 | maxSet = 20 378 | ) 379 | 380 | // anyMatch returns the regexpInfo describing a regexp that 381 | // matches any string. 382 | func anyMatch() regexpInfo { 383 | return regexpInfo{ 384 | canEmpty: true, 385 | prefix: []string{""}, 386 | suffix: []string{""}, 387 | match: allQuery, 388 | } 389 | } 390 | 391 | // anyChar returns the regexpInfo describing a regexp that 392 | // matches any single character. 393 | func anyChar() regexpInfo { 394 | return regexpInfo{ 395 | prefix: []string{""}, 396 | suffix: []string{""}, 397 | match: allQuery, 398 | } 399 | } 400 | 401 | // noMatch returns the regexpInfo describing a regexp that 402 | // matches no strings at all. 403 | func noMatch() regexpInfo { 404 | return regexpInfo{ 405 | match: noneQuery, 406 | } 407 | } 408 | 409 | // emptyString returns the regexpInfo describing a regexp that 410 | // matches only the empty string. 411 | func emptyString() regexpInfo { 412 | return regexpInfo{ 413 | canEmpty: true, 414 | exact: []string{""}, 415 | match: allQuery, 416 | } 417 | } 418 | 419 | // analyze returns the regexpInfo for the regexp re. 420 | func analyze(re *syntax.Regexp) (ret regexpInfo) { 421 | //println("analyze", re.String()) 422 | //defer func() { println("->", ret.String()) }() 423 | var info regexpInfo 424 | switch re.Op { 425 | case syntax.OpNoMatch: 426 | return noMatch() 427 | 428 | case syntax.OpEmptyMatch, 429 | syntax.OpBeginLine, syntax.OpEndLine, 430 | syntax.OpBeginText, syntax.OpEndText, 431 | syntax.OpWordBoundary, syntax.OpNoWordBoundary: 432 | return emptyString() 433 | 434 | case syntax.OpLiteral: 435 | if re.Flags&syntax.FoldCase != 0 { 436 | switch len(re.Rune) { 437 | case 0: 438 | return emptyString() 439 | case 1: 440 | // Single-letter case-folded string: 441 | // rewrite into char class and analyze. 442 | re1 := &syntax.Regexp{ 443 | Op: syntax.OpCharClass, 444 | } 445 | re1.Rune = re1.Rune0[:0] 446 | r0 := re.Rune[0] 447 | re1.Rune = append(re1.Rune, r0, r0) 448 | for r1 := unicode.SimpleFold(r0); r1 != r0; r1 = unicode.SimpleFold(r1) { 449 | re1.Rune = append(re1.Rune, r1, r1) 450 | } 451 | info = analyze(re1) 452 | return info 453 | } 454 | // Multi-letter case-folded string: 455 | // treat as concatenation of single-letter case-folded strings. 456 | re1 := &syntax.Regexp{ 457 | Op: syntax.OpLiteral, 458 | Flags: syntax.FoldCase, 459 | } 460 | info = emptyString() 461 | for i := range re.Rune { 462 | re1.Rune = re.Rune[i : i+1] 463 | info = concat(info, analyze(re1)) 464 | } 465 | return info 466 | } 467 | info.exact = stringSet{string(re.Rune)} 468 | info.match = allQuery 469 | 470 | case syntax.OpAnyCharNotNL, syntax.OpAnyChar: 471 | return anyChar() 472 | 473 | case syntax.OpCapture: 474 | return analyze(re.Sub[0]) 475 | 476 | case syntax.OpConcat: 477 | return fold(concat, re.Sub, emptyString()) 478 | 479 | case syntax.OpAlternate: 480 | return fold(alternate, re.Sub, noMatch()) 481 | 482 | case syntax.OpQuest: 483 | return alternate(analyze(re.Sub[0]), emptyString()) 484 | 485 | case syntax.OpStar: 486 | // We don't know anything, so assume the worst. 487 | return anyMatch() 488 | 489 | case syntax.OpRepeat: 490 | if re.Min == 0 { 491 | // Like OpStar 492 | return anyMatch() 493 | } 494 | fallthrough 495 | case syntax.OpPlus: 496 | // x+ 497 | // Since there has to be at least one x, the prefixes and suffixes 498 | // stay the same. If x was exact, it isn't anymore. 499 | info = analyze(re.Sub[0]) 500 | if info.exact.have() { 501 | info.prefix = info.exact 502 | info.suffix = info.exact.copy() 503 | info.exact = nil 504 | } 505 | 506 | case syntax.OpCharClass: 507 | info.match = allQuery 508 | 509 | // Special case. 510 | if len(re.Rune) == 0 { 511 | return noMatch() 512 | } 513 | 514 | // Special case. 515 | if len(re.Rune) == 1 { 516 | info.exact = stringSet{string(re.Rune[0])} 517 | break 518 | } 519 | 520 | n := 0 521 | for i := 0; i < len(re.Rune); i += 2 { 522 | n += int(re.Rune[i+1] - re.Rune[i]) 523 | } 524 | // If the class is too large, it's okay to overestimate. 525 | if n > 100 { 526 | return anyChar() 527 | } 528 | 529 | info.exact = []string{} 530 | for i := 0; i < len(re.Rune); i += 2 { 531 | lo, hi := re.Rune[i], re.Rune[i+1] 532 | for rr := lo; rr <= hi; rr++ { 533 | info.exact.add(string(rr)) 534 | } 535 | } 536 | } 537 | 538 | info.simplify(false) 539 | return info 540 | } 541 | 542 | // fold is the usual higher-order function. 543 | func fold(f func(x, y regexpInfo) regexpInfo, sub []*syntax.Regexp, zero regexpInfo) regexpInfo { 544 | if len(sub) == 0 { 545 | return zero 546 | } 547 | if len(sub) == 1 { 548 | return analyze(sub[0]) 549 | } 550 | info := f(analyze(sub[0]), analyze(sub[1])) 551 | for i := 2; i < len(sub); i++ { 552 | info = f(info, analyze(sub[i])) 553 | } 554 | return info 555 | } 556 | 557 | // concat returns the regexp info for xy given x and y. 558 | func concat(x, y regexpInfo) (out regexpInfo) { 559 | //println("concat", x.String(), "...", y.String()) 560 | //defer func() { println("->", out.String()) }() 561 | var xy regexpInfo 562 | xy.match = x.match.and(y.match) 563 | if x.exact.have() && y.exact.have() { 564 | xy.exact = x.exact.cross(y.exact, false) 565 | } else { 566 | if x.exact.have() { 567 | xy.prefix = x.exact.cross(y.prefix, false) 568 | } else { 569 | xy.prefix = x.prefix 570 | if x.canEmpty { 571 | xy.prefix = xy.prefix.union(y.prefix, false) 572 | } 573 | } 574 | if y.exact.have() { 575 | xy.suffix = x.suffix.cross(y.exact, true) 576 | } else { 577 | xy.suffix = y.suffix 578 | if y.canEmpty { 579 | xy.suffix = xy.suffix.union(x.suffix, true) 580 | } 581 | } 582 | } 583 | 584 | // If all the possible strings in the cross product of x.suffix 585 | // and y.prefix are long enough, then the trigram for one 586 | // of them must be present and would not necessarily be 587 | // accounted for in xy.prefix or xy.suffix yet. Cut things off 588 | // at maxSet just to keep the sets manageable. 589 | if !x.exact.have() && !y.exact.have() && 590 | x.suffix.size() <= maxSet && y.prefix.size() <= maxSet && 591 | x.suffix.minLen()+y.prefix.minLen() >= 3 { 592 | xy.match = xy.match.andTrigrams(x.suffix.cross(y.prefix, false)) 593 | } 594 | 595 | xy.simplify(false) 596 | return xy 597 | } 598 | 599 | // alternate returns the regexpInfo for x|y given x and y. 600 | func alternate(x, y regexpInfo) (out regexpInfo) { 601 | //println("alternate", x.String(), "...", y.String()) 602 | //defer func() { println("->", out.String()) }() 603 | var xy regexpInfo 604 | if x.exact.have() && y.exact.have() { 605 | xy.exact = x.exact.union(y.exact, false) 606 | } else if x.exact.have() { 607 | xy.prefix = x.exact.union(y.prefix, false) 608 | xy.suffix = x.exact.union(y.suffix, true) 609 | x.addExact() 610 | } else if y.exact.have() { 611 | xy.prefix = x.prefix.union(y.exact, false) 612 | xy.suffix = x.suffix.union(y.exact.copy(), true) 613 | y.addExact() 614 | } else { 615 | xy.prefix = x.prefix.union(y.prefix, false) 616 | xy.suffix = x.suffix.union(y.suffix, true) 617 | } 618 | xy.canEmpty = x.canEmpty || y.canEmpty 619 | xy.match = x.match.or(y.match) 620 | 621 | xy.simplify(false) 622 | return xy 623 | } 624 | 625 | // addExact adds to the match query the trigrams for matching info.exact. 626 | func (info *regexpInfo) addExact() { 627 | if info.exact.have() { 628 | info.match = info.match.andTrigrams(info.exact) 629 | } 630 | } 631 | 632 | // simplify simplifies the regexpInfo when the exact set gets too large. 633 | func (info *regexpInfo) simplify(force bool) { 634 | //println(" simplify", info.String(), " force=", force) 635 | //defer func() { println(" ->", info.String()) }() 636 | // If there are now too many exact strings, 637 | // loop over them, adding trigrams and moving 638 | // the relevant pieces into prefix and suffix. 639 | info.exact.clean(false) 640 | if len(info.exact) > maxExact || (info.exact.minLen() >= 3 && force) || info.exact.minLen() >= 4 { 641 | info.addExact() 642 | for _, s := range info.exact { 643 | n := len(s) 644 | if n < 3 { 645 | info.prefix.add(s) 646 | info.suffix.add(s) 647 | } else { 648 | info.prefix.add(s[:2]) 649 | info.suffix.add(s[n-2:]) 650 | } 651 | } 652 | info.exact = nil 653 | } 654 | 655 | if !info.exact.have() { 656 | info.simplifySet(&info.prefix) 657 | info.simplifySet(&info.suffix) 658 | } 659 | } 660 | 661 | // simplifySet reduces the size of the given set (either prefix or suffix). 662 | // There is no need to pass around enormous prefix or suffix sets, since 663 | // they will only be used to create trigrams. As they get too big, simplifySet 664 | // moves the information they contain into the match query, which is 665 | // more efficient to pass around. 666 | func (info *regexpInfo) simplifySet(s *stringSet) { 667 | t := *s 668 | t.clean(s == &info.suffix) 669 | 670 | // Add the OR of the current prefix/suffix set to the query. 671 | info.match = info.match.andTrigrams(t) 672 | 673 | for n := 3; n == 3 || t.size() > maxSet; n-- { 674 | // Replace set by strings of length n-1. 675 | w := 0 676 | for _, str := range t { 677 | if len(str) >= n { 678 | if s == &info.prefix { 679 | str = str[:n-1] 680 | } else { 681 | str = str[len(str)-n+1:] 682 | } 683 | } 684 | if w == 0 || t[w-1] != str { 685 | t[w] = str 686 | w++ 687 | } 688 | } 689 | t = t[:w] 690 | t.clean(s == &info.suffix) 691 | } 692 | 693 | // Now make sure that the prefix/suffix sets aren't redundant. 694 | // For example, if we know "ab" is a possible prefix, then it 695 | // doesn't help at all to know that "abc" is also a possible 696 | // prefix, so delete "abc". 697 | w := 0 698 | f := strings.HasPrefix 699 | if s == &info.suffix { 700 | f = strings.HasSuffix 701 | } 702 | for _, str := range t { 703 | if w == 0 || !f(str, t[w-1]) { 704 | t[w] = str 705 | w++ 706 | } 707 | } 708 | t = t[:w] 709 | 710 | *s = t 711 | } 712 | 713 | func (info regexpInfo) String() string { 714 | s := "" 715 | if info.canEmpty { 716 | s += "canempty " 717 | } 718 | if info.exact.have() { 719 | s += "exact:" + strings.Join(info.exact, ",") 720 | } else { 721 | s += "prefix:" + strings.Join(info.prefix, ",") 722 | s += " suffix:" + strings.Join(info.suffix, ",") 723 | } 724 | s += " match: " + info.match.String() 725 | return s 726 | } 727 | 728 | // A stringSet is a set of strings. 729 | // The nil stringSet indicates not having a set. 730 | // The non-nil but empty stringSet is the empty set. 731 | type stringSet []string 732 | 733 | // have reports whether we have a stringSet. 734 | func (s stringSet) have() bool { 735 | return s != nil 736 | } 737 | 738 | // contains reports whether s contains str. 739 | func (s stringSet) contains(str string) bool { 740 | for _, ss := range s { 741 | if ss == str { 742 | return true 743 | } 744 | } 745 | return false 746 | } 747 | 748 | type byPrefix []string 749 | 750 | func (x *byPrefix) Len() int { return len(*x) } 751 | func (x *byPrefix) Swap(i, j int) { (*x)[i], (*x)[j] = (*x)[j], (*x)[i] } 752 | func (x *byPrefix) Less(i, j int) bool { return (*x)[i] < (*x)[j] } 753 | 754 | type bySuffix []string 755 | 756 | func (x *bySuffix) Len() int { return len(*x) } 757 | func (x *bySuffix) Swap(i, j int) { (*x)[i], (*x)[j] = (*x)[j], (*x)[i] } 758 | func (x *bySuffix) Less(i, j int) bool { 759 | s := (*x)[i] 760 | t := (*x)[j] 761 | for i := 1; i <= len(s) && i <= len(t); i++ { 762 | si := s[len(s)-i] 763 | ti := t[len(t)-i] 764 | if si < ti { 765 | return true 766 | } 767 | if si > ti { 768 | return false 769 | } 770 | } 771 | return len(s) < len(t) 772 | } 773 | 774 | // add adds str to the set. 775 | func (s *stringSet) add(str string) { 776 | *s = append(*s, str) 777 | } 778 | 779 | // clean removes duplicates from the stringSet. 780 | func (s *stringSet) clean(isSuffix bool) { 781 | t := *s 782 | if isSuffix { 783 | sort.Sort((*bySuffix)(s)) 784 | } else { 785 | sort.Sort((*byPrefix)(s)) 786 | } 787 | w := 0 788 | for _, str := range t { 789 | if w == 0 || t[w-1] != str { 790 | t[w] = str 791 | w++ 792 | } 793 | } 794 | *s = t[:w] 795 | } 796 | 797 | // size returns the number of strings in s. 798 | func (s stringSet) size() int { 799 | return len(s) 800 | } 801 | 802 | // minLen returns the length of the shortest string in s. 803 | func (s stringSet) minLen() int { 804 | if len(s) == 0 { 805 | return 0 806 | } 807 | m := len(s[0]) 808 | for _, str := range s { 809 | if m > len(str) { 810 | m = len(str) 811 | } 812 | } 813 | return m 814 | } 815 | 816 | // maxLen returns the length of the longest string in s. 817 | func (s stringSet) maxLen() int { 818 | if len(s) == 0 { 819 | return 0 820 | } 821 | m := len(s[0]) 822 | for _, str := range s { 823 | if m < len(str) { 824 | m = len(str) 825 | } 826 | } 827 | return m 828 | } 829 | 830 | // union returns the union of s and t, reusing s's storage. 831 | func (s stringSet) union(t stringSet, isSuffix bool) stringSet { 832 | s = append(s, t...) 833 | s.clean(isSuffix) 834 | return s 835 | } 836 | 837 | // cross returns the cross product of s and t. 838 | func (s stringSet) cross(t stringSet, isSuffix bool) stringSet { 839 | p := stringSet{} 840 | for _, ss := range s { 841 | for _, tt := range t { 842 | p.add(ss + tt) 843 | } 844 | } 845 | p.clean(isSuffix) 846 | return p 847 | } 848 | 849 | // clear empties the set but preserves the storage. 850 | func (s *stringSet) clear() { 851 | *s = (*s)[:0] 852 | } 853 | 854 | // copy returns a copy of the set that does not share storage with the original. 855 | func (s stringSet) copy() stringSet { 856 | return append(stringSet{}, s...) 857 | } 858 | 859 | // isSubsetOf returns true if all strings in s are also in t. 860 | // It assumes both sets are sorted. 861 | func (s stringSet) isSubsetOf(t stringSet) bool { 862 | j := 0 863 | for _, ss := range s { 864 | for j < len(t) && t[j] < ss { 865 | j++ 866 | } 867 | if j >= len(t) || t[j] != ss { 868 | return false 869 | } 870 | } 871 | return true 872 | } 873 | -------------------------------------------------------------------------------- /index/regexp_test.go: -------------------------------------------------------------------------------- 1 | // Copyright 2011 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package index 6 | 7 | import ( 8 | "regexp/syntax" 9 | "testing" 10 | ) 11 | 12 | var queryTests = []struct { 13 | re string 14 | q string 15 | }{ 16 | {`Abcdef`, `"Abc" "bcd" "cde" "def"`}, 17 | {`(abc)(def)`, `"abc" "bcd" "cde" "def"`}, 18 | {`abc.*(def|ghi)`, `"abc" ("def"|"ghi")`}, 19 | {`abc(def|ghi)`, `"abc" ("bcd" "cde" "def")|("bcg" "cgh" "ghi")`}, 20 | {`a+hello`, `"ahe" "ell" "hel" "llo"`}, 21 | {`(a+hello|b+world)`, `("ahe" "ell" "hel" "llo")|("bwo" "orl" "rld" "wor")`}, 22 | {`a*bbb`, `"bbb"`}, 23 | {`a?bbb`, `"bbb"`}, 24 | {`(bbb)a?`, `"bbb"`}, 25 | {`(bbb)a*`, `"bbb"`}, 26 | {`^abc`, `"abc"`}, 27 | {`abc$`, `"abc"`}, 28 | {`ab[cde]f`, `("abc" "bcf")|("abd" "bdf")|("abe" "bef")`}, 29 | {`(abc|bac)de`, `"cde" ("abc" "bcd")|("acd" "bac")`}, 30 | 31 | // These don't have enough letters for a trigram, so they return the 32 | // always matching query "+". 33 | {`ab[^cde]f`, `+`}, 34 | {`ab.f`, `+`}, 35 | {`.`, `+`}, 36 | {`()`, `+`}, 37 | 38 | // No matches. 39 | {`[^\s\S]`, `-`}, 40 | 41 | // Factoring works. 42 | {`(abc|abc)`, `"abc"`}, 43 | {`(ab|ab)c`, `"abc"`}, 44 | {`ab(cab|cat)`, `"abc" "bca" ("cab"|"cat")`}, 45 | {`(z*(abc|def)z*)(z*(abc|def)z*)`, `("abc"|"def")`}, 46 | {`(z*abcz*defz*)|(z*abcz*defz*)`, `"abc" "def"`}, 47 | {`(z*abcz*defz*(ghi|jkl)z*)|(z*abcz*defz*(mno|prs)z*)`, 48 | `"abc" "def" ("ghi"|"jkl"|"mno"|"prs")`}, 49 | {`(z*(abcz*def)|(ghiz*jkl)z*)|(z*(mnoz*prs)|(tuvz*wxy)z*)`, 50 | `("abc" "def")|("ghi" "jkl")|("mno" "prs")|("tuv" "wxy")`}, 51 | {`(z*abcz*defz*)(z*(ghi|jkl)z*)`, `"abc" "def" ("ghi"|"jkl")`}, 52 | {`(z*abcz*defz*)|(z*(ghi|jkl)z*)`, `("ghi"|"jkl")|("abc" "def")`}, 53 | 54 | // analyze keeps track of multiple possible prefix/suffixes. 55 | {`[ab][cd][ef]`, `("ace"|"acf"|"ade"|"adf"|"bce"|"bcf"|"bde"|"bdf")`}, 56 | {`ab[cd]e`, `("abc" "bce")|("abd" "bde")`}, 57 | 58 | // Different sized suffixes. 59 | {`(a|ab)cde`, `"cde" ("abc" "bcd")|("acd")`}, 60 | {`(a|b|c|d)(ef|g|hi|j)`, `+`}, 61 | 62 | {`(?s).`, `+`}, 63 | 64 | // Expanding case. 65 | {`(?i)a~~`, `("A~~"|"a~~")`}, 66 | {`(?i)ab~`, `("AB~"|"Ab~"|"aB~"|"ab~")`}, 67 | {`(?i)abc`, `("ABC"|"ABc"|"AbC"|"Abc"|"aBC"|"aBc"|"abC"|"abc")`}, 68 | {`(?i)abc|def`, `("ABC"|"ABc"|"AbC"|"Abc"|"DEF"|"DEf"|"DeF"|"Def"|"aBC"|"aBc"|"abC"|"abc"|"dEF"|"dEf"|"deF"|"def")`}, 69 | {`(?i)abcd`, `("ABC"|"ABc"|"AbC"|"Abc"|"aBC"|"aBc"|"abC"|"abc") ("BCD"|"BCd"|"BcD"|"Bcd"|"bCD"|"bCd"|"bcD"|"bcd")`}, 70 | {`(?i)abc|abc`, `("ABC"|"ABc"|"AbC"|"Abc"|"aBC"|"aBc"|"abC"|"abc")`}, 71 | 72 | // Word boundary. 73 | {`\b`, `+`}, 74 | {`\B`, `+`}, 75 | {`\babc`, `"abc"`}, 76 | {`\Babc`, `"abc"`}, 77 | {`abc\b`, `"abc"`}, 78 | {`abc\B`, `"abc"`}, 79 | {`ab\bc`, `"abc"`}, 80 | {`ab\Bc`, `"abc"`}, 81 | } 82 | 83 | func TestQuery(t *testing.T) { 84 | for _, tt := range queryTests { 85 | re, err := syntax.Parse(tt.re, syntax.Perl) 86 | if err != nil { 87 | t.Fatal(err) 88 | } 89 | q := RegexpQuery(re).String() 90 | if q != tt.q { 91 | t.Errorf("RegexpQuery(%#q) = %#q, want %#q", tt.re, q, tt.q) 92 | } 93 | } 94 | } 95 | -------------------------------------------------------------------------------- /index/write.go: -------------------------------------------------------------------------------- 1 | // Copyright 2011 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package index 6 | 7 | import ( 8 | "io" 9 | "io/ioutil" 10 | "log" 11 | "os" 12 | "strings" 13 | "unsafe" 14 | 15 | "github.com/google/codesearch/sparse" 16 | ) 17 | 18 | // Index writing. See read.go for details of on-disk format. 19 | // 20 | // It would suffice to make a single large list of (trigram, file#) pairs 21 | // while processing the files one at a time, sort that list by trigram, 22 | // and then create the posting lists from subsequences of the list. 23 | // However, we do not assume that the entire index fits in memory. 24 | // Instead, we sort and flush the list to a new temporary file each time 25 | // it reaches its maximum in-memory size, and then at the end we 26 | // create the final posting lists by merging the temporary files as we 27 | // read them back in. 28 | // 29 | // It would also be useful to be able to create an index for a subset 30 | // of the files and then merge that index into an existing one. This would 31 | // allow incremental updating of an existing index when a directory changes. 32 | // But we have not implemented that. 33 | 34 | // An IndexWriter creates an on-disk index corresponding to a set of files. 35 | type IndexWriter struct { 36 | LogSkip bool // log information about skipped files 37 | Verbose bool // log status using package log 38 | 39 | trigram *sparse.Set // trigrams for the current file 40 | buf [8]byte // scratch buffer 41 | 42 | paths []string 43 | 44 | nameData *bufWriter // temp file holding list of names 45 | nameLen uint32 // number of bytes written to nameData 46 | nameIndex *bufWriter // temp file holding name index 47 | numName int // number of names written 48 | totalBytes int64 49 | 50 | post []postEntry // list of (trigram, file#) pairs 51 | postFile []*os.File // flushed post entries 52 | postIndex *bufWriter // temp file holding posting list index 53 | 54 | inbuf []byte // input buffer 55 | main *bufWriter // main index file 56 | } 57 | 58 | const npost = 64 << 20 / 8 // 64 MB worth of post entries 59 | 60 | // Create returns a new IndexWriter that will write the index to file. 61 | func Create(file string) *IndexWriter { 62 | return &IndexWriter{ 63 | trigram: sparse.NewSet(1 << 24), 64 | nameData: bufCreate(""), 65 | nameIndex: bufCreate(""), 66 | postIndex: bufCreate(""), 67 | main: bufCreate(file), 68 | post: make([]postEntry, 0, npost), 69 | inbuf: make([]byte, 16384), 70 | } 71 | } 72 | 73 | // A postEntry is an in-memory (trigram, file#) pair. 74 | type postEntry uint64 75 | 76 | func (p postEntry) trigram() uint32 { 77 | return uint32(p >> 32) 78 | } 79 | 80 | func (p postEntry) fileid() uint32 { 81 | return uint32(p) 82 | } 83 | 84 | func makePostEntry(trigram, fileid uint32) postEntry { 85 | return postEntry(trigram)<<32 | postEntry(fileid) 86 | } 87 | 88 | // Tuning constants for detecting text files. 89 | // A file is assumed not to be text files (and thus not indexed) 90 | // if it contains an invalid UTF-8 sequences, if it is longer than maxFileLength 91 | // bytes, if it contains a line longer than maxLineLen bytes, 92 | // or if it contains more than maxTextTrigrams distinct trigrams. 93 | const ( 94 | maxFileLen = 1 << 30 95 | maxLineLen = 2000 96 | maxTextTrigrams = 20000 97 | ) 98 | 99 | // AddPaths adds the given paths to the index's list of paths. 100 | func (ix *IndexWriter) AddPaths(paths []string) { 101 | ix.paths = append(ix.paths, paths...) 102 | } 103 | 104 | // AddFile adds the file with the given name (opened using os.Open) 105 | // to the index. It logs errors using package log. 106 | func (ix *IndexWriter) AddFile(name string) { 107 | f, err := os.Open(name) 108 | if err != nil { 109 | log.Print(err) 110 | return 111 | } 112 | defer f.Close() 113 | ix.Add(name, f) 114 | } 115 | 116 | // Add adds the file f to the index under the given name. 117 | // It logs errors using package log. 118 | func (ix *IndexWriter) Add(name string, f io.Reader) { 119 | ix.trigram.Reset() 120 | var ( 121 | c = byte(0) 122 | i = 0 123 | buf = ix.inbuf[:0] 124 | tv = uint32(0) 125 | n = int64(0) 126 | linelen = 0 127 | ) 128 | for { 129 | tv = (tv << 8) & (1<<24 - 1) 130 | if i >= len(buf) { 131 | n, err := f.Read(buf[:cap(buf)]) 132 | if n == 0 { 133 | if err != nil { 134 | if err == io.EOF { 135 | break 136 | } 137 | log.Printf("%s: %v\n", name, err) 138 | return 139 | } 140 | log.Printf("%s: 0-length read\n", name) 141 | return 142 | } 143 | buf = buf[:n] 144 | i = 0 145 | } 146 | c = buf[i] 147 | i++ 148 | tv |= uint32(c) 149 | if n++; n >= 3 { 150 | ix.trigram.Add(tv) 151 | } 152 | if !validUTF8((tv>>8)&0xFF, tv&0xFF) { 153 | if ix.LogSkip { 154 | log.Printf("%s: invalid UTF-8, ignoring\n", name) 155 | } 156 | return 157 | } 158 | if n > maxFileLen { 159 | if ix.LogSkip { 160 | log.Printf("%s: too long, ignoring\n", name) 161 | } 162 | return 163 | } 164 | if linelen++; linelen > maxLineLen { 165 | if ix.LogSkip { 166 | log.Printf("%s: very long lines, ignoring\n", name) 167 | } 168 | return 169 | } 170 | if c == '\n' { 171 | linelen = 0 172 | } 173 | } 174 | if ix.trigram.Len() > maxTextTrigrams { 175 | if ix.LogSkip { 176 | log.Printf("%s: too many trigrams, probably not text, ignoring\n", name) 177 | } 178 | return 179 | } 180 | ix.totalBytes += n 181 | 182 | if ix.Verbose { 183 | log.Printf("%d %d %s\n", n, ix.trigram.Len(), name) 184 | } 185 | 186 | fileid := ix.addName(name) 187 | for _, trigram := range ix.trigram.Dense() { 188 | if len(ix.post) >= cap(ix.post) { 189 | ix.flushPost() 190 | } 191 | ix.post = append(ix.post, makePostEntry(trigram, fileid)) 192 | } 193 | } 194 | 195 | // Flush flushes the index entry to the target file. 196 | func (ix *IndexWriter) Flush() { 197 | ix.addName("") 198 | 199 | var off [5]uint32 200 | ix.main.writeString(magic) 201 | off[0] = ix.main.offset() 202 | for _, p := range ix.paths { 203 | ix.main.writeString(p) 204 | ix.main.writeString("\x00") 205 | } 206 | ix.main.writeString("\x00") 207 | off[1] = ix.main.offset() 208 | copyFile(ix.main, ix.nameData) 209 | off[2] = ix.main.offset() 210 | ix.mergePost(ix.main) 211 | off[3] = ix.main.offset() 212 | copyFile(ix.main, ix.nameIndex) 213 | off[4] = ix.main.offset() 214 | copyFile(ix.main, ix.postIndex) 215 | for _, v := range off { 216 | ix.main.writeUint32(v) 217 | } 218 | ix.main.writeString(trailerMagic) 219 | 220 | os.Remove(ix.nameData.name) 221 | for _, f := range ix.postFile { 222 | os.Remove(f.Name()) 223 | } 224 | os.Remove(ix.nameIndex.name) 225 | os.Remove(ix.postIndex.name) 226 | 227 | log.Printf("%d data bytes, %d index bytes", ix.totalBytes, ix.main.offset()) 228 | 229 | ix.main.flush() 230 | } 231 | 232 | func copyFile(dst, src *bufWriter) { 233 | dst.flush() 234 | _, err := io.Copy(dst.file, src.finish()) 235 | if err != nil { 236 | log.Fatalf("copying %s to %s: %v", src.name, dst.name, err) 237 | } 238 | } 239 | 240 | // addName adds the file with the given name to the index. 241 | // It returns the assigned file ID number. 242 | func (ix *IndexWriter) addName(name string) uint32 { 243 | if strings.Contains(name, "\x00") { 244 | log.Fatalf("%q: file has NUL byte in name", name) 245 | } 246 | 247 | ix.nameIndex.writeUint32(ix.nameData.offset()) 248 | ix.nameData.writeString(name) 249 | ix.nameData.writeByte(0) 250 | id := ix.numName 251 | ix.numName++ 252 | return uint32(id) 253 | } 254 | 255 | // flushPost writes ix.post to a new temporary file and 256 | // clears the slice. 257 | func (ix *IndexWriter) flushPost() { 258 | w, err := ioutil.TempFile("", "csearch-index") 259 | if err != nil { 260 | log.Fatal(err) 261 | } 262 | if ix.Verbose { 263 | log.Printf("flush %d entries to %s", len(ix.post), w.Name()) 264 | } 265 | sortPost(ix.post) 266 | 267 | // Write the raw ix.post array to disk as is. 268 | // This process is the one reading it back in, so byte order is not a concern. 269 | data := (*[npost * 8]byte)(unsafe.Pointer(&ix.post[0]))[:len(ix.post)*8] 270 | if n, err := w.Write(data); err != nil || n < len(data) { 271 | if err != nil { 272 | log.Fatal(err) 273 | } 274 | log.Fatalf("short write writing %s", w.Name()) 275 | } 276 | 277 | ix.post = ix.post[:0] 278 | w.Seek(0, 0) 279 | ix.postFile = append(ix.postFile, w) 280 | } 281 | 282 | // mergePost reads the flushed index entries and merges them 283 | // into posting lists, writing the resulting lists to out. 284 | func (ix *IndexWriter) mergePost(out *bufWriter) { 285 | var h postHeap 286 | 287 | log.Printf("merge %d files + mem", len(ix.postFile)) 288 | for _, f := range ix.postFile { 289 | h.addFile(f) 290 | } 291 | sortPost(ix.post) 292 | h.addMem(ix.post) 293 | 294 | npost := 0 295 | e := h.next() 296 | offset0 := out.offset() 297 | for { 298 | npost++ 299 | offset := out.offset() - offset0 300 | trigram := e.trigram() 301 | ix.buf[0] = byte(trigram >> 16) 302 | ix.buf[1] = byte(trigram >> 8) 303 | ix.buf[2] = byte(trigram) 304 | 305 | // posting list 306 | fileid := ^uint32(0) 307 | nfile := uint32(0) 308 | out.write(ix.buf[:3]) 309 | for ; e.trigram() == trigram && trigram != 1<<24-1; e = h.next() { 310 | out.writeUvarint(e.fileid() - fileid) 311 | fileid = e.fileid() 312 | nfile++ 313 | } 314 | out.writeUvarint(0) 315 | 316 | // index entry 317 | ix.postIndex.write(ix.buf[:3]) 318 | ix.postIndex.writeUint32(nfile) 319 | ix.postIndex.writeUint32(offset) 320 | 321 | if trigram == 1<<24-1 { 322 | break 323 | } 324 | } 325 | } 326 | 327 | // A postChunk represents a chunk of post entries flushed to disk or 328 | // still in memory. 329 | type postChunk struct { 330 | e postEntry // next entry 331 | m []postEntry // remaining entries after e 332 | } 333 | 334 | const postBuf = 4096 335 | 336 | // A postHeap is a heap (priority queue) of postChunks. 337 | type postHeap struct { 338 | ch []*postChunk 339 | } 340 | 341 | func (h *postHeap) addFile(f *os.File) { 342 | data := mmapFile(f).d 343 | m := (*[npost]postEntry)(unsafe.Pointer(&data[0]))[:len(data)/8] 344 | h.addMem(m) 345 | } 346 | 347 | func (h *postHeap) addMem(x []postEntry) { 348 | h.add(&postChunk{m: x}) 349 | } 350 | 351 | // step reads the next entry from ch and saves it in ch.e. 352 | // It returns false if ch is over. 353 | func (h *postHeap) step(ch *postChunk) bool { 354 | old := ch.e 355 | m := ch.m 356 | if len(m) == 0 { 357 | return false 358 | } 359 | ch.e = postEntry(m[0]) 360 | m = m[1:] 361 | ch.m = m 362 | if old >= ch.e { 363 | panic("bad sort") 364 | } 365 | return true 366 | } 367 | 368 | // add adds the chunk to the postHeap. 369 | // All adds must be called before the first call to next. 370 | func (h *postHeap) add(ch *postChunk) { 371 | if len(ch.m) > 0 { 372 | ch.e = ch.m[0] 373 | ch.m = ch.m[1:] 374 | h.push(ch) 375 | } 376 | } 377 | 378 | // empty reports whether the postHeap is empty. 379 | func (h *postHeap) empty() bool { 380 | return len(h.ch) == 0 381 | } 382 | 383 | // next returns the next entry from the postHeap. 384 | // It returns a postEntry with trigram == 1<<24 - 1 if h is empty. 385 | func (h *postHeap) next() postEntry { 386 | if len(h.ch) == 0 { 387 | return makePostEntry(1<<24-1, 0) 388 | } 389 | ch := h.ch[0] 390 | e := ch.e 391 | m := ch.m 392 | if len(m) == 0 { 393 | h.pop() 394 | } else { 395 | ch.e = m[0] 396 | ch.m = m[1:] 397 | h.siftDown(0) 398 | } 399 | return e 400 | } 401 | 402 | func (h *postHeap) pop() *postChunk { 403 | ch := h.ch[0] 404 | n := len(h.ch) - 1 405 | h.ch[0] = h.ch[n] 406 | h.ch = h.ch[:n] 407 | if n > 1 { 408 | h.siftDown(0) 409 | } 410 | return ch 411 | } 412 | 413 | func (h *postHeap) push(ch *postChunk) { 414 | n := len(h.ch) 415 | h.ch = append(h.ch, ch) 416 | if len(h.ch) >= 2 { 417 | h.siftUp(n) 418 | } 419 | } 420 | 421 | func (h *postHeap) siftDown(i int) { 422 | ch := h.ch 423 | for { 424 | j1 := 2*i + 1 425 | if j1 >= len(ch) { 426 | break 427 | } 428 | j := j1 429 | if j2 := j1 + 1; j2 < len(ch) && ch[j1].e >= ch[j2].e { 430 | j = j2 431 | } 432 | if ch[i].e < ch[j].e { 433 | break 434 | } 435 | ch[i], ch[j] = ch[j], ch[i] 436 | i = j 437 | } 438 | } 439 | 440 | func (h *postHeap) siftUp(j int) { 441 | ch := h.ch 442 | for { 443 | i := (j - 1) / 2 444 | if i == j || ch[i].e < ch[j].e { 445 | break 446 | } 447 | ch[i], ch[j] = ch[j], ch[i] 448 | j = i 449 | } 450 | } 451 | 452 | // A bufWriter is a convenience wrapper: a closeable bufio.Writer. 453 | type bufWriter struct { 454 | name string 455 | file *os.File 456 | buf []byte 457 | tmp [8]byte 458 | } 459 | 460 | // bufCreate creates a new file with the given name and returns a 461 | // corresponding bufWriter. If name is empty, bufCreate uses a 462 | // temporary file. 463 | func bufCreate(name string) *bufWriter { 464 | var ( 465 | f *os.File 466 | err error 467 | ) 468 | if name != "" { 469 | f, err = os.OpenFile(name, os.O_RDWR|os.O_CREATE|os.O_TRUNC, 0600) 470 | } else { 471 | f, err = ioutil.TempFile("", "csearch") 472 | } 473 | if err != nil { 474 | log.Fatal(err) 475 | } 476 | return &bufWriter{ 477 | name: f.Name(), 478 | buf: make([]byte, 0, 256<<10), 479 | file: f, 480 | } 481 | } 482 | 483 | func (b *bufWriter) write(x []byte) { 484 | n := cap(b.buf) - len(b.buf) 485 | if len(x) > n { 486 | b.flush() 487 | if len(x) >= cap(b.buf) { 488 | if _, err := b.file.Write(x); err != nil { 489 | log.Fatalf("writing %s: %v", b.name, err) 490 | } 491 | return 492 | } 493 | } 494 | b.buf = append(b.buf, x...) 495 | } 496 | 497 | func (b *bufWriter) writeByte(x byte) { 498 | if len(b.buf) >= cap(b.buf) { 499 | b.flush() 500 | } 501 | b.buf = append(b.buf, x) 502 | } 503 | 504 | func (b *bufWriter) writeString(s string) { 505 | n := cap(b.buf) - len(b.buf) 506 | if len(s) > n { 507 | b.flush() 508 | if len(s) >= cap(b.buf) { 509 | if _, err := b.file.WriteString(s); err != nil { 510 | log.Fatalf("writing %s: %v", b.name, err) 511 | } 512 | return 513 | } 514 | } 515 | b.buf = append(b.buf, s...) 516 | } 517 | 518 | // offset returns the current write offset. 519 | func (b *bufWriter) offset() uint32 { 520 | off, _ := b.file.Seek(0, 1) 521 | off += int64(len(b.buf)) 522 | if int64(uint32(off)) != off { 523 | log.Fatalf("index is larger than 4GB") 524 | } 525 | return uint32(off) 526 | } 527 | 528 | func (b *bufWriter) flush() { 529 | if len(b.buf) == 0 { 530 | return 531 | } 532 | _, err := b.file.Write(b.buf) 533 | if err != nil { 534 | log.Fatalf("writing %s: %v", b.name, err) 535 | } 536 | b.buf = b.buf[:0] 537 | } 538 | 539 | // finish flushes the file to disk and returns an open file ready for reading. 540 | func (b *bufWriter) finish() *os.File { 541 | b.flush() 542 | f := b.file 543 | f.Seek(0, 0) 544 | return f 545 | } 546 | 547 | func (b *bufWriter) writeTrigram(t uint32) { 548 | if cap(b.buf)-len(b.buf) < 3 { 549 | b.flush() 550 | } 551 | b.buf = append(b.buf, byte(t>>16), byte(t>>8), byte(t)) 552 | } 553 | 554 | func (b *bufWriter) writeUint32(x uint32) { 555 | if cap(b.buf)-len(b.buf) < 4 { 556 | b.flush() 557 | } 558 | b.buf = append(b.buf, byte(x>>24), byte(x>>16), byte(x>>8), byte(x)) 559 | } 560 | 561 | func (b *bufWriter) writeUvarint(x uint32) { 562 | if cap(b.buf)-len(b.buf) < 5 { 563 | b.flush() 564 | } 565 | switch { 566 | case x < 1<<7: 567 | b.buf = append(b.buf, byte(x)) 568 | case x < 1<<14: 569 | b.buf = append(b.buf, byte(x|0x80), byte(x>>7)) 570 | case x < 1<<21: 571 | b.buf = append(b.buf, byte(x|0x80), byte(x>>7|0x80), byte(x>>14)) 572 | case x < 1<<28: 573 | b.buf = append(b.buf, byte(x|0x80), byte(x>>7|0x80), byte(x>>14|0x80), byte(x>>21)) 574 | default: 575 | b.buf = append(b.buf, byte(x|0x80), byte(x>>7|0x80), byte(x>>14|0x80), byte(x>>21|0x80), byte(x>>28)) 576 | } 577 | } 578 | 579 | // validUTF8 reports whether the byte pair can appear in a 580 | // valid sequence of UTF-8-encoded code points. 581 | func validUTF8(c1, c2 uint32) bool { 582 | switch { 583 | case c1 < 0x80: 584 | // 1-byte, must be followed by 1-byte or first of multi-byte 585 | return c2 < 0x80 || 0xc0 <= c2 && c2 < 0xf8 586 | case c1 < 0xc0: 587 | // continuation byte, can be followed by nearly anything 588 | return c2 < 0xf8 589 | case c1 < 0xf8: 590 | // first of multi-byte, must be followed by continuation byte 591 | return 0x80 <= c2 && c2 < 0xc0 592 | } 593 | return false 594 | } 595 | 596 | // sortPost sorts the postentry list. 597 | // The list is already sorted by fileid (bottom 32 bits) 598 | // and the top 8 bits are always zero, so there are only 599 | // 24 bits to sort. Run two rounds of 12-bit radix sort. 600 | const sortK = 12 601 | 602 | var sortTmp []postEntry 603 | var sortN [1 << sortK]int 604 | 605 | func sortPost(post []postEntry) { 606 | if len(post) > len(sortTmp) { 607 | sortTmp = make([]postEntry, len(post)) 608 | } 609 | tmp := sortTmp[:len(post)] 610 | 611 | const k = sortK 612 | for i := range sortN { 613 | sortN[i] = 0 614 | } 615 | for _, p := range post { 616 | r := uintptr(p>>32) & (1<>32) & (1<>(32+k)) & (1<>(32+k)) & (1<> 24) 95 | buf[1] = byte(x >> 16) 96 | buf[2] = byte(x >> 8) 97 | buf[3] = byte(x) 98 | return string(buf[:]) 99 | } 100 | 101 | func fileList(list ...uint32) string { 102 | var buf []byte 103 | 104 | last := ^uint32(0) 105 | for _, x := range list { 106 | delta := x - last 107 | for delta >= 0x80 { 108 | buf = append(buf, byte(delta)|0x80) 109 | delta >>= 7 110 | } 111 | buf = append(buf, byte(delta)) 112 | last = x 113 | } 114 | buf = append(buf, 0) 115 | return string(buf) 116 | } 117 | 118 | func buildFlushIndex(out string, paths []string, doFlush bool, fileData map[string]string) { 119 | ix := Create(out) 120 | ix.AddPaths(paths) 121 | var files []string 122 | for name := range fileData { 123 | files = append(files, name) 124 | } 125 | sort.Strings(files) 126 | for _, name := range files { 127 | ix.Add(name, strings.NewReader(fileData[name])) 128 | } 129 | if doFlush { 130 | ix.flushPost() 131 | } 132 | ix.Flush() 133 | } 134 | 135 | func buildIndex(name string, paths []string, fileData map[string]string) { 136 | buildFlushIndex(name, paths, false, fileData) 137 | } 138 | 139 | func testTrivialWrite(t *testing.T, doFlush bool) { 140 | f, _ := ioutil.TempFile("", "index-test") 141 | defer os.Remove(f.Name()) 142 | out := f.Name() 143 | buildFlushIndex(out, nil, doFlush, trivialFiles) 144 | 145 | data, err := ioutil.ReadFile(out) 146 | if err != nil { 147 | t.Fatalf("reading _test/index.triv: %v", err) 148 | } 149 | want := []byte(trivialIndex) 150 | if !bytes.Equal(data, want) { 151 | i := 0 152 | for i < len(data) && i < len(want) && data[i] == want[i] { 153 | i++ 154 | } 155 | t.Fatalf("wrong index:\nhave: %q %q\nwant: %q %q", data[:i], data[i:], want[:i], want[i:]) 156 | } 157 | } 158 | 159 | func TestTrivialWrite(t *testing.T) { 160 | testTrivialWrite(t, false) 161 | } 162 | 163 | func TestTrivialWriteDisk(t *testing.T) { 164 | testTrivialWrite(t, true) 165 | } 166 | 167 | func TestHeap(t *testing.T) { 168 | h := &postHeap{} 169 | es := []postEntry{7, 4, 3, 2, 4} 170 | for _, e := range es { 171 | h.addMem([]postEntry{e}) 172 | } 173 | if len(h.ch) != len(es) { 174 | t.Fatalf("wrong heap size: %d, want %d", len(h.ch), len(es)) 175 | } 176 | for a, b := h.next(), h.next(); b.trigram() != (1<<24 - 1); a, b = b, h.next() { 177 | if a > b { 178 | t.Fatalf("%d should <= %d", a, b) 179 | } 180 | } 181 | } 182 | -------------------------------------------------------------------------------- /lib/README.template: -------------------------------------------------------------------------------- 1 | These are the command-line Code Search tools from 2 | https://github.com/google/codesearch. 3 | 4 | These binaries are for ARCH systems running OPERSYS. 5 | 6 | To get started, run cindex with a list of directories to index: 7 | 8 | cindex /usr/include $HOME/src 9 | 10 | Then run csearch to run grep over all the indexed sources: 11 | 12 | csearch DATAKIT 13 | 14 | For details, run either command with the -help option, and 15 | read http://swtch.com/~rsc/regexp/regexp4.html. 16 | -------------------------------------------------------------------------------- /lib/buildall: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # This script builds the code search binaries for a variety of OS/architecture combinations. 4 | 5 | . ./setup 6 | 7 | for i in {5,6,8}{c,g,a,l} 8 | do 9 | go tool dist install cmd/$i 10 | done 11 | 12 | build() { 13 | echo "# $1" 14 | goos=$(echo $1 | sed 's;/.*;;') 15 | goarch=$(echo $1 | sed 's;.*/;;') 16 | GOOS=$goos GOARCH=$goarch CGO_ENABLED=0 \ 17 | go install -a code.google.com/p/codesearch/cmd/{cgrep,cindex,csearch} 18 | rm -rf codesearch-$version 19 | mkdir codesearch-$version 20 | mv ~/g/bin/{cgrep,cindex,csearch}* codesearch-$version 21 | chmod +x codesearch-$version/* 22 | cat README.template | sed "s/ARCH/$(arch $goarch)/; s/OPERSYS/$(os $goos)/" >codesearch-$version/README.txt 23 | rm -f codesearch-$version-$goos-$goarch.zip 24 | zip -z -r codesearch-$version-$goos-$goarch.zip codesearch-$version < codesearch-$version/README.txt 25 | rm -rf codesearch-0.01 26 | } 27 | 28 | for i in {linux,darwin,freebsd,windows}/{amd64,386} 29 | do 30 | build $i 31 | done 32 | -------------------------------------------------------------------------------- /lib/setup: -------------------------------------------------------------------------------- 1 | set -e 2 | 3 | os() { 4 | case "$1" in 5 | freebsd) echo FreeBSD;; 6 | linux) echo Linux;; 7 | darwin) echo Mac OS X;; 8 | openbsd) echo OpenBSD;; 9 | netbsd) echo NetBSD;; 10 | windows) echo Windows;; 11 | *) echo $1;; 12 | esac 13 | } 14 | 15 | arch() { 16 | case "$1" in 17 | 386) echo 32-bit x86;; 18 | amd64) echo 64-bit x86;; 19 | *) echo $1;; 20 | esac 21 | } 22 | 23 | version=$(cat version) 24 | -------------------------------------------------------------------------------- /lib/uploadall: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | # gcodeup is a copy of $GOROOT/misc/dashboard/googlecode_upload.py. 4 | 5 | . ./setup 6 | user=$(sed -n 's/^re2.username = //' ~/.hgrc) 7 | password=$(sed -n 's/^re2\.password = //' ~/.hgrc) 8 | 9 | upload() { 10 | goos=$(echo $1 | sed "s/codesearch-$version-//; s/-.*//") 11 | goarch=$(echo $1 | sed "s/codesearch-$version-//; s/[a-z0-9]*-//; s/-.*//") 12 | gcodeup -s "binaries for $(os $goos) $(arch $goarch)" -p codesearch -u "$user" -w "$password" codesearch-$version-$1-$2.zip 13 | } 14 | 15 | for i in codesearch-$version-* 16 | do 17 | upload $i 18 | done 19 | -------------------------------------------------------------------------------- /lib/version: -------------------------------------------------------------------------------- 1 | 0.01 2 | -------------------------------------------------------------------------------- /regexp/copy.go: -------------------------------------------------------------------------------- 1 | // Copyright 2011 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | // Copied from Go's regexp/syntax. 6 | // Formatters edited to handle instByteRange. 7 | 8 | package regexp 9 | 10 | import ( 11 | "bytes" 12 | "fmt" 13 | "regexp/syntax" 14 | "sort" 15 | "strconv" 16 | "unicode" 17 | ) 18 | 19 | // cleanClass sorts the ranges (pairs of elements of r), 20 | // merges them, and eliminates duplicates. 21 | func cleanClass(rp *[]rune) []rune { 22 | 23 | // Sort by lo increasing, hi decreasing to break ties. 24 | sort.Sort(ranges{rp}) 25 | 26 | r := *rp 27 | if len(r) < 2 { 28 | return r 29 | } 30 | 31 | // Merge abutting, overlapping. 32 | w := 2 // write index 33 | for i := 2; i < len(r); i += 2 { 34 | lo, hi := r[i], r[i+1] 35 | if lo <= r[w-1]+1 { 36 | // merge with previous range 37 | if hi > r[w-1] { 38 | r[w-1] = hi 39 | } 40 | continue 41 | } 42 | // new disjoint range 43 | r[w] = lo 44 | r[w+1] = hi 45 | w += 2 46 | } 47 | 48 | return r[:w] 49 | } 50 | 51 | // appendRange returns the result of appending the range lo-hi to the class r. 52 | func appendRange(r []rune, lo, hi rune) []rune { 53 | // Expand last range or next to last range if it overlaps or abuts. 54 | // Checking two ranges helps when appending case-folded 55 | // alphabets, so that one range can be expanding A-Z and the 56 | // other expanding a-z. 57 | n := len(r) 58 | for i := 2; i <= 4; i += 2 { // twice, using i=2, i=4 59 | if n >= i { 60 | rlo, rhi := r[n-i], r[n-i+1] 61 | if lo <= rhi+1 && rlo <= hi+1 { 62 | if lo < rlo { 63 | r[n-i] = lo 64 | } 65 | if hi > rhi { 66 | r[n-i+1] = hi 67 | } 68 | return r 69 | } 70 | } 71 | } 72 | 73 | return append(r, lo, hi) 74 | } 75 | 76 | const ( 77 | // minimum and maximum runes involved in folding. 78 | // checked during test. 79 | minFold = 0x0041 80 | maxFold = 0x1044f 81 | ) 82 | 83 | // appendFoldedRange returns the result of appending the range lo-hi 84 | // and its case folding-equivalent runes to the class r. 85 | func appendFoldedRange(r []rune, lo, hi rune) []rune { 86 | // Optimizations. 87 | if lo <= minFold && hi >= maxFold { 88 | // Range is full: folding can't add more. 89 | return appendRange(r, lo, hi) 90 | } 91 | if hi < minFold || lo > maxFold { 92 | // Range is outside folding possibilities. 93 | return appendRange(r, lo, hi) 94 | } 95 | if lo < minFold { 96 | // [lo, minFold-1] needs no folding. 97 | r = appendRange(r, lo, minFold-1) 98 | lo = minFold 99 | } 100 | if hi > maxFold { 101 | // [maxFold+1, hi] needs no folding. 102 | r = appendRange(r, maxFold+1, hi) 103 | hi = maxFold 104 | } 105 | 106 | // Brute force. Depend on appendRange to coalesce ranges on the fly. 107 | for c := lo; c <= hi; c++ { 108 | r = appendRange(r, c, c) 109 | f := unicode.SimpleFold(c) 110 | for f != c { 111 | r = appendRange(r, f, f) 112 | f = unicode.SimpleFold(f) 113 | } 114 | } 115 | return r 116 | } 117 | 118 | // ranges implements sort.Interface on a []rune. 119 | // The choice of receiver type definition is strange 120 | // but avoids an allocation since we already have 121 | // a *[]rune. 122 | type ranges struct { 123 | p *[]rune 124 | } 125 | 126 | func (ra ranges) Less(i, j int) bool { 127 | p := *ra.p 128 | i *= 2 129 | j *= 2 130 | return p[i] < p[j] || p[i] == p[j] && p[i+1] > p[j+1] 131 | } 132 | 133 | func (ra ranges) Len() int { 134 | return len(*ra.p) / 2 135 | } 136 | 137 | func (ra ranges) Swap(i, j int) { 138 | p := *ra.p 139 | i *= 2 140 | j *= 2 141 | p[i], p[i+1], p[j], p[j+1] = p[j], p[j+1], p[i], p[i+1] 142 | } 143 | 144 | func progString(p *syntax.Prog) string { 145 | var b bytes.Buffer 146 | dumpProg(&b, p) 147 | return b.String() 148 | } 149 | 150 | func instString(i *syntax.Inst) string { 151 | var b bytes.Buffer 152 | dumpInst(&b, i) 153 | return b.String() 154 | } 155 | 156 | func bw(b *bytes.Buffer, args ...string) { 157 | for _, s := range args { 158 | b.WriteString(s) 159 | } 160 | } 161 | 162 | func dumpProg(b *bytes.Buffer, p *syntax.Prog) { 163 | for j := range p.Inst { 164 | i := &p.Inst[j] 165 | pc := strconv.Itoa(j) 166 | if len(pc) < 3 { 167 | b.WriteString(" "[len(pc):]) 168 | } 169 | if j == p.Start { 170 | pc += "*" 171 | } 172 | bw(b, pc, "\t") 173 | dumpInst(b, i) 174 | bw(b, "\n") 175 | } 176 | } 177 | 178 | func u32(i uint32) string { 179 | return strconv.FormatUint(uint64(i), 10) 180 | } 181 | 182 | func dumpInst(b *bytes.Buffer, i *syntax.Inst) { 183 | switch i.Op { 184 | case syntax.InstAlt: 185 | bw(b, "alt -> ", u32(i.Out), ", ", u32(i.Arg)) 186 | case syntax.InstAltMatch: 187 | bw(b, "altmatch -> ", u32(i.Out), ", ", u32(i.Arg)) 188 | case syntax.InstCapture: 189 | bw(b, "cap ", u32(i.Arg), " -> ", u32(i.Out)) 190 | case syntax.InstEmptyWidth: 191 | bw(b, "empty ", u32(i.Arg), " -> ", u32(i.Out)) 192 | case syntax.InstMatch: 193 | bw(b, "match") 194 | case syntax.InstFail: 195 | bw(b, "fail") 196 | case syntax.InstNop: 197 | bw(b, "nop -> ", u32(i.Out)) 198 | case instByteRange: 199 | fmt.Fprintf(b, "byte %02x-%02x", (i.Arg>>8)&0xFF, i.Arg&0xFF) 200 | if i.Arg&argFold != 0 { 201 | bw(b, "/i") 202 | } 203 | bw(b, " -> ", u32(i.Out)) 204 | 205 | // Should not happen 206 | case syntax.InstRune: 207 | if i.Rune == nil { 208 | // shouldn't happen 209 | bw(b, "rune ") 210 | } 211 | bw(b, "rune ", strconv.QuoteToASCII(string(i.Rune))) 212 | if syntax.Flags(i.Arg)&syntax.FoldCase != 0 { 213 | bw(b, "/i") 214 | } 215 | bw(b, " -> ", u32(i.Out)) 216 | case syntax.InstRune1: 217 | bw(b, "rune1 ", strconv.QuoteToASCII(string(i.Rune)), " -> ", u32(i.Out)) 218 | case syntax.InstRuneAny: 219 | bw(b, "any -> ", u32(i.Out)) 220 | case syntax.InstRuneAnyNotNL: 221 | bw(b, "anynotnl -> ", u32(i.Out)) 222 | } 223 | } 224 | -------------------------------------------------------------------------------- /regexp/match.go: -------------------------------------------------------------------------------- 1 | // Copyright 2011 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package regexp 6 | 7 | import ( 8 | "bytes" 9 | "encoding/binary" 10 | "flag" 11 | "fmt" 12 | "io" 13 | "os" 14 | "regexp/syntax" 15 | "sort" 16 | 17 | "github.com/google/codesearch/sparse" 18 | ) 19 | 20 | // A matcher holds the state for running regular expression search. 21 | type matcher struct { 22 | prog *syntax.Prog // compiled program 23 | dstate map[string]*dstate // dstate cache 24 | start *dstate // start state 25 | startLine *dstate // start state for beginning of line 26 | z1, z2 nstate // two temporary nstates 27 | } 28 | 29 | // An nstate corresponds to an NFA state. 30 | type nstate struct { 31 | q sparse.Set // queue of program instructions 32 | partial rune // partially decoded rune (TODO) 33 | flag flags // flags (TODO) 34 | } 35 | 36 | // The flags record state about a position between bytes in the text. 37 | type flags uint32 38 | 39 | const ( 40 | flagBOL flags = 1 << iota // beginning of line 41 | flagEOL // end of line 42 | flagBOT // beginning of text 43 | flagEOT // end of text 44 | flagWord // last byte was word byte 45 | ) 46 | 47 | // A dstate corresponds to a DFA state. 48 | type dstate struct { 49 | next [256]*dstate // next state, per byte 50 | enc string // encoded nstate 51 | matchNL bool // match when next byte is \n 52 | matchEOT bool // match in this state at end of text 53 | } 54 | 55 | func (z *nstate) String() string { 56 | return fmt.Sprintf("%v/%#x+%#x", z.q.Dense(), z.flag, z.partial) 57 | } 58 | 59 | // enc encodes z as a string. 60 | func (z *nstate) enc() string { 61 | var buf []byte 62 | var v [10]byte 63 | last := ^uint32(0) 64 | n := binary.PutUvarint(v[:], uint64(z.partial)) 65 | buf = append(buf, v[:n]...) 66 | n = binary.PutUvarint(v[:], uint64(z.flag)) 67 | buf = append(buf, v[:n]...) 68 | dense := z.q.Dense() 69 | ids := make([]int, 0, len(dense)) 70 | for _, id := range z.q.Dense() { 71 | ids = append(ids, int(id)) 72 | } 73 | sort.Ints(ids) 74 | for _, id := range ids { 75 | n := binary.PutUvarint(v[:], uint64(uint32(id)-last)) 76 | buf = append(buf, v[:n]...) 77 | last = uint32(id) 78 | } 79 | return string(buf) 80 | } 81 | 82 | // dec decodes the encoding s into z. 83 | func (z *nstate) dec(s string) { 84 | b := []byte(s) 85 | i, n := binary.Uvarint(b) 86 | if n <= 0 { 87 | bug() 88 | } 89 | b = b[n:] 90 | z.partial = rune(i) 91 | i, n = binary.Uvarint(b) 92 | if n <= 0 { 93 | bug() 94 | } 95 | b = b[n:] 96 | z.flag = flags(i) 97 | z.q.Reset() 98 | last := ^uint32(0) 99 | for len(b) > 0 { 100 | i, n = binary.Uvarint(b) 101 | if n <= 0 { 102 | bug() 103 | } 104 | b = b[n:] 105 | last += uint32(i) 106 | z.q.Add(last) 107 | } 108 | } 109 | 110 | // dmatch is the state we're in when we've seen a match and are just 111 | // waiting for the end of the line. 112 | var dmatch = dstate{ 113 | matchNL: true, 114 | matchEOT: true, 115 | } 116 | 117 | func init() { 118 | var z nstate 119 | dmatch.enc = z.enc() 120 | for i := range dmatch.next { 121 | if i != '\n' { 122 | dmatch.next[i] = &dmatch 123 | } 124 | } 125 | } 126 | 127 | // init initializes the matcher. 128 | func (m *matcher) init(prog *syntax.Prog) error { 129 | m.prog = prog 130 | m.dstate = make(map[string]*dstate) 131 | 132 | m.z1.q.Init(uint32(len(prog.Inst))) 133 | m.z2.q.Init(uint32(len(prog.Inst))) 134 | 135 | m.addq(&m.z1.q, uint32(prog.Start), syntax.EmptyBeginLine|syntax.EmptyBeginText) 136 | m.z1.flag = flagBOL | flagBOT 137 | m.start = m.cache(&m.z1) 138 | 139 | m.z1.q.Reset() 140 | m.addq(&m.z1.q, uint32(prog.Start), syntax.EmptyBeginLine) 141 | m.z1.flag = flagBOL 142 | m.startLine = m.cache(&m.z1) 143 | 144 | return nil 145 | } 146 | 147 | // stepEmpty steps runq to nextq expanding according to flag. 148 | func (m *matcher) stepEmpty(runq, nextq *sparse.Set, flag syntax.EmptyOp) { 149 | nextq.Reset() 150 | for _, id := range runq.Dense() { 151 | m.addq(nextq, id, flag) 152 | } 153 | } 154 | 155 | // stepByte steps runq to nextq consuming c and then expanding according to flag. 156 | // It returns true if a match ends immediately before c. 157 | // c is either an input byte or endText. 158 | func (m *matcher) stepByte(runq, nextq *sparse.Set, c int, flag syntax.EmptyOp) (match bool) { 159 | nextq.Reset() 160 | m.addq(nextq, uint32(m.prog.Start), flag) 161 | for _, id := range runq.Dense() { 162 | i := &m.prog.Inst[id] 163 | switch i.Op { 164 | default: 165 | continue 166 | case syntax.InstMatch: 167 | match = true 168 | continue 169 | case instByteRange: 170 | if c == endText { 171 | break 172 | } 173 | lo := int((i.Arg >> 8) & 0xFF) 174 | hi := int(i.Arg & 0xFF) 175 | ch := c 176 | if i.Arg&argFold != 0 && 'a' <= ch && ch <= 'z' { 177 | ch += 'A' - 'a' 178 | } 179 | if lo <= ch && ch <= hi { 180 | m.addq(nextq, i.Out, flag) 181 | } 182 | } 183 | } 184 | return 185 | } 186 | 187 | // addq adds id to the queue, expanding according to flag. 188 | func (m *matcher) addq(q *sparse.Set, id uint32, flag syntax.EmptyOp) { 189 | if q.Has(id) { 190 | return 191 | } 192 | q.Add(id) 193 | i := &m.prog.Inst[id] 194 | switch i.Op { 195 | case syntax.InstCapture, syntax.InstNop: 196 | m.addq(q, i.Out, flag) 197 | case syntax.InstAlt, syntax.InstAltMatch: 198 | m.addq(q, i.Out, flag) 199 | m.addq(q, i.Arg, flag) 200 | case syntax.InstEmptyWidth: 201 | if syntax.EmptyOp(i.Arg)&^flag == 0 { 202 | m.addq(q, i.Out, flag) 203 | } 204 | } 205 | } 206 | 207 | const endText = -1 208 | 209 | // computeNext computes the next DFA state if we're in d reading c (an input byte or endText). 210 | func (m *matcher) computeNext(d *dstate, c int) *dstate { 211 | this, next := &m.z1, &m.z2 212 | this.dec(d.enc) 213 | 214 | // compute flags in effect before c 215 | flag := syntax.EmptyOp(0) 216 | if this.flag&flagBOL != 0 { 217 | flag |= syntax.EmptyBeginLine 218 | } 219 | if this.flag&flagBOT != 0 { 220 | flag |= syntax.EmptyBeginText 221 | } 222 | if this.flag&flagWord != 0 { 223 | if !isWordByte(c) { 224 | flag |= syntax.EmptyWordBoundary 225 | } else { 226 | flag |= syntax.EmptyNoWordBoundary 227 | } 228 | } else { 229 | if isWordByte(c) { 230 | flag |= syntax.EmptyWordBoundary 231 | } else { 232 | flag |= syntax.EmptyNoWordBoundary 233 | } 234 | } 235 | if c == '\n' { 236 | flag |= syntax.EmptyEndLine 237 | } 238 | if c == endText { 239 | flag |= syntax.EmptyEndLine | syntax.EmptyEndText 240 | } 241 | 242 | // re-expand queue using new flags. 243 | // TODO: only do this when it matters 244 | // (something is gating on word boundaries). 245 | m.stepEmpty(&this.q, &next.q, flag) 246 | this, next = next, this 247 | 248 | // now compute flags after c. 249 | flag = 0 250 | next.flag = 0 251 | if c == '\n' { 252 | flag |= syntax.EmptyBeginLine 253 | next.flag |= flagBOL 254 | } 255 | if isWordByte(c) { 256 | next.flag |= flagWord 257 | } 258 | 259 | // re-add start, process rune + expand according to flags. 260 | if m.stepByte(&this.q, &next.q, c, flag) { 261 | return &dmatch 262 | } 263 | return m.cache(next) 264 | } 265 | 266 | func (m *matcher) cache(z *nstate) *dstate { 267 | enc := z.enc() 268 | d := m.dstate[enc] 269 | if d != nil { 270 | return d 271 | } 272 | 273 | d = &dstate{enc: enc} 274 | m.dstate[enc] = d 275 | d.matchNL = m.computeNext(d, '\n') == &dmatch 276 | d.matchEOT = m.computeNext(d, endText) == &dmatch 277 | return d 278 | } 279 | 280 | func (m *matcher) match(b []byte, beginText, endText bool) (end int) { 281 | // fmt.Printf("%v\n", m.prog) 282 | 283 | d := m.startLine 284 | if beginText { 285 | d = m.start 286 | } 287 | // m.z1.dec(d.enc) 288 | // fmt.Printf("%v (%v)\n", &m.z1, d==&dmatch) 289 | for i, c := range b { 290 | d1 := d.next[c] 291 | if d1 == nil { 292 | if c == '\n' { 293 | if d.matchNL { 294 | return i 295 | } 296 | d1 = m.startLine 297 | } else { 298 | d1 = m.computeNext(d, int(c)) 299 | } 300 | d.next[c] = d1 301 | } 302 | d = d1 303 | // m.z1.dec(d.enc) 304 | // fmt.Printf("%#U: %v (%v, %v, %v)\n", c, &m.z1, d==&dmatch, d.matchNL, d.matchEOT) 305 | } 306 | if d.matchNL || endText && d.matchEOT { 307 | return len(b) 308 | } 309 | return -1 310 | } 311 | 312 | func (m *matcher) matchString(b string, beginText, endText bool) (end int) { 313 | d := m.startLine 314 | if beginText { 315 | d = m.start 316 | } 317 | for i := 0; i < len(b); i++ { 318 | c := b[i] 319 | d1 := d.next[c] 320 | if d1 == nil { 321 | if c == '\n' { 322 | if d.matchNL { 323 | return i 324 | } 325 | d1 = m.startLine 326 | } else { 327 | d1 = m.computeNext(d, int(c)) 328 | } 329 | d.next[c] = d1 330 | } 331 | d = d1 332 | } 333 | if d.matchNL || endText && d.matchEOT { 334 | return len(b) 335 | } 336 | return -1 337 | } 338 | 339 | // isWordByte reports whether the byte c is a word character: ASCII only. 340 | // This is used to implement \b and \B. This is not right for Unicode, but: 341 | // - it's hard to get right in a byte-at-a-time matching world 342 | // (the DFA has only one-byte lookahead) 343 | // - this crude approximation is the same one PCRE uses 344 | func isWordByte(c int) bool { 345 | return 'A' <= c && c <= 'Z' || 346 | 'a' <= c && c <= 'z' || 347 | '0' <= c && c <= '9' || 348 | c == '_' 349 | } 350 | 351 | // TODO: 352 | type Grep struct { 353 | Regexp *Regexp // regexp to search for 354 | Stdout io.Writer // output target 355 | Stderr io.Writer // error target 356 | 357 | L bool // L flag - print file names only 358 | C bool // C flag - print count of matches 359 | N bool // N flag - print line numbers 360 | H bool // H flag - do not print file names 361 | 362 | Match bool 363 | 364 | buf []byte 365 | } 366 | 367 | func (g *Grep) AddFlags() { 368 | flag.BoolVar(&g.L, "l", false, "list matching files only") 369 | flag.BoolVar(&g.C, "c", false, "print match counts only") 370 | flag.BoolVar(&g.N, "n", false, "show line numbers") 371 | flag.BoolVar(&g.H, "h", false, "omit file names") 372 | } 373 | 374 | func (g *Grep) File(name string) { 375 | f, err := os.Open(name) 376 | if err != nil { 377 | fmt.Fprintf(g.Stderr, "%s\n", err) 378 | return 379 | } 380 | defer f.Close() 381 | g.Reader(f, name) 382 | } 383 | 384 | var nl = []byte{'\n'} 385 | 386 | func countNL(b []byte) int { 387 | n := 0 388 | for { 389 | i := bytes.IndexByte(b, '\n') 390 | if i < 0 { 391 | break 392 | } 393 | n++ 394 | b = b[i+1:] 395 | } 396 | return n 397 | } 398 | 399 | func (g *Grep) Reader(r io.Reader, name string) { 400 | if g.buf == nil { 401 | g.buf = make([]byte, 1<<20) 402 | } 403 | var ( 404 | buf = g.buf[:0] 405 | needLineno = g.N 406 | lineno = 1 407 | count = 0 408 | prefix = "" 409 | beginText = true 410 | endText = false 411 | ) 412 | if !g.H { 413 | prefix = name + ":" 414 | } 415 | for { 416 | n, err := io.ReadFull(r, buf[len(buf):cap(buf)]) 417 | buf = buf[:len(buf)+n] 418 | end := len(buf) 419 | if err == nil { 420 | i := bytes.LastIndex(buf, nl) 421 | if i >= 0 { 422 | end = i + 1 423 | } 424 | } else { 425 | endText = true 426 | } 427 | chunkStart := 0 428 | for chunkStart < end { 429 | m1 := g.Regexp.Match(buf[chunkStart:end], beginText, endText) + chunkStart 430 | beginText = false 431 | if m1 < chunkStart { 432 | break 433 | } 434 | g.Match = true 435 | if g.L { 436 | fmt.Fprintf(g.Stdout, "%s\n", name) 437 | return 438 | } 439 | lineStart := bytes.LastIndex(buf[chunkStart:m1], nl) + 1 + chunkStart 440 | lineEnd := m1 + 1 441 | if lineEnd > end { 442 | lineEnd = end 443 | } 444 | if needLineno { 445 | lineno += countNL(buf[chunkStart:lineStart]) 446 | } 447 | line := buf[lineStart:lineEnd] 448 | nl := "" 449 | if len(line) == 0 || line[len(line)-1] != '\n' { 450 | nl = "\n" 451 | } 452 | switch { 453 | case g.C: 454 | count++ 455 | case g.N: 456 | fmt.Fprintf(g.Stdout, "%s%d:%s%s", prefix, lineno, line, nl) 457 | default: 458 | fmt.Fprintf(g.Stdout, "%s%s%s", prefix, line, nl) 459 | } 460 | if needLineno { 461 | lineno++ 462 | } 463 | chunkStart = lineEnd 464 | } 465 | if needLineno && err == nil { 466 | lineno += countNL(buf[chunkStart:end]) 467 | } 468 | n = copy(buf, buf[end:]) 469 | buf = buf[:n] 470 | if len(buf) == 0 && err != nil { 471 | if err != io.EOF && err != io.ErrUnexpectedEOF { 472 | fmt.Fprintf(g.Stderr, "%s: %v\n", name, err) 473 | } 474 | break 475 | } 476 | } 477 | if g.C && count > 0 { 478 | fmt.Fprintf(g.Stdout, "%s: %d\n", name, count) 479 | } 480 | } 481 | -------------------------------------------------------------------------------- /regexp/regexp.go: -------------------------------------------------------------------------------- 1 | // Copyright 2011 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | // Package regexp implements regular expression search tuned for 6 | // use in grep-like programs. 7 | package regexp 8 | 9 | import "regexp/syntax" 10 | 11 | func bug() { 12 | panic("codesearch/regexp: internal error") 13 | } 14 | 15 | // Regexp is the representation of a compiled regular expression. 16 | // A Regexp is NOT SAFE for concurrent use by multiple goroutines. 17 | type Regexp struct { 18 | Syntax *syntax.Regexp 19 | expr string // original expression 20 | m matcher 21 | } 22 | 23 | // String returns the source text used to compile the regular expression. 24 | func (re *Regexp) String() string { 25 | return re.expr 26 | } 27 | 28 | // Compile parses a regular expression and returns, if successful, 29 | // a Regexp object that can be used to match against lines of text. 30 | func Compile(expr string) (*Regexp, error) { 31 | re, err := syntax.Parse(expr, syntax.Perl) 32 | if err != nil { 33 | return nil, err 34 | } 35 | sre := re.Simplify() 36 | prog, err := syntax.Compile(sre) 37 | if err != nil { 38 | return nil, err 39 | } 40 | if err := toByteProg(prog); err != nil { 41 | return nil, err 42 | } 43 | r := &Regexp{ 44 | Syntax: re, 45 | expr: expr, 46 | } 47 | if err := r.m.init(prog); err != nil { 48 | return nil, err 49 | } 50 | return r, nil 51 | } 52 | 53 | func (r *Regexp) Match(b []byte, beginText, endText bool) (end int) { 54 | return r.m.match(b, beginText, endText) 55 | } 56 | 57 | func (r *Regexp) MatchString(s string, beginText, endText bool) (end int) { 58 | return r.m.matchString(s, beginText, endText) 59 | } 60 | -------------------------------------------------------------------------------- /regexp/regexp_test.go: -------------------------------------------------------------------------------- 1 | // Copyright 2011 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package regexp 6 | 7 | import ( 8 | "bytes" 9 | "reflect" 10 | "strings" 11 | "testing" 12 | ) 13 | 14 | var nstateTests = []struct { 15 | q []uint32 16 | partial rune 17 | }{ 18 | {[]uint32{1, 2, 3}, 1}, 19 | {[]uint32{1}, 1}, 20 | {[]uint32{}, 0}, 21 | {[]uint32{1, 2, 8}, 0x10FFF}, 22 | } 23 | 24 | func TestNstateEnc(t *testing.T) { 25 | var n1, n2 nstate 26 | n1.q.Init(10) 27 | n2.q.Init(10) 28 | for _, tt := range nstateTests { 29 | n1.q.Reset() 30 | n1.partial = tt.partial 31 | for _, id := range tt.q { 32 | n1.q.Add(id) 33 | } 34 | enc := n1.enc() 35 | n2.dec(enc) 36 | if n2.partial != n1.partial || !reflect.DeepEqual(n1.q.Dense(), n2.q.Dense()) { 37 | t.Errorf("%v.enc.dec = %v", &n1, &n2) 38 | } 39 | } 40 | } 41 | 42 | var matchTests = []struct { 43 | re string 44 | s string 45 | m []int 46 | }{ 47 | // Adapted from go/src/pkg/regexp/find_test.go. 48 | {`a+`, "abc\ndef\nghi\n", []int{1}}, 49 | {``, ``, []int{1}}, 50 | {`^abcdefg`, "abcdefg", []int{1}}, 51 | {`a+`, "baaab", []int{1}}, 52 | {"abcd..", "abcdef", []int{1}}, 53 | {`a`, "a", []int{1}}, 54 | {`x`, "y", nil}, 55 | {`b`, "abc", []int{1}}, 56 | {`.`, "a", []int{1}}, 57 | {`.*`, "abcdef", []int{1}}, 58 | {`^`, "abcde", []int{1}}, 59 | {`$`, "abcde", []int{1}}, 60 | {`^abcd$`, "abcd", []int{1}}, 61 | {`^bcd'`, "abcdef", nil}, 62 | {`^abcd$`, "abcde", nil}, 63 | {`a+`, "baaab", []int{1}}, 64 | {`a*`, "baaab", []int{1}}, 65 | {`[a-z]+`, "abcd", []int{1}}, 66 | {`[^a-z]+`, "ab1234cd", []int{1}}, 67 | {`[a\-\]z]+`, "az]-bcz", []int{1}}, 68 | {`[^\n]+`, "abcd\n", []int{1}}, 69 | {`[日本語]+`, "日本語日本語", []int{1}}, 70 | {`日本語+`, "日本語", []int{1}}, 71 | {`日本語+`, "日本語語語語", []int{1}}, 72 | {`()`, "", []int{1}}, 73 | {`(a)`, "a", []int{1}}, 74 | {`(.)(.)`, "日a", []int{1}}, 75 | {`(.*)`, "", []int{1}}, 76 | {`(.*)`, "abcd", []int{1}}, 77 | {`(..)(..)`, "abcd", []int{1}}, 78 | {`(([^xyz]*)(d))`, "abcd", []int{1}}, 79 | {`((a|b|c)*(d))`, "abcd", []int{1}}, 80 | {`(((a|b|c)*)(d))`, "abcd", []int{1}}, 81 | {`\a\f\r\t\v`, "\a\f\r\t\v", []int{1}}, 82 | {`[\a\f\n\r\t\v]+`, "\a\f\r\t\v", []int{1}}, 83 | 84 | {`a*(|(b))c*`, "aacc", []int{1}}, 85 | {`(.*).*`, "ab", []int{1}}, 86 | {`[.]`, ".", []int{1}}, 87 | {`/$`, "/abc/", []int{1}}, 88 | {`/$`, "/abc", nil}, 89 | 90 | // multiple matches 91 | {`.`, "abc", []int{1}}, 92 | {`(.)`, "abc", []int{1}}, 93 | {`.(.)`, "abcd", []int{1}}, 94 | {`ab*`, "abbaab", []int{1}}, 95 | {`a(b*)`, "abbaab", []int{1}}, 96 | 97 | // fixed bugs 98 | {`ab$`, "cab", []int{1}}, 99 | {`axxb$`, "axxcb", nil}, 100 | {`data`, "daXY data", []int{1}}, 101 | {`da(.)a$`, "daXY data", []int{1}}, 102 | {`zx+`, "zzx", []int{1}}, 103 | {`ab$`, "abcab", []int{1}}, 104 | {`(aa)*$`, "a", []int{1}}, 105 | {`(?:.|(?:.a))`, "", nil}, 106 | {`(?:A(?:A|a))`, "Aa", []int{1}}, 107 | {`(?:A|(?:A|a))`, "a", []int{1}}, 108 | {`(a){0}`, "", []int{1}}, 109 | // {`(?-s)(?:(?:^).)`, "\n", nil}, 110 | // {`(?s)(?:(?:^).)`, "\n", []int{1}}, 111 | // {`(?:(?:^).)`, "\n", nil}, 112 | {`\b`, "x", []int{1}}, 113 | {`\b`, "xx", []int{1}}, 114 | {`\b`, "x y", []int{1}}, 115 | {`\b`, "xx yy", []int{1}}, 116 | {`\B`, "x", nil}, 117 | {`\B`, "xx", []int{1}}, 118 | {`\B`, "x y", nil}, 119 | {`\B`, "xx yy", []int{1}}, 120 | {`(?im)^[abc]+$`, "abcABC", []int{1}}, 121 | {`(?im)^[α]+$`, "αΑ", []int{1}}, 122 | {`[Aa]BC`, "abc", nil}, 123 | {`[Aa]bc`, "abc", []int{1}}, 124 | 125 | // RE2 tests 126 | {`[^\S\s]`, "abcd", nil}, 127 | {`[^\S[:space:]]`, "abcd", nil}, 128 | {`[^\D\d]`, "abcd", nil}, 129 | {`[^\D[:digit:]]`, "abcd", nil}, 130 | {`(?i)\W`, "x", nil}, 131 | {`(?i)\W`, "k", nil}, 132 | {`(?i)\W`, "s", nil}, 133 | 134 | // can backslash-escape any punctuation 135 | {`\!\"\#\$\%\&\'\(\)\*\+\,\-\.\/\:\;\<\=\>\?\@\[\\\]\^\_\{\|\}\~`, 136 | `!"#$%&'()*+,-./:;<=>?@[\]^_{|}~`, []int{1}}, 137 | {`[\!\"\#\$\%\&\'\(\)\*\+\,\-\.\/\:\;\<\=\>\?\@\[\\\]\^\_\{\|\}\~]+`, 138 | `!"#$%&'()*+,-./:;<=>?@[\]^_{|}~`, []int{1}}, 139 | {"\\`", "`", []int{1}}, 140 | {"[\\`]+", "`", []int{1}}, 141 | 142 | // long set of matches (longer than startSize) 143 | { 144 | ".", 145 | "qwertyuiopasdfghjklzxcvbnm1234567890", 146 | []int{1}, 147 | }, 148 | } 149 | 150 | func TestMatch(t *testing.T) { 151 | for _, tt := range matchTests { 152 | re, err := Compile("(?m)" + tt.re) 153 | if err != nil { 154 | t.Errorf("Compile(%#q): %v", tt.re, err) 155 | continue 156 | } 157 | b := []byte(tt.s) 158 | lines := grep(re, b) 159 | if !reflect.DeepEqual(lines, tt.m) { 160 | t.Errorf("grep(%#q, %q) = %v, want %v", tt.re, tt.s, lines, tt.m) 161 | } 162 | } 163 | } 164 | 165 | func grep(re *Regexp, b []byte) []int { 166 | var m []int 167 | lineno := 1 168 | for { 169 | i := re.Match(b, true, true) 170 | if i < 0 { 171 | break 172 | } 173 | start := bytes.LastIndex(b[:i], nl) + 1 174 | end := i + 1 175 | if end > len(b) { 176 | end = len(b) 177 | } 178 | lineno += bytes.Count(b[:start], nl) 179 | m = append(m, lineno) 180 | if start < end && b[end-1] == '\n' { 181 | lineno++ 182 | } 183 | b = b[end:] 184 | if len(b) == 0 { 185 | break 186 | } 187 | } 188 | return m 189 | } 190 | 191 | var grepTests = []struct { 192 | re string 193 | s string 194 | out string 195 | err string 196 | g Grep 197 | }{ 198 | {re: `a+`, s: "abc\ndef\nghalloo\n", out: "input:abc\ninput:ghalloo\n"}, 199 | {re: `x.*y`, s: "xay\nxa\ny\n", out: "input:xay\n"}, 200 | } 201 | 202 | func TestGrep(t *testing.T) { 203 | for i, tt := range grepTests { 204 | re, err := Compile("(?m)" + tt.re) 205 | if err != nil { 206 | t.Errorf("Compile(%#q): %v", tt.re, err) 207 | continue 208 | } 209 | g := tt.g 210 | g.Regexp = re 211 | var out, errb bytes.Buffer 212 | g.Stdout = &out 213 | g.Stderr = &errb 214 | g.Reader(strings.NewReader(tt.s), "input") 215 | if out.String() != tt.out || errb.String() != tt.err { 216 | t.Errorf("#%d: grep(%#q, %q) = %q, %q, want %q, %q", i, tt.re, tt.s, out.String(), errb.String(), tt.out, tt.err) 217 | } 218 | } 219 | } 220 | -------------------------------------------------------------------------------- /regexp/utf.go: -------------------------------------------------------------------------------- 1 | // Copyright 2011 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package regexp 6 | 7 | import ( 8 | "regexp/syntax" 9 | "unicode" 10 | "unicode/utf8" 11 | ) 12 | 13 | const ( 14 | instFail = syntax.InstFail 15 | instAlt = syntax.InstAlt 16 | instByteRange = syntax.InstRune | 0x80 // local opcode 17 | 18 | argFold = 1 << 16 19 | ) 20 | 21 | func toByteProg(prog *syntax.Prog) error { 22 | var b runeBuilder 23 | for pc := range prog.Inst { 24 | i := &prog.Inst[pc] 25 | switch i.Op { 26 | case syntax.InstRune, syntax.InstRune1: 27 | // General rune range. PIA. 28 | // TODO: Pick off single-byte case. 29 | if lo, hi, fold, ok := oneByteRange(i); ok { 30 | i.Op = instByteRange 31 | i.Arg = uint32(lo)<<8 | uint32(hi) 32 | if fold { 33 | i.Arg |= argFold 34 | } 35 | break 36 | } 37 | 38 | r := i.Rune 39 | if syntax.Flags(i.Arg)&syntax.FoldCase != 0 { 40 | // Build folded list. 41 | var rr []rune 42 | if len(r) == 1 { 43 | rr = appendFoldedRange(rr, r[0], r[0]) 44 | } else { 45 | for j := 0; j < len(r); j += 2 { 46 | rr = appendFoldedRange(rr, r[j], r[j+1]) 47 | } 48 | } 49 | r = rr 50 | } 51 | 52 | b.init(prog, uint32(pc), i.Out) 53 | if len(r) == 1 { 54 | b.addRange(r[0], r[0], false) 55 | } else { 56 | for j := 0; j < len(r); j += 2 { 57 | b.addRange(r[j], r[j+1], false) 58 | } 59 | } 60 | 61 | case syntax.InstRuneAny, syntax.InstRuneAnyNotNL: 62 | // All runes. 63 | // AnyNotNL should exclude \n but the line-at-a-time 64 | // execution takes care of that for us. 65 | b.init(prog, uint32(pc), i.Out) 66 | b.addRange(0, unicode.MaxRune, false) 67 | } 68 | } 69 | return nil 70 | } 71 | 72 | func oneByteRange(i *syntax.Inst) (lo, hi byte, fold, ok bool) { 73 | if i.Op == syntax.InstRune1 { 74 | r := i.Rune[0] 75 | if r < utf8.RuneSelf { 76 | return byte(r), byte(r), false, true 77 | } 78 | } 79 | if i.Op != syntax.InstRune { 80 | return 81 | } 82 | fold = syntax.Flags(i.Arg)&syntax.FoldCase != 0 83 | if len(i.Rune) == 1 || len(i.Rune) == 2 && i.Rune[0] == i.Rune[1] { 84 | r := i.Rune[0] 85 | if r >= utf8.RuneSelf { 86 | return 87 | } 88 | if fold && !asciiFold(r) { 89 | return 90 | } 91 | return byte(r), byte(r), fold, true 92 | } 93 | if len(i.Rune) == 2 && i.Rune[1] < utf8.RuneSelf { 94 | if fold { 95 | for r := i.Rune[0]; r <= i.Rune[1]; r++ { 96 | if asciiFold(r) { 97 | return 98 | } 99 | } 100 | } 101 | return byte(i.Rune[0]), byte(i.Rune[1]), fold, true 102 | } 103 | if len(i.Rune) == 4 && i.Rune[0] == i.Rune[1] && i.Rune[2] == i.Rune[3] && unicode.SimpleFold(i.Rune[0]) == i.Rune[2] && unicode.SimpleFold(i.Rune[2]) == i.Rune[0] { 104 | return byte(i.Rune[0]), byte(i.Rune[0]), true, true 105 | } 106 | 107 | return 108 | } 109 | 110 | func asciiFold(r rune) bool { 111 | if r >= utf8.RuneSelf { 112 | return false 113 | } 114 | r1 := unicode.SimpleFold(r) 115 | if r1 >= utf8.RuneSelf { 116 | return false 117 | } 118 | if r1 == r { 119 | return true 120 | } 121 | return unicode.SimpleFold(r1) == r 122 | } 123 | 124 | func maxRune(n int) rune { 125 | b := 0 126 | if n == 1 { 127 | b = 7 128 | } else { 129 | b = 8 - (n + 1) + 6*(n-1) 130 | } 131 | return 1< 0xbf { 178 | // Not a continuation byte, no need to cache. 179 | return b.uncachedSuffix(lo, hi, fold, next) 180 | } 181 | 182 | key := cacheKey{lo, hi, fold, next} 183 | if pc, ok := b.cache[key]; ok { 184 | return pc 185 | } 186 | 187 | pc := b.uncachedSuffix(lo, hi, fold, next) 188 | b.cache[key] = pc 189 | return pc 190 | } 191 | 192 | func (b *runeBuilder) addBranch(pc uint32) { 193 | // Add pc to the branch at the beginning. 194 | i := &b.p.Inst[b.begin] 195 | switch i.Op { 196 | case syntax.InstFail: 197 | i.Op = syntax.InstNop 198 | i.Out = pc 199 | return 200 | case syntax.InstNop: 201 | i.Op = syntax.InstAlt 202 | i.Arg = pc 203 | return 204 | case syntax.InstAlt: 205 | apc := uint32(len(b.p.Inst)) 206 | b.p.Inst = append(b.p.Inst, syntax.Inst{Op: instAlt, Out: i.Arg, Arg: pc}) 207 | i = &b.p.Inst[b.begin] 208 | i.Arg = apc 209 | b.begin = apc 210 | } 211 | } 212 | 213 | func (b *runeBuilder) addRange(lo, hi rune, fold bool) { 214 | if lo > hi { 215 | return 216 | } 217 | 218 | // TODO: Pick off 80-10FFFF for special handling? 219 | if lo == 0x80 && hi == 0x10FFFF { 220 | } 221 | 222 | // Split range into same-length sized ranges. 223 | for i := 1; i < utf8.UTFMax; i++ { 224 | max := maxRune(i) 225 | if lo <= max && max < hi { 226 | b.addRange(lo, max, fold) 227 | b.addRange(max+1, hi, fold) 228 | return 229 | } 230 | } 231 | 232 | // ASCII range is special. 233 | if hi < utf8.RuneSelf { 234 | b.addBranch(b.suffix(byte(lo), byte(hi), fold, 0)) 235 | return 236 | } 237 | 238 | // Split range into sections that agree on leading bytes. 239 | for i := 1; i < utf8.UTFMax; i++ { 240 | m := rune(1)<= 0; i-- { 265 | pc = b.suffix(ulo[i], uhi[i], false, pc) 266 | } 267 | b.addBranch(pc) 268 | } 269 | -------------------------------------------------------------------------------- /sparse/set.go: -------------------------------------------------------------------------------- 1 | // Copyright 2011 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | // Package sparse implements sparse sets. 6 | package sparse 7 | 8 | // For comparison: running cindex over the Linux 2.6 kernel with this 9 | // implementation of trigram sets takes 11 seconds. If I change it to 10 | // a bitmap (which must be cleared between files) it takes 25 seconds. 11 | 12 | // A Set is a sparse set of uint32 values. 13 | // http://research.swtch.com/2008/03/using-uninitialized-memory-for-fun-and.html 14 | type Set struct { 15 | dense []uint32 16 | sparse []uint32 17 | } 18 | 19 | // NewSet returns a new Set with a given maximum size. 20 | // The set can contain numbers in [0, max-1]. 21 | func NewSet(max uint32) *Set { 22 | return &Set{ 23 | sparse: make([]uint32, max), 24 | } 25 | } 26 | 27 | // Init initializes a Set to have a given maximum size. 28 | // The set can contain numbers in [0, max-1]. 29 | func (s *Set) Init(max uint32) { 30 | s.sparse = make([]uint32, max) 31 | } 32 | 33 | // Reset clears (empties) the set. 34 | func (s *Set) Reset() { 35 | s.dense = s.dense[:0] 36 | } 37 | 38 | // Add adds x to the set if it is not already there. 39 | func (s *Set) Add(x uint32) { 40 | v := s.sparse[x] 41 | if v < uint32(len(s.dense)) && s.dense[v] == x { 42 | return 43 | } 44 | n := len(s.dense) 45 | s.sparse[x] = uint32(n) 46 | s.dense = append(s.dense, x) 47 | } 48 | 49 | // Has reports whether x is in the set. 50 | func (s *Set) Has(x uint32) bool { 51 | v := s.sparse[x] 52 | return v < uint32(len(s.dense)) && s.dense[v] == x 53 | } 54 | 55 | // Dense returns the values in the set. 56 | // The values are listed in the order in which they 57 | // were inserted. 58 | func (s *Set) Dense() []uint32 { 59 | return s.dense 60 | } 61 | 62 | // Len returns the number of values in the set. 63 | func (s *Set) Len() int { 64 | return len(s.dense) 65 | } 66 | --------------------------------------------------------------------------------