├── .gitignore ├── README.md ├── internal ├── diff │ ├── testdata │ │ ├── same.txt │ │ ├── eof.txt │ │ ├── allnew.txt │ │ ├── allold.txt │ │ ├── eof1.txt │ │ ├── eof2.txt │ │ ├── start.txt │ │ ├── dups.txt │ │ ├── end.txt │ │ ├── basic.txt │ │ ├── triv.txt │ │ └── long.txt │ ├── diff_test.go │ └── diff.go ├── commentfix │ ├── testdata │ │ ├── nop.txt │ │ ├── autolink.txt │ │ ├── order.txt │ │ ├── replacetext.txt │ │ └── replaceurl.txt │ ├── fix_test.go │ └── fix.go ├── github │ ├── testing_test.go │ ├── edit_test.go │ ├── edit.go │ ├── data.go │ └── sync_test.go ├── llm │ ├── llm_test.go │ ├── embed_test.go │ ├── embed.go │ └── llm.go ├── storage │ ├── vectordb_test.go │ ├── db_test.go │ ├── mem_test.go │ ├── vtest.go │ ├── vectordb.go │ ├── test.go │ ├── db.go │ ├── mem.go │ └── timed │ │ └── timed_test.go ├── keycheck │ └── key_test.go ├── secret │ ├── secret_test.go │ └── secret.go ├── testutil │ └── testutil.go ├── pebble │ ├── pebble_test.go │ └── pebble.go ├── embeddocs │ ├── sync.go │ └── sync_test.go ├── githubdocs │ ├── sync.go │ └── sync_test.go ├── gemini │ ├── gemini_test.go │ └── gemini.go ├── docs │ ├── docs_test.go │ └── docs.go ├── testdata │ ├── omap.httprr │ └── markdown3.httprr ├── related │ ├── related_test.go │ └── related.go └── httprr │ ├── rr_test.go │ └── rr.go ├── LICENSE └── go.mod /.gitignore: -------------------------------------------------------------------------------- 1 | *.db 2 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Moved to . 2 | -------------------------------------------------------------------------------- /internal/diff/testdata/same.txt: -------------------------------------------------------------------------------- 1 | -- old -- 2 | hello world 3 | -- new -- 4 | hello world 5 | -- diff -- 6 | -------------------------------------------------------------------------------- /internal/diff/testdata/eof.txt: -------------------------------------------------------------------------------- 1 | -- old -- 2 | a 3 | b 4 | c^D 5 | -- new -- 6 | a 7 | b 8 | c^D 9 | -- diff -- 10 | -------------------------------------------------------------------------------- /internal/diff/testdata/allnew.txt: -------------------------------------------------------------------------------- 1 | -- old -- 2 | -- new -- 3 | a 4 | b 5 | c 6 | -- diff -- 7 | diff old new 8 | --- old 9 | +++ new 10 | @@ -0,0 +1,3 @@ 11 | +a 12 | +b 13 | +c 14 | -------------------------------------------------------------------------------- /internal/diff/testdata/allold.txt: -------------------------------------------------------------------------------- 1 | -- old -- 2 | a 3 | b 4 | c 5 | -- new -- 6 | -- diff -- 7 | diff old new 8 | --- old 9 | +++ new 10 | @@ -1,3 +0,0 @@ 11 | -a 12 | -b 13 | -c 14 | -------------------------------------------------------------------------------- /internal/diff/testdata/eof1.txt: -------------------------------------------------------------------------------- 1 | -- old -- 2 | a 3 | b 4 | c 5 | -- new -- 6 | a 7 | b 8 | c^D 9 | -- diff -- 10 | diff old new 11 | --- old 12 | +++ new 13 | @@ -1,3 +1,3 @@ 14 | a 15 | b 16 | -c 17 | +c 18 | \ No newline at end of file 19 | -------------------------------------------------------------------------------- /internal/diff/testdata/eof2.txt: -------------------------------------------------------------------------------- 1 | -- old -- 2 | a 3 | b 4 | c^D 5 | -- new -- 6 | a 7 | b 8 | c 9 | -- diff -- 10 | diff old new 11 | --- old 12 | +++ new 13 | @@ -1,3 +1,3 @@ 14 | a 15 | b 16 | -c 17 | \ No newline at end of file 18 | +c 19 | -------------------------------------------------------------------------------- /internal/diff/testdata/start.txt: -------------------------------------------------------------------------------- 1 | -- old -- 2 | e 3 | pi 4 | 4 5 | 5 6 | 6 7 | 7 8 | 8 9 | 9 10 | 10 11 | -- new -- 12 | 1 13 | 2 14 | 3 15 | 4 16 | 5 17 | 6 18 | 7 19 | 8 20 | 9 21 | 10 22 | -- diff -- 23 | diff old new 24 | --- old 25 | +++ new 26 | @@ -1,5 +1,6 @@ 27 | -e 28 | -pi 29 | +1 30 | +2 31 | +3 32 | 4 33 | 5 34 | 6 35 | -------------------------------------------------------------------------------- /internal/diff/testdata/dups.txt: -------------------------------------------------------------------------------- 1 | -- old -- 2 | a 3 | 4 | b 5 | 6 | c 7 | 8 | d 9 | 10 | e 11 | 12 | f 13 | -- new -- 14 | a 15 | 16 | B 17 | 18 | C 19 | 20 | d 21 | 22 | e 23 | 24 | f 25 | -- diff -- 26 | diff old new 27 | --- old 28 | +++ new 29 | @@ -1,8 +1,8 @@ 30 | a 31 | $ 32 | -b 33 | - 34 | -c 35 | +B 36 | + 37 | +C 38 | $ 39 | d 40 | $ 41 | -------------------------------------------------------------------------------- /internal/diff/testdata/end.txt: -------------------------------------------------------------------------------- 1 | -- old -- 2 | 1 3 | 2 4 | 3 5 | 4 6 | 5 7 | 6 8 | 7 9 | eight 10 | nine 11 | ten 12 | eleven 13 | -- new -- 14 | 1 15 | 2 16 | 3 17 | 4 18 | 5 19 | 6 20 | 7 21 | 8 22 | 9 23 | 10 24 | -- diff -- 25 | diff old new 26 | --- old 27 | +++ new 28 | @@ -5,7 +5,6 @@ 29 | 5 30 | 6 31 | 7 32 | -eight 33 | -nine 34 | -ten 35 | -eleven 36 | +8 37 | +9 38 | +10 39 | -------------------------------------------------------------------------------- /internal/commentfix/testdata/nop.txt: -------------------------------------------------------------------------------- 1 | {{/* 2 | make sure this does not loop; 3 | it claims to have edited (and did edit) the text, 4 | so the result is non-empty, 5 | but no actual change is made. 6 | */}} 7 | {{.ReplaceText `cancelled` "canceled"}} 8 | {{.ReplaceText `canceled` "cancelled"}} 9 | -- 1.in -- 10 | The context is cancelled. 11 | -- 1.out -- 12 | The context is cancelled. 13 | -------------------------------------------------------------------------------- /internal/commentfix/testdata/autolink.txt: -------------------------------------------------------------------------------- 1 | {{.AutoLink `\bCL (\d+)\b` "https://go.dev/cl/$1"}} 2 | -- 1.in -- 3 | This is in CL 12345. 4 | -- 1.out -- 5 | This is in [CL 12345](https://go.dev/cl/12345). 6 | -- 2.in -- 7 | This is in **CL 12345**. 8 | -- 2.out -- 9 | This is in **[CL 12345](https://go.dev/cl/12345)**. 10 | -- 3.in -- 11 | This is in [the CL 12345 page](https://go.dev/cl/12345). 12 | -- 3.out -- 13 | -------------------------------------------------------------------------------- /internal/diff/testdata/basic.txt: -------------------------------------------------------------------------------- 1 | Example from Hunt and McIlroy, “An Algorithm for Differential File Comparison.” 2 | https://www.cs.dartmouth.edu/~doug/diff.pdf 3 | 4 | -- old -- 5 | a 6 | b 7 | c 8 | d 9 | e 10 | f 11 | g 12 | -- new -- 13 | w 14 | a 15 | b 16 | x 17 | y 18 | z 19 | e 20 | -- diff -- 21 | diff old new 22 | --- old 23 | +++ new 24 | @@ -1,7 +1,7 @@ 25 | +w 26 | a 27 | b 28 | -c 29 | -d 30 | +x 31 | +y 32 | +z 33 | e 34 | -f 35 | -g 36 | -------------------------------------------------------------------------------- /internal/commentfix/testdata/order.txt: -------------------------------------------------------------------------------- 1 | {{/* 2 | rules apply in order. 3 | make sure this does not loop; 4 | it claims to have edited (and did edit) the text, 5 | so the result is non-empty, 6 | but no actual change is made. 7 | */}} 8 | {{.ReplaceText `cancelled` "canceled"}} 9 | {{.ReplaceText `canceled` "cancelled"}} 10 | -- 1.in -- 11 | The context is cancelled. 12 | -- 1.out -- 13 | The context is cancelled. 14 | -- 2.in -- 15 | The context is canceled. 16 | -- 2.out -- 17 | The context is cancelled. 18 | -------------------------------------------------------------------------------- /internal/github/testing_test.go: -------------------------------------------------------------------------------- 1 | // Copyright 2024 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package github 6 | 7 | import ( 8 | "testing" 9 | 10 | "rsc.io/gaby/internal/storage" 11 | "rsc.io/gaby/internal/testutil" 12 | ) 13 | 14 | func TestLoadTxtar(t *testing.T) { 15 | gh := New(testutil.Slogger(t), storage.MemDB(), nil, nil) 16 | testutil.Check(t, gh.Testing().LoadTxtar("../testdata/rsctmp.txt")) 17 | } 18 | -------------------------------------------------------------------------------- /internal/diff/testdata/triv.txt: -------------------------------------------------------------------------------- 1 | Another example from Hunt and McIlroy, 2 | “An Algorithm for Differential File Comparison.” 3 | https://www.cs.dartmouth.edu/~doug/diff.pdf 4 | 5 | Anchored diff gives up on finding anything, 6 | since there are no unique lines. 7 | 8 | -- old -- 9 | a 10 | b 11 | c 12 | a 13 | b 14 | b 15 | a 16 | -- new -- 17 | c 18 | a 19 | b 20 | a 21 | b 22 | c 23 | -- diff -- 24 | diff old new 25 | --- old 26 | +++ new 27 | @@ -1,7 +1,6 @@ 28 | -a 29 | -b 30 | -c 31 | -a 32 | -b 33 | -b 34 | -a 35 | +c 36 | +a 37 | +b 38 | +a 39 | +b 40 | +c 41 | -------------------------------------------------------------------------------- /internal/diff/testdata/long.txt: -------------------------------------------------------------------------------- 1 | -- old -- 2 | 1 3 | 2 4 | 3 5 | 4 6 | 5 7 | 6 8 | 7 9 | 8 10 | 9 11 | 10 12 | 11 13 | 12 14 | 13 15 | 14 16 | 14½ 17 | 15 18 | 16 19 | 17 20 | 18 21 | 19 22 | 20 23 | -- new -- 24 | 1 25 | 2 26 | 3 27 | 4 28 | 5 29 | 6 30 | 8 31 | 9 32 | 10 33 | 11 34 | 12 35 | 13 36 | 14 37 | 17 38 | 18 39 | 19 40 | 20 41 | -- diff -- 42 | diff old new 43 | --- old 44 | +++ new 45 | @@ -4,7 +4,6 @@ 46 | 4 47 | 5 48 | 6 49 | -7 50 | 8 51 | 9 52 | 10 53 | @@ -12,9 +11,6 @@ 54 | 12 55 | 13 56 | 14 57 | -14½ 58 | -15 59 | -16 60 | 17 61 | 18 62 | 19 63 | -------------------------------------------------------------------------------- /internal/llm/llm_test.go: -------------------------------------------------------------------------------- 1 | // Copyright 2024 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package llm 6 | 7 | import ( 8 | "slices" 9 | "testing" 10 | ) 11 | 12 | func TestVector(t *testing.T) { 13 | v1 := Vector{1, 2, 3, 4} 14 | v2 := Vector{-200, -3000, 0, -10000} 15 | dot := v1.Dot(v2) 16 | if dot != -46200 { 17 | t.Errorf("%v.Dot(%v) = %v, want -46200", v1, v2, dot) 18 | } 19 | 20 | enc := v1.Encode() 21 | var v3 Vector 22 | v3.Decode(enc) 23 | if !slices.Equal(v3, v1) { 24 | t.Errorf("Decode(Encode(%v)) = %v, want %v", v1, v3, v1) 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /internal/storage/vectordb_test.go: -------------------------------------------------------------------------------- 1 | // Copyright 2024 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package storage 6 | 7 | import "testing" 8 | 9 | func TestVectorResultCompare(t *testing.T) { 10 | type R = VectorResult 11 | var tests = []struct { 12 | x, y VectorResult 13 | cmp int 14 | }{ 15 | {R{"b", 0.5}, R{"c", 0.5}, -1}, 16 | {R{"b", 0.4}, R{"a", 0.5}, -1}, 17 | } 18 | 19 | try := func(x, y VectorResult, cmp int) { 20 | if c := x.cmp(y); c != cmp { 21 | t.Errorf("Compare(%v, %v) = %d, want %d", x, y, c, cmp) 22 | } 23 | } 24 | for _, tt := range tests { 25 | try(tt.x, tt.x, 0) 26 | try(tt.y, tt.y, 0) 27 | try(tt.x, tt.y, tt.cmp) 28 | try(tt.y, tt.x, -tt.cmp) 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /internal/commentfix/testdata/replacetext.txt: -------------------------------------------------------------------------------- 1 | {{.ReplaceText `cancelled` "canceled"}} 2 | -- 1.in -- 3 | The context is cancelled. 4 | -- 1.out -- 5 | The context is canceled. 6 | -- 2.in -- 7 | fmt.Printf("cancelled\n") 8 | -- 2.out -- 9 | -- 3.in -- 10 | The context **is cancelled.** 11 | -- 3.out -- 12 | The context **is canceled.** 13 | -- 4.in -- 14 | The context *is cancelled.* 15 | -- 4.out -- 16 | The context *is canceled.* 17 | -- 4.in -- 18 | The context ~~is cancelled.~~ 19 | -- 4.out -- 20 | The context ~~is canceled.~~ 21 | -- 5.in -- 22 | # Contexts that are cancelled 23 | -- 5.out -- 24 | # Contexts that are canceled 25 | -- 6.in -- 26 | Here is a list of misspelled words: 27 | - cancelled 28 | -- 6.out -- 29 | Here is a list of misspelled words: 30 | - canceled 31 | -- 7.in -- 32 | > The context is cancelled. 33 | -- 7.out -- 34 | > The context is canceled. 35 | -------------------------------------------------------------------------------- /internal/commentfix/testdata/replaceurl.txt: -------------------------------------------------------------------------------- 1 | {{.ReplaceURL `https://golang.org/(.*)` "https://go.dev/$1#"}} 2 | {{.ReplaceURL `(?i)https://lowercase.com/(.*)` "https://lowercase.com/$1"}} 3 | -- 1.in -- 4 | Visit https://golang.org/doc for more docs. 5 | -- 1.out -- 6 | Visit [https://go.dev/doc#](https://go.dev/doc#) for more docs. 7 | -- 2.in -- 8 | Visit for more docs. 9 | -- 2.out -- 10 | Visit for more docs. 11 | -- 3.in -- 12 | Visit [this page](https://golang.org/doc) for more docs. 13 | -- 3.out -- 14 | Visit [this page](https://go.dev/doc#) for more docs. 15 | -- 4.in -- 16 | Visit [https://golang.org/doc](https://golang.org/doc) for more docs. 17 | -- 4.out -- 18 | Visit [https://go.dev/doc#](https://go.dev/doc#) for more docs. 19 | -- 5.in -- 20 | Visit for more docs. 21 | -- 5.out -- 22 | Visit for more docs. 23 | -- 6.in -- 24 | Visit for more docs. 25 | -- 6.out -- 26 | -------------------------------------------------------------------------------- /internal/keycheck/key_test.go: -------------------------------------------------------------------------------- 1 | // Copyright 2024 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | // Test that API keys do not appear in any httprr logs in this repo. 6 | 7 | package keycheck 8 | 9 | import ( 10 | "bytes" 11 | "io/fs" 12 | "os" 13 | "path/filepath" 14 | "strings" 15 | "testing" 16 | 17 | "rsc.io/gaby/internal/testutil" 18 | ) 19 | 20 | var bads = []string{ 21 | "\nAuthorization:", 22 | "\nx-goog-api-key:", 23 | "\nX-Goog-Api-Key:", 24 | } 25 | 26 | func TestTestdata(t *testing.T) { 27 | check := testutil.Checker(t) 28 | err := filepath.WalkDir("../..", func(file string, d fs.DirEntry, err error) error { 29 | if strings.HasSuffix(file, ".httprr") { 30 | data, err := os.ReadFile(file) 31 | check(err) 32 | for _, bad := range bads { 33 | if bytes.Contains(data, []byte(bad)) { 34 | t.Errorf("%s contains %q", file, bad) 35 | } 36 | } 37 | } 38 | return nil 39 | }) 40 | check(err) 41 | } 42 | -------------------------------------------------------------------------------- /internal/llm/embed_test.go: -------------------------------------------------------------------------------- 1 | // Copyright 2024 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package llm 6 | 7 | import ( 8 | "bytes" 9 | "fmt" 10 | "testing" 11 | ) 12 | 13 | func TestQuote(t *testing.T) { 14 | docs := []EmbedDoc{{Text: "abc"}, {Text: "alphabetical order"}} 15 | vecs, err := QuoteEmbedder().EmbedDocs(docs) 16 | if err != nil { 17 | t.Fatal(err) 18 | } 19 | if len(vecs) != len(docs) { 20 | t.Fatalf("len(docs) = %v, but len(vecs) = %d", len(docs), len(vecs)) 21 | } 22 | for i, v := range vecs { 23 | u := UnquoteVector(v) 24 | if u != docs[i].Text { 25 | var buf bytes.Buffer 26 | for i, f := range v { 27 | fmt.Fprintf(&buf, " %f", f) 28 | if f < 0 { 29 | if i < len(v)-1 { 30 | fmt.Fprintf(&buf, " ... %f", v[len(v)-1]) 31 | } 32 | break 33 | } 34 | } 35 | t.Logf("Embed(%q) = %v", docs[i].Text, buf.String()) 36 | t.Errorf("Unquote() = %q, want %q", u, docs[i].Text) 37 | } 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /internal/diff/diff_test.go: -------------------------------------------------------------------------------- 1 | // Copyright 2022 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package diff 6 | 7 | import ( 8 | "bytes" 9 | "path/filepath" 10 | "testing" 11 | 12 | "golang.org/x/tools/txtar" 13 | ) 14 | 15 | func clean(text []byte) []byte { 16 | text = bytes.ReplaceAll(text, []byte("$\n"), []byte("\n")) 17 | text = bytes.TrimSuffix(text, []byte("^D\n")) 18 | return text 19 | } 20 | 21 | func Test(t *testing.T) { 22 | files, _ := filepath.Glob("testdata/*.txt") 23 | if len(files) == 0 { 24 | t.Fatalf("no testdata") 25 | } 26 | 27 | for _, file := range files { 28 | t.Run(filepath.Base(file), func(t *testing.T) { 29 | a, err := txtar.ParseFile(file) 30 | if err != nil { 31 | t.Fatal(err) 32 | } 33 | if len(a.Files) != 3 || a.Files[2].Name != "diff" { 34 | t.Fatalf("%s: want three files, third named \"diff\"", file) 35 | } 36 | diffs := Diff(a.Files[0].Name, clean(a.Files[0].Data), a.Files[1].Name, clean(a.Files[1].Data)) 37 | want := clean(a.Files[2].Data) 38 | if !bytes.Equal(diffs, want) { 39 | t.Fatalf("%s: have:\n%s\nwant:\n%s\n%s", file, 40 | diffs, want, Diff("have", diffs, "want", want)) 41 | } 42 | }) 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /internal/storage/db_test.go: -------------------------------------------------------------------------------- 1 | // Copyright 2024 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package storage 6 | 7 | import ( 8 | "math" 9 | "testing" 10 | 11 | "rsc.io/ordered" 12 | ) 13 | 14 | func TestPanic(t *testing.T) { 15 | func() { 16 | defer func() { 17 | r := recover() 18 | if r.(string) != "msg key=val" { 19 | t.Errorf("panic value is not msg key=val:\n%s", r) 20 | } 21 | }() 22 | Panic("msg", "key", "val") 23 | t.Fatalf("did not panic") 24 | }() 25 | 26 | } 27 | 28 | func TestJSON(t *testing.T) { 29 | x := map[string]string{"a": "b"} 30 | js := JSON(x) 31 | want := `{"a":"b"}` 32 | if string(js) != want { 33 | t.Errorf("JSON(%v) = %#q, want %#q", x, js, want) 34 | } 35 | 36 | func() { 37 | defer func() { 38 | recover() 39 | }() 40 | JSON(math.NaN()) 41 | t.Errorf("JSON(NaN) did not panic") 42 | }() 43 | } 44 | 45 | var fmtTests = []struct { 46 | data []byte 47 | out string 48 | }{ 49 | {ordered.Encode(1, 2, 3), "(1, 2, 3)"}, 50 | {[]byte(`"hello"`), "`\"hello\"`"}, 51 | {[]byte("`hello`"), "\"`hello`\""}, 52 | } 53 | 54 | func TestFmt(t *testing.T) { 55 | for _, tt := range fmtTests { 56 | out := Fmt(tt.data) 57 | if out != tt.out { 58 | t.Errorf("Fmt(%q) = %q, want %q", tt.data, out, tt.out) 59 | } 60 | } 61 | } 62 | 63 | func TestMemLocker(t *testing.T) { 64 | m := new(MemLocker) 65 | 66 | testDBLock(t, m) 67 | } 68 | -------------------------------------------------------------------------------- /internal/secret/secret_test.go: -------------------------------------------------------------------------------- 1 | // Copyright 2024 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package secret 6 | 7 | import ( 8 | "os" 9 | "path/filepath" 10 | "testing" 11 | ) 12 | 13 | func TestNetrc(t *testing.T) { 14 | file := filepath.Join(t.TempDir(), "netrc") 15 | if err := os.WriteFile(file, []byte(testNetrc), 0666); err != nil { 16 | t.Fatal(err) 17 | } 18 | t.Setenv("NETRC", file) 19 | 20 | db := Netrc() 21 | if secret, ok := db.Get("missing"); secret != "" || ok != false { 22 | t.Errorf("Get(missing) = %q, %v, want %q, %v", secret, ok, "", false) 23 | } 24 | 25 | if secret, ok := db.Get("example.com"); secret != "u2:p2" || ok != true { 26 | t.Errorf("Get(example.com) = %q, %v, want %q, %v", secret, ok, "u2:p2", true) 27 | } 28 | 29 | func() { 30 | defer func() { 31 | recover() 32 | }() 33 | db.Set("name", "value") 34 | t.Errorf("Set did not panic") 35 | }() 36 | } 37 | 38 | var testNetrc = ` 39 | machine example.com login u1 password p1 40 | machine missing login u password p and more 41 | machine example.com login u2 password p2 42 | ` 43 | 44 | func TestEmpty(t *testing.T) { 45 | db := Empty() 46 | if secret, ok := db.Get("missing"); secret != "" || ok != false { 47 | t.Errorf("Get(missing) = %q, %v, want %q, %v", secret, ok, "", false) 48 | } 49 | 50 | func() { 51 | defer func() { 52 | recover() 53 | }() 54 | db.Set("name", "value") 55 | t.Errorf("Set did not panic") 56 | }() 57 | } 58 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2009 The Go Authors. All rights reserved. 2 | 3 | Redistribution and use in source and binary forms, with or without 4 | modification, are permitted provided that the following conditions are 5 | met: 6 | 7 | * Redistributions of source code must retain the above copyright 8 | notice, this list of conditions and the following disclaimer. 9 | * Redistributions in binary form must reproduce the above 10 | copyright notice, this list of conditions and the following disclaimer 11 | in the documentation and/or other materials provided with the 12 | distribution. 13 | * Neither the name of Google Inc. nor the names of its 14 | contributors may be used to endorse or promote products derived from 15 | this software without specific prior written permission. 16 | 17 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 18 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 19 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 20 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 21 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 22 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 23 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 24 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 25 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 27 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | -------------------------------------------------------------------------------- /internal/testutil/testutil.go: -------------------------------------------------------------------------------- 1 | // Copyright 2024 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | // Package testutil implements various testing utilities. 6 | package testutil 7 | 8 | import ( 9 | "bytes" 10 | "io" 11 | "log/slog" 12 | "testing" 13 | ) 14 | 15 | // LogWriter returns an [io.Writer] that handles logs 16 | // each Write using t.Log. 17 | func LogWriter(t *testing.T) io.Writer { 18 | return testWriter{t} 19 | } 20 | 21 | type testWriter struct{ t *testing.T } 22 | 23 | func (w testWriter) Write(b []byte) (int, error) { 24 | w.t.Logf("%s", b) 25 | return len(b), nil 26 | } 27 | 28 | // Slogger returns a [*slog.Logger] that writes each message 29 | // using t.Log. 30 | func Slogger(t *testing.T) *slog.Logger { 31 | return slog.New(slog.NewTextHandler(LogWriter(t), nil)) 32 | } 33 | 34 | // SlogBuffer returns a [*slog.Logger] that writes each message to out. 35 | func SlogBuffer() (lg *slog.Logger, out *bytes.Buffer) { 36 | var buf bytes.Buffer 37 | lg = slog.New(slog.NewTextHandler(&buf, nil)) 38 | return lg, &buf 39 | } 40 | 41 | // Check calls t.Fatal(err) if err is not nil. 42 | func Check(t *testing.T, err error) { 43 | if err != nil { 44 | t.Helper() 45 | t.Fatal(err) 46 | } 47 | } 48 | 49 | // Checker returns a check function that 50 | // calls t.Fatal if err is not nil. 51 | func Checker(t *testing.T) (check func(err error)) { 52 | return func(err error) { 53 | if err != nil { 54 | t.Helper() 55 | t.Fatal(err) 56 | } 57 | } 58 | } 59 | -------------------------------------------------------------------------------- /internal/storage/mem_test.go: -------------------------------------------------------------------------------- 1 | // Copyright 2024 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package storage 6 | 7 | import ( 8 | "testing" 9 | 10 | "rsc.io/gaby/internal/testutil" 11 | ) 12 | 13 | func TestMemDB(t *testing.T) { 14 | TestDB(t, MemDB()) 15 | } 16 | 17 | func TestMemVectorDB(t *testing.T) { 18 | db := MemDB() 19 | TestVectorDB(t, func() VectorDB { return MemVectorDB(db, testutil.Slogger(t), "") }) 20 | } 21 | 22 | type maybeDB struct { 23 | DB 24 | maybe bool 25 | } 26 | 27 | type maybeBatch struct { 28 | db *maybeDB 29 | Batch 30 | } 31 | 32 | func (db *maybeDB) Batch() Batch { 33 | return &maybeBatch{db: db, Batch: db.DB.Batch()} 34 | } 35 | 36 | func (b *maybeBatch) MaybeApply() bool { 37 | return b.db.maybe 38 | } 39 | 40 | // Test that when db.Batch.MaybeApply returns true, 41 | // the memvector Batch MaybeApply applies the memvector ops. 42 | func TestMemVectorBatchMaybeApply(t *testing.T) { 43 | db := &maybeDB{DB: MemDB()} 44 | vdb := MemVectorDB(db, testutil.Slogger(t), "") 45 | b := vdb.Batch() 46 | b.Set("apple3", embed("apple3")) 47 | if _, ok := vdb.Get("apple3"); ok { 48 | t.Errorf("Get(apple3) succeeded before batch apply") 49 | } 50 | b.MaybeApply() // should not apply because the db Batch does not apply 51 | if _, ok := vdb.Get("apple3"); ok { 52 | t.Errorf("Get(apple3) succeeded after MaybeApply that didn't apply") 53 | } 54 | db.maybe = true 55 | b.MaybeApply() // now should apply 56 | if _, ok := vdb.Get("apple3"); !ok { 57 | t.Errorf("Get(apple3) failed after MaybeApply that did apply") 58 | } 59 | } 60 | -------------------------------------------------------------------------------- /internal/storage/vtest.go: -------------------------------------------------------------------------------- 1 | // Copyright 2024 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package storage 6 | 7 | import ( 8 | "math" 9 | "reflect" 10 | "slices" 11 | "testing" 12 | 13 | "rsc.io/gaby/internal/llm" 14 | ) 15 | 16 | func TestVectorDB(t *testing.T, newdb func() VectorDB) { 17 | vdb := newdb() 18 | 19 | vdb.Set("orange2", embed("orange2")) 20 | vdb.Set("orange1", embed("orange1")) 21 | b := vdb.Batch() 22 | b.Set("apple3", embed("apple3")) 23 | b.Set("apple4", embed("apple4")) 24 | b.Set("ignore", embed("bad")[:4]) 25 | b.Apply() 26 | 27 | v, ok := vdb.Get("apple3") 28 | if !ok || !slices.Equal(v, embed("apple3")) { 29 | // unreachable except bad vectordb 30 | t.Errorf("Get(apple3) = %v, %v, want %v, true", v, ok, embed("apple3")) 31 | } 32 | 33 | want := []VectorResult{ 34 | {"apple4", 0.9999961187341375}, 35 | {"apple3", 0.9999843342970269}, 36 | {"orange1", 0.38062230442542155}, 37 | {"orange2", 0.3785152783773009}, 38 | } 39 | have := vdb.Search(embed("apple5"), 5) 40 | if !reflect.DeepEqual(have, want) { 41 | // unreachable except bad vectordb 42 | t.Fatalf("Search(apple5, 5):\nhave %v\nwant %v", have, want) 43 | } 44 | 45 | vdb.Flush() 46 | 47 | vdb = newdb() 48 | have = vdb.Search(embed("apple5"), 3) 49 | want = want[:3] 50 | if !reflect.DeepEqual(have, want) { 51 | // unreachable except bad vectordb 52 | t.Errorf("Search(apple5, 3) in fresh database:\nhave %v\nwant %v", have, want) 53 | } 54 | 55 | } 56 | 57 | func embed(text string) llm.Vector { 58 | const vectorLen = 16 59 | v := make(llm.Vector, vectorLen) 60 | d := float32(0) 61 | for i := range len(text) { 62 | v[i] = float32(byte(text[i])) / 256 63 | d += v[i] * v[i] 64 | } 65 | if len(text) < len(v) { 66 | v[len(text)] = -1 67 | d += 1 68 | } 69 | d = float32(1 / math.Sqrt(float64(d))) 70 | for i, x := range v { 71 | v[i] = x * d 72 | } 73 | return v 74 | } 75 | -------------------------------------------------------------------------------- /internal/pebble/pebble_test.go: -------------------------------------------------------------------------------- 1 | // Copyright 2024 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package pebble 6 | 7 | import ( 8 | "encoding/binary" 9 | "fmt" 10 | "math/rand/v2" 11 | "testing" 12 | 13 | "rsc.io/gaby/internal/storage" 14 | "rsc.io/gaby/internal/testutil" 15 | ) 16 | 17 | type testWriter struct{ t *testing.T } 18 | 19 | func (w testWriter) Write(b []byte) (int, error) { 20 | w.t.Logf("%s", b) 21 | return len(b), nil 22 | } 23 | 24 | func TestDB(t *testing.T) { 25 | lg := testutil.Slogger(t) 26 | dir := t.TempDir() 27 | 28 | db, err := Open(lg, dir+"/db1") 29 | if err == nil { 30 | t.Fatal("Open nonexistent succeeded") 31 | } 32 | 33 | db, err = Create(lg, dir+"/db1") 34 | if err != nil { 35 | t.Fatal(err) 36 | } 37 | db.Close() 38 | 39 | db, err = Create(lg, dir+"/db1") 40 | if err == nil { 41 | t.Fatal("Create already-existing succeeded") 42 | } 43 | 44 | db, err = Open(lg, dir+"/db1") 45 | if err != nil { 46 | t.Fatal(err) 47 | } 48 | defer db.Close() 49 | 50 | storage.TestDB(t, db) 51 | 52 | if testing.Short() { 53 | return 54 | } 55 | 56 | // Test that MaybeApply handles very large batch. 57 | b := db.Batch() 58 | val := make([]byte, 1e6) 59 | pcg := rand.NewPCG(1, 2) 60 | applied := 0 61 | for key := range 500 { 62 | for i := 0; i < len(val); i += 8 { 63 | binary.BigEndian.PutUint64(val[i:], pcg.Uint64()) 64 | } 65 | binary.BigEndian.PutUint64(val, uint64(key)) 66 | b.Set([]byte(fmt.Sprint(key)), val) 67 | if b.MaybeApply() { 68 | if applied++; applied == 2 { 69 | break 70 | } 71 | } 72 | } 73 | b.Apply() 74 | 75 | for key := range 200 { 76 | val, ok := db.Get([]byte(fmt.Sprint(key))) 77 | if !ok { 78 | t.Fatalf("after batch, missing key %d", key) 79 | } 80 | if x := binary.BigEndian.Uint64(val); x != uint64(key) { 81 | t.Fatalf("Get(%d) = value for %d, want %d", key, x, key) 82 | } 83 | } 84 | } 85 | -------------------------------------------------------------------------------- /internal/llm/embed.go: -------------------------------------------------------------------------------- 1 | // Copyright 2024 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package llm 6 | 7 | import "math" 8 | 9 | const quoteLen = 123 10 | 11 | // QuoteEmbedder returns an implementation 12 | // of Embedder that can be useful for testing but 13 | // is completely pointless for real use. 14 | // It encodes up to the first 122 bytes of each document 15 | // directly into the first 122 elements of a 123-element unit vector. 16 | func QuoteEmbedder() Embedder { 17 | return quoter{} 18 | } 19 | 20 | // quote quotes text into a vector. 21 | // The text ends at the first negative entry in the vector. 22 | // The final entry of the vector is hard-coded to -1 23 | // before normalization, so that the final entry of a 24 | // normalized vector lets us know scaling to reverse 25 | // to obtain the original bytes. 26 | func quote(text string) Vector { 27 | v := make(Vector, quoteLen) 28 | var d float64 29 | for i := range len(text) { 30 | if i >= len(v)-1 { 31 | break 32 | } 33 | v[i] = float32(byte(text[i])) / 256 34 | d += float64(v[i]) * float64(v[i]) 35 | } 36 | if len(text)+1 < len(v) { 37 | v[len(text)] = -1 38 | d += 1 39 | } 40 | v[len(v)-1] = -1 41 | d += 1 42 | 43 | d = 1 / math.Sqrt(d) 44 | for i := range v { 45 | v[i] *= float32(d) 46 | } 47 | return v 48 | } 49 | 50 | // quoter is a quoting Embedder, returned by QuoteEmbedder 51 | type quoter struct{} 52 | 53 | // EmbedDocs implements Embedder by quoting. 54 | func (quoter) EmbedDocs(docs []EmbedDoc) ([]Vector, error) { 55 | var vecs []Vector 56 | for _, d := range docs { 57 | vecs = append(vecs, quote(d.Text)) 58 | } 59 | return vecs, nil 60 | } 61 | 62 | // UnquoteVector recovers the original text prefix 63 | // passed to a [QuoteEmbedder]'s EmbedDocs method. 64 | // Like QuoteEmbedder, UnquoteVector is only useful in tests. 65 | func UnquoteVector(v Vector) string { 66 | if len(v) != quoteLen { 67 | panic("UnquoteVector of non-quotation vector") 68 | } 69 | d := -1 / v[len(v)-1] 70 | var b []byte 71 | for _, f := range v { 72 | if f < 0 { 73 | break 74 | } 75 | b = append(b, byte(256*f*d+0.5)) 76 | } 77 | return string(b) 78 | } 79 | -------------------------------------------------------------------------------- /internal/embeddocs/sync.go: -------------------------------------------------------------------------------- 1 | // Copyright 2024 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | // Package embeddocs implements embedding text docs into a vector database. 6 | package embeddocs 7 | 8 | import ( 9 | "log/slog" 10 | 11 | "rsc.io/gaby/internal/docs" 12 | "rsc.io/gaby/internal/llm" 13 | "rsc.io/gaby/internal/storage" 14 | "rsc.io/gaby/internal/storage/timed" 15 | ) 16 | 17 | // Sync reads new documents from dc, embeds them using embed, 18 | // and then writes the (docid, vector) pairs to vdb. 19 | // 20 | // Sync uses [docs.DocWatcher] with the the name “embeddocs” to 21 | // save its position across multiple calls. 22 | // 23 | // Sync logs status and unexpected problems to lg. 24 | func Sync(lg *slog.Logger, vdb storage.VectorDB, embed llm.Embedder, dc *docs.Corpus) { 25 | lg.Info("embeddocs sync") 26 | 27 | const batchSize = 100 28 | var ( 29 | batch []llm.EmbedDoc 30 | ids []string 31 | batchLast timed.DBTime 32 | ) 33 | w := dc.DocWatcher("embeddocs") 34 | 35 | flush := func() bool { 36 | vecs, err := embed.EmbedDocs(batch) 37 | if len(vecs) > len(ids) { 38 | lg.Error("embeddocs length mismatch", "batch", len(batch), "vecs", len(vecs), "ids", len(ids)) 39 | return false 40 | } 41 | for i, v := range vecs { 42 | vdb.Set(ids[i], v) 43 | } 44 | if err != nil { 45 | lg.Error("embeddocs EmbedDocs error", "err", err) 46 | return false 47 | } 48 | if len(vecs) != len(ids) { 49 | lg.Error("embeddocs length mismatch", "batch", len(batch), "vecs", len(vecs), "ids", len(ids)) 50 | return false 51 | } 52 | vdb.Flush() // todo vdb 53 | w.MarkOld(batchLast) 54 | w.Flush() 55 | batch = nil 56 | ids = nil 57 | return true 58 | } 59 | 60 | for d := range w.Recent() { 61 | lg.Debug("embeddocs sync start", "doc", d.ID) 62 | batch = append(batch, llm.EmbedDoc{Title: d.Title, Text: d.Text}) 63 | ids = append(ids, d.ID) 64 | batchLast = d.DBTime 65 | if len(batch) >= batchSize { 66 | if !flush() { 67 | break 68 | } 69 | } 70 | } 71 | if len(batch) > 0 { 72 | // More to flush, but flush uses w.MarkOld, 73 | // which has to be called during an iteration over w.Recent. 74 | // Start a new iteration just to call flush and then break out. 75 | for _ = range w.Recent() { 76 | if !flush() { 77 | return 78 | } 79 | break 80 | } 81 | } 82 | } 83 | -------------------------------------------------------------------------------- /internal/githubdocs/sync.go: -------------------------------------------------------------------------------- 1 | // Copyright 2024 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | // Package githubdocs implements converting GitHub issues into text docs 6 | // for [rsc.io/gaby/internal/docs]. 7 | package githubdocs 8 | 9 | import ( 10 | "fmt" 11 | "log/slog" 12 | 13 | "rsc.io/gaby/internal/docs" 14 | "rsc.io/gaby/internal/github" 15 | ) 16 | 17 | // Sync writes to dc docs corresponding to each issue in gh that is 18 | // new since the last call to Sync. 19 | // 20 | // If an issue is edited on GitHub, it will appear new in gh and 21 | // the new text will be written to dc, replacing the old issue text. 22 | // Only the issue body (what looks like the top comment in the UI) 23 | // is saved as a document. 24 | // The document ID for each issue is its GitHub URL: "https://github.com///issues/". 25 | func Sync(lg *slog.Logger, dc *docs.Corpus, gh *github.Client) { 26 | w := gh.EventWatcher("githubdocs") 27 | for e := range w.Recent() { 28 | if e.API != "/issues" { 29 | continue 30 | } 31 | lg.Debug("githubdocs sync", "issue", e.Issue, "dbtime", e.DBTime) 32 | issue := e.Typed.(*github.Issue) 33 | title := cleanTitle(issue.Title) 34 | text := cleanBody(issue.Body) 35 | dc.Add(fmt.Sprintf("https://github.com/%s/issues/%d", e.Project, e.Issue), title, text) 36 | w.MarkOld(e.DBTime) 37 | } 38 | } 39 | 40 | // Restart causes the next call to Sync to behave as if 41 | // it has never sync'ed any issues before. 42 | // The result is that all issues will be reconverted to doc form 43 | // and re-added. 44 | // Docs that have not changed since the last addition to the corpus 45 | // will appear unmodified; others will be marked new in the corpus. 46 | func Restart(lg *slog.Logger, gh *github.Client) { 47 | gh.EventWatcher("githubdocs").Restart() 48 | } 49 | 50 | // cleanTitle should clean the title for indexing. 51 | // For now we assume the LLM is good enough at Markdown not to bother. 52 | func cleanTitle(title string) string { 53 | // TODO 54 | return title 55 | } 56 | 57 | // cleanBody should clean the body for indexing. 58 | // For now we assume the LLM is good enough at Markdown not to bother. 59 | // In the future we may want to make various changes like inlining 60 | // the programs associated with playground URLs, 61 | // and we may also want to remove any HTML tags from the Markdown. 62 | func cleanBody(body string) string { 63 | // TODO 64 | return body 65 | } 66 | -------------------------------------------------------------------------------- /internal/secret/secret.go: -------------------------------------------------------------------------------- 1 | // Copyright 2024 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | // Package secret defines an interface to a database storing secrets, such as passwords and API keys. 6 | // 7 | // TODO(rsc): Consider adding a basic key: value text file format besides netrc. 8 | package secret 9 | 10 | import ( 11 | "os" 12 | "path/filepath" 13 | "strings" 14 | ) 15 | 16 | // A DB is a secret database, which is a persistent map from names to secret values. 17 | type DB interface { 18 | Get(name string) (secret string, ok bool) 19 | Set(name, secret string) 20 | } 21 | 22 | // Empty returns a read-only, empty secret database. 23 | func Empty() DB { 24 | return ReadOnlyMap(nil) 25 | } 26 | 27 | // A Map is a read-write, in-memory [DB]. 28 | type Map map[string]string 29 | 30 | // Get returns the named secret. 31 | func (m Map) Get(name string) (secret string, ok bool) { 32 | secret, ok = m[name] 33 | return 34 | } 35 | 36 | // Set adds a secret with the given name. 37 | func (m Map) Set(name, secret string) { 38 | m[name] = secret 39 | } 40 | 41 | // A ReadOnlyMap is a read-only [DB]. Calling [Set] panics. 42 | type ReadOnlyMap map[string]string 43 | 44 | // Get returns the named secret. 45 | func (m ReadOnlyMap) Get(name string) (secret string, ok bool) { 46 | secret, ok = m[name] 47 | return 48 | } 49 | 50 | // Set panics. 51 | func (m ReadOnlyMap) Set(name, secret string) { 52 | panic("read-only secrets") 53 | } 54 | 55 | // Netrc returns a read-only secret database initialized by the content of $HOME/.netrc, if it exists. 56 | // A line in .netrc of the form 57 | // 58 | // machine name login user password pass 59 | // 60 | // causes Get("name") to return "user:pass". 61 | // Lines later in .netrc take priority over lines earlier in .netrc. 62 | // 63 | // If the environment $NETRC is set and non-empty, the file it names is used 64 | // instead of $HOME/.netrc. 65 | func Netrc() ReadOnlyMap { 66 | file := filepath.Join(os.Getenv("HOME"), ".netrc") 67 | if env := os.Getenv("NETRC"); env != "" { 68 | file = env 69 | } 70 | return openNetrc(file) 71 | } 72 | 73 | func openNetrc(file string) ReadOnlyMap { 74 | m := make(ReadOnlyMap) 75 | if data, err := os.ReadFile(file); err == nil { 76 | for _, line := range strings.Split(string(data), "\n") { 77 | f := strings.Fields(line) 78 | if len(f) == 6 && f[0] == "machine" && f[2] == "login" && f[4] == "password" { 79 | m[f[1]] = f[3] + ":" + f[5] 80 | } 81 | } 82 | } 83 | return m 84 | } 85 | -------------------------------------------------------------------------------- /internal/llm/llm.go: -------------------------------------------------------------------------------- 1 | // Copyright 2024 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | // Package llm defines interfaces implemented by LLMs (or LLM-related services). 6 | package llm 7 | 8 | import ( 9 | "encoding/binary" 10 | "math" 11 | ) 12 | 13 | // An Embedder computes vector embeddings of a list of documents. 14 | // 15 | // EmbedDocs accepts an arbitrary number of documents and returns 16 | // their embeddings. If the underlying implementation has a limit on 17 | // the batch size, it should make multiple requests in order to process 18 | // all the documents. If an error occurs after some, but not all, documents 19 | // have been processed, EmbedDocs can return an error along with a 20 | // shortened vector slice giving the vectors for a prefix of the document slice. 21 | // 22 | // See [QuoteEmbedder] for a semantically useless embedder that 23 | // can nonetheless be helpful when writing tests, 24 | // and see [rsc.io/gaby/internal/gemini] for a real implementation. 25 | type Embedder interface { 26 | EmbedDocs(docs []EmbedDoc) ([]Vector, error) 27 | } 28 | 29 | // An EmbedDoc is a single document to be embedded. 30 | type EmbedDoc struct { 31 | Title string // title of document 32 | Text string // text of document 33 | } 34 | 35 | // A Vector is an embedding vector, typically a high-dimensional unit vector. 36 | type Vector []float32 37 | 38 | // Dot returns the dot product of v and w. 39 | // 40 | // TODO(rsc): Using a float64 for the result is slightly higher 41 | // precision and may be worth doing in the intermediate calculation 42 | // but may not be worth the type conversions involved to return a float64. 43 | // Perhaps the return type should still be float32 even if the math is float64. 44 | func (v Vector) Dot(w Vector) float64 { 45 | v = v[:min(len(v), len(w))] 46 | w = w[:len(v)] // make "i in range for v" imply "i in range for w" to remove bounds check in loop 47 | t := float64(0) 48 | for i := range v { 49 | t += float64(v[i]) * float64(w[i]) 50 | } 51 | return t 52 | } 53 | 54 | // Encode returns a byte encoding of the vector v, 55 | // suitable for storing in a database. 56 | func (v Vector) Encode() []byte { 57 | val := make([]byte, 4*len(v)) 58 | for i, f := range v { 59 | binary.BigEndian.PutUint32(val[4*i:], math.Float32bits(f)) 60 | } 61 | return val 62 | } 63 | 64 | // Decode decodes the byte encoding enc into the vector v. 65 | // Enc should be a multiple of 4 bytes; any trailing bytes are ignored. 66 | func (v *Vector) Decode(enc []byte) { 67 | if len(*v) < len(enc)/4 { 68 | *v = make(Vector, len(enc)/4) 69 | } 70 | *v = (*v)[:0] 71 | for ; len(enc) >= 4; enc = enc[4:] { 72 | *v = append(*v, math.Float32frombits(binary.BigEndian.Uint32(enc))) 73 | } 74 | } 75 | -------------------------------------------------------------------------------- /internal/storage/vectordb.go: -------------------------------------------------------------------------------- 1 | // Copyright 2024 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package storage 6 | 7 | import ( 8 | "cmp" 9 | 10 | "rsc.io/gaby/internal/llm" 11 | ) 12 | 13 | // A VectorDB is a vector database that implements 14 | // nearest-neighbor search over embedding vectors 15 | // corresponding to documents. 16 | type VectorDB interface { 17 | // Set sets the vector associated with the given document ID to vec. 18 | Set(id string, vec llm.Vector) 19 | 20 | // TODO: Add Delete. 21 | 22 | // Get gets the vector associated with the given document ID. 23 | // If no such document exists, Get returns nil, false. 24 | // If a document exists, Get returns vec, true. 25 | Get(id string) (llm.Vector, bool) 26 | 27 | // Batch returns a new [VectorBatch] that accumulates 28 | // vector database mutations to apply in an atomic operation. 29 | // It is more efficient than repeated calls to Set. 30 | Batch() VectorBatch 31 | 32 | // Search searches the database for the n vectors 33 | // most similar to vec, returning the document IDs 34 | // and similarity scores. 35 | Search(vec llm.Vector, n int) []VectorResult 36 | 37 | // Flush flushes storage to disk. 38 | Flush() 39 | } 40 | 41 | // A VectorBatch accumulates vector database mutations 42 | // that are applied to a [VectorDB] in a single atomic operation. 43 | // Applying bulk operations in a batch is also more efficient than 44 | // making individual [VectorDB] method calls. 45 | // The batched operations apply in the order they are made. 46 | type VectorBatch interface { 47 | // Set sets the vector associated with the given document ID to vec. 48 | Set(id string, vec llm.Vector) 49 | 50 | // TODO: Add Delete. 51 | 52 | // MaybeApply calls Apply if the VectorBatch is getting close to full. 53 | // Every VectorBatch has a limit to how many operations can be batched, 54 | // so in a bulk operation where atomicity of the entire batch is not a concern, 55 | // calling MaybeApply gives the VectorBatch implementation 56 | // permission to flush the batch at specific “safe points”. 57 | // A typical limit for a batch is about 100MB worth of logged operations. 58 | // 59 | // MaybeApply reports whether it called Apply. 60 | MaybeApply() bool 61 | 62 | // Apply applies all the batched operations to the underlying VectorDB 63 | // as a single atomic unit. 64 | // When Apply returns, the VectorBatch is an empty batch ready for 65 | // more operations. 66 | Apply() 67 | } 68 | 69 | // A VectorResult is a single document returned by a VectorDB search. 70 | type VectorResult struct { 71 | ID string // document ID 72 | Score float64 // similarity score in range [0, 1]; 1 is exact match 73 | } 74 | 75 | func (x VectorResult) cmp(y VectorResult) int { 76 | if x.Score != y.Score { 77 | return cmp.Compare(x.Score, y.Score) 78 | } 79 | return cmp.Compare(x.ID, y.ID) 80 | } 81 | -------------------------------------------------------------------------------- /internal/gemini/gemini_test.go: -------------------------------------------------------------------------------- 1 | // Copyright 2024 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package gemini 6 | 7 | import ( 8 | "bytes" 9 | "fmt" 10 | "net/http" 11 | "os" 12 | "strings" 13 | "testing" 14 | 15 | "rsc.io/gaby/internal/httprr" 16 | "rsc.io/gaby/internal/llm" 17 | "rsc.io/gaby/internal/secret" 18 | "rsc.io/gaby/internal/testutil" 19 | ) 20 | 21 | var docs = []llm.EmbedDoc{ 22 | {Text: "for loops"}, 23 | {Text: "for all time, always"}, 24 | {Text: "break statements"}, 25 | {Text: "breakdancing"}, 26 | {Text: "forever could never be long enough for me"}, 27 | {Text: "the macarena"}, 28 | } 29 | 30 | var matches = map[string]string{ 31 | "for loops": "break statements", 32 | "for all time, always": "forever could never be long enough for me", 33 | "breakdancing": "the macarena", 34 | } 35 | 36 | func init() { 37 | for k, v := range matches { 38 | matches[v] = k 39 | } 40 | } 41 | 42 | func newTestClient(t *testing.T, rrfile string) *Client { 43 | check := testutil.Checker(t) 44 | lg := testutil.Slogger(t) 45 | 46 | rr, err := httprr.Open(rrfile, http.DefaultTransport) 47 | check(err) 48 | rr.Scrub(Scrub) 49 | sdb := secret.Netrc() 50 | 51 | c, err := NewClient(lg, sdb, rr.Client()) 52 | check(err) 53 | 54 | return c 55 | } 56 | 57 | func TestEmbedBatch(t *testing.T) { 58 | check := testutil.Checker(t) 59 | c := newTestClient(t, "testdata/embedbatch.httprr") 60 | vecs, err := c.EmbedDocs(docs) 61 | check(err) 62 | if len(vecs) != len(docs) { 63 | t.Fatalf("len(vecs) = %d, but len(docs) = %d", len(vecs), len(docs)) 64 | } 65 | 66 | var buf bytes.Buffer 67 | for i := range docs { 68 | for j := range docs { 69 | fmt.Fprintf(&buf, " %.4f", vecs[i].Dot(vecs[j])) 70 | } 71 | fmt.Fprintf(&buf, "\n") 72 | } 73 | 74 | for i, d := range docs { 75 | best := "" 76 | bestDot := 0.0 77 | for j := range docs { 78 | if dot := vecs[i].Dot(vecs[j]); i != j && dot > bestDot { 79 | best, bestDot = docs[j].Text, dot 80 | } 81 | } 82 | if best != matches[d.Text] { 83 | if buf.Len() > 0 { 84 | t.Errorf("dot matrix:\n%s", buf.String()) 85 | buf.Reset() 86 | } 87 | t.Errorf("%q: best=%q, want %q", d.Text, best, matches[d.Text]) 88 | } 89 | } 90 | } 91 | 92 | func TestBigBatch(t *testing.T) { 93 | check := testutil.Checker(t) 94 | c := newTestClient(t, "testdata/bigbatch.httprr") 95 | var docs []llm.EmbedDoc 96 | data, err := os.ReadFile("/usr/local/plan9/lib/words") 97 | check(err) 98 | for _, w := range strings.Fields(string(data)) { 99 | docs = append(docs, llm.EmbedDoc{Text: w}) 100 | } 101 | docs = docs[:251] 102 | vecs, err := c.EmbedDocs(docs) 103 | check(err) 104 | if len(vecs) != len(docs) { 105 | t.Fatalf("len(vecs) = %d, but len(docs) = %d", len(vecs), len(docs)) 106 | } 107 | } 108 | -------------------------------------------------------------------------------- /internal/githubdocs/sync_test.go: -------------------------------------------------------------------------------- 1 | // Copyright 2024 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package githubdocs 6 | 7 | import ( 8 | "testing" 9 | 10 | "rsc.io/gaby/internal/docs" 11 | "rsc.io/gaby/internal/github" 12 | "rsc.io/gaby/internal/storage" 13 | "rsc.io/gaby/internal/testutil" 14 | ) 15 | 16 | func TestMarkdown(t *testing.T) { 17 | check := testutil.Checker(t) 18 | lg := testutil.Slogger(t) 19 | db := storage.MemDB() 20 | gh := github.New(lg, db, nil, nil) 21 | check(gh.Testing().LoadTxtar("../testdata/markdown.txt")) 22 | 23 | dc := docs.New(db) 24 | Sync(lg, dc, gh) 25 | 26 | var want = []string{ 27 | "https://github.com/rsc/markdown/issues/1", 28 | "https://github.com/rsc/markdown/issues/10", 29 | "https://github.com/rsc/markdown/issues/11", 30 | "https://github.com/rsc/markdown/issues/12", 31 | "https://github.com/rsc/markdown/issues/13", 32 | "https://github.com/rsc/markdown/issues/14", 33 | "https://github.com/rsc/markdown/issues/15", 34 | "https://github.com/rsc/markdown/issues/16", 35 | "https://github.com/rsc/markdown/issues/17", 36 | "https://github.com/rsc/markdown/issues/18", 37 | "https://github.com/rsc/markdown/issues/19", 38 | "https://github.com/rsc/markdown/issues/2", 39 | "https://github.com/rsc/markdown/issues/3", 40 | "https://github.com/rsc/markdown/issues/4", 41 | "https://github.com/rsc/markdown/issues/5", 42 | "https://github.com/rsc/markdown/issues/6", 43 | "https://github.com/rsc/markdown/issues/7", 44 | "https://github.com/rsc/markdown/issues/8", 45 | "https://github.com/rsc/markdown/issues/9", 46 | } 47 | for d := range dc.Docs("") { 48 | if len(want) == 0 { 49 | t.Fatalf("unexpected extra doc: %s", d.ID) 50 | } 51 | if d.ID != want[0] { 52 | t.Fatalf("doc mismatch: have %s, want %s", d.ID, want[0]) 53 | } 54 | want = want[1:] 55 | if d.ID == md1 { 56 | if d.Title != md1Title { 57 | t.Errorf("#1 Title = %q, want %q", d.Title, md1Title) 58 | } 59 | if d.Text != md1Text { 60 | t.Errorf("#1 Text = %q, want %q", d.Text, md1Text) 61 | } 62 | } 63 | } 64 | if len(want) > 0 { 65 | t.Fatalf("missing docs: %v", want) 66 | } 67 | 68 | dc.Add("https://github.com/rsc/markdown/issues/1", "OLD TITLE", "OLD TEXT") 69 | Sync(lg, dc, gh) 70 | d, _ := dc.Get(md1) 71 | if d.Title != "OLD TITLE" || d.Text != "OLD TEXT" { 72 | t.Errorf("Sync rewrote #1: Title=%q Text=%q, want OLD TITLE, OLD TEXT", d.Title, d.Text) 73 | } 74 | 75 | Restart(lg, gh) 76 | Sync(lg, dc, gh) 77 | d, _ = dc.Get(md1) 78 | if d.Title == "OLD TITLE" || d.Text == "OLD TEXT" { 79 | t.Errorf("Restart+Sync did not rewrite #1: Title=%q Text=%q", d.Title, d.Text) 80 | } 81 | } 82 | 83 | var ( 84 | md1 = "https://github.com/rsc/markdown/issues/1" 85 | md1Title = "Support Github Emojis" 86 | md1Text = "This is an issue for supporting github emojis, such as `:smile:` for \n😄 . There's a github page that gives a mapping of emojis to image \nfile names that we can parse the hex representation out of here: \nhttps://api.github.com/emojis.\n" 87 | ) 88 | -------------------------------------------------------------------------------- /internal/docs/docs_test.go: -------------------------------------------------------------------------------- 1 | // Copyright 2024 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package docs 6 | 7 | import ( 8 | "slices" 9 | "strings" 10 | "testing" 11 | 12 | "rsc.io/gaby/internal/storage" 13 | ) 14 | 15 | func TestCorpus(t *testing.T) { 16 | db := storage.MemDB() 17 | 18 | corpus := New(db) 19 | corpus.Add("id1", "Title1", "text1") 20 | corpus.Add("id3", "Title3", "text3") 21 | corpus.Add("id2", "Title2", "text2") 22 | 23 | extra := make(map[string]string) 24 | var ids []string 25 | do := func(d *Doc) { 26 | t.Helper() 27 | if !strings.HasPrefix(d.ID, "id") { 28 | t.Fatalf("invalid prefix %q", d.ID) 29 | } 30 | n := d.ID[len("id"):] 31 | title := "Title" + n + extra[d.ID] 32 | text := "text" + n + extra[d.ID] 33 | if d.Title != title || d.Text != text { 34 | t.Fatalf("Doc id=%s has Title=%q, Text=%q, want %q, %q", d.ID, d.Title, d.Text, title, text) 35 | } 36 | ids = append(ids, d.ID) 37 | } 38 | 39 | // Basic iteration. 40 | for d := range corpus.Docs("") { 41 | do(d) 42 | } 43 | want := []string{"id1", "id2", "id3"} 44 | if !slices.Equal(ids, want) { 45 | t.Errorf("Docs() = %v, want %v", ids, want) 46 | } 47 | 48 | // Break during iteration. 49 | ids = nil 50 | for d := range corpus.Docs("") { 51 | do(d) 52 | if d.ID == "id2" { 53 | break 54 | } 55 | } 56 | want = []string{"id1", "id2"} 57 | if !slices.Equal(ids, want) { 58 | t.Errorf("Docs with break = %v, want %v", ids, want) 59 | } 60 | 61 | // DocsAfter iteration uses insert order. 62 | var last *Doc 63 | ids = nil 64 | for d := range corpus.DocsAfter(0, "") { 65 | do(d) 66 | last = d 67 | } 68 | want = []string{"id1", "id3", "id2"} 69 | if !slices.Equal(ids, want) { 70 | t.Errorf("Docs() = %v, want %v", ids, want) 71 | } 72 | 73 | // DocsAfter incremental iteration. 74 | corpus.Add("id4", "Title4", "text4") 75 | extra["id2"] = "X" 76 | corpus.Add("id2", "Title2X", "text2X") // edits existing text 77 | corpus.Add("id3", "Title3", "text3") // no-op, ignored 78 | ids = nil 79 | for d := range corpus.DocsAfter(last.DBTime, "") { 80 | do(d) 81 | } 82 | want = []string{"id4", "id2"} 83 | if !slices.Equal(ids, want) { 84 | t.Errorf("DocsAfter(last.DBTime=%d) = %v, want %v", last.DBTime, ids, want) 85 | } 86 | 87 | // DocsAfter with break. 88 | ids = nil 89 | for d := range corpus.DocsAfter(last.DBTime, "") { 90 | do(d) 91 | break 92 | } 93 | want = []string{"id4"} 94 | if !slices.Equal(ids, want) { 95 | t.Errorf("DocsAfter(last.DBTime=%d) with break = %v, want %v", last.DBTime, ids, want) 96 | } 97 | 98 | // Docs with prefix. 99 | corpus.Add("id11", "Title11", "text11") 100 | ids = nil 101 | for d := range corpus.Docs("id1") { 102 | do(d) 103 | } 104 | want = []string{"id1", "id11"} 105 | if !slices.Equal(ids, want) { 106 | t.Errorf("Docs(id1) = %v, want %v", ids, want) 107 | } 108 | 109 | // DocsAfter with prefix. 110 | ids = nil 111 | for d := range corpus.DocsAfter(0, "id1") { 112 | do(d) 113 | } 114 | want = []string{"id1", "id11"} 115 | if !slices.Equal(ids, want) { 116 | t.Errorf("DocsAfter(0, id1) = %v, want %v", ids, want) 117 | } 118 | } 119 | -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module rsc.io/gaby 2 | 3 | go 1.23rc1 4 | 5 | require ( 6 | cloud.google.com/go/firestore v1.15.0 7 | github.com/cockroachdb/pebble v1.1.0 8 | github.com/google/generative-ai-go v0.13.0 9 | golang.org/x/net v0.26.0 10 | golang.org/x/tools v0.22.0 11 | google.golang.org/api v0.178.0 12 | rsc.io/markdown v0.0.0-20240603215554-74725d8a840a 13 | rsc.io/omap v1.0.0 14 | rsc.io/ordered v1.1.0 15 | rsc.io/top v1.0.2 16 | ) 17 | 18 | require ( 19 | cloud.google.com/go v0.113.0 // indirect 20 | cloud.google.com/go/ai v0.5.0 // indirect 21 | cloud.google.com/go/auth v0.4.0 // indirect 22 | cloud.google.com/go/auth/oauth2adapt v0.2.2 // indirect 23 | cloud.google.com/go/compute/metadata v0.3.0 // indirect 24 | cloud.google.com/go/longrunning v0.5.7 // indirect 25 | github.com/DataDog/zstd v1.4.5 // indirect 26 | github.com/beorn7/perks v1.0.1 // indirect 27 | github.com/cespare/xxhash/v2 v2.2.0 // indirect 28 | github.com/cockroachdb/errors v1.11.1 // indirect 29 | github.com/cockroachdb/logtags v0.0.0-20230118201751-21c54148d20b // indirect 30 | github.com/cockroachdb/redact v1.1.5 // indirect 31 | github.com/cockroachdb/tokenbucket v0.0.0-20230807174530-cc333fc44b06 // indirect 32 | github.com/felixge/httpsnoop v1.0.4 // indirect 33 | github.com/getsentry/sentry-go v0.18.0 // indirect 34 | github.com/go-logr/logr v1.4.1 // indirect 35 | github.com/go-logr/stdr v1.2.2 // indirect 36 | github.com/gogo/protobuf v1.3.2 // indirect 37 | github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da // indirect 38 | github.com/golang/protobuf v1.5.4 // indirect 39 | github.com/golang/snappy v0.0.4 // indirect 40 | github.com/google/s2a-go v0.1.7 // indirect 41 | github.com/google/uuid v1.6.0 // indirect 42 | github.com/googleapis/enterprise-certificate-proxy v0.3.2 // indirect 43 | github.com/googleapis/gax-go/v2 v2.12.4 // indirect 44 | github.com/klauspost/compress v1.15.15 // indirect 45 | github.com/kr/pretty v0.3.1 // indirect 46 | github.com/kr/text v0.2.0 // indirect 47 | github.com/matttproud/golang_protobuf_extensions v1.0.2-0.20181231171920-c182affec369 // indirect 48 | github.com/pkg/errors v0.9.1 // indirect 49 | github.com/prometheus/client_golang v1.12.0 // indirect 50 | github.com/prometheus/client_model v0.2.1-0.20210607210712-147c58e9608a // indirect 51 | github.com/prometheus/common v0.32.1 // indirect 52 | github.com/prometheus/procfs v0.7.3 // indirect 53 | github.com/rogpeppe/go-internal v1.9.0 // indirect 54 | go.opencensus.io v0.24.0 // indirect 55 | go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.51.0 // indirect 56 | go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.51.0 // indirect 57 | go.opentelemetry.io/otel v1.26.0 // indirect 58 | go.opentelemetry.io/otel/metric v1.26.0 // indirect 59 | go.opentelemetry.io/otel/trace v1.26.0 // indirect 60 | golang.org/x/crypto v0.24.0 // indirect 61 | golang.org/x/exp v0.0.0-20230626212559-97b1e661b5df // indirect 62 | golang.org/x/oauth2 v0.20.0 // indirect 63 | golang.org/x/sync v0.7.0 // indirect 64 | golang.org/x/sys v0.21.0 // indirect 65 | golang.org/x/text v0.16.0 // indirect 66 | golang.org/x/time v0.5.0 // indirect 67 | google.golang.org/genproto v0.0.0-20240401170217-c3f982113cda // indirect 68 | google.golang.org/genproto/googleapis/api v0.0.0-20240506185236-b8a5c65736ae // indirect 69 | google.golang.org/genproto/googleapis/rpc v0.0.0-20240506185236-b8a5c65736ae // indirect 70 | google.golang.org/grpc v1.63.2 // indirect 71 | google.golang.org/protobuf v1.34.1 // indirect 72 | ) 73 | -------------------------------------------------------------------------------- /internal/storage/test.go: -------------------------------------------------------------------------------- 1 | // Copyright 2024 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package storage 6 | 7 | import ( 8 | "fmt" 9 | "slices" 10 | "sync" 11 | "testing" 12 | 13 | "rsc.io/ordered" 14 | ) 15 | 16 | // TestDB runs basic tests on db. 17 | // It should be empty when TestDB is called. 18 | func TestDB(t *testing.T, db DB) { 19 | db.Set([]byte("key"), []byte("value")) 20 | if val, ok := db.Get([]byte("key")); string(val) != "value" || ok != true { 21 | // unreachable except for bad db 22 | t.Fatalf("Get(key) = %q, %v, want %q, true", val, ok, "value") 23 | } 24 | if val, ok := db.Get([]byte("missing")); val != nil || ok != false { 25 | // unreachable except for bad db 26 | t.Fatalf("Get(missing) = %v, %v, want nil, false", val, ok) 27 | } 28 | 29 | db.Delete([]byte("key")) 30 | if val, ok := db.Get([]byte("key")); val != nil || ok != false { 31 | // unreachable except for bad db 32 | t.Fatalf("Get(key) after delete = %v, %v, want nil, false", val, ok) 33 | } 34 | 35 | b := db.Batch() 36 | for i := range 10 { 37 | b.Set(ordered.Encode(i), []byte(fmt.Sprint(i))) 38 | b.MaybeApply() 39 | } 40 | b.Apply() 41 | 42 | collect := func(min, max, stop int) []int { 43 | t.Helper() 44 | var list []int 45 | for key, val := range db.Scan(ordered.Encode(min), ordered.Encode(max)) { 46 | var i int 47 | if err := ordered.Decode(key, &i); err != nil { 48 | // unreachable except for bad db 49 | t.Fatalf("db.Scan malformed key %v", Fmt(key)) 50 | } 51 | if sv, want := string(val()), fmt.Sprint(i); sv != want { 52 | // unreachable except for bad db 53 | t.Fatalf("db.Scan key %v val=%q, want %q", i, sv, want) 54 | } 55 | list = append(list, i) 56 | if i == stop { 57 | break 58 | } 59 | } 60 | return list 61 | } 62 | 63 | if scan, want := collect(3, 6, -1), []int{3, 4, 5, 6}; !slices.Equal(scan, want) { 64 | // unreachable except for bad db 65 | t.Fatalf("Scan(3, 6) = %v, want %v", scan, want) 66 | } 67 | 68 | if scan, want := collect(3, 6, 5), []int{3, 4, 5}; !slices.Equal(scan, want) { 69 | // unreachable except for bad db 70 | t.Fatalf("Scan(3, 6) with break at 5 = %v, want %v", scan, want) 71 | } 72 | 73 | db.DeleteRange(ordered.Encode(4), ordered.Encode(7)) 74 | if scan, want := collect(-1, 11, -1), []int{0, 1, 2, 3, 8, 9}; !slices.Equal(scan, want) { 75 | // unreachable except for bad db 76 | t.Fatalf("Scan(-1, 11) after Delete(4, 7) = %v, want %v", scan, want) 77 | } 78 | 79 | b = db.Batch() 80 | for i := range 5 { 81 | b.Delete(ordered.Encode(i)) 82 | b.Set(ordered.Encode(2*i), []byte(fmt.Sprint(2*i))) 83 | } 84 | b.DeleteRange(ordered.Encode(0), ordered.Encode(0)) 85 | b.Apply() 86 | if scan, want := collect(-1, 11, -1), []int{6, 8, 9}; !slices.Equal(scan, want) { 87 | // unreachable except for bad db 88 | t.Fatalf("Scan(-1, 11) after batch Delete+Set = %v, want %v", scan, want) 89 | } 90 | 91 | // Can't test much, but check that it doesn't crash. 92 | db.Flush() 93 | 94 | testDBLock(t, db) 95 | } 96 | 97 | type locker interface { 98 | Lock(string) 99 | Unlock(string) 100 | } 101 | 102 | func testDBLock(t *testing.T, db locker) { 103 | var x int 104 | db.Lock("abc") 105 | var wg sync.WaitGroup 106 | wg.Add(1) 107 | go func() { 108 | db.Lock("abc") 109 | x = 2 // cause race if not synchronized 110 | db.Unlock("abc") 111 | wg.Done() 112 | }() 113 | x = 1 // cause race if not synchronized 114 | db.Unlock("abc") 115 | wg.Wait() 116 | _ = x 117 | 118 | func() { 119 | defer func() { 120 | recover() 121 | }() 122 | db.Unlock("def") 123 | t.Errorf("Unlock never-locked key did not panic") 124 | }() 125 | 126 | } 127 | -------------------------------------------------------------------------------- /internal/github/edit_test.go: -------------------------------------------------------------------------------- 1 | // Copyright 2024 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package github 6 | 7 | import ( 8 | "net/http" 9 | "slices" 10 | "testing" 11 | 12 | "rsc.io/gaby/internal/httprr" 13 | "rsc.io/gaby/internal/secret" 14 | "rsc.io/gaby/internal/storage" 15 | "rsc.io/gaby/internal/testutil" 16 | ) 17 | 18 | func TestMarkdownEditing(t *testing.T) { 19 | check := testutil.Checker(t) 20 | lg := testutil.Slogger(t) 21 | db := storage.MemDB() 22 | 23 | // Initial load. 24 | rr, err := httprr.Open("../testdata/tmpedit.httprr", http.DefaultTransport) 25 | check(err) 26 | rr.Scrub(Scrub) 27 | sdb := secret.DB(secret.Map{"api.github.com": "user:pass"}) 28 | if rr.Recording() { 29 | sdb = secret.Netrc() 30 | } 31 | c := New(lg, db, sdb, rr.Client()) 32 | check(c.Add("rsc/tmp")) 33 | check(c.Sync()) 34 | 35 | var ei, ec *Event 36 | for e := range c.Events("rsc/tmp", 5, 5) { 37 | if ei == nil && e.API == "/issues" { 38 | ei = e 39 | } 40 | if ec == nil && e.API == "/issues/comments" { 41 | ec = e 42 | } 43 | } 44 | if ei == nil { 45 | t.Fatalf("did not find issue #5") 46 | } 47 | if ec == nil { 48 | t.Fatalf("did not find comment on issue #5") 49 | } 50 | 51 | issue := ei.Typed.(*Issue) 52 | issue1, err := c.DownloadIssue(issue.URL) 53 | check(err) 54 | if issue1.Title != issue.Title { 55 | t.Errorf("DownloadIssue: Title=%q, want %q", issue1.Title, issue.Title) 56 | } 57 | 58 | comment := ec.Typed.(*IssueComment) 59 | comment1, err := c.DownloadIssueComment(comment.URL) 60 | check(err) 61 | if comment1.Body != comment.Body { 62 | t.Errorf("DownloadIssueComment: Body=%q, want %q", comment1.Body, comment.Body) 63 | } 64 | 65 | c.testing = false // edit github directly (except for the httprr in the way) 66 | check(c.EditIssueComment(comment, &IssueCommentChanges{Body: rot13(comment.Body)})) 67 | check(c.PostIssueComment(issue, &IssueCommentChanges{Body: "testing. rot13 is the best."})) 68 | check(c.EditIssue(issue, &IssueChanges{Title: rot13(issue.Title)})) 69 | } 70 | 71 | func TestMarkdownDivertEdit(t *testing.T) { 72 | check := testutil.Checker(t) 73 | lg := testutil.Slogger(t) 74 | db := storage.MemDB() 75 | 76 | c := New(lg, db, nil, nil) 77 | check(c.Testing().LoadTxtar("../testdata/rsctmp.txt")) 78 | 79 | var ei, ec *Event 80 | for e := range c.Events("rsc/tmp", 5, 5) { 81 | if ei == nil && e.API == "/issues" { 82 | ei = e 83 | } 84 | if ec == nil && e.API == "/issues/comments" { 85 | ec = e 86 | } 87 | } 88 | if ei == nil { 89 | t.Fatalf("did not find issue #5") 90 | } 91 | if ec == nil { 92 | t.Fatalf("did not find comment on issue #5") 93 | } 94 | 95 | issue := ei.Typed.(*Issue) 96 | issue1, err := c.DownloadIssue(issue.URL) 97 | check(err) 98 | if issue1.Title != issue.Title { 99 | t.Errorf("DownloadIssue: Title=%q, want %q", issue1.Title, issue.Title) 100 | } 101 | 102 | comment := ec.Typed.(*IssueComment) 103 | comment1, err := c.DownloadIssueComment(comment.URL) 104 | check(err) 105 | if comment1.Body != comment.Body { 106 | t.Errorf("DownloadIssueComment: Body=%q, want %q", comment1.Body, comment.Body) 107 | } 108 | 109 | check(c.EditIssueComment(comment, &IssueCommentChanges{Body: rot13(comment.Body)})) 110 | check(c.PostIssueComment(issue, &IssueCommentChanges{Body: "testing. rot13 is the best."})) 111 | check(c.EditIssue(issue, &IssueChanges{Title: rot13(issue.Title), Labels: &[]string{"ebg13"}})) 112 | 113 | var edits []string 114 | for _, e := range c.Testing().Edits() { 115 | edits = append(edits, e.String()) 116 | } 117 | 118 | want := []string{ 119 | `EditIssueComment(rsc/tmp#5.10000000008, {"body":"Comment!\n"})`, 120 | `PostIssueComment(rsc/tmp#5, {"body":"testing. rot13 is the best."})`, 121 | `EditIssue(rsc/tmp#5, {"title":"another new issue","labels":["ebg13"]})`, 122 | } 123 | if !slices.Equal(edits, want) { 124 | t.Fatalf("Testing().Edits():\nhave %s\nwant %s", edits, want) 125 | } 126 | } 127 | 128 | func rot13(s string) string { 129 | b := []byte(s) 130 | for i, x := range b { 131 | if 'A' <= x && x <= 'M' || 'a' <= x && x <= 'm' { 132 | b[i] = x + 13 133 | } else if 'N' <= x && x <= 'Z' || 'n' <= x && x <= 'z' { 134 | b[i] = x - 13 135 | } 136 | } 137 | return string(b) 138 | } 139 | -------------------------------------------------------------------------------- /internal/testdata/omap.httprr: -------------------------------------------------------------------------------- 1 | httprr trace v1 2 | 172 1240 3 | GET https://api.github.com/repos/rsc/omap/issues?direction=asc&page=1&per_page=100&sort=updated&state=all HTTP/1.1 4 | Host: api.github.com 5 | User-Agent: Go-http-client/1.1 6 | 7 | HTTP/2.0 200 OK 8 | Content-Length: 2 9 | Accept-Ranges: bytes 10 | Access-Control-Allow-Origin: * 11 | Access-Control-Expose-Headers: ETag, Link, Location, Retry-After, X-GitHub-OTP, X-RateLimit-Limit, X-RateLimit-Remaining, X-RateLimit-Used, X-RateLimit-Resource, X-RateLimit-Reset, X-OAuth-Scopes, X-Accepted-OAuth-Scopes, X-Poll-Interval, X-GitHub-Media-Type, X-GitHub-SSO, X-GitHub-Request-Id, Deprecation, Sunset 12 | Cache-Control: public, max-age=60, s-maxage=60 13 | Content-Security-Policy: default-src 'none' 14 | Content-Type: application/json; charset=utf-8 15 | Date: Tue, 04 Jun 2024 16:16:40 GMT 16 | Etag: "4f53cda18c2baa0c0354bb5f9a3ecbe5ed12ab4d8e11ba873c2f11161202b945" 17 | Referrer-Policy: origin-when-cross-origin, strict-origin-when-cross-origin 18 | Server: GitHub.com 19 | Strict-Transport-Security: max-age=31536000; includeSubdomains; preload 20 | Vary: Accept, Accept-Encoding, Accept, X-Requested-With 21 | X-Content-Type-Options: nosniff 22 | X-Frame-Options: deny 23 | X-Github-Api-Version-Selected: 2022-11-28 24 | X-Github-Media-Type: github.v3; format=json 25 | X-Github-Request-Id: F9AB:A58D8:48273A5:7F79B5E:665F3DE8 26 | X-Ratelimit-Limit: 60 27 | X-Ratelimit-Remaining: 59 28 | X-Ratelimit-Reset: 1717521400 29 | X-Ratelimit-Resource: core 30 | X-Ratelimit-Used: 1 31 | X-Xss-Protection: 0 32 | 33 | []158 1240 34 | GET https://api.github.com/repos/rsc/omap/issues/comments?direction=asc&page=1&sort=updated HTTP/1.1 35 | Host: api.github.com 36 | User-Agent: Go-http-client/1.1 37 | 38 | HTTP/2.0 200 OK 39 | Content-Length: 2 40 | Accept-Ranges: bytes 41 | Access-Control-Allow-Origin: * 42 | Access-Control-Expose-Headers: ETag, Link, Location, Retry-After, X-GitHub-OTP, X-RateLimit-Limit, X-RateLimit-Remaining, X-RateLimit-Used, X-RateLimit-Resource, X-RateLimit-Reset, X-OAuth-Scopes, X-Accepted-OAuth-Scopes, X-Poll-Interval, X-GitHub-Media-Type, X-GitHub-SSO, X-GitHub-Request-Id, Deprecation, Sunset 43 | Cache-Control: public, max-age=60, s-maxage=60 44 | Content-Security-Policy: default-src 'none' 45 | Content-Type: application/json; charset=utf-8 46 | Date: Tue, 04 Jun 2024 16:16:40 GMT 47 | Etag: "4f53cda18c2baa0c0354bb5f9a3ecbe5ed12ab4d8e11ba873c2f11161202b945" 48 | Referrer-Policy: origin-when-cross-origin, strict-origin-when-cross-origin 49 | Server: GitHub.com 50 | Strict-Transport-Security: max-age=31536000; includeSubdomains; preload 51 | Vary: Accept, Accept-Encoding, Accept, X-Requested-With 52 | X-Content-Type-Options: nosniff 53 | X-Frame-Options: deny 54 | X-Github-Api-Version-Selected: 2022-11-28 55 | X-Github-Media-Type: github.v3; format=json 56 | X-Github-Request-Id: F9AB:A58D8:48273F3:7F79BDD:665F3DE8 57 | X-Ratelimit-Limit: 60 58 | X-Ratelimit-Remaining: 58 59 | X-Ratelimit-Reset: 1717521400 60 | X-Ratelimit-Resource: core 61 | X-Ratelimit-Used: 2 62 | X-Xss-Protection: 0 63 | 64 | []142 1240 65 | GET https://api.github.com/repos/rsc/omap/issues/events?page=1&per_page=100 HTTP/1.1 66 | Host: api.github.com 67 | User-Agent: Go-http-client/1.1 68 | 69 | HTTP/2.0 200 OK 70 | Content-Length: 2 71 | Accept-Ranges: bytes 72 | Access-Control-Allow-Origin: * 73 | Access-Control-Expose-Headers: ETag, Link, Location, Retry-After, X-GitHub-OTP, X-RateLimit-Limit, X-RateLimit-Remaining, X-RateLimit-Used, X-RateLimit-Resource, X-RateLimit-Reset, X-OAuth-Scopes, X-Accepted-OAuth-Scopes, X-Poll-Interval, X-GitHub-Media-Type, X-GitHub-SSO, X-GitHub-Request-Id, Deprecation, Sunset 74 | Cache-Control: public, max-age=60, s-maxage=60 75 | Content-Security-Policy: default-src 'none' 76 | Content-Type: application/json; charset=utf-8 77 | Date: Tue, 04 Jun 2024 16:16:40 GMT 78 | Etag: "4f53cda18c2baa0c0354bb5f9a3ecbe5ed12ab4d8e11ba873c2f11161202b945" 79 | Referrer-Policy: origin-when-cross-origin, strict-origin-when-cross-origin 80 | Server: GitHub.com 81 | Strict-Transport-Security: max-age=31536000; includeSubdomains; preload 82 | Vary: Accept, Accept-Encoding, Accept, X-Requested-With 83 | X-Content-Type-Options: nosniff 84 | X-Frame-Options: deny 85 | X-Github-Api-Version-Selected: 2022-11-28 86 | X-Github-Media-Type: github.v3; format=json 87 | X-Github-Request-Id: F9AB:A58D8:482744D:7F79C70:665F3DE8 88 | X-Ratelimit-Limit: 60 89 | X-Ratelimit-Remaining: 57 90 | X-Ratelimit-Reset: 1717521400 91 | X-Ratelimit-Resource: core 92 | X-Ratelimit-Used: 3 93 | X-Xss-Protection: 0 94 | 95 | [] -------------------------------------------------------------------------------- /internal/docs/docs.go: -------------------------------------------------------------------------------- 1 | // Copyright 2024 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | // Package docs implements a corpus of text documents identified by document IDs. 6 | // It allows retrieving the documents by ID as well as retrieving documents that are 7 | // new since a previous scan. 8 | package docs 9 | 10 | import ( 11 | "iter" 12 | "strings" 13 | 14 | "rsc.io/gaby/internal/storage" 15 | "rsc.io/gaby/internal/storage/timed" 16 | "rsc.io/ordered" 17 | ) 18 | 19 | // This package stores the following key schemas in the database: 20 | // 21 | // ["docs.Doc", URL] => [DBTime, Title, Text] 22 | // ["docs.DocByTime", DBTime, URL] => [] 23 | // 24 | // DocByTime is an index of Docs by DBTime, which is the time when the 25 | // record was added to the database. Code that processes new docs can 26 | // record which DBTime it has most recently processed and then scan forward in 27 | // the index to learn about new docs. 28 | 29 | // A Corpus is the collection of documents stored in a database. 30 | type Corpus struct { 31 | db storage.DB 32 | } 33 | 34 | // New returns a new Corpus representing the documents stored in db. 35 | func New(db storage.DB) *Corpus { 36 | return &Corpus{db} 37 | } 38 | 39 | // A Doc is a single document in the Corpus. 40 | type Doc struct { 41 | DBTime timed.DBTime // database time (from storage.Now) when Doc was written 42 | ID string // document identifier (such as a URL) 43 | Title string // title of document 44 | Text string // text of document 45 | } 46 | 47 | // decodeDoc decodes the document in the timed key-value pair. 48 | // It calls c.db.Panic if the key-value pair is malformed. 49 | func (c *Corpus) decodeDoc(t *timed.Entry) *Doc { 50 | d := new(Doc) 51 | d.DBTime = t.ModTime 52 | if err := ordered.Decode(t.Key, &d.ID); err != nil { 53 | // unreachable unless db corruption 54 | c.db.Panic("docs decode", "key", storage.Fmt(t.Key), "err", err) 55 | } 56 | if err := ordered.Decode(t.Val, &d.Title, &d.Text); err != nil { 57 | // unreachable unless db corruption 58 | c.db.Panic("docs decode", "key", storage.Fmt(t.Key), "val", storage.Fmt(t.Val), "err", err) 59 | } 60 | return d 61 | } 62 | 63 | // Get returns the document with the given id. 64 | // It returns nil, false if no document is found. 65 | // It returns d, true otherwise. 66 | func (c *Corpus) Get(id string) (doc *Doc, ok bool) { 67 | t, ok := timed.Get(c.db, "docs.Doc", ordered.Encode(id)) 68 | if !ok { 69 | return nil, false 70 | } 71 | return c.decodeDoc(t), true 72 | } 73 | 74 | // Add adds a document with the given id, title, and text. 75 | // If the document already exists in the corpus with the same title and text, 76 | // Add is an no-op. 77 | // Otherwise, if the document already exists in the corpus, it is replaced. 78 | func (c *Corpus) Add(id, title, text string) { 79 | old, ok := c.Get(id) 80 | if ok && old.Title == title && old.Text == text { 81 | return 82 | } 83 | b := c.db.Batch() 84 | timed.Set(c.db, b, "docs.Doc", ordered.Encode(id), ordered.Encode(title, text)) 85 | b.Apply() 86 | } 87 | 88 | // Docs returns an iterator over all documents in the corpus 89 | // with IDs starting with a given prefix. 90 | // The documents are ordered by ID. 91 | func (c *Corpus) Docs(prefix string) iter.Seq[*Doc] { 92 | return func(yield func(*Doc) bool) { 93 | for t := range timed.Scan(c.db, "docs.Doc", ordered.Encode(prefix), ordered.Encode(prefix+"\xff")) { 94 | if !yield(c.decodeDoc(t)) { 95 | return 96 | } 97 | } 98 | } 99 | } 100 | 101 | // DocsAfter returns an iterator over all documents with DBTime 102 | // greater than dbtime and with IDs starting with the prefix. 103 | // The documents are ordered by DBTime. 104 | func (c *Corpus) DocsAfter(dbtime timed.DBTime, prefix string) iter.Seq[*Doc] { 105 | filter := func(key []byte) bool { 106 | if prefix == "" { 107 | return true 108 | } 109 | var id string 110 | if err := ordered.Decode(key, &id); err != nil { 111 | // unreachable unless db corruption 112 | c.db.Panic("docs scan decode", "key", storage.Fmt(key), "err", err) 113 | } 114 | return strings.HasPrefix(id, prefix) 115 | } 116 | return func(yield func(*Doc) bool) { 117 | for t := range timed.ScanAfter(c.db, "docs.Doc", dbtime, filter) { 118 | if !yield(c.decodeDoc(t)) { 119 | return 120 | } 121 | } 122 | } 123 | } 124 | 125 | // DocWatcher returns a new [storage.Watcher] with the given name. 126 | // It picks up where any previous Watcher of the same name left off. 127 | func (c *Corpus) DocWatcher(name string) *timed.Watcher[*Doc] { 128 | return timed.NewWatcher(c.db, name, "docs.Doc", c.decodeDoc) 129 | } 130 | -------------------------------------------------------------------------------- /internal/gemini/gemini.go: -------------------------------------------------------------------------------- 1 | // Copyright 2024 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | // Package gemini implements access to Google's Gemini model. 6 | // 7 | // [Client] implements [llm.Embedder]. Use [NewClient] to connect. 8 | package gemini 9 | 10 | import ( 11 | "bytes" 12 | "context" 13 | "encoding/json" 14 | "fmt" 15 | "log/slog" 16 | "maps" 17 | "net/http" 18 | "slices" 19 | "strings" 20 | _ "unsafe" // for linkname 21 | 22 | "github.com/google/generative-ai-go/genai" 23 | "google.golang.org/api/option" 24 | "rsc.io/gaby/internal/httprr" 25 | "rsc.io/gaby/internal/llm" 26 | "rsc.io/gaby/internal/secret" 27 | ) 28 | 29 | // Scrub is a request scrubber for use with [rsc.io/httprr]. 30 | func Scrub(req *http.Request) error { 31 | delete(req.Header, "x-goog-api-key") // genai does not canonicalize 32 | req.Header.Del("X-Goog-Api-Key") // in case it starts 33 | delete(req.Header, "x-goog-api-client") // contains version numbers 34 | req.Header.Del("X-Goog-Api-Client") 35 | 36 | if ctype := req.Header.Get("Content-Type"); ctype == "application/json" || strings.HasPrefix(ctype, "application/json;") { 37 | // Canonicalize JSON body. 38 | // google.golang.org/protobuf/internal/encoding.json 39 | // goes out of its way to randomize the JSON encodings 40 | // of protobuf messages by adding or not adding spaces 41 | // after commas. Derandomize by compacting the JSON. 42 | b := req.Body.(*httprr.Body) 43 | var buf bytes.Buffer 44 | if err := json.Compact(&buf, b.Data); err == nil { 45 | b.Data = buf.Bytes() 46 | } 47 | } 48 | return nil 49 | } 50 | 51 | // A Client represents a connection to Gemini. 52 | type Client struct { 53 | slog *slog.Logger 54 | genai *genai.Client 55 | } 56 | 57 | // NewClient returns a connection to Gemini, using the given logger and HTTP client. 58 | // It expects to find a secret of the form "AIza..." or "user:AIza..." in sdb 59 | // under the name "ai.google.dev". 60 | func NewClient(lg *slog.Logger, sdb secret.DB, hc *http.Client) (*Client, error) { 61 | key, ok := sdb.Get("ai.google.dev") 62 | if !ok { 63 | return nil, fmt.Errorf("missing api key for ai.google.dev") 64 | } 65 | // If key is from .netrc, ignore user name. 66 | if _, pass, ok := strings.Cut(key, ":"); ok { 67 | key = pass 68 | } 69 | 70 | // Ideally this would use use “option.WithAPIKey(key), option.WithHTTPClient(hc),” 71 | // but using option.WithHTTPClient bypasses the code that passes along the API key. 72 | // Instead we make our own derived http.Client that re-adds the key. 73 | // And then we still have to say option.WithAPIKey("ignored") because 74 | // otherwise NewClient complains that we haven't passed in a key. 75 | // (If we pass in the key, it ignores it, but if we don't pass it in, 76 | // it complains that we didn't give it a key.) 77 | ai, err := genai.NewClient(context.Background(), 78 | option.WithAPIKey("ignored"), 79 | option.WithHTTPClient(withKey(hc, key))) 80 | if err != nil { 81 | return nil, err 82 | } 83 | 84 | return &Client{slog: lg, genai: ai}, nil 85 | } 86 | 87 | // withKey returns a new http.Client that is the same as hc 88 | // except that it adds "x-goog-api-key: key" to every request. 89 | func withKey(hc *http.Client, key string) *http.Client { 90 | c := *hc 91 | t := c.Transport 92 | if t == nil { 93 | t = http.DefaultTransport 94 | } 95 | c.Transport = &transportWithKey{t, key} 96 | return &c 97 | } 98 | 99 | // transportWithKey is the same as rt 100 | // except that it adds "x-goog-api-key: key" to every request. 101 | type transportWithKey struct { 102 | rt http.RoundTripper 103 | key string 104 | } 105 | 106 | func (t *transportWithKey) RoundTrip(req *http.Request) (resp *http.Response, err error) { 107 | r := *req 108 | r.Header = maps.Clone(req.Header) 109 | r.Header["x-goog-api-key"] = []string{t.key} 110 | return t.rt.RoundTrip(&r) 111 | } 112 | 113 | const maxBatch = 100 // empirical limit 114 | 115 | // EmbedDocs returns the vector embeddings for the docs, 116 | // implementing [llm.Embedder]. 117 | func (c *Client) EmbedDocs(docs []llm.EmbedDoc) ([]llm.Vector, error) { 118 | model := c.genai.EmbeddingModel("text-embedding-004") 119 | var vecs []llm.Vector 120 | for docs := range slices.Chunk(docs, maxBatch) { 121 | b := model.NewBatch() 122 | for _, d := range docs { 123 | b.AddContentWithTitle(d.Title, genai.Text(d.Text)) 124 | } 125 | resp, err := model.BatchEmbedContents(context.Background(), b) 126 | if err != nil { 127 | return vecs, err 128 | } 129 | for _, e := range resp.Embeddings { 130 | vecs = append(vecs, e.Values) 131 | } 132 | } 133 | return vecs, nil 134 | } 135 | -------------------------------------------------------------------------------- /internal/embeddocs/sync_test.go: -------------------------------------------------------------------------------- 1 | // Copyright 2024 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package embeddocs 6 | 7 | import ( 8 | "fmt" 9 | "strings" 10 | "testing" 11 | 12 | "rsc.io/gaby/internal/docs" 13 | "rsc.io/gaby/internal/llm" 14 | "rsc.io/gaby/internal/storage" 15 | "rsc.io/gaby/internal/testutil" 16 | ) 17 | 18 | var texts = []string{ 19 | "for loops", 20 | "for all time, always", 21 | "break statements", 22 | "breakdancing", 23 | "forever could never be long enough for me", 24 | "the macarena", 25 | } 26 | 27 | func checker(t *testing.T) func(error) { 28 | return func(err error) { 29 | if err != nil { 30 | t.Helper() 31 | t.Fatal(err) 32 | } 33 | } 34 | } 35 | 36 | func TestSync(t *testing.T) { 37 | lg := testutil.Slogger(t) 38 | db := storage.MemDB() 39 | vdb := storage.MemVectorDB(db, lg, "step1") 40 | dc := docs.New(db) 41 | for i, text := range texts { 42 | dc.Add(fmt.Sprintf("URL%d", i), "", text) 43 | } 44 | 45 | Sync(lg, vdb, llm.QuoteEmbedder(), dc) 46 | for i, text := range texts { 47 | vec, ok := vdb.Get(fmt.Sprintf("URL%d", i)) 48 | if !ok { 49 | t.Errorf("URL%d missing from vdb", i) 50 | continue 51 | } 52 | vtext := llm.UnquoteVector(vec) 53 | if vtext != text { 54 | t.Errorf("URL%d decoded to %q, want %q", i, vtext, text) 55 | } 56 | } 57 | 58 | for i, text := range texts { 59 | dc.Add(fmt.Sprintf("rot13%d", i), "", rot13(text)) 60 | } 61 | vdb2 := storage.MemVectorDB(db, lg, "step2") 62 | Sync(lg, vdb2, llm.QuoteEmbedder(), dc) 63 | for i, text := range texts { 64 | vec, ok := vdb2.Get(fmt.Sprintf("URL%d", i)) 65 | if ok { 66 | t.Errorf("URL%d written during second sync: %q", i, llm.UnquoteVector(vec)) 67 | continue 68 | } 69 | 70 | vec, ok = vdb2.Get(fmt.Sprintf("rot13%d", i)) 71 | vtext := llm.UnquoteVector(vec) 72 | if vtext != rot13(text) { 73 | t.Errorf("rot13%d decoded to %q, want %q", i, vtext, rot13(text)) 74 | } 75 | } 76 | } 77 | 78 | func TestBigSync(t *testing.T) { 79 | const N = 10000 80 | 81 | lg := testutil.Slogger(t) 82 | db := storage.MemDB() 83 | vdb := storage.MemVectorDB(db, lg, "vdb") 84 | dc := docs.New(db) 85 | for i := range N { 86 | dc.Add(fmt.Sprintf("URL%d", i), "", fmt.Sprintf("Text%d", i)) 87 | } 88 | 89 | Sync(lg, vdb, llm.QuoteEmbedder(), dc) 90 | for i := range N { 91 | vec, ok := vdb.Get(fmt.Sprintf("URL%d", i)) 92 | if !ok { 93 | t.Errorf("URL%d missing from vdb", i) 94 | continue 95 | } 96 | text := fmt.Sprintf("Text%d", i) 97 | vtext := llm.UnquoteVector(vec) 98 | if vtext != text { 99 | t.Errorf("URL%d decoded to %q, want %q", i, vtext, text) 100 | } 101 | } 102 | } 103 | 104 | func TestBadEmbedders(t *testing.T) { 105 | const N = 150 106 | db := storage.MemDB() 107 | dc := docs.New(db) 108 | for i := range N { 109 | dc.Add(fmt.Sprintf("URL%03d", i), "", fmt.Sprintf("Text%d", i)) 110 | } 111 | 112 | lg, out := testutil.SlogBuffer() 113 | db = storage.MemDB() 114 | vdb := storage.MemVectorDB(db, lg, "vdb") 115 | Sync(lg, vdb, tooManyEmbed{}, dc) 116 | if !strings.Contains(out.String(), "embeddocs length mismatch") { 117 | t.Errorf("tooManyEmbed did not report error:\n%s", out) 118 | } 119 | 120 | lg, out = testutil.SlogBuffer() 121 | db = storage.MemDB() 122 | vdb = storage.MemVectorDB(db, lg, "vdb") 123 | Sync(lg, vdb, embedErr{}, dc) 124 | if !strings.Contains(out.String(), "EMBED ERROR") { 125 | t.Errorf("embedErr did not report error:\n%s", out) 126 | } 127 | if _, ok := vdb.Get("URL001"); !ok { 128 | t.Errorf("Sync did not write URL001 after embedErr") 129 | } 130 | 131 | lg, out = testutil.SlogBuffer() 132 | db = storage.MemDB() 133 | vdb = storage.MemVectorDB(db, lg, "vdb") 134 | Sync(lg, vdb, embedHalf{}, dc) 135 | if !strings.Contains(out.String(), "length mismatch") { 136 | t.Errorf("embedHalf did not report error:\n%s", out) 137 | } 138 | if _, ok := vdb.Get("URL001"); !ok { 139 | t.Errorf("Sync did not write URL001 after embedHalf") 140 | } 141 | } 142 | 143 | func rot13(s string) string { 144 | b := []byte(s) 145 | for i, x := range b { 146 | if 'A' <= x && x <= 'M' || 'a' <= x && x <= 'm' { 147 | b[i] = x + 13 148 | } else if 'N' <= x && x <= 'Z' || 'n' <= x && x <= 'z' { 149 | b[i] = x - 13 150 | } 151 | } 152 | return string(b) 153 | } 154 | 155 | type tooManyEmbed struct{} 156 | 157 | func (tooManyEmbed) EmbedDocs(docs []llm.EmbedDoc) ([]llm.Vector, error) { 158 | vec, _ := llm.QuoteEmbedder().EmbedDocs(docs) 159 | vec = append(vec, vec...) 160 | return vec, nil 161 | } 162 | 163 | type embedErr struct{} 164 | 165 | func (embedErr) EmbedDocs(docs []llm.EmbedDoc) ([]llm.Vector, error) { 166 | vec, _ := llm.QuoteEmbedder().EmbedDocs(docs) 167 | return vec, fmt.Errorf("EMBED ERROR") 168 | } 169 | 170 | type embedHalf struct{} 171 | 172 | func (embedHalf) EmbedDocs(docs []llm.EmbedDoc) ([]llm.Vector, error) { 173 | vec, _ := llm.QuoteEmbedder().EmbedDocs(docs) 174 | vec = vec[:len(vec)/2] 175 | return vec, nil 176 | } 177 | -------------------------------------------------------------------------------- /internal/pebble/pebble.go: -------------------------------------------------------------------------------- 1 | // Copyright 2024 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | // Package pebble implements a storage.DB using Pebble, 6 | // a production-quality key-value database from CockroachDB. 7 | package pebble 8 | 9 | import ( 10 | "bytes" 11 | "cmp" 12 | "iter" 13 | "log/slog" 14 | 15 | "github.com/cockroachdb/pebble" 16 | "rsc.io/gaby/internal/storage" 17 | ) 18 | 19 | // Open opens an existing Pebble database in the named directory. 20 | // The database must already exist. 21 | func Open(lg *slog.Logger, dir string) (storage.DB, error) { 22 | return open(lg, dir, &pebble.Options{ErrorIfNotExists: true}) 23 | } 24 | 25 | // Create creates a new Pebble database in the named directory. 26 | // The database (and directory) must not already exist. 27 | func Create(lg *slog.Logger, dir string) (storage.DB, error) { 28 | return open(lg, dir, &pebble.Options{ErrorIfExists: true}) 29 | } 30 | 31 | func open(lg *slog.Logger, dir string, opts *pebble.Options) (storage.DB, error) { 32 | p, err := pebble.Open(dir, opts) 33 | if err != nil { 34 | lg.Error("pebble open", "dir", dir, "create", opts.ErrorIfExists, "err", err) 35 | return nil, err 36 | } 37 | return &db{p: p, slog: lg}, nil 38 | } 39 | 40 | type db struct { 41 | p *pebble.DB 42 | m storage.MemLocker 43 | slog *slog.Logger 44 | } 45 | 46 | type batch struct { 47 | db *db 48 | b *pebble.Batch 49 | } 50 | 51 | func (d *db) Lock(key string) { 52 | d.m.Lock(key) 53 | } 54 | 55 | func (d *db) Unlock(key string) { 56 | d.m.Unlock(key) 57 | } 58 | 59 | func (d *db) get(key []byte, yield func(val []byte)) { 60 | v, c, err := d.p.Get(key) 61 | if err == pebble.ErrNotFound { 62 | return 63 | } 64 | if err != nil { 65 | // unreachable except db error 66 | d.Panic("pebble get", "key", storage.Fmt(key), "err", err) 67 | } 68 | yield(v) 69 | c.Close() 70 | } 71 | 72 | func (d *db) Get(key []byte) (val []byte, ok bool) { 73 | d.get(key, func(v []byte) { 74 | val = bytes.Clone(v) 75 | ok = true 76 | }) 77 | return 78 | } 79 | 80 | var ( 81 | sync = &pebble.WriteOptions{Sync: true} 82 | noSync = &pebble.WriteOptions{Sync: false} 83 | ) 84 | 85 | func (d *db) Panic(msg string, args ...any) { 86 | d.slog.Error(msg, args...) 87 | storage.Panic(msg, args...) 88 | } 89 | 90 | func (d *db) Set(key, val []byte) { 91 | if err := d.p.Set(key, val, noSync); err != nil { 92 | // unreachable except db error 93 | d.Panic("pebble set", "key", storage.Fmt(key), "val", storage.Fmt(val), "err", err) 94 | } 95 | } 96 | 97 | func (d *db) Delete(key []byte) { 98 | if err := d.p.Delete(key, noSync); err != nil { 99 | // unreachable except db error 100 | d.Panic("pebble delete", "key", storage.Fmt(key), "err", err) 101 | } 102 | } 103 | 104 | func (d *db) DeleteRange(start, end []byte) { 105 | err := cmp.Or( 106 | d.p.DeleteRange(start, end, noSync), 107 | d.p.Delete(end, noSync), 108 | ) 109 | if err != nil { 110 | // unreachable except db error 111 | d.Panic("pebble delete range", "start", storage.Fmt(start), "end", storage.Fmt(end), "err", err) 112 | } 113 | } 114 | 115 | func (d *db) Flush() { 116 | if err := d.p.Flush(); err != nil { 117 | // unreachable except db error 118 | d.Panic("pebble flush", "err", err) 119 | } 120 | } 121 | 122 | func (d *db) Close() { 123 | if err := d.p.Close(); err != nil { 124 | // unreachable except db error 125 | d.Panic("pebble close", "err", err) 126 | } 127 | } 128 | 129 | func (d *db) Scan(start, end []byte) iter.Seq2[[]byte, func() []byte] { 130 | start = bytes.Clone(start) 131 | end = bytes.Clone(end) 132 | return func(yield func(key []byte, val func() []byte) bool) { 133 | // Note: Pebble's UpperBound is non-inclusive (not included in the scan) 134 | // but we want to include the key end in the scan, 135 | // so do not use UpperBound; we check during the iteration instead. 136 | iter, err := d.p.NewIter(&pebble.IterOptions{ 137 | LowerBound: start, 138 | }) 139 | if err != nil { 140 | // unreachable except db error 141 | d.Panic("pebble new iterator", "start", storage.Fmt(start), "err", err) 142 | } 143 | defer iter.Close() 144 | for iter.First(); iter.Valid(); iter.Next() { 145 | key := iter.Key() 146 | if bytes.Compare(key, end) > 0 { 147 | break 148 | } 149 | val := func() []byte { 150 | v, err := iter.ValueAndErr() 151 | if err != nil { 152 | // unreachable except db error 153 | d.Panic("pebble iterator value", "key", storage.Fmt(key), "err", err) 154 | } 155 | return v 156 | } 157 | if !yield(key, val) { 158 | return 159 | } 160 | } 161 | } 162 | } 163 | 164 | func (d *db) Batch() storage.Batch { 165 | return &batch{d, d.p.NewBatch()} 166 | } 167 | 168 | func (b *batch) Set(key, val []byte) { 169 | if err := b.b.Set(key, val, noSync); err != nil { 170 | // unreachable except db error 171 | b.db.Panic("pebble batch set", "key", storage.Fmt(key), "val", storage.Fmt(val), "err", err) 172 | } 173 | } 174 | 175 | func (b *batch) Delete(key []byte) { 176 | if err := b.b.Delete(key, noSync); err != nil { 177 | // unreachable except db error 178 | b.db.Panic("pebble batch delete", "key", storage.Fmt(key), "err", err) 179 | } 180 | } 181 | 182 | func (b *batch) DeleteRange(start, end []byte) { 183 | err := cmp.Or( 184 | b.b.DeleteRange(start, end, noSync), 185 | b.b.Delete(end, noSync), 186 | ) 187 | if err != nil { 188 | // unreachable except db error 189 | b.db.Panic("pebble batch delete range", "start", storage.Fmt(start), "end", storage.Fmt(end), "err", err) 190 | } 191 | } 192 | 193 | func (b *batch) MaybeApply() bool { 194 | if b.b.Len() > 100e6 { 195 | b.Apply() 196 | return true 197 | } 198 | return false 199 | } 200 | 201 | func (b *batch) Apply() { 202 | if err := b.db.p.Apply(b.b, noSync); err != nil { 203 | // unreachable except db error 204 | b.db.Panic("pebble batch apply", "err", err) 205 | } 206 | b.b.Reset() 207 | } 208 | -------------------------------------------------------------------------------- /internal/github/edit.go: -------------------------------------------------------------------------------- 1 | // Copyright 2024 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package github 6 | 7 | import ( 8 | "bytes" 9 | "encoding/json" 10 | "fmt" 11 | "io" 12 | "net/http" 13 | "slices" 14 | "strings" 15 | "testing" 16 | ) 17 | 18 | // NOTE: It's possible that we should elevate TestingEdit to a general 19 | // “deferred edits” facility for use in looking at potential changes. 20 | // On the other hand, higher-level code usually needs to know 21 | // whether it's making changes or not, so that it can record that 22 | // the work has been done, so normally “deferred edits” should be 23 | // as high in the stack as possible, and the GitHub client is not. 24 | 25 | // PostIssueComment posts a new comment with the given body (written in Markdown) on issue. 26 | func (c *Client) PostIssueComment(issue *Issue, changes *IssueCommentChanges) error { 27 | if c.divertEdits() { 28 | c.testMu.Lock() 29 | defer c.testMu.Unlock() 30 | 31 | c.testEdits = append(c.testEdits, &TestingEdit{ 32 | Project: issue.Project(), 33 | Issue: issue.Number, 34 | IssueCommentChanges: changes.clone(), 35 | }) 36 | return nil 37 | } 38 | 39 | return c.post(issue.URL+"/comments", changes) 40 | } 41 | 42 | // DownloadIssue downloads the current issue JSON from the given URL 43 | // and decodes it into an issue. 44 | // Given an issue, c.DownloadIssue(issue.URL) fetches the very latest state for the issue. 45 | func (c *Client) DownloadIssue(url string) (*Issue, error) { 46 | x := new(Issue) 47 | _, err := c.get(url, "", x) 48 | if err != nil { 49 | return nil, err 50 | } 51 | return x, nil 52 | } 53 | 54 | // DownloadIssueComment downloads the current comment JSON from the given URL 55 | // and decodes it into an IssueComment. 56 | // Given a comment, c.DownloadIssueComment(comment.URL) fetches the very latest state for the comment. 57 | func (c *Client) DownloadIssueComment(url string) (*IssueComment, error) { 58 | x := new(IssueComment) 59 | _, err := c.get(url, "", x) 60 | if err != nil { 61 | return nil, err 62 | } 63 | return x, nil 64 | } 65 | 66 | type IssueCommentChanges struct { 67 | Body string `json:"body,omitempty"` 68 | } 69 | 70 | func (ch *IssueCommentChanges) clone() *IssueCommentChanges { 71 | x := *ch 72 | ch = &x 73 | return ch 74 | } 75 | 76 | // EditIssueComment changes the comment on GitHub to have the new body. 77 | // It is typically a good idea to use c.DownloadIssueComment first and check 78 | // that the live comment body matches the one obtained from the database, 79 | // to minimize race windows. 80 | func (c *Client) EditIssueComment(comment *IssueComment, changes *IssueCommentChanges) error { 81 | if c.divertEdits() { 82 | c.testMu.Lock() 83 | defer c.testMu.Unlock() 84 | 85 | c.testEdits = append(c.testEdits, &TestingEdit{ 86 | Project: comment.Project(), 87 | Issue: comment.Issue(), 88 | Comment: comment.CommentID(), 89 | IssueCommentChanges: changes.clone(), 90 | }) 91 | return nil 92 | } 93 | 94 | return c.patch(comment.URL, changes) 95 | } 96 | 97 | // An IssueChanges specifies changes to make to an issue. 98 | // Fields that are the empty string or a nil pointer are ignored. 99 | // 100 | // Note that Labels is the new set of all labels for the issue, 101 | // not labels to add. If you are adding a single label, 102 | // you need to include all the existing labels as well. 103 | // Labels is a *[]string so that it can be set to new([]string) 104 | // to clear the labels. 105 | type IssueChanges struct { 106 | Title string `json:"title,omitempty"` 107 | Body string `json:"body,omitempty"` 108 | State string `json:"state,omitempty"` 109 | Labels *[]string `json:"labels,omitempty"` 110 | } 111 | 112 | func (ch *IssueChanges) clone() *IssueChanges { 113 | x := *ch 114 | ch = &x 115 | if ch.Labels != nil { 116 | x := slices.Clone(*ch.Labels) 117 | ch.Labels = &x 118 | } 119 | return ch 120 | } 121 | 122 | // EditIssue applies the changes to issue on GitHub. 123 | func (c *Client) EditIssue(issue *Issue, changes *IssueChanges) error { 124 | if c.divertEdits() { 125 | c.testMu.Lock() 126 | defer c.testMu.Unlock() 127 | 128 | c.testEdits = append(c.testEdits, &TestingEdit{ 129 | Project: issue.Project(), 130 | Issue: issue.Number, 131 | IssueChanges: changes.clone(), 132 | }) 133 | return nil 134 | } 135 | 136 | return c.patch(issue.URL, changes) 137 | } 138 | 139 | // patch is like c.get but makes a PATCH request. 140 | // Unlike c.get, it requires authentication. 141 | func (c *Client) patch(url string, changes any) error { 142 | return c.json("PATCH", url, changes) 143 | } 144 | 145 | // post is like c.get but makes a POST request. 146 | // Unlike c.get, it requires authentication. 147 | func (c *Client) post(url string, body any) error { 148 | return c.json("POST", url, body) 149 | } 150 | 151 | // json is the general PATCH/POST implementation. 152 | func (c *Client) json(method, url string, body any) error { 153 | js, err := json.Marshal(body) 154 | if err != nil { 155 | return err 156 | } 157 | 158 | auth, ok := c.secret.Get("api.github.com") 159 | if !ok && !testing.Testing() { 160 | return fmt.Errorf("no secret for api.github.com") 161 | } 162 | user, pass, _ := strings.Cut(auth, ":") 163 | 164 | Redo: 165 | req, err := http.NewRequest(method, url, bytes.NewReader(js)) 166 | if err != nil { 167 | return err 168 | } 169 | req.Header.Set("Content-Type", "application/json; charset=utf-8") 170 | req.SetBasicAuth(user, pass) 171 | resp, err := c.http.Do(req) 172 | if err != nil { 173 | return err 174 | } 175 | data, err := io.ReadAll(resp.Body) 176 | resp.Body.Close() 177 | if err != nil { 178 | return fmt.Errorf("reading body: %v", err) 179 | } 180 | if c.rateLimit(resp) { 181 | goto Redo 182 | } 183 | if resp.StatusCode/10 != 20 { // allow 200, 201, maybe others 184 | return fmt.Errorf("%s\n%s", resp.Status, data) 185 | } 186 | return nil 187 | } 188 | -------------------------------------------------------------------------------- /internal/storage/db.go: -------------------------------------------------------------------------------- 1 | // Copyright 2024 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | // Package storage defines the storage abstractions needed for Gaby: 6 | // [DB], a basic key-value store, and [VectorDB], a vector database. 7 | // The storage needs are intentionally minimal (avoiding, for example, 8 | // a requirement on SQL), to admit as many implementations as possible. 9 | package storage 10 | 11 | import ( 12 | "bytes" 13 | "encoding/json" 14 | "fmt" 15 | "iter" 16 | "log/slog" 17 | "strconv" 18 | "strings" 19 | 20 | "rsc.io/ordered" 21 | ) 22 | 23 | // A DB is a key-value database. 24 | // 25 | // DB operations are assumed not to fail. 26 | // They panic, intending to take down the program, 27 | // if there is an error accessing the database. 28 | // The assumption is that the program cannot possibly 29 | // continue without the database, since that's where all the state is stored. 30 | // Similarly, clients of DB conventionally panic if the database 31 | // returned corrupted data. 32 | // Code using multiple parallel database operations can recover 33 | // at the outermost calls. 34 | // Clients of DB 35 | type DB interface { 36 | // Lock acquires a lock on the given name, which need not exist in the database. 37 | // After a successful Lock(name), 38 | // any other call to Lock(name) from any other client of the database 39 | // (including in another process, for shared databases) 40 | // must block until Unlock(name) has been called. 41 | // In a shared database, a lock may also unlock 42 | // when the client disconnects or times out. 43 | Lock(name string) 44 | 45 | // Unlock releases the lock with the given name, 46 | // which the caller must have locked. 47 | Unlock(name string) 48 | 49 | // Set sets the value associated with key to val. 50 | Set(key, val []byte) 51 | 52 | // Get looks up the value associated with key. 53 | // If there is no entry for key in the database, Get returns nil, false. 54 | // Otherwise it returns val, true. 55 | Get(key []byte) (val []byte, ok bool) 56 | 57 | // Scan returns an iterator over all key-value pairs with start ≤ key ≤ end. 58 | // The second value in each iteration pair is a function returning the value, 59 | // not the value itself: 60 | // 61 | // for key, getVal := range db.Scan([]byte("aaa"), []byte("zzz")) { 62 | // val := getVal() 63 | // fmt.Printf("%q: %q\n", key, val) 64 | // } 65 | // 66 | // In iterations that only need the keys or only need the values for a subset of keys, 67 | // some DB implementations may avoid work when the value function is not called. 68 | Scan(start, end []byte) iter.Seq2[[]byte, func() []byte] 69 | 70 | // Delete deletes any value associated with key. 71 | // Delete of an unset key is a no-op. 72 | Delete(key []byte) 73 | 74 | // DeleteRange deletes all key-value pairs with start ≤ key ≤ end. 75 | DeleteRange(start, end []byte) 76 | 77 | // Batch returns a new [Batch] that accumulates database mutations 78 | // to apply in an atomic operation. In addition to the atomicity, using a 79 | // Batch for bulk operations is more efficient than making each 80 | // change using repeated calls to DB's Set, Delete, and DeleteRange methods. 81 | Batch() Batch 82 | 83 | // Flush flushes DB changes to permanent storage. 84 | // Flush must be called before the process crashes or exits, 85 | // or else any changes since the previous Flush may be lost. 86 | Flush() 87 | 88 | // Close closes the database. 89 | // Like the other routines, it panics if an error happens, 90 | // so there is no error result. 91 | Close() 92 | 93 | // Panic logs the error message and args using the database's slog.Logger 94 | // and then panics with the text formatting of its arguments. 95 | // It is meant to be called when database corruption or other 96 | // database-related “can't happen” conditions been detected. 97 | Panic(msg string, args ...any) 98 | } 99 | 100 | // A Batch accumulates database mutations that are applied to a [DB] 101 | // as a single atomic operation. Applying bulk operations in a batch 102 | // is also more efficient than making individual [DB] method calls. 103 | // The batched operations apply in the order they are made. 104 | // For example, Set("a", "b") followed by Delete("a") is the same as 105 | // Delete("a"), while Delete("a") followed by Set("a", "b") is the same 106 | // as Set("a", "b"). 107 | type Batch interface { 108 | // Delete deletes any value associated with key. 109 | // Delete of an unset key is a no-op. 110 | Delete(key []byte) 111 | 112 | // DeleteRange deletes all key-value pairs with start ≤ key ≤ end. 113 | DeleteRange(start, end []byte) 114 | 115 | // Set sets the value associated with key to val. 116 | Set(key, val []byte) 117 | 118 | // MaybeApply calls Apply if the batch is getting close to full. 119 | // Every Batch has a limit to how many operations can be batched, 120 | // so in a bulk operation where atomicity of the entire batch is not a concern, 121 | // calling MaybeApply gives the Batch implementation 122 | // permission to flush the batch at specific “safe points”. 123 | // A typical limit for a batch is about 100MB worth of logged operations. 124 | // MaybeApply reports whether it called Apply. 125 | MaybeApply() bool 126 | 127 | // Apply applies all the batched operations to the underlying DB 128 | // as a single atomic unit. 129 | // When Apply returns, the Batch is an empty batch ready for 130 | // more operations. 131 | Apply() 132 | } 133 | 134 | // Panic panics with the text formatting of its arguments. 135 | // It is meant to be called for database errors or corruption, 136 | // which have been defined to be impossible. 137 | // (See the [DB] documentation.) 138 | // 139 | // Panic is expected to be used by DB implementations. 140 | // DB clients should use the [DB.Panic] method instead. 141 | func Panic(msg string, args ...any) { 142 | var b bytes.Buffer 143 | slog.New(slog.NewTextHandler(&b, nil)).Error(msg, args...) 144 | s := b.String() 145 | if _, rest, ok := strings.Cut(s, " level=ERROR msg="); ok { 146 | s = rest 147 | } 148 | panic(strings.TrimSpace(s)) 149 | } 150 | 151 | // JSON converts x to JSON and returns the result. 152 | // It panics if there is any error converting x to JSON. 153 | // Since whether x can be converted to JSON depends 154 | // almost entirely on its type, a marshaling error indicates a 155 | // bug at the call site. 156 | // 157 | // (The exception is certain malformed UTF-8 and floating-point 158 | // infinity and NaN. Code must be careful not to use JSON with those.) 159 | func JSON(x any) []byte { 160 | js, err := json.Marshal(x) 161 | if err != nil { 162 | panic(fmt.Sprintf("json.Marshal: %v", err)) 163 | } 164 | return js 165 | } 166 | 167 | // Fmt formats data for printing, 168 | // first trying [ordered.DecodeFmt] in case data is an [ordered encoding], 169 | // then trying a backquoted string if possible 170 | // (handling simple JSON data), 171 | // and finally resorting to [strconv.QuoteToASCII]. 172 | func Fmt(data []byte) string { 173 | if s, err := ordered.DecodeFmt(data); err == nil { 174 | return s 175 | } 176 | s := string(data) 177 | if strconv.CanBackquote(s) { 178 | return "`" + s + "`" 179 | } 180 | return strconv.QuoteToASCII(s) 181 | } 182 | -------------------------------------------------------------------------------- /internal/related/related_test.go: -------------------------------------------------------------------------------- 1 | // Copyright 2024 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package related 6 | 7 | import ( 8 | "fmt" 9 | "maps" 10 | "slices" 11 | "strings" 12 | "testing" 13 | "time" 14 | 15 | "rsc.io/gaby/internal/diff" 16 | "rsc.io/gaby/internal/docs" 17 | "rsc.io/gaby/internal/embeddocs" 18 | "rsc.io/gaby/internal/github" 19 | "rsc.io/gaby/internal/githubdocs" 20 | "rsc.io/gaby/internal/llm" 21 | "rsc.io/gaby/internal/storage" 22 | "rsc.io/gaby/internal/testutil" 23 | ) 24 | 25 | func Test(t *testing.T) { 26 | lg := testutil.Slogger(t) 27 | db := storage.MemDB() 28 | gh := github.New(lg, db, nil, nil) 29 | gh.Testing().LoadTxtar("../testdata/markdown.txt") 30 | gh.Testing().LoadTxtar("../testdata/rsctmp.txt") 31 | 32 | dc := docs.New(db) 33 | githubdocs.Sync(lg, dc, gh) 34 | 35 | vdb := storage.MemVectorDB(db, lg, "vecs") 36 | embeddocs.Sync(lg, vdb, llm.QuoteEmbedder(), dc) 37 | 38 | vdb = storage.MemVectorDB(db, lg, "vecs") 39 | p := New(lg, db, gh, vdb, dc, "postname") 40 | p.EnableProject("rsc/markdown") 41 | p.SetTimeLimit(time.Time{}) 42 | p.Run() 43 | checkEdits(t, gh.Testing().Edits(), nil) 44 | gh.Testing().ClearEdits() 45 | 46 | p.EnablePosts() 47 | p.Run() 48 | checkEdits(t, gh.Testing().Edits(), map[int64]string{13: post13, 19: post19}) 49 | gh.Testing().ClearEdits() 50 | 51 | p = New(lg, db, gh, vdb, dc, "postname2") 52 | p.EnableProject("rsc/markdown") 53 | p.SetTimeLimit(time.Time{}) 54 | p.EnablePosts() 55 | p.Run() 56 | checkEdits(t, gh.Testing().Edits(), nil) 57 | gh.Testing().ClearEdits() 58 | 59 | for i := range 4 { 60 | p := New(lg, db, gh, vdb, dc, "postnameloop."+fmt.Sprint(i)) 61 | p.EnableProject("rsc/markdown") 62 | p.SetTimeLimit(time.Time{}) 63 | switch i { 64 | case 0: 65 | p.SkipTitlePrefix("feature: ") 66 | case 1: 67 | p.SkipTitleSuffix("for heading") 68 | case 2: 69 | p.SkipBodyContains("For example, this heading") 70 | case 3: 71 | p.SkipBodyContains("For example, this heading") 72 | p.SkipBodyContains("ZZZ") 73 | } 74 | p.EnablePosts() 75 | p.deletePosted() 76 | p.Run() 77 | checkEdits(t, gh.Testing().Edits(), map[int64]string{13: post13}) 78 | gh.Testing().ClearEdits() 79 | } 80 | 81 | p = New(lg, db, gh, vdb, dc, "postname3") 82 | p.EnableProject("rsc/markdown") 83 | p.SetMinScore(2.0) // impossible 84 | p.SetTimeLimit(time.Time{}) 85 | p.EnablePosts() 86 | p.deletePosted() 87 | p.Run() 88 | checkEdits(t, gh.Testing().Edits(), nil) 89 | gh.Testing().ClearEdits() 90 | 91 | p = New(lg, db, gh, vdb, dc, "postname4") 92 | p.EnableProject("rsc/markdown") 93 | p.SetMinScore(2.0) // impossible 94 | p.SetTimeLimit(time.Date(2222, 1, 1, 1, 1, 1, 1, time.UTC)) 95 | p.EnablePosts() 96 | p.deletePosted() 97 | p.Run() 98 | checkEdits(t, gh.Testing().Edits(), nil) 99 | gh.Testing().ClearEdits() 100 | 101 | p = New(lg, db, gh, vdb, dc, "postname5") 102 | p.EnableProject("rsc/markdown") 103 | p.SetMinScore(0) // everything 104 | p.SetMaxResults(0) // except none 105 | p.SetTimeLimit(time.Time{}) 106 | p.EnablePosts() 107 | p.deletePosted() 108 | p.Run() 109 | checkEdits(t, gh.Testing().Edits(), nil) 110 | gh.Testing().ClearEdits() 111 | 112 | } 113 | 114 | func checkEdits(t *testing.T, edits []*github.TestingEdit, want map[int64]string) { 115 | t.Helper() 116 | for _, e := range edits { 117 | if e.Project != "rsc/markdown" { 118 | t.Errorf("posted to unexpected project: %v", e) 119 | continue 120 | } 121 | if e.Comment != 0 || e.IssueCommentChanges == nil { 122 | t.Errorf("non-post edit: %v", e) 123 | continue 124 | } 125 | w, ok := want[e.Issue] 126 | if !ok { 127 | t.Errorf("post to unexpected issue: %v", e) 128 | continue 129 | } 130 | delete(want, e.Issue) 131 | if strings.TrimSpace(e.IssueCommentChanges.Body) != strings.TrimSpace(w) { 132 | t.Errorf("rsc/markdown#%d: wrong post:\n%s", e.Issue, 133 | string(diff.Diff("want", []byte(w), "have", []byte(e.IssueCommentChanges.Body)))) 134 | } 135 | } 136 | for _, issue := range slices.Sorted(maps.Keys(want)) { 137 | t.Errorf("did not see post on rsc/markdown#%d", issue) 138 | } 139 | if t.Failed() { 140 | t.FailNow() 141 | } 142 | } 143 | 144 | var post13 = unQUOT(`**Related Issues** 145 | 146 | - [goldmark and markdown diff with h1 inside p #6 (closed)](https://github.com/rsc/markdown/issues/6) 147 | - [Support escaped \QUOT|\QUOT in table cells #9 (closed)](https://github.com/rsc/markdown/issues/9) 148 | - [markdown: fix markdown printing for inline code #12 (closed)](https://github.com/rsc/markdown/issues/12) 149 | - [markdown: emit Info in CodeBlock markdown #18 (closed)](https://github.com/rsc/markdown/issues/18) 150 | - [feature: synthesize lowercase anchors for heading #19](https://github.com/rsc/markdown/issues/19) 151 | - [Replace newlines with spaces in alt text #4 (closed)](https://github.com/rsc/markdown/issues/4) 152 | - [allow capital X in task list items #2 (closed)](https://github.com/rsc/markdown/issues/2) 153 | - [build(deps): bump golang.org/x/text from 0.3.6 to 0.3.8 in /rmplay #10](https://github.com/rsc/tmp/issues/10) 154 | - [Render reference links in Markdown #14 (closed)](https://github.com/rsc/markdown/issues/14) 155 | - [Render reference links in Markdown #15 (closed)](https://github.com/rsc/markdown/issues/15) 156 | 157 | (Emoji vote if this was helpful or unhelpful; more detailed feedback welcome in [this discussion](https://github.com/golang/go/discussions/67901).) 158 | `) 159 | 160 | var post19 = unQUOT(`**Related Issues** 161 | 162 | - [allow capital X in task list items #2 (closed)](https://github.com/rsc/markdown/issues/2) 163 | - [Support escaped \QUOT|\QUOT in table cells #9 (closed)](https://github.com/rsc/markdown/issues/9) 164 | - [goldmark and markdown diff with h1 inside p #6 (closed)](https://github.com/rsc/markdown/issues/6) 165 | - [Render reference links in Markdown #14 (closed)](https://github.com/rsc/markdown/issues/14) 166 | - [Render reference links in Markdown #15 (closed)](https://github.com/rsc/markdown/issues/15) 167 | - [Empty column heading not recognized in table #7 (closed)](https://github.com/rsc/markdown/issues/7) 168 | - [Correctly render reference links in Markdown #13](https://github.com/rsc/markdown/issues/13) 169 | - [markdown: fix markdown printing for inline code #12 (closed)](https://github.com/rsc/markdown/issues/12) 170 | - [Replace newlines with spaces in alt text #4 (closed)](https://github.com/rsc/markdown/issues/4) 171 | - [build(deps): bump golang.org/x/text from 0.3.6 to 0.3.8 in /rmplay #10](https://github.com/rsc/tmp/issues/10) 172 | 173 | (Emoji vote if this was helpful or unhelpful; more detailed feedback welcome in [this discussion](https://github.com/golang/go/discussions/67901).) 174 | `) 175 | 176 | func unQUOT(s string) string { return strings.ReplaceAll(s, "QUOT", "`") } 177 | -------------------------------------------------------------------------------- /internal/commentfix/fix_test.go: -------------------------------------------------------------------------------- 1 | // Copyright 2024 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package commentfix 6 | 7 | import ( 8 | "bytes" 9 | "io" 10 | "path/filepath" 11 | "strings" 12 | "testing" 13 | "text/template" 14 | "time" 15 | 16 | "golang.org/x/tools/txtar" 17 | "rsc.io/gaby/internal/diff" 18 | "rsc.io/gaby/internal/github" 19 | "rsc.io/gaby/internal/storage" 20 | "rsc.io/gaby/internal/testutil" 21 | ) 22 | 23 | func TestTestdata(t *testing.T) { 24 | files, err := filepath.Glob("testdata/*.txt") 25 | testutil.Check(t, err) 26 | for _, file := range files { 27 | t.Run(filepath.Base(file), func(t *testing.T) { 28 | a, err := txtar.ParseFile(file) 29 | testutil.Check(t, err) 30 | var f Fixer 31 | tmpl, err := new(template.Template).Parse(string(a.Comment)) 32 | testutil.Check(t, err) 33 | testutil.Check(t, tmpl.Execute(io.Discard, &f)) 34 | for i := 0; i+2 <= len(a.Files); { 35 | in := a.Files[i] 36 | out := a.Files[i+1] 37 | i += 2 38 | name := strings.TrimSuffix(in.Name, ".in") 39 | if name != strings.TrimSuffix(out.Name, ".out") { 40 | t.Fatalf("mismatched file pair: %s and %s", in.Name, out.Name) 41 | } 42 | t.Run(name, func(t *testing.T) { 43 | newBody, fixed := f.Fix(string(in.Data)) 44 | if fixed != (newBody != "") { 45 | t.Fatalf("Fix() = %q, %v (len(newBody)=%d but fixed=%v)", newBody, fixed, len(newBody), fixed) 46 | } 47 | if newBody != string(out.Data) { 48 | t.Fatalf("Fix: incorrect output:\n%s", string(diff.Diff("want", []byte(out.Data), "have", []byte(newBody)))) 49 | } 50 | }) 51 | } 52 | }) 53 | } 54 | } 55 | 56 | func TestPanics(t *testing.T) { 57 | callRecover := func() { recover() } 58 | 59 | func() { 60 | defer callRecover() 61 | var f Fixer 62 | f.EnableEdits() 63 | t.Errorf("EnableEdits on zero Fixer did not panic") 64 | }() 65 | 66 | func() { 67 | defer callRecover() 68 | var f Fixer 69 | f.EnableProject("abc/xyz") 70 | t.Errorf("EnableProject on zero Fixer did not panic") 71 | }() 72 | 73 | func() { 74 | defer callRecover() 75 | var f Fixer 76 | f.Run() 77 | t.Errorf("Run on zero Fixer did not panic") 78 | }() 79 | } 80 | 81 | func TestErrors(t *testing.T) { 82 | var f Fixer 83 | if err := f.AutoLink(`\`, ""); err == nil { 84 | t.Fatalf("AutoLink succeeded on bad regexp") 85 | } 86 | if err := f.ReplaceText(`\`, ""); err == nil { 87 | t.Fatalf("ReplaceText succeeded on bad regexp") 88 | } 89 | if err := f.ReplaceURL(`\`, ""); err == nil { 90 | t.Fatalf("ReplaceText succeeded on bad regexp") 91 | } 92 | } 93 | 94 | func TestGitHub(t *testing.T) { 95 | testGH := func() *github.Client { 96 | db := storage.MemDB() 97 | gh := github.New(testutil.Slogger(t), db, nil, nil) 98 | gh.Testing().AddIssue("rsc/tmp", &github.Issue{ 99 | Number: 18, 100 | Title: "spellchecking", 101 | Body: "Contexts are cancelled.", 102 | CreatedAt: "2024-06-17T20:16:49-04:00", 103 | UpdatedAt: "2024-06-17T20:16:49-04:00", 104 | }) 105 | gh.Testing().AddIssue("rsc/tmp", &github.Issue{ 106 | Number: 19, 107 | Title: "spellchecking", 108 | Body: "Contexts are cancelled.", 109 | CreatedAt: "2024-06-17T20:16:49-04:00", 110 | UpdatedAt: "2024-06-17T20:16:49-04:00", 111 | PullRequest: new(struct{}), 112 | }) 113 | 114 | gh.Testing().AddIssueComment("rsc/tmp", 18, &github.IssueComment{ 115 | Body: "No really, contexts are cancelled.", 116 | CreatedAt: "2024-06-17T20:16:49-04:00", 117 | UpdatedAt: "2024-06-17T20:16:49-04:00", 118 | }) 119 | 120 | gh.Testing().AddIssueComment("rsc/tmp", 18, &github.IssueComment{ 121 | Body: "Completely unrelated.", 122 | CreatedAt: "2024-06-17T20:16:49-04:00", 123 | UpdatedAt: "2024-06-17T20:16:49-04:00", 124 | }) 125 | 126 | return gh 127 | } 128 | 129 | // Check for comment with too-new cutoff and edits disabled. 130 | // Finds nothing but also no-op. 131 | gh := testGH() 132 | lg, buf := testutil.SlogBuffer() 133 | f := New(lg, gh, "fixer1") 134 | f.SetStderr(testutil.LogWriter(t)) 135 | f.EnableProject("rsc/tmp") 136 | f.SetTimeLimit(time.Date(2222, 1, 1, 1, 1, 1, 1, time.UTC)) 137 | f.ReplaceText("cancelled", "canceled") 138 | f.Run() 139 | // t.Logf("output:\n%s", buf) 140 | if bytes.Contains(buf.Bytes(), []byte("commentfix rewrite")) { 141 | t.Fatalf("logs mention rewrite of old comment:\n%s", buf.Bytes()) 142 | } 143 | 144 | // Check again with old enough cutoff. 145 | // Finds comment but does not edit, does not advance cursor. 146 | f = New(lg, gh, "fixer1") 147 | f.SetStderr(testutil.LogWriter(t)) 148 | f.EnableProject("rsc/tmp") 149 | f.SetTimeLimit(time.Time{}) 150 | f.ReplaceText("cancelled", "canceled") 151 | f.Run() 152 | // t.Logf("output:\n%s", buf) 153 | if !bytes.Contains(buf.Bytes(), []byte("commentfix rewrite")) { 154 | t.Fatalf("logs do not mention rewrite of comment:\n%s", buf.Bytes()) 155 | } 156 | if bytes.Contains(buf.Bytes(), []byte("editing github")) { 157 | t.Fatalf("logs incorrectly mention editing github:\n%s", buf.Bytes()) 158 | } 159 | 160 | // Run with too-new cutoff and edits enabled, should make issue not seen again. 161 | buf.Truncate(0) 162 | f.SetTimeLimit(time.Date(2222, 1, 1, 1, 1, 1, 1, time.UTC)) 163 | f.EnableEdits() 164 | f.Run() 165 | // t.Logf("output:\n%s", buf) 166 | if bytes.Contains(buf.Bytes(), []byte("commentfix rewrite")) { 167 | t.Fatalf("logs incorrectly mention rewrite of comment:\n%s", buf.Bytes()) 168 | } 169 | 170 | f.SetTimeLimit(time.Time{}) 171 | f.Run() 172 | // t.Logf("output:\n%s", buf) 173 | if bytes.Contains(buf.Bytes(), []byte("commentfix rewrite")) { 174 | t.Fatalf("logs incorrectly mention rewrite of comment:\n%s", buf.Bytes()) 175 | } 176 | 177 | // Write comment (now using fixer2 to avoid 'marked as old' in fixer1). 178 | lg, buf = testutil.SlogBuffer() 179 | f = New(lg, gh, "fixer2") 180 | f.SetStderr(testutil.LogWriter(t)) 181 | f.EnableProject("rsc/tmp") 182 | f.ReplaceText("cancelled", "canceled") 183 | f.SetTimeLimit(time.Time{}) 184 | f.EnableEdits() 185 | f.Run() 186 | // t.Logf("output:\n%s", buf) 187 | if !bytes.Contains(buf.Bytes(), []byte("commentfix rewrite")) { 188 | t.Fatalf("logs do not mention rewrite of comment:\n%s", buf.Bytes()) 189 | } 190 | if !bytes.Contains(buf.Bytes(), []byte("editing github")) { 191 | t.Fatalf("logs do not mention editing github:\n%s", buf.Bytes()) 192 | } 193 | if !bytes.Contains(buf.Bytes(), []byte(`editing github" url=https://api.github.com/repos/rsc/tmp/issues/18`)) { 194 | t.Fatalf("logs do not mention editing issue body:\n%s", buf.Bytes()) 195 | } 196 | if bytes.Contains(buf.Bytes(), []byte(`editing github" url=https://api.github.com/repos/rsc/tmp/issues/19`)) { 197 | t.Fatalf("logs incorrectly mention editing pull request body:\n%s", buf.Bytes()) 198 | } 199 | if !bytes.Contains(buf.Bytes(), []byte(`editing github" url=https://api.github.com/repos/rsc/tmp/issues/comments/10000000001`)) { 200 | t.Fatalf("logs do not mention editing issue comment:\n%s", buf.Bytes()) 201 | } 202 | if bytes.Contains(buf.Bytes(), []byte("ERROR")) { 203 | t.Fatalf("editing failed:\n%s", buf.Bytes()) 204 | } 205 | 206 | // Try again; comment should now be marked old in watcher. 207 | lg, buf = testutil.SlogBuffer() 208 | f = New(lg, gh, "fixer2") 209 | f.SetStderr(testutil.LogWriter(t)) 210 | f.EnableProject("rsc/tmp") 211 | f.ReplaceText("cancelled", "canceled") 212 | f.EnableEdits() 213 | f.SetTimeLimit(time.Time{}) 214 | f.Run() 215 | // t.Logf("output:\n%s", buf) 216 | if bytes.Contains(buf.Bytes(), []byte("commentfix rewrite")) { 217 | t.Fatalf("logs incorrectly mention rewrite of comment:\n%s", buf.Bytes()) 218 | } 219 | 220 | // Check that not enabling the project doesn't edit comments. 221 | lg, buf = testutil.SlogBuffer() 222 | f = New(lg, gh, "fixer3") 223 | f.SetStderr(testutil.LogWriter(t)) 224 | f.EnableProject("xyz/tmp") 225 | f.ReplaceText("cancelled", "canceled") 226 | f.EnableEdits() 227 | f.SetTimeLimit(time.Time{}) 228 | f.Run() 229 | // t.Logf("output:\n%s", buf) 230 | if bytes.Contains(buf.Bytes(), []byte("commentfix rewrite")) { 231 | t.Fatalf("logs incorrectly mention rewrite of comment:\n%s", buf.Bytes()) 232 | } 233 | } 234 | -------------------------------------------------------------------------------- /internal/diff/diff.go: -------------------------------------------------------------------------------- 1 | // Copyright 2022 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package diff 6 | 7 | import ( 8 | "bytes" 9 | "fmt" 10 | "sort" 11 | "strings" 12 | ) 13 | 14 | // A pair is a pair of values tracked for both the x and y side of a diff. 15 | // It is typically a pair of line indexes. 16 | type pair struct{ x, y int } 17 | 18 | // Diff returns an anchored diff of the two texts old and new 19 | // in the “unified diff” format. If old and new are identical, 20 | // Diff returns a nil slice (no output). 21 | // 22 | // Unix diff implementations typically look for a diff with 23 | // the smallest number of lines inserted and removed, 24 | // which can in the worst case take time quadratic in the 25 | // number of lines in the texts. As a result, many implementations 26 | // either can be made to run for a long time or cut off the search 27 | // after a predetermined amount of work. 28 | // 29 | // In contrast, this implementation looks for a diff with the 30 | // smallest number of “unique” lines inserted and removed, 31 | // where unique means a line that appears just once in both old and new. 32 | // We call this an “anchored diff” because the unique lines anchor 33 | // the chosen matching regions. An anchored diff is usually clearer 34 | // than a standard diff, because the algorithm does not try to 35 | // reuse unrelated blank lines or closing braces. 36 | // The algorithm also guarantees to run in O(n log n) time 37 | // instead of the standard O(n²) time. 38 | // 39 | // Some systems call this approach a “patience diff,” named for 40 | // the “patience sorting” algorithm, itself named for a solitaire card game. 41 | // We avoid that name for two reasons. First, the name has been used 42 | // for a few different variants of the algorithm, so it is imprecise. 43 | // Second, the name is frequently interpreted as meaning that you have 44 | // to wait longer (to be patient) for the diff, meaning that it is a slower algorithm, 45 | // when in fact the algorithm is faster than the standard one. 46 | func Diff(oldName string, old []byte, newName string, new []byte) []byte { 47 | if bytes.Equal(old, new) { 48 | return nil 49 | } 50 | x := lines(old) 51 | y := lines(new) 52 | 53 | // Print diff header. 54 | var out bytes.Buffer 55 | fmt.Fprintf(&out, "diff %s %s\n", oldName, newName) 56 | fmt.Fprintf(&out, "--- %s\n", oldName) 57 | fmt.Fprintf(&out, "+++ %s\n", newName) 58 | 59 | // Loop over matches to consider, 60 | // expanding each match to include surrounding lines, 61 | // and then printing diff chunks. 62 | // To avoid setup/teardown cases outside the loop, 63 | // tgs returns a leading {0,0} and trailing {len(x), len(y)} pair 64 | // in the sequence of matches. 65 | var ( 66 | done pair // printed up to x[:done.x] and y[:done.y] 67 | chunk pair // start lines of current chunk 68 | count pair // number of lines from each side in current chunk 69 | ctext []string // lines for current chunk 70 | ) 71 | for _, m := range tgs(x, y) { 72 | if m.x < done.x { 73 | // Already handled scanning forward from earlier match. 74 | continue 75 | } 76 | 77 | // Expand matching lines as far as possible, 78 | // establishing that x[start.x:end.x] == y[start.y:end.y]. 79 | // Note that on the first (or last) iteration we may (or definitely do) 80 | // have an empty match: start.x==end.x and start.y==end.y. 81 | start := m 82 | for start.x > done.x && start.y > done.y && x[start.x-1] == y[start.y-1] { 83 | start.x-- 84 | start.y-- 85 | } 86 | end := m 87 | for end.x < len(x) && end.y < len(y) && x[end.x] == y[end.y] { 88 | end.x++ 89 | end.y++ 90 | } 91 | 92 | // Emit the mismatched lines before start into this chunk. 93 | // (No effect on first sentinel iteration, when start = {0,0}.) 94 | for _, s := range x[done.x:start.x] { 95 | ctext = append(ctext, "-"+s) 96 | count.x++ 97 | } 98 | for _, s := range y[done.y:start.y] { 99 | ctext = append(ctext, "+"+s) 100 | count.y++ 101 | } 102 | 103 | // If we're not at EOF and have too few common lines, 104 | // the chunk includes all the common lines and continues. 105 | const C = 3 // number of context lines 106 | if (end.x < len(x) || end.y < len(y)) && 107 | (end.x-start.x < C || (len(ctext) > 0 && end.x-start.x < 2*C)) { 108 | for _, s := range x[start.x:end.x] { 109 | ctext = append(ctext, " "+s) 110 | count.x++ 111 | count.y++ 112 | } 113 | done = end 114 | continue 115 | } 116 | 117 | // End chunk with common lines for context. 118 | if len(ctext) > 0 { 119 | n := end.x - start.x 120 | if n > C { 121 | n = C 122 | } 123 | for _, s := range x[start.x : start.x+n] { 124 | ctext = append(ctext, " "+s) 125 | count.x++ 126 | count.y++ 127 | } 128 | done = pair{start.x + n, start.y + n} 129 | 130 | // Format and emit chunk. 131 | // Convert line numbers to 1-indexed. 132 | // Special case: empty file shows up as 0,0 not 1,0. 133 | if count.x > 0 { 134 | chunk.x++ 135 | } 136 | if count.y > 0 { 137 | chunk.y++ 138 | } 139 | fmt.Fprintf(&out, "@@ -%d,%d +%d,%d @@\n", chunk.x, count.x, chunk.y, count.y) 140 | for _, s := range ctext { 141 | out.WriteString(s) 142 | } 143 | count.x = 0 144 | count.y = 0 145 | ctext = ctext[:0] 146 | } 147 | 148 | // If we reached EOF, we're done. 149 | if end.x >= len(x) && end.y >= len(y) { 150 | break 151 | } 152 | 153 | // Otherwise start a new chunk. 154 | chunk = pair{end.x - C, end.y - C} 155 | for _, s := range x[chunk.x:end.x] { 156 | ctext = append(ctext, " "+s) 157 | count.x++ 158 | count.y++ 159 | } 160 | done = end 161 | } 162 | 163 | return out.Bytes() 164 | } 165 | 166 | // lines returns the lines in the file x, including newlines. 167 | // If the file does not end in a newline, one is supplied 168 | // along with a warning about the missing newline. 169 | func lines(x []byte) []string { 170 | l := strings.SplitAfter(string(x), "\n") 171 | if l[len(l)-1] == "" { 172 | l = l[:len(l)-1] 173 | } else { 174 | // Treat last line as having a message about the missing newline attached, 175 | // using the same text as BSD/GNU diff (including the leading backslash). 176 | l[len(l)-1] += "\n\\ No newline at end of file\n" 177 | } 178 | return l 179 | } 180 | 181 | // tgs returns the pairs of indexes of the longest common subsequence 182 | // of unique lines in x and y, where a unique line is one that appears 183 | // once in x and once in y. 184 | // 185 | // The longest common subsequence algorithm is as described in 186 | // Thomas G. Szymanski, “A Special Case of the Maximal Common 187 | // Subsequence Problem,” Princeton TR #170 (January 1975), 188 | // available at https://research.swtch.com/tgs170.pdf. 189 | func tgs(x, y []string) []pair { 190 | // Count the number of times each string appears in a and b. 191 | // We only care about 0, 1, many, counted as 0, -1, -2 192 | // for the x side and 0, -4, -8 for the y side. 193 | // Using negative numbers now lets us distinguish positive line numbers later. 194 | m := make(map[string]int) 195 | for _, s := range x { 196 | if c := m[s]; c > -2 { 197 | m[s] = c - 1 198 | } 199 | } 200 | for _, s := range y { 201 | if c := m[s]; c > -8 { 202 | m[s] = c - 4 203 | } 204 | } 205 | 206 | // Now unique strings can be identified by m[s] = -1+-4. 207 | // 208 | // Gather the indexes of those strings in x and y, building: 209 | // xi[i] = increasing indexes of unique strings in x. 210 | // yi[i] = increasing indexes of unique strings in y. 211 | // inv[i] = index j such that x[xi[i]] = y[yi[j]]. 212 | var xi, yi, inv []int 213 | for i, s := range y { 214 | if m[s] == -1+-4 { 215 | m[s] = len(yi) 216 | yi = append(yi, i) 217 | } 218 | } 219 | for i, s := range x { 220 | if j, ok := m[s]; ok && j >= 0 { 221 | xi = append(xi, i) 222 | inv = append(inv, j) 223 | } 224 | } 225 | 226 | // Apply Algorithm A from Szymanski's paper. 227 | // In those terms, A = J = inv and B = [0, n). 228 | // We add sentinel pairs {0,0}, and {len(x),len(y)} 229 | // to the returned sequence, to help the processing loop. 230 | J := inv 231 | n := len(xi) 232 | T := make([]int, n) 233 | L := make([]int, n) 234 | for i := range T { 235 | T[i] = n + 1 236 | } 237 | for i := 0; i < n; i++ { 238 | k := sort.Search(n, func(k int) bool { 239 | return T[k] >= J[i] 240 | }) 241 | T[k] = J[i] 242 | L[i] = k + 1 243 | } 244 | k := 0 245 | for _, v := range L { 246 | if k < v { 247 | k = v 248 | } 249 | } 250 | seq := make([]pair, 2+k) 251 | seq[1+k] = pair{len(x), len(y)} // sentinel at end 252 | lastj := n 253 | for i := n - 1; i >= 0; i-- { 254 | if L[i] == k && J[i] < lastj { 255 | seq[k] = pair{xi[i], yi[J[i]]} 256 | k-- 257 | } 258 | } 259 | seq[0] = pair{0, 0} // sentinel at start 260 | return seq 261 | } 262 | -------------------------------------------------------------------------------- /internal/storage/mem.go: -------------------------------------------------------------------------------- 1 | // Copyright 2024 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package storage 6 | 7 | import ( 8 | "bytes" 9 | "fmt" 10 | "iter" 11 | "log/slog" 12 | "slices" 13 | "sync" 14 | 15 | "rsc.io/gaby/internal/llm" 16 | "rsc.io/omap" 17 | "rsc.io/ordered" 18 | "rsc.io/top" 19 | ) 20 | 21 | // A MemLocker is an single-process implementation 22 | // of the database Lock and Unlock methods, 23 | // suitable if there is only one process accessing the 24 | // database at a time. 25 | type MemLocker struct { 26 | mu sync.Mutex 27 | locks map[string]*sync.Mutex 28 | } 29 | 30 | // Lock locks the mutex with the given name. 31 | func (l *MemLocker) Lock(name string) { 32 | l.mu.Lock() 33 | if l.locks == nil { 34 | l.locks = make(map[string]*sync.Mutex) 35 | } 36 | mu := l.locks[name] 37 | if mu == nil { 38 | mu = new(sync.Mutex) 39 | l.locks[name] = mu 40 | } 41 | l.mu.Unlock() 42 | 43 | mu.Lock() 44 | } 45 | 46 | // Unlock locks the mutex with the given name. 47 | func (l *MemLocker) Unlock(name string) { 48 | l.mu.Lock() 49 | mu := l.locks[name] 50 | l.mu.Unlock() 51 | if mu == nil { 52 | panic("Unlock of never locked key") 53 | } 54 | mu.Unlock() 55 | } 56 | 57 | // MemDB returns an in-memory DB implementation. 58 | func MemDB() DB { 59 | return new(memDB) 60 | } 61 | 62 | // A memDB is an in-memory DB implementation,. 63 | type memDB struct { 64 | MemLocker 65 | mu sync.RWMutex 66 | data omap.Map[string, []byte] 67 | } 68 | 69 | func (*memDB) Close() {} 70 | 71 | func (*memDB) Panic(msg string, args ...any) { 72 | Panic(msg, args...) 73 | } 74 | 75 | // Get returns the value associated with the key. 76 | func (db *memDB) Get(key []byte) (val []byte, ok bool) { 77 | db.mu.RLock() 78 | v, ok := db.data.Get(string(key)) 79 | db.mu.RUnlock() 80 | if ok { 81 | v = bytes.Clone(v) 82 | } 83 | return v, ok 84 | } 85 | 86 | // Scan returns an iterator overall key-value pairs 87 | // in the range start ≤ key ≤ end. 88 | func (db *memDB) Scan(start, end []byte) iter.Seq2[[]byte, func() []byte] { 89 | lo := string(start) 90 | hi := string(end) 91 | return func(yield func(key []byte, val func() []byte) bool) { 92 | db.mu.RLock() 93 | locked := true 94 | defer func() { 95 | if locked { 96 | db.mu.RUnlock() 97 | } 98 | }() 99 | for k, v := range db.data.Scan(lo, hi) { 100 | key := []byte(k) 101 | val := func() []byte { return bytes.Clone(v) } 102 | db.mu.RUnlock() 103 | locked = false 104 | if !yield(key, val) { 105 | return 106 | } 107 | db.mu.RLock() 108 | locked = true 109 | } 110 | } 111 | } 112 | 113 | // Delete deletes any entry with the given key. 114 | func (db *memDB) Delete(key []byte) { 115 | db.mu.Lock() 116 | defer db.mu.Unlock() 117 | 118 | db.data.Delete(string(key)) 119 | } 120 | 121 | // DeleteRange deletes all entries with start ≤ key ≤ end. 122 | func (db *memDB) DeleteRange(start, end []byte) { 123 | db.mu.Lock() 124 | defer db.mu.Unlock() 125 | 126 | db.data.DeleteRange(string(start), string(end)) 127 | } 128 | 129 | // Set sets the value associated with key to val. 130 | func (db *memDB) Set(key, val []byte) { 131 | db.mu.Lock() 132 | defer db.mu.Unlock() 133 | 134 | db.data.Set(string(key), bytes.Clone(val)) 135 | } 136 | 137 | // Batch returns a new batch. 138 | func (db *memDB) Batch() Batch { 139 | return &memBatch{db: db} 140 | } 141 | 142 | // Flush flushes everything to persistent storage. 143 | // Since this is an in-memory database, the memory is as persistent as it gets. 144 | func (db *memDB) Flush() { 145 | } 146 | 147 | // A memBatch is a Batch for a memDB. 148 | type memBatch struct { 149 | db *memDB // underlying database 150 | ops []func() // operations to apply 151 | } 152 | 153 | func (b *memBatch) Set(key, val []byte) { 154 | k := string(key) 155 | v := bytes.Clone(val) 156 | b.ops = append(b.ops, func() { b.db.data.Set(k, v) }) 157 | } 158 | 159 | func (b *memBatch) Delete(key []byte) { 160 | k := string(key) 161 | b.ops = append(b.ops, func() { b.db.data.Delete(k) }) 162 | } 163 | 164 | func (b *memBatch) DeleteRange(start, end []byte) { 165 | s := string(start) 166 | e := string(end) 167 | b.ops = append(b.ops, func() { b.db.data.DeleteRange(s, e) }) 168 | } 169 | 170 | func (b *memBatch) MaybeApply() bool { 171 | return false 172 | } 173 | 174 | func (b *memBatch) Apply() { 175 | b.db.mu.Lock() 176 | defer b.db.mu.Unlock() 177 | 178 | for _, op := range b.ops { 179 | op() 180 | } 181 | } 182 | 183 | // A memVectorDB is a VectorDB implementing in-memory search 184 | // but storing its vectors in an underlying DB. 185 | type memVectorDB struct { 186 | storage DB 187 | slog *slog.Logger 188 | namespace string 189 | 190 | mu sync.RWMutex 191 | cache map[string][]float32 // in-memory cache of all vectors, indexed by id 192 | } 193 | 194 | // MemVectorDB returns a VectorDB that stores its vectors in db 195 | // but uses a cached, in-memory copy to implement Search using 196 | // a brute-force scan. 197 | // 198 | // The namespace is incorporated into the keys used in the underlying db, 199 | // to allow multiple vector databases to be stored in a single [DB]. 200 | // 201 | // When MemVectorDB is called, it reads all previously stored vectors 202 | // from db; after that, changes must be made using the MemVectorDB 203 | // Set method. 204 | // 205 | // A MemVectorDB requires approximately 3kB of memory per stored vector. 206 | // 207 | // The db keys used by a MemVectorDB have the form 208 | // 209 | // ordered.Encode("llm.Vector", namespace, id) 210 | // 211 | // where id is the document ID passed to Set. 212 | func MemVectorDB(db DB, lg *slog.Logger, namespace string) VectorDB { 213 | // NOTE: The worst case score error in a dot product over 768 entries 214 | // caused by quantization error of e is approximately 54e, 215 | // so quantizing to int16s would only introduce a maximum score 216 | // error of 0.00165, which would not change results significantly. 217 | // So we could cut the memory per stored vector in half by 218 | // quantizing to int16. 219 | 220 | vdb := &memVectorDB{ 221 | storage: db, 222 | slog: lg, 223 | namespace: namespace, 224 | cache: make(map[string][]float32), 225 | } 226 | 227 | // Load all the previously-stored vectors. 228 | vdb.cache = make(map[string][]float32) 229 | for key, getVal := range vdb.storage.Scan( 230 | ordered.Encode("llm.Vector", namespace), 231 | ordered.Encode("llm.Vector", namespace, ordered.Inf)) { 232 | 233 | var id string 234 | if err := ordered.Decode(key, nil, nil, &id); err != nil { 235 | // unreachable except data corruption 236 | panic(fmt.Errorf("MemVectorDB decode key=%v: %v", Fmt(key), err)) 237 | } 238 | val := getVal() 239 | if len(val)%4 != 0 { 240 | // unreachable except data corruption 241 | panic(fmt.Errorf("MemVectorDB decode key=%v bad len(val)=%d", Fmt(key), len(val))) 242 | } 243 | var vec llm.Vector 244 | vec.Decode(val) 245 | vdb.cache[id] = vec 246 | } 247 | 248 | vdb.slog.Info("loaded vectordb", "n", len(vdb.cache), "namespace", namespace) 249 | return vdb 250 | } 251 | 252 | func (db *memVectorDB) Set(id string, vec llm.Vector) { 253 | db.storage.Set(ordered.Encode("llm.Vector", db.namespace, id), vec.Encode()) 254 | 255 | db.mu.Lock() 256 | db.cache[id] = slices.Clone(vec) 257 | db.mu.Unlock() 258 | } 259 | 260 | func (db *memVectorDB) Get(name string) (llm.Vector, bool) { 261 | db.mu.RLock() 262 | vec, ok := db.cache[name] 263 | db.mu.RUnlock() 264 | return vec, ok 265 | } 266 | 267 | func (db *memVectorDB) Search(target llm.Vector, n int) []VectorResult { 268 | db.mu.RLock() 269 | defer db.mu.RUnlock() 270 | best := top.New(n, VectorResult.cmp) 271 | for name, vec := range db.cache { 272 | if len(vec) != len(target) { 273 | continue 274 | } 275 | best.Add(VectorResult{name, target.Dot(vec)}) 276 | } 277 | return best.Take() 278 | } 279 | 280 | func (db *memVectorDB) Flush() { 281 | db.storage.Flush() 282 | } 283 | 284 | // memVectorBatch implements VectorBatch for a memVectorDB. 285 | type memVectorBatch struct { 286 | db *memVectorDB // underlying memVectorDB 287 | sb Batch // batch for underlying DB 288 | w map[string]llm.Vector // vectors to write 289 | } 290 | 291 | func (db *memVectorDB) Batch() VectorBatch { 292 | return &memVectorBatch{db, db.storage.Batch(), make(map[string]llm.Vector)} 293 | } 294 | 295 | func (b *memVectorBatch) Set(name string, vec llm.Vector) { 296 | b.sb.Set(ordered.Encode("llm.Vector", b.db.namespace, name), vec.Encode()) 297 | 298 | b.w[name] = slices.Clone(vec) 299 | } 300 | 301 | func (b *memVectorBatch) MaybeApply() bool { 302 | if !b.sb.MaybeApply() { 303 | return false 304 | } 305 | b.Apply() 306 | return true 307 | } 308 | 309 | func (b *memVectorBatch) Apply() { 310 | b.sb.Apply() 311 | 312 | b.db.mu.Lock() 313 | defer b.db.mu.Unlock() 314 | 315 | for name, vec := range b.w { 316 | b.db.cache[name] = vec 317 | } 318 | clear(b.w) 319 | } 320 | -------------------------------------------------------------------------------- /internal/github/data.go: -------------------------------------------------------------------------------- 1 | // Copyright 2024 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package github 6 | 7 | import ( 8 | "encoding/json" 9 | "fmt" 10 | "iter" 11 | "math" 12 | "strconv" 13 | "strings" 14 | 15 | "rsc.io/gaby/internal/storage" 16 | "rsc.io/gaby/internal/storage/timed" 17 | "rsc.io/ordered" 18 | ) 19 | 20 | // LookupIssueURL looks up an issue by URL, 21 | // only consulting the database (not actual GitHub). 22 | func (c *Client) LookupIssueURL(url string) (*Issue, error) { 23 | bad := func() (*Issue, error) { 24 | return nil, fmt.Errorf("not a github URL: %q", url) 25 | } 26 | proj, ok := strings.CutPrefix(url, "https://github.com/") 27 | if !ok { 28 | return bad() 29 | } 30 | i := strings.LastIndex(proj, "/issues/") 31 | if i < 0 { 32 | return bad() 33 | } 34 | proj, num := proj[:i], proj[i+len("/issues/"):] 35 | n, err := strconv.ParseInt(num, 10, 64) 36 | if err != nil || n <= 0 { 37 | return bad() 38 | } 39 | 40 | for e := range c.Events(proj, n, n) { 41 | if e.API == "/issues" { 42 | return e.Typed.(*Issue), nil 43 | } 44 | } 45 | return nil, fmt.Errorf("%s#%d not in database", proj, n) 46 | } 47 | 48 | // An Event is a single GitHub issue event stored in the database. 49 | type Event struct { 50 | DBTime timed.DBTime // when event was last written 51 | Project string // project ("golang/go") 52 | Issue int64 // issue number 53 | API string // API endpoint for event: "/issues", "/issues/comments", or "/issues/events" 54 | ID int64 // ID of event; each API has a different ID space. (Project, Issue, API, ID) is assumed unique 55 | JSON []byte // JSON for the event data 56 | Typed any // Typed unmarshaling of the event data, of type *Issue, *IssueComment, or *IssueEvent 57 | } 58 | 59 | // Events returns an iterator over issue events for the given project, 60 | // limited to issues in the range issueMin ≤ issue ≤ issueMax. 61 | // If issueMax < 0, there is no upper limit. 62 | // The events are iterated over in (Project, Issue, API, ID) order, 63 | // so "/issues" events come first, then "/issues/comments", then "/issues/events". 64 | // Within a specific API, the events are ordered by increasing ID, 65 | // which corresponds to increasing event time on GitHub. 66 | func (c *Client) Events(project string, issueMin, issueMax int64) iter.Seq[*Event] { 67 | return func(yield func(*Event) bool) { 68 | start := o(project, issueMin) 69 | if issueMax < 0 { 70 | issueMax = math.MaxInt64 71 | } 72 | end := o(project, issueMax, ordered.Inf) 73 | for t := range timed.Scan(c.db, "githubdl.Event", start, end) { 74 | if !yield(c.decodeEvent(t)) { 75 | return 76 | } 77 | } 78 | } 79 | } 80 | 81 | // EventsAfter returns an iterator over events in the given project after DBTime t, 82 | // which should be e.DBTime from the most recent processed event. 83 | // The events are iterated over in DBTime order, so the DBTime of the last 84 | // successfully processed event can be used in a future call to EventsAfter. 85 | // If project is the empty string, then events from all projects are returned. 86 | func (c *Client) EventsAfter(t timed.DBTime, project string) iter.Seq[*Event] { 87 | filter := func(key []byte) bool { 88 | if project == "" { 89 | return true 90 | } 91 | var p string 92 | if _, err := ordered.DecodePrefix(key, &p); err != nil { 93 | c.db.Panic("github EventsAfter decode", "key", storage.Fmt(key), "err", err) 94 | } 95 | return p == project 96 | } 97 | 98 | return func(yield func(*Event) bool) { 99 | for e := range timed.ScanAfter(c.db, "githubdl.Event", t, filter) { 100 | if !yield(c.decodeEvent(e)) { 101 | return 102 | } 103 | } 104 | } 105 | } 106 | 107 | // decodeEvent decodes the key, val pair into an Event. 108 | // It calls c.db.Panic for malformed data. 109 | func (c *Client) decodeEvent(t *timed.Entry) *Event { 110 | var e Event 111 | e.DBTime = t.ModTime 112 | if err := ordered.Decode(t.Key, &e.Project, &e.Issue, &e.API, &e.ID); err != nil { 113 | c.db.Panic("github event decode", "key", storage.Fmt(t.Key), "err", err) 114 | } 115 | 116 | var js ordered.Raw 117 | if err := ordered.Decode(t.Val, &js); err != nil { 118 | c.db.Panic("github event val decode", "key", storage.Fmt(t.Key), "val", storage.Fmt(t.Val), "err", err) 119 | } 120 | e.JSON = js 121 | switch e.API { 122 | default: 123 | c.db.Panic("github event invalid API", "api", e.API) 124 | case "/issues": 125 | e.Typed = new(Issue) 126 | case "/issues/comments": 127 | e.Typed = new(IssueComment) 128 | case "/issues/events": 129 | e.Typed = new(IssueEvent) 130 | } 131 | if err := json.Unmarshal(js, e.Typed); err != nil { 132 | c.db.Panic("github event json", "js", string(js), "err", err) 133 | } 134 | return &e 135 | } 136 | 137 | // EventWatcher returns a new [storage.Watcher] with the given name. 138 | // It picks up where any previous Watcher of the same name left off. 139 | func (c *Client) EventWatcher(name string) *timed.Watcher[*Event] { 140 | return timed.NewWatcher(c.db, name, "githubdl.Event", c.decodeEvent) 141 | } 142 | 143 | // IssueEvent is the GitHub JSON structure for an issue metadata event. 144 | type IssueEvent struct { 145 | // NOTE: Issue field is not present when downloading for a specific issue, 146 | // only in the master feed for the whole repo. So do not add it here. 147 | ID int64 148 | URL string 149 | Actor User `json:"actor"` 150 | Event string `json:"event"` 151 | Labels []Label `json:"labels"` 152 | LockReason string `json:"lock_reason"` 153 | CreatedAt string `json:"created_at"` 154 | CommitID string `json:"commit_id"` 155 | Assigner User `json:"assigner"` 156 | Assignees []User `json:"assignees"` 157 | Milestone Milestone `json:"milestone"` 158 | Rename Rename `json:"rename"` 159 | } 160 | 161 | // A User represents a user or organization account in GitHub JSON. 162 | type User struct { 163 | Login string 164 | } 165 | 166 | // A Label represents a project issue tracker label in GitHub JSON. 167 | type Label struct { 168 | Name string 169 | } 170 | 171 | // A Milestone represents a project issue milestone in GitHub JSON. 172 | type Milestone struct { 173 | Title string 174 | } 175 | 176 | // A Rename describes an issue title renaming in GitHub JSON. 177 | type Rename struct { 178 | From string 179 | To string 180 | } 181 | 182 | func urlToProject(u string) string { 183 | u, ok := strings.CutPrefix(u, "https://api.github.com/repos/") 184 | if !ok { 185 | return "" 186 | } 187 | i := strings.Index(u, "/") 188 | if i < 0 { 189 | return "" 190 | } 191 | j := strings.Index(u[i+1:], "/") 192 | if j < 0 { 193 | return "" 194 | } 195 | return u[:i+1+j] 196 | } 197 | 198 | func baseToInt64(u string) int64 { 199 | i, err := strconv.ParseInt(u[strings.LastIndex(u, "/")+1:], 10, 64) 200 | if i <= 0 || err != nil { 201 | return 0 202 | } 203 | return i 204 | } 205 | 206 | // IssueComment is the GitHub JSON structure for an issue comment event. 207 | type IssueComment struct { 208 | URL string `json:"url"` 209 | IssueURL string `json:"issue_url"` 210 | HTMLURL string `json:"html_url"` 211 | User User `json:"user"` 212 | CreatedAt string `json:"created_at"` 213 | UpdatedAt string `json:"updated_at"` 214 | Body string `json:"body"` 215 | } 216 | 217 | // Project returns the issue comment's GitHub project (for example, "golang/go"). 218 | func (x *IssueComment) Project() string { 219 | return urlToProject(x.URL) 220 | } 221 | 222 | // Issue returns the issue comment's issue number. 223 | func (x *IssueComment) Issue() int64 { 224 | u, _, _ := strings.Cut(x.HTMLURL, "#") 225 | return baseToInt64(u) 226 | } 227 | 228 | // CommentID returns the issue comment's numeric ID. 229 | // The ID appears to be unique across all comments on GitHub, 230 | // but we only assume it is unique within a single issue. 231 | func (x *IssueComment) CommentID() int64 { 232 | return baseToInt64(x.URL) 233 | } 234 | 235 | // Issue is the GitHub JSON structure for an issue creation event. 236 | type Issue struct { 237 | URL string `json:"url"` 238 | HTMLURL string `json:"html_url"` 239 | Number int64 `json:"number"` 240 | User User `json:"user"` 241 | Title string `json:"title"` 242 | CreatedAt string `json:"created_at"` 243 | UpdatedAt string `json:"updated_at"` 244 | ClosedAt string `json:"closed_at"` 245 | Body string `json:"body"` 246 | Assignees []User `json:"assignees"` 247 | Milestone Milestone `json:"milestone"` 248 | State string `json:"state"` 249 | PullRequest *struct{} `json:"pull_request"` 250 | Locked bool 251 | ActiveLockReason string `json:"active_lock_reason"` 252 | Labels []Label `json:"labels"` 253 | } 254 | 255 | // Project returns the issue's GitHub project (for example, "golang/go"). 256 | func (x *Issue) Project() string { 257 | return urlToProject(x.URL) 258 | } 259 | -------------------------------------------------------------------------------- /internal/httprr/rr_test.go: -------------------------------------------------------------------------------- 1 | // Copyright 2024 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package httprr 6 | 7 | import ( 8 | "errors" 9 | "io" 10 | "net/http" 11 | "net/http/httptest" 12 | "os" 13 | "strings" 14 | "testing" 15 | "testing/iotest" 16 | ) 17 | 18 | func handler(w http.ResponseWriter, r *http.Request) { 19 | if strings.HasSuffix(r.URL.Path, "/redirect") { 20 | http.Error(w, "redirect me!", 304) 21 | return 22 | } 23 | if r.Method == "GET" { 24 | if r.Header.Get("Secret") != "key" { 25 | http.Error(w, "missing secret", 666) 26 | return 27 | } 28 | } 29 | if r.Method == "POST" { 30 | data, err := io.ReadAll(r.Body) 31 | if err != nil { 32 | panic(err) 33 | } 34 | if !strings.Contains(string(data), "my Secret") { 35 | http.Error(w, "missing body secret", 667) 36 | return 37 | } 38 | } 39 | } 40 | 41 | func always555(w http.ResponseWriter, r *http.Request) { 42 | http.Error(w, "should not be making HTTP requests", 555) 43 | } 44 | 45 | func dropPort(r *http.Request) error { 46 | if r.URL.Port() != "" { 47 | r.URL.Host = r.URL.Host[:strings.LastIndex(r.URL.Host, ":")] 48 | r.Host = r.Host[:strings.LastIndex(r.Host, ":")] 49 | } 50 | return nil 51 | } 52 | 53 | func dropSecretHeader(r *http.Request) error { 54 | r.Header.Del("Secret") 55 | return nil 56 | } 57 | 58 | func hideSecretBody(r *http.Request) error { 59 | if r.Body != nil { 60 | body := r.Body.(*Body) 61 | body.Data = []byte("redacted") 62 | } 63 | return nil 64 | } 65 | 66 | func TestRecordReplay(t *testing.T) { 67 | dir := t.TempDir() 68 | file := dir + "/rr" 69 | 70 | // 4 passes: 71 | // 0: create 72 | // 1: open 73 | // 2: Open with -httprecord="r+" 74 | // 3: Open with -httprecord="" 75 | for pass := range 4 { 76 | start := open 77 | h := always555 78 | *record = "" 79 | switch pass { 80 | case 0: 81 | start = create 82 | h = handler 83 | case 2: 84 | start = Open 85 | *record = "r+" 86 | h = handler 87 | case 3: 88 | start = Open 89 | } 90 | rr, err := start(file, http.DefaultTransport) 91 | if err != nil { 92 | t.Fatal(err) 93 | } 94 | if rr.Recording() { 95 | t.Log("RECORDING") 96 | } else { 97 | t.Log("REPLAYING") 98 | } 99 | rr.Scrub(dropPort, dropSecretHeader) 100 | rr.Scrub(hideSecretBody) 101 | 102 | mustNewRequest := func(method, url string, body io.Reader) *http.Request { 103 | req, err := http.NewRequest(method, url, body) 104 | if err != nil { 105 | t.Helper() 106 | t.Fatal(err) 107 | } 108 | return req 109 | } 110 | 111 | mustDo := func(req *http.Request, status int) { 112 | resp, err := rr.Client().Do(req) 113 | if err != nil { 114 | t.Helper() 115 | t.Fatal(err) 116 | } 117 | body, _ := io.ReadAll(resp.Body) 118 | resp.Body.Close() 119 | if resp.StatusCode != status { 120 | t.Helper() 121 | t.Fatalf("%v: %s\n%s", req.URL, resp.Status, body) 122 | } 123 | } 124 | 125 | srv := httptest.NewServer(http.HandlerFunc(h)) 126 | defer srv.Close() 127 | 128 | req := mustNewRequest("GET", srv.URL+"/myrequest", nil) 129 | req.Header.Set("Secret", "key") 130 | mustDo(req, 200) 131 | 132 | req = mustNewRequest("POST", srv.URL+"/myrequest", strings.NewReader("my Secret")) 133 | mustDo(req, 200) 134 | 135 | req = mustNewRequest("GET", srv.URL+"/redirect", nil) 136 | mustDo(req, 304) 137 | 138 | if !rr.Recording() { 139 | req = mustNewRequest("GET", srv.URL+"/uncached", nil) 140 | resp, err := rr.Client().Do(req) 141 | if err == nil { 142 | body, _ := io.ReadAll(resp.Body) 143 | t.Fatalf("%v: %s\n%s", req.URL, resp.Status, body) 144 | } 145 | } 146 | 147 | if err := rr.Close(); err != nil { 148 | t.Fatal(err) 149 | } 150 | } 151 | 152 | data, err := os.ReadFile(file) 153 | if err != nil { 154 | t.Fatal(err) 155 | } 156 | if strings.Contains(string(data), "Secret") { 157 | t.Fatalf("rr file contains Secret:\n%s", data) 158 | } 159 | } 160 | 161 | var badResponseTrace = []byte("httprr trace v1\n" + 162 | "92 75\n" + 163 | "GET http://127.0.0.1/myrequest HTTP/1.1\r\n" + 164 | "Host: 127.0.0.1\r\n" + 165 | "User-Agent: Go-http-client/1.1\r\n" + 166 | "\r\n" + 167 | "HZZP/1.1 200 OK\r\n" + 168 | "Date: Wed, 12 Jun 2024 13:55:02 GMT\r\n" + 169 | "Content-Length: 0\r\n" + 170 | "\r\n") 171 | 172 | func TestErrors(t *testing.T) { 173 | // -httprecord regexp parsing 174 | *record = "+" 175 | if _, err := Open(os.DevNull, nil); err == nil || !strings.Contains(err.Error(), "invalid -httprecord flag") { 176 | t.Errorf("did not diagnose bad -httprecord: err = %v", err) 177 | } 178 | *record = "" 179 | 180 | // invalid httprr trace 181 | if _, err := Open(os.DevNull, nil); err == nil || !strings.Contains(err.Error(), "not an httprr trace") { 182 | t.Errorf("did not diagnose invalid httprr trace: err = %v", err) 183 | } 184 | 185 | // corrupt httprr trace 186 | dir := t.TempDir() 187 | os.WriteFile(dir+"/rr", []byte("httprr trace v1\ngarbage\n"), 0666) 188 | if _, err := Open(dir+"/rr", nil); err == nil || !strings.Contains(err.Error(), "corrupt httprr trace") { 189 | t.Errorf("did not diagnose invalid httprr trace: err = %v", err) 190 | } 191 | 192 | // os.Create error creating trace 193 | if _, err := create("invalid\x00file", nil); err == nil { 194 | t.Errorf("did not report failure from os.Create: err = %v", err) 195 | } 196 | 197 | // os.ReadAll error reading trace 198 | if _, err := open("nonexistent", nil); err == nil { 199 | t.Errorf("did not report failure from os.ReadFile: err = %v", err) 200 | } 201 | 202 | // error reading body 203 | rr, err := create(os.DevNull, nil) 204 | if err != nil { 205 | t.Fatal(err) 206 | } 207 | if _, err := rr.Client().Post("http://127.0.0.1/nonexist", "x/error", iotest.ErrReader(errors.New("MY ERROR"))); err == nil || !strings.Contains(err.Error(), "MY ERROR") { 208 | t.Errorf("did not report failure from io.ReadAll(body): err = %v", err) 209 | } 210 | 211 | // error during scrub 212 | rr.Scrub(func(*http.Request) error { return errors.New("SCRUB ERROR") }) 213 | if _, err := rr.Client().Get("http://127.0.0.1/nonexist"); err == nil || !strings.Contains(err.Error(), "SCRUB ERROR") { 214 | t.Errorf("did not report failure from scrub: err = %v", err) 215 | } 216 | rr.Close() 217 | 218 | // error during rkey.WriteProxy 219 | rr, err = create(os.DevNull, nil) 220 | if err != nil { 221 | t.Fatal(err) 222 | } 223 | rr.Scrub(func(req *http.Request) error { 224 | req.URL = nil 225 | req.Host = "" 226 | return nil 227 | }) 228 | if _, err := rr.Client().Get("http://127.0.0.1/nonexist"); err == nil || !strings.Contains(err.Error(), "no Host or URL set") { 229 | t.Errorf("did not report failure from rkey.WriteProxy: err = %v", err) 230 | } 231 | rr.Close() 232 | 233 | // error during resp.Write 234 | rr, err = create(os.DevNull, badRespTransport{}) 235 | if err != nil { 236 | t.Fatal(err) 237 | } 238 | if _, err := rr.Client().Get("http://127.0.0.1/nonexist"); err == nil || !strings.Contains(err.Error(), "TRANSPORT ERROR") { 239 | t.Errorf("did not report failure from resp.Write: err = %v", err) 240 | } 241 | rr.Close() 242 | 243 | // error during Write logging request 244 | srv := httptest.NewServer(http.HandlerFunc(always555)) 245 | defer srv.Close() 246 | rr, err = create(os.DevNull, http.DefaultTransport) 247 | if err != nil { 248 | t.Fatal(err) 249 | } 250 | rr.Scrub(dropPort) 251 | rr.record.Close() // cause write error 252 | if _, err := rr.Client().Get(srv.URL + "/redirect"); err == nil || !strings.Contains(err.Error(), "file already closed") { 253 | t.Errorf("did not report failure from record write: err = %v", err) 254 | } 255 | rr.broken = errors.New("BROKEN ERROR") 256 | if _, err := rr.Client().Get(srv.URL + "/redirect"); err == nil || !strings.Contains(err.Error(), "BROKEN ERROR") { 257 | t.Errorf("did not report previous write failure: err = %v", err) 258 | } 259 | if err := rr.Close(); err == nil || !strings.Contains(err.Error(), "BROKEN ERROR") { 260 | t.Errorf("did not report write failure during close: err = %v", err) 261 | } 262 | 263 | // error during RoundTrip 264 | rr, err = create(os.DevNull, errTransport{errors.New("TRANSPORT ERROR")}) 265 | if err != nil { 266 | t.Fatal(err) 267 | } 268 | if _, err := rr.Client().Get(srv.URL); err == nil || !strings.Contains(err.Error(), "TRANSPORT ERROR") { 269 | t.Errorf("did not report failure from transport: err = %v", err) 270 | } 271 | 272 | // error during http.ReadResponse: trace is structurally okay but has malformed response inside 273 | if err := os.WriteFile(dir+"/rr", badResponseTrace, 0666); err != nil { 274 | t.Fatal(err) 275 | } 276 | rr, err = Open(dir+"/rr", nil) 277 | if err != nil { 278 | t.Fatal(err) 279 | } 280 | if _, err := rr.Client().Get("http://127.0.0.1/myrequest"); err == nil || !strings.Contains(err.Error(), "corrupt httprr trace:") { 281 | t.Errorf("did not diagnose invalid httprr trace: err = %v", err) 282 | } 283 | } 284 | 285 | type errTransport struct{ err error } 286 | 287 | func (e errTransport) RoundTrip(req *http.Request) (*http.Response, error) { 288 | return nil, e.err 289 | } 290 | 291 | type badRespTransport struct{} 292 | 293 | func (badRespTransport) RoundTrip(req *http.Request) (*http.Response, error) { 294 | resp := new(http.Response) 295 | resp.Body = io.NopCloser(iotest.ErrReader(errors.New("TRANSPORT ERROR"))) 296 | return resp, nil 297 | } 298 | -------------------------------------------------------------------------------- /internal/httprr/rr.go: -------------------------------------------------------------------------------- 1 | // Copyright 2024 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | // Package httprr implements HTTP record and replay, mainly for use in tests. 6 | // 7 | // [Open] creates a new [RecordReplay]. Whether it is recording or replaying 8 | // is controlled by the -httprecord flag, which is defined by this package 9 | // only in test programs (built by “go test”). 10 | // See the [Open] documentation for more details. 11 | package httprr 12 | 13 | import ( 14 | "bufio" 15 | "bytes" 16 | "cmp" 17 | "context" 18 | "flag" 19 | "fmt" 20 | "io" 21 | "net/http" 22 | "os" 23 | "regexp" 24 | "strconv" 25 | "strings" 26 | "sync" 27 | "testing" 28 | ) 29 | 30 | var record = new(string) 31 | 32 | func init() { 33 | if testing.Testing() { 34 | record = flag.String("httprecord", "", "re-record traces for files matching `regexp`") 35 | } 36 | } 37 | 38 | // A RecordReplay is an [http.RoundTripper] that can operate in two modes: record and replay. 39 | // 40 | // In record mode, the RecordReplay invokes another RoundTripper 41 | // and logs the (request, response) pairs to a file. 42 | // 43 | // In replay mode, the RecordReplay responds to requests by finding 44 | // an identical request in the log and sending the logged response. 45 | type RecordReplay struct { 46 | file string 47 | real http.RoundTripper 48 | 49 | mu sync.Mutex 50 | broken error 51 | record *os.File 52 | replay map[string]string 53 | scrub []func(*http.Request) error 54 | } 55 | 56 | // Scrub adds new scrubbing functions to rr. 57 | // 58 | // Before using a request as a lookup key or saving it in the record/replay log, 59 | // the RecordReplay calls each scrub function, in the order they were registered, 60 | // to canonicalize non-deterministic parts of the request and remove secrets. 61 | // Scrubbing only applies to a copy of the request used in the record/replay log; 62 | // the unmodified original request is sent to the actual server in recording mode. 63 | // A scrub function can assume that if req.Body is not nil, then it has type [*Body]. 64 | // 65 | // Calling Scrub adds to the list of registered scrubbing functions; 66 | // it does not replace those registered by earlier calls. 67 | func (rr *RecordReplay) Scrub(scrubs ...func(req *http.Request) error) { 68 | rr.scrub = append(rr.scrub, scrubs...) 69 | } 70 | 71 | // Recording reports whether the rr is in recording mode. 72 | func (rr *RecordReplay) Recording() bool { 73 | return rr.record != nil 74 | } 75 | 76 | // Open opens a new record/replay log in the named file and 77 | // returns a [RecordReplay] backed by that file. 78 | // 79 | // By default Open expects the file to exist and contain a 80 | // previously-recorded log of (request, response) pairs, 81 | // which [RecordReplay.RoundTrip] consults to prepare its responses. 82 | // 83 | // If the command-line flag -httprecord is set to a non-empty 84 | // regular expression that matches file, then Open creates 85 | // the file as a new log. In that mode, [RecordReplay.RoundTrip] 86 | // makes actual HTTP requests using rt but then logs the requests and 87 | // responses to the file for replaying in a future run. 88 | func Open(file string, rt http.RoundTripper) (*RecordReplay, error) { 89 | if *record != "" { 90 | re, err := regexp.Compile(*record) 91 | if err != nil { 92 | return nil, fmt.Errorf("invalid -httprecord flag: %v", err) 93 | } 94 | if re.MatchString(file) { 95 | return create(file, rt) 96 | } 97 | } 98 | return open(file, rt) 99 | } 100 | 101 | // creates creates a new record-mode RecordReplay in the file. 102 | // TODO maybe export 103 | func create(file string, rt http.RoundTripper) (*RecordReplay, error) { 104 | f, err := os.Create(file) 105 | if err != nil { 106 | return nil, err 107 | } 108 | if _, err := fmt.Fprintf(f, "httprr trace v1\n"); err != nil { 109 | // unreachable unless write error immediately after os.Create 110 | f.Close() 111 | return nil, err 112 | } 113 | rr := &RecordReplay{ 114 | file: file, 115 | real: rt, 116 | record: f, 117 | } 118 | return rr, nil 119 | } 120 | 121 | // open opens a replay-mode RecordReplay using the data in the file. 122 | func open(file string, rt http.RoundTripper) (*RecordReplay, error) { 123 | // Note: To handle larger traces without storing entirely in memory, 124 | // could instead read the file incrementally, storing a map[hash]offsets 125 | // and then reread the relevant part of the file during RoundTrip. 126 | 127 | bdata, err := os.ReadFile(file) 128 | if err != nil { 129 | return nil, err 130 | } 131 | data := string(bdata) 132 | line, data, ok := strings.Cut(data, "\n") 133 | if !ok || line != "httprr trace v1" { 134 | return nil, fmt.Errorf("read %s: not an httprr trace", file) 135 | } 136 | replay := make(map[string]string) 137 | for data != "" { 138 | line, data, ok = strings.Cut(data, "\n") 139 | f1, f2, _ := strings.Cut(line, " ") 140 | n1, err1 := strconv.Atoi(f1) 141 | n2, err2 := strconv.Atoi(f2) 142 | if !ok || err1 != nil || err2 != nil || n1 > len(data) || n2 > len(data[n1:]) { 143 | return nil, fmt.Errorf("read %s: corrupt httprr trace", file) 144 | } 145 | var req, resp string 146 | req, resp, data = data[:n1], data[n1:n1+n2], data[n1+n2:] 147 | replay[req] = resp 148 | } 149 | 150 | rr := &RecordReplay{ 151 | file: file, 152 | real: rt, 153 | replay: replay, 154 | } 155 | return rr, nil 156 | } 157 | 158 | // Client returns an http.Client using rr as its transport. 159 | // It is a shorthand for: 160 | // 161 | // return &http.Client{Transport: rr} 162 | // 163 | // For more complicated uses, use rr or the [RecordReplay.RoundTrip] method directly. 164 | func (rr *RecordReplay) Client() *http.Client { 165 | return &http.Client{Transport: rr} 166 | } 167 | 168 | // A Body is an io.ReadCloser used as an HTTP request body. 169 | // In a Scrubber, if req.Body != nil, then req.Body is guaranteed 170 | // to have type *Body, making it easy to access the body to change it. 171 | type Body struct { 172 | Data []byte 173 | ReadOffset int 174 | } 175 | 176 | // Read reads from the body, implementing io.Reader. 177 | func (b *Body) Read(p []byte) (int, error) { 178 | n := copy(p, b.Data[b.ReadOffset:]) 179 | if n == 0 { 180 | return 0, io.EOF 181 | } 182 | b.ReadOffset += n 183 | return n, nil 184 | } 185 | 186 | // Close is a no-op, implementing io.Closer. 187 | func (b *Body) Close() error { 188 | return nil 189 | } 190 | 191 | // RoundTrip implements [http.RoundTripper]. 192 | // 193 | // If rr has been opened in record mode, RoundTrip passes the requests on to 194 | // the RoundTripper specified in the call to [Open] and then logs the 195 | // (request, response) pair to the underlying file. 196 | // 197 | // If rr has been opened in replay mode, RoundTrip looks up the request in the log 198 | // and then responds with the previously logged response. 199 | // If the log does not contain req, RoundTrip returns an error. 200 | func (rr *RecordReplay) RoundTrip(req *http.Request) (*http.Response, error) { 201 | // rkey is the scrubbed request used as a lookup key. 202 | rkey := req.Clone(context.Background()) 203 | if req.Body != nil { 204 | body, err := io.ReadAll(req.Body) 205 | req.Body.Close() 206 | if err != nil { 207 | return nil, err 208 | } 209 | req.Body = &Body{Data: body} 210 | rkey.Body = &Body{Data: bytes.Clone(body)} 211 | rkey.ContentLength = -1 212 | } 213 | 214 | if len(rr.scrub) > 0 { 215 | // Canonicalize and scrub body. 216 | for _, scrub := range rr.scrub { 217 | if err := scrub(rkey); err != nil { 218 | return nil, err 219 | } 220 | } 221 | if rkey.Body != nil { 222 | rkey.ContentLength = int64(len(rkey.Body.(*Body).Data)) 223 | } 224 | } 225 | 226 | // Use WriteProxy instead of Write to preserve the URL scheme. 227 | var bkey strings.Builder 228 | if err := rkey.WriteProxy(&bkey); err != nil { 229 | return nil, err 230 | } 231 | key := bkey.String() 232 | 233 | if rr.replay != nil { 234 | if respWire, ok := rr.replay[key]; ok { 235 | resp, err := http.ReadResponse(bufio.NewReader(strings.NewReader(respWire)), req) 236 | if err != nil { 237 | return nil, fmt.Errorf("read %s: corrupt httprr trace: %v", rr.file, err) 238 | } 239 | return resp, nil 240 | } 241 | return nil, fmt.Errorf("cached HTTP response not found for:\n%s", key) 242 | } 243 | 244 | rr.mu.Lock() 245 | err := rr.broken 246 | rr.mu.Unlock() 247 | if err != nil { 248 | return nil, err 249 | } 250 | 251 | resp, err := rr.real.RoundTrip(req) 252 | if err != nil { 253 | return nil, err 254 | } 255 | 256 | var respBuf strings.Builder 257 | if err := resp.Write(&respBuf); err != nil { 258 | return nil, err 259 | } 260 | respWire := respBuf.String() 261 | 262 | resp, err = http.ReadResponse(bufio.NewReader(strings.NewReader(respWire)), req) 263 | if err != nil { 264 | // unreachable unless resp.Write does not round-trip with http.ReadResponse 265 | return nil, err 266 | } 267 | 268 | rr.mu.Lock() 269 | defer rr.mu.Unlock() 270 | if rr.broken != nil { 271 | // unreachable unless concurrent I/O error; checked above 272 | return nil, rr.broken 273 | } 274 | _, err1 := fmt.Fprintf(rr.record, "%d %d\n", len(key), len(respWire)) 275 | _, err2 := rr.record.WriteString(key) 276 | _, err3 := rr.record.WriteString(respWire) 277 | if err := cmp.Or(err1, err2, err3); err != nil { 278 | rr.broken = err 279 | rr.record.Close() 280 | os.Remove(rr.file) 281 | return nil, err 282 | } 283 | return resp, nil 284 | } 285 | 286 | // Close closes the RecordReplay. 287 | // It is a no-op in replay mode. 288 | func (rr *RecordReplay) Close() error { 289 | if rr.broken != nil { 290 | return rr.broken 291 | } 292 | if rr.record != nil { 293 | return rr.record.Close() 294 | } 295 | return nil 296 | } 297 | -------------------------------------------------------------------------------- /internal/related/related.go: -------------------------------------------------------------------------------- 1 | // Copyright 2024 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | /*TODO 6 | 7 | p.EnableProject("golang/go") 8 | p.IgnoreBody("— [watchflakes](https://go.dev/wiki/Watchflakes)") 9 | p.IgnoreTitlePrefix("x/tools/gopls: release version v") 10 | p.IgnoreTitleSuffix(" backport]") 11 | 12 | */ 13 | 14 | // Package related implements posting about related issues to GitHub. 15 | package related 16 | 17 | import ( 18 | "bytes" 19 | "fmt" 20 | "log/slog" 21 | "strings" 22 | "time" 23 | 24 | "rsc.io/gaby/internal/docs" 25 | "rsc.io/gaby/internal/github" 26 | "rsc.io/gaby/internal/storage" 27 | "rsc.io/gaby/internal/storage/timed" 28 | "rsc.io/ordered" 29 | ) 30 | 31 | // A Poster posts to GitHub about related issues (and eventually other documents). 32 | type Poster struct { 33 | slog *slog.Logger 34 | db storage.DB 35 | vdb storage.VectorDB 36 | github *github.Client 37 | docs *docs.Corpus 38 | projects map[string]bool 39 | watcher *timed.Watcher[*github.Event] 40 | name string 41 | timeLimit time.Time 42 | ignores []func(*github.Issue) bool 43 | maxResults int 44 | scoreCutoff float64 45 | post bool 46 | } 47 | 48 | // New creates and returns a new Poster. It logs to lg, stores state in db, 49 | // watches for new GitHub issues using gh, looks up related documents in vdb, 50 | // and reads the document content from docs. 51 | // For the purposes of storing its own state, it uses the given name. 52 | // Future calls to New with the same name will use the same state. 53 | // 54 | // Use the [Poster] methods to configure the posting parameters 55 | // (especially [Poster.EnableProject] and [Poster.EnablePosts]) 56 | // before calling [Poster.Run]. 57 | func New(lg *slog.Logger, db storage.DB, gh *github.Client, vdb storage.VectorDB, docs *docs.Corpus, name string) *Poster { 58 | return &Poster{ 59 | slog: lg, 60 | db: db, 61 | vdb: vdb, 62 | github: gh, 63 | docs: docs, 64 | projects: make(map[string]bool), 65 | watcher: gh.EventWatcher("related.Poster:" + name), 66 | name: name, 67 | timeLimit: time.Now().Add(-defaultTooOld), 68 | maxResults: defaultMaxResults, 69 | scoreCutoff: defaultScoreCutoff, 70 | } 71 | } 72 | 73 | // SetTimeLimit controls how old an issue can be for the Poster to post to it. 74 | // Issues created before time t will be skipped. 75 | // The default is not to post to issues that are more than 48 hours old 76 | // at the time of the call to [New]. 77 | func (p *Poster) SetTimeLimit(t time.Time) { 78 | p.timeLimit = t 79 | } 80 | 81 | const defaultTooOld = 48 * time.Hour 82 | 83 | // SetMaxResults sets the maximum number of related documents to 84 | // post to the issue. 85 | // The default is 10. 86 | func (p *Poster) SetMaxResults(max int) { 87 | p.maxResults = max 88 | } 89 | 90 | const defaultMaxResults = 10 91 | 92 | // SetMinScore sets the minimum vector search score that a 93 | // [storage.VectorResult] must have to be considered a related document 94 | // The default is 0.82, which was determined empirically. 95 | func (p *Poster) SetMinScore(min float64) { 96 | p.scoreCutoff = min 97 | } 98 | 99 | const defaultScoreCutoff = 0.82 100 | 101 | // SkipBodyContains configures the Poster to skip issues with a body containing 102 | // the given text. 103 | func (p *Poster) SkipBodyContains(text string) { 104 | p.ignores = append(p.ignores, func(issue *github.Issue) bool { 105 | return strings.Contains(issue.Body, text) 106 | }) 107 | } 108 | 109 | // SkipTitlePrefix configures the Poster to skip issues with a title starting 110 | // with the given prefix. 111 | func (p *Poster) SkipTitlePrefix(prefix string) { 112 | p.ignores = append(p.ignores, func(issue *github.Issue) bool { 113 | return strings.HasPrefix(issue.Title, prefix) 114 | }) 115 | } 116 | 117 | // SkipTitleSuffix configures the Poster to skip issues with a title starting 118 | // with the given suffix. 119 | func (p *Poster) SkipTitleSuffix(suffix string) { 120 | p.ignores = append(p.ignores, func(issue *github.Issue) bool { 121 | return strings.HasSuffix(issue.Title, suffix) 122 | }) 123 | } 124 | 125 | // EnableProject enables the Poster to post on issues in the given GitHub project (for example "golang/go"). 126 | // See also [Poster.EnablePosts], which must also be called to post anything to GitHub. 127 | func (p *Poster) EnableProject(project string) { 128 | p.projects[project] = true 129 | } 130 | 131 | // EnablePosts enables the Poster to post to GitHub. 132 | // If EnablePosts has not been called, [Poster.Run] logs what it would post but does not post the messages. 133 | // See also [Poster.EnableProject], which must also be called to set the projects being considered. 134 | func (p *Poster) EnablePosts() { 135 | p.post = true 136 | } 137 | 138 | // deletePosted deletes all the “posted on this issue” notes. 139 | func (p *Poster) deletePosted() { 140 | p.db.DeleteRange(ordered.Encode("triage.Posted"), ordered.Encode("triage.Posted", ordered.Inf)) 141 | } 142 | 143 | // Run runs a single round of posting to GitHub. 144 | // It scans all open issues that have been created since the last call to [Poster.Run] 145 | // using a Poster with the same name (see [New]). 146 | // Run skips closed issues, and it also skips pull requests. 147 | // 148 | // For each issue that matches the configured posting constraints 149 | // (see [Poster.EnableProject], [Poster.SetTimeLimit], [Poster.IgnoreBodyContains], [Poster.IgnoreTitlePrefix], and [Poster.IgnoreTitleSuffix]), 150 | // Run computes an embedding of the issue body text (ignoring comments) 151 | // and looks in the vector database for other documents (currently only issues) 152 | // that are aligned closely enough with that body text 153 | // (see [Poster.SetMinScore]) and posts a limited number of matches 154 | // (see [Poster.SetMaxResults]). 155 | // 156 | // Run logs each post to the [slog.Logger] passed to [New]. 157 | // If [Poster.EnablePosts] has been called, then [Run] also posts the comment to GitHub, 158 | // records in the database that it has posted to GitHub to make sure it never posts to that issue again, 159 | // and advances its GitHub issue watcher's incremental cursor to speed future calls to [Run]. 160 | // 161 | // When [Poster.EnablePosts] has not been called, Run only logs the comments it would post. 162 | // Future calls to Run will reprocess the same issues and re-log the same comments. 163 | func (p *Poster) Run() { 164 | p.slog.Info("related.Poster start", "name", p.name) 165 | defer p.slog.Info("related.Poster end", "name", p.name) 166 | 167 | defer p.watcher.Flush() 168 | 169 | Watcher: 170 | for e := range p.watcher.Recent() { 171 | if !p.projects[e.Project] || e.API != "/issues" { 172 | continue 173 | } 174 | issue := e.Typed.(*github.Issue) 175 | if issue.State == "closed" || issue.PullRequest != nil { 176 | continue 177 | } 178 | tm, err := time.Parse(time.RFC3339, issue.CreatedAt) 179 | if err != nil { 180 | p.slog.Error("triage parse createdat", "CreatedAt", issue.CreatedAt, "err", err) 181 | continue 182 | } 183 | if tm.Before(p.timeLimit) { 184 | continue 185 | } 186 | for _, ig := range p.ignores { 187 | if ig(issue) { 188 | continue Watcher 189 | } 190 | } 191 | 192 | // TODO: Perhaps this key should include p.name, but perhaps not. 193 | // This makes sure we only every post to each issue once. 194 | posted := ordered.Encode("triage.Posted", e.Project, e.Issue) 195 | if _, ok := p.db.Get(posted); ok { 196 | continue 197 | } 198 | 199 | u := fmt.Sprintf("https://github.com/%s/issues/%d", e.Project, e.Issue) 200 | p.slog.Debug("triage client consider", "url", u) 201 | vec, ok := p.vdb.Get(u) 202 | if !ok { 203 | p.slog.Error("triage lookup failed", "url", u) 204 | continue 205 | } 206 | results := p.vdb.Search(vec, p.maxResults+5) 207 | if len(results) > 0 && results[0].ID == u { 208 | results = results[1:] 209 | } 210 | for i, r := range results { 211 | if r.Score < p.scoreCutoff { 212 | results = results[:i] 213 | break 214 | } 215 | } 216 | if len(results) > p.maxResults { 217 | results = results[:p.maxResults] 218 | } 219 | if len(results) == 0 { 220 | if p.post { 221 | p.watcher.MarkOld(e.DBTime) 222 | } 223 | continue 224 | } 225 | var buf bytes.Buffer 226 | fmt.Fprintf(&buf, "**Related Issues**\n\n") 227 | for _, r := range results { 228 | title := r.ID 229 | if d, ok := p.docs.Get(r.ID); ok { 230 | title = d.Title 231 | } 232 | info := "" 233 | if issue, err := p.github.LookupIssueURL(r.ID); err == nil { 234 | info = fmt.Sprint(" #", issue.Number) 235 | if issue.ClosedAt != "" { 236 | info += " (closed)" 237 | } 238 | } 239 | fmt.Fprintf(&buf, " - [%s%s](%s) \n", markdownEscape(title), info, r.ID, r.Score) 240 | } 241 | fmt.Fprintf(&buf, "\n(Emoji vote if this was helpful or unhelpful; more detailed feedback welcome in [this discussion](https://github.com/golang/go/discussions/67901).)\n") 242 | 243 | p.slog.Info("related.Poster post", "name", p.name, "project", e.Project, "issue", e.Issue, "comment", buf.String()) 244 | 245 | if !p.post { 246 | continue 247 | } 248 | 249 | if err := p.github.PostIssueComment(issue, &github.IssueCommentChanges{Body: buf.String()}); err != nil { 250 | p.slog.Error("PostIssueComment", "issue", e.Issue, "err", err) 251 | continue 252 | } 253 | p.db.Set(posted, nil) 254 | p.watcher.MarkOld(e.DBTime) 255 | 256 | // Flush immediately to make sure we don't re-post if interrupted later in the loop. 257 | p.watcher.Flush() 258 | p.db.Flush() 259 | } 260 | } 261 | 262 | var markdownEscaper = strings.NewReplacer( 263 | "_", `\_`, 264 | "*", `\*`, 265 | "`", "\\`", 266 | "[", `\[`, 267 | "]", `\]`, 268 | "<", `\<`, 269 | ">", `\>`, 270 | "&", `\&`, 271 | ) 272 | 273 | func markdownEscape(s string) string { 274 | return markdownEscaper.Replace(s) 275 | } 276 | -------------------------------------------------------------------------------- /internal/storage/timed/timed_test.go: -------------------------------------------------------------------------------- 1 | // Copyright 2024 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package timed 6 | 7 | import ( 8 | "slices" 9 | "strings" 10 | "testing" 11 | 12 | "rsc.io/gaby/internal/storage" 13 | ) 14 | 15 | func Test(t *testing.T) { 16 | db := storage.MemDB() 17 | b := db.Batch() 18 | 19 | Set(db, b, "kind", []byte("key"), []byte("val")) 20 | if e, ok := Get(db, "kind", []byte("key")); e != nil || ok != false { 21 | t.Errorf("Set wrote to db instead of b: Get = %v, %v, want nil, false", e, ok) 22 | } 23 | b.Apply() 24 | if e, ok := Get(db, "kind", []byte("key")); !ok || e == nil || e.Kind != "kind" || string(e.Key) != "key" || string(e.Val) != "val" || e.ModTime == 0 { 25 | t.Errorf("Get after Set = %+v, %v, want {>0, kind, key, val}, true", e, ok) 26 | } 27 | 28 | Delete(db, b, "kind", []byte("missing")) 29 | b.Apply() 30 | if e, ok := Get(db, "kind", []byte("key")); !ok || e == nil || e.Kind != "kind" || string(e.Key) != "key" || string(e.Val) != "val" || e.ModTime == 0 { 31 | t.Errorf("Get after Delete = %+v, %v, want {>0, kind, key, val}, true", e, ok) 32 | } 33 | 34 | Delete(db, b, "kind", []byte("key")) 35 | b.Apply() 36 | if e, ok := Get(db, "kind", []byte("key")); e != nil || ok != false { 37 | t.Errorf("Delete didn't delete key: Get = %v, %v, want nil, false", e, ok) 38 | } 39 | 40 | var keys []string 41 | var last DBTime 42 | do := func(e *Entry) { 43 | t.Helper() 44 | if last != -1 { 45 | if e.ModTime <= last { 46 | t.Fatalf("%+v: ModTime %v <= last %v", e, e.ModTime, last) 47 | } 48 | last = e.ModTime 49 | } 50 | if string(e.Kind) != "kind" { 51 | t.Fatalf("%+v: Kind=%q, want %q", e, e.Kind, "kind") 52 | } 53 | key := string(e.Key) 54 | if !strings.HasPrefix(key, "k") { 55 | t.Fatalf("%+v: Key=%q, want k prefix", e, e.Key) 56 | } 57 | if want := "v" + key[1:]; string(e.Val) != want { 58 | t.Fatalf("%+v: Val=%q, want %q", e, e.Val, want) 59 | } 60 | keys = append(keys, key) 61 | } 62 | 63 | Set(db, b, "kind", []byte("k1"), []byte("v1")) 64 | Set(db, b, "kind", []byte("k3"), []byte("v3")) 65 | Set(db, b, "kind", []byte("k2"), []byte("v2")) 66 | b.Apply() 67 | 68 | // Basic iteration. 69 | last = -1 70 | keys = nil 71 | for e := range Scan(db, "kind", nil, []byte("\xff")) { 72 | do(e) 73 | } 74 | if want := []string{"k1", "k2", "k3"}; !slices.Equal(keys, want) { 75 | t.Errorf("Scan() = %v, want %v", keys, want) 76 | } 77 | 78 | keys = nil 79 | for e := range Scan(db, "kind", []byte("k1x"), []byte("k2z")) { 80 | do(e) 81 | } 82 | if want := []string{"k2"}; !slices.Equal(keys, want) { 83 | t.Errorf("Scan(k1x, k2z) = %v, want %v", keys, want) 84 | } 85 | 86 | keys = nil 87 | for e := range Scan(db, "kind", []byte("k2"), []byte("\xff")) { 88 | do(e) 89 | } 90 | if want := []string{"k2", "k3"}; !slices.Equal(keys, want) { 91 | t.Errorf("Scan(k2) = %v, want %v", keys, want) 92 | } 93 | 94 | keys = nil 95 | for e := range Scan(db, "kind", []byte("k2"), []byte("\xff")) { 96 | do(e) 97 | break 98 | } 99 | if want := []string{"k2"}; !slices.Equal(keys, want) { 100 | t.Errorf("Scan(k2) with break = %v, want %v", keys, want) 101 | } 102 | 103 | // Timed iteration. 104 | last = 0 105 | keys = nil 106 | for e := range ScanAfter(db, "kind", 0, nil) { 107 | do(e) 108 | } 109 | if want := []string{"k1", "k3", "k2"}; !slices.Equal(keys, want) { 110 | t.Errorf("ScanAfter(0) = %v, want %v", keys, want) 111 | } 112 | t123 := last 113 | 114 | // Watcher. 115 | last = 0 116 | keys = nil 117 | w := NewWatcher(db, "name", "kind", func(e *Entry) *Entry { return e }) 118 | for e := range w.Recent() { 119 | do(e) 120 | w.MarkOld(e.ModTime) 121 | w.MarkOld(e.ModTime - 1) // no-op 122 | } 123 | if want := []string{"k1", "k3", "k2"}; !slices.Equal(keys, want) { 124 | t.Errorf("Watcher.Recent() = %v, want %v", keys, want) 125 | } 126 | 127 | // Timed iteration with break. 128 | last = 0 129 | keys = nil 130 | for e := range ScanAfter(db, "kind", 0, nil) { 131 | do(e) 132 | break 133 | } 134 | if want := []string{"k1"}; !slices.Equal(keys, want) { 135 | t.Errorf("ScanAfter(0) with break = %v, want %v", keys, want) 136 | } 137 | 138 | // Incremental iteration 139 | Set(db, b, "kind", []byte("k5"), []byte("v5")) 140 | Set(db, b, "kind", []byte("k4"), []byte("v4")) 141 | Set(db, b, "kind", []byte("k2"), []byte("v2")) 142 | b.Apply() 143 | 144 | // Check full scan. 145 | last = 0 146 | keys = nil 147 | for e := range ScanAfter(db, "kind", 0, nil) { 148 | do(e) 149 | } 150 | if want := []string{"k1", "k3", "k5", "k4", "k2"}; !slices.Equal(keys, want) { 151 | t.Errorf("ScanAfter(0) = %v, want %v", keys, want) 152 | } 153 | 154 | // Check incremental scan. 155 | last = 0 156 | keys = nil 157 | for e := range ScanAfter(db, "kind", t123, nil) { 158 | do(e) 159 | } 160 | if want := []string{"k5", "k4", "k2"}; !slices.Equal(keys, want) { 161 | t.Errorf("ScanAfter(t123) = %v, want %v", keys, want) 162 | } 163 | 164 | // Full (new) watcher. 165 | last = 0 166 | keys = nil 167 | w = NewWatcher(db, "name2", "kind", func(e *Entry) *Entry { return e }) 168 | for e := range w.Recent() { 169 | do(e) 170 | } 171 | if want := []string{"k1", "k3", "k5", "k4", "k2"}; !slices.Equal(keys, want) { 172 | t.Errorf("Watcher.Recent() full = %v, want %v", keys, want) 173 | } 174 | 175 | // Watcher with break 176 | last = 0 177 | keys = nil 178 | w = NewWatcher(db, "name2", "kind", func(e *Entry) *Entry { return e }) 179 | for e := range w.Recent() { 180 | do(e) 181 | break 182 | } 183 | if want := []string{"k1"}; !slices.Equal(keys, want) { 184 | t.Errorf("Watcher.Recent() full = %v, want %v", keys, want) 185 | } 186 | 187 | // Incremental (old) watcher. 188 | last = 0 189 | keys = nil 190 | w = NewWatcher(db, "name", "kind", func(e *Entry) *Entry { return e }) 191 | for e := range w.Recent() { 192 | do(e) 193 | } 194 | if want := []string{"k5", "k4", "k2"}; !slices.Equal(keys, want) { 195 | t.Errorf("Watcher.Recent() incremental = %v, want %v", keys, want) 196 | } 197 | 198 | // Restart incremental watcher. 199 | last = 0 200 | keys = nil 201 | w.Restart() 202 | for e := range w.Recent() { 203 | do(e) 204 | } 205 | if want := []string{"k1", "k3", "k5", "k4", "k2"}; !slices.Equal(keys, want) { 206 | t.Errorf("Watcher.Recent() after Reset = %v, want %v", keys, want) 207 | } 208 | 209 | // Filtered scan. 210 | last = 0 211 | keys = nil 212 | filter := func(key []byte) bool { return strings.HasSuffix(string(key), "3") } 213 | for e := range ScanAfter(db, "kind", 0, filter) { 214 | do(e) 215 | } 216 | if want := []string{"k3"}; !slices.Equal(keys, want) { 217 | t.Errorf("ScanAfter(0, suffix3) = %v, want %v", keys, want) 218 | } 219 | 220 | // Accidentally doing multiple Sets of a single key 221 | // will leave behind a stale timestamp record. 222 | Set(db, b, "kind", []byte("k3"), []byte("v3")) 223 | Set(db, b, "kind", []byte("k3"), []byte("v3")) 224 | b.Apply() 225 | 226 | // Stale timestamp should not result in multiple k3 visits. 227 | last = 0 228 | keys = nil 229 | for e := range ScanAfter(db, "kind", 0, nil) { 230 | do(e) 231 | } 232 | if want := []string{"k1", "k5", "k4", "k2", "k3"}; !slices.Equal(keys, want) { 233 | t.Errorf("ScanAfter(0) = %v, want %v", keys, want) 234 | } 235 | 236 | // Deleting k3 now will still leave the stale timestamp record. 237 | // Make sure it is ignored and doesn't cause a lookup crash. 238 | Delete(db, b, "kind", []byte("k3")) 239 | b.Apply() 240 | 241 | // Stale timestamp should not crash on k3. 242 | last = 0 243 | keys = nil 244 | for e := range ScanAfter(db, "kind", 0, nil) { 245 | do(e) 246 | } 247 | if want := []string{"k1", "k5", "k4", "k2"}; !slices.Equal(keys, want) { 248 | t.Errorf("ScanAfter(0) = %v, want %v", keys, want) 249 | } 250 | 251 | // Range deletion. 252 | DeleteRange(db, b, "kind", []byte("k1z"), []byte("k33")) 253 | b.Apply() 254 | 255 | last = -1 256 | keys = nil 257 | for e := range Scan(db, "kind", nil, []byte("\xff")) { 258 | do(e) 259 | } 260 | if want := []string{"k1", "k4", "k5"}; !slices.Equal(keys, want) { 261 | t.Errorf("Scan() after DeleteRange = %v, want %v", keys, want) 262 | } 263 | 264 | last = 0 265 | keys = nil 266 | for e := range ScanAfter(db, "kind", 0, nil) { 267 | do(e) 268 | } 269 | if want := []string{"k1", "k5", "k4"}; !slices.Equal(keys, want) { 270 | t.Errorf("ScanAfter(0) after DeleteRange = %v, want %v", keys, want) 271 | } 272 | 273 | Set(db, b, "kind", []byte("k2"), []byte("v2")) 274 | b.Apply() 275 | } 276 | 277 | func TestLocking(t *testing.T) { 278 | db := storage.MemDB() 279 | b := db.Batch() 280 | Set(db, b, "kind", []byte("key"), []byte("val")) 281 | b.Apply() 282 | 283 | w := NewWatcher(db, "name", "kind", func(e *Entry) *Entry { return e }) 284 | callRecover := func() { recover() } 285 | 286 | w.lock() 287 | func() { 288 | defer callRecover() 289 | w.lock() 290 | t.Fatalf("second w.lock did not panic") 291 | }() 292 | 293 | w.unlock() 294 | func() { 295 | defer callRecover() 296 | w.unlock() 297 | t.Fatalf("second w.unlock did not panic") 298 | }() 299 | 300 | func() { 301 | defer callRecover() 302 | w.MarkOld(0) 303 | t.Fatalf("MarkOld outside iteration did not panic") 304 | }() 305 | 306 | did := false 307 | for _ = range w.Recent() { 308 | did = true 309 | func() { 310 | defer callRecover() 311 | w.Restart() 312 | t.Fatalf("Restart inside iteration did not panic") 313 | }() 314 | 315 | func() { 316 | defer callRecover() 317 | for _ = range w.Recent() { 318 | } 319 | t.Fatalf("iteration inside iteration did not panic") 320 | }() 321 | } 322 | if !did { 323 | t.Fatalf("range over Recent did not find any entries") 324 | } 325 | } 326 | 327 | func TestNow(t *testing.T) { 328 | t1 := now() 329 | for range 1000 { 330 | t2 := now() 331 | if t2 <= t1 { 332 | t.Errorf("now(), now() = %d, %d (out of order)", t1, t2) 333 | } 334 | t1 = t2 335 | } 336 | } 337 | -------------------------------------------------------------------------------- /internal/testdata/markdown3.httprr: -------------------------------------------------------------------------------- 1 | httprr trace v1 2 | 207 6198 3 | GET https://api.github.com/repos/rsc/markdown/issues?direction=asc&page=1&per_page=100&since=2024-06-04T02%3A57%3A22Z&sort=updated&state=all HTTP/1.1 4 | Host: api.github.com 5 | User-Agent: Go-http-client/1.1 6 | 7 | HTTP/2.0 200 OK 8 | Accept-Ranges: bytes 9 | Access-Control-Allow-Origin: * 10 | Access-Control-Expose-Headers: ETag, Link, Location, Retry-After, X-GitHub-OTP, X-RateLimit-Limit, X-RateLimit-Remaining, X-RateLimit-Used, X-RateLimit-Resource, X-RateLimit-Reset, X-OAuth-Scopes, X-Accepted-OAuth-Scopes, X-Poll-Interval, X-GitHub-Media-Type, X-GitHub-SSO, X-GitHub-Request-Id, Deprecation, Sunset 11 | Cache-Control: public, max-age=60, s-maxage=60 12 | Content-Security-Policy: default-src 'none' 13 | Content-Type: application/json; charset=utf-8 14 | Date: Tue, 04 Jun 2024 12:28:22 GMT 15 | Etag: W/"97372b4a1a5b329a038fade04cd5934c5acca1f74d0515121101e2ce66b1ba4e" 16 | Referrer-Policy: origin-when-cross-origin, strict-origin-when-cross-origin 17 | Server: GitHub.com 18 | Strict-Transport-Security: max-age=31536000; includeSubdomains; preload 19 | Vary: Accept, Accept-Encoding, Accept, X-Requested-With 20 | X-Content-Type-Options: nosniff 21 | X-Frame-Options: deny 22 | X-Github-Api-Version-Selected: 2022-11-28 23 | X-Github-Media-Type: github.v3; format=json 24 | X-Github-Request-Id: DD96:493B9:15322F77:248A5191:665F0866 25 | X-Ratelimit-Limit: 60 26 | X-Ratelimit-Remaining: 59 27 | X-Ratelimit-Reset: 1717507702 28 | X-Ratelimit-Resource: core 29 | X-Ratelimit-Used: 1 30 | X-Xss-Protection: 0 31 | 32 | [{"url":"https://api.github.com/repos/rsc/markdown/issues/18","repository_url":"https://api.github.com/repos/rsc/markdown","labels_url":"https://api.github.com/repos/rsc/markdown/issues/18/labels{/name}","comments_url":"https://api.github.com/repos/rsc/markdown/issues/18/comments","events_url":"https://api.github.com/repos/rsc/markdown/issues/18/events","html_url":"https://github.com/rsc/markdown/pull/18","id":2276848742,"node_id":"PR_kwDOKnFwjc5ubgV0","number":18,"title":"markdown: emit Info in CodeBlock markdown","user":{"login":"juliaogris","id":1596871,"node_id":"MDQ6VXNlcjE1OTY4NzE=","avatar_url":"https://avatars.githubusercontent.com/u/1596871?v=4","gravatar_id":"","url":"https://api.github.com/users/juliaogris","html_url":"https://github.com/juliaogris","followers_url":"https://api.github.com/users/juliaogris/followers","following_url":"https://api.github.com/users/juliaogris/following{/other_user}","gists_url":"https://api.github.com/users/juliaogris/gists{/gist_id}","starred_url":"https://api.github.com/users/juliaogris/starred{/owner}{/repo}","subscriptions_url":"https://api.github.com/users/juliaogris/subscriptions","organizations_url":"https://api.github.com/users/juliaogris/orgs","repos_url":"https://api.github.com/users/juliaogris/repos","events_url":"https://api.github.com/users/juliaogris/events{/privacy}","received_events_url":"https://api.github.com/users/juliaogris/received_events","type":"User","site_admin":false},"labels":[],"state":"closed","locked":false,"assignee":null,"assignees":[],"milestone":null,"comments":2,"created_at":"2024-05-03T03:59:00Z","updated_at":"2024-06-04T02:57:22Z","closed_at":"2024-06-03T21:40:04Z","author_association":"CONTRIBUTOR","active_lock_reason":null,"draft":false,"pull_request":{"url":"https://api.github.com/repos/rsc/markdown/pulls/18","html_url":"https://github.com/rsc/markdown/pull/18","diff_url":"https://github.com/rsc/markdown/pull/18.diff","patch_url":"https://github.com/rsc/markdown/pull/18.patch","merged_at":"2024-06-03T21:40:04Z"},"body":"Emit the Info field of CodeBlock in the CodeBlock.printMardown function so \r\nthat a round trip from markdown to markdown will preserve the language Info.","reactions":{"url":"https://api.github.com/repos/rsc/markdown/issues/18/reactions","total_count":1,"+1":1,"-1":0,"laugh":0,"hooray":0,"confused":0,"heart":0,"rocket":0,"eyes":0},"timeline_url":"https://api.github.com/repos/rsc/markdown/issues/18/timeline","performed_via_github_app":null,"state_reason":null},{"url":"https://api.github.com/repos/rsc/markdown/issues/19","repository_url":"https://api.github.com/repos/rsc/markdown","labels_url":"https://api.github.com/repos/rsc/markdown/issues/19/labels{/name}","comments_url":"https://api.github.com/repos/rsc/markdown/issues/19/comments","events_url":"https://api.github.com/repos/rsc/markdown/issues/19/events","html_url":"https://github.com/rsc/markdown/issues/19","id":2308816936,"node_id":"I_kwDOKnFwjc6JncAo","number":19,"title":"feature: synthesize lowercase anchors for heading","user":{"login":"adonovan","id":5658175,"node_id":"MDQ6VXNlcjU2NTgxNzU=","avatar_url":"https://avatars.githubusercontent.com/u/5658175?v=4","gravatar_id":"","url":"https://api.github.com/users/adonovan","html_url":"https://github.com/adonovan","followers_url":"https://api.github.com/users/adonovan/followers","following_url":"https://api.github.com/users/adonovan/following{/other_user}","gists_url":"https://api.github.com/users/adonovan/gists{/gist_id}","starred_url":"https://api.github.com/users/adonovan/starred{/owner}{/repo}","subscriptions_url":"https://api.github.com/users/adonovan/subscriptions","organizations_url":"https://api.github.com/users/adonovan/orgs","repos_url":"https://api.github.com/users/adonovan/repos","events_url":"https://api.github.com/users/adonovan/events{/privacy}","received_events_url":"https://api.github.com/users/adonovan/received_events","type":"User","site_admin":false},"labels":[],"state":"open","locked":false,"assignee":null,"assignees":[],"milestone":null,"comments":1,"created_at":"2024-05-21T17:56:12Z","updated_at":"2024-06-04T12:27:49Z","closed_at":null,"author_association":"NONE","active_lock_reason":null,"body":"GitHub's markdown renderer creates lowercase anchors for headings. For example, this heading, `## Diagnostic`, can be found using either of these two URLs, which differ in the case of their fragment ID:\r\n\r\nhttps://github.com/golang/tools/blob/master/gopls/doc/settings.md#diagnostic\r\nhttps://github.com/golang/tools/blob/master/gopls/doc/settings.md#Diagnostic\r\n\r\nPerhaps your markdown renderer (which has been really useful--thanks!) could do the same.\r\n","reactions":{"url":"https://api.github.com/repos/rsc/markdown/issues/19/reactions","total_count":0,"+1":0,"-1":0,"laugh":0,"hooray":0,"confused":0,"heart":0,"rocket":0,"eyes":0},"timeline_url":"https://api.github.com/repos/rsc/markdown/issues/19/timeline","performed_via_github_app":null,"state_reason":null}]193 2704 33 | GET https://api.github.com/repos/rsc/markdown/issues/comments?direction=asc&page=1&since=2024-06-04T02%3A57%3A21Z&sort=updated HTTP/1.1 34 | Host: api.github.com 35 | User-Agent: Go-http-client/1.1 36 | 37 | HTTP/2.0 200 OK 38 | Accept-Ranges: bytes 39 | Access-Control-Allow-Origin: * 40 | Access-Control-Expose-Headers: ETag, Link, Location, Retry-After, X-GitHub-OTP, X-RateLimit-Limit, X-RateLimit-Remaining, X-RateLimit-Used, X-RateLimit-Resource, X-RateLimit-Reset, X-OAuth-Scopes, X-Accepted-OAuth-Scopes, X-Poll-Interval, X-GitHub-Media-Type, X-GitHub-SSO, X-GitHub-Request-Id, Deprecation, Sunset 41 | Cache-Control: public, max-age=60, s-maxage=60 42 | Content-Security-Policy: default-src 'none' 43 | Content-Type: application/json; charset=utf-8 44 | Date: Tue, 04 Jun 2024 12:28:22 GMT 45 | Etag: W/"1075157bc09a784524c573b03b8c28dabbce2697b7dc47f380a9ad6d44a8badf" 46 | Referrer-Policy: origin-when-cross-origin, strict-origin-when-cross-origin 47 | Server: GitHub.com 48 | Strict-Transport-Security: max-age=31536000; includeSubdomains; preload 49 | Vary: Accept, Accept-Encoding, Accept, X-Requested-With 50 | X-Content-Type-Options: nosniff 51 | X-Frame-Options: deny 52 | X-Github-Api-Version-Selected: 2022-11-28 53 | X-Github-Media-Type: github.v3; format=json 54 | X-Github-Request-Id: DD96:493B9:15323000:248A5264:665F0866 55 | X-Ratelimit-Limit: 60 56 | X-Ratelimit-Remaining: 58 57 | X-Ratelimit-Reset: 1717507702 58 | X-Ratelimit-Resource: core 59 | X-Ratelimit-Used: 2 60 | X-Xss-Protection: 0 61 | 62 | [{"url":"https://api.github.com/repos/rsc/markdown/issues/comments/2146475274","html_url":"https://github.com/rsc/markdown/pull/18#issuecomment-2146475274","issue_url":"https://api.github.com/repos/rsc/markdown/issues/18","id":2146475274,"node_id":"IC_kwDOKnFwjc5_8J0K","user":{"login":"rsc","id":104030,"node_id":"MDQ6VXNlcjEwNDAzMA==","avatar_url":"https://avatars.githubusercontent.com/u/104030?v=4","gravatar_id":"","url":"https://api.github.com/users/rsc","html_url":"https://github.com/rsc","followers_url":"https://api.github.com/users/rsc/followers","following_url":"https://api.github.com/users/rsc/following{/other_user}","gists_url":"https://api.github.com/users/rsc/gists{/gist_id}","starred_url":"https://api.github.com/users/rsc/starred{/owner}{/repo}","subscriptions_url":"https://api.github.com/users/rsc/subscriptions","organizations_url":"https://api.github.com/users/rsc/orgs","repos_url":"https://api.github.com/users/rsc/repos","events_url":"https://api.github.com/users/rsc/events{/privacy}","received_events_url":"https://api.github.com/users/rsc/received_events","type":"User","site_admin":false},"created_at":"2024-06-04T02:57:21Z","updated_at":"2024-06-04T02:57:21Z","author_association":"OWNER","body":"Thanks very much!\r\n","reactions":{"url":"https://api.github.com/repos/rsc/markdown/issues/comments/2146475274/reactions","total_count":0,"+1":0,"-1":0,"laugh":0,"hooray":0,"confused":0,"heart":0,"rocket":0,"eyes":0},"performed_via_github_app":null}]231 1235 63 | GET https://api.github.com/repos/rsc/markdown/issues/events?page=1&per_page=100 HTTP/1.1 64 | Host: api.github.com 65 | User-Agent: Go-http-client/1.1 66 | If-None-Match: W/"5f8cdae3e0a577c993191ba0140691c76a0df6b824580833fc3662906ef5aaf3" 67 | 68 | HTTP/2.0 304 Not Modified 69 | Access-Control-Allow-Origin: * 70 | Access-Control-Expose-Headers: ETag, Link, Location, Retry-After, X-GitHub-OTP, X-RateLimit-Limit, X-RateLimit-Remaining, X-RateLimit-Used, X-RateLimit-Resource, X-RateLimit-Reset, X-OAuth-Scopes, X-Accepted-OAuth-Scopes, X-Poll-Interval, X-GitHub-Media-Type, X-GitHub-SSO, X-GitHub-Request-Id, Deprecation, Sunset 71 | Cache-Control: public, max-age=60, s-maxage=60 72 | Content-Encoding: gzip 73 | Content-Security-Policy: default-src 'none' 74 | Content-Type: application/json; charset=utf-8 75 | Date: Tue, 04 Jun 2024 12:28:23 GMT 76 | Etag: W/"5f8cdae3e0a577c993191ba0140691c76a0df6b824580833fc3662906ef5aaf3" 77 | Referrer-Policy: origin-when-cross-origin, strict-origin-when-cross-origin 78 | Server: GitHub.com 79 | Strict-Transport-Security: max-age=31536000; includeSubdomains; preload 80 | Vary: Accept, Accept-Encoding, Accept, X-Requested-With 81 | X-Content-Type-Options: nosniff 82 | X-Frame-Options: deny 83 | X-Github-Api-Version-Selected: 2022-11-28 84 | X-Github-Media-Type: github.v3; format=json 85 | X-Github-Request-Id: DD96:493B9:1532304C:248A52EE:665F0866 86 | X-Ratelimit-Limit: 60 87 | X-Ratelimit-Remaining: 57 88 | X-Ratelimit-Reset: 1717507702 89 | X-Ratelimit-Resource: core 90 | X-Ratelimit-Used: 3 91 | X-Xss-Protection: 0 92 | 93 | -------------------------------------------------------------------------------- /internal/github/sync_test.go: -------------------------------------------------------------------------------- 1 | // Copyright 2024 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package github 6 | 7 | import ( 8 | "bytes" 9 | "errors" 10 | "iter" 11 | "net/http" 12 | "os" 13 | "path/filepath" 14 | "slices" 15 | "strings" 16 | "testing" 17 | 18 | "rsc.io/gaby/internal/httprr" 19 | "rsc.io/gaby/internal/secret" 20 | "rsc.io/gaby/internal/storage" 21 | "rsc.io/gaby/internal/storage/timed" 22 | "rsc.io/gaby/internal/testutil" 23 | ) 24 | 25 | func githubAuth() (string, string) { 26 | data, err := os.ReadFile(filepath.Join(os.Getenv("HOME"), ".netrc")) 27 | if err != nil { 28 | return "", "" 29 | } 30 | for _, line := range strings.Split(string(data), "\n") { 31 | f := strings.Fields(line) 32 | if len(f) == 6 && f[0] == "machine" && f[1] == "api.github.com" && f[2] == "login" && f[4] == "password" { 33 | return f[3], f[5] 34 | } 35 | } 36 | return "", "" 37 | } 38 | 39 | func TestMarkdown(t *testing.T) { 40 | check := testutil.Checker(t) 41 | lg := testutil.Slogger(t) 42 | db := storage.MemDB() 43 | 44 | // Initial load. 45 | rr, err := httprr.Open("../testdata/markdown.httprr", http.DefaultTransport) 46 | check(err) 47 | rr.Scrub(Scrub) 48 | sdb := secret.Empty() 49 | if rr.Recording() { 50 | sdb = secret.Netrc() 51 | } 52 | c := New(lg, db, sdb, rr.Client()) 53 | check(c.Add("rsc/markdown")) 54 | check(c.Sync()) 55 | 56 | w := c.EventWatcher("test1") 57 | for e := range w.Recent() { 58 | w.MarkOld(e.DBTime) 59 | } 60 | 61 | // Incremental update. 62 | rr, err = httprr.Open("../testdata/markdown2.httprr", http.DefaultTransport) 63 | check(err) 64 | rr.Scrub(Scrub) 65 | sdb = secret.Empty() 66 | if rr.Recording() { 67 | sdb = secret.Netrc() 68 | } 69 | c = New(lg, db, sdb, rr.Client()) 70 | check(c.Sync()) 71 | 72 | // Test that EventWatcher sees the updates. 73 | diffEvents(t, 74 | collectEventsAfter(t, 0, c.EventWatcher("test1").Recent()), 75 | markdownNewEvents) 76 | 77 | // Test that without MarkOld, Recent leaves the cursor where it was. 78 | diffEvents(t, 79 | collectEventsAfter(t, 0, c.EventWatcher("test1").Recent()), 80 | markdownNewEvents) 81 | 82 | // Incremental update. 83 | rr, err = httprr.Open("../testdata/markdown3.httprr", http.DefaultTransport) 84 | check(err) 85 | rr.Scrub(Scrub) 86 | sdb = secret.Empty() 87 | if rr.Recording() { 88 | sdb = secret.Netrc() 89 | } 90 | c = New(lg, db, sdb, rr.Client()) 91 | check(c.Sync()) 92 | 93 | testMarkdownEvents(t, c) 94 | } 95 | 96 | func TestMarkdownIncrementalSync(t *testing.T) { 97 | check := testutil.Checker(t) 98 | lg := testutil.Slogger(t) 99 | db := storage.MemDB() 100 | 101 | // Initial load. 102 | rr, err := httprr.Open("../testdata/markdowninc.httprr", http.DefaultTransport) 103 | check(err) 104 | rr.Scrub(Scrub) 105 | sdb := secret.Empty() 106 | if rr.Recording() { 107 | sdb = secret.Netrc() 108 | } 109 | c := New(lg, db, sdb, rr.Client()) 110 | check(c.Add("rsc/markdown")) 111 | 112 | testFullSyncStop = errors.New("stop for testing") 113 | defer func() { 114 | testFullSyncStop = nil 115 | }() 116 | for { 117 | err := c.Sync() 118 | if err == nil { 119 | break 120 | } 121 | if !errors.Is(err, testFullSyncStop) { 122 | t.Fatal(err) 123 | } 124 | } 125 | 126 | testMarkdownEvents(t, c) 127 | } 128 | 129 | func testMarkdownEvents(t *testing.T, c *Client) { 130 | // All the events should be present in order. 131 | have := collectEvents(c.Events("rsc/markdown", -1, -1)) 132 | diffEvents(t, have, markdownEvents) 133 | 134 | // Again with an early break. 135 | have = have[:0] 136 | for e := range c.Events("rsc/markdown", -1, 100) { 137 | have = append(have, o(e.Project, e.Issue, e.API, e.ID)) 138 | if len(have) == len(markdownEvents)/2 { 139 | break 140 | } 141 | } 142 | diffEvents(t, have, markdownEvents[:len(markdownEvents)/2]) 143 | 144 | // Again with a different project. 145 | for _ = range c.Events("fauxlang/faux", -1, 100) { 146 | t.Errorf("EventsAfter: project filter failed") 147 | } 148 | 149 | // The EventsByTime list should not have any duplicates, even though 150 | // the incremental sync revisited some issues. 151 | have = collectEventsAfter(t, 0, c.EventsAfter(0, "")) 152 | diffEvents(t, have, markdownEvents) 153 | 154 | // Again with an early break. 155 | have = have[:0] 156 | for e := range c.EventsAfter(0, "") { 157 | have = append(have, o(e.Project, e.Issue, e.API, e.ID)) 158 | if len(have) == len(markdownEarlyEvents) { 159 | break 160 | } 161 | } 162 | diffEvents(t, have, markdownEarlyEvents) 163 | 164 | // Again with a different project. 165 | for _ = range c.EventsAfter(0, "fauxlang/faux") { 166 | t.Errorf("EventsAfter: project filter failed") 167 | } 168 | } 169 | 170 | func diffEvents(t *testing.T, have, want [][]byte) { 171 | t.Helper() 172 | for _, key := range have { 173 | for len(want) > 0 && bytes.Compare(want[0], key) < 0 { 174 | t.Errorf("Events: missing %s", storage.Fmt(want[0])) 175 | want = want[1:] 176 | } 177 | if len(want) > 0 && bytes.Equal(key, want[0]) { 178 | want = want[1:] 179 | continue 180 | } 181 | t.Errorf("Events: unexpected %s", storage.Fmt(key)) 182 | } 183 | for len(want) > 0 { 184 | t.Errorf("Events: missing %s", storage.Fmt(want[0])) 185 | want = want[1:] 186 | } 187 | } 188 | 189 | func collectEvents(seq iter.Seq[*Event]) [][]byte { 190 | var keys [][]byte 191 | for e := range seq { 192 | keys = append(keys, o(e.Project, e.Issue, e.API, e.ID)) 193 | } 194 | return keys 195 | } 196 | 197 | func collectEventsAfter(t *testing.T, dbtime timed.DBTime, seq iter.Seq[*Event]) [][]byte { 198 | var keys [][]byte 199 | for e := range seq { 200 | if e.DBTime <= dbtime { 201 | // TODO(rsc): t.Helper probably doesn't apply here but should. 202 | t.Errorf("EventsSince: DBTime inversion: e.DBTime %d <= last %d", e.DBTime, dbtime) 203 | } 204 | dbtime = e.DBTime 205 | keys = append(keys, o(e.Project, e.Issue, e.API, e.ID)) 206 | } 207 | slices.SortFunc(keys, bytes.Compare) 208 | return keys 209 | } 210 | 211 | func TestIvy(t *testing.T) { 212 | check := testutil.Checker(t) 213 | lg := testutil.Slogger(t) 214 | db := storage.MemDB() 215 | rr, err := httprr.Open("../testdata/ivy.httprr", http.DefaultTransport) 216 | check(err) 217 | rr.Scrub(Scrub) 218 | sdb := secret.Empty() 219 | if rr.Recording() { 220 | sdb = secret.Netrc() 221 | } 222 | c := New(lg, db, sdb, rr.Client()) 223 | check(c.Add("robpike/ivy")) 224 | check(c.Sync()) 225 | } 226 | 227 | func TestOmap(t *testing.T) { 228 | check := testutil.Checker(t) 229 | lg := testutil.Slogger(t) 230 | db := storage.MemDB() 231 | rr, err := httprr.Open("../testdata/omap.httprr", http.DefaultTransport) 232 | check(err) 233 | rr.Scrub(Scrub) 234 | sdb := secret.Empty() 235 | if rr.Recording() { 236 | sdb = secret.Netrc() 237 | } 238 | c := New(lg, db, sdb, rr.Client()) 239 | check(c.Add("rsc/omap")) 240 | check(c.Sync()) 241 | } 242 | 243 | var markdownEarlyEvents = [][]byte{ 244 | o("rsc/markdown", 3, "/issues", 2038510799), 245 | o("rsc/markdown", 2, "/issues", 2038502414), 246 | o("rsc/markdown", 4, "/issues", 2038521730), 247 | o("rsc/markdown", 1, "/issues", 2038380363), 248 | o("rsc/markdown", 6, "/issues", 2038573328), 249 | } 250 | 251 | var markdownNewEvents = [][]byte{ 252 | o("rsc/markdown", 16, "/issues", 2189605425), 253 | o("rsc/markdown", 16, "/issues/comments", 2146194902), 254 | o("rsc/markdown", 16, "/issues/events", 13027435265), 255 | o("rsc/markdown", 17, "/issues", 2189605911), 256 | o("rsc/markdown", 17, "/issues/comments", 2146194573), 257 | o("rsc/markdown", 17, "/issues/comments", 2146421109), 258 | o("rsc/markdown", 17, "/issues/events", 13027432818), 259 | o("rsc/markdown", 17, "/issues/events", 13028910699), 260 | o("rsc/markdown", 17, "/issues/events", 13028910702), 261 | o("rsc/markdown", 18, "/issues", 2276848742), 262 | o("rsc/markdown", 18, "/issues/comments", 2097019306), 263 | o("rsc/markdown", 18, "/issues/comments", 2146475274), 264 | o("rsc/markdown", 18, "/issues/events", 13027289256), 265 | o("rsc/markdown", 18, "/issues/events", 13027289270), 266 | o("rsc/markdown", 18, "/issues/events", 13027289466), 267 | o("rsc/markdown", 19, "/issues", 2308816936), 268 | o("rsc/markdown", 19, "/issues/comments", 2146197528), 269 | } 270 | 271 | var markdownEvents = [][]byte{ 272 | o("rsc/markdown", 1, "/issues", 2038380363), 273 | o("rsc/markdown", 1, "/issues/events", 11230676272), 274 | o("rsc/markdown", 2, "/issues", 2038502414), 275 | o("rsc/markdown", 2, "/issues/events", 11230676151), 276 | o("rsc/markdown", 3, "/issues", 2038510799), 277 | o("rsc/markdown", 3, "/issues/comments", 1852808662), 278 | o("rsc/markdown", 3, "/issues/events", 11228615168), 279 | o("rsc/markdown", 3, "/issues/events", 11228628324), 280 | o("rsc/markdown", 3, "/issues/events", 11230676181), 281 | o("rsc/markdown", 4, "/issues", 2038521730), 282 | o("rsc/markdown", 4, "/issues/events", 11230676170), 283 | o("rsc/markdown", 5, "/issues", 2038530418), 284 | o("rsc/markdown", 5, "/issues/comments", 1852919031), 285 | o("rsc/markdown", 5, "/issues/comments", 1854409176), 286 | o("rsc/markdown", 5, "/issues/events", 11230676200), 287 | o("rsc/markdown", 5, "/issues/events", 11239005964), 288 | o("rsc/markdown", 6, "/issues", 2038573328), 289 | o("rsc/markdown", 6, "/issues/events", 11230676238), 290 | o("rsc/markdown", 7, "/issues", 2040197050), 291 | o("rsc/markdown", 7, "/issues/events", 11241620840), 292 | o("rsc/markdown", 8, "/issues", 2040277497), 293 | o("rsc/markdown", 8, "/issues/comments", 1854835554), 294 | o("rsc/markdown", 8, "/issues/comments", 1854837832), 295 | o("rsc/markdown", 8, "/issues/comments", 1856133592), 296 | o("rsc/markdown", 8, "/issues/comments", 1856151124), 297 | o("rsc/markdown", 8, "/issues/events", 11250194227), 298 | o("rsc/markdown", 9, "/issues", 2040303458), 299 | o("rsc/markdown", 9, "/issues/events", 11241620809), 300 | o("rsc/markdown", 10, "/issues", 2076625629), 301 | o("rsc/markdown", 10, "/issues/comments", 1894927765), 302 | o("rsc/markdown", 10, "/issues/events", 11456466988), 303 | o("rsc/markdown", 10, "/issues/events", 11506360992), 304 | o("rsc/markdown", 11, "/issues", 2076798270), 305 | o("rsc/markdown", 11, "/issues/comments", 1894929190), 306 | o("rsc/markdown", 11, "/issues/events", 11506369300), 307 | o("rsc/markdown", 12, "/issues", 2137605063), 308 | o("rsc/markdown", 12, "/issues/events", 11822212932), 309 | o("rsc/markdown", 12, "/issues/events", 11942808811), 310 | o("rsc/markdown", 12, "/issues/events", 11942812866), 311 | o("rsc/markdown", 12, "/issues/events", 12028957331), 312 | o("rsc/markdown", 12, "/issues/events", 12028957356), 313 | o("rsc/markdown", 12, "/issues/events", 12028957676), 314 | o("rsc/markdown", 13, "/issues", 2182527101), 315 | o("rsc/markdown", 13, "/issues/events", 12122378461), 316 | o("rsc/markdown", 14, "/issues", 2182534654), 317 | o("rsc/markdown", 14, "/issues/events", 12122340938), 318 | o("rsc/markdown", 14, "/issues/events", 12122495521), 319 | o("rsc/markdown", 14, "/issues/events", 12122495545), 320 | o("rsc/markdown", 14, "/issues/events", 12122501258), 321 | o("rsc/markdown", 14, "/issues/events", 12122508555), 322 | o("rsc/markdown", 15, "/issues", 2187046263), 323 | o("rsc/markdown", 16, "/issues", 2189605425), 324 | o("rsc/markdown", 16, "/issues/comments", 2146194902), 325 | o("rsc/markdown", 16, "/issues/events", 13027435265), 326 | o("rsc/markdown", 17, "/issues", 2189605911), 327 | o("rsc/markdown", 17, "/issues/comments", 2146194573), 328 | o("rsc/markdown", 17, "/issues/comments", 2146421109), 329 | o("rsc/markdown", 17, "/issues/events", 12137686933), 330 | o("rsc/markdown", 17, "/issues/events", 12137688071), 331 | o("rsc/markdown", 17, "/issues/events", 13027432818), 332 | o("rsc/markdown", 17, "/issues/events", 13028910699), 333 | o("rsc/markdown", 17, "/issues/events", 13028910702), 334 | o("rsc/markdown", 18, "/issues", 2276848742), 335 | o("rsc/markdown", 18, "/issues/comments", 2097019306), 336 | o("rsc/markdown", 18, "/issues/comments", 2146475274), 337 | o("rsc/markdown", 18, "/issues/events", 12721108829), 338 | o("rsc/markdown", 18, "/issues/events", 13027289256), 339 | o("rsc/markdown", 18, "/issues/events", 13027289270), 340 | o("rsc/markdown", 18, "/issues/events", 13027289466), 341 | o("rsc/markdown", 19, "/issues", 2308816936), 342 | o("rsc/markdown", 19, "/issues/comments", 2146197528), 343 | } 344 | -------------------------------------------------------------------------------- /internal/commentfix/fix.go: -------------------------------------------------------------------------------- 1 | // Copyright 2024 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | // Package commentfix implements rule-based rewriting of issue comments. 6 | package commentfix 7 | 8 | import ( 9 | "fmt" 10 | "io" 11 | "log/slog" 12 | "os" 13 | "reflect" 14 | "regexp" 15 | "strings" 16 | "testing" 17 | "time" 18 | 19 | "rsc.io/gaby/internal/diff" 20 | "rsc.io/gaby/internal/github" 21 | "rsc.io/gaby/internal/storage/timed" 22 | "rsc.io/markdown" 23 | ) 24 | 25 | // A Fixer rewrites issue texts and issue comments using a set of rules. 26 | // After creating a fixer with [New], new rules can be added using 27 | // the [Fixer.AutoLink], [Fixer.ReplaceText], and [Fixer.ReplaceURL] methods, 28 | // and then repeated calls to [Fixer.Run] apply the replacements on GitHub. 29 | // 30 | // The zero value of a Fixer can be used in “offline” mode with [Fixer.Fix], 31 | // which returns rewritten Markdown. 32 | // 33 | // TODO(rsc): Separate the GitHub logic more cleanly from the rewrite logic. 34 | type Fixer struct { 35 | slog *slog.Logger 36 | github *github.Client 37 | watcher *timed.Watcher[*github.Event] 38 | fixes []func(any, int) any 39 | projects map[string]bool 40 | edit bool 41 | timeLimit time.Time 42 | 43 | stderrw io.Writer 44 | } 45 | 46 | func (f *Fixer) stderr() io.Writer { 47 | if f.stderrw != nil { 48 | return f.stderrw 49 | } 50 | return os.Stderr 51 | } 52 | 53 | func (f *Fixer) SetStderr(w io.Writer) { 54 | f.stderrw = w 55 | } 56 | 57 | // New creates a new Fixer using the given logger and GitHub client. 58 | // 59 | // The Fixer logs status and errors to lg; if lg is nil, the Fixer does not log anything. 60 | // 61 | // The GitHub client is used to watch for new issues and comments 62 | // and to edit issues and comments. If gh is nil, the Fixer can still be 63 | // configured and applied to Markdown using [Fixer.Fix], but calling 64 | // [Fixer.Run] will panic. 65 | // 66 | // The name is the handle by which the Fixer's “last position” is retrieved 67 | // across multiple program invocations; each differently configured 68 | // Fixer needs a different name. 69 | func New(lg *slog.Logger, gh *github.Client, name string) *Fixer { 70 | f := &Fixer{ 71 | slog: lg, 72 | github: gh, 73 | projects: make(map[string]bool), 74 | timeLimit: time.Now().Add(-30 * 24 * time.Hour), 75 | } 76 | f.init() // set f.slog if lg==nil 77 | if gh != nil { 78 | f.watcher = gh.EventWatcher("commentfix.Fixer:" + name) 79 | } 80 | return f 81 | } 82 | 83 | // SetTimeLimit sets the time before which comments are not edited. 84 | func (f *Fixer) SetTimeLimit(limit time.Time) { 85 | f.timeLimit = limit 86 | } 87 | 88 | // init makes sure slog is non-nil. 89 | func (f *Fixer) init() { 90 | if f.slog == nil { 91 | f.slog = slog.New(slog.NewTextHandler(io.Discard, nil)) 92 | } 93 | } 94 | 95 | func (f *Fixer) EnableProject(name string) { 96 | f.init() 97 | if f.github == nil { 98 | panic("commentfix.Fixer: EnableProject missing GitHub client") 99 | } 100 | f.projects[name] = true 101 | } 102 | 103 | // EnableEdits configures the fixer to make edits to comments on GitHub. 104 | // If EnableEdits is not called, the Fixer only prints what it would do, 105 | // and it does not mark the issues and comments as “old”. 106 | // This default mode is useful for experimenting with a Fixer 107 | // to gauge its effects. 108 | // 109 | // EnableEdits panics if the Fixer was not constructed by calling [New] 110 | // with a non-nil [github.Client]. 111 | func (f *Fixer) EnableEdits() { 112 | f.init() 113 | if f.github == nil { 114 | panic("commentfix.Fixer: EnableEdits missing GitHub client") 115 | } 116 | f.edit = true 117 | } 118 | 119 | // AutoLink instructs the fixer to turn any text matching the 120 | // regular expression pattern into a link to the URL. 121 | // The URL can contain substitution values like $1 122 | // as supported by [regexp.Regexp.Expand]. 123 | // 124 | // For example, to link CL nnn to https://go.dev/cl/nnn, 125 | // you could use: 126 | // 127 | // f.AutoLink(`\bCL (\d+)\b`, "https://go.dev/cl/$1") 128 | func (f *Fixer) AutoLink(pattern, url string) error { 129 | f.init() 130 | re, err := regexp.Compile(pattern) 131 | if err != nil { 132 | return err 133 | } 134 | f.fixes = append(f.fixes, func(x any, flags int) any { 135 | if flags&flagLink != 0 { 136 | // already inside link 137 | return nil 138 | } 139 | plain, ok := x.(*markdown.Plain) 140 | if !ok { 141 | return nil 142 | } 143 | var out []markdown.Inline 144 | start := 0 145 | text := plain.Text 146 | for _, m := range re.FindAllStringSubmatchIndex(text, -1) { 147 | if start < m[0] { 148 | out = append(out, &markdown.Plain{Text: text[start:m[0]]}) 149 | } 150 | link := string(re.ExpandString(nil, url, text, m)) 151 | out = append(out, &markdown.Link{ 152 | Inner: []markdown.Inline{&markdown.Plain{Text: text[m[0]:m[1]]}}, 153 | URL: link, 154 | }) 155 | start = m[1] 156 | } 157 | if start == 0 { 158 | return nil 159 | } 160 | out = append(out, &markdown.Plain{Text: text[start:]}) 161 | return out 162 | }) 163 | return nil 164 | } 165 | 166 | // ReplaceText instructs the fixer to replace any text 167 | // matching the regular expression pattern with the replacement repl. 168 | // The replacement can contain substitution values like $1 169 | // as supported by [regexp.Regexp.Expand]. 170 | // 171 | // ReplaceText only applies in Markdown plain text. 172 | // It does not apply in backticked code text, or in backticked 173 | // or indented code blocks, or to URLs. 174 | // It does apply to the plain text inside headings, 175 | // inside bold, italic, or link markup. 176 | // 177 | // For example, you could correct “cancelled” to “canceled”, 178 | // following Go's usual conventions, with: 179 | // 180 | // f.ReplaceText(`cancelled`, "canceled") 181 | func (f *Fixer) ReplaceText(pattern, repl string) error { 182 | f.init() 183 | re, err := regexp.Compile(pattern) 184 | if err != nil { 185 | return err 186 | } 187 | f.fixes = append(f.fixes, func(x any, flags int) any { 188 | plain, ok := x.(*markdown.Plain) 189 | if !ok { 190 | return nil 191 | } 192 | if re.FindStringSubmatchIndex(plain.Text) == nil { 193 | return nil 194 | } 195 | plain.Text = re.ReplaceAllString(plain.Text, repl) 196 | return plain 197 | }) 198 | return nil 199 | } 200 | 201 | // ReplaceURL instructs the fixer to replace any linked URLs 202 | // matching the regular expression pattern with the replacement URL repl. 203 | // The replacement can contain substitution values like $1 204 | // as supported by [regexp.Regexp.Expand]. 205 | // 206 | // The regular expression pattern is automatically anchored 207 | // to the start of the URL: there is no need to start it with \A or ^. 208 | // 209 | // For example, to replace links to golang.org with links to go.dev, 210 | // you could use: 211 | // 212 | // f.ReplaceURL(`https://golang\.org(/?)`, "https://go.dev$1") 213 | func (f *Fixer) ReplaceURL(pattern, repl string) error { 214 | f.init() 215 | re, err := regexp.Compile(`\A(?:` + pattern + `)`) 216 | if err != nil { 217 | return err 218 | } 219 | f.fixes = append(f.fixes, func(x any, flags int) any { 220 | switch x := x.(type) { 221 | case *markdown.AutoLink: 222 | old := x.URL 223 | x.URL = re.ReplaceAllString(x.URL, repl) 224 | if x.URL == old { 225 | return nil 226 | } 227 | if x.Text == old { 228 | x.Text = x.URL 229 | } 230 | return x 231 | case *markdown.Link: 232 | old := x.URL 233 | x.URL = re.ReplaceAllString(x.URL, repl) 234 | if x.URL == old { 235 | return nil 236 | } 237 | if len(x.Inner) == 1 { 238 | if p, ok := x.Inner[0].(*markdown.Plain); ok && p.Text == old { 239 | p.Text = x.URL 240 | } 241 | } 242 | return x 243 | } 244 | return nil 245 | }) 246 | return nil 247 | } 248 | 249 | // Run applies the configured rewrites to issue texts and comments on GitHub 250 | // that have been updated since the last call to Run for this fixer with edits enabled 251 | // (including in different program invocations using the same fixer name). 252 | // Run ignores issues texts and comments more than 30 days old. 253 | // 254 | // Run prints diffs of its edits to standard error in addition to logging them, 255 | // because slog logs the diffs as single-line Go quoted strings that are 256 | // too difficult to skim. 257 | // 258 | // If [Fixer.EnableEdits] has not been called, Run processes recent issue texts 259 | // and comments and prints diffs of its intended edits to standard error, 260 | // but it does not make the changes. It also does not mark the issues and comments as processed, 261 | // so that a future call to Run with edits enabled can rewrite them on GitHub. 262 | // 263 | // Run sleeps for 1 second after each GitHub edit. 264 | // 265 | // Run panics if the Fixer was not constructed by calling [New] 266 | // with a non-nil [github.Client]. 267 | func (f *Fixer) Run() { 268 | if f.watcher == nil { 269 | panic("commentfix.Fixer: Run missing GitHub client") 270 | } 271 | for e := range f.watcher.Recent() { 272 | if !f.projects[e.Project] { 273 | continue 274 | } 275 | var ic *issueOrComment 276 | switch x := e.Typed.(type) { 277 | default: 278 | continue 279 | case *github.Issue: 280 | if x.PullRequest != nil { 281 | // Do not edit pull request bodies, 282 | // because they turn into commit messages 283 | // and cannot contain things like hyperlinks. 284 | continue 285 | } 286 | ic = &issueOrComment{issue: x} 287 | case *github.IssueComment: 288 | ic = &issueOrComment{comment: x} 289 | } 290 | if tm, err := time.Parse(time.RFC3339, ic.updatedAt()); err == nil && tm.Before(f.timeLimit) { 291 | if f.edit { 292 | f.watcher.MarkOld(e.DBTime) 293 | } 294 | continue 295 | } 296 | body, updated := f.Fix(ic.body()) 297 | if !updated { 298 | continue 299 | } 300 | live, err := ic.download(f.github) 301 | if err != nil { 302 | // unreachable unless github error 303 | f.slog.Error("commentfix download error", "project", e.Project, "issue", e.Issue, "url", ic.url(), "err", err) 304 | continue 305 | } 306 | if live.body() != ic.body() { 307 | f.slog.Info("commentfix stale", "project", e.Project, "issue", e.Issue, "url", ic.url()) 308 | continue 309 | } 310 | f.slog.Info("commentfix rewrite", "project", e.Project, "issue", e.Issue, "url", ic.url(), "edit", f.edit, "diff", bodyDiff(ic.body(), body)) 311 | fmt.Fprintf(f.stderr(), "Fix %s:\n%s\n", ic.url(), bodyDiff(ic.body(), body)) 312 | if f.edit { 313 | f.slog.Info("commentfix editing github", "url", ic.url()) 314 | if err := ic.editBody(f.github, body); err != nil { 315 | // unreachable unless github error 316 | f.slog.Error("commentfix edit", "project", e.Project, "issue", e.Issue, "err", err) 317 | continue 318 | } 319 | f.watcher.MarkOld(e.DBTime) 320 | f.watcher.Flush() 321 | if !testing.Testing() { 322 | // unreachable in tests 323 | time.Sleep(1 * time.Second) 324 | } 325 | } 326 | } 327 | } 328 | 329 | type issueOrComment struct { 330 | issue *github.Issue 331 | comment *github.IssueComment 332 | } 333 | 334 | func (ic *issueOrComment) updatedAt() string { 335 | if ic.issue != nil { 336 | return ic.issue.UpdatedAt 337 | } 338 | return ic.comment.UpdatedAt 339 | } 340 | 341 | func (ic *issueOrComment) body() string { 342 | if ic.issue != nil { 343 | return ic.issue.Body 344 | } 345 | return ic.comment.Body 346 | } 347 | 348 | func (ic *issueOrComment) download(gh *github.Client) (*issueOrComment, error) { 349 | if ic.issue != nil { 350 | live, err := gh.DownloadIssue(ic.issue.URL) 351 | return &issueOrComment{issue: live}, err 352 | } 353 | live, err := gh.DownloadIssueComment(ic.comment.URL) 354 | return &issueOrComment{comment: live}, err 355 | } 356 | 357 | func (ic *issueOrComment) url() string { 358 | if ic.issue != nil { 359 | return ic.issue.URL 360 | } 361 | return ic.comment.URL 362 | } 363 | 364 | func (ic *issueOrComment) editBody(gh *github.Client, body string) error { 365 | if ic.issue != nil { 366 | return gh.EditIssue(ic.issue, &github.IssueChanges{Body: body}) 367 | } 368 | return gh.EditIssueComment(ic.comment, &github.IssueCommentChanges{Body: body}) 369 | } 370 | 371 | // Fix applies the configured rewrites to the markdown text. 372 | // If no fixes apply, it returns "", false. 373 | // If any fixes apply, it returns the updated text and true. 374 | func (f *Fixer) Fix(text string) (newText string, fixed bool) { 375 | p := &markdown.Parser{ 376 | AutoLinkText: true, 377 | Strikethrough: true, 378 | HeadingIDs: true, 379 | Emoji: true, 380 | } 381 | doc := p.Parse(text) 382 | for _, fixer := range f.fixes { 383 | if f.fixOne(fixer, doc) { 384 | fixed = true 385 | } 386 | } 387 | if !fixed { 388 | return "", false 389 | } 390 | return markdown.ToMarkdown(doc), true 391 | } 392 | 393 | const ( 394 | // flagLink means this inline is link text, 395 | // so it is inappropriate/impossible to turn 396 | // it into a (nested) hyperlink. 397 | flagLink = 1 << iota 398 | ) 399 | 400 | // fixOne runs one fix function over doc, 401 | // reporting whether doc was changed. 402 | func (f *Fixer) fixOne(fix func(any, int) any, doc *markdown.Document) (fixed bool) { 403 | var ( 404 | fixBlock func(markdown.Block) 405 | fixInlines func(*[]markdown.Inline) 406 | ) 407 | fixBlock = func(x markdown.Block) { 408 | switch x := x.(type) { 409 | case *markdown.Document: 410 | for _, sub := range x.Blocks { 411 | fixBlock(sub) 412 | } 413 | case *markdown.Quote: 414 | for _, sub := range x.Blocks { 415 | fixBlock(sub) 416 | } 417 | case *markdown.List: 418 | for _, sub := range x.Items { 419 | fixBlock(sub) 420 | } 421 | case *markdown.Item: 422 | for _, sub := range x.Blocks { 423 | fixBlock(sub) 424 | } 425 | case *markdown.Heading: 426 | fixBlock(x.Text) 427 | case *markdown.Paragraph: 428 | fixBlock(x.Text) 429 | case *markdown.Text: 430 | fixInlines(&x.Inline) 431 | } 432 | } 433 | 434 | link := 0 435 | fixInlines = func(inlines *[]markdown.Inline) { 436 | changed := false 437 | var out []markdown.Inline 438 | for _, x := range *inlines { 439 | switch x := x.(type) { 440 | case *markdown.Del: 441 | fixInlines(&x.Inner) 442 | case *markdown.Emph: 443 | fixInlines(&x.Inner) 444 | case *markdown.Strong: 445 | fixInlines(&x.Inner) 446 | case *markdown.Link: 447 | link++ 448 | fixInlines(&x.Inner) 449 | link-- 450 | } 451 | flags := 0 452 | if link > 0 { 453 | flags = flagLink 454 | } 455 | switch fx := fix(x, flags).(type) { 456 | default: 457 | // unreachable unless bug in fix func 458 | f.slog.Error("fixer returned invalid type", "old", reflect.TypeOf(x).String(), "new", reflect.TypeOf(fx).String()) 459 | out = append(out, x) 460 | case nil: 461 | out = append(out, x) 462 | case markdown.Inline: 463 | changed = true 464 | out = append(out, fx) 465 | case []markdown.Inline: 466 | changed = true 467 | out = append(out, fx...) 468 | } 469 | } 470 | if changed { 471 | *inlines = out 472 | fixed = true 473 | } 474 | } 475 | 476 | fixBlock(doc) 477 | return fixed 478 | } 479 | 480 | func bodyDiff(old, new string) string { 481 | old = strings.TrimRight(old, "\n") + "\n" 482 | old = strings.ReplaceAll(old, "\r\n", "\n") 483 | 484 | new = strings.TrimRight(new, "\n") + "\n" 485 | new = strings.ReplaceAll(new, "\r\n", "\n") 486 | 487 | return string(diff.Diff("old", []byte(old), "new", []byte(new))) 488 | } 489 | --------------------------------------------------------------------------------