├── .github └── workflows │ ├── ci.yml │ └── go.yml ├── .gitignore ├── LICENSE ├── README.md ├── TODO.md ├── assets └── gopher-eating.svg ├── chew.go ├── chew_test.go ├── cmd └── chew │ └── wrapper.go ├── codecov.yml ├── docs ├── golang.md ├── python.md ├── ruby.md └── setup.md ├── examples ├── main.go ├── main.py ├── main.rb └── transcription │ ├── google.go │ └── whisper.go ├── go.mod ├── go.sum ├── internal ├── audio │ ├── flac.go │ ├── flac_test.go │ ├── mp3.go │ ├── mp3_test.go │ ├── processor.go │ ├── processor_test.go │ ├── types.go │ ├── wav.go │ └── wav_test.go ├── common │ └── types.go ├── document │ ├── docx.go │ ├── docx_test.go │ ├── epub.go │ ├── epub_test.go │ ├── pdf.go │ ├── pdf_test.go │ ├── pptx.go │ └── pptx_test.go ├── text │ ├── csv.go │ ├── csv_test.go │ ├── html.go │ ├── html_test.go │ ├── json.go │ ├── json_test.go │ ├── markdown.go │ ├── plaintext.go │ ├── plaintext_test.go │ ├── xml.go │ ├── xml_test.go │ ├── yaml.go │ └── yaml_test.go ├── transcribe │ ├── google_transcriber.go │ ├── google_transcriber_test.go │ ├── transcribe.go │ ├── transcribe_test.go │ ├── types.go │ ├── whisper.go │ └── whisper_test.go └── utils │ ├── gcs │ ├── gcs_utils.go │ └── gcs_utils_test.go │ ├── utils.go │ └── utils_test.go └── testdata ├── audio ├── test.flac ├── test.mp3 ├── test.ogg └── test.wav └── files ├── test.epub └── test.pdf /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: Workflow for Codecov 2 | on: [push, pull_request] 3 | jobs: 4 | run: 5 | runs-on: ubuntu-latest 6 | steps: 7 | - name: Set up Go 8 | uses: actions/setup-go@v5 9 | with: 10 | go-version: '1.23' 11 | id: go 12 | 13 | - name: Check out code into the Go module directory 14 | uses: actions/checkout@v4 15 | 16 | - name: Get dependencies 17 | run: | 18 | go get -v -t -d ./... 19 | if [ -f Gopkg.toml ]; then 20 | curl https://raw.githubusercontent.com/golang/dep/master/install.sh | sh 21 | dep ensure 22 | fi 23 | 24 | - name: Generate coverage report 25 | run: | 26 | go test `go list ./... | grep -v -E 'docs|cmd|examples'` -coverprofile=coverage.txt -covermode=atomic 27 | 28 | - name: Upload coverage to Codecov 29 | uses: codecov/codecov-action@v4 30 | with: 31 | verbose: true 32 | env: 33 | CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }} 34 | -------------------------------------------------------------------------------- /.github/workflows/go.yml: -------------------------------------------------------------------------------- 1 | name: Go 2 | 3 | on: 4 | push: 5 | branches: [ "master" ] 6 | pull_request: 7 | branches: [ "master" ] 8 | 9 | jobs: 10 | 11 | test: 12 | runs-on: ubuntu-latest 13 | steps: 14 | - uses: actions/checkout@v4 15 | 16 | - name: Set up Go 17 | uses: actions/setup-go@v4 18 | with: 19 | go-version: '1.22' 20 | 21 | - name: Test 22 | run: go test -v -race ./... -cover -covermode=atomic 23 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | audio 3 | *.json 4 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 Daniel M. Matongo 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |
2 | chew logo 7 | 8 | [![Go Report Card](https://goreportcard.com/badge/github.com/mmatongo/chew)](https://goreportcard.com/report/github.com/mmatongo/chew) 9 | [![GoDoc](https://godoc.org/github.com/mmatongo/chew?status.svg)](https://pkg.go.dev/github.com/mmatongo/chew) 10 | [![Maintainability](https://api.codeclimate.com/v1/badges/441cfd36f310c0c48878/maintainability)](https://codeclimate.com/github/mmatongo/chew/maintainability) 11 | [![codecov](https://codecov.io/github/mmatongo/chew/graph/badge.svg?token=6OOK91QQRC)](https://codecov.io/github/mmatongo/chew) 12 | [![License](https://img.shields.io/badge/License-MIT-blue.svg)](./LICENSE) 13 |
14 | 15 | >

A Go library for processing various content types into markdown/plaintext..

16 | 17 | ## About 18 | 19 | *Chew* is a Go library that processes various content types into markdown or plaintext. It supports multiple content types, including HTML, PDF, CSV, JSON, YAML, DOCX, PPTX, Markdown, Plaintext, MP3, FLAC, and WAVE. 20 | 21 | ## Installation 22 | 23 | ```bash 24 | go get github.com/mmatongo/chew 25 | ``` 26 | 27 | ## Usage 28 | 29 | Here's a basic example of how to use Chew: 30 | 31 | ```go 32 | package main 33 | 34 | import ( 35 | "context" 36 | "fmt" 37 | "log" 38 | "time" 39 | 40 | "github.com/mmatongo/chew/v1" 41 | ) 42 | 43 | func main() { 44 | urls := []string{ 45 | "https://example.com", 46 | } 47 | 48 | config := chew.Config{ 49 | UserAgent: "Chew/1.0 (+https://github.com/mmatongo/chew)", 50 | RetryLimit: 3, 51 | RetryDelay: 5 * time.Second, 52 | CrawlDelay: 10 * time.Second, 53 | ProxyList: []string{}, // Add your proxies here, or leave empty 54 | RateLimit: 2 * time.Second, 55 | RateBurst: 3, 56 | IgnoreRobotsTxt: false, 57 | } 58 | 59 | haChew := chew.New(config) 60 | 61 | // The context is optional, but can be used to cancel the operation after a certain time 62 | ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) 63 | defer cancel() 64 | 65 | chunks, err := haChew.Process(ctx, urls) 66 | if err != nil { 67 | if err == context.DeadlineExceeded { 68 | log.Println("Operation timed out") 69 | } else { 70 | log.Printf("Error processing URLs: %v", err) 71 | } 72 | return 73 | } 74 | 75 | for _, chunk := range chunks { 76 | fmt.Printf("Source: %s\nContent: %s\n\n", chunk.Source, chunk.Content) 77 | } 78 | } 79 | ``` 80 | 81 | Output 82 | 83 | ```bash 84 | Source: https://example.com 85 | Content: Example Domain 86 | 87 | Source: https://example.com 88 | Content: This domain is for use in illustrative examples in documents. You may use this domain in literature without prior coordination or asking for permission. 89 | 90 | Source: https://example.com 91 | Content: More information... 92 | ``` 93 | 94 | You can find more examples in the [examples](./examples) directory as well as instructions on how to use Chew with Ruby and Python. 95 | 96 | ## Contributing 97 | 98 | Contributions are welcome! Feel free to open an issue or submit a pull request if you have any suggestions or improvements. 99 | 100 | ## License 101 | 102 | This project is licensed under the MIT License - see the [LICENSE](./LICENSE) file for details. 103 | 104 | ### Logo 105 | 106 | The [logo](https://github.com/MariaLetta/free-gophers-pack) was made by the amazing [MariaLetta](https://github.com/MariaLetta). 107 | 108 | 109 | ### Similar Projects 110 | [docconv](https://github.com/sajari/docconv) 111 | 112 | ### Roadmap 113 | The roadmap for this project is available [here](./TODO.md). It's meant more as a guide than a strict plan because I only work on this project in my free time. 114 | -------------------------------------------------------------------------------- /TODO.md: -------------------------------------------------------------------------------- 1 | 2 | ### TODO 3 | --- 4 | 5 | - [x] Add tests 6 | - [ ] Improve error handling 7 | - [x] Add support for more content types 8 | - [x] Implement rate limiting for URL fetching 9 | - [x] Use a free PDF processing library 10 | - [x] How to handle text/plain content type 11 | - [x] Add transcription support 12 | - [x] Customisable user agent 13 | - [ ] Allow users what to target in the HTML, i.e. body, title, etc 14 | - [ ] More examples, documentation and use cases 15 | - [ ] Improve PPTX and DOCX processing, (currently using a hacky method I cobbed together from various sources) 16 | - [ ] Use a common interface for all content types 17 | -------------------------------------------------------------------------------- /chew.go: -------------------------------------------------------------------------------- 1 | /* 2 | Package chew provides a simple way to process URLs and files. It allows you to process a list of URLs 3 | and files, and returns the content of the URLs and files as a list of Chunks. It also provides a way to 4 | transcribe audio files using the Google Cloud Speech-to-Text API or the OpenAI Whisper API. 5 | 6 | The library respects rules defined in robots.txt file and crawl delays, and allows you to set a custom http.Client for making requests. 7 | 8 | Note on Responsible Usage: 9 | 10 | This library is designed for processing data from both local files and web sources. Users should be aware of the following considerations: 11 | 12 | 1. Web Scraping: 13 | - When scraping websites, ensure compliance with the target website's terms of service and robots.txt rules. 14 | - Respect rate limits and crawl delays to avoid overwhelming target servers. 15 | - Be aware that web scraping may be subject to legal restrictions in some jurisdictions. 16 | - While the library will attempt to respect robots.txt rules by default, users are responsible for ensuring 17 | that their usage complies with the target website's terms of service and legal requirements. 18 | 19 | 2. File Processing: 20 | - Exercise caution when processing files from untrusted sources. 21 | - Ensure you have appropriate permissions to access and process the files. 22 | - Be mindful of potential sensitive information in processed files and handle it securely. 23 | 24 | 3. Data Handling: 25 | - Properly secure and manage any data extracted or processed using this library, especially if it contains personal or sensitive information. 26 | - Comply with relevant data protection regulations (e.g., GDPR, CCPA) when handling personal data. 27 | 28 | 4. System Resource Usage: 29 | - Be aware that processing large files or numerous web pages can be resource-intensive. Monitor and manage system resources accordingly. 30 | 31 | 5. Have Fun 32 | 33 | Users of this library are responsible for ensuring their usage complies with applicable laws, regulations, and ethical considerations in their jurisdiction and context of use. 34 | */ 35 | package chew 36 | 37 | import ( 38 | "context" 39 | "fmt" 40 | "io" 41 | "net/http" 42 | "net/url" 43 | "strings" 44 | "sync" 45 | "time" 46 | 47 | "github.com/mmatongo/chew/v1/internal/common" 48 | "github.com/mmatongo/chew/v1/internal/document" 49 | "github.com/mmatongo/chew/v1/internal/text" 50 | "github.com/mmatongo/chew/v1/internal/transcribe" 51 | "github.com/mmatongo/chew/v1/internal/utils" 52 | "github.com/temoto/robotstxt" 53 | "golang.org/x/time/rate" 54 | ) 55 | 56 | const ( 57 | contentTypeHTML = "text/html" 58 | contentTypeText = "text/plain" 59 | contentTypeXML = "application/xml" 60 | contentTypeTextXML = "text/xml" 61 | contentTypePDF = "application/pdf" 62 | contentTypeCSV = "text/csv" 63 | contentTypeJSON = "application/json" 64 | contentTypeYAML = "application/x-yaml" 65 | contentTypeMarkdown = "text/markdown" 66 | contentTypeEPUB = "application/epub+zip" 67 | contentTypeDocx = "application/vnd.openxmlformats-officedocument.wordprocessingml.document" 68 | contentTypePptx = "application/vnd.openxmlformats-officedocument.presentationml.presentation" 69 | ) 70 | 71 | var contentTypeProcessors = map[string]func(io.Reader, string) ([]common.Chunk, error){ 72 | contentTypeHTML: text.ProcessHTML, 73 | contentTypeCSV: text.ProcessCSV, 74 | contentTypeJSON: text.ProcessJSON, 75 | contentTypeYAML: text.ProcessYAML, 76 | contentTypeMarkdown: text.ProcessText, 77 | contentTypeText: text.ProcessText, 78 | contentTypeXML: text.ProcessXML, 79 | contentTypeTextXML: text.ProcessXML, 80 | contentTypeDocx: document.ProcessDocx, 81 | contentTypePptx: document.ProcessPptx, 82 | contentTypePDF: document.ProcessPDF, 83 | contentTypeEPUB: document.ProcessEpub, 84 | } 85 | 86 | type Chew struct { 87 | config common.Config 88 | httpClient *http.Client 89 | rateLimiter RateLimiter 90 | rateLimiterMu sync.RWMutex 91 | robotsCache map[string]*robotstxt.RobotsData 92 | robotsMu sync.RWMutex 93 | lastAccess map[string]time.Time 94 | lastAccessMu sync.Mutex 95 | proxyIndex int 96 | proxyMu sync.Mutex 97 | } 98 | 99 | type RateLimiter interface { 100 | Wait(context.Context) error 101 | } 102 | 103 | func (c *Chew) SetRateLimiter(rl RateLimiter) { 104 | c.rateLimiterMu.Lock() 105 | defer c.rateLimiterMu.Unlock() 106 | c.rateLimiter = rl 107 | } 108 | 109 | /* 110 | NewConfig allows you to set the configuration options for URL processing. It takes a Config struct. 111 | 112 | Usage: 113 | 114 | config := chew.Config{ 115 | UserAgent: "MyBot/1.0 (+https://example.com/bot)", 116 | RetryLimit: 3, 117 | RetryDelay: 5 * time.Second, 118 | CrawlDelay: 10 * time.Second, 119 | ProxyList: []string{"http://proxy1.com", "http://proxy2.com"}, 120 | RateLimit: 2 * time.Second, 121 | RateBurst: 3, 122 | IgnoreRobotsTxt: false, 123 | } 124 | 125 | chew.NewConfig(config) 126 | */ 127 | func New(config common.Config) *Chew { 128 | c := &Chew{ 129 | config: config, 130 | robotsCache: make(map[string]*robotstxt.RobotsData), 131 | lastAccess: make(map[string]time.Time), 132 | } 133 | c.initHTTPClient() 134 | 135 | limit := rate.Every(config.RateLimit) 136 | c.rateLimiter = rate.NewLimiter(limit, config.RateBurst) 137 | 138 | return c 139 | } 140 | 141 | /* 142 | Transcribe is a function that transcribes audio files using either the Google Cloud Speech-to-Text API 143 | or the Whisper API. It handles uploading the audio file to Google Cloud Storage if necessary, 144 | manages the transcription process, and returns the resulting transcript. 145 | 146 | For detailed usage instructions, see the TranscribeOptions struct documentation. 147 | */ 148 | var Transcribe = transcribe.Transcribe 149 | 150 | /* 151 | The TranscribeOptions struct contains the options for transcribing an audio file. It allows the user 152 | to specify the Google Cloud credentials, the GCS bucket to upload the audio file to, the language code 153 | to use for transcription, an option to enable diarization (the process of separating and labeling 154 | speakers in an audio stream) including the min and max speakers and 155 | an option to clean up the audio file from Google Cloud Speech-to-Text (GCS) after transcription is complete. 156 | 157 | And also, it allows the user to specify whether to use the Whisper API for transcription, and if so, 158 | the API key, model, and prompt to use. 159 | 160 | Usage: 161 | 162 | opts := chew.TranscribeOptions{ 163 | CredentialsJSON: []byte("..."), 164 | Bucket: "my-bucket", 165 | LanguageCode: "en-US", 166 | EnableDiarization: true, 167 | MinSpeakers: 2, 168 | MaxSpeakers: 4, 169 | CleanupOnComplete: true, 170 | UseWhisper: true, // You can only have one of these enabled, by default it uses the Google Cloud Speech-to-Text API 171 | WhisperAPIKey: "my-whisper-api-key", 172 | WhisperModel: "whisper-1", 173 | } 174 | */ 175 | type TranscribeOptions = transcribe.TranscribeOptions 176 | 177 | /* 178 | Config struct contains the configuration options for URL processing. 179 | 180 | Fields: 181 | - UserAgent: The user agent string to use for requests (e.g., "MyBot/1.0 (+https://example.com/bot)") 182 | - RetryLimit: Number of retries to attempt in case of failure (e.g., 3) 183 | - RetryDelay: Delay between retries (e.g., 5 * time.Second) 184 | - CrawlDelay: Delay between requests to the same domain (e.g., 10 * time.Second) 185 | - ProxyList: List of proxy URLs to use for requests (e.g., []string{"http://proxy1.com", "http://proxy2.com"}) 186 | - RateLimit: Rate limit for requests (e.g., rate.Every(2 * time.Second)) 187 | - RateBurst: Maximum burst size for rate limiting (e.g., 3) 188 | - IgnoreRobotsTxt: Whether to ignore robots.txt rules (e.g., false) 189 | 190 | Usage: 191 | 192 | config := chew.Config{ 193 | UserAgent: "MyBot/1.0 (+https://example.com/bot)", 194 | RetryLimit: 3, 195 | RetryDelay: 5 * time.Second, 196 | CrawlDelay: 10 * time.Second, 197 | ProxyList: []string{"http://proxy1.com", "http://proxy2.com"}, 198 | RateLimit: 2 * time.Second, 199 | RateBurst: 3, 200 | IgnoreRobotsTxt: false, 201 | } 202 | */ 203 | type Config = common.Config 204 | 205 | /* 206 | This is meant as a fallback in case the content type is not recognized and to enforce 207 | the content type based on the file extension instead of the content type 208 | returned by the server. i.e. if the server returns text/plain but the file is a markdown file 209 | the content types are the biggest culprits of this 210 | */ 211 | var validExtensions = map[string]func(io.Reader, string) ([]common.Chunk, error){ 212 | ".md": text.ProcessText, 213 | ".csv": text.ProcessCSV, 214 | ".json": text.ProcessJSON, 215 | ".yaml": text.ProcessYAML, 216 | ".html": text.ProcessHTML, 217 | ".epub": document.ProcessEpub, 218 | } 219 | 220 | /* 221 | SetHTTPClient allows you to set a custom http.Client to use for making requests. 222 | 223 | This would be useful in the event custom logging, tracing, or other functionality is 224 | required for the requests made by the library. 225 | 226 | Usage: 227 | 228 | client := &http.Client{ 229 | Transport: loggingRoundTripper{wrapped: http.DefaultTransport}, 230 | } 231 | 232 | chew.SetHTTPClient(client) 233 | */ 234 | 235 | func (c *Chew) SetHTTPClient(client *http.Client) { 236 | c.httpClient = client 237 | } 238 | 239 | func (c *Chew) initHTTPClient() { 240 | transport := &http.Transport{ 241 | Proxy: c.getProxy, 242 | } 243 | c.httpClient = &http.Client{ 244 | Timeout: 30 * time.Second, 245 | Transport: transport, 246 | } 247 | } 248 | 249 | /* 250 | For content types that can also return text/plain as their content types we need to manually check 251 | their extension to properly process them. I feel like this could be done better but this is my solution for now. 252 | */ 253 | func getProcessor(contentType, url string) (func(io.Reader, string) ([]common.Chunk, error), error) { 254 | for key, proc := range contentTypeProcessors { 255 | if strings.Contains(contentType, key) { 256 | return proc, nil 257 | } 258 | } 259 | 260 | ext, err := utils.GetFileExtension(url) 261 | if err != nil { 262 | return nil, fmt.Errorf("couldn't get file extension from url %s: %s", url, err) 263 | } 264 | 265 | if proc, ok := validExtensions[ext]; ok { 266 | return proc, nil 267 | } 268 | 269 | return nil, fmt.Errorf("unsupported content type: %s", contentType) 270 | } 271 | 272 | /* 273 | Process takes a list of URLs and returns a list of Chunks 274 | 275 | The slice of strings to be processed can be URLs or file paths 276 | The context is optional and can be used to cancel the processing 277 | of the URLs after a certain amount of time 278 | 279 | This function is safe for concurrent use. 280 | 281 | Usage: 282 | 283 | chunks, err := chew.Process([]string{"https://example.com", "file://path/to/file.txt"}) 284 | if err != nil { 285 | log.Fatalf("Error processing URLs: %v", err) 286 | } 287 | 288 | for _, chunk := range chunks { 289 | log.Printf("Chunk: %s\n Source: %s\n", chunk.Content, chunk.Source) 290 | } 291 | */ 292 | func (c *Chew) Process(ctx context.Context, urls []string) ([]common.Chunk, error) { 293 | var ( 294 | result []common.Chunk 295 | mu sync.Mutex 296 | errCh = make(chan error, len(urls)) 297 | resCh = make(chan []common.Chunk, len(urls)) 298 | ) 299 | 300 | for _, url := range urls { 301 | go func(url string) { 302 | select { 303 | case <-ctx.Done(): 304 | errCh <- ctx.Err() 305 | return 306 | default: 307 | c.rateLimiterMu.RLock() 308 | rateLimiter := c.rateLimiter 309 | c.rateLimiterMu.RUnlock() 310 | 311 | if err := rateLimiter.Wait(ctx); err != nil { 312 | errCh <- fmt.Errorf("rate limit exceeded for %s: %w", url, err) 313 | return 314 | } 315 | 316 | if !c.config.IgnoreRobotsTxt { 317 | allowed, crawlDelay, err := c.getRobotsTxtInfo(url) 318 | if err != nil { 319 | errCh <- fmt.Errorf("checking robots.txt for %s: %w", url, err) 320 | return 321 | } 322 | if !allowed { 323 | errCh <- fmt.Errorf("access to %s is disallowed by robots.txt", url) 324 | return 325 | } 326 | if err := c.respectCrawlDelay(ctx, url, crawlDelay); err != nil { 327 | errCh <- fmt.Errorf("respecting crawl delay for %s: %w", url, err) 328 | return 329 | } 330 | } 331 | 332 | chunks, err := c.processWithRetry(ctx, url) 333 | if err != nil { 334 | errCh <- fmt.Errorf("processing %s: %w", url, err) 335 | return 336 | } 337 | 338 | resCh <- chunks 339 | } 340 | }(url) 341 | } 342 | 343 | for i := 0; i < len(urls); i++ { 344 | select { 345 | case <-ctx.Done(): 346 | return nil, ctx.Err() 347 | case err := <-errCh: 348 | return nil, err 349 | case chunks := <-resCh: 350 | mu.Lock() 351 | result = append(result, chunks...) 352 | mu.Unlock() 353 | } 354 | } 355 | 356 | return result, nil 357 | } 358 | 359 | /* 360 | processURL handles the actual processing of a single URL or file 361 | file paths are processed directly while URLs are fetched and processed 362 | */ 363 | func (c *Chew) processURL(ctx context.Context, url string) ([]common.Chunk, error) { 364 | // if the url is a file path we can just open the file and process it directly 365 | if filePath, found := strings.CutPrefix(url, "file://"); found { 366 | file, err := utils.OpenFile(filePath) 367 | if err != nil { 368 | return nil, fmt.Errorf("opening file: %w", err) 369 | } 370 | defer file.Close() 371 | 372 | ext, _ := utils.GetFileExtension(filePath) 373 | /* 374 | Will leave this in here for now, but I think it's better to just check the file extension 375 | instead of the content type returned. 376 | */ 377 | contentType := utils.GetFileContentType(file) 378 | 379 | proc, err := getProcessor(contentType, filePath) 380 | if err != nil { 381 | proc, ok := validExtensions[ext] 382 | if !ok { 383 | return nil, fmt.Errorf("unsupported file type: %s", ext) 384 | } 385 | return proc(file, url) 386 | } 387 | 388 | return proc(file, url) 389 | } 390 | 391 | req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil) 392 | if err != nil { 393 | return nil, fmt.Errorf("creating request: %w", err) 394 | } 395 | 396 | req.Header.Set("User-Agent", c.config.UserAgent) 397 | 398 | resp, err := c.httpClient.Do(req) 399 | if err != nil { 400 | return nil, fmt.Errorf("making request: %w", err) 401 | } 402 | defer resp.Body.Close() 403 | 404 | contentType := resp.Header.Get("Content-Type") 405 | 406 | processor, err := getProcessor(contentType, url) 407 | if err != nil { 408 | return nil, err 409 | } 410 | 411 | return processor(resp.Body, url) 412 | } 413 | 414 | func (c *Chew) getRobotsTxtInfo(urlStr string) (bool, time.Duration, error) { 415 | parsedURL, err := url.Parse(urlStr) 416 | if err != nil { 417 | return false, 0, err 418 | } 419 | 420 | robotsURL := fmt.Sprintf("%s://%s/robots.txt", parsedURL.Scheme, parsedURL.Host) 421 | 422 | c.robotsMu.RLock() 423 | robotsData, exists := c.robotsCache[robotsURL] 424 | c.robotsMu.RUnlock() 425 | 426 | if !exists { 427 | resp, err := http.Get(robotsURL) 428 | if err != nil { 429 | return true, c.config.CrawlDelay, nil 430 | } 431 | defer resp.Body.Close() 432 | 433 | robotsData, err = robotstxt.FromResponse(resp) 434 | if err != nil { 435 | return true, c.config.CrawlDelay, nil 436 | } 437 | 438 | c.robotsMu.Lock() 439 | c.robotsCache[robotsURL] = robotsData 440 | c.robotsMu.Unlock() 441 | } 442 | 443 | allowed := robotsData.TestAgent(parsedURL.Path, c.config.UserAgent) 444 | 445 | return allowed, c.config.CrawlDelay, nil 446 | } 447 | 448 | // respectCrawlDelay ensures that subsequent requests to the same domain respect the specified crawl delay. 449 | func (c *Chew) respectCrawlDelay(ctx context.Context, urlStr string, delay time.Duration) error { 450 | parsedURL, err := url.Parse(urlStr) 451 | if err != nil { 452 | return err 453 | } 454 | 455 | domain := parsedURL.Hostname() 456 | 457 | c.lastAccessMu.Lock() 458 | lastAccess, exists := c.lastAccess[domain] 459 | if exists { 460 | timeToWait := time.Until(lastAccess.Add(delay)) 461 | if timeToWait > 0 { 462 | c.lastAccessMu.Unlock() 463 | select { 464 | case <-time.After(timeToWait): 465 | case <-ctx.Done(): 466 | return ctx.Err() 467 | } 468 | c.lastAccessMu.Lock() 469 | } 470 | } 471 | 472 | c.lastAccess[domain] = time.Now() 473 | c.lastAccessMu.Unlock() 474 | return nil 475 | } 476 | 477 | func (c *Chew) processWithRetry(ctx context.Context, url string) ([]common.Chunk, error) { 478 | var ( 479 | chunks []common.Chunk 480 | err error 481 | ) 482 | 483 | var retries int 484 | for { 485 | chunks, err = c.processURL(ctx, url) 486 | if err == nil { 487 | return chunks, nil 488 | } 489 | if retries > c.config.RetryLimit { 490 | break 491 | } 492 | retries++ 493 | c.wait(ctx, c.config.RetryDelay) 494 | } 495 | 496 | return nil, err 497 | } 498 | 499 | func (c *Chew) wait(ctx context.Context, d time.Duration) { 500 | select { 501 | case <-time.After(d): 502 | case <-ctx.Done(): 503 | } 504 | } 505 | 506 | func (c *Chew) getProxy(req *http.Request) (*url.URL, error) { 507 | c.proxyMu.Lock() 508 | defer c.proxyMu.Unlock() 509 | 510 | if len(c.config.ProxyList) == 0 { 511 | return nil, nil 512 | } 513 | 514 | proxyURL, err := url.Parse(c.config.ProxyList[c.proxyIndex]) 515 | if err != nil { 516 | return nil, err 517 | } 518 | 519 | c.proxyIndex = (c.proxyIndex + 1) % len(c.config.ProxyList) 520 | return proxyURL, nil 521 | } 522 | -------------------------------------------------------------------------------- /chew_test.go: -------------------------------------------------------------------------------- 1 | package chew 2 | 3 | import ( 4 | "context" 5 | "fmt" 6 | "io" 7 | "net/http" 8 | "net/http/httptest" 9 | "net/url" 10 | "os" 11 | "path/filepath" 12 | "reflect" 13 | "strings" 14 | "testing" 15 | "time" 16 | 17 | "github.com/mmatongo/chew/v1/internal/common" 18 | "github.com/mmatongo/chew/v1/internal/text" 19 | "golang.org/x/time/rate" 20 | ) 21 | 22 | func mockProcessor(r io.Reader, url string) ([]common.Chunk, error) { 23 | content, err := io.ReadAll(r) 24 | if err != nil { 25 | return nil, err 26 | } 27 | return []common.Chunk{{Content: string(content), Source: url}}, nil 28 | } 29 | 30 | type mockTransport struct { 31 | response *http.Response 32 | err error 33 | } 34 | 35 | func (m *mockTransport) RoundTrip(*http.Request) (*http.Response, error) { 36 | return m.response, m.err 37 | } 38 | 39 | type mockRateLimiter struct { 40 | waitErr error 41 | } 42 | 43 | func (m *mockRateLimiter) Wait(ctx context.Context) error { 44 | return m.waitErr 45 | } 46 | 47 | func Test_processURL(t *testing.T) { 48 | originalHTTPClient := http.DefaultClient 49 | originalContentTypeProcessors := contentTypeProcessors 50 | originalValidExtensions := validExtensions 51 | 52 | defer func() { 53 | http.DefaultClient = originalHTTPClient 54 | contentTypeProcessors = originalContentTypeProcessors 55 | validExtensions = originalValidExtensions 56 | }() 57 | 58 | mockClient := &http.Client{ 59 | Transport: &mockTransport{ 60 | response: &http.Response{ 61 | StatusCode: 200, 62 | Body: io.NopCloser(strings.NewReader("Test content")), 63 | Header: http.Header{"Content-Type": []string{"text/html"}}, 64 | }, 65 | }, 66 | } 67 | chew := New(Config{}) 68 | ctx := context.Background() 69 | 70 | chew.SetHTTPClient(mockClient) 71 | defer chew.SetHTTPClient(nil) 72 | 73 | contentTypeProcessors = map[string]func(io.Reader, string) ([]common.Chunk, error){ 74 | "text/html": mockProcessor, 75 | "text/plain": mockProcessor, 76 | } 77 | validExtensions = map[string]func(io.Reader, string) ([]common.Chunk, error){ 78 | ".html": mockProcessor, 79 | ".txt": mockProcessor, 80 | } 81 | 82 | tempDir := t.TempDir() 83 | testHTMLPath := filepath.Join(tempDir, "test.html") 84 | testTXTPath := filepath.Join(tempDir, "test.txt") 85 | testUnsupportedPath := filepath.Join(tempDir, "test.unsupported") 86 | 87 | err := os.WriteFile(testHTMLPath, []byte("html content"), 0644) 88 | if err != nil { 89 | t.Fatalf("failed to create test html file: %v", err) 90 | } 91 | 92 | err = os.WriteFile(testTXTPath, []byte("text content"), 0644) 93 | if err != nil { 94 | t.Fatalf("failed to create test text file: %v", err) 95 | } 96 | 97 | err = os.WriteFile(testUnsupportedPath, []byte("unsupported content"), 0644) 98 | if err != nil { 99 | t.Fatalf("failed to create test unsupported file: %v", err) 100 | } 101 | 102 | tests := []struct { 103 | name string 104 | url string 105 | want []common.Chunk 106 | wantErr bool 107 | }{ 108 | { 109 | name: "success", 110 | url: "https://example.com/page.html", 111 | want: []common.Chunk{{Content: "Test content", Source: "https://example.com/page.html"}}, 112 | wantErr: false, 113 | }, 114 | { 115 | name: "success html", 116 | url: "file://" + testHTMLPath, 117 | want: []common.Chunk{{Content: "html content", Source: "file://" + testHTMLPath}}, 118 | wantErr: false, 119 | }, 120 | { 121 | name: "success txt", 122 | url: "file://" + testTXTPath, 123 | want: []common.Chunk{{Content: "text content", Source: "file://" + testTXTPath}}, 124 | wantErr: false, 125 | }, 126 | { 127 | name: "unsupported file type", 128 | url: "file://" + testUnsupportedPath, 129 | want: nil, 130 | wantErr: true, 131 | }, 132 | { 133 | name: "non-existent file", 134 | url: "file:///non-existent.md", 135 | want: nil, 136 | wantErr: true, 137 | }, 138 | } 139 | 140 | for _, tt := range tests { 141 | t.Run(tt.name, func(t *testing.T) { 142 | got, err := chew.processURL(ctx, tt.url) 143 | if (err != nil) != tt.wantErr { 144 | t.Errorf("processURL() error = %v, wantErr %v", err, tt.wantErr) 145 | return 146 | } 147 | if !reflect.DeepEqual(got, tt.want) { 148 | t.Errorf("processURL() = %v, want %v", got, tt.want) 149 | } 150 | }) 151 | } 152 | } 153 | 154 | func Test_getProcessor(t *testing.T) { 155 | type args struct { 156 | contentType string 157 | url string 158 | } 159 | tests := []struct { 160 | name string 161 | args args 162 | want func(io.Reader, string) ([]common.Chunk, error) 163 | wantErr bool 164 | }{ 165 | { 166 | name: "success", 167 | args: args{ 168 | contentType: "text/html", 169 | url: "https://example.com/page.html", 170 | }, 171 | want: mockProcessor, 172 | wantErr: false, 173 | }, 174 | { 175 | name: "unknown content type", 176 | args: args{ 177 | contentType: "octet/stream", 178 | url: "https://example.com/page.html", 179 | }, 180 | want: text.ProcessHTML, 181 | wantErr: false, 182 | }, 183 | { 184 | name: "unsupported content type", 185 | args: args{ 186 | contentType: "application/octet-stream", 187 | url: "https://example.com/page.htt", 188 | }, 189 | want: nil, 190 | wantErr: true, 191 | }, 192 | { 193 | name: "no extension", 194 | args: args{ 195 | contentType: "octet/stream", 196 | url: "https://example.com/page", 197 | }, 198 | want: nil, 199 | wantErr: true, 200 | }, 201 | } 202 | for _, tt := range tests { 203 | t.Run(tt.name, func(t *testing.T) { 204 | got, err := getProcessor(tt.args.contentType, tt.args.url) 205 | if (err != nil) != tt.wantErr { 206 | t.Errorf("getProcessor() error = %v, wantErr %v", err, tt.wantErr) 207 | return 208 | } 209 | 210 | if got == nil && tt.want != nil { 211 | t.Errorf("getProcessor() returned nil, want non-nil") 212 | } else if got != nil && tt.want == nil { 213 | t.Errorf("getProcessor() returned non-nil, want nil") 214 | } else if got != nil { 215 | gotType := reflect.TypeOf(got) 216 | wantType := reflect.TypeOf(tt.want) 217 | if gotType != wantType { 218 | t.Errorf("getProcessor() returned function of type %v, want %v", gotType, wantType) 219 | } 220 | } 221 | }) 222 | } 223 | } 224 | 225 | func TestProcess(t *testing.T) { 226 | server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { 227 | switch r.URL.Path { 228 | case "/robots.txt": 229 | w.Header().Set("Content-Type", "text/plain") 230 | w.Write([]byte("User-agent: *\nDisallow: /disallowed\nCrawl-delay: 1")) 231 | case "/text": 232 | w.Header().Set("Content-Type", "text/plain") 233 | w.Write([]byte("A plain text file.")) 234 | case "/html": 235 | w.Header().Set("Content-Type", "text/html") 236 | w.Write([]byte("

An HTML file.

")) 237 | case "/markdown": 238 | w.Header().Set("Content-Type", "text/plain") 239 | w.Write([]byte("# A Markdown file")) 240 | case "/disallowed": 241 | w.Header().Set("Content-Type", "text/plain") 242 | w.Write([]byte("This page is disallowed by robots.txt")) 243 | case "/rate-limited": 244 | time.Sleep(2 * time.Second) 245 | w.Write([]byte("Rate limited content")) 246 | } 247 | })) 248 | defer server.Close() 249 | 250 | containsChunk := func(chunks []common.Chunk, chunk common.Chunk) bool { 251 | for _, c := range chunks { 252 | if c.Content == chunk.Content && c.Source == chunk.Source { 253 | return true 254 | } 255 | } 256 | return false 257 | } 258 | 259 | chew := New(Config{ 260 | IgnoreRobotsTxt: false, 261 | UserAgent: "TestBot/1.0", 262 | RetryLimit: 3, 263 | RetryDelay: 100 * time.Millisecond, 264 | CrawlDelay: 1 * time.Second, 265 | RateLimit: 500 * time.Millisecond, 266 | RateBurst: 1, 267 | }) 268 | 269 | chew.httpClient.Timeout = 5 * time.Second 270 | 271 | type args struct { 272 | urls []string 273 | ctxs []context.Context 274 | } 275 | tests := []struct { 276 | name string 277 | args args 278 | want []common.Chunk 279 | wantErr bool 280 | expectedErrText string 281 | ignoreRobotsTxt bool 282 | orderIndependent bool 283 | rateLimiter RateLimiter 284 | }{ 285 | { 286 | name: "plain text", 287 | args: args{ 288 | urls: []string{server.URL + "/text"}, 289 | }, 290 | want: []common.Chunk{ 291 | {Content: "A plain text file.", Source: server.URL + "/text"}, 292 | }, 293 | wantErr: false, 294 | }, 295 | { 296 | name: "HTML", 297 | args: args{ 298 | urls: []string{server.URL + "/html"}, 299 | }, 300 | want: []common.Chunk{ 301 | {Content: "An HTML file.", Source: server.URL + "/html"}, 302 | }, 303 | wantErr: false, 304 | }, 305 | { 306 | name: "markdown", 307 | args: args{ 308 | urls: []string{server.URL + "/markdown"}, 309 | }, 310 | want: []common.Chunk{ 311 | {Content: "# A Markdown file", Source: server.URL + "/markdown"}, 312 | }, 313 | wantErr: false, 314 | }, 315 | { 316 | name: "multiple URLs", 317 | args: args{ 318 | urls: []string{server.URL + "/text", server.URL + "/html"}, 319 | }, 320 | want: []common.Chunk{ 321 | {Content: "An HTML file.", Source: server.URL + "/html"}, 322 | {Content: "A plain text file.", Source: server.URL + "/text"}, 323 | }, 324 | wantErr: false, 325 | orderIndependent: true, 326 | }, 327 | { 328 | name: "invalid URL", 329 | args: args{ 330 | urls: []string{"ftp://invalid.url"}, 331 | }, 332 | want: nil, 333 | wantErr: true, 334 | }, 335 | { 336 | name: "context cancellation", 337 | args: args{ 338 | urls: []string{server.URL + "/text"}, 339 | ctxs: []context.Context{func() context.Context { 340 | ctx, cancel := context.WithCancel(context.Background()) 341 | go func() { 342 | time.Sleep(50 * time.Millisecond) 343 | cancel() 344 | }() 345 | return ctx 346 | }()}, 347 | }, 348 | want: nil, 349 | wantErr: true, 350 | expectedErrText: "context canceled", 351 | }, 352 | { 353 | name: "with more than one context", 354 | args: args{ 355 | urls: []string{server.URL + "/text"}, 356 | ctxs: []context.Context{context.Background(), context.Background()}, 357 | }, 358 | want: []common.Chunk{{Content: "A plain text file.", Source: server.URL + "/text"}}, 359 | wantErr: false, 360 | }, 361 | { 362 | name: "respects robots.txt", 363 | args: args{ 364 | urls: []string{server.URL + "/disallowed"}, 365 | }, 366 | want: nil, 367 | wantErr: true, 368 | }, 369 | { 370 | name: "ignores robots.txt when configured", 371 | args: args{ 372 | urls: []string{server.URL + "/disallowed"}, 373 | }, 374 | want: []common.Chunk{ 375 | {Content: "This page is disallowed by robots.txt", Source: server.URL + "/disallowed"}, 376 | }, 377 | wantErr: false, 378 | ignoreRobotsTxt: true, 379 | }, 380 | { 381 | name: "robots.txt disallowed", 382 | args: args{ 383 | urls: []string{server.URL + "/disallowed"}, 384 | }, 385 | want: nil, 386 | wantErr: true, 387 | expectedErrText: "access to", 388 | }, 389 | { 390 | name: "respects crawl delay", 391 | args: args{ 392 | urls: []string{server.URL + "/text", server.URL + "/html"}, 393 | }, 394 | want: []common.Chunk{ 395 | {Content: "A plain text file.", Source: server.URL + "/text"}, 396 | {Content: "An HTML file.", Source: server.URL + "/html"}, 397 | }, 398 | wantErr: false, 399 | orderIndependent: true, 400 | }, 401 | { 402 | name: "rate limiting error", 403 | args: args{ 404 | urls: []string{server.URL + "/rate-limited", server.URL + "/rate-limited"}, 405 | }, 406 | want: nil, 407 | wantErr: true, 408 | expectedErrText: "rate limit exceeded", 409 | rateLimiter: &mockRateLimiter{waitErr: fmt.Errorf("rate limit exceeded")}, 410 | }, 411 | { 412 | name: "crawl delay respect error", 413 | args: args{ 414 | urls: []string{server.URL + "/text", server.URL + "/text"}, 415 | ctxs: []context.Context{func() context.Context { 416 | ctx, cancel := context.WithTimeout(context.Background(), 1500*time.Millisecond) 417 | defer cancel() 418 | return ctx 419 | }()}, 420 | }, 421 | want: nil, 422 | wantErr: true, 423 | expectedErrText: "context canceled", 424 | }, 425 | } 426 | 427 | for _, tt := range tests { 428 | t.Run(tt.name, func(t *testing.T) { 429 | oldState := chew.config.IgnoreRobotsTxt 430 | chew.config.IgnoreRobotsTxt = tt.ignoreRobotsTxt 431 | defer func() { chew.config.IgnoreRobotsTxt = oldState }() 432 | 433 | if tt.rateLimiter != nil { 434 | chew.SetRateLimiter(tt.rateLimiter) 435 | } else { 436 | chew.SetRateLimiter(rate.NewLimiter(rate.Every(chew.config.RateLimit), chew.config.RateBurst)) 437 | } 438 | 439 | ctx := context.Background() 440 | if len(tt.args.ctxs) > 0 { 441 | ctx = tt.args.ctxs[0] 442 | } 443 | 444 | got, err := chew.Process(ctx, tt.args.urls) 445 | 446 | if tt.wantErr { 447 | if err == nil { 448 | t.Errorf("Process() error = nil, wantErr %v", tt.wantErr) 449 | return 450 | } 451 | if tt.expectedErrText != "" && !strings.Contains(err.Error(), tt.expectedErrText) { 452 | t.Errorf("Process() error = %v, expectedErrText %v", err, tt.expectedErrText) 453 | return 454 | } 455 | } else { 456 | if err != nil { 457 | t.Errorf("Process() unexpected error: %v", err) 458 | return 459 | } 460 | if !tt.orderIndependent && !reflect.DeepEqual(got, tt.want) { 461 | t.Errorf("Process() = %v, want %v", got, tt.want) 462 | } 463 | if tt.orderIndependent { 464 | if len(got) != len(tt.want) { 465 | t.Errorf("Process() returned %d chunks, want %d", len(got), len(tt.want)) 466 | } 467 | for _, wantChunk := range tt.want { 468 | if !containsChunk(got, wantChunk) { 469 | t.Errorf("Process() did not return chunk %v", wantChunk) 470 | } 471 | } 472 | } 473 | } 474 | }) 475 | } 476 | } 477 | 478 | func Test_getProxy(t *testing.T) { 479 | tests := []struct { 480 | name string 481 | config Config 482 | requests int 483 | wantProxy []*url.URL 484 | }{ 485 | { 486 | name: "no proxies", 487 | config: Config{}, 488 | requests: 1, 489 | wantProxy: []*url.URL{nil}, 490 | }, 491 | { 492 | name: "single proxy", 493 | config: Config{ 494 | ProxyList: []string{"http://proxy1.example.com"}, 495 | }, 496 | requests: 2, 497 | wantProxy: []*url.URL{must(url.Parse("http://proxy1.example.com")), must(url.Parse("http://proxy1.example.com"))}, 498 | }, 499 | { 500 | name: "multiple proxies", 501 | config: Config{ 502 | ProxyList: []string{"http://proxy1.example.com", "http://proxy2.example.com", "http://proxy3.example.com"}, 503 | }, 504 | requests: 5, 505 | wantProxy: []*url.URL{ 506 | must(url.Parse("http://proxy1.example.com")), 507 | must(url.Parse("http://proxy2.example.com")), 508 | must(url.Parse("http://proxy3.example.com")), 509 | must(url.Parse("http://proxy1.example.com")), 510 | must(url.Parse("http://proxy2.example.com")), 511 | }, 512 | }, 513 | } 514 | 515 | for _, tt := range tests { 516 | t.Run(tt.name, func(t *testing.T) { 517 | c := New(tt.config) 518 | for i := 0; i < tt.requests; i++ { 519 | got, err := c.getProxy(&http.Request{}) 520 | if err != nil { 521 | t.Errorf("getProxy() error = %v", err) 522 | return 523 | } 524 | if !reflect.DeepEqual(got, tt.wantProxy[i]) { 525 | t.Errorf("getProxy() = %v, want %v", got, tt.wantProxy[i]) 526 | } 527 | } 528 | }) 529 | } 530 | } 531 | 532 | func must(u *url.URL, err error) *url.URL { 533 | if err != nil { 534 | panic(err) 535 | } 536 | return u 537 | } 538 | 539 | func TestRespectCrawlDelay(t *testing.T) { 540 | chew := New(Config{}) 541 | ctx := context.Background() 542 | 543 | tests := []struct { 544 | name string 545 | ctx context.Context 546 | url string 547 | delay time.Duration 548 | wantWait bool 549 | }{ 550 | { 551 | name: "first access", 552 | url: "https://example.com", 553 | delay: time.Second, 554 | wantWait: false, 555 | }, 556 | { 557 | name: "second access", 558 | url: "https://example.com", 559 | delay: time.Second, 560 | wantWait: true, 561 | }, 562 | } 563 | 564 | for _, tt := range tests { 565 | t.Run(tt.name, func(t *testing.T) { 566 | start := time.Now() 567 | err := chew.respectCrawlDelay(ctx, tt.url, tt.delay) 568 | duration := time.Since(start) 569 | 570 | if err != nil { 571 | t.Errorf("respectCrawlDelay() error = %v", err) 572 | return 573 | } 574 | 575 | if tt.wantWait && duration < tt.delay { 576 | t.Errorf("respectCrawlDelay() didn't wait long enough. Duration: %v, Expected: %v", duration, tt.delay) 577 | } 578 | if !tt.wantWait && duration >= tt.delay { 579 | t.Errorf("respectCrawlDelay() waited unnecessarily. Duration: %v", duration) 580 | } 581 | }) 582 | } 583 | } 584 | -------------------------------------------------------------------------------- /cmd/chew/wrapper.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | /* 4 | #include 5 | */ 6 | import "C" 7 | 8 | import ( 9 | "context" 10 | "fmt" 11 | "strings" 12 | "time" 13 | "unsafe" 14 | 15 | "github.com/mmatongo/chew/v1" 16 | ) 17 | 18 | //export Process 19 | func Process(urls *C.char) *C.char { 20 | urlsSlice := strings.Split(C.GoString(urls), ",") 21 | 22 | ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) 23 | defer cancel() 24 | 25 | c := chew.New(chew.Config{ 26 | UserAgent: "Chew/1.0 (+https://github.com/mmatongo/chew)", 27 | RetryLimit: 3, 28 | RetryDelay: time.Second, 29 | CrawlDelay: time.Second, 30 | RateLimit: time.Second, 31 | RateBurst: 1, 32 | IgnoreRobotsTxt: false, 33 | }) 34 | 35 | chunks, err := c.Process(ctx, urlsSlice) 36 | if err != nil { 37 | if err == context.DeadlineExceeded { 38 | return C.CString("Operation timed out") 39 | } 40 | return C.CString(fmt.Sprintf("Error processing URLs: %v", err)) 41 | } 42 | 43 | var result strings.Builder 44 | for _, chunk := range chunks { 45 | result.WriteString(fmt.Sprintf("Source: %s\nContent: %s\n\n", chunk.Source, chunk.Content)) 46 | } 47 | 48 | return C.CString(result.String()) 49 | } 50 | 51 | //export FreeString 52 | func FreeString(ptr *C.char) { 53 | C.free(unsafe.Pointer(ptr)) 54 | } 55 | 56 | func main() {} 57 | -------------------------------------------------------------------------------- /codecov.yml: -------------------------------------------------------------------------------- 1 | coverage: 2 | precision: 2 3 | round: up 4 | range: "70...100" 5 | 6 | ignore: 7 | - ".idea" 8 | - "docs" 9 | - "cmd" 10 | - "testdata" 11 | - "examples" 12 | - "assets" 13 | -------------------------------------------------------------------------------- /docs/golang.md: -------------------------------------------------------------------------------- 1 | Chew is native to Go and can be used as library in your Go project with ease. Here is a simple example of how to use Chew in your Go project. 2 | 3 | ```go 4 | package main 5 | 6 | import ( 7 | "context" 8 | "fmt" 9 | "log" 10 | "time" 11 | 12 | "github.com/mmatongo/chew" 13 | ) 14 | 15 | func main() { 16 | urls := []string{ 17 | "https://example.com", 18 | } 19 | 20 | // The context is optional 21 | ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) 22 | defer cancel() 23 | 24 | chunks, err := chew.Process(urls, ctx) 25 | if err != nil { 26 | if err == context.DeadlineExceeded { 27 | log.Println("Operation timed out") 28 | } else { 29 | log.Printf("Error processing URLs: %v", err) 30 | } 31 | return 32 | } 33 | 34 | for _, chunk := range chunks { 35 | fmt.Printf("Source: %s\nContent: %s\n\n", chunk.Source, chunk.Content) 36 | } 37 | } 38 | ``` 39 | 40 | The above code snippet demonstrates how to use Chew in your Go project. The `chew.Process` function takes a list of URLs and returns a list of `Chunk` objects. Each `Chunk` object contains the source URL and the content of the URL. The `context` parameter is optional and can be used to set a timeout for the operation. If the operation times out, the function will return a `context.DeadlineExceeded` error. 41 | 42 | Markdown formatting is not enforced in the content of the `Chunk` object. However, the output is always going to be plain text so you can format it as you wish. 43 | -------------------------------------------------------------------------------- /docs/python.md: -------------------------------------------------------------------------------- 1 | To use Chew with python you need to first build the package to create the shared object file and header file. You can do this by running the following command: 2 | 3 | ```bash 4 | go build -o chew.so -buildmode=c-shared ./cmd/chew/wrapper.go 5 | ``` 6 | 7 | This will create a `chew.so` and `chew.h` file in the current directory. You can then use these files in your python project to use Chew. Here is an example of how to use Chew in your python project: 8 | 9 | ```python 10 | import ctypes 11 | 12 | chew_lib = ctypes.CDLL('./chew.so') 13 | 14 | chew_lib.Process.argtypes = [ctypes.c_char_p] 15 | chew_lib.Process.restype = ctypes.c_char_p 16 | 17 | url = "https://example.com" 18 | result = chew_lib.Process(url.encode('utf-8')) 19 | 20 | print(result.decode('utf-8')) 21 | ``` 22 | 23 | With the above code snippet, you can now use Chew in your python project. I can't speak for the limitations of using Chew in python as I have not extensively tested it myself. 24 | -------------------------------------------------------------------------------- /docs/ruby.md: -------------------------------------------------------------------------------- 1 | To use Chew with Ruby you need to first build the package to create the shared object file and header file. You can do this by running the following command: 2 | 3 | ```bash 4 | go build -o chew.so -buildmode=c-shared ./cmd/chew/wrapper.go 5 | ``` 6 | 7 | This will create a `chew.so` and `chew.h` file in the current directory. You can then use these files in your Ruby project to use Chew. Here is an example of how to use Chew in your Ruby project: 8 | 9 | ```ruby 10 | require 'fiddle' 11 | require 'fiddle/import' 12 | 13 | module ChewLib 14 | extend Fiddle::Importer 15 | dlload './chew.so' 16 | 17 | extern 'char* Process(char*)' 18 | end 19 | 20 | urls = ['https://example.com', 'https://example.com'] 21 | for url in urls 22 | result_ptr = ChewLib.Process(url) 23 | result = result_ptr.to_s 24 | Fiddle::Function.new(Fiddle::Handle['free'], [Fiddle::TYPE_VOIDP], Fiddle::TYPE_VOID).call(result_ptr) 25 | 26 | puts result 27 | end 28 | ``` 29 | 30 | Using chew like this will come with obvious limitations, however, this is a simple example of how to use Chew in your Ruby project 31 | -------------------------------------------------------------------------------- /docs/setup.md: -------------------------------------------------------------------------------- 1 | # Setting up Google Cloud Services for Speech-to-Text 2 | 3 | 1. **Create a Google Cloud Project** 4 | - Go to the [Google Cloud Console](https://console.cloud.google.com/) 5 | - Click on the project dropdown and select "New Project" 6 | - Enter a project name and click "Create" 7 | 8 | 2. **Enable the Cloud Speech-to-Text API** 9 | - In the Google Cloud Console, under "Quick access" go to "APIs & Services" 10 | - Click on "+ ENABLE APIS AND SERVICES" 11 | - Search for "Cloud Speech-to-Text API" and select it 12 | - Click "Enable" 13 | 14 | 3. **Create a Service Account** 15 | - In the Google Cloud Console, go to "IAM & Admin" > "Service Accounts" (or use [this link](https://console.cloud.google.com/iam-admin/serviceaccounts)) 16 | - Click "Create Service Account" 17 | - Enter a name for the service account and click "Create" 18 | - For the role, choose "Project" > "Owner" (or a more restrictive role if preferred) 19 | - Click "Continue" and then "Done" 20 | 21 | 4. **Generate a Key for the Service Account** 22 | - In the Service Accounts list, find the account you just created 23 | - Click on the three dots menu (⋮) and select "Manage keys" 24 | - Click "Add Key" > "Create new key" 25 | - Choose "JSON" as the key type and click "Create" 26 | - The key file will be downloaded to your computer 27 | 28 | 5. **Set the GOOGLE_APPLICATION_CREDENTIALS Environment Variable** 29 | - On Linux or macOS: 30 | ``` 31 | export GOOGLE_APPLICATION_CREDENTIALS="/path/to/your/service-account-key.json" 32 | ``` 33 | - If you want to set the environment variable permanently, you can add it to your shell profile (e.g., `~/.bashrc`, `~/.zshrc`, etc.) 34 | 35 | You can optionally set the environment variable in your code as well: 36 | ```python 37 | import os 38 | 39 | os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "/path/to/your/service-account-key.json" 40 | ``` 41 | 42 | ```go 43 | import "os" 44 | 45 | os.Setenv("GOOGLE_APPLICATION_CREDENTIALS", "/path/to/your/service-account-key.json") 46 | ``` 47 | 48 | ```ruby 49 | ENV["GOOGLE_APPLICATION_CREDENTIALS"] = "/path/to/your/service-account-key.json" 50 | ``` 51 | -------------------------------------------------------------------------------- /examples/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "context" 5 | "fmt" 6 | "log" 7 | "time" 8 | 9 | "github.com/mmatongo/chew/v1" 10 | ) 11 | 12 | func main() { 13 | urls := []string{ 14 | "https://example.com", 15 | } 16 | 17 | config := chew.Config{ 18 | UserAgent: "Chew/1.0 (+https://github.com/mmatongo/chew)", 19 | RetryLimit: 3, 20 | RetryDelay: 5 * time.Second, 21 | CrawlDelay: 10 * time.Second, 22 | ProxyList: []string{}, // Add your proxies here, or leave empty 23 | RateLimit: 2 * time.Second, 24 | RateBurst: 3, 25 | IgnoreRobotsTxt: false, 26 | } 27 | 28 | haChew := chew.New(config) 29 | 30 | ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) 31 | defer cancel() 32 | 33 | chunks, err := haChew.Process(ctx, urls) 34 | if err != nil { 35 | if err == context.DeadlineExceeded { 36 | log.Println("Operation timed out") 37 | } else { 38 | log.Printf("Error processing URLs: %v", err) 39 | } 40 | return 41 | } 42 | 43 | for _, chunk := range chunks { 44 | fmt.Printf("Source: %s\nContent: %s\n\n", chunk.Source, chunk.Content) 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /examples/main.py: -------------------------------------------------------------------------------- 1 | # please see the documentation on how to build chew for use with python 2 | 3 | import ctypes 4 | 5 | chew_lib = ctypes.CDLL('./chew.so') 6 | 7 | chew_lib.Process.argtypes = [ctypes.c_char_p] 8 | chew_lib.Process.restype = ctypes.c_char_p 9 | 10 | urls = "https://example.com" 11 | result = chew_lib.Process(urls.encode('utf-8')) 12 | 13 | print(result.decode('utf-8')) 14 | -------------------------------------------------------------------------------- /examples/main.rb: -------------------------------------------------------------------------------- 1 | # please see the documentation on how to build chew for use with ruby 2 | 3 | require 'fiddle' 4 | require 'fiddle/import' 5 | 6 | module ChewLib 7 | extend Fiddle::Importer 8 | dlload './chew.so' 9 | 10 | extern 'char* Process(char*)' 11 | end 12 | 13 | urls = ['https://example.com', 'https://example.com'] 14 | for url in urls 15 | result_ptr = ChewLib.Process(url) 16 | result = result_ptr.to_s 17 | Fiddle::Function.new(Fiddle::Handle['free'], [Fiddle::TYPE_VOIDP], Fiddle::TYPE_VOID).call(result_ptr) 18 | 19 | puts result 20 | end 21 | -------------------------------------------------------------------------------- /examples/transcription/google.go: -------------------------------------------------------------------------------- 1 | //go:build ignore 2 | 3 | package main 4 | 5 | import ( 6 | "context" 7 | "log" 8 | "os" 9 | "time" 10 | 11 | "github.com/mmatongo/chew/v1" 12 | ) 13 | 14 | func main() { 15 | credentialsFile := "chew-go.json" 16 | credentialsJSON, err := os.ReadFile(credentialsFile) 17 | if err != nil { 18 | log.Fatalf("Failed to read credentials file: %v", err) 19 | } 20 | 21 | err = os.Setenv("GOOGLE_APPLICATION_CREDENTIALS", credentialsFile) 22 | if err != nil { 23 | log.Fatalf("Failed to set environment variable: %v", err) 24 | } 25 | 26 | config := chew.TranscribeOptions{ 27 | CredentialsJSON: credentialsJSON, 28 | Bucket: "chew-go", 29 | LanguageCode: "en-US", 30 | } 31 | 32 | log.Println("transcribing files...") 33 | /* 34 | Transcriptions can take a bit of time so ensure that the timeout you set 35 | is enough for the process to finish 36 | 37 | In a test with MLK Jr's speech it took about 3min to complete 38 | 39 | The two audio files used in this example can be obtained from the following links: 40 | - Conference.wav: https://voiceage.com/wbsamples/in_stereo/Conference.wav 41 | - MLKDream_64kb.mp3: https://archive.org/details/MLKDream 42 | */ 43 | 44 | ctx, cancel := context.WithTimeout(context.Background(), 5*time.Minute) 45 | defer cancel() 46 | 47 | filenames := []string{ 48 | "audio/Conference.wav", 49 | "audio/MLKDream_64kb.mp3", 50 | } 51 | 52 | results, err := chew.Transcribe(ctx, filenames, config) 53 | if err != nil { 54 | log.Fatalf("failed to transcribe: %v", err) 55 | } 56 | 57 | for filename, transcript := range results { 58 | log.Printf("Transcript for %s: %s\n", filename, transcript) 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /examples/transcription/whisper.go: -------------------------------------------------------------------------------- 1 | //go:build ignore 2 | 3 | package main 4 | 5 | import ( 6 | "context" 7 | "log" 8 | "os" 9 | "time" 10 | 11 | "github.com/mmatongo/chew/v1" 12 | ) 13 | 14 | func main() { 15 | key := os.Getenv("OPENAI_API_KEY") 16 | if key == "" { 17 | log.Fatalf("Please set the OPENAI_API_KEY= environment variable") 18 | } 19 | 20 | whisperOpts := chew.TranscribeOptions{ 21 | UseWhisper: true, 22 | WhisperAPIKey: key, 23 | WhisperModel: "whisper-1", 24 | } 25 | 26 | log.Println("transcribing files...") 27 | /* 28 | The whisper model is a bit faster than the google cloud speech-to-text api 29 | so the timeout can be set to a lower value. 30 | 31 | In a test with MLK Jr's speech it took about 32s to complete 32 | 33 | The two audio files used in this example can be obtained from the following links: 34 | - Conference.wav: https://voiceage.com/wbsamples/in_stereo/Conference.wav 35 | - MLKDream_64kb.mp3: https://archive.org/details/MLKDream 36 | */ 37 | 38 | ctx, cancel := context.WithTimeout(context.Background(), 5*time.Minute) 39 | defer cancel() 40 | 41 | audioFiles := []string{ 42 | "audio/Conference.wav", 43 | "audio/MLKDream_64kb.mp3", 44 | } 45 | 46 | results, err := chew.Transcribe(ctx, audioFiles, whisperOpts) 47 | 48 | if err != nil { 49 | log.Fatalf("Error transcribing with OpenAI Whisper: %v", err) 50 | } 51 | 52 | for filename, transcript := range results { 53 | log.Printf("Transcript for %s: %s\n", filename, transcript) 54 | } 55 | } 56 | -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/mmatongo/chew/v1 2 | 3 | go 1.23 4 | 5 | require ( 6 | cloud.google.com/go/speech v1.23.1 7 | github.com/PuerkitoBio/goquery v1.9.2 8 | github.com/amanitaverna/go-mp3 v0.4.0 9 | github.com/go-audio/wav v1.1.0 10 | github.com/ledongthuc/pdf v0.0.0-20240201131950-da5b75280b06 11 | github.com/mewkiz/flac v1.0.11 12 | github.com/taylorskalyo/goreader v0.0.0-20230626212555-e7f5644f8115 13 | github.com/temoto/robotstxt v1.1.2 14 | golang.org/x/time v0.6.0 15 | google.golang.org/api v0.187.0 16 | ) 17 | 18 | require ( 19 | cloud.google.com/go v0.115.0 // indirect 20 | cloud.google.com/go/auth v0.6.1 // indirect 21 | cloud.google.com/go/auth/oauth2adapt v0.2.2 // indirect 22 | cloud.google.com/go/compute/metadata v0.3.0 // indirect 23 | cloud.google.com/go/iam v1.1.8 // indirect 24 | cloud.google.com/go/longrunning v0.5.7 // indirect 25 | github.com/felixge/httpsnoop v1.0.4 // indirect 26 | github.com/go-audio/audio v1.0.0 // indirect 27 | github.com/go-audio/riff v1.0.0 // indirect 28 | github.com/go-logr/logr v1.4.1 // indirect 29 | github.com/go-logr/stdr v1.2.2 // indirect 30 | github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da // indirect 31 | github.com/golang/protobuf v1.5.4 // indirect 32 | github.com/google/s2a-go v0.1.7 // indirect 33 | github.com/google/uuid v1.6.0 // indirect 34 | github.com/googleapis/enterprise-certificate-proxy v0.3.2 // indirect 35 | github.com/googleapis/gax-go/v2 v2.12.5 // indirect 36 | github.com/icza/bitio v1.1.0 // indirect 37 | github.com/kr/pretty v0.1.0 // indirect 38 | github.com/mewkiz/pkg v0.0.0-20230226050401-4010bf0fec14 // indirect 39 | go.opencensus.io v0.24.0 // indirect 40 | go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.49.0 // indirect 41 | go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.49.0 // indirect 42 | go.opentelemetry.io/otel v1.24.0 // indirect 43 | go.opentelemetry.io/otel/metric v1.24.0 // indirect 44 | go.opentelemetry.io/otel/trace v1.24.0 // indirect 45 | golang.org/x/crypto v0.25.0 // indirect 46 | golang.org/x/oauth2 v0.21.0 // indirect 47 | golang.org/x/sync v0.7.0 // indirect 48 | golang.org/x/sys v0.22.0 // indirect 49 | golang.org/x/text v0.16.0 // indirect 50 | google.golang.org/genproto v0.0.0-20240624140628-dc46fd24d27d // indirect 51 | google.golang.org/genproto/googleapis/api v0.0.0-20240617180043-68d350f18fd4 // indirect 52 | google.golang.org/genproto/googleapis/rpc v0.0.0-20240624140628-dc46fd24d27d // indirect 53 | google.golang.org/grpc v1.64.0 // indirect 54 | google.golang.org/protobuf v1.34.2 // indirect 55 | gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127 // indirect 56 | ) 57 | 58 | require ( 59 | cloud.google.com/go/storage v1.43.0 60 | github.com/andybalholm/cascadia v1.3.2 // indirect 61 | golang.org/x/net v0.27.0 // indirect 62 | gopkg.in/yaml.v3 v3.0.1 63 | ) 64 | -------------------------------------------------------------------------------- /go.sum: -------------------------------------------------------------------------------- 1 | cloud.google.com/go v0.26.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMTw= 2 | cloud.google.com/go v0.115.0 h1:CnFSK6Xo3lDYRoBKEcAtia6VSC837/ZkJuRduSFnr14= 3 | cloud.google.com/go v0.115.0/go.mod h1:8jIM5vVgoAEoiVxQ/O4BFTfHqulPZgs/ufEzMcFMdWU= 4 | cloud.google.com/go/auth v0.6.1 h1:T0Zw1XM5c1GlpN2HYr2s+m3vr1p2wy+8VN+Z1FKxW38= 5 | cloud.google.com/go/auth v0.6.1/go.mod h1:eFHG7zDzbXHKmjJddFG/rBlcGp6t25SwRUiEQSlO4x4= 6 | cloud.google.com/go/auth/oauth2adapt v0.2.2 h1:+TTV8aXpjeChS9M+aTtN/TjdQnzJvmzKFt//oWu7HX4= 7 | cloud.google.com/go/auth/oauth2adapt v0.2.2/go.mod h1:wcYjgpZI9+Yu7LyYBg4pqSiaRkfEK3GQcpb7C/uyF1Q= 8 | cloud.google.com/go/compute/metadata v0.3.0 h1:Tz+eQXMEqDIKRsmY3cHTL6FVaynIjX2QxYC4trgAKZc= 9 | cloud.google.com/go/compute/metadata v0.3.0/go.mod h1:zFmK7XCadkQkj6TtorcaGlCW1hT1fIilQDwofLpJ20k= 10 | cloud.google.com/go/iam v1.1.8 h1:r7umDwhj+BQyz0ScZMp4QrGXjSTI3ZINnpgU2nlB/K0= 11 | cloud.google.com/go/iam v1.1.8/go.mod h1:GvE6lyMmfxXauzNq8NbgJbeVQNspG+tcdL/W8QO1+zE= 12 | cloud.google.com/go/longrunning v0.5.7 h1:WLbHekDbjK1fVFD3ibpFFVoyizlLRl73I7YKuAKilhU= 13 | cloud.google.com/go/longrunning v0.5.7/go.mod h1:8GClkudohy1Fxm3owmBGid8W0pSgodEMwEAztp38Xng= 14 | cloud.google.com/go/speech v1.23.1 h1:TcWEAOLQH1Lb2fhHS6/GjvAh+ue0dt4xUDHXHG6vF04= 15 | cloud.google.com/go/speech v1.23.1/go.mod h1:UNgzNxhNBuo/OxpF1rMhA/U2rdai7ILL6PBXFs70wq0= 16 | cloud.google.com/go/storage v1.43.0 h1:CcxnSohZwizt4LCzQHWvBf1/kvtHUn7gk9QERXPyXFs= 17 | cloud.google.com/go/storage v1.43.0/go.mod h1:ajvxEa7WmZS1PxvKRq4bq0tFT3vMd502JwstCcYv0Q0= 18 | github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU= 19 | github.com/PuerkitoBio/goquery v1.9.2 h1:4/wZksC3KgkQw7SQgkKotmKljk0M6V8TUvA8Wb4yPeE= 20 | github.com/PuerkitoBio/goquery v1.9.2/go.mod h1:GHPCaP0ODyyxqcNoFGYlAprUFH81NuRPd0GX3Zu2Mvk= 21 | github.com/amanitaverna/go-mp3 v0.4.0 h1:ZZ5maCStIh7+M9NZSk58Eww23q0B4IuJtQW+Y6u4kkw= 22 | github.com/amanitaverna/go-mp3 v0.4.0/go.mod h1:b9idBPNUTSU/5D+GATwLkJx5xqDYTEeRg7/O7K7gZF0= 23 | github.com/andybalholm/cascadia v1.3.2 h1:3Xi6Dw5lHF15JtdcmAHD3i1+T8plmv7BQ/nsViSLyss= 24 | github.com/andybalholm/cascadia v1.3.2/go.mod h1:7gtRlve5FxPPgIgX36uWBX58OdBsSS6lUvCFb+h7KvU= 25 | github.com/census-instrumentation/opencensus-proto v0.2.1/go.mod h1:f6KPmirojxKA12rnyqOA5BBL4O983OfeGPqjHWSTneU= 26 | github.com/client9/misspell v0.3.4/go.mod h1:qj6jICC3Q7zFZvVWo7KLAzC3yx5G7kyvSDkc90ppPyw= 27 | github.com/cncf/udpa/go v0.0.0-20191209042840-269d4d468f6f/go.mod h1:M8M6+tZqaGXZJjfX53e64911xZQV5JYwmTeXPW+k8Sc= 28 | github.com/d4l3k/messagediff v1.2.2-0.20190829033028-7e0a312ae40b/go.mod h1:Oozbb1TVXFac9FtSIxHBMnBCq2qeH/2KkEQxENCrlLo= 29 | github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= 30 | github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= 31 | github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= 32 | github.com/envoyproxy/go-control-plane v0.9.0/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4= 33 | github.com/envoyproxy/go-control-plane v0.9.1-0.20191026205805-5f8ba28d4473/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4= 34 | github.com/envoyproxy/go-control-plane v0.9.4/go.mod h1:6rpuAdCZL397s3pYoYcLgu1mIlRU8Am5FuJP05cCM98= 35 | github.com/envoyproxy/protoc-gen-validate v0.1.0/go.mod h1:iSmxcyjqTsJpI2R4NaDN7+kN2VEUnK/pcBlmesArF7c= 36 | github.com/felixge/httpsnoop v1.0.4 h1:NFTV2Zj1bL4mc9sqWACXbQFVBBg2W3GPvqp8/ESS2Wg= 37 | github.com/felixge/httpsnoop v1.0.4/go.mod h1:m8KPJKqk1gH5J9DgRY2ASl2lWCfGKXixSwevea8zH2U= 38 | github.com/go-audio/audio v1.0.0 h1:zS9vebldgbQqktK4H0lUqWrG8P0NxCJVqcj7ZpNnwd4= 39 | github.com/go-audio/audio v1.0.0/go.mod h1:6uAu0+H2lHkwdGsAY+j2wHPNPpPoeg5AaEFh9FlA+Zs= 40 | github.com/go-audio/riff v1.0.0 h1:d8iCGbDvox9BfLagY94fBynxSPHO80LmZCaOsmKxokA= 41 | github.com/go-audio/riff v1.0.0/go.mod h1:l3cQwc85y79NQFCRB7TiPoNiaijp6q8Z0Uv38rVG498= 42 | github.com/go-audio/wav v1.1.0 h1:jQgLtbqBzY7G+BM8fXF7AHUk1uHUviWS4X39d5rsL2g= 43 | github.com/go-audio/wav v1.1.0/go.mod h1:mpe9qfwbScEbkd8uybLuIpTgHyrISw/OTuvjUW2iGtE= 44 | github.com/go-logr/logr v1.2.2/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A= 45 | github.com/go-logr/logr v1.4.1 h1:pKouT5E8xu9zeFC39JXRDukb6JFQPXM5p5I91188VAQ= 46 | github.com/go-logr/logr v1.4.1/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= 47 | github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag= 48 | github.com/go-logr/stdr v1.2.2/go.mod h1:mMo/vtBO5dYbehREoey6XUKy/eSumjCCveDpRre4VKE= 49 | github.com/golang/glog v0.0.0-20160126235308-23def4e6c14b/go.mod h1:SBH7ygxi8pfUlaOkMMuAQtPIUF8ecWP5IEl/CR7VP2Q= 50 | github.com/golang/groupcache v0.0.0-20200121045136-8c9f03a8e57e/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc= 51 | github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da h1:oI5xCqsCo564l8iNU+DwB5epxmsaqB+rhGL0m5jtYqE= 52 | github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc= 53 | github.com/golang/mock v1.1.1/go.mod h1:oTYuIxOrZwtPieC+H1uAHpcLFnEyAGVDL/k47Jfbm0A= 54 | github.com/golang/protobuf v1.2.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= 55 | github.com/golang/protobuf v1.3.2/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= 56 | github.com/golang/protobuf v1.4.0-rc.1/go.mod h1:ceaxUfeHdC40wWswd/P6IGgMaK3YpKi5j83Wpe3EHw8= 57 | github.com/golang/protobuf v1.4.0-rc.1.0.20200221234624-67d41d38c208/go.mod h1:xKAWHe0F5eneWXFV3EuXVDTCmh+JuBKY0li0aMyXATA= 58 | github.com/golang/protobuf v1.4.0-rc.2/go.mod h1:LlEzMj4AhA7rCAGe4KMBDvJI+AwstrUpVNzEA03Pprs= 59 | github.com/golang/protobuf v1.4.0-rc.4.0.20200313231945-b860323f09d0/go.mod h1:WU3c8KckQ9AFe+yFwt9sWVRKCVIyN9cPHBJSNnbL67w= 60 | github.com/golang/protobuf v1.4.0/go.mod h1:jodUvKwWbYaEsadDk5Fwe5c77LiNKVO9IDvqG2KuDX0= 61 | github.com/golang/protobuf v1.4.1/go.mod h1:U8fpvMrcmy5pZrNK1lt4xCsGvpyWQ/VVv6QDs8UjoX8= 62 | github.com/golang/protobuf v1.4.3/go.mod h1:oDoupMAO8OvCJWAcko0GGGIgR6R6ocIYbsSw735rRwI= 63 | github.com/golang/protobuf v1.5.4 h1:i7eJL8qZTpSEXOPTxNKhASYpMn+8e5Q6AdndVa1dWek= 64 | github.com/golang/protobuf v1.5.4/go.mod h1:lnTiLA8Wa4RWRcIUkrtSVa5nRhsEGBg48fD6rSs7xps= 65 | github.com/google/go-cmp v0.2.0/go.mod h1:oXzfMopK8JAjlY9xF4vHSVASa0yLyX7SntLO5aqRK0M= 66 | github.com/google/go-cmp v0.3.0/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU= 67 | github.com/google/go-cmp v0.3.1/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU= 68 | github.com/google/go-cmp v0.4.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= 69 | github.com/google/go-cmp v0.5.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= 70 | github.com/google/go-cmp v0.5.3/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= 71 | github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI= 72 | github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= 73 | github.com/google/martian/v3 v3.3.3 h1:DIhPTQrbPkgs2yJYdXU/eNACCG5DVQjySNRNlflZ9Fc= 74 | github.com/google/martian/v3 v3.3.3/go.mod h1:iEPrYcgCF7jA9OtScMFQyAlZZ4YXTKEtJ1E6RWzmBA0= 75 | github.com/google/s2a-go v0.1.7 h1:60BLSyTrOV4/haCDW4zb1guZItoSq8foHCXrAnjBo/o= 76 | github.com/google/s2a-go v0.1.7/go.mod h1:50CgR4k1jNlWBu4UfS4AcfhVe1r6pdZPygJ3R8F0Qdw= 77 | github.com/google/uuid v1.1.2/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= 78 | github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= 79 | github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= 80 | github.com/googleapis/enterprise-certificate-proxy v0.3.2 h1:Vie5ybvEvT75RniqhfFxPRy3Bf7vr3h0cechB90XaQs= 81 | github.com/googleapis/enterprise-certificate-proxy v0.3.2/go.mod h1:VLSiSSBs/ksPL8kq3OBOQ6WRI2QnaFynd1DCjZ62+V0= 82 | github.com/googleapis/gax-go/v2 v2.12.5 h1:8gw9KZK8TiVKB6q3zHY3SBzLnrGp6HQjyfYBYGmXdxA= 83 | github.com/googleapis/gax-go/v2 v2.12.5/go.mod h1:BUDKcWo+RaKq5SC9vVYL0wLADa3VcfswbOMMRmB9H3E= 84 | github.com/icza/bitio v1.1.0 h1:ysX4vtldjdi3Ygai5m1cWy4oLkhWTAi+SyO6HC8L9T0= 85 | github.com/icza/bitio v1.1.0/go.mod h1:0jGnlLAx8MKMr9VGnn/4YrvZiprkvBelsVIbA9Jjr9A= 86 | github.com/icza/mighty v0.0.0-20180919140131-cfd07d671de6 h1:8UsGZ2rr2ksmEru6lToqnXgA8Mz1DP11X4zSJ159C3k= 87 | github.com/icza/mighty v0.0.0-20180919140131-cfd07d671de6/go.mod h1:xQig96I1VNBDIWGCdTt54nHt6EeI639SmHycLYL7FkA= 88 | github.com/jszwec/csvutil v1.5.1/go.mod h1:Rpu7Uu9giO9subDyMCIQfHVDuLrcaC36UA4YcJjGBkg= 89 | github.com/kr/pretty v0.1.0 h1:L/CwN0zerZDmRFUapSPitk6f+Q3+0za1rQkzVuMiMFI= 90 | github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo= 91 | github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= 92 | github.com/kr/text v0.1.0 h1:45sCR5RtlFHMR4UwH9sdQ5TC8v0qDQCHnXt+kaKSTVE= 93 | github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= 94 | github.com/ledongthuc/pdf v0.0.0-20240201131950-da5b75280b06 h1:kacRlPN7EN++tVpGUorNGPn/4DnB7/DfTY82AOn6ccU= 95 | github.com/ledongthuc/pdf v0.0.0-20240201131950-da5b75280b06/go.mod h1:imJHygn/1yfhB7XSJJKlFZKl/J+dCPAknuiaGOshXAs= 96 | github.com/mewkiz/flac v1.0.11 h1:2KFoMH/P72qhZ/E4bI7ZuK79lCPE1zZM3/6WnrMOTH4= 97 | github.com/mewkiz/flac v1.0.11/go.mod h1:1UeXlFRJp4ft2mfZnPLRpQTd7cSjb/s17o7JQzzyrCA= 98 | github.com/mewkiz/pkg v0.0.0-20230226050401-4010bf0fec14 h1:tnAPMExbRERsyEYkmR1YjhTgDM0iqyiBYf8ojRXxdbA= 99 | github.com/mewkiz/pkg v0.0.0-20230226050401-4010bf0fec14/go.mod h1:QYCFBiH5q6XTHEbWhR0uhR3M9qNPoD2CSQzr0g75kE4= 100 | github.com/pkg/errors v0.8.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= 101 | github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= 102 | github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= 103 | github.com/prometheus/client_model v0.0.0-20190812154241-14fe0d1b01d4/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA= 104 | github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= 105 | github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw= 106 | github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo= 107 | github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= 108 | github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= 109 | github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU= 110 | github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4= 111 | github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg= 112 | github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= 113 | github.com/taylorskalyo/goreader v0.0.0-20230626212555-e7f5644f8115 h1:OEAIMYp5l9kJ2kT9UPL5QSUriKIIDhnLmpJTy69sltA= 114 | github.com/taylorskalyo/goreader v0.0.0-20230626212555-e7f5644f8115/go.mod h1:AIVbkIe1G7fpFHiKOdxZnU5p9tFPYNTQyH3H5IrRkGw= 115 | github.com/temoto/robotstxt v1.1.2 h1:W2pOjSJ6SWvldyEuiFXNxz3xZ8aiWX5LbfDiOFd7Fxg= 116 | github.com/temoto/robotstxt v1.1.2/go.mod h1:+1AmkuG3IYkh1kv0d2qEB9Le88ehNO0zwOr3ujewlOo= 117 | github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= 118 | go.opencensus.io v0.24.0 h1:y73uSU6J157QMP2kn2r30vwW1A2W2WFwSCGnAVxeaD0= 119 | go.opencensus.io v0.24.0/go.mod h1:vNK8G9p7aAivkbmorf4v+7Hgx+Zs0yY+0fOtgBfjQKo= 120 | go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.49.0 h1:4Pp6oUg3+e/6M4C0A/3kJ2VYa++dsWVTtGgLVj5xtHg= 121 | go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.49.0/go.mod h1:Mjt1i1INqiaoZOMGR1RIUJN+i3ChKoFRqzrRQhlkbs0= 122 | go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.49.0 h1:jq9TW8u3so/bN+JPT166wjOI6/vQPF6Xe7nMNIltagk= 123 | go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.49.0/go.mod h1:p8pYQP+m5XfbZm9fxtSKAbM6oIllS7s2AfxrChvc7iw= 124 | go.opentelemetry.io/otel v1.24.0 h1:0LAOdjNmQeSTzGBzduGe/rU4tZhMwL5rWgtp9Ku5Jfo= 125 | go.opentelemetry.io/otel v1.24.0/go.mod h1:W7b9Ozg4nkF5tWI5zsXkaKKDjdVjpD4oAt9Qi/MArHo= 126 | go.opentelemetry.io/otel/metric v1.24.0 h1:6EhoGWWK28x1fbpA4tYTOWBkPefTDQnb8WSGXlc88kI= 127 | go.opentelemetry.io/otel/metric v1.24.0/go.mod h1:VYhLe1rFfxuTXLgj4CBiyz+9WYBA8pNGJgDcSFRKBco= 128 | go.opentelemetry.io/otel/sdk v1.24.0 h1:YMPPDNymmQN3ZgczicBY3B6sf9n62Dlj9pWD3ucgoDw= 129 | go.opentelemetry.io/otel/sdk v1.24.0/go.mod h1:KVrIYw6tEubO9E96HQpcmpTKDVn9gdv35HoYiQWGDFg= 130 | go.opentelemetry.io/otel/trace v1.24.0 h1:CsKnnL4dUAr/0llH9FKuc698G04IrpWV0MQA/Y1YELI= 131 | go.opentelemetry.io/otel/trace v1.24.0/go.mod h1:HPc3Xr/cOApsBI154IU0OI0HJexz+aw5uPdbs3UCjNU= 132 | golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= 133 | golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= 134 | golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= 135 | golang.org/x/crypto v0.25.0 h1:ypSNr+bnYL2YhwoMt2zPxHFmbAN1KZs/njMG3hxUp30= 136 | golang.org/x/crypto v0.25.0/go.mod h1:T+wALwcMOSE0kXgUAnPAHqTLW+XHgcELELW8VaDgm/M= 137 | golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= 138 | golang.org/x/image v0.5.0/go.mod h1:FVC7BI/5Ym8R25iw5OLsgshdUBbT1h5jZTpA+mvAdZ4= 139 | golang.org/x/lint v0.0.0-20181026193005-c67002cb31c3/go.mod h1:UVdnD1Gm6xHRNCYTkRU2/jEulfH38KcIWyp/GAMgvoE= 140 | golang.org/x/lint v0.0.0-20190227174305-5b3e6a55c961/go.mod h1:wehouNa3lNwaWXcvxsM5YxQ5yQlVC4a0KAMCusXpPoU= 141 | golang.org/x/lint v0.0.0-20190313153728-d0100b6bd8b3/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc= 142 | golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4= 143 | golang.org/x/mod v0.8.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs= 144 | golang.org/x/net v0.0.0-20180724234803-3673e40ba225/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= 145 | golang.org/x/net v0.0.0-20180826012351-8a410e7b638d/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= 146 | golang.org/x/net v0.0.0-20190213061140-3a22650c66bd/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= 147 | golang.org/x/net v0.0.0-20190311183353-d8887717615a/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= 148 | golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= 149 | golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= 150 | golang.org/x/net v0.0.0-20201110031124-69a78807bb2b/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU= 151 | golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg= 152 | golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= 153 | golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= 154 | golang.org/x/net v0.7.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= 155 | golang.org/x/net v0.9.0/go.mod h1:d48xBJpPfHeWQsugry2m+kC02ZBRGRgulfHnEXEuWns= 156 | golang.org/x/net v0.27.0 h1:5K3Njcw06/l2y9vpGCSdcxWOYHOUk3dVNGDXN+FvAys= 157 | golang.org/x/net v0.27.0/go.mod h1:dDi0PyhWNoiUOrAS8uXv/vnScO4wnHQO4mj9fn/RytE= 158 | golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U= 159 | golang.org/x/oauth2 v0.21.0 h1:tsimM75w1tF/uws5rbeHzIWxEqElMehnc+iW793zsZs= 160 | golang.org/x/oauth2 v0.21.0/go.mod h1:XYTD2NtWslqkgxebSiOHnXEap4TF09sJSc7H1sXbhtI= 161 | golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= 162 | golang.org/x/sync v0.0.0-20181108010431-42b317875d0f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= 163 | golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= 164 | golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= 165 | golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= 166 | golang.org/x/sync v0.7.0 h1:YsImfSBoP9QPYL0xyKJPq0gcaJdG3rInoqxTWbfQu9M= 167 | golang.org/x/sync v0.7.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= 168 | golang.org/x/sys v0.0.0-20180830151530-49385e6e1522/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= 169 | golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= 170 | golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= 171 | golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= 172 | golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= 173 | golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= 174 | golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= 175 | golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= 176 | golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= 177 | golang.org/x/sys v0.7.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= 178 | golang.org/x/sys v0.22.0 h1:RI27ohtqKCnwULzJLqkv897zojh5/DwS/ENaMzUOaWI= 179 | golang.org/x/sys v0.22.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= 180 | golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= 181 | golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= 182 | golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k= 183 | golang.org/x/term v0.7.0/go.mod h1:P32HKFT3hSsZrRxla30E9HqToFYAQPCMs/zFMBUFqPY= 184 | golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= 185 | golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= 186 | golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= 187 | golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= 188 | golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8= 189 | golang.org/x/text v0.16.0 h1:a94ExnEXNtEwYLGJSIUxnWoxoRz/ZcCsV63ROupILh4= 190 | golang.org/x/text v0.16.0/go.mod h1:GhwF1Be+LQoKShO3cGOHzqOgRrGaYc9AvblQOmPVHnI= 191 | golang.org/x/time v0.6.0 h1:eTDhh4ZXt5Qf0augr54TN6suAUudPcawVZeIAPU7D4U= 192 | golang.org/x/time v0.6.0/go.mod h1:3BpzKBy/shNhVucY/MWOyx10tF3SFh9QdLuxbVysPQM= 193 | golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= 194 | golang.org/x/tools v0.0.0-20190114222345-bf090417da8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= 195 | golang.org/x/tools v0.0.0-20190226205152-f727befe758c/go.mod h1:9Yl7xja0Znq3iFh3HoIrodX9oNMXvdceNzlUR8zjMvY= 196 | golang.org/x/tools v0.0.0-20190311212946-11955173bddd/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs= 197 | golang.org/x/tools v0.0.0-20190524140312-2c0ae7006135/go.mod h1:RgjU9mgBXZiqYHBnxXauZ1Gv1EHHAz9KjViQ78xBX0Q= 198 | golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= 199 | golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc= 200 | golang.org/x/tools v0.6.0/go.mod h1:Xwgl3UAJ/d3gWutnCtw505GrjyAbvKui8lOU390QaIU= 201 | golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= 202 | golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= 203 | google.golang.org/api v0.187.0 h1:Mxs7VATVC2v7CY+7Xwm4ndkX71hpElcvx0D1Ji/p1eo= 204 | google.golang.org/api v0.187.0/go.mod h1:KIHlTc4x7N7gKKuVsdmfBXN13yEEWXWFURWY6SBp2gk= 205 | google.golang.org/appengine v1.1.0/go.mod h1:EbEs0AVv82hx2wNQdGPgUI5lhzA/G0D9YwlJXL52JkM= 206 | google.golang.org/appengine v1.4.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4= 207 | google.golang.org/genproto v0.0.0-20180817151627-c66870c02cf8/go.mod h1:JiN7NxoALGmiZfu7CAH4rXhgtRTLTxftemlI0sWmxmc= 208 | google.golang.org/genproto v0.0.0-20190819201941-24fa4b261c55/go.mod h1:DMBHOl98Agz4BDEuKkezgsaosCRResVns1a3J2ZsMNc= 209 | google.golang.org/genproto v0.0.0-20200526211855-cb27e3aa2013/go.mod h1:NbSheEEYHJ7i3ixzK3sjbqSGDJWnxyFXZblF3eUsNvo= 210 | google.golang.org/genproto v0.0.0-20240624140628-dc46fd24d27d h1:PksQg4dV6Sem3/HkBX+Ltq8T0ke0PKIRBNBatoDTVls= 211 | google.golang.org/genproto v0.0.0-20240624140628-dc46fd24d27d/go.mod h1:s7iA721uChleev562UJO2OYB0PPT9CMFjV+Ce7VJH5M= 212 | google.golang.org/genproto/googleapis/api v0.0.0-20240617180043-68d350f18fd4 h1:MuYw1wJzT+ZkybKfaOXKp5hJiZDn2iHaXRw0mRYdHSc= 213 | google.golang.org/genproto/googleapis/api v0.0.0-20240617180043-68d350f18fd4/go.mod h1:px9SlOOZBg1wM1zdnr8jEL4CNGUBZ+ZKYtNPApNQc4c= 214 | google.golang.org/genproto/googleapis/rpc v0.0.0-20240624140628-dc46fd24d27d h1:k3zyW3BYYR30e8v3x0bTDdE9vpYFjZHK+HcyqkrppWk= 215 | google.golang.org/genproto/googleapis/rpc v0.0.0-20240624140628-dc46fd24d27d/go.mod h1:Ue6ibwXGpU+dqIcODieyLOcgj7z8+IcskoNIgZxtrFY= 216 | google.golang.org/grpc v1.19.0/go.mod h1:mqu4LbDTu4XGKhr4mRzUsmM4RtVoemTSY81AxZiDr8c= 217 | google.golang.org/grpc v1.23.0/go.mod h1:Y5yQAOtifL1yxbo5wqy6BxZv8vAUGQwXBOALyacEbxg= 218 | google.golang.org/grpc v1.25.1/go.mod h1:c3i+UQWmh7LiEpx4sFZnkU36qjEYZ0imhYfXVyQciAY= 219 | google.golang.org/grpc v1.27.0/go.mod h1:qbnxyOmOxrQa7FizSgH+ReBfzJrCY1pSN7KXBS8abTk= 220 | google.golang.org/grpc v1.33.2/go.mod h1:JMHMWHQWaTccqQQlmk3MJZS+GWXOdAesneDmEnv2fbc= 221 | google.golang.org/grpc v1.64.0 h1:KH3VH9y/MgNQg1dE7b3XfVK0GsPSIzJwdF617gUSbvY= 222 | google.golang.org/grpc v1.64.0/go.mod h1:oxjF8E3FBnjp+/gVFYdWacaLDx9na1aqy9oovLpxQYg= 223 | google.golang.org/protobuf v0.0.0-20200109180630-ec00e32a8dfd/go.mod h1:DFci5gLYBciE7Vtevhsrf46CRTquxDuWsQurQQe4oz8= 224 | google.golang.org/protobuf v0.0.0-20200221191635-4d8936d0db64/go.mod h1:kwYJMbMJ01Woi6D6+Kah6886xMZcty6N08ah7+eCXa0= 225 | google.golang.org/protobuf v0.0.0-20200228230310-ab0ca4ff8a60/go.mod h1:cfTl7dwQJ+fmap5saPgwCLgHXTUD7jkjRqWcaiX5VyM= 226 | google.golang.org/protobuf v1.20.1-0.20200309200217-e05f789c0967/go.mod h1:A+miEFZTKqfCUM6K7xSMQL9OKL/b6hQv+e19PK+JZNE= 227 | google.golang.org/protobuf v1.21.0/go.mod h1:47Nbq4nVaFHyn7ilMalzfO3qCViNmqZ2kzikPIcrTAo= 228 | google.golang.org/protobuf v1.22.0/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU= 229 | google.golang.org/protobuf v1.23.0/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU= 230 | google.golang.org/protobuf v1.23.1-0.20200526195155-81db48ad09cc/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU= 231 | google.golang.org/protobuf v1.25.0/go.mod h1:9JNX74DMeImyA3h4bdi1ymwjUzf21/xIlbajtzgsN7c= 232 | google.golang.org/protobuf v1.34.2 h1:6xV6lTsCfpGD21XK49h7MhtcApnLqkfYgPcdHftf6hg= 233 | google.golang.org/protobuf v1.34.2/go.mod h1:qYOHts0dSfpeUzUFpOMr/WGzszTmLH+DiWniOlNbLDw= 234 | gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= 235 | gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127 h1:qIbj1fsPNlZgppZ+VLlY7N33q108Sa+fhmuc+sWQYwY= 236 | gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= 237 | gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= 238 | gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= 239 | gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= 240 | honnef.co/go/tools v0.0.0-20190102054323-c2f93a96b099/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4= 241 | honnef.co/go/tools v0.0.0-20190523083050-ea95bdfd59fc/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4= 242 | -------------------------------------------------------------------------------- /internal/audio/flac.go: -------------------------------------------------------------------------------- 1 | package audio 2 | 3 | import ( 4 | "fmt" 5 | 6 | "github.com/mewkiz/flac" 7 | ) 8 | 9 | type flacProcessor struct{} 10 | 11 | func (p *flacProcessor) process(filename string) (*audioInfo, error) { 12 | file, err := flac.Open(filename) 13 | if err != nil { 14 | return nil, fmt.Errorf("failed to open FLAC file: %w", err) 15 | } 16 | defer func(file *flac.Stream) { 17 | err := file.Close() 18 | if err != nil { 19 | fmt.Printf("failed to close FLAC file: %v\n", err) 20 | } 21 | }(file) 22 | 23 | return &audioInfo{ 24 | sampleRate: int(file.Info.SampleRate), 25 | numChannels: int(file.Info.NChannels), 26 | format: "FLAC", 27 | }, nil 28 | } 29 | -------------------------------------------------------------------------------- /internal/audio/flac_test.go: -------------------------------------------------------------------------------- 1 | package audio 2 | 3 | import ( 4 | "reflect" 5 | "testing" 6 | ) 7 | 8 | func Test_flacProcessor_process(t *testing.T) { 9 | type args struct { 10 | filename string 11 | } 12 | tests := []struct { 13 | name string 14 | p *flacProcessor 15 | args args 16 | want *audioInfo 17 | wantErr bool 18 | }{ 19 | { 20 | name: "success", 21 | p: &flacProcessor{}, 22 | args: args{ 23 | filename: getRootPath(t) + "/testdata/audio/test.flac", 24 | }, 25 | want: &audioInfo{ 26 | sampleRate: 96000, 27 | numChannels: 2, 28 | format: "FLAC", 29 | }, 30 | wantErr: false, 31 | }, 32 | { 33 | name: "file not found", 34 | p: &flacProcessor{}, 35 | args: args{ 36 | filename: getRootPath(t) + "/testdata/audio/test_new.flac", 37 | }, 38 | want: nil, 39 | wantErr: true, 40 | }, 41 | } 42 | for _, tt := range tests { 43 | t.Run(tt.name, func(t *testing.T) { 44 | p := &flacProcessor{} 45 | got, err := p.process(tt.args.filename) 46 | if (err != nil) != tt.wantErr { 47 | t.Errorf("flacProcessor.process() error = %v, wantErr %v", err, tt.wantErr) 48 | return 49 | } 50 | if !reflect.DeepEqual(got, tt.want) { 51 | t.Errorf("flacProcessor.process() = %v, want %v", got, tt.want) 52 | } 53 | }) 54 | } 55 | } 56 | -------------------------------------------------------------------------------- /internal/audio/mp3.go: -------------------------------------------------------------------------------- 1 | package audio 2 | 3 | import ( 4 | "fmt" 5 | "os" 6 | 7 | "github.com/amanitaverna/go-mp3" 8 | ) 9 | 10 | type mp3Processor struct{} 11 | 12 | func (p *mp3Processor) process(filename string) (*audioInfo, error) { 13 | file, err := os.Open(filename) 14 | if err != nil { 15 | return nil, fmt.Errorf("failed to open MP3 file: %w", err) 16 | } 17 | 18 | defer func(file *os.File) { 19 | err := file.Close() 20 | if err != nil { 21 | fmt.Printf("failed to close MP3 file: %v\n", err) 22 | } 23 | }(file) 24 | 25 | decoder, err := mp3.NewDecoder(file) 26 | if err != nil { 27 | return nil, fmt.Errorf("failed to create MP3 decoder: %w", err) 28 | } 29 | 30 | return &audioInfo{ 31 | sampleRate: decoder.SampleRate(), 32 | /* 33 | This is a terrible assumption but seeing as the MP3 decoder 34 | doesn't expose this information, we'll have to live with it for now. 35 | */ 36 | numChannels: 2, 37 | format: "MP3", 38 | }, nil 39 | } 40 | -------------------------------------------------------------------------------- /internal/audio/mp3_test.go: -------------------------------------------------------------------------------- 1 | package audio 2 | 3 | import ( 4 | "reflect" 5 | "testing" 6 | ) 7 | 8 | func Test_mp3Processor_process(t *testing.T) { 9 | type args struct { 10 | filename string 11 | } 12 | tests := []struct { 13 | name string 14 | p *mp3Processor 15 | args args 16 | want *audioInfo 17 | wantErr bool 18 | }{ 19 | { 20 | name: "success", 21 | p: &mp3Processor{}, 22 | args: args{ 23 | filename: getRootPath(t) + "/testdata/audio/test.mp3", 24 | }, 25 | want: &audioInfo{ 26 | sampleRate: 44100, 27 | numChannels: 2, 28 | format: "MP3", 29 | }, 30 | wantErr: false, 31 | }, 32 | { 33 | name: "file not found", 34 | p: &mp3Processor{}, 35 | args: args{ 36 | filename: getRootPath(t) + "/testdata/audio/test_new.mp3", 37 | }, 38 | want: nil, 39 | wantErr: true, 40 | }, 41 | { 42 | name: "invalid MP3 file", 43 | p: &mp3Processor{}, 44 | args: args{ 45 | filename: getRootPath(t) + "/testdata/audio/test.flac", 46 | }, 47 | want: nil, 48 | wantErr: true, 49 | }, 50 | } 51 | for _, tt := range tests { 52 | t.Run(tt.name, func(t *testing.T) { 53 | p := &mp3Processor{} 54 | got, err := p.process(tt.args.filename) 55 | if (err != nil) != tt.wantErr { 56 | t.Errorf("mp3Processor.process() error = %v, wantErr %v", err, tt.wantErr) 57 | return 58 | } 59 | if !reflect.DeepEqual(got, tt.want) { 60 | t.Errorf("mp3Processor.process() = %v, want %v", got, tt.want) 61 | } 62 | }) 63 | } 64 | } 65 | -------------------------------------------------------------------------------- /internal/audio/processor.go: -------------------------------------------------------------------------------- 1 | package audio 2 | 3 | import ( 4 | "fmt" 5 | "path/filepath" 6 | "strings" 7 | 8 | "cloud.google.com/go/speech/apiv1/speechpb" 9 | ) 10 | 11 | var ( 12 | defaultFactory = &defaultAudioProcessorFactory{} 13 | retriever = newAudioInfoRetriever(defaultFactory) 14 | ) 15 | 16 | var encodingMap = map[string]speechpb.RecognitionConfig_AudioEncoding{ 17 | "WAV": speechpb.RecognitionConfig_LINEAR16, 18 | "MP3": speechpb.RecognitionConfig_MP3, 19 | "FLAC": speechpb.RecognitionConfig_FLAC, 20 | } 21 | 22 | func (f *defaultAudioProcessorFactory) createProcessor(ext string) (audioProcessor, error) { 23 | switch strings.ToLower(ext) { 24 | case ".mp3": 25 | return &mp3Processor{}, nil 26 | case ".flac": 27 | return &flacProcessor{}, nil 28 | case ".wav": 29 | return &wavProcessor{}, nil 30 | default: 31 | return nil, fmt.Errorf("unsupported file format: %s", ext) 32 | } 33 | } 34 | 35 | func newAudioInfoRetriever(factory audioProcessorFactory) *audioInfoRetriever { 36 | return &audioInfoRetriever{ 37 | factory: factory, 38 | } 39 | } 40 | 41 | func GetAudioInfo(filename string) (*speechpb.RecognitionConfig, error) { 42 | info, err := retriever.audioInfo(filename) 43 | if err != nil { 44 | return nil, err 45 | } 46 | 47 | return &speechpb.RecognitionConfig{ 48 | Encoding: getEncoding(info.format), 49 | SampleRateHertz: int32(info.sampleRate), 50 | AudioChannelCount: int32(info.numChannels), 51 | }, nil 52 | } 53 | 54 | func (r *audioInfoRetriever) audioInfo(filename string) (*audioInfo, error) { 55 | ext := filepath.Ext(filename) 56 | processor, err := r.factory.createProcessor(ext) 57 | if err != nil { 58 | return nil, err 59 | } 60 | return processor.process(filename) 61 | } 62 | 63 | func getEncoding(format string) speechpb.RecognitionConfig_AudioEncoding { 64 | if encoding, ok := encodingMap[format]; ok { 65 | return encoding 66 | } 67 | return speechpb.RecognitionConfig_ENCODING_UNSPECIFIED 68 | } 69 | -------------------------------------------------------------------------------- /internal/audio/processor_test.go: -------------------------------------------------------------------------------- 1 | package audio 2 | 3 | import ( 4 | "errors" 5 | "os" 6 | "path/filepath" 7 | "reflect" 8 | "testing" 9 | 10 | "cloud.google.com/go/speech/apiv1/speechpb" 11 | ) 12 | 13 | type mockProcessor struct { 14 | info *audioInfo 15 | err error 16 | } 17 | 18 | func (m *mockProcessor) process(string) (*audioInfo, error) { 19 | return m.info, m.err 20 | } 21 | 22 | type mockFactory struct { 23 | processor audioProcessor 24 | err error 25 | } 26 | 27 | func (m *mockFactory) createProcessor(string) (audioProcessor, error) { 28 | return m.processor, m.err 29 | } 30 | 31 | func getRootPath(t *testing.T) string { 32 | t.Helper() 33 | pwd, err := os.Getwd() 34 | if err != nil { 35 | t.Fatalf("getting current folder: %s", err) 36 | } 37 | pwd = filepath.Dir(filepath.Dir(pwd)) 38 | return pwd 39 | } 40 | 41 | func Test_getEncoding(t *testing.T) { 42 | tests := []struct { 43 | name string 44 | format string 45 | want speechpb.RecognitionConfig_AudioEncoding 46 | }{ 47 | { 48 | name: "WAV format", 49 | format: "WAV", 50 | want: speechpb.RecognitionConfig_LINEAR16, 51 | }, 52 | { 53 | name: "MP3 format", 54 | format: "MP3", 55 | want: speechpb.RecognitionConfig_MP3, 56 | }, 57 | { 58 | name: "FLAC format", 59 | format: "FLAC", 60 | want: speechpb.RecognitionConfig_FLAC, 61 | }, 62 | { 63 | name: "Unsupported format", 64 | format: "AAC", 65 | want: speechpb.RecognitionConfig_ENCODING_UNSPECIFIED, 66 | }, 67 | } 68 | 69 | for _, tt := range tests { 70 | t.Run(tt.name, func(t *testing.T) { 71 | if got := getEncoding(tt.format); !reflect.DeepEqual(got, tt.want) { 72 | t.Errorf("getEncoding() = %v, want %v", got, tt.want) 73 | } 74 | }) 75 | } 76 | } 77 | 78 | func Test_audioInfoRetriever_audioInfo(t *testing.T) { 79 | tests := []struct { 80 | name string 81 | factory audioProcessorFactory 82 | filename string 83 | want *audioInfo 84 | wantErr bool 85 | errMsg string 86 | }{ 87 | { 88 | name: "MP3 file - successful processing", 89 | factory: &mockFactory{ 90 | processor: &mockProcessor{ 91 | info: &audioInfo{ 92 | sampleRate: 44100, 93 | numChannels: 2, 94 | format: "MP3", 95 | }, 96 | err: nil, 97 | }, 98 | err: nil, 99 | }, 100 | filename: "test.mp3", 101 | want: &audioInfo{ 102 | sampleRate: 44100, 103 | numChannels: 2, 104 | format: "MP3", 105 | }, 106 | wantErr: false, 107 | }, 108 | { 109 | name: "FLAC file - successful processing", 110 | factory: &mockFactory{ 111 | processor: &mockProcessor{ 112 | info: &audioInfo{ 113 | sampleRate: 96000, 114 | numChannels: 2, 115 | format: "FLAC", 116 | }, 117 | err: nil, 118 | }, 119 | err: nil, 120 | }, 121 | filename: "test.flac", 122 | want: &audioInfo{ 123 | sampleRate: 96000, 124 | numChannels: 2, 125 | format: "FLAC", 126 | }, 127 | wantErr: false, 128 | }, 129 | { 130 | name: "WAV file - processing error", 131 | factory: &mockFactory{ 132 | processor: &mockProcessor{ 133 | info: nil, 134 | err: errors.New("failed to process WAV file"), 135 | }, 136 | err: nil, 137 | }, 138 | filename: "test.wav", 139 | want: nil, 140 | wantErr: true, 141 | errMsg: "failed to process WAV file", 142 | }, 143 | { 144 | name: "Unsupported file format", 145 | factory: &mockFactory{ 146 | processor: nil, 147 | err: errors.New("unsupported file format: .aac"), 148 | }, 149 | filename: "test.aac", 150 | want: nil, 151 | wantErr: true, 152 | errMsg: "unsupported file format: .aac", 153 | }, 154 | } 155 | 156 | for _, tt := range tests { 157 | t.Run(tt.name, func(t *testing.T) { 158 | r := newAudioInfoRetriever(tt.factory) 159 | got, err := r.audioInfo(tt.filename) 160 | if (err != nil) != tt.wantErr { 161 | t.Errorf("audioInfoRetriever.audioInfo() error = %v, wantErr %v", err, tt.wantErr) 162 | return 163 | } 164 | if err != nil && err.Error() != tt.errMsg { 165 | t.Errorf("audioInfoRetriever.audioInfo() error = %v, expected error %v", err, tt.errMsg) 166 | } 167 | if !reflect.DeepEqual(got, tt.want) { 168 | t.Errorf("audioInfoRetriever.audioInfo() = %v, want %v", got, tt.want) 169 | } 170 | }) 171 | } 172 | } 173 | 174 | func Test_newAudioInfoRetriever(t *testing.T) { 175 | factory := &defaultAudioProcessorFactory{} 176 | tests := []struct { 177 | name string 178 | want *audioInfoRetriever 179 | }{ 180 | { 181 | name: "Create new audioInfoRetriever", 182 | want: &audioInfoRetriever{ 183 | factory: factory, 184 | }, 185 | }, 186 | } 187 | for _, tt := range tests { 188 | t.Run(tt.name, func(t *testing.T) { 189 | if got := newAudioInfoRetriever(factory); !reflect.DeepEqual(got, tt.want) { 190 | t.Errorf("newAudioInfoRetriever() = %v, want %v", got, tt.want) 191 | } 192 | }) 193 | } 194 | } 195 | 196 | func Test_getAudioInfo(t *testing.T) { 197 | type args struct { 198 | filename string 199 | } 200 | tests := []struct { 201 | name string 202 | args args 203 | want *speechpb.RecognitionConfig 204 | wantErr bool 205 | }{ 206 | { 207 | name: "MP3 file", 208 | args: args{ 209 | filename: getRootPath(t) + "/testdata/audio/test.mp3", 210 | }, 211 | want: &speechpb.RecognitionConfig{ 212 | Encoding: speechpb.RecognitionConfig_MP3, 213 | SampleRateHertz: 44100, 214 | AudioChannelCount: 2, 215 | }, 216 | wantErr: false, 217 | }, 218 | { 219 | name: "FLAC file", 220 | args: args{ 221 | filename: getRootPath(t) + "/testdata/audio/test.flac", 222 | }, 223 | want: &speechpb.RecognitionConfig{ 224 | Encoding: speechpb.RecognitionConfig_FLAC, 225 | SampleRateHertz: 96000, 226 | AudioChannelCount: 2, 227 | }, 228 | wantErr: false, 229 | }, 230 | { 231 | name: "WAV file", 232 | args: args{ 233 | filename: getRootPath(t) + "/testdata/audio/test.wav", 234 | }, 235 | want: &speechpb.RecognitionConfig{ 236 | Encoding: speechpb.RecognitionConfig_LINEAR16, 237 | SampleRateHertz: 44100, 238 | AudioChannelCount: 2, 239 | }, 240 | wantErr: false, 241 | }, 242 | { 243 | name: "Unsupported file format", 244 | args: args{ 245 | filename: getRootPath(t) + "/testdata/audio/test.ogg", 246 | }, 247 | want: nil, 248 | wantErr: true, 249 | }, 250 | } 251 | for _, tt := range tests { 252 | t.Run(tt.name, func(t *testing.T) { 253 | got, err := GetAudioInfo(tt.args.filename) 254 | if (err != nil) != tt.wantErr { 255 | t.Errorf("getAudioInfo() error = %v, wantErr %v", err, tt.wantErr) 256 | return 257 | } 258 | if !reflect.DeepEqual(got, tt.want) { 259 | t.Errorf("getAudioInfo() = %v, want %v", got, tt.want) 260 | } 261 | }) 262 | } 263 | } 264 | -------------------------------------------------------------------------------- /internal/audio/types.go: -------------------------------------------------------------------------------- 1 | package audio 2 | 3 | type audioInfo struct { 4 | sampleRate int 5 | numChannels int 6 | format string 7 | } 8 | 9 | type audioProcessor interface { 10 | process(filename string) (*audioInfo, error) 11 | } 12 | 13 | type audioProcessorFactory interface { 14 | createProcessor(fileExtension string) (audioProcessor, error) 15 | } 16 | 17 | type defaultAudioProcessorFactory struct{} 18 | 19 | type audioInfoRetriever struct { 20 | factory audioProcessorFactory 21 | } 22 | -------------------------------------------------------------------------------- /internal/audio/wav.go: -------------------------------------------------------------------------------- 1 | package audio 2 | 3 | import ( 4 | "errors" 5 | "fmt" 6 | "os" 7 | 8 | "github.com/go-audio/wav" 9 | ) 10 | 11 | type wavProcessor struct{} 12 | 13 | func (p *wavProcessor) process(filename string) (*audioInfo, error) { 14 | file, err := os.Open(filename) 15 | if err != nil { 16 | return nil, fmt.Errorf("failed to open WAV file: %w", err) 17 | } 18 | defer func(file *os.File) { 19 | err := file.Close() 20 | if err != nil { 21 | fmt.Printf("failed to close WAV file: %v\n", err) 22 | } 23 | }(file) 24 | 25 | decoder := wav.NewDecoder(file) 26 | if !decoder.IsValidFile() { 27 | return nil, errors.New("invalid WAV file") 28 | } 29 | 30 | return &audioInfo{ 31 | sampleRate: int(decoder.SampleRate), 32 | numChannels: int(decoder.NumChans), 33 | format: "WAV", 34 | }, nil 35 | } 36 | -------------------------------------------------------------------------------- /internal/audio/wav_test.go: -------------------------------------------------------------------------------- 1 | package audio 2 | 3 | import ( 4 | "reflect" 5 | "testing" 6 | ) 7 | 8 | func Test_wavProcessor_process(t *testing.T) { 9 | type args struct { 10 | filename string 11 | } 12 | tests := []struct { 13 | name string 14 | p *wavProcessor 15 | args args 16 | want *audioInfo 17 | wantErr bool 18 | }{ 19 | { 20 | name: "success", 21 | p: &wavProcessor{}, 22 | args: args{ 23 | filename: getRootPath(t) + "/testdata/audio/test.wav", 24 | }, 25 | want: &audioInfo{ 26 | sampleRate: 44100, 27 | numChannels: 2, 28 | format: "WAV", 29 | }, 30 | }, 31 | { 32 | name: "file not found", 33 | p: &wavProcessor{}, 34 | args: args{ 35 | filename: getRootPath(t) + "/testdata/audio/test_new.wav", 36 | }, 37 | want: nil, 38 | wantErr: true, 39 | }, 40 | { 41 | name: "invalid WAV file", 42 | p: &wavProcessor{}, 43 | args: args{ 44 | filename: getRootPath(t) + "/testdata/audio/test.flac", 45 | }, 46 | want: nil, 47 | wantErr: true, 48 | }, 49 | } 50 | for _, tt := range tests { 51 | t.Run(tt.name, func(t *testing.T) { 52 | p := &wavProcessor{} 53 | got, err := p.process(tt.args.filename) 54 | if (err != nil) != tt.wantErr { 55 | t.Errorf("wavProcessor.process() error = %v, wantErr %v", err, tt.wantErr) 56 | return 57 | } 58 | if !reflect.DeepEqual(got, tt.want) { 59 | t.Errorf("wavProcessor.process() = %v, want %v", got, tt.want) 60 | } 61 | }) 62 | } 63 | } 64 | -------------------------------------------------------------------------------- /internal/common/types.go: -------------------------------------------------------------------------------- 1 | package common 2 | 3 | import ( 4 | "time" 5 | ) 6 | 7 | type TranscribeOptions struct { 8 | CredentialsJSON []byte 9 | Bucket string 10 | LanguageCode string 11 | EnableDiarization bool 12 | MinSpeakers int 13 | MaxSpeakers int 14 | CleanupOnComplete bool 15 | UseWhisper bool 16 | WhisperAPIKey string 17 | WhisperModel string 18 | WhisperPrompt string 19 | } 20 | 21 | type Config struct { 22 | UserAgent string 23 | RetryLimit int 24 | RetryDelay time.Duration 25 | CrawlDelay time.Duration 26 | ProxyList []string 27 | RateLimit time.Duration 28 | RateBurst int 29 | IgnoreRobotsTxt bool 30 | } 31 | 32 | type Chunk struct { 33 | Content string 34 | Source string 35 | } 36 | -------------------------------------------------------------------------------- /internal/document/docx.go: -------------------------------------------------------------------------------- 1 | package document 2 | 3 | import ( 4 | "archive/zip" 5 | "bytes" 6 | "io" 7 | "strings" 8 | 9 | "github.com/mmatongo/chew/v1/internal/common" 10 | "github.com/mmatongo/chew/v1/internal/utils" 11 | ) 12 | 13 | func processDocxContent(r io.Reader) ([]string, error) { 14 | data, err := io.ReadAll(r) 15 | if err != nil { 16 | return nil, err 17 | } 18 | 19 | zipReader, err := zip.NewReader(bytes.NewReader(data), int64(len(data))) 20 | if err != nil { 21 | return nil, err 22 | } 23 | 24 | var contents []string 25 | 26 | for _, file := range zipReader.File { 27 | if file.Name == "word/document.xml" { 28 | contents, err = utils.ExtractTextFromXML(file) 29 | if err != nil { 30 | return nil, err 31 | } 32 | break 33 | } 34 | } 35 | 36 | var allContent strings.Builder 37 | for _, content := range contents { 38 | allContent.WriteString(content) 39 | allContent.WriteString(" ") 40 | } 41 | 42 | return []string{allContent.String()}, nil 43 | 44 | /* 45 | // In the event we just want chunks we can just return contents 46 | return contents, nil 47 | */ 48 | } 49 | 50 | func ProcessDocx(r io.Reader, url string) ([]common.Chunk, error) { 51 | content, err := processDocxContent(r) 52 | if err != nil { 53 | return nil, err 54 | } 55 | 56 | var chunks []common.Chunk 57 | for _, chunk := range content { 58 | if strings.TrimSpace(string(chunk)) != "" { 59 | chunks = append(chunks, common.Chunk{Content: string(chunk), Source: url}) 60 | } 61 | } 62 | 63 | return chunks, nil 64 | } 65 | -------------------------------------------------------------------------------- /internal/document/docx_test.go: -------------------------------------------------------------------------------- 1 | package document 2 | 3 | import ( 4 | "archive/zip" 5 | "bytes" 6 | "errors" 7 | "io" 8 | "reflect" 9 | "testing" 10 | 11 | "github.com/mmatongo/chew/v1/internal/common" 12 | ) 13 | 14 | type errorReader struct{} 15 | 16 | var errMockRead = errors.New("mock read error") 17 | 18 | func (r *errorReader) Read(p []byte) (n int, err error) { 19 | return 0, errMockRead 20 | } 21 | 22 | func createDocxWithContent(content string) io.Reader { 23 | buf := new(bytes.Buffer) 24 | w := zip.NewWriter(buf) 25 | f, _ := w.Create("word/document.xml") 26 | f.Write([]byte(content)) 27 | w.Close() 28 | return bytes.NewReader(buf.Bytes()) 29 | } 30 | 31 | func createEmptyDocx() io.Reader { 32 | return createDocxWithContent(``) 33 | } 34 | 35 | func createSingleParagraphDocx(content string) io.Reader { 36 | return createDocxWithContent(`

` + content + `

`) 37 | } 38 | 39 | func TestProcessDocx(t *testing.T) { 40 | type args struct { 41 | r io.Reader 42 | url string 43 | } 44 | tests := []struct { 45 | name string 46 | args args 47 | want []common.Chunk 48 | wantErr bool 49 | }{ 50 | { 51 | name: "Empty docx file", 52 | args: args{ 53 | r: createEmptyDocx(), 54 | url: "http://example.com", 55 | }, 56 | want: nil, 57 | wantErr: false, 58 | }, 59 | { 60 | name: "Single paragraph docx file", 61 | args: args{ 62 | r: createSingleParagraphDocx("Hello from chew!"), 63 | url: "http://example.com", 64 | }, 65 | want: []common.Chunk{ 66 | { 67 | Content: "Hello from chew! ", 68 | Source: "http://example.com", 69 | }, 70 | }, 71 | wantErr: false, 72 | }, 73 | } 74 | for _, tt := range tests { 75 | t.Run(tt.name, func(t *testing.T) { 76 | got, err := ProcessDocx(tt.args.r, tt.args.url) 77 | if (err != nil) != tt.wantErr { 78 | t.Errorf("ProcessDocx() error = %v, wantErr %v", err, tt.wantErr) 79 | return 80 | } 81 | if !reflect.DeepEqual(got, tt.want) { 82 | t.Errorf("ProcessDocx() = %v, want %v", got, tt.want) 83 | } 84 | }) 85 | } 86 | } 87 | 88 | func TestProcessDocx_Error_ReadAll(t *testing.T) { 89 | _, err := processPptxContent(&errorReader{}) 90 | if err == nil { 91 | t.Error("ProcessDocx() did not return an error, but one was expected") 92 | } 93 | } 94 | -------------------------------------------------------------------------------- /internal/document/epub.go: -------------------------------------------------------------------------------- 1 | package document 2 | 3 | import ( 4 | "bytes" 5 | "fmt" 6 | "io" 7 | "strings" 8 | 9 | "github.com/PuerkitoBio/goquery" 10 | "github.com/mmatongo/chew/v1/internal/common" 11 | "github.com/taylorskalyo/goreader/epub" 12 | ) 13 | 14 | func processEpubContent(r io.Reader) ([]common.Chunk, error) { 15 | content, err := io.ReadAll(r) 16 | if err != nil { 17 | return nil, fmt.Errorf("failed to read EPUB content: %w", err) 18 | } 19 | 20 | reader, err := epub.NewReader(bytes.NewReader(content), int64(len(content))) 21 | if err != nil { 22 | return nil, fmt.Errorf("failed to create EPUB reader: %w", err) 23 | } 24 | 25 | if len(reader.Rootfiles) == 0 { 26 | return nil, fmt.Errorf("EPUB contains no content") 27 | } 28 | 29 | contents := reader.Rootfiles[0] 30 | var chunks []common.Chunk 31 | 32 | for _, item := range contents.Manifest.Items { 33 | if !strings.HasSuffix(item.HREF, ".xhtml") && !strings.HasSuffix(item.HREF, ".html") { 34 | continue 35 | } 36 | 37 | file, err := item.Open() 38 | if err != nil { 39 | return nil, fmt.Errorf("failed to open item %s: %w", item.HREF, err) 40 | } 41 | 42 | text, err := extractTextFromHTML(file) 43 | file.Close() 44 | if err != nil { 45 | return nil, fmt.Errorf("failed to extract text from %s: %w", item.HREF, err) 46 | } 47 | 48 | text = strings.TrimSpace(text) 49 | if text == "" { 50 | continue 51 | } 52 | chunks = append(chunks, common.Chunk{Content: text, Source: item.HREF}) 53 | } 54 | 55 | return chunks, nil 56 | } 57 | 58 | func ProcessEpub(r io.Reader, url string) ([]common.Chunk, error) { 59 | chunks, err := processEpubContent(r) 60 | if err != nil { 61 | return nil, err 62 | } 63 | 64 | for i := range chunks { 65 | chunks[i].Source = url 66 | } 67 | 68 | return chunks, nil 69 | } 70 | 71 | func extractTextFromHTML(r io.Reader) (string, error) { 72 | doc, err := goquery.NewDocumentFromReader(r) 73 | if err != nil { 74 | return "", err 75 | } 76 | 77 | doc.Find("script, style,nav, header, footer").Remove() 78 | 79 | var buf strings.Builder 80 | /* 81 | We're only interested in the text content of the HTML document 82 | however this is a very naive approach and might not work well 83 | for all HTML documents unfortunately. 84 | This is a known issue and I'm working on a better solution. 85 | see: https://github.com/mmatongo/chew/issues/22 86 | 87 | TODO: Allow users to specify a CSS selector to extract text from 88 | */ 89 | doc.Find("p, h1, h2, h3, h4, h5, h6, li").Each(func(_ int, s *goquery.Selection) { 90 | buf.WriteString(strings.TrimSpace(s.Text())) 91 | buf.WriteString("\n\n") 92 | }) 93 | 94 | return strings.TrimSpace(buf.String()), nil 95 | } 96 | -------------------------------------------------------------------------------- /internal/document/epub_test.go: -------------------------------------------------------------------------------- 1 | package document 2 | 3 | import ( 4 | "io" 5 | "os" 6 | "path/filepath" 7 | "reflect" 8 | "strings" 9 | "testing" 10 | 11 | "github.com/mmatongo/chew/v1/internal/common" 12 | "github.com/mmatongo/chew/v1/internal/utils" 13 | ) 14 | 15 | func Test_processEpubContent(t *testing.T) { 16 | type args struct { 17 | r io.Reader 18 | } 19 | tests := []struct { 20 | name string 21 | args args 22 | want []common.Chunk 23 | wantErr bool 24 | }{ 25 | { 26 | name: "success", 27 | args: args{ 28 | r: func() io.Reader { 29 | f, _ := os.Open(filepath.Join(getRootPath(t), "testdata", "files", "test.epub")) 30 | return f 31 | }(), 32 | }, 33 | want: []common.Chunk{ 34 | { 35 | Content: "A pdf for testing", 36 | Source: "index.html", 37 | }, 38 | }, 39 | wantErr: false, 40 | }, 41 | } 42 | for _, tt := range tests { 43 | t.Run(tt.name, func(t *testing.T) { 44 | got, err := processEpubContent(tt.args.r) 45 | if (err != nil) != tt.wantErr { 46 | t.Errorf("processEpubContent() error = %v, wantErr %v", err, tt.wantErr) 47 | return 48 | } 49 | if !reflect.DeepEqual(got, tt.want) { 50 | t.Errorf("processEpubContent() = %v, want %v", got, tt.want) 51 | } 52 | }) 53 | } 54 | } 55 | 56 | func TestProcessEpub(t *testing.T) { 57 | type args struct { 58 | r io.Reader 59 | url string 60 | } 61 | tests := []struct { 62 | name string 63 | args args 64 | want []common.Chunk 65 | wantErr bool 66 | }{ 67 | { 68 | name: "success", 69 | args: args{ 70 | r: func() io.Reader { 71 | f, _ := os.Open(filepath.Join(getRootPath(t), "testdata", "files", "test.epub")) 72 | return f 73 | }(), 74 | url: "https://example.com/test.epub", 75 | }, 76 | want: []common.Chunk{ 77 | { 78 | Content: "A pdf for testing", 79 | Source: "https://example.com/test.epub", 80 | }, 81 | }, 82 | wantErr: false, 83 | }, 84 | { 85 | name: "error", 86 | args: args{ 87 | r: strings.NewReader("key: value, key2: value2"), 88 | url: "https://example.com/data.yaml", 89 | }, 90 | want: nil, 91 | wantErr: true, 92 | }, 93 | { 94 | name: "empty", 95 | args: args{ 96 | r: strings.NewReader(""), 97 | url: "https://example.com", 98 | }, 99 | want: nil, 100 | wantErr: true, 101 | }, 102 | { 103 | name: "unreadable", 104 | args: args{ 105 | r: func() io.Reader { f, _ := os.Open("nonexistent.epub"); return f }(), 106 | url: "https://example.com/nonexistent.epub", 107 | }, 108 | want: nil, 109 | wantErr: true, 110 | }, 111 | } 112 | for _, tt := range tests { 113 | t.Run(tt.name, func(t *testing.T) { 114 | got, err := ProcessEpub(tt.args.r, tt.args.url) 115 | if (err != nil) != tt.wantErr { 116 | t.Errorf("ProcessEpub() error = %v, wantErr %v", err, tt.wantErr) 117 | return 118 | } 119 | if !reflect.DeepEqual(got, tt.want) { 120 | t.Errorf("ProcessEpub() = %v, want %v", got, tt.want) 121 | } 122 | }) 123 | } 124 | } 125 | 126 | func Test_extractTextFromHTML(t *testing.T) { 127 | file, _ := utils.OpenFile("testdata/invalid.html") 128 | type args struct { 129 | r io.Reader 130 | } 131 | tests := []struct { 132 | name string 133 | args args 134 | want string 135 | wantErr bool 136 | }{ 137 | { 138 | name: "success", 139 | args: args{ 140 | r: strings.NewReader("

some content

"), 141 | }, 142 | want: "some content", 143 | wantErr: false, 144 | }, 145 | { 146 | name: "error", 147 | args: args{ 148 | r: file, 149 | }, 150 | want: "", 151 | wantErr: true, 152 | }, 153 | } 154 | for _, tt := range tests { 155 | t.Run(tt.name, func(t *testing.T) { 156 | got, err := extractTextFromHTML(tt.args.r) 157 | if (err != nil) != tt.wantErr { 158 | t.Errorf("extractTextFromHTML() error = %v, wantErr %v", err, tt.wantErr) 159 | return 160 | } 161 | if got != tt.want { 162 | t.Errorf("extractTextFromHTML() = %v, want %v", got, tt.want) 163 | } 164 | }) 165 | } 166 | } 167 | -------------------------------------------------------------------------------- /internal/document/pdf.go: -------------------------------------------------------------------------------- 1 | package document 2 | 3 | import ( 4 | "bytes" 5 | "fmt" 6 | "io" 7 | "log" 8 | "strings" 9 | 10 | "github.com/ledongthuc/pdf" 11 | "github.com/mmatongo/chew/v1/internal/common" 12 | ) 13 | 14 | func ProcessPDF(r io.Reader, url string) ([]common.Chunk, error) { 15 | pdfData, err := io.ReadAll(r) 16 | if err != nil { 17 | return nil, err 18 | } 19 | 20 | f, err := pdf.NewReader(bytes.NewReader(pdfData), int64(len(pdfData))) 21 | if err != nil { 22 | return nil, err 23 | } 24 | 25 | var chunks []common.Chunk 26 | for i := 1; i <= f.NumPage(); i++ { 27 | p := f.Page(i) 28 | if p.V.IsNull() { 29 | continue 30 | } 31 | text, err := p.GetPlainText(nil) 32 | if err != nil { 33 | log.Printf("Error extracting text from page %d: %v\n\n", i, err) 34 | continue 35 | } 36 | 37 | text = strings.TrimSpace(text) 38 | text = strings.ReplaceAll(text, "\n", "\n\n") 39 | 40 | chunks = append(chunks, common.Chunk{ 41 | Content: text, 42 | Source: fmt.Sprintf("%s#page=%d", url, i), 43 | }) 44 | } 45 | 46 | if len(chunks) == 0 { 47 | return nil, err 48 | } 49 | 50 | return chunks, nil 51 | } 52 | -------------------------------------------------------------------------------- /internal/document/pdf_test.go: -------------------------------------------------------------------------------- 1 | package document 2 | 3 | import ( 4 | "io" 5 | "os" 6 | "path/filepath" 7 | "reflect" 8 | "strings" 9 | "testing" 10 | 11 | "github.com/mmatongo/chew/v1/internal/common" 12 | ) 13 | 14 | func getRootPath(t *testing.T) string { 15 | t.Helper() 16 | pwd, err := os.Getwd() 17 | if err != nil { 18 | t.Fatalf("getting current folder: %s", err) 19 | } 20 | pwd = filepath.Dir(filepath.Dir(pwd)) 21 | return pwd 22 | } 23 | 24 | func TestProcessPDF(t *testing.T) { 25 | type args struct { 26 | r io.Reader 27 | url string 28 | } 29 | tests := []struct { 30 | name string 31 | args args 32 | want []common.Chunk 33 | wantErr bool 34 | }{ 35 | { 36 | name: "success", 37 | args: args{ 38 | r: func() io.Reader { 39 | f, _ := os.Open(filepath.Join(getRootPath(t), "testdata", "files", "test.pdf")) 40 | return f 41 | }(), 42 | url: "https://example.com/test.pdf", 43 | }, 44 | want: []common.Chunk{ 45 | { 46 | Content: "Apdffortesting", 47 | Source: "https://example.com/test.pdf#page=1", 48 | }, 49 | }, 50 | wantErr: false, 51 | }, 52 | { 53 | name: "error", 54 | args: args{ 55 | r: strings.NewReader("key: value, key2: value2"), 56 | url: "https://example.com/data.yaml", 57 | }, 58 | want: nil, 59 | wantErr: true, 60 | }, 61 | { 62 | name: "empty", 63 | args: args{ 64 | r: strings.NewReader(""), 65 | url: "https://example.com", 66 | }, 67 | want: nil, 68 | wantErr: true, 69 | }, 70 | { 71 | name: "unreadable", 72 | args: args{ 73 | r: func() io.Reader { f, _ := os.Open("nonexistent.pdf"); return f }(), 74 | url: "https://example.com/nonexistent.pdf", 75 | }, 76 | want: nil, 77 | wantErr: true, 78 | }, 79 | } 80 | for _, tt := range tests { 81 | t.Run(tt.name, func(t *testing.T) { 82 | got, err := ProcessPDF(tt.args.r, tt.args.url) 83 | if (err != nil) != tt.wantErr { 84 | t.Errorf("ProcessPDF() error = %v, wantErr %v", err, tt.wantErr) 85 | return 86 | } 87 | if !reflect.DeepEqual(got, tt.want) { 88 | t.Errorf("ProcessPDF() = %v, want %v", got, tt.want) 89 | } 90 | }) 91 | } 92 | } 93 | -------------------------------------------------------------------------------- /internal/document/pptx.go: -------------------------------------------------------------------------------- 1 | package document 2 | 3 | import ( 4 | "archive/zip" 5 | "bytes" 6 | "io" 7 | "strings" 8 | 9 | "github.com/mmatongo/chew/v1/internal/common" 10 | "github.com/mmatongo/chew/v1/internal/utils" 11 | ) 12 | 13 | func processPptxContent(r io.Reader) ([]string, error) { 14 | data, err := io.ReadAll(r) 15 | if err != nil { 16 | return nil, err 17 | } 18 | 19 | zipReader, err := zip.NewReader(bytes.NewReader(data), int64(len(data))) 20 | if err != nil { 21 | return nil, err 22 | } 23 | 24 | var contents []string 25 | 26 | for _, file := range zipReader.File { 27 | if strings.HasPrefix(file.Name, "ppt/slides/") { 28 | slideText, err := utils.ExtractTextFromXML(file) 29 | if err != nil { 30 | return nil, err 31 | } 32 | contents = append(contents, slideText...) 33 | } 34 | } 35 | 36 | var allContent strings.Builder 37 | for _, content := range contents { 38 | allContent.WriteString(content) 39 | allContent.WriteString(" ") 40 | } 41 | 42 | return []string{allContent.String()}, nil 43 | 44 | /* 45 | // In the event we just want chunks we can just return contents 46 | return contents, nil 47 | */ 48 | } 49 | 50 | func ProcessPptx(r io.Reader, url string) ([]common.Chunk, error) { 51 | content, err := processPptxContent(r) 52 | if err != nil { 53 | return nil, err 54 | } 55 | 56 | var chunks []common.Chunk 57 | for _, chunk := range content { 58 | if strings.TrimSpace(string(chunk)) != "" { 59 | chunks = append(chunks, common.Chunk{Content: string(chunk), Source: url}) 60 | } 61 | } 62 | 63 | return chunks, nil 64 | } 65 | -------------------------------------------------------------------------------- /internal/document/pptx_test.go: -------------------------------------------------------------------------------- 1 | package document 2 | 3 | import ( 4 | "archive/zip" 5 | "bytes" 6 | "io" 7 | "reflect" 8 | "testing" 9 | 10 | "github.com/mmatongo/chew/v1/internal/common" 11 | ) 12 | 13 | func createPptxWithContent(content string) io.Reader { 14 | buf := new(bytes.Buffer) 15 | w := zip.NewWriter(buf) 16 | f, _ := w.Create("ppt/slides/slide1.xml") 17 | f.Write([]byte(content)) 18 | w.Close() 19 | return bytes.NewReader(buf.Bytes()) 20 | } 21 | 22 | func createEmptyPptx() io.Reader { 23 | return createPptxWithContent(``) 24 | } 25 | 26 | func createSingleParagraphPptx(content string) io.Reader { 27 | return createPptxWithContent(`

` + content + `

`) 28 | } 29 | 30 | func TestProcessPptx(t *testing.T) { 31 | type args struct { 32 | r io.Reader 33 | url string 34 | } 35 | tests := []struct { 36 | name string 37 | args args 38 | want []common.Chunk 39 | wantErr bool 40 | }{ 41 | { 42 | name: "Empty pptx file", 43 | args: args{r: createEmptyPptx(), url: "http://example.com"}, 44 | want: nil, 45 | wantErr: false, 46 | }, 47 | { 48 | name: "Single paragraph pptx file", 49 | args: args{r: createSingleParagraphPptx("Hello from chew!"), url: "http://example.com"}, 50 | want: []common.Chunk{{Content: "Hello from chew! ", Source: "http://example.com"}}, 51 | wantErr: false, 52 | }, 53 | } 54 | for _, tt := range tests { 55 | t.Run(tt.name, func(t *testing.T) { 56 | got, err := ProcessPptx(tt.args.r, tt.args.url) 57 | if (err != nil) != tt.wantErr { 58 | t.Errorf("ProcessPptx() error = %v, wantErr %v", err, tt.wantErr) 59 | return 60 | } 61 | if !reflect.DeepEqual(got, tt.want) { 62 | t.Errorf("ProcessPptx() = %v, want %v", got, tt.want) 63 | } 64 | }) 65 | } 66 | } 67 | 68 | func TestProcessPptx_Error_ReadAll(t *testing.T) { 69 | _, err := processPptxContent(&errorReader{}) 70 | if err == nil { 71 | t.Error("ProcessPptx() did not return an error, but one was expected") 72 | } 73 | } 74 | -------------------------------------------------------------------------------- /internal/text/csv.go: -------------------------------------------------------------------------------- 1 | package text 2 | 3 | import ( 4 | "encoding/csv" 5 | "io" 6 | "strings" 7 | 8 | "github.com/mmatongo/chew/v1/internal/common" 9 | ) 10 | 11 | func ProcessCSV(r io.Reader, url string) ([]common.Chunk, error) { 12 | csvReader := csv.NewReader(r) 13 | var records [][]string 14 | var err error 15 | 16 | records, err = csvReader.ReadAll() 17 | if err != nil { 18 | return nil, err 19 | } 20 | 21 | var chunks []common.Chunk 22 | for _, record := range records { 23 | chunks = append(chunks, common.Chunk{Content: strings.Join(record, ", "), Source: url}) 24 | } 25 | 26 | return chunks, nil 27 | } 28 | -------------------------------------------------------------------------------- /internal/text/csv_test.go: -------------------------------------------------------------------------------- 1 | package text 2 | 3 | import ( 4 | "io" 5 | "reflect" 6 | "strings" 7 | "testing" 8 | 9 | "github.com/mmatongo/chew/v1/internal/common" 10 | ) 11 | 12 | func TestProcessCSV(t *testing.T) { 13 | type args struct { 14 | r io.Reader 15 | url string 16 | } 17 | tests := []struct { 18 | name string 19 | args args 20 | want []common.Chunk 21 | wantErr bool 22 | }{ 23 | { 24 | name: "success", 25 | args: args{ 26 | r: strings.NewReader("Test content"), 27 | url: "https://example.com", 28 | }, 29 | want: []common.Chunk{{ 30 | Content: "Test content", 31 | Source: "https://example.com", 32 | }}, 33 | wantErr: false, 34 | }, 35 | { 36 | name: "empty", 37 | args: args{ 38 | r: strings.NewReader(""), 39 | url: "https://example.com", 40 | }, 41 | want: nil, 42 | wantErr: false, 43 | }, 44 | { 45 | name: "CSV with quoted fields", 46 | args: args{ 47 | r: strings.NewReader("\"header 1\",\"header 2\"\n\"value, with comma\",\"value2\""), 48 | url: "https://example.com/quoted.csv", 49 | }, 50 | want: []common.Chunk{ 51 | {Content: "header 1, header 2", Source: "https://example.com/quoted.csv"}, 52 | {Content: "value, with comma, value2", Source: "https://example.com/quoted.csv"}, 53 | }, 54 | wantErr: false, 55 | }, 56 | } 57 | for _, tt := range tests { 58 | t.Run(tt.name, func(t *testing.T) { 59 | got, err := ProcessCSV(tt.args.r, tt.args.url) 60 | if (err != nil) != tt.wantErr { 61 | t.Errorf("ProcessCSV() error = %v, wantErr %v", err, tt.wantErr) 62 | return 63 | } 64 | if !reflect.DeepEqual(got, tt.want) { 65 | t.Errorf("ProcessCSV() = %v, want %v", got, tt.want) 66 | } 67 | }) 68 | } 69 | } 70 | -------------------------------------------------------------------------------- /internal/text/html.go: -------------------------------------------------------------------------------- 1 | package text 2 | 3 | import ( 4 | "fmt" 5 | "io" 6 | "strings" 7 | 8 | "github.com/PuerkitoBio/goquery" 9 | "github.com/mmatongo/chew/v1/internal/common" 10 | ) 11 | 12 | func ProcessHTML(r io.Reader, url string) ([]common.Chunk, error) { 13 | doc, err := goquery.NewDocumentFromReader(r) 14 | if err != nil { 15 | return nil, fmt.Errorf("failed to parse HTML: %w", err) 16 | } 17 | 18 | var chunks []common.Chunk 19 | /* 20 | We're only interested in the text content of the HTML document 21 | so we're going to ignore the tags that don't contain useful text. 22 | This is a very naive approach and might not work for all HTML documents unfortunately 23 | */ 24 | 25 | doc.Find("nav, header, footer").Remove() 26 | 27 | doc.Find("p, h1, h2, h3, h4, h5, h6, li").Each(func(_ int, s *goquery.Selection) { 28 | text := strings.TrimSpace(s.Text()) 29 | if text != "" { 30 | chunks = append(chunks, common.Chunk{Content: text, Source: url}) 31 | } 32 | }) 33 | 34 | return chunks, nil 35 | } 36 | -------------------------------------------------------------------------------- /internal/text/html_test.go: -------------------------------------------------------------------------------- 1 | package text 2 | 3 | import ( 4 | "io" 5 | "reflect" 6 | "strings" 7 | "testing" 8 | 9 | "github.com/mmatongo/chew/v1/internal/common" 10 | "github.com/mmatongo/chew/v1/internal/utils" 11 | ) 12 | 13 | func TestProcessHTML(t *testing.T) { 14 | file, _ := utils.OpenFile("testdata/invalid.html") 15 | type args struct { 16 | r io.Reader 17 | url string 18 | } 19 | tests := []struct { 20 | name string 21 | args args 22 | want []common.Chunk 23 | wantErr bool 24 | }{ 25 | { 26 | name: "success", 27 | args: args{ 28 | r: strings.NewReader(` 29 | 30 | 31 | 32 | Test HTML 33 | 34 | 35 |

Test content

36 |

This is a test paragraph.

37 | 38 | 39 | `), 40 | url: "https://example.com/page.html", 41 | }, 42 | want: []common.Chunk{ 43 | { 44 | Content: "Test content", 45 | Source: "https://example.com/page.html", 46 | }, 47 | { 48 | Content: "This is a test paragraph.", 49 | Source: "https://example.com/page.html", 50 | }, 51 | }, 52 | wantErr: false, 53 | }, 54 | { 55 | name: "empty", 56 | args: args{ 57 | r: strings.NewReader(""), 58 | url: "https://example.com", 59 | }, 60 | want: nil, 61 | wantErr: false, 62 | }, 63 | { 64 | name: "invalid content as a reader", 65 | args: args{ 66 | r: file, 67 | url: "https://example.com", 68 | }, 69 | want: nil, 70 | wantErr: true, 71 | }, 72 | } 73 | for _, tt := range tests { 74 | t.Run(tt.name, func(t *testing.T) { 75 | got, err := ProcessHTML(tt.args.r, tt.args.url) 76 | if (err != nil) != tt.wantErr { 77 | t.Errorf("processHTML() error = %v, wantErr %v", err, tt.wantErr) 78 | return 79 | } 80 | if !reflect.DeepEqual(got, tt.want) { 81 | t.Errorf("processHTML() = %v, want %v", got, tt.want) 82 | } 83 | }) 84 | } 85 | } 86 | -------------------------------------------------------------------------------- /internal/text/json.go: -------------------------------------------------------------------------------- 1 | package text 2 | 3 | import ( 4 | "encoding/json" 5 | "fmt" 6 | "io" 7 | 8 | "github.com/mmatongo/chew/v1/internal/common" 9 | ) 10 | 11 | func ProcessJSON(r io.Reader, url string) ([]common.Chunk, error) { 12 | var data interface{} 13 | if err := json.NewDecoder(r).Decode(&data); err != nil { 14 | return nil, err 15 | } 16 | 17 | jsonStr, err := json.MarshalIndent(data, "", " ") 18 | if err != nil { 19 | return nil, fmt.Errorf("failed to marshal json: %w", err) 20 | } 21 | 22 | return []common.Chunk{{Content: string(jsonStr), Source: url}}, nil 23 | } 24 | -------------------------------------------------------------------------------- /internal/text/json_test.go: -------------------------------------------------------------------------------- 1 | package text 2 | 3 | import ( 4 | "io" 5 | "reflect" 6 | "strings" 7 | "testing" 8 | 9 | "github.com/mmatongo/chew/v1/internal/common" 10 | ) 11 | 12 | func TestProcessJSON(t *testing.T) { 13 | type args struct { 14 | r io.Reader 15 | url string 16 | } 17 | tests := []struct { 18 | name string 19 | args args 20 | want []common.Chunk 21 | wantErr bool 22 | }{ 23 | { 24 | name: "success", 25 | args: args{ 26 | r: strings.NewReader(`{"key": "value"}`), 27 | url: "https://example.com/data.json", 28 | }, 29 | want: []common.Chunk{{ 30 | Content: "{\n \"key\": \"value\"\n}", 31 | Source: "https://example.com/data.json", 32 | }}, 33 | wantErr: false, 34 | }, 35 | { 36 | name: "empty", 37 | args: args{ 38 | r: strings.NewReader(""), 39 | url: "https://example.com", 40 | }, 41 | want: nil, 42 | wantErr: true, 43 | }, 44 | { 45 | name: "valid empty json", 46 | args: args{ 47 | r: strings.NewReader("{}"), 48 | url: "https://example.com", 49 | }, 50 | want: []common.Chunk{{ 51 | Content: "{}", 52 | Source: "https://example.com", 53 | }}, 54 | wantErr: false, 55 | }, 56 | } 57 | for _, tt := range tests { 58 | t.Run(tt.name, func(t *testing.T) { 59 | got, err := ProcessJSON(tt.args.r, tt.args.url) 60 | if (err != nil) != tt.wantErr { 61 | t.Errorf("ProcessJSON() error = %v, wantErr %v", err, tt.wantErr) 62 | return 63 | } 64 | if !reflect.DeepEqual(got, tt.want) { 65 | t.Errorf("ProcessJSON() = %v, want %v", got, tt.want) 66 | } 67 | }) 68 | } 69 | } 70 | -------------------------------------------------------------------------------- /internal/text/markdown.go: -------------------------------------------------------------------------------- 1 | package text 2 | 3 | var ProcessMd = ProcessText 4 | -------------------------------------------------------------------------------- /internal/text/plaintext.go: -------------------------------------------------------------------------------- 1 | package text 2 | 3 | import ( 4 | "io" 5 | 6 | "github.com/mmatongo/chew/v1/internal/common" 7 | ) 8 | 9 | func ProcessText(r io.Reader, url string) ([]common.Chunk, error) { 10 | content, err := io.ReadAll(r) 11 | if err != nil { 12 | return nil, err 13 | } 14 | 15 | if len(content) == 0 { 16 | return nil, nil 17 | } 18 | 19 | return []common.Chunk{{Content: string(content), Source: url}}, nil 20 | } 21 | -------------------------------------------------------------------------------- /internal/text/plaintext_test.go: -------------------------------------------------------------------------------- 1 | package text 2 | 3 | import ( 4 | "io" 5 | "reflect" 6 | "strings" 7 | "testing" 8 | 9 | "github.com/mmatongo/chew/v1/internal/common" 10 | "github.com/mmatongo/chew/v1/internal/utils" 11 | ) 12 | 13 | func TestProcessText(t *testing.T) { 14 | file, _ := utils.OpenFile("testdata/invalid.html") 15 | type args struct { 16 | r io.Reader 17 | url string 18 | } 19 | tests := []struct { 20 | name string 21 | args args 22 | want []common.Chunk 23 | wantErr bool 24 | }{ 25 | { 26 | name: "success", 27 | args: args{ 28 | r: strings.NewReader("Test content"), 29 | url: "https://example.com", 30 | }, 31 | want: []common.Chunk{{ 32 | Content: "Test content", 33 | Source: "https://example.com", 34 | }}, 35 | wantErr: false, 36 | }, 37 | { 38 | name: "empty", 39 | args: args{ 40 | r: strings.NewReader(""), 41 | url: "https://example.com", 42 | }, 43 | want: nil, 44 | wantErr: false, 45 | }, 46 | { 47 | name: "invalid", 48 | args: args{ 49 | r: file, 50 | url: "https://example.com", 51 | }, 52 | want: nil, 53 | wantErr: true, 54 | }, 55 | } 56 | for _, tt := range tests { 57 | t.Run(tt.name, func(t *testing.T) { 58 | got, err := ProcessText(tt.args.r, tt.args.url) 59 | if (err != nil) != tt.wantErr { 60 | t.Errorf("ProcessText() error = %v, wantErr %v", err, tt.wantErr) 61 | return 62 | } 63 | if !reflect.DeepEqual(got, tt.want) { 64 | t.Errorf("ProcessText() = %v, want %v", got, tt.want) 65 | } 66 | }) 67 | } 68 | } 69 | -------------------------------------------------------------------------------- /internal/text/xml.go: -------------------------------------------------------------------------------- 1 | package text 2 | 3 | import ( 4 | "bytes" 5 | "encoding/xml" 6 | "io" 7 | 8 | "github.com/mmatongo/chew/v1/internal/common" 9 | ) 10 | 11 | func ProcessXML(r io.Reader, url string) ([]common.Chunk, error) { 12 | decoder := xml.NewDecoder(r) 13 | var chunks []common.Chunk 14 | var currentElement string 15 | for { 16 | t, err := decoder.Token() 17 | if err == io.EOF { 18 | break 19 | } 20 | if err != nil { 21 | return nil, err 22 | } 23 | switch se := t.(type) { 24 | case xml.StartElement: 25 | currentElement = se.Name.Local 26 | case xml.CharData: 27 | content := string(bytes.TrimSpace(se)) 28 | if content != "" && currentElement != "" { 29 | chunks = append(chunks, common.Chunk{ 30 | Content: content, 31 | Source: url, 32 | }) 33 | } 34 | } 35 | } 36 | return chunks, nil 37 | } 38 | -------------------------------------------------------------------------------- /internal/text/xml_test.go: -------------------------------------------------------------------------------- 1 | package text 2 | 3 | import ( 4 | "io" 5 | "reflect" 6 | "strings" 7 | "testing" 8 | 9 | "github.com/mmatongo/chew/v1/internal/common" 10 | ) 11 | 12 | func TestProcessXML(t *testing.T) { 13 | type args struct { 14 | r io.Reader 15 | url string 16 | } 17 | tests := []struct { 18 | name string 19 | args args 20 | want []common.Chunk 21 | wantErr bool 22 | }{ 23 | { 24 | name: "success", 25 | args: args{ 26 | r: strings.NewReader("Test content"), 27 | url: "https://example.com", 28 | }, 29 | want: []common.Chunk{{ 30 | Content: "Test content", 31 | Source: "https://example.com", 32 | }}, 33 | 34 | wantErr: false, 35 | }, 36 | } 37 | for _, tt := range tests { 38 | t.Run(tt.name, func(t *testing.T) { 39 | got, err := ProcessXML(tt.args.r, tt.args.url) 40 | if (err != nil) != tt.wantErr { 41 | t.Errorf("ProcessXML() error = %v, wantErr %v", err, tt.wantErr) 42 | return 43 | } 44 | if !reflect.DeepEqual(got, tt.want) { 45 | t.Errorf("ProcessXML() = %v, want %v", got, tt.want) 46 | } 47 | }) 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /internal/text/yaml.go: -------------------------------------------------------------------------------- 1 | package text 2 | 3 | import ( 4 | "io" 5 | 6 | "github.com/mmatongo/chew/v1/internal/common" 7 | "gopkg.in/yaml.v3" 8 | ) 9 | 10 | func ProcessYAML(r io.Reader, url string) ([]common.Chunk, error) { 11 | var data interface{} 12 | if err := yaml.NewDecoder(r).Decode(&data); err != nil { 13 | return nil, err 14 | } 15 | 16 | yamlStr, err := yaml.Marshal(data) 17 | if err != nil { 18 | return nil, err 19 | } 20 | 21 | return []common.Chunk{{Content: string(yamlStr), Source: url}}, nil 22 | } 23 | -------------------------------------------------------------------------------- /internal/text/yaml_test.go: -------------------------------------------------------------------------------- 1 | package text 2 | 3 | import ( 4 | "io" 5 | "reflect" 6 | "strings" 7 | "testing" 8 | 9 | "github.com/mmatongo/chew/v1/internal/common" 10 | ) 11 | 12 | func TestProcessYAML(t *testing.T) { 13 | type args struct { 14 | r io.Reader 15 | url string 16 | } 17 | tests := []struct { 18 | name string 19 | args args 20 | want []common.Chunk 21 | wantErr bool 22 | }{ 23 | { 24 | name: "success", 25 | args: args{ 26 | r: strings.NewReader("key: value\nkey2: value2"), 27 | url: "https://example.com/data.yaml", 28 | }, 29 | want: []common.Chunk{ 30 | { 31 | Content: "key: value\nkey2: value2\n", 32 | Source: "https://example.com/data.yaml", 33 | }, 34 | }, 35 | wantErr: false, 36 | }, 37 | { 38 | name: "error", 39 | args: args{ 40 | r: strings.NewReader("key: value, key2: value2"), 41 | url: "https://example.com/data.yaml", 42 | }, 43 | want: nil, 44 | wantErr: true, 45 | }, 46 | } 47 | for _, tt := range tests { 48 | t.Run(tt.name, func(t *testing.T) { 49 | got, err := ProcessYAML(tt.args.r, tt.args.url) 50 | if (err != nil) != tt.wantErr { 51 | t.Errorf("ProcessYAML() error = %v, wantErr %v", err, tt.wantErr) 52 | return 53 | } 54 | if !reflect.DeepEqual(got, tt.want) { 55 | t.Errorf("ProcessYAML() = %v, want %v", got, tt.want) 56 | } 57 | }) 58 | } 59 | } 60 | -------------------------------------------------------------------------------- /internal/transcribe/google_transcriber.go: -------------------------------------------------------------------------------- 1 | package transcribe 2 | 3 | import ( 4 | "context" 5 | "errors" 6 | "fmt" 7 | "path/filepath" 8 | 9 | "cloud.google.com/go/storage" 10 | 11 | "github.com/mmatongo/chew/v1/internal/audio" 12 | "github.com/mmatongo/chew/v1/internal/utils/gcs" 13 | ) 14 | 15 | type googleTranscriber struct{} 16 | 17 | /* 18 | This relies too heavily on external dependencies and is not easily testable. A refactor is needed to make it more testable and is currently in progress. 19 | */ 20 | func (gt *googleTranscriber) process(ctx context.Context, filename string, opts TranscribeOptions) (string, error) { 21 | client, err := gcs.NewSpeechClient(ctx, opts) 22 | if err != nil { 23 | return "", fmt.Errorf("failed to create speech client: %w", err) 24 | } 25 | defer func() { 26 | if cerr := client.Close(); cerr != nil { 27 | err = errors.Join(err, fmt.Errorf("failed to close transcribe client: %w", cerr)) 28 | } 29 | }() 30 | 31 | storageClient, err := gcs.NewStorageClient(ctx, opts) 32 | if err != nil { 33 | return "", err 34 | } 35 | defer func(storageClient *storage.Client) { 36 | err := storageClient.Close() 37 | if err != nil { 38 | fmt.Printf("failed to close storage client: %v\n", err) 39 | } 40 | }(storageClient) 41 | 42 | audioInfo, err := audio.GetAudioInfo(filename) 43 | if err != nil { 44 | return "", fmt.Errorf("failed to process audio file: %w", err) 45 | } 46 | 47 | gcsURI, err := gcs.UploadToGCS(ctx, storageClient, opts.Bucket, filename) 48 | if err != nil { 49 | return "", fmt.Errorf("failed to upload to GCS: %w", err) 50 | } 51 | 52 | if opts.CleanupOnComplete { 53 | defer func(ctx context.Context, client *storage.Client, bucket, objectName string) { 54 | err := gcs.DeleteFromGCS(ctx, client, bucket, objectName) 55 | if err != nil { 56 | fmt.Printf("failed to delete object from GCS: %v\n", err) 57 | } 58 | }(ctx, storageClient, opts.Bucket, filepath.Base(filename)) 59 | } 60 | 61 | req := gcs.NewRecognitionRequest(opts, audioInfo, gcsURI) 62 | 63 | op, err := client.LongRunningRecognize(ctx, req) 64 | if err != nil { 65 | return "", fmt.Errorf("failed to start long running recognition: %w", err) 66 | } 67 | 68 | resp, err := op.Wait(ctx) 69 | if err != nil { 70 | return "", fmt.Errorf("failed to get long running recognition results: %w", err) 71 | } 72 | 73 | return gcs.ExtractTranscript(resp), nil 74 | } 75 | -------------------------------------------------------------------------------- /internal/transcribe/google_transcriber_test.go: -------------------------------------------------------------------------------- 1 | package transcribe 2 | 3 | import ( 4 | "context" 5 | "testing" 6 | ) 7 | 8 | func Test_googleTranscriber_process(t *testing.T) { 9 | type args struct { 10 | ctx context.Context 11 | filename string 12 | opts TranscribeOptions 13 | } 14 | tests := []struct { 15 | name string 16 | gt *googleTranscriber 17 | args args 18 | want string 19 | wantErr bool 20 | }{ 21 | { 22 | name: "failed to create speech client", 23 | gt: &googleTranscriber{}, 24 | args: args{ 25 | ctx: context.Background(), 26 | filename: "test.mp3", 27 | opts: TranscribeOptions{}, 28 | }, 29 | want: "", 30 | wantErr: true, 31 | }, 32 | } 33 | for _, tt := range tests { 34 | t.Run(tt.name, func(t *testing.T) { 35 | gt := &googleTranscriber{} 36 | got, err := gt.process(tt.args.ctx, tt.args.filename, tt.args.opts) 37 | if (err != nil) != tt.wantErr { 38 | t.Errorf("googleTranscriber.process() error = %v, wantErr %v", err, tt.wantErr) 39 | return 40 | } 41 | if got != tt.want { 42 | t.Errorf("googleTranscriber.process() = %v, want %v", got, tt.want) 43 | } 44 | }) 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /internal/transcribe/transcribe.go: -------------------------------------------------------------------------------- 1 | package transcribe 2 | 3 | import ( 4 | "context" 5 | "fmt" 6 | "sync" 7 | 8 | "github.com/mmatongo/chew/v1/internal/common" 9 | ) 10 | 11 | /* 12 | The TranscribeOptions struct contains the options for transcribing an audio file. It allows the user 13 | to specify the Google Cloud credentials, the GCS bucket to upload the audio file to, the language code 14 | to use for transcription, a potion to enable diarization including the min and max speakers and 15 | an option to clean up the audio file from GCS after transcription is complete. 16 | 17 | And also, it allows the user to specify whether to use the Whisper API for transcription, and if so, 18 | the API key, model, and prompt to use. 19 | */ 20 | type TranscribeOptions = common.TranscribeOptions 21 | 22 | // code is largely inspired by https://github.com/polyfact/polyfire-api 23 | 24 | type transcribeOption func(*transcribeConfig) 25 | 26 | type transcribeConfig struct { 27 | t transcriber 28 | } 29 | 30 | func WithTranscriber(t transcriber) transcribeOption { 31 | return func(config *transcribeConfig) { 32 | config.t = t 33 | } 34 | } 35 | 36 | /* 37 | Transcribe uses the Google Cloud Speech-to-Text API to transcribe an audio file. It takes 38 | a context, the filename of the audio file to transcribe, and a TranscribeOptions struct which 39 | contains the Google Cloud credentials, the GCS bucket to upload the audio file to, the language code 40 | to use for transcription, a potion to enable diarization including the min and max speakers and 41 | an option to clean up the audio file from GCS after transcription is complete. 42 | It returns the transcript of the audio file as a string and an error if the transcription fails. 43 | */ 44 | func Transcribe(ctx context.Context, filenames []string, opts TranscribeOptions, options ...transcribeOption) (map[string]string, error) { 45 | config := &transcribeConfig{} 46 | for _, option := range options { 47 | option(config) 48 | } 49 | 50 | if config.t == nil { 51 | if opts.UseWhisper { 52 | config.t = &whisperTranscriber{} 53 | } else { 54 | config.t = &googleTranscriber{} 55 | } 56 | } 57 | 58 | var ( 59 | results = make(map[string]string) 60 | wg sync.WaitGroup 61 | mu sync.Mutex 62 | errCh = make(chan error, len(filenames)) 63 | ) 64 | 65 | for _, filename := range filenames { 66 | wg.Add(1) 67 | go func(filename string) { 68 | defer wg.Done() 69 | 70 | transcript, err := config.t.process(ctx, filename, opts) 71 | if err != nil { 72 | select { 73 | case errCh <- fmt.Errorf("transcribing %s: %w", filename, err): 74 | default: 75 | } 76 | return 77 | } 78 | 79 | mu.Lock() 80 | results[filename] = transcript 81 | mu.Unlock() 82 | }(filename) 83 | } 84 | 85 | go func() { 86 | wg.Wait() 87 | close(errCh) 88 | }() 89 | 90 | select { 91 | case err := <-errCh: 92 | if err != nil { 93 | return nil, err 94 | } 95 | case <-ctx.Done(): 96 | return nil, ctx.Err() 97 | } 98 | 99 | return results, nil 100 | } 101 | -------------------------------------------------------------------------------- /internal/transcribe/transcribe_test.go: -------------------------------------------------------------------------------- 1 | package transcribe 2 | 3 | import ( 4 | "context" 5 | "fmt" 6 | "reflect" 7 | "testing" 8 | ) 9 | 10 | type mockTranscriber struct { 11 | processFn func(ctx context.Context, filename string, opts TranscribeOptions) (string, error) 12 | } 13 | 14 | func (m *mockTranscriber) process(ctx context.Context, filename string, opts TranscribeOptions) (string, error) { 15 | if m.processFn != nil { 16 | return m.processFn(ctx, filename, opts) 17 | } 18 | return "", nil 19 | } 20 | 21 | func TestTranscribe(t *testing.T) { 22 | type args struct { 23 | ctx context.Context 24 | filenames []string 25 | opts TranscribeOptions 26 | } 27 | tests := []struct { 28 | name string 29 | args args 30 | want map[string]string 31 | wantErr bool 32 | mockFn func(ctx context.Context, filename string, opts TranscribeOptions) (string, error) 33 | }{ 34 | { 35 | name: "Test Transcribe", 36 | args: args{ 37 | ctx: context.Background(), 38 | filenames: []string{"test1.mp3", "test2.mp3"}, 39 | opts: TranscribeOptions{ 40 | CredentialsJSON: []byte("``"), 41 | Bucket: "test-bucket", 42 | LanguageCode: "en-US", 43 | EnableDiarization: false, 44 | MinSpeakers: 0, 45 | MaxSpeakers: 0, 46 | CleanupOnComplete: false, 47 | UseWhisper: false, 48 | WhisperAPIKey: "", 49 | WhisperModel: "", 50 | WhisperPrompt: "", 51 | }, 52 | }, 53 | want: map[string]string{ 54 | "test1.mp3": "transcript for test1.mp3", 55 | "test2.mp3": "transcript for test2.mp3", 56 | }, 57 | wantErr: false, 58 | mockFn: func(ctx context.Context, filename string, opts TranscribeOptions) (string, error) { 59 | return "transcript for " + filename, nil 60 | }, 61 | }, 62 | { 63 | name: "Test Transcribe Error", 64 | args: args{ 65 | ctx: context.Background(), 66 | filenames: []string{"test1.mp3", "test2.mp3"}, 67 | opts: TranscribeOptions{}, 68 | }, 69 | want: nil, 70 | wantErr: true, 71 | mockFn: func(ctx context.Context, filename string, opts TranscribeOptions) (string, error) { 72 | return "", fmt.Errorf("mock error") 73 | }, 74 | }, 75 | } 76 | for _, tt := range tests { 77 | t.Run(tt.name, func(t *testing.T) { 78 | mockT := &mockTranscriber{ 79 | processFn: tt.mockFn, 80 | } 81 | got, err := Transcribe(tt.args.ctx, tt.args.filenames, tt.args.opts, WithTranscriber(mockT)) 82 | if (err != nil) != tt.wantErr { 83 | t.Errorf("Transcribe() error = %v, wantErr %v", err, tt.wantErr) 84 | return 85 | } 86 | if !reflect.DeepEqual(got, tt.want) { 87 | t.Errorf("Transcribe() = %v, want %v", got, tt.want) 88 | } 89 | }) 90 | } 91 | } 92 | -------------------------------------------------------------------------------- /internal/transcribe/types.go: -------------------------------------------------------------------------------- 1 | package transcribe 2 | 3 | import ( 4 | "context" 5 | "io" 6 | "net/http" 7 | ) 8 | 9 | type transcriber interface { 10 | process(ctx context.Context, filename string, opts TranscribeOptions) (string, error) 11 | } 12 | 13 | type whisperTranscriber struct{} 14 | 15 | type httpClient interface { 16 | Do(req *http.Request) (*http.Response, error) 17 | } 18 | 19 | type fileOpener func(name string) (io.ReadCloser, error) 20 | -------------------------------------------------------------------------------- /internal/transcribe/whisper.go: -------------------------------------------------------------------------------- 1 | package transcribe 2 | 3 | import ( 4 | "bytes" 5 | "context" 6 | "encoding/json" 7 | "fmt" 8 | "io" 9 | "mime/multipart" 10 | "net/http" 11 | "os" 12 | "path/filepath" 13 | ) 14 | 15 | func processWhisper(ctx context.Context, filename string, opts TranscribeOptions, client httpClient, opener fileOpener) (string, error) { 16 | if client == nil { 17 | client = &http.Client{} 18 | } 19 | if opener == nil { 20 | opener = func(name string) (io.ReadCloser, error) { 21 | return os.Open(name) 22 | } 23 | } 24 | 25 | file, err := opener(filename) 26 | if err != nil { 27 | return "", fmt.Errorf("failed to open file: %w", err) 28 | } 29 | defer func() { 30 | if cerr := file.Close(); cerr != nil { 31 | err = fmt.Errorf("failed to close file: %v (original error: %w)", cerr, err) 32 | } 33 | }() 34 | 35 | body := &bytes.Buffer{} 36 | writer := multipart.NewWriter(body) 37 | 38 | part, err := writer.CreateFormFile("file", filepath.Base(filename)) 39 | if err != nil { 40 | return "", fmt.Errorf("failed to create form file: %w", err) 41 | } 42 | if _, err = io.Copy(part, file); err != nil { 43 | return "", fmt.Errorf("failed to copy file content: %w", err) 44 | } 45 | 46 | if err = writeFields(writer, opts); err != nil { 47 | return "", err 48 | } 49 | 50 | if err = writer.Close(); err != nil { 51 | return "", fmt.Errorf("failed to close writer: %w", err) 52 | } 53 | 54 | req, err := http.NewRequestWithContext(ctx, "POST", "https://api.openai.com/v1/audio/transcriptions", body) 55 | if err != nil { 56 | return "", fmt.Errorf("failed to create request: %w", err) 57 | } 58 | 59 | req.Header.Set("Authorization", "Bearer "+opts.WhisperAPIKey) 60 | req.Header.Set("Content-Type", writer.FormDataContentType()) 61 | 62 | resp, err := client.Do(req) 63 | if err != nil { 64 | return "", fmt.Errorf("failed to send request: %w", err) 65 | } 66 | defer func() { 67 | if cerr := resp.Body.Close(); cerr != nil { 68 | err = fmt.Errorf("failed to close response body: %v (original error: %w)", cerr, err) 69 | } 70 | }() 71 | 72 | if resp.StatusCode != http.StatusOK { 73 | bodyBytes, _ := io.ReadAll(resp.Body) 74 | return "", fmt.Errorf("API request failed with status code %d: %s", resp.StatusCode, string(bodyBytes)) 75 | } 76 | 77 | var result struct { 78 | Text string `json:"text"` 79 | } 80 | if err = json.NewDecoder(resp.Body).Decode(&result); err != nil { 81 | return "", fmt.Errorf("failed to decode response: %w", err) 82 | } 83 | 84 | return result.Text, nil 85 | } 86 | 87 | func writeFields(writer *multipart.Writer, opts TranscribeOptions) error { 88 | fields := map[string]string{ 89 | "model": opts.WhisperModel, 90 | "language": opts.LanguageCode, 91 | "prompt": opts.WhisperPrompt, 92 | } 93 | 94 | for key, value := range fields { 95 | if value != "" { 96 | if err := writer.WriteField(key, value); err != nil { 97 | return fmt.Errorf("failed to write %s field: %w", key, err) 98 | } 99 | } 100 | } 101 | 102 | return nil 103 | } 104 | 105 | func (wt *whisperTranscriber) process(ctx context.Context, filename string, opts TranscribeOptions) (string, error) { 106 | return processWhisper(ctx, filename, opts, nil, nil) 107 | } 108 | -------------------------------------------------------------------------------- /internal/transcribe/whisper_test.go: -------------------------------------------------------------------------------- 1 | package transcribe 2 | 3 | import ( 4 | "bytes" 5 | "context" 6 | "errors" 7 | "io" 8 | "mime/multipart" 9 | "net/http" 10 | "os" 11 | "path/filepath" 12 | "testing" 13 | ) 14 | 15 | type mockHTTPClient struct { 16 | DoFunc func(req *http.Request) (*http.Response, error) 17 | } 18 | 19 | func (m *mockHTTPClient) Do(req *http.Request) (*http.Response, error) { 20 | return m.DoFunc(req) 21 | } 22 | 23 | type mockFile struct { 24 | *bytes.Reader 25 | } 26 | 27 | func (m *mockFile) Close() error { 28 | return nil 29 | } 30 | 31 | func Test_processWhisper(t *testing.T) { 32 | /* mocks */ 33 | 34 | tempDir, err := os.MkdirTemp("", "whisper_test") 35 | if err != nil { 36 | t.Fatalf("failed to create temp dir: %v", err) 37 | } 38 | defer os.RemoveAll(tempDir) 39 | 40 | testFilePath := filepath.Join(tempDir, "test.mp3") 41 | if err := os.WriteFile(testFilePath, []byte("dummy audio content"), 0644); err != nil { 42 | t.Fatalf("failed to create test file: %v", err) 43 | } 44 | 45 | unreadableFilePath := filepath.Join(tempDir, "unreadable.mp3") 46 | if err := os.WriteFile(unreadableFilePath, []byte("unreadable content"), 0000); err != nil { 47 | t.Fatalf("failed to create unreadable file: %v", err) 48 | } 49 | 50 | successfulMockClient := &mockHTTPClient{ 51 | DoFunc: func(req *http.Request) (*http.Response, error) { 52 | return &http.Response{ 53 | StatusCode: 200, 54 | Body: io.NopCloser(bytes.NewBufferString(`{ 55 | "text": "this is a test transcription." 56 | }`)), 57 | }, nil 58 | }, 59 | } 60 | 61 | errorMockClient := &mockHTTPClient{ 62 | DoFunc: func(req *http.Request) (*http.Response, error) { 63 | return nil, errors.New("mock HTTP error") 64 | }, 65 | } 66 | 67 | badResponseMockClient := &mockHTTPClient{ 68 | DoFunc: func(req *http.Request) (*http.Response, error) { 69 | return &http.Response{ 70 | StatusCode: 400, 71 | Body: io.NopCloser(bytes.NewBufferString(`{"error": "Bad Request"}`)), 72 | }, nil 73 | }, 74 | } 75 | 76 | invalidJSONMockClient := &mockHTTPClient{ 77 | DoFunc: func(req *http.Request) (*http.Response, error) { 78 | return &http.Response{ 79 | StatusCode: 200, 80 | Body: io.NopCloser(bytes.NewBufferString(`invalid JSON`)), 81 | }, nil 82 | }, 83 | } 84 | 85 | successfulMockFileOpener := func(name string) (io.ReadCloser, error) { 86 | return &mockFile{bytes.NewReader([]byte("file content"))}, nil 87 | } 88 | 89 | errorMockFileOpener := func(name string) (io.ReadCloser, error) { 90 | return nil, errors.New("file open error") 91 | } 92 | 93 | type args struct { 94 | ctx context.Context 95 | filename string 96 | opts TranscribeOptions 97 | client httpClient 98 | opener func(name string) (io.ReadCloser, error) 99 | } 100 | 101 | tests := []struct { 102 | name string 103 | args args 104 | want string 105 | wantErr bool 106 | }{ 107 | { 108 | name: "successful transcription", 109 | args: args{ 110 | ctx: context.Background(), 111 | filename: testFilePath, 112 | opts: TranscribeOptions{ 113 | WhisperAPIKey: "test-api-key", 114 | WhisperModel: "test-model", 115 | LanguageCode: "en-US", 116 | WhisperPrompt: "test-prompt", 117 | }, 118 | client: successfulMockClient, 119 | }, 120 | want: "this is a test transcription.", 121 | wantErr: false, 122 | }, 123 | { 124 | name: "file open error", 125 | args: args{ 126 | ctx: context.Background(), 127 | filename: "non-existent-file.mp3", 128 | opts: TranscribeOptions{}, 129 | client: successfulMockClient, 130 | }, 131 | want: "", 132 | wantErr: true, 133 | }, 134 | { 135 | name: "file read error", 136 | args: args{ 137 | ctx: context.Background(), 138 | filename: unreadableFilePath, 139 | opts: TranscribeOptions{}, 140 | client: successfulMockClient, 141 | }, 142 | want: "", 143 | wantErr: true, 144 | }, 145 | { 146 | name: "HTTP client error", 147 | args: args{ 148 | ctx: context.Background(), 149 | filename: testFilePath, 150 | opts: TranscribeOptions{}, 151 | client: errorMockClient, 152 | }, 153 | want: "", 154 | wantErr: true, 155 | }, 156 | { 157 | name: "bad response from API", 158 | args: args{ 159 | ctx: context.Background(), 160 | filename: testFilePath, 161 | opts: TranscribeOptions{}, 162 | client: badResponseMockClient, 163 | }, 164 | want: "", 165 | wantErr: true, 166 | }, 167 | { 168 | name: "invalid JSON response", 169 | args: args{ 170 | ctx: context.Background(), 171 | filename: testFilePath, 172 | opts: TranscribeOptions{}, 173 | client: invalidJSONMockClient, 174 | }, 175 | want: "", 176 | wantErr: true, 177 | }, 178 | { 179 | name: "file open error", 180 | args: args{ 181 | ctx: context.Background(), 182 | filename: "test.mp3", 183 | opts: TranscribeOptions{}, 184 | client: successfulMockClient, 185 | opener: errorMockFileOpener, 186 | }, 187 | want: "", 188 | wantErr: true, 189 | }, 190 | { 191 | name: "HTTP client error", 192 | args: args{ 193 | ctx: context.Background(), 194 | filename: "test.mp3", 195 | opts: TranscribeOptions{}, 196 | client: errorMockClient, 197 | opener: successfulMockFileOpener, 198 | }, 199 | want: "", 200 | wantErr: true, 201 | }, 202 | } 203 | 204 | for _, tt := range tests { 205 | t.Run(tt.name, func(t *testing.T) { 206 | got, err := processWhisper(tt.args.ctx, tt.args.filename, tt.args.opts, tt.args.client, tt.args.opener) 207 | if (err != nil) != tt.wantErr { 208 | t.Errorf("processWhisper() error = %v, wantErr %v", err, tt.wantErr) 209 | return 210 | } 211 | if got != tt.want { 212 | t.Errorf("processWhisper() = %v, want %v", got, tt.want) 213 | } 214 | }) 215 | } 216 | } 217 | 218 | func Test_writeFields(t *testing.T) { 219 | tests := []struct { 220 | name string 221 | opts TranscribeOptions 222 | wantErr bool 223 | }{ 224 | { 225 | name: "all fields present", 226 | opts: TranscribeOptions{ 227 | WhisperModel: "test-model", 228 | LanguageCode: "en-US", 229 | WhisperPrompt: "test-prompt", 230 | }, 231 | wantErr: false, 232 | }, 233 | } 234 | 235 | for _, tt := range tests { 236 | t.Run(tt.name, func(t *testing.T) { 237 | writer := multipart.NewWriter(&bytes.Buffer{}) 238 | if err := writeFields(writer, tt.opts); (err != nil) != tt.wantErr { 239 | t.Errorf("writeFields() error = %v, wantErr %v", err, tt.wantErr) 240 | } 241 | }) 242 | } 243 | } 244 | -------------------------------------------------------------------------------- /internal/utils/gcs/gcs_utils.go: -------------------------------------------------------------------------------- 1 | package gcs 2 | 3 | import ( 4 | "context" 5 | "errors" 6 | "fmt" 7 | "io" 8 | "os" 9 | "path/filepath" 10 | 11 | speech "cloud.google.com/go/speech/apiv1" 12 | "cloud.google.com/go/speech/apiv1/speechpb" 13 | "cloud.google.com/go/storage" 14 | "github.com/mmatongo/chew/v1/internal/common" 15 | "google.golang.org/api/option" 16 | ) 17 | 18 | func UploadToGCS(ctx context.Context, client *storage.Client, bucket, filename string) (string, error) { 19 | f, err := os.Open(filename) 20 | if err != nil { 21 | return "", fmt.Errorf("failed to open file: %w", err) 22 | } 23 | defer func() { 24 | if cerr := f.Close(); cerr != nil { 25 | err = errors.Join(err, fmt.Errorf("failed to close file: %w", cerr)) 26 | } 27 | }() 28 | 29 | objectName := filepath.Base(filename) 30 | w := client.Bucket(bucket).Object(objectName).NewWriter(ctx) 31 | if _, err = io.Copy(w, f); err != nil { 32 | return "", fmt.Errorf("failed to copy file to GCS: %w", err) 33 | } 34 | if err := w.Close(); err != nil { 35 | return "", fmt.Errorf("failed to close GCS writer: %w", err) 36 | } 37 | 38 | return fmt.Sprintf("gs://%s/%s", bucket, objectName), nil 39 | } 40 | 41 | func DeleteFromGCS(ctx context.Context, client *storage.Client, bucket, objectName string) error { 42 | if err := client.Bucket(bucket).Object(objectName).Delete(ctx); err != nil { 43 | return fmt.Errorf("failed to delete object from GCS: %w", err) 44 | } 45 | return nil 46 | } 47 | 48 | func NewStorageClient(ctx context.Context, opts common.TranscribeOptions) (*storage.Client, error) { 49 | var clientOpts []option.ClientOption 50 | if opts.CredentialsJSON != nil { 51 | clientOpts = append(clientOpts, option.WithCredentialsJSON(opts.CredentialsJSON)) 52 | } 53 | return storage.NewClient(ctx, clientOpts...) 54 | } 55 | 56 | func NewSpeechClient(ctx context.Context, opts common.TranscribeOptions) (*speech.Client, error) { 57 | var clientOpts []option.ClientOption 58 | if opts.CredentialsJSON != nil { 59 | clientOpts = append(clientOpts, option.WithCredentialsJSON(opts.CredentialsJSON)) 60 | } 61 | return speech.NewClient(ctx, clientOpts...) 62 | } 63 | 64 | func NewRecognitionRequest(opts common.TranscribeOptions, audioInfo *speechpb.RecognitionConfig, gcsURI string) *speechpb.LongRunningRecognizeRequest { 65 | diarizationConfig := &speechpb.SpeakerDiarizationConfig{ 66 | EnableSpeakerDiarization: opts.EnableDiarization, 67 | MinSpeakerCount: int32(opts.MinSpeakers), 68 | MaxSpeakerCount: int32(opts.MaxSpeakers), 69 | } 70 | 71 | return &speechpb.LongRunningRecognizeRequest{ 72 | Config: &speechpb.RecognitionConfig{ 73 | Encoding: audioInfo.Encoding, 74 | SampleRateHertz: audioInfo.SampleRateHertz, 75 | AudioChannelCount: audioInfo.AudioChannelCount, 76 | LanguageCode: opts.LanguageCode, 77 | EnableAutomaticPunctuation: true, 78 | UseEnhanced: true, 79 | EnableWordConfidence: true, 80 | Model: "latest_long", 81 | DiarizationConfig: diarizationConfig, 82 | }, 83 | Audio: &speechpb.RecognitionAudio{ 84 | AudioSource: &speechpb.RecognitionAudio_Uri{ 85 | Uri: gcsURI, 86 | }, 87 | }, 88 | } 89 | } 90 | 91 | func ExtractTranscript(resp *speechpb.LongRunningRecognizeResponse) string { 92 | var transcript string 93 | for _, result := range resp.Results { 94 | for _, alt := range result.Alternatives { 95 | transcript += alt.Transcript 96 | } 97 | } 98 | return transcript 99 | } 100 | -------------------------------------------------------------------------------- /internal/utils/gcs/gcs_utils_test.go: -------------------------------------------------------------------------------- 1 | package gcs 2 | 3 | import ( 4 | "context" 5 | "fmt" 6 | "net/http" 7 | "net/http/httptest" 8 | "os" 9 | "path/filepath" 10 | "reflect" 11 | "strings" 12 | "testing" 13 | 14 | speech "cloud.google.com/go/speech/apiv1" 15 | "cloud.google.com/go/speech/apiv1/speechpb" 16 | "cloud.google.com/go/storage" 17 | "github.com/mmatongo/chew/v1/internal/common" 18 | "google.golang.org/api/option" 19 | ) 20 | 21 | func Test_extractTranscript(t *testing.T) { 22 | type args struct { 23 | resp *speechpb.LongRunningRecognizeResponse 24 | } 25 | tests := []struct { 26 | name string 27 | args args 28 | want string 29 | }{ 30 | { 31 | name: "empty response", 32 | args: args{ 33 | resp: &speechpb.LongRunningRecognizeResponse{}, 34 | }, 35 | want: "", 36 | }, 37 | { 38 | name: "response with no results", 39 | args: args{ 40 | resp: &speechpb.LongRunningRecognizeResponse{ 41 | Results: []*speechpb.SpeechRecognitionResult{}, 42 | }, 43 | }, 44 | want: "", 45 | }, 46 | { 47 | name: "response with no alternatives", 48 | args: args{ 49 | resp: &speechpb.LongRunningRecognizeResponse{ 50 | Results: []*speechpb.SpeechRecognitionResult{ 51 | {}, 52 | }, 53 | }, 54 | }, 55 | }, 56 | { 57 | name: "response with result and alternative", 58 | args: args{ 59 | resp: &speechpb.LongRunningRecognizeResponse{ 60 | Results: []*speechpb.SpeechRecognitionResult{ 61 | { 62 | Alternatives: []*speechpb.SpeechRecognitionAlternative{ 63 | { 64 | Transcript: "hello world", 65 | }, 66 | }, 67 | }, 68 | }, 69 | }, 70 | }, 71 | want: "hello world", 72 | }, 73 | { 74 | name: "response with multiple results and alternatives", 75 | args: args{ 76 | resp: &speechpb.LongRunningRecognizeResponse{ 77 | Results: []*speechpb.SpeechRecognitionResult{ 78 | { 79 | Alternatives: []*speechpb.SpeechRecognitionAlternative{ 80 | { 81 | Transcript: "hello world", 82 | Confidence: 0.9, 83 | }, 84 | { 85 | Transcript: "hello world", 86 | Confidence: 0.8, 87 | }, 88 | }, 89 | }, 90 | { 91 | Alternatives: []*speechpb.SpeechRecognitionAlternative{ 92 | { 93 | Transcript: "hello world", 94 | Confidence: 0.7, 95 | }, 96 | { 97 | Transcript: "hello world", 98 | Confidence: 0.6, 99 | }, 100 | }, 101 | }, 102 | }, 103 | }, 104 | }, 105 | want: "hello worldhello worldhello worldhello world", 106 | }, 107 | } 108 | for _, tt := range tests { 109 | t.Run(tt.name, func(t *testing.T) { 110 | if got := ExtractTranscript(tt.args.resp); got != tt.want { 111 | t.Errorf("extractTranscript() = %v, want %v", got, tt.want) 112 | } 113 | }) 114 | } 115 | } 116 | 117 | func Test_newRecognitionRequest(t *testing.T) { 118 | type args struct { 119 | opts common.TranscribeOptions 120 | audioInfo *speechpb.RecognitionConfig 121 | gcsURI string 122 | } 123 | tests := []struct { 124 | name string 125 | args args 126 | want *speechpb.LongRunningRecognizeRequest 127 | }{ 128 | { 129 | name: "create recognition request", 130 | args: args{ 131 | opts: common.TranscribeOptions{ 132 | EnableDiarization: true, 133 | MinSpeakers: 1, 134 | MaxSpeakers: 2, 135 | LanguageCode: "en-US", 136 | }, 137 | audioInfo: &speechpb.RecognitionConfig{ 138 | Encoding: speechpb.RecognitionConfig_ENCODING_UNSPECIFIED, 139 | SampleRateHertz: 44100, 140 | AudioChannelCount: 2, 141 | }, 142 | gcsURI: "gs://bucket/object", 143 | }, 144 | want: &speechpb.LongRunningRecognizeRequest{ 145 | Config: &speechpb.RecognitionConfig{ 146 | Encoding: speechpb.RecognitionConfig_ENCODING_UNSPECIFIED, 147 | SampleRateHertz: 44100, 148 | AudioChannelCount: 2, 149 | LanguageCode: "en-US", 150 | EnableAutomaticPunctuation: true, 151 | UseEnhanced: true, 152 | EnableWordConfidence: true, 153 | Model: "latest_long", 154 | DiarizationConfig: &speechpb.SpeakerDiarizationConfig{ 155 | EnableSpeakerDiarization: true, 156 | MinSpeakerCount: 1, 157 | MaxSpeakerCount: 2, 158 | }, 159 | }, 160 | Audio: &speechpb.RecognitionAudio{ 161 | AudioSource: &speechpb.RecognitionAudio_Uri{ 162 | Uri: "gs://bucket/object", 163 | }, 164 | }, 165 | }, 166 | }, 167 | } 168 | for _, tt := range tests { 169 | t.Run(tt.name, func(t *testing.T) { 170 | if got := NewRecognitionRequest(tt.args.opts, tt.args.audioInfo, tt.args.gcsURI); !reflect.DeepEqual(got, tt.want) { 171 | t.Errorf("createRecognitionRequest() = %v, want %v", got, tt.want) 172 | } 173 | }) 174 | } 175 | } 176 | 177 | /* 178 | All of the following tests are expected to fail because the credentials JSON is empty 179 | and the functions are not written in a way that allows for mocking of the GCP client libraries. 180 | This is a limitation of the current implementation and should be refactored in the future. 181 | */ 182 | 183 | func Test_newSpeechClient(t *testing.T) { 184 | type args struct { 185 | ctx context.Context 186 | opts common.TranscribeOptions 187 | } 188 | tests := []struct { 189 | name string 190 | args args 191 | want *speech.Client 192 | wantErr bool 193 | }{ 194 | { 195 | name: "create speech client", 196 | args: args{ 197 | ctx: context.Background(), 198 | opts: common.TranscribeOptions{ 199 | CredentialsJSON: nil, 200 | }, 201 | }, 202 | want: nil, 203 | wantErr: true, 204 | }, 205 | { 206 | /* 207 | This test case is expected to fail because the credentials JSON is empty. 208 | 209 | TODO: Refactor to allow for mocking of the speech.NewClient function. 210 | */ 211 | name: "create speech client with credentials", 212 | args: args{ 213 | ctx: context.Background(), 214 | opts: common.TranscribeOptions{ 215 | CredentialsJSON: []byte(""), 216 | }, 217 | }, 218 | want: nil, 219 | wantErr: true, 220 | }, 221 | } 222 | for _, tt := range tests { 223 | t.Run(tt.name, func(t *testing.T) { 224 | got, err := NewSpeechClient(tt.args.ctx, tt.args.opts) 225 | if (err != nil) != tt.wantErr { 226 | t.Errorf("createSpeechClient() error = %v, wantErr %v", err, tt.wantErr) 227 | return 228 | } 229 | if !reflect.DeepEqual(got, tt.want) { 230 | t.Errorf("createSpeechClient() = %v, want %v", got, tt.want) 231 | } 232 | }) 233 | } 234 | } 235 | 236 | func Test_createStorageClient(t *testing.T) { 237 | type args struct { 238 | ctx context.Context 239 | opts common.TranscribeOptions 240 | } 241 | tests := []struct { 242 | name string 243 | args args 244 | want *storage.Client 245 | wantErr bool 246 | }{ 247 | { 248 | name: "create storage client", 249 | args: args{ 250 | ctx: context.Background(), 251 | opts: common.TranscribeOptions{ 252 | CredentialsJSON: nil, 253 | }, 254 | }, 255 | want: nil, 256 | wantErr: true, 257 | }, 258 | { 259 | /* 260 | This test case is expected to fail because the credentials JSON is empty. 261 | This does not affect the functionality of the createStorageClient function. 262 | 263 | TODO: Refactor to allow for mocking of the storage.NewClient function. 264 | */ 265 | 266 | name: "create storage client with credentials", 267 | args: args{ 268 | ctx: context.Background(), 269 | opts: common.TranscribeOptions{ 270 | CredentialsJSON: []byte(""), 271 | }, 272 | }, 273 | want: nil, 274 | wantErr: true, 275 | }, 276 | } 277 | for _, tt := range tests { 278 | t.Run(tt.name, func(t *testing.T) { 279 | got, err := NewStorageClient(tt.args.ctx, tt.args.opts) 280 | if (err != nil) != tt.wantErr { 281 | t.Errorf("createStorageClient() error = %v, wantErr %v", err, tt.wantErr) 282 | return 283 | } 284 | if !reflect.DeepEqual(got, tt.want) { 285 | t.Errorf("createStorageClient() = %v, want %v", got, tt.want) 286 | } 287 | }) 288 | } 289 | } 290 | 291 | func Test_uploadToGCS(t *testing.T) { 292 | server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { 293 | if r.Method == "POST" && strings.Contains(r.URL.Path, "/upload/storage/v1/b/") { 294 | w.WriteHeader(http.StatusOK) 295 | fmt.Fprintf(w, `{"name": "uploaded-object"}`) 296 | } else { 297 | http.Error(w, "Unexpected request", http.StatusBadRequest) 298 | } 299 | })) 300 | defer server.Close() 301 | 302 | client, err := storage.NewClient(context.Background(), option.WithEndpoint(server.URL), option.WithHTTPClient(server.Client())) 303 | if err != nil { 304 | t.Fatalf("failed to create test client: %v", err) 305 | } 306 | defer client.Close() 307 | 308 | tempFile, err := os.CreateTemp("", "test-file-*.txt") 309 | if err != nil { 310 | t.Fatalf("failed to create temp file: %v", err) 311 | } 312 | defer os.Remove(tempFile.Name()) 313 | 314 | content := []byte("test content") 315 | if _, err := tempFile.Write(content); err != nil { 316 | t.Fatalf("failed to write to temp file: %v", err) 317 | } 318 | tempFile.Close() 319 | 320 | tests := []struct { 321 | name string 322 | bucket string 323 | filename string 324 | want string 325 | wantErr bool 326 | }{ 327 | { 328 | name: "successful upload", 329 | bucket: "test-bucket", 330 | filename: tempFile.Name(), 331 | want: fmt.Sprintf("gs://test-bucket/%s", filepath.Base(tempFile.Name())), 332 | wantErr: false, 333 | }, 334 | { 335 | name: "non-existent file", 336 | bucket: "test-bucket", 337 | filename: "file.txt", 338 | want: "", 339 | wantErr: true, 340 | }, 341 | { 342 | name: "empty filename", 343 | bucket: "test-bucket", 344 | filename: "", 345 | want: "", 346 | wantErr: true, 347 | }, 348 | } 349 | 350 | for _, tt := range tests { 351 | t.Run(tt.name, func(t *testing.T) { 352 | got, err := UploadToGCS(context.Background(), client, tt.bucket, tt.filename) 353 | if (err != nil) != tt.wantErr { 354 | t.Errorf("uploadToGCS() error = %v, wantErr %v", err, tt.wantErr) 355 | return 356 | } 357 | if got != tt.want { 358 | t.Errorf("uploadToGCS() = %v, want %v", got, tt.want) 359 | } 360 | }) 361 | } 362 | } 363 | 364 | func Test_deleteFromGCS(t *testing.T) { 365 | server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { 366 | if r.Method == "DELETE" && r.URL.Path == "/b/test-bucket/o/test-object.txt" { 367 | w.WriteHeader(http.StatusOK) 368 | } else { 369 | http.Error(w, "Unexpected request", http.StatusBadRequest) 370 | } 371 | })) 372 | defer server.Close() 373 | 374 | client, err := storage.NewClient(context.Background(), 375 | option.WithEndpoint(server.URL), 376 | option.WithHTTPClient(server.Client()), 377 | option.WithoutAuthentication()) 378 | if err != nil { 379 | t.Fatalf("failed to create test client: %v", err) 380 | } 381 | defer client.Close() 382 | 383 | tests := []struct { 384 | name string 385 | bucket string 386 | objectName string 387 | wantErr bool 388 | }{ 389 | { 390 | name: "successful delete", 391 | bucket: "test-bucket", 392 | objectName: "test-object.txt", 393 | wantErr: false, 394 | }, 395 | { 396 | name: "empty object name", 397 | bucket: "test-bucket", 398 | objectName: "", 399 | wantErr: true, 400 | }, 401 | { 402 | name: "empty bucket name", 403 | bucket: "", 404 | objectName: "test-object.txt", 405 | wantErr: true, 406 | }, 407 | } 408 | 409 | for _, tt := range tests { 410 | t.Run(tt.name, func(t *testing.T) { 411 | err := DeleteFromGCS(context.Background(), client, tt.bucket, tt.objectName) 412 | if (err != nil) != tt.wantErr { 413 | t.Errorf("deleteFromGCS() error = %v, wantErr %v", err, tt.wantErr) 414 | return 415 | } 416 | }) 417 | } 418 | } 419 | -------------------------------------------------------------------------------- /internal/utils/utils.go: -------------------------------------------------------------------------------- 1 | package utils 2 | 3 | import ( 4 | "archive/zip" 5 | "encoding/xml" 6 | "fmt" 7 | "io" 8 | "mime" 9 | "net/url" 10 | "os" 11 | "path/filepath" 12 | "regexp" 13 | "strings" 14 | ) 15 | 16 | func GetFileExtension(rawURL string) (string, error) { 17 | u, err := url.Parse(rawURL) 18 | if err != nil { 19 | return "", fmt.Errorf("invalid URL or file path: %w", err) 20 | } 21 | 22 | var pathToCheck string 23 | if u.Scheme == "" || u.Scheme == "file" { 24 | pathToCheck = rawURL 25 | if u.Scheme == "file" { 26 | pathToCheck = u.Path 27 | } 28 | } else { 29 | pathToCheck = u.Path 30 | } 31 | 32 | ext := filepath.Ext(pathToCheck) 33 | if ext == "" { 34 | return "", fmt.Errorf("no file extension found in %q", rawURL) 35 | } 36 | 37 | return ext, nil 38 | } 39 | 40 | func GetFileContentType(file *os.File) string { 41 | return mime.TypeByExtension(filepath.Ext(file.Name())) 42 | } 43 | 44 | func ExtractTextFromXML(file *zip.File) ([]string, error) { 45 | fileReader, err := file.Open() 46 | if err != nil { 47 | return nil, err 48 | } 49 | defer fileReader.Close() 50 | 51 | decoder := xml.NewDecoder(fileReader) 52 | var contents []string 53 | var currentParagraph strings.Builder 54 | inParagraph := false 55 | 56 | for { 57 | token, err := decoder.Token() 58 | if err == io.EOF { 59 | break 60 | } 61 | if err != nil { 62 | return nil, err 63 | } 64 | 65 | switch element := token.(type) { 66 | case xml.StartElement: 67 | if element.Name.Local == "p" { 68 | inParagraph = true 69 | currentParagraph.Reset() 70 | } 71 | case xml.EndElement: 72 | if element.Name.Local == "p" { 73 | inParagraph = false 74 | if trimmed := strings.TrimSpace(currentParagraph.String()); trimmed != "" { 75 | contents = append(contents, trimmed) 76 | } 77 | } 78 | case xml.CharData: 79 | if inParagraph { 80 | currentParagraph.Write(element) 81 | } 82 | } 83 | } 84 | 85 | return contents, nil 86 | } 87 | 88 | /* 89 | Wondering if this is even necessary but I can see how it can be useful 90 | as it also removes links, images, and code blocks. 91 | 92 | I'm not sure if this is the best way to remove markdown syntax. 93 | Inspired by https://github.com/mmatongo/site/blob/master/cmd/dnlm/helpers.go#L62-L87 94 | */ 95 | 96 | /* RemoveMarkdownSyntax removes markdown syntax from a string */ 97 | func RemoveMarkdownSyntax(text string) string { 98 | patterns := []string{ 99 | "(```[\\s\\S]*?```)", // Code blocks 100 | "(`[^`\n]+`)", // Inline code 101 | "!\\[([^\\]]*?)\\]\\(([^)]+)\\)", // Images 102 | "\\[([^\\]]+)\\]\\(([^)]+)\\)", // Links 103 | "(__|\\*\\*|_|\\*)(.+?)(__|\\*\\*|_|\\*)", // Bold and Italic 104 | "~~(.+?)~~", // Strikethrough 105 | "^#{1,6}\\s(.*)$", // Headers 106 | "^>\\s(.*)$", // Blockquotes 107 | "^-{3,}$", // Horizontal rules 108 | "^\\s*[\\*\\-+]\\s+(.+)$", // Unordered lists 109 | "^\\s*\\d+\\.\\s+(.+)$", // Ordered lists 110 | } 111 | 112 | for _, pattern := range patterns { 113 | re := regexp.MustCompile("(?m)" + pattern) 114 | switch { 115 | case strings.HasPrefix(pattern, "(```"): 116 | text = re.ReplaceAllString(text, "$1") 117 | case strings.HasPrefix(pattern, "(`"): 118 | text = re.ReplaceAllString(text, "$1") 119 | case strings.HasPrefix(pattern, "!\\["): 120 | text = re.ReplaceAllString(text, "$1 ($2)") 121 | case strings.HasPrefix(pattern, "\\["): 122 | text = re.ReplaceAllString(text, "$1 ($2)") 123 | case strings.Contains(pattern, "(__|\\*\\*|_|\\*)"): 124 | text = re.ReplaceAllString(text, "$2") 125 | case strings.Contains(pattern, "~~"): 126 | text = re.ReplaceAllString(text, "$1") 127 | case strings.HasPrefix(pattern, "^#"): 128 | text = re.ReplaceAllString(text, "$1") 129 | case strings.HasPrefix(pattern, "^>"): 130 | text = re.ReplaceAllString(text, "$1") 131 | case strings.HasPrefix(pattern, "^\\s*[\\*\\-+]"): 132 | text = re.ReplaceAllString(text, "$1") 133 | case strings.HasPrefix(pattern, "^\\s*\\d+"): 134 | text = re.ReplaceAllString(text, "$1") 135 | default: 136 | text = re.ReplaceAllString(text, "") 137 | } 138 | } 139 | 140 | // Remove any remaining Markdown characters 141 | text = strings.NewReplacer( 142 | "*", "", 143 | "_", "", 144 | "`", "", 145 | "#", "", 146 | ">", "", 147 | "+", "", 148 | "-", "", 149 | ).Replace(text) 150 | 151 | return strings.TrimSpace(text) 152 | } 153 | 154 | func OpenFile(filePath string) (*os.File, error) { 155 | filePath = strings.TrimPrefix(filePath, "file://") 156 | return os.Open(filePath) 157 | } 158 | -------------------------------------------------------------------------------- /internal/utils/utils_test.go: -------------------------------------------------------------------------------- 1 | package utils 2 | 3 | import ( 4 | "archive/zip" 5 | "bytes" 6 | "os" 7 | "path/filepath" 8 | "reflect" 9 | "testing" 10 | ) 11 | 12 | func createMockZipFile(content string) *zip.File { 13 | buf := new(bytes.Buffer) 14 | w := zip.NewWriter(buf) 15 | 16 | var files = []struct { 17 | Name, Body string 18 | }{ 19 | {"document.xml", content}, 20 | } 21 | for _, file := range files { 22 | f, err := w.Create(file.Name) 23 | if err != nil { 24 | panic(err) 25 | } 26 | _, err = f.Write([]byte(file.Body)) 27 | if err != nil { 28 | panic(err) 29 | } 30 | } 31 | 32 | err := w.Close() 33 | if err != nil { 34 | panic(err) 35 | } 36 | 37 | r, err := zip.NewReader(bytes.NewReader(buf.Bytes()), int64(buf.Len())) 38 | if err != nil { 39 | panic(err) 40 | } 41 | 42 | return r.File[0] 43 | } 44 | 45 | func TestRemoveMarkdownSyntax(t *testing.T) { 46 | type args struct { 47 | text string 48 | } 49 | tests := []struct { 50 | name string 51 | args args 52 | want string 53 | }{ 54 | { 55 | name: "Test 1", 56 | args: args{ 57 | text: "This is a **bold** text", 58 | }, 59 | want: "This is a bold text", 60 | }, 61 | { 62 | name: "Test 2", 63 | args: args{ 64 | text: "This is a *italic* text", 65 | }, 66 | want: "This is a italic text", 67 | }, 68 | { 69 | name: "Test 3", 70 | args: args{ 71 | text: "This is a [link](https://example.com) text", 72 | }, 73 | want: "This is a link (https://example.com) text", 74 | }, 75 | { 76 | name: "Test 4", 77 | args: args{ 78 | text: "This is a ![image](https://example.com/image.png) text", 79 | }, 80 | want: "This is a image (https://example.com/image.png) text", 81 | }, 82 | } 83 | for _, tt := range tests { 84 | t.Run(tt.name, func(t *testing.T) { 85 | if got := RemoveMarkdownSyntax(tt.args.text); got != tt.want { 86 | t.Errorf("RemoveMarkdownSyntax() = %v, want %v", got, tt.want) 87 | } 88 | }) 89 | } 90 | } 91 | 92 | func TestGetFileExtension(t *testing.T) { 93 | type args struct { 94 | rawUrl string 95 | } 96 | tests := []struct { 97 | name string 98 | args args 99 | want string 100 | wantErr bool 101 | }{ 102 | { 103 | name: "Test 1", 104 | args: args{ 105 | rawUrl: "https://example.com/test.csv", 106 | }, 107 | want: ".csv", 108 | wantErr: false, 109 | }, 110 | { 111 | name: "Test 2", 112 | args: args{ 113 | rawUrl: "", 114 | }, 115 | want: "", 116 | wantErr: true, 117 | }, 118 | { 119 | name: "Test 3", 120 | args: args{ 121 | rawUrl: "https://example.com/test", 122 | }, 123 | want: "", 124 | wantErr: true, 125 | }, 126 | { 127 | name: "Test 4", 128 | args: args{ 129 | rawUrl: "file:///test.csv", 130 | }, 131 | want: ".csv", 132 | wantErr: false, 133 | }, 134 | { 135 | name: "Test 5", 136 | args: args{ 137 | rawUrl: "file:///test", 138 | }, 139 | want: "", 140 | wantErr: true, 141 | }, 142 | { 143 | name: "Test 6", 144 | args: args{ 145 | rawUrl: string([]byte{0x01, 0x02, 0x03, 0x04, 0x05}), 146 | }, 147 | want: "", 148 | wantErr: true, 149 | }, 150 | } 151 | for _, tt := range tests { 152 | t.Run(tt.name, func(t *testing.T) { 153 | got, err := GetFileExtension(tt.args.rawUrl) 154 | if (err != nil) != tt.wantErr { 155 | t.Errorf("GetFileExtensionFromUrl() error = %v, wantErr %v", err, tt.wantErr) 156 | return 157 | } 158 | if got != tt.want { 159 | t.Errorf("GetFileExtensionFromUrl() = %v, want %v", got, tt.want) 160 | } 161 | }) 162 | } 163 | } 164 | 165 | func TestExtractTextFromXML(t *testing.T) { 166 | type args struct { 167 | file *zip.File 168 | } 169 | tests := []struct { 170 | name string 171 | args args 172 | want []string 173 | wantErr bool 174 | }{ 175 | { 176 | name: "valid XML with paragraphs", 177 | args: args{ 178 | file: createMockZipFile(` 179 | 180 | 181 |

First paragraph

182 |

Second paragraph

183 |

Third paragraph

184 |
185 | `), 186 | }, 187 | want: []string{"First paragraph", "Second paragraph", "Third paragraph"}, 188 | wantErr: false, 189 | }, 190 | { 191 | name: "XML with empty paragraphs", 192 | args: args{ 193 | file: createMockZipFile(` 194 | 195 | 196 |

First paragraph

197 |

198 |

Third paragraph

199 |
200 | `), 201 | }, 202 | want: []string{"First paragraph", "Third paragraph"}, 203 | wantErr: false, 204 | }, 205 | { 206 | name: "invalid XML", 207 | args: args{ 208 | file: createMockZipFile(` 209 | 210 | 211 |

Unclosed paragraph 212 | 213 | `), 214 | }, 215 | want: nil, 216 | wantErr: true, 217 | }, 218 | } 219 | for _, tt := range tests { 220 | t.Run(tt.name, func(t *testing.T) { 221 | got, err := ExtractTextFromXML(tt.args.file) 222 | if (err != nil) != tt.wantErr { 223 | t.Errorf("ExtractTextFromXML() error = %v, wantErr %v", err, tt.wantErr) 224 | return 225 | } 226 | if !reflect.DeepEqual(got, tt.want) { 227 | t.Errorf("ExtractTextFromXML() = %v, want %v", got, tt.want) 228 | } 229 | }) 230 | } 231 | } 232 | 233 | func TestOpenFile(t *testing.T) { 234 | type args struct { 235 | filePath string 236 | } 237 | tests := []struct { 238 | name string 239 | args args 240 | want *os.File 241 | wantErr bool 242 | }{ 243 | { 244 | name: "valid file", 245 | args: args{ 246 | filePath: "testdata/test.pdf", 247 | }, 248 | want: nil, 249 | wantErr: true, 250 | }, 251 | } 252 | for _, tt := range tests { 253 | t.Run(tt.name, func(t *testing.T) { 254 | got, err := OpenFile(tt.args.filePath) 255 | if (err != nil) != tt.wantErr { 256 | t.Errorf("OpenFile() error = %v, wantErr %v", err, tt.wantErr) 257 | return 258 | } 259 | if !reflect.DeepEqual(got, tt.want) { 260 | t.Errorf("OpenFile() = %v, want %v", got, tt.want) 261 | } 262 | }) 263 | } 264 | } 265 | 266 | func TestGetFileContentType(t *testing.T) { 267 | tempDir := t.TempDir() 268 | testHTMLPath := filepath.Join(tempDir, "test.html") 269 | 270 | err := os.WriteFile(testHTMLPath, []byte("html content"), 0644) 271 | if err != nil { 272 | t.Fatalf("failed to create test html file: %v", err) 273 | } 274 | 275 | filepath, _ := OpenFile(testHTMLPath) 276 | type args struct { 277 | file *os.File 278 | } 279 | tests := []struct { 280 | name string 281 | args args 282 | want string 283 | }{ 284 | { 285 | name: "Test 1", 286 | args: args{ 287 | file: filepath, 288 | }, 289 | want: "text/html; charset=utf-8", 290 | }, 291 | } 292 | for _, tt := range tests { 293 | t.Run(tt.name, func(t *testing.T) { 294 | if got := GetFileContentType(tt.args.file); got != tt.want { 295 | t.Errorf("GetFileContentType() = %v, want %v", got, tt.want) 296 | } 297 | }) 298 | } 299 | } 300 | -------------------------------------------------------------------------------- /testdata/audio/test.flac: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mmatongo/chew/bf3ea330803108fd896caeb73a2103e03b0d5d1f/testdata/audio/test.flac -------------------------------------------------------------------------------- /testdata/audio/test.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mmatongo/chew/bf3ea330803108fd896caeb73a2103e03b0d5d1f/testdata/audio/test.mp3 -------------------------------------------------------------------------------- /testdata/audio/test.ogg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mmatongo/chew/bf3ea330803108fd896caeb73a2103e03b0d5d1f/testdata/audio/test.ogg -------------------------------------------------------------------------------- /testdata/audio/test.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mmatongo/chew/bf3ea330803108fd896caeb73a2103e03b0d5d1f/testdata/audio/test.wav -------------------------------------------------------------------------------- /testdata/files/test.epub: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mmatongo/chew/bf3ea330803108fd896caeb73a2103e03b0d5d1f/testdata/files/test.epub -------------------------------------------------------------------------------- /testdata/files/test.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mmatongo/chew/bf3ea330803108fd896caeb73a2103e03b0d5d1f/testdata/files/test.pdf --------------------------------------------------------------------------------