├── .github
└── workflows
│ ├── ci.yml
│ └── go.yml
├── .gitignore
├── LICENSE
├── README.md
├── TODO.md
├── assets
└── gopher-eating.svg
├── chew.go
├── chew_test.go
├── cmd
└── chew
│ └── wrapper.go
├── codecov.yml
├── docs
├── golang.md
├── python.md
├── ruby.md
└── setup.md
├── examples
├── main.go
├── main.py
├── main.rb
└── transcription
│ ├── google.go
│ └── whisper.go
├── go.mod
├── go.sum
├── internal
├── audio
│ ├── flac.go
│ ├── flac_test.go
│ ├── mp3.go
│ ├── mp3_test.go
│ ├── processor.go
│ ├── processor_test.go
│ ├── types.go
│ ├── wav.go
│ └── wav_test.go
├── common
│ └── types.go
├── document
│ ├── docx.go
│ ├── docx_test.go
│ ├── epub.go
│ ├── epub_test.go
│ ├── pdf.go
│ ├── pdf_test.go
│ ├── pptx.go
│ └── pptx_test.go
├── text
│ ├── csv.go
│ ├── csv_test.go
│ ├── html.go
│ ├── html_test.go
│ ├── json.go
│ ├── json_test.go
│ ├── markdown.go
│ ├── plaintext.go
│ ├── plaintext_test.go
│ ├── xml.go
│ ├── xml_test.go
│ ├── yaml.go
│ └── yaml_test.go
├── transcribe
│ ├── google_transcriber.go
│ ├── google_transcriber_test.go
│ ├── transcribe.go
│ ├── transcribe_test.go
│ ├── types.go
│ ├── whisper.go
│ └── whisper_test.go
└── utils
│ ├── gcs
│ ├── gcs_utils.go
│ └── gcs_utils_test.go
│ ├── utils.go
│ └── utils_test.go
└── testdata
├── audio
├── test.flac
├── test.mp3
├── test.ogg
└── test.wav
└── files
├── test.epub
└── test.pdf
/.github/workflows/ci.yml:
--------------------------------------------------------------------------------
1 | name: Workflow for Codecov
2 | on: [push, pull_request]
3 | jobs:
4 | run:
5 | runs-on: ubuntu-latest
6 | steps:
7 | - name: Set up Go
8 | uses: actions/setup-go@v5
9 | with:
10 | go-version: '1.23'
11 | id: go
12 |
13 | - name: Check out code into the Go module directory
14 | uses: actions/checkout@v4
15 |
16 | - name: Get dependencies
17 | run: |
18 | go get -v -t -d ./...
19 | if [ -f Gopkg.toml ]; then
20 | curl https://raw.githubusercontent.com/golang/dep/master/install.sh | sh
21 | dep ensure
22 | fi
23 |
24 | - name: Generate coverage report
25 | run: |
26 | go test `go list ./... | grep -v -E 'docs|cmd|examples'` -coverprofile=coverage.txt -covermode=atomic
27 |
28 | - name: Upload coverage to Codecov
29 | uses: codecov/codecov-action@v4
30 | with:
31 | verbose: true
32 | env:
33 | CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
34 |
--------------------------------------------------------------------------------
/.github/workflows/go.yml:
--------------------------------------------------------------------------------
1 | name: Go
2 |
3 | on:
4 | push:
5 | branches: [ "master" ]
6 | pull_request:
7 | branches: [ "master" ]
8 |
9 | jobs:
10 |
11 | test:
12 | runs-on: ubuntu-latest
13 | steps:
14 | - uses: actions/checkout@v4
15 |
16 | - name: Set up Go
17 | uses: actions/setup-go@v4
18 | with:
19 | go-version: '1.22'
20 |
21 | - name: Test
22 | run: go test -v -race ./... -cover -covermode=atomic
23 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .idea
2 | audio
3 | *.json
4 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2024 Daniel M. Matongo
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 |

7 |
8 | [](https://goreportcard.com/report/github.com/mmatongo/chew)
9 | [](https://pkg.go.dev/github.com/mmatongo/chew)
10 | [](https://codeclimate.com/github/mmatongo/chew/maintainability)
11 | [](https://codecov.io/github/mmatongo/chew)
12 | [](./LICENSE)
13 |
14 |
15 | > A Go library for processing various content types into markdown/plaintext..
16 |
17 | ## About
18 |
19 | *Chew* is a Go library that processes various content types into markdown or plaintext. It supports multiple content types, including HTML, PDF, CSV, JSON, YAML, DOCX, PPTX, Markdown, Plaintext, MP3, FLAC, and WAVE.
20 |
21 | ## Installation
22 |
23 | ```bash
24 | go get github.com/mmatongo/chew
25 | ```
26 |
27 | ## Usage
28 |
29 | Here's a basic example of how to use Chew:
30 |
31 | ```go
32 | package main
33 |
34 | import (
35 | "context"
36 | "fmt"
37 | "log"
38 | "time"
39 |
40 | "github.com/mmatongo/chew/v1"
41 | )
42 |
43 | func main() {
44 | urls := []string{
45 | "https://example.com",
46 | }
47 |
48 | config := chew.Config{
49 | UserAgent: "Chew/1.0 (+https://github.com/mmatongo/chew)",
50 | RetryLimit: 3,
51 | RetryDelay: 5 * time.Second,
52 | CrawlDelay: 10 * time.Second,
53 | ProxyList: []string{}, // Add your proxies here, or leave empty
54 | RateLimit: 2 * time.Second,
55 | RateBurst: 3,
56 | IgnoreRobotsTxt: false,
57 | }
58 |
59 | haChew := chew.New(config)
60 |
61 | // The context is optional, but can be used to cancel the operation after a certain time
62 | ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
63 | defer cancel()
64 |
65 | chunks, err := haChew.Process(ctx, urls)
66 | if err != nil {
67 | if err == context.DeadlineExceeded {
68 | log.Println("Operation timed out")
69 | } else {
70 | log.Printf("Error processing URLs: %v", err)
71 | }
72 | return
73 | }
74 |
75 | for _, chunk := range chunks {
76 | fmt.Printf("Source: %s\nContent: %s\n\n", chunk.Source, chunk.Content)
77 | }
78 | }
79 | ```
80 |
81 | Output
82 |
83 | ```bash
84 | Source: https://example.com
85 | Content: Example Domain
86 |
87 | Source: https://example.com
88 | Content: This domain is for use in illustrative examples in documents. You may use this domain in literature without prior coordination or asking for permission.
89 |
90 | Source: https://example.com
91 | Content: More information...
92 | ```
93 |
94 | You can find more examples in the [examples](./examples) directory as well as instructions on how to use Chew with Ruby and Python.
95 |
96 | ## Contributing
97 |
98 | Contributions are welcome! Feel free to open an issue or submit a pull request if you have any suggestions or improvements.
99 |
100 | ## License
101 |
102 | This project is licensed under the MIT License - see the [LICENSE](./LICENSE) file for details.
103 |
104 | ### Logo
105 |
106 | The [logo](https://github.com/MariaLetta/free-gophers-pack) was made by the amazing [MariaLetta](https://github.com/MariaLetta).
107 |
108 |
109 | ### Similar Projects
110 | [docconv](https://github.com/sajari/docconv)
111 |
112 | ### Roadmap
113 | The roadmap for this project is available [here](./TODO.md). It's meant more as a guide than a strict plan because I only work on this project in my free time.
114 |
--------------------------------------------------------------------------------
/TODO.md:
--------------------------------------------------------------------------------
1 |
2 | ### TODO
3 | ---
4 |
5 | - [x] Add tests
6 | - [ ] Improve error handling
7 | - [x] Add support for more content types
8 | - [x] Implement rate limiting for URL fetching
9 | - [x] Use a free PDF processing library
10 | - [x] How to handle text/plain content type
11 | - [x] Add transcription support
12 | - [x] Customisable user agent
13 | - [ ] Allow users what to target in the HTML, i.e. body, title, etc
14 | - [ ] More examples, documentation and use cases
15 | - [ ] Improve PPTX and DOCX processing, (currently using a hacky method I cobbed together from various sources)
16 | - [ ] Use a common interface for all content types
17 |
--------------------------------------------------------------------------------
/chew.go:
--------------------------------------------------------------------------------
1 | /*
2 | Package chew provides a simple way to process URLs and files. It allows you to process a list of URLs
3 | and files, and returns the content of the URLs and files as a list of Chunks. It also provides a way to
4 | transcribe audio files using the Google Cloud Speech-to-Text API or the OpenAI Whisper API.
5 |
6 | The library respects rules defined in robots.txt file and crawl delays, and allows you to set a custom http.Client for making requests.
7 |
8 | Note on Responsible Usage:
9 |
10 | This library is designed for processing data from both local files and web sources. Users should be aware of the following considerations:
11 |
12 | 1. Web Scraping:
13 | - When scraping websites, ensure compliance with the target website's terms of service and robots.txt rules.
14 | - Respect rate limits and crawl delays to avoid overwhelming target servers.
15 | - Be aware that web scraping may be subject to legal restrictions in some jurisdictions.
16 | - While the library will attempt to respect robots.txt rules by default, users are responsible for ensuring
17 | that their usage complies with the target website's terms of service and legal requirements.
18 |
19 | 2. File Processing:
20 | - Exercise caution when processing files from untrusted sources.
21 | - Ensure you have appropriate permissions to access and process the files.
22 | - Be mindful of potential sensitive information in processed files and handle it securely.
23 |
24 | 3. Data Handling:
25 | - Properly secure and manage any data extracted or processed using this library, especially if it contains personal or sensitive information.
26 | - Comply with relevant data protection regulations (e.g., GDPR, CCPA) when handling personal data.
27 |
28 | 4. System Resource Usage:
29 | - Be aware that processing large files or numerous web pages can be resource-intensive. Monitor and manage system resources accordingly.
30 |
31 | 5. Have Fun
32 |
33 | Users of this library are responsible for ensuring their usage complies with applicable laws, regulations, and ethical considerations in their jurisdiction and context of use.
34 | */
35 | package chew
36 |
37 | import (
38 | "context"
39 | "fmt"
40 | "io"
41 | "net/http"
42 | "net/url"
43 | "strings"
44 | "sync"
45 | "time"
46 |
47 | "github.com/mmatongo/chew/v1/internal/common"
48 | "github.com/mmatongo/chew/v1/internal/document"
49 | "github.com/mmatongo/chew/v1/internal/text"
50 | "github.com/mmatongo/chew/v1/internal/transcribe"
51 | "github.com/mmatongo/chew/v1/internal/utils"
52 | "github.com/temoto/robotstxt"
53 | "golang.org/x/time/rate"
54 | )
55 |
56 | const (
57 | contentTypeHTML = "text/html"
58 | contentTypeText = "text/plain"
59 | contentTypeXML = "application/xml"
60 | contentTypeTextXML = "text/xml"
61 | contentTypePDF = "application/pdf"
62 | contentTypeCSV = "text/csv"
63 | contentTypeJSON = "application/json"
64 | contentTypeYAML = "application/x-yaml"
65 | contentTypeMarkdown = "text/markdown"
66 | contentTypeEPUB = "application/epub+zip"
67 | contentTypeDocx = "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
68 | contentTypePptx = "application/vnd.openxmlformats-officedocument.presentationml.presentation"
69 | )
70 |
71 | var contentTypeProcessors = map[string]func(io.Reader, string) ([]common.Chunk, error){
72 | contentTypeHTML: text.ProcessHTML,
73 | contentTypeCSV: text.ProcessCSV,
74 | contentTypeJSON: text.ProcessJSON,
75 | contentTypeYAML: text.ProcessYAML,
76 | contentTypeMarkdown: text.ProcessText,
77 | contentTypeText: text.ProcessText,
78 | contentTypeXML: text.ProcessXML,
79 | contentTypeTextXML: text.ProcessXML,
80 | contentTypeDocx: document.ProcessDocx,
81 | contentTypePptx: document.ProcessPptx,
82 | contentTypePDF: document.ProcessPDF,
83 | contentTypeEPUB: document.ProcessEpub,
84 | }
85 |
86 | type Chew struct {
87 | config common.Config
88 | httpClient *http.Client
89 | rateLimiter RateLimiter
90 | rateLimiterMu sync.RWMutex
91 | robotsCache map[string]*robotstxt.RobotsData
92 | robotsMu sync.RWMutex
93 | lastAccess map[string]time.Time
94 | lastAccessMu sync.Mutex
95 | proxyIndex int
96 | proxyMu sync.Mutex
97 | }
98 |
99 | type RateLimiter interface {
100 | Wait(context.Context) error
101 | }
102 |
103 | func (c *Chew) SetRateLimiter(rl RateLimiter) {
104 | c.rateLimiterMu.Lock()
105 | defer c.rateLimiterMu.Unlock()
106 | c.rateLimiter = rl
107 | }
108 |
109 | /*
110 | NewConfig allows you to set the configuration options for URL processing. It takes a Config struct.
111 |
112 | Usage:
113 |
114 | config := chew.Config{
115 | UserAgent: "MyBot/1.0 (+https://example.com/bot)",
116 | RetryLimit: 3,
117 | RetryDelay: 5 * time.Second,
118 | CrawlDelay: 10 * time.Second,
119 | ProxyList: []string{"http://proxy1.com", "http://proxy2.com"},
120 | RateLimit: 2 * time.Second,
121 | RateBurst: 3,
122 | IgnoreRobotsTxt: false,
123 | }
124 |
125 | chew.NewConfig(config)
126 | */
127 | func New(config common.Config) *Chew {
128 | c := &Chew{
129 | config: config,
130 | robotsCache: make(map[string]*robotstxt.RobotsData),
131 | lastAccess: make(map[string]time.Time),
132 | }
133 | c.initHTTPClient()
134 |
135 | limit := rate.Every(config.RateLimit)
136 | c.rateLimiter = rate.NewLimiter(limit, config.RateBurst)
137 |
138 | return c
139 | }
140 |
141 | /*
142 | Transcribe is a function that transcribes audio files using either the Google Cloud Speech-to-Text API
143 | or the Whisper API. It handles uploading the audio file to Google Cloud Storage if necessary,
144 | manages the transcription process, and returns the resulting transcript.
145 |
146 | For detailed usage instructions, see the TranscribeOptions struct documentation.
147 | */
148 | var Transcribe = transcribe.Transcribe
149 |
150 | /*
151 | The TranscribeOptions struct contains the options for transcribing an audio file. It allows the user
152 | to specify the Google Cloud credentials, the GCS bucket to upload the audio file to, the language code
153 | to use for transcription, an option to enable diarization (the process of separating and labeling
154 | speakers in an audio stream) including the min and max speakers and
155 | an option to clean up the audio file from Google Cloud Speech-to-Text (GCS) after transcription is complete.
156 |
157 | And also, it allows the user to specify whether to use the Whisper API for transcription, and if so,
158 | the API key, model, and prompt to use.
159 |
160 | Usage:
161 |
162 | opts := chew.TranscribeOptions{
163 | CredentialsJSON: []byte("..."),
164 | Bucket: "my-bucket",
165 | LanguageCode: "en-US",
166 | EnableDiarization: true,
167 | MinSpeakers: 2,
168 | MaxSpeakers: 4,
169 | CleanupOnComplete: true,
170 | UseWhisper: true, // You can only have one of these enabled, by default it uses the Google Cloud Speech-to-Text API
171 | WhisperAPIKey: "my-whisper-api-key",
172 | WhisperModel: "whisper-1",
173 | }
174 | */
175 | type TranscribeOptions = transcribe.TranscribeOptions
176 |
177 | /*
178 | Config struct contains the configuration options for URL processing.
179 |
180 | Fields:
181 | - UserAgent: The user agent string to use for requests (e.g., "MyBot/1.0 (+https://example.com/bot)")
182 | - RetryLimit: Number of retries to attempt in case of failure (e.g., 3)
183 | - RetryDelay: Delay between retries (e.g., 5 * time.Second)
184 | - CrawlDelay: Delay between requests to the same domain (e.g., 10 * time.Second)
185 | - ProxyList: List of proxy URLs to use for requests (e.g., []string{"http://proxy1.com", "http://proxy2.com"})
186 | - RateLimit: Rate limit for requests (e.g., rate.Every(2 * time.Second))
187 | - RateBurst: Maximum burst size for rate limiting (e.g., 3)
188 | - IgnoreRobotsTxt: Whether to ignore robots.txt rules (e.g., false)
189 |
190 | Usage:
191 |
192 | config := chew.Config{
193 | UserAgent: "MyBot/1.0 (+https://example.com/bot)",
194 | RetryLimit: 3,
195 | RetryDelay: 5 * time.Second,
196 | CrawlDelay: 10 * time.Second,
197 | ProxyList: []string{"http://proxy1.com", "http://proxy2.com"},
198 | RateLimit: 2 * time.Second,
199 | RateBurst: 3,
200 | IgnoreRobotsTxt: false,
201 | }
202 | */
203 | type Config = common.Config
204 |
205 | /*
206 | This is meant as a fallback in case the content type is not recognized and to enforce
207 | the content type based on the file extension instead of the content type
208 | returned by the server. i.e. if the server returns text/plain but the file is a markdown file
209 | the content types are the biggest culprits of this
210 | */
211 | var validExtensions = map[string]func(io.Reader, string) ([]common.Chunk, error){
212 | ".md": text.ProcessText,
213 | ".csv": text.ProcessCSV,
214 | ".json": text.ProcessJSON,
215 | ".yaml": text.ProcessYAML,
216 | ".html": text.ProcessHTML,
217 | ".epub": document.ProcessEpub,
218 | }
219 |
220 | /*
221 | SetHTTPClient allows you to set a custom http.Client to use for making requests.
222 |
223 | This would be useful in the event custom logging, tracing, or other functionality is
224 | required for the requests made by the library.
225 |
226 | Usage:
227 |
228 | client := &http.Client{
229 | Transport: loggingRoundTripper{wrapped: http.DefaultTransport},
230 | }
231 |
232 | chew.SetHTTPClient(client)
233 | */
234 |
235 | func (c *Chew) SetHTTPClient(client *http.Client) {
236 | c.httpClient = client
237 | }
238 |
239 | func (c *Chew) initHTTPClient() {
240 | transport := &http.Transport{
241 | Proxy: c.getProxy,
242 | }
243 | c.httpClient = &http.Client{
244 | Timeout: 30 * time.Second,
245 | Transport: transport,
246 | }
247 | }
248 |
249 | /*
250 | For content types that can also return text/plain as their content types we need to manually check
251 | their extension to properly process them. I feel like this could be done better but this is my solution for now.
252 | */
253 | func getProcessor(contentType, url string) (func(io.Reader, string) ([]common.Chunk, error), error) {
254 | for key, proc := range contentTypeProcessors {
255 | if strings.Contains(contentType, key) {
256 | return proc, nil
257 | }
258 | }
259 |
260 | ext, err := utils.GetFileExtension(url)
261 | if err != nil {
262 | return nil, fmt.Errorf("couldn't get file extension from url %s: %s", url, err)
263 | }
264 |
265 | if proc, ok := validExtensions[ext]; ok {
266 | return proc, nil
267 | }
268 |
269 | return nil, fmt.Errorf("unsupported content type: %s", contentType)
270 | }
271 |
272 | /*
273 | Process takes a list of URLs and returns a list of Chunks
274 |
275 | The slice of strings to be processed can be URLs or file paths
276 | The context is optional and can be used to cancel the processing
277 | of the URLs after a certain amount of time
278 |
279 | This function is safe for concurrent use.
280 |
281 | Usage:
282 |
283 | chunks, err := chew.Process([]string{"https://example.com", "file://path/to/file.txt"})
284 | if err != nil {
285 | log.Fatalf("Error processing URLs: %v", err)
286 | }
287 |
288 | for _, chunk := range chunks {
289 | log.Printf("Chunk: %s\n Source: %s\n", chunk.Content, chunk.Source)
290 | }
291 | */
292 | func (c *Chew) Process(ctx context.Context, urls []string) ([]common.Chunk, error) {
293 | var (
294 | result []common.Chunk
295 | mu sync.Mutex
296 | errCh = make(chan error, len(urls))
297 | resCh = make(chan []common.Chunk, len(urls))
298 | )
299 |
300 | for _, url := range urls {
301 | go func(url string) {
302 | select {
303 | case <-ctx.Done():
304 | errCh <- ctx.Err()
305 | return
306 | default:
307 | c.rateLimiterMu.RLock()
308 | rateLimiter := c.rateLimiter
309 | c.rateLimiterMu.RUnlock()
310 |
311 | if err := rateLimiter.Wait(ctx); err != nil {
312 | errCh <- fmt.Errorf("rate limit exceeded for %s: %w", url, err)
313 | return
314 | }
315 |
316 | if !c.config.IgnoreRobotsTxt {
317 | allowed, crawlDelay, err := c.getRobotsTxtInfo(url)
318 | if err != nil {
319 | errCh <- fmt.Errorf("checking robots.txt for %s: %w", url, err)
320 | return
321 | }
322 | if !allowed {
323 | errCh <- fmt.Errorf("access to %s is disallowed by robots.txt", url)
324 | return
325 | }
326 | if err := c.respectCrawlDelay(ctx, url, crawlDelay); err != nil {
327 | errCh <- fmt.Errorf("respecting crawl delay for %s: %w", url, err)
328 | return
329 | }
330 | }
331 |
332 | chunks, err := c.processWithRetry(ctx, url)
333 | if err != nil {
334 | errCh <- fmt.Errorf("processing %s: %w", url, err)
335 | return
336 | }
337 |
338 | resCh <- chunks
339 | }
340 | }(url)
341 | }
342 |
343 | for i := 0; i < len(urls); i++ {
344 | select {
345 | case <-ctx.Done():
346 | return nil, ctx.Err()
347 | case err := <-errCh:
348 | return nil, err
349 | case chunks := <-resCh:
350 | mu.Lock()
351 | result = append(result, chunks...)
352 | mu.Unlock()
353 | }
354 | }
355 |
356 | return result, nil
357 | }
358 |
359 | /*
360 | processURL handles the actual processing of a single URL or file
361 | file paths are processed directly while URLs are fetched and processed
362 | */
363 | func (c *Chew) processURL(ctx context.Context, url string) ([]common.Chunk, error) {
364 | // if the url is a file path we can just open the file and process it directly
365 | if filePath, found := strings.CutPrefix(url, "file://"); found {
366 | file, err := utils.OpenFile(filePath)
367 | if err != nil {
368 | return nil, fmt.Errorf("opening file: %w", err)
369 | }
370 | defer file.Close()
371 |
372 | ext, _ := utils.GetFileExtension(filePath)
373 | /*
374 | Will leave this in here for now, but I think it's better to just check the file extension
375 | instead of the content type returned.
376 | */
377 | contentType := utils.GetFileContentType(file)
378 |
379 | proc, err := getProcessor(contentType, filePath)
380 | if err != nil {
381 | proc, ok := validExtensions[ext]
382 | if !ok {
383 | return nil, fmt.Errorf("unsupported file type: %s", ext)
384 | }
385 | return proc(file, url)
386 | }
387 |
388 | return proc(file, url)
389 | }
390 |
391 | req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil)
392 | if err != nil {
393 | return nil, fmt.Errorf("creating request: %w", err)
394 | }
395 |
396 | req.Header.Set("User-Agent", c.config.UserAgent)
397 |
398 | resp, err := c.httpClient.Do(req)
399 | if err != nil {
400 | return nil, fmt.Errorf("making request: %w", err)
401 | }
402 | defer resp.Body.Close()
403 |
404 | contentType := resp.Header.Get("Content-Type")
405 |
406 | processor, err := getProcessor(contentType, url)
407 | if err != nil {
408 | return nil, err
409 | }
410 |
411 | return processor(resp.Body, url)
412 | }
413 |
414 | func (c *Chew) getRobotsTxtInfo(urlStr string) (bool, time.Duration, error) {
415 | parsedURL, err := url.Parse(urlStr)
416 | if err != nil {
417 | return false, 0, err
418 | }
419 |
420 | robotsURL := fmt.Sprintf("%s://%s/robots.txt", parsedURL.Scheme, parsedURL.Host)
421 |
422 | c.robotsMu.RLock()
423 | robotsData, exists := c.robotsCache[robotsURL]
424 | c.robotsMu.RUnlock()
425 |
426 | if !exists {
427 | resp, err := http.Get(robotsURL)
428 | if err != nil {
429 | return true, c.config.CrawlDelay, nil
430 | }
431 | defer resp.Body.Close()
432 |
433 | robotsData, err = robotstxt.FromResponse(resp)
434 | if err != nil {
435 | return true, c.config.CrawlDelay, nil
436 | }
437 |
438 | c.robotsMu.Lock()
439 | c.robotsCache[robotsURL] = robotsData
440 | c.robotsMu.Unlock()
441 | }
442 |
443 | allowed := robotsData.TestAgent(parsedURL.Path, c.config.UserAgent)
444 |
445 | return allowed, c.config.CrawlDelay, nil
446 | }
447 |
448 | // respectCrawlDelay ensures that subsequent requests to the same domain respect the specified crawl delay.
449 | func (c *Chew) respectCrawlDelay(ctx context.Context, urlStr string, delay time.Duration) error {
450 | parsedURL, err := url.Parse(urlStr)
451 | if err != nil {
452 | return err
453 | }
454 |
455 | domain := parsedURL.Hostname()
456 |
457 | c.lastAccessMu.Lock()
458 | lastAccess, exists := c.lastAccess[domain]
459 | if exists {
460 | timeToWait := time.Until(lastAccess.Add(delay))
461 | if timeToWait > 0 {
462 | c.lastAccessMu.Unlock()
463 | select {
464 | case <-time.After(timeToWait):
465 | case <-ctx.Done():
466 | return ctx.Err()
467 | }
468 | c.lastAccessMu.Lock()
469 | }
470 | }
471 |
472 | c.lastAccess[domain] = time.Now()
473 | c.lastAccessMu.Unlock()
474 | return nil
475 | }
476 |
477 | func (c *Chew) processWithRetry(ctx context.Context, url string) ([]common.Chunk, error) {
478 | var (
479 | chunks []common.Chunk
480 | err error
481 | )
482 |
483 | var retries int
484 | for {
485 | chunks, err = c.processURL(ctx, url)
486 | if err == nil {
487 | return chunks, nil
488 | }
489 | if retries > c.config.RetryLimit {
490 | break
491 | }
492 | retries++
493 | c.wait(ctx, c.config.RetryDelay)
494 | }
495 |
496 | return nil, err
497 | }
498 |
499 | func (c *Chew) wait(ctx context.Context, d time.Duration) {
500 | select {
501 | case <-time.After(d):
502 | case <-ctx.Done():
503 | }
504 | }
505 |
506 | func (c *Chew) getProxy(req *http.Request) (*url.URL, error) {
507 | c.proxyMu.Lock()
508 | defer c.proxyMu.Unlock()
509 |
510 | if len(c.config.ProxyList) == 0 {
511 | return nil, nil
512 | }
513 |
514 | proxyURL, err := url.Parse(c.config.ProxyList[c.proxyIndex])
515 | if err != nil {
516 | return nil, err
517 | }
518 |
519 | c.proxyIndex = (c.proxyIndex + 1) % len(c.config.ProxyList)
520 | return proxyURL, nil
521 | }
522 |
--------------------------------------------------------------------------------
/chew_test.go:
--------------------------------------------------------------------------------
1 | package chew
2 |
3 | import (
4 | "context"
5 | "fmt"
6 | "io"
7 | "net/http"
8 | "net/http/httptest"
9 | "net/url"
10 | "os"
11 | "path/filepath"
12 | "reflect"
13 | "strings"
14 | "testing"
15 | "time"
16 |
17 | "github.com/mmatongo/chew/v1/internal/common"
18 | "github.com/mmatongo/chew/v1/internal/text"
19 | "golang.org/x/time/rate"
20 | )
21 |
22 | func mockProcessor(r io.Reader, url string) ([]common.Chunk, error) {
23 | content, err := io.ReadAll(r)
24 | if err != nil {
25 | return nil, err
26 | }
27 | return []common.Chunk{{Content: string(content), Source: url}}, nil
28 | }
29 |
30 | type mockTransport struct {
31 | response *http.Response
32 | err error
33 | }
34 |
35 | func (m *mockTransport) RoundTrip(*http.Request) (*http.Response, error) {
36 | return m.response, m.err
37 | }
38 |
39 | type mockRateLimiter struct {
40 | waitErr error
41 | }
42 |
43 | func (m *mockRateLimiter) Wait(ctx context.Context) error {
44 | return m.waitErr
45 | }
46 |
47 | func Test_processURL(t *testing.T) {
48 | originalHTTPClient := http.DefaultClient
49 | originalContentTypeProcessors := contentTypeProcessors
50 | originalValidExtensions := validExtensions
51 |
52 | defer func() {
53 | http.DefaultClient = originalHTTPClient
54 | contentTypeProcessors = originalContentTypeProcessors
55 | validExtensions = originalValidExtensions
56 | }()
57 |
58 | mockClient := &http.Client{
59 | Transport: &mockTransport{
60 | response: &http.Response{
61 | StatusCode: 200,
62 | Body: io.NopCloser(strings.NewReader("Test content")),
63 | Header: http.Header{"Content-Type": []string{"text/html"}},
64 | },
65 | },
66 | }
67 | chew := New(Config{})
68 | ctx := context.Background()
69 |
70 | chew.SetHTTPClient(mockClient)
71 | defer chew.SetHTTPClient(nil)
72 |
73 | contentTypeProcessors = map[string]func(io.Reader, string) ([]common.Chunk, error){
74 | "text/html": mockProcessor,
75 | "text/plain": mockProcessor,
76 | }
77 | validExtensions = map[string]func(io.Reader, string) ([]common.Chunk, error){
78 | ".html": mockProcessor,
79 | ".txt": mockProcessor,
80 | }
81 |
82 | tempDir := t.TempDir()
83 | testHTMLPath := filepath.Join(tempDir, "test.html")
84 | testTXTPath := filepath.Join(tempDir, "test.txt")
85 | testUnsupportedPath := filepath.Join(tempDir, "test.unsupported")
86 |
87 | err := os.WriteFile(testHTMLPath, []byte("html content"), 0644)
88 | if err != nil {
89 | t.Fatalf("failed to create test html file: %v", err)
90 | }
91 |
92 | err = os.WriteFile(testTXTPath, []byte("text content"), 0644)
93 | if err != nil {
94 | t.Fatalf("failed to create test text file: %v", err)
95 | }
96 |
97 | err = os.WriteFile(testUnsupportedPath, []byte("unsupported content"), 0644)
98 | if err != nil {
99 | t.Fatalf("failed to create test unsupported file: %v", err)
100 | }
101 |
102 | tests := []struct {
103 | name string
104 | url string
105 | want []common.Chunk
106 | wantErr bool
107 | }{
108 | {
109 | name: "success",
110 | url: "https://example.com/page.html",
111 | want: []common.Chunk{{Content: "Test content", Source: "https://example.com/page.html"}},
112 | wantErr: false,
113 | },
114 | {
115 | name: "success html",
116 | url: "file://" + testHTMLPath,
117 | want: []common.Chunk{{Content: "html content", Source: "file://" + testHTMLPath}},
118 | wantErr: false,
119 | },
120 | {
121 | name: "success txt",
122 | url: "file://" + testTXTPath,
123 | want: []common.Chunk{{Content: "text content", Source: "file://" + testTXTPath}},
124 | wantErr: false,
125 | },
126 | {
127 | name: "unsupported file type",
128 | url: "file://" + testUnsupportedPath,
129 | want: nil,
130 | wantErr: true,
131 | },
132 | {
133 | name: "non-existent file",
134 | url: "file:///non-existent.md",
135 | want: nil,
136 | wantErr: true,
137 | },
138 | }
139 |
140 | for _, tt := range tests {
141 | t.Run(tt.name, func(t *testing.T) {
142 | got, err := chew.processURL(ctx, tt.url)
143 | if (err != nil) != tt.wantErr {
144 | t.Errorf("processURL() error = %v, wantErr %v", err, tt.wantErr)
145 | return
146 | }
147 | if !reflect.DeepEqual(got, tt.want) {
148 | t.Errorf("processURL() = %v, want %v", got, tt.want)
149 | }
150 | })
151 | }
152 | }
153 |
154 | func Test_getProcessor(t *testing.T) {
155 | type args struct {
156 | contentType string
157 | url string
158 | }
159 | tests := []struct {
160 | name string
161 | args args
162 | want func(io.Reader, string) ([]common.Chunk, error)
163 | wantErr bool
164 | }{
165 | {
166 | name: "success",
167 | args: args{
168 | contentType: "text/html",
169 | url: "https://example.com/page.html",
170 | },
171 | want: mockProcessor,
172 | wantErr: false,
173 | },
174 | {
175 | name: "unknown content type",
176 | args: args{
177 | contentType: "octet/stream",
178 | url: "https://example.com/page.html",
179 | },
180 | want: text.ProcessHTML,
181 | wantErr: false,
182 | },
183 | {
184 | name: "unsupported content type",
185 | args: args{
186 | contentType: "application/octet-stream",
187 | url: "https://example.com/page.htt",
188 | },
189 | want: nil,
190 | wantErr: true,
191 | },
192 | {
193 | name: "no extension",
194 | args: args{
195 | contentType: "octet/stream",
196 | url: "https://example.com/page",
197 | },
198 | want: nil,
199 | wantErr: true,
200 | },
201 | }
202 | for _, tt := range tests {
203 | t.Run(tt.name, func(t *testing.T) {
204 | got, err := getProcessor(tt.args.contentType, tt.args.url)
205 | if (err != nil) != tt.wantErr {
206 | t.Errorf("getProcessor() error = %v, wantErr %v", err, tt.wantErr)
207 | return
208 | }
209 |
210 | if got == nil && tt.want != nil {
211 | t.Errorf("getProcessor() returned nil, want non-nil")
212 | } else if got != nil && tt.want == nil {
213 | t.Errorf("getProcessor() returned non-nil, want nil")
214 | } else if got != nil {
215 | gotType := reflect.TypeOf(got)
216 | wantType := reflect.TypeOf(tt.want)
217 | if gotType != wantType {
218 | t.Errorf("getProcessor() returned function of type %v, want %v", gotType, wantType)
219 | }
220 | }
221 | })
222 | }
223 | }
224 |
225 | func TestProcess(t *testing.T) {
226 | server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
227 | switch r.URL.Path {
228 | case "/robots.txt":
229 | w.Header().Set("Content-Type", "text/plain")
230 | w.Write([]byte("User-agent: *\nDisallow: /disallowed\nCrawl-delay: 1"))
231 | case "/text":
232 | w.Header().Set("Content-Type", "text/plain")
233 | w.Write([]byte("A plain text file."))
234 | case "/html":
235 | w.Header().Set("Content-Type", "text/html")
236 | w.Write([]byte("An HTML file.
"))
237 | case "/markdown":
238 | w.Header().Set("Content-Type", "text/plain")
239 | w.Write([]byte("# A Markdown file"))
240 | case "/disallowed":
241 | w.Header().Set("Content-Type", "text/plain")
242 | w.Write([]byte("This page is disallowed by robots.txt"))
243 | case "/rate-limited":
244 | time.Sleep(2 * time.Second)
245 | w.Write([]byte("Rate limited content"))
246 | }
247 | }))
248 | defer server.Close()
249 |
250 | containsChunk := func(chunks []common.Chunk, chunk common.Chunk) bool {
251 | for _, c := range chunks {
252 | if c.Content == chunk.Content && c.Source == chunk.Source {
253 | return true
254 | }
255 | }
256 | return false
257 | }
258 |
259 | chew := New(Config{
260 | IgnoreRobotsTxt: false,
261 | UserAgent: "TestBot/1.0",
262 | RetryLimit: 3,
263 | RetryDelay: 100 * time.Millisecond,
264 | CrawlDelay: 1 * time.Second,
265 | RateLimit: 500 * time.Millisecond,
266 | RateBurst: 1,
267 | })
268 |
269 | chew.httpClient.Timeout = 5 * time.Second
270 |
271 | type args struct {
272 | urls []string
273 | ctxs []context.Context
274 | }
275 | tests := []struct {
276 | name string
277 | args args
278 | want []common.Chunk
279 | wantErr bool
280 | expectedErrText string
281 | ignoreRobotsTxt bool
282 | orderIndependent bool
283 | rateLimiter RateLimiter
284 | }{
285 | {
286 | name: "plain text",
287 | args: args{
288 | urls: []string{server.URL + "/text"},
289 | },
290 | want: []common.Chunk{
291 | {Content: "A plain text file.", Source: server.URL + "/text"},
292 | },
293 | wantErr: false,
294 | },
295 | {
296 | name: "HTML",
297 | args: args{
298 | urls: []string{server.URL + "/html"},
299 | },
300 | want: []common.Chunk{
301 | {Content: "An HTML file.", Source: server.URL + "/html"},
302 | },
303 | wantErr: false,
304 | },
305 | {
306 | name: "markdown",
307 | args: args{
308 | urls: []string{server.URL + "/markdown"},
309 | },
310 | want: []common.Chunk{
311 | {Content: "# A Markdown file", Source: server.URL + "/markdown"},
312 | },
313 | wantErr: false,
314 | },
315 | {
316 | name: "multiple URLs",
317 | args: args{
318 | urls: []string{server.URL + "/text", server.URL + "/html"},
319 | },
320 | want: []common.Chunk{
321 | {Content: "An HTML file.", Source: server.URL + "/html"},
322 | {Content: "A plain text file.", Source: server.URL + "/text"},
323 | },
324 | wantErr: false,
325 | orderIndependent: true,
326 | },
327 | {
328 | name: "invalid URL",
329 | args: args{
330 | urls: []string{"ftp://invalid.url"},
331 | },
332 | want: nil,
333 | wantErr: true,
334 | },
335 | {
336 | name: "context cancellation",
337 | args: args{
338 | urls: []string{server.URL + "/text"},
339 | ctxs: []context.Context{func() context.Context {
340 | ctx, cancel := context.WithCancel(context.Background())
341 | go func() {
342 | time.Sleep(50 * time.Millisecond)
343 | cancel()
344 | }()
345 | return ctx
346 | }()},
347 | },
348 | want: nil,
349 | wantErr: true,
350 | expectedErrText: "context canceled",
351 | },
352 | {
353 | name: "with more than one context",
354 | args: args{
355 | urls: []string{server.URL + "/text"},
356 | ctxs: []context.Context{context.Background(), context.Background()},
357 | },
358 | want: []common.Chunk{{Content: "A plain text file.", Source: server.URL + "/text"}},
359 | wantErr: false,
360 | },
361 | {
362 | name: "respects robots.txt",
363 | args: args{
364 | urls: []string{server.URL + "/disallowed"},
365 | },
366 | want: nil,
367 | wantErr: true,
368 | },
369 | {
370 | name: "ignores robots.txt when configured",
371 | args: args{
372 | urls: []string{server.URL + "/disallowed"},
373 | },
374 | want: []common.Chunk{
375 | {Content: "This page is disallowed by robots.txt", Source: server.URL + "/disallowed"},
376 | },
377 | wantErr: false,
378 | ignoreRobotsTxt: true,
379 | },
380 | {
381 | name: "robots.txt disallowed",
382 | args: args{
383 | urls: []string{server.URL + "/disallowed"},
384 | },
385 | want: nil,
386 | wantErr: true,
387 | expectedErrText: "access to",
388 | },
389 | {
390 | name: "respects crawl delay",
391 | args: args{
392 | urls: []string{server.URL + "/text", server.URL + "/html"},
393 | },
394 | want: []common.Chunk{
395 | {Content: "A plain text file.", Source: server.URL + "/text"},
396 | {Content: "An HTML file.", Source: server.URL + "/html"},
397 | },
398 | wantErr: false,
399 | orderIndependent: true,
400 | },
401 | {
402 | name: "rate limiting error",
403 | args: args{
404 | urls: []string{server.URL + "/rate-limited", server.URL + "/rate-limited"},
405 | },
406 | want: nil,
407 | wantErr: true,
408 | expectedErrText: "rate limit exceeded",
409 | rateLimiter: &mockRateLimiter{waitErr: fmt.Errorf("rate limit exceeded")},
410 | },
411 | {
412 | name: "crawl delay respect error",
413 | args: args{
414 | urls: []string{server.URL + "/text", server.URL + "/text"},
415 | ctxs: []context.Context{func() context.Context {
416 | ctx, cancel := context.WithTimeout(context.Background(), 1500*time.Millisecond)
417 | defer cancel()
418 | return ctx
419 | }()},
420 | },
421 | want: nil,
422 | wantErr: true,
423 | expectedErrText: "context canceled",
424 | },
425 | }
426 |
427 | for _, tt := range tests {
428 | t.Run(tt.name, func(t *testing.T) {
429 | oldState := chew.config.IgnoreRobotsTxt
430 | chew.config.IgnoreRobotsTxt = tt.ignoreRobotsTxt
431 | defer func() { chew.config.IgnoreRobotsTxt = oldState }()
432 |
433 | if tt.rateLimiter != nil {
434 | chew.SetRateLimiter(tt.rateLimiter)
435 | } else {
436 | chew.SetRateLimiter(rate.NewLimiter(rate.Every(chew.config.RateLimit), chew.config.RateBurst))
437 | }
438 |
439 | ctx := context.Background()
440 | if len(tt.args.ctxs) > 0 {
441 | ctx = tt.args.ctxs[0]
442 | }
443 |
444 | got, err := chew.Process(ctx, tt.args.urls)
445 |
446 | if tt.wantErr {
447 | if err == nil {
448 | t.Errorf("Process() error = nil, wantErr %v", tt.wantErr)
449 | return
450 | }
451 | if tt.expectedErrText != "" && !strings.Contains(err.Error(), tt.expectedErrText) {
452 | t.Errorf("Process() error = %v, expectedErrText %v", err, tt.expectedErrText)
453 | return
454 | }
455 | } else {
456 | if err != nil {
457 | t.Errorf("Process() unexpected error: %v", err)
458 | return
459 | }
460 | if !tt.orderIndependent && !reflect.DeepEqual(got, tt.want) {
461 | t.Errorf("Process() = %v, want %v", got, tt.want)
462 | }
463 | if tt.orderIndependent {
464 | if len(got) != len(tt.want) {
465 | t.Errorf("Process() returned %d chunks, want %d", len(got), len(tt.want))
466 | }
467 | for _, wantChunk := range tt.want {
468 | if !containsChunk(got, wantChunk) {
469 | t.Errorf("Process() did not return chunk %v", wantChunk)
470 | }
471 | }
472 | }
473 | }
474 | })
475 | }
476 | }
477 |
478 | func Test_getProxy(t *testing.T) {
479 | tests := []struct {
480 | name string
481 | config Config
482 | requests int
483 | wantProxy []*url.URL
484 | }{
485 | {
486 | name: "no proxies",
487 | config: Config{},
488 | requests: 1,
489 | wantProxy: []*url.URL{nil},
490 | },
491 | {
492 | name: "single proxy",
493 | config: Config{
494 | ProxyList: []string{"http://proxy1.example.com"},
495 | },
496 | requests: 2,
497 | wantProxy: []*url.URL{must(url.Parse("http://proxy1.example.com")), must(url.Parse("http://proxy1.example.com"))},
498 | },
499 | {
500 | name: "multiple proxies",
501 | config: Config{
502 | ProxyList: []string{"http://proxy1.example.com", "http://proxy2.example.com", "http://proxy3.example.com"},
503 | },
504 | requests: 5,
505 | wantProxy: []*url.URL{
506 | must(url.Parse("http://proxy1.example.com")),
507 | must(url.Parse("http://proxy2.example.com")),
508 | must(url.Parse("http://proxy3.example.com")),
509 | must(url.Parse("http://proxy1.example.com")),
510 | must(url.Parse("http://proxy2.example.com")),
511 | },
512 | },
513 | }
514 |
515 | for _, tt := range tests {
516 | t.Run(tt.name, func(t *testing.T) {
517 | c := New(tt.config)
518 | for i := 0; i < tt.requests; i++ {
519 | got, err := c.getProxy(&http.Request{})
520 | if err != nil {
521 | t.Errorf("getProxy() error = %v", err)
522 | return
523 | }
524 | if !reflect.DeepEqual(got, tt.wantProxy[i]) {
525 | t.Errorf("getProxy() = %v, want %v", got, tt.wantProxy[i])
526 | }
527 | }
528 | })
529 | }
530 | }
531 |
532 | func must(u *url.URL, err error) *url.URL {
533 | if err != nil {
534 | panic(err)
535 | }
536 | return u
537 | }
538 |
539 | func TestRespectCrawlDelay(t *testing.T) {
540 | chew := New(Config{})
541 | ctx := context.Background()
542 |
543 | tests := []struct {
544 | name string
545 | ctx context.Context
546 | url string
547 | delay time.Duration
548 | wantWait bool
549 | }{
550 | {
551 | name: "first access",
552 | url: "https://example.com",
553 | delay: time.Second,
554 | wantWait: false,
555 | },
556 | {
557 | name: "second access",
558 | url: "https://example.com",
559 | delay: time.Second,
560 | wantWait: true,
561 | },
562 | }
563 |
564 | for _, tt := range tests {
565 | t.Run(tt.name, func(t *testing.T) {
566 | start := time.Now()
567 | err := chew.respectCrawlDelay(ctx, tt.url, tt.delay)
568 | duration := time.Since(start)
569 |
570 | if err != nil {
571 | t.Errorf("respectCrawlDelay() error = %v", err)
572 | return
573 | }
574 |
575 | if tt.wantWait && duration < tt.delay {
576 | t.Errorf("respectCrawlDelay() didn't wait long enough. Duration: %v, Expected: %v", duration, tt.delay)
577 | }
578 | if !tt.wantWait && duration >= tt.delay {
579 | t.Errorf("respectCrawlDelay() waited unnecessarily. Duration: %v", duration)
580 | }
581 | })
582 | }
583 | }
584 |
--------------------------------------------------------------------------------
/cmd/chew/wrapper.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | /*
4 | #include
5 | */
6 | import "C"
7 |
8 | import (
9 | "context"
10 | "fmt"
11 | "strings"
12 | "time"
13 | "unsafe"
14 |
15 | "github.com/mmatongo/chew/v1"
16 | )
17 |
18 | //export Process
19 | func Process(urls *C.char) *C.char {
20 | urlsSlice := strings.Split(C.GoString(urls), ",")
21 |
22 | ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
23 | defer cancel()
24 |
25 | c := chew.New(chew.Config{
26 | UserAgent: "Chew/1.0 (+https://github.com/mmatongo/chew)",
27 | RetryLimit: 3,
28 | RetryDelay: time.Second,
29 | CrawlDelay: time.Second,
30 | RateLimit: time.Second,
31 | RateBurst: 1,
32 | IgnoreRobotsTxt: false,
33 | })
34 |
35 | chunks, err := c.Process(ctx, urlsSlice)
36 | if err != nil {
37 | if err == context.DeadlineExceeded {
38 | return C.CString("Operation timed out")
39 | }
40 | return C.CString(fmt.Sprintf("Error processing URLs: %v", err))
41 | }
42 |
43 | var result strings.Builder
44 | for _, chunk := range chunks {
45 | result.WriteString(fmt.Sprintf("Source: %s\nContent: %s\n\n", chunk.Source, chunk.Content))
46 | }
47 |
48 | return C.CString(result.String())
49 | }
50 |
51 | //export FreeString
52 | func FreeString(ptr *C.char) {
53 | C.free(unsafe.Pointer(ptr))
54 | }
55 |
56 | func main() {}
57 |
--------------------------------------------------------------------------------
/codecov.yml:
--------------------------------------------------------------------------------
1 | coverage:
2 | precision: 2
3 | round: up
4 | range: "70...100"
5 |
6 | ignore:
7 | - ".idea"
8 | - "docs"
9 | - "cmd"
10 | - "testdata"
11 | - "examples"
12 | - "assets"
13 |
--------------------------------------------------------------------------------
/docs/golang.md:
--------------------------------------------------------------------------------
1 | Chew is native to Go and can be used as library in your Go project with ease. Here is a simple example of how to use Chew in your Go project.
2 |
3 | ```go
4 | package main
5 |
6 | import (
7 | "context"
8 | "fmt"
9 | "log"
10 | "time"
11 |
12 | "github.com/mmatongo/chew"
13 | )
14 |
15 | func main() {
16 | urls := []string{
17 | "https://example.com",
18 | }
19 |
20 | // The context is optional
21 | ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
22 | defer cancel()
23 |
24 | chunks, err := chew.Process(urls, ctx)
25 | if err != nil {
26 | if err == context.DeadlineExceeded {
27 | log.Println("Operation timed out")
28 | } else {
29 | log.Printf("Error processing URLs: %v", err)
30 | }
31 | return
32 | }
33 |
34 | for _, chunk := range chunks {
35 | fmt.Printf("Source: %s\nContent: %s\n\n", chunk.Source, chunk.Content)
36 | }
37 | }
38 | ```
39 |
40 | The above code snippet demonstrates how to use Chew in your Go project. The `chew.Process` function takes a list of URLs and returns a list of `Chunk` objects. Each `Chunk` object contains the source URL and the content of the URL. The `context` parameter is optional and can be used to set a timeout for the operation. If the operation times out, the function will return a `context.DeadlineExceeded` error.
41 |
42 | Markdown formatting is not enforced in the content of the `Chunk` object. However, the output is always going to be plain text so you can format it as you wish.
43 |
--------------------------------------------------------------------------------
/docs/python.md:
--------------------------------------------------------------------------------
1 | To use Chew with python you need to first build the package to create the shared object file and header file. You can do this by running the following command:
2 |
3 | ```bash
4 | go build -o chew.so -buildmode=c-shared ./cmd/chew/wrapper.go
5 | ```
6 |
7 | This will create a `chew.so` and `chew.h` file in the current directory. You can then use these files in your python project to use Chew. Here is an example of how to use Chew in your python project:
8 |
9 | ```python
10 | import ctypes
11 |
12 | chew_lib = ctypes.CDLL('./chew.so')
13 |
14 | chew_lib.Process.argtypes = [ctypes.c_char_p]
15 | chew_lib.Process.restype = ctypes.c_char_p
16 |
17 | url = "https://example.com"
18 | result = chew_lib.Process(url.encode('utf-8'))
19 |
20 | print(result.decode('utf-8'))
21 | ```
22 |
23 | With the above code snippet, you can now use Chew in your python project. I can't speak for the limitations of using Chew in python as I have not extensively tested it myself.
24 |
--------------------------------------------------------------------------------
/docs/ruby.md:
--------------------------------------------------------------------------------
1 | To use Chew with Ruby you need to first build the package to create the shared object file and header file. You can do this by running the following command:
2 |
3 | ```bash
4 | go build -o chew.so -buildmode=c-shared ./cmd/chew/wrapper.go
5 | ```
6 |
7 | This will create a `chew.so` and `chew.h` file in the current directory. You can then use these files in your Ruby project to use Chew. Here is an example of how to use Chew in your Ruby project:
8 |
9 | ```ruby
10 | require 'fiddle'
11 | require 'fiddle/import'
12 |
13 | module ChewLib
14 | extend Fiddle::Importer
15 | dlload './chew.so'
16 |
17 | extern 'char* Process(char*)'
18 | end
19 |
20 | urls = ['https://example.com', 'https://example.com']
21 | for url in urls
22 | result_ptr = ChewLib.Process(url)
23 | result = result_ptr.to_s
24 | Fiddle::Function.new(Fiddle::Handle['free'], [Fiddle::TYPE_VOIDP], Fiddle::TYPE_VOID).call(result_ptr)
25 |
26 | puts result
27 | end
28 | ```
29 |
30 | Using chew like this will come with obvious limitations, however, this is a simple example of how to use Chew in your Ruby project
31 |
--------------------------------------------------------------------------------
/docs/setup.md:
--------------------------------------------------------------------------------
1 | # Setting up Google Cloud Services for Speech-to-Text
2 |
3 | 1. **Create a Google Cloud Project**
4 | - Go to the [Google Cloud Console](https://console.cloud.google.com/)
5 | - Click on the project dropdown and select "New Project"
6 | - Enter a project name and click "Create"
7 |
8 | 2. **Enable the Cloud Speech-to-Text API**
9 | - In the Google Cloud Console, under "Quick access" go to "APIs & Services"
10 | - Click on "+ ENABLE APIS AND SERVICES"
11 | - Search for "Cloud Speech-to-Text API" and select it
12 | - Click "Enable"
13 |
14 | 3. **Create a Service Account**
15 | - In the Google Cloud Console, go to "IAM & Admin" > "Service Accounts" (or use [this link](https://console.cloud.google.com/iam-admin/serviceaccounts))
16 | - Click "Create Service Account"
17 | - Enter a name for the service account and click "Create"
18 | - For the role, choose "Project" > "Owner" (or a more restrictive role if preferred)
19 | - Click "Continue" and then "Done"
20 |
21 | 4. **Generate a Key for the Service Account**
22 | - In the Service Accounts list, find the account you just created
23 | - Click on the three dots menu (⋮) and select "Manage keys"
24 | - Click "Add Key" > "Create new key"
25 | - Choose "JSON" as the key type and click "Create"
26 | - The key file will be downloaded to your computer
27 |
28 | 5. **Set the GOOGLE_APPLICATION_CREDENTIALS Environment Variable**
29 | - On Linux or macOS:
30 | ```
31 | export GOOGLE_APPLICATION_CREDENTIALS="/path/to/your/service-account-key.json"
32 | ```
33 | - If you want to set the environment variable permanently, you can add it to your shell profile (e.g., `~/.bashrc`, `~/.zshrc`, etc.)
34 |
35 | You can optionally set the environment variable in your code as well:
36 | ```python
37 | import os
38 |
39 | os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "/path/to/your/service-account-key.json"
40 | ```
41 |
42 | ```go
43 | import "os"
44 |
45 | os.Setenv("GOOGLE_APPLICATION_CREDENTIALS", "/path/to/your/service-account-key.json")
46 | ```
47 |
48 | ```ruby
49 | ENV["GOOGLE_APPLICATION_CREDENTIALS"] = "/path/to/your/service-account-key.json"
50 | ```
51 |
--------------------------------------------------------------------------------
/examples/main.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import (
4 | "context"
5 | "fmt"
6 | "log"
7 | "time"
8 |
9 | "github.com/mmatongo/chew/v1"
10 | )
11 |
12 | func main() {
13 | urls := []string{
14 | "https://example.com",
15 | }
16 |
17 | config := chew.Config{
18 | UserAgent: "Chew/1.0 (+https://github.com/mmatongo/chew)",
19 | RetryLimit: 3,
20 | RetryDelay: 5 * time.Second,
21 | CrawlDelay: 10 * time.Second,
22 | ProxyList: []string{}, // Add your proxies here, or leave empty
23 | RateLimit: 2 * time.Second,
24 | RateBurst: 3,
25 | IgnoreRobotsTxt: false,
26 | }
27 |
28 | haChew := chew.New(config)
29 |
30 | ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
31 | defer cancel()
32 |
33 | chunks, err := haChew.Process(ctx, urls)
34 | if err != nil {
35 | if err == context.DeadlineExceeded {
36 | log.Println("Operation timed out")
37 | } else {
38 | log.Printf("Error processing URLs: %v", err)
39 | }
40 | return
41 | }
42 |
43 | for _, chunk := range chunks {
44 | fmt.Printf("Source: %s\nContent: %s\n\n", chunk.Source, chunk.Content)
45 | }
46 | }
47 |
--------------------------------------------------------------------------------
/examples/main.py:
--------------------------------------------------------------------------------
1 | # please see the documentation on how to build chew for use with python
2 |
3 | import ctypes
4 |
5 | chew_lib = ctypes.CDLL('./chew.so')
6 |
7 | chew_lib.Process.argtypes = [ctypes.c_char_p]
8 | chew_lib.Process.restype = ctypes.c_char_p
9 |
10 | urls = "https://example.com"
11 | result = chew_lib.Process(urls.encode('utf-8'))
12 |
13 | print(result.decode('utf-8'))
14 |
--------------------------------------------------------------------------------
/examples/main.rb:
--------------------------------------------------------------------------------
1 | # please see the documentation on how to build chew for use with ruby
2 |
3 | require 'fiddle'
4 | require 'fiddle/import'
5 |
6 | module ChewLib
7 | extend Fiddle::Importer
8 | dlload './chew.so'
9 |
10 | extern 'char* Process(char*)'
11 | end
12 |
13 | urls = ['https://example.com', 'https://example.com']
14 | for url in urls
15 | result_ptr = ChewLib.Process(url)
16 | result = result_ptr.to_s
17 | Fiddle::Function.new(Fiddle::Handle['free'], [Fiddle::TYPE_VOIDP], Fiddle::TYPE_VOID).call(result_ptr)
18 |
19 | puts result
20 | end
21 |
--------------------------------------------------------------------------------
/examples/transcription/google.go:
--------------------------------------------------------------------------------
1 | //go:build ignore
2 |
3 | package main
4 |
5 | import (
6 | "context"
7 | "log"
8 | "os"
9 | "time"
10 |
11 | "github.com/mmatongo/chew/v1"
12 | )
13 |
14 | func main() {
15 | credentialsFile := "chew-go.json"
16 | credentialsJSON, err := os.ReadFile(credentialsFile)
17 | if err != nil {
18 | log.Fatalf("Failed to read credentials file: %v", err)
19 | }
20 |
21 | err = os.Setenv("GOOGLE_APPLICATION_CREDENTIALS", credentialsFile)
22 | if err != nil {
23 | log.Fatalf("Failed to set environment variable: %v", err)
24 | }
25 |
26 | config := chew.TranscribeOptions{
27 | CredentialsJSON: credentialsJSON,
28 | Bucket: "chew-go",
29 | LanguageCode: "en-US",
30 | }
31 |
32 | log.Println("transcribing files...")
33 | /*
34 | Transcriptions can take a bit of time so ensure that the timeout you set
35 | is enough for the process to finish
36 |
37 | In a test with MLK Jr's speech it took about 3min to complete
38 |
39 | The two audio files used in this example can be obtained from the following links:
40 | - Conference.wav: https://voiceage.com/wbsamples/in_stereo/Conference.wav
41 | - MLKDream_64kb.mp3: https://archive.org/details/MLKDream
42 | */
43 |
44 | ctx, cancel := context.WithTimeout(context.Background(), 5*time.Minute)
45 | defer cancel()
46 |
47 | filenames := []string{
48 | "audio/Conference.wav",
49 | "audio/MLKDream_64kb.mp3",
50 | }
51 |
52 | results, err := chew.Transcribe(ctx, filenames, config)
53 | if err != nil {
54 | log.Fatalf("failed to transcribe: %v", err)
55 | }
56 |
57 | for filename, transcript := range results {
58 | log.Printf("Transcript for %s: %s\n", filename, transcript)
59 | }
60 | }
61 |
--------------------------------------------------------------------------------
/examples/transcription/whisper.go:
--------------------------------------------------------------------------------
1 | //go:build ignore
2 |
3 | package main
4 |
5 | import (
6 | "context"
7 | "log"
8 | "os"
9 | "time"
10 |
11 | "github.com/mmatongo/chew/v1"
12 | )
13 |
14 | func main() {
15 | key := os.Getenv("OPENAI_API_KEY")
16 | if key == "" {
17 | log.Fatalf("Please set the OPENAI_API_KEY= environment variable")
18 | }
19 |
20 | whisperOpts := chew.TranscribeOptions{
21 | UseWhisper: true,
22 | WhisperAPIKey: key,
23 | WhisperModel: "whisper-1",
24 | }
25 |
26 | log.Println("transcribing files...")
27 | /*
28 | The whisper model is a bit faster than the google cloud speech-to-text api
29 | so the timeout can be set to a lower value.
30 |
31 | In a test with MLK Jr's speech it took about 32s to complete
32 |
33 | The two audio files used in this example can be obtained from the following links:
34 | - Conference.wav: https://voiceage.com/wbsamples/in_stereo/Conference.wav
35 | - MLKDream_64kb.mp3: https://archive.org/details/MLKDream
36 | */
37 |
38 | ctx, cancel := context.WithTimeout(context.Background(), 5*time.Minute)
39 | defer cancel()
40 |
41 | audioFiles := []string{
42 | "audio/Conference.wav",
43 | "audio/MLKDream_64kb.mp3",
44 | }
45 |
46 | results, err := chew.Transcribe(ctx, audioFiles, whisperOpts)
47 |
48 | if err != nil {
49 | log.Fatalf("Error transcribing with OpenAI Whisper: %v", err)
50 | }
51 |
52 | for filename, transcript := range results {
53 | log.Printf("Transcript for %s: %s\n", filename, transcript)
54 | }
55 | }
56 |
--------------------------------------------------------------------------------
/go.mod:
--------------------------------------------------------------------------------
1 | module github.com/mmatongo/chew/v1
2 |
3 | go 1.23
4 |
5 | require (
6 | cloud.google.com/go/speech v1.23.1
7 | github.com/PuerkitoBio/goquery v1.9.2
8 | github.com/amanitaverna/go-mp3 v0.4.0
9 | github.com/go-audio/wav v1.1.0
10 | github.com/ledongthuc/pdf v0.0.0-20240201131950-da5b75280b06
11 | github.com/mewkiz/flac v1.0.11
12 | github.com/taylorskalyo/goreader v0.0.0-20230626212555-e7f5644f8115
13 | github.com/temoto/robotstxt v1.1.2
14 | golang.org/x/time v0.6.0
15 | google.golang.org/api v0.187.0
16 | )
17 |
18 | require (
19 | cloud.google.com/go v0.115.0 // indirect
20 | cloud.google.com/go/auth v0.6.1 // indirect
21 | cloud.google.com/go/auth/oauth2adapt v0.2.2 // indirect
22 | cloud.google.com/go/compute/metadata v0.3.0 // indirect
23 | cloud.google.com/go/iam v1.1.8 // indirect
24 | cloud.google.com/go/longrunning v0.5.7 // indirect
25 | github.com/felixge/httpsnoop v1.0.4 // indirect
26 | github.com/go-audio/audio v1.0.0 // indirect
27 | github.com/go-audio/riff v1.0.0 // indirect
28 | github.com/go-logr/logr v1.4.1 // indirect
29 | github.com/go-logr/stdr v1.2.2 // indirect
30 | github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da // indirect
31 | github.com/golang/protobuf v1.5.4 // indirect
32 | github.com/google/s2a-go v0.1.7 // indirect
33 | github.com/google/uuid v1.6.0 // indirect
34 | github.com/googleapis/enterprise-certificate-proxy v0.3.2 // indirect
35 | github.com/googleapis/gax-go/v2 v2.12.5 // indirect
36 | github.com/icza/bitio v1.1.0 // indirect
37 | github.com/kr/pretty v0.1.0 // indirect
38 | github.com/mewkiz/pkg v0.0.0-20230226050401-4010bf0fec14 // indirect
39 | go.opencensus.io v0.24.0 // indirect
40 | go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.49.0 // indirect
41 | go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.49.0 // indirect
42 | go.opentelemetry.io/otel v1.24.0 // indirect
43 | go.opentelemetry.io/otel/metric v1.24.0 // indirect
44 | go.opentelemetry.io/otel/trace v1.24.0 // indirect
45 | golang.org/x/crypto v0.25.0 // indirect
46 | golang.org/x/oauth2 v0.21.0 // indirect
47 | golang.org/x/sync v0.7.0 // indirect
48 | golang.org/x/sys v0.22.0 // indirect
49 | golang.org/x/text v0.16.0 // indirect
50 | google.golang.org/genproto v0.0.0-20240624140628-dc46fd24d27d // indirect
51 | google.golang.org/genproto/googleapis/api v0.0.0-20240617180043-68d350f18fd4 // indirect
52 | google.golang.org/genproto/googleapis/rpc v0.0.0-20240624140628-dc46fd24d27d // indirect
53 | google.golang.org/grpc v1.64.0 // indirect
54 | google.golang.org/protobuf v1.34.2 // indirect
55 | gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127 // indirect
56 | )
57 |
58 | require (
59 | cloud.google.com/go/storage v1.43.0
60 | github.com/andybalholm/cascadia v1.3.2 // indirect
61 | golang.org/x/net v0.27.0 // indirect
62 | gopkg.in/yaml.v3 v3.0.1
63 | )
64 |
--------------------------------------------------------------------------------
/go.sum:
--------------------------------------------------------------------------------
1 | cloud.google.com/go v0.26.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMTw=
2 | cloud.google.com/go v0.115.0 h1:CnFSK6Xo3lDYRoBKEcAtia6VSC837/ZkJuRduSFnr14=
3 | cloud.google.com/go v0.115.0/go.mod h1:8jIM5vVgoAEoiVxQ/O4BFTfHqulPZgs/ufEzMcFMdWU=
4 | cloud.google.com/go/auth v0.6.1 h1:T0Zw1XM5c1GlpN2HYr2s+m3vr1p2wy+8VN+Z1FKxW38=
5 | cloud.google.com/go/auth v0.6.1/go.mod h1:eFHG7zDzbXHKmjJddFG/rBlcGp6t25SwRUiEQSlO4x4=
6 | cloud.google.com/go/auth/oauth2adapt v0.2.2 h1:+TTV8aXpjeChS9M+aTtN/TjdQnzJvmzKFt//oWu7HX4=
7 | cloud.google.com/go/auth/oauth2adapt v0.2.2/go.mod h1:wcYjgpZI9+Yu7LyYBg4pqSiaRkfEK3GQcpb7C/uyF1Q=
8 | cloud.google.com/go/compute/metadata v0.3.0 h1:Tz+eQXMEqDIKRsmY3cHTL6FVaynIjX2QxYC4trgAKZc=
9 | cloud.google.com/go/compute/metadata v0.3.0/go.mod h1:zFmK7XCadkQkj6TtorcaGlCW1hT1fIilQDwofLpJ20k=
10 | cloud.google.com/go/iam v1.1.8 h1:r7umDwhj+BQyz0ScZMp4QrGXjSTI3ZINnpgU2nlB/K0=
11 | cloud.google.com/go/iam v1.1.8/go.mod h1:GvE6lyMmfxXauzNq8NbgJbeVQNspG+tcdL/W8QO1+zE=
12 | cloud.google.com/go/longrunning v0.5.7 h1:WLbHekDbjK1fVFD3ibpFFVoyizlLRl73I7YKuAKilhU=
13 | cloud.google.com/go/longrunning v0.5.7/go.mod h1:8GClkudohy1Fxm3owmBGid8W0pSgodEMwEAztp38Xng=
14 | cloud.google.com/go/speech v1.23.1 h1:TcWEAOLQH1Lb2fhHS6/GjvAh+ue0dt4xUDHXHG6vF04=
15 | cloud.google.com/go/speech v1.23.1/go.mod h1:UNgzNxhNBuo/OxpF1rMhA/U2rdai7ILL6PBXFs70wq0=
16 | cloud.google.com/go/storage v1.43.0 h1:CcxnSohZwizt4LCzQHWvBf1/kvtHUn7gk9QERXPyXFs=
17 | cloud.google.com/go/storage v1.43.0/go.mod h1:ajvxEa7WmZS1PxvKRq4bq0tFT3vMd502JwstCcYv0Q0=
18 | github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU=
19 | github.com/PuerkitoBio/goquery v1.9.2 h1:4/wZksC3KgkQw7SQgkKotmKljk0M6V8TUvA8Wb4yPeE=
20 | github.com/PuerkitoBio/goquery v1.9.2/go.mod h1:GHPCaP0ODyyxqcNoFGYlAprUFH81NuRPd0GX3Zu2Mvk=
21 | github.com/amanitaverna/go-mp3 v0.4.0 h1:ZZ5maCStIh7+M9NZSk58Eww23q0B4IuJtQW+Y6u4kkw=
22 | github.com/amanitaverna/go-mp3 v0.4.0/go.mod h1:b9idBPNUTSU/5D+GATwLkJx5xqDYTEeRg7/O7K7gZF0=
23 | github.com/andybalholm/cascadia v1.3.2 h1:3Xi6Dw5lHF15JtdcmAHD3i1+T8plmv7BQ/nsViSLyss=
24 | github.com/andybalholm/cascadia v1.3.2/go.mod h1:7gtRlve5FxPPgIgX36uWBX58OdBsSS6lUvCFb+h7KvU=
25 | github.com/census-instrumentation/opencensus-proto v0.2.1/go.mod h1:f6KPmirojxKA12rnyqOA5BBL4O983OfeGPqjHWSTneU=
26 | github.com/client9/misspell v0.3.4/go.mod h1:qj6jICC3Q7zFZvVWo7KLAzC3yx5G7kyvSDkc90ppPyw=
27 | github.com/cncf/udpa/go v0.0.0-20191209042840-269d4d468f6f/go.mod h1:M8M6+tZqaGXZJjfX53e64911xZQV5JYwmTeXPW+k8Sc=
28 | github.com/d4l3k/messagediff v1.2.2-0.20190829033028-7e0a312ae40b/go.mod h1:Oozbb1TVXFac9FtSIxHBMnBCq2qeH/2KkEQxENCrlLo=
29 | github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
30 | github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
31 | github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
32 | github.com/envoyproxy/go-control-plane v0.9.0/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4=
33 | github.com/envoyproxy/go-control-plane v0.9.1-0.20191026205805-5f8ba28d4473/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4=
34 | github.com/envoyproxy/go-control-plane v0.9.4/go.mod h1:6rpuAdCZL397s3pYoYcLgu1mIlRU8Am5FuJP05cCM98=
35 | github.com/envoyproxy/protoc-gen-validate v0.1.0/go.mod h1:iSmxcyjqTsJpI2R4NaDN7+kN2VEUnK/pcBlmesArF7c=
36 | github.com/felixge/httpsnoop v1.0.4 h1:NFTV2Zj1bL4mc9sqWACXbQFVBBg2W3GPvqp8/ESS2Wg=
37 | github.com/felixge/httpsnoop v1.0.4/go.mod h1:m8KPJKqk1gH5J9DgRY2ASl2lWCfGKXixSwevea8zH2U=
38 | github.com/go-audio/audio v1.0.0 h1:zS9vebldgbQqktK4H0lUqWrG8P0NxCJVqcj7ZpNnwd4=
39 | github.com/go-audio/audio v1.0.0/go.mod h1:6uAu0+H2lHkwdGsAY+j2wHPNPpPoeg5AaEFh9FlA+Zs=
40 | github.com/go-audio/riff v1.0.0 h1:d8iCGbDvox9BfLagY94fBynxSPHO80LmZCaOsmKxokA=
41 | github.com/go-audio/riff v1.0.0/go.mod h1:l3cQwc85y79NQFCRB7TiPoNiaijp6q8Z0Uv38rVG498=
42 | github.com/go-audio/wav v1.1.0 h1:jQgLtbqBzY7G+BM8fXF7AHUk1uHUviWS4X39d5rsL2g=
43 | github.com/go-audio/wav v1.1.0/go.mod h1:mpe9qfwbScEbkd8uybLuIpTgHyrISw/OTuvjUW2iGtE=
44 | github.com/go-logr/logr v1.2.2/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A=
45 | github.com/go-logr/logr v1.4.1 h1:pKouT5E8xu9zeFC39JXRDukb6JFQPXM5p5I91188VAQ=
46 | github.com/go-logr/logr v1.4.1/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY=
47 | github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag=
48 | github.com/go-logr/stdr v1.2.2/go.mod h1:mMo/vtBO5dYbehREoey6XUKy/eSumjCCveDpRre4VKE=
49 | github.com/golang/glog v0.0.0-20160126235308-23def4e6c14b/go.mod h1:SBH7ygxi8pfUlaOkMMuAQtPIUF8ecWP5IEl/CR7VP2Q=
50 | github.com/golang/groupcache v0.0.0-20200121045136-8c9f03a8e57e/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc=
51 | github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da h1:oI5xCqsCo564l8iNU+DwB5epxmsaqB+rhGL0m5jtYqE=
52 | github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc=
53 | github.com/golang/mock v1.1.1/go.mod h1:oTYuIxOrZwtPieC+H1uAHpcLFnEyAGVDL/k47Jfbm0A=
54 | github.com/golang/protobuf v1.2.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U=
55 | github.com/golang/protobuf v1.3.2/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U=
56 | github.com/golang/protobuf v1.4.0-rc.1/go.mod h1:ceaxUfeHdC40wWswd/P6IGgMaK3YpKi5j83Wpe3EHw8=
57 | github.com/golang/protobuf v1.4.0-rc.1.0.20200221234624-67d41d38c208/go.mod h1:xKAWHe0F5eneWXFV3EuXVDTCmh+JuBKY0li0aMyXATA=
58 | github.com/golang/protobuf v1.4.0-rc.2/go.mod h1:LlEzMj4AhA7rCAGe4KMBDvJI+AwstrUpVNzEA03Pprs=
59 | github.com/golang/protobuf v1.4.0-rc.4.0.20200313231945-b860323f09d0/go.mod h1:WU3c8KckQ9AFe+yFwt9sWVRKCVIyN9cPHBJSNnbL67w=
60 | github.com/golang/protobuf v1.4.0/go.mod h1:jodUvKwWbYaEsadDk5Fwe5c77LiNKVO9IDvqG2KuDX0=
61 | github.com/golang/protobuf v1.4.1/go.mod h1:U8fpvMrcmy5pZrNK1lt4xCsGvpyWQ/VVv6QDs8UjoX8=
62 | github.com/golang/protobuf v1.4.3/go.mod h1:oDoupMAO8OvCJWAcko0GGGIgR6R6ocIYbsSw735rRwI=
63 | github.com/golang/protobuf v1.5.4 h1:i7eJL8qZTpSEXOPTxNKhASYpMn+8e5Q6AdndVa1dWek=
64 | github.com/golang/protobuf v1.5.4/go.mod h1:lnTiLA8Wa4RWRcIUkrtSVa5nRhsEGBg48fD6rSs7xps=
65 | github.com/google/go-cmp v0.2.0/go.mod h1:oXzfMopK8JAjlY9xF4vHSVASa0yLyX7SntLO5aqRK0M=
66 | github.com/google/go-cmp v0.3.0/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU=
67 | github.com/google/go-cmp v0.3.1/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU=
68 | github.com/google/go-cmp v0.4.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
69 | github.com/google/go-cmp v0.5.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
70 | github.com/google/go-cmp v0.5.3/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
71 | github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI=
72 | github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
73 | github.com/google/martian/v3 v3.3.3 h1:DIhPTQrbPkgs2yJYdXU/eNACCG5DVQjySNRNlflZ9Fc=
74 | github.com/google/martian/v3 v3.3.3/go.mod h1:iEPrYcgCF7jA9OtScMFQyAlZZ4YXTKEtJ1E6RWzmBA0=
75 | github.com/google/s2a-go v0.1.7 h1:60BLSyTrOV4/haCDW4zb1guZItoSq8foHCXrAnjBo/o=
76 | github.com/google/s2a-go v0.1.7/go.mod h1:50CgR4k1jNlWBu4UfS4AcfhVe1r6pdZPygJ3R8F0Qdw=
77 | github.com/google/uuid v1.1.2/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
78 | github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
79 | github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
80 | github.com/googleapis/enterprise-certificate-proxy v0.3.2 h1:Vie5ybvEvT75RniqhfFxPRy3Bf7vr3h0cechB90XaQs=
81 | github.com/googleapis/enterprise-certificate-proxy v0.3.2/go.mod h1:VLSiSSBs/ksPL8kq3OBOQ6WRI2QnaFynd1DCjZ62+V0=
82 | github.com/googleapis/gax-go/v2 v2.12.5 h1:8gw9KZK8TiVKB6q3zHY3SBzLnrGp6HQjyfYBYGmXdxA=
83 | github.com/googleapis/gax-go/v2 v2.12.5/go.mod h1:BUDKcWo+RaKq5SC9vVYL0wLADa3VcfswbOMMRmB9H3E=
84 | github.com/icza/bitio v1.1.0 h1:ysX4vtldjdi3Ygai5m1cWy4oLkhWTAi+SyO6HC8L9T0=
85 | github.com/icza/bitio v1.1.0/go.mod h1:0jGnlLAx8MKMr9VGnn/4YrvZiprkvBelsVIbA9Jjr9A=
86 | github.com/icza/mighty v0.0.0-20180919140131-cfd07d671de6 h1:8UsGZ2rr2ksmEru6lToqnXgA8Mz1DP11X4zSJ159C3k=
87 | github.com/icza/mighty v0.0.0-20180919140131-cfd07d671de6/go.mod h1:xQig96I1VNBDIWGCdTt54nHt6EeI639SmHycLYL7FkA=
88 | github.com/jszwec/csvutil v1.5.1/go.mod h1:Rpu7Uu9giO9subDyMCIQfHVDuLrcaC36UA4YcJjGBkg=
89 | github.com/kr/pretty v0.1.0 h1:L/CwN0zerZDmRFUapSPitk6f+Q3+0za1rQkzVuMiMFI=
90 | github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo=
91 | github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ=
92 | github.com/kr/text v0.1.0 h1:45sCR5RtlFHMR4UwH9sdQ5TC8v0qDQCHnXt+kaKSTVE=
93 | github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI=
94 | github.com/ledongthuc/pdf v0.0.0-20240201131950-da5b75280b06 h1:kacRlPN7EN++tVpGUorNGPn/4DnB7/DfTY82AOn6ccU=
95 | github.com/ledongthuc/pdf v0.0.0-20240201131950-da5b75280b06/go.mod h1:imJHygn/1yfhB7XSJJKlFZKl/J+dCPAknuiaGOshXAs=
96 | github.com/mewkiz/flac v1.0.11 h1:2KFoMH/P72qhZ/E4bI7ZuK79lCPE1zZM3/6WnrMOTH4=
97 | github.com/mewkiz/flac v1.0.11/go.mod h1:1UeXlFRJp4ft2mfZnPLRpQTd7cSjb/s17o7JQzzyrCA=
98 | github.com/mewkiz/pkg v0.0.0-20230226050401-4010bf0fec14 h1:tnAPMExbRERsyEYkmR1YjhTgDM0iqyiBYf8ojRXxdbA=
99 | github.com/mewkiz/pkg v0.0.0-20230226050401-4010bf0fec14/go.mod h1:QYCFBiH5q6XTHEbWhR0uhR3M9qNPoD2CSQzr0g75kE4=
100 | github.com/pkg/errors v0.8.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
101 | github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
102 | github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
103 | github.com/prometheus/client_model v0.0.0-20190812154241-14fe0d1b01d4/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA=
104 | github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
105 | github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw=
106 | github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo=
107 | github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
108 | github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
109 | github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU=
110 | github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4=
111 | github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg=
112 | github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY=
113 | github.com/taylorskalyo/goreader v0.0.0-20230626212555-e7f5644f8115 h1:OEAIMYp5l9kJ2kT9UPL5QSUriKIIDhnLmpJTy69sltA=
114 | github.com/taylorskalyo/goreader v0.0.0-20230626212555-e7f5644f8115/go.mod h1:AIVbkIe1G7fpFHiKOdxZnU5p9tFPYNTQyH3H5IrRkGw=
115 | github.com/temoto/robotstxt v1.1.2 h1:W2pOjSJ6SWvldyEuiFXNxz3xZ8aiWX5LbfDiOFd7Fxg=
116 | github.com/temoto/robotstxt v1.1.2/go.mod h1:+1AmkuG3IYkh1kv0d2qEB9Le88ehNO0zwOr3ujewlOo=
117 | github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY=
118 | go.opencensus.io v0.24.0 h1:y73uSU6J157QMP2kn2r30vwW1A2W2WFwSCGnAVxeaD0=
119 | go.opencensus.io v0.24.0/go.mod h1:vNK8G9p7aAivkbmorf4v+7Hgx+Zs0yY+0fOtgBfjQKo=
120 | go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.49.0 h1:4Pp6oUg3+e/6M4C0A/3kJ2VYa++dsWVTtGgLVj5xtHg=
121 | go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.49.0/go.mod h1:Mjt1i1INqiaoZOMGR1RIUJN+i3ChKoFRqzrRQhlkbs0=
122 | go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.49.0 h1:jq9TW8u3so/bN+JPT166wjOI6/vQPF6Xe7nMNIltagk=
123 | go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.49.0/go.mod h1:p8pYQP+m5XfbZm9fxtSKAbM6oIllS7s2AfxrChvc7iw=
124 | go.opentelemetry.io/otel v1.24.0 h1:0LAOdjNmQeSTzGBzduGe/rU4tZhMwL5rWgtp9Ku5Jfo=
125 | go.opentelemetry.io/otel v1.24.0/go.mod h1:W7b9Ozg4nkF5tWI5zsXkaKKDjdVjpD4oAt9Qi/MArHo=
126 | go.opentelemetry.io/otel/metric v1.24.0 h1:6EhoGWWK28x1fbpA4tYTOWBkPefTDQnb8WSGXlc88kI=
127 | go.opentelemetry.io/otel/metric v1.24.0/go.mod h1:VYhLe1rFfxuTXLgj4CBiyz+9WYBA8pNGJgDcSFRKBco=
128 | go.opentelemetry.io/otel/sdk v1.24.0 h1:YMPPDNymmQN3ZgczicBY3B6sf9n62Dlj9pWD3ucgoDw=
129 | go.opentelemetry.io/otel/sdk v1.24.0/go.mod h1:KVrIYw6tEubO9E96HQpcmpTKDVn9gdv35HoYiQWGDFg=
130 | go.opentelemetry.io/otel/trace v1.24.0 h1:CsKnnL4dUAr/0llH9FKuc698G04IrpWV0MQA/Y1YELI=
131 | go.opentelemetry.io/otel/trace v1.24.0/go.mod h1:HPc3Xr/cOApsBI154IU0OI0HJexz+aw5uPdbs3UCjNU=
132 | golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
133 | golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto=
134 | golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc=
135 | golang.org/x/crypto v0.25.0 h1:ypSNr+bnYL2YhwoMt2zPxHFmbAN1KZs/njMG3hxUp30=
136 | golang.org/x/crypto v0.25.0/go.mod h1:T+wALwcMOSE0kXgUAnPAHqTLW+XHgcELELW8VaDgm/M=
137 | golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
138 | golang.org/x/image v0.5.0/go.mod h1:FVC7BI/5Ym8R25iw5OLsgshdUBbT1h5jZTpA+mvAdZ4=
139 | golang.org/x/lint v0.0.0-20181026193005-c67002cb31c3/go.mod h1:UVdnD1Gm6xHRNCYTkRU2/jEulfH38KcIWyp/GAMgvoE=
140 | golang.org/x/lint v0.0.0-20190227174305-5b3e6a55c961/go.mod h1:wehouNa3lNwaWXcvxsM5YxQ5yQlVC4a0KAMCusXpPoU=
141 | golang.org/x/lint v0.0.0-20190313153728-d0100b6bd8b3/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc=
142 | golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4=
143 | golang.org/x/mod v0.8.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs=
144 | golang.org/x/net v0.0.0-20180724234803-3673e40ba225/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
145 | golang.org/x/net v0.0.0-20180826012351-8a410e7b638d/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
146 | golang.org/x/net v0.0.0-20190213061140-3a22650c66bd/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
147 | golang.org/x/net v0.0.0-20190311183353-d8887717615a/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
148 | golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
149 | golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
150 | golang.org/x/net v0.0.0-20201110031124-69a78807bb2b/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU=
151 | golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg=
152 | golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c=
153 | golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs=
154 | golang.org/x/net v0.7.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs=
155 | golang.org/x/net v0.9.0/go.mod h1:d48xBJpPfHeWQsugry2m+kC02ZBRGRgulfHnEXEuWns=
156 | golang.org/x/net v0.27.0 h1:5K3Njcw06/l2y9vpGCSdcxWOYHOUk3dVNGDXN+FvAys=
157 | golang.org/x/net v0.27.0/go.mod h1:dDi0PyhWNoiUOrAS8uXv/vnScO4wnHQO4mj9fn/RytE=
158 | golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U=
159 | golang.org/x/oauth2 v0.21.0 h1:tsimM75w1tF/uws5rbeHzIWxEqElMehnc+iW793zsZs=
160 | golang.org/x/oauth2 v0.21.0/go.mod h1:XYTD2NtWslqkgxebSiOHnXEap4TF09sJSc7H1sXbhtI=
161 | golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
162 | golang.org/x/sync v0.0.0-20181108010431-42b317875d0f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
163 | golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
164 | golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
165 | golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
166 | golang.org/x/sync v0.7.0 h1:YsImfSBoP9QPYL0xyKJPq0gcaJdG3rInoqxTWbfQu9M=
167 | golang.org/x/sync v0.7.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
168 | golang.org/x/sys v0.0.0-20180830151530-49385e6e1522/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
169 | golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
170 | golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
171 | golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
172 | golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
173 | golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
174 | golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
175 | golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
176 | golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
177 | golang.org/x/sys v0.7.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
178 | golang.org/x/sys v0.22.0 h1:RI27ohtqKCnwULzJLqkv897zojh5/DwS/ENaMzUOaWI=
179 | golang.org/x/sys v0.22.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
180 | golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
181 | golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8=
182 | golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k=
183 | golang.org/x/term v0.7.0/go.mod h1:P32HKFT3hSsZrRxla30E9HqToFYAQPCMs/zFMBUFqPY=
184 | golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
185 | golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
186 | golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ=
187 | golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8=
188 | golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8=
189 | golang.org/x/text v0.16.0 h1:a94ExnEXNtEwYLGJSIUxnWoxoRz/ZcCsV63ROupILh4=
190 | golang.org/x/text v0.16.0/go.mod h1:GhwF1Be+LQoKShO3cGOHzqOgRrGaYc9AvblQOmPVHnI=
191 | golang.org/x/time v0.6.0 h1:eTDhh4ZXt5Qf0augr54TN6suAUudPcawVZeIAPU7D4U=
192 | golang.org/x/time v0.6.0/go.mod h1:3BpzKBy/shNhVucY/MWOyx10tF3SFh9QdLuxbVysPQM=
193 | golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
194 | golang.org/x/tools v0.0.0-20190114222345-bf090417da8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
195 | golang.org/x/tools v0.0.0-20190226205152-f727befe758c/go.mod h1:9Yl7xja0Znq3iFh3HoIrodX9oNMXvdceNzlUR8zjMvY=
196 | golang.org/x/tools v0.0.0-20190311212946-11955173bddd/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs=
197 | golang.org/x/tools v0.0.0-20190524140312-2c0ae7006135/go.mod h1:RgjU9mgBXZiqYHBnxXauZ1Gv1EHHAz9KjViQ78xBX0Q=
198 | golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
199 | golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc=
200 | golang.org/x/tools v0.6.0/go.mod h1:Xwgl3UAJ/d3gWutnCtw505GrjyAbvKui8lOU390QaIU=
201 | golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
202 | golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
203 | google.golang.org/api v0.187.0 h1:Mxs7VATVC2v7CY+7Xwm4ndkX71hpElcvx0D1Ji/p1eo=
204 | google.golang.org/api v0.187.0/go.mod h1:KIHlTc4x7N7gKKuVsdmfBXN13yEEWXWFURWY6SBp2gk=
205 | google.golang.org/appengine v1.1.0/go.mod h1:EbEs0AVv82hx2wNQdGPgUI5lhzA/G0D9YwlJXL52JkM=
206 | google.golang.org/appengine v1.4.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4=
207 | google.golang.org/genproto v0.0.0-20180817151627-c66870c02cf8/go.mod h1:JiN7NxoALGmiZfu7CAH4rXhgtRTLTxftemlI0sWmxmc=
208 | google.golang.org/genproto v0.0.0-20190819201941-24fa4b261c55/go.mod h1:DMBHOl98Agz4BDEuKkezgsaosCRResVns1a3J2ZsMNc=
209 | google.golang.org/genproto v0.0.0-20200526211855-cb27e3aa2013/go.mod h1:NbSheEEYHJ7i3ixzK3sjbqSGDJWnxyFXZblF3eUsNvo=
210 | google.golang.org/genproto v0.0.0-20240624140628-dc46fd24d27d h1:PksQg4dV6Sem3/HkBX+Ltq8T0ke0PKIRBNBatoDTVls=
211 | google.golang.org/genproto v0.0.0-20240624140628-dc46fd24d27d/go.mod h1:s7iA721uChleev562UJO2OYB0PPT9CMFjV+Ce7VJH5M=
212 | google.golang.org/genproto/googleapis/api v0.0.0-20240617180043-68d350f18fd4 h1:MuYw1wJzT+ZkybKfaOXKp5hJiZDn2iHaXRw0mRYdHSc=
213 | google.golang.org/genproto/googleapis/api v0.0.0-20240617180043-68d350f18fd4/go.mod h1:px9SlOOZBg1wM1zdnr8jEL4CNGUBZ+ZKYtNPApNQc4c=
214 | google.golang.org/genproto/googleapis/rpc v0.0.0-20240624140628-dc46fd24d27d h1:k3zyW3BYYR30e8v3x0bTDdE9vpYFjZHK+HcyqkrppWk=
215 | google.golang.org/genproto/googleapis/rpc v0.0.0-20240624140628-dc46fd24d27d/go.mod h1:Ue6ibwXGpU+dqIcODieyLOcgj7z8+IcskoNIgZxtrFY=
216 | google.golang.org/grpc v1.19.0/go.mod h1:mqu4LbDTu4XGKhr4mRzUsmM4RtVoemTSY81AxZiDr8c=
217 | google.golang.org/grpc v1.23.0/go.mod h1:Y5yQAOtifL1yxbo5wqy6BxZv8vAUGQwXBOALyacEbxg=
218 | google.golang.org/grpc v1.25.1/go.mod h1:c3i+UQWmh7LiEpx4sFZnkU36qjEYZ0imhYfXVyQciAY=
219 | google.golang.org/grpc v1.27.0/go.mod h1:qbnxyOmOxrQa7FizSgH+ReBfzJrCY1pSN7KXBS8abTk=
220 | google.golang.org/grpc v1.33.2/go.mod h1:JMHMWHQWaTccqQQlmk3MJZS+GWXOdAesneDmEnv2fbc=
221 | google.golang.org/grpc v1.64.0 h1:KH3VH9y/MgNQg1dE7b3XfVK0GsPSIzJwdF617gUSbvY=
222 | google.golang.org/grpc v1.64.0/go.mod h1:oxjF8E3FBnjp+/gVFYdWacaLDx9na1aqy9oovLpxQYg=
223 | google.golang.org/protobuf v0.0.0-20200109180630-ec00e32a8dfd/go.mod h1:DFci5gLYBciE7Vtevhsrf46CRTquxDuWsQurQQe4oz8=
224 | google.golang.org/protobuf v0.0.0-20200221191635-4d8936d0db64/go.mod h1:kwYJMbMJ01Woi6D6+Kah6886xMZcty6N08ah7+eCXa0=
225 | google.golang.org/protobuf v0.0.0-20200228230310-ab0ca4ff8a60/go.mod h1:cfTl7dwQJ+fmap5saPgwCLgHXTUD7jkjRqWcaiX5VyM=
226 | google.golang.org/protobuf v1.20.1-0.20200309200217-e05f789c0967/go.mod h1:A+miEFZTKqfCUM6K7xSMQL9OKL/b6hQv+e19PK+JZNE=
227 | google.golang.org/protobuf v1.21.0/go.mod h1:47Nbq4nVaFHyn7ilMalzfO3qCViNmqZ2kzikPIcrTAo=
228 | google.golang.org/protobuf v1.22.0/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU=
229 | google.golang.org/protobuf v1.23.0/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU=
230 | google.golang.org/protobuf v1.23.1-0.20200526195155-81db48ad09cc/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU=
231 | google.golang.org/protobuf v1.25.0/go.mod h1:9JNX74DMeImyA3h4bdi1ymwjUzf21/xIlbajtzgsN7c=
232 | google.golang.org/protobuf v1.34.2 h1:6xV6lTsCfpGD21XK49h7MhtcApnLqkfYgPcdHftf6hg=
233 | google.golang.org/protobuf v1.34.2/go.mod h1:qYOHts0dSfpeUzUFpOMr/WGzszTmLH+DiWniOlNbLDw=
234 | gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
235 | gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127 h1:qIbj1fsPNlZgppZ+VLlY7N33q108Sa+fhmuc+sWQYwY=
236 | gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
237 | gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
238 | gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
239 | gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
240 | honnef.co/go/tools v0.0.0-20190102054323-c2f93a96b099/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4=
241 | honnef.co/go/tools v0.0.0-20190523083050-ea95bdfd59fc/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4=
242 |
--------------------------------------------------------------------------------
/internal/audio/flac.go:
--------------------------------------------------------------------------------
1 | package audio
2 |
3 | import (
4 | "fmt"
5 |
6 | "github.com/mewkiz/flac"
7 | )
8 |
9 | type flacProcessor struct{}
10 |
11 | func (p *flacProcessor) process(filename string) (*audioInfo, error) {
12 | file, err := flac.Open(filename)
13 | if err != nil {
14 | return nil, fmt.Errorf("failed to open FLAC file: %w", err)
15 | }
16 | defer func(file *flac.Stream) {
17 | err := file.Close()
18 | if err != nil {
19 | fmt.Printf("failed to close FLAC file: %v\n", err)
20 | }
21 | }(file)
22 |
23 | return &audioInfo{
24 | sampleRate: int(file.Info.SampleRate),
25 | numChannels: int(file.Info.NChannels),
26 | format: "FLAC",
27 | }, nil
28 | }
29 |
--------------------------------------------------------------------------------
/internal/audio/flac_test.go:
--------------------------------------------------------------------------------
1 | package audio
2 |
3 | import (
4 | "reflect"
5 | "testing"
6 | )
7 |
8 | func Test_flacProcessor_process(t *testing.T) {
9 | type args struct {
10 | filename string
11 | }
12 | tests := []struct {
13 | name string
14 | p *flacProcessor
15 | args args
16 | want *audioInfo
17 | wantErr bool
18 | }{
19 | {
20 | name: "success",
21 | p: &flacProcessor{},
22 | args: args{
23 | filename: getRootPath(t) + "/testdata/audio/test.flac",
24 | },
25 | want: &audioInfo{
26 | sampleRate: 96000,
27 | numChannels: 2,
28 | format: "FLAC",
29 | },
30 | wantErr: false,
31 | },
32 | {
33 | name: "file not found",
34 | p: &flacProcessor{},
35 | args: args{
36 | filename: getRootPath(t) + "/testdata/audio/test_new.flac",
37 | },
38 | want: nil,
39 | wantErr: true,
40 | },
41 | }
42 | for _, tt := range tests {
43 | t.Run(tt.name, func(t *testing.T) {
44 | p := &flacProcessor{}
45 | got, err := p.process(tt.args.filename)
46 | if (err != nil) != tt.wantErr {
47 | t.Errorf("flacProcessor.process() error = %v, wantErr %v", err, tt.wantErr)
48 | return
49 | }
50 | if !reflect.DeepEqual(got, tt.want) {
51 | t.Errorf("flacProcessor.process() = %v, want %v", got, tt.want)
52 | }
53 | })
54 | }
55 | }
56 |
--------------------------------------------------------------------------------
/internal/audio/mp3.go:
--------------------------------------------------------------------------------
1 | package audio
2 |
3 | import (
4 | "fmt"
5 | "os"
6 |
7 | "github.com/amanitaverna/go-mp3"
8 | )
9 |
10 | type mp3Processor struct{}
11 |
12 | func (p *mp3Processor) process(filename string) (*audioInfo, error) {
13 | file, err := os.Open(filename)
14 | if err != nil {
15 | return nil, fmt.Errorf("failed to open MP3 file: %w", err)
16 | }
17 |
18 | defer func(file *os.File) {
19 | err := file.Close()
20 | if err != nil {
21 | fmt.Printf("failed to close MP3 file: %v\n", err)
22 | }
23 | }(file)
24 |
25 | decoder, err := mp3.NewDecoder(file)
26 | if err != nil {
27 | return nil, fmt.Errorf("failed to create MP3 decoder: %w", err)
28 | }
29 |
30 | return &audioInfo{
31 | sampleRate: decoder.SampleRate(),
32 | /*
33 | This is a terrible assumption but seeing as the MP3 decoder
34 | doesn't expose this information, we'll have to live with it for now.
35 | */
36 | numChannels: 2,
37 | format: "MP3",
38 | }, nil
39 | }
40 |
--------------------------------------------------------------------------------
/internal/audio/mp3_test.go:
--------------------------------------------------------------------------------
1 | package audio
2 |
3 | import (
4 | "reflect"
5 | "testing"
6 | )
7 |
8 | func Test_mp3Processor_process(t *testing.T) {
9 | type args struct {
10 | filename string
11 | }
12 | tests := []struct {
13 | name string
14 | p *mp3Processor
15 | args args
16 | want *audioInfo
17 | wantErr bool
18 | }{
19 | {
20 | name: "success",
21 | p: &mp3Processor{},
22 | args: args{
23 | filename: getRootPath(t) + "/testdata/audio/test.mp3",
24 | },
25 | want: &audioInfo{
26 | sampleRate: 44100,
27 | numChannels: 2,
28 | format: "MP3",
29 | },
30 | wantErr: false,
31 | },
32 | {
33 | name: "file not found",
34 | p: &mp3Processor{},
35 | args: args{
36 | filename: getRootPath(t) + "/testdata/audio/test_new.mp3",
37 | },
38 | want: nil,
39 | wantErr: true,
40 | },
41 | {
42 | name: "invalid MP3 file",
43 | p: &mp3Processor{},
44 | args: args{
45 | filename: getRootPath(t) + "/testdata/audio/test.flac",
46 | },
47 | want: nil,
48 | wantErr: true,
49 | },
50 | }
51 | for _, tt := range tests {
52 | t.Run(tt.name, func(t *testing.T) {
53 | p := &mp3Processor{}
54 | got, err := p.process(tt.args.filename)
55 | if (err != nil) != tt.wantErr {
56 | t.Errorf("mp3Processor.process() error = %v, wantErr %v", err, tt.wantErr)
57 | return
58 | }
59 | if !reflect.DeepEqual(got, tt.want) {
60 | t.Errorf("mp3Processor.process() = %v, want %v", got, tt.want)
61 | }
62 | })
63 | }
64 | }
65 |
--------------------------------------------------------------------------------
/internal/audio/processor.go:
--------------------------------------------------------------------------------
1 | package audio
2 |
3 | import (
4 | "fmt"
5 | "path/filepath"
6 | "strings"
7 |
8 | "cloud.google.com/go/speech/apiv1/speechpb"
9 | )
10 |
11 | var (
12 | defaultFactory = &defaultAudioProcessorFactory{}
13 | retriever = newAudioInfoRetriever(defaultFactory)
14 | )
15 |
16 | var encodingMap = map[string]speechpb.RecognitionConfig_AudioEncoding{
17 | "WAV": speechpb.RecognitionConfig_LINEAR16,
18 | "MP3": speechpb.RecognitionConfig_MP3,
19 | "FLAC": speechpb.RecognitionConfig_FLAC,
20 | }
21 |
22 | func (f *defaultAudioProcessorFactory) createProcessor(ext string) (audioProcessor, error) {
23 | switch strings.ToLower(ext) {
24 | case ".mp3":
25 | return &mp3Processor{}, nil
26 | case ".flac":
27 | return &flacProcessor{}, nil
28 | case ".wav":
29 | return &wavProcessor{}, nil
30 | default:
31 | return nil, fmt.Errorf("unsupported file format: %s", ext)
32 | }
33 | }
34 |
35 | func newAudioInfoRetriever(factory audioProcessorFactory) *audioInfoRetriever {
36 | return &audioInfoRetriever{
37 | factory: factory,
38 | }
39 | }
40 |
41 | func GetAudioInfo(filename string) (*speechpb.RecognitionConfig, error) {
42 | info, err := retriever.audioInfo(filename)
43 | if err != nil {
44 | return nil, err
45 | }
46 |
47 | return &speechpb.RecognitionConfig{
48 | Encoding: getEncoding(info.format),
49 | SampleRateHertz: int32(info.sampleRate),
50 | AudioChannelCount: int32(info.numChannels),
51 | }, nil
52 | }
53 |
54 | func (r *audioInfoRetriever) audioInfo(filename string) (*audioInfo, error) {
55 | ext := filepath.Ext(filename)
56 | processor, err := r.factory.createProcessor(ext)
57 | if err != nil {
58 | return nil, err
59 | }
60 | return processor.process(filename)
61 | }
62 |
63 | func getEncoding(format string) speechpb.RecognitionConfig_AudioEncoding {
64 | if encoding, ok := encodingMap[format]; ok {
65 | return encoding
66 | }
67 | return speechpb.RecognitionConfig_ENCODING_UNSPECIFIED
68 | }
69 |
--------------------------------------------------------------------------------
/internal/audio/processor_test.go:
--------------------------------------------------------------------------------
1 | package audio
2 |
3 | import (
4 | "errors"
5 | "os"
6 | "path/filepath"
7 | "reflect"
8 | "testing"
9 |
10 | "cloud.google.com/go/speech/apiv1/speechpb"
11 | )
12 |
13 | type mockProcessor struct {
14 | info *audioInfo
15 | err error
16 | }
17 |
18 | func (m *mockProcessor) process(string) (*audioInfo, error) {
19 | return m.info, m.err
20 | }
21 |
22 | type mockFactory struct {
23 | processor audioProcessor
24 | err error
25 | }
26 |
27 | func (m *mockFactory) createProcessor(string) (audioProcessor, error) {
28 | return m.processor, m.err
29 | }
30 |
31 | func getRootPath(t *testing.T) string {
32 | t.Helper()
33 | pwd, err := os.Getwd()
34 | if err != nil {
35 | t.Fatalf("getting current folder: %s", err)
36 | }
37 | pwd = filepath.Dir(filepath.Dir(pwd))
38 | return pwd
39 | }
40 |
41 | func Test_getEncoding(t *testing.T) {
42 | tests := []struct {
43 | name string
44 | format string
45 | want speechpb.RecognitionConfig_AudioEncoding
46 | }{
47 | {
48 | name: "WAV format",
49 | format: "WAV",
50 | want: speechpb.RecognitionConfig_LINEAR16,
51 | },
52 | {
53 | name: "MP3 format",
54 | format: "MP3",
55 | want: speechpb.RecognitionConfig_MP3,
56 | },
57 | {
58 | name: "FLAC format",
59 | format: "FLAC",
60 | want: speechpb.RecognitionConfig_FLAC,
61 | },
62 | {
63 | name: "Unsupported format",
64 | format: "AAC",
65 | want: speechpb.RecognitionConfig_ENCODING_UNSPECIFIED,
66 | },
67 | }
68 |
69 | for _, tt := range tests {
70 | t.Run(tt.name, func(t *testing.T) {
71 | if got := getEncoding(tt.format); !reflect.DeepEqual(got, tt.want) {
72 | t.Errorf("getEncoding() = %v, want %v", got, tt.want)
73 | }
74 | })
75 | }
76 | }
77 |
78 | func Test_audioInfoRetriever_audioInfo(t *testing.T) {
79 | tests := []struct {
80 | name string
81 | factory audioProcessorFactory
82 | filename string
83 | want *audioInfo
84 | wantErr bool
85 | errMsg string
86 | }{
87 | {
88 | name: "MP3 file - successful processing",
89 | factory: &mockFactory{
90 | processor: &mockProcessor{
91 | info: &audioInfo{
92 | sampleRate: 44100,
93 | numChannels: 2,
94 | format: "MP3",
95 | },
96 | err: nil,
97 | },
98 | err: nil,
99 | },
100 | filename: "test.mp3",
101 | want: &audioInfo{
102 | sampleRate: 44100,
103 | numChannels: 2,
104 | format: "MP3",
105 | },
106 | wantErr: false,
107 | },
108 | {
109 | name: "FLAC file - successful processing",
110 | factory: &mockFactory{
111 | processor: &mockProcessor{
112 | info: &audioInfo{
113 | sampleRate: 96000,
114 | numChannels: 2,
115 | format: "FLAC",
116 | },
117 | err: nil,
118 | },
119 | err: nil,
120 | },
121 | filename: "test.flac",
122 | want: &audioInfo{
123 | sampleRate: 96000,
124 | numChannels: 2,
125 | format: "FLAC",
126 | },
127 | wantErr: false,
128 | },
129 | {
130 | name: "WAV file - processing error",
131 | factory: &mockFactory{
132 | processor: &mockProcessor{
133 | info: nil,
134 | err: errors.New("failed to process WAV file"),
135 | },
136 | err: nil,
137 | },
138 | filename: "test.wav",
139 | want: nil,
140 | wantErr: true,
141 | errMsg: "failed to process WAV file",
142 | },
143 | {
144 | name: "Unsupported file format",
145 | factory: &mockFactory{
146 | processor: nil,
147 | err: errors.New("unsupported file format: .aac"),
148 | },
149 | filename: "test.aac",
150 | want: nil,
151 | wantErr: true,
152 | errMsg: "unsupported file format: .aac",
153 | },
154 | }
155 |
156 | for _, tt := range tests {
157 | t.Run(tt.name, func(t *testing.T) {
158 | r := newAudioInfoRetriever(tt.factory)
159 | got, err := r.audioInfo(tt.filename)
160 | if (err != nil) != tt.wantErr {
161 | t.Errorf("audioInfoRetriever.audioInfo() error = %v, wantErr %v", err, tt.wantErr)
162 | return
163 | }
164 | if err != nil && err.Error() != tt.errMsg {
165 | t.Errorf("audioInfoRetriever.audioInfo() error = %v, expected error %v", err, tt.errMsg)
166 | }
167 | if !reflect.DeepEqual(got, tt.want) {
168 | t.Errorf("audioInfoRetriever.audioInfo() = %v, want %v", got, tt.want)
169 | }
170 | })
171 | }
172 | }
173 |
174 | func Test_newAudioInfoRetriever(t *testing.T) {
175 | factory := &defaultAudioProcessorFactory{}
176 | tests := []struct {
177 | name string
178 | want *audioInfoRetriever
179 | }{
180 | {
181 | name: "Create new audioInfoRetriever",
182 | want: &audioInfoRetriever{
183 | factory: factory,
184 | },
185 | },
186 | }
187 | for _, tt := range tests {
188 | t.Run(tt.name, func(t *testing.T) {
189 | if got := newAudioInfoRetriever(factory); !reflect.DeepEqual(got, tt.want) {
190 | t.Errorf("newAudioInfoRetriever() = %v, want %v", got, tt.want)
191 | }
192 | })
193 | }
194 | }
195 |
196 | func Test_getAudioInfo(t *testing.T) {
197 | type args struct {
198 | filename string
199 | }
200 | tests := []struct {
201 | name string
202 | args args
203 | want *speechpb.RecognitionConfig
204 | wantErr bool
205 | }{
206 | {
207 | name: "MP3 file",
208 | args: args{
209 | filename: getRootPath(t) + "/testdata/audio/test.mp3",
210 | },
211 | want: &speechpb.RecognitionConfig{
212 | Encoding: speechpb.RecognitionConfig_MP3,
213 | SampleRateHertz: 44100,
214 | AudioChannelCount: 2,
215 | },
216 | wantErr: false,
217 | },
218 | {
219 | name: "FLAC file",
220 | args: args{
221 | filename: getRootPath(t) + "/testdata/audio/test.flac",
222 | },
223 | want: &speechpb.RecognitionConfig{
224 | Encoding: speechpb.RecognitionConfig_FLAC,
225 | SampleRateHertz: 96000,
226 | AudioChannelCount: 2,
227 | },
228 | wantErr: false,
229 | },
230 | {
231 | name: "WAV file",
232 | args: args{
233 | filename: getRootPath(t) + "/testdata/audio/test.wav",
234 | },
235 | want: &speechpb.RecognitionConfig{
236 | Encoding: speechpb.RecognitionConfig_LINEAR16,
237 | SampleRateHertz: 44100,
238 | AudioChannelCount: 2,
239 | },
240 | wantErr: false,
241 | },
242 | {
243 | name: "Unsupported file format",
244 | args: args{
245 | filename: getRootPath(t) + "/testdata/audio/test.ogg",
246 | },
247 | want: nil,
248 | wantErr: true,
249 | },
250 | }
251 | for _, tt := range tests {
252 | t.Run(tt.name, func(t *testing.T) {
253 | got, err := GetAudioInfo(tt.args.filename)
254 | if (err != nil) != tt.wantErr {
255 | t.Errorf("getAudioInfo() error = %v, wantErr %v", err, tt.wantErr)
256 | return
257 | }
258 | if !reflect.DeepEqual(got, tt.want) {
259 | t.Errorf("getAudioInfo() = %v, want %v", got, tt.want)
260 | }
261 | })
262 | }
263 | }
264 |
--------------------------------------------------------------------------------
/internal/audio/types.go:
--------------------------------------------------------------------------------
1 | package audio
2 |
3 | type audioInfo struct {
4 | sampleRate int
5 | numChannels int
6 | format string
7 | }
8 |
9 | type audioProcessor interface {
10 | process(filename string) (*audioInfo, error)
11 | }
12 |
13 | type audioProcessorFactory interface {
14 | createProcessor(fileExtension string) (audioProcessor, error)
15 | }
16 |
17 | type defaultAudioProcessorFactory struct{}
18 |
19 | type audioInfoRetriever struct {
20 | factory audioProcessorFactory
21 | }
22 |
--------------------------------------------------------------------------------
/internal/audio/wav.go:
--------------------------------------------------------------------------------
1 | package audio
2 |
3 | import (
4 | "errors"
5 | "fmt"
6 | "os"
7 |
8 | "github.com/go-audio/wav"
9 | )
10 |
11 | type wavProcessor struct{}
12 |
13 | func (p *wavProcessor) process(filename string) (*audioInfo, error) {
14 | file, err := os.Open(filename)
15 | if err != nil {
16 | return nil, fmt.Errorf("failed to open WAV file: %w", err)
17 | }
18 | defer func(file *os.File) {
19 | err := file.Close()
20 | if err != nil {
21 | fmt.Printf("failed to close WAV file: %v\n", err)
22 | }
23 | }(file)
24 |
25 | decoder := wav.NewDecoder(file)
26 | if !decoder.IsValidFile() {
27 | return nil, errors.New("invalid WAV file")
28 | }
29 |
30 | return &audioInfo{
31 | sampleRate: int(decoder.SampleRate),
32 | numChannels: int(decoder.NumChans),
33 | format: "WAV",
34 | }, nil
35 | }
36 |
--------------------------------------------------------------------------------
/internal/audio/wav_test.go:
--------------------------------------------------------------------------------
1 | package audio
2 |
3 | import (
4 | "reflect"
5 | "testing"
6 | )
7 |
8 | func Test_wavProcessor_process(t *testing.T) {
9 | type args struct {
10 | filename string
11 | }
12 | tests := []struct {
13 | name string
14 | p *wavProcessor
15 | args args
16 | want *audioInfo
17 | wantErr bool
18 | }{
19 | {
20 | name: "success",
21 | p: &wavProcessor{},
22 | args: args{
23 | filename: getRootPath(t) + "/testdata/audio/test.wav",
24 | },
25 | want: &audioInfo{
26 | sampleRate: 44100,
27 | numChannels: 2,
28 | format: "WAV",
29 | },
30 | },
31 | {
32 | name: "file not found",
33 | p: &wavProcessor{},
34 | args: args{
35 | filename: getRootPath(t) + "/testdata/audio/test_new.wav",
36 | },
37 | want: nil,
38 | wantErr: true,
39 | },
40 | {
41 | name: "invalid WAV file",
42 | p: &wavProcessor{},
43 | args: args{
44 | filename: getRootPath(t) + "/testdata/audio/test.flac",
45 | },
46 | want: nil,
47 | wantErr: true,
48 | },
49 | }
50 | for _, tt := range tests {
51 | t.Run(tt.name, func(t *testing.T) {
52 | p := &wavProcessor{}
53 | got, err := p.process(tt.args.filename)
54 | if (err != nil) != tt.wantErr {
55 | t.Errorf("wavProcessor.process() error = %v, wantErr %v", err, tt.wantErr)
56 | return
57 | }
58 | if !reflect.DeepEqual(got, tt.want) {
59 | t.Errorf("wavProcessor.process() = %v, want %v", got, tt.want)
60 | }
61 | })
62 | }
63 | }
64 |
--------------------------------------------------------------------------------
/internal/common/types.go:
--------------------------------------------------------------------------------
1 | package common
2 |
3 | import (
4 | "time"
5 | )
6 |
7 | type TranscribeOptions struct {
8 | CredentialsJSON []byte
9 | Bucket string
10 | LanguageCode string
11 | EnableDiarization bool
12 | MinSpeakers int
13 | MaxSpeakers int
14 | CleanupOnComplete bool
15 | UseWhisper bool
16 | WhisperAPIKey string
17 | WhisperModel string
18 | WhisperPrompt string
19 | }
20 |
21 | type Config struct {
22 | UserAgent string
23 | RetryLimit int
24 | RetryDelay time.Duration
25 | CrawlDelay time.Duration
26 | ProxyList []string
27 | RateLimit time.Duration
28 | RateBurst int
29 | IgnoreRobotsTxt bool
30 | }
31 |
32 | type Chunk struct {
33 | Content string
34 | Source string
35 | }
36 |
--------------------------------------------------------------------------------
/internal/document/docx.go:
--------------------------------------------------------------------------------
1 | package document
2 |
3 | import (
4 | "archive/zip"
5 | "bytes"
6 | "io"
7 | "strings"
8 |
9 | "github.com/mmatongo/chew/v1/internal/common"
10 | "github.com/mmatongo/chew/v1/internal/utils"
11 | )
12 |
13 | func processDocxContent(r io.Reader) ([]string, error) {
14 | data, err := io.ReadAll(r)
15 | if err != nil {
16 | return nil, err
17 | }
18 |
19 | zipReader, err := zip.NewReader(bytes.NewReader(data), int64(len(data)))
20 | if err != nil {
21 | return nil, err
22 | }
23 |
24 | var contents []string
25 |
26 | for _, file := range zipReader.File {
27 | if file.Name == "word/document.xml" {
28 | contents, err = utils.ExtractTextFromXML(file)
29 | if err != nil {
30 | return nil, err
31 | }
32 | break
33 | }
34 | }
35 |
36 | var allContent strings.Builder
37 | for _, content := range contents {
38 | allContent.WriteString(content)
39 | allContent.WriteString(" ")
40 | }
41 |
42 | return []string{allContent.String()}, nil
43 |
44 | /*
45 | // In the event we just want chunks we can just return contents
46 | return contents, nil
47 | */
48 | }
49 |
50 | func ProcessDocx(r io.Reader, url string) ([]common.Chunk, error) {
51 | content, err := processDocxContent(r)
52 | if err != nil {
53 | return nil, err
54 | }
55 |
56 | var chunks []common.Chunk
57 | for _, chunk := range content {
58 | if strings.TrimSpace(string(chunk)) != "" {
59 | chunks = append(chunks, common.Chunk{Content: string(chunk), Source: url})
60 | }
61 | }
62 |
63 | return chunks, nil
64 | }
65 |
--------------------------------------------------------------------------------
/internal/document/docx_test.go:
--------------------------------------------------------------------------------
1 | package document
2 |
3 | import (
4 | "archive/zip"
5 | "bytes"
6 | "errors"
7 | "io"
8 | "reflect"
9 | "testing"
10 |
11 | "github.com/mmatongo/chew/v1/internal/common"
12 | )
13 |
14 | type errorReader struct{}
15 |
16 | var errMockRead = errors.New("mock read error")
17 |
18 | func (r *errorReader) Read(p []byte) (n int, err error) {
19 | return 0, errMockRead
20 | }
21 |
22 | func createDocxWithContent(content string) io.Reader {
23 | buf := new(bytes.Buffer)
24 | w := zip.NewWriter(buf)
25 | f, _ := w.Create("word/document.xml")
26 | f.Write([]byte(content))
27 | w.Close()
28 | return bytes.NewReader(buf.Bytes())
29 | }
30 |
31 | func createEmptyDocx() io.Reader {
32 | return createDocxWithContent(``)
33 | }
34 |
35 | func createSingleParagraphDocx(content string) io.Reader {
36 | return createDocxWithContent(`` + content + `
`)
37 | }
38 |
39 | func TestProcessDocx(t *testing.T) {
40 | type args struct {
41 | r io.Reader
42 | url string
43 | }
44 | tests := []struct {
45 | name string
46 | args args
47 | want []common.Chunk
48 | wantErr bool
49 | }{
50 | {
51 | name: "Empty docx file",
52 | args: args{
53 | r: createEmptyDocx(),
54 | url: "http://example.com",
55 | },
56 | want: nil,
57 | wantErr: false,
58 | },
59 | {
60 | name: "Single paragraph docx file",
61 | args: args{
62 | r: createSingleParagraphDocx("Hello from chew!"),
63 | url: "http://example.com",
64 | },
65 | want: []common.Chunk{
66 | {
67 | Content: "Hello from chew! ",
68 | Source: "http://example.com",
69 | },
70 | },
71 | wantErr: false,
72 | },
73 | }
74 | for _, tt := range tests {
75 | t.Run(tt.name, func(t *testing.T) {
76 | got, err := ProcessDocx(tt.args.r, tt.args.url)
77 | if (err != nil) != tt.wantErr {
78 | t.Errorf("ProcessDocx() error = %v, wantErr %v", err, tt.wantErr)
79 | return
80 | }
81 | if !reflect.DeepEqual(got, tt.want) {
82 | t.Errorf("ProcessDocx() = %v, want %v", got, tt.want)
83 | }
84 | })
85 | }
86 | }
87 |
88 | func TestProcessDocx_Error_ReadAll(t *testing.T) {
89 | _, err := processPptxContent(&errorReader{})
90 | if err == nil {
91 | t.Error("ProcessDocx() did not return an error, but one was expected")
92 | }
93 | }
94 |
--------------------------------------------------------------------------------
/internal/document/epub.go:
--------------------------------------------------------------------------------
1 | package document
2 |
3 | import (
4 | "bytes"
5 | "fmt"
6 | "io"
7 | "strings"
8 |
9 | "github.com/PuerkitoBio/goquery"
10 | "github.com/mmatongo/chew/v1/internal/common"
11 | "github.com/taylorskalyo/goreader/epub"
12 | )
13 |
14 | func processEpubContent(r io.Reader) ([]common.Chunk, error) {
15 | content, err := io.ReadAll(r)
16 | if err != nil {
17 | return nil, fmt.Errorf("failed to read EPUB content: %w", err)
18 | }
19 |
20 | reader, err := epub.NewReader(bytes.NewReader(content), int64(len(content)))
21 | if err != nil {
22 | return nil, fmt.Errorf("failed to create EPUB reader: %w", err)
23 | }
24 |
25 | if len(reader.Rootfiles) == 0 {
26 | return nil, fmt.Errorf("EPUB contains no content")
27 | }
28 |
29 | contents := reader.Rootfiles[0]
30 | var chunks []common.Chunk
31 |
32 | for _, item := range contents.Manifest.Items {
33 | if !strings.HasSuffix(item.HREF, ".xhtml") && !strings.HasSuffix(item.HREF, ".html") {
34 | continue
35 | }
36 |
37 | file, err := item.Open()
38 | if err != nil {
39 | return nil, fmt.Errorf("failed to open item %s: %w", item.HREF, err)
40 | }
41 |
42 | text, err := extractTextFromHTML(file)
43 | file.Close()
44 | if err != nil {
45 | return nil, fmt.Errorf("failed to extract text from %s: %w", item.HREF, err)
46 | }
47 |
48 | text = strings.TrimSpace(text)
49 | if text == "" {
50 | continue
51 | }
52 | chunks = append(chunks, common.Chunk{Content: text, Source: item.HREF})
53 | }
54 |
55 | return chunks, nil
56 | }
57 |
58 | func ProcessEpub(r io.Reader, url string) ([]common.Chunk, error) {
59 | chunks, err := processEpubContent(r)
60 | if err != nil {
61 | return nil, err
62 | }
63 |
64 | for i := range chunks {
65 | chunks[i].Source = url
66 | }
67 |
68 | return chunks, nil
69 | }
70 |
71 | func extractTextFromHTML(r io.Reader) (string, error) {
72 | doc, err := goquery.NewDocumentFromReader(r)
73 | if err != nil {
74 | return "", err
75 | }
76 |
77 | doc.Find("script, style,nav, header, footer").Remove()
78 |
79 | var buf strings.Builder
80 | /*
81 | We're only interested in the text content of the HTML document
82 | however this is a very naive approach and might not work well
83 | for all HTML documents unfortunately.
84 | This is a known issue and I'm working on a better solution.
85 | see: https://github.com/mmatongo/chew/issues/22
86 |
87 | TODO: Allow users to specify a CSS selector to extract text from
88 | */
89 | doc.Find("p, h1, h2, h3, h4, h5, h6, li").Each(func(_ int, s *goquery.Selection) {
90 | buf.WriteString(strings.TrimSpace(s.Text()))
91 | buf.WriteString("\n\n")
92 | })
93 |
94 | return strings.TrimSpace(buf.String()), nil
95 | }
96 |
--------------------------------------------------------------------------------
/internal/document/epub_test.go:
--------------------------------------------------------------------------------
1 | package document
2 |
3 | import (
4 | "io"
5 | "os"
6 | "path/filepath"
7 | "reflect"
8 | "strings"
9 | "testing"
10 |
11 | "github.com/mmatongo/chew/v1/internal/common"
12 | "github.com/mmatongo/chew/v1/internal/utils"
13 | )
14 |
15 | func Test_processEpubContent(t *testing.T) {
16 | type args struct {
17 | r io.Reader
18 | }
19 | tests := []struct {
20 | name string
21 | args args
22 | want []common.Chunk
23 | wantErr bool
24 | }{
25 | {
26 | name: "success",
27 | args: args{
28 | r: func() io.Reader {
29 | f, _ := os.Open(filepath.Join(getRootPath(t), "testdata", "files", "test.epub"))
30 | return f
31 | }(),
32 | },
33 | want: []common.Chunk{
34 | {
35 | Content: "A pdf for testing",
36 | Source: "index.html",
37 | },
38 | },
39 | wantErr: false,
40 | },
41 | }
42 | for _, tt := range tests {
43 | t.Run(tt.name, func(t *testing.T) {
44 | got, err := processEpubContent(tt.args.r)
45 | if (err != nil) != tt.wantErr {
46 | t.Errorf("processEpubContent() error = %v, wantErr %v", err, tt.wantErr)
47 | return
48 | }
49 | if !reflect.DeepEqual(got, tt.want) {
50 | t.Errorf("processEpubContent() = %v, want %v", got, tt.want)
51 | }
52 | })
53 | }
54 | }
55 |
56 | func TestProcessEpub(t *testing.T) {
57 | type args struct {
58 | r io.Reader
59 | url string
60 | }
61 | tests := []struct {
62 | name string
63 | args args
64 | want []common.Chunk
65 | wantErr bool
66 | }{
67 | {
68 | name: "success",
69 | args: args{
70 | r: func() io.Reader {
71 | f, _ := os.Open(filepath.Join(getRootPath(t), "testdata", "files", "test.epub"))
72 | return f
73 | }(),
74 | url: "https://example.com/test.epub",
75 | },
76 | want: []common.Chunk{
77 | {
78 | Content: "A pdf for testing",
79 | Source: "https://example.com/test.epub",
80 | },
81 | },
82 | wantErr: false,
83 | },
84 | {
85 | name: "error",
86 | args: args{
87 | r: strings.NewReader("key: value, key2: value2"),
88 | url: "https://example.com/data.yaml",
89 | },
90 | want: nil,
91 | wantErr: true,
92 | },
93 | {
94 | name: "empty",
95 | args: args{
96 | r: strings.NewReader(""),
97 | url: "https://example.com",
98 | },
99 | want: nil,
100 | wantErr: true,
101 | },
102 | {
103 | name: "unreadable",
104 | args: args{
105 | r: func() io.Reader { f, _ := os.Open("nonexistent.epub"); return f }(),
106 | url: "https://example.com/nonexistent.epub",
107 | },
108 | want: nil,
109 | wantErr: true,
110 | },
111 | }
112 | for _, tt := range tests {
113 | t.Run(tt.name, func(t *testing.T) {
114 | got, err := ProcessEpub(tt.args.r, tt.args.url)
115 | if (err != nil) != tt.wantErr {
116 | t.Errorf("ProcessEpub() error = %v, wantErr %v", err, tt.wantErr)
117 | return
118 | }
119 | if !reflect.DeepEqual(got, tt.want) {
120 | t.Errorf("ProcessEpub() = %v, want %v", got, tt.want)
121 | }
122 | })
123 | }
124 | }
125 |
126 | func Test_extractTextFromHTML(t *testing.T) {
127 | file, _ := utils.OpenFile("testdata/invalid.html")
128 | type args struct {
129 | r io.Reader
130 | }
131 | tests := []struct {
132 | name string
133 | args args
134 | want string
135 | wantErr bool
136 | }{
137 | {
138 | name: "success",
139 | args: args{
140 | r: strings.NewReader("some content
"),
141 | },
142 | want: "some content",
143 | wantErr: false,
144 | },
145 | {
146 | name: "error",
147 | args: args{
148 | r: file,
149 | },
150 | want: "",
151 | wantErr: true,
152 | },
153 | }
154 | for _, tt := range tests {
155 | t.Run(tt.name, func(t *testing.T) {
156 | got, err := extractTextFromHTML(tt.args.r)
157 | if (err != nil) != tt.wantErr {
158 | t.Errorf("extractTextFromHTML() error = %v, wantErr %v", err, tt.wantErr)
159 | return
160 | }
161 | if got != tt.want {
162 | t.Errorf("extractTextFromHTML() = %v, want %v", got, tt.want)
163 | }
164 | })
165 | }
166 | }
167 |
--------------------------------------------------------------------------------
/internal/document/pdf.go:
--------------------------------------------------------------------------------
1 | package document
2 |
3 | import (
4 | "bytes"
5 | "fmt"
6 | "io"
7 | "log"
8 | "strings"
9 |
10 | "github.com/ledongthuc/pdf"
11 | "github.com/mmatongo/chew/v1/internal/common"
12 | )
13 |
14 | func ProcessPDF(r io.Reader, url string) ([]common.Chunk, error) {
15 | pdfData, err := io.ReadAll(r)
16 | if err != nil {
17 | return nil, err
18 | }
19 |
20 | f, err := pdf.NewReader(bytes.NewReader(pdfData), int64(len(pdfData)))
21 | if err != nil {
22 | return nil, err
23 | }
24 |
25 | var chunks []common.Chunk
26 | for i := 1; i <= f.NumPage(); i++ {
27 | p := f.Page(i)
28 | if p.V.IsNull() {
29 | continue
30 | }
31 | text, err := p.GetPlainText(nil)
32 | if err != nil {
33 | log.Printf("Error extracting text from page %d: %v\n\n", i, err)
34 | continue
35 | }
36 |
37 | text = strings.TrimSpace(text)
38 | text = strings.ReplaceAll(text, "\n", "\n\n")
39 |
40 | chunks = append(chunks, common.Chunk{
41 | Content: text,
42 | Source: fmt.Sprintf("%s#page=%d", url, i),
43 | })
44 | }
45 |
46 | if len(chunks) == 0 {
47 | return nil, err
48 | }
49 |
50 | return chunks, nil
51 | }
52 |
--------------------------------------------------------------------------------
/internal/document/pdf_test.go:
--------------------------------------------------------------------------------
1 | package document
2 |
3 | import (
4 | "io"
5 | "os"
6 | "path/filepath"
7 | "reflect"
8 | "strings"
9 | "testing"
10 |
11 | "github.com/mmatongo/chew/v1/internal/common"
12 | )
13 |
14 | func getRootPath(t *testing.T) string {
15 | t.Helper()
16 | pwd, err := os.Getwd()
17 | if err != nil {
18 | t.Fatalf("getting current folder: %s", err)
19 | }
20 | pwd = filepath.Dir(filepath.Dir(pwd))
21 | return pwd
22 | }
23 |
24 | func TestProcessPDF(t *testing.T) {
25 | type args struct {
26 | r io.Reader
27 | url string
28 | }
29 | tests := []struct {
30 | name string
31 | args args
32 | want []common.Chunk
33 | wantErr bool
34 | }{
35 | {
36 | name: "success",
37 | args: args{
38 | r: func() io.Reader {
39 | f, _ := os.Open(filepath.Join(getRootPath(t), "testdata", "files", "test.pdf"))
40 | return f
41 | }(),
42 | url: "https://example.com/test.pdf",
43 | },
44 | want: []common.Chunk{
45 | {
46 | Content: "Apdffortesting",
47 | Source: "https://example.com/test.pdf#page=1",
48 | },
49 | },
50 | wantErr: false,
51 | },
52 | {
53 | name: "error",
54 | args: args{
55 | r: strings.NewReader("key: value, key2: value2"),
56 | url: "https://example.com/data.yaml",
57 | },
58 | want: nil,
59 | wantErr: true,
60 | },
61 | {
62 | name: "empty",
63 | args: args{
64 | r: strings.NewReader(""),
65 | url: "https://example.com",
66 | },
67 | want: nil,
68 | wantErr: true,
69 | },
70 | {
71 | name: "unreadable",
72 | args: args{
73 | r: func() io.Reader { f, _ := os.Open("nonexistent.pdf"); return f }(),
74 | url: "https://example.com/nonexistent.pdf",
75 | },
76 | want: nil,
77 | wantErr: true,
78 | },
79 | }
80 | for _, tt := range tests {
81 | t.Run(tt.name, func(t *testing.T) {
82 | got, err := ProcessPDF(tt.args.r, tt.args.url)
83 | if (err != nil) != tt.wantErr {
84 | t.Errorf("ProcessPDF() error = %v, wantErr %v", err, tt.wantErr)
85 | return
86 | }
87 | if !reflect.DeepEqual(got, tt.want) {
88 | t.Errorf("ProcessPDF() = %v, want %v", got, tt.want)
89 | }
90 | })
91 | }
92 | }
93 |
--------------------------------------------------------------------------------
/internal/document/pptx.go:
--------------------------------------------------------------------------------
1 | package document
2 |
3 | import (
4 | "archive/zip"
5 | "bytes"
6 | "io"
7 | "strings"
8 |
9 | "github.com/mmatongo/chew/v1/internal/common"
10 | "github.com/mmatongo/chew/v1/internal/utils"
11 | )
12 |
13 | func processPptxContent(r io.Reader) ([]string, error) {
14 | data, err := io.ReadAll(r)
15 | if err != nil {
16 | return nil, err
17 | }
18 |
19 | zipReader, err := zip.NewReader(bytes.NewReader(data), int64(len(data)))
20 | if err != nil {
21 | return nil, err
22 | }
23 |
24 | var contents []string
25 |
26 | for _, file := range zipReader.File {
27 | if strings.HasPrefix(file.Name, "ppt/slides/") {
28 | slideText, err := utils.ExtractTextFromXML(file)
29 | if err != nil {
30 | return nil, err
31 | }
32 | contents = append(contents, slideText...)
33 | }
34 | }
35 |
36 | var allContent strings.Builder
37 | for _, content := range contents {
38 | allContent.WriteString(content)
39 | allContent.WriteString(" ")
40 | }
41 |
42 | return []string{allContent.String()}, nil
43 |
44 | /*
45 | // In the event we just want chunks we can just return contents
46 | return contents, nil
47 | */
48 | }
49 |
50 | func ProcessPptx(r io.Reader, url string) ([]common.Chunk, error) {
51 | content, err := processPptxContent(r)
52 | if err != nil {
53 | return nil, err
54 | }
55 |
56 | var chunks []common.Chunk
57 | for _, chunk := range content {
58 | if strings.TrimSpace(string(chunk)) != "" {
59 | chunks = append(chunks, common.Chunk{Content: string(chunk), Source: url})
60 | }
61 | }
62 |
63 | return chunks, nil
64 | }
65 |
--------------------------------------------------------------------------------
/internal/document/pptx_test.go:
--------------------------------------------------------------------------------
1 | package document
2 |
3 | import (
4 | "archive/zip"
5 | "bytes"
6 | "io"
7 | "reflect"
8 | "testing"
9 |
10 | "github.com/mmatongo/chew/v1/internal/common"
11 | )
12 |
13 | func createPptxWithContent(content string) io.Reader {
14 | buf := new(bytes.Buffer)
15 | w := zip.NewWriter(buf)
16 | f, _ := w.Create("ppt/slides/slide1.xml")
17 | f.Write([]byte(content))
18 | w.Close()
19 | return bytes.NewReader(buf.Bytes())
20 | }
21 |
22 | func createEmptyPptx() io.Reader {
23 | return createPptxWithContent(``)
24 | }
25 |
26 | func createSingleParagraphPptx(content string) io.Reader {
27 | return createPptxWithContent(`` + content + `
`)
28 | }
29 |
30 | func TestProcessPptx(t *testing.T) {
31 | type args struct {
32 | r io.Reader
33 | url string
34 | }
35 | tests := []struct {
36 | name string
37 | args args
38 | want []common.Chunk
39 | wantErr bool
40 | }{
41 | {
42 | name: "Empty pptx file",
43 | args: args{r: createEmptyPptx(), url: "http://example.com"},
44 | want: nil,
45 | wantErr: false,
46 | },
47 | {
48 | name: "Single paragraph pptx file",
49 | args: args{r: createSingleParagraphPptx("Hello from chew!"), url: "http://example.com"},
50 | want: []common.Chunk{{Content: "Hello from chew! ", Source: "http://example.com"}},
51 | wantErr: false,
52 | },
53 | }
54 | for _, tt := range tests {
55 | t.Run(tt.name, func(t *testing.T) {
56 | got, err := ProcessPptx(tt.args.r, tt.args.url)
57 | if (err != nil) != tt.wantErr {
58 | t.Errorf("ProcessPptx() error = %v, wantErr %v", err, tt.wantErr)
59 | return
60 | }
61 | if !reflect.DeepEqual(got, tt.want) {
62 | t.Errorf("ProcessPptx() = %v, want %v", got, tt.want)
63 | }
64 | })
65 | }
66 | }
67 |
68 | func TestProcessPptx_Error_ReadAll(t *testing.T) {
69 | _, err := processPptxContent(&errorReader{})
70 | if err == nil {
71 | t.Error("ProcessPptx() did not return an error, but one was expected")
72 | }
73 | }
74 |
--------------------------------------------------------------------------------
/internal/text/csv.go:
--------------------------------------------------------------------------------
1 | package text
2 |
3 | import (
4 | "encoding/csv"
5 | "io"
6 | "strings"
7 |
8 | "github.com/mmatongo/chew/v1/internal/common"
9 | )
10 |
11 | func ProcessCSV(r io.Reader, url string) ([]common.Chunk, error) {
12 | csvReader := csv.NewReader(r)
13 | var records [][]string
14 | var err error
15 |
16 | records, err = csvReader.ReadAll()
17 | if err != nil {
18 | return nil, err
19 | }
20 |
21 | var chunks []common.Chunk
22 | for _, record := range records {
23 | chunks = append(chunks, common.Chunk{Content: strings.Join(record, ", "), Source: url})
24 | }
25 |
26 | return chunks, nil
27 | }
28 |
--------------------------------------------------------------------------------
/internal/text/csv_test.go:
--------------------------------------------------------------------------------
1 | package text
2 |
3 | import (
4 | "io"
5 | "reflect"
6 | "strings"
7 | "testing"
8 |
9 | "github.com/mmatongo/chew/v1/internal/common"
10 | )
11 |
12 | func TestProcessCSV(t *testing.T) {
13 | type args struct {
14 | r io.Reader
15 | url string
16 | }
17 | tests := []struct {
18 | name string
19 | args args
20 | want []common.Chunk
21 | wantErr bool
22 | }{
23 | {
24 | name: "success",
25 | args: args{
26 | r: strings.NewReader("Test content"),
27 | url: "https://example.com",
28 | },
29 | want: []common.Chunk{{
30 | Content: "Test content",
31 | Source: "https://example.com",
32 | }},
33 | wantErr: false,
34 | },
35 | {
36 | name: "empty",
37 | args: args{
38 | r: strings.NewReader(""),
39 | url: "https://example.com",
40 | },
41 | want: nil,
42 | wantErr: false,
43 | },
44 | {
45 | name: "CSV with quoted fields",
46 | args: args{
47 | r: strings.NewReader("\"header 1\",\"header 2\"\n\"value, with comma\",\"value2\""),
48 | url: "https://example.com/quoted.csv",
49 | },
50 | want: []common.Chunk{
51 | {Content: "header 1, header 2", Source: "https://example.com/quoted.csv"},
52 | {Content: "value, with comma, value2", Source: "https://example.com/quoted.csv"},
53 | },
54 | wantErr: false,
55 | },
56 | }
57 | for _, tt := range tests {
58 | t.Run(tt.name, func(t *testing.T) {
59 | got, err := ProcessCSV(tt.args.r, tt.args.url)
60 | if (err != nil) != tt.wantErr {
61 | t.Errorf("ProcessCSV() error = %v, wantErr %v", err, tt.wantErr)
62 | return
63 | }
64 | if !reflect.DeepEqual(got, tt.want) {
65 | t.Errorf("ProcessCSV() = %v, want %v", got, tt.want)
66 | }
67 | })
68 | }
69 | }
70 |
--------------------------------------------------------------------------------
/internal/text/html.go:
--------------------------------------------------------------------------------
1 | package text
2 |
3 | import (
4 | "fmt"
5 | "io"
6 | "strings"
7 |
8 | "github.com/PuerkitoBio/goquery"
9 | "github.com/mmatongo/chew/v1/internal/common"
10 | )
11 |
12 | func ProcessHTML(r io.Reader, url string) ([]common.Chunk, error) {
13 | doc, err := goquery.NewDocumentFromReader(r)
14 | if err != nil {
15 | return nil, fmt.Errorf("failed to parse HTML: %w", err)
16 | }
17 |
18 | var chunks []common.Chunk
19 | /*
20 | We're only interested in the text content of the HTML document
21 | so we're going to ignore the tags that don't contain useful text.
22 | This is a very naive approach and might not work for all HTML documents unfortunately
23 | */
24 |
25 | doc.Find("nav, header, footer").Remove()
26 |
27 | doc.Find("p, h1, h2, h3, h4, h5, h6, li").Each(func(_ int, s *goquery.Selection) {
28 | text := strings.TrimSpace(s.Text())
29 | if text != "" {
30 | chunks = append(chunks, common.Chunk{Content: text, Source: url})
31 | }
32 | })
33 |
34 | return chunks, nil
35 | }
36 |
--------------------------------------------------------------------------------
/internal/text/html_test.go:
--------------------------------------------------------------------------------
1 | package text
2 |
3 | import (
4 | "io"
5 | "reflect"
6 | "strings"
7 | "testing"
8 |
9 | "github.com/mmatongo/chew/v1/internal/common"
10 | "github.com/mmatongo/chew/v1/internal/utils"
11 | )
12 |
13 | func TestProcessHTML(t *testing.T) {
14 | file, _ := utils.OpenFile("testdata/invalid.html")
15 | type args struct {
16 | r io.Reader
17 | url string
18 | }
19 | tests := []struct {
20 | name string
21 | args args
22 | want []common.Chunk
23 | wantErr bool
24 | }{
25 | {
26 | name: "success",
27 | args: args{
28 | r: strings.NewReader(`
29 |
30 |
31 |
32 | Test HTML
33 |
34 |
35 | Test content
36 | This is a test paragraph.
37 |
38 |
39 | `),
40 | url: "https://example.com/page.html",
41 | },
42 | want: []common.Chunk{
43 | {
44 | Content: "Test content",
45 | Source: "https://example.com/page.html",
46 | },
47 | {
48 | Content: "This is a test paragraph.",
49 | Source: "https://example.com/page.html",
50 | },
51 | },
52 | wantErr: false,
53 | },
54 | {
55 | name: "empty",
56 | args: args{
57 | r: strings.NewReader(""),
58 | url: "https://example.com",
59 | },
60 | want: nil,
61 | wantErr: false,
62 | },
63 | {
64 | name: "invalid content as a reader",
65 | args: args{
66 | r: file,
67 | url: "https://example.com",
68 | },
69 | want: nil,
70 | wantErr: true,
71 | },
72 | }
73 | for _, tt := range tests {
74 | t.Run(tt.name, func(t *testing.T) {
75 | got, err := ProcessHTML(tt.args.r, tt.args.url)
76 | if (err != nil) != tt.wantErr {
77 | t.Errorf("processHTML() error = %v, wantErr %v", err, tt.wantErr)
78 | return
79 | }
80 | if !reflect.DeepEqual(got, tt.want) {
81 | t.Errorf("processHTML() = %v, want %v", got, tt.want)
82 | }
83 | })
84 | }
85 | }
86 |
--------------------------------------------------------------------------------
/internal/text/json.go:
--------------------------------------------------------------------------------
1 | package text
2 |
3 | import (
4 | "encoding/json"
5 | "fmt"
6 | "io"
7 |
8 | "github.com/mmatongo/chew/v1/internal/common"
9 | )
10 |
11 | func ProcessJSON(r io.Reader, url string) ([]common.Chunk, error) {
12 | var data interface{}
13 | if err := json.NewDecoder(r).Decode(&data); err != nil {
14 | return nil, err
15 | }
16 |
17 | jsonStr, err := json.MarshalIndent(data, "", " ")
18 | if err != nil {
19 | return nil, fmt.Errorf("failed to marshal json: %w", err)
20 | }
21 |
22 | return []common.Chunk{{Content: string(jsonStr), Source: url}}, nil
23 | }
24 |
--------------------------------------------------------------------------------
/internal/text/json_test.go:
--------------------------------------------------------------------------------
1 | package text
2 |
3 | import (
4 | "io"
5 | "reflect"
6 | "strings"
7 | "testing"
8 |
9 | "github.com/mmatongo/chew/v1/internal/common"
10 | )
11 |
12 | func TestProcessJSON(t *testing.T) {
13 | type args struct {
14 | r io.Reader
15 | url string
16 | }
17 | tests := []struct {
18 | name string
19 | args args
20 | want []common.Chunk
21 | wantErr bool
22 | }{
23 | {
24 | name: "success",
25 | args: args{
26 | r: strings.NewReader(`{"key": "value"}`),
27 | url: "https://example.com/data.json",
28 | },
29 | want: []common.Chunk{{
30 | Content: "{\n \"key\": \"value\"\n}",
31 | Source: "https://example.com/data.json",
32 | }},
33 | wantErr: false,
34 | },
35 | {
36 | name: "empty",
37 | args: args{
38 | r: strings.NewReader(""),
39 | url: "https://example.com",
40 | },
41 | want: nil,
42 | wantErr: true,
43 | },
44 | {
45 | name: "valid empty json",
46 | args: args{
47 | r: strings.NewReader("{}"),
48 | url: "https://example.com",
49 | },
50 | want: []common.Chunk{{
51 | Content: "{}",
52 | Source: "https://example.com",
53 | }},
54 | wantErr: false,
55 | },
56 | }
57 | for _, tt := range tests {
58 | t.Run(tt.name, func(t *testing.T) {
59 | got, err := ProcessJSON(tt.args.r, tt.args.url)
60 | if (err != nil) != tt.wantErr {
61 | t.Errorf("ProcessJSON() error = %v, wantErr %v", err, tt.wantErr)
62 | return
63 | }
64 | if !reflect.DeepEqual(got, tt.want) {
65 | t.Errorf("ProcessJSON() = %v, want %v", got, tt.want)
66 | }
67 | })
68 | }
69 | }
70 |
--------------------------------------------------------------------------------
/internal/text/markdown.go:
--------------------------------------------------------------------------------
1 | package text
2 |
3 | var ProcessMd = ProcessText
4 |
--------------------------------------------------------------------------------
/internal/text/plaintext.go:
--------------------------------------------------------------------------------
1 | package text
2 |
3 | import (
4 | "io"
5 |
6 | "github.com/mmatongo/chew/v1/internal/common"
7 | )
8 |
9 | func ProcessText(r io.Reader, url string) ([]common.Chunk, error) {
10 | content, err := io.ReadAll(r)
11 | if err != nil {
12 | return nil, err
13 | }
14 |
15 | if len(content) == 0 {
16 | return nil, nil
17 | }
18 |
19 | return []common.Chunk{{Content: string(content), Source: url}}, nil
20 | }
21 |
--------------------------------------------------------------------------------
/internal/text/plaintext_test.go:
--------------------------------------------------------------------------------
1 | package text
2 |
3 | import (
4 | "io"
5 | "reflect"
6 | "strings"
7 | "testing"
8 |
9 | "github.com/mmatongo/chew/v1/internal/common"
10 | "github.com/mmatongo/chew/v1/internal/utils"
11 | )
12 |
13 | func TestProcessText(t *testing.T) {
14 | file, _ := utils.OpenFile("testdata/invalid.html")
15 | type args struct {
16 | r io.Reader
17 | url string
18 | }
19 | tests := []struct {
20 | name string
21 | args args
22 | want []common.Chunk
23 | wantErr bool
24 | }{
25 | {
26 | name: "success",
27 | args: args{
28 | r: strings.NewReader("Test content"),
29 | url: "https://example.com",
30 | },
31 | want: []common.Chunk{{
32 | Content: "Test content",
33 | Source: "https://example.com",
34 | }},
35 | wantErr: false,
36 | },
37 | {
38 | name: "empty",
39 | args: args{
40 | r: strings.NewReader(""),
41 | url: "https://example.com",
42 | },
43 | want: nil,
44 | wantErr: false,
45 | },
46 | {
47 | name: "invalid",
48 | args: args{
49 | r: file,
50 | url: "https://example.com",
51 | },
52 | want: nil,
53 | wantErr: true,
54 | },
55 | }
56 | for _, tt := range tests {
57 | t.Run(tt.name, func(t *testing.T) {
58 | got, err := ProcessText(tt.args.r, tt.args.url)
59 | if (err != nil) != tt.wantErr {
60 | t.Errorf("ProcessText() error = %v, wantErr %v", err, tt.wantErr)
61 | return
62 | }
63 | if !reflect.DeepEqual(got, tt.want) {
64 | t.Errorf("ProcessText() = %v, want %v", got, tt.want)
65 | }
66 | })
67 | }
68 | }
69 |
--------------------------------------------------------------------------------
/internal/text/xml.go:
--------------------------------------------------------------------------------
1 | package text
2 |
3 | import (
4 | "bytes"
5 | "encoding/xml"
6 | "io"
7 |
8 | "github.com/mmatongo/chew/v1/internal/common"
9 | )
10 |
11 | func ProcessXML(r io.Reader, url string) ([]common.Chunk, error) {
12 | decoder := xml.NewDecoder(r)
13 | var chunks []common.Chunk
14 | var currentElement string
15 | for {
16 | t, err := decoder.Token()
17 | if err == io.EOF {
18 | break
19 | }
20 | if err != nil {
21 | return nil, err
22 | }
23 | switch se := t.(type) {
24 | case xml.StartElement:
25 | currentElement = se.Name.Local
26 | case xml.CharData:
27 | content := string(bytes.TrimSpace(se))
28 | if content != "" && currentElement != "" {
29 | chunks = append(chunks, common.Chunk{
30 | Content: content,
31 | Source: url,
32 | })
33 | }
34 | }
35 | }
36 | return chunks, nil
37 | }
38 |
--------------------------------------------------------------------------------
/internal/text/xml_test.go:
--------------------------------------------------------------------------------
1 | package text
2 |
3 | import (
4 | "io"
5 | "reflect"
6 | "strings"
7 | "testing"
8 |
9 | "github.com/mmatongo/chew/v1/internal/common"
10 | )
11 |
12 | func TestProcessXML(t *testing.T) {
13 | type args struct {
14 | r io.Reader
15 | url string
16 | }
17 | tests := []struct {
18 | name string
19 | args args
20 | want []common.Chunk
21 | wantErr bool
22 | }{
23 | {
24 | name: "success",
25 | args: args{
26 | r: strings.NewReader("Test content"),
27 | url: "https://example.com",
28 | },
29 | want: []common.Chunk{{
30 | Content: "Test content",
31 | Source: "https://example.com",
32 | }},
33 |
34 | wantErr: false,
35 | },
36 | }
37 | for _, tt := range tests {
38 | t.Run(tt.name, func(t *testing.T) {
39 | got, err := ProcessXML(tt.args.r, tt.args.url)
40 | if (err != nil) != tt.wantErr {
41 | t.Errorf("ProcessXML() error = %v, wantErr %v", err, tt.wantErr)
42 | return
43 | }
44 | if !reflect.DeepEqual(got, tt.want) {
45 | t.Errorf("ProcessXML() = %v, want %v", got, tt.want)
46 | }
47 | })
48 | }
49 | }
50 |
--------------------------------------------------------------------------------
/internal/text/yaml.go:
--------------------------------------------------------------------------------
1 | package text
2 |
3 | import (
4 | "io"
5 |
6 | "github.com/mmatongo/chew/v1/internal/common"
7 | "gopkg.in/yaml.v3"
8 | )
9 |
10 | func ProcessYAML(r io.Reader, url string) ([]common.Chunk, error) {
11 | var data interface{}
12 | if err := yaml.NewDecoder(r).Decode(&data); err != nil {
13 | return nil, err
14 | }
15 |
16 | yamlStr, err := yaml.Marshal(data)
17 | if err != nil {
18 | return nil, err
19 | }
20 |
21 | return []common.Chunk{{Content: string(yamlStr), Source: url}}, nil
22 | }
23 |
--------------------------------------------------------------------------------
/internal/text/yaml_test.go:
--------------------------------------------------------------------------------
1 | package text
2 |
3 | import (
4 | "io"
5 | "reflect"
6 | "strings"
7 | "testing"
8 |
9 | "github.com/mmatongo/chew/v1/internal/common"
10 | )
11 |
12 | func TestProcessYAML(t *testing.T) {
13 | type args struct {
14 | r io.Reader
15 | url string
16 | }
17 | tests := []struct {
18 | name string
19 | args args
20 | want []common.Chunk
21 | wantErr bool
22 | }{
23 | {
24 | name: "success",
25 | args: args{
26 | r: strings.NewReader("key: value\nkey2: value2"),
27 | url: "https://example.com/data.yaml",
28 | },
29 | want: []common.Chunk{
30 | {
31 | Content: "key: value\nkey2: value2\n",
32 | Source: "https://example.com/data.yaml",
33 | },
34 | },
35 | wantErr: false,
36 | },
37 | {
38 | name: "error",
39 | args: args{
40 | r: strings.NewReader("key: value, key2: value2"),
41 | url: "https://example.com/data.yaml",
42 | },
43 | want: nil,
44 | wantErr: true,
45 | },
46 | }
47 | for _, tt := range tests {
48 | t.Run(tt.name, func(t *testing.T) {
49 | got, err := ProcessYAML(tt.args.r, tt.args.url)
50 | if (err != nil) != tt.wantErr {
51 | t.Errorf("ProcessYAML() error = %v, wantErr %v", err, tt.wantErr)
52 | return
53 | }
54 | if !reflect.DeepEqual(got, tt.want) {
55 | t.Errorf("ProcessYAML() = %v, want %v", got, tt.want)
56 | }
57 | })
58 | }
59 | }
60 |
--------------------------------------------------------------------------------
/internal/transcribe/google_transcriber.go:
--------------------------------------------------------------------------------
1 | package transcribe
2 |
3 | import (
4 | "context"
5 | "errors"
6 | "fmt"
7 | "path/filepath"
8 |
9 | "cloud.google.com/go/storage"
10 |
11 | "github.com/mmatongo/chew/v1/internal/audio"
12 | "github.com/mmatongo/chew/v1/internal/utils/gcs"
13 | )
14 |
15 | type googleTranscriber struct{}
16 |
17 | /*
18 | This relies too heavily on external dependencies and is not easily testable. A refactor is needed to make it more testable and is currently in progress.
19 | */
20 | func (gt *googleTranscriber) process(ctx context.Context, filename string, opts TranscribeOptions) (string, error) {
21 | client, err := gcs.NewSpeechClient(ctx, opts)
22 | if err != nil {
23 | return "", fmt.Errorf("failed to create speech client: %w", err)
24 | }
25 | defer func() {
26 | if cerr := client.Close(); cerr != nil {
27 | err = errors.Join(err, fmt.Errorf("failed to close transcribe client: %w", cerr))
28 | }
29 | }()
30 |
31 | storageClient, err := gcs.NewStorageClient(ctx, opts)
32 | if err != nil {
33 | return "", err
34 | }
35 | defer func(storageClient *storage.Client) {
36 | err := storageClient.Close()
37 | if err != nil {
38 | fmt.Printf("failed to close storage client: %v\n", err)
39 | }
40 | }(storageClient)
41 |
42 | audioInfo, err := audio.GetAudioInfo(filename)
43 | if err != nil {
44 | return "", fmt.Errorf("failed to process audio file: %w", err)
45 | }
46 |
47 | gcsURI, err := gcs.UploadToGCS(ctx, storageClient, opts.Bucket, filename)
48 | if err != nil {
49 | return "", fmt.Errorf("failed to upload to GCS: %w", err)
50 | }
51 |
52 | if opts.CleanupOnComplete {
53 | defer func(ctx context.Context, client *storage.Client, bucket, objectName string) {
54 | err := gcs.DeleteFromGCS(ctx, client, bucket, objectName)
55 | if err != nil {
56 | fmt.Printf("failed to delete object from GCS: %v\n", err)
57 | }
58 | }(ctx, storageClient, opts.Bucket, filepath.Base(filename))
59 | }
60 |
61 | req := gcs.NewRecognitionRequest(opts, audioInfo, gcsURI)
62 |
63 | op, err := client.LongRunningRecognize(ctx, req)
64 | if err != nil {
65 | return "", fmt.Errorf("failed to start long running recognition: %w", err)
66 | }
67 |
68 | resp, err := op.Wait(ctx)
69 | if err != nil {
70 | return "", fmt.Errorf("failed to get long running recognition results: %w", err)
71 | }
72 |
73 | return gcs.ExtractTranscript(resp), nil
74 | }
75 |
--------------------------------------------------------------------------------
/internal/transcribe/google_transcriber_test.go:
--------------------------------------------------------------------------------
1 | package transcribe
2 |
3 | import (
4 | "context"
5 | "testing"
6 | )
7 |
8 | func Test_googleTranscriber_process(t *testing.T) {
9 | type args struct {
10 | ctx context.Context
11 | filename string
12 | opts TranscribeOptions
13 | }
14 | tests := []struct {
15 | name string
16 | gt *googleTranscriber
17 | args args
18 | want string
19 | wantErr bool
20 | }{
21 | {
22 | name: "failed to create speech client",
23 | gt: &googleTranscriber{},
24 | args: args{
25 | ctx: context.Background(),
26 | filename: "test.mp3",
27 | opts: TranscribeOptions{},
28 | },
29 | want: "",
30 | wantErr: true,
31 | },
32 | }
33 | for _, tt := range tests {
34 | t.Run(tt.name, func(t *testing.T) {
35 | gt := &googleTranscriber{}
36 | got, err := gt.process(tt.args.ctx, tt.args.filename, tt.args.opts)
37 | if (err != nil) != tt.wantErr {
38 | t.Errorf("googleTranscriber.process() error = %v, wantErr %v", err, tt.wantErr)
39 | return
40 | }
41 | if got != tt.want {
42 | t.Errorf("googleTranscriber.process() = %v, want %v", got, tt.want)
43 | }
44 | })
45 | }
46 | }
47 |
--------------------------------------------------------------------------------
/internal/transcribe/transcribe.go:
--------------------------------------------------------------------------------
1 | package transcribe
2 |
3 | import (
4 | "context"
5 | "fmt"
6 | "sync"
7 |
8 | "github.com/mmatongo/chew/v1/internal/common"
9 | )
10 |
11 | /*
12 | The TranscribeOptions struct contains the options for transcribing an audio file. It allows the user
13 | to specify the Google Cloud credentials, the GCS bucket to upload the audio file to, the language code
14 | to use for transcription, a potion to enable diarization including the min and max speakers and
15 | an option to clean up the audio file from GCS after transcription is complete.
16 |
17 | And also, it allows the user to specify whether to use the Whisper API for transcription, and if so,
18 | the API key, model, and prompt to use.
19 | */
20 | type TranscribeOptions = common.TranscribeOptions
21 |
22 | // code is largely inspired by https://github.com/polyfact/polyfire-api
23 |
24 | type transcribeOption func(*transcribeConfig)
25 |
26 | type transcribeConfig struct {
27 | t transcriber
28 | }
29 |
30 | func WithTranscriber(t transcriber) transcribeOption {
31 | return func(config *transcribeConfig) {
32 | config.t = t
33 | }
34 | }
35 |
36 | /*
37 | Transcribe uses the Google Cloud Speech-to-Text API to transcribe an audio file. It takes
38 | a context, the filename of the audio file to transcribe, and a TranscribeOptions struct which
39 | contains the Google Cloud credentials, the GCS bucket to upload the audio file to, the language code
40 | to use for transcription, a potion to enable diarization including the min and max speakers and
41 | an option to clean up the audio file from GCS after transcription is complete.
42 | It returns the transcript of the audio file as a string and an error if the transcription fails.
43 | */
44 | func Transcribe(ctx context.Context, filenames []string, opts TranscribeOptions, options ...transcribeOption) (map[string]string, error) {
45 | config := &transcribeConfig{}
46 | for _, option := range options {
47 | option(config)
48 | }
49 |
50 | if config.t == nil {
51 | if opts.UseWhisper {
52 | config.t = &whisperTranscriber{}
53 | } else {
54 | config.t = &googleTranscriber{}
55 | }
56 | }
57 |
58 | var (
59 | results = make(map[string]string)
60 | wg sync.WaitGroup
61 | mu sync.Mutex
62 | errCh = make(chan error, len(filenames))
63 | )
64 |
65 | for _, filename := range filenames {
66 | wg.Add(1)
67 | go func(filename string) {
68 | defer wg.Done()
69 |
70 | transcript, err := config.t.process(ctx, filename, opts)
71 | if err != nil {
72 | select {
73 | case errCh <- fmt.Errorf("transcribing %s: %w", filename, err):
74 | default:
75 | }
76 | return
77 | }
78 |
79 | mu.Lock()
80 | results[filename] = transcript
81 | mu.Unlock()
82 | }(filename)
83 | }
84 |
85 | go func() {
86 | wg.Wait()
87 | close(errCh)
88 | }()
89 |
90 | select {
91 | case err := <-errCh:
92 | if err != nil {
93 | return nil, err
94 | }
95 | case <-ctx.Done():
96 | return nil, ctx.Err()
97 | }
98 |
99 | return results, nil
100 | }
101 |
--------------------------------------------------------------------------------
/internal/transcribe/transcribe_test.go:
--------------------------------------------------------------------------------
1 | package transcribe
2 |
3 | import (
4 | "context"
5 | "fmt"
6 | "reflect"
7 | "testing"
8 | )
9 |
10 | type mockTranscriber struct {
11 | processFn func(ctx context.Context, filename string, opts TranscribeOptions) (string, error)
12 | }
13 |
14 | func (m *mockTranscriber) process(ctx context.Context, filename string, opts TranscribeOptions) (string, error) {
15 | if m.processFn != nil {
16 | return m.processFn(ctx, filename, opts)
17 | }
18 | return "", nil
19 | }
20 |
21 | func TestTranscribe(t *testing.T) {
22 | type args struct {
23 | ctx context.Context
24 | filenames []string
25 | opts TranscribeOptions
26 | }
27 | tests := []struct {
28 | name string
29 | args args
30 | want map[string]string
31 | wantErr bool
32 | mockFn func(ctx context.Context, filename string, opts TranscribeOptions) (string, error)
33 | }{
34 | {
35 | name: "Test Transcribe",
36 | args: args{
37 | ctx: context.Background(),
38 | filenames: []string{"test1.mp3", "test2.mp3"},
39 | opts: TranscribeOptions{
40 | CredentialsJSON: []byte("``"),
41 | Bucket: "test-bucket",
42 | LanguageCode: "en-US",
43 | EnableDiarization: false,
44 | MinSpeakers: 0,
45 | MaxSpeakers: 0,
46 | CleanupOnComplete: false,
47 | UseWhisper: false,
48 | WhisperAPIKey: "",
49 | WhisperModel: "",
50 | WhisperPrompt: "",
51 | },
52 | },
53 | want: map[string]string{
54 | "test1.mp3": "transcript for test1.mp3",
55 | "test2.mp3": "transcript for test2.mp3",
56 | },
57 | wantErr: false,
58 | mockFn: func(ctx context.Context, filename string, opts TranscribeOptions) (string, error) {
59 | return "transcript for " + filename, nil
60 | },
61 | },
62 | {
63 | name: "Test Transcribe Error",
64 | args: args{
65 | ctx: context.Background(),
66 | filenames: []string{"test1.mp3", "test2.mp3"},
67 | opts: TranscribeOptions{},
68 | },
69 | want: nil,
70 | wantErr: true,
71 | mockFn: func(ctx context.Context, filename string, opts TranscribeOptions) (string, error) {
72 | return "", fmt.Errorf("mock error")
73 | },
74 | },
75 | }
76 | for _, tt := range tests {
77 | t.Run(tt.name, func(t *testing.T) {
78 | mockT := &mockTranscriber{
79 | processFn: tt.mockFn,
80 | }
81 | got, err := Transcribe(tt.args.ctx, tt.args.filenames, tt.args.opts, WithTranscriber(mockT))
82 | if (err != nil) != tt.wantErr {
83 | t.Errorf("Transcribe() error = %v, wantErr %v", err, tt.wantErr)
84 | return
85 | }
86 | if !reflect.DeepEqual(got, tt.want) {
87 | t.Errorf("Transcribe() = %v, want %v", got, tt.want)
88 | }
89 | })
90 | }
91 | }
92 |
--------------------------------------------------------------------------------
/internal/transcribe/types.go:
--------------------------------------------------------------------------------
1 | package transcribe
2 |
3 | import (
4 | "context"
5 | "io"
6 | "net/http"
7 | )
8 |
9 | type transcriber interface {
10 | process(ctx context.Context, filename string, opts TranscribeOptions) (string, error)
11 | }
12 |
13 | type whisperTranscriber struct{}
14 |
15 | type httpClient interface {
16 | Do(req *http.Request) (*http.Response, error)
17 | }
18 |
19 | type fileOpener func(name string) (io.ReadCloser, error)
20 |
--------------------------------------------------------------------------------
/internal/transcribe/whisper.go:
--------------------------------------------------------------------------------
1 | package transcribe
2 |
3 | import (
4 | "bytes"
5 | "context"
6 | "encoding/json"
7 | "fmt"
8 | "io"
9 | "mime/multipart"
10 | "net/http"
11 | "os"
12 | "path/filepath"
13 | )
14 |
15 | func processWhisper(ctx context.Context, filename string, opts TranscribeOptions, client httpClient, opener fileOpener) (string, error) {
16 | if client == nil {
17 | client = &http.Client{}
18 | }
19 | if opener == nil {
20 | opener = func(name string) (io.ReadCloser, error) {
21 | return os.Open(name)
22 | }
23 | }
24 |
25 | file, err := opener(filename)
26 | if err != nil {
27 | return "", fmt.Errorf("failed to open file: %w", err)
28 | }
29 | defer func() {
30 | if cerr := file.Close(); cerr != nil {
31 | err = fmt.Errorf("failed to close file: %v (original error: %w)", cerr, err)
32 | }
33 | }()
34 |
35 | body := &bytes.Buffer{}
36 | writer := multipart.NewWriter(body)
37 |
38 | part, err := writer.CreateFormFile("file", filepath.Base(filename))
39 | if err != nil {
40 | return "", fmt.Errorf("failed to create form file: %w", err)
41 | }
42 | if _, err = io.Copy(part, file); err != nil {
43 | return "", fmt.Errorf("failed to copy file content: %w", err)
44 | }
45 |
46 | if err = writeFields(writer, opts); err != nil {
47 | return "", err
48 | }
49 |
50 | if err = writer.Close(); err != nil {
51 | return "", fmt.Errorf("failed to close writer: %w", err)
52 | }
53 |
54 | req, err := http.NewRequestWithContext(ctx, "POST", "https://api.openai.com/v1/audio/transcriptions", body)
55 | if err != nil {
56 | return "", fmt.Errorf("failed to create request: %w", err)
57 | }
58 |
59 | req.Header.Set("Authorization", "Bearer "+opts.WhisperAPIKey)
60 | req.Header.Set("Content-Type", writer.FormDataContentType())
61 |
62 | resp, err := client.Do(req)
63 | if err != nil {
64 | return "", fmt.Errorf("failed to send request: %w", err)
65 | }
66 | defer func() {
67 | if cerr := resp.Body.Close(); cerr != nil {
68 | err = fmt.Errorf("failed to close response body: %v (original error: %w)", cerr, err)
69 | }
70 | }()
71 |
72 | if resp.StatusCode != http.StatusOK {
73 | bodyBytes, _ := io.ReadAll(resp.Body)
74 | return "", fmt.Errorf("API request failed with status code %d: %s", resp.StatusCode, string(bodyBytes))
75 | }
76 |
77 | var result struct {
78 | Text string `json:"text"`
79 | }
80 | if err = json.NewDecoder(resp.Body).Decode(&result); err != nil {
81 | return "", fmt.Errorf("failed to decode response: %w", err)
82 | }
83 |
84 | return result.Text, nil
85 | }
86 |
87 | func writeFields(writer *multipart.Writer, opts TranscribeOptions) error {
88 | fields := map[string]string{
89 | "model": opts.WhisperModel,
90 | "language": opts.LanguageCode,
91 | "prompt": opts.WhisperPrompt,
92 | }
93 |
94 | for key, value := range fields {
95 | if value != "" {
96 | if err := writer.WriteField(key, value); err != nil {
97 | return fmt.Errorf("failed to write %s field: %w", key, err)
98 | }
99 | }
100 | }
101 |
102 | return nil
103 | }
104 |
105 | func (wt *whisperTranscriber) process(ctx context.Context, filename string, opts TranscribeOptions) (string, error) {
106 | return processWhisper(ctx, filename, opts, nil, nil)
107 | }
108 |
--------------------------------------------------------------------------------
/internal/transcribe/whisper_test.go:
--------------------------------------------------------------------------------
1 | package transcribe
2 |
3 | import (
4 | "bytes"
5 | "context"
6 | "errors"
7 | "io"
8 | "mime/multipart"
9 | "net/http"
10 | "os"
11 | "path/filepath"
12 | "testing"
13 | )
14 |
15 | type mockHTTPClient struct {
16 | DoFunc func(req *http.Request) (*http.Response, error)
17 | }
18 |
19 | func (m *mockHTTPClient) Do(req *http.Request) (*http.Response, error) {
20 | return m.DoFunc(req)
21 | }
22 |
23 | type mockFile struct {
24 | *bytes.Reader
25 | }
26 |
27 | func (m *mockFile) Close() error {
28 | return nil
29 | }
30 |
31 | func Test_processWhisper(t *testing.T) {
32 | /* mocks */
33 |
34 | tempDir, err := os.MkdirTemp("", "whisper_test")
35 | if err != nil {
36 | t.Fatalf("failed to create temp dir: %v", err)
37 | }
38 | defer os.RemoveAll(tempDir)
39 |
40 | testFilePath := filepath.Join(tempDir, "test.mp3")
41 | if err := os.WriteFile(testFilePath, []byte("dummy audio content"), 0644); err != nil {
42 | t.Fatalf("failed to create test file: %v", err)
43 | }
44 |
45 | unreadableFilePath := filepath.Join(tempDir, "unreadable.mp3")
46 | if err := os.WriteFile(unreadableFilePath, []byte("unreadable content"), 0000); err != nil {
47 | t.Fatalf("failed to create unreadable file: %v", err)
48 | }
49 |
50 | successfulMockClient := &mockHTTPClient{
51 | DoFunc: func(req *http.Request) (*http.Response, error) {
52 | return &http.Response{
53 | StatusCode: 200,
54 | Body: io.NopCloser(bytes.NewBufferString(`{
55 | "text": "this is a test transcription."
56 | }`)),
57 | }, nil
58 | },
59 | }
60 |
61 | errorMockClient := &mockHTTPClient{
62 | DoFunc: func(req *http.Request) (*http.Response, error) {
63 | return nil, errors.New("mock HTTP error")
64 | },
65 | }
66 |
67 | badResponseMockClient := &mockHTTPClient{
68 | DoFunc: func(req *http.Request) (*http.Response, error) {
69 | return &http.Response{
70 | StatusCode: 400,
71 | Body: io.NopCloser(bytes.NewBufferString(`{"error": "Bad Request"}`)),
72 | }, nil
73 | },
74 | }
75 |
76 | invalidJSONMockClient := &mockHTTPClient{
77 | DoFunc: func(req *http.Request) (*http.Response, error) {
78 | return &http.Response{
79 | StatusCode: 200,
80 | Body: io.NopCloser(bytes.NewBufferString(`invalid JSON`)),
81 | }, nil
82 | },
83 | }
84 |
85 | successfulMockFileOpener := func(name string) (io.ReadCloser, error) {
86 | return &mockFile{bytes.NewReader([]byte("file content"))}, nil
87 | }
88 |
89 | errorMockFileOpener := func(name string) (io.ReadCloser, error) {
90 | return nil, errors.New("file open error")
91 | }
92 |
93 | type args struct {
94 | ctx context.Context
95 | filename string
96 | opts TranscribeOptions
97 | client httpClient
98 | opener func(name string) (io.ReadCloser, error)
99 | }
100 |
101 | tests := []struct {
102 | name string
103 | args args
104 | want string
105 | wantErr bool
106 | }{
107 | {
108 | name: "successful transcription",
109 | args: args{
110 | ctx: context.Background(),
111 | filename: testFilePath,
112 | opts: TranscribeOptions{
113 | WhisperAPIKey: "test-api-key",
114 | WhisperModel: "test-model",
115 | LanguageCode: "en-US",
116 | WhisperPrompt: "test-prompt",
117 | },
118 | client: successfulMockClient,
119 | },
120 | want: "this is a test transcription.",
121 | wantErr: false,
122 | },
123 | {
124 | name: "file open error",
125 | args: args{
126 | ctx: context.Background(),
127 | filename: "non-existent-file.mp3",
128 | opts: TranscribeOptions{},
129 | client: successfulMockClient,
130 | },
131 | want: "",
132 | wantErr: true,
133 | },
134 | {
135 | name: "file read error",
136 | args: args{
137 | ctx: context.Background(),
138 | filename: unreadableFilePath,
139 | opts: TranscribeOptions{},
140 | client: successfulMockClient,
141 | },
142 | want: "",
143 | wantErr: true,
144 | },
145 | {
146 | name: "HTTP client error",
147 | args: args{
148 | ctx: context.Background(),
149 | filename: testFilePath,
150 | opts: TranscribeOptions{},
151 | client: errorMockClient,
152 | },
153 | want: "",
154 | wantErr: true,
155 | },
156 | {
157 | name: "bad response from API",
158 | args: args{
159 | ctx: context.Background(),
160 | filename: testFilePath,
161 | opts: TranscribeOptions{},
162 | client: badResponseMockClient,
163 | },
164 | want: "",
165 | wantErr: true,
166 | },
167 | {
168 | name: "invalid JSON response",
169 | args: args{
170 | ctx: context.Background(),
171 | filename: testFilePath,
172 | opts: TranscribeOptions{},
173 | client: invalidJSONMockClient,
174 | },
175 | want: "",
176 | wantErr: true,
177 | },
178 | {
179 | name: "file open error",
180 | args: args{
181 | ctx: context.Background(),
182 | filename: "test.mp3",
183 | opts: TranscribeOptions{},
184 | client: successfulMockClient,
185 | opener: errorMockFileOpener,
186 | },
187 | want: "",
188 | wantErr: true,
189 | },
190 | {
191 | name: "HTTP client error",
192 | args: args{
193 | ctx: context.Background(),
194 | filename: "test.mp3",
195 | opts: TranscribeOptions{},
196 | client: errorMockClient,
197 | opener: successfulMockFileOpener,
198 | },
199 | want: "",
200 | wantErr: true,
201 | },
202 | }
203 |
204 | for _, tt := range tests {
205 | t.Run(tt.name, func(t *testing.T) {
206 | got, err := processWhisper(tt.args.ctx, tt.args.filename, tt.args.opts, tt.args.client, tt.args.opener)
207 | if (err != nil) != tt.wantErr {
208 | t.Errorf("processWhisper() error = %v, wantErr %v", err, tt.wantErr)
209 | return
210 | }
211 | if got != tt.want {
212 | t.Errorf("processWhisper() = %v, want %v", got, tt.want)
213 | }
214 | })
215 | }
216 | }
217 |
218 | func Test_writeFields(t *testing.T) {
219 | tests := []struct {
220 | name string
221 | opts TranscribeOptions
222 | wantErr bool
223 | }{
224 | {
225 | name: "all fields present",
226 | opts: TranscribeOptions{
227 | WhisperModel: "test-model",
228 | LanguageCode: "en-US",
229 | WhisperPrompt: "test-prompt",
230 | },
231 | wantErr: false,
232 | },
233 | }
234 |
235 | for _, tt := range tests {
236 | t.Run(tt.name, func(t *testing.T) {
237 | writer := multipart.NewWriter(&bytes.Buffer{})
238 | if err := writeFields(writer, tt.opts); (err != nil) != tt.wantErr {
239 | t.Errorf("writeFields() error = %v, wantErr %v", err, tt.wantErr)
240 | }
241 | })
242 | }
243 | }
244 |
--------------------------------------------------------------------------------
/internal/utils/gcs/gcs_utils.go:
--------------------------------------------------------------------------------
1 | package gcs
2 |
3 | import (
4 | "context"
5 | "errors"
6 | "fmt"
7 | "io"
8 | "os"
9 | "path/filepath"
10 |
11 | speech "cloud.google.com/go/speech/apiv1"
12 | "cloud.google.com/go/speech/apiv1/speechpb"
13 | "cloud.google.com/go/storage"
14 | "github.com/mmatongo/chew/v1/internal/common"
15 | "google.golang.org/api/option"
16 | )
17 |
18 | func UploadToGCS(ctx context.Context, client *storage.Client, bucket, filename string) (string, error) {
19 | f, err := os.Open(filename)
20 | if err != nil {
21 | return "", fmt.Errorf("failed to open file: %w", err)
22 | }
23 | defer func() {
24 | if cerr := f.Close(); cerr != nil {
25 | err = errors.Join(err, fmt.Errorf("failed to close file: %w", cerr))
26 | }
27 | }()
28 |
29 | objectName := filepath.Base(filename)
30 | w := client.Bucket(bucket).Object(objectName).NewWriter(ctx)
31 | if _, err = io.Copy(w, f); err != nil {
32 | return "", fmt.Errorf("failed to copy file to GCS: %w", err)
33 | }
34 | if err := w.Close(); err != nil {
35 | return "", fmt.Errorf("failed to close GCS writer: %w", err)
36 | }
37 |
38 | return fmt.Sprintf("gs://%s/%s", bucket, objectName), nil
39 | }
40 |
41 | func DeleteFromGCS(ctx context.Context, client *storage.Client, bucket, objectName string) error {
42 | if err := client.Bucket(bucket).Object(objectName).Delete(ctx); err != nil {
43 | return fmt.Errorf("failed to delete object from GCS: %w", err)
44 | }
45 | return nil
46 | }
47 |
48 | func NewStorageClient(ctx context.Context, opts common.TranscribeOptions) (*storage.Client, error) {
49 | var clientOpts []option.ClientOption
50 | if opts.CredentialsJSON != nil {
51 | clientOpts = append(clientOpts, option.WithCredentialsJSON(opts.CredentialsJSON))
52 | }
53 | return storage.NewClient(ctx, clientOpts...)
54 | }
55 |
56 | func NewSpeechClient(ctx context.Context, opts common.TranscribeOptions) (*speech.Client, error) {
57 | var clientOpts []option.ClientOption
58 | if opts.CredentialsJSON != nil {
59 | clientOpts = append(clientOpts, option.WithCredentialsJSON(opts.CredentialsJSON))
60 | }
61 | return speech.NewClient(ctx, clientOpts...)
62 | }
63 |
64 | func NewRecognitionRequest(opts common.TranscribeOptions, audioInfo *speechpb.RecognitionConfig, gcsURI string) *speechpb.LongRunningRecognizeRequest {
65 | diarizationConfig := &speechpb.SpeakerDiarizationConfig{
66 | EnableSpeakerDiarization: opts.EnableDiarization,
67 | MinSpeakerCount: int32(opts.MinSpeakers),
68 | MaxSpeakerCount: int32(opts.MaxSpeakers),
69 | }
70 |
71 | return &speechpb.LongRunningRecognizeRequest{
72 | Config: &speechpb.RecognitionConfig{
73 | Encoding: audioInfo.Encoding,
74 | SampleRateHertz: audioInfo.SampleRateHertz,
75 | AudioChannelCount: audioInfo.AudioChannelCount,
76 | LanguageCode: opts.LanguageCode,
77 | EnableAutomaticPunctuation: true,
78 | UseEnhanced: true,
79 | EnableWordConfidence: true,
80 | Model: "latest_long",
81 | DiarizationConfig: diarizationConfig,
82 | },
83 | Audio: &speechpb.RecognitionAudio{
84 | AudioSource: &speechpb.RecognitionAudio_Uri{
85 | Uri: gcsURI,
86 | },
87 | },
88 | }
89 | }
90 |
91 | func ExtractTranscript(resp *speechpb.LongRunningRecognizeResponse) string {
92 | var transcript string
93 | for _, result := range resp.Results {
94 | for _, alt := range result.Alternatives {
95 | transcript += alt.Transcript
96 | }
97 | }
98 | return transcript
99 | }
100 |
--------------------------------------------------------------------------------
/internal/utils/gcs/gcs_utils_test.go:
--------------------------------------------------------------------------------
1 | package gcs
2 |
3 | import (
4 | "context"
5 | "fmt"
6 | "net/http"
7 | "net/http/httptest"
8 | "os"
9 | "path/filepath"
10 | "reflect"
11 | "strings"
12 | "testing"
13 |
14 | speech "cloud.google.com/go/speech/apiv1"
15 | "cloud.google.com/go/speech/apiv1/speechpb"
16 | "cloud.google.com/go/storage"
17 | "github.com/mmatongo/chew/v1/internal/common"
18 | "google.golang.org/api/option"
19 | )
20 |
21 | func Test_extractTranscript(t *testing.T) {
22 | type args struct {
23 | resp *speechpb.LongRunningRecognizeResponse
24 | }
25 | tests := []struct {
26 | name string
27 | args args
28 | want string
29 | }{
30 | {
31 | name: "empty response",
32 | args: args{
33 | resp: &speechpb.LongRunningRecognizeResponse{},
34 | },
35 | want: "",
36 | },
37 | {
38 | name: "response with no results",
39 | args: args{
40 | resp: &speechpb.LongRunningRecognizeResponse{
41 | Results: []*speechpb.SpeechRecognitionResult{},
42 | },
43 | },
44 | want: "",
45 | },
46 | {
47 | name: "response with no alternatives",
48 | args: args{
49 | resp: &speechpb.LongRunningRecognizeResponse{
50 | Results: []*speechpb.SpeechRecognitionResult{
51 | {},
52 | },
53 | },
54 | },
55 | },
56 | {
57 | name: "response with result and alternative",
58 | args: args{
59 | resp: &speechpb.LongRunningRecognizeResponse{
60 | Results: []*speechpb.SpeechRecognitionResult{
61 | {
62 | Alternatives: []*speechpb.SpeechRecognitionAlternative{
63 | {
64 | Transcript: "hello world",
65 | },
66 | },
67 | },
68 | },
69 | },
70 | },
71 | want: "hello world",
72 | },
73 | {
74 | name: "response with multiple results and alternatives",
75 | args: args{
76 | resp: &speechpb.LongRunningRecognizeResponse{
77 | Results: []*speechpb.SpeechRecognitionResult{
78 | {
79 | Alternatives: []*speechpb.SpeechRecognitionAlternative{
80 | {
81 | Transcript: "hello world",
82 | Confidence: 0.9,
83 | },
84 | {
85 | Transcript: "hello world",
86 | Confidence: 0.8,
87 | },
88 | },
89 | },
90 | {
91 | Alternatives: []*speechpb.SpeechRecognitionAlternative{
92 | {
93 | Transcript: "hello world",
94 | Confidence: 0.7,
95 | },
96 | {
97 | Transcript: "hello world",
98 | Confidence: 0.6,
99 | },
100 | },
101 | },
102 | },
103 | },
104 | },
105 | want: "hello worldhello worldhello worldhello world",
106 | },
107 | }
108 | for _, tt := range tests {
109 | t.Run(tt.name, func(t *testing.T) {
110 | if got := ExtractTranscript(tt.args.resp); got != tt.want {
111 | t.Errorf("extractTranscript() = %v, want %v", got, tt.want)
112 | }
113 | })
114 | }
115 | }
116 |
117 | func Test_newRecognitionRequest(t *testing.T) {
118 | type args struct {
119 | opts common.TranscribeOptions
120 | audioInfo *speechpb.RecognitionConfig
121 | gcsURI string
122 | }
123 | tests := []struct {
124 | name string
125 | args args
126 | want *speechpb.LongRunningRecognizeRequest
127 | }{
128 | {
129 | name: "create recognition request",
130 | args: args{
131 | opts: common.TranscribeOptions{
132 | EnableDiarization: true,
133 | MinSpeakers: 1,
134 | MaxSpeakers: 2,
135 | LanguageCode: "en-US",
136 | },
137 | audioInfo: &speechpb.RecognitionConfig{
138 | Encoding: speechpb.RecognitionConfig_ENCODING_UNSPECIFIED,
139 | SampleRateHertz: 44100,
140 | AudioChannelCount: 2,
141 | },
142 | gcsURI: "gs://bucket/object",
143 | },
144 | want: &speechpb.LongRunningRecognizeRequest{
145 | Config: &speechpb.RecognitionConfig{
146 | Encoding: speechpb.RecognitionConfig_ENCODING_UNSPECIFIED,
147 | SampleRateHertz: 44100,
148 | AudioChannelCount: 2,
149 | LanguageCode: "en-US",
150 | EnableAutomaticPunctuation: true,
151 | UseEnhanced: true,
152 | EnableWordConfidence: true,
153 | Model: "latest_long",
154 | DiarizationConfig: &speechpb.SpeakerDiarizationConfig{
155 | EnableSpeakerDiarization: true,
156 | MinSpeakerCount: 1,
157 | MaxSpeakerCount: 2,
158 | },
159 | },
160 | Audio: &speechpb.RecognitionAudio{
161 | AudioSource: &speechpb.RecognitionAudio_Uri{
162 | Uri: "gs://bucket/object",
163 | },
164 | },
165 | },
166 | },
167 | }
168 | for _, tt := range tests {
169 | t.Run(tt.name, func(t *testing.T) {
170 | if got := NewRecognitionRequest(tt.args.opts, tt.args.audioInfo, tt.args.gcsURI); !reflect.DeepEqual(got, tt.want) {
171 | t.Errorf("createRecognitionRequest() = %v, want %v", got, tt.want)
172 | }
173 | })
174 | }
175 | }
176 |
177 | /*
178 | All of the following tests are expected to fail because the credentials JSON is empty
179 | and the functions are not written in a way that allows for mocking of the GCP client libraries.
180 | This is a limitation of the current implementation and should be refactored in the future.
181 | */
182 |
183 | func Test_newSpeechClient(t *testing.T) {
184 | type args struct {
185 | ctx context.Context
186 | opts common.TranscribeOptions
187 | }
188 | tests := []struct {
189 | name string
190 | args args
191 | want *speech.Client
192 | wantErr bool
193 | }{
194 | {
195 | name: "create speech client",
196 | args: args{
197 | ctx: context.Background(),
198 | opts: common.TranscribeOptions{
199 | CredentialsJSON: nil,
200 | },
201 | },
202 | want: nil,
203 | wantErr: true,
204 | },
205 | {
206 | /*
207 | This test case is expected to fail because the credentials JSON is empty.
208 |
209 | TODO: Refactor to allow for mocking of the speech.NewClient function.
210 | */
211 | name: "create speech client with credentials",
212 | args: args{
213 | ctx: context.Background(),
214 | opts: common.TranscribeOptions{
215 | CredentialsJSON: []byte(""),
216 | },
217 | },
218 | want: nil,
219 | wantErr: true,
220 | },
221 | }
222 | for _, tt := range tests {
223 | t.Run(tt.name, func(t *testing.T) {
224 | got, err := NewSpeechClient(tt.args.ctx, tt.args.opts)
225 | if (err != nil) != tt.wantErr {
226 | t.Errorf("createSpeechClient() error = %v, wantErr %v", err, tt.wantErr)
227 | return
228 | }
229 | if !reflect.DeepEqual(got, tt.want) {
230 | t.Errorf("createSpeechClient() = %v, want %v", got, tt.want)
231 | }
232 | })
233 | }
234 | }
235 |
236 | func Test_createStorageClient(t *testing.T) {
237 | type args struct {
238 | ctx context.Context
239 | opts common.TranscribeOptions
240 | }
241 | tests := []struct {
242 | name string
243 | args args
244 | want *storage.Client
245 | wantErr bool
246 | }{
247 | {
248 | name: "create storage client",
249 | args: args{
250 | ctx: context.Background(),
251 | opts: common.TranscribeOptions{
252 | CredentialsJSON: nil,
253 | },
254 | },
255 | want: nil,
256 | wantErr: true,
257 | },
258 | {
259 | /*
260 | This test case is expected to fail because the credentials JSON is empty.
261 | This does not affect the functionality of the createStorageClient function.
262 |
263 | TODO: Refactor to allow for mocking of the storage.NewClient function.
264 | */
265 |
266 | name: "create storage client with credentials",
267 | args: args{
268 | ctx: context.Background(),
269 | opts: common.TranscribeOptions{
270 | CredentialsJSON: []byte(""),
271 | },
272 | },
273 | want: nil,
274 | wantErr: true,
275 | },
276 | }
277 | for _, tt := range tests {
278 | t.Run(tt.name, func(t *testing.T) {
279 | got, err := NewStorageClient(tt.args.ctx, tt.args.opts)
280 | if (err != nil) != tt.wantErr {
281 | t.Errorf("createStorageClient() error = %v, wantErr %v", err, tt.wantErr)
282 | return
283 | }
284 | if !reflect.DeepEqual(got, tt.want) {
285 | t.Errorf("createStorageClient() = %v, want %v", got, tt.want)
286 | }
287 | })
288 | }
289 | }
290 |
291 | func Test_uploadToGCS(t *testing.T) {
292 | server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
293 | if r.Method == "POST" && strings.Contains(r.URL.Path, "/upload/storage/v1/b/") {
294 | w.WriteHeader(http.StatusOK)
295 | fmt.Fprintf(w, `{"name": "uploaded-object"}`)
296 | } else {
297 | http.Error(w, "Unexpected request", http.StatusBadRequest)
298 | }
299 | }))
300 | defer server.Close()
301 |
302 | client, err := storage.NewClient(context.Background(), option.WithEndpoint(server.URL), option.WithHTTPClient(server.Client()))
303 | if err != nil {
304 | t.Fatalf("failed to create test client: %v", err)
305 | }
306 | defer client.Close()
307 |
308 | tempFile, err := os.CreateTemp("", "test-file-*.txt")
309 | if err != nil {
310 | t.Fatalf("failed to create temp file: %v", err)
311 | }
312 | defer os.Remove(tempFile.Name())
313 |
314 | content := []byte("test content")
315 | if _, err := tempFile.Write(content); err != nil {
316 | t.Fatalf("failed to write to temp file: %v", err)
317 | }
318 | tempFile.Close()
319 |
320 | tests := []struct {
321 | name string
322 | bucket string
323 | filename string
324 | want string
325 | wantErr bool
326 | }{
327 | {
328 | name: "successful upload",
329 | bucket: "test-bucket",
330 | filename: tempFile.Name(),
331 | want: fmt.Sprintf("gs://test-bucket/%s", filepath.Base(tempFile.Name())),
332 | wantErr: false,
333 | },
334 | {
335 | name: "non-existent file",
336 | bucket: "test-bucket",
337 | filename: "file.txt",
338 | want: "",
339 | wantErr: true,
340 | },
341 | {
342 | name: "empty filename",
343 | bucket: "test-bucket",
344 | filename: "",
345 | want: "",
346 | wantErr: true,
347 | },
348 | }
349 |
350 | for _, tt := range tests {
351 | t.Run(tt.name, func(t *testing.T) {
352 | got, err := UploadToGCS(context.Background(), client, tt.bucket, tt.filename)
353 | if (err != nil) != tt.wantErr {
354 | t.Errorf("uploadToGCS() error = %v, wantErr %v", err, tt.wantErr)
355 | return
356 | }
357 | if got != tt.want {
358 | t.Errorf("uploadToGCS() = %v, want %v", got, tt.want)
359 | }
360 | })
361 | }
362 | }
363 |
364 | func Test_deleteFromGCS(t *testing.T) {
365 | server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
366 | if r.Method == "DELETE" && r.URL.Path == "/b/test-bucket/o/test-object.txt" {
367 | w.WriteHeader(http.StatusOK)
368 | } else {
369 | http.Error(w, "Unexpected request", http.StatusBadRequest)
370 | }
371 | }))
372 | defer server.Close()
373 |
374 | client, err := storage.NewClient(context.Background(),
375 | option.WithEndpoint(server.URL),
376 | option.WithHTTPClient(server.Client()),
377 | option.WithoutAuthentication())
378 | if err != nil {
379 | t.Fatalf("failed to create test client: %v", err)
380 | }
381 | defer client.Close()
382 |
383 | tests := []struct {
384 | name string
385 | bucket string
386 | objectName string
387 | wantErr bool
388 | }{
389 | {
390 | name: "successful delete",
391 | bucket: "test-bucket",
392 | objectName: "test-object.txt",
393 | wantErr: false,
394 | },
395 | {
396 | name: "empty object name",
397 | bucket: "test-bucket",
398 | objectName: "",
399 | wantErr: true,
400 | },
401 | {
402 | name: "empty bucket name",
403 | bucket: "",
404 | objectName: "test-object.txt",
405 | wantErr: true,
406 | },
407 | }
408 |
409 | for _, tt := range tests {
410 | t.Run(tt.name, func(t *testing.T) {
411 | err := DeleteFromGCS(context.Background(), client, tt.bucket, tt.objectName)
412 | if (err != nil) != tt.wantErr {
413 | t.Errorf("deleteFromGCS() error = %v, wantErr %v", err, tt.wantErr)
414 | return
415 | }
416 | })
417 | }
418 | }
419 |
--------------------------------------------------------------------------------
/internal/utils/utils.go:
--------------------------------------------------------------------------------
1 | package utils
2 |
3 | import (
4 | "archive/zip"
5 | "encoding/xml"
6 | "fmt"
7 | "io"
8 | "mime"
9 | "net/url"
10 | "os"
11 | "path/filepath"
12 | "regexp"
13 | "strings"
14 | )
15 |
16 | func GetFileExtension(rawURL string) (string, error) {
17 | u, err := url.Parse(rawURL)
18 | if err != nil {
19 | return "", fmt.Errorf("invalid URL or file path: %w", err)
20 | }
21 |
22 | var pathToCheck string
23 | if u.Scheme == "" || u.Scheme == "file" {
24 | pathToCheck = rawURL
25 | if u.Scheme == "file" {
26 | pathToCheck = u.Path
27 | }
28 | } else {
29 | pathToCheck = u.Path
30 | }
31 |
32 | ext := filepath.Ext(pathToCheck)
33 | if ext == "" {
34 | return "", fmt.Errorf("no file extension found in %q", rawURL)
35 | }
36 |
37 | return ext, nil
38 | }
39 |
40 | func GetFileContentType(file *os.File) string {
41 | return mime.TypeByExtension(filepath.Ext(file.Name()))
42 | }
43 |
44 | func ExtractTextFromXML(file *zip.File) ([]string, error) {
45 | fileReader, err := file.Open()
46 | if err != nil {
47 | return nil, err
48 | }
49 | defer fileReader.Close()
50 |
51 | decoder := xml.NewDecoder(fileReader)
52 | var contents []string
53 | var currentParagraph strings.Builder
54 | inParagraph := false
55 |
56 | for {
57 | token, err := decoder.Token()
58 | if err == io.EOF {
59 | break
60 | }
61 | if err != nil {
62 | return nil, err
63 | }
64 |
65 | switch element := token.(type) {
66 | case xml.StartElement:
67 | if element.Name.Local == "p" {
68 | inParagraph = true
69 | currentParagraph.Reset()
70 | }
71 | case xml.EndElement:
72 | if element.Name.Local == "p" {
73 | inParagraph = false
74 | if trimmed := strings.TrimSpace(currentParagraph.String()); trimmed != "" {
75 | contents = append(contents, trimmed)
76 | }
77 | }
78 | case xml.CharData:
79 | if inParagraph {
80 | currentParagraph.Write(element)
81 | }
82 | }
83 | }
84 |
85 | return contents, nil
86 | }
87 |
88 | /*
89 | Wondering if this is even necessary but I can see how it can be useful
90 | as it also removes links, images, and code blocks.
91 |
92 | I'm not sure if this is the best way to remove markdown syntax.
93 | Inspired by https://github.com/mmatongo/site/blob/master/cmd/dnlm/helpers.go#L62-L87
94 | */
95 |
96 | /* RemoveMarkdownSyntax removes markdown syntax from a string */
97 | func RemoveMarkdownSyntax(text string) string {
98 | patterns := []string{
99 | "(```[\\s\\S]*?```)", // Code blocks
100 | "(`[^`\n]+`)", // Inline code
101 | "!\\[([^\\]]*?)\\]\\(([^)]+)\\)", // Images
102 | "\\[([^\\]]+)\\]\\(([^)]+)\\)", // Links
103 | "(__|\\*\\*|_|\\*)(.+?)(__|\\*\\*|_|\\*)", // Bold and Italic
104 | "~~(.+?)~~", // Strikethrough
105 | "^#{1,6}\\s(.*)$", // Headers
106 | "^>\\s(.*)$", // Blockquotes
107 | "^-{3,}$", // Horizontal rules
108 | "^\\s*[\\*\\-+]\\s+(.+)$", // Unordered lists
109 | "^\\s*\\d+\\.\\s+(.+)$", // Ordered lists
110 | }
111 |
112 | for _, pattern := range patterns {
113 | re := regexp.MustCompile("(?m)" + pattern)
114 | switch {
115 | case strings.HasPrefix(pattern, "(```"):
116 | text = re.ReplaceAllString(text, "$1")
117 | case strings.HasPrefix(pattern, "(`"):
118 | text = re.ReplaceAllString(text, "$1")
119 | case strings.HasPrefix(pattern, "!\\["):
120 | text = re.ReplaceAllString(text, "$1 ($2)")
121 | case strings.HasPrefix(pattern, "\\["):
122 | text = re.ReplaceAllString(text, "$1 ($2)")
123 | case strings.Contains(pattern, "(__|\\*\\*|_|\\*)"):
124 | text = re.ReplaceAllString(text, "$2")
125 | case strings.Contains(pattern, "~~"):
126 | text = re.ReplaceAllString(text, "$1")
127 | case strings.HasPrefix(pattern, "^#"):
128 | text = re.ReplaceAllString(text, "$1")
129 | case strings.HasPrefix(pattern, "^>"):
130 | text = re.ReplaceAllString(text, "$1")
131 | case strings.HasPrefix(pattern, "^\\s*[\\*\\-+]"):
132 | text = re.ReplaceAllString(text, "$1")
133 | case strings.HasPrefix(pattern, "^\\s*\\d+"):
134 | text = re.ReplaceAllString(text, "$1")
135 | default:
136 | text = re.ReplaceAllString(text, "")
137 | }
138 | }
139 |
140 | // Remove any remaining Markdown characters
141 | text = strings.NewReplacer(
142 | "*", "",
143 | "_", "",
144 | "`", "",
145 | "#", "",
146 | ">", "",
147 | "+", "",
148 | "-", "",
149 | ).Replace(text)
150 |
151 | return strings.TrimSpace(text)
152 | }
153 |
154 | func OpenFile(filePath string) (*os.File, error) {
155 | filePath = strings.TrimPrefix(filePath, "file://")
156 | return os.Open(filePath)
157 | }
158 |
--------------------------------------------------------------------------------
/internal/utils/utils_test.go:
--------------------------------------------------------------------------------
1 | package utils
2 |
3 | import (
4 | "archive/zip"
5 | "bytes"
6 | "os"
7 | "path/filepath"
8 | "reflect"
9 | "testing"
10 | )
11 |
12 | func createMockZipFile(content string) *zip.File {
13 | buf := new(bytes.Buffer)
14 | w := zip.NewWriter(buf)
15 |
16 | var files = []struct {
17 | Name, Body string
18 | }{
19 | {"document.xml", content},
20 | }
21 | for _, file := range files {
22 | f, err := w.Create(file.Name)
23 | if err != nil {
24 | panic(err)
25 | }
26 | _, err = f.Write([]byte(file.Body))
27 | if err != nil {
28 | panic(err)
29 | }
30 | }
31 |
32 | err := w.Close()
33 | if err != nil {
34 | panic(err)
35 | }
36 |
37 | r, err := zip.NewReader(bytes.NewReader(buf.Bytes()), int64(buf.Len()))
38 | if err != nil {
39 | panic(err)
40 | }
41 |
42 | return r.File[0]
43 | }
44 |
45 | func TestRemoveMarkdownSyntax(t *testing.T) {
46 | type args struct {
47 | text string
48 | }
49 | tests := []struct {
50 | name string
51 | args args
52 | want string
53 | }{
54 | {
55 | name: "Test 1",
56 | args: args{
57 | text: "This is a **bold** text",
58 | },
59 | want: "This is a bold text",
60 | },
61 | {
62 | name: "Test 2",
63 | args: args{
64 | text: "This is a *italic* text",
65 | },
66 | want: "This is a italic text",
67 | },
68 | {
69 | name: "Test 3",
70 | args: args{
71 | text: "This is a [link](https://example.com) text",
72 | },
73 | want: "This is a link (https://example.com) text",
74 | },
75 | {
76 | name: "Test 4",
77 | args: args{
78 | text: "This is a  text",
79 | },
80 | want: "This is a image (https://example.com/image.png) text",
81 | },
82 | }
83 | for _, tt := range tests {
84 | t.Run(tt.name, func(t *testing.T) {
85 | if got := RemoveMarkdownSyntax(tt.args.text); got != tt.want {
86 | t.Errorf("RemoveMarkdownSyntax() = %v, want %v", got, tt.want)
87 | }
88 | })
89 | }
90 | }
91 |
92 | func TestGetFileExtension(t *testing.T) {
93 | type args struct {
94 | rawUrl string
95 | }
96 | tests := []struct {
97 | name string
98 | args args
99 | want string
100 | wantErr bool
101 | }{
102 | {
103 | name: "Test 1",
104 | args: args{
105 | rawUrl: "https://example.com/test.csv",
106 | },
107 | want: ".csv",
108 | wantErr: false,
109 | },
110 | {
111 | name: "Test 2",
112 | args: args{
113 | rawUrl: "",
114 | },
115 | want: "",
116 | wantErr: true,
117 | },
118 | {
119 | name: "Test 3",
120 | args: args{
121 | rawUrl: "https://example.com/test",
122 | },
123 | want: "",
124 | wantErr: true,
125 | },
126 | {
127 | name: "Test 4",
128 | args: args{
129 | rawUrl: "file:///test.csv",
130 | },
131 | want: ".csv",
132 | wantErr: false,
133 | },
134 | {
135 | name: "Test 5",
136 | args: args{
137 | rawUrl: "file:///test",
138 | },
139 | want: "",
140 | wantErr: true,
141 | },
142 | {
143 | name: "Test 6",
144 | args: args{
145 | rawUrl: string([]byte{0x01, 0x02, 0x03, 0x04, 0x05}),
146 | },
147 | want: "",
148 | wantErr: true,
149 | },
150 | }
151 | for _, tt := range tests {
152 | t.Run(tt.name, func(t *testing.T) {
153 | got, err := GetFileExtension(tt.args.rawUrl)
154 | if (err != nil) != tt.wantErr {
155 | t.Errorf("GetFileExtensionFromUrl() error = %v, wantErr %v", err, tt.wantErr)
156 | return
157 | }
158 | if got != tt.want {
159 | t.Errorf("GetFileExtensionFromUrl() = %v, want %v", got, tt.want)
160 | }
161 | })
162 | }
163 | }
164 |
165 | func TestExtractTextFromXML(t *testing.T) {
166 | type args struct {
167 | file *zip.File
168 | }
169 | tests := []struct {
170 | name string
171 | args args
172 | want []string
173 | wantErr bool
174 | }{
175 | {
176 | name: "valid XML with paragraphs",
177 | args: args{
178 | file: createMockZipFile(`
179 |
180 |
181 | First paragraph
182 | Second paragraph
183 | Third paragraph
184 |
185 | `),
186 | },
187 | want: []string{"First paragraph", "Second paragraph", "Third paragraph"},
188 | wantErr: false,
189 | },
190 | {
191 | name: "XML with empty paragraphs",
192 | args: args{
193 | file: createMockZipFile(`
194 |
195 |
196 | First paragraph
197 |
198 | Third paragraph
199 |
200 | `),
201 | },
202 | want: []string{"First paragraph", "Third paragraph"},
203 | wantErr: false,
204 | },
205 | {
206 | name: "invalid XML",
207 | args: args{
208 | file: createMockZipFile(`
209 |
210 |
211 | Unclosed paragraph
212 |
213 | `),
214 | },
215 | want: nil,
216 | wantErr: true,
217 | },
218 | }
219 | for _, tt := range tests {
220 | t.Run(tt.name, func(t *testing.T) {
221 | got, err := ExtractTextFromXML(tt.args.file)
222 | if (err != nil) != tt.wantErr {
223 | t.Errorf("ExtractTextFromXML() error = %v, wantErr %v", err, tt.wantErr)
224 | return
225 | }
226 | if !reflect.DeepEqual(got, tt.want) {
227 | t.Errorf("ExtractTextFromXML() = %v, want %v", got, tt.want)
228 | }
229 | })
230 | }
231 | }
232 |
233 | func TestOpenFile(t *testing.T) {
234 | type args struct {
235 | filePath string
236 | }
237 | tests := []struct {
238 | name string
239 | args args
240 | want *os.File
241 | wantErr bool
242 | }{
243 | {
244 | name: "valid file",
245 | args: args{
246 | filePath: "testdata/test.pdf",
247 | },
248 | want: nil,
249 | wantErr: true,
250 | },
251 | }
252 | for _, tt := range tests {
253 | t.Run(tt.name, func(t *testing.T) {
254 | got, err := OpenFile(tt.args.filePath)
255 | if (err != nil) != tt.wantErr {
256 | t.Errorf("OpenFile() error = %v, wantErr %v", err, tt.wantErr)
257 | return
258 | }
259 | if !reflect.DeepEqual(got, tt.want) {
260 | t.Errorf("OpenFile() = %v, want %v", got, tt.want)
261 | }
262 | })
263 | }
264 | }
265 |
266 | func TestGetFileContentType(t *testing.T) {
267 | tempDir := t.TempDir()
268 | testHTMLPath := filepath.Join(tempDir, "test.html")
269 |
270 | err := os.WriteFile(testHTMLPath, []byte("html content"), 0644)
271 | if err != nil {
272 | t.Fatalf("failed to create test html file: %v", err)
273 | }
274 |
275 | filepath, _ := OpenFile(testHTMLPath)
276 | type args struct {
277 | file *os.File
278 | }
279 | tests := []struct {
280 | name string
281 | args args
282 | want string
283 | }{
284 | {
285 | name: "Test 1",
286 | args: args{
287 | file: filepath,
288 | },
289 | want: "text/html; charset=utf-8",
290 | },
291 | }
292 | for _, tt := range tests {
293 | t.Run(tt.name, func(t *testing.T) {
294 | if got := GetFileContentType(tt.args.file); got != tt.want {
295 | t.Errorf("GetFileContentType() = %v, want %v", got, tt.want)
296 | }
297 | })
298 | }
299 | }
300 |
--------------------------------------------------------------------------------
/testdata/audio/test.flac:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mmatongo/chew/bf3ea330803108fd896caeb73a2103e03b0d5d1f/testdata/audio/test.flac
--------------------------------------------------------------------------------
/testdata/audio/test.mp3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mmatongo/chew/bf3ea330803108fd896caeb73a2103e03b0d5d1f/testdata/audio/test.mp3
--------------------------------------------------------------------------------
/testdata/audio/test.ogg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mmatongo/chew/bf3ea330803108fd896caeb73a2103e03b0d5d1f/testdata/audio/test.ogg
--------------------------------------------------------------------------------
/testdata/audio/test.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mmatongo/chew/bf3ea330803108fd896caeb73a2103e03b0d5d1f/testdata/audio/test.wav
--------------------------------------------------------------------------------
/testdata/files/test.epub:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mmatongo/chew/bf3ea330803108fd896caeb73a2103e03b0d5d1f/testdata/files/test.epub
--------------------------------------------------------------------------------
/testdata/files/test.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mmatongo/chew/bf3ea330803108fd896caeb73a2103e03b0d5d1f/testdata/files/test.pdf
--------------------------------------------------------------------------------