├── .github
    └── workflows
    │   ├── ci.yml
    │   └── go.yml
├── .gitignore
├── LICENSE
├── README.md
├── TODO.md
├── assets
    └── gopher-eating.svg
├── chew.go
├── chew_test.go
├── cmd
    └── chew
    │   └── wrapper.go
├── codecov.yml
├── docs
    ├── golang.md
    ├── python.md
    ├── ruby.md
    └── setup.md
├── examples
    ├── main.go
    ├── main.py
    ├── main.rb
    └── transcription
    │   ├── google.go
    │   └── whisper.go
├── go.mod
├── go.sum
├── internal
    ├── audio
    │   ├── flac.go
    │   ├── flac_test.go
    │   ├── mp3.go
    │   ├── mp3_test.go
    │   ├── processor.go
    │   ├── processor_test.go
    │   ├── types.go
    │   ├── wav.go
    │   └── wav_test.go
    ├── common
    │   └── types.go
    ├── document
    │   ├── docx.go
    │   ├── docx_test.go
    │   ├── epub.go
    │   ├── epub_test.go
    │   ├── pdf.go
    │   ├── pdf_test.go
    │   ├── pptx.go
    │   └── pptx_test.go
    ├── text
    │   ├── csv.go
    │   ├── csv_test.go
    │   ├── html.go
    │   ├── html_test.go
    │   ├── json.go
    │   ├── json_test.go
    │   ├── markdown.go
    │   ├── plaintext.go
    │   ├── plaintext_test.go
    │   ├── xml.go
    │   ├── xml_test.go
    │   ├── yaml.go
    │   └── yaml_test.go
    ├── transcribe
    │   ├── google_transcriber.go
    │   ├── google_transcriber_test.go
    │   ├── transcribe.go
    │   ├── transcribe_test.go
    │   ├── types.go
    │   ├── whisper.go
    │   └── whisper_test.go
    └── utils
    │   ├── gcs
    │       ├── gcs_utils.go
    │       └── gcs_utils_test.go
    │   ├── utils.go
    │   └── utils_test.go
└── testdata
    ├── audio
        ├── test.flac
        ├── test.mp3
        ├── test.ogg
        └── test.wav
    └── files
        ├── test.epub
        └── test.pdf


/.github/workflows/ci.yml:
--------------------------------------------------------------------------------
 1 | name: Workflow for Codecov
 2 | on: [push, pull_request]
 3 | jobs:
 4 |   run:
 5 |     runs-on: ubuntu-latest
 6 |     steps:
 7 |       - name: Set up Go
 8 |         uses: actions/setup-go@v5
 9 |         with:
10 |           go-version: '1.23'
11 |         id: go
12 | 
13 |       - name: Check out code into the Go module directory
14 |         uses: actions/checkout@v4
15 | 
16 |       - name: Get dependencies
17 |         run: |
18 |           go get -v -t -d ./...
19 |           if [ -f Gopkg.toml ]; then
20 |               curl https://raw.githubusercontent.com/golang/dep/master/install.sh | sh
21 |               dep ensure
22 |           fi
23 | 
24 |       - name: Generate coverage report
25 |         run: |
26 |           go test `go list ./... | grep -v -E 'docs|cmd|examples'` -coverprofile=coverage.txt -covermode=atomic
27 | 
28 |       - name: Upload coverage to Codecov
29 |         uses: codecov/codecov-action@v4
30 |         with:
31 |           verbose: true
32 |         env:
33 |           CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
34 | 


--------------------------------------------------------------------------------
/.github/workflows/go.yml:
--------------------------------------------------------------------------------
 1 | name: Go
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [ "master" ]
 6 |   pull_request:
 7 |     branches: [ "master" ]
 8 | 
 9 | jobs:
10 | 
11 |   test:
12 |     runs-on: ubuntu-latest
13 |     steps:
14 |     - uses: actions/checkout@v4
15 | 
16 |     - name: Set up Go
17 |       uses: actions/setup-go@v4
18 |       with:
19 |         go-version: '1.22'
20 | 
21 |     - name: Test
22 |       run: go test -v -race ./... -cover -covermode=atomic
23 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .idea
2 | audio
3 | *.json
4 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 Daniel M. Matongo
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | <div align="center">
  2 | <img
  3 |     width=40%
  4 |     src="https://raw.githubusercontent.com/mmatongo/chew/master/assets/gopher-eating.svg"
  5 |     alt="chew logo"
  6 | />
  7 | 
  8 | [![Go Report Card](https://goreportcard.com/badge/github.com/mmatongo/chew)](https://goreportcard.com/report/github.com/mmatongo/chew)
  9 | [![GoDoc](https://godoc.org/github.com/mmatongo/chew?status.svg)](https://pkg.go.dev/github.com/mmatongo/chew)
 10 | [![Maintainability](https://api.codeclimate.com/v1/badges/441cfd36f310c0c48878/maintainability)](https://codeclimate.com/github/mmatongo/chew/maintainability)
 11 | [![codecov](https://codecov.io/github/mmatongo/chew/graph/badge.svg?token=6OOK91QQRC)](https://codecov.io/github/mmatongo/chew)
 12 | [![License](https://img.shields.io/badge/License-MIT-blue.svg)](./LICENSE)
 13 | </div>
 14 | 
 15 | > <p align="center">A Go library for processing various content types into markdown/plaintext..</p>
 16 | 
 17 | ## About <a id="about"></a>
 18 | 
 19 | *Chew* is a Go library that processes various content types into markdown or plaintext. It supports multiple content types, including HTML, PDF, CSV, JSON, YAML, DOCX, PPTX, Markdown, Plaintext, MP3, FLAC, and WAVE.
 20 | 
 21 | ## Installation <a id="installation"></a>
 22 | 
 23 | ```bash
 24 | go get github.com/mmatongo/chew
 25 | ```
 26 | 
 27 | ## Usage <a id="usage"></a>
 28 | 
 29 | Here's a basic example of how to use Chew:
 30 | 
 31 | ```go
 32 | package main
 33 | 
 34 | import (
 35 | 	"context"
 36 | 	"fmt"
 37 | 	"log"
 38 | 	"time"
 39 | 
 40 | 	"github.com/mmatongo/chew/v1"
 41 | )
 42 | 
 43 | func main() {
 44 | 	urls := []string{
 45 | 		"https://example.com",
 46 | 	}
 47 | 
 48 | 	config := chew.Config{
 49 | 		UserAgent:       "Chew/1.0 (+https://github.com/mmatongo/chew)",
 50 | 		RetryLimit:      3,
 51 | 		RetryDelay:      5 * time.Second,
 52 | 		CrawlDelay:      10 * time.Second,
 53 | 		ProxyList:       []string{}, // Add your proxies here, or leave empty
 54 | 		RateLimit:       2 * time.Second,
 55 | 		RateBurst:       3,
 56 | 		IgnoreRobotsTxt: false,
 57 | 	}
 58 | 
 59 | 	haChew := chew.New(config)
 60 | 
 61 | 	// The context is optional, but can be used to cancel the operation after a certain time
 62 | 	ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
 63 | 	defer cancel()
 64 | 
 65 | 	chunks, err := haChew.Process(ctx, urls)
 66 | 	if err != nil {
 67 | 		if err == context.DeadlineExceeded {
 68 | 			log.Println("Operation timed out")
 69 | 		} else {
 70 | 			log.Printf("Error processing URLs: %v", err)
 71 | 		}
 72 | 		return
 73 | 	}
 74 | 
 75 | 	for _, chunk := range chunks {
 76 | 		fmt.Printf("Source: %s\nContent: %s\n\n", chunk.Source, chunk.Content)
 77 | 	}
 78 | }
 79 | ```
 80 | 
 81 | Output
 82 | 
 83 | ```bash
 84 | Source: https://example.com
 85 | Content: Example Domain
 86 | 
 87 | Source: https://example.com
 88 | Content: This domain is for use in illustrative examples in documents. You may use this domain in literature without prior coordination or asking for permission.
 89 | 
 90 | Source: https://example.com
 91 | Content: More information...
 92 | ```
 93 | 
 94 | You can find more examples in the [examples](./examples) directory as well as instructions on how to use Chew with Ruby and Python.
 95 | 
 96 | ## Contributing <a id="contributing"></a>
 97 | 
 98 | Contributions are welcome! Feel free to open an issue or submit a pull request if you have any suggestions or improvements.
 99 | 
100 | ## License <a id="license"></a>
101 | 
102 | This project is licensed under the MIT License - see the [LICENSE](./LICENSE) file for details.
103 | 
104 | ### Logo <a id="logo"></a>
105 | 
106 | The [logo](https://github.com/MariaLetta/free-gophers-pack) was made by the amazing [MariaLetta](https://github.com/MariaLetta).
107 | 
108 | 
109 | ### Similar Projects <a id="similar_projects"></a>
110 | [docconv](https://github.com/sajari/docconv)
111 | 
112 | ### Roadmap <a id="roadmap"></a>
113 | The roadmap for this project is available [here](./TODO.md). It's meant more as a guide than a strict plan because I only work on this project in my free time.
114 | 


--------------------------------------------------------------------------------
/TODO.md:
--------------------------------------------------------------------------------
 1 | 
 2 | ### TODO
 3 | ---
 4 | 
 5 | - [x] Add tests
 6 | - [ ] Improve error handling
 7 | - [x] Add support for more content types
 8 | - [x] Implement rate limiting for URL fetching
 9 | - [x] Use a free PDF processing library
10 | - [x] How to handle text/plain content type
11 | - [x] Add transcription support
12 | - [x] Customisable user agent
13 | - [ ] Allow users what to target in the HTML, i.e. body, title, etc
14 | - [ ] More examples, documentation and use cases
15 | - [ ] Improve PPTX and DOCX processing, (currently using a hacky method I cobbed together from various sources)
16 | - [ ] Use a common interface for all content types
17 | 


--------------------------------------------------------------------------------
/chew.go:
--------------------------------------------------------------------------------
  1 | /*
  2 | Package chew provides a simple way to process URLs and files. It allows you to process a list of URLs
  3 | and files, and returns the content of the URLs and files as a list of Chunks. It also provides a way to
  4 | transcribe audio files using the Google Cloud Speech-to-Text API or the OpenAI Whisper API.
  5 | 
  6 | The library respects rules defined in robots.txt file and crawl delays, and allows you to set a custom http.Client for making requests.
  7 | 
  8 | Note on Responsible Usage:
  9 | 
 10 | This library is designed for processing data from both local files and web sources. Users should be aware of the following considerations:
 11 | 
 12 | 1. Web Scraping:
 13 |   - When scraping websites, ensure compliance with the target website's terms of service and robots.txt rules.
 14 |   - Respect rate limits and crawl delays to avoid overwhelming target servers.
 15 |   - Be aware that web scraping may be subject to legal restrictions in some jurisdictions.
 16 |   - While the library will attempt to respect robots.txt rules by default, users are responsible for ensuring
 17 |     that their usage complies with the target website's terms of service and legal requirements.
 18 | 
 19 | 2. File Processing:
 20 |   - Exercise caution when processing files from untrusted sources.
 21 |   - Ensure you have appropriate permissions to access and process the files.
 22 |   - Be mindful of potential sensitive information in processed files and handle it securely.
 23 | 
 24 | 3. Data Handling:
 25 |   - Properly secure and manage any data extracted or processed using this library, especially if it contains personal or sensitive information.
 26 |   - Comply with relevant data protection regulations (e.g., GDPR, CCPA) when handling personal data.
 27 | 
 28 | 4. System Resource Usage:
 29 |   - Be aware that processing large files or numerous web pages can be resource-intensive. Monitor and manage system resources accordingly.
 30 | 
 31 | 5. Have Fun
 32 | 
 33 | Users of this library are responsible for ensuring their usage complies with applicable laws, regulations, and ethical considerations in their jurisdiction and context of use.
 34 | */
 35 | package chew
 36 | 
 37 | import (
 38 | 	"context"
 39 | 	"fmt"
 40 | 	"io"
 41 | 	"net/http"
 42 | 	"net/url"
 43 | 	"strings"
 44 | 	"sync"
 45 | 	"time"
 46 | 
 47 | 	"github.com/mmatongo/chew/v1/internal/common"
 48 | 	"github.com/mmatongo/chew/v1/internal/document"
 49 | 	"github.com/mmatongo/chew/v1/internal/text"
 50 | 	"github.com/mmatongo/chew/v1/internal/transcribe"
 51 | 	"github.com/mmatongo/chew/v1/internal/utils"
 52 | 	"github.com/temoto/robotstxt"
 53 | 	"golang.org/x/time/rate"
 54 | )
 55 | 
 56 | const (
 57 | 	contentTypeHTML     = "text/html"
 58 | 	contentTypeText     = "text/plain"
 59 | 	contentTypeXML      = "application/xml"
 60 | 	contentTypeTextXML  = "text/xml"
 61 | 	contentTypePDF      = "application/pdf"
 62 | 	contentTypeCSV      = "text/csv"
 63 | 	contentTypeJSON     = "application/json"
 64 | 	contentTypeYAML     = "application/x-yaml"
 65 | 	contentTypeMarkdown = "text/markdown"
 66 | 	contentTypeEPUB     = "application/epub+zip"
 67 | 	contentTypeDocx     = "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
 68 | 	contentTypePptx     = "application/vnd.openxmlformats-officedocument.presentationml.presentation"
 69 | )
 70 | 
 71 | var contentTypeProcessors = map[string]func(io.Reader, string) ([]common.Chunk, error){
 72 | 	contentTypeHTML:     text.ProcessHTML,
 73 | 	contentTypeCSV:      text.ProcessCSV,
 74 | 	contentTypeJSON:     text.ProcessJSON,
 75 | 	contentTypeYAML:     text.ProcessYAML,
 76 | 	contentTypeMarkdown: text.ProcessText,
 77 | 	contentTypeText:     text.ProcessText,
 78 | 	contentTypeXML:      text.ProcessXML,
 79 | 	contentTypeTextXML:  text.ProcessXML,
 80 | 	contentTypeDocx:     document.ProcessDocx,
 81 | 	contentTypePptx:     document.ProcessPptx,
 82 | 	contentTypePDF:      document.ProcessPDF,
 83 | 	contentTypeEPUB:     document.ProcessEpub,
 84 | }
 85 | 
 86 | type Chew struct {
 87 | 	config        common.Config
 88 | 	httpClient    *http.Client
 89 | 	rateLimiter   RateLimiter
 90 | 	rateLimiterMu sync.RWMutex
 91 | 	robotsCache   map[string]*robotstxt.RobotsData
 92 | 	robotsMu      sync.RWMutex
 93 | 	lastAccess    map[string]time.Time
 94 | 	lastAccessMu  sync.Mutex
 95 | 	proxyIndex    int
 96 | 	proxyMu       sync.Mutex
 97 | }
 98 | 
 99 | type RateLimiter interface {
100 | 	Wait(context.Context) error
101 | }
102 | 
103 | func (c *Chew) SetRateLimiter(rl RateLimiter) {
104 | 	c.rateLimiterMu.Lock()
105 | 	defer c.rateLimiterMu.Unlock()
106 | 	c.rateLimiter = rl
107 | }
108 | 
109 | /*
110 | NewConfig allows you to set the configuration options for URL processing. It takes a Config struct.
111 | 
112 | Usage:
113 | 
114 | 	config := chew.Config{
115 | 		UserAgent:       "MyBot/1.0 (+https://example.com/bot)",
116 | 		RetryLimit:      3,
117 | 		RetryDelay:      5 * time.Second,
118 | 		CrawlDelay:      10 * time.Second,
119 | 		ProxyList:       []string{"http://proxy1.com", "http://proxy2.com"},
120 | 		RateLimit:       2 * time.Second,
121 | 		RateBurst:       3,
122 | 		IgnoreRobotsTxt: false,
123 | 	}
124 | 
125 | 	chew.NewConfig(config)
126 | */
127 | func New(config common.Config) *Chew {
128 | 	c := &Chew{
129 | 		config:      config,
130 | 		robotsCache: make(map[string]*robotstxt.RobotsData),
131 | 		lastAccess:  make(map[string]time.Time),
132 | 	}
133 | 	c.initHTTPClient()
134 | 
135 | 	limit := rate.Every(config.RateLimit)
136 | 	c.rateLimiter = rate.NewLimiter(limit, config.RateBurst)
137 | 
138 | 	return c
139 | }
140 | 
141 | /*
142 | Transcribe is a function that transcribes audio files using either the Google Cloud Speech-to-Text API
143 | or the Whisper API. It handles uploading the audio file to Google Cloud Storage if necessary,
144 | manages the transcription process, and returns the resulting transcript.
145 | 
146 | For detailed usage instructions, see the TranscribeOptions struct documentation.
147 | */
148 | var Transcribe = transcribe.Transcribe
149 | 
150 | /*
151 | The TranscribeOptions struct contains the options for transcribing an audio file. It allows the user
152 | to specify the Google Cloud credentials, the GCS bucket to upload the audio file to, the language code
153 | to use for transcription, an option to enable diarization (the process of separating and labeling
154 | speakers in an audio stream) including the min and max speakers and
155 | an option to clean up the audio file from Google Cloud Speech-to-Text (GCS) after transcription is complete.
156 | 
157 | And also, it allows the user to specify whether to use the Whisper API for transcription, and if so,
158 | the API key, model, and prompt to use.
159 | 
160 | Usage:
161 | 
162 | 	opts := chew.TranscribeOptions{
163 | 		CredentialsJSON:   []byte("..."),
164 | 		Bucket:            "my-bucket",
165 | 		LanguageCode:      "en-US",
166 | 		EnableDiarization: true,
167 | 		MinSpeakers:       2,
168 | 		MaxSpeakers:       4,
169 | 		CleanupOnComplete: true,
170 | 		UseWhisper:        true, // You can only have one of these enabled, by default it uses the Google Cloud Speech-to-Text API
171 | 		WhisperAPIKey:     "my-whisper-api-key",
172 | 		WhisperModel:      "whisper-1",
173 | 	}
174 | */
175 | type TranscribeOptions = transcribe.TranscribeOptions
176 | 
177 | /*
178 | Config struct contains the configuration options for URL processing.
179 | 
180 | Fields:
181 |   - UserAgent: The user agent string to use for requests (e.g., "MyBot/1.0 (+https://example.com/bot)")
182 |   - RetryLimit: Number of retries to attempt in case of failure (e.g., 3)
183 |   - RetryDelay: Delay between retries (e.g., 5 * time.Second)
184 |   - CrawlDelay: Delay between requests to the same domain (e.g., 10 * time.Second)
185 |   - ProxyList: List of proxy URLs to use for requests (e.g., []string{"http://proxy1.com", "http://proxy2.com"})
186 |   - RateLimit: Rate limit for requests (e.g., rate.Every(2 * time.Second))
187 |   - RateBurst: Maximum burst size for rate limiting (e.g., 3)
188 |   - IgnoreRobotsTxt: Whether to ignore robots.txt rules (e.g., false)
189 | 
190 | Usage:
191 | 
192 | 	config := chew.Config{
193 | 	    UserAgent:       "MyBot/1.0 (+https://example.com/bot)",
194 | 	    RetryLimit:      3,
195 | 	    RetryDelay:      5 * time.Second,
196 | 	    CrawlDelay:      10 * time.Second,
197 | 	    ProxyList:       []string{"http://proxy1.com", "http://proxy2.com"},
198 | 	    RateLimit:       2 * time.Second,
199 | 	    RateBurst:       3,
200 | 	    IgnoreRobotsTxt: false,
201 | 	}
202 | */
203 | type Config = common.Config
204 | 
205 | /*
206 | This is meant as a fallback in case the content type is not recognized and to enforce
207 | the content type based on the file extension instead of the content type
208 | returned by the server. i.e. if the server returns text/plain but the file is a markdown file
209 | the content types are the biggest culprits of this
210 | */
211 | var validExtensions = map[string]func(io.Reader, string) ([]common.Chunk, error){
212 | 	".md":   text.ProcessText,
213 | 	".csv":  text.ProcessCSV,
214 | 	".json": text.ProcessJSON,
215 | 	".yaml": text.ProcessYAML,
216 | 	".html": text.ProcessHTML,
217 | 	".epub": document.ProcessEpub,
218 | }
219 | 
220 | /*
221 | SetHTTPClient allows you to set a custom http.Client to use for making requests.
222 | 
223 | This would be useful in the event custom logging, tracing, or other functionality is
224 | required for the requests made by the library.
225 | 
226 | Usage:
227 | 
228 | 	client := &http.Client{
229 | 		Transport: loggingRoundTripper{wrapped: http.DefaultTransport},
230 | 	}
231 | 
232 | 	chew.SetHTTPClient(client)
233 | */
234 | 
235 | func (c *Chew) SetHTTPClient(client *http.Client) {
236 | 	c.httpClient = client
237 | }
238 | 
239 | func (c *Chew) initHTTPClient() {
240 | 	transport := &http.Transport{
241 | 		Proxy: c.getProxy,
242 | 	}
243 | 	c.httpClient = &http.Client{
244 | 		Timeout:   30 * time.Second,
245 | 		Transport: transport,
246 | 	}
247 | }
248 | 
249 | /*
250 | For content types that can also return text/plain as their content types we need to manually check
251 | their extension to properly process them. I feel like this could be done better but this is my solution for now.
252 | */
253 | func getProcessor(contentType, url string) (func(io.Reader, string) ([]common.Chunk, error), error) {
254 | 	for key, proc := range contentTypeProcessors {
255 | 		if strings.Contains(contentType, key) {
256 | 			return proc, nil
257 | 		}
258 | 	}
259 | 
260 | 	ext, err := utils.GetFileExtension(url)
261 | 	if err != nil {
262 | 		return nil, fmt.Errorf("couldn't get file extension from url %s: %s", url, err)
263 | 	}
264 | 
265 | 	if proc, ok := validExtensions[ext]; ok {
266 | 		return proc, nil
267 | 	}
268 | 
269 | 	return nil, fmt.Errorf("unsupported content type: %s", contentType)
270 | }
271 | 
272 | /*
273 | Process takes a list of URLs and returns a list of Chunks
274 | 
275 | The slice of strings to be processed can be URLs or file paths
276 | The context is optional and can be used to cancel the processing
277 | of the URLs after a certain amount of time
278 | 
279 | This function is safe for concurrent use.
280 | 
281 | Usage:
282 | 
283 | 	chunks, err := chew.Process([]string{"https://example.com", "file://path/to/file.txt"})
284 | 	if err != nil {
285 | 		log.Fatalf("Error processing URLs: %v", err)
286 | 	}
287 | 
288 | 	for _, chunk := range chunks {
289 | 		log.Printf("Chunk: %s\n Source: %s\n", chunk.Content, chunk.Source)
290 | 	}
291 | */
292 | func (c *Chew) Process(ctx context.Context, urls []string) ([]common.Chunk, error) {
293 | 	var (
294 | 		result []common.Chunk
295 | 		mu     sync.Mutex
296 | 		errCh  = make(chan error, len(urls))
297 | 		resCh  = make(chan []common.Chunk, len(urls))
298 | 	)
299 | 
300 | 	for _, url := range urls {
301 | 		go func(url string) {
302 | 			select {
303 | 			case <-ctx.Done():
304 | 				errCh <- ctx.Err()
305 | 				return
306 | 			default:
307 | 				c.rateLimiterMu.RLock()
308 | 				rateLimiter := c.rateLimiter
309 | 				c.rateLimiterMu.RUnlock()
310 | 
311 | 				if err := rateLimiter.Wait(ctx); err != nil {
312 | 					errCh <- fmt.Errorf("rate limit exceeded for %s: %w", url, err)
313 | 					return
314 | 				}
315 | 
316 | 				if !c.config.IgnoreRobotsTxt {
317 | 					allowed, crawlDelay, err := c.getRobotsTxtInfo(url)
318 | 					if err != nil {
319 | 						errCh <- fmt.Errorf("checking robots.txt for %s: %w", url, err)
320 | 						return
321 | 					}
322 | 					if !allowed {
323 | 						errCh <- fmt.Errorf("access to %s is disallowed by robots.txt", url)
324 | 						return
325 | 					}
326 | 					if err := c.respectCrawlDelay(ctx, url, crawlDelay); err != nil {
327 | 						errCh <- fmt.Errorf("respecting crawl delay for %s: %w", url, err)
328 | 						return
329 | 					}
330 | 				}
331 | 
332 | 				chunks, err := c.processWithRetry(ctx, url)
333 | 				if err != nil {
334 | 					errCh <- fmt.Errorf("processing %s: %w", url, err)
335 | 					return
336 | 				}
337 | 
338 | 				resCh <- chunks
339 | 			}
340 | 		}(url)
341 | 	}
342 | 
343 | 	for i := 0; i < len(urls); i++ {
344 | 		select {
345 | 		case <-ctx.Done():
346 | 			return nil, ctx.Err()
347 | 		case err := <-errCh:
348 | 			return nil, err
349 | 		case chunks := <-resCh:
350 | 			mu.Lock()
351 | 			result = append(result, chunks...)
352 | 			mu.Unlock()
353 | 		}
354 | 	}
355 | 
356 | 	return result, nil
357 | }
358 | 
359 | /*
360 | processURL handles the actual processing of a single URL or file
361 | file paths are processed directly while URLs are fetched and processed
362 | */
363 | func (c *Chew) processURL(ctx context.Context, url string) ([]common.Chunk, error) {
364 | 	// if the url is a file path we can just open the file and process it directly
365 | 	if filePath, found := strings.CutPrefix(url, "file://"); found {
366 | 		file, err := utils.OpenFile(filePath)
367 | 		if err != nil {
368 | 			return nil, fmt.Errorf("opening file: %w", err)
369 | 		}
370 | 		defer file.Close()
371 | 
372 | 		ext, _ := utils.GetFileExtension(filePath)
373 | 		/*
374 | 			Will leave this in here for now, but I think it's better to just check the file extension
375 | 			instead of the content type returned.
376 | 		*/
377 | 		contentType := utils.GetFileContentType(file)
378 | 
379 | 		proc, err := getProcessor(contentType, filePath)
380 | 		if err != nil {
381 | 			proc, ok := validExtensions[ext]
382 | 			if !ok {
383 | 				return nil, fmt.Errorf("unsupported file type: %s", ext)
384 | 			}
385 | 			return proc(file, url)
386 | 		}
387 | 
388 | 		return proc(file, url)
389 | 	}
390 | 
391 | 	req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil)
392 | 	if err != nil {
393 | 		return nil, fmt.Errorf("creating request: %w", err)
394 | 	}
395 | 
396 | 	req.Header.Set("User-Agent", c.config.UserAgent)
397 | 
398 | 	resp, err := c.httpClient.Do(req)
399 | 	if err != nil {
400 | 		return nil, fmt.Errorf("making request: %w", err)
401 | 	}
402 | 	defer resp.Body.Close()
403 | 
404 | 	contentType := resp.Header.Get("Content-Type")
405 | 
406 | 	processor, err := getProcessor(contentType, url)
407 | 	if err != nil {
408 | 		return nil, err
409 | 	}
410 | 
411 | 	return processor(resp.Body, url)
412 | }
413 | 
414 | func (c *Chew) getRobotsTxtInfo(urlStr string) (bool, time.Duration, error) {
415 | 	parsedURL, err := url.Parse(urlStr)
416 | 	if err != nil {
417 | 		return false, 0, err
418 | 	}
419 | 
420 | 	robotsURL := fmt.Sprintf("%s://%s/robots.txt", parsedURL.Scheme, parsedURL.Host)
421 | 
422 | 	c.robotsMu.RLock()
423 | 	robotsData, exists := c.robotsCache[robotsURL]
424 | 	c.robotsMu.RUnlock()
425 | 
426 | 	if !exists {
427 | 		resp, err := http.Get(robotsURL)
428 | 		if err != nil {
429 | 			return true, c.config.CrawlDelay, nil
430 | 		}
431 | 		defer resp.Body.Close()
432 | 
433 | 		robotsData, err = robotstxt.FromResponse(resp)
434 | 		if err != nil {
435 | 			return true, c.config.CrawlDelay, nil
436 | 		}
437 | 
438 | 		c.robotsMu.Lock()
439 | 		c.robotsCache[robotsURL] = robotsData
440 | 		c.robotsMu.Unlock()
441 | 	}
442 | 
443 | 	allowed := robotsData.TestAgent(parsedURL.Path, c.config.UserAgent)
444 | 
445 | 	return allowed, c.config.CrawlDelay, nil
446 | }
447 | 
448 | // respectCrawlDelay ensures that subsequent requests to the same domain respect the specified crawl delay.
449 | func (c *Chew) respectCrawlDelay(ctx context.Context, urlStr string, delay time.Duration) error {
450 | 	parsedURL, err := url.Parse(urlStr)
451 | 	if err != nil {
452 | 		return err
453 | 	}
454 | 
455 | 	domain := parsedURL.Hostname()
456 | 
457 | 	c.lastAccessMu.Lock()
458 | 	lastAccess, exists := c.lastAccess[domain]
459 | 	if exists {
460 | 		timeToWait := time.Until(lastAccess.Add(delay))
461 | 		if timeToWait > 0 {
462 | 			c.lastAccessMu.Unlock()
463 | 			select {
464 | 			case <-time.After(timeToWait):
465 | 			case <-ctx.Done():
466 | 				return ctx.Err()
467 | 			}
468 | 			c.lastAccessMu.Lock()
469 | 		}
470 | 	}
471 | 
472 | 	c.lastAccess[domain] = time.Now()
473 | 	c.lastAccessMu.Unlock()
474 | 	return nil
475 | }
476 | 
477 | func (c *Chew) processWithRetry(ctx context.Context, url string) ([]common.Chunk, error) {
478 | 	var (
479 | 		chunks []common.Chunk
480 | 		err    error
481 | 	)
482 | 
483 | 	var retries int
484 | 	for {
485 | 		chunks, err = c.processURL(ctx, url)
486 | 		if err == nil {
487 | 			return chunks, nil
488 | 		}
489 | 		if retries > c.config.RetryLimit {
490 | 			break
491 | 		}
492 | 		retries++
493 | 		c.wait(ctx, c.config.RetryDelay)
494 | 	}
495 | 
496 | 	return nil, err
497 | }
498 | 
499 | func (c *Chew) wait(ctx context.Context, d time.Duration) {
500 | 	select {
501 | 	case <-time.After(d):
502 | 	case <-ctx.Done():
503 | 	}
504 | }
505 | 
506 | func (c *Chew) getProxy(req *http.Request) (*url.URL, error) {
507 | 	c.proxyMu.Lock()
508 | 	defer c.proxyMu.Unlock()
509 | 
510 | 	if len(c.config.ProxyList) == 0 {
511 | 		return nil, nil
512 | 	}
513 | 
514 | 	proxyURL, err := url.Parse(c.config.ProxyList[c.proxyIndex])
515 | 	if err != nil {
516 | 		return nil, err
517 | 	}
518 | 
519 | 	c.proxyIndex = (c.proxyIndex + 1) % len(c.config.ProxyList)
520 | 	return proxyURL, nil
521 | }
522 | 


--------------------------------------------------------------------------------
/chew_test.go:
--------------------------------------------------------------------------------
  1 | package chew
  2 | 
  3 | import (
  4 | 	"context"
  5 | 	"fmt"
  6 | 	"io"
  7 | 	"net/http"
  8 | 	"net/http/httptest"
  9 | 	"net/url"
 10 | 	"os"
 11 | 	"path/filepath"
 12 | 	"reflect"
 13 | 	"strings"
 14 | 	"testing"
 15 | 	"time"
 16 | 
 17 | 	"github.com/mmatongo/chew/v1/internal/common"
 18 | 	"github.com/mmatongo/chew/v1/internal/text"
 19 | 	"golang.org/x/time/rate"
 20 | )
 21 | 
 22 | func mockProcessor(r io.Reader, url string) ([]common.Chunk, error) {
 23 | 	content, err := io.ReadAll(r)
 24 | 	if err != nil {
 25 | 		return nil, err
 26 | 	}
 27 | 	return []common.Chunk{{Content: string(content), Source: url}}, nil
 28 | }
 29 | 
 30 | type mockTransport struct {
 31 | 	response *http.Response
 32 | 	err      error
 33 | }
 34 | 
 35 | func (m *mockTransport) RoundTrip(*http.Request) (*http.Response, error) {
 36 | 	return m.response, m.err
 37 | }
 38 | 
 39 | type mockRateLimiter struct {
 40 | 	waitErr error
 41 | }
 42 | 
 43 | func (m *mockRateLimiter) Wait(ctx context.Context) error {
 44 | 	return m.waitErr
 45 | }
 46 | 
 47 | func Test_processURL(t *testing.T) {
 48 | 	originalHTTPClient := http.DefaultClient
 49 | 	originalContentTypeProcessors := contentTypeProcessors
 50 | 	originalValidExtensions := validExtensions
 51 | 
 52 | 	defer func() {
 53 | 		http.DefaultClient = originalHTTPClient
 54 | 		contentTypeProcessors = originalContentTypeProcessors
 55 | 		validExtensions = originalValidExtensions
 56 | 	}()
 57 | 
 58 | 	mockClient := &http.Client{
 59 | 		Transport: &mockTransport{
 60 | 			response: &http.Response{
 61 | 				StatusCode: 200,
 62 | 				Body:       io.NopCloser(strings.NewReader("Test content")),
 63 | 				Header:     http.Header{"Content-Type": []string{"text/html"}},
 64 | 			},
 65 | 		},
 66 | 	}
 67 | 	chew := New(Config{})
 68 | 	ctx := context.Background()
 69 | 
 70 | 	chew.SetHTTPClient(mockClient)
 71 | 	defer chew.SetHTTPClient(nil)
 72 | 
 73 | 	contentTypeProcessors = map[string]func(io.Reader, string) ([]common.Chunk, error){
 74 | 		"text/html":  mockProcessor,
 75 | 		"text/plain": mockProcessor,
 76 | 	}
 77 | 	validExtensions = map[string]func(io.Reader, string) ([]common.Chunk, error){
 78 | 		".html": mockProcessor,
 79 | 		".txt":  mockProcessor,
 80 | 	}
 81 | 
 82 | 	tempDir := t.TempDir()
 83 | 	testHTMLPath := filepath.Join(tempDir, "test.html")
 84 | 	testTXTPath := filepath.Join(tempDir, "test.txt")
 85 | 	testUnsupportedPath := filepath.Join(tempDir, "test.unsupported")
 86 | 
 87 | 	err := os.WriteFile(testHTMLPath, []byte("html content"), 0644)
 88 | 	if err != nil {
 89 | 		t.Fatalf("failed to create test html file: %v", err)
 90 | 	}
 91 | 
 92 | 	err = os.WriteFile(testTXTPath, []byte("text content"), 0644)
 93 | 	if err != nil {
 94 | 		t.Fatalf("failed to create test text file: %v", err)
 95 | 	}
 96 | 
 97 | 	err = os.WriteFile(testUnsupportedPath, []byte("unsupported content"), 0644)
 98 | 	if err != nil {
 99 | 		t.Fatalf("failed to create test unsupported file: %v", err)
100 | 	}
101 | 
102 | 	tests := []struct {
103 | 		name    string
104 | 		url     string
105 | 		want    []common.Chunk
106 | 		wantErr bool
107 | 	}{
108 | 		{
109 | 			name:    "success",
110 | 			url:     "https://example.com/page.html",
111 | 			want:    []common.Chunk{{Content: "Test content", Source: "https://example.com/page.html"}},
112 | 			wantErr: false,
113 | 		},
114 | 		{
115 | 			name:    "success html",
116 | 			url:     "file://" + testHTMLPath,
117 | 			want:    []common.Chunk{{Content: "html content", Source: "file://" + testHTMLPath}},
118 | 			wantErr: false,
119 | 		},
120 | 		{
121 | 			name:    "success txt",
122 | 			url:     "file://" + testTXTPath,
123 | 			want:    []common.Chunk{{Content: "text content", Source: "file://" + testTXTPath}},
124 | 			wantErr: false,
125 | 		},
126 | 		{
127 | 			name:    "unsupported file type",
128 | 			url:     "file://" + testUnsupportedPath,
129 | 			want:    nil,
130 | 			wantErr: true,
131 | 		},
132 | 		{
133 | 			name:    "non-existent file",
134 | 			url:     "file:///non-existent.md",
135 | 			want:    nil,
136 | 			wantErr: true,
137 | 		},
138 | 	}
139 | 
140 | 	for _, tt := range tests {
141 | 		t.Run(tt.name, func(t *testing.T) {
142 | 			got, err := chew.processURL(ctx, tt.url)
143 | 			if (err != nil) != tt.wantErr {
144 | 				t.Errorf("processURL() error = %v, wantErr %v", err, tt.wantErr)
145 | 				return
146 | 			}
147 | 			if !reflect.DeepEqual(got, tt.want) {
148 | 				t.Errorf("processURL() = %v, want %v", got, tt.want)
149 | 			}
150 | 		})
151 | 	}
152 | }
153 | 
154 | func Test_getProcessor(t *testing.T) {
155 | 	type args struct {
156 | 		contentType string
157 | 		url         string
158 | 	}
159 | 	tests := []struct {
160 | 		name    string
161 | 		args    args
162 | 		want    func(io.Reader, string) ([]common.Chunk, error)
163 | 		wantErr bool
164 | 	}{
165 | 		{
166 | 			name: "success",
167 | 			args: args{
168 | 				contentType: "text/html",
169 | 				url:         "https://example.com/page.html",
170 | 			},
171 | 			want:    mockProcessor,
172 | 			wantErr: false,
173 | 		},
174 | 		{
175 | 			name: "unknown content type",
176 | 			args: args{
177 | 				contentType: "octet/stream",
178 | 				url:         "https://example.com/page.html",
179 | 			},
180 | 			want:    text.ProcessHTML,
181 | 			wantErr: false,
182 | 		},
183 | 		{
184 | 			name: "unsupported content type",
185 | 			args: args{
186 | 				contentType: "application/octet-stream",
187 | 				url:         "https://example.com/page.htt",
188 | 			},
189 | 			want:    nil,
190 | 			wantErr: true,
191 | 		},
192 | 		{
193 | 			name: "no extension",
194 | 			args: args{
195 | 				contentType: "octet/stream",
196 | 				url:         "https://example.com/page",
197 | 			},
198 | 			want:    nil,
199 | 			wantErr: true,
200 | 		},
201 | 	}
202 | 	for _, tt := range tests {
203 | 		t.Run(tt.name, func(t *testing.T) {
204 | 			got, err := getProcessor(tt.args.contentType, tt.args.url)
205 | 			if (err != nil) != tt.wantErr {
206 | 				t.Errorf("getProcessor() error = %v, wantErr %v", err, tt.wantErr)
207 | 				return
208 | 			}
209 | 
210 | 			if got == nil && tt.want != nil {
211 | 				t.Errorf("getProcessor() returned nil, want non-nil")
212 | 			} else if got != nil && tt.want == nil {
213 | 				t.Errorf("getProcessor() returned non-nil, want nil")
214 | 			} else if got != nil {
215 | 				gotType := reflect.TypeOf(got)
216 | 				wantType := reflect.TypeOf(tt.want)
217 | 				if gotType != wantType {
218 | 					t.Errorf("getProcessor() returned function of type %v, want %v", gotType, wantType)
219 | 				}
220 | 			}
221 | 		})
222 | 	}
223 | }
224 | 
225 | func TestProcess(t *testing.T) {
226 | 	server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
227 | 		switch r.URL.Path {
228 | 		case "/robots.txt":
229 | 			w.Header().Set("Content-Type", "text/plain")
230 | 			w.Write([]byte("User-agent: *\nDisallow: /disallowed\nCrawl-delay: 1"))
231 | 		case "/text":
232 | 			w.Header().Set("Content-Type", "text/plain")
233 | 			w.Write([]byte("A plain text file."))
234 | 		case "/html":
235 | 			w.Header().Set("Content-Type", "text/html")
236 | 			w.Write([]byte("<html><body><p>An HTML file.</p></body></html>"))
237 | 		case "/markdown":
238 | 			w.Header().Set("Content-Type", "text/plain")
239 | 			w.Write([]byte("# A Markdown file"))
240 | 		case "/disallowed":
241 | 			w.Header().Set("Content-Type", "text/plain")
242 | 			w.Write([]byte("This page is disallowed by robots.txt"))
243 | 		case "/rate-limited":
244 | 			time.Sleep(2 * time.Second)
245 | 			w.Write([]byte("Rate limited content"))
246 | 		}
247 | 	}))
248 | 	defer server.Close()
249 | 
250 | 	containsChunk := func(chunks []common.Chunk, chunk common.Chunk) bool {
251 | 		for _, c := range chunks {
252 | 			if c.Content == chunk.Content && c.Source == chunk.Source {
253 | 				return true
254 | 			}
255 | 		}
256 | 		return false
257 | 	}
258 | 
259 | 	chew := New(Config{
260 | 		IgnoreRobotsTxt: false,
261 | 		UserAgent:       "TestBot/1.0",
262 | 		RetryLimit:      3,
263 | 		RetryDelay:      100 * time.Millisecond,
264 | 		CrawlDelay:      1 * time.Second,
265 | 		RateLimit:       500 * time.Millisecond,
266 | 		RateBurst:       1,
267 | 	})
268 | 
269 | 	chew.httpClient.Timeout = 5 * time.Second
270 | 
271 | 	type args struct {
272 | 		urls []string
273 | 		ctxs []context.Context
274 | 	}
275 | 	tests := []struct {
276 | 		name             string
277 | 		args             args
278 | 		want             []common.Chunk
279 | 		wantErr          bool
280 | 		expectedErrText  string
281 | 		ignoreRobotsTxt  bool
282 | 		orderIndependent bool
283 | 		rateLimiter      RateLimiter
284 | 	}{
285 | 		{
286 | 			name: "plain text",
287 | 			args: args{
288 | 				urls: []string{server.URL + "/text"},
289 | 			},
290 | 			want: []common.Chunk{
291 | 				{Content: "A plain text file.", Source: server.URL + "/text"},
292 | 			},
293 | 			wantErr: false,
294 | 		},
295 | 		{
296 | 			name: "HTML",
297 | 			args: args{
298 | 				urls: []string{server.URL + "/html"},
299 | 			},
300 | 			want: []common.Chunk{
301 | 				{Content: "An HTML file.", Source: server.URL + "/html"},
302 | 			},
303 | 			wantErr: false,
304 | 		},
305 | 		{
306 | 			name: "markdown",
307 | 			args: args{
308 | 				urls: []string{server.URL + "/markdown"},
309 | 			},
310 | 			want: []common.Chunk{
311 | 				{Content: "# A Markdown file", Source: server.URL + "/markdown"},
312 | 			},
313 | 			wantErr: false,
314 | 		},
315 | 		{
316 | 			name: "multiple URLs",
317 | 			args: args{
318 | 				urls: []string{server.URL + "/text", server.URL + "/html"},
319 | 			},
320 | 			want: []common.Chunk{
321 | 				{Content: "An HTML file.", Source: server.URL + "/html"},
322 | 				{Content: "A plain text file.", Source: server.URL + "/text"},
323 | 			},
324 | 			wantErr:          false,
325 | 			orderIndependent: true,
326 | 		},
327 | 		{
328 | 			name: "invalid URL",
329 | 			args: args{
330 | 				urls: []string{"ftp://invalid.url"},
331 | 			},
332 | 			want:    nil,
333 | 			wantErr: true,
334 | 		},
335 | 		{
336 | 			name: "context cancellation",
337 | 			args: args{
338 | 				urls: []string{server.URL + "/text"},
339 | 				ctxs: []context.Context{func() context.Context {
340 | 					ctx, cancel := context.WithCancel(context.Background())
341 | 					go func() {
342 | 						time.Sleep(50 * time.Millisecond)
343 | 						cancel()
344 | 					}()
345 | 					return ctx
346 | 				}()},
347 | 			},
348 | 			want:            nil,
349 | 			wantErr:         true,
350 | 			expectedErrText: "context canceled",
351 | 		},
352 | 		{
353 | 			name: "with more than one context",
354 | 			args: args{
355 | 				urls: []string{server.URL + "/text"},
356 | 				ctxs: []context.Context{context.Background(), context.Background()},
357 | 			},
358 | 			want:    []common.Chunk{{Content: "A plain text file.", Source: server.URL + "/text"}},
359 | 			wantErr: false,
360 | 		},
361 | 		{
362 | 			name: "respects robots.txt",
363 | 			args: args{
364 | 				urls: []string{server.URL + "/disallowed"},
365 | 			},
366 | 			want:    nil,
367 | 			wantErr: true,
368 | 		},
369 | 		{
370 | 			name: "ignores robots.txt when configured",
371 | 			args: args{
372 | 				urls: []string{server.URL + "/disallowed"},
373 | 			},
374 | 			want: []common.Chunk{
375 | 				{Content: "This page is disallowed by robots.txt", Source: server.URL + "/disallowed"},
376 | 			},
377 | 			wantErr:         false,
378 | 			ignoreRobotsTxt: true,
379 | 		},
380 | 		{
381 | 			name: "robots.txt disallowed",
382 | 			args: args{
383 | 				urls: []string{server.URL + "/disallowed"},
384 | 			},
385 | 			want:            nil,
386 | 			wantErr:         true,
387 | 			expectedErrText: "access to",
388 | 		},
389 | 		{
390 | 			name: "respects crawl delay",
391 | 			args: args{
392 | 				urls: []string{server.URL + "/text", server.URL + "/html"},
393 | 			},
394 | 			want: []common.Chunk{
395 | 				{Content: "A plain text file.", Source: server.URL + "/text"},
396 | 				{Content: "An HTML file.", Source: server.URL + "/html"},
397 | 			},
398 | 			wantErr:          false,
399 | 			orderIndependent: true,
400 | 		},
401 | 		{
402 | 			name: "rate limiting error",
403 | 			args: args{
404 | 				urls: []string{server.URL + "/rate-limited", server.URL + "/rate-limited"},
405 | 			},
406 | 			want:            nil,
407 | 			wantErr:         true,
408 | 			expectedErrText: "rate limit exceeded",
409 | 			rateLimiter:     &mockRateLimiter{waitErr: fmt.Errorf("rate limit exceeded")},
410 | 		},
411 | 		{
412 | 			name: "crawl delay respect error",
413 | 			args: args{
414 | 				urls: []string{server.URL + "/text", server.URL + "/text"},
415 | 				ctxs: []context.Context{func() context.Context {
416 | 					ctx, cancel := context.WithTimeout(context.Background(), 1500*time.Millisecond)
417 | 					defer cancel()
418 | 					return ctx
419 | 				}()},
420 | 			},
421 | 			want:            nil,
422 | 			wantErr:         true,
423 | 			expectedErrText: "context canceled",
424 | 		},
425 | 	}
426 | 
427 | 	for _, tt := range tests {
428 | 		t.Run(tt.name, func(t *testing.T) {
429 | 			oldState := chew.config.IgnoreRobotsTxt
430 | 			chew.config.IgnoreRobotsTxt = tt.ignoreRobotsTxt
431 | 			defer func() { chew.config.IgnoreRobotsTxt = oldState }()
432 | 
433 | 			if tt.rateLimiter != nil {
434 | 				chew.SetRateLimiter(tt.rateLimiter)
435 | 			} else {
436 | 				chew.SetRateLimiter(rate.NewLimiter(rate.Every(chew.config.RateLimit), chew.config.RateBurst))
437 | 			}
438 | 
439 | 			ctx := context.Background()
440 | 			if len(tt.args.ctxs) > 0 {
441 | 				ctx = tt.args.ctxs[0]
442 | 			}
443 | 
444 | 			got, err := chew.Process(ctx, tt.args.urls)
445 | 
446 | 			if tt.wantErr {
447 | 				if err == nil {
448 | 					t.Errorf("Process() error = nil, wantErr %v", tt.wantErr)
449 | 					return
450 | 				}
451 | 				if tt.expectedErrText != "" && !strings.Contains(err.Error(), tt.expectedErrText) {
452 | 					t.Errorf("Process() error = %v, expectedErrText %v", err, tt.expectedErrText)
453 | 					return
454 | 				}
455 | 			} else {
456 | 				if err != nil {
457 | 					t.Errorf("Process() unexpected error: %v", err)
458 | 					return
459 | 				}
460 | 				if !tt.orderIndependent && !reflect.DeepEqual(got, tt.want) {
461 | 					t.Errorf("Process() = %v, want %v", got, tt.want)
462 | 				}
463 | 				if tt.orderIndependent {
464 | 					if len(got) != len(tt.want) {
465 | 						t.Errorf("Process() returned %d chunks, want %d", len(got), len(tt.want))
466 | 					}
467 | 					for _, wantChunk := range tt.want {
468 | 						if !containsChunk(got, wantChunk) {
469 | 							t.Errorf("Process() did not return chunk %v", wantChunk)
470 | 						}
471 | 					}
472 | 				}
473 | 			}
474 | 		})
475 | 	}
476 | }
477 | 
478 | func Test_getProxy(t *testing.T) {
479 | 	tests := []struct {
480 | 		name      string
481 | 		config    Config
482 | 		requests  int
483 | 		wantProxy []*url.URL
484 | 	}{
485 | 		{
486 | 			name:      "no proxies",
487 | 			config:    Config{},
488 | 			requests:  1,
489 | 			wantProxy: []*url.URL{nil},
490 | 		},
491 | 		{
492 | 			name: "single proxy",
493 | 			config: Config{
494 | 				ProxyList: []string{"http://proxy1.example.com"},
495 | 			},
496 | 			requests:  2,
497 | 			wantProxy: []*url.URL{must(url.Parse("http://proxy1.example.com")), must(url.Parse("http://proxy1.example.com"))},
498 | 		},
499 | 		{
500 | 			name: "multiple proxies",
501 | 			config: Config{
502 | 				ProxyList: []string{"http://proxy1.example.com", "http://proxy2.example.com", "http://proxy3.example.com"},
503 | 			},
504 | 			requests: 5,
505 | 			wantProxy: []*url.URL{
506 | 				must(url.Parse("http://proxy1.example.com")),
507 | 				must(url.Parse("http://proxy2.example.com")),
508 | 				must(url.Parse("http://proxy3.example.com")),
509 | 				must(url.Parse("http://proxy1.example.com")),
510 | 				must(url.Parse("http://proxy2.example.com")),
511 | 			},
512 | 		},
513 | 	}
514 | 
515 | 	for _, tt := range tests {
516 | 		t.Run(tt.name, func(t *testing.T) {
517 | 			c := New(tt.config)
518 | 			for i := 0; i < tt.requests; i++ {
519 | 				got, err := c.getProxy(&http.Request{})
520 | 				if err != nil {
521 | 					t.Errorf("getProxy() error = %v", err)
522 | 					return
523 | 				}
524 | 				if !reflect.DeepEqual(got, tt.wantProxy[i]) {
525 | 					t.Errorf("getProxy() = %v, want %v", got, tt.wantProxy[i])
526 | 				}
527 | 			}
528 | 		})
529 | 	}
530 | }
531 | 
532 | func must(u *url.URL, err error) *url.URL {
533 | 	if err != nil {
534 | 		panic(err)
535 | 	}
536 | 	return u
537 | }
538 | 
539 | func TestRespectCrawlDelay(t *testing.T) {
540 | 	chew := New(Config{})
541 | 	ctx := context.Background()
542 | 
543 | 	tests := []struct {
544 | 		name     string
545 | 		ctx      context.Context
546 | 		url      string
547 | 		delay    time.Duration
548 | 		wantWait bool
549 | 	}{
550 | 		{
551 | 			name:     "first access",
552 | 			url:      "https://example.com",
553 | 			delay:    time.Second,
554 | 			wantWait: false,
555 | 		},
556 | 		{
557 | 			name:     "second access",
558 | 			url:      "https://example.com",
559 | 			delay:    time.Second,
560 | 			wantWait: true,
561 | 		},
562 | 	}
563 | 
564 | 	for _, tt := range tests {
565 | 		t.Run(tt.name, func(t *testing.T) {
566 | 			start := time.Now()
567 | 			err := chew.respectCrawlDelay(ctx, tt.url, tt.delay)
568 | 			duration := time.Since(start)
569 | 
570 | 			if err != nil {
571 | 				t.Errorf("respectCrawlDelay() error = %v", err)
572 | 				return
573 | 			}
574 | 
575 | 			if tt.wantWait && duration < tt.delay {
576 | 				t.Errorf("respectCrawlDelay() didn't wait long enough. Duration: %v, Expected: %v", duration, tt.delay)
577 | 			}
578 | 			if !tt.wantWait && duration >= tt.delay {
579 | 				t.Errorf("respectCrawlDelay() waited unnecessarily. Duration: %v", duration)
580 | 			}
581 | 		})
582 | 	}
583 | }
584 | 


--------------------------------------------------------------------------------
/cmd/chew/wrapper.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | /*
 4 | #include <stdlib.h>
 5 | */
 6 | import "C"
 7 | 
 8 | import (
 9 | 	"context"
10 | 	"fmt"
11 | 	"strings"
12 | 	"time"
13 | 	"unsafe"
14 | 
15 | 	"github.com/mmatongo/chew/v1"
16 | )
17 | 
18 | //export Process
19 | func Process(urls *C.char) *C.char {
20 | 	urlsSlice := strings.Split(C.GoString(urls), ",")
21 | 
22 | 	ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
23 | 	defer cancel()
24 | 
25 | 	c := chew.New(chew.Config{
26 | 		UserAgent:       "Chew/1.0 (+https://github.com/mmatongo/chew)",
27 | 		RetryLimit:      3,
28 | 		RetryDelay:      time.Second,
29 | 		CrawlDelay:      time.Second,
30 | 		RateLimit:       time.Second,
31 | 		RateBurst:       1,
32 | 		IgnoreRobotsTxt: false,
33 | 	})
34 | 
35 | 	chunks, err := c.Process(ctx, urlsSlice)
36 | 	if err != nil {
37 | 		if err == context.DeadlineExceeded {
38 | 			return C.CString("Operation timed out")
39 | 		}
40 | 		return C.CString(fmt.Sprintf("Error processing URLs: %v", err))
41 | 	}
42 | 
43 | 	var result strings.Builder
44 | 	for _, chunk := range chunks {
45 | 		result.WriteString(fmt.Sprintf("Source: %s\nContent: %s\n\n", chunk.Source, chunk.Content))
46 | 	}
47 | 
48 | 	return C.CString(result.String())
49 | }
50 | 
51 | //export FreeString
52 | func FreeString(ptr *C.char) {
53 | 	C.free(unsafe.Pointer(ptr))
54 | }
55 | 
56 | func main() {}
57 | 


--------------------------------------------------------------------------------
/codecov.yml:
--------------------------------------------------------------------------------
 1 | coverage:
 2 |   precision: 2
 3 |   round: up
 4 |   range: "70...100"
 5 | 
 6 | ignore:
 7 |   - ".idea"
 8 |   - "docs"
 9 |   - "cmd"
10 |   - "testdata"
11 |   - "examples"
12 |   - "assets"
13 | 


--------------------------------------------------------------------------------
/docs/golang.md:
--------------------------------------------------------------------------------
 1 | Chew is native to Go and can be used as library in your Go project with ease. Here is a simple example of how to use Chew in your Go project.
 2 | 
 3 | ```go
 4 | package main
 5 | 
 6 | import (
 7 | 	"context"
 8 | 	"fmt"
 9 | 	"log"
10 | 	"time"
11 | 
12 | 	"github.com/mmatongo/chew"
13 | )
14 | 
15 | func main() {
16 | 	urls := []string{
17 | 		"https://example.com",
18 | 	}
19 | 
20 | 	// The context is optional
21 | 	ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
22 | 	defer cancel()
23 | 
24 | 	chunks, err := chew.Process(urls, ctx)
25 | 	if err != nil {
26 | 		if err == context.DeadlineExceeded {
27 | 			log.Println("Operation timed out")
28 | 		} else {
29 | 			log.Printf("Error processing URLs: %v", err)
30 | 		}
31 | 		return
32 | 	}
33 | 
34 | 	for _, chunk := range chunks {
35 | 		fmt.Printf("Source: %s\nContent: %s\n\n", chunk.Source, chunk.Content)
36 | 	}
37 | }
38 | ```
39 | 
40 | The above code snippet demonstrates how to use Chew in your Go project. The `chew.Process` function takes a list of URLs and returns a list of `Chunk` objects. Each `Chunk` object contains the source URL and the content of the URL. The `context` parameter is optional and can be used to set a timeout for the operation. If the operation times out, the function will return a `context.DeadlineExceeded` error.
41 | 
42 | Markdown formatting is not enforced in the content of the `Chunk` object. However, the output is always going to be plain text so you can format it as you wish.
43 | 


--------------------------------------------------------------------------------
/docs/python.md:
--------------------------------------------------------------------------------
 1 | To use Chew with python you need to first build the package to create the shared object file and header file. You can do this by running the following command:
 2 | 
 3 | ```bash
 4 | go build -o chew.so -buildmode=c-shared ./cmd/chew/wrapper.go
 5 | ```
 6 | 
 7 | This will create a `chew.so` and `chew.h` file in the current directory. You can then use these files in your python project to use Chew. Here is an example of how to use Chew in your python project:
 8 | 
 9 | ```python
10 | import ctypes
11 | 
12 | chew_lib = ctypes.CDLL('./chew.so')
13 | 
14 | chew_lib.Process.argtypes = [ctypes.c_char_p]
15 | chew_lib.Process.restype = ctypes.c_char_p
16 | 
17 | url = "https://example.com"
18 | result = chew_lib.Process(url.encode('utf-8'))
19 | 
20 | print(result.decode('utf-8'))
21 | ```
22 | 
23 | With the above code snippet, you can now use Chew in your python project. I can't speak for the limitations of using Chew in python as I have not extensively tested it myself.
24 | 


--------------------------------------------------------------------------------
/docs/ruby.md:
--------------------------------------------------------------------------------
 1 | To use Chew with Ruby you need to first build the package to create the shared object file and header file. You can do this by running the following command:
 2 | 
 3 | ```bash
 4 | go build -o chew.so -buildmode=c-shared ./cmd/chew/wrapper.go
 5 | ```
 6 | 
 7 | This will create a `chew.so` and `chew.h` file in the current directory. You can then use these files in your Ruby project to use Chew. Here is an example of how to use Chew in your Ruby project:
 8 | 
 9 | ```ruby
10 | require 'fiddle'
11 | require 'fiddle/import'
12 | 
13 | module ChewLib
14 |   extend Fiddle::Importer
15 |   dlload './chew.so'
16 | 
17 |   extern 'char* Process(char*)'
18 | end
19 | 
20 | urls = ['https://example.com', 'https://example.com']
21 | for url in urls
22 |   result_ptr = ChewLib.Process(url)
23 |   result = result_ptr.to_s
24 |   Fiddle::Function.new(Fiddle::Handle['free'], [Fiddle::TYPE_VOIDP], Fiddle::TYPE_VOID).call(result_ptr)
25 | 
26 |   puts result
27 | end
28 | ```
29 | 
30 | Using chew like this will come with obvious limitations, however, this is a simple example of how to use Chew in your Ruby project
31 | 


--------------------------------------------------------------------------------
/docs/setup.md:
--------------------------------------------------------------------------------
 1 | # Setting up Google Cloud Services for Speech-to-Text
 2 | 
 3 | 1. **Create a Google Cloud Project**
 4 |    - Go to the [Google Cloud Console](https://console.cloud.google.com/)
 5 |    - Click on the project dropdown and select "New Project"
 6 |    - Enter a project name and click "Create"
 7 | 
 8 | 2. **Enable the Cloud Speech-to-Text API**
 9 |    - In the Google Cloud Console, under "Quick access" go to "APIs & Services"
10 |    - Click on "+ ENABLE APIS AND SERVICES"
11 |    - Search for "Cloud Speech-to-Text API" and select it
12 |    - Click "Enable"
13 | 
14 | 3. **Create a Service Account**
15 |    - In the Google Cloud Console, go to "IAM & Admin" > "Service Accounts" (or use [this link](https://console.cloud.google.com/iam-admin/serviceaccounts))
16 |    - Click "Create Service Account"
17 |    - Enter a name for the service account and click "Create"
18 |    - For the role, choose "Project" > "Owner" (or a more restrictive role if preferred)
19 |    - Click "Continue" and then "Done"
20 | 
21 | 4. **Generate a Key for the Service Account**
22 |    - In the Service Accounts list, find the account you just created
23 |    - Click on the three dots menu (⋮) and select "Manage keys"
24 |    - Click "Add Key" > "Create new key"
25 |    - Choose "JSON" as the key type and click "Create"
26 |    - The key file will be downloaded to your computer
27 | 
28 | 5. **Set the GOOGLE_APPLICATION_CREDENTIALS Environment Variable**
29 |    - On Linux or macOS:
30 |      ```
31 |      export GOOGLE_APPLICATION_CREDENTIALS="/path/to/your/service-account-key.json"
32 |      ```
33 |    - If you want to set the environment variable permanently, you can add it to your shell profile (e.g., `~/.bashrc`, `~/.zshrc`, etc.)
34 | 
35 |    You can optionally set the environment variable in your code as well:
36 |    ```python
37 |     import os
38 | 
39 |     os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "/path/to/your/service-account-key.json"
40 |     ```
41 | 
42 |     ```go
43 |     import "os"
44 | 
45 |     os.Setenv("GOOGLE_APPLICATION_CREDENTIALS", "/path/to/your/service-account-key.json")
46 |     ```
47 | 
48 |     ```ruby
49 |     ENV["GOOGLE_APPLICATION_CREDENTIALS"] = "/path/to/your/service-account-key.json"
50 |     ```
51 | 


--------------------------------------------------------------------------------
/examples/main.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | import (
 4 | 	"context"
 5 | 	"fmt"
 6 | 	"log"
 7 | 	"time"
 8 | 
 9 | 	"github.com/mmatongo/chew/v1"
10 | )
11 | 
12 | func main() {
13 | 	urls := []string{
14 | 		"https://example.com",
15 | 	}
16 | 
17 | 	config := chew.Config{
18 | 		UserAgent:       "Chew/1.0 (+https://github.com/mmatongo/chew)",
19 | 		RetryLimit:      3,
20 | 		RetryDelay:      5 * time.Second,
21 | 		CrawlDelay:      10 * time.Second,
22 | 		ProxyList:       []string{}, // Add your proxies here, or leave empty
23 | 		RateLimit:       2 * time.Second,
24 | 		RateBurst:       3,
25 | 		IgnoreRobotsTxt: false,
26 | 	}
27 | 
28 | 	haChew := chew.New(config)
29 | 
30 | 	ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
31 | 	defer cancel()
32 | 
33 | 	chunks, err := haChew.Process(ctx, urls)
34 | 	if err != nil {
35 | 		if err == context.DeadlineExceeded {
36 | 			log.Println("Operation timed out")
37 | 		} else {
38 | 			log.Printf("Error processing URLs: %v", err)
39 | 		}
40 | 		return
41 | 	}
42 | 
43 | 	for _, chunk := range chunks {
44 | 		fmt.Printf("Source: %s\nContent: %s\n\n", chunk.Source, chunk.Content)
45 | 	}
46 | }
47 | 


--------------------------------------------------------------------------------
/examples/main.py:
--------------------------------------------------------------------------------
 1 | # please see the documentation on how to build chew for use with python
 2 | 
 3 | import ctypes
 4 | 
 5 | chew_lib = ctypes.CDLL('./chew.so')
 6 | 
 7 | chew_lib.Process.argtypes = [ctypes.c_char_p]
 8 | chew_lib.Process.restype = ctypes.c_char_p
 9 | 
10 | urls = "https://example.com"
11 | result = chew_lib.Process(urls.encode('utf-8'))
12 | 
13 | print(result.decode('utf-8'))
14 | 


--------------------------------------------------------------------------------
/examples/main.rb:
--------------------------------------------------------------------------------
 1 | # please see the documentation on how to build chew for use with ruby
 2 | 
 3 | require 'fiddle'
 4 | require 'fiddle/import'
 5 | 
 6 | module ChewLib
 7 |   extend Fiddle::Importer
 8 |   dlload './chew.so'
 9 | 
10 |   extern 'char* Process(char*)'
11 | end
12 | 
13 | urls = ['https://example.com', 'https://example.com']
14 | for url in urls
15 |   result_ptr = ChewLib.Process(url)
16 |   result = result_ptr.to_s
17 |   Fiddle::Function.new(Fiddle::Handle['free'], [Fiddle::TYPE_VOIDP], Fiddle::TYPE_VOID).call(result_ptr)
18 | 
19 |   puts result
20 | end
21 | 


--------------------------------------------------------------------------------
/examples/transcription/google.go:
--------------------------------------------------------------------------------
 1 | //go:build ignore
 2 | 
 3 | package main
 4 | 
 5 | import (
 6 | 	"context"
 7 | 	"log"
 8 | 	"os"
 9 | 	"time"
10 | 
11 | 	"github.com/mmatongo/chew/v1"
12 | )
13 | 
14 | func main() {
15 | 	credentialsFile := "chew-go.json"
16 | 	credentialsJSON, err := os.ReadFile(credentialsFile)
17 | 	if err != nil {
18 | 		log.Fatalf("Failed to read credentials file: %v", err)
19 | 	}
20 | 
21 | 	err = os.Setenv("GOOGLE_APPLICATION_CREDENTIALS", credentialsFile)
22 | 	if err != nil {
23 | 		log.Fatalf("Failed to set environment variable: %v", err)
24 | 	}
25 | 
26 | 	config := chew.TranscribeOptions{
27 | 		CredentialsJSON: credentialsJSON,
28 | 		Bucket:          "chew-go",
29 | 		LanguageCode:    "en-US",
30 | 	}
31 | 
32 | 	log.Println("transcribing files...")
33 | 	/*
34 | 		Transcriptions can take a bit of time so ensure that the timeout you set
35 | 		is enough for the process to finish
36 | 
37 | 		In a test with MLK Jr's speech it took about 3min to complete
38 | 
39 | 		The two audio files used in this example can be obtained from the following links:
40 | 		- Conference.wav: https://voiceage.com/wbsamples/in_stereo/Conference.wav
41 | 		- MLKDream_64kb.mp3: https://archive.org/details/MLKDream
42 | 	*/
43 | 
44 | 	ctx, cancel := context.WithTimeout(context.Background(), 5*time.Minute)
45 | 	defer cancel()
46 | 
47 | 	filenames := []string{
48 | 		"audio/Conference.wav",
49 | 		"audio/MLKDream_64kb.mp3",
50 | 	}
51 | 
52 | 	results, err := chew.Transcribe(ctx, filenames, config)
53 | 	if err != nil {
54 | 		log.Fatalf("failed to transcribe: %v", err)
55 | 	}
56 | 
57 | 	for filename, transcript := range results {
58 | 		log.Printf("Transcript for %s: %s\n", filename, transcript)
59 | 	}
60 | }
61 | 


--------------------------------------------------------------------------------
/examples/transcription/whisper.go:
--------------------------------------------------------------------------------
 1 | //go:build ignore
 2 | 
 3 | package main
 4 | 
 5 | import (
 6 | 	"context"
 7 | 	"log"
 8 | 	"os"
 9 | 	"time"
10 | 
11 | 	"github.com/mmatongo/chew/v1"
12 | )
13 | 
14 | func main() {
15 | 	key := os.Getenv("OPENAI_API_KEY")
16 | 	if key == "" {
17 | 		log.Fatalf("Please set the OPENAI_API_KEY= environment variable")
18 | 	}
19 | 
20 | 	whisperOpts := chew.TranscribeOptions{
21 | 		UseWhisper:    true,
22 | 		WhisperAPIKey: key,
23 | 		WhisperModel:  "whisper-1",
24 | 	}
25 | 
26 | 	log.Println("transcribing files...")
27 | 	/*
28 | 		The whisper model is a bit faster than the google cloud speech-to-text api
29 | 		so the timeout can be set to a lower value.
30 | 
31 | 		In a test with MLK Jr's speech it took about 32s to complete
32 | 
33 | 		The two audio files used in this example can be obtained from the following links:
34 | 		- Conference.wav: https://voiceage.com/wbsamples/in_stereo/Conference.wav
35 | 		- MLKDream_64kb.mp3: https://archive.org/details/MLKDream
36 | 	*/
37 | 
38 | 	ctx, cancel := context.WithTimeout(context.Background(), 5*time.Minute)
39 | 	defer cancel()
40 | 
41 | 	audioFiles := []string{
42 | 		"audio/Conference.wav",
43 | 		"audio/MLKDream_64kb.mp3",
44 | 	}
45 | 
46 | 	results, err := chew.Transcribe(ctx, audioFiles, whisperOpts)
47 | 
48 | 	if err != nil {
49 | 		log.Fatalf("Error transcribing with OpenAI Whisper: %v", err)
50 | 	}
51 | 
52 | 	for filename, transcript := range results {
53 | 		log.Printf("Transcript for %s: %s\n", filename, transcript)
54 | 	}
55 | }
56 | 


--------------------------------------------------------------------------------
/go.mod:
--------------------------------------------------------------------------------
 1 | module github.com/mmatongo/chew/v1
 2 | 
 3 | go 1.23
 4 | 
 5 | require (
 6 | 	cloud.google.com/go/speech v1.23.1
 7 | 	github.com/PuerkitoBio/goquery v1.9.2
 8 | 	github.com/amanitaverna/go-mp3 v0.4.0
 9 | 	github.com/go-audio/wav v1.1.0
10 | 	github.com/ledongthuc/pdf v0.0.0-20240201131950-da5b75280b06
11 | 	github.com/mewkiz/flac v1.0.11
12 | 	github.com/taylorskalyo/goreader v0.0.0-20230626212555-e7f5644f8115
13 | 	github.com/temoto/robotstxt v1.1.2
14 | 	golang.org/x/time v0.6.0
15 | 	google.golang.org/api v0.187.0
16 | )
17 | 
18 | require (
19 | 	cloud.google.com/go v0.115.0 // indirect
20 | 	cloud.google.com/go/auth v0.6.1 // indirect
21 | 	cloud.google.com/go/auth/oauth2adapt v0.2.2 // indirect
22 | 	cloud.google.com/go/compute/metadata v0.3.0 // indirect
23 | 	cloud.google.com/go/iam v1.1.8 // indirect
24 | 	cloud.google.com/go/longrunning v0.5.7 // indirect
25 | 	github.com/felixge/httpsnoop v1.0.4 // indirect
26 | 	github.com/go-audio/audio v1.0.0 // indirect
27 | 	github.com/go-audio/riff v1.0.0 // indirect
28 | 	github.com/go-logr/logr v1.4.1 // indirect
29 | 	github.com/go-logr/stdr v1.2.2 // indirect
30 | 	github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da // indirect
31 | 	github.com/golang/protobuf v1.5.4 // indirect
32 | 	github.com/google/s2a-go v0.1.7 // indirect
33 | 	github.com/google/uuid v1.6.0 // indirect
34 | 	github.com/googleapis/enterprise-certificate-proxy v0.3.2 // indirect
35 | 	github.com/googleapis/gax-go/v2 v2.12.5 // indirect
36 | 	github.com/icza/bitio v1.1.0 // indirect
37 | 	github.com/kr/pretty v0.1.0 // indirect
38 | 	github.com/mewkiz/pkg v0.0.0-20230226050401-4010bf0fec14 // indirect
39 | 	go.opencensus.io v0.24.0 // indirect
40 | 	go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.49.0 // indirect
41 | 	go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.49.0 // indirect
42 | 	go.opentelemetry.io/otel v1.24.0 // indirect
43 | 	go.opentelemetry.io/otel/metric v1.24.0 // indirect
44 | 	go.opentelemetry.io/otel/trace v1.24.0 // indirect
45 | 	golang.org/x/crypto v0.25.0 // indirect
46 | 	golang.org/x/oauth2 v0.21.0 // indirect
47 | 	golang.org/x/sync v0.7.0 // indirect
48 | 	golang.org/x/sys v0.22.0 // indirect
49 | 	golang.org/x/text v0.16.0 // indirect
50 | 	google.golang.org/genproto v0.0.0-20240624140628-dc46fd24d27d // indirect
51 | 	google.golang.org/genproto/googleapis/api v0.0.0-20240617180043-68d350f18fd4 // indirect
52 | 	google.golang.org/genproto/googleapis/rpc v0.0.0-20240624140628-dc46fd24d27d // indirect
53 | 	google.golang.org/grpc v1.64.0 // indirect
54 | 	google.golang.org/protobuf v1.34.2 // indirect
55 | 	gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127 // indirect
56 | )
57 | 
58 | require (
59 | 	cloud.google.com/go/storage v1.43.0
60 | 	github.com/andybalholm/cascadia v1.3.2 // indirect
61 | 	golang.org/x/net v0.27.0 // indirect
62 | 	gopkg.in/yaml.v3 v3.0.1
63 | )
64 | 


--------------------------------------------------------------------------------
/go.sum:
--------------------------------------------------------------------------------
  1 | cloud.google.com/go v0.26.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMTw=
  2 | cloud.google.com/go v0.115.0 h1:CnFSK6Xo3lDYRoBKEcAtia6VSC837/ZkJuRduSFnr14=
  3 | cloud.google.com/go v0.115.0/go.mod h1:8jIM5vVgoAEoiVxQ/O4BFTfHqulPZgs/ufEzMcFMdWU=
  4 | cloud.google.com/go/auth v0.6.1 h1:T0Zw1XM5c1GlpN2HYr2s+m3vr1p2wy+8VN+Z1FKxW38=
  5 | cloud.google.com/go/auth v0.6.1/go.mod h1:eFHG7zDzbXHKmjJddFG/rBlcGp6t25SwRUiEQSlO4x4=
  6 | cloud.google.com/go/auth/oauth2adapt v0.2.2 h1:+TTV8aXpjeChS9M+aTtN/TjdQnzJvmzKFt//oWu7HX4=
  7 | cloud.google.com/go/auth/oauth2adapt v0.2.2/go.mod h1:wcYjgpZI9+Yu7LyYBg4pqSiaRkfEK3GQcpb7C/uyF1Q=
  8 | cloud.google.com/go/compute/metadata v0.3.0 h1:Tz+eQXMEqDIKRsmY3cHTL6FVaynIjX2QxYC4trgAKZc=
  9 | cloud.google.com/go/compute/metadata v0.3.0/go.mod h1:zFmK7XCadkQkj6TtorcaGlCW1hT1fIilQDwofLpJ20k=
 10 | cloud.google.com/go/iam v1.1.8 h1:r7umDwhj+BQyz0ScZMp4QrGXjSTI3ZINnpgU2nlB/K0=
 11 | cloud.google.com/go/iam v1.1.8/go.mod h1:GvE6lyMmfxXauzNq8NbgJbeVQNspG+tcdL/W8QO1+zE=
 12 | cloud.google.com/go/longrunning v0.5.7 h1:WLbHekDbjK1fVFD3ibpFFVoyizlLRl73I7YKuAKilhU=
 13 | cloud.google.com/go/longrunning v0.5.7/go.mod h1:8GClkudohy1Fxm3owmBGid8W0pSgodEMwEAztp38Xng=
 14 | cloud.google.com/go/speech v1.23.1 h1:TcWEAOLQH1Lb2fhHS6/GjvAh+ue0dt4xUDHXHG6vF04=
 15 | cloud.google.com/go/speech v1.23.1/go.mod h1:UNgzNxhNBuo/OxpF1rMhA/U2rdai7ILL6PBXFs70wq0=
 16 | cloud.google.com/go/storage v1.43.0 h1:CcxnSohZwizt4LCzQHWvBf1/kvtHUn7gk9QERXPyXFs=
 17 | cloud.google.com/go/storage v1.43.0/go.mod h1:ajvxEa7WmZS1PxvKRq4bq0tFT3vMd502JwstCcYv0Q0=
 18 | github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU=
 19 | github.com/PuerkitoBio/goquery v1.9.2 h1:4/wZksC3KgkQw7SQgkKotmKljk0M6V8TUvA8Wb4yPeE=
 20 | github.com/PuerkitoBio/goquery v1.9.2/go.mod h1:GHPCaP0ODyyxqcNoFGYlAprUFH81NuRPd0GX3Zu2Mvk=
 21 | github.com/amanitaverna/go-mp3 v0.4.0 h1:ZZ5maCStIh7+M9NZSk58Eww23q0B4IuJtQW+Y6u4kkw=
 22 | github.com/amanitaverna/go-mp3 v0.4.0/go.mod h1:b9idBPNUTSU/5D+GATwLkJx5xqDYTEeRg7/O7K7gZF0=
 23 | github.com/andybalholm/cascadia v1.3.2 h1:3Xi6Dw5lHF15JtdcmAHD3i1+T8plmv7BQ/nsViSLyss=
 24 | github.com/andybalholm/cascadia v1.3.2/go.mod h1:7gtRlve5FxPPgIgX36uWBX58OdBsSS6lUvCFb+h7KvU=
 25 | github.com/census-instrumentation/opencensus-proto v0.2.1/go.mod h1:f6KPmirojxKA12rnyqOA5BBL4O983OfeGPqjHWSTneU=
 26 | github.com/client9/misspell v0.3.4/go.mod h1:qj6jICC3Q7zFZvVWo7KLAzC3yx5G7kyvSDkc90ppPyw=
 27 | github.com/cncf/udpa/go v0.0.0-20191209042840-269d4d468f6f/go.mod h1:M8M6+tZqaGXZJjfX53e64911xZQV5JYwmTeXPW+k8Sc=
 28 | github.com/d4l3k/messagediff v1.2.2-0.20190829033028-7e0a312ae40b/go.mod h1:Oozbb1TVXFac9FtSIxHBMnBCq2qeH/2KkEQxENCrlLo=
 29 | github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
 30 | github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
 31 | github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
 32 | github.com/envoyproxy/go-control-plane v0.9.0/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4=
 33 | github.com/envoyproxy/go-control-plane v0.9.1-0.20191026205805-5f8ba28d4473/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4=
 34 | github.com/envoyproxy/go-control-plane v0.9.4/go.mod h1:6rpuAdCZL397s3pYoYcLgu1mIlRU8Am5FuJP05cCM98=
 35 | github.com/envoyproxy/protoc-gen-validate v0.1.0/go.mod h1:iSmxcyjqTsJpI2R4NaDN7+kN2VEUnK/pcBlmesArF7c=
 36 | github.com/felixge/httpsnoop v1.0.4 h1:NFTV2Zj1bL4mc9sqWACXbQFVBBg2W3GPvqp8/ESS2Wg=
 37 | github.com/felixge/httpsnoop v1.0.4/go.mod h1:m8KPJKqk1gH5J9DgRY2ASl2lWCfGKXixSwevea8zH2U=
 38 | github.com/go-audio/audio v1.0.0 h1:zS9vebldgbQqktK4H0lUqWrG8P0NxCJVqcj7ZpNnwd4=
 39 | github.com/go-audio/audio v1.0.0/go.mod h1:6uAu0+H2lHkwdGsAY+j2wHPNPpPoeg5AaEFh9FlA+Zs=
 40 | github.com/go-audio/riff v1.0.0 h1:d8iCGbDvox9BfLagY94fBynxSPHO80LmZCaOsmKxokA=
 41 | github.com/go-audio/riff v1.0.0/go.mod h1:l3cQwc85y79NQFCRB7TiPoNiaijp6q8Z0Uv38rVG498=
 42 | github.com/go-audio/wav v1.1.0 h1:jQgLtbqBzY7G+BM8fXF7AHUk1uHUviWS4X39d5rsL2g=
 43 | github.com/go-audio/wav v1.1.0/go.mod h1:mpe9qfwbScEbkd8uybLuIpTgHyrISw/OTuvjUW2iGtE=
 44 | github.com/go-logr/logr v1.2.2/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A=
 45 | github.com/go-logr/logr v1.4.1 h1:pKouT5E8xu9zeFC39JXRDukb6JFQPXM5p5I91188VAQ=
 46 | github.com/go-logr/logr v1.4.1/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY=
 47 | github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag=
 48 | github.com/go-logr/stdr v1.2.2/go.mod h1:mMo/vtBO5dYbehREoey6XUKy/eSumjCCveDpRre4VKE=
 49 | github.com/golang/glog v0.0.0-20160126235308-23def4e6c14b/go.mod h1:SBH7ygxi8pfUlaOkMMuAQtPIUF8ecWP5IEl/CR7VP2Q=
 50 | github.com/golang/groupcache v0.0.0-20200121045136-8c9f03a8e57e/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc=
 51 | github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da h1:oI5xCqsCo564l8iNU+DwB5epxmsaqB+rhGL0m5jtYqE=
 52 | github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc=
 53 | github.com/golang/mock v1.1.1/go.mod h1:oTYuIxOrZwtPieC+H1uAHpcLFnEyAGVDL/k47Jfbm0A=
 54 | github.com/golang/protobuf v1.2.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U=
 55 | github.com/golang/protobuf v1.3.2/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U=
 56 | github.com/golang/protobuf v1.4.0-rc.1/go.mod h1:ceaxUfeHdC40wWswd/P6IGgMaK3YpKi5j83Wpe3EHw8=
 57 | github.com/golang/protobuf v1.4.0-rc.1.0.20200221234624-67d41d38c208/go.mod h1:xKAWHe0F5eneWXFV3EuXVDTCmh+JuBKY0li0aMyXATA=
 58 | github.com/golang/protobuf v1.4.0-rc.2/go.mod h1:LlEzMj4AhA7rCAGe4KMBDvJI+AwstrUpVNzEA03Pprs=
 59 | github.com/golang/protobuf v1.4.0-rc.4.0.20200313231945-b860323f09d0/go.mod h1:WU3c8KckQ9AFe+yFwt9sWVRKCVIyN9cPHBJSNnbL67w=
 60 | github.com/golang/protobuf v1.4.0/go.mod h1:jodUvKwWbYaEsadDk5Fwe5c77LiNKVO9IDvqG2KuDX0=
 61 | github.com/golang/protobuf v1.4.1/go.mod h1:U8fpvMrcmy5pZrNK1lt4xCsGvpyWQ/VVv6QDs8UjoX8=
 62 | github.com/golang/protobuf v1.4.3/go.mod h1:oDoupMAO8OvCJWAcko0GGGIgR6R6ocIYbsSw735rRwI=
 63 | github.com/golang/protobuf v1.5.4 h1:i7eJL8qZTpSEXOPTxNKhASYpMn+8e5Q6AdndVa1dWek=
 64 | github.com/golang/protobuf v1.5.4/go.mod h1:lnTiLA8Wa4RWRcIUkrtSVa5nRhsEGBg48fD6rSs7xps=
 65 | github.com/google/go-cmp v0.2.0/go.mod h1:oXzfMopK8JAjlY9xF4vHSVASa0yLyX7SntLO5aqRK0M=
 66 | github.com/google/go-cmp v0.3.0/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU=
 67 | github.com/google/go-cmp v0.3.1/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU=
 68 | github.com/google/go-cmp v0.4.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
 69 | github.com/google/go-cmp v0.5.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
 70 | github.com/google/go-cmp v0.5.3/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
 71 | github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI=
 72 | github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
 73 | github.com/google/martian/v3 v3.3.3 h1:DIhPTQrbPkgs2yJYdXU/eNACCG5DVQjySNRNlflZ9Fc=
 74 | github.com/google/martian/v3 v3.3.3/go.mod h1:iEPrYcgCF7jA9OtScMFQyAlZZ4YXTKEtJ1E6RWzmBA0=
 75 | github.com/google/s2a-go v0.1.7 h1:60BLSyTrOV4/haCDW4zb1guZItoSq8foHCXrAnjBo/o=
 76 | github.com/google/s2a-go v0.1.7/go.mod h1:50CgR4k1jNlWBu4UfS4AcfhVe1r6pdZPygJ3R8F0Qdw=
 77 | github.com/google/uuid v1.1.2/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
 78 | github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
 79 | github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
 80 | github.com/googleapis/enterprise-certificate-proxy v0.3.2 h1:Vie5ybvEvT75RniqhfFxPRy3Bf7vr3h0cechB90XaQs=
 81 | github.com/googleapis/enterprise-certificate-proxy v0.3.2/go.mod h1:VLSiSSBs/ksPL8kq3OBOQ6WRI2QnaFynd1DCjZ62+V0=
 82 | github.com/googleapis/gax-go/v2 v2.12.5 h1:8gw9KZK8TiVKB6q3zHY3SBzLnrGp6HQjyfYBYGmXdxA=
 83 | github.com/googleapis/gax-go/v2 v2.12.5/go.mod h1:BUDKcWo+RaKq5SC9vVYL0wLADa3VcfswbOMMRmB9H3E=
 84 | github.com/icza/bitio v1.1.0 h1:ysX4vtldjdi3Ygai5m1cWy4oLkhWTAi+SyO6HC8L9T0=
 85 | github.com/icza/bitio v1.1.0/go.mod h1:0jGnlLAx8MKMr9VGnn/4YrvZiprkvBelsVIbA9Jjr9A=
 86 | github.com/icza/mighty v0.0.0-20180919140131-cfd07d671de6 h1:8UsGZ2rr2ksmEru6lToqnXgA8Mz1DP11X4zSJ159C3k=
 87 | github.com/icza/mighty v0.0.0-20180919140131-cfd07d671de6/go.mod h1:xQig96I1VNBDIWGCdTt54nHt6EeI639SmHycLYL7FkA=
 88 | github.com/jszwec/csvutil v1.5.1/go.mod h1:Rpu7Uu9giO9subDyMCIQfHVDuLrcaC36UA4YcJjGBkg=
 89 | github.com/kr/pretty v0.1.0 h1:L/CwN0zerZDmRFUapSPitk6f+Q3+0za1rQkzVuMiMFI=
 90 | github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo=
 91 | github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ=
 92 | github.com/kr/text v0.1.0 h1:45sCR5RtlFHMR4UwH9sdQ5TC8v0qDQCHnXt+kaKSTVE=
 93 | github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI=
 94 | github.com/ledongthuc/pdf v0.0.0-20240201131950-da5b75280b06 h1:kacRlPN7EN++tVpGUorNGPn/4DnB7/DfTY82AOn6ccU=
 95 | github.com/ledongthuc/pdf v0.0.0-20240201131950-da5b75280b06/go.mod h1:imJHygn/1yfhB7XSJJKlFZKl/J+dCPAknuiaGOshXAs=
 96 | github.com/mewkiz/flac v1.0.11 h1:2KFoMH/P72qhZ/E4bI7ZuK79lCPE1zZM3/6WnrMOTH4=
 97 | github.com/mewkiz/flac v1.0.11/go.mod h1:1UeXlFRJp4ft2mfZnPLRpQTd7cSjb/s17o7JQzzyrCA=
 98 | github.com/mewkiz/pkg v0.0.0-20230226050401-4010bf0fec14 h1:tnAPMExbRERsyEYkmR1YjhTgDM0iqyiBYf8ojRXxdbA=
 99 | github.com/mewkiz/pkg v0.0.0-20230226050401-4010bf0fec14/go.mod h1:QYCFBiH5q6XTHEbWhR0uhR3M9qNPoD2CSQzr0g75kE4=
100 | github.com/pkg/errors v0.8.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
101 | github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
102 | github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
103 | github.com/prometheus/client_model v0.0.0-20190812154241-14fe0d1b01d4/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA=
104 | github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
105 | github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw=
106 | github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo=
107 | github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
108 | github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
109 | github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU=
110 | github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4=
111 | github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg=
112 | github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY=
113 | github.com/taylorskalyo/goreader v0.0.0-20230626212555-e7f5644f8115 h1:OEAIMYp5l9kJ2kT9UPL5QSUriKIIDhnLmpJTy69sltA=
114 | github.com/taylorskalyo/goreader v0.0.0-20230626212555-e7f5644f8115/go.mod h1:AIVbkIe1G7fpFHiKOdxZnU5p9tFPYNTQyH3H5IrRkGw=
115 | github.com/temoto/robotstxt v1.1.2 h1:W2pOjSJ6SWvldyEuiFXNxz3xZ8aiWX5LbfDiOFd7Fxg=
116 | github.com/temoto/robotstxt v1.1.2/go.mod h1:+1AmkuG3IYkh1kv0d2qEB9Le88ehNO0zwOr3ujewlOo=
117 | github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY=
118 | go.opencensus.io v0.24.0 h1:y73uSU6J157QMP2kn2r30vwW1A2W2WFwSCGnAVxeaD0=
119 | go.opencensus.io v0.24.0/go.mod h1:vNK8G9p7aAivkbmorf4v+7Hgx+Zs0yY+0fOtgBfjQKo=
120 | go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.49.0 h1:4Pp6oUg3+e/6M4C0A/3kJ2VYa++dsWVTtGgLVj5xtHg=
121 | go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.49.0/go.mod h1:Mjt1i1INqiaoZOMGR1RIUJN+i3ChKoFRqzrRQhlkbs0=
122 | go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.49.0 h1:jq9TW8u3so/bN+JPT166wjOI6/vQPF6Xe7nMNIltagk=
123 | go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.49.0/go.mod h1:p8pYQP+m5XfbZm9fxtSKAbM6oIllS7s2AfxrChvc7iw=
124 | go.opentelemetry.io/otel v1.24.0 h1:0LAOdjNmQeSTzGBzduGe/rU4tZhMwL5rWgtp9Ku5Jfo=
125 | go.opentelemetry.io/otel v1.24.0/go.mod h1:W7b9Ozg4nkF5tWI5zsXkaKKDjdVjpD4oAt9Qi/MArHo=
126 | go.opentelemetry.io/otel/metric v1.24.0 h1:6EhoGWWK28x1fbpA4tYTOWBkPefTDQnb8WSGXlc88kI=
127 | go.opentelemetry.io/otel/metric v1.24.0/go.mod h1:VYhLe1rFfxuTXLgj4CBiyz+9WYBA8pNGJgDcSFRKBco=
128 | go.opentelemetry.io/otel/sdk v1.24.0 h1:YMPPDNymmQN3ZgczicBY3B6sf9n62Dlj9pWD3ucgoDw=
129 | go.opentelemetry.io/otel/sdk v1.24.0/go.mod h1:KVrIYw6tEubO9E96HQpcmpTKDVn9gdv35HoYiQWGDFg=
130 | go.opentelemetry.io/otel/trace v1.24.0 h1:CsKnnL4dUAr/0llH9FKuc698G04IrpWV0MQA/Y1YELI=
131 | go.opentelemetry.io/otel/trace v1.24.0/go.mod h1:HPc3Xr/cOApsBI154IU0OI0HJexz+aw5uPdbs3UCjNU=
132 | golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
133 | golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto=
134 | golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc=
135 | golang.org/x/crypto v0.25.0 h1:ypSNr+bnYL2YhwoMt2zPxHFmbAN1KZs/njMG3hxUp30=
136 | golang.org/x/crypto v0.25.0/go.mod h1:T+wALwcMOSE0kXgUAnPAHqTLW+XHgcELELW8VaDgm/M=
137 | golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
138 | golang.org/x/image v0.5.0/go.mod h1:FVC7BI/5Ym8R25iw5OLsgshdUBbT1h5jZTpA+mvAdZ4=
139 | golang.org/x/lint v0.0.0-20181026193005-c67002cb31c3/go.mod h1:UVdnD1Gm6xHRNCYTkRU2/jEulfH38KcIWyp/GAMgvoE=
140 | golang.org/x/lint v0.0.0-20190227174305-5b3e6a55c961/go.mod h1:wehouNa3lNwaWXcvxsM5YxQ5yQlVC4a0KAMCusXpPoU=
141 | golang.org/x/lint v0.0.0-20190313153728-d0100b6bd8b3/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc=
142 | golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4=
143 | golang.org/x/mod v0.8.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs=
144 | golang.org/x/net v0.0.0-20180724234803-3673e40ba225/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
145 | golang.org/x/net v0.0.0-20180826012351-8a410e7b638d/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
146 | golang.org/x/net v0.0.0-20190213061140-3a22650c66bd/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
147 | golang.org/x/net v0.0.0-20190311183353-d8887717615a/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
148 | golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
149 | golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
150 | golang.org/x/net v0.0.0-20201110031124-69a78807bb2b/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU=
151 | golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg=
152 | golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c=
153 | golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs=
154 | golang.org/x/net v0.7.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs=
155 | golang.org/x/net v0.9.0/go.mod h1:d48xBJpPfHeWQsugry2m+kC02ZBRGRgulfHnEXEuWns=
156 | golang.org/x/net v0.27.0 h1:5K3Njcw06/l2y9vpGCSdcxWOYHOUk3dVNGDXN+FvAys=
157 | golang.org/x/net v0.27.0/go.mod h1:dDi0PyhWNoiUOrAS8uXv/vnScO4wnHQO4mj9fn/RytE=
158 | golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U=
159 | golang.org/x/oauth2 v0.21.0 h1:tsimM75w1tF/uws5rbeHzIWxEqElMehnc+iW793zsZs=
160 | golang.org/x/oauth2 v0.21.0/go.mod h1:XYTD2NtWslqkgxebSiOHnXEap4TF09sJSc7H1sXbhtI=
161 | golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
162 | golang.org/x/sync v0.0.0-20181108010431-42b317875d0f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
163 | golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
164 | golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
165 | golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
166 | golang.org/x/sync v0.7.0 h1:YsImfSBoP9QPYL0xyKJPq0gcaJdG3rInoqxTWbfQu9M=
167 | golang.org/x/sync v0.7.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
168 | golang.org/x/sys v0.0.0-20180830151530-49385e6e1522/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
169 | golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
170 | golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
171 | golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
172 | golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
173 | golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
174 | golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
175 | golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
176 | golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
177 | golang.org/x/sys v0.7.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
178 | golang.org/x/sys v0.22.0 h1:RI27ohtqKCnwULzJLqkv897zojh5/DwS/ENaMzUOaWI=
179 | golang.org/x/sys v0.22.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
180 | golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
181 | golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8=
182 | golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k=
183 | golang.org/x/term v0.7.0/go.mod h1:P32HKFT3hSsZrRxla30E9HqToFYAQPCMs/zFMBUFqPY=
184 | golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
185 | golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
186 | golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ=
187 | golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8=
188 | golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8=
189 | golang.org/x/text v0.16.0 h1:a94ExnEXNtEwYLGJSIUxnWoxoRz/ZcCsV63ROupILh4=
190 | golang.org/x/text v0.16.0/go.mod h1:GhwF1Be+LQoKShO3cGOHzqOgRrGaYc9AvblQOmPVHnI=
191 | golang.org/x/time v0.6.0 h1:eTDhh4ZXt5Qf0augr54TN6suAUudPcawVZeIAPU7D4U=
192 | golang.org/x/time v0.6.0/go.mod h1:3BpzKBy/shNhVucY/MWOyx10tF3SFh9QdLuxbVysPQM=
193 | golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
194 | golang.org/x/tools v0.0.0-20190114222345-bf090417da8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
195 | golang.org/x/tools v0.0.0-20190226205152-f727befe758c/go.mod h1:9Yl7xja0Znq3iFh3HoIrodX9oNMXvdceNzlUR8zjMvY=
196 | golang.org/x/tools v0.0.0-20190311212946-11955173bddd/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs=
197 | golang.org/x/tools v0.0.0-20190524140312-2c0ae7006135/go.mod h1:RgjU9mgBXZiqYHBnxXauZ1Gv1EHHAz9KjViQ78xBX0Q=
198 | golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
199 | golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc=
200 | golang.org/x/tools v0.6.0/go.mod h1:Xwgl3UAJ/d3gWutnCtw505GrjyAbvKui8lOU390QaIU=
201 | golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
202 | golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
203 | google.golang.org/api v0.187.0 h1:Mxs7VATVC2v7CY+7Xwm4ndkX71hpElcvx0D1Ji/p1eo=
204 | google.golang.org/api v0.187.0/go.mod h1:KIHlTc4x7N7gKKuVsdmfBXN13yEEWXWFURWY6SBp2gk=
205 | google.golang.org/appengine v1.1.0/go.mod h1:EbEs0AVv82hx2wNQdGPgUI5lhzA/G0D9YwlJXL52JkM=
206 | google.golang.org/appengine v1.4.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4=
207 | google.golang.org/genproto v0.0.0-20180817151627-c66870c02cf8/go.mod h1:JiN7NxoALGmiZfu7CAH4rXhgtRTLTxftemlI0sWmxmc=
208 | google.golang.org/genproto v0.0.0-20190819201941-24fa4b261c55/go.mod h1:DMBHOl98Agz4BDEuKkezgsaosCRResVns1a3J2ZsMNc=
209 | google.golang.org/genproto v0.0.0-20200526211855-cb27e3aa2013/go.mod h1:NbSheEEYHJ7i3ixzK3sjbqSGDJWnxyFXZblF3eUsNvo=
210 | google.golang.org/genproto v0.0.0-20240624140628-dc46fd24d27d h1:PksQg4dV6Sem3/HkBX+Ltq8T0ke0PKIRBNBatoDTVls=
211 | google.golang.org/genproto v0.0.0-20240624140628-dc46fd24d27d/go.mod h1:s7iA721uChleev562UJO2OYB0PPT9CMFjV+Ce7VJH5M=
212 | google.golang.org/genproto/googleapis/api v0.0.0-20240617180043-68d350f18fd4 h1:MuYw1wJzT+ZkybKfaOXKp5hJiZDn2iHaXRw0mRYdHSc=
213 | google.golang.org/genproto/googleapis/api v0.0.0-20240617180043-68d350f18fd4/go.mod h1:px9SlOOZBg1wM1zdnr8jEL4CNGUBZ+ZKYtNPApNQc4c=
214 | google.golang.org/genproto/googleapis/rpc v0.0.0-20240624140628-dc46fd24d27d h1:k3zyW3BYYR30e8v3x0bTDdE9vpYFjZHK+HcyqkrppWk=
215 | google.golang.org/genproto/googleapis/rpc v0.0.0-20240624140628-dc46fd24d27d/go.mod h1:Ue6ibwXGpU+dqIcODieyLOcgj7z8+IcskoNIgZxtrFY=
216 | google.golang.org/grpc v1.19.0/go.mod h1:mqu4LbDTu4XGKhr4mRzUsmM4RtVoemTSY81AxZiDr8c=
217 | google.golang.org/grpc v1.23.0/go.mod h1:Y5yQAOtifL1yxbo5wqy6BxZv8vAUGQwXBOALyacEbxg=
218 | google.golang.org/grpc v1.25.1/go.mod h1:c3i+UQWmh7LiEpx4sFZnkU36qjEYZ0imhYfXVyQciAY=
219 | google.golang.org/grpc v1.27.0/go.mod h1:qbnxyOmOxrQa7FizSgH+ReBfzJrCY1pSN7KXBS8abTk=
220 | google.golang.org/grpc v1.33.2/go.mod h1:JMHMWHQWaTccqQQlmk3MJZS+GWXOdAesneDmEnv2fbc=
221 | google.golang.org/grpc v1.64.0 h1:KH3VH9y/MgNQg1dE7b3XfVK0GsPSIzJwdF617gUSbvY=
222 | google.golang.org/grpc v1.64.0/go.mod h1:oxjF8E3FBnjp+/gVFYdWacaLDx9na1aqy9oovLpxQYg=
223 | google.golang.org/protobuf v0.0.0-20200109180630-ec00e32a8dfd/go.mod h1:DFci5gLYBciE7Vtevhsrf46CRTquxDuWsQurQQe4oz8=
224 | google.golang.org/protobuf v0.0.0-20200221191635-4d8936d0db64/go.mod h1:kwYJMbMJ01Woi6D6+Kah6886xMZcty6N08ah7+eCXa0=
225 | google.golang.org/protobuf v0.0.0-20200228230310-ab0ca4ff8a60/go.mod h1:cfTl7dwQJ+fmap5saPgwCLgHXTUD7jkjRqWcaiX5VyM=
226 | google.golang.org/protobuf v1.20.1-0.20200309200217-e05f789c0967/go.mod h1:A+miEFZTKqfCUM6K7xSMQL9OKL/b6hQv+e19PK+JZNE=
227 | google.golang.org/protobuf v1.21.0/go.mod h1:47Nbq4nVaFHyn7ilMalzfO3qCViNmqZ2kzikPIcrTAo=
228 | google.golang.org/protobuf v1.22.0/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU=
229 | google.golang.org/protobuf v1.23.0/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU=
230 | google.golang.org/protobuf v1.23.1-0.20200526195155-81db48ad09cc/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU=
231 | google.golang.org/protobuf v1.25.0/go.mod h1:9JNX74DMeImyA3h4bdi1ymwjUzf21/xIlbajtzgsN7c=
232 | google.golang.org/protobuf v1.34.2 h1:6xV6lTsCfpGD21XK49h7MhtcApnLqkfYgPcdHftf6hg=
233 | google.golang.org/protobuf v1.34.2/go.mod h1:qYOHts0dSfpeUzUFpOMr/WGzszTmLH+DiWniOlNbLDw=
234 | gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
235 | gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127 h1:qIbj1fsPNlZgppZ+VLlY7N33q108Sa+fhmuc+sWQYwY=
236 | gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
237 | gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
238 | gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
239 | gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
240 | honnef.co/go/tools v0.0.0-20190102054323-c2f93a96b099/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4=
241 | honnef.co/go/tools v0.0.0-20190523083050-ea95bdfd59fc/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4=
242 | 


--------------------------------------------------------------------------------
/internal/audio/flac.go:
--------------------------------------------------------------------------------
 1 | package audio
 2 | 
 3 | import (
 4 | 	"fmt"
 5 | 
 6 | 	"github.com/mewkiz/flac"
 7 | )
 8 | 
 9 | type flacProcessor struct{}
10 | 
11 | func (p *flacProcessor) process(filename string) (*audioInfo, error) {
12 | 	file, err := flac.Open(filename)
13 | 	if err != nil {
14 | 		return nil, fmt.Errorf("failed to open FLAC file: %w", err)
15 | 	}
16 | 	defer func(file *flac.Stream) {
17 | 		err := file.Close()
18 | 		if err != nil {
19 | 			fmt.Printf("failed to close FLAC file: %v\n", err)
20 | 		}
21 | 	}(file)
22 | 
23 | 	return &audioInfo{
24 | 		sampleRate:  int(file.Info.SampleRate),
25 | 		numChannels: int(file.Info.NChannels),
26 | 		format:      "FLAC",
27 | 	}, nil
28 | }
29 | 


--------------------------------------------------------------------------------
/internal/audio/flac_test.go:
--------------------------------------------------------------------------------
 1 | package audio
 2 | 
 3 | import (
 4 | 	"reflect"
 5 | 	"testing"
 6 | )
 7 | 
 8 | func Test_flacProcessor_process(t *testing.T) {
 9 | 	type args struct {
10 | 		filename string
11 | 	}
12 | 	tests := []struct {
13 | 		name    string
14 | 		p       *flacProcessor
15 | 		args    args
16 | 		want    *audioInfo
17 | 		wantErr bool
18 | 	}{
19 | 		{
20 | 			name: "success",
21 | 			p:    &flacProcessor{},
22 | 			args: args{
23 | 				filename: getRootPath(t) + "/testdata/audio/test.flac",
24 | 			},
25 | 			want: &audioInfo{
26 | 				sampleRate:  96000,
27 | 				numChannels: 2,
28 | 				format:      "FLAC",
29 | 			},
30 | 			wantErr: false,
31 | 		},
32 | 		{
33 | 			name: "file not found",
34 | 			p:    &flacProcessor{},
35 | 			args: args{
36 | 				filename: getRootPath(t) + "/testdata/audio/test_new.flac",
37 | 			},
38 | 			want:    nil,
39 | 			wantErr: true,
40 | 		},
41 | 	}
42 | 	for _, tt := range tests {
43 | 		t.Run(tt.name, func(t *testing.T) {
44 | 			p := &flacProcessor{}
45 | 			got, err := p.process(tt.args.filename)
46 | 			if (err != nil) != tt.wantErr {
47 | 				t.Errorf("flacProcessor.process() error = %v, wantErr %v", err, tt.wantErr)
48 | 				return
49 | 			}
50 | 			if !reflect.DeepEqual(got, tt.want) {
51 | 				t.Errorf("flacProcessor.process() = %v, want %v", got, tt.want)
52 | 			}
53 | 		})
54 | 	}
55 | }
56 | 


--------------------------------------------------------------------------------
/internal/audio/mp3.go:
--------------------------------------------------------------------------------
 1 | package audio
 2 | 
 3 | import (
 4 | 	"fmt"
 5 | 	"os"
 6 | 
 7 | 	"github.com/amanitaverna/go-mp3"
 8 | )
 9 | 
10 | type mp3Processor struct{}
11 | 
12 | func (p *mp3Processor) process(filename string) (*audioInfo, error) {
13 | 	file, err := os.Open(filename)
14 | 	if err != nil {
15 | 		return nil, fmt.Errorf("failed to open MP3 file: %w", err)
16 | 	}
17 | 
18 | 	defer func(file *os.File) {
19 | 		err := file.Close()
20 | 		if err != nil {
21 | 			fmt.Printf("failed to close MP3 file: %v\n", err)
22 | 		}
23 | 	}(file)
24 | 
25 | 	decoder, err := mp3.NewDecoder(file)
26 | 	if err != nil {
27 | 		return nil, fmt.Errorf("failed to create MP3 decoder: %w", err)
28 | 	}
29 | 
30 | 	return &audioInfo{
31 | 		sampleRate: decoder.SampleRate(),
32 | 		/*
33 | 			This is a terrible assumption but seeing as the MP3 decoder
34 | 			doesn't expose this information, we'll have to live with it for now.
35 | 		*/
36 | 		numChannels: 2,
37 | 		format:      "MP3",
38 | 	}, nil
39 | }
40 | 


--------------------------------------------------------------------------------
/internal/audio/mp3_test.go:
--------------------------------------------------------------------------------
 1 | package audio
 2 | 
 3 | import (
 4 | 	"reflect"
 5 | 	"testing"
 6 | )
 7 | 
 8 | func Test_mp3Processor_process(t *testing.T) {
 9 | 	type args struct {
10 | 		filename string
11 | 	}
12 | 	tests := []struct {
13 | 		name    string
14 | 		p       *mp3Processor
15 | 		args    args
16 | 		want    *audioInfo
17 | 		wantErr bool
18 | 	}{
19 | 		{
20 | 			name: "success",
21 | 			p:    &mp3Processor{},
22 | 			args: args{
23 | 				filename: getRootPath(t) + "/testdata/audio/test.mp3",
24 | 			},
25 | 			want: &audioInfo{
26 | 				sampleRate:  44100,
27 | 				numChannels: 2,
28 | 				format:      "MP3",
29 | 			},
30 | 			wantErr: false,
31 | 		},
32 | 		{
33 | 			name: "file not found",
34 | 			p:    &mp3Processor{},
35 | 			args: args{
36 | 				filename: getRootPath(t) + "/testdata/audio/test_new.mp3",
37 | 			},
38 | 			want:    nil,
39 | 			wantErr: true,
40 | 		},
41 | 		{
42 | 			name: "invalid MP3 file",
43 | 			p:    &mp3Processor{},
44 | 			args: args{
45 | 				filename: getRootPath(t) + "/testdata/audio/test.flac",
46 | 			},
47 | 			want:    nil,
48 | 			wantErr: true,
49 | 		},
50 | 	}
51 | 	for _, tt := range tests {
52 | 		t.Run(tt.name, func(t *testing.T) {
53 | 			p := &mp3Processor{}
54 | 			got, err := p.process(tt.args.filename)
55 | 			if (err != nil) != tt.wantErr {
56 | 				t.Errorf("mp3Processor.process() error = %v, wantErr %v", err, tt.wantErr)
57 | 				return
58 | 			}
59 | 			if !reflect.DeepEqual(got, tt.want) {
60 | 				t.Errorf("mp3Processor.process() = %v, want %v", got, tt.want)
61 | 			}
62 | 		})
63 | 	}
64 | }
65 | 


--------------------------------------------------------------------------------
/internal/audio/processor.go:
--------------------------------------------------------------------------------
 1 | package audio
 2 | 
 3 | import (
 4 | 	"fmt"
 5 | 	"path/filepath"
 6 | 	"strings"
 7 | 
 8 | 	"cloud.google.com/go/speech/apiv1/speechpb"
 9 | )
10 | 
11 | var (
12 | 	defaultFactory = &defaultAudioProcessorFactory{}
13 | 	retriever      = newAudioInfoRetriever(defaultFactory)
14 | )
15 | 
16 | var encodingMap = map[string]speechpb.RecognitionConfig_AudioEncoding{
17 | 	"WAV":  speechpb.RecognitionConfig_LINEAR16,
18 | 	"MP3":  speechpb.RecognitionConfig_MP3,
19 | 	"FLAC": speechpb.RecognitionConfig_FLAC,
20 | }
21 | 
22 | func (f *defaultAudioProcessorFactory) createProcessor(ext string) (audioProcessor, error) {
23 | 	switch strings.ToLower(ext) {
24 | 	case ".mp3":
25 | 		return &mp3Processor{}, nil
26 | 	case ".flac":
27 | 		return &flacProcessor{}, nil
28 | 	case ".wav":
29 | 		return &wavProcessor{}, nil
30 | 	default:
31 | 		return nil, fmt.Errorf("unsupported file format: %s", ext)
32 | 	}
33 | }
34 | 
35 | func newAudioInfoRetriever(factory audioProcessorFactory) *audioInfoRetriever {
36 | 	return &audioInfoRetriever{
37 | 		factory: factory,
38 | 	}
39 | }
40 | 
41 | func GetAudioInfo(filename string) (*speechpb.RecognitionConfig, error) {
42 | 	info, err := retriever.audioInfo(filename)
43 | 	if err != nil {
44 | 		return nil, err
45 | 	}
46 | 
47 | 	return &speechpb.RecognitionConfig{
48 | 		Encoding:          getEncoding(info.format),
49 | 		SampleRateHertz:   int32(info.sampleRate),
50 | 		AudioChannelCount: int32(info.numChannels),
51 | 	}, nil
52 | }
53 | 
54 | func (r *audioInfoRetriever) audioInfo(filename string) (*audioInfo, error) {
55 | 	ext := filepath.Ext(filename)
56 | 	processor, err := r.factory.createProcessor(ext)
57 | 	if err != nil {
58 | 		return nil, err
59 | 	}
60 | 	return processor.process(filename)
61 | }
62 | 
63 | func getEncoding(format string) speechpb.RecognitionConfig_AudioEncoding {
64 | 	if encoding, ok := encodingMap[format]; ok {
65 | 		return encoding
66 | 	}
67 | 	return speechpb.RecognitionConfig_ENCODING_UNSPECIFIED
68 | }
69 | 


--------------------------------------------------------------------------------
/internal/audio/processor_test.go:
--------------------------------------------------------------------------------
  1 | package audio
  2 | 
  3 | import (
  4 | 	"errors"
  5 | 	"os"
  6 | 	"path/filepath"
  7 | 	"reflect"
  8 | 	"testing"
  9 | 
 10 | 	"cloud.google.com/go/speech/apiv1/speechpb"
 11 | )
 12 | 
 13 | type mockProcessor struct {
 14 | 	info *audioInfo
 15 | 	err  error
 16 | }
 17 | 
 18 | func (m *mockProcessor) process(string) (*audioInfo, error) {
 19 | 	return m.info, m.err
 20 | }
 21 | 
 22 | type mockFactory struct {
 23 | 	processor audioProcessor
 24 | 	err       error
 25 | }
 26 | 
 27 | func (m *mockFactory) createProcessor(string) (audioProcessor, error) {
 28 | 	return m.processor, m.err
 29 | }
 30 | 
 31 | func getRootPath(t *testing.T) string {
 32 | 	t.Helper()
 33 | 	pwd, err := os.Getwd()
 34 | 	if err != nil {
 35 | 		t.Fatalf("getting current folder: %s", err)
 36 | 	}
 37 | 	pwd = filepath.Dir(filepath.Dir(pwd))
 38 | 	return pwd
 39 | }
 40 | 
 41 | func Test_getEncoding(t *testing.T) {
 42 | 	tests := []struct {
 43 | 		name   string
 44 | 		format string
 45 | 		want   speechpb.RecognitionConfig_AudioEncoding
 46 | 	}{
 47 | 		{
 48 | 			name:   "WAV format",
 49 | 			format: "WAV",
 50 | 			want:   speechpb.RecognitionConfig_LINEAR16,
 51 | 		},
 52 | 		{
 53 | 			name:   "MP3 format",
 54 | 			format: "MP3",
 55 | 			want:   speechpb.RecognitionConfig_MP3,
 56 | 		},
 57 | 		{
 58 | 			name:   "FLAC format",
 59 | 			format: "FLAC",
 60 | 			want:   speechpb.RecognitionConfig_FLAC,
 61 | 		},
 62 | 		{
 63 | 			name:   "Unsupported format",
 64 | 			format: "AAC",
 65 | 			want:   speechpb.RecognitionConfig_ENCODING_UNSPECIFIED,
 66 | 		},
 67 | 	}
 68 | 
 69 | 	for _, tt := range tests {
 70 | 		t.Run(tt.name, func(t *testing.T) {
 71 | 			if got := getEncoding(tt.format); !reflect.DeepEqual(got, tt.want) {
 72 | 				t.Errorf("getEncoding() = %v, want %v", got, tt.want)
 73 | 			}
 74 | 		})
 75 | 	}
 76 | }
 77 | 
 78 | func Test_audioInfoRetriever_audioInfo(t *testing.T) {
 79 | 	tests := []struct {
 80 | 		name     string
 81 | 		factory  audioProcessorFactory
 82 | 		filename string
 83 | 		want     *audioInfo
 84 | 		wantErr  bool
 85 | 		errMsg   string
 86 | 	}{
 87 | 		{
 88 | 			name: "MP3 file - successful processing",
 89 | 			factory: &mockFactory{
 90 | 				processor: &mockProcessor{
 91 | 					info: &audioInfo{
 92 | 						sampleRate:  44100,
 93 | 						numChannels: 2,
 94 | 						format:      "MP3",
 95 | 					},
 96 | 					err: nil,
 97 | 				},
 98 | 				err: nil,
 99 | 			},
100 | 			filename: "test.mp3",
101 | 			want: &audioInfo{
102 | 				sampleRate:  44100,
103 | 				numChannels: 2,
104 | 				format:      "MP3",
105 | 			},
106 | 			wantErr: false,
107 | 		},
108 | 		{
109 | 			name: "FLAC file - successful processing",
110 | 			factory: &mockFactory{
111 | 				processor: &mockProcessor{
112 | 					info: &audioInfo{
113 | 						sampleRate:  96000,
114 | 						numChannels: 2,
115 | 						format:      "FLAC",
116 | 					},
117 | 					err: nil,
118 | 				},
119 | 				err: nil,
120 | 			},
121 | 			filename: "test.flac",
122 | 			want: &audioInfo{
123 | 				sampleRate:  96000,
124 | 				numChannels: 2,
125 | 				format:      "FLAC",
126 | 			},
127 | 			wantErr: false,
128 | 		},
129 | 		{
130 | 			name: "WAV file - processing error",
131 | 			factory: &mockFactory{
132 | 				processor: &mockProcessor{
133 | 					info: nil,
134 | 					err:  errors.New("failed to process WAV file"),
135 | 				},
136 | 				err: nil,
137 | 			},
138 | 			filename: "test.wav",
139 | 			want:     nil,
140 | 			wantErr:  true,
141 | 			errMsg:   "failed to process WAV file",
142 | 		},
143 | 		{
144 | 			name: "Unsupported file format",
145 | 			factory: &mockFactory{
146 | 				processor: nil,
147 | 				err:       errors.New("unsupported file format: .aac"),
148 | 			},
149 | 			filename: "test.aac",
150 | 			want:     nil,
151 | 			wantErr:  true,
152 | 			errMsg:   "unsupported file format: .aac",
153 | 		},
154 | 	}
155 | 
156 | 	for _, tt := range tests {
157 | 		t.Run(tt.name, func(t *testing.T) {
158 | 			r := newAudioInfoRetriever(tt.factory)
159 | 			got, err := r.audioInfo(tt.filename)
160 | 			if (err != nil) != tt.wantErr {
161 | 				t.Errorf("audioInfoRetriever.audioInfo() error = %v, wantErr %v", err, tt.wantErr)
162 | 				return
163 | 			}
164 | 			if err != nil && err.Error() != tt.errMsg {
165 | 				t.Errorf("audioInfoRetriever.audioInfo() error = %v, expected error %v", err, tt.errMsg)
166 | 			}
167 | 			if !reflect.DeepEqual(got, tt.want) {
168 | 				t.Errorf("audioInfoRetriever.audioInfo() = %v, want %v", got, tt.want)
169 | 			}
170 | 		})
171 | 	}
172 | }
173 | 
174 | func Test_newAudioInfoRetriever(t *testing.T) {
175 | 	factory := &defaultAudioProcessorFactory{}
176 | 	tests := []struct {
177 | 		name string
178 | 		want *audioInfoRetriever
179 | 	}{
180 | 		{
181 | 			name: "Create new audioInfoRetriever",
182 | 			want: &audioInfoRetriever{
183 | 				factory: factory,
184 | 			},
185 | 		},
186 | 	}
187 | 	for _, tt := range tests {
188 | 		t.Run(tt.name, func(t *testing.T) {
189 | 			if got := newAudioInfoRetriever(factory); !reflect.DeepEqual(got, tt.want) {
190 | 				t.Errorf("newAudioInfoRetriever() = %v, want %v", got, tt.want)
191 | 			}
192 | 		})
193 | 	}
194 | }
195 | 
196 | func Test_getAudioInfo(t *testing.T) {
197 | 	type args struct {
198 | 		filename string
199 | 	}
200 | 	tests := []struct {
201 | 		name    string
202 | 		args    args
203 | 		want    *speechpb.RecognitionConfig
204 | 		wantErr bool
205 | 	}{
206 | 		{
207 | 			name: "MP3 file",
208 | 			args: args{
209 | 				filename: getRootPath(t) + "/testdata/audio/test.mp3",
210 | 			},
211 | 			want: &speechpb.RecognitionConfig{
212 | 				Encoding:          speechpb.RecognitionConfig_MP3,
213 | 				SampleRateHertz:   44100,
214 | 				AudioChannelCount: 2,
215 | 			},
216 | 			wantErr: false,
217 | 		},
218 | 		{
219 | 			name: "FLAC file",
220 | 			args: args{
221 | 				filename: getRootPath(t) + "/testdata/audio/test.flac",
222 | 			},
223 | 			want: &speechpb.RecognitionConfig{
224 | 				Encoding:          speechpb.RecognitionConfig_FLAC,
225 | 				SampleRateHertz:   96000,
226 | 				AudioChannelCount: 2,
227 | 			},
228 | 			wantErr: false,
229 | 		},
230 | 		{
231 | 			name: "WAV file",
232 | 			args: args{
233 | 				filename: getRootPath(t) + "/testdata/audio/test.wav",
234 | 			},
235 | 			want: &speechpb.RecognitionConfig{
236 | 				Encoding:          speechpb.RecognitionConfig_LINEAR16,
237 | 				SampleRateHertz:   44100,
238 | 				AudioChannelCount: 2,
239 | 			},
240 | 			wantErr: false,
241 | 		},
242 | 		{
243 | 			name: "Unsupported file format",
244 | 			args: args{
245 | 				filename: getRootPath(t) + "/testdata/audio/test.ogg",
246 | 			},
247 | 			want:    nil,
248 | 			wantErr: true,
249 | 		},
250 | 	}
251 | 	for _, tt := range tests {
252 | 		t.Run(tt.name, func(t *testing.T) {
253 | 			got, err := GetAudioInfo(tt.args.filename)
254 | 			if (err != nil) != tt.wantErr {
255 | 				t.Errorf("getAudioInfo() error = %v, wantErr %v", err, tt.wantErr)
256 | 				return
257 | 			}
258 | 			if !reflect.DeepEqual(got, tt.want) {
259 | 				t.Errorf("getAudioInfo() = %v, want %v", got, tt.want)
260 | 			}
261 | 		})
262 | 	}
263 | }
264 | 


--------------------------------------------------------------------------------
/internal/audio/types.go:
--------------------------------------------------------------------------------
 1 | package audio
 2 | 
 3 | type audioInfo struct {
 4 | 	sampleRate  int
 5 | 	numChannels int
 6 | 	format      string
 7 | }
 8 | 
 9 | type audioProcessor interface {
10 | 	process(filename string) (*audioInfo, error)
11 | }
12 | 
13 | type audioProcessorFactory interface {
14 | 	createProcessor(fileExtension string) (audioProcessor, error)
15 | }
16 | 
17 | type defaultAudioProcessorFactory struct{}
18 | 
19 | type audioInfoRetriever struct {
20 | 	factory audioProcessorFactory
21 | }
22 | 


--------------------------------------------------------------------------------
/internal/audio/wav.go:
--------------------------------------------------------------------------------
 1 | package audio
 2 | 
 3 | import (
 4 | 	"errors"
 5 | 	"fmt"
 6 | 	"os"
 7 | 
 8 | 	"github.com/go-audio/wav"
 9 | )
10 | 
11 | type wavProcessor struct{}
12 | 
13 | func (p *wavProcessor) process(filename string) (*audioInfo, error) {
14 | 	file, err := os.Open(filename)
15 | 	if err != nil {
16 | 		return nil, fmt.Errorf("failed to open WAV file: %w", err)
17 | 	}
18 | 	defer func(file *os.File) {
19 | 		err := file.Close()
20 | 		if err != nil {
21 | 			fmt.Printf("failed to close WAV file: %v\n", err)
22 | 		}
23 | 	}(file)
24 | 
25 | 	decoder := wav.NewDecoder(file)
26 | 	if !decoder.IsValidFile() {
27 | 		return nil, errors.New("invalid WAV file")
28 | 	}
29 | 
30 | 	return &audioInfo{
31 | 		sampleRate:  int(decoder.SampleRate),
32 | 		numChannels: int(decoder.NumChans),
33 | 		format:      "WAV",
34 | 	}, nil
35 | }
36 | 


--------------------------------------------------------------------------------
/internal/audio/wav_test.go:
--------------------------------------------------------------------------------
 1 | package audio
 2 | 
 3 | import (
 4 | 	"reflect"
 5 | 	"testing"
 6 | )
 7 | 
 8 | func Test_wavProcessor_process(t *testing.T) {
 9 | 	type args struct {
10 | 		filename string
11 | 	}
12 | 	tests := []struct {
13 | 		name    string
14 | 		p       *wavProcessor
15 | 		args    args
16 | 		want    *audioInfo
17 | 		wantErr bool
18 | 	}{
19 | 		{
20 | 			name: "success",
21 | 			p:    &wavProcessor{},
22 | 			args: args{
23 | 				filename: getRootPath(t) + "/testdata/audio/test.wav",
24 | 			},
25 | 			want: &audioInfo{
26 | 				sampleRate:  44100,
27 | 				numChannels: 2,
28 | 				format:      "WAV",
29 | 			},
30 | 		},
31 | 		{
32 | 			name: "file not found",
33 | 			p:    &wavProcessor{},
34 | 			args: args{
35 | 				filename: getRootPath(t) + "/testdata/audio/test_new.wav",
36 | 			},
37 | 			want:    nil,
38 | 			wantErr: true,
39 | 		},
40 | 		{
41 | 			name: "invalid WAV file",
42 | 			p:    &wavProcessor{},
43 | 			args: args{
44 | 				filename: getRootPath(t) + "/testdata/audio/test.flac",
45 | 			},
46 | 			want:    nil,
47 | 			wantErr: true,
48 | 		},
49 | 	}
50 | 	for _, tt := range tests {
51 | 		t.Run(tt.name, func(t *testing.T) {
52 | 			p := &wavProcessor{}
53 | 			got, err := p.process(tt.args.filename)
54 | 			if (err != nil) != tt.wantErr {
55 | 				t.Errorf("wavProcessor.process() error = %v, wantErr %v", err, tt.wantErr)
56 | 				return
57 | 			}
58 | 			if !reflect.DeepEqual(got, tt.want) {
59 | 				t.Errorf("wavProcessor.process() = %v, want %v", got, tt.want)
60 | 			}
61 | 		})
62 | 	}
63 | }
64 | 


--------------------------------------------------------------------------------
/internal/common/types.go:
--------------------------------------------------------------------------------
 1 | package common
 2 | 
 3 | import (
 4 | 	"time"
 5 | )
 6 | 
 7 | type TranscribeOptions struct {
 8 | 	CredentialsJSON   []byte
 9 | 	Bucket            string
10 | 	LanguageCode      string
11 | 	EnableDiarization bool
12 | 	MinSpeakers       int
13 | 	MaxSpeakers       int
14 | 	CleanupOnComplete bool
15 | 	UseWhisper        bool
16 | 	WhisperAPIKey     string
17 | 	WhisperModel      string
18 | 	WhisperPrompt     string
19 | }
20 | 
21 | type Config struct {
22 | 	UserAgent       string
23 | 	RetryLimit      int
24 | 	RetryDelay      time.Duration
25 | 	CrawlDelay      time.Duration
26 | 	ProxyList       []string
27 | 	RateLimit       time.Duration
28 | 	RateBurst       int
29 | 	IgnoreRobotsTxt bool
30 | }
31 | 
32 | type Chunk struct {
33 | 	Content string
34 | 	Source  string
35 | }
36 | 


--------------------------------------------------------------------------------
/internal/document/docx.go:
--------------------------------------------------------------------------------
 1 | package document
 2 | 
 3 | import (
 4 | 	"archive/zip"
 5 | 	"bytes"
 6 | 	"io"
 7 | 	"strings"
 8 | 
 9 | 	"github.com/mmatongo/chew/v1/internal/common"
10 | 	"github.com/mmatongo/chew/v1/internal/utils"
11 | )
12 | 
13 | func processDocxContent(r io.Reader) ([]string, error) {
14 | 	data, err := io.ReadAll(r)
15 | 	if err != nil {
16 | 		return nil, err
17 | 	}
18 | 
19 | 	zipReader, err := zip.NewReader(bytes.NewReader(data), int64(len(data)))
20 | 	if err != nil {
21 | 		return nil, err
22 | 	}
23 | 
24 | 	var contents []string
25 | 
26 | 	for _, file := range zipReader.File {
27 | 		if file.Name == "word/document.xml" {
28 | 			contents, err = utils.ExtractTextFromXML(file)
29 | 			if err != nil {
30 | 				return nil, err
31 | 			}
32 | 			break
33 | 		}
34 | 	}
35 | 
36 | 	var allContent strings.Builder
37 | 	for _, content := range contents {
38 | 		allContent.WriteString(content)
39 | 		allContent.WriteString(" ")
40 | 	}
41 | 
42 | 	return []string{allContent.String()}, nil
43 | 
44 | 	/*
45 | 		// In the event we just want chunks we can just return contents
46 | 		return contents, nil
47 | 	*/
48 | }
49 | 
50 | func ProcessDocx(r io.Reader, url string) ([]common.Chunk, error) {
51 | 	content, err := processDocxContent(r)
52 | 	if err != nil {
53 | 		return nil, err
54 | 	}
55 | 
56 | 	var chunks []common.Chunk
57 | 	for _, chunk := range content {
58 | 		if strings.TrimSpace(string(chunk)) != "" {
59 | 			chunks = append(chunks, common.Chunk{Content: string(chunk), Source: url})
60 | 		}
61 | 	}
62 | 
63 | 	return chunks, nil
64 | }
65 | 


--------------------------------------------------------------------------------
/internal/document/docx_test.go:
--------------------------------------------------------------------------------
 1 | package document
 2 | 
 3 | import (
 4 | 	"archive/zip"
 5 | 	"bytes"
 6 | 	"errors"
 7 | 	"io"
 8 | 	"reflect"
 9 | 	"testing"
10 | 
11 | 	"github.com/mmatongo/chew/v1/internal/common"
12 | )
13 | 
14 | type errorReader struct{}
15 | 
16 | var errMockRead = errors.New("mock read error")
17 | 
18 | func (r *errorReader) Read(p []byte) (n int, err error) {
19 | 	return 0, errMockRead
20 | }
21 | 
22 | func createDocxWithContent(content string) io.Reader {
23 | 	buf := new(bytes.Buffer)
24 | 	w := zip.NewWriter(buf)
25 | 	f, _ := w.Create("word/document.xml")
26 | 	f.Write([]byte(content))
27 | 	w.Close()
28 | 	return bytes.NewReader(buf.Bytes())
29 | }
30 | 
31 | func createEmptyDocx() io.Reader {
32 | 	return createDocxWithContent(`<?xml version="1.0" encoding="UTF-8"?><document></document>`)
33 | }
34 | 
35 | func createSingleParagraphDocx(content string) io.Reader {
36 | 	return createDocxWithContent(`<?xml version="1.0" encoding="UTF-8"?><document><p>` + content + `</p></document>`)
37 | }
38 | 
39 | func TestProcessDocx(t *testing.T) {
40 | 	type args struct {
41 | 		r   io.Reader
42 | 		url string
43 | 	}
44 | 	tests := []struct {
45 | 		name    string
46 | 		args    args
47 | 		want    []common.Chunk
48 | 		wantErr bool
49 | 	}{
50 | 		{
51 | 			name: "Empty docx file",
52 | 			args: args{
53 | 				r:   createEmptyDocx(),
54 | 				url: "http://example.com",
55 | 			},
56 | 			want:    nil,
57 | 			wantErr: false,
58 | 		},
59 | 		{
60 | 			name: "Single paragraph docx file",
61 | 			args: args{
62 | 				r:   createSingleParagraphDocx("Hello from chew!"),
63 | 				url: "http://example.com",
64 | 			},
65 | 			want: []common.Chunk{
66 | 				{
67 | 					Content: "Hello from chew! ",
68 | 					Source:  "http://example.com",
69 | 				},
70 | 			},
71 | 			wantErr: false,
72 | 		},
73 | 	}
74 | 	for _, tt := range tests {
75 | 		t.Run(tt.name, func(t *testing.T) {
76 | 			got, err := ProcessDocx(tt.args.r, tt.args.url)
77 | 			if (err != nil) != tt.wantErr {
78 | 				t.Errorf("ProcessDocx() error = %v, wantErr %v", err, tt.wantErr)
79 | 				return
80 | 			}
81 | 			if !reflect.DeepEqual(got, tt.want) {
82 | 				t.Errorf("ProcessDocx() = %v, want %v", got, tt.want)
83 | 			}
84 | 		})
85 | 	}
86 | }
87 | 
88 | func TestProcessDocx_Error_ReadAll(t *testing.T) {
89 | 	_, err := processPptxContent(&errorReader{})
90 | 	if err == nil {
91 | 		t.Error("ProcessDocx() did not return an error, but one was expected")
92 | 	}
93 | }
94 | 


--------------------------------------------------------------------------------
/internal/document/epub.go:
--------------------------------------------------------------------------------
 1 | package document
 2 | 
 3 | import (
 4 | 	"bytes"
 5 | 	"fmt"
 6 | 	"io"
 7 | 	"strings"
 8 | 
 9 | 	"github.com/PuerkitoBio/goquery"
10 | 	"github.com/mmatongo/chew/v1/internal/common"
11 | 	"github.com/taylorskalyo/goreader/epub"
12 | )
13 | 
14 | func processEpubContent(r io.Reader) ([]common.Chunk, error) {
15 | 	content, err := io.ReadAll(r)
16 | 	if err != nil {
17 | 		return nil, fmt.Errorf("failed to read EPUB content: %w", err)
18 | 	}
19 | 
20 | 	reader, err := epub.NewReader(bytes.NewReader(content), int64(len(content)))
21 | 	if err != nil {
22 | 		return nil, fmt.Errorf("failed to create EPUB reader: %w", err)
23 | 	}
24 | 
25 | 	if len(reader.Rootfiles) == 0 {
26 | 		return nil, fmt.Errorf("EPUB contains no content")
27 | 	}
28 | 
29 | 	contents := reader.Rootfiles[0]
30 | 	var chunks []common.Chunk
31 | 
32 | 	for _, item := range contents.Manifest.Items {
33 | 		if !strings.HasSuffix(item.HREF, ".xhtml") && !strings.HasSuffix(item.HREF, ".html") {
34 | 			continue
35 | 		}
36 | 
37 | 		file, err := item.Open()
38 | 		if err != nil {
39 | 			return nil, fmt.Errorf("failed to open item %s: %w", item.HREF, err)
40 | 		}
41 | 
42 | 		text, err := extractTextFromHTML(file)
43 | 		file.Close()
44 | 		if err != nil {
45 | 			return nil, fmt.Errorf("failed to extract text from %s: %w", item.HREF, err)
46 | 		}
47 | 
48 | 		text = strings.TrimSpace(text)
49 | 		if text == "" {
50 | 			continue
51 | 		}
52 | 		chunks = append(chunks, common.Chunk{Content: text, Source: item.HREF})
53 | 	}
54 | 
55 | 	return chunks, nil
56 | }
57 | 
58 | func ProcessEpub(r io.Reader, url string) ([]common.Chunk, error) {
59 | 	chunks, err := processEpubContent(r)
60 | 	if err != nil {
61 | 		return nil, err
62 | 	}
63 | 
64 | 	for i := range chunks {
65 | 		chunks[i].Source = url
66 | 	}
67 | 
68 | 	return chunks, nil
69 | }
70 | 
71 | func extractTextFromHTML(r io.Reader) (string, error) {
72 | 	doc, err := goquery.NewDocumentFromReader(r)
73 | 	if err != nil {
74 | 		return "", err
75 | 	}
76 | 
77 | 	doc.Find("script, style,nav, header, footer").Remove()
78 | 
79 | 	var buf strings.Builder
80 | 	/*
81 | 		We're only interested in the text content of the HTML document
82 | 		however this is a very naive approach and might not work well
83 | 		for all HTML documents unfortunately.
84 | 		This is a known issue and I'm working on a better solution.
85 | 		see: https://github.com/mmatongo/chew/issues/22
86 | 
87 | 		TODO: Allow users to specify a CSS selector to extract text from
88 | 	*/
89 | 	doc.Find("p, h1, h2, h3, h4, h5, h6, li").Each(func(_ int, s *goquery.Selection) {
90 | 		buf.WriteString(strings.TrimSpace(s.Text()))
91 | 		buf.WriteString("\n\n")
92 | 	})
93 | 
94 | 	return strings.TrimSpace(buf.String()), nil
95 | }
96 | 


--------------------------------------------------------------------------------
/internal/document/epub_test.go:
--------------------------------------------------------------------------------
  1 | package document
  2 | 
  3 | import (
  4 | 	"io"
  5 | 	"os"
  6 | 	"path/filepath"
  7 | 	"reflect"
  8 | 	"strings"
  9 | 	"testing"
 10 | 
 11 | 	"github.com/mmatongo/chew/v1/internal/common"
 12 | 	"github.com/mmatongo/chew/v1/internal/utils"
 13 | )
 14 | 
 15 | func Test_processEpubContent(t *testing.T) {
 16 | 	type args struct {
 17 | 		r io.Reader
 18 | 	}
 19 | 	tests := []struct {
 20 | 		name    string
 21 | 		args    args
 22 | 		want    []common.Chunk
 23 | 		wantErr bool
 24 | 	}{
 25 | 		{
 26 | 			name: "success",
 27 | 			args: args{
 28 | 				r: func() io.Reader {
 29 | 					f, _ := os.Open(filepath.Join(getRootPath(t), "testdata", "files", "test.epub"))
 30 | 					return f
 31 | 				}(),
 32 | 			},
 33 | 			want: []common.Chunk{
 34 | 				{
 35 | 					Content: "A pdf for testing",
 36 | 					Source:  "index.html",
 37 | 				},
 38 | 			},
 39 | 			wantErr: false,
 40 | 		},
 41 | 	}
 42 | 	for _, tt := range tests {
 43 | 		t.Run(tt.name, func(t *testing.T) {
 44 | 			got, err := processEpubContent(tt.args.r)
 45 | 			if (err != nil) != tt.wantErr {
 46 | 				t.Errorf("processEpubContent() error = %v, wantErr %v", err, tt.wantErr)
 47 | 				return
 48 | 			}
 49 | 			if !reflect.DeepEqual(got, tt.want) {
 50 | 				t.Errorf("processEpubContent() = %v, want %v", got, tt.want)
 51 | 			}
 52 | 		})
 53 | 	}
 54 | }
 55 | 
 56 | func TestProcessEpub(t *testing.T) {
 57 | 	type args struct {
 58 | 		r   io.Reader
 59 | 		url string
 60 | 	}
 61 | 	tests := []struct {
 62 | 		name    string
 63 | 		args    args
 64 | 		want    []common.Chunk
 65 | 		wantErr bool
 66 | 	}{
 67 | 		{
 68 | 			name: "success",
 69 | 			args: args{
 70 | 				r: func() io.Reader {
 71 | 					f, _ := os.Open(filepath.Join(getRootPath(t), "testdata", "files", "test.epub"))
 72 | 					return f
 73 | 				}(),
 74 | 				url: "https://example.com/test.epub",
 75 | 			},
 76 | 			want: []common.Chunk{
 77 | 				{
 78 | 					Content: "A pdf for testing",
 79 | 					Source:  "https://example.com/test.epub",
 80 | 				},
 81 | 			},
 82 | 			wantErr: false,
 83 | 		},
 84 | 		{
 85 | 			name: "error",
 86 | 			args: args{
 87 | 				r:   strings.NewReader("key: value, key2: value2"),
 88 | 				url: "https://example.com/data.yaml",
 89 | 			},
 90 | 			want:    nil,
 91 | 			wantErr: true,
 92 | 		},
 93 | 		{
 94 | 			name: "empty",
 95 | 			args: args{
 96 | 				r:   strings.NewReader(""),
 97 | 				url: "https://example.com",
 98 | 			},
 99 | 			want:    nil,
100 | 			wantErr: true,
101 | 		},
102 | 		{
103 | 			name: "unreadable",
104 | 			args: args{
105 | 				r:   func() io.Reader { f, _ := os.Open("nonexistent.epub"); return f }(),
106 | 				url: "https://example.com/nonexistent.epub",
107 | 			},
108 | 			want:    nil,
109 | 			wantErr: true,
110 | 		},
111 | 	}
112 | 	for _, tt := range tests {
113 | 		t.Run(tt.name, func(t *testing.T) {
114 | 			got, err := ProcessEpub(tt.args.r, tt.args.url)
115 | 			if (err != nil) != tt.wantErr {
116 | 				t.Errorf("ProcessEpub() error = %v, wantErr %v", err, tt.wantErr)
117 | 				return
118 | 			}
119 | 			if !reflect.DeepEqual(got, tt.want) {
120 | 				t.Errorf("ProcessEpub() = %v, want %v", got, tt.want)
121 | 			}
122 | 		})
123 | 	}
124 | }
125 | 
126 | func Test_extractTextFromHTML(t *testing.T) {
127 | 	file, _ := utils.OpenFile("testdata/invalid.html")
128 | 	type args struct {
129 | 		r io.Reader
130 | 	}
131 | 	tests := []struct {
132 | 		name    string
133 | 		args    args
134 | 		want    string
135 | 		wantErr bool
136 | 	}{
137 | 		{
138 | 			name: "success",
139 | 			args: args{
140 | 				r: strings.NewReader("<html><body><h1>some content</h1></body></html>"),
141 | 			},
142 | 			want:    "some content",
143 | 			wantErr: false,
144 | 		},
145 | 		{
146 | 			name: "error",
147 | 			args: args{
148 | 				r: file,
149 | 			},
150 | 			want:    "",
151 | 			wantErr: true,
152 | 		},
153 | 	}
154 | 	for _, tt := range tests {
155 | 		t.Run(tt.name, func(t *testing.T) {
156 | 			got, err := extractTextFromHTML(tt.args.r)
157 | 			if (err != nil) != tt.wantErr {
158 | 				t.Errorf("extractTextFromHTML() error = %v, wantErr %v", err, tt.wantErr)
159 | 				return
160 | 			}
161 | 			if got != tt.want {
162 | 				t.Errorf("extractTextFromHTML() = %v, want %v", got, tt.want)
163 | 			}
164 | 		})
165 | 	}
166 | }
167 | 


--------------------------------------------------------------------------------
/internal/document/pdf.go:
--------------------------------------------------------------------------------
 1 | package document
 2 | 
 3 | import (
 4 | 	"bytes"
 5 | 	"fmt"
 6 | 	"io"
 7 | 	"log"
 8 | 	"strings"
 9 | 
10 | 	"github.com/ledongthuc/pdf"
11 | 	"github.com/mmatongo/chew/v1/internal/common"
12 | )
13 | 
14 | func ProcessPDF(r io.Reader, url string) ([]common.Chunk, error) {
15 | 	pdfData, err := io.ReadAll(r)
16 | 	if err != nil {
17 | 		return nil, err
18 | 	}
19 | 
20 | 	f, err := pdf.NewReader(bytes.NewReader(pdfData), int64(len(pdfData)))
21 | 	if err != nil {
22 | 		return nil, err
23 | 	}
24 | 
25 | 	var chunks []common.Chunk
26 | 	for i := 1; i <= f.NumPage(); i++ {
27 | 		p := f.Page(i)
28 | 		if p.V.IsNull() {
29 | 			continue
30 | 		}
31 | 		text, err := p.GetPlainText(nil)
32 | 		if err != nil {
33 | 			log.Printf("Error extracting text from page %d: %v\n\n", i, err)
34 | 			continue
35 | 		}
36 | 
37 | 		text = strings.TrimSpace(text)
38 | 		text = strings.ReplaceAll(text, "\n", "\n\n")
39 | 
40 | 		chunks = append(chunks, common.Chunk{
41 | 			Content: text,
42 | 			Source:  fmt.Sprintf("%s#page=%d", url, i),
43 | 		})
44 | 	}
45 | 
46 | 	if len(chunks) == 0 {
47 | 		return nil, err
48 | 	}
49 | 
50 | 	return chunks, nil
51 | }
52 | 


--------------------------------------------------------------------------------
/internal/document/pdf_test.go:
--------------------------------------------------------------------------------
 1 | package document
 2 | 
 3 | import (
 4 | 	"io"
 5 | 	"os"
 6 | 	"path/filepath"
 7 | 	"reflect"
 8 | 	"strings"
 9 | 	"testing"
10 | 
11 | 	"github.com/mmatongo/chew/v1/internal/common"
12 | )
13 | 
14 | func getRootPath(t *testing.T) string {
15 | 	t.Helper()
16 | 	pwd, err := os.Getwd()
17 | 	if err != nil {
18 | 		t.Fatalf("getting current folder: %s", err)
19 | 	}
20 | 	pwd = filepath.Dir(filepath.Dir(pwd))
21 | 	return pwd
22 | }
23 | 
24 | func TestProcessPDF(t *testing.T) {
25 | 	type args struct {
26 | 		r   io.Reader
27 | 		url string
28 | 	}
29 | 	tests := []struct {
30 | 		name    string
31 | 		args    args
32 | 		want    []common.Chunk
33 | 		wantErr bool
34 | 	}{
35 | 		{
36 | 			name: "success",
37 | 			args: args{
38 | 				r: func() io.Reader {
39 | 					f, _ := os.Open(filepath.Join(getRootPath(t), "testdata", "files", "test.pdf"))
40 | 					return f
41 | 				}(),
42 | 				url: "https://example.com/test.pdf",
43 | 			},
44 | 			want: []common.Chunk{
45 | 				{
46 | 					Content: "Apdffortesting",
47 | 					Source:  "https://example.com/test.pdf#page=1",
48 | 				},
49 | 			},
50 | 			wantErr: false,
51 | 		},
52 | 		{
53 | 			name: "error",
54 | 			args: args{
55 | 				r:   strings.NewReader("key: value, key2: value2"),
56 | 				url: "https://example.com/data.yaml",
57 | 			},
58 | 			want:    nil,
59 | 			wantErr: true,
60 | 		},
61 | 		{
62 | 			name: "empty",
63 | 			args: args{
64 | 				r:   strings.NewReader(""),
65 | 				url: "https://example.com",
66 | 			},
67 | 			want:    nil,
68 | 			wantErr: true,
69 | 		},
70 | 		{
71 | 			name: "unreadable",
72 | 			args: args{
73 | 				r:   func() io.Reader { f, _ := os.Open("nonexistent.pdf"); return f }(),
74 | 				url: "https://example.com/nonexistent.pdf",
75 | 			},
76 | 			want:    nil,
77 | 			wantErr: true,
78 | 		},
79 | 	}
80 | 	for _, tt := range tests {
81 | 		t.Run(tt.name, func(t *testing.T) {
82 | 			got, err := ProcessPDF(tt.args.r, tt.args.url)
83 | 			if (err != nil) != tt.wantErr {
84 | 				t.Errorf("ProcessPDF() error = %v, wantErr %v", err, tt.wantErr)
85 | 				return
86 | 			}
87 | 			if !reflect.DeepEqual(got, tt.want) {
88 | 				t.Errorf("ProcessPDF() = %v, want %v", got, tt.want)
89 | 			}
90 | 		})
91 | 	}
92 | }
93 | 


--------------------------------------------------------------------------------
/internal/document/pptx.go:
--------------------------------------------------------------------------------
 1 | package document
 2 | 
 3 | import (
 4 | 	"archive/zip"
 5 | 	"bytes"
 6 | 	"io"
 7 | 	"strings"
 8 | 
 9 | 	"github.com/mmatongo/chew/v1/internal/common"
10 | 	"github.com/mmatongo/chew/v1/internal/utils"
11 | )
12 | 
13 | func processPptxContent(r io.Reader) ([]string, error) {
14 | 	data, err := io.ReadAll(r)
15 | 	if err != nil {
16 | 		return nil, err
17 | 	}
18 | 
19 | 	zipReader, err := zip.NewReader(bytes.NewReader(data), int64(len(data)))
20 | 	if err != nil {
21 | 		return nil, err
22 | 	}
23 | 
24 | 	var contents []string
25 | 
26 | 	for _, file := range zipReader.File {
27 | 		if strings.HasPrefix(file.Name, "ppt/slides/") {
28 | 			slideText, err := utils.ExtractTextFromXML(file)
29 | 			if err != nil {
30 | 				return nil, err
31 | 			}
32 | 			contents = append(contents, slideText...)
33 | 		}
34 | 	}
35 | 
36 | 	var allContent strings.Builder
37 | 	for _, content := range contents {
38 | 		allContent.WriteString(content)
39 | 		allContent.WriteString(" ")
40 | 	}
41 | 
42 | 	return []string{allContent.String()}, nil
43 | 
44 | 	/*
45 | 		// In the event we just want chunks we can just return contents
46 | 		return contents, nil
47 | 	*/
48 | }
49 | 
50 | func ProcessPptx(r io.Reader, url string) ([]common.Chunk, error) {
51 | 	content, err := processPptxContent(r)
52 | 	if err != nil {
53 | 		return nil, err
54 | 	}
55 | 
56 | 	var chunks []common.Chunk
57 | 	for _, chunk := range content {
58 | 		if strings.TrimSpace(string(chunk)) != "" {
59 | 			chunks = append(chunks, common.Chunk{Content: string(chunk), Source: url})
60 | 		}
61 | 	}
62 | 
63 | 	return chunks, nil
64 | }
65 | 


--------------------------------------------------------------------------------
/internal/document/pptx_test.go:
--------------------------------------------------------------------------------
 1 | package document
 2 | 
 3 | import (
 4 | 	"archive/zip"
 5 | 	"bytes"
 6 | 	"io"
 7 | 	"reflect"
 8 | 	"testing"
 9 | 
10 | 	"github.com/mmatongo/chew/v1/internal/common"
11 | )
12 | 
13 | func createPptxWithContent(content string) io.Reader {
14 | 	buf := new(bytes.Buffer)
15 | 	w := zip.NewWriter(buf)
16 | 	f, _ := w.Create("ppt/slides/slide1.xml")
17 | 	f.Write([]byte(content))
18 | 	w.Close()
19 | 	return bytes.NewReader(buf.Bytes())
20 | }
21 | 
22 | func createEmptyPptx() io.Reader {
23 | 	return createPptxWithContent(`<?xml version="1.0" encoding="UTF-8"?><document></document>`)
24 | }
25 | 
26 | func createSingleParagraphPptx(content string) io.Reader {
27 | 	return createPptxWithContent(`<?xml version="1.0" encoding="UTF-8"?><document><p>` + content + `</p></document>`)
28 | }
29 | 
30 | func TestProcessPptx(t *testing.T) {
31 | 	type args struct {
32 | 		r   io.Reader
33 | 		url string
34 | 	}
35 | 	tests := []struct {
36 | 		name    string
37 | 		args    args
38 | 		want    []common.Chunk
39 | 		wantErr bool
40 | 	}{
41 | 		{
42 | 			name:    "Empty pptx file",
43 | 			args:    args{r: createEmptyPptx(), url: "http://example.com"},
44 | 			want:    nil,
45 | 			wantErr: false,
46 | 		},
47 | 		{
48 | 			name:    "Single paragraph pptx file",
49 | 			args:    args{r: createSingleParagraphPptx("Hello from chew!"), url: "http://example.com"},
50 | 			want:    []common.Chunk{{Content: "Hello from chew! ", Source: "http://example.com"}},
51 | 			wantErr: false,
52 | 		},
53 | 	}
54 | 	for _, tt := range tests {
55 | 		t.Run(tt.name, func(t *testing.T) {
56 | 			got, err := ProcessPptx(tt.args.r, tt.args.url)
57 | 			if (err != nil) != tt.wantErr {
58 | 				t.Errorf("ProcessPptx() error = %v, wantErr %v", err, tt.wantErr)
59 | 				return
60 | 			}
61 | 			if !reflect.DeepEqual(got, tt.want) {
62 | 				t.Errorf("ProcessPptx() = %v, want %v", got, tt.want)
63 | 			}
64 | 		})
65 | 	}
66 | }
67 | 
68 | func TestProcessPptx_Error_ReadAll(t *testing.T) {
69 | 	_, err := processPptxContent(&errorReader{})
70 | 	if err == nil {
71 | 		t.Error("ProcessPptx() did not return an error, but one was expected")
72 | 	}
73 | }
74 | 


--------------------------------------------------------------------------------
/internal/text/csv.go:
--------------------------------------------------------------------------------
 1 | package text
 2 | 
 3 | import (
 4 | 	"encoding/csv"
 5 | 	"io"
 6 | 	"strings"
 7 | 
 8 | 	"github.com/mmatongo/chew/v1/internal/common"
 9 | )
10 | 
11 | func ProcessCSV(r io.Reader, url string) ([]common.Chunk, error) {
12 | 	csvReader := csv.NewReader(r)
13 | 	var records [][]string
14 | 	var err error
15 | 
16 | 	records, err = csvReader.ReadAll()
17 | 	if err != nil {
18 | 		return nil, err
19 | 	}
20 | 
21 | 	var chunks []common.Chunk
22 | 	for _, record := range records {
23 | 		chunks = append(chunks, common.Chunk{Content: strings.Join(record, ", "), Source: url})
24 | 	}
25 | 
26 | 	return chunks, nil
27 | }
28 | 


--------------------------------------------------------------------------------
/internal/text/csv_test.go:
--------------------------------------------------------------------------------
 1 | package text
 2 | 
 3 | import (
 4 | 	"io"
 5 | 	"reflect"
 6 | 	"strings"
 7 | 	"testing"
 8 | 
 9 | 	"github.com/mmatongo/chew/v1/internal/common"
10 | )
11 | 
12 | func TestProcessCSV(t *testing.T) {
13 | 	type args struct {
14 | 		r   io.Reader
15 | 		url string
16 | 	}
17 | 	tests := []struct {
18 | 		name    string
19 | 		args    args
20 | 		want    []common.Chunk
21 | 		wantErr bool
22 | 	}{
23 | 		{
24 | 			name: "success",
25 | 			args: args{
26 | 				r:   strings.NewReader("Test content"),
27 | 				url: "https://example.com",
28 | 			},
29 | 			want: []common.Chunk{{
30 | 				Content: "Test content",
31 | 				Source:  "https://example.com",
32 | 			}},
33 | 			wantErr: false,
34 | 		},
35 | 		{
36 | 			name: "empty",
37 | 			args: args{
38 | 				r:   strings.NewReader(""),
39 | 				url: "https://example.com",
40 | 			},
41 | 			want:    nil,
42 | 			wantErr: false,
43 | 		},
44 | 		{
45 | 			name: "CSV with quoted fields",
46 | 			args: args{
47 | 				r:   strings.NewReader("\"header 1\",\"header 2\"\n\"value, with comma\",\"value2\""),
48 | 				url: "https://example.com/quoted.csv",
49 | 			},
50 | 			want: []common.Chunk{
51 | 				{Content: "header 1, header 2", Source: "https://example.com/quoted.csv"},
52 | 				{Content: "value, with comma, value2", Source: "https://example.com/quoted.csv"},
53 | 			},
54 | 			wantErr: false,
55 | 		},
56 | 	}
57 | 	for _, tt := range tests {
58 | 		t.Run(tt.name, func(t *testing.T) {
59 | 			got, err := ProcessCSV(tt.args.r, tt.args.url)
60 | 			if (err != nil) != tt.wantErr {
61 | 				t.Errorf("ProcessCSV() error = %v, wantErr %v", err, tt.wantErr)
62 | 				return
63 | 			}
64 | 			if !reflect.DeepEqual(got, tt.want) {
65 | 				t.Errorf("ProcessCSV() = %v, want %v", got, tt.want)
66 | 			}
67 | 		})
68 | 	}
69 | }
70 | 


--------------------------------------------------------------------------------
/internal/text/html.go:
--------------------------------------------------------------------------------
 1 | package text
 2 | 
 3 | import (
 4 | 	"fmt"
 5 | 	"io"
 6 | 	"strings"
 7 | 
 8 | 	"github.com/PuerkitoBio/goquery"
 9 | 	"github.com/mmatongo/chew/v1/internal/common"
10 | )
11 | 
12 | func ProcessHTML(r io.Reader, url string) ([]common.Chunk, error) {
13 | 	doc, err := goquery.NewDocumentFromReader(r)
14 | 	if err != nil {
15 | 		return nil, fmt.Errorf("failed to parse HTML: %w", err)
16 | 	}
17 | 
18 | 	var chunks []common.Chunk
19 | 	/*
20 | 		We're only interested in the text content of the HTML document
21 | 		so we're going to ignore the tags that don't contain useful text.
22 | 		This is a very naive approach and might not work for all HTML documents unfortunately
23 | 	*/
24 | 
25 | 	doc.Find("nav, header, footer").Remove()
26 | 
27 | 	doc.Find("p, h1, h2, h3, h4, h5, h6, li").Each(func(_ int, s *goquery.Selection) {
28 | 		text := strings.TrimSpace(s.Text())
29 | 		if text != "" {
30 | 			chunks = append(chunks, common.Chunk{Content: text, Source: url})
31 | 		}
32 | 	})
33 | 
34 | 	return chunks, nil
35 | }
36 | 


--------------------------------------------------------------------------------
/internal/text/html_test.go:
--------------------------------------------------------------------------------
 1 | package text
 2 | 
 3 | import (
 4 | 	"io"
 5 | 	"reflect"
 6 | 	"strings"
 7 | 	"testing"
 8 | 
 9 | 	"github.com/mmatongo/chew/v1/internal/common"
10 | 	"github.com/mmatongo/chew/v1/internal/utils"
11 | )
12 | 
13 | func TestProcessHTML(t *testing.T) {
14 | 	file, _ := utils.OpenFile("testdata/invalid.html")
15 | 	type args struct {
16 | 		r   io.Reader
17 | 		url string
18 | 	}
19 | 	tests := []struct {
20 | 		name    string
21 | 		args    args
22 | 		want    []common.Chunk
23 | 		wantErr bool
24 | 	}{
25 | 		{
26 | 			name: "success",
27 | 			args: args{
28 | 				r: strings.NewReader(`
29 | 					<!DOCTYPE html>
30 | 					<html>
31 | 					<head>
32 | 						<title>Test HTML</title>
33 | 					</head>
34 | 					<body>
35 | 						<h1>Test content</h1>
36 | 						<p>This is a test paragraph.</p>
37 | 					</body>
38 | 					</html>
39 | 				`),
40 | 				url: "https://example.com/page.html",
41 | 			},
42 | 			want: []common.Chunk{
43 | 				{
44 | 					Content: "Test content",
45 | 					Source:  "https://example.com/page.html",
46 | 				},
47 | 				{
48 | 					Content: "This is a test paragraph.",
49 | 					Source:  "https://example.com/page.html",
50 | 				},
51 | 			},
52 | 			wantErr: false,
53 | 		},
54 | 		{
55 | 			name: "empty",
56 | 			args: args{
57 | 				r:   strings.NewReader(""),
58 | 				url: "https://example.com",
59 | 			},
60 | 			want:    nil,
61 | 			wantErr: false,
62 | 		},
63 | 		{
64 | 			name: "invalid content as a reader",
65 | 			args: args{
66 | 				r:   file,
67 | 				url: "https://example.com",
68 | 			},
69 | 			want:    nil,
70 | 			wantErr: true,
71 | 		},
72 | 	}
73 | 	for _, tt := range tests {
74 | 		t.Run(tt.name, func(t *testing.T) {
75 | 			got, err := ProcessHTML(tt.args.r, tt.args.url)
76 | 			if (err != nil) != tt.wantErr {
77 | 				t.Errorf("processHTML() error = %v, wantErr %v", err, tt.wantErr)
78 | 				return
79 | 			}
80 | 			if !reflect.DeepEqual(got, tt.want) {
81 | 				t.Errorf("processHTML() = %v, want %v", got, tt.want)
82 | 			}
83 | 		})
84 | 	}
85 | }
86 | 


--------------------------------------------------------------------------------
/internal/text/json.go:
--------------------------------------------------------------------------------
 1 | package text
 2 | 
 3 | import (
 4 | 	"encoding/json"
 5 | 	"fmt"
 6 | 	"io"
 7 | 
 8 | 	"github.com/mmatongo/chew/v1/internal/common"
 9 | )
10 | 
11 | func ProcessJSON(r io.Reader, url string) ([]common.Chunk, error) {
12 | 	var data interface{}
13 | 	if err := json.NewDecoder(r).Decode(&data); err != nil {
14 | 		return nil, err
15 | 	}
16 | 
17 | 	jsonStr, err := json.MarshalIndent(data, "", "  ")
18 | 	if err != nil {
19 | 		return nil, fmt.Errorf("failed to marshal json: %w", err)
20 | 	}
21 | 
22 | 	return []common.Chunk{{Content: string(jsonStr), Source: url}}, nil
23 | }
24 | 


--------------------------------------------------------------------------------
/internal/text/json_test.go:
--------------------------------------------------------------------------------
 1 | package text
 2 | 
 3 | import (
 4 | 	"io"
 5 | 	"reflect"
 6 | 	"strings"
 7 | 	"testing"
 8 | 
 9 | 	"github.com/mmatongo/chew/v1/internal/common"
10 | )
11 | 
12 | func TestProcessJSON(t *testing.T) {
13 | 	type args struct {
14 | 		r   io.Reader
15 | 		url string
16 | 	}
17 | 	tests := []struct {
18 | 		name    string
19 | 		args    args
20 | 		want    []common.Chunk
21 | 		wantErr bool
22 | 	}{
23 | 		{
24 | 			name: "success",
25 | 			args: args{
26 | 				r:   strings.NewReader(`{"key": "value"}`),
27 | 				url: "https://example.com/data.json",
28 | 			},
29 | 			want: []common.Chunk{{
30 | 				Content: "{\n  \"key\": \"value\"\n}",
31 | 				Source:  "https://example.com/data.json",
32 | 			}},
33 | 			wantErr: false,
34 | 		},
35 | 		{
36 | 			name: "empty",
37 | 			args: args{
38 | 				r:   strings.NewReader(""),
39 | 				url: "https://example.com",
40 | 			},
41 | 			want:    nil,
42 | 			wantErr: true,
43 | 		},
44 | 		{
45 | 			name: "valid empty json",
46 | 			args: args{
47 | 				r:   strings.NewReader("{}"),
48 | 				url: "https://example.com",
49 | 			},
50 | 			want: []common.Chunk{{
51 | 				Content: "{}",
52 | 				Source:  "https://example.com",
53 | 			}},
54 | 			wantErr: false,
55 | 		},
56 | 	}
57 | 	for _, tt := range tests {
58 | 		t.Run(tt.name, func(t *testing.T) {
59 | 			got, err := ProcessJSON(tt.args.r, tt.args.url)
60 | 			if (err != nil) != tt.wantErr {
61 | 				t.Errorf("ProcessJSON() error = %v, wantErr %v", err, tt.wantErr)
62 | 				return
63 | 			}
64 | 			if !reflect.DeepEqual(got, tt.want) {
65 | 				t.Errorf("ProcessJSON() = %v, want %v", got, tt.want)
66 | 			}
67 | 		})
68 | 	}
69 | }
70 | 


--------------------------------------------------------------------------------
/internal/text/markdown.go:
--------------------------------------------------------------------------------
1 | package text
2 | 
3 | var ProcessMd = ProcessText
4 | 


--------------------------------------------------------------------------------
/internal/text/plaintext.go:
--------------------------------------------------------------------------------
 1 | package text
 2 | 
 3 | import (
 4 | 	"io"
 5 | 
 6 | 	"github.com/mmatongo/chew/v1/internal/common"
 7 | )
 8 | 
 9 | func ProcessText(r io.Reader, url string) ([]common.Chunk, error) {
10 | 	content, err := io.ReadAll(r)
11 | 	if err != nil {
12 | 		return nil, err
13 | 	}
14 | 
15 | 	if len(content) == 0 {
16 | 		return nil, nil
17 | 	}
18 | 
19 | 	return []common.Chunk{{Content: string(content), Source: url}}, nil
20 | }
21 | 


--------------------------------------------------------------------------------
/internal/text/plaintext_test.go:
--------------------------------------------------------------------------------
 1 | package text
 2 | 
 3 | import (
 4 | 	"io"
 5 | 	"reflect"
 6 | 	"strings"
 7 | 	"testing"
 8 | 
 9 | 	"github.com/mmatongo/chew/v1/internal/common"
10 | 	"github.com/mmatongo/chew/v1/internal/utils"
11 | )
12 | 
13 | func TestProcessText(t *testing.T) {
14 | 	file, _ := utils.OpenFile("testdata/invalid.html")
15 | 	type args struct {
16 | 		r   io.Reader
17 | 		url string
18 | 	}
19 | 	tests := []struct {
20 | 		name    string
21 | 		args    args
22 | 		want    []common.Chunk
23 | 		wantErr bool
24 | 	}{
25 | 		{
26 | 			name: "success",
27 | 			args: args{
28 | 				r:   strings.NewReader("Test content"),
29 | 				url: "https://example.com",
30 | 			},
31 | 			want: []common.Chunk{{
32 | 				Content: "Test content",
33 | 				Source:  "https://example.com",
34 | 			}},
35 | 			wantErr: false,
36 | 		},
37 | 		{
38 | 			name: "empty",
39 | 			args: args{
40 | 				r:   strings.NewReader(""),
41 | 				url: "https://example.com",
42 | 			},
43 | 			want:    nil,
44 | 			wantErr: false,
45 | 		},
46 | 		{
47 | 			name: "invalid",
48 | 			args: args{
49 | 				r:   file,
50 | 				url: "https://example.com",
51 | 			},
52 | 			want:    nil,
53 | 			wantErr: true,
54 | 		},
55 | 	}
56 | 	for _, tt := range tests {
57 | 		t.Run(tt.name, func(t *testing.T) {
58 | 			got, err := ProcessText(tt.args.r, tt.args.url)
59 | 			if (err != nil) != tt.wantErr {
60 | 				t.Errorf("ProcessText() error = %v, wantErr %v", err, tt.wantErr)
61 | 				return
62 | 			}
63 | 			if !reflect.DeepEqual(got, tt.want) {
64 | 				t.Errorf("ProcessText() = %v, want %v", got, tt.want)
65 | 			}
66 | 		})
67 | 	}
68 | }
69 | 


--------------------------------------------------------------------------------
/internal/text/xml.go:
--------------------------------------------------------------------------------
 1 | package text
 2 | 
 3 | import (
 4 | 	"bytes"
 5 | 	"encoding/xml"
 6 | 	"io"
 7 | 
 8 | 	"github.com/mmatongo/chew/v1/internal/common"
 9 | )
10 | 
11 | func ProcessXML(r io.Reader, url string) ([]common.Chunk, error) {
12 | 	decoder := xml.NewDecoder(r)
13 | 	var chunks []common.Chunk
14 | 	var currentElement string
15 | 	for {
16 | 		t, err := decoder.Token()
17 | 		if err == io.EOF {
18 | 			break
19 | 		}
20 | 		if err != nil {
21 | 			return nil, err
22 | 		}
23 | 		switch se := t.(type) {
24 | 		case xml.StartElement:
25 | 			currentElement = se.Name.Local
26 | 		case xml.CharData:
27 | 			content := string(bytes.TrimSpace(se))
28 | 			if content != "" && currentElement != "" {
29 | 				chunks = append(chunks, common.Chunk{
30 | 					Content: content,
31 | 					Source:  url,
32 | 				})
33 | 			}
34 | 		}
35 | 	}
36 | 	return chunks, nil
37 | }
38 | 


--------------------------------------------------------------------------------
/internal/text/xml_test.go:
--------------------------------------------------------------------------------
 1 | package text
 2 | 
 3 | import (
 4 | 	"io"
 5 | 	"reflect"
 6 | 	"strings"
 7 | 	"testing"
 8 | 
 9 | 	"github.com/mmatongo/chew/v1/internal/common"
10 | )
11 | 
12 | func TestProcessXML(t *testing.T) {
13 | 	type args struct {
14 | 		r   io.Reader
15 | 		url string
16 | 	}
17 | 	tests := []struct {
18 | 		name    string
19 | 		args    args
20 | 		want    []common.Chunk
21 | 		wantErr bool
22 | 	}{
23 | 		{
24 | 			name: "success",
25 | 			args: args{
26 | 				r:   strings.NewReader("<root><child>Test content</child></root>"),
27 | 				url: "https://example.com",
28 | 			},
29 | 			want: []common.Chunk{{
30 | 				Content: "Test content",
31 | 				Source:  "https://example.com",
32 | 			}},
33 | 
34 | 			wantErr: false,
35 | 		},
36 | 	}
37 | 	for _, tt := range tests {
38 | 		t.Run(tt.name, func(t *testing.T) {
39 | 			got, err := ProcessXML(tt.args.r, tt.args.url)
40 | 			if (err != nil) != tt.wantErr {
41 | 				t.Errorf("ProcessXML() error = %v, wantErr %v", err, tt.wantErr)
42 | 				return
43 | 			}
44 | 			if !reflect.DeepEqual(got, tt.want) {
45 | 				t.Errorf("ProcessXML() = %v, want %v", got, tt.want)
46 | 			}
47 | 		})
48 | 	}
49 | }
50 | 


--------------------------------------------------------------------------------
/internal/text/yaml.go:
--------------------------------------------------------------------------------
 1 | package text
 2 | 
 3 | import (
 4 | 	"io"
 5 | 
 6 | 	"github.com/mmatongo/chew/v1/internal/common"
 7 | 	"gopkg.in/yaml.v3"
 8 | )
 9 | 
10 | func ProcessYAML(r io.Reader, url string) ([]common.Chunk, error) {
11 | 	var data interface{}
12 | 	if err := yaml.NewDecoder(r).Decode(&data); err != nil {
13 | 		return nil, err
14 | 	}
15 | 
16 | 	yamlStr, err := yaml.Marshal(data)
17 | 	if err != nil {
18 | 		return nil, err
19 | 	}
20 | 
21 | 	return []common.Chunk{{Content: string(yamlStr), Source: url}}, nil
22 | }
23 | 


--------------------------------------------------------------------------------
/internal/text/yaml_test.go:
--------------------------------------------------------------------------------
 1 | package text
 2 | 
 3 | import (
 4 | 	"io"
 5 | 	"reflect"
 6 | 	"strings"
 7 | 	"testing"
 8 | 
 9 | 	"github.com/mmatongo/chew/v1/internal/common"
10 | )
11 | 
12 | func TestProcessYAML(t *testing.T) {
13 | 	type args struct {
14 | 		r   io.Reader
15 | 		url string
16 | 	}
17 | 	tests := []struct {
18 | 		name    string
19 | 		args    args
20 | 		want    []common.Chunk
21 | 		wantErr bool
22 | 	}{
23 | 		{
24 | 			name: "success",
25 | 			args: args{
26 | 				r:   strings.NewReader("key: value\nkey2: value2"),
27 | 				url: "https://example.com/data.yaml",
28 | 			},
29 | 			want: []common.Chunk{
30 | 				{
31 | 					Content: "key: value\nkey2: value2\n",
32 | 					Source:  "https://example.com/data.yaml",
33 | 				},
34 | 			},
35 | 			wantErr: false,
36 | 		},
37 | 		{
38 | 			name: "error",
39 | 			args: args{
40 | 				r:   strings.NewReader("key: value, key2: value2"),
41 | 				url: "https://example.com/data.yaml",
42 | 			},
43 | 			want:    nil,
44 | 			wantErr: true,
45 | 		},
46 | 	}
47 | 	for _, tt := range tests {
48 | 		t.Run(tt.name, func(t *testing.T) {
49 | 			got, err := ProcessYAML(tt.args.r, tt.args.url)
50 | 			if (err != nil) != tt.wantErr {
51 | 				t.Errorf("ProcessYAML() error = %v, wantErr %v", err, tt.wantErr)
52 | 				return
53 | 			}
54 | 			if !reflect.DeepEqual(got, tt.want) {
55 | 				t.Errorf("ProcessYAML() = %v, want %v", got, tt.want)
56 | 			}
57 | 		})
58 | 	}
59 | }
60 | 


--------------------------------------------------------------------------------
/internal/transcribe/google_transcriber.go:
--------------------------------------------------------------------------------
 1 | package transcribe
 2 | 
 3 | import (
 4 | 	"context"
 5 | 	"errors"
 6 | 	"fmt"
 7 | 	"path/filepath"
 8 | 
 9 | 	"cloud.google.com/go/storage"
10 | 
11 | 	"github.com/mmatongo/chew/v1/internal/audio"
12 | 	"github.com/mmatongo/chew/v1/internal/utils/gcs"
13 | )
14 | 
15 | type googleTranscriber struct{}
16 | 
17 | /*
18 | This relies too heavily on external dependencies and is not easily testable. A refactor is needed to make it more testable and is currently in progress.
19 | */
20 | func (gt *googleTranscriber) process(ctx context.Context, filename string, opts TranscribeOptions) (string, error) {
21 | 	client, err := gcs.NewSpeechClient(ctx, opts)
22 | 	if err != nil {
23 | 		return "", fmt.Errorf("failed to create speech client: %w", err)
24 | 	}
25 | 	defer func() {
26 | 		if cerr := client.Close(); cerr != nil {
27 | 			err = errors.Join(err, fmt.Errorf("failed to close transcribe client: %w", cerr))
28 | 		}
29 | 	}()
30 | 
31 | 	storageClient, err := gcs.NewStorageClient(ctx, opts)
32 | 	if err != nil {
33 | 		return "", err
34 | 	}
35 | 	defer func(storageClient *storage.Client) {
36 | 		err := storageClient.Close()
37 | 		if err != nil {
38 | 			fmt.Printf("failed to close storage client: %v\n", err)
39 | 		}
40 | 	}(storageClient)
41 | 
42 | 	audioInfo, err := audio.GetAudioInfo(filename)
43 | 	if err != nil {
44 | 		return "", fmt.Errorf("failed to process audio file: %w", err)
45 | 	}
46 | 
47 | 	gcsURI, err := gcs.UploadToGCS(ctx, storageClient, opts.Bucket, filename)
48 | 	if err != nil {
49 | 		return "", fmt.Errorf("failed to upload to GCS: %w", err)
50 | 	}
51 | 
52 | 	if opts.CleanupOnComplete {
53 | 		defer func(ctx context.Context, client *storage.Client, bucket, objectName string) {
54 | 			err := gcs.DeleteFromGCS(ctx, client, bucket, objectName)
55 | 			if err != nil {
56 | 				fmt.Printf("failed to delete object from GCS: %v\n", err)
57 | 			}
58 | 		}(ctx, storageClient, opts.Bucket, filepath.Base(filename))
59 | 	}
60 | 
61 | 	req := gcs.NewRecognitionRequest(opts, audioInfo, gcsURI)
62 | 
63 | 	op, err := client.LongRunningRecognize(ctx, req)
64 | 	if err != nil {
65 | 		return "", fmt.Errorf("failed to start long running recognition: %w", err)
66 | 	}
67 | 
68 | 	resp, err := op.Wait(ctx)
69 | 	if err != nil {
70 | 		return "", fmt.Errorf("failed to get long running recognition results: %w", err)
71 | 	}
72 | 
73 | 	return gcs.ExtractTranscript(resp), nil
74 | }
75 | 


--------------------------------------------------------------------------------
/internal/transcribe/google_transcriber_test.go:
--------------------------------------------------------------------------------
 1 | package transcribe
 2 | 
 3 | import (
 4 | 	"context"
 5 | 	"testing"
 6 | )
 7 | 
 8 | func Test_googleTranscriber_process(t *testing.T) {
 9 | 	type args struct {
10 | 		ctx      context.Context
11 | 		filename string
12 | 		opts     TranscribeOptions
13 | 	}
14 | 	tests := []struct {
15 | 		name    string
16 | 		gt      *googleTranscriber
17 | 		args    args
18 | 		want    string
19 | 		wantErr bool
20 | 	}{
21 | 		{
22 | 			name: "failed to create speech client",
23 | 			gt:   &googleTranscriber{},
24 | 			args: args{
25 | 				ctx:      context.Background(),
26 | 				filename: "test.mp3",
27 | 				opts:     TranscribeOptions{},
28 | 			},
29 | 			want:    "",
30 | 			wantErr: true,
31 | 		},
32 | 	}
33 | 	for _, tt := range tests {
34 | 		t.Run(tt.name, func(t *testing.T) {
35 | 			gt := &googleTranscriber{}
36 | 			got, err := gt.process(tt.args.ctx, tt.args.filename, tt.args.opts)
37 | 			if (err != nil) != tt.wantErr {
38 | 				t.Errorf("googleTranscriber.process() error = %v, wantErr %v", err, tt.wantErr)
39 | 				return
40 | 			}
41 | 			if got != tt.want {
42 | 				t.Errorf("googleTranscriber.process() = %v, want %v", got, tt.want)
43 | 			}
44 | 		})
45 | 	}
46 | }
47 | 


--------------------------------------------------------------------------------
/internal/transcribe/transcribe.go:
--------------------------------------------------------------------------------
  1 | package transcribe
  2 | 
  3 | import (
  4 | 	"context"
  5 | 	"fmt"
  6 | 	"sync"
  7 | 
  8 | 	"github.com/mmatongo/chew/v1/internal/common"
  9 | )
 10 | 
 11 | /*
 12 | The TranscribeOptions struct contains the options for transcribing an audio file. It allows the user
 13 | to specify the Google Cloud credentials, the GCS bucket to upload the audio file to, the language code
 14 | to use for transcription, a potion to enable diarization including the min and max speakers and
 15 | an option to clean up the audio file from GCS after transcription is complete.
 16 | 
 17 | And also, it allows the user to specify whether to use the Whisper API for transcription, and if so,
 18 | the API key, model, and prompt to use.
 19 | */
 20 | type TranscribeOptions = common.TranscribeOptions
 21 | 
 22 | // code is largely inspired by https://github.com/polyfact/polyfire-api
 23 | 
 24 | type transcribeOption func(*transcribeConfig)
 25 | 
 26 | type transcribeConfig struct {
 27 | 	t transcriber
 28 | }
 29 | 
 30 | func WithTranscriber(t transcriber) transcribeOption {
 31 | 	return func(config *transcribeConfig) {
 32 | 		config.t = t
 33 | 	}
 34 | }
 35 | 
 36 | /*
 37 | Transcribe uses the Google Cloud Speech-to-Text API to transcribe an audio file. It takes
 38 | a context, the filename of the audio file to transcribe, and a TranscribeOptions struct which
 39 | contains the Google Cloud credentials, the GCS bucket to upload the audio file to, the language code
 40 | to use for transcription, a potion to enable diarization including the min and max speakers and
 41 | an option to clean up the audio file from GCS after transcription is complete.
 42 | It returns the transcript of the audio file as a string and an error if the transcription fails.
 43 | */
 44 | func Transcribe(ctx context.Context, filenames []string, opts TranscribeOptions, options ...transcribeOption) (map[string]string, error) {
 45 | 	config := &transcribeConfig{}
 46 | 	for _, option := range options {
 47 | 		option(config)
 48 | 	}
 49 | 
 50 | 	if config.t == nil {
 51 | 		if opts.UseWhisper {
 52 | 			config.t = &whisperTranscriber{}
 53 | 		} else {
 54 | 			config.t = &googleTranscriber{}
 55 | 		}
 56 | 	}
 57 | 
 58 | 	var (
 59 | 		results = make(map[string]string)
 60 | 		wg      sync.WaitGroup
 61 | 		mu      sync.Mutex
 62 | 		errCh   = make(chan error, len(filenames))
 63 | 	)
 64 | 
 65 | 	for _, filename := range filenames {
 66 | 		wg.Add(1)
 67 | 		go func(filename string) {
 68 | 			defer wg.Done()
 69 | 
 70 | 			transcript, err := config.t.process(ctx, filename, opts)
 71 | 			if err != nil {
 72 | 				select {
 73 | 				case errCh <- fmt.Errorf("transcribing %s: %w", filename, err):
 74 | 				default:
 75 | 				}
 76 | 				return
 77 | 			}
 78 | 
 79 | 			mu.Lock()
 80 | 			results[filename] = transcript
 81 | 			mu.Unlock()
 82 | 		}(filename)
 83 | 	}
 84 | 
 85 | 	go func() {
 86 | 		wg.Wait()
 87 | 		close(errCh)
 88 | 	}()
 89 | 
 90 | 	select {
 91 | 	case err := <-errCh:
 92 | 		if err != nil {
 93 | 			return nil, err
 94 | 		}
 95 | 	case <-ctx.Done():
 96 | 		return nil, ctx.Err()
 97 | 	}
 98 | 
 99 | 	return results, nil
100 | }
101 | 


--------------------------------------------------------------------------------
/internal/transcribe/transcribe_test.go:
--------------------------------------------------------------------------------
 1 | package transcribe
 2 | 
 3 | import (
 4 | 	"context"
 5 | 	"fmt"
 6 | 	"reflect"
 7 | 	"testing"
 8 | )
 9 | 
10 | type mockTranscriber struct {
11 | 	processFn func(ctx context.Context, filename string, opts TranscribeOptions) (string, error)
12 | }
13 | 
14 | func (m *mockTranscriber) process(ctx context.Context, filename string, opts TranscribeOptions) (string, error) {
15 | 	if m.processFn != nil {
16 | 		return m.processFn(ctx, filename, opts)
17 | 	}
18 | 	return "", nil
19 | }
20 | 
21 | func TestTranscribe(t *testing.T) {
22 | 	type args struct {
23 | 		ctx       context.Context
24 | 		filenames []string
25 | 		opts      TranscribeOptions
26 | 	}
27 | 	tests := []struct {
28 | 		name    string
29 | 		args    args
30 | 		want    map[string]string
31 | 		wantErr bool
32 | 		mockFn  func(ctx context.Context, filename string, opts TranscribeOptions) (string, error)
33 | 	}{
34 | 		{
35 | 			name: "Test Transcribe",
36 | 			args: args{
37 | 				ctx:       context.Background(),
38 | 				filenames: []string{"test1.mp3", "test2.mp3"},
39 | 				opts: TranscribeOptions{
40 | 					CredentialsJSON:   []byte("``"),
41 | 					Bucket:            "test-bucket",
42 | 					LanguageCode:      "en-US",
43 | 					EnableDiarization: false,
44 | 					MinSpeakers:       0,
45 | 					MaxSpeakers:       0,
46 | 					CleanupOnComplete: false,
47 | 					UseWhisper:        false,
48 | 					WhisperAPIKey:     "",
49 | 					WhisperModel:      "",
50 | 					WhisperPrompt:     "",
51 | 				},
52 | 			},
53 | 			want: map[string]string{
54 | 				"test1.mp3": "transcript for test1.mp3",
55 | 				"test2.mp3": "transcript for test2.mp3",
56 | 			},
57 | 			wantErr: false,
58 | 			mockFn: func(ctx context.Context, filename string, opts TranscribeOptions) (string, error) {
59 | 				return "transcript for " + filename, nil
60 | 			},
61 | 		},
62 | 		{
63 | 			name: "Test Transcribe Error",
64 | 			args: args{
65 | 				ctx:       context.Background(),
66 | 				filenames: []string{"test1.mp3", "test2.mp3"},
67 | 				opts:      TranscribeOptions{},
68 | 			},
69 | 			want:    nil,
70 | 			wantErr: true,
71 | 			mockFn: func(ctx context.Context, filename string, opts TranscribeOptions) (string, error) {
72 | 				return "", fmt.Errorf("mock error")
73 | 			},
74 | 		},
75 | 	}
76 | 	for _, tt := range tests {
77 | 		t.Run(tt.name, func(t *testing.T) {
78 | 			mockT := &mockTranscriber{
79 | 				processFn: tt.mockFn,
80 | 			}
81 | 			got, err := Transcribe(tt.args.ctx, tt.args.filenames, tt.args.opts, WithTranscriber(mockT))
82 | 			if (err != nil) != tt.wantErr {
83 | 				t.Errorf("Transcribe() error = %v, wantErr %v", err, tt.wantErr)
84 | 				return
85 | 			}
86 | 			if !reflect.DeepEqual(got, tt.want) {
87 | 				t.Errorf("Transcribe() = %v, want %v", got, tt.want)
88 | 			}
89 | 		})
90 | 	}
91 | }
92 | 


--------------------------------------------------------------------------------
/internal/transcribe/types.go:
--------------------------------------------------------------------------------
 1 | package transcribe
 2 | 
 3 | import (
 4 | 	"context"
 5 | 	"io"
 6 | 	"net/http"
 7 | )
 8 | 
 9 | type transcriber interface {
10 | 	process(ctx context.Context, filename string, opts TranscribeOptions) (string, error)
11 | }
12 | 
13 | type whisperTranscriber struct{}
14 | 
15 | type httpClient interface {
16 | 	Do(req *http.Request) (*http.Response, error)
17 | }
18 | 
19 | type fileOpener func(name string) (io.ReadCloser, error)
20 | 


--------------------------------------------------------------------------------
/internal/transcribe/whisper.go:
--------------------------------------------------------------------------------
  1 | package transcribe
  2 | 
  3 | import (
  4 | 	"bytes"
  5 | 	"context"
  6 | 	"encoding/json"
  7 | 	"fmt"
  8 | 	"io"
  9 | 	"mime/multipart"
 10 | 	"net/http"
 11 | 	"os"
 12 | 	"path/filepath"
 13 | )
 14 | 
 15 | func processWhisper(ctx context.Context, filename string, opts TranscribeOptions, client httpClient, opener fileOpener) (string, error) {
 16 | 	if client == nil {
 17 | 		client = &http.Client{}
 18 | 	}
 19 | 	if opener == nil {
 20 | 		opener = func(name string) (io.ReadCloser, error) {
 21 | 			return os.Open(name)
 22 | 		}
 23 | 	}
 24 | 
 25 | 	file, err := opener(filename)
 26 | 	if err != nil {
 27 | 		return "", fmt.Errorf("failed to open file: %w", err)
 28 | 	}
 29 | 	defer func() {
 30 | 		if cerr := file.Close(); cerr != nil {
 31 | 			err = fmt.Errorf("failed to close file: %v (original error: %w)", cerr, err)
 32 | 		}
 33 | 	}()
 34 | 
 35 | 	body := &bytes.Buffer{}
 36 | 	writer := multipart.NewWriter(body)
 37 | 
 38 | 	part, err := writer.CreateFormFile("file", filepath.Base(filename))
 39 | 	if err != nil {
 40 | 		return "", fmt.Errorf("failed to create form file: %w", err)
 41 | 	}
 42 | 	if _, err = io.Copy(part, file); err != nil {
 43 | 		return "", fmt.Errorf("failed to copy file content: %w", err)
 44 | 	}
 45 | 
 46 | 	if err = writeFields(writer, opts); err != nil {
 47 | 		return "", err
 48 | 	}
 49 | 
 50 | 	if err = writer.Close(); err != nil {
 51 | 		return "", fmt.Errorf("failed to close writer: %w", err)
 52 | 	}
 53 | 
 54 | 	req, err := http.NewRequestWithContext(ctx, "POST", "https://api.openai.com/v1/audio/transcriptions", body)
 55 | 	if err != nil {
 56 | 		return "", fmt.Errorf("failed to create request: %w", err)
 57 | 	}
 58 | 
 59 | 	req.Header.Set("Authorization", "Bearer "+opts.WhisperAPIKey)
 60 | 	req.Header.Set("Content-Type", writer.FormDataContentType())
 61 | 
 62 | 	resp, err := client.Do(req)
 63 | 	if err != nil {
 64 | 		return "", fmt.Errorf("failed to send request: %w", err)
 65 | 	}
 66 | 	defer func() {
 67 | 		if cerr := resp.Body.Close(); cerr != nil {
 68 | 			err = fmt.Errorf("failed to close response body: %v (original error: %w)", cerr, err)
 69 | 		}
 70 | 	}()
 71 | 
 72 | 	if resp.StatusCode != http.StatusOK {
 73 | 		bodyBytes, _ := io.ReadAll(resp.Body)
 74 | 		return "", fmt.Errorf("API request failed with status code %d: %s", resp.StatusCode, string(bodyBytes))
 75 | 	}
 76 | 
 77 | 	var result struct {
 78 | 		Text string `json:"text"`
 79 | 	}
 80 | 	if err = json.NewDecoder(resp.Body).Decode(&result); err != nil {
 81 | 		return "", fmt.Errorf("failed to decode response: %w", err)
 82 | 	}
 83 | 
 84 | 	return result.Text, nil
 85 | }
 86 | 
 87 | func writeFields(writer *multipart.Writer, opts TranscribeOptions) error {
 88 | 	fields := map[string]string{
 89 | 		"model":    opts.WhisperModel,
 90 | 		"language": opts.LanguageCode,
 91 | 		"prompt":   opts.WhisperPrompt,
 92 | 	}
 93 | 
 94 | 	for key, value := range fields {
 95 | 		if value != "" {
 96 | 			if err := writer.WriteField(key, value); err != nil {
 97 | 				return fmt.Errorf("failed to write %s field: %w", key, err)
 98 | 			}
 99 | 		}
100 | 	}
101 | 
102 | 	return nil
103 | }
104 | 
105 | func (wt *whisperTranscriber) process(ctx context.Context, filename string, opts TranscribeOptions) (string, error) {
106 | 	return processWhisper(ctx, filename, opts, nil, nil)
107 | }
108 | 


--------------------------------------------------------------------------------
/internal/transcribe/whisper_test.go:
--------------------------------------------------------------------------------
  1 | package transcribe
  2 | 
  3 | import (
  4 | 	"bytes"
  5 | 	"context"
  6 | 	"errors"
  7 | 	"io"
  8 | 	"mime/multipart"
  9 | 	"net/http"
 10 | 	"os"
 11 | 	"path/filepath"
 12 | 	"testing"
 13 | )
 14 | 
 15 | type mockHTTPClient struct {
 16 | 	DoFunc func(req *http.Request) (*http.Response, error)
 17 | }
 18 | 
 19 | func (m *mockHTTPClient) Do(req *http.Request) (*http.Response, error) {
 20 | 	return m.DoFunc(req)
 21 | }
 22 | 
 23 | type mockFile struct {
 24 | 	*bytes.Reader
 25 | }
 26 | 
 27 | func (m *mockFile) Close() error {
 28 | 	return nil
 29 | }
 30 | 
 31 | func Test_processWhisper(t *testing.T) {
 32 | 	/* mocks */
 33 | 
 34 | 	tempDir, err := os.MkdirTemp("", "whisper_test")
 35 | 	if err != nil {
 36 | 		t.Fatalf("failed to create temp dir: %v", err)
 37 | 	}
 38 | 	defer os.RemoveAll(tempDir)
 39 | 
 40 | 	testFilePath := filepath.Join(tempDir, "test.mp3")
 41 | 	if err := os.WriteFile(testFilePath, []byte("dummy audio content"), 0644); err != nil {
 42 | 		t.Fatalf("failed to create test file: %v", err)
 43 | 	}
 44 | 
 45 | 	unreadableFilePath := filepath.Join(tempDir, "unreadable.mp3")
 46 | 	if err := os.WriteFile(unreadableFilePath, []byte("unreadable content"), 0000); err != nil {
 47 | 		t.Fatalf("failed to create unreadable file: %v", err)
 48 | 	}
 49 | 
 50 | 	successfulMockClient := &mockHTTPClient{
 51 | 		DoFunc: func(req *http.Request) (*http.Response, error) {
 52 | 			return &http.Response{
 53 | 				StatusCode: 200,
 54 | 				Body: io.NopCloser(bytes.NewBufferString(`{
 55 | 					"text": "this is a test transcription."
 56 | 				}`)),
 57 | 			}, nil
 58 | 		},
 59 | 	}
 60 | 
 61 | 	errorMockClient := &mockHTTPClient{
 62 | 		DoFunc: func(req *http.Request) (*http.Response, error) {
 63 | 			return nil, errors.New("mock HTTP error")
 64 | 		},
 65 | 	}
 66 | 
 67 | 	badResponseMockClient := &mockHTTPClient{
 68 | 		DoFunc: func(req *http.Request) (*http.Response, error) {
 69 | 			return &http.Response{
 70 | 				StatusCode: 400,
 71 | 				Body:       io.NopCloser(bytes.NewBufferString(`{"error": "Bad Request"}`)),
 72 | 			}, nil
 73 | 		},
 74 | 	}
 75 | 
 76 | 	invalidJSONMockClient := &mockHTTPClient{
 77 | 		DoFunc: func(req *http.Request) (*http.Response, error) {
 78 | 			return &http.Response{
 79 | 				StatusCode: 200,
 80 | 				Body:       io.NopCloser(bytes.NewBufferString(`invalid JSON`)),
 81 | 			}, nil
 82 | 		},
 83 | 	}
 84 | 
 85 | 	successfulMockFileOpener := func(name string) (io.ReadCloser, error) {
 86 | 		return &mockFile{bytes.NewReader([]byte("file content"))}, nil
 87 | 	}
 88 | 
 89 | 	errorMockFileOpener := func(name string) (io.ReadCloser, error) {
 90 | 		return nil, errors.New("file open error")
 91 | 	}
 92 | 
 93 | 	type args struct {
 94 | 		ctx      context.Context
 95 | 		filename string
 96 | 		opts     TranscribeOptions
 97 | 		client   httpClient
 98 | 		opener   func(name string) (io.ReadCloser, error)
 99 | 	}
100 | 
101 | 	tests := []struct {
102 | 		name    string
103 | 		args    args
104 | 		want    string
105 | 		wantErr bool
106 | 	}{
107 | 		{
108 | 			name: "successful transcription",
109 | 			args: args{
110 | 				ctx:      context.Background(),
111 | 				filename: testFilePath,
112 | 				opts: TranscribeOptions{
113 | 					WhisperAPIKey: "test-api-key",
114 | 					WhisperModel:  "test-model",
115 | 					LanguageCode:  "en-US",
116 | 					WhisperPrompt: "test-prompt",
117 | 				},
118 | 				client: successfulMockClient,
119 | 			},
120 | 			want:    "this is a test transcription.",
121 | 			wantErr: false,
122 | 		},
123 | 		{
124 | 			name: "file open error",
125 | 			args: args{
126 | 				ctx:      context.Background(),
127 | 				filename: "non-existent-file.mp3",
128 | 				opts:     TranscribeOptions{},
129 | 				client:   successfulMockClient,
130 | 			},
131 | 			want:    "",
132 | 			wantErr: true,
133 | 		},
134 | 		{
135 | 			name: "file read error",
136 | 			args: args{
137 | 				ctx:      context.Background(),
138 | 				filename: unreadableFilePath,
139 | 				opts:     TranscribeOptions{},
140 | 				client:   successfulMockClient,
141 | 			},
142 | 			want:    "",
143 | 			wantErr: true,
144 | 		},
145 | 		{
146 | 			name: "HTTP client error",
147 | 			args: args{
148 | 				ctx:      context.Background(),
149 | 				filename: testFilePath,
150 | 				opts:     TranscribeOptions{},
151 | 				client:   errorMockClient,
152 | 			},
153 | 			want:    "",
154 | 			wantErr: true,
155 | 		},
156 | 		{
157 | 			name: "bad response from API",
158 | 			args: args{
159 | 				ctx:      context.Background(),
160 | 				filename: testFilePath,
161 | 				opts:     TranscribeOptions{},
162 | 				client:   badResponseMockClient,
163 | 			},
164 | 			want:    "",
165 | 			wantErr: true,
166 | 		},
167 | 		{
168 | 			name: "invalid JSON response",
169 | 			args: args{
170 | 				ctx:      context.Background(),
171 | 				filename: testFilePath,
172 | 				opts:     TranscribeOptions{},
173 | 				client:   invalidJSONMockClient,
174 | 			},
175 | 			want:    "",
176 | 			wantErr: true,
177 | 		},
178 | 		{
179 | 			name: "file open error",
180 | 			args: args{
181 | 				ctx:      context.Background(),
182 | 				filename: "test.mp3",
183 | 				opts:     TranscribeOptions{},
184 | 				client:   successfulMockClient,
185 | 				opener:   errorMockFileOpener,
186 | 			},
187 | 			want:    "",
188 | 			wantErr: true,
189 | 		},
190 | 		{
191 | 			name: "HTTP client error",
192 | 			args: args{
193 | 				ctx:      context.Background(),
194 | 				filename: "test.mp3",
195 | 				opts:     TranscribeOptions{},
196 | 				client:   errorMockClient,
197 | 				opener:   successfulMockFileOpener,
198 | 			},
199 | 			want:    "",
200 | 			wantErr: true,
201 | 		},
202 | 	}
203 | 
204 | 	for _, tt := range tests {
205 | 		t.Run(tt.name, func(t *testing.T) {
206 | 			got, err := processWhisper(tt.args.ctx, tt.args.filename, tt.args.opts, tt.args.client, tt.args.opener)
207 | 			if (err != nil) != tt.wantErr {
208 | 				t.Errorf("processWhisper() error = %v, wantErr %v", err, tt.wantErr)
209 | 				return
210 | 			}
211 | 			if got != tt.want {
212 | 				t.Errorf("processWhisper() = %v, want %v", got, tt.want)
213 | 			}
214 | 		})
215 | 	}
216 | }
217 | 
218 | func Test_writeFields(t *testing.T) {
219 | 	tests := []struct {
220 | 		name    string
221 | 		opts    TranscribeOptions
222 | 		wantErr bool
223 | 	}{
224 | 		{
225 | 			name: "all fields present",
226 | 			opts: TranscribeOptions{
227 | 				WhisperModel:  "test-model",
228 | 				LanguageCode:  "en-US",
229 | 				WhisperPrompt: "test-prompt",
230 | 			},
231 | 			wantErr: false,
232 | 		},
233 | 	}
234 | 
235 | 	for _, tt := range tests {
236 | 		t.Run(tt.name, func(t *testing.T) {
237 | 			writer := multipart.NewWriter(&bytes.Buffer{})
238 | 			if err := writeFields(writer, tt.opts); (err != nil) != tt.wantErr {
239 | 				t.Errorf("writeFields() error = %v, wantErr %v", err, tt.wantErr)
240 | 			}
241 | 		})
242 | 	}
243 | }
244 | 


--------------------------------------------------------------------------------
/internal/utils/gcs/gcs_utils.go:
--------------------------------------------------------------------------------
  1 | package gcs
  2 | 
  3 | import (
  4 | 	"context"
  5 | 	"errors"
  6 | 	"fmt"
  7 | 	"io"
  8 | 	"os"
  9 | 	"path/filepath"
 10 | 
 11 | 	speech "cloud.google.com/go/speech/apiv1"
 12 | 	"cloud.google.com/go/speech/apiv1/speechpb"
 13 | 	"cloud.google.com/go/storage"
 14 | 	"github.com/mmatongo/chew/v1/internal/common"
 15 | 	"google.golang.org/api/option"
 16 | )
 17 | 
 18 | func UploadToGCS(ctx context.Context, client *storage.Client, bucket, filename string) (string, error) {
 19 | 	f, err := os.Open(filename)
 20 | 	if err != nil {
 21 | 		return "", fmt.Errorf("failed to open file: %w", err)
 22 | 	}
 23 | 	defer func() {
 24 | 		if cerr := f.Close(); cerr != nil {
 25 | 			err = errors.Join(err, fmt.Errorf("failed to close file: %w", cerr))
 26 | 		}
 27 | 	}()
 28 | 
 29 | 	objectName := filepath.Base(filename)
 30 | 	w := client.Bucket(bucket).Object(objectName).NewWriter(ctx)
 31 | 	if _, err = io.Copy(w, f); err != nil {
 32 | 		return "", fmt.Errorf("failed to copy file to GCS: %w", err)
 33 | 	}
 34 | 	if err := w.Close(); err != nil {
 35 | 		return "", fmt.Errorf("failed to close GCS writer: %w", err)
 36 | 	}
 37 | 
 38 | 	return fmt.Sprintf("gs://%s/%s", bucket, objectName), nil
 39 | }
 40 | 
 41 | func DeleteFromGCS(ctx context.Context, client *storage.Client, bucket, objectName string) error {
 42 | 	if err := client.Bucket(bucket).Object(objectName).Delete(ctx); err != nil {
 43 | 		return fmt.Errorf("failed to delete object from GCS: %w", err)
 44 | 	}
 45 | 	return nil
 46 | }
 47 | 
 48 | func NewStorageClient(ctx context.Context, opts common.TranscribeOptions) (*storage.Client, error) {
 49 | 	var clientOpts []option.ClientOption
 50 | 	if opts.CredentialsJSON != nil {
 51 | 		clientOpts = append(clientOpts, option.WithCredentialsJSON(opts.CredentialsJSON))
 52 | 	}
 53 | 	return storage.NewClient(ctx, clientOpts...)
 54 | }
 55 | 
 56 | func NewSpeechClient(ctx context.Context, opts common.TranscribeOptions) (*speech.Client, error) {
 57 | 	var clientOpts []option.ClientOption
 58 | 	if opts.CredentialsJSON != nil {
 59 | 		clientOpts = append(clientOpts, option.WithCredentialsJSON(opts.CredentialsJSON))
 60 | 	}
 61 | 	return speech.NewClient(ctx, clientOpts...)
 62 | }
 63 | 
 64 | func NewRecognitionRequest(opts common.TranscribeOptions, audioInfo *speechpb.RecognitionConfig, gcsURI string) *speechpb.LongRunningRecognizeRequest {
 65 | 	diarizationConfig := &speechpb.SpeakerDiarizationConfig{
 66 | 		EnableSpeakerDiarization: opts.EnableDiarization,
 67 | 		MinSpeakerCount:          int32(opts.MinSpeakers),
 68 | 		MaxSpeakerCount:          int32(opts.MaxSpeakers),
 69 | 	}
 70 | 
 71 | 	return &speechpb.LongRunningRecognizeRequest{
 72 | 		Config: &speechpb.RecognitionConfig{
 73 | 			Encoding:                   audioInfo.Encoding,
 74 | 			SampleRateHertz:            audioInfo.SampleRateHertz,
 75 | 			AudioChannelCount:          audioInfo.AudioChannelCount,
 76 | 			LanguageCode:               opts.LanguageCode,
 77 | 			EnableAutomaticPunctuation: true,
 78 | 			UseEnhanced:                true,
 79 | 			EnableWordConfidence:       true,
 80 | 			Model:                      "latest_long",
 81 | 			DiarizationConfig:          diarizationConfig,
 82 | 		},
 83 | 		Audio: &speechpb.RecognitionAudio{
 84 | 			AudioSource: &speechpb.RecognitionAudio_Uri{
 85 | 				Uri: gcsURI,
 86 | 			},
 87 | 		},
 88 | 	}
 89 | }
 90 | 
 91 | func ExtractTranscript(resp *speechpb.LongRunningRecognizeResponse) string {
 92 | 	var transcript string
 93 | 	for _, result := range resp.Results {
 94 | 		for _, alt := range result.Alternatives {
 95 | 			transcript += alt.Transcript
 96 | 		}
 97 | 	}
 98 | 	return transcript
 99 | }
100 | 


--------------------------------------------------------------------------------
/internal/utils/gcs/gcs_utils_test.go:
--------------------------------------------------------------------------------
  1 | package gcs
  2 | 
  3 | import (
  4 | 	"context"
  5 | 	"fmt"
  6 | 	"net/http"
  7 | 	"net/http/httptest"
  8 | 	"os"
  9 | 	"path/filepath"
 10 | 	"reflect"
 11 | 	"strings"
 12 | 	"testing"
 13 | 
 14 | 	speech "cloud.google.com/go/speech/apiv1"
 15 | 	"cloud.google.com/go/speech/apiv1/speechpb"
 16 | 	"cloud.google.com/go/storage"
 17 | 	"github.com/mmatongo/chew/v1/internal/common"
 18 | 	"google.golang.org/api/option"
 19 | )
 20 | 
 21 | func Test_extractTranscript(t *testing.T) {
 22 | 	type args struct {
 23 | 		resp *speechpb.LongRunningRecognizeResponse
 24 | 	}
 25 | 	tests := []struct {
 26 | 		name string
 27 | 		args args
 28 | 		want string
 29 | 	}{
 30 | 		{
 31 | 			name: "empty response",
 32 | 			args: args{
 33 | 				resp: &speechpb.LongRunningRecognizeResponse{},
 34 | 			},
 35 | 			want: "",
 36 | 		},
 37 | 		{
 38 | 			name: "response with no results",
 39 | 			args: args{
 40 | 				resp: &speechpb.LongRunningRecognizeResponse{
 41 | 					Results: []*speechpb.SpeechRecognitionResult{},
 42 | 				},
 43 | 			},
 44 | 			want: "",
 45 | 		},
 46 | 		{
 47 | 			name: "response with no alternatives",
 48 | 			args: args{
 49 | 				resp: &speechpb.LongRunningRecognizeResponse{
 50 | 					Results: []*speechpb.SpeechRecognitionResult{
 51 | 						{},
 52 | 					},
 53 | 				},
 54 | 			},
 55 | 		},
 56 | 		{
 57 | 			name: "response with result and alternative",
 58 | 			args: args{
 59 | 				resp: &speechpb.LongRunningRecognizeResponse{
 60 | 					Results: []*speechpb.SpeechRecognitionResult{
 61 | 						{
 62 | 							Alternatives: []*speechpb.SpeechRecognitionAlternative{
 63 | 								{
 64 | 									Transcript: "hello world",
 65 | 								},
 66 | 							},
 67 | 						},
 68 | 					},
 69 | 				},
 70 | 			},
 71 | 			want: "hello world",
 72 | 		},
 73 | 		{
 74 | 			name: "response with multiple results and alternatives",
 75 | 			args: args{
 76 | 				resp: &speechpb.LongRunningRecognizeResponse{
 77 | 					Results: []*speechpb.SpeechRecognitionResult{
 78 | 						{
 79 | 							Alternatives: []*speechpb.SpeechRecognitionAlternative{
 80 | 								{
 81 | 									Transcript: "hello world",
 82 | 									Confidence: 0.9,
 83 | 								},
 84 | 								{
 85 | 									Transcript: "hello world",
 86 | 									Confidence: 0.8,
 87 | 								},
 88 | 							},
 89 | 						},
 90 | 						{
 91 | 							Alternatives: []*speechpb.SpeechRecognitionAlternative{
 92 | 								{
 93 | 									Transcript: "hello world",
 94 | 									Confidence: 0.7,
 95 | 								},
 96 | 								{
 97 | 									Transcript: "hello world",
 98 | 									Confidence: 0.6,
 99 | 								},
100 | 							},
101 | 						},
102 | 					},
103 | 				},
104 | 			},
105 | 			want: "hello worldhello worldhello worldhello world",
106 | 		},
107 | 	}
108 | 	for _, tt := range tests {
109 | 		t.Run(tt.name, func(t *testing.T) {
110 | 			if got := ExtractTranscript(tt.args.resp); got != tt.want {
111 | 				t.Errorf("extractTranscript() = %v, want %v", got, tt.want)
112 | 			}
113 | 		})
114 | 	}
115 | }
116 | 
117 | func Test_newRecognitionRequest(t *testing.T) {
118 | 	type args struct {
119 | 		opts      common.TranscribeOptions
120 | 		audioInfo *speechpb.RecognitionConfig
121 | 		gcsURI    string
122 | 	}
123 | 	tests := []struct {
124 | 		name string
125 | 		args args
126 | 		want *speechpb.LongRunningRecognizeRequest
127 | 	}{
128 | 		{
129 | 			name: "create recognition request",
130 | 			args: args{
131 | 				opts: common.TranscribeOptions{
132 | 					EnableDiarization: true,
133 | 					MinSpeakers:       1,
134 | 					MaxSpeakers:       2,
135 | 					LanguageCode:      "en-US",
136 | 				},
137 | 				audioInfo: &speechpb.RecognitionConfig{
138 | 					Encoding:          speechpb.RecognitionConfig_ENCODING_UNSPECIFIED,
139 | 					SampleRateHertz:   44100,
140 | 					AudioChannelCount: 2,
141 | 				},
142 | 				gcsURI: "gs://bucket/object",
143 | 			},
144 | 			want: &speechpb.LongRunningRecognizeRequest{
145 | 				Config: &speechpb.RecognitionConfig{
146 | 					Encoding:                   speechpb.RecognitionConfig_ENCODING_UNSPECIFIED,
147 | 					SampleRateHertz:            44100,
148 | 					AudioChannelCount:          2,
149 | 					LanguageCode:               "en-US",
150 | 					EnableAutomaticPunctuation: true,
151 | 					UseEnhanced:                true,
152 | 					EnableWordConfidence:       true,
153 | 					Model:                      "latest_long",
154 | 					DiarizationConfig: &speechpb.SpeakerDiarizationConfig{
155 | 						EnableSpeakerDiarization: true,
156 | 						MinSpeakerCount:          1,
157 | 						MaxSpeakerCount:          2,
158 | 					},
159 | 				},
160 | 				Audio: &speechpb.RecognitionAudio{
161 | 					AudioSource: &speechpb.RecognitionAudio_Uri{
162 | 						Uri: "gs://bucket/object",
163 | 					},
164 | 				},
165 | 			},
166 | 		},
167 | 	}
168 | 	for _, tt := range tests {
169 | 		t.Run(tt.name, func(t *testing.T) {
170 | 			if got := NewRecognitionRequest(tt.args.opts, tt.args.audioInfo, tt.args.gcsURI); !reflect.DeepEqual(got, tt.want) {
171 | 				t.Errorf("createRecognitionRequest() = %v, want %v", got, tt.want)
172 | 			}
173 | 		})
174 | 	}
175 | }
176 | 
177 | /*
178 | All of the following tests are expected to fail because the credentials JSON is empty
179 | and the functions are not written in a way that allows for mocking of the GCP client libraries.
180 | This is a limitation of the current implementation and should be refactored in the future.
181 | */
182 | 
183 | func Test_newSpeechClient(t *testing.T) {
184 | 	type args struct {
185 | 		ctx  context.Context
186 | 		opts common.TranscribeOptions
187 | 	}
188 | 	tests := []struct {
189 | 		name    string
190 | 		args    args
191 | 		want    *speech.Client
192 | 		wantErr bool
193 | 	}{
194 | 		{
195 | 			name: "create speech client",
196 | 			args: args{
197 | 				ctx: context.Background(),
198 | 				opts: common.TranscribeOptions{
199 | 					CredentialsJSON: nil,
200 | 				},
201 | 			},
202 | 			want:    nil,
203 | 			wantErr: true,
204 | 		},
205 | 		{
206 | 			/*
207 | 				This test case is expected to fail because the credentials JSON is empty.
208 | 
209 | 				TODO: Refactor to allow for mocking of the speech.NewClient function.
210 | 			*/
211 | 			name: "create speech client with credentials",
212 | 			args: args{
213 | 				ctx: context.Background(),
214 | 				opts: common.TranscribeOptions{
215 | 					CredentialsJSON: []byte(""),
216 | 				},
217 | 			},
218 | 			want:    nil,
219 | 			wantErr: true,
220 | 		},
221 | 	}
222 | 	for _, tt := range tests {
223 | 		t.Run(tt.name, func(t *testing.T) {
224 | 			got, err := NewSpeechClient(tt.args.ctx, tt.args.opts)
225 | 			if (err != nil) != tt.wantErr {
226 | 				t.Errorf("createSpeechClient() error = %v, wantErr %v", err, tt.wantErr)
227 | 				return
228 | 			}
229 | 			if !reflect.DeepEqual(got, tt.want) {
230 | 				t.Errorf("createSpeechClient() = %v, want %v", got, tt.want)
231 | 			}
232 | 		})
233 | 	}
234 | }
235 | 
236 | func Test_createStorageClient(t *testing.T) {
237 | 	type args struct {
238 | 		ctx  context.Context
239 | 		opts common.TranscribeOptions
240 | 	}
241 | 	tests := []struct {
242 | 		name    string
243 | 		args    args
244 | 		want    *storage.Client
245 | 		wantErr bool
246 | 	}{
247 | 		{
248 | 			name: "create storage client",
249 | 			args: args{
250 | 				ctx: context.Background(),
251 | 				opts: common.TranscribeOptions{
252 | 					CredentialsJSON: nil,
253 | 				},
254 | 			},
255 | 			want:    nil,
256 | 			wantErr: true,
257 | 		},
258 | 		{
259 | 			/*
260 | 				This test case is expected to fail because the credentials JSON is empty.
261 | 				This does not affect the functionality of the createStorageClient function.
262 | 
263 | 				TODO: Refactor to allow for mocking of the storage.NewClient function.
264 | 			*/
265 | 
266 | 			name: "create storage client with credentials",
267 | 			args: args{
268 | 				ctx: context.Background(),
269 | 				opts: common.TranscribeOptions{
270 | 					CredentialsJSON: []byte(""),
271 | 				},
272 | 			},
273 | 			want:    nil,
274 | 			wantErr: true,
275 | 		},
276 | 	}
277 | 	for _, tt := range tests {
278 | 		t.Run(tt.name, func(t *testing.T) {
279 | 			got, err := NewStorageClient(tt.args.ctx, tt.args.opts)
280 | 			if (err != nil) != tt.wantErr {
281 | 				t.Errorf("createStorageClient() error = %v, wantErr %v", err, tt.wantErr)
282 | 				return
283 | 			}
284 | 			if !reflect.DeepEqual(got, tt.want) {
285 | 				t.Errorf("createStorageClient() = %v, want %v", got, tt.want)
286 | 			}
287 | 		})
288 | 	}
289 | }
290 | 
291 | func Test_uploadToGCS(t *testing.T) {
292 | 	server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
293 | 		if r.Method == "POST" && strings.Contains(r.URL.Path, "/upload/storage/v1/b/") {
294 | 			w.WriteHeader(http.StatusOK)
295 | 			fmt.Fprintf(w, `{"name": "uploaded-object"}`)
296 | 		} else {
297 | 			http.Error(w, "Unexpected request", http.StatusBadRequest)
298 | 		}
299 | 	}))
300 | 	defer server.Close()
301 | 
302 | 	client, err := storage.NewClient(context.Background(), option.WithEndpoint(server.URL), option.WithHTTPClient(server.Client()))
303 | 	if err != nil {
304 | 		t.Fatalf("failed to create test client: %v", err)
305 | 	}
306 | 	defer client.Close()
307 | 
308 | 	tempFile, err := os.CreateTemp("", "test-file-*.txt")
309 | 	if err != nil {
310 | 		t.Fatalf("failed to create temp file: %v", err)
311 | 	}
312 | 	defer os.Remove(tempFile.Name())
313 | 
314 | 	content := []byte("test content")
315 | 	if _, err := tempFile.Write(content); err != nil {
316 | 		t.Fatalf("failed to write to temp file: %v", err)
317 | 	}
318 | 	tempFile.Close()
319 | 
320 | 	tests := []struct {
321 | 		name     string
322 | 		bucket   string
323 | 		filename string
324 | 		want     string
325 | 		wantErr  bool
326 | 	}{
327 | 		{
328 | 			name:     "successful upload",
329 | 			bucket:   "test-bucket",
330 | 			filename: tempFile.Name(),
331 | 			want:     fmt.Sprintf("gs://test-bucket/%s", filepath.Base(tempFile.Name())),
332 | 			wantErr:  false,
333 | 		},
334 | 		{
335 | 			name:     "non-existent file",
336 | 			bucket:   "test-bucket",
337 | 			filename: "file.txt",
338 | 			want:     "",
339 | 			wantErr:  true,
340 | 		},
341 | 		{
342 | 			name:     "empty filename",
343 | 			bucket:   "test-bucket",
344 | 			filename: "",
345 | 			want:     "",
346 | 			wantErr:  true,
347 | 		},
348 | 	}
349 | 
350 | 	for _, tt := range tests {
351 | 		t.Run(tt.name, func(t *testing.T) {
352 | 			got, err := UploadToGCS(context.Background(), client, tt.bucket, tt.filename)
353 | 			if (err != nil) != tt.wantErr {
354 | 				t.Errorf("uploadToGCS() error = %v, wantErr %v", err, tt.wantErr)
355 | 				return
356 | 			}
357 | 			if got != tt.want {
358 | 				t.Errorf("uploadToGCS() = %v, want %v", got, tt.want)
359 | 			}
360 | 		})
361 | 	}
362 | }
363 | 
364 | func Test_deleteFromGCS(t *testing.T) {
365 | 	server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
366 | 		if r.Method == "DELETE" && r.URL.Path == "/b/test-bucket/o/test-object.txt" {
367 | 			w.WriteHeader(http.StatusOK)
368 | 		} else {
369 | 			http.Error(w, "Unexpected request", http.StatusBadRequest)
370 | 		}
371 | 	}))
372 | 	defer server.Close()
373 | 
374 | 	client, err := storage.NewClient(context.Background(),
375 | 		option.WithEndpoint(server.URL),
376 | 		option.WithHTTPClient(server.Client()),
377 | 		option.WithoutAuthentication())
378 | 	if err != nil {
379 | 		t.Fatalf("failed to create test client: %v", err)
380 | 	}
381 | 	defer client.Close()
382 | 
383 | 	tests := []struct {
384 | 		name       string
385 | 		bucket     string
386 | 		objectName string
387 | 		wantErr    bool
388 | 	}{
389 | 		{
390 | 			name:       "successful delete",
391 | 			bucket:     "test-bucket",
392 | 			objectName: "test-object.txt",
393 | 			wantErr:    false,
394 | 		},
395 | 		{
396 | 			name:       "empty object name",
397 | 			bucket:     "test-bucket",
398 | 			objectName: "",
399 | 			wantErr:    true,
400 | 		},
401 | 		{
402 | 			name:       "empty bucket name",
403 | 			bucket:     "",
404 | 			objectName: "test-object.txt",
405 | 			wantErr:    true,
406 | 		},
407 | 	}
408 | 
409 | 	for _, tt := range tests {
410 | 		t.Run(tt.name, func(t *testing.T) {
411 | 			err := DeleteFromGCS(context.Background(), client, tt.bucket, tt.objectName)
412 | 			if (err != nil) != tt.wantErr {
413 | 				t.Errorf("deleteFromGCS() error = %v, wantErr %v", err, tt.wantErr)
414 | 				return
415 | 			}
416 | 		})
417 | 	}
418 | }
419 | 


--------------------------------------------------------------------------------
/internal/utils/utils.go:
--------------------------------------------------------------------------------
  1 | package utils
  2 | 
  3 | import (
  4 | 	"archive/zip"
  5 | 	"encoding/xml"
  6 | 	"fmt"
  7 | 	"io"
  8 | 	"mime"
  9 | 	"net/url"
 10 | 	"os"
 11 | 	"path/filepath"
 12 | 	"regexp"
 13 | 	"strings"
 14 | )
 15 | 
 16 | func GetFileExtension(rawURL string) (string, error) {
 17 | 	u, err := url.Parse(rawURL)
 18 | 	if err != nil {
 19 | 		return "", fmt.Errorf("invalid URL or file path: %w", err)
 20 | 	}
 21 | 
 22 | 	var pathToCheck string
 23 | 	if u.Scheme == "" || u.Scheme == "file" {
 24 | 		pathToCheck = rawURL
 25 | 		if u.Scheme == "file" {
 26 | 			pathToCheck = u.Path
 27 | 		}
 28 | 	} else {
 29 | 		pathToCheck = u.Path
 30 | 	}
 31 | 
 32 | 	ext := filepath.Ext(pathToCheck)
 33 | 	if ext == "" {
 34 | 		return "", fmt.Errorf("no file extension found in %q", rawURL)
 35 | 	}
 36 | 
 37 | 	return ext, nil
 38 | }
 39 | 
 40 | func GetFileContentType(file *os.File) string {
 41 | 	return mime.TypeByExtension(filepath.Ext(file.Name()))
 42 | }
 43 | 
 44 | func ExtractTextFromXML(file *zip.File) ([]string, error) {
 45 | 	fileReader, err := file.Open()
 46 | 	if err != nil {
 47 | 		return nil, err
 48 | 	}
 49 | 	defer fileReader.Close()
 50 | 
 51 | 	decoder := xml.NewDecoder(fileReader)
 52 | 	var contents []string
 53 | 	var currentParagraph strings.Builder
 54 | 	inParagraph := false
 55 | 
 56 | 	for {
 57 | 		token, err := decoder.Token()
 58 | 		if err == io.EOF {
 59 | 			break
 60 | 		}
 61 | 		if err != nil {
 62 | 			return nil, err
 63 | 		}
 64 | 
 65 | 		switch element := token.(type) {
 66 | 		case xml.StartElement:
 67 | 			if element.Name.Local == "p" {
 68 | 				inParagraph = true
 69 | 				currentParagraph.Reset()
 70 | 			}
 71 | 		case xml.EndElement:
 72 | 			if element.Name.Local == "p" {
 73 | 				inParagraph = false
 74 | 				if trimmed := strings.TrimSpace(currentParagraph.String()); trimmed != "" {
 75 | 					contents = append(contents, trimmed)
 76 | 				}
 77 | 			}
 78 | 		case xml.CharData:
 79 | 			if inParagraph {
 80 | 				currentParagraph.Write(element)
 81 | 			}
 82 | 		}
 83 | 	}
 84 | 
 85 | 	return contents, nil
 86 | }
 87 | 
 88 | /*
 89 | Wondering if this is even necessary but I can see how it can be useful
 90 | as it also removes links, images, and code blocks.
 91 | 
 92 | I'm not sure if this is the best way to remove markdown syntax.
 93 | Inspired by https://github.com/mmatongo/site/blob/master/cmd/dnlm/helpers.go#L62-L87
 94 | */
 95 | 
 96 | /* RemoveMarkdownSyntax removes markdown syntax from a string */
 97 | func RemoveMarkdownSyntax(text string) string {
 98 | 	patterns := []string{
 99 | 		"(```[\\s\\S]*?```)",                      // Code blocks
100 | 		"(`[^`\n]+`)",                             // Inline code
101 | 		"!\\[([^\\]]*?)\\]\\(([^)]+)\\)",          // Images
102 | 		"\\[([^\\]]+)\\]\\(([^)]+)\\)",            // Links
103 | 		"(__|\\*\\*|_|\\*)(.+?)(__|\\*\\*|_|\\*)", // Bold and Italic
104 | 		"~~(.+?)~~",                               // Strikethrough
105 | 		"^#{1,6}\\s(.*)$",                         // Headers
106 | 		"^>\\s(.*)$",                              // Blockquotes
107 | 		"^-{3,}$",                                 // Horizontal rules
108 | 		"^\\s*[\\*\\-+]\\s+(.+)$",                 // Unordered lists
109 | 		"^\\s*\\d+\\.\\s+(.+)$",                   // Ordered lists
110 | 	}
111 | 
112 | 	for _, pattern := range patterns {
113 | 		re := regexp.MustCompile("(?m)" + pattern)
114 | 		switch {
115 | 		case strings.HasPrefix(pattern, "(```"):
116 | 			text = re.ReplaceAllString(text, "$1")
117 | 		case strings.HasPrefix(pattern, "(`"):
118 | 			text = re.ReplaceAllString(text, "$1")
119 | 		case strings.HasPrefix(pattern, "!\\["):
120 | 			text = re.ReplaceAllString(text, "$1 ($2)")
121 | 		case strings.HasPrefix(pattern, "\\["):
122 | 			text = re.ReplaceAllString(text, "$1 ($2)")
123 | 		case strings.Contains(pattern, "(__|\\*\\*|_|\\*)"):
124 | 			text = re.ReplaceAllString(text, "$2")
125 | 		case strings.Contains(pattern, "~~"):
126 | 			text = re.ReplaceAllString(text, "$1")
127 | 		case strings.HasPrefix(pattern, "^#"):
128 | 			text = re.ReplaceAllString(text, "$1")
129 | 		case strings.HasPrefix(pattern, "^>"):
130 | 			text = re.ReplaceAllString(text, "$1")
131 | 		case strings.HasPrefix(pattern, "^\\s*[\\*\\-+]"):
132 | 			text = re.ReplaceAllString(text, "$1")
133 | 		case strings.HasPrefix(pattern, "^\\s*\\d+"):
134 | 			text = re.ReplaceAllString(text, "$1")
135 | 		default:
136 | 			text = re.ReplaceAllString(text, "")
137 | 		}
138 | 	}
139 | 
140 | 	// Remove any remaining Markdown characters
141 | 	text = strings.NewReplacer(
142 | 		"*", "",
143 | 		"_", "",
144 | 		"`", "",
145 | 		"#", "",
146 | 		">", "",
147 | 		"+", "",
148 | 		"-", "",
149 | 	).Replace(text)
150 | 
151 | 	return strings.TrimSpace(text)
152 | }
153 | 
154 | func OpenFile(filePath string) (*os.File, error) {
155 | 	filePath = strings.TrimPrefix(filePath, "file://")
156 | 	return os.Open(filePath)
157 | }
158 | 


--------------------------------------------------------------------------------
/internal/utils/utils_test.go:
--------------------------------------------------------------------------------
  1 | package utils
  2 | 
  3 | import (
  4 | 	"archive/zip"
  5 | 	"bytes"
  6 | 	"os"
  7 | 	"path/filepath"
  8 | 	"reflect"
  9 | 	"testing"
 10 | )
 11 | 
 12 | func createMockZipFile(content string) *zip.File {
 13 | 	buf := new(bytes.Buffer)
 14 | 	w := zip.NewWriter(buf)
 15 | 
 16 | 	var files = []struct {
 17 | 		Name, Body string
 18 | 	}{
 19 | 		{"document.xml", content},
 20 | 	}
 21 | 	for _, file := range files {
 22 | 		f, err := w.Create(file.Name)
 23 | 		if err != nil {
 24 | 			panic(err)
 25 | 		}
 26 | 		_, err = f.Write([]byte(file.Body))
 27 | 		if err != nil {
 28 | 			panic(err)
 29 | 		}
 30 | 	}
 31 | 
 32 | 	err := w.Close()
 33 | 	if err != nil {
 34 | 		panic(err)
 35 | 	}
 36 | 
 37 | 	r, err := zip.NewReader(bytes.NewReader(buf.Bytes()), int64(buf.Len()))
 38 | 	if err != nil {
 39 | 		panic(err)
 40 | 	}
 41 | 
 42 | 	return r.File[0]
 43 | }
 44 | 
 45 | func TestRemoveMarkdownSyntax(t *testing.T) {
 46 | 	type args struct {
 47 | 		text string
 48 | 	}
 49 | 	tests := []struct {
 50 | 		name string
 51 | 		args args
 52 | 		want string
 53 | 	}{
 54 | 		{
 55 | 			name: "Test 1",
 56 | 			args: args{
 57 | 				text: "This is a **bold** text",
 58 | 			},
 59 | 			want: "This is a bold text",
 60 | 		},
 61 | 		{
 62 | 			name: "Test 2",
 63 | 			args: args{
 64 | 				text: "This is a *italic* text",
 65 | 			},
 66 | 			want: "This is a italic text",
 67 | 		},
 68 | 		{
 69 | 			name: "Test 3",
 70 | 			args: args{
 71 | 				text: "This is a [link](https://example.com) text",
 72 | 			},
 73 | 			want: "This is a link (https://example.com) text",
 74 | 		},
 75 | 		{
 76 | 			name: "Test 4",
 77 | 			args: args{
 78 | 				text: "This is a ![image](https://example.com/image.png) text",
 79 | 			},
 80 | 			want: "This is a image (https://example.com/image.png) text",
 81 | 		},
 82 | 	}
 83 | 	for _, tt := range tests {
 84 | 		t.Run(tt.name, func(t *testing.T) {
 85 | 			if got := RemoveMarkdownSyntax(tt.args.text); got != tt.want {
 86 | 				t.Errorf("RemoveMarkdownSyntax() = %v, want %v", got, tt.want)
 87 | 			}
 88 | 		})
 89 | 	}
 90 | }
 91 | 
 92 | func TestGetFileExtension(t *testing.T) {
 93 | 	type args struct {
 94 | 		rawUrl string
 95 | 	}
 96 | 	tests := []struct {
 97 | 		name    string
 98 | 		args    args
 99 | 		want    string
100 | 		wantErr bool
101 | 	}{
102 | 		{
103 | 			name: "Test 1",
104 | 			args: args{
105 | 				rawUrl: "https://example.com/test.csv",
106 | 			},
107 | 			want:    ".csv",
108 | 			wantErr: false,
109 | 		},
110 | 		{
111 | 			name: "Test 2",
112 | 			args: args{
113 | 				rawUrl: "",
114 | 			},
115 | 			want:    "",
116 | 			wantErr: true,
117 | 		},
118 | 		{
119 | 			name: "Test 3",
120 | 			args: args{
121 | 				rawUrl: "https://example.com/test",
122 | 			},
123 | 			want:    "",
124 | 			wantErr: true,
125 | 		},
126 | 		{
127 | 			name: "Test 4",
128 | 			args: args{
129 | 				rawUrl: "file:///test.csv",
130 | 			},
131 | 			want:    ".csv",
132 | 			wantErr: false,
133 | 		},
134 | 		{
135 | 			name: "Test 5",
136 | 			args: args{
137 | 				rawUrl: "file:///test",
138 | 			},
139 | 			want:    "",
140 | 			wantErr: true,
141 | 		},
142 | 		{
143 | 			name: "Test 6",
144 | 			args: args{
145 | 				rawUrl: string([]byte{0x01, 0x02, 0x03, 0x04, 0x05}),
146 | 			},
147 | 			want:    "",
148 | 			wantErr: true,
149 | 		},
150 | 	}
151 | 	for _, tt := range tests {
152 | 		t.Run(tt.name, func(t *testing.T) {
153 | 			got, err := GetFileExtension(tt.args.rawUrl)
154 | 			if (err != nil) != tt.wantErr {
155 | 				t.Errorf("GetFileExtensionFromUrl() error = %v, wantErr %v", err, tt.wantErr)
156 | 				return
157 | 			}
158 | 			if got != tt.want {
159 | 				t.Errorf("GetFileExtensionFromUrl() = %v, want %v", got, tt.want)
160 | 			}
161 | 		})
162 | 	}
163 | }
164 | 
165 | func TestExtractTextFromXML(t *testing.T) {
166 | 	type args struct {
167 | 		file *zip.File
168 | 	}
169 | 	tests := []struct {
170 | 		name    string
171 | 		args    args
172 | 		want    []string
173 | 		wantErr bool
174 | 	}{
175 | 		{
176 | 			name: "valid XML with paragraphs",
177 | 			args: args{
178 | 				file: createMockZipFile(`
179 | 					<?xml version="1.0" encoding="UTF-8"?>
180 | 					<document>
181 | 						<p>First paragraph</p>
182 | 						<p>Second paragraph</p>
183 | 						<p>Third paragraph</p>
184 | 					</document>
185 | 				`),
186 | 			},
187 | 			want:    []string{"First paragraph", "Second paragraph", "Third paragraph"},
188 | 			wantErr: false,
189 | 		},
190 | 		{
191 | 			name: "XML with empty paragraphs",
192 | 			args: args{
193 | 				file: createMockZipFile(`
194 | 					<?xml version="1.0" encoding="UTF-8"?>
195 | 					<document>
196 | 						<p>First paragraph</p>
197 | 						<p></p>
198 | 						<p>Third paragraph</p>
199 | 					</document>
200 | 				`),
201 | 			},
202 | 			want:    []string{"First paragraph", "Third paragraph"},
203 | 			wantErr: false,
204 | 		},
205 | 		{
206 | 			name: "invalid XML",
207 | 			args: args{
208 | 				file: createMockZipFile(`
209 | 					<?xml version="1.0" encoding="UTF-8"?>
210 | 					<document>
211 | 						<p>Unclosed paragraph
212 | 					</document>
213 | 				`),
214 | 			},
215 | 			want:    nil,
216 | 			wantErr: true,
217 | 		},
218 | 	}
219 | 	for _, tt := range tests {
220 | 		t.Run(tt.name, func(t *testing.T) {
221 | 			got, err := ExtractTextFromXML(tt.args.file)
222 | 			if (err != nil) != tt.wantErr {
223 | 				t.Errorf("ExtractTextFromXML() error = %v, wantErr %v", err, tt.wantErr)
224 | 				return
225 | 			}
226 | 			if !reflect.DeepEqual(got, tt.want) {
227 | 				t.Errorf("ExtractTextFromXML() = %v, want %v", got, tt.want)
228 | 			}
229 | 		})
230 | 	}
231 | }
232 | 
233 | func TestOpenFile(t *testing.T) {
234 | 	type args struct {
235 | 		filePath string
236 | 	}
237 | 	tests := []struct {
238 | 		name    string
239 | 		args    args
240 | 		want    *os.File
241 | 		wantErr bool
242 | 	}{
243 | 		{
244 | 			name: "valid file",
245 | 			args: args{
246 | 				filePath: "testdata/test.pdf",
247 | 			},
248 | 			want:    nil,
249 | 			wantErr: true,
250 | 		},
251 | 	}
252 | 	for _, tt := range tests {
253 | 		t.Run(tt.name, func(t *testing.T) {
254 | 			got, err := OpenFile(tt.args.filePath)
255 | 			if (err != nil) != tt.wantErr {
256 | 				t.Errorf("OpenFile() error = %v, wantErr %v", err, tt.wantErr)
257 | 				return
258 | 			}
259 | 			if !reflect.DeepEqual(got, tt.want) {
260 | 				t.Errorf("OpenFile() = %v, want %v", got, tt.want)
261 | 			}
262 | 		})
263 | 	}
264 | }
265 | 
266 | func TestGetFileContentType(t *testing.T) {
267 | 	tempDir := t.TempDir()
268 | 	testHTMLPath := filepath.Join(tempDir, "test.html")
269 | 
270 | 	err := os.WriteFile(testHTMLPath, []byte("html content"), 0644)
271 | 	if err != nil {
272 | 		t.Fatalf("failed to create test html file: %v", err)
273 | 	}
274 | 
275 | 	filepath, _ := OpenFile(testHTMLPath)
276 | 	type args struct {
277 | 		file *os.File
278 | 	}
279 | 	tests := []struct {
280 | 		name string
281 | 		args args
282 | 		want string
283 | 	}{
284 | 		{
285 | 			name: "Test 1",
286 | 			args: args{
287 | 				file: filepath,
288 | 			},
289 | 			want: "text/html; charset=utf-8",
290 | 		},
291 | 	}
292 | 	for _, tt := range tests {
293 | 		t.Run(tt.name, func(t *testing.T) {
294 | 			if got := GetFileContentType(tt.args.file); got != tt.want {
295 | 				t.Errorf("GetFileContentType() = %v, want %v", got, tt.want)
296 | 			}
297 | 		})
298 | 	}
299 | }
300 | 


--------------------------------------------------------------------------------
/testdata/audio/test.flac:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mmatongo/chew/bf3ea330803108fd896caeb73a2103e03b0d5d1f/testdata/audio/test.flac


--------------------------------------------------------------------------------
/testdata/audio/test.mp3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mmatongo/chew/bf3ea330803108fd896caeb73a2103e03b0d5d1f/testdata/audio/test.mp3


--------------------------------------------------------------------------------
/testdata/audio/test.ogg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mmatongo/chew/bf3ea330803108fd896caeb73a2103e03b0d5d1f/testdata/audio/test.ogg


--------------------------------------------------------------------------------
/testdata/audio/test.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mmatongo/chew/bf3ea330803108fd896caeb73a2103e03b0d5d1f/testdata/audio/test.wav


--------------------------------------------------------------------------------
/testdata/files/test.epub:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mmatongo/chew/bf3ea330803108fd896caeb73a2103e03b0d5d1f/testdata/files/test.epub


--------------------------------------------------------------------------------
/testdata/files/test.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mmatongo/chew/bf3ea330803108fd896caeb73a2103e03b0d5d1f/testdata/files/test.pdf


--------------------------------------------------------------------------------