├── .gitignore
├── go.mod
├── input
    └── Dracula.epub
├── go.sum
├── internal
    ├── book
    │   ├── epub.go
    │   ├── text-book.go
    │   └── epub-parser.go
    ├── consts
    │   └── consts.go
    ├── debug
    │   └── debug.go
    ├── str
    │   └── str.go
    ├── file
    │   └── file.go
    └── tts
    │   └── tts.go
├── main.go
├── LICENSE
└── README.md


/.gitignore:
--------------------------------------------------------------------------------
1 | node_modules/
2 | output/
3 | .DS_Store


--------------------------------------------------------------------------------
/go.mod:
--------------------------------------------------------------------------------
1 | module epub-tts
2 | 
3 | go 1.22.0
4 | 
5 | require golang.org/x/text v0.21.0
6 | 


--------------------------------------------------------------------------------
/input/Dracula.epub:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rafael1mc/epub-tts/HEAD/input/Dracula.epub


--------------------------------------------------------------------------------
/go.sum:
--------------------------------------------------------------------------------
1 | golang.org/x/text v0.21.0 h1:zyQAAkrwaneQ066sspRyJaG9VNi/YJ1NfzcGB3hZ/qo=
2 | golang.org/x/text v0.21.0/go.mod h1:4IBbMaMmOPCJ8SecivzSH54+73PCFmPWxNTLm+vZkEQ=
3 | 


--------------------------------------------------------------------------------
/internal/book/epub.go:
--------------------------------------------------------------------------------
 1 | package book
 2 | 
 3 | type EpubSection struct {
 4 | 	ID          string `json:"id"`
 5 | 	Title       string `json:"title"`
 6 | 	HtmlContent string `json:"htmlString"`
 7 | }
 8 | 
 9 | type Epub struct {
10 | 	Name     string
11 | 	Toc      map[string]string
12 | 	Sections []EpubSection
13 | }
14 | 


--------------------------------------------------------------------------------
/internal/consts/consts.go:
--------------------------------------------------------------------------------
 1 | package consts
 2 | 
 3 | const (
 4 | 	Perm          = 0777
 5 | 	InputFilePath = "input/Dracula.epub"
 6 | 
 7 | 	IsDryRun = false // if true, will generate text files, but not audio files
 8 | 	IsDebug  = false // if true, will generate files for section json and html content as well
 9 | 
10 | 	SpeakProcessCompletion = true // if true, will say something at the end of the process
11 | )
12 | 
13 | const (
14 | 	OutputRootDir  = "output"
15 | 	OutputTxtDir   = "txt"
16 | 	OutputTmpDir   = "tmp"
17 | 	OutputDebugDir = "debug"
18 | )
19 | 


--------------------------------------------------------------------------------
/main.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | import (
 4 | 	"epub-tts/internal/book"
 5 | 	"epub-tts/internal/consts"
 6 | 	"epub-tts/internal/debug"
 7 | 	"epub-tts/internal/file"
 8 | 	"epub-tts/internal/tts"
 9 | 	"fmt"
10 | )
11 | 
12 | func main() {
13 | 	fmt.Println(" ---== Execution Started ==--- ")
14 | 
15 | 	epub, err := book.ParseEpub(consts.InputFilePath)
16 | 	if err != nil {
17 | 		panic(err)
18 | 	}
19 | 
20 | 	textBook := book.TextBookFromEpub(epub)
21 | 
22 | 	err = file.CreateOutputDirs(textBook.Name)
23 | 	if err != nil {
24 | 		panic(err)
25 | 	}
26 | 
27 | 	err = file.SaveChapters(textBook)
28 | 	if err != nil {
29 | 		panic(err)
30 | 	}
31 | 	debug.GenerateDebugFiles(epub)
32 | 
33 | 	tts := tts.NewTTS(3, textBook)
34 | 	tts.Run()
35 | 
36 | 	if consts.SpeakProcessCompletion {
37 | 		tts.Speak("TTS completed")
38 | 	}
39 | 	fmt.Println(" ---== Execution ended ==--- ")
40 | }
41 | 


--------------------------------------------------------------------------------
/internal/book/text-book.go:
--------------------------------------------------------------------------------
 1 | package book
 2 | 
 3 | import (
 4 | 	"epub-tts/internal/str"
 5 | 	"strings"
 6 | )
 7 | 
 8 | type Chapter struct {
 9 | 	ID      string
10 | 	Name    string
11 | 	Content string
12 | }
13 | 
14 | func (c Chapter) NameOrID() string {
15 | 	if c.Name == "" {
16 | 		return c.ID
17 | 	}
18 | 	return c.Name
19 | }
20 | 
21 | type TextBook struct {
22 | 	Name     string
23 | 	Chapters []Chapter
24 | }
25 | 
26 | func TextBookFromEpub(input Epub) TextBook {
27 | 	chapters := []Chapter{}
28 | 
29 | 	for _, v := range input.Sections {
30 | 		name := str.SanitizeString(v.Title)
31 | 		name = strings.ReplaceAll(name, "\n", "")
32 | 		chapter := Chapter{
33 | 			ID:      str.SanitizeString(v.ID),
34 | 			Name:    name,
35 | 			Content: str.SanitizeString(str.RemoveTags(v.HtmlContent)),
36 | 		}
37 | 		chapters = append(chapters, chapter)
38 | 	}
39 | 
40 | 	return TextBook{
41 | 		Name:     input.Name,
42 | 		Chapters: chapters,
43 | 	}
44 | }
45 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 Rafael
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/internal/debug/debug.go:
--------------------------------------------------------------------------------
 1 | package debug
 2 | 
 3 | import (
 4 | 	"encoding/json"
 5 | 	"epub-tts/internal/book"
 6 | 	"epub-tts/internal/consts"
 7 | 	"epub-tts/internal/file"
 8 | 	"errors"
 9 | 	"fmt"
10 | 	"os"
11 | )
12 | 
13 | func GenerateDebugFiles(epub book.Epub) {
14 | 	if !consts.IsDebug {
15 | 		return
16 | 	}
17 | 	fmt.Println("Saving debug files")
18 | 
19 | 	err := os.MkdirAll(file.DebugDir(epub.Name), consts.Perm)
20 | 	if err != nil && !errors.Is(err, os.ErrExist) {
21 | 		panic(err)
22 | 	}
23 | 
24 | 	for k, v := range epub.Sections {
25 | 		//
26 | 		// JSON
27 | 		//
28 | 		jsonContent, err := json.Marshal(v)
29 | 		if err != nil {
30 | 			panic(err)
31 | 		}
32 | 
33 | 		err = os.WriteFile(
34 | 			file.GetOutputPath(k, file.DebugDir(epub.Name), v.ID, "json"),
35 | 			jsonContent,
36 | 			consts.Perm,
37 | 		)
38 | 		if err != nil {
39 | 			fmt.Println("Failed to save json debug file")
40 | 		}
41 | 
42 | 		//
43 | 		// HTML
44 | 		//
45 | 		err = os.WriteFile(
46 | 			file.GetOutputPath(k, file.DebugDir(epub.Name), v.ID, "html"),
47 | 			[]byte(v.HtmlContent),
48 | 			consts.Perm,
49 | 		)
50 | 		if err != nil {
51 | 			fmt.Println("Failed to save html debug file")
52 | 		}
53 | 	}
54 | }
55 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # epub-tts
 2 | 
 3 | Convert ePUB into audio files.
 4 | 
 5 | Code will parse the ePUB into sections (which roughly correlates to book chapters) and 'text-to-speech' each section into its own audio file.<br>Output will be prefixed with a number to maintain order.
 6 | 
 7 | <sub>
 8 | This is an alpha, proof of concept version.
 9 | To me, it's supposed to be a simple alternative for when eyes are tired but the mind is not :)
10 | </sub>
11 | 
12 | <br>
13 | 
14 | # Requirements
15 |  - Run on MacOS
16 |  - ffmpeg installed and available in $PATH
17 |  - Golang
18 | 
19 | # How to use
20 |  1. Clone this repo
21 |  2. Replace the file inside `volume/input.epub` with the book you want to convert to audio (keep file name)
22 |  3. Execute the program (note that it will take quite some time, but you should see _some_ output during execution):
23 | ```
24 | go run .
25 | ```
26 |  5. You should see a new `output` folder with each text and audio file.
27 | 
28 | # TODO
29 |  - [x] Parse ePUB from Golang
30 |  - [x] Organize code
31 |  - [x] Add worker pools for batch conversion and less CPU strain
32 |  - [x] Reduce output audio size
33 |  - [x] Extract chapter info
34 |  - [ ] Add more sample ePUBs
35 |  - [ ] Add automated tests
36 |  - [x] Separate output by folder
37 |  - [ ] Handle multiple input
38 |  - [ ] Organize the code some more
39 |  - [ ] Support other languages beyond english
40 |  - [ ] Display progress
41 |  - [ ] Break down a big section to be TTS concurrently, and merge after whole section is done
42 |  - [ ] Add support for Ubuntu TTS
43 |  - [ ] Add Web UI to Drag and Drop epub files
44 |  - [ ] ?
45 | 
46 | ### Dependencies
47 |  - MacOS `say` command
48 |  - Note: The example book in this repo is taken from [Project Guttenber](https://www.gutenberg.org/about/), with Copyright Status as "Public domain in the USA"
49 | <hr>
50 | 
51 | # License
52 | Check [LICENSE](https://github.com/rafael1mc/epub-tts/blob/main/LICENSE) file.


--------------------------------------------------------------------------------
/internal/str/str.go:
--------------------------------------------------------------------------------
 1 | package str
 2 | 
 3 | import (
 4 | 	"regexp"
 5 | 	"strings"
 6 | 	"unicode"
 7 | 
 8 | 	"golang.org/x/text/runes"
 9 | 	"golang.org/x/text/transform"
10 | 	"golang.org/x/text/unicode/norm"
11 | )
12 | 
13 | func SanitizeString(str string) string {
14 | 	str = strings.Trim(str, "\r\n\t ")
15 | 	str = strings.ReplaceAll(str, "\r\n", "\n")
16 | 
17 | 	// make lines with only spaces to be just lines so they can be grouped below
18 | 	blankLineRegex := regexp.MustCompile(`(?m)^\s*$`)
19 | 	str = blankLineRegex.ReplaceAllString(str, "\n")
20 | 
21 | 	str = strings.Map(func(r rune) rune {
22 | 		if unicode.IsPrint(r) || r == '\n' {
23 | 			return r
24 | 		}
25 | 		return -1
26 | 	}, str)
27 | 
28 | 	// remove excess line breaks
29 | 	for strings.Contains(str, "\n\n\n") {
30 | 		str = strings.ReplaceAll(str, "\n\n\n", "\n\n")
31 | 	}
32 | 
33 | 	return str
34 | }
35 | 
36 | // CleanFileName removes invalid characters for filenames
37 | // and also removes accents and special characters.
38 | func CleanFileName(input string) string {
39 | 	// Normalize the input string to remove accents
40 | 	normalized, err := normalize(input)
41 | 	if err != nil {
42 | 		// TODO add log
43 | 		normalized = input
44 | 	}
45 | 
46 | 	normalized = strings.ReplaceAll(normalized, "—", "_")
47 | 	normalized = strings.ReplaceAll(normalized, ":", "_")
48 | 
49 | 	// Define a regular expression that allows only alphanumeric characters, dashes, and underscores
50 | 	re := regexp.MustCompile(`[^a-zA-Z0-9\s\-_\.]`)
51 | 
52 | 	// Remove any character that is not a word character, whitespace, dash, or period
53 | 	cleaned := re.ReplaceAllString(normalized, "")
54 | 
55 | 	// Optionally replace spaces with underscores or dashes
56 | 	cleaned = strings.ReplaceAll(cleaned, " ", "_")
57 | 	cleaned = strings.ReplaceAll(cleaned, `\n`, "")
58 | 	cleaned = strings.ReplaceAll(cleaned, "\n", "")
59 | 
60 | 	return cleaned
61 | }
62 | 
63 | // https://stackoverflow.com/a/65981868
64 | func normalize(s string) (string, error) {
65 | 	t := transform.Chain(norm.NFD, runes.Remove(runes.In(unicode.Mn)), norm.NFC)
66 | 	result, _, err := transform.String(t, s)
67 | 	if err != nil {
68 | 		return "", err
69 | 	}
70 | 
71 | 	return result, nil
72 | }
73 | 
74 | func RemoveTags(input string) string {
75 | 	// Define the regex pattern to match HTML tags
76 | 	tagRegex := regexp.MustCompile(`<[^>]+>`)
77 | 	// Replace all occurrences of the tag pattern with an empty string
78 | 	return tagRegex.ReplaceAllString(input, "")
79 | }
80 | 


--------------------------------------------------------------------------------
/internal/file/file.go:
--------------------------------------------------------------------------------
  1 | package file
  2 | 
  3 | import (
  4 | 	"epub-tts/internal/book"
  5 | 	"epub-tts/internal/consts"
  6 | 	"epub-tts/internal/str"
  7 | 	"errors"
  8 | 	"fmt"
  9 | 	"os"
 10 | 	"path"
 11 | 	"path/filepath"
 12 | 	"strings"
 13 | )
 14 | 
 15 | func normalizeBookName(bookName string) string {
 16 | 	cleanName := strings.ToLower(str.CleanFileName(bookName))
 17 | 	nameLen := len(cleanName)
 18 | 	if nameLen > 50 {
 19 | 		nameLen = 50
 20 | 	}
 21 | 
 22 | 	return cleanName[:nameLen]
 23 | }
 24 | 
 25 | func rootDir(bookName string) string {
 26 | 	return path.Join(
 27 | 		consts.OutputRootDir,
 28 | 		normalizeBookName(bookName),
 29 | 	)
 30 | }
 31 | 
 32 | func txtDir(bookName string) string {
 33 | 	return path.Join(
 34 | 		rootDir(bookName),
 35 | 		consts.OutputTxtDir,
 36 | 	)
 37 | }
 38 | 
 39 | func TmpDir(bookName string) string {
 40 | 	return path.Join(
 41 | 		rootDir(bookName),
 42 | 		consts.OutputTmpDir,
 43 | 	)
 44 | }
 45 | 
 46 | func DebugDir(bookName string) string {
 47 | 	return path.Join(
 48 | 		rootDir(bookName),
 49 | 		consts.OutputDebugDir,
 50 | 	)
 51 | }
 52 | 
 53 | func CreateOutputDirs(bookName string) error {
 54 | 	var err error
 55 | 
 56 | 	fmt.Println("Creating tmp dir", TmpDir(bookName))
 57 | 	err = os.MkdirAll(TmpDir(bookName), consts.Perm)
 58 | 	if err != nil && !errors.Is(err, os.ErrExist) {
 59 | 		return err
 60 | 	}
 61 | 
 62 | 	err = os.MkdirAll(txtDir(bookName), consts.Perm)
 63 | 	if err != nil && !errors.Is(err, os.ErrExist) {
 64 | 		return err
 65 | 	}
 66 | 
 67 | 	return nil
 68 | }
 69 | 
 70 | func SaveChapters(textBook book.TextBook) error {
 71 | 	fmt.Println("Saving chapter text files.")
 72 | 	for k, v := range textBook.Chapters {
 73 | 		filename := GetTextfileName(k, textBook.Name, v)
 74 | 		err := os.WriteFile(
 75 | 			filename,
 76 | 			[]byte(v.Content),
 77 | 			consts.Perm,
 78 | 		)
 79 | 		if err != nil {
 80 | 			return err
 81 | 		}
 82 | 	}
 83 | 
 84 | 	return nil
 85 | }
 86 | 
 87 | func GetTextfileName(pos int, bookName string, chapter book.Chapter) string {
 88 | 	return GetOutputPath(pos, txtDir(bookName), chapter.NameOrID(), "txt")
 89 | }
 90 | 
 91 | func GetTtsAudioFilename(pos int, bookName string, chapter book.Chapter) string {
 92 | 	return GetOutputPath(pos, TmpDir(bookName), chapter.NameOrID(), "aiff")
 93 | }
 94 | 
 95 | func GetConvertedAudioFilename(pos int, bookName string, chapter book.Chapter) string {
 96 | 	return GetOutputPath(pos, rootDir(bookName), chapter.NameOrID(), "mp3")
 97 | }
 98 | 
 99 | func GetOutputPath(pos int, outputFolder string, name string, extension string) string {
100 | 	filename := fmt.Sprintf("%d-%s.%s", pos, name, extension)
101 | 	filename = strings.ToLower(filename)
102 | 	filename = str.CleanFileName(filename)
103 | 
104 | 	filePath := filepath.Join(outputFolder, filename)
105 | 
106 | 	return filePath
107 | }
108 | 


--------------------------------------------------------------------------------
/internal/tts/tts.go:
--------------------------------------------------------------------------------
  1 | package tts
  2 | 
  3 | import (
  4 | 	"epub-tts/internal/book"
  5 | 	"epub-tts/internal/consts"
  6 | 	"epub-tts/internal/file"
  7 | 	"fmt"
  8 | 	"os"
  9 | 	"os/exec"
 10 | )
 11 | 
 12 | type TTS struct {
 13 | 	workerCount int
 14 | 
 15 | 	textBook book.TextBook
 16 | }
 17 | 
 18 | type job struct {
 19 | 	ID       int
 20 | 	BookName string
 21 | 	Chapter  book.Chapter
 22 | }
 23 | 
 24 | type jobDone struct {
 25 | 	job
 26 | 	Error error
 27 | }
 28 | 
 29 | func NewTTS(
 30 | 	workerCount int,
 31 | 	textBook book.TextBook,
 32 | ) *TTS {
 33 | 	return &TTS{
 34 | 		workerCount: workerCount,
 35 | 		textBook:    textBook,
 36 | 	}
 37 | }
 38 | 
 39 | func (t TTS) Run() {
 40 | 	fmt.Println("Running text-to-speech")
 41 | 
 42 | 	jobCount := len(t.textBook.Chapters)
 43 | 	jobInputChan := make(chan job, jobCount)
 44 | 	jobDoneChan := make(chan jobDone, jobCount)
 45 | 
 46 | 	t.launchWorkers(jobInputChan, jobDoneChan)
 47 | 
 48 | 	for k, v := range t.textBook.Chapters {
 49 | 		jobInputChan <- job{ID: k, BookName: t.textBook.Name, Chapter: v}
 50 | 	}
 51 | 	close(jobInputChan)
 52 | 
 53 | 	for range jobCount {
 54 | 		jobDone := <-jobDoneChan
 55 | 		if jobDone.Error != nil {
 56 | 			fmt.Println("Failed to process item", jobDone.Chapter.Name, "with error", jobDone.Error)
 57 | 		}
 58 | 	}
 59 | 
 60 | 	os.RemoveAll(file.TmpDir(t.textBook.Name))
 61 | }
 62 | 
 63 | func (t TTS) Speak(text string) {
 64 | 	cmd := fmt.Sprintf(`say "%s"`, text)
 65 | 	exec.Command("/bin/sh", "-c", cmd).Output()
 66 | }
 67 | 
 68 | func (t TTS) launchWorkers(jobInputChan <-chan job, jobDoneChan chan<- jobDone) {
 69 | 	fmt.Println("Launching", t.workerCount, "worker(s)")
 70 | 	for k := range t.workerCount {
 71 | 		go t.launchWorker(k, jobInputChan, jobDoneChan)
 72 | 	}
 73 | }
 74 | 
 75 | func (t TTS) launchWorker(id int, inputChan <-chan job, doneChan chan<- jobDone) {
 76 | 	// TODO: use worker id and doneChan with error
 77 | 	for i := range inputChan {
 78 | 		if consts.IsDryRun {
 79 | 			doneChan <- jobDone{job: i}
 80 | 			continue
 81 | 		}
 82 | 
 83 | 		_ = ttsChapter(i.ID, i.BookName, i.Chapter)
 84 | 		audioConvert(i.ID, i.BookName, i.Chapter)
 85 | 		// TODO: maybe already delete the aiff file here, to prevent growing then shriking
 86 | 		// some books generate GBs on aiff
 87 | 		doneChan <- jobDone{job: i} // not sending errors yet
 88 | 	}
 89 | }
 90 | 
 91 | func ttsChapter(pos int, bookName string, chapter book.Chapter) string {
 92 | 	audioName := file.GetTtsAudioFilename(pos, bookName, chapter)
 93 | 
 94 | 	fmt.Println("🎤 Narrating chapter: '" + audioName + "' 🎤")
 95 | 	cmdStr := fmt.Sprintf(`say -f "%s" -o "%s"`, file.GetTextfileName(pos, bookName, chapter), audioName)
 96 | 	out, _ := exec.Command("/bin/sh", "-c", cmdStr).Output()
 97 | 
 98 | 	return string(out)
 99 | }
100 | 
101 | func audioConvert(pos int, bookName string, chapter book.Chapter) string {
102 | 	ttsAudioName := file.GetTtsAudioFilename(pos, bookName, chapter)
103 | 	convertedAudioName := file.GetConvertedAudioFilename(pos, bookName, chapter)
104 | 
105 | 	fmt.Println("🔄 Converting chapter: '" + ttsAudioName + "' 🔄")
106 | 	cmdStr := fmt.Sprintf(`ffmpeg -y -i %s %s`, ttsAudioName, convertedAudioName)
107 | 	out, _ := exec.Command("/bin/sh", "-c", cmdStr).Output()
108 | 
109 | 	fmt.Println("✅ Chapter '" + convertedAudioName + "' converted ✅")
110 | 	return string(out)
111 | }
112 | 


--------------------------------------------------------------------------------
/internal/book/epub-parser.go:
--------------------------------------------------------------------------------
  1 | package book
  2 | 
  3 | import (
  4 | 	"archive/zip"
  5 | 	"encoding/xml"
  6 | 	"fmt"
  7 | 	"io"
  8 | 	"net/url"
  9 | 	"path/filepath"
 10 | 	"strings"
 11 | )
 12 | 
 13 | // Container structure to parse container.xml
 14 | type Container struct {
 15 | 	Rootfiles []Rootfile `xml:"rootfiles>rootfile"`
 16 | }
 17 | 
 18 | // Rootfile structure for OPF reference
 19 | type Rootfile struct {
 20 | 	FullPath string `xml:"full-path,attr"`
 21 | }
 22 | 
 23 | // Package structure to parse OPF file
 24 | type Package struct {
 25 | 	Manifest []Item    `xml:"manifest>item"`
 26 | 	Spine    []Itemref `xml:"spine>itemref"`
 27 | }
 28 | 
 29 | // Item structure for manifest items
 30 | type Item struct {
 31 | 	ID   string `xml:"id,attr"`
 32 | 	Href string `xml:"href,attr"`
 33 | }
 34 | 
 35 | // Itemref structure for spine items
 36 | type Itemref struct {
 37 | 	IDRef string `xml:"idref,attr"`
 38 | }
 39 | 
 40 | // NavPoint represents a navigation point in the EPUB toc.ncx file
 41 | type NavPoint struct {
 42 | 	Text string `xml:"navLabel>text"`
 43 | 	// Src  string `xml:"content>src,attr"` // this doesn;t work
 44 | 	Src          string     `xml:"content,attr"`
 45 | 	SubNavPoints []NavPoint `xml:"navPoint"`
 46 | }
 47 | 
 48 | func (n *NavPoint) UnmarshalXML(d *xml.Decoder, start xml.StartElement) error {
 49 | 	var aux struct {
 50 | 		Text    string `xml:"navLabel>text"`
 51 | 		Content struct {
 52 | 			Src string `xml:"src,attr"`
 53 | 		} `xml:"content"`
 54 | 		SubNavPoints []NavPoint `xml:"navPoint"`
 55 | 	}
 56 | 	if err := d.DecodeElement(&aux, &start); err != nil {
 57 | 		return err
 58 | 	}
 59 | 
 60 | 	n.Text = aux.Text
 61 | 	n.Src = aux.Content.Src
 62 | 	n.SubNavPoints = aux.SubNavPoints
 63 | 
 64 | 	return nil
 65 | }
 66 | 
 67 | // NCX represents the structure of the toc.ncx file
 68 | type NCX struct {
 69 | 	Title  string     `xml:"docTitle>text"`
 70 | 	NavMap []NavPoint `xml:"navMap>navPoint"`
 71 | }
 72 | 
 73 | func ParseEpub(epubPath string) (Epub, error) {
 74 | 	// Open the ePUB file as a zip archive
 75 | 	r, err := zip.OpenReader(epubPath)
 76 | 	if err != nil {
 77 | 		return Epub{}, err
 78 | 	}
 79 | 	defer r.Close()
 80 | 
 81 | 	// Read the container.xml to locate the OPF file
 82 | 	container, err := readContainer(r)
 83 | 	if err != nil {
 84 | 		return Epub{}, err
 85 | 	}
 86 | 
 87 | 	// Parse the OPF file
 88 | 	packageData, err := readOPF(r, container.Rootfiles[0].FullPath)
 89 | 	if err != nil {
 90 | 		return Epub{}, err
 91 | 	}
 92 | 
 93 | 	basePath := extractBasePath(container.Rootfiles[0].FullPath)
 94 | 	tocFileName := findTocFileName(packageData.Manifest)
 95 | 
 96 | 	ncx, err := parseNCX(r, basePath, tocFileName)
 97 | 	if err != nil {
 98 | 		fmt.Println("Failed to parse ncx file")
 99 | 	}
100 | 
101 | 	// Parse table of contents
102 | 	tableOfContents, err := extractTableOfContents(ncx)
103 | 	if err != nil {
104 | 		fmt.Println("Failed to parse table of contents", err)
105 | 	}
106 | 
107 | 	book := Epub{
108 | 		Name:     ncx.Title,
109 | 		Toc:      map[string]string{},
110 | 		Sections: []EpubSection{},
111 | 	}
112 | 
113 | 	// Get content in order of the spine
114 | 	for _, spineItem := range packageData.Spine {
115 | 		manifestItem := findManifestItem(packageData.Manifest, spineItem.IDRef)
116 | 		if manifestItem != nil {
117 | 			currFile := filepath.Join(basePath, manifestItem.Href)
118 | 			content, err := readFileFromZip(r, currFile)
119 | 			if err != nil {
120 | 				return Epub{}, err
121 | 			}
122 | 
123 | 			title := tableOfContents[manifestItem.Href]
124 | 			if title == "" {
125 | 				title = tableOfContents[currFile]
126 | 			}
127 | 
128 | 			book.Sections = append(book.Sections, EpubSection{
129 | 				ID:          manifestItem.ID,
130 | 				Title:       title,
131 | 				HtmlContent: string(content),
132 | 			})
133 | 		}
134 | 	}
135 | 
136 | 	return book, nil
137 | }
138 | 
139 | // readContainer reads and parses the container.xml
140 | func readContainer(r *zip.ReadCloser) (*Container, error) {
141 | 	content, err := readFileFromZip(r, "META-INF/container.xml")
142 | 	if err != nil {
143 | 		return nil, err
144 | 	}
145 | 
146 | 	var container Container
147 | 	if err := xml.Unmarshal(content, &container); err != nil {
148 | 		return nil, err
149 | 	}
150 | 
151 | 	return &container, nil
152 | }
153 | 
154 | // readOPF reads and parses the OPF file
155 | func readOPF(r *zip.ReadCloser, opfPath string) (*Package, error) {
156 | 	content, err := readFileFromZip(r, opfPath)
157 | 	if err != nil {
158 | 		return nil, err
159 | 	}
160 | 
161 | 	var packageData Package
162 | 	if err := xml.Unmarshal(content, &packageData); err != nil {
163 | 		return nil, err
164 | 	}
165 | 
166 | 	return &packageData, nil
167 | }
168 | 
169 | // readFileFromZip extracts a file's content from the zip archive
170 | func readFileFromZip(r *zip.ReadCloser, name string) ([]byte, error) {
171 | 	for _, file := range r.File {
172 | 		if file.Name == name {
173 | 			rc, err := file.Open()
174 | 			if err != nil {
175 | 				return nil, err
176 | 			}
177 | 			defer rc.Close()
178 | 
179 | 			return io.ReadAll(rc)
180 | 		}
181 | 	}
182 | 
183 | 	return nil, fmt.Errorf("file not found: %s", name)
184 | }
185 | 
186 | func extractTableOfContents(
187 | 	ncx *NCX,
188 | ) (map[string]string, error) {
189 | 	result := map[string]string{}
190 | 	addNavPoints(result, ncx.NavMap)
191 | 
192 | 	return result, nil
193 | }
194 | 
195 | func addNavPoints(m map[string]string, navPoints []NavPoint) {
196 | 	for _, v := range navPoints {
197 | 		parsedSrc, err := url.Parse(v.Src)
198 | 		if err != nil {
199 | 			// TODO: log
200 | 			m[v.Src] = v.Text
201 | 			continue
202 | 		}
203 | 
204 | 		parsedSrc.Fragment = ""
205 | 		parsedSrc.RawQuery = ""
206 | 		src := parsedSrc.String()
207 | 
208 | 		m[src] = v.Text
209 | 
210 | 		if len(v.SubNavPoints) > 0 {
211 | 			addNavPoints(m, v.SubNavPoints)
212 | 		}
213 | 	}
214 | }
215 | 
216 | func parseNCX(r *zip.ReadCloser, basePath string, tocFileName string) (*NCX, error) {
217 | 	ncxContent, err := readFileFromZip(r, filepath.Join(basePath, tocFileName))
218 | 	if err != nil {
219 | 		return nil, err
220 | 	}
221 | 
222 | 	// Parse the NCX XML
223 | 	var ncx NCX
224 | 	err = xml.Unmarshal(ncxContent, &ncx)
225 | 	if err != nil {
226 | 		return nil, fmt.Errorf("failed to parse XML: %w", err)
227 | 	}
228 | 
229 | 	return &ncx, nil
230 | }
231 | 
232 | // findManifestItem finds a manifest item by ID
233 | func findManifestItem(manifest []Item, id string) *Item {
234 | 	for _, item := range manifest {
235 | 		if item.ID == id {
236 | 			return &item
237 | 		}
238 | 	}
239 | 
240 | 	return nil
241 | }
242 | 
243 | func extractBasePath(fullPath string) string {
244 | 	parsedFullPath := strings.Split(fullPath, "/")
245 | 	var fullPathBase string
246 | 	if len(parsedFullPath) > 1 {
247 | 		fullPathBase = strings.Join(parsedFullPath[:len(parsedFullPath)-1], "/")
248 | 	}
249 | 
250 | 	return fullPathBase
251 | }
252 | 
253 | func findTocFileName(manifestItems []Item) string {
254 | 	for _, v := range manifestItems {
255 | 		if strings.Contains(v.ID, "ncx") &&
256 | 			strings.Contains(v.Href, "ncx") {
257 | 			return v.Href
258 | 		}
259 | 	}
260 | 
261 | 	return "toc.ncx" // TODO look for any ncx file inside the whole zip
262 | }
263 | 


--------------------------------------------------------------------------------