├── mboxrd_error.go ├── message_error.go ├── .gitignore ├── LICENSE.txt ├── cmd ├── mvre │ └── mvre.go ├── mboxrd_split │ └── mboxrd_split.go └── msgdate │ └── msgdate.go ├── message_admit.go ├── README.md ├── mboxrd.go ├── message_unpack.go ├── message_name.go └── message_original.go /mboxrd_error.go: -------------------------------------------------------------------------------- 1 | package mboxrd 2 | 3 | import "fmt" 4 | 5 | // MboxError type is returned when there are errors occurred 6 | // reading or splitting a mboxrd archive. 7 | type MboxError string 8 | 9 | func (mbe MboxError) Error() string { 10 | return fmt.Sprintf("MBox error: %s", string(mbe)) 11 | } 12 | -------------------------------------------------------------------------------- /message_error.go: -------------------------------------------------------------------------------- 1 | package mboxrd 2 | 3 | import "fmt" 4 | 5 | // MessageError type is returned when there are errors occurred 6 | // writing a mesage to filesystem. 7 | type MessageError string 8 | 9 | func (msge MessageError) Error() string { 10 | return fmt.Sprintf("Message error: %s", string(msge)) 11 | } 12 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Compiled Object files, Static and Dynamic libs (Shared Objects) 2 | *.o 3 | *.a 4 | *.so 5 | 6 | # Folders 7 | _obj 8 | _test 9 | 10 | .idea 11 | 12 | # Architecture specific extensions/prefixes 13 | *.[568vq] 14 | [568vq].out 15 | 16 | *.cgo1.go 17 | *.cgo2.c 18 | _cgo_defun.c 19 | _cgo_gotypes.go 20 | _cgo_export.* 21 | 22 | _testmain.go 23 | 24 | *.exe 25 | *.test 26 | *.prof 27 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Copyright (c) 2015, Vlad Didenko 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 5 | 6 | 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 7 | 8 | 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 9 | 10 | 3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. 11 | 12 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 13 | -------------------------------------------------------------------------------- /cmd/mvre/mvre.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "flag" 5 | "fmt" 6 | "os" 7 | "path/filepath" 8 | "regexp" 9 | ) 10 | 11 | var ( 12 | find string 13 | findRE *regexp.Regexp 14 | repl string 15 | dir string 16 | preview bool 17 | ) 18 | 19 | func init() { 20 | flag.StringVar(&find, "find", "", "A required regular expression to search for. See https://golang.org/pkg/regexp/syntax/ for details.") 21 | flag.StringVar(&repl, "repl", "", "A string with placeholders to use for replacement. Defaults to an empty string. See https://golang.org/pkg/regexp/#Regexp.ReplaceAllString for details.") 22 | flag.StringVar(&dir, "dir", ".", "A directory to search for matching files.") 23 | flag.BoolVar(&preview, "preview", false, "Preview only, do not change matching files.") 24 | flag.Parse() 25 | 26 | if find == "" { 27 | flag.PrintDefaults() 28 | os.Exit(1) 29 | } 30 | 31 | findRE = regexp.MustCompile(find) 32 | } 33 | 34 | func main() { 35 | err := filepath.Walk(dir, func(oldpath string, info os.FileInfo, err error) error { 36 | if err != nil { 37 | return err 38 | } 39 | 40 | var ( 41 | newname string 42 | newpath string 43 | oldname = info.Name() 44 | ) 45 | 46 | if newname = findRE.ReplaceAllString(oldname, repl); newname != oldname { 47 | newpath = filepath.Join(filepath.Dir(oldpath), newname) 48 | 49 | if preview { 50 | fmt.Printf("%q => %q\n", oldpath, newpath) 51 | } else { 52 | er := os.Rename(oldpath, newpath) 53 | if er != nil { 54 | return er 55 | } 56 | } 57 | } 58 | return nil 59 | }) 60 | 61 | if err != nil { 62 | fmt.Println(err.Error()) 63 | os.Exit(1) 64 | } 65 | } 66 | -------------------------------------------------------------------------------- /message_admit.go: -------------------------------------------------------------------------------- 1 | package mboxrd 2 | 3 | import ( 4 | "fmt" 5 | "regexp" 6 | ) 7 | 8 | type Criterion struct { 9 | OnlyHeaders bool 10 | RE *regexp.Regexp 11 | } 12 | 13 | const ( 14 | sdrFmt = `^(From:|Sender:|Reply-To:|Return-Path:).*[\s<,:]%s[\s>,\]].*` 15 | rvrFmt = `^(To:\s|Cc:\s|Bcc:\s|(\s+)).*[<,:]%s[\s>,\]].*` 16 | ) 17 | 18 | var ( 19 | banChat = regexp.MustCompile(`^X-Gmail-Labels: Chat`) 20 | ) 21 | 22 | func AdmitAnyPattern(criteria []Criterion, vetos []Criterion, errors chan error) ByLineAdmit { 23 | 24 | var ( 25 | admitted = false 26 | banned = false 27 | inHeaders = true 28 | ) 29 | 30 | return func(line string, errors chan error) bool { 31 | 32 | if banned { 33 | return false 34 | } 35 | 36 | if admitted { 37 | return true 38 | } 39 | 40 | if inHeaders && line == "" { 41 | inHeaders = false 42 | } 43 | 44 | for _, veto := range vetos { 45 | if veto.OnlyHeaders && !inHeaders { 46 | continue 47 | } 48 | 49 | if veto.RE.MatchString(line) { 50 | banned = true 51 | return false 52 | } 53 | } 54 | 55 | if len(criteria) == 0 { 56 | return true 57 | } 58 | 59 | for _, permit := range criteria { 60 | if permit.OnlyHeaders && !inHeaders { 61 | continue 62 | } 63 | 64 | if admitted = permit.RE.MatchString(line); admitted { 65 | return true 66 | } 67 | } 68 | 69 | return false 70 | } 71 | } 72 | 73 | func AllWith(addrs []string, errors chan error) ByLineAdmit { 74 | 75 | criteria := make([]Criterion, len(addrs)*2) 76 | vetos := make([]Criterion, 1) 77 | 78 | for i, addr := range addrs { 79 | pos := i * 2 80 | criteria[pos] = Criterion{true, regexp.MustCompile(fmt.Sprintf(sdrFmt, addr))} 81 | criteria[pos+1] = Criterion{true, regexp.MustCompile(fmt.Sprintf(rvrFmt, addr))} 82 | } 83 | 84 | vetos[0] = Criterion{true, banChat} 85 | 86 | return AdmitAnyPattern(criteria, vetos, errors) 87 | } 88 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Purpose 2 | 3 | Split email archives downloaded from [Google Takeout (Download Your Data)][1] service into individual emails. Based on experimentation it looks like Google uses the _mboxrd_ dialect of _mbox_ format with CRLF lines as discussed at the [Wikipedia mbox article][2] 4 | 5 | # License 6 | 7 | The project is licensed under the [BSD 3-Clause License - see the `LICENSE.txt` file included with the package][3]. 8 | 9 | # Using the `mboxrd` package 10 | 11 | The package provides both libraries and a buildable executable. See the code documentation on using the libraries. 12 | 13 | [![GoDoc](https://godoc.org/github.com/didenko/mboxrd?status.svg)](https://godoc.org/github.com/didenko/mboxrd) 14 | 15 | # Using the `mboxrd_split` executable 16 | 17 | The executable takes the following parameters: 18 | 19 | -dir : A directory to put the resulting messages to. 20 | The directory must exist before running the program. 21 | 22 | -mbox : An mbox file to process and split into messages. 23 | 24 | -email
: An email which correspondence to be captured. Only 25 | the actual address should be provided. 26 | 27 | The program does not preserve unfinished last line of the last message in the archive. In the resulting files all message lines end with CRLF after the processing. 28 | 29 | During the processing it creates temporary message files and then moves them into the UTC-timestamped `.eml` file. If the destination filename is already taken by another message, then the later message does not override it. It is left in the temporary file and the error is printed to `stderr`. 30 | 31 | Also a message stays in a temporary file if the program fails to construct a name for the message file. Some forwarded messages, for example, lack the `Date: ` header. 32 | 33 | [1]: https://www.google.com/settings/takeout 34 | [2]: https://en.wikipedia.org/wiki/Mbox#Family 35 | [3]: ./LICENSE.txt 36 | 37 | -------------------------------------------------------------------------------- /mboxrd.go: -------------------------------------------------------------------------------- 1 | package mboxrd 2 | 3 | import ( 4 | "bufio" 5 | "io" 6 | "regexp" 7 | ) 8 | 9 | const ( 10 | crlf = string("\r\n") 11 | ) 12 | 13 | var ( 14 | reNewMessage = regexp.MustCompile(`^From `) 15 | reUnescape = regexp.MustCompile(`^\>+From `) 16 | ) 17 | 18 | // Extract processes all lines from the the mboxrd reader 19 | // and puts resulting messages each as its own channel into 20 | // the provided messages channel. 21 | // 22 | // It will stop only if it runs into non-empty lines prior 23 | // to a message header. Otherwise it will continue processing 24 | // the lines in assumption that the message archive format 25 | // is correct. 26 | // 27 | // Each message's channel and the parent messages' channel 28 | // are closed after the mbox data is exhausted. 29 | func Extract(mboxrd io.Reader, messages chan chan string, errors chan error) { 30 | 31 | var ( 32 | line string 33 | prevEmpty = true 34 | message chan string 35 | ) 36 | 37 | scanner := bufio.NewScanner(mboxrd) 38 | 39 | for scanner.Scan() { 40 | 41 | line = scanner.Text() 42 | 43 | switch { 44 | 45 | case line == "": 46 | 47 | if message != nil { 48 | if prevEmpty { 49 | message <- "" 50 | } 51 | prevEmpty = true 52 | } 53 | 54 | case reNewMessage.MatchString(line) && prevEmpty: 55 | 56 | if message != nil { 57 | close(message) 58 | } 59 | 60 | message = make(chan string) 61 | messages <- message 62 | 63 | message <- line 64 | prevEmpty = false 65 | 66 | case reUnescape.MatchString(line): 67 | 68 | line = line[1:] 69 | 70 | if message == nil { 71 | errors <- MboxError("Data found before a message beginning") 72 | return 73 | } 74 | 75 | if prevEmpty { 76 | message <- "" 77 | } 78 | message <- line 79 | prevEmpty = false 80 | 81 | default: 82 | 83 | if message == nil { 84 | errors <- MboxError("Data found before a message beginning") 85 | return 86 | } 87 | 88 | if prevEmpty { 89 | message <- "" 90 | } 91 | message <- line 92 | prevEmpty = false 93 | } 94 | } 95 | 96 | if message != nil { 97 | close(message) 98 | } 99 | 100 | if err := scanner.Err(); err != nil { 101 | errors <- err 102 | } 103 | 104 | close(messages) 105 | 106 | return 107 | } 108 | -------------------------------------------------------------------------------- /cmd/mboxrd_split/mboxrd_split.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "flag" 5 | "log" 6 | "os" 7 | "sync" 8 | 9 | "github.com/didenko/mboxrd" 10 | ) 11 | 12 | var ( 13 | lg *log.Logger = log.New(os.Stderr, "", log.LstdFlags) 14 | err error 15 | mbox string 16 | dir string 17 | addr string 18 | emlWG sync.WaitGroup 19 | origWG sync.WaitGroup 20 | workWG sync.WaitGroup 21 | 22 | messages = make(chan chan string) 23 | emlNames = make(chan string) 24 | errors = make(chan error) 25 | ) 26 | 27 | func logErrors(lg *log.Logger, errors chan error) { 28 | for err := range errors { 29 | lg.Println(err) 30 | } 31 | } 32 | 33 | func init() { 34 | 35 | flag.StringVar(&dir, "dir", "", "A directory to put the resulting messages to") 36 | flag.StringVar(&mbox, "mbox", "", "An mbox file to process") 37 | flag.StringVar(&addr, "email", "", "An email which correspondence to be captured") 38 | flag.Parse() 39 | 40 | if dir == "" || mbox == "" { 41 | lg.Fatal("Parameters 'dir' and 'mbox' are required") 42 | } 43 | 44 | fi, err := os.Stat(dir) 45 | if err != nil { 46 | lg.Fatalf("Failed to open the path %q: %s\n", dir, err) 47 | } 48 | 49 | if !fi.Mode().IsDir() { 50 | lg.Fatalf("Error: the %q path is not a directory", dir) 51 | } 52 | } 53 | 54 | func main() { 55 | 56 | mboxFile, err := os.Open(mbox) 57 | if err != nil { 58 | lg.Panic(err) 59 | } 60 | defer mboxFile.Close() 61 | 62 | go logErrors(lg, errors) 63 | 64 | go mboxrd.Extract(mboxFile, messages, errors) 65 | 66 | workWG.Add(1) 67 | go func() { 68 | defer workWG.Done() 69 | for message := range messages { 70 | 71 | origWG.Add(1) 72 | 73 | var admit mboxrd.ByLineAdmit 74 | if addr != "" { 75 | admit = mboxrd.AllWith([]string{addr}, errors) 76 | } else { 77 | admit = mboxrd.AllWith([]string{}, errors) 78 | } 79 | 80 | go mboxrd.WriteOriginal( 81 | message, 82 | emlNames, 83 | errors, 84 | dir, 85 | admit, 86 | mboxrd.NameFromTimeUser("%s_%s_%s.eml", errors), 87 | &origWG) 88 | } 89 | }() 90 | 91 | go func() { 92 | for eml := range emlNames { 93 | emlWG.Add(1) 94 | go mboxrd.UnpackMessage( 95 | eml, 96 | errors, 97 | &emlWG) 98 | } 99 | }() 100 | 101 | workWG.Wait() 102 | origWG.Wait() 103 | emlWG.Wait() 104 | } 105 | -------------------------------------------------------------------------------- /message_unpack.go: -------------------------------------------------------------------------------- 1 | package mboxrd 2 | 3 | import ( 4 | "encoding/base64" 5 | "fmt" 6 | "io" 7 | "mime" 8 | "mime/multipart" 9 | "net/mail" 10 | "os" 11 | "strings" 12 | "sync" 13 | ) 14 | 15 | type getter interface { 16 | Get(key string) string 17 | } 18 | 19 | func partsIterate(head getter, body io.Reader, emlbase string, errors chan error) { 20 | 21 | mediaType, params, err := mime.ParseMediaType(head.Get("Content-Type")) 22 | if err != nil || !strings.HasPrefix(mediaType, "multipart/") { 23 | return 24 | } 25 | 26 | mr := multipart.NewReader(body, params["boundary"]) 27 | 28 | for { 29 | 30 | p, err := mr.NextPart() 31 | if err == io.EOF { 32 | return 33 | } 34 | if err != nil { 35 | errors <- MessageError( 36 | fmt.Sprintf( 37 | "Problem opening a part of the %q message: %s", 38 | emlbase, 39 | err.Error())) 40 | return 41 | } 42 | 43 | partsIterate(p.Header, p, emlbase, errors) 44 | unpackPart(p, emlbase, errors) 45 | } 46 | } 47 | 48 | func unpackPart(part *multipart.Part, emlbase string, errors chan error) { 49 | 50 | defer part.Close() 51 | 52 | partFileName := part.FileName() 53 | if partFileName == "" { 54 | return 55 | } 56 | 57 | attachmentFileName := emlbase + " " + partFileName 58 | 59 | attachmentFile, err := os.Create(attachmentFileName) 60 | if err != nil { 61 | errors <- MessageError( 62 | fmt.Sprintf( 63 | "Problem opening the %q file: %s", 64 | attachmentFileName, 65 | err.Error())) 66 | return 67 | } 68 | defer attachmentFile.Close() 69 | 70 | enc := part.Header.Get("Content-Transfer-Encoding") 71 | 72 | var partReader io.Reader 73 | 74 | switch enc { 75 | case "", "7bit", "8bit": 76 | partReader = part 77 | 78 | case "base64", "BASE64", "Base64": 79 | partReader = base64.NewDecoder(base64.StdEncoding, part) 80 | 81 | default: 82 | errors <- MessageError( 83 | fmt.Sprintf( 84 | "Attachment %q: unknown encoging %q", 85 | attachmentFileName, 86 | enc)) 87 | return 88 | } 89 | 90 | _, err = io.Copy(attachmentFile, partReader) 91 | if err != nil { 92 | errors <- MessageError( 93 | fmt.Sprintf( 94 | "Problem copying the %q part of the %q message: %s", 95 | attachmentFile, 96 | emlbase, 97 | err.Error())) 98 | return 99 | } 100 | } 101 | 102 | func cutExt(fname string) string { 103 | dotIdx := strings.LastIndex(fname, ".") 104 | if dotIdx < 0 { 105 | return fname 106 | } 107 | 108 | return fname[:dotIdx] 109 | } 110 | 111 | func UnpackMessage(eml string, errors chan error, wg *sync.WaitGroup) { 112 | 113 | if wg != nil { 114 | defer wg.Done() 115 | } 116 | 117 | f, err := os.Open(eml) 118 | if err != nil { 119 | errors <- MessageError( 120 | fmt.Sprintf( 121 | "Problem opening the %q message file for unpacking: %s", 122 | eml, 123 | err.Error())) 124 | return 125 | } 126 | defer f.Close() 127 | 128 | msg, err := mail.ReadMessage(f) 129 | if err != nil { 130 | errors <- MessageError( 131 | fmt.Sprintf( 132 | "Problem opening the %q message file for unpacking: %s", 133 | eml, 134 | err.Error())) 135 | return 136 | } 137 | 138 | partsIterate(msg.Header, msg.Body, cutExt(eml), errors) 139 | } 140 | -------------------------------------------------------------------------------- /cmd/msgdate/msgdate.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "bufio" 5 | "flag" 6 | "fmt" 7 | "io/ioutil" 8 | "log" 9 | "os" 10 | "regexp" 11 | "strings" 12 | "time" 13 | 14 | "github.com/didenko/mboxrd" 15 | ) 16 | 17 | var ( 18 | dirName string 19 | msgSuffix string 20 | loc *time.Location 21 | err error 22 | lg = log.New(os.Stderr, "", log.Lshortfile) 23 | prefixRE = regexp.MustCompile("([[:digit:]]{12})(.*)") 24 | dateRE = regexp.MustCompile(`(?i)^date: (.*)`) 25 | ) 26 | 27 | func init() { 28 | var tz string 29 | var defaults bool 30 | flag.StringVar(&dirName, "dir", ".", "A directory name to scan") 31 | flag.StringVar(&tz, "loc", "UTC", "A location name from the IANA Time Zone database") 32 | flag.StringVar(&msgSuffix, "ext", ".eml", "A file extension (including dot) to be recognised as a message file") 33 | flag.BoolVar(&defaults, "defaults", false, "Proceed with default values for parameters without a warning") 34 | flag.Parse() 35 | 36 | if flag.NFlag() < 1 && !defaults { 37 | flag.PrintDefaults() 38 | 39 | fmt.Fprintf(os.Stderr, "\n%s\n", "Kill this process (usually Ctrl-C) to avoid running it with the default parameter values.") 40 | fmt.Fprintf(os.Stderr, "%s\n", "Press Enter to continue processing.") 41 | fmt.Scanln() 42 | } 43 | 44 | loc, err = time.LoadLocation(tz) 45 | if err != nil { 46 | lg.Fatal(err) 47 | } 48 | } 49 | 50 | func stampInFile(fn string) (string, error) { 51 | 52 | f, err := os.Open(fn) 53 | if err != nil { 54 | return "", err 55 | } 56 | defer f.Close() 57 | 58 | scanner := bufio.NewScanner(f) 59 | for scanner.Scan() { 60 | if line := scanner.Text(); dateRE.MatchString(line) { 61 | return mboxrd.TimeFromLine(line, loc) 62 | } 63 | } 64 | if err = scanner.Err(); err != nil { 65 | return "", err 66 | } 67 | return "", fmt.Errorf("Timestamp not found in file: %q", fn) 68 | } 69 | 70 | type trans struct { 71 | tsNew string 72 | files []string 73 | } 74 | 75 | type message_heap map[string]*trans 76 | 77 | func scanMessages(dir string) message_heap { 78 | files, err := ioutil.ReadDir(dirName) 79 | if err != nil { 80 | lg.Fatal(err) 81 | } 82 | 83 | messages := make(map[string]*trans) 84 | 85 | for _, file := range files { 86 | 87 | if !file.Mode().IsRegular() { 88 | continue 89 | } 90 | 91 | fn := file.Name() 92 | parts := prefixRE.FindStringSubmatch(fn) 93 | if parts == nil { 94 | lg.Printf("Skipping %q\n", fn) 95 | continue 96 | } 97 | 98 | if msgInfo, ok := messages[parts[1]]; ok { 99 | msgInfo.files = append(msgInfo.files, parts[2]) 100 | } else { 101 | msgInfo = &trans{"", []string{parts[2]}} 102 | messages[parts[1]] = msgInfo 103 | } 104 | 105 | if strings.HasSuffix(fn, msgSuffix) { 106 | stamp, err := stampInFile(fn) 107 | if err != nil { 108 | lg.Fatal(err) 109 | } 110 | messages[parts[1]].tsNew = stamp 111 | } 112 | } 113 | return messages 114 | } 115 | 116 | func processMessages(msgs message_heap) { 117 | for tsOld, msg := range msgs { 118 | for _, file := range msg.files { 119 | if tsOld != msg.tsNew { 120 | oldName := tsOld + file 121 | newName := msg.tsNew + file 122 | os.Rename(oldName, newName) 123 | lg.Printf("%s => %s", oldName, newName) 124 | } 125 | } 126 | } 127 | } 128 | 129 | func main() { 130 | processMessages(scanMessages(dirName)) 131 | } 132 | -------------------------------------------------------------------------------- /message_name.go: -------------------------------------------------------------------------------- 1 | package mboxrd 2 | 3 | import ( 4 | "fmt" 5 | "log" 6 | "regexp" 7 | "strings" 8 | "time" 9 | ) 10 | 11 | const timestampFormat = "060102150405" 12 | 13 | var ( 14 | utc *time.Location 15 | addrRE = regexp.MustCompile(`(\w{1,})["']?(@.*)`) 16 | headRE = regexp.MustCompile(`^From (.*)(@.*)`) 17 | dateRE = regexp.MustCompile(`(?i)^date: (.*)`) 18 | fromRE = regexp.MustCompile(`(?i)^(from:)(.*)`) 19 | indentRE = regexp.MustCompile(`^\s`) 20 | ) 21 | 22 | func init() { 23 | var err error 24 | utc, err = time.LoadLocation("UTC") 25 | if err != nil { 26 | log.Fatal(err) 27 | } 28 | } 29 | 30 | func TimeFromLine(line string, lc *time.Location) (string, error) { 31 | parsedTS := dateRE.FindStringSubmatch(line) 32 | if parsedTS == nil { 33 | return "", fmt.Errorf("Failed to parse the timestamp from the line %q", line) 34 | } 35 | 36 | return TimeNorm(parsedTS[1], lc) 37 | } 38 | 39 | func TimeNorm(line string, loc *time.Location) (string, error) { 40 | var l string 41 | 42 | if li := strings.LastIndex(line, ` (`); li == -1 { 43 | l = line 44 | } else { 45 | l = line[:li] 46 | } 47 | 48 | l = strings.TrimSpace(strings.Replace(strings.TrimSuffix(l, " UT"), `GMT`, ``, 1)) 49 | 50 | t, er := time.Parse("Mon, 2 Jan 2006 15:04:05 -0700", l) 51 | if er == nil { 52 | return t.In(loc).Format(timestampFormat), nil 53 | } 54 | 55 | t, er = time.Parse("2 Jan 2006 15:04:05 -0700", l) 56 | if er == nil { 57 | return t.In(loc).Format(timestampFormat), nil 58 | } 59 | 60 | t, er = time.Parse("2 Jan 2006 15:04:05 MST", l) 61 | if er == nil { 62 | return t.In(loc).Format(timestampFormat), nil 63 | } 64 | 65 | t, er = time.Parse("Mon, 2 Jan 2006 15:04:05 MST", l) 66 | if er == nil { 67 | return t.In(loc).Format(timestampFormat), nil 68 | } 69 | 70 | t, er = time.Parse("Mon, 2 Jan 2006 15:04:05", l) 71 | if er == nil { 72 | return t.In(loc).Format(timestampFormat), nil 73 | } 74 | 75 | t, er = time.Parse("2006-01-02 15:04:05 -0700", l) 76 | if er == nil { 77 | return t.In(loc).Format(timestampFormat), nil 78 | } 79 | 80 | //Wed, 6 Aug 2014 09:59:18 GMT-07:00 81 | 82 | t, er = time.Parse("Mon, 2 Jan 2006 15:04:05 Z07:00", l) 83 | if er == nil { 84 | return t.In(loc).Format(timestampFormat), nil 85 | } 86 | 87 | return "", er 88 | } 89 | 90 | // NameFromTimeUser returns a closed function used to extract 91 | // a message file name based on the message timestamp and sender's 92 | // username part of the email. 93 | // 94 | // It is an example on how to construct the file name from multiple 95 | // headers. 96 | func NameFromTimeUser(format string, errors chan error) ByLineName { 97 | const headPrefix = "From " 98 | 99 | var ( 100 | ts, fr, fr_acc, hd string 101 | in_from = false 102 | ) 103 | 104 | return func(line string, errors chan error) string { 105 | var er error 106 | 107 | if ts == "" && dateRE.MatchString(line) { 108 | ts, er = TimeFromLine(line, utc) 109 | if er != nil { 110 | errors <- fmt.Errorf( 111 | "Failed to parse the timestamp. Error: %s", 112 | er.Error()) 113 | return "" 114 | } 115 | } 116 | 117 | if hd == "" && strings.HasPrefix(line, headPrefix) { 118 | parsedHead := headRE.FindStringSubmatch(line) 119 | 120 | if parsedHead == nil { 121 | errors <- fmt.Errorf( 122 | "Failed to extract sender ID from the header line %q", 123 | line) 124 | return "" 125 | } 126 | 127 | hd_temp := parsedHead[1] 128 | hd = hd_temp[len(hd_temp)-8:] 129 | } 130 | 131 | if fr == "" { 132 | if parsedFrom := fromRE.FindStringSubmatch(line); parsedFrom != nil { 133 | in_from = true 134 | fr_acc = strings.TrimSpace(parsedFrom[2]) 135 | 136 | } else if in_from && indentRE.MatchString(line) { 137 | fr_acc = fr_acc + strings.TrimSpace(line) 138 | 139 | } else if in_from { 140 | in_from = false 141 | 142 | parsedEmail := addrRE.FindStringSubmatch(fr_acc) 143 | if parsedEmail == nil { 144 | errors <- fmt.Errorf( 145 | "Failed to extract user name from the email address %q", 146 | fr_acc) 147 | return "" 148 | } 149 | 150 | fr = parsedEmail[1] 151 | } 152 | } 153 | 154 | if ts == "" || fr == "" || hd == "" { 155 | return "" 156 | } 157 | 158 | return fmt.Sprintf(format, ts, hd, fr) 159 | } 160 | } 161 | -------------------------------------------------------------------------------- /message_original.go: -------------------------------------------------------------------------------- 1 | package mboxrd 2 | 3 | import ( 4 | "bufio" 5 | "fmt" 6 | "io/ioutil" 7 | "os" 8 | "path/filepath" 9 | "sync" 10 | ) 11 | 12 | type ( 13 | ByLineAdmit func(string, chan error) bool 14 | ByLineName func(string, chan error) string 15 | ) 16 | 17 | func sameFileContent(fA, fB string) (bool, error) { 18 | inA, errA := os.Open(fA) 19 | if errA != nil { 20 | return false, errA 21 | } 22 | defer inA.Close() 23 | scanA := bufio.NewScanner(inA) 24 | scanA.Split(bufio.ScanLines) 25 | 26 | inB, errB := os.Open(fB) 27 | if errB != nil { 28 | return false, errB 29 | } 30 | defer inB.Close() 31 | scanB := bufio.NewScanner(inB) 32 | scanB.Split(bufio.ScanLines) 33 | 34 | for a, b := scanA.Scan(), scanB.Scan(); a && b; a, b = scanA.Scan(), scanB.Scan() { 35 | if scanA.Text() != scanB.Text() { 36 | return false, nil 37 | } 38 | } 39 | 40 | if (scanA.Err() == nil) && (scanB.Err() == nil) { 41 | return true, nil 42 | } 43 | 44 | if errA = scanA.Err(); errA != nil { 45 | return false, errA 46 | } 47 | 48 | return false, scanB.Err() 49 | } 50 | 51 | // WriteOriginal receives a message text from the `message` channel 52 | // and writes it into a file in the destination `dir` directory. 53 | // 54 | // All error are posted in the `error` parameter channel. 55 | // 56 | // An `admit` parameter allows to determine if the message is left 57 | // in the target directory. The function is called for each line 58 | // in the message, uncluding headers. The value returned by the 59 | // `admit` function determines if the message is kept in the 60 | // directory. 61 | // 62 | // The message file name is constructed by the `name` parameter 63 | // function. The function is called for each line in the 64 | // message, uncluding headers, until it returns a non-empty 65 | // string. If `name` parameter is `nill` then messages will stay 66 | // in randomly named temporary files starting with `_msg_` prefix 67 | // 68 | // The `WaitGroup` parameter must be properly initialised and 69 | // incremented prior to calling this function, or be supplied as 70 | // `nil` if not needed. 71 | func WriteOriginal( 72 | message chan string, 73 | emlName chan string, 74 | errors chan error, 75 | dir string, 76 | admit ByLineAdmit, 77 | name ByLineName, 78 | wg *sync.WaitGroup) { 79 | 80 | if wg != nil { 81 | defer wg.Done() 82 | } 83 | 84 | var ( 85 | msgFile string 86 | allowed = true 87 | ) 88 | 89 | tempFile, err := ioutil.TempFile(dir, "_msg_") 90 | if err != nil { 91 | errors <- err 92 | return 93 | } 94 | 95 | for line := range message { 96 | 97 | allowed = admit(line, errors) 98 | 99 | tempFile.WriteString(line + crlf) 100 | if name != nil && msgFile == "" { 101 | msgFile = name(line, errors) 102 | } 103 | } 104 | 105 | if !allowed { 106 | defer func() { 107 | 108 | if err := tempFile.Close(); err != nil { 109 | errors <- MessageError( 110 | fmt.Sprintf( 111 | "Problem while closing the %s temporary file: %s", 112 | tempFile.Name(), 113 | err.Error())) 114 | } 115 | 116 | if err := os.Remove(tempFile.Name()); err != nil { 117 | errors <- MessageError( 118 | fmt.Sprintf( 119 | "Problem while deleting the %s temporary file: %s", 120 | tempFile.Name(), 121 | err.Error())) 122 | } 123 | }() 124 | return 125 | } 126 | 127 | tempFileEml := filepath.Base(tempFile.Name()) + ".eml" 128 | if name != nil && msgFile == "" { 129 | msgFile = tempFileEml 130 | errors <- MessageError( 131 | fmt.Sprintf( 132 | "File name did not constuct, moving message to the %q file", 133 | msgFile)) 134 | } 135 | 136 | msgPath := filepath.Join(dir, msgFile) 137 | 138 | _, err = os.Stat(msgPath) 139 | if err == nil { 140 | 141 | if ok, err := sameFileContent(msgFile, tempFileEml); ok && (err == nil) { 142 | if err := os.Remove(tempFileEml); err != nil { 143 | errors <- MessageError( 144 | fmt.Sprintf( 145 | "Problem while deleting the %s temporary file: %s", 146 | tempFileEml, 147 | err.Error())) 148 | } 149 | return 150 | } 151 | 152 | if msgFile != tempFileEml { 153 | 154 | msgFile = tempFileEml 155 | errors <- MessageError( 156 | fmt.Sprintf( 157 | "The message file %q already exists, moving message to the %q file", 158 | msgPath, 159 | msgFile)) 160 | 161 | msgPath = filepath.Join(dir, msgFile) 162 | _, err = os.Stat(msgPath) 163 | } 164 | 165 | if err == nil { 166 | errors <- MessageError( 167 | fmt.Sprintf( 168 | "The message file %q already exists, the message left in the %q file", 169 | msgPath, 170 | tempFile.Name())) 171 | emlName <- tempFile.Name() 172 | return 173 | } 174 | } 175 | 176 | tempFile.Close() 177 | err = os.Rename(tempFile.Name(), msgPath) 178 | if err != nil { 179 | errors <- MessageError( 180 | fmt.Sprintf( 181 | "Problem renaming %q into %q, the file may have either of the names. Error: %s", 182 | tempFile.Name(), 183 | msgPath, 184 | err.Error())) 185 | return 186 | } 187 | 188 | emlName <- msgPath 189 | } 190 | --------------------------------------------------------------------------------