├── .gitignore ├── CODE_OF_CONDUCT.md ├── Gopkg.lock ├── Gopkg.toml ├── README.md ├── UNLICENSE ├── charset.go ├── headers.go ├── headers_test.go ├── main.go └── main_easyjson.go /.gitignore: -------------------------------------------------------------------------------- 1 | /mail2elasticsearch 2 | /vendor 3 | /TODO 4 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Contributor Covenant Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | In the interest of fostering an open and welcoming environment, we as 6 | contributors and maintainers pledge to making participation in our project and 7 | our community a harassment-free experience for everyone, regardless of age, body 8 | size, disability, ethnicity, gender identity and expression, level of experience, 9 | nationality, personal appearance, race, religion, or sexual identity and 10 | orientation. 11 | 12 | ## Our Standards 13 | 14 | Examples of behavior that contributes to creating a positive environment 15 | include: 16 | 17 | * Using welcoming and inclusive language 18 | * Being respectful of differing viewpoints and experiences 19 | * Gracefully accepting constructive criticism 20 | * Focusing on what is best for the community 21 | * Showing empathy towards other community members 22 | 23 | Examples of unacceptable behavior by participants include: 24 | 25 | * The use of sexualized language or imagery and unwelcome sexual attention or 26 | advances 27 | * Trolling, insulting/derogatory comments, and personal or political attacks 28 | * Public or private harassment 29 | * Publishing others' private information, such as a physical or electronic 30 | address, without explicit permission 31 | * Other conduct which could reasonably be considered inappropriate in a 32 | professional setting 33 | 34 | ## Our Responsibilities 35 | 36 | Project maintainers are responsible for clarifying the standards of acceptable 37 | behavior and are expected to take appropriate and fair corrective action in 38 | response to any instances of unacceptable behavior. 39 | 40 | Project maintainers have the right and responsibility to remove, edit, or 41 | reject comments, commits, code, wiki edits, issues, and other contributions 42 | that are not aligned to this Code of Conduct, or to ban temporarily or 43 | permanently any contributor for other behaviors that they deem inappropriate, 44 | threatening, offensive, or harmful. 45 | 46 | ## Scope 47 | 48 | This Code of Conduct applies both within project spaces and in public spaces 49 | when an individual is representing the project or its community. Examples of 50 | representing a project or community include using an official project e-mail 51 | address, posting via an official social media account, or acting as an appointed 52 | representative at an online or offline event. Representation of a project may be 53 | further defined and clarified by project maintainers. 54 | 55 | ## Enforcement 56 | 57 | Instances of abusive, harassing, or otherwise unacceptable behavior may be 58 | reported by contacting the project owner at greg@unrelenting.technology. All 59 | complaints will be reviewed and investigated and will result in a response that 60 | is deemed necessary and appropriate to the circumstances. The project owner is 61 | obligated to maintain confidentiality with regard to the reporter of an incident. 62 | Further details of specific enforcement policies may be posted separately. 63 | 64 | Project maintainers who do not follow or enforce the Code of Conduct in good 65 | faith may face temporary or permanent repercussions as determined by other 66 | members of the project's leadership. 67 | 68 | ## Attribution 69 | 70 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, 71 | available at [http://contributor-covenant.org/version/1/4][version] 72 | 73 | [homepage]: http://contributor-covenant.org 74 | [version]: http://contributor-covenant.org/version/1/4/ 75 | -------------------------------------------------------------------------------- /Gopkg.lock: -------------------------------------------------------------------------------- 1 | # This file is autogenerated, do not edit; changes may be undone by the next 'dep ensure'. 2 | 3 | 4 | [[projects]] 5 | name = "github.com/davecgh/go-spew" 6 | packages = ["spew"] 7 | revision = "346938d642f2ec3594ed81d874461961cd0faa76" 8 | version = "v1.1.0" 9 | 10 | [[projects]] 11 | branch = "master" 12 | name = "github.com/dchest/safefile" 13 | packages = ["."] 14 | revision = "855e8d98f1852d48dde521e0522408d1fe7e836a" 15 | 16 | [[projects]] 17 | branch = "master" 18 | name = "github.com/gogits/chardet" 19 | packages = ["."] 20 | revision = "2404f777256163ea3eadb273dada5dcb037993c0" 21 | 22 | [[projects]] 23 | branch = "master" 24 | name = "github.com/mailru/easyjson" 25 | packages = [".","buffer","jlexer","jwriter"] 26 | revision = "32fa128f234d041f196a9f3e0fea5ac9772c08e1" 27 | 28 | [[projects]] 29 | branch = "master" 30 | name = "github.com/minio/blake2b-simd" 31 | packages = ["."] 32 | revision = "3f5f724cb5b182a5c278d6d3d55b40e7f8c2efb4" 33 | 34 | [[projects]] 35 | branch = "master" 36 | name = "github.com/myfreeweb/go-base64-simd" 37 | packages = ["base64"] 38 | revision = "a996ba366c4249e6fd8967e1257ba0752b9e6628" 39 | 40 | [[projects]] 41 | branch = "master" 42 | name = "github.com/myfreeweb/go-email" 43 | packages = ["email"] 44 | revision = "5ebbe4b970f65566dfaae38e30db4f5b6e8f8b2f" 45 | 46 | [[projects]] 47 | name = "github.com/olivere/elastic" 48 | packages = [".","config","uritemplates"] 49 | revision = "2963eb09b89294356e1a826068f33e5a44326ac0" 50 | version = "v6.1.5" 51 | 52 | [[projects]] 53 | name = "github.com/pkg/errors" 54 | packages = ["."] 55 | revision = "645ef00459ed84a119197bfb8d8205042c6df63d" 56 | version = "v0.8.0" 57 | 58 | [[projects]] 59 | name = "github.com/pmezard/go-difflib" 60 | packages = ["difflib"] 61 | revision = "792786c7400a136282c1664665ae0a8db921c6c2" 62 | version = "v1.0.0" 63 | 64 | [[projects]] 65 | name = "github.com/stretchr/testify" 66 | packages = ["assert"] 67 | revision = "12b6f73e6084dad08a7c6e575284b177ecafbc71" 68 | version = "v1.2.1" 69 | 70 | [[projects]] 71 | name = "go.uber.org/atomic" 72 | packages = ["."] 73 | revision = "8474b86a5a6f79c443ce4b2992817ff32cf208b8" 74 | version = "v1.3.1" 75 | 76 | [[projects]] 77 | name = "go.uber.org/multierr" 78 | packages = ["."] 79 | revision = "3c4937480c32f4c13a875a1829af76c98ca3d40a" 80 | version = "v1.1.0" 81 | 82 | [[projects]] 83 | name = "go.uber.org/zap" 84 | packages = [".","buffer","internal/bufferpool","internal/color","internal/exit","zapcore"] 85 | revision = "35aad584952c3e7020db7b839f6b102de6271f89" 86 | version = "v1.7.1" 87 | 88 | [[projects]] 89 | branch = "master" 90 | name = "golang.org/x/text" 91 | packages = ["encoding","encoding/charmap","encoding/htmlindex","encoding/internal","encoding/internal/identifier","encoding/japanese","encoding/korean","encoding/simplifiedchinese","encoding/traditionalchinese","encoding/unicode","internal/gen","internal/tag","internal/utf8internal","language","runes","transform","unicode/cldr"] 92 | revision = "e19ae1496984b1c655b8044a65c0300a3c878dd3" 93 | 94 | [solve-meta] 95 | analyzer-name = "dep" 96 | analyzer-version = 1 97 | inputs-digest = "e6fe25aee4cb6ddf68f4bf5f821ffa0142e8f502cbcc90c4e930d1005238299f" 98 | solver-name = "gps-cdcl" 99 | solver-version = 1 100 | -------------------------------------------------------------------------------- /Gopkg.toml: -------------------------------------------------------------------------------- 1 | # Refer to https://github.com/golang/dep/blob/master/docs/Gopkg.toml.md 2 | # for detailed Gopkg.toml documentation. 3 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # mail2elasticsearch [![unlicense](https://img.shields.io/badge/un-license-green.svg?style=flat)](http://unlicense.org) 2 | 3 | A MIME email indexer for [ElasticSearch](https://www.elastic.co/products/elasticsearch), written in Go. 4 | 5 | - preserves mail structure (nested parts) 6 | - tells ElasticSearch to index dates correctly 7 | - deduplicates attachments by storing them in a plain filesystem folder, using hashed contents as the filename (note: potential [attachment content indexing using other tools](https://blog.ambar.cloud/ingesting-documents-pdf-word-txt-etc-into-elasticsearch/)) 8 | - decodes [a ton of character sets](https://github.com/golang/text/blob/master/encoding/htmlindex/tables.go), with [autodetection](https://github.com/gogits/chardet) when needed 9 | - is observable: uses [structured logging](https://github.com/uber-go/zap), optionally exposes profiling and stats over HTTP 10 | - is fast: indexes multiple files in parallel, uses ElasticSearch's bulk index endpoint, [static JSON encoding](https://github.com/mailru/easyjson), SIMD accelerated [BLAKE2b hashing](https://github.com/minio/blake2b-simd) 11 | - is (mostly) robust: tested on a large real-world mail archive, did not crash, most mail was parsed correctly, but some messages were skipped (weird EOFs, quoted-printable errors) 12 | 13 | ## Usage 14 | 15 | ```bash 16 | $ mail2elasticsearch -h # check available flags 17 | $ mail2elasticsearch -init # setup the index 18 | 19 | $ mail2elasticsearch < /mail/cur/some.letter # stdin 20 | $ mail2elasticsearch /mail/cur/some.letter /mail/cur/other.letter # paths 21 | $ mail2elasticsearch /mail/cur # recursive walk (e.g. initial bulk indexing) 22 | ``` 23 | 24 | ### Development 25 | 26 | ```bash 27 | $ dep ensure 28 | $ mail2elasticsearch -srvaddr 127.0.0.1:42069 -attachdir /tmp/files ~/testmail/cur 2>&1 | humanlog 29 | $ go-torch -u http://127.0.0.1:42069/ -t 120 --binaryname=mail2elasticsearch 30 | $ expvarmon -ports="http://127.0.0.1:42069" 31 | ``` 32 | 33 | Use 34 | 35 | - [dep](https://github.com/golang/dep) to get dependencies 36 | - [humanlog](https://github.com/aybabtme/humanlog) to read logs in development 37 | - [go-torch](https://github.com/uber/go-torch) to profile with flamegraphs 38 | - [expvarmon](https://github.com/divan/expvarmon) to monitor stats 39 | - [gometalinter](https://github.com/alecthomas/gometalinter) to analyze code 40 | 41 | ## Contributing 42 | 43 | Please feel free to submit pull requests! 44 | 45 | By participating in this project you agree to follow the [Contributor Code of Conduct](http://contributor-covenant.org/version/1/4/). 46 | 47 | [The list of contributors is available on GitHub](https://github.com/myfreeweb/mail2elasticsearch/graphs/contributors). 48 | 49 | ## License 50 | 51 | This is free and unencumbered software released into the public domain. 52 | For more information, please refer to the `UNLICENSE` file or [unlicense.org](http://unlicense.org). 53 | -------------------------------------------------------------------------------- /UNLICENSE: -------------------------------------------------------------------------------- 1 | This is free and unencumbered software released into the public domain. 2 | 3 | Anyone is free to copy, modify, publish, use, compile, sell, or 4 | distribute this software, either in source code form or as a compiled 5 | binary, for any purpose, commercial or non-commercial, and by any 6 | means. 7 | 8 | In jurisdictions that recognize copyright laws, the author or authors 9 | of this software dedicate any and all copyright interest in the 10 | software to the public domain. We make this dedication for the benefit 11 | of the public at large and to the detriment of our heirs and 12 | successors. We intend this dedication to be an overt act of 13 | relinquishment in perpetuity of all present and future rights to this 14 | software under copyright law. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 19 | IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 | OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 | OTHER DEALINGS IN THE SOFTWARE. 23 | 24 | For more information, please refer to 25 | -------------------------------------------------------------------------------- /charset.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "bytes" 5 | "fmt" 6 | "io" 7 | "io/ioutil" 8 | 9 | "github.com/gogits/chardet" 10 | "go.uber.org/zap" 11 | "golang.org/x/text/encoding/htmlindex" 12 | ) 13 | 14 | var htmlDetector = chardet.NewHtmlDetector() 15 | var textDetector = chardet.NewTextDetector() 16 | 17 | func decodeCharset(charset string, body []byte, description string, ishtml bool, log *zap.SugaredLogger) ([]byte, string, error) { 18 | var err error 19 | if charset == "" { 20 | var detenc *chardet.Result 21 | if ishtml { 22 | detenc, err = htmlDetector.DetectBest(body) 23 | } else { 24 | detenc, err = textDetector.DetectBest(body) 25 | } 26 | if err != nil { 27 | charset = detenc.Charset 28 | log.Infow("Using detected charset", "where", description, "detected", detenc.Charset, 29 | "lang", detenc.Language, "confidence", detenc.Confidence) 30 | } else { 31 | charset = "utf-8" 32 | log.Infow("Could not detect charset, assuming UTF-8", "where", description) 33 | } 34 | } 35 | enc, err := htmlindex.Get(charset) 36 | if err != nil || enc == nil { 37 | return nil, charset, err 38 | } 39 | decoded, err := enc.NewDecoder().Bytes(body) 40 | if err != nil { 41 | return nil, charset, err 42 | } 43 | return decoded, charset, nil 44 | } 45 | 46 | func decodeReader(charset string, input io.Reader, log *zap.SugaredLogger) (io.Reader, error) { 47 | body, err := ioutil.ReadAll(input) 48 | if err != nil { 49 | return nil, err 50 | } 51 | decoded, _, err := decodeCharset(charset, body, fmt.Sprintf("header '%s'", body), false, log) 52 | if err != nil { 53 | return nil, err 54 | } 55 | return bytes.NewReader(decoded), nil 56 | } 57 | -------------------------------------------------------------------------------- /headers.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "regexp" 5 | "strings" 6 | ) 7 | 8 | var addrSplitRegex = regexp.MustCompile(`\s*,\s*`) 9 | 10 | func splitAddrs(vals []string) []string { 11 | result := make([]string, 0) 12 | for _, val := range vals { 13 | addrs := addrSplitRegex.Split(val, -1) 14 | result = append(result, addrs...) 15 | } 16 | return result 17 | } 18 | 19 | var addrRegex = regexp.MustCompile(`[\p{L}\d.!#$%&*+\/=?^_{|}~-]+@[\p{L}\d-.]+`) 20 | 21 | func extractOnlyAddrs(vals []string) []string { 22 | result := make([]string, 0) 23 | for _, val := range vals { 24 | result = append(result, addrRegex.FindAllString(val, -1)...) 25 | } 26 | return result 27 | } 28 | 29 | var whitespaceRegex = regexp.MustCompile(`\s+`) 30 | var commentRegex = regexp.MustCompile(`\([^\)]*\)`) 31 | 32 | // RFC 2822 allows whitespace and comments, ElasticSearch/joda-time does not 33 | func stripSpaceAndComments(vals []string) []string { 34 | result := make([]string, 0) 35 | for _, val := range vals { 36 | val = commentRegex.ReplaceAllString(val, "") 37 | val = whitespaceRegex.ReplaceAllString(val, " ") 38 | val = strings.TrimSpace(val) 39 | result = append(result, val) 40 | } 41 | return result 42 | } 43 | -------------------------------------------------------------------------------- /headers_test.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "github.com/stretchr/testify/assert" 5 | "testing" 6 | ) 7 | 8 | func TestStripSpaceAndComments(t *testing.T) { 9 | assert.Equal( 10 | t, 11 | []string{ "Thu, 13 Feb 1969 23:32 -0330" }, 12 | stripSpaceAndComments([]string{ ` Thu, 13 | 13 14 | Feb 15 | 1969 16 | 23:32 17 | -0330 (Newfoundland Time)` }), 18 | ) 19 | } 20 | func TestSplitAddrs(t *testing.T) { 21 | assert.Equal( 22 | t, 23 | []string{ "hello world ", "nice@me.me (test)", "hi@example.com" }, 24 | splitAddrs([]string{ "hello world , nice@me.me (test),hi@example.com" }), 25 | ) 26 | } 27 | -------------------------------------------------------------------------------- /main.go: -------------------------------------------------------------------------------- 1 | //go:generate easyjson main.go 2 | package main 3 | 4 | import ( 5 | "bufio" 6 | "bytes" 7 | "context" 8 | "encoding/base64" 9 | "encoding/hex" 10 | _ "expvar" 11 | "flag" 12 | "fmt" 13 | "io" 14 | "io/ioutil" 15 | "mime" 16 | "mime/quotedprintable" 17 | "net/http" 18 | _ "net/http/pprof" 19 | "net/textproto" 20 | "os" 21 | "path/filepath" 22 | "runtime" 23 | "strings" 24 | "sync" 25 | 26 | "github.com/dchest/safefile" 27 | "github.com/mailru/easyjson" 28 | blake2b "github.com/minio/blake2b-simd" 29 | "github.com/myfreeweb/go-email/email" 30 | elastic "github.com/olivere/elastic" 31 | zap "go.uber.org/zap" 32 | ) 33 | 34 | var attachdir = flag.String("attachdir", "files", "path to the attachments directory") 35 | var elasticUrl = flag.String("elastic", "http://127.0.0.1:9200", "URL of the ElasticSearch server") 36 | var elasticIndex = flag.String("index", "mail", "name of the ElasticSearch index") 37 | var doInit = flag.Bool("init", false, "whether to initialize the index instead of indexing mail") 38 | var srvAddr = flag.String("srvaddr", "", "address for the pprof/expvar server to listen on") 39 | 40 | const indexSettings string = `{ 41 | "settings": { 42 | "index": { 43 | "sort.field": "h.Date", 44 | "sort.order": "desc", 45 | "refresh_interval": "15s" 46 | } 47 | }, 48 | "mappings": { 49 | "msg": { 50 | "dynamic": false, 51 | "properties": { 52 | "h": { 53 | "dynamic": false, 54 | "properties": { 55 | "Date": { "type": "date", "format": "EEE, dd MMM yyyy HH:mm:ss Z||dd MMM yyyy HH:mm:ss Z||dd MMM yyyy HH:mm:ss||dd MMM yyyy HH:mm", "ignore_malformed": true }, 56 | "Subject": { "type": "text" }, 57 | "From": { "type": "text" }, 58 | "To": { "type": "text" }, 59 | "Cc": { "type": "text" }, 60 | "S-From": { "type": "keyword", "ignore_above": 10922 }, 61 | "S-To": { "type": "keyword", "ignore_above": 10922 }, 62 | "Reply-To": { "type": "text" }, 63 | "Thread-Index": { "type": "keyword", "ignore_above": 10922 }, 64 | "In-Reply-To": { "type": "keyword", "ignore_above": 10922 }, 65 | "References": { "type": "keyword", "ignore_above": 10922 } 66 | } 67 | }, 68 | "t": { "type": "text" }, 69 | "p": { 70 | "dynamic": false, 71 | "properties": { 72 | "t": { "type": "text" }, 73 | "h": { 74 | "dynamic": false, 75 | "properties": { 76 | "Content-Disposition": { "type": "text" } 77 | } 78 | }, 79 | "p": { 80 | "dynamic": false, 81 | "properties": { 82 | "t": { "type": "text" }, 83 | "h": { 84 | "dynamic": false, 85 | "properties": { 86 | "Content-Disposition": { "type": "text" } 87 | } 88 | }, 89 | "p": { 90 | "dynamic": false, 91 | "properties": { 92 | "t": { "type": "text" }, 93 | "h": { 94 | "dynamic": false, 95 | "properties": { 96 | "Content-Disposition": { "type": "text" } 97 | } 98 | } 99 | } 100 | } 101 | } 102 | } 103 | } 104 | }, 105 | "sub": { 106 | "dynamic": false, 107 | "properties": { 108 | "t": { "type": "text" } 109 | } 110 | } 111 | } 112 | } 113 | } 114 | }` 115 | 116 | //easyjson:json 117 | type JMessage struct { 118 | Id string 119 | Header email.Header `json:"h,omitempty"` 120 | Preamble []byte `json:"pre,omitempty"` 121 | Epilogue []byte `json:"epi,omitempty"` 122 | Parts []*JMessage `json:"p,omitempty"` 123 | SubMessage *JMessage `json:"sub,omitempty"` 124 | TextBody string `json:"t,omitempty"` 125 | Attachment string `json:"a,omitempty"` 126 | } 127 | 128 | func jsonifyMsg(msg email.Message, log *zap.SugaredLogger) JMessage { 129 | log = log.With("msgid", msg.Header.Get("Message-Id")) 130 | wordDecoder := new(mime.WordDecoder) 131 | wordDecoder.CharsetReader = func(charset string, input io.Reader) (io.Reader, error) { 132 | return decodeReader(charset, input, log) 133 | } 134 | result := JMessage{ 135 | Id: msg.Header.Get("Message-Id"), 136 | Header: make(map[string][]string), 137 | Preamble: msg.Preamble, 138 | Epilogue: msg.Epilogue, 139 | Parts: []*JMessage{}, 140 | SubMessage: nil, 141 | TextBody: "", 142 | Attachment: "", 143 | } 144 | //// Headers 145 | for k, vs := range msg.Header { 146 | kk := textproto.CanonicalMIMEHeaderKey(k) 147 | result.Header[kk] = vs 148 | for i, v := range vs { 149 | dec, err := wordDecoder.DecodeHeader(v) 150 | if err != nil { 151 | log.Warnw("Could not decode header", "name", kk, "index", i, "value", v, "err", err) 152 | continue 153 | } 154 | result.Header[kk][i] = dec 155 | } 156 | } 157 | delete(result.Header, "Message-Id") 158 | result.Header["Date"] = stripSpaceAndComments(result.Header["Date"]) 159 | result.Header["S-From"] = extractOnlyAddrs(result.Header["From"]) 160 | result.Header["S-To"] = append(append( 161 | extractOnlyAddrs(result.Header["To"]), 162 | extractOnlyAddrs(result.Header["Cc"])...), 163 | extractOnlyAddrs(result.Header["Bcc"])...) 164 | result.Header["From"] = splitAddrs(result.Header["From"]) 165 | result.Header["To"] = splitAddrs(result.Header["To"]) 166 | result.Header["Cc"] = splitAddrs(result.Header["Cc"]) 167 | result.Header["Bcc"] = splitAddrs(result.Header["Bcc"]) 168 | result.Header["Return-Path"] = splitAddrs(result.Header["Return-Path"]) 169 | result.Header["Delivered-To"] = splitAddrs(result.Header["Delivered-To"]) 170 | result.Header["X-Failed-Recipients"] = splitAddrs(result.Header["X-Failed-Recipients"]) 171 | result.Header["Thread-Index"] = splitAddrs(result.Header["Thread-Index"]) 172 | result.Header["In-Reply-To"] = splitAddrs(result.Header["In-Reply-To"]) 173 | result.Header["References"] = splitAddrs(result.Header["References"]) 174 | //// Parts 175 | if msg.SubMessage != nil { 176 | submsg := jsonifyMsg(*msg.SubMessage, log.With("submsg", true)) 177 | result.SubMessage = &submsg 178 | } 179 | for partidx, part := range msg.Parts { 180 | if part != nil { 181 | partmsg := jsonifyMsg(*part, log.With("partidx", partidx)) 182 | result.Parts = append(result.Parts, &partmsg) 183 | } 184 | } 185 | //// Body 186 | ctype := result.Header.Get("Content-Type") 187 | //// Body Transfer-Encoding 188 | if result.Header.Get("Content-Transfer-Encoding") == "quoted-printable" { 189 | decBody, err := ioutil.ReadAll(quotedprintable.NewReader(bytes.NewReader(msg.Body))) 190 | if err != nil { 191 | log.Warnw("Could not decode quoted-printable, treating like an attachment", "err", err) 192 | goto file 193 | } 194 | msg.Body = decBody 195 | } else if result.Header.Get("Content-Transfer-Encoding") == "base64" { 196 | decBody := make([]byte, base64.StdEncoding.DecodedLen(len(msg.Body))) 197 | n, err := base64.StdEncoding.Decode(decBody, msg.Body) 198 | if err != nil { 199 | log.Warnw("Could not decode base64, treating like an attachment", "nbytes", n, "err", err) 200 | goto file 201 | } 202 | msg.Body = decBody 203 | } 204 | //// Body Charset 205 | if strings.HasPrefix(ctype, "text") && !strings.Contains(result.Header.Get("Content-Disposition"), "attachment") { 206 | mediatype, params, err := mime.ParseMediaType(ctype) 207 | if err != nil { 208 | if strings.Contains(ctype, "html") { 209 | mediatype = "text/html" 210 | } else { 211 | mediatype = "text/plain" 212 | } 213 | params = make(map[string]string) 214 | log.Warnw("Unreadable Content-Type", "ctype", ctype, "err", err, "assumed", mediatype) 215 | } 216 | decoded, charset, err := decodeCharset( 217 | params["charset"], 218 | msg.Body, 219 | fmt.Sprintf("Content-Type: %s", ctype), 220 | strings.Contains(mediatype, "html"), 221 | log) 222 | if err != nil { 223 | log.Warnw("Could not decode charset, treating like an attachment", "charset", charset, "err", err) 224 | goto file 225 | } 226 | result.TextBody = string(decoded) 227 | return result 228 | } 229 | file: 230 | hash := blake2b.Sum256(msg.Body) 231 | path := filepath.Join(*attachdir, hex.EncodeToString(hash[:])) 232 | log = log.With("path", path) 233 | result.Attachment = path 234 | if _, err := os.Stat(path); !os.IsNotExist(err) { 235 | log.Debug("Attachment already exists") 236 | return result 237 | } 238 | f, err := safefile.Create(path, 0444) 239 | if err != nil { 240 | log.Errorw("Could not open file for attachment", "err", err) 241 | return result 242 | } 243 | defer f.Close() 244 | _, err = f.Write(msg.Body) 245 | if err != nil { 246 | log.Errorw("Could not write attachment", "err", err) 247 | return result 248 | } 249 | err = f.Commit() 250 | if err != nil { 251 | log.Errorw("Could not commit attachment", "err", err) 252 | return result 253 | } 254 | log.Info("Saved attachment") 255 | return result 256 | } 257 | 258 | func process(msgtext io.Reader, log *zap.SugaredLogger) (*JMessage, error) { 259 | msg, err := email.ParseMessage(msgtext) 260 | if err != nil { 261 | return nil, err 262 | } 263 | jmsg := jsonifyMsg(*msg, log) 264 | return &jmsg, nil 265 | } 266 | 267 | func main() { 268 | flag.Parse() 269 | logger, _ := zap.NewProduction() 270 | defer logger.Sync() 271 | log := logger.Sugar() 272 | if *srvAddr != "" { 273 | go func() { 274 | log.Infow("pprof/expvar server started", "result", http.ListenAndServe(*srvAddr, nil)) 275 | }() 276 | } 277 | ctx := context.Background() 278 | client, err := elastic.NewClient( 279 | elastic.SetURL(*elasticUrl), 280 | ) 281 | if err != nil { 282 | log.Fatalw("Could not create ElasticSearch client", "err", err) 283 | } 284 | if *doInit { 285 | res, err := client.CreateIndex(*elasticIndex).BodyString(indexSettings).Do(ctx) 286 | if err != nil { 287 | log.Fatalw("Could not initialize index", "err", err) 288 | } else { 289 | log.Infow("Created index", "result", res) 290 | } 291 | } else if len(flag.Args()) == 0 || flag.Arg(0) == "-" { 292 | jmsg, err := process(bufio.NewReader(os.Stdin), log.With("filename", "stdin")) 293 | if err != nil { 294 | log.Fatalw("Could not process", "err", err) 295 | } 296 | j, err := easyjson.Marshal(*jmsg) 297 | if err != nil { 298 | log.Fatalw("Could not serialize JSON", "err", err) 299 | } 300 | _, err = client.Index().Index(*elasticIndex).Type("msg").Id(jmsg.Id).BodyString(string(j)).Do(ctx) 301 | if err != nil { 302 | log.Fatalw("Could not index", "err", err) 303 | } 304 | } else { 305 | proc, err := client.BulkProcessor().Name("mail2elasticsearch").Do(ctx) 306 | if err != nil { 307 | log.Fatalw("Could not start bulk processor", "err", err) 308 | } 309 | defer proc.Close() 310 | var wg sync.WaitGroup 311 | tasks := make(chan string) 312 | for i := 0; i < runtime.GOMAXPROCS(0); i++ { 313 | go func() { 314 | for { 315 | var j []byte 316 | var jmsg *JMessage 317 | filename := <-tasks 318 | log := log.With("filename", filename) 319 | log.Debug("Processing start") 320 | file, err := os.Open(filename) 321 | if err != nil { 322 | log.Errorw("Could not open file", "err", err) 323 | goto done 324 | } 325 | jmsg, err = process(bufio.NewReader(file), log) 326 | if err != nil { 327 | log.Errorw("Could not process", "err", err) 328 | goto done 329 | } 330 | j, err = easyjson.Marshal(*jmsg) 331 | if err != nil { 332 | log.Errorw("Could not serialize JSON", "err", err) 333 | goto done 334 | } 335 | proc.Add(elastic.NewBulkIndexRequest().Index(*elasticIndex).Type("msg").Id(jmsg.Id).Doc(string(j))) 336 | log.Debug("Processing end") 337 | done: 338 | wg.Done() 339 | } 340 | }() 341 | } 342 | for _, filename := range flag.Args() { 343 | f, err := os.Stat(filename) 344 | if err != nil { 345 | log.Fatalw("Could not stat file", "err", err, "filename", filename) 346 | } 347 | if f.Mode().IsDir() { 348 | err = filepath.Walk(filename, func(path string, _ os.FileInfo, err error) error { 349 | if err != nil { 350 | return err 351 | } 352 | f, err := os.Stat(path) 353 | if err != nil { 354 | log.Fatalw("Could not stat file", "err", err, "filename", path) 355 | } 356 | if f.Mode().IsRegular() { 357 | wg.Add(1) 358 | tasks <- path 359 | } else { 360 | log.Infow("Not a file", "filename", path) 361 | } 362 | return nil 363 | }) 364 | if err != nil { 365 | log.Fatalw("Could not walk directory", "err", err, "filename", filename) 366 | } 367 | } else { 368 | wg.Add(1) 369 | tasks <- filename 370 | } 371 | } 372 | wg.Wait() 373 | } 374 | } 375 | -------------------------------------------------------------------------------- /main_easyjson.go: -------------------------------------------------------------------------------- 1 | // Code generated by easyjson for marshaling/unmarshaling. DO NOT EDIT. 2 | 3 | package main 4 | 5 | import ( 6 | json "encoding/json" 7 | easyjson "github.com/mailru/easyjson" 8 | jlexer "github.com/mailru/easyjson/jlexer" 9 | jwriter "github.com/mailru/easyjson/jwriter" 10 | email "github.com/myfreeweb/go-email/email" 11 | ) 12 | 13 | // suppress unused package warning 14 | var ( 15 | _ *json.RawMessage 16 | _ *jlexer.Lexer 17 | _ *jwriter.Writer 18 | _ easyjson.Marshaler 19 | ) 20 | 21 | func easyjson89aae3efDecodeGithubComMyfreewebMail2elasticsearch(in *jlexer.Lexer, out *JMessage) { 22 | isTopLevel := in.IsStart() 23 | if in.IsNull() { 24 | if isTopLevel { 25 | in.Consumed() 26 | } 27 | in.Skip() 28 | return 29 | } 30 | in.Delim('{') 31 | for !in.IsDelim('}') { 32 | key := in.UnsafeString() 33 | in.WantColon() 34 | if in.IsNull() { 35 | in.Skip() 36 | in.WantComma() 37 | continue 38 | } 39 | switch key { 40 | case "Id": 41 | out.Id = string(in.String()) 42 | case "h": 43 | if in.IsNull() { 44 | in.Skip() 45 | } else { 46 | in.Delim('{') 47 | if !in.IsDelim('}') { 48 | out.Header = make(email.Header) 49 | } else { 50 | out.Header = nil 51 | } 52 | for !in.IsDelim('}') { 53 | key := string(in.String()) 54 | in.WantColon() 55 | var v1 []string 56 | if in.IsNull() { 57 | in.Skip() 58 | v1 = nil 59 | } else { 60 | in.Delim('[') 61 | if v1 == nil { 62 | if !in.IsDelim(']') { 63 | v1 = make([]string, 0, 4) 64 | } else { 65 | v1 = []string{} 66 | } 67 | } else { 68 | v1 = (v1)[:0] 69 | } 70 | for !in.IsDelim(']') { 71 | var v2 string 72 | v2 = string(in.String()) 73 | v1 = append(v1, v2) 74 | in.WantComma() 75 | } 76 | in.Delim(']') 77 | } 78 | (out.Header)[key] = v1 79 | in.WantComma() 80 | } 81 | in.Delim('}') 82 | } 83 | case "pre": 84 | if in.IsNull() { 85 | in.Skip() 86 | out.Preamble = nil 87 | } else { 88 | out.Preamble = in.Bytes() 89 | } 90 | case "epi": 91 | if in.IsNull() { 92 | in.Skip() 93 | out.Epilogue = nil 94 | } else { 95 | out.Epilogue = in.Bytes() 96 | } 97 | case "p": 98 | if in.IsNull() { 99 | in.Skip() 100 | out.Parts = nil 101 | } else { 102 | in.Delim('[') 103 | if out.Parts == nil { 104 | if !in.IsDelim(']') { 105 | out.Parts = make([]*JMessage, 0, 8) 106 | } else { 107 | out.Parts = []*JMessage{} 108 | } 109 | } else { 110 | out.Parts = (out.Parts)[:0] 111 | } 112 | for !in.IsDelim(']') { 113 | var v5 *JMessage 114 | if in.IsNull() { 115 | in.Skip() 116 | v5 = nil 117 | } else { 118 | if v5 == nil { 119 | v5 = new(JMessage) 120 | } 121 | (*v5).UnmarshalEasyJSON(in) 122 | } 123 | out.Parts = append(out.Parts, v5) 124 | in.WantComma() 125 | } 126 | in.Delim(']') 127 | } 128 | case "sub": 129 | if in.IsNull() { 130 | in.Skip() 131 | out.SubMessage = nil 132 | } else { 133 | if out.SubMessage == nil { 134 | out.SubMessage = new(JMessage) 135 | } 136 | (*out.SubMessage).UnmarshalEasyJSON(in) 137 | } 138 | case "t": 139 | out.TextBody = string(in.String()) 140 | case "a": 141 | out.Attachment = string(in.String()) 142 | default: 143 | in.SkipRecursive() 144 | } 145 | in.WantComma() 146 | } 147 | in.Delim('}') 148 | if isTopLevel { 149 | in.Consumed() 150 | } 151 | } 152 | func easyjson89aae3efEncodeGithubComMyfreewebMail2elasticsearch(out *jwriter.Writer, in JMessage) { 153 | out.RawByte('{') 154 | first := true 155 | _ = first 156 | if !first { 157 | out.RawByte(',') 158 | } 159 | first = false 160 | out.RawString("\"Id\":") 161 | out.String(string(in.Id)) 162 | if len(in.Header) != 0 { 163 | if !first { 164 | out.RawByte(',') 165 | } 166 | first = false 167 | out.RawString("\"h\":") 168 | if in.Header == nil && (out.Flags&jwriter.NilMapAsEmpty) == 0 { 169 | out.RawString(`null`) 170 | } else { 171 | out.RawByte('{') 172 | v6First := true 173 | for v6Name, v6Value := range in.Header { 174 | if !v6First { 175 | out.RawByte(',') 176 | } 177 | v6First = false 178 | out.String(string(v6Name)) 179 | out.RawByte(':') 180 | if v6Value == nil && (out.Flags&jwriter.NilSliceAsEmpty) == 0 { 181 | out.RawString("null") 182 | } else { 183 | out.RawByte('[') 184 | for v7, v8 := range v6Value { 185 | if v7 > 0 { 186 | out.RawByte(',') 187 | } 188 | out.String(string(v8)) 189 | } 190 | out.RawByte(']') 191 | } 192 | } 193 | out.RawByte('}') 194 | } 195 | } 196 | if len(in.Preamble) != 0 { 197 | if !first { 198 | out.RawByte(',') 199 | } 200 | first = false 201 | out.RawString("\"pre\":") 202 | out.Base64Bytes(in.Preamble) 203 | } 204 | if len(in.Epilogue) != 0 { 205 | if !first { 206 | out.RawByte(',') 207 | } 208 | first = false 209 | out.RawString("\"epi\":") 210 | out.Base64Bytes(in.Epilogue) 211 | } 212 | if len(in.Parts) != 0 { 213 | if !first { 214 | out.RawByte(',') 215 | } 216 | first = false 217 | out.RawString("\"p\":") 218 | if in.Parts == nil && (out.Flags&jwriter.NilSliceAsEmpty) == 0 { 219 | out.RawString("null") 220 | } else { 221 | out.RawByte('[') 222 | for v13, v14 := range in.Parts { 223 | if v13 > 0 { 224 | out.RawByte(',') 225 | } 226 | if v14 == nil { 227 | out.RawString("null") 228 | } else { 229 | (*v14).MarshalEasyJSON(out) 230 | } 231 | } 232 | out.RawByte(']') 233 | } 234 | } 235 | if in.SubMessage != nil { 236 | if !first { 237 | out.RawByte(',') 238 | } 239 | first = false 240 | out.RawString("\"sub\":") 241 | if in.SubMessage == nil { 242 | out.RawString("null") 243 | } else { 244 | (*in.SubMessage).MarshalEasyJSON(out) 245 | } 246 | } 247 | if in.TextBody != "" { 248 | if !first { 249 | out.RawByte(',') 250 | } 251 | first = false 252 | out.RawString("\"t\":") 253 | out.String(string(in.TextBody)) 254 | } 255 | if in.Attachment != "" { 256 | if !first { 257 | out.RawByte(',') 258 | } 259 | first = false 260 | out.RawString("\"a\":") 261 | out.String(string(in.Attachment)) 262 | } 263 | out.RawByte('}') 264 | } 265 | 266 | // MarshalJSON supports json.Marshaler interface 267 | func (v JMessage) MarshalJSON() ([]byte, error) { 268 | w := jwriter.Writer{} 269 | easyjson89aae3efEncodeGithubComMyfreewebMail2elasticsearch(&w, v) 270 | return w.Buffer.BuildBytes(), w.Error 271 | } 272 | 273 | // MarshalEasyJSON supports easyjson.Marshaler interface 274 | func (v JMessage) MarshalEasyJSON(w *jwriter.Writer) { 275 | easyjson89aae3efEncodeGithubComMyfreewebMail2elasticsearch(w, v) 276 | } 277 | 278 | // UnmarshalJSON supports json.Unmarshaler interface 279 | func (v *JMessage) UnmarshalJSON(data []byte) error { 280 | r := jlexer.Lexer{Data: data} 281 | easyjson89aae3efDecodeGithubComMyfreewebMail2elasticsearch(&r, v) 282 | return r.Error() 283 | } 284 | 285 | // UnmarshalEasyJSON supports easyjson.Unmarshaler interface 286 | func (v *JMessage) UnmarshalEasyJSON(l *jlexer.Lexer) { 287 | easyjson89aae3efDecodeGithubComMyfreewebMail2elasticsearch(l, v) 288 | } 289 | --------------------------------------------------------------------------------