├── .gitignore
├── CODE_OF_CONDUCT.md
├── Gopkg.lock
├── Gopkg.toml
├── README.md
├── UNLICENSE
├── charset.go
├── headers.go
├── headers_test.go
├── main.go
└── main_easyjson.go
/.gitignore:
--------------------------------------------------------------------------------
1 | /mail2elasticsearch
2 | /vendor
3 | /TODO
4 |
--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
1 | # Contributor Covenant Code of Conduct
2 |
3 | ## Our Pledge
4 |
5 | In the interest of fostering an open and welcoming environment, we as
6 | contributors and maintainers pledge to making participation in our project and
7 | our community a harassment-free experience for everyone, regardless of age, body
8 | size, disability, ethnicity, gender identity and expression, level of experience,
9 | nationality, personal appearance, race, religion, or sexual identity and
10 | orientation.
11 |
12 | ## Our Standards
13 |
14 | Examples of behavior that contributes to creating a positive environment
15 | include:
16 |
17 | * Using welcoming and inclusive language
18 | * Being respectful of differing viewpoints and experiences
19 | * Gracefully accepting constructive criticism
20 | * Focusing on what is best for the community
21 | * Showing empathy towards other community members
22 |
23 | Examples of unacceptable behavior by participants include:
24 |
25 | * The use of sexualized language or imagery and unwelcome sexual attention or
26 | advances
27 | * Trolling, insulting/derogatory comments, and personal or political attacks
28 | * Public or private harassment
29 | * Publishing others' private information, such as a physical or electronic
30 | address, without explicit permission
31 | * Other conduct which could reasonably be considered inappropriate in a
32 | professional setting
33 |
34 | ## Our Responsibilities
35 |
36 | Project maintainers are responsible for clarifying the standards of acceptable
37 | behavior and are expected to take appropriate and fair corrective action in
38 | response to any instances of unacceptable behavior.
39 |
40 | Project maintainers have the right and responsibility to remove, edit, or
41 | reject comments, commits, code, wiki edits, issues, and other contributions
42 | that are not aligned to this Code of Conduct, or to ban temporarily or
43 | permanently any contributor for other behaviors that they deem inappropriate,
44 | threatening, offensive, or harmful.
45 |
46 | ## Scope
47 |
48 | This Code of Conduct applies both within project spaces and in public spaces
49 | when an individual is representing the project or its community. Examples of
50 | representing a project or community include using an official project e-mail
51 | address, posting via an official social media account, or acting as an appointed
52 | representative at an online or offline event. Representation of a project may be
53 | further defined and clarified by project maintainers.
54 |
55 | ## Enforcement
56 |
57 | Instances of abusive, harassing, or otherwise unacceptable behavior may be
58 | reported by contacting the project owner at greg@unrelenting.technology. All
59 | complaints will be reviewed and investigated and will result in a response that
60 | is deemed necessary and appropriate to the circumstances. The project owner is
61 | obligated to maintain confidentiality with regard to the reporter of an incident.
62 | Further details of specific enforcement policies may be posted separately.
63 |
64 | Project maintainers who do not follow or enforce the Code of Conduct in good
65 | faith may face temporary or permanent repercussions as determined by other
66 | members of the project's leadership.
67 |
68 | ## Attribution
69 |
70 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
71 | available at [http://contributor-covenant.org/version/1/4][version]
72 |
73 | [homepage]: http://contributor-covenant.org
74 | [version]: http://contributor-covenant.org/version/1/4/
75 |
--------------------------------------------------------------------------------
/Gopkg.lock:
--------------------------------------------------------------------------------
1 | # This file is autogenerated, do not edit; changes may be undone by the next 'dep ensure'.
2 |
3 |
4 | [[projects]]
5 | name = "github.com/davecgh/go-spew"
6 | packages = ["spew"]
7 | revision = "346938d642f2ec3594ed81d874461961cd0faa76"
8 | version = "v1.1.0"
9 |
10 | [[projects]]
11 | branch = "master"
12 | name = "github.com/dchest/safefile"
13 | packages = ["."]
14 | revision = "855e8d98f1852d48dde521e0522408d1fe7e836a"
15 |
16 | [[projects]]
17 | branch = "master"
18 | name = "github.com/gogits/chardet"
19 | packages = ["."]
20 | revision = "2404f777256163ea3eadb273dada5dcb037993c0"
21 |
22 | [[projects]]
23 | branch = "master"
24 | name = "github.com/mailru/easyjson"
25 | packages = [".","buffer","jlexer","jwriter"]
26 | revision = "32fa128f234d041f196a9f3e0fea5ac9772c08e1"
27 |
28 | [[projects]]
29 | branch = "master"
30 | name = "github.com/minio/blake2b-simd"
31 | packages = ["."]
32 | revision = "3f5f724cb5b182a5c278d6d3d55b40e7f8c2efb4"
33 |
34 | [[projects]]
35 | branch = "master"
36 | name = "github.com/myfreeweb/go-base64-simd"
37 | packages = ["base64"]
38 | revision = "a996ba366c4249e6fd8967e1257ba0752b9e6628"
39 |
40 | [[projects]]
41 | branch = "master"
42 | name = "github.com/myfreeweb/go-email"
43 | packages = ["email"]
44 | revision = "5ebbe4b970f65566dfaae38e30db4f5b6e8f8b2f"
45 |
46 | [[projects]]
47 | name = "github.com/olivere/elastic"
48 | packages = [".","config","uritemplates"]
49 | revision = "2963eb09b89294356e1a826068f33e5a44326ac0"
50 | version = "v6.1.5"
51 |
52 | [[projects]]
53 | name = "github.com/pkg/errors"
54 | packages = ["."]
55 | revision = "645ef00459ed84a119197bfb8d8205042c6df63d"
56 | version = "v0.8.0"
57 |
58 | [[projects]]
59 | name = "github.com/pmezard/go-difflib"
60 | packages = ["difflib"]
61 | revision = "792786c7400a136282c1664665ae0a8db921c6c2"
62 | version = "v1.0.0"
63 |
64 | [[projects]]
65 | name = "github.com/stretchr/testify"
66 | packages = ["assert"]
67 | revision = "12b6f73e6084dad08a7c6e575284b177ecafbc71"
68 | version = "v1.2.1"
69 |
70 | [[projects]]
71 | name = "go.uber.org/atomic"
72 | packages = ["."]
73 | revision = "8474b86a5a6f79c443ce4b2992817ff32cf208b8"
74 | version = "v1.3.1"
75 |
76 | [[projects]]
77 | name = "go.uber.org/multierr"
78 | packages = ["."]
79 | revision = "3c4937480c32f4c13a875a1829af76c98ca3d40a"
80 | version = "v1.1.0"
81 |
82 | [[projects]]
83 | name = "go.uber.org/zap"
84 | packages = [".","buffer","internal/bufferpool","internal/color","internal/exit","zapcore"]
85 | revision = "35aad584952c3e7020db7b839f6b102de6271f89"
86 | version = "v1.7.1"
87 |
88 | [[projects]]
89 | branch = "master"
90 | name = "golang.org/x/text"
91 | packages = ["encoding","encoding/charmap","encoding/htmlindex","encoding/internal","encoding/internal/identifier","encoding/japanese","encoding/korean","encoding/simplifiedchinese","encoding/traditionalchinese","encoding/unicode","internal/gen","internal/tag","internal/utf8internal","language","runes","transform","unicode/cldr"]
92 | revision = "e19ae1496984b1c655b8044a65c0300a3c878dd3"
93 |
94 | [solve-meta]
95 | analyzer-name = "dep"
96 | analyzer-version = 1
97 | inputs-digest = "e6fe25aee4cb6ddf68f4bf5f821ffa0142e8f502cbcc90c4e930d1005238299f"
98 | solver-name = "gps-cdcl"
99 | solver-version = 1
100 |
--------------------------------------------------------------------------------
/Gopkg.toml:
--------------------------------------------------------------------------------
1 | # Refer to https://github.com/golang/dep/blob/master/docs/Gopkg.toml.md
2 | # for detailed Gopkg.toml documentation.
3 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # mail2elasticsearch [](http://unlicense.org)
2 |
3 | A MIME email indexer for [ElasticSearch](https://www.elastic.co/products/elasticsearch), written in Go.
4 |
5 | - preserves mail structure (nested parts)
6 | - tells ElasticSearch to index dates correctly
7 | - deduplicates attachments by storing them in a plain filesystem folder, using hashed contents as the filename (note: potential [attachment content indexing using other tools](https://blog.ambar.cloud/ingesting-documents-pdf-word-txt-etc-into-elasticsearch/))
8 | - decodes [a ton of character sets](https://github.com/golang/text/blob/master/encoding/htmlindex/tables.go), with [autodetection](https://github.com/gogits/chardet) when needed
9 | - is observable: uses [structured logging](https://github.com/uber-go/zap), optionally exposes profiling and stats over HTTP
10 | - is fast: indexes multiple files in parallel, uses ElasticSearch's bulk index endpoint, [static JSON encoding](https://github.com/mailru/easyjson), SIMD accelerated [BLAKE2b hashing](https://github.com/minio/blake2b-simd)
11 | - is (mostly) robust: tested on a large real-world mail archive, did not crash, most mail was parsed correctly, but some messages were skipped (weird EOFs, quoted-printable errors)
12 |
13 | ## Usage
14 |
15 | ```bash
16 | $ mail2elasticsearch -h # check available flags
17 | $ mail2elasticsearch -init # setup the index
18 |
19 | $ mail2elasticsearch < /mail/cur/some.letter # stdin
20 | $ mail2elasticsearch /mail/cur/some.letter /mail/cur/other.letter # paths
21 | $ mail2elasticsearch /mail/cur # recursive walk (e.g. initial bulk indexing)
22 | ```
23 |
24 | ### Development
25 |
26 | ```bash
27 | $ dep ensure
28 | $ mail2elasticsearch -srvaddr 127.0.0.1:42069 -attachdir /tmp/files ~/testmail/cur 2>&1 | humanlog
29 | $ go-torch -u http://127.0.0.1:42069/ -t 120 --binaryname=mail2elasticsearch
30 | $ expvarmon -ports="http://127.0.0.1:42069"
31 | ```
32 |
33 | Use
34 |
35 | - [dep](https://github.com/golang/dep) to get dependencies
36 | - [humanlog](https://github.com/aybabtme/humanlog) to read logs in development
37 | - [go-torch](https://github.com/uber/go-torch) to profile with flamegraphs
38 | - [expvarmon](https://github.com/divan/expvarmon) to monitor stats
39 | - [gometalinter](https://github.com/alecthomas/gometalinter) to analyze code
40 |
41 | ## Contributing
42 |
43 | Please feel free to submit pull requests!
44 |
45 | By participating in this project you agree to follow the [Contributor Code of Conduct](http://contributor-covenant.org/version/1/4/).
46 |
47 | [The list of contributors is available on GitHub](https://github.com/myfreeweb/mail2elasticsearch/graphs/contributors).
48 |
49 | ## License
50 |
51 | This is free and unencumbered software released into the public domain.
52 | For more information, please refer to the `UNLICENSE` file or [unlicense.org](http://unlicense.org).
53 |
--------------------------------------------------------------------------------
/UNLICENSE:
--------------------------------------------------------------------------------
1 | This is free and unencumbered software released into the public domain.
2 |
3 | Anyone is free to copy, modify, publish, use, compile, sell, or
4 | distribute this software, either in source code form or as a compiled
5 | binary, for any purpose, commercial or non-commercial, and by any
6 | means.
7 |
8 | In jurisdictions that recognize copyright laws, the author or authors
9 | of this software dedicate any and all copyright interest in the
10 | software to the public domain. We make this dedication for the benefit
11 | of the public at large and to the detriment of our heirs and
12 | successors. We intend this dedication to be an overt act of
13 | relinquishment in perpetuity of all present and future rights to this
14 | software under copyright law.
15 |
16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
19 | IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
20 | OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
21 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
22 | OTHER DEALINGS IN THE SOFTWARE.
23 |
24 | For more information, please refer to
25 |
--------------------------------------------------------------------------------
/charset.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import (
4 | "bytes"
5 | "fmt"
6 | "io"
7 | "io/ioutil"
8 |
9 | "github.com/gogits/chardet"
10 | "go.uber.org/zap"
11 | "golang.org/x/text/encoding/htmlindex"
12 | )
13 |
14 | var htmlDetector = chardet.NewHtmlDetector()
15 | var textDetector = chardet.NewTextDetector()
16 |
17 | func decodeCharset(charset string, body []byte, description string, ishtml bool, log *zap.SugaredLogger) ([]byte, string, error) {
18 | var err error
19 | if charset == "" {
20 | var detenc *chardet.Result
21 | if ishtml {
22 | detenc, err = htmlDetector.DetectBest(body)
23 | } else {
24 | detenc, err = textDetector.DetectBest(body)
25 | }
26 | if err != nil {
27 | charset = detenc.Charset
28 | log.Infow("Using detected charset", "where", description, "detected", detenc.Charset,
29 | "lang", detenc.Language, "confidence", detenc.Confidence)
30 | } else {
31 | charset = "utf-8"
32 | log.Infow("Could not detect charset, assuming UTF-8", "where", description)
33 | }
34 | }
35 | enc, err := htmlindex.Get(charset)
36 | if err != nil || enc == nil {
37 | return nil, charset, err
38 | }
39 | decoded, err := enc.NewDecoder().Bytes(body)
40 | if err != nil {
41 | return nil, charset, err
42 | }
43 | return decoded, charset, nil
44 | }
45 |
46 | func decodeReader(charset string, input io.Reader, log *zap.SugaredLogger) (io.Reader, error) {
47 | body, err := ioutil.ReadAll(input)
48 | if err != nil {
49 | return nil, err
50 | }
51 | decoded, _, err := decodeCharset(charset, body, fmt.Sprintf("header '%s'", body), false, log)
52 | if err != nil {
53 | return nil, err
54 | }
55 | return bytes.NewReader(decoded), nil
56 | }
57 |
--------------------------------------------------------------------------------
/headers.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import (
4 | "regexp"
5 | "strings"
6 | )
7 |
8 | var addrSplitRegex = regexp.MustCompile(`\s*,\s*`)
9 |
10 | func splitAddrs(vals []string) []string {
11 | result := make([]string, 0)
12 | for _, val := range vals {
13 | addrs := addrSplitRegex.Split(val, -1)
14 | result = append(result, addrs...)
15 | }
16 | return result
17 | }
18 |
19 | var addrRegex = regexp.MustCompile(`[\p{L}\d.!#$%&*+\/=?^_{|}~-]+@[\p{L}\d-.]+`)
20 |
21 | func extractOnlyAddrs(vals []string) []string {
22 | result := make([]string, 0)
23 | for _, val := range vals {
24 | result = append(result, addrRegex.FindAllString(val, -1)...)
25 | }
26 | return result
27 | }
28 |
29 | var whitespaceRegex = regexp.MustCompile(`\s+`)
30 | var commentRegex = regexp.MustCompile(`\([^\)]*\)`)
31 |
32 | // RFC 2822 allows whitespace and comments, ElasticSearch/joda-time does not
33 | func stripSpaceAndComments(vals []string) []string {
34 | result := make([]string, 0)
35 | for _, val := range vals {
36 | val = commentRegex.ReplaceAllString(val, "")
37 | val = whitespaceRegex.ReplaceAllString(val, " ")
38 | val = strings.TrimSpace(val)
39 | result = append(result, val)
40 | }
41 | return result
42 | }
43 |
--------------------------------------------------------------------------------
/headers_test.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import (
4 | "github.com/stretchr/testify/assert"
5 | "testing"
6 | )
7 |
8 | func TestStripSpaceAndComments(t *testing.T) {
9 | assert.Equal(
10 | t,
11 | []string{ "Thu, 13 Feb 1969 23:32 -0330" },
12 | stripSpaceAndComments([]string{ ` Thu,
13 | 13
14 | Feb
15 | 1969
16 | 23:32
17 | -0330 (Newfoundland Time)` }),
18 | )
19 | }
20 | func TestSplitAddrs(t *testing.T) {
21 | assert.Equal(
22 | t,
23 | []string{ "hello world ", "nice@me.me (test)", "hi@example.com" },
24 | splitAddrs([]string{ "hello world , nice@me.me (test),hi@example.com" }),
25 | )
26 | }
27 |
--------------------------------------------------------------------------------
/main.go:
--------------------------------------------------------------------------------
1 | //go:generate easyjson main.go
2 | package main
3 |
4 | import (
5 | "bufio"
6 | "bytes"
7 | "context"
8 | "encoding/base64"
9 | "encoding/hex"
10 | _ "expvar"
11 | "flag"
12 | "fmt"
13 | "io"
14 | "io/ioutil"
15 | "mime"
16 | "mime/quotedprintable"
17 | "net/http"
18 | _ "net/http/pprof"
19 | "net/textproto"
20 | "os"
21 | "path/filepath"
22 | "runtime"
23 | "strings"
24 | "sync"
25 |
26 | "github.com/dchest/safefile"
27 | "github.com/mailru/easyjson"
28 | blake2b "github.com/minio/blake2b-simd"
29 | "github.com/myfreeweb/go-email/email"
30 | elastic "github.com/olivere/elastic"
31 | zap "go.uber.org/zap"
32 | )
33 |
34 | var attachdir = flag.String("attachdir", "files", "path to the attachments directory")
35 | var elasticUrl = flag.String("elastic", "http://127.0.0.1:9200", "URL of the ElasticSearch server")
36 | var elasticIndex = flag.String("index", "mail", "name of the ElasticSearch index")
37 | var doInit = flag.Bool("init", false, "whether to initialize the index instead of indexing mail")
38 | var srvAddr = flag.String("srvaddr", "", "address for the pprof/expvar server to listen on")
39 |
40 | const indexSettings string = `{
41 | "settings": {
42 | "index": {
43 | "sort.field": "h.Date",
44 | "sort.order": "desc",
45 | "refresh_interval": "15s"
46 | }
47 | },
48 | "mappings": {
49 | "msg": {
50 | "dynamic": false,
51 | "properties": {
52 | "h": {
53 | "dynamic": false,
54 | "properties": {
55 | "Date": { "type": "date", "format": "EEE, dd MMM yyyy HH:mm:ss Z||dd MMM yyyy HH:mm:ss Z||dd MMM yyyy HH:mm:ss||dd MMM yyyy HH:mm", "ignore_malformed": true },
56 | "Subject": { "type": "text" },
57 | "From": { "type": "text" },
58 | "To": { "type": "text" },
59 | "Cc": { "type": "text" },
60 | "S-From": { "type": "keyword", "ignore_above": 10922 },
61 | "S-To": { "type": "keyword", "ignore_above": 10922 },
62 | "Reply-To": { "type": "text" },
63 | "Thread-Index": { "type": "keyword", "ignore_above": 10922 },
64 | "In-Reply-To": { "type": "keyword", "ignore_above": 10922 },
65 | "References": { "type": "keyword", "ignore_above": 10922 }
66 | }
67 | },
68 | "t": { "type": "text" },
69 | "p": {
70 | "dynamic": false,
71 | "properties": {
72 | "t": { "type": "text" },
73 | "h": {
74 | "dynamic": false,
75 | "properties": {
76 | "Content-Disposition": { "type": "text" }
77 | }
78 | },
79 | "p": {
80 | "dynamic": false,
81 | "properties": {
82 | "t": { "type": "text" },
83 | "h": {
84 | "dynamic": false,
85 | "properties": {
86 | "Content-Disposition": { "type": "text" }
87 | }
88 | },
89 | "p": {
90 | "dynamic": false,
91 | "properties": {
92 | "t": { "type": "text" },
93 | "h": {
94 | "dynamic": false,
95 | "properties": {
96 | "Content-Disposition": { "type": "text" }
97 | }
98 | }
99 | }
100 | }
101 | }
102 | }
103 | }
104 | },
105 | "sub": {
106 | "dynamic": false,
107 | "properties": {
108 | "t": { "type": "text" }
109 | }
110 | }
111 | }
112 | }
113 | }
114 | }`
115 |
116 | //easyjson:json
117 | type JMessage struct {
118 | Id string
119 | Header email.Header `json:"h,omitempty"`
120 | Preamble []byte `json:"pre,omitempty"`
121 | Epilogue []byte `json:"epi,omitempty"`
122 | Parts []*JMessage `json:"p,omitempty"`
123 | SubMessage *JMessage `json:"sub,omitempty"`
124 | TextBody string `json:"t,omitempty"`
125 | Attachment string `json:"a,omitempty"`
126 | }
127 |
128 | func jsonifyMsg(msg email.Message, log *zap.SugaredLogger) JMessage {
129 | log = log.With("msgid", msg.Header.Get("Message-Id"))
130 | wordDecoder := new(mime.WordDecoder)
131 | wordDecoder.CharsetReader = func(charset string, input io.Reader) (io.Reader, error) {
132 | return decodeReader(charset, input, log)
133 | }
134 | result := JMessage{
135 | Id: msg.Header.Get("Message-Id"),
136 | Header: make(map[string][]string),
137 | Preamble: msg.Preamble,
138 | Epilogue: msg.Epilogue,
139 | Parts: []*JMessage{},
140 | SubMessage: nil,
141 | TextBody: "",
142 | Attachment: "",
143 | }
144 | //// Headers
145 | for k, vs := range msg.Header {
146 | kk := textproto.CanonicalMIMEHeaderKey(k)
147 | result.Header[kk] = vs
148 | for i, v := range vs {
149 | dec, err := wordDecoder.DecodeHeader(v)
150 | if err != nil {
151 | log.Warnw("Could not decode header", "name", kk, "index", i, "value", v, "err", err)
152 | continue
153 | }
154 | result.Header[kk][i] = dec
155 | }
156 | }
157 | delete(result.Header, "Message-Id")
158 | result.Header["Date"] = stripSpaceAndComments(result.Header["Date"])
159 | result.Header["S-From"] = extractOnlyAddrs(result.Header["From"])
160 | result.Header["S-To"] = append(append(
161 | extractOnlyAddrs(result.Header["To"]),
162 | extractOnlyAddrs(result.Header["Cc"])...),
163 | extractOnlyAddrs(result.Header["Bcc"])...)
164 | result.Header["From"] = splitAddrs(result.Header["From"])
165 | result.Header["To"] = splitAddrs(result.Header["To"])
166 | result.Header["Cc"] = splitAddrs(result.Header["Cc"])
167 | result.Header["Bcc"] = splitAddrs(result.Header["Bcc"])
168 | result.Header["Return-Path"] = splitAddrs(result.Header["Return-Path"])
169 | result.Header["Delivered-To"] = splitAddrs(result.Header["Delivered-To"])
170 | result.Header["X-Failed-Recipients"] = splitAddrs(result.Header["X-Failed-Recipients"])
171 | result.Header["Thread-Index"] = splitAddrs(result.Header["Thread-Index"])
172 | result.Header["In-Reply-To"] = splitAddrs(result.Header["In-Reply-To"])
173 | result.Header["References"] = splitAddrs(result.Header["References"])
174 | //// Parts
175 | if msg.SubMessage != nil {
176 | submsg := jsonifyMsg(*msg.SubMessage, log.With("submsg", true))
177 | result.SubMessage = &submsg
178 | }
179 | for partidx, part := range msg.Parts {
180 | if part != nil {
181 | partmsg := jsonifyMsg(*part, log.With("partidx", partidx))
182 | result.Parts = append(result.Parts, &partmsg)
183 | }
184 | }
185 | //// Body
186 | ctype := result.Header.Get("Content-Type")
187 | //// Body Transfer-Encoding
188 | if result.Header.Get("Content-Transfer-Encoding") == "quoted-printable" {
189 | decBody, err := ioutil.ReadAll(quotedprintable.NewReader(bytes.NewReader(msg.Body)))
190 | if err != nil {
191 | log.Warnw("Could not decode quoted-printable, treating like an attachment", "err", err)
192 | goto file
193 | }
194 | msg.Body = decBody
195 | } else if result.Header.Get("Content-Transfer-Encoding") == "base64" {
196 | decBody := make([]byte, base64.StdEncoding.DecodedLen(len(msg.Body)))
197 | n, err := base64.StdEncoding.Decode(decBody, msg.Body)
198 | if err != nil {
199 | log.Warnw("Could not decode base64, treating like an attachment", "nbytes", n, "err", err)
200 | goto file
201 | }
202 | msg.Body = decBody
203 | }
204 | //// Body Charset
205 | if strings.HasPrefix(ctype, "text") && !strings.Contains(result.Header.Get("Content-Disposition"), "attachment") {
206 | mediatype, params, err := mime.ParseMediaType(ctype)
207 | if err != nil {
208 | if strings.Contains(ctype, "html") {
209 | mediatype = "text/html"
210 | } else {
211 | mediatype = "text/plain"
212 | }
213 | params = make(map[string]string)
214 | log.Warnw("Unreadable Content-Type", "ctype", ctype, "err", err, "assumed", mediatype)
215 | }
216 | decoded, charset, err := decodeCharset(
217 | params["charset"],
218 | msg.Body,
219 | fmt.Sprintf("Content-Type: %s", ctype),
220 | strings.Contains(mediatype, "html"),
221 | log)
222 | if err != nil {
223 | log.Warnw("Could not decode charset, treating like an attachment", "charset", charset, "err", err)
224 | goto file
225 | }
226 | result.TextBody = string(decoded)
227 | return result
228 | }
229 | file:
230 | hash := blake2b.Sum256(msg.Body)
231 | path := filepath.Join(*attachdir, hex.EncodeToString(hash[:]))
232 | log = log.With("path", path)
233 | result.Attachment = path
234 | if _, err := os.Stat(path); !os.IsNotExist(err) {
235 | log.Debug("Attachment already exists")
236 | return result
237 | }
238 | f, err := safefile.Create(path, 0444)
239 | if err != nil {
240 | log.Errorw("Could not open file for attachment", "err", err)
241 | return result
242 | }
243 | defer f.Close()
244 | _, err = f.Write(msg.Body)
245 | if err != nil {
246 | log.Errorw("Could not write attachment", "err", err)
247 | return result
248 | }
249 | err = f.Commit()
250 | if err != nil {
251 | log.Errorw("Could not commit attachment", "err", err)
252 | return result
253 | }
254 | log.Info("Saved attachment")
255 | return result
256 | }
257 |
258 | func process(msgtext io.Reader, log *zap.SugaredLogger) (*JMessage, error) {
259 | msg, err := email.ParseMessage(msgtext)
260 | if err != nil {
261 | return nil, err
262 | }
263 | jmsg := jsonifyMsg(*msg, log)
264 | return &jmsg, nil
265 | }
266 |
267 | func main() {
268 | flag.Parse()
269 | logger, _ := zap.NewProduction()
270 | defer logger.Sync()
271 | log := logger.Sugar()
272 | if *srvAddr != "" {
273 | go func() {
274 | log.Infow("pprof/expvar server started", "result", http.ListenAndServe(*srvAddr, nil))
275 | }()
276 | }
277 | ctx := context.Background()
278 | client, err := elastic.NewClient(
279 | elastic.SetURL(*elasticUrl),
280 | )
281 | if err != nil {
282 | log.Fatalw("Could not create ElasticSearch client", "err", err)
283 | }
284 | if *doInit {
285 | res, err := client.CreateIndex(*elasticIndex).BodyString(indexSettings).Do(ctx)
286 | if err != nil {
287 | log.Fatalw("Could not initialize index", "err", err)
288 | } else {
289 | log.Infow("Created index", "result", res)
290 | }
291 | } else if len(flag.Args()) == 0 || flag.Arg(0) == "-" {
292 | jmsg, err := process(bufio.NewReader(os.Stdin), log.With("filename", "stdin"))
293 | if err != nil {
294 | log.Fatalw("Could not process", "err", err)
295 | }
296 | j, err := easyjson.Marshal(*jmsg)
297 | if err != nil {
298 | log.Fatalw("Could not serialize JSON", "err", err)
299 | }
300 | _, err = client.Index().Index(*elasticIndex).Type("msg").Id(jmsg.Id).BodyString(string(j)).Do(ctx)
301 | if err != nil {
302 | log.Fatalw("Could not index", "err", err)
303 | }
304 | } else {
305 | proc, err := client.BulkProcessor().Name("mail2elasticsearch").Do(ctx)
306 | if err != nil {
307 | log.Fatalw("Could not start bulk processor", "err", err)
308 | }
309 | defer proc.Close()
310 | var wg sync.WaitGroup
311 | tasks := make(chan string)
312 | for i := 0; i < runtime.GOMAXPROCS(0); i++ {
313 | go func() {
314 | for {
315 | var j []byte
316 | var jmsg *JMessage
317 | filename := <-tasks
318 | log := log.With("filename", filename)
319 | log.Debug("Processing start")
320 | file, err := os.Open(filename)
321 | if err != nil {
322 | log.Errorw("Could not open file", "err", err)
323 | goto done
324 | }
325 | jmsg, err = process(bufio.NewReader(file), log)
326 | if err != nil {
327 | log.Errorw("Could not process", "err", err)
328 | goto done
329 | }
330 | j, err = easyjson.Marshal(*jmsg)
331 | if err != nil {
332 | log.Errorw("Could not serialize JSON", "err", err)
333 | goto done
334 | }
335 | proc.Add(elastic.NewBulkIndexRequest().Index(*elasticIndex).Type("msg").Id(jmsg.Id).Doc(string(j)))
336 | log.Debug("Processing end")
337 | done:
338 | wg.Done()
339 | }
340 | }()
341 | }
342 | for _, filename := range flag.Args() {
343 | f, err := os.Stat(filename)
344 | if err != nil {
345 | log.Fatalw("Could not stat file", "err", err, "filename", filename)
346 | }
347 | if f.Mode().IsDir() {
348 | err = filepath.Walk(filename, func(path string, _ os.FileInfo, err error) error {
349 | if err != nil {
350 | return err
351 | }
352 | f, err := os.Stat(path)
353 | if err != nil {
354 | log.Fatalw("Could not stat file", "err", err, "filename", path)
355 | }
356 | if f.Mode().IsRegular() {
357 | wg.Add(1)
358 | tasks <- path
359 | } else {
360 | log.Infow("Not a file", "filename", path)
361 | }
362 | return nil
363 | })
364 | if err != nil {
365 | log.Fatalw("Could not walk directory", "err", err, "filename", filename)
366 | }
367 | } else {
368 | wg.Add(1)
369 | tasks <- filename
370 | }
371 | }
372 | wg.Wait()
373 | }
374 | }
375 |
--------------------------------------------------------------------------------
/main_easyjson.go:
--------------------------------------------------------------------------------
1 | // Code generated by easyjson for marshaling/unmarshaling. DO NOT EDIT.
2 |
3 | package main
4 |
5 | import (
6 | json "encoding/json"
7 | easyjson "github.com/mailru/easyjson"
8 | jlexer "github.com/mailru/easyjson/jlexer"
9 | jwriter "github.com/mailru/easyjson/jwriter"
10 | email "github.com/myfreeweb/go-email/email"
11 | )
12 |
13 | // suppress unused package warning
14 | var (
15 | _ *json.RawMessage
16 | _ *jlexer.Lexer
17 | _ *jwriter.Writer
18 | _ easyjson.Marshaler
19 | )
20 |
21 | func easyjson89aae3efDecodeGithubComMyfreewebMail2elasticsearch(in *jlexer.Lexer, out *JMessage) {
22 | isTopLevel := in.IsStart()
23 | if in.IsNull() {
24 | if isTopLevel {
25 | in.Consumed()
26 | }
27 | in.Skip()
28 | return
29 | }
30 | in.Delim('{')
31 | for !in.IsDelim('}') {
32 | key := in.UnsafeString()
33 | in.WantColon()
34 | if in.IsNull() {
35 | in.Skip()
36 | in.WantComma()
37 | continue
38 | }
39 | switch key {
40 | case "Id":
41 | out.Id = string(in.String())
42 | case "h":
43 | if in.IsNull() {
44 | in.Skip()
45 | } else {
46 | in.Delim('{')
47 | if !in.IsDelim('}') {
48 | out.Header = make(email.Header)
49 | } else {
50 | out.Header = nil
51 | }
52 | for !in.IsDelim('}') {
53 | key := string(in.String())
54 | in.WantColon()
55 | var v1 []string
56 | if in.IsNull() {
57 | in.Skip()
58 | v1 = nil
59 | } else {
60 | in.Delim('[')
61 | if v1 == nil {
62 | if !in.IsDelim(']') {
63 | v1 = make([]string, 0, 4)
64 | } else {
65 | v1 = []string{}
66 | }
67 | } else {
68 | v1 = (v1)[:0]
69 | }
70 | for !in.IsDelim(']') {
71 | var v2 string
72 | v2 = string(in.String())
73 | v1 = append(v1, v2)
74 | in.WantComma()
75 | }
76 | in.Delim(']')
77 | }
78 | (out.Header)[key] = v1
79 | in.WantComma()
80 | }
81 | in.Delim('}')
82 | }
83 | case "pre":
84 | if in.IsNull() {
85 | in.Skip()
86 | out.Preamble = nil
87 | } else {
88 | out.Preamble = in.Bytes()
89 | }
90 | case "epi":
91 | if in.IsNull() {
92 | in.Skip()
93 | out.Epilogue = nil
94 | } else {
95 | out.Epilogue = in.Bytes()
96 | }
97 | case "p":
98 | if in.IsNull() {
99 | in.Skip()
100 | out.Parts = nil
101 | } else {
102 | in.Delim('[')
103 | if out.Parts == nil {
104 | if !in.IsDelim(']') {
105 | out.Parts = make([]*JMessage, 0, 8)
106 | } else {
107 | out.Parts = []*JMessage{}
108 | }
109 | } else {
110 | out.Parts = (out.Parts)[:0]
111 | }
112 | for !in.IsDelim(']') {
113 | var v5 *JMessage
114 | if in.IsNull() {
115 | in.Skip()
116 | v5 = nil
117 | } else {
118 | if v5 == nil {
119 | v5 = new(JMessage)
120 | }
121 | (*v5).UnmarshalEasyJSON(in)
122 | }
123 | out.Parts = append(out.Parts, v5)
124 | in.WantComma()
125 | }
126 | in.Delim(']')
127 | }
128 | case "sub":
129 | if in.IsNull() {
130 | in.Skip()
131 | out.SubMessage = nil
132 | } else {
133 | if out.SubMessage == nil {
134 | out.SubMessage = new(JMessage)
135 | }
136 | (*out.SubMessage).UnmarshalEasyJSON(in)
137 | }
138 | case "t":
139 | out.TextBody = string(in.String())
140 | case "a":
141 | out.Attachment = string(in.String())
142 | default:
143 | in.SkipRecursive()
144 | }
145 | in.WantComma()
146 | }
147 | in.Delim('}')
148 | if isTopLevel {
149 | in.Consumed()
150 | }
151 | }
152 | func easyjson89aae3efEncodeGithubComMyfreewebMail2elasticsearch(out *jwriter.Writer, in JMessage) {
153 | out.RawByte('{')
154 | first := true
155 | _ = first
156 | if !first {
157 | out.RawByte(',')
158 | }
159 | first = false
160 | out.RawString("\"Id\":")
161 | out.String(string(in.Id))
162 | if len(in.Header) != 0 {
163 | if !first {
164 | out.RawByte(',')
165 | }
166 | first = false
167 | out.RawString("\"h\":")
168 | if in.Header == nil && (out.Flags&jwriter.NilMapAsEmpty) == 0 {
169 | out.RawString(`null`)
170 | } else {
171 | out.RawByte('{')
172 | v6First := true
173 | for v6Name, v6Value := range in.Header {
174 | if !v6First {
175 | out.RawByte(',')
176 | }
177 | v6First = false
178 | out.String(string(v6Name))
179 | out.RawByte(':')
180 | if v6Value == nil && (out.Flags&jwriter.NilSliceAsEmpty) == 0 {
181 | out.RawString("null")
182 | } else {
183 | out.RawByte('[')
184 | for v7, v8 := range v6Value {
185 | if v7 > 0 {
186 | out.RawByte(',')
187 | }
188 | out.String(string(v8))
189 | }
190 | out.RawByte(']')
191 | }
192 | }
193 | out.RawByte('}')
194 | }
195 | }
196 | if len(in.Preamble) != 0 {
197 | if !first {
198 | out.RawByte(',')
199 | }
200 | first = false
201 | out.RawString("\"pre\":")
202 | out.Base64Bytes(in.Preamble)
203 | }
204 | if len(in.Epilogue) != 0 {
205 | if !first {
206 | out.RawByte(',')
207 | }
208 | first = false
209 | out.RawString("\"epi\":")
210 | out.Base64Bytes(in.Epilogue)
211 | }
212 | if len(in.Parts) != 0 {
213 | if !first {
214 | out.RawByte(',')
215 | }
216 | first = false
217 | out.RawString("\"p\":")
218 | if in.Parts == nil && (out.Flags&jwriter.NilSliceAsEmpty) == 0 {
219 | out.RawString("null")
220 | } else {
221 | out.RawByte('[')
222 | for v13, v14 := range in.Parts {
223 | if v13 > 0 {
224 | out.RawByte(',')
225 | }
226 | if v14 == nil {
227 | out.RawString("null")
228 | } else {
229 | (*v14).MarshalEasyJSON(out)
230 | }
231 | }
232 | out.RawByte(']')
233 | }
234 | }
235 | if in.SubMessage != nil {
236 | if !first {
237 | out.RawByte(',')
238 | }
239 | first = false
240 | out.RawString("\"sub\":")
241 | if in.SubMessage == nil {
242 | out.RawString("null")
243 | } else {
244 | (*in.SubMessage).MarshalEasyJSON(out)
245 | }
246 | }
247 | if in.TextBody != "" {
248 | if !first {
249 | out.RawByte(',')
250 | }
251 | first = false
252 | out.RawString("\"t\":")
253 | out.String(string(in.TextBody))
254 | }
255 | if in.Attachment != "" {
256 | if !first {
257 | out.RawByte(',')
258 | }
259 | first = false
260 | out.RawString("\"a\":")
261 | out.String(string(in.Attachment))
262 | }
263 | out.RawByte('}')
264 | }
265 |
266 | // MarshalJSON supports json.Marshaler interface
267 | func (v JMessage) MarshalJSON() ([]byte, error) {
268 | w := jwriter.Writer{}
269 | easyjson89aae3efEncodeGithubComMyfreewebMail2elasticsearch(&w, v)
270 | return w.Buffer.BuildBytes(), w.Error
271 | }
272 |
273 | // MarshalEasyJSON supports easyjson.Marshaler interface
274 | func (v JMessage) MarshalEasyJSON(w *jwriter.Writer) {
275 | easyjson89aae3efEncodeGithubComMyfreewebMail2elasticsearch(w, v)
276 | }
277 |
278 | // UnmarshalJSON supports json.Unmarshaler interface
279 | func (v *JMessage) UnmarshalJSON(data []byte) error {
280 | r := jlexer.Lexer{Data: data}
281 | easyjson89aae3efDecodeGithubComMyfreewebMail2elasticsearch(&r, v)
282 | return r.Error()
283 | }
284 |
285 | // UnmarshalEasyJSON supports easyjson.Unmarshaler interface
286 | func (v *JMessage) UnmarshalEasyJSON(l *jlexer.Lexer) {
287 | easyjson89aae3efDecodeGithubComMyfreewebMail2elasticsearch(l, v)
288 | }
289 |
--------------------------------------------------------------------------------