├── LICENSE
├── README.md
└── mailtojson.py


/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2013 Newsman App - www.newsmanapp.com
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of
 6 | this software and associated documentation files (the "Software"), to deal in
 7 | the Software without restriction, including without limitation the rights to
 8 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
 9 | the Software, and to permit persons to whom the Software is furnished to do so,
10 | subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
17 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
18 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
19 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
20 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
21 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # MailToJson
 2 | 
 3 | Quick script to parse incoming mail and do a post with the content as JSON data.
 4 | 
 5 | ## How to use
 6 | 
 7 | The work flow is quite simple. The script reads the mail mime message from STDIN, parses
 8 | the data and makes a POST call with RAW JSON to the url (passed as command line argument).
 9 | 
10 | Example usage (command line):
11 | ```bash
12 | cat mail.eml | python mailtojson.py -u https://dev.url/autoreply/handle.php
13 | ```
14 | 
15 | Example usage (postfix aliases):
16 | ```bash
17 | mailtojson_autoreply: "|/nethosting/mailtojson/mailtojson.py -u https://dev.url/autoreply/handle.php"
18 | ```
19 | 
20 | ## JSON Format
21 | 
22 | ```yaml
23 | json:
24 |   headers:
25 |     header_key1: value
26 |     header_key2: value
27 |   subject: "The email subject as utf-8 string"
28 |   datetime: "2015-03-17 17:48:06"
29 |   encoding: "utf-8"
30 |   from:
31 |     - { name: "Sender Name", email: "sender@email.com" }
32 |   to:
33 |     - { name: "Recipient Name", email: "recpient@email.com" }
34 |     - { name: "Recipient Name 2", email: "recpient2@email.com" }
35 |   cc:
36 |     - { name: "Recipient Name", email: "recpient@email.com" }
37 |     - { name: "Recipient Name 2", email: "recpient2@email.com" }
38 |   parts:
39 |     - { content_type: "text/plain", content: "body of this part", "headers": { "header_key1": value, "header_key2": value } }
40 |     - { content_type: "text/html", content: "body of this part", "headers": { "header_key1": value, "header_key2": value } }
41 |   attachments:
42 |     - { filename": "invoice.pdf", content_type: "application/pdf", content: "base64 of binary data" }
43 |     - { filename": "invoice2.pdf", content_type: "application/pdf", content: "base64 of binary data" }
44 | ```
45 | 
46 | ### Handling JSON data in PHP
47 | 
48 | Here is a quick example of how to parse the JSON data posted by the mail to json script:
49 | 
50 | ```php
51 | <?php
52 | 
53 | $json_str  = file_get_contents("php://input");
54 | $json_data = json_decode($json_str, true);
55 | 
56 | var_dump($json_data);
57 | ?>
58 | ```
59 | 
60 | # License
61 | 
62 | This code is released under [MIT license](https://github.com/Newsman/MailToJson/blob/master/LICENSE) by [Newsman App - Smart Email Service Provider](https://www.newsman.app).
63 | 


--------------------------------------------------------------------------------
/mailtojson.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | ## Open Sourced by - Newsman App www.newsmanapp.com
  4 | ## (c) 2013 Newsman App
  5 | ## https://github.com/Newsman/MailToJson
  6 | 
  7 | import sys, urllib2, email, re, csv, StringIO, base64, json, datetime, pprint
  8 | from optparse import OptionParser
  9 | 
 10 | VERSION = "1.3.1"
 11 | 
 12 | ERROR_NOUSER = 67
 13 | ERROR_PERM_DENIED = 77
 14 | ERROR_TEMP_FAIL = 75
 15 | 
 16 | # regular expresion from https://github.com/django/django/blob/master/django/core/validators.py
 17 | email_re = re.compile(
 18 |     r"(^[-!#$%&'*+/=?^_`{}|~0-9A-Z]+(\.[-!#$%&'*+/=?^_`{}|~0-9A-Z]+)*"  # dot-atom
 19 |     # quoted-string, see also http://tools.ietf.org/html/rfc2822#section-3.2.5
 20 |     r'|^"([\001-\010\013\014\016-\037!#-\[\]-\177]|\\[\001-\011\013\014\016-\177])*"'
 21 |     r')@((?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)$)'  # domain
 22 |     r'|\[(25[0-5]|2[0-4]\d|[0-1]?\d?\d)(\.(25[0-5]|2[0-4]\d|[0-1]?\d?\d)){3}\]$', re.IGNORECASE)
 23 | 
 24 | email_extract_re = re.compile("<(([.0-9a-z_+-=]+)@(([0-9a-z-]+\.)+[0-9a-z]{2,9}))>", re.M|re.S|re.I)
 25 | filename_re = re.compile("filename=\"(.+)\"|filename=([^;\n\r\"\']+)", re.I|re.S)
 26 | contenttype_filename_re = re.compile("name=\"(.+)\"|name=([^;\n\r\"\']+)", re.I|re.S)
 27 | 
 28 | begin_tab_re = re.compile("^\t{1,}", re.M)
 29 | begin_space_re = re.compile("^\s{1,}", re.M)
 30 | 
 31 | class MailJson:
 32 |     def __init__(self, content = None):
 33 |         self.data = {}
 34 |         self.raw_parts = []
 35 |         self.encoding = "utf-8" # output encoding
 36 |         self.setContent(content)
 37 | 
 38 |     def setEncoding(self, encoding):
 39 |         self.encoding = encoding
 40 | 
 41 |     def setContent(self, content):
 42 |         self.content = content
 43 | 
 44 |     def _fixEncodedSubject(self, subject):
 45 |         if subject is None:
 46 |             return ""
 47 | 
 48 |         subject = "%s" % subject
 49 |         subject = subject.strip()
 50 | 
 51 |         if len(subject) < 2:
 52 |             # empty string or not encoded string ?
 53 |             return subject
 54 |         if subject.find("\n") == -1:
 55 |             # is on single line
 56 |             return subject
 57 |         if subject[0:2] != "=?":
 58 |             # not encoded
 59 |             return subject
 60 | 
 61 |         subject = subject.replace("\r", "")
 62 |         subject = begin_tab_re.sub("", subject)
 63 |         subject = begin_space_re.sub("", subject)
 64 |         lines = subject.split("\n")
 65 | 
 66 |         new_subject = ""
 67 |         for l in lines:
 68 |             new_subject = "%s%s" % (new_subject, l)
 69 |             if l[-1] == "=":
 70 |                 new_subject = "%s\n " % new_subject
 71 | 
 72 |         return new_subject
 73 | 
 74 |     def _extract_email(self, s):
 75 |         ret = email_extract_re.findall(s)
 76 |         if len(ret) < 1:
 77 |             p = s.split(" ")
 78 |             for e in p:
 79 |                 e = e.strip()
 80 |                 if email_re.match(e):
 81 |                     return e
 82 | 
 83 |             return None
 84 |         else:
 85 |             return ret[0][0]
 86 | 
 87 |     def _decode_headers(self, v):
 88 |         if type(v) is not list:
 89 |             v = [ v ]
 90 | 
 91 |         ret = []
 92 |         for h in v:
 93 |             h = email.Header.decode_header(h)
 94 |             h_ret = []
 95 |             for h_decoded in h:
 96 |                 hv = h_decoded[0]
 97 |                 h_encoding = h_decoded[1]
 98 |                 if h_encoding is None:
 99 |                     h_encoding = "ascii"
100 |                 else:
101 |                     h_encoding = h_encoding.lower()
102 | 
103 |                 hv = unicode(hv, h_encoding).strip().strip("\t")
104 | 
105 | 
106 |                 h_ret.append(hv.encode(self.encoding))
107 | 
108 |             ret.append(" ".join(h_ret))
109 | 
110 |         return ret
111 | 
112 |     def _parse_recipients(self, v):
113 |         if v is None:
114 |             return None
115 | 
116 |         ret = []
117 | 
118 |         # Sometimes a list is passed, which breaks .replace()
119 |         if isinstance(v, list):
120 |             v = ",".join(v)
121 |         v = v.replace("\n", " ").replace("\r", " ").strip()
122 |         s = StringIO.StringIO(v)
123 |         c = csv.reader(s)
124 |         try:
125 |             row = c.next()
126 |         except StopIteration:
127 |             return ret
128 | 
129 |         for entry in row:
130 |             entry = entry.strip()
131 |             if email_re.match(entry):
132 |                 e = entry
133 |                 entry = ""
134 |             else:
135 |                 e = self._extract_email(entry)
136 |                 entry = entry.replace("<%s>" % e, "")
137 |                 entry = entry.strip()
138 |                 if e and entry.find(e) != -1:
139 |                     entry = entry.replace(e, "").strip()
140 | 
141 |             # If all else has failed
142 |             if entry and e is None:
143 |                 e_split = entry.split(" ")
144 |                 e = e_split[-1].replace("<", "").replace(">","")
145 |                 entry = " ".join(e_split[:-1])
146 | 
147 |             ret.append({"name": entry, "email": e})
148 | 
149 |         return ret
150 | 
151 |     def _parse_date(self, v):
152 |         if v is None:
153 |             return datetime.datetime.now()
154 | 
155 |         tt = email.utils.parsedate_tz(v)
156 | 
157 |         if tt is None:
158 |             return datetime.datetime.now()
159 | 
160 |         timestamp = email.utils.mktime_tz(tt)
161 |         date = datetime.datetime.fromtimestamp(timestamp)
162 |         return date
163 | 
164 |     def _get_content_charset(self, part, failobj = None):
165 |         """Return the charset parameter of the Content-Type header.
166 | 
167 |         The returned string is always coerced to lower case.  If there is no
168 |         Content-Type header, or if that header has no charset parameter,
169 |         failobj is returned.
170 |         """
171 |         missing = object()
172 |         charset = part.get_param("charset", missing)
173 |         if charset is missing:
174 |             return failobj
175 |         if isinstance(charset, tuple):
176 |             # RFC 2231 encoded, so decode it, and it better end up as ascii.
177 |             pcharset = charset[0] or "us-ascii"
178 |             try:
179 |                 # LookupError will be raised if the charset isn't known to
180 |                 # Python.  UnicodeError will be raised if the encoded text
181 |                 # contains a character not in the charset.
182 |                 charset = unicode(charset[2], pcharset).encode("us-ascii")
183 |             except (LookupError, UnicodeError):
184 |                 charset = charset[2]
185 |         # charset character must be in us-ascii range
186 |         try:
187 |             if isinstance(charset, unicode):
188 |                 charset = charset.encode("us-ascii")
189 |             charset = unicode(charset, "us-ascii").encode("us-ascii")
190 |         except UnicodeError:
191 |             return failobj
192 |         # RFC 2046, $4.1.2 says charsets are not case sensitive
193 |         return charset.lower()
194 | 
195 |     def _get_part_headers(self, part):
196 |         # raw headers
197 |         headers = {}
198 |         for k in part.keys():
199 |             k = k.lower()
200 |             v = part.get_all(k)
201 |             v = self._decode_headers(v)
202 | 
203 |             if len(v) == 1:
204 |                 headers[k] = v[0]
205 |             else:
206 |                 headers[k] = v
207 | 
208 |         return headers
209 | 
210 |     def parse(self):
211 |         self.msg = email.message_from_string(self.content)
212 | 
213 |         headers = self._get_part_headers(self.msg)
214 |         self.data["headers"] = headers
215 |         self.data["datetime"] = self._parse_date(headers.get("date", None)).strftime("%Y-%m-%d %H:%M:%S")
216 |         self.data["subject"] = self._fixEncodedSubject(headers.get("subject", None))
217 |         self.data["to"] = self._parse_recipients(headers.get("to", None))
218 |         self.data["reply-to"] = self._parse_recipients(headers.get("reply-to", None))
219 |         self.data["from"] = self._parse_recipients(headers.get("from", None))
220 |         self.data["cc"] = self._parse_recipients(headers.get("cc", None))
221 | 
222 |         attachments = []
223 |         parts = []
224 |         for part in self.msg.walk():
225 |             if part.is_multipart():
226 |                 continue
227 | 
228 |             content_disposition = part.get("Content-Disposition", None)
229 |             if content_disposition:
230 |                 # we have attachment
231 |                 filename = "undefined"
232 |                 
233 |                 r = filename_re.findall(content_disposition)
234 |                 if r:
235 |                     filename = sorted(r[0])[1]
236 |                 else:
237 |                     content_type = part.get("Content-Type", None)
238 |                     if content_type:
239 |                         r = contenttype_filename_re.findall(content_type)
240 |                         if r:
241 |                             filename = sorted(r[0])[1]
242 | 
243 |                 a = { "filename": filename, "content": base64.b64encode(part.get_payload(decode = True)), "content_type": part.get_content_type() }
244 |                 attachments.append(a)
245 |             else:
246 |                 try:
247 |                     p = { "content_type": part.get_content_type(), "content": unicode(part.get_payload(decode = 1), self._get_content_charset(part, "utf-8"), "ignore").encode(self.encoding), "headers": self._get_part_headers(part) }
248 |                     parts.append(p)
249 |                     self.raw_parts.append(part)
250 |                 except LookupError:
251 |                     # Sometimes an encoding isn't recognised - not much to be done
252 |                     pass
253 | 
254 |         self.data["attachments"] = attachments
255 |         self.data["parts"] = parts
256 |         self.data["encoding"] = self.encoding
257 | 
258 |         return self.get_data()
259 | 
260 |     def get_data(self):
261 |         return self.data
262 | 
263 |     def get_raw_parts(self):
264 |         return self.raw_parts
265 | 
266 | if __name__ == "__main__":
267 |     usage = "usage: %prog [options]"
268 |     parser = OptionParser(usage)
269 |     parser.add_option("-u", "--url", dest = "url", action = "store", help = "the url where to post the mail data as json")
270 |     parser.add_option("-p", "--print", dest = "do_print", action = "store_true", help = "no json posting, just print the data")
271 |     parser.add_option("-d", "--dump", dest = "do_dump", action = "store_true", help = "if present print to output the url post response")
272 | 
273 |     opt, args = parser.parse_args()
274 | 
275 |     if not opt.url and not opt.do_print:
276 |         print parser.format_help()
277 |         sys.exit(1)
278 | 
279 |     content = sys.stdin.read()
280 | 
281 |     try:
282 |         mj = MailJson(content)
283 |         mj.parse()
284 |         data = mj.get_data()
285 | 
286 |         if opt.do_print:
287 |             print(json.dumps(data, encoding = data.get("encoding")))
288 |         else:
289 |             headers = { "Content-Type": "application/json; charset=%s" % data.get("encoding"), "User-Agent": "NewsmanApp/MailToJson %s - https://github.com/Newsman/MailToJson" % VERSION }
290 |             req = urllib2.Request(opt.url.replace("\n", "").replace("\r", ""), json.dumps(data, encoding = data.get("encoding")), headers)
291 |             resp = urllib2.urlopen(req)
292 |             ret = resp.read()
293 | 
294 |             print "Parsed Mail Data sent to: %s\n" % opt.url
295 |             if opt.do_dump:
296 |                 print ret
297 |     except Exception, inst:
298 |         print "ERR: %s" % inst
299 |         sys.exit(ERROR_TEMP_FAIL)
300 | 


--------------------------------------------------------------------------------