├── .gitignore
├── README.md
└── mhtifier.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | *.py[cod]
 2 | 
 3 | # C extensions
 4 | *.so
 5 | 
 6 | # Packages
 7 | *.egg
 8 | *.egg-info
 9 | dist
10 | build
11 | eggs
12 | parts
13 | bin
14 | var
15 | sdist
16 | develop-eggs
17 | .installed.cfg
18 | lib
19 | lib64
20 | 
21 | # Installer logs
22 | pip-log.txt
23 | 
24 | # Unit test / coverage reports
25 | .coverage
26 | .tox
27 | nosetests.xml
28 | 
29 | # Translations
30 | *.mo
31 | 
32 | # Mr Developer
33 | .mr.developer.cfg
34 | .project
35 | .pydevproject
36 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # MHTifier
 2 | Un/packs an MHT (MHTML) archive into/from separate files, writing/reading them in directories to match their Content-Location.
 3 | 
 4 | Whole [story](http://decodecode.net/elitist/2013/01/mhtifier/) is in my devlog.
 5 | 
 6 | # Issues
 7 | 1. Cleanest would've been to use stdin/out, but turned out inconvenient, annoying even, so added command line options.
 8 | 2. Python's stdlib module's performance (premature optimization?):
 9 | 	`email.message_from_bytes(mht.read()) # Parser is "conducive to incremental parsing of email messages, such as would be necessary when reading the text of an email message from a source that can block", so I guess it's more efficient to have it read stdin directly, rather than buffering.`
10 | 3. Encodings (ascii, UTF-8) and de/coding was painful, and probably still buggy.
11 | 4. base64 encoded binaries: my editor, Geany, suffocates, I think, when wrapping these long lines?
12 | 1. Verify index.html is present!?
13 | 1. A few un/Pythonisms, idioms,I guess.
14 | 


--------------------------------------------------------------------------------
/mhtifier.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # Encoding: UTF-8
  3 | """mhtifier.py
  4 | Un/packs an MHT "archive" into/from separate files, writing/reading them in directories to match their Content-Location.
  5 | 
  6 | Uses part's Content-Location to name paths, or index.html for the root HTML.
  7 | Content types will be assigned according to registry of MIME types mapping to file name extensions.
  8 | 
  9 | History:
 10 | * 2013-01-11: renamed mhtifier.
 11 | * 2013-01-10: created mht2fs.py, and... done.
 12 | """
 13 | 
 14 | # Standard library modules do the heavy lifting. Ours is all simple stuff.
 15 | import base64
 16 | import email, email.message
 17 | import mimetypes
 18 | import os
 19 | import quopri
 20 | import sys
 21 | import argparse
 22 | 
 23 | # Just do it.
 24 | def main():
 25 | 	"""Convert MHT file given as command line argument (or stdin?) to files and directories in the current directory.
 26 | 
 27 | 	Usage:
 28 | 		cd foo-unpacked/
 29 | 		mht2fs.py ../foo.mht
 30 | 	"""
 31 | 	parser = argparse.ArgumentParser(description="Extract MHT archive into new directory.")
 32 | 	parser.add_argument("mht", metavar="MHT", help='path to MHT file, use "-" for stdin/stdout.')
 33 | 	parser.add_argument("d", metavar="DIR", help="directory to create to store parts in, or read them from.") #??? How to make optional, default to current dir?
 34 | 	parser.add_argument("-p", "--pack", action="store_true", help="pack file under DIR into an MHT.")
 35 | 	parser.add_argument("-u", "--unpack", action="store_true", help="unpack MHT into a new DIR.")
 36 | 	parser.add_argument("-v", "--verbose", action="store_true")
 37 | 	parser.add_argument("-q", "--quiet", action="store_true")
 38 | 	args = parser.parse_args() # --help is built-in.
 39 | 
 40 | 	# Validate command line.
 41 | 	if args.pack == args.unpack:
 42 | 		sys.stderr.write("Invalid: must specify one action, either --pack or --unpack.\n")
 43 | 		sys.exit(-1)
 44 | 
 45 | 	# File name or stdin/stdout?
 46 | 	if args.mht == "-":
 47 | 		mht = sys.stdout if args.pack else sys.stdin.buffer
 48 | 	else:
 49 | 		if args.pack and os.path.exists(args.mht):
 50 | 			# Refuse to overwrite MHT file.
 51 | 			sys.stderr.write("Error: MHT file exists, won't overwrite.\n")
 52 | 			sys.exit(-2)
 53 | 		mht = open(args.mht, "wb" if args.pack else "rb")
 54 | 
 55 | 	# New directory?
 56 | 	if args.unpack:
 57 | 		os.mkdir(args.d)
 58 | 
 59 | 	# Change directory so paths (content-location) are relative to index.html.
 60 | 	os.chdir(args.d)
 61 | 
 62 | 	# Un/pack?
 63 | 	if args.unpack:
 64 | 		if not args.quiet:
 65 | 			sys.stderr.write("Unpacking...\n")
 66 | 
 67 | 		# Read entire MHT archive -- it's a multipart(/related) message.
 68 | 		a = email.message_from_bytes(mht.read()) # Parser is "conducive to incremental parsing of email messages, such as would be necessary when reading the text of an email message from a source that can block", so I guess it's more efficient to have it read stdin directly, rather than buffering.				
 69 | 
 70 | 		parts = a.get_payload() # Multiple parts, usually?
 71 | 		if not type(parts) is list:
 72 | 			parts = [a] # Single 'str' part, so convert to list.
 73 |                 		                                                    
 74 | 		# Save all parts to files.
 75 | 		for p in parts: # walk() for a tree, but I'm guessing MHT is never nested?
 76 | 			#??? cs = p.get_charset() # Expecting "utf-8" for root HTML, None for all other parts.						
 77 | 			ct = p.get_content_type() # String coerced to lower case of the form maintype/subtype, else get_default_type().			
 78 | 			fp = p.get("content-location") or "index.html" # File path. Expecting root HTML is only part with no location.
 79 | 
 80 | 			if args.verbose:
 81 | 				sys.stderr.write("Writing %s to %s, %d bytes...\n" % (ct, fp, len(p.get_payload())))
 82 | 
 83 | 			# Create directories as necessary.
 84 | 			if os.path.dirname(fp):
 85 | 				os.makedirs(os.path.dirname(fp), exist_ok=True)
 86 | 
 87 | 			# Save part's body to a file.
 88 | 			open(fp, "wb").write(p.get_payload(decode=True))
 89 | 
 90 | 		if not args.quiet:
 91 | 			sys.stderr.write("Done.\nUnpacked %d files.\n" % (len(parts)))
 92 | 
 93 | 	else:
 94 | 		if not args.quiet:
 95 | 			sys.stderr.write("Packing...\n")
 96 | 
 97 | 		# Create archive as multipart message.
 98 | 		a = email.message.Message()
 99 | 		a["MIME-Version"] = "1.0"
100 | 		a.add_header("Content-Type", "multipart/related", type="text/html")
101 | 
102 | 		# Walk current directory.
103 | 		for (root, _, files) in os.walk("."):
104 | 			# Create message part from each file and attach them to archive.
105 | 			for f in files:
106 | 				p = os.path.join(root, f).lstrip("./")
107 | 				m = email.message.Message()
108 | 				# Encode and set type of part.
109 | 				t = mimetypes.guess_type(f)[0]
110 | 				if t:
111 | 					m["Content-Type"] = t
112 | 
113 | 				if args.verbose:
114 | 					sys.stderr.write("Reading %s as %s...\n" % (p, t))
115 | 
116 | 				if t and t.startswith("text/"):
117 | 					m["Content-Transfer-Encoding"] = "quoted-printable"
118 | 					m.set_payload(quopri.encodestring(open(p, "rt").read().encode("utf-8")).decode("ascii")) #??? WTF?
119 | 				else:
120 | 					m["Content-Transfer-Encoding"] = "base64"
121 | 					m.set_payload(base64.b64encode(open(p, "rb").read()).decode("ascii"))
122 | 					#??? My editor, Geany, suffocates, I think, when needs to wrap these long lines?
123 | 
124 | 				# Only set charset for index.html to UTF-8, and no location.
125 | 				if f == "index.html":
126 | 					m.add_header("Content-Type", "text/html", charset="utf-8")
127 | 					#??? m.set_charset("utf-8")
128 | 				else:
129 | 					m["Content-Location"] = p
130 | 				a.attach(m)
131 | 
132 | 		# Write MHT file.
133 | 		#??? verify index.html is present!?
134 | 		mht.write(bytes(a.as_string(unixfrom=False), "utf-8")) # Not an mbox file, so we don't need to mangle "From " lines, I guess?
135 | 
136 | 		if not args.quiet:
137 | 			sys.stderr.write("Done.\nPacked %d files.\n" % (len(a.get_payload())))
138 | 
139 | if __name__ == "__main__":
140 | 	main() # Kindda useless if we're not using doctest or anything?
141 | 


--------------------------------------------------------------------------------