├── .gitignore ├── README.md └── mhtifier.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.py[cod] 2 | 3 | # C extensions 4 | *.so 5 | 6 | # Packages 7 | *.egg 8 | *.egg-info 9 | dist 10 | build 11 | eggs 12 | parts 13 | bin 14 | var 15 | sdist 16 | develop-eggs 17 | .installed.cfg 18 | lib 19 | lib64 20 | 21 | # Installer logs 22 | pip-log.txt 23 | 24 | # Unit test / coverage reports 25 | .coverage 26 | .tox 27 | nosetests.xml 28 | 29 | # Translations 30 | *.mo 31 | 32 | # Mr Developer 33 | .mr.developer.cfg 34 | .project 35 | .pydevproject 36 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # MHTifier 2 | Un/packs an MHT (MHTML) archive into/from separate files, writing/reading them in directories to match their Content-Location. 3 | 4 | Whole [story](http://decodecode.net/elitist/2013/01/mhtifier/) is in my devlog. 5 | 6 | # Issues 7 | 1. Cleanest would've been to use stdin/out, but turned out inconvenient, annoying even, so added command line options. 8 | 2. Python's stdlib module's performance (premature optimization?): 9 | `email.message_from_bytes(mht.read()) # Parser is "conducive to incremental parsing of email messages, such as would be necessary when reading the text of an email message from a source that can block", so I guess it's more efficient to have it read stdin directly, rather than buffering.` 10 | 3. Encodings (ascii, UTF-8) and de/coding was painful, and probably still buggy. 11 | 4. base64 encoded binaries: my editor, Geany, suffocates, I think, when wrapping these long lines? 12 | 1. Verify index.html is present!? 13 | 1. A few un/Pythonisms, idioms,I guess. 14 | -------------------------------------------------------------------------------- /mhtifier.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Encoding: UTF-8 3 | """mhtifier.py 4 | Un/packs an MHT "archive" into/from separate files, writing/reading them in directories to match their Content-Location. 5 | 6 | Uses part's Content-Location to name paths, or index.html for the root HTML. 7 | Content types will be assigned according to registry of MIME types mapping to file name extensions. 8 | 9 | History: 10 | * 2013-01-11: renamed mhtifier. 11 | * 2013-01-10: created mht2fs.py, and... done. 12 | """ 13 | 14 | # Standard library modules do the heavy lifting. Ours is all simple stuff. 15 | import base64 16 | import email, email.message 17 | import mimetypes 18 | import os 19 | import quopri 20 | import sys 21 | import argparse 22 | 23 | # Just do it. 24 | def main(): 25 | """Convert MHT file given as command line argument (or stdin?) to files and directories in the current directory. 26 | 27 | Usage: 28 | cd foo-unpacked/ 29 | mht2fs.py ../foo.mht 30 | """ 31 | parser = argparse.ArgumentParser(description="Extract MHT archive into new directory.") 32 | parser.add_argument("mht", metavar="MHT", help='path to MHT file, use "-" for stdin/stdout.') 33 | parser.add_argument("d", metavar="DIR", help="directory to create to store parts in, or read them from.") #??? How to make optional, default to current dir? 34 | parser.add_argument("-p", "--pack", action="store_true", help="pack file under DIR into an MHT.") 35 | parser.add_argument("-u", "--unpack", action="store_true", help="unpack MHT into a new DIR.") 36 | parser.add_argument("-v", "--verbose", action="store_true") 37 | parser.add_argument("-q", "--quiet", action="store_true") 38 | args = parser.parse_args() # --help is built-in. 39 | 40 | # Validate command line. 41 | if args.pack == args.unpack: 42 | sys.stderr.write("Invalid: must specify one action, either --pack or --unpack.\n") 43 | sys.exit(-1) 44 | 45 | # File name or stdin/stdout? 46 | if args.mht == "-": 47 | mht = sys.stdout if args.pack else sys.stdin.buffer 48 | else: 49 | if args.pack and os.path.exists(args.mht): 50 | # Refuse to overwrite MHT file. 51 | sys.stderr.write("Error: MHT file exists, won't overwrite.\n") 52 | sys.exit(-2) 53 | mht = open(args.mht, "wb" if args.pack else "rb") 54 | 55 | # New directory? 56 | if args.unpack: 57 | os.mkdir(args.d) 58 | 59 | # Change directory so paths (content-location) are relative to index.html. 60 | os.chdir(args.d) 61 | 62 | # Un/pack? 63 | if args.unpack: 64 | if not args.quiet: 65 | sys.stderr.write("Unpacking...\n") 66 | 67 | # Read entire MHT archive -- it's a multipart(/related) message. 68 | a = email.message_from_bytes(mht.read()) # Parser is "conducive to incremental parsing of email messages, such as would be necessary when reading the text of an email message from a source that can block", so I guess it's more efficient to have it read stdin directly, rather than buffering. 69 | 70 | parts = a.get_payload() # Multiple parts, usually? 71 | if not type(parts) is list: 72 | parts = [a] # Single 'str' part, so convert to list. 73 | 74 | # Save all parts to files. 75 | for p in parts: # walk() for a tree, but I'm guessing MHT is never nested? 76 | #??? cs = p.get_charset() # Expecting "utf-8" for root HTML, None for all other parts. 77 | ct = p.get_content_type() # String coerced to lower case of the form maintype/subtype, else get_default_type(). 78 | fp = p.get("content-location") or "index.html" # File path. Expecting root HTML is only part with no location. 79 | 80 | if args.verbose: 81 | sys.stderr.write("Writing %s to %s, %d bytes...\n" % (ct, fp, len(p.get_payload()))) 82 | 83 | # Create directories as necessary. 84 | if os.path.dirname(fp): 85 | os.makedirs(os.path.dirname(fp), exist_ok=True) 86 | 87 | # Save part's body to a file. 88 | open(fp, "wb").write(p.get_payload(decode=True)) 89 | 90 | if not args.quiet: 91 | sys.stderr.write("Done.\nUnpacked %d files.\n" % (len(parts))) 92 | 93 | else: 94 | if not args.quiet: 95 | sys.stderr.write("Packing...\n") 96 | 97 | # Create archive as multipart message. 98 | a = email.message.Message() 99 | a["MIME-Version"] = "1.0" 100 | a.add_header("Content-Type", "multipart/related", type="text/html") 101 | 102 | # Walk current directory. 103 | for (root, _, files) in os.walk("."): 104 | # Create message part from each file and attach them to archive. 105 | for f in files: 106 | p = os.path.join(root, f).lstrip("./") 107 | m = email.message.Message() 108 | # Encode and set type of part. 109 | t = mimetypes.guess_type(f)[0] 110 | if t: 111 | m["Content-Type"] = t 112 | 113 | if args.verbose: 114 | sys.stderr.write("Reading %s as %s...\n" % (p, t)) 115 | 116 | if t and t.startswith("text/"): 117 | m["Content-Transfer-Encoding"] = "quoted-printable" 118 | m.set_payload(quopri.encodestring(open(p, "rt").read().encode("utf-8")).decode("ascii")) #??? WTF? 119 | else: 120 | m["Content-Transfer-Encoding"] = "base64" 121 | m.set_payload(base64.b64encode(open(p, "rb").read()).decode("ascii")) 122 | #??? My editor, Geany, suffocates, I think, when needs to wrap these long lines? 123 | 124 | # Only set charset for index.html to UTF-8, and no location. 125 | if f == "index.html": 126 | m.add_header("Content-Type", "text/html", charset="utf-8") 127 | #??? m.set_charset("utf-8") 128 | else: 129 | m["Content-Location"] = p 130 | a.attach(m) 131 | 132 | # Write MHT file. 133 | #??? verify index.html is present!? 134 | mht.write(bytes(a.as_string(unixfrom=False), "utf-8")) # Not an mbox file, so we don't need to mangle "From " lines, I guess? 135 | 136 | if not args.quiet: 137 | sys.stderr.write("Done.\nPacked %d files.\n" % (len(a.get_payload()))) 138 | 139 | if __name__ == "__main__": 140 | main() # Kindda useless if we're not using doctest or anything? 141 | --------------------------------------------------------------------------------