├── deployer ├── __init__.py ├── exceptions.py ├── constants.py ├── utils.py ├── main.py └── upload.py ├── .gitignore ├── setup.cfg ├── pypi-release.sh ├── .therapist.yml ├── .env-dist ├── setup.py ├── README.md └── LICENSE /deployer/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | stumptown_deployer.egg-info 2 | build/ 3 | dist/ 4 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [bdist_wheel] 2 | universal = 1 3 | 4 | [flake8] 5 | max-line-length = 88 6 | -------------------------------------------------------------------------------- /pypi-release.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -eo pipefail 3 | 4 | pip install twine 5 | # From https://pypi.org/project/twine/ 6 | rm -fr dist/ 7 | python setup.py sdist bdist_wheel 8 | twine upload dist/* 9 | -------------------------------------------------------------------------------- /.therapist.yml: -------------------------------------------------------------------------------- 1 | actions: 2 | black: 3 | run: black --check --diff {files} 4 | fix: black {files} 5 | include: "*.py" 6 | 7 | flake8: 8 | run: flake8 {files} 9 | include: "*.py" 10 | -------------------------------------------------------------------------------- /deployer/exceptions.py: -------------------------------------------------------------------------------- 1 | class CoreException(Exception): 2 | """Exists for the benefit of making the cli easier to catch exceptions.""" 3 | 4 | 5 | class NoGitDirectory(CoreException): 6 | """When trying to find a/the git directory and failing.""" 7 | -------------------------------------------------------------------------------- /.env-dist: -------------------------------------------------------------------------------- 1 | # If you're hacking on this locally and want to test the whole thing 2 | # outside of Mozilla's infra, this setting you might want to change to 3 | # something unique to you. 4 | #DEPLOYER_DEFAULT_BUCKET=yari 5 | 6 | #AWS_PROFILE=default 7 | 8 | #S3_DEFAULT_BUCKET_LOCATION=us-east-2 9 | -------------------------------------------------------------------------------- /deployer/constants.py: -------------------------------------------------------------------------------- 1 | import json 2 | import sys 3 | import os 4 | 5 | from decouple import AutoConfig 6 | 7 | config = AutoConfig(os.curdir) 8 | 9 | DEFAULT_BUCKET = config("DEPLOYER_DEFAULT_BUCKET", "yari") 10 | 11 | DEFAULT_NAME_PATTERN = config( 12 | "DEPLOYER_DEFAULT_NAME_PATTERN", "{username}-{branchname}" 13 | ) 14 | 15 | AWS_PROFILE = config("AWS_PROFILE", default="default") 16 | 17 | # E.g. us-east-1 18 | S3_DEFAULT_BUCKET_LOCATION = config("S3_DEFAULT_BUCKET_LOCATION", default="") 19 | 20 | # When uploading a bunch of files, the work is done in a thread pool. 21 | # If you use too many "workers" it might saturate your network meaning it's 22 | # slower. 23 | MAX_WORKERS_PARALLEL_UPLOADS = config( 24 | "DEPLOYER_MAX_WORKERS_PARALLEL_UPLOADS", default=50, cast=int 25 | ) 26 | 27 | # E.g. /en-US/docs/Foo/Bar/index.html 28 | DEFAULT_CACHE_CONTROL = config( 29 | "DEPLOYER_DEFAULT_CACHE_CONTROL", default=60 * 60, cast=int 30 | ) 31 | # E.g. '2.02b14290.chunk.css' 32 | HASHED_CACHE_CONTROL = config( 33 | "DEPLOYER_HASHED_CACHE_CONTROL", default=60 * 60 * 24 * 365, cast=int 34 | ) 35 | 36 | 37 | DEFAULT_NO_PROGRESS_BAR = config( 38 | "NO_PROGRESS_BAR", 39 | cast=bool, 40 | default=not sys.stdout.isatty() or bool(json.loads(os.environ.get("CI", "0"))), 41 | ) 42 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from os import path 2 | 3 | from setuptools import find_packages, setup 4 | 5 | _here = path.dirname(__file__) 6 | 7 | 8 | dev_requirements = ["black==19.3b0", "flake8==3.7.8", "therapist"] 9 | 10 | setup( 11 | name="stumptown-deployer", 12 | version="0.2.6", 13 | author="Mozilla MDN", 14 | url="https://github.com/mdn/stumptown-deployer", 15 | description="Deploying static Stumptown sites", 16 | long_description=open(path.join(_here, "README.md")).read(), 17 | long_description_content_type="text/markdown", 18 | license="MPL 2.0", 19 | classifiers=[ 20 | "Programming Language :: Python", 21 | "Programming Language :: Python :: 3", 22 | "Programming Language :: Python :: Implementation :: CPython", 23 | "License :: OSI Approved :: Mozilla Public License 2.0 (MPL 2.0)", 24 | ], 25 | packages=find_packages(), 26 | include_package_data=True, 27 | zip_safe=False, 28 | install_requires=["boto3", "click", "PyGithub", "GitPython", "python-decouple"], 29 | extras_require={"dev": dev_requirements}, 30 | entry_points=""" 31 | [console_scripts] 32 | stumptown-deployer=deployer.main:cli 33 | """, 34 | setup_requires=[], 35 | tests_require=["pytest"], 36 | keywords="git github s3 boto3 stumptown mdn", 37 | ) 38 | -------------------------------------------------------------------------------- /deployer/utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | from pathlib import Path 3 | 4 | import click 5 | 6 | 7 | def error(*msg): 8 | msg = " ".join([str(x) for x in msg]) 9 | click.echo(click.style(msg, fg="red")) 10 | 11 | 12 | def warning(*msg): 13 | msg = " ".join([str(x) for x in msg]) 14 | click.echo(click.style(msg, fg="yellow")) 15 | 16 | 17 | def info(*msg): 18 | msg = " ".join([str(x) for x in msg]) 19 | click.echo(click.style(msg)) 20 | 21 | 22 | def success(*msg): 23 | msg = " ".join([str(x) for x in msg]) 24 | click.echo(click.style(msg, fg="green")) 25 | 26 | 27 | def ppath(path: Path, current_dir=None): 28 | current_dir = current_dir or Path(os.curdir) 29 | p = Path(path) 30 | try: 31 | return p.relative_to(current_dir) 32 | except ValueError: 33 | # FIXME: Would be nice if it could produce something like ../../other/dir 34 | return path 35 | 36 | 37 | def is_junk_file(file_path: Path): 38 | if file_path.name == ".DS_Store": 39 | return True 40 | if file_path.name.endswith("~"): 41 | return True 42 | return False 43 | 44 | 45 | def fmt_size(bytes_): 46 | if bytes_ > 1024 * 1024: 47 | return f"{bytes_ / 1024 / 1024:.1f}MB" 48 | if bytes_ > 1024: 49 | return f"{bytes_ / 1024:.1f}KB" 50 | return f"{int(bytes_)}B" 51 | 52 | 53 | def fmt_seconds(seconds): 54 | if seconds < 1: 55 | return f"{seconds * 1000:.1f}ms" 56 | if seconds >= 60 * 60: 57 | minutes = seconds / 60 58 | return f"{int(minutes) // 60}h{int(minutes) % 60}m" 59 | if seconds >= 60: 60 | return f"{int(seconds) // 60}m{int(seconds) % 60}s" 61 | return f"{seconds:.1f}s" 62 | -------------------------------------------------------------------------------- /deployer/main.py: -------------------------------------------------------------------------------- 1 | import functools 2 | import pkg_resources 3 | from pathlib import Path 4 | 5 | import click 6 | 7 | from .constants import ( 8 | DEFAULT_NAME_PATTERN, 9 | DEFAULT_BUCKET, 10 | S3_DEFAULT_BUCKET_LOCATION, 11 | DEFAULT_NO_PROGRESS_BAR, 12 | ) 13 | from .exceptions import CoreException 14 | from .upload import upload_site 15 | from .utils import error, info 16 | 17 | 18 | def cli_wrap(fn): 19 | @functools.wraps(fn) 20 | def inner(*args, **kwargs): 21 | try: 22 | fn(*args, **kwargs) 23 | except CoreException as exception: 24 | info(exception.__class__.__name__) 25 | error(str(exception)) 26 | raise click.Abort 27 | 28 | return inner 29 | 30 | 31 | @click.group() 32 | @click.option("--debug/--no-debug", default=False) 33 | @click.pass_context 34 | def cli(ctx, debug): 35 | ctx.ensure_object(dict) 36 | ctx.obj["debug"] = debug 37 | 38 | 39 | @cli.command() 40 | @click.pass_context 41 | @cli_wrap 42 | @click.option( 43 | "--bucket", 44 | default=DEFAULT_BUCKET, 45 | help=f"Name of the bucket (default {DEFAULT_BUCKET!r})", 46 | ) 47 | @click.option( 48 | "--name", default=None, help=f"Name of the site (default {DEFAULT_NAME_PATTERN!r})" 49 | ) 50 | @click.option( 51 | "--bucket-location", 52 | default=S3_DEFAULT_BUCKET_LOCATION, 53 | help=f"Name of the bucket (default {S3_DEFAULT_BUCKET_LOCATION!r})", 54 | ) 55 | @click.option( 56 | "--refresh", 57 | default=False, 58 | help="Ignores checking if files exist already", 59 | show_default=True, 60 | is_flag=True, 61 | ) 62 | @click.option( 63 | "--bucket-lifecycle-days", 64 | required=False, 65 | type=int, 66 | help=( 67 | "If specified, the number of days until uploaded objects are deleted. " 68 | "(Only applicable when buckets are created!)" 69 | ), 70 | ) 71 | @click.option( 72 | "--dry-run", 73 | default=False, 74 | help="No actual uploading", 75 | show_default=True, 76 | is_flag=True, 77 | ) 78 | @click.option( 79 | "--no-progress-bar", 80 | default=DEFAULT_NO_PROGRESS_BAR, 81 | help="Don't use an iteractive progress bar", 82 | show_default=True, 83 | is_flag=True, 84 | ) 85 | @click.argument("directory", type=click.Path()) 86 | def upload(ctx, directory, **kwargs): 87 | p = Path(directory) 88 | if not p.exists(): 89 | error(f"{directory} does not exist") 90 | raise click.Abort 91 | 92 | ctx.obj.update(kwargs) 93 | upload_site(directory, ctx.obj) 94 | 95 | 96 | @cli.command() 97 | @click.pass_context 98 | def version(ctx): 99 | info(pkg_resources.get_distribution("stumptown-deployer").version) 100 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # stumptown-deployer 2 | 3 | Ship a Stumptown static site for web hosting. 4 | 5 | Don't tell anyone, but for now it's all AWS as the backend but that's an 6 | implementation detail that shouldn't prevent us from one day moving to Google Cloud 7 | Platform or Azure or Fastly. 8 | 9 | ## Limitations and caveats 10 | 11 | - Redirects - in the build directory we're supposed to have `/en-us/_redirects.txt` 12 | 13 | - Preferred names - file systems might not be allowed to call a folder a certain thing 14 | but that's not necessarily what we want the key to be called in S3. 15 | 16 | - GitHub integration 17 | 18 | ## How it works 19 | 20 | This project's goal is ultimately to take a big directory of files and upload them to 21 | S3. But there are some more advanced features so as turning `_redirects.txt` files 22 | into S3 redirect keys. And there might be file system names that don't match exactly 23 | what we need the S3 key to be called exactly. Also, the directory is bound to contain 24 | "junk" that should be omitted. For example, Yari produces `index.hash` files which 25 | are used to remember the checksum when it built the `index.html`. 26 | 27 | All deployments, generally, all go into the one same S3 bucket. But in that bucket 28 | you always have a "prefix" (aka. a root folder) which gets used by CloudFront so you 29 | can have N CloudFront distributions for 1 S3 bucket. For example, one prefix might 30 | be called `master` which'll be the production site. Another prefix might be 31 | `peterbe-pr12345`. 32 | 33 | So every deployment has a prefix (aka. the "name") which can be automatically 34 | generated based on the name of the current branch, which'd be known to something 35 | like TravisCI. The first thing it does is that it downloads a complete listing of 36 | every known key in the bucket under that prefix and each key's size. (That's all 37 | you get from `bucket.list_objects_v2`). Now, it starts to walk the local directory 38 | and for each _file_ it applies the following logic: 39 | 40 | - Does it S3 key _not_ exist at all? --> Upload brand new S3 key! 41 | - Does the S3 key _exist_? 42 | - Is the file size different from the S3 key size? --> Upload changed S3 key! 43 | - Is the file size exactly the same as the S3 key size? --> Download the 44 | S3 key's `Metadata->filehash`. 45 | - Is the hash exactly the same as the file's hash? --> Do nothing! 46 | - Is the hash different? --> Upload changed S3 key! 47 | 48 | When it uploads an S3 key, _always_ compute the local file's hash and include that 49 | as a piece of S3 key Metadata. 50 | 51 | ## Getting started 52 | 53 | You can install it globally or in a virtualen environment. Whatever floats 54 | float fancy. 55 | 56 | pip install stumptown-deployer 57 | stumptown-deployer --help 58 | 59 | Please refer to the [`boto3` documentation](https://boto3.amazonaws.com/v1/documentation/api/latest/guide/quickstart.html#configuration) with regards to configuring AWS access 60 | credentials. 61 | 62 | ## Goal 63 | 64 | To be dead-easy to use and powerful at the same time. 65 | 66 | ## Contributing 67 | 68 | Clone this repo then run: 69 | 70 | pip install -e ".[dev]" 71 | 72 | That should have installed the CLI `stumptown-deployer` 73 | 74 | stumptown-deployer --help 75 | 76 | If you wanna make a PR, make sure it's formatted with `black` and passes `flake8`. 77 | 78 | You can check that all files are `flake8` fine by running: 79 | 80 | flake8 deployer 81 | 82 | And to check that all files are formatted according to `black` run: 83 | 84 | black --check deployer 85 | 86 | All of the code style stuff can be simplified by installing `therapist`. It should 87 | get installed by default, but setting it up as a `git` `pre-commit` hook is optional. 88 | Here's how you set it up once: 89 | 90 | therapist install 91 | 92 | Now, next time you try to commit a `.py` file with a `black` or `flake8` violation 93 | it will remind you and block the commit. You can override it like this: 94 | 95 | git commit -a -m "I know what I'm doing" 96 | 97 | To run _all_ code style and lint checkers you can also use `therapist` with: 98 | 99 | therapist run --use-tracked-files 100 | 101 | Some things can't be automatically fixed, but `black` violations can for example: 102 | 103 | therapist run --use-tracked-files --fix 104 | 105 | ## Contributing and using 106 | 107 | If you like to use the globally installed executable `stumptown-deployer` 108 | but don't want to depend on a new PyPI release for every change you want 109 | to try, use this: 110 | 111 | # If you use a virtualenv, deactivate it first 112 | deactive 113 | # Use the global pip (or pip3) on your system 114 | pip3 install -e . 115 | 116 | If you do this, you can use this repo to install in your system. 117 | -------------------------------------------------------------------------------- /deployer/upload.py: -------------------------------------------------------------------------------- 1 | import concurrent.futures 2 | import datetime 3 | import getpass 4 | import hashlib 5 | import mimetypes 6 | import shutil 7 | import os 8 | import re 9 | import time 10 | from dataclasses import dataclass 11 | from pathlib import Path 12 | 13 | import boto3 14 | import git 15 | from boto3.s3.transfer import TransferConfig 16 | from botocore.exceptions import ClientError 17 | from git.exc import InvalidGitRepositoryError 18 | 19 | from .constants import ( 20 | AWS_PROFILE, 21 | DEFAULT_CACHE_CONTROL, 22 | DEFAULT_NAME_PATTERN, 23 | HASHED_CACHE_CONTROL, 24 | MAX_WORKERS_PARALLEL_UPLOADS, 25 | ) 26 | from .exceptions import NoGitDirectory, CantDryRunError 27 | from .utils import fmt_seconds, fmt_size, info, is_junk_file, ppath, success, warning 28 | 29 | hashed_filename_regex = re.compile(r"\.[a-f0-9]{8,32}\.") 30 | 31 | 32 | def _find_git_repo(start): 33 | if str(start) == str(start.root): 34 | raise NoGitDirectory 35 | try: 36 | return git.Repo(start) 37 | except InvalidGitRepositoryError: 38 | return _find_git_repo(Path(start).parent) 39 | 40 | 41 | def _has_hashed_filename(fn): 42 | return hashed_filename_regex.findall(os.path.basename(fn)) 43 | 44 | 45 | @dataclass() 46 | class UploadTask: 47 | """All the relevant information for doing an upload""" 48 | 49 | key: str 50 | file_path: Path 51 | size: int 52 | file_hash: str 53 | needs_hash_check: bool 54 | 55 | def __repr__(self): 56 | return repr(self.key) 57 | 58 | def set_file_hash(self): 59 | with open(self.file_path, "rb") as f: 60 | self.file_hash = hashlib.md5(f.read()).hexdigest() 61 | 62 | 63 | def upload_site(directory, config): 64 | if isinstance(directory, str): 65 | directory = Path(directory) 66 | if not config.get("name"): 67 | try: 68 | repo = _find_git_repo(directory) 69 | except NoGitDirectory: 70 | raise NoGitDirectory( 71 | f"From {directory} can't find its git root directory " 72 | "which is needed to supply a default branchname." 73 | ) 74 | active_branch = repo.active_branch 75 | if active_branch == "master" and config["lifecycle_days"]: 76 | warning( 77 | f"Warning! You're setting a lifecycle_days " 78 | f"({config['lifecycle_days']} days) on a build from a 'master' repo." 79 | ) 80 | config["name"] = DEFAULT_NAME_PATTERN.format( 81 | username=getpass.getuser(), 82 | branchname=active_branch.name, 83 | date=datetime.datetime.utcnow().strftime("%Y%m%d"), 84 | ) 85 | if not config.replace("-", "").strip(): 86 | raise ValueError("Empty prefix name") 87 | info( 88 | f"About to upload {ppath(directory)} to prefix {config['name']!r} " 89 | f"into bucket {config['bucket']!r}" 90 | ) 91 | 92 | session = boto3.Session(profile_name=AWS_PROFILE) 93 | s3 = session.client("s3") 94 | 95 | # First make sure the bucket exists 96 | try: 97 | s3.head_bucket(Bucket=config["bucket"]) 98 | info(f"Bucket {config['bucket']!r} exists") 99 | except ClientError as error: 100 | # If a client error is thrown, then check that it was a 404 error. 101 | # If it was a 404 error, then the bucket does not exist. 102 | if error.response["Error"]["Code"] != "404": 103 | print(error.response) 104 | raise 105 | 106 | # Needs to be created. 107 | bucket_config = {} 108 | if config["bucket_location"]: 109 | bucket_config["LocationConstraint"] = config["bucket_location"] 110 | if config["dry_run"]: 111 | raise CantDryRunError( 112 | f"The bucket ({config['bucket']} doesn't exist and won't be created " 113 | "in dry-run mode. But it needs to exist to be able to find out " 114 | "what files already exist." 115 | ) 116 | s3.create_bucket( 117 | Bucket=config["bucket"], 118 | ACL="public-read", 119 | CreateBucketConfiguration=bucket_config, 120 | ) 121 | info(f"Bucket {config['bucket']!r} created") 122 | 123 | if config["bucket_lifecycle_days"]: 124 | # https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3.html#S3.Client.put_bucket_lifecycle_configuration 125 | # https://docs.aws.amazon.com/code-samples/latest/catalog/python-s3-put_bucket_lifecyle_configuration.py.html 126 | s3.put_bucket_lifecycle_configuration( 127 | Bucket=config["bucket"], 128 | LifecycleConfiguration={ 129 | "Rules": [ 130 | { 131 | "Expiration": {"Days": config["bucket_lifecycle_days"]}, 132 | "Filter": {"Prefix": ""}, 133 | "Status": "Enabled", 134 | } 135 | ] 136 | }, 137 | ) 138 | info( 139 | f"Bucket lifecycle expiration of " 140 | f"{config['bucket_lifecycle_days']!r} days configured." 141 | ) 142 | 143 | try: 144 | website_bucket = s3.get_bucket_website(Bucket=config["bucket"]) 145 | except ClientError as error: 146 | if error.response["Error"]["Code"] != "NoSuchWebsiteConfiguration": 147 | raise 148 | # Define the website configuration 149 | website_configuration = { 150 | "ErrorDocument": {"Key": "404.html"}, 151 | "IndexDocument": {"Suffix": "index.html"}, 152 | "RoutingRules": [ 153 | { 154 | "Condition": {"KeyPrefixEquals": "/"}, 155 | "Redirect": {"ReplaceKeyWith": "index.html"}, 156 | } 157 | ], 158 | } 159 | website_bucket = s3.put_bucket_website( 160 | Bucket=config["bucket"], WebsiteConfiguration=website_configuration 161 | ) 162 | info(f"Created bucket website configuration for {config['bucket']!r}") 163 | 164 | if config["debug"]: 165 | info(f"Website bucket: {website_bucket!r}") 166 | 167 | uploaded_already = {} 168 | 169 | if config["refresh"]: 170 | info("Refresh, so ignoring what was previously uploaded.") 171 | else: 172 | info( 173 | f"Gather complete list of existing uploads under prefix " 174 | f"{config['name']!r}..." 175 | ) 176 | t0 = time.time() 177 | continuation_token = None 178 | while True: 179 | # Have to do this so that 'ContinuationToken' can be omitted if falsy 180 | list_kwargs = dict(Bucket=config["bucket"], Prefix=config["name"]) 181 | if continuation_token: 182 | list_kwargs["ContinuationToken"] = continuation_token 183 | response = s3.list_objects_v2(**list_kwargs) 184 | for obj in response.get("Contents", []): 185 | uploaded_already[obj["Key"]] = obj 186 | if response["IsTruncated"]: 187 | continuation_token = response["NextContinuationToken"] 188 | else: 189 | break 190 | t1 = time.time() 191 | 192 | warning( 193 | f"{len(uploaded_already):,} files already uploaded " 194 | f"(took {fmt_seconds(t1 - t0)})." 195 | ) 196 | 197 | total_todo = 0 198 | t0 = time.time() 199 | for fp in pwalk(directory): 200 | if is_junk_file(fp): 201 | continue 202 | if fp.name.startswith("_"): 203 | continue 204 | total_todo += 1 205 | t1 = time.time() 206 | warning( 207 | f"{total_todo:,} files to be (maybe) uploaded " 208 | f"(took {fmt_seconds(t1 - t0)})." 209 | ) 210 | 211 | transfer_config = TransferConfig() 212 | 213 | # Number of files that don't need to be uploaded because they are already uploaded 214 | # with a difference. 215 | skipped = 0 216 | 217 | # Number of files we deliberate chose to NOT upload. Or even attempt to. 218 | ignored = 0 219 | 220 | # Use this pattern in case there's a file without extension. 221 | # for fp in directory.glob("**/*"): 222 | # if fp.is_dir(): 223 | # # E.g. /pl/Web/API/docs/WindowBase64.atob/ which is 224 | # continue 225 | counts = {"uploaded": 0, "not_uploaded": 0} 226 | 227 | total_size = [] 228 | total_time = [] 229 | 230 | def update_uploaded_stats(stats): 231 | counts["uploaded"] += stats["counts"].get("uploaded") 232 | counts["not_uploaded"] += stats["counts"].get("not_uploaded") 233 | total_size.append(stats["total_size_uploaded"]) 234 | total_time.append(stats["total_time"]) 235 | if not config["no_progress_bar"]: 236 | done = counts["uploaded"] + counts["not_uploaded"] 237 | percentage = 100 * done / total_todo 238 | max_bar_width = shutil.get_terminal_size((80, 20)).columns 239 | bar_width = int(max_bar_width * done / total_todo) 240 | print( 241 | f"{done:,} of {total_todo:,}".ljust(20) 242 | + f"[{'▋' * bar_width:<{max_bar_width}}] " 243 | f"{percentage:.1f}%\r", 244 | end="", 245 | ) 246 | 247 | total_count = 0 248 | batch = [] 249 | 250 | if config["no_progress_bar"]: 251 | log = info 252 | else: 253 | 254 | current_log_file_name = "upload.log" 255 | info(f"Logging progress into {current_log_file_name}") 256 | 257 | def log(line): 258 | with open(current_log_file_name, "a") as f: 259 | f.write(f"{line}\n") 260 | 261 | T0 = time.time() 262 | for fp in pwalk(directory): 263 | if is_junk_file(fp): 264 | ignored += 1 265 | continue 266 | if fp.name.startswith("_"): 267 | ignored += 1 268 | continue 269 | # This assumes that it can saved in S3 as a key that is the filename. 270 | key_path = fp.relative_to(directory) 271 | # if key_path.name == "index.redirect": 272 | # # Call these index.html when they go into S3 273 | # key_path = key_path.parent / "index.html" 274 | key = f"{config['name']}/{key_path}" 275 | 276 | size = fp.stat().st_size 277 | # with open(fp, "rb") as f: 278 | # file_hash = hashlib.md5(f.read()).hexdigest() 279 | task = UploadTask(key, fp, size, None, False) 280 | if key not in uploaded_already or uploaded_already[key]["Size"] != size: 281 | # No doubt! We definitely didn't have this before or it's definitely 282 | # different. 283 | batch.append(task) 284 | 285 | else: 286 | # At this point, the key exists and the size hasn't changed. 287 | # However, for some files, that's not conclusive. 288 | # Image, a 'index.html' file might have this as its diff: 289 | # 290 | # - 291 | # + 292 | # 293 | # ...which means it definitely has changed but the file size is 294 | # exactly the same as before. 295 | # If this is the case, we're going to *maybe* upload it. 296 | # However, for files that are already digest hashed, we don't need 297 | # to bother checking. 298 | if _has_hashed_filename(key): 299 | # skipped.append(task) 300 | skipped += 1 301 | continue 302 | else: 303 | task.needs_hash_check = True 304 | batch.append(task) 305 | 306 | if len(batch) >= 1000: 307 | # Fire off these 308 | update_uploaded_stats( 309 | _start_uploads( 310 | s3, 311 | config, 312 | batch, 313 | transfer_config, 314 | log=log, 315 | dry_run=config["dry_run"], 316 | ) 317 | ) 318 | total_count += len(batch) 319 | batch = [] 320 | 321 | if batch: 322 | update_uploaded_stats( 323 | _start_uploads( 324 | s3, config, batch, transfer_config, log=log, dry_run=config["dry_run"] 325 | ) 326 | ) 327 | total_count += len(batch) 328 | 329 | T1 = time.time() 330 | success( 331 | f"{counts['uploaded']:,} files uploaded, " 332 | f"{counts['not_uploaded']:,} files didn't need to be uploaded." 333 | ) 334 | info(f"Total thread-pool time: {fmt_seconds(sum(total_time))}") 335 | success(f"Uploaded {fmt_size(sum(total_size))}.") 336 | if config["dry_run"]: 337 | warning("Remember! In dry-run mode") 338 | success(f"Done in {fmt_seconds(T1 - T0)}.") 339 | 340 | 341 | def _start_uploads(s3, config, batch, transfer_config, log=info, dry_run=False): 342 | T0 = time.time() 343 | futures = {} 344 | total_threadpool_time = [] 345 | counts = {"uploaded": 0, "not_uploaded": 0} 346 | total_size_uploaded = 0 347 | with concurrent.futures.ThreadPoolExecutor( 348 | max_workers=MAX_WORKERS_PARALLEL_UPLOADS 349 | ) as executor: 350 | bucket_name = config["bucket"] 351 | for task in batch: 352 | futures[ 353 | executor.submit( 354 | _upload_file_maybe, 355 | s3, 356 | task, 357 | bucket_name, 358 | transfer_config, 359 | log=log, 360 | dry_run=dry_run, 361 | ) 362 | ] = task 363 | 364 | for future in concurrent.futures.as_completed(futures): 365 | was_uploaded, took = future.result() 366 | task = futures[future] 367 | total_threadpool_time.append(took) 368 | if was_uploaded: 369 | counts["uploaded"] += 1 370 | print(f"Adding {task.size} to total_size_uploaded") 371 | total_size_uploaded += task.size 372 | else: 373 | counts["not_uploaded"] += 1 374 | 375 | T1 = time.time() 376 | 377 | return { 378 | "counts": counts, 379 | "took": T1 - T0, 380 | "total_time": sum(total_threadpool_time), 381 | "total_size_uploaded": total_size_uploaded, 382 | } 383 | 384 | 385 | def pwalk(start): 386 | for entry in os.scandir(start): 387 | if entry.is_dir(): 388 | for p in pwalk(entry): 389 | yield p 390 | else: 391 | yield Path(entry) 392 | 393 | 394 | def _upload_file_maybe(s3, task, bucket_name, transfer_config, log=info, dry_run=False): 395 | t0 = time.time() 396 | if not task.file_hash: 397 | task.set_file_hash() 398 | if task.needs_hash_check: 399 | try: 400 | object_data = s3.head_object(Bucket=bucket_name, Key=task.key) 401 | if object_data["Metadata"].get("filehash") == task.file_hash: 402 | # We can bail early! 403 | t1 = time.time() 404 | start = f"{fmt_size(task.size):} in {fmt_seconds(t1 - t0)}" 405 | log(f"Skipped {start:>19} {task.key}") 406 | return False, t1 - t0 407 | except ClientError as error: 408 | # If a client error is thrown, then check that it was a 404 error. 409 | # If it was a 404 error, then the key does not exist. 410 | if error.response["Error"]["Code"] != "404": 411 | raise 412 | 413 | # If it really was a 404, it means that the method that gathered 414 | # the existing list is out of sync. 415 | 416 | mime_type = mimetypes.guess_type(str(task.file_path))[0] or "binary/octet-stream" 417 | 418 | if os.path.basename(task.file_path) == "service-worker.js": 419 | cache_control = "no-cache" 420 | else: 421 | cache_control_seconds = DEFAULT_CACHE_CONTROL 422 | if _has_hashed_filename(task.file_path): 423 | cache_control_seconds = HASHED_CACHE_CONTROL 424 | cache_control = f"max-age={cache_control_seconds}, public" 425 | 426 | ExtraArgs = { 427 | "ACL": "public-read", 428 | "ContentType": mime_type, 429 | "CacheControl": cache_control, 430 | "Metadata": {"filehash": task.file_hash}, 431 | } 432 | # if task.file_path.name == "index.redirect": 433 | # with open(task.file_path) as f: 434 | # redirect_url = f.read().strip() 435 | # ExtraArgs["WebsiteRedirectLocation"] = redirect_url 436 | if not dry_run: 437 | s3.upload_file( 438 | str(task.file_path), 439 | bucket_name, 440 | task.key, 441 | ExtraArgs=ExtraArgs, 442 | Config=transfer_config, 443 | ) 444 | t1 = time.time() 445 | 446 | start = f"{fmt_size(task.size)} in {fmt_seconds(t1 - t0)}" 447 | log(f"{'Updated' if task.needs_hash_check else 'Uploaded'} {start:>20} {task.key}") 448 | return True, t1 - t0 449 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Mozilla Public License Version 2.0 2 | ================================== 3 | 4 | 1. Definitions 5 | -------------- 6 | 7 | 1.1. "Contributor" 8 | means each individual or legal entity that creates, contributes to 9 | the creation of, or owns Covered Software. 10 | 11 | 1.2. "Contributor Version" 12 | means the combination of the Contributions of others (if any) used 13 | by a Contributor and that particular Contributor's Contribution. 14 | 15 | 1.3. "Contribution" 16 | means Covered Software of a particular Contributor. 17 | 18 | 1.4. "Covered Software" 19 | means Source Code Form to which the initial Contributor has attached 20 | the notice in Exhibit A, the Executable Form of such Source Code 21 | Form, and Modifications of such Source Code Form, in each case 22 | including portions thereof. 23 | 24 | 1.5. "Incompatible With Secondary Licenses" 25 | means 26 | 27 | (a) that the initial Contributor has attached the notice described 28 | in Exhibit B to the Covered Software; or 29 | 30 | (b) that the Covered Software was made available under the terms of 31 | version 1.1 or earlier of the License, but not also under the 32 | terms of a Secondary License. 33 | 34 | 1.6. "Executable Form" 35 | means any form of the work other than Source Code Form. 36 | 37 | 1.7. "Larger Work" 38 | means a work that combines Covered Software with other material, in 39 | a separate file or files, that is not Covered Software. 40 | 41 | 1.8. "License" 42 | means this document. 43 | 44 | 1.9. "Licensable" 45 | means having the right to grant, to the maximum extent possible, 46 | whether at the time of the initial grant or subsequently, any and 47 | all of the rights conveyed by this License. 48 | 49 | 1.10. "Modifications" 50 | means any of the following: 51 | 52 | (a) any file in Source Code Form that results from an addition to, 53 | deletion from, or modification of the contents of Covered 54 | Software; or 55 | 56 | (b) any new file in Source Code Form that contains any Covered 57 | Software. 58 | 59 | 1.11. "Patent Claims" of a Contributor 60 | means any patent claim(s), including without limitation, method, 61 | process, and apparatus claims, in any patent Licensable by such 62 | Contributor that would be infringed, but for the grant of the 63 | License, by the making, using, selling, offering for sale, having 64 | made, import, or transfer of either its Contributions or its 65 | Contributor Version. 66 | 67 | 1.12. "Secondary License" 68 | means either the GNU General Public License, Version 2.0, the GNU 69 | Lesser General Public License, Version 2.1, the GNU Affero General 70 | Public License, Version 3.0, or any later versions of those 71 | licenses. 72 | 73 | 1.13. "Source Code Form" 74 | means the form of the work preferred for making modifications. 75 | 76 | 1.14. "You" (or "Your") 77 | means an individual or a legal entity exercising rights under this 78 | License. For legal entities, "You" includes any entity that 79 | controls, is controlled by, or is under common control with You. For 80 | purposes of this definition, "control" means (a) the power, direct 81 | or indirect, to cause the direction or management of such entity, 82 | whether by contract or otherwise, or (b) ownership of more than 83 | fifty percent (50%) of the outstanding shares or beneficial 84 | ownership of such entity. 85 | 86 | 2. License Grants and Conditions 87 | -------------------------------- 88 | 89 | 2.1. Grants 90 | 91 | Each Contributor hereby grants You a world-wide, royalty-free, 92 | non-exclusive license: 93 | 94 | (a) under intellectual property rights (other than patent or trademark) 95 | Licensable by such Contributor to use, reproduce, make available, 96 | modify, display, perform, distribute, and otherwise exploit its 97 | Contributions, either on an unmodified basis, with Modifications, or 98 | as part of a Larger Work; and 99 | 100 | (b) under Patent Claims of such Contributor to make, use, sell, offer 101 | for sale, have made, import, and otherwise transfer either its 102 | Contributions or its Contributor Version. 103 | 104 | 2.2. Effective Date 105 | 106 | The licenses granted in Section 2.1 with respect to any Contribution 107 | become effective for each Contribution on the date the Contributor first 108 | distributes such Contribution. 109 | 110 | 2.3. Limitations on Grant Scope 111 | 112 | The licenses granted in this Section 2 are the only rights granted under 113 | this License. No additional rights or licenses will be implied from the 114 | distribution or licensing of Covered Software under this License. 115 | Notwithstanding Section 2.1(b) above, no patent license is granted by a 116 | Contributor: 117 | 118 | (a) for any code that a Contributor has removed from Covered Software; 119 | or 120 | 121 | (b) for infringements caused by: (i) Your and any other third party's 122 | modifications of Covered Software, or (ii) the combination of its 123 | Contributions with other software (except as part of its Contributor 124 | Version); or 125 | 126 | (c) under Patent Claims infringed by Covered Software in the absence of 127 | its Contributions. 128 | 129 | This License does not grant any rights in the trademarks, service marks, 130 | or logos of any Contributor (except as may be necessary to comply with 131 | the notice requirements in Section 3.4). 132 | 133 | 2.4. Subsequent Licenses 134 | 135 | No Contributor makes additional grants as a result of Your choice to 136 | distribute the Covered Software under a subsequent version of this 137 | License (see Section 10.2) or under the terms of a Secondary License (if 138 | permitted under the terms of Section 3.3). 139 | 140 | 2.5. Representation 141 | 142 | Each Contributor represents that the Contributor believes its 143 | Contributions are its original creation(s) or it has sufficient rights 144 | to grant the rights to its Contributions conveyed by this License. 145 | 146 | 2.6. Fair Use 147 | 148 | This License is not intended to limit any rights You have under 149 | applicable copyright doctrines of fair use, fair dealing, or other 150 | equivalents. 151 | 152 | 2.7. Conditions 153 | 154 | Sections 3.1, 3.2, 3.3, and 3.4 are conditions of the licenses granted 155 | in Section 2.1. 156 | 157 | 3. Responsibilities 158 | ------------------- 159 | 160 | 3.1. Distribution of Source Form 161 | 162 | All distribution of Covered Software in Source Code Form, including any 163 | Modifications that You create or to which You contribute, must be under 164 | the terms of this License. You must inform recipients that the Source 165 | Code Form of the Covered Software is governed by the terms of this 166 | License, and how they can obtain a copy of this License. You may not 167 | attempt to alter or restrict the recipients' rights in the Source Code 168 | Form. 169 | 170 | 3.2. Distribution of Executable Form 171 | 172 | If You distribute Covered Software in Executable Form then: 173 | 174 | (a) such Covered Software must also be made available in Source Code 175 | Form, as described in Section 3.1, and You must inform recipients of 176 | the Executable Form how they can obtain a copy of such Source Code 177 | Form by reasonable means in a timely manner, at a charge no more 178 | than the cost of distribution to the recipient; and 179 | 180 | (b) You may distribute such Executable Form under the terms of this 181 | License, or sublicense it under different terms, provided that the 182 | license for the Executable Form does not attempt to limit or alter 183 | the recipients' rights in the Source Code Form under this License. 184 | 185 | 3.3. Distribution of a Larger Work 186 | 187 | You may create and distribute a Larger Work under terms of Your choice, 188 | provided that You also comply with the requirements of this License for 189 | the Covered Software. If the Larger Work is a combination of Covered 190 | Software with a work governed by one or more Secondary Licenses, and the 191 | Covered Software is not Incompatible With Secondary Licenses, this 192 | License permits You to additionally distribute such Covered Software 193 | under the terms of such Secondary License(s), so that the recipient of 194 | the Larger Work may, at their option, further distribute the Covered 195 | Software under the terms of either this License or such Secondary 196 | License(s). 197 | 198 | 3.4. Notices 199 | 200 | You may not remove or alter the substance of any license notices 201 | (including copyright notices, patent notices, disclaimers of warranty, 202 | or limitations of liability) contained within the Source Code Form of 203 | the Covered Software, except that You may alter any license notices to 204 | the extent required to remedy known factual inaccuracies. 205 | 206 | 3.5. Application of Additional Terms 207 | 208 | You may choose to offer, and to charge a fee for, warranty, support, 209 | indemnity or liability obligations to one or more recipients of Covered 210 | Software. However, You may do so only on Your own behalf, and not on 211 | behalf of any Contributor. You must make it absolutely clear that any 212 | such warranty, support, indemnity, or liability obligation is offered by 213 | You alone, and You hereby agree to indemnify every Contributor for any 214 | liability incurred by such Contributor as a result of warranty, support, 215 | indemnity or liability terms You offer. You may include additional 216 | disclaimers of warranty and limitations of liability specific to any 217 | jurisdiction. 218 | 219 | 4. Inability to Comply Due to Statute or Regulation 220 | --------------------------------------------------- 221 | 222 | If it is impossible for You to comply with any of the terms of this 223 | License with respect to some or all of the Covered Software due to 224 | statute, judicial order, or regulation then You must: (a) comply with 225 | the terms of this License to the maximum extent possible; and (b) 226 | describe the limitations and the code they affect. Such description must 227 | be placed in a text file included with all distributions of the Covered 228 | Software under this License. Except to the extent prohibited by statute 229 | or regulation, such description must be sufficiently detailed for a 230 | recipient of ordinary skill to be able to understand it. 231 | 232 | 5. Termination 233 | -------------- 234 | 235 | 5.1. The rights granted under this License will terminate automatically 236 | if You fail to comply with any of its terms. However, if You become 237 | compliant, then the rights granted under this License from a particular 238 | Contributor are reinstated (a) provisionally, unless and until such 239 | Contributor explicitly and finally terminates Your grants, and (b) on an 240 | ongoing basis, if such Contributor fails to notify You of the 241 | non-compliance by some reasonable means prior to 60 days after You have 242 | come back into compliance. Moreover, Your grants from a particular 243 | Contributor are reinstated on an ongoing basis if such Contributor 244 | notifies You of the non-compliance by some reasonable means, this is the 245 | first time You have received notice of non-compliance with this License 246 | from such Contributor, and You become compliant prior to 30 days after 247 | Your receipt of the notice. 248 | 249 | 5.2. If You initiate litigation against any entity by asserting a patent 250 | infringement claim (excluding declaratory judgment actions, 251 | counter-claims, and cross-claims) alleging that a Contributor Version 252 | directly or indirectly infringes any patent, then the rights granted to 253 | You by any and all Contributors for the Covered Software under Section 254 | 2.1 of this License shall terminate. 255 | 256 | 5.3. In the event of termination under Sections 5.1 or 5.2 above, all 257 | end user license agreements (excluding distributors and resellers) which 258 | have been validly granted by You or Your distributors under this License 259 | prior to termination shall survive termination. 260 | 261 | ************************************************************************ 262 | * * 263 | * 6. Disclaimer of Warranty * 264 | * ------------------------- * 265 | * * 266 | * Covered Software is provided under this License on an "as is" * 267 | * basis, without warranty of any kind, either expressed, implied, or * 268 | * statutory, including, without limitation, warranties that the * 269 | * Covered Software is free of defects, merchantable, fit for a * 270 | * particular purpose or non-infringing. The entire risk as to the * 271 | * quality and performance of the Covered Software is with You. * 272 | * Should any Covered Software prove defective in any respect, You * 273 | * (not any Contributor) assume the cost of any necessary servicing, * 274 | * repair, or correction. This disclaimer of warranty constitutes an * 275 | * essential part of this License. No use of any Covered Software is * 276 | * authorized under this License except under this disclaimer. * 277 | * * 278 | ************************************************************************ 279 | 280 | ************************************************************************ 281 | * * 282 | * 7. Limitation of Liability * 283 | * -------------------------- * 284 | * * 285 | * Under no circumstances and under no legal theory, whether tort * 286 | * (including negligence), contract, or otherwise, shall any * 287 | * Contributor, or anyone who distributes Covered Software as * 288 | * permitted above, be liable to You for any direct, indirect, * 289 | * special, incidental, or consequential damages of any character * 290 | * including, without limitation, damages for lost profits, loss of * 291 | * goodwill, work stoppage, computer failure or malfunction, or any * 292 | * and all other commercial damages or losses, even if such party * 293 | * shall have been informed of the possibility of such damages. This * 294 | * limitation of liability shall not apply to liability for death or * 295 | * personal injury resulting from such party's negligence to the * 296 | * extent applicable law prohibits such limitation. Some * 297 | * jurisdictions do not allow the exclusion or limitation of * 298 | * incidental or consequential damages, so this exclusion and * 299 | * limitation may not apply to You. * 300 | * * 301 | ************************************************************************ 302 | 303 | 8. Litigation 304 | ------------- 305 | 306 | Any litigation relating to this License may be brought only in the 307 | courts of a jurisdiction where the defendant maintains its principal 308 | place of business and such litigation shall be governed by laws of that 309 | jurisdiction, without reference to its conflict-of-law provisions. 310 | Nothing in this Section shall prevent a party's ability to bring 311 | cross-claims or counter-claims. 312 | 313 | 9. Miscellaneous 314 | ---------------- 315 | 316 | This License represents the complete agreement concerning the subject 317 | matter hereof. If any provision of this License is held to be 318 | unenforceable, such provision shall be reformed only to the extent 319 | necessary to make it enforceable. Any law or regulation which provides 320 | that the language of a contract shall be construed against the drafter 321 | shall not be used to construe this License against a Contributor. 322 | 323 | 10. Versions of the License 324 | --------------------------- 325 | 326 | 10.1. New Versions 327 | 328 | Mozilla Foundation is the license steward. Except as provided in Section 329 | 10.3, no one other than the license steward has the right to modify or 330 | publish new versions of this License. Each version will be given a 331 | distinguishing version number. 332 | 333 | 10.2. Effect of New Versions 334 | 335 | You may distribute the Covered Software under the terms of the version 336 | of the License under which You originally received the Covered Software, 337 | or under the terms of any subsequent version published by the license 338 | steward. 339 | 340 | 10.3. Modified Versions 341 | 342 | If you create software not governed by this License, and you want to 343 | create a new license for such software, you may create and use a 344 | modified version of this License if you rename the license and remove 345 | any references to the name of the license steward (except to note that 346 | such modified license differs from this License). 347 | 348 | 10.4. Distributing Source Code Form that is Incompatible With Secondary 349 | Licenses 350 | 351 | If You choose to distribute Source Code Form that is Incompatible With 352 | Secondary Licenses under the terms of this version of the License, the 353 | notice described in Exhibit B of this License must be attached. 354 | 355 | Exhibit A - Source Code Form License Notice 356 | ------------------------------------------- 357 | 358 | This Source Code Form is subject to the terms of the Mozilla Public 359 | License, v. 2.0. If a copy of the MPL was not distributed with this 360 | file, You can obtain one at http://mozilla.org/MPL/2.0/. 361 | 362 | If it is not possible or desirable to put the notice in a particular 363 | file, then You may include the notice in a location (such as a LICENSE 364 | file in a relevant directory) where a recipient would be likely to look 365 | for such a notice. 366 | 367 | You may add additional accurate notices of copyright ownership. 368 | 369 | Exhibit B - "Incompatible With Secondary Licenses" Notice 370 | --------------------------------------------------------- 371 | 372 | This Source Code Form is "Incompatible With Secondary Licenses", as 373 | defined by the Mozilla Public License, v. 2.0. 374 | --------------------------------------------------------------------------------