├── .github └── workflows │ └── deploy.yml ├── .travis.yml ├── CITATION.cff ├── LICENSE ├── README.md ├── config.py ├── documentation ├── .github │ └── workflows │ │ └── deploy.yml └── DCP-documentation │ ├── AWS_hygiene_scripts.md │ ├── SQS_QUEUE_information.md │ ├── _config.yml │ ├── _toc.yml │ ├── advanced_configuration.md │ ├── config_examples.md │ ├── costs.md │ ├── dashboard.md │ ├── external_buckets.md │ ├── images │ ├── AMIID.jpg │ ├── DCP-chronological_schematic.png │ ├── ECS.jpg │ ├── InstanceID.jpg │ ├── Launch.jpg │ ├── LoadDataCSV.png │ ├── Network.jpg │ ├── Snapshot.jpg │ ├── adjust_dashboard_view.png │ ├── blip_in_messagesnotvisible.png │ ├── dashboard_overview.png │ ├── expand_error_log.png │ ├── fulfilledcapacity.png │ ├── logs_comparison.png │ ├── memoryutilization.png │ ├── messages_change_slope.png │ ├── messages_deleted_received.png │ ├── sample_DCP_config_1.png │ ├── sample_DCP_config_2.png │ └── zoom_into_dashboard.png │ ├── overview.md │ ├── overview_2.md │ ├── passing_files_to_DCP.md │ ├── step_0_prep.md │ ├── step_1_configuration.md │ ├── step_2_submit_jobs.md │ ├── step_3_start_cluster.md │ ├── step_4_monitor.md │ ├── troubleshooting_runs.md │ ├── troubleshooting_start_cluster.md │ └── versions.md ├── example_project ├── README.md ├── config.py ├── demo_project_folder │ ├── images │ │ ├── 01_POS002_D.TIF │ │ ├── 01_POS002_F.TIF │ │ ├── 01_POS002_R.TIF │ │ ├── 01_POS076_D.TIF │ │ ├── 01_POS076_F.TIF │ │ ├── 01_POS076_R.TIF │ │ ├── 01_POS218_D.TIF │ │ ├── 01_POS218_F.TIF │ │ └── 01_POS218_R.TIF │ └── workspace │ │ ├── ExampleFly.cppipe │ │ └── load_data.csv └── files │ └── exampleJob.json ├── example_project_CPG ├── README.md ├── config.py ├── demo_project_folder │ └── workspace │ │ ├── ExampleCPG.cppipe │ │ └── load_data.csv └── files │ └── exampleCPGJob.json ├── files ├── ManualMetadata.py ├── batches.sh ├── exampleFleet_us-east-1.json ├── exampleFleet_us-west-2.json ├── exampleJob.json └── requirements.txt ├── lambda_function.py ├── python2worker ├── Dockerfile ├── Makefile ├── cp-worker.py ├── instance-monitor.py └── run-worker.sh ├── run.py ├── run_batch_general.py ├── setup_AWS.py └── worker ├── Dockerfile ├── Makefile ├── cp-worker.py ├── instance-monitor.py └── run-worker.sh /.github/workflows/deploy.yml: -------------------------------------------------------------------------------- 1 | name: deploy-documentation 2 | 3 | # Only run this when the master branch changes 4 | on: 5 | push: 6 | branches: 7 | - master 8 | # Only run if edits in DCP-documentation 9 | paths: 10 | - documentation/DCP-documentation/** 11 | - .github/workflows/deploy.yml 12 | 13 | # This job installs dependencies, builds the book, and pushes it to `gh-pages` 14 | jobs: 15 | deploy-book: 16 | runs-on: ubuntu-latest 17 | steps: 18 | - uses: actions/checkout@v4 19 | 20 | # Install dependencies 21 | - name: Set up Python 3.8 22 | uses: actions/setup-python@v5 23 | with: 24 | python-version: 3.8 25 | 26 | - name: Install dependencies 27 | run: | 28 | pip install jupyter-book 29 | 30 | # Build the book 31 | - name: Build the book 32 | run: | 33 | jupyter-book build documentation/DCP-documentation/ 34 | 35 | # Push the book's HTML to github-pages 36 | - name: GitHub Pages action 37 | uses: peaceiris/actions-gh-pages@v3.9.3 38 | with: 39 | github_token: ${{ secrets.GITHUB_TOKEN }} 40 | publish_dir: ./documentation/DCP-documentation/_build/html 41 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | matrix: 3 | include: 4 | - python: 2.7 5 | - python: 3.6 6 | allow_failures: 7 | - python: 3.6 8 | install: 9 | - pip install --upgrade pip 10 | - pip install -r requirements.txt 11 | - pip install flake8 12 | before_script: 13 | # stop the build if there are Python syntax errors or undefined names 14 | - flake8 . --count --select=E901,E999,F821,F822,F823 --show-source --statistics 15 | # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide 16 | - flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics 17 | script: true # pytest 18 | notifications: 19 | email: false 20 | slack: 21 | secure: kDWVy90sDY+o3g0/ZTGX2D+PTbzhtd74Whe1AJHhcUDobTUzkch8GtY9eZxybZk4nga9lQxL6YeJ72SfBBEPaLzXcUMe0YcNaBydkQHcipKZn+Vcb8kf2FiZC6YwsUYfTvvH9MPLbkZOZvsNyd0h85z+hYMB8jHsq6Yn5gf79BA= 22 | on_failure: always 23 | on_success: change 24 | -------------------------------------------------------------------------------- /CITATION.cff: -------------------------------------------------------------------------------- 1 | cff-version: 1.2.0 2 | message: "If you use this software, please cite at least McQuin et al. Also site Weisbart et al. for updated DCP functions" 3 | type: software 4 | authors: 5 | - name: 'Imaging Platform, Broad Institute of Harvard and MIT' 6 | city: Cambridge 7 | country: US 8 | repository: https://github.com/DistributedScience/Distributed-CellProfiler 9 | title: "Distributed-CellProfiler" 10 | doi: 10.1371/journal.pbio.2005970 11 | date-released: 2018-07-03 12 | preferred-citation: 13 | type: article 14 | authors: 15 | - family-names: "McQuin" 16 | given-names: "Claire" 17 | orcid: "https://orcid.org/0000-0002-3664-2318" 18 | - family-names: "Goodman" 19 | given-names: "Allen" 20 | orcid: "https://orcid.org/0000-0002-6434-2320" 21 | - family-names: "Chernyshev" 22 | given-names: "Vasiliy" 23 | orcid: "https://orcid.org/0000-0003-2372-7037" 24 | - family-names: "Kamentsky" 25 | given-names: "Lee" 26 | orcid: "https://orcid.org/0000-0002-8161-3604" 27 | - family-names: "Cimini" 28 | given-names: "Beth A." 29 | orcid: "https://orcid.org/0000-0001-9640-9318" 30 | - family-names: "Karhohs" 31 | given-names: "Kyle W." 32 | orcid: "https://orcid.org/0000-0002-5126-5805" 33 | - family-names: "Doan" 34 | given-names: "Minh" 35 | orcid: "https://orcid.org/0000-0002-3235-0457" 36 | - family-names: "Ding" 37 | given-names: "Liya" 38 | - family-names: "Rafelski" 39 | given-names: "Susanne M." 40 | orcid: "https://orcid.org/0000-0002-1399-5970" 41 | - family-names: "Thirstrup" 42 | given-names: "Derek" 43 | orcid: "https://orcid.org/0000-0002-2702-2010" 44 | - family-names: "Wiegraebe" 45 | given-names: "Winfried" 46 | orcid: "https://orcid.org/0000-0002-1099-4817" 47 | - family-names: "Singh" 48 | given-names: "Shantanu" 49 | orcid: "https://orcid.org/0000-0003-3150-3025" 50 | - family-names: "Becker" 51 | given-names: "Tim" 52 | orcid: "https://orcid.org/0000-0001-9615-0799" 53 | - family-names: "Caicedo" 54 | given-names: "Juan C." 55 | orcid: "https://orcid.org/0000-0002-1277-4631" 56 | - family-names: "Carpenter" 57 | given-names: "Anne E." 58 | orcid: "https://orcid.org/0000-0003-1555-8261" 59 | doi: "10.1371/journal.pbio.2005970" 60 | journal: "PLOS Biology" 61 | month: 7 62 | start: 0 # First page number 63 | end: 0 # Last page number 64 | title: "CellProfiler 3.0: Next-generation image processing for biology." 65 | issue: 16 66 | volume: 7 67 | year: 2018 68 | reference: 69 | type: article 70 | authors: 71 | - family-names: "Weisbart" 72 | given-names: "Erin" 73 | orcid: "https://orcid.org/0000-0002-6437-2458" 74 | - family-names: "Cimini" 75 | given-names: "Beth A." 76 | orcid: "https://orcid.org/0000-0001-9640-9318" 77 | doi: "10.1038/s41592-023-01918-8" 78 | journal: "Nature Methods" 79 | month: 6 80 | start: 0 # First page number 81 | end: 0 # Last page number 82 | title: "Distributed-Something: scripts to leverage AWS storage and computing for distributed workflows at scale" 83 | issue: 0 84 | volume: 0 85 | year: 2023 86 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Distributed-CellProfiler is distributed under the following BSD-style license: 2 | 3 | Copyright © 2020 Broad Institute, Inc. All rights reserved. 4 | 5 | Redistribution and use in source and binary forms, with or without 6 | modification, are permitted provided that the following conditions are 7 | met: 8 | 9 | 1. Redistributions of source code must retain the above copyright 10 | notice, this list of conditions and the following disclaimer. 11 | 12 | 2. Redistributions in binary form must reproduce the above copyright 13 | notice, this list of conditions and the following disclaimer in the 14 | documentation and/or other materials provided with the distribution. 15 | 16 | 3. Neither the name of the Broad Institute, Inc. nor the names of its 17 | contributors may be used to endorse or promote products derived from 18 | this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED “AS IS.” BROAD MAKES NO EXPRESS OR IMPLIED 21 | REPRESENTATIONS OR WARRANTIES OF ANY KIND REGARDING THE SOFTWARE AND 22 | COPYRIGHT, INCLUDING, BUT NOT LIMITED TO, WARRANTIES OF 23 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, CONFORMITY WITH ANY 24 | DOCUMENTATION, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER 25 | DEFECTS, WHETHER OR NOT DISCOVERABLE. IN NO EVENT SHALL BROAD, THE 26 | COPYRIGHT HOLDERS, OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 27 | INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 28 | BUT NOT LIMITED TO PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS 29 | OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 30 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR 31 | TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE 32 | USE OF THIS SOFTWARE, EVEN IF ADVISED OF, HAVE REASON TO KNOW, OR IN 33 | FACT SHALL KNOW OF THE POSSIBILITY OF SUCH DAMAGE. 34 | 35 | If, by operation of law or otherwise, any of the aforementioned 36 | warranty disclaimers are determined inapplicable, your sole remedy, 37 | regardless of the form of action, including, but not limited to, 38 | negligence and strict liability, shall be replacement of the software 39 | with an updated version if one exists. 40 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Distributed-CellProfiler 2 | Run encapsulated docker containers with CellProfiler in the Amazon Web Services infrastructure. 3 | 4 | This code is an example of how to use AWS distributed infrastructure for running CellProfiler. 5 | The configuration of the AWS resources is done using boto3 and the AWS CLI. 6 | The worker is written in Python and is encapsulated in a docker container. 7 | There are four AWS components that are minimally needed to run distributed jobs: 8 | 9 | 1. An SQS queue 10 | 2. An ECS cluster 11 | 3. An S3 bucket 12 | 4. A spot fleet of EC2 instances 13 | 14 | All of them can be managed through the AWS Management Console. 15 | However, this code helps to get started quickly and run a job autonomously if all the configuration is correct. 16 | The code prepares the infrastructure to run a distributed job. 17 | When the job is completed, the code is also able to stop resources and clean up components. 18 | It also adds logging and alarms via CloudWatch, helping the user troubleshoot runs and destroy stuck machines. 19 | 20 | ## Documentation 21 | Comprehensive documentation, including troubleshooting, is available at [Distributed CellProfiler Documentation](https://distributedscience.github.io/Distributed-CellProfiler). 22 | 23 | ## Running the code 24 | 25 | ### Step 1 26 | Edit the config.py file with all the relevant information for your job. 27 | Then, start creating the basic AWS resources by running the following script: 28 | 29 | $ python run.py setup 30 | 31 | This script initializes the resources in AWS. 32 | Notice that the docker registry is built separately, and you can modify the worker code to build your own. 33 | Any time you modify the worker code, you need to update the docker registry using the Makefile script inside the worker directory. 34 | 35 | ### Step 2 36 | After the first script runs successfully, the job can now be submitted to AWS using EITHER of the following commands: 37 | 38 | $ python run.py submitJob files/exampleJob.json 39 | 40 | OR 41 | 42 | $ python run_batch_general.py 43 | 44 | Running either script uploads the tasks that are configured in the json file. 45 | This assumes that your data is stored in S3, and the json file has the paths to find input and output directories. 46 | You have to customize the `exampleJob.json` file or the `run_batch_general.py` file with paths that make sense for your project. 47 | The tasks that compose your job are CP groups, and each one will be run in parallel. 48 | You need to define each task in your input file to guide the parallelization. 49 | 50 | ### Step 3 51 | After submitting the job to the queue, we can add computing power to process all tasks in AWS. 52 | This code starts a fleet of spot EC2 instances which will run the worker code. 53 | The worker code is encapsulated in docker containers, and the code uses ECS services to inject them in EC2. 54 | All of this is automated with the following command: 55 | 56 | $ python run.py startCluster files/exampleFleet.json 57 | 58 | After the cluster is ready, the code informs you that everything is setup, and saves the spot fleet identifier 59 | in a file for further reference. 60 | 61 | ### Step 4 62 | When the cluster is up and running, you can monitor progress using the following command: 63 | 64 | $ python run.py monitor files/APP_NAMESpotFleetRequestId.json 65 | 66 | The file APP_NAMESpotFleetRequestId.json is created after the cluster is setup in step 3. 67 | It is important to keep this monitor running if you want to automatically shutdown computing resources when there are no more tasks in the queue (recommended). 68 | 69 | ![Distributed-CellProfiler-Workflow](documentation/DCP-documentation/images/DCP-chronological_schematic.png) 70 | -------------------------------------------------------------------------------- /config.py: -------------------------------------------------------------------------------- 1 | # Constants (User configurable) 2 | 3 | APP_NAME = 'DistributedCP' # Used to generate derivative names unique to the application. 4 | LOG_GROUP_NAME = APP_NAME 5 | 6 | # DOCKER REGISTRY INFORMATION: 7 | DOCKERHUB_TAG = 'cellprofiler/distributed-cellprofiler:2.2.0_4.2.8' 8 | 9 | # AWS GENERAL SETTINGS: 10 | AWS_REGION = 'us-east-1' 11 | AWS_PROFILE = 'default' # The same profile used by your AWS CLI installation 12 | SSH_KEY_NAME = 'your-key-file.pem' # Expected to be in ~/.ssh 13 | AWS_BUCKET = 'your-bucket-name' # Bucket to use for logging 14 | SOURCE_BUCKET = 'bucket-name' # Bucket to download image files from 15 | WORKSPACE_BUCKET = 'bucket-name' # Bucket to download non-image files from 16 | DESTINATION_BUCKET = 'bucket-name' # Bucket to upload files to 17 | UPLOAD_FLAGS = '' # Any flags needed for upload to destination bucket 18 | 19 | # EC2 AND ECS INFORMATION: 20 | ECS_CLUSTER = 'default' 21 | CLUSTER_MACHINES = 3 22 | TASKS_PER_MACHINE = 1 23 | MACHINE_TYPE = ['m5.xlarge'] 24 | MACHINE_PRICE = 0.20 25 | EBS_VOL_SIZE = 30 # In GB. Minimum allowed is 22. 26 | DOWNLOAD_FILES = 'False' 27 | ASSIGN_IP = 'True' # If false, will overwrite setting in Fleet file 28 | 29 | # DOCKER INSTANCE RUNNING ENVIRONMENT: 30 | DOCKER_CORES = 4 # Number of CellProfiler processes to run inside a docker container 31 | CPU_SHARES = DOCKER_CORES * 1024 # ECS computing units assigned to each docker container (1024 units = 1 core) 32 | MEMORY = 15000 # Memory assigned to the docker container in MB 33 | SECONDS_TO_START = 3*60 # Wait before the next CP process is initiated to avoid memory collisions 34 | 35 | # SQS QUEUE INFORMATION: 36 | SQS_QUEUE_NAME = APP_NAME + 'Queue' 37 | SQS_MESSAGE_VISIBILITY = 1*60 # Timeout (secs) for messages in flight (average time to be processed) 38 | SQS_DEAD_LETTER_QUEUE = 'user_DeadMessages' 39 | JOB_RETRIES = 3 # Number of times to retry a job before sending it to DEAD_LETTER_QUEUE 40 | 41 | # MONITORING 42 | AUTO_MONITOR = 'True' 43 | 44 | # CLOUDWATCH DASHBOARD CREATION 45 | CREATE_DASHBOARD = 'True' # Create a dashboard in Cloudwatch for run 46 | CLEAN_DASHBOARD = 'True' # Automatically remove dashboard at end of run with Monitor 47 | 48 | # REDUNDANCY CHECKS 49 | CHECK_IF_DONE_BOOL = 'False' #True or False- should it check if there are a certain number of non-empty files and delete the job if yes? 50 | EXPECTED_NUMBER_FILES = 7 #What is the number of files that trigger skipping a job? 51 | MIN_FILE_SIZE_BYTES = 1 #What is the minimal number of bytes an object should be to "count"? 52 | NECESSARY_STRING = '' #Is there any string that should be in the file name to "count"? 53 | 54 | # CELLPROFILER SETTINGS 55 | ALWAYS_CONTINUE = 'False' # Whether or not to run CellProfiler with the --always-continue flag, which will keep CellProfiler from crashing if it errors 56 | 57 | # PLUGINS 58 | USE_PLUGINS = 'False' # True to use any plugin from CellProfiler-plugins repo 59 | UPDATE_PLUGINS = 'False' # True to download updates from CellProfiler-plugins repo 60 | PLUGINS_COMMIT = 'False' # What commit or version tag do you want to check out? If not, set to False. 61 | INSTALL_REQUIREMENTS = 'False' # True to install REQUIREMENTS defined below. Requirements should have all plugin dependencies. 62 | REQUIREMENTS = '' # Flag to use with install (current) or path within the CellProfiler-plugins repo to a requirements file (deprecated). 63 | -------------------------------------------------------------------------------- /documentation/.github/workflows/deploy.yml: -------------------------------------------------------------------------------- 1 | name: deploy-documentation 2 | 3 | # Only run this when the master branch changes 4 | on: 5 | push: 6 | branches: 7 | - master 8 | # Only run if edits in DS-documentation or 9 | paths: 10 | - documentation/DCP-documentation/** 11 | - .github/workflows/deploy.yml 12 | 13 | # This job installs dependencies, builds the book, and pushes it to `gh-pages` 14 | jobs: 15 | deploy-book: 16 | runs-on: ubuntu-latest 17 | steps: 18 | - uses: actions/checkout@v2 19 | 20 | # Install dependencies 21 | - name: Set up Python 3.8 22 | uses: actions/setup-python@v2 23 | with: 24 | python-version: 3.8 25 | 26 | - name: Install dependencies 27 | run: | 28 | pip install jupyter-book 29 | 30 | # Build the book 31 | - name: Build the book 32 | run: | 33 | jupyter-book build DCP-documentation/ 34 | 35 | # Push the book's HTML to github-pages 36 | - name: GitHub Pages action 37 | uses: peaceiris/actions-gh-pages@v3.6.1 38 | with: 39 | github_token: ${{ secrets.GITHUB_TOKEN }} 40 | publish_dir: ./DCP-documentation/_build/html 41 | -------------------------------------------------------------------------------- /documentation/DCP-documentation/AWS_hygiene_scripts.md: -------------------------------------------------------------------------------- 1 | # AWS Hygiene Scripts 2 | 3 | See also [AUSPICES](https://github.com/broadinstitute/AuSPICES) for setting up various hygiene scripts to automatically run in your AWS account. 4 | 5 | ## Clean out old alarms 6 | 7 | Python: 8 | 9 | ```python 10 | import boto3 11 | import time 12 | filterstring = 'MyProjectName' 13 | client = boto3.client('cloudwatch') 14 | alarms = client.describe_alarms(AlarmTypes=['MetricAlarm'],StateValue='INSUFFICIENT_DATA') 15 | while True: 16 | for eachalarm in alarms['MetricAlarms']: 17 | if eachalarm['StateValue'] == 'INSUFFICIENT_DATA': 18 | if filterstring in eachalarm['AlarmName']: 19 | client.delete_alarms(AlarmNames = [eachalarm['AlarmName']]) 20 | time.sleep(1) #avoid throttling 21 | token = alarms['NextToken'] 22 | print(token) 23 | alarms = client.describe_alarms(AlarmTypes=['MetricAlarm'],StateValue='INSUFFICIENT_DATA',NextToken=token) 24 | ``` 25 | 26 | ## Clean out old log groups 27 | 28 | Bash: 29 | 30 | ```sh 31 | aws logs describe-log-groups| in2csv -f json --key logGroups > logs.csv 32 | ``` 33 | 34 | R: 35 | 36 | (requires `dplyr` and `readr`) 37 | 38 | ```r 39 | library(dplyr) 40 | library(readr) 41 | read_csv( 42 | "logs.csv", 43 | col_types = cols_only( 44 | storedBytes = col_integer(), 45 | creationTime = col_double(), 46 | logGroupName = col_character() 47 | ) 48 | ) %>% 49 | mutate(creationTime = 50 | as.POSIXct(creationTime / 1000, 51 | origin = "1970-01-01")) %>% 52 | filter(storedBytes == 0) %>% 53 | select(logGroupName) %>% 54 | write_tsv("logs_clear.txt", col_names = F) 55 | ``` 56 | 57 | Bash: 58 | 59 | ```sh 60 | parallel aws logs delete-log-group --log-group-name {1} :::: logs_clear.txt 61 | ``` 62 | -------------------------------------------------------------------------------- /documentation/DCP-documentation/SQS_QUEUE_information.md: -------------------------------------------------------------------------------- 1 | # SQS QUEUE Information 2 | 3 | This is in-depth information about the configurable components in SQS QUEUE INFORMATION, a section in [Step 1: Configuration](step_1_configuration.md) of running Distributed CellProfiler. 4 | 5 | ## SQS_QUEUE_NAME 6 | 7 | **SQS_QUEUE_NAME** is the name of the queue where all of your jobs are sent. 8 | (A queue is exactly what it sounds like - a list of things waiting their turn. Jobs represent one complete run through a CellProfiler pipeline (though each job may involve any number of images. e.g. analysis may require thousands of jobs, each with a single image making one complete CellProfiler run, while making an illumination correction may be a single job that iterates through thousands of images to produce a single output file.)) 9 | You want a name that is descriptive enough to distinguish it from other queues. 10 | We usually name our queues based on the project and the step or pipeline goal. 11 | An example may be something like Hepatocyte_Differentiation_Illum or Lipid_Droplet_Analysis. 12 | 13 | ## SQS_DEAD_LETTER_QUEUE 14 | 15 | **SQS_DEAD_LETTER_QUEUE** is the name of the queue where all the jobs that failed to run are sent. 16 | If everything goes perfectly, this will always remain empty. 17 | If jobs that are in the queue fail multiple times (our default is 10) they are moved to the dead-letter queue, which is not used to initiate jobs. 18 | The dead-letter queue therefore functions effectively as a log so you can see if any of your jobs failed. 19 | It is different from your other queue as machines do not try and pull jobs from it. 20 | Protip: Each member of our team maintains their own dead-letter queue so we don’t have to worry about finding messages if multiple people are running jobs at the same time. 21 | We use names like DeadMessages_Erin. 22 | 23 | If all of your jobs end up in your dead-letter queue there are many different places you could have a problem. 24 | Hopefully, you’ll keep an eye on the logs in your CloudWatch (the part of AWS used for monitoring what all your other AWS services are doing) after starting a run and catch the issue before all of your jobs fail multiple times. 25 | 26 | If a single job ends up in your dead-letter queue while the rest of your jobs complete successfully, it is likely that that an image is corrupted (a corrupted image is one that has failed to save properly or has been damaged so that it will not open). 27 | This is true whether your pipeline processes a single image at a time (such as in analysis runs where you’re interested in cellular measurements on a per-image basis) or whether your pipeline processes many images at a time (such as when making an illumination correction image on a per-plate basis). 28 | This is the major reason why we have the dead-letter queue: you certainly don’t want to pay for your cluster to indefinitely attempt to process a corrupted image. 29 | Keeping an eye on your CloudWatch logs wouldn’t necessarily help you catch this kind of error because you could have tens or hundreds of successful jobs run before an instance pulls the job for the corrupted image, or the corrupted image could be thousands of images into an illumination correction run, etc. 30 | 31 | ## SQS_MESSAGE_VISIBILITY 32 | 33 | **SQS_MESSAGE_VISIBILITY** controls how long jobs are hidden after being pulled by a machine to run. 34 | Jobs must be visible (i.e. not hidden) in order to be pulled by a Docker and therefore run. 35 | In other words, the time you enter in SQS_MESSAGE_VISIBILITY is how long a job is allowed a chance to complete before it is unhidden and made available to be started by a different copy of CellProfiler. 36 | It’s quite important to set this time correctly - we typically say to estimate 1.5X how long the job typically takes to run (or your best guess of that if you’re not sure). 37 | To understand why, and the consequences of setting an incorrect time, let’s look more carefully at the SQS queue. 38 | 39 | The SQS queue has two categories - “Messages Available” and “Messages In Flight”. 40 | Each message is a job and regardless of the category it’s in, the jobs all remain in the same queue. 41 | In effect, “In Flight” means currently hiding and “Available” means not currently hiding. 42 | 43 | When you submit your Config file to AWS it creates your queue in SQS but that queue starts out empty. 44 | When you submit your Jobs file to AWS it puts all of your jobs into the queue under “Messages Available”. 45 | When you submit your Fleet file to AWS it 1) creates machines in EC2, 2) ECS puts Docker containers on those instances, and 3) those instances look in “Messages Available” in SQS for jobs to run. 46 | 47 | Once a Docker has pulled a job, that job moves from “Available’ to “In Flight”. 48 | It remains hidden (“In Flight”) for the duration of time set in SQS_MESSAGE_VISIBILITY and then it becomes visible again (“Available”). 49 | Jobs are hidden so that multiple machines don’t process the same job at the same time. 50 | If the job completes successfully, the Docker tells the queue to delete that message. 51 | 52 | If the job completes but it is not successful (e.g. CellProfiler errors), the Docker tells the queue to move the job from “In Flight” to “Available” so another Docker (with a different copy of CellProfiler) can attempt to complete the job. 53 | 54 | If the SQS_MESSAGE_VISIBILITY is too short then a job will become unhidden even though it is still currently being (hopefully successfully) run by the Docker that originally picked it up. 55 | This means that another Docker may come along and start the same job and you end up paying for unnecessary compute time because both Dockers will continue to run the job until they each finish. 56 | 57 | If the SQS_MESSAGE_VISIBILITY is too long then you can end up wasting time and money waiting for the job to become available again after a crash even when the rest of your analysis is done. 58 | If anything causes a job to stop mid-run (e.g. CellProfiler crashes, the instance crashes, or the instance is removed by AWS because you are outbid), that job stays hidden until the set time. 59 | If a Docker instance goes to the queue and doesn’t find any visible jobs, then it does not try to run any more jobs in that copy of CellProfiler, limiting the effective computing power of that Docker. 60 | Therefore, some or all of your instances may hang around doing nothing (but costing money) until the job is visible again. 61 | When in doubt, it is better to have your SQS_MESSAGE_VISIBILITY set too long than too short because, while crashes can happen, it is rare that AWS takes small machines from your fleet, though we do notice it happening with larger machines. 62 | 63 | There is not an easy way to see if you have selected the appropriate amount of time for your SQS_MESSAGE_VISIBILITY on your first run through a brand new pipeline. 64 | To confirm that multiple Dockers didn’t run the same job, after the jobs are complete, you need to manually go through each log in CloudWatch and figure out how many times you got the word “SUCCESS” in each log. 65 | (This may be reasonable to do on an illumination correction run where you have a single job per plate, but it’s not so reasonable if running an analysis pipeline on thousands of individual images). 66 | To confirm that multiple Dockers are never processing the same job, you can keep an eye on your queue and make sure that you never have more jobs “In Flight” than the number of copies of CellProfiler that you have running; likewise, if your timeout time is too short, it may seem like too few jobs are “In Flight” even though the CPU usage on all your machines is high. 67 | 68 | Once you have run a pipeline once, you can check the execution time (either by noticing how long after you started your jobs that your first jobs begin to finish, or by checking the logs of individual jobs and noting the start and end time), you will then have an accurate idea of roughly how long that pipeline needs to execute, and can set your message visibility accordingly. 69 | You can even do this on the fly while jobs are currently processing; the updated visibility time won’t affect the jobs already out for processing (i.e. if the time was set to 3 hours and you change it to 1 hour, the jobs already processing will remain hidden for 3 hours or until finished), but any job that begins processing AFTER the change will use the new visibility timeout setting. 70 | 71 | ## JOB_RETRIES 72 | 73 | **JOB_RETRIES** is the number of times that a job will be retried before it is sent to the Dead Letter Queue. 74 | The count goes up every time a message is "In Flight" and after the SQS_MESSAGE_VISIBILITY times out, if the count is too high the message will not be made "Available" but will instead go to your SQS_DEAD_LETTER_QUEUE. 75 | We recommend setting this larger than 1 because stochastic job failures are possible (e.g. the EC2 machine running the job become unavailable mid-run). 76 | Allowing large numbers of retries tends to waste compute as most failure modes are not stochastic. 77 | 78 | ## Example SQS Queue 79 | 80 | [[images/Sample_SQS_Queue.png|alt="Sample_SQS_Queue"]] 81 | 82 | This is an example of an SQS Queue. 83 | You can see that there is one active task with 64 jobs in it. 84 | In this example, we are running a fleet of 32 instances, each with a single Docker, so at this moment (right after starting the fleet), there are 32 tasks "In Flight" and 32 tasks that are still "Available." 85 | You can also see that many lab members have their own dead-letter queues which are, fortunately, all currently empty. 86 | -------------------------------------------------------------------------------- /documentation/DCP-documentation/_config.yml: -------------------------------------------------------------------------------- 1 | # Book settings 2 | 3 | # Learn more at https://jupyterbook.org/customize/config.html 4 | title: DCP Documentation 5 | author: Broad Institute 6 | copyright: "2022" 7 | #logo: img/logo.svg 8 | 9 | # Only build files that are in the ToC 10 | only_build_toc_files: true 11 | 12 | # Force re-execution of notebooks on each build. 13 | # See https://jupyterbook.org/content/execute.html 14 | execute: 15 | execute_notebooks: force 16 | 17 | # Information about where the book exists on the web 18 | repository: 19 | url: https://github.com/distributedscience/distributed-cellprofiler 20 | branch: main # Which branch of the repository should be used when creating links (optional) 21 | path_to_book: DCP-documentation 22 | 23 | html: 24 | baseurl: distributedscience.github.io 25 | use_repository_button: true 26 | use_issues_button: true 27 | use_edit_page_button: true 28 | comments: 29 | hypothesis: true 30 | 31 | parse: 32 | myst_enable_extensions: 33 | # Only required if you use html 34 | - html_image 35 | -------------------------------------------------------------------------------- /documentation/DCP-documentation/_toc.yml: -------------------------------------------------------------------------------- 1 | # Table of contents 2 | 3 | format: jb-book 4 | root: overview 5 | parts: 6 | - caption: FAQ 7 | chapters: 8 | - file: overview_2 9 | - file: costs 10 | - caption: Running DCP 11 | chapters: 12 | - file: step_0_prep 13 | - file: step_1_configuration 14 | sections: 15 | - file: config_examples 16 | - file: SQS_QUEUE_information 17 | - file: step_2_submit_jobs 18 | sections: 19 | - file: passing_files_to_DCP 20 | - file: step_3_start_cluster 21 | sections: 22 | - file: troubleshooting_start_cluster 23 | - file: step_4_monitor 24 | - caption: Technical Guides 25 | chapters: 26 | - file: dashboard 27 | - file: external_buckets 28 | - file: advanced_configuration 29 | - file: AWS_hygiene_scripts 30 | - file: troubleshooting_runs 31 | - file: versions -------------------------------------------------------------------------------- /documentation/DCP-documentation/advanced_configuration.md: -------------------------------------------------------------------------------- 1 | # Advanced Configuration of DCP 2 | 3 | We've tried very hard to make Distributed-CellProfiler light and adaptable, but keeping the configuration settings to a manageable number requires making some default assumptions. 4 | Below is a non-comprehensive list of places where you can adapt the code to your own purposes. 5 | 6 | *** 7 | 8 | ## Changes you can make to Distributed-CellProfiler outside of the Docker container 9 | 10 | * **Location of ECS configuration files:** By default these are placed into your bucket with a prefix of 'ecsconfigs/'. 11 | Alternate locations can be designated in the run script. 12 | * **Log configuration and location of exported logs:** Distributed-CellProfiler creates log groups with a default retention of 60 days (to avoid hitting the AWS limit of 250) and after finishing the run exports them into your bucket with a prefix of 'exportedlogs/LOG_GROUP_NAME/'. 13 | These may be modified in the run script. 14 | * **Advanced EC2 configuration:** Any additional configuration of your EC2 spot fleet (such as installing additional packages or running scripts on startup) can be done by modifying the userData parameter in the run script. 15 | * **SQS queue detailed configuration:** Distributed-CellProfiler creates a queue where unprocessed messages will expire after 14 days (the AWS maximum). 16 | This value can be modified in run.py . 17 | 18 | *** 19 | 20 | ## Changes that will require you to make your own Docker container 21 | 22 | * **CellProfiler version:** We try to keep Distributed-CellProfiler up to date with the latest stable release of CellProfiler, but in case you want to use your own Dockerized version of a different CellProfiler build you can edit the Dockerfile to call that CellProfiler Docker instead. 23 | * **Alarm names or thresholds:** These can be modified in the run-worker script. 24 | * **Frequency or types of information included in the per-instance logs:** These can be adjusted in the instance-monitor script. 25 | * **[CellProfiler command line flags](https://github.com/CellProfiler/CellProfiler/wiki/Adapting-CellProfiler-to-a-LIMS-environment#cmd):** These can be modified in the cp-worker script. 26 | * **Log stream names or logging level:** These can be modified in the cp-worker script. 27 | 28 | ## Changes to CellProfiler pipeline to use Distributed-CellProfiler with RunCellpose plugin 29 | 30 | * **Distributed-CellProfiler version:** At least CellProfiler version 4.2.4, and use the DOCKERHUB_TAG in config.py as `bethcimini/distributed-cellprofiler:2.1.0_4.2.4_plugins`. 31 | * **Custom model: If using a [custom User-trained model](https://cellpose.readthedocs.io/en/latest/models.html) generated using Cellpose, add the model file to S3. 32 | We use the following structure to organize our files on S3. 33 | 34 | ```text 35 | └── 36 |    └── workspace 37 |      └── model 38 |       └── custom_model_filename 39 | ``` 40 | 41 | * **RunCellpose module:** 42 | * Inside RunCellpose, select the "custom" Detection mode. 43 | In "Location of the pre-trained model file", enter the mounted bucket path to your model. 44 | e.g. **/home/ubuntu/bucket/projects//workspace/model/** 45 | * In "Pre-trained model file name", enter your custom_model_filename 46 | -------------------------------------------------------------------------------- /documentation/DCP-documentation/config_examples.md: -------------------------------------------------------------------------------- 1 | # config.py configuration examples 2 | 3 | We have a handful of standard workflows we follow in a stereotyped fashion when running the Cell Painting Assay. 4 | We have listed below the standard way that we configure `config.py` for each workflow. 5 | You can read more information about the pipelines in the context of the Cell Painting Assay [here](https://www.biorxiv.org/content/10.1101/2022.07.13.499171v1.full). 6 | 7 | - **Z-projection** creates a new image with each pixel containing the maximum value from any of the z-planes, effectively condensing the contents of multiple focal planes into one. 8 | Generally, we perform projection on all images with multiple z-planes and downstream processing and analysis is performed on the projected images. 9 | 10 | - **Illumination Correction** is batched by plate and generates a function that corrects for light path irregularities as described [here](https://onlinelibrary.wiley.com/doi/abs/10.1111/jmi.12178). 11 | Note that this pipeline depends upon having a large number of images. 12 | A standard pipeline can be found [here](https://github.com/broadinstitute/imaging-platform-pipelines/blob/master/JUMP_production/JUMP_illum_LoadData_v1.cppipe). 13 | 14 | - **Quality Control** provides metrics on the quality of the input images. 15 | It is not a necessary step but can provide helpful information, particularly for improving wetlab workflows and for comparing across datasets. 16 | A standard pipeline can be found [here](https://github.com/broadinstitute/imaging-platform-pipelines/blob/master/JUMP_production/JUMP_QC_Drag-and-Drop_v1.cppipe). 17 | 18 | - **Assay Dev/Segmentation** is a quick pipeline that outputs segmentation outlines overlaid on a multichannel image rescaled for visual inspection. 19 | We often stitch the output into a pseudo-plate view as described [here](https://currentprotocols.onlinelibrary.wiley.com/doi/10.1002/cpz1.89) to confirm we have chosen segmentation parameters that work across our dataset. 20 | A standard pipeline can be found [here](https://github.com/broadinstitute/imaging-platform-pipelines/blob/master/JUMP_production/JUMP_segment_LoadData_v1.cppipe). 21 | 22 | - **Analysis** is where illumination correction is applied, actual segmentation occurs, and all of the measurements used for generating image-based profiles are taken. 23 | Note that large images may require more memory than our default parameters listed below. 24 | If you don't have enough memory, reduce the number of copies of CellProfiler running at one time by decreasing DOCKER_CORES. 25 | A standard pipeline can be found [here](https://github.com/broadinstitute/imaging-platform-pipelines/blob/master/JUMP_production/JUMP_analysis_v3.cppipe). 26 | 27 | Our internal configurations for each pipeline are as follows: 28 | 29 | | | Z-Projection | Illumination Correction | Quality Control | Assay Dev | Analysis | Notes | 30 | |---|---|---|---|---|---|---| 31 | | APP_NAME | 'PROJECT_NAME_Zproj' |'PROJECT_NAME_Illum' | 'PROJECT_NAME_QC' |' PROJECT_NAME_AssayDev' | 'PROJECT_NAME_Analysis' | If the PROJECT_NAME is excessively long you can enter a truncated version of it here but you will need to be careful to use the correct version in subsequent steps in the protocol. (e.g. 2021_06_08_WCPC_Zproj) | 32 | | LOG_GROUP_NAME | APP_NAME | APP_NAME | APP_NAME | APP_NAME |APP_NAME | We never change this. | 33 | | DOCKERHUB_TAG | 'cellprofiler/distributed-cellprofiler:2.2.0_4.2.8' | 'cellprofiler/distributed-cellprofiler:2.2.0_4.2.8' | 'cellprofiler/distributed-cellprofiler:2.2.0_4.2.8' | 'cellprofiler/distributed-cellprofiler:2.2.0_4.2.8' | 'cellprofiler/distributed-cellprofiler:2.2.0_4.2.8' | Ensure the CP tag number matches the version of CellProfiler for your pipeline (can easily see by opening the pipeline in a text editor and looking for the 3rd line “DateRevision: 413”). | 34 | | AWS_REGION | 'us-east-1' | 'us-east-1' | 'us-east-1' | 'us-east-1' | 'us-east-1' | | 35 | | AWS_PROFILE | 'default' | 'default' | 'default' | 'default' | 'default' | | 36 | | SSH_KEY_NAME | 'YOURPEM.pem' | 'YOURPEM.pem' | 'YOURPEM.pem' | 'YOURPEM.pem' | 'YOURPEM.pem' | | 37 | | AWS_BUCKET | 'BUCKET' | 'BUCKET' | 'BUCKET' | 'BUCKET' | 'BUCKET' | Usually a bucket in the account that is running DCP. | 38 | | SOURCE_BUCKET | 'BUCKET' | 'BUCKET' | 'BUCKET' | 'BUCKET' | 'BUCKET' | Can be a public bucket like cellpainting-gallery. | 39 | | WORKSPACE_BUCKET | 'BUCKET' | 'BUCKET' | 'BUCKET' | 'BUCKET' | 'BUCKET' | If reading images from a public bucket, you might still want to read metadata from your bucket. | 40 | | DESTINATION_BUCKET | 'BUCKET' | 'BUCKET' | 'BUCKET' | 'BUCKET' | 'BUCKET' | Usually a bucket in the account that is running DCP. | 41 | | UPLOAD_FLAGS | '' | '' | '' | '' | '' | | 42 | | ECS_CLUSTER | 'default' | 'default' | 'default' | 'default' | 'default' | Most of the time we all just use the default cluster but if there are multiple jobs being run at once you can create your own cluster by changing default to YOURNAME so that the correct dockers go on the correct machines. | 43 | | CLUSTER_MACHINES | 100-200 | number of plates / 4 and rounded up | 25-100 | 25-100 | 100-200 | AWS has limits on the number of machines you can request at a time. 200 is generally the largest we request for a single job to ensure there is some capacity for other users in the team. For Illum, use number of plates divided by number of CPUs - we assume 4 vCPUs (as on 'c5.xlarge' machines).| 44 | | TASKS_PER_MACHINE | 1 | 1 | 1 | 1 | 1 | | 45 | | MACHINE_TYPE | ['c5.xlarge'] | ['c5.xlarge'] | ['c5.xlarge'] | ['c5.xlarge'] | ['c5.xlarge'] | Historically we have used m4.xlarge and then m5.xlarge however very recently we have been having a hard time getting m class machines so we have switched to c class. Note that they have different memory sizes so you need to make sure MEMORY is set correctly if changing between classes. | 46 | | MACHINE_PRICE | .20 | .20 | .20 | .20 | .20 | Will be different for different size/classes of machines. | 47 | | EBS_VOL_SIZE (if using S3 mounted as a file system) | 22 | 22 | 22 | 22 | 22 | Files are read directly off of S3, mounted as a file system when `DOWNLOAD_FILES = False`. | 48 | | EBS_VOL_SIZE (if downloading files) | 22 | 200 | 22 | 22 | 40 | Files are downloaded to the EBS volume when `DOWNLOAD_FILES = True`. | 49 | | DOWNLOAD_FILES | 'False' | 'False' | 'False' | 'False' | 'False' | | 50 | | ASSIGN_IP | 'True' | 'True' | 'True' | 'True' | 'True' | | 51 | | DOCKER_CORES | 4 | 4 | 4 | 4 | 3 | If using c class machines and large images (2k + pixels) then you might need to reduce this number. | 52 | | CPU_SHARES | DOCKER_CORES * 1024 | DOCKER_CORES * 1024 | DOCKER_CORES * 1024 | DOCKER_CORES * 1024 | DOCKER_CORES * 1024 | We never change this. | 53 | | MEMORY | 7500 | 7500 | 7500 | 7500 | 7500 | This must match your machine type. m class use 15000, c class use 7500. | 54 | | SECONDS_TO_START | 60 | 3*60 | 60 | 3*60 | 3*60 | | 55 | | SQS_QUEUE_NAME | APP_NAME + 'Queue' | APP_NAME + 'Queue' | APP_NAME + 'Queue' | APP_NAME + 'Queue' | APP_NAME + 'Queue' | We never change this. | 56 | | SQS_MESSAGE_VISIBILITY | 3*60 | 240*60 | 15*60 | 10*60 | 120*60 | About how long you expect a job to take * 1.5 in seconds | 57 | | SQS_DEAD_LETTER_QUEUE | 'YOURNAME_DEADMESSAGES' | 'YOURNAME_DEADMESSAGES' | 'YOURNAME_DEADMESSAGES' | 'YOURNAME_DEADMESSAGES' |'YOURNAME_DEADMESSAGES' | | 58 | | JOB_RETRIES | 3 | 3 | 3 | 3 | 3 | | 59 | | AUTO_MONITOR | 'True' | 'True' | 'True' | 'True' | 'True' | Can be turned off if manually running Monitor. | 60 | | CREATE_DASHBOARD | 'True' | 'True' | 'True' | 'True' | 'True' | | 61 | | CLEAN_DASHBOARD | 'True' | 'True' | 'True' | 'True' | 'True' | | 62 | | CHECK_IF_DONE_BOOL | 'False' | 'True' | 'True' | 'True' | 'True' | Can be turned off if wanting to overwrite old data. | 63 | | EXPECTED_NUMBER_FILES | 1 (can be anything, False above) | number channels + 1 (an .npy for each channel and isdone) | 3 (Experiment.csv, Image.csv, and isdone) | 1 (an image) | 5 (Experiment, Image, Cells, Nuclei, and Cytoplasm .csvs) | Better to underestimate than overestimate. | 64 | | MIN_FILE_SIZE_BYTES | 1 | 1 | 1 | 1 | 1 | Count files of any size. | 65 | | NECESSARY_STRING | '' | '' | '' | '' | '' | Not necessary for standard workflows. | 66 | | ALWAYS_CONTINUE | 'False' | 'False' | 'False' | 'False' | 'False' | Use with caution. | 67 | | USE_PLUGINS | 'False' | 'False' | 'False' | 'False' | 'False' | Not necessary for standard workflows. | 68 | | UPDATE_PLUGINS | 'False' | 'False' | 'False' | 'False' | 'False' | Not necessary for standard workflows. | 69 | | PLUGINS_COMMIT | '' | '' | '' | '' | '' | Not necessary for standard workflows. | 70 | | INSTALL_REQUIREMENTS | 'False' | 'False' | 'False' | 'False' | 'False' | Not necessary for standard workflows. | 71 | | REQUIREMENTS_FILE | '' | '' | '' | '' | '' | Not necessary for standard workflows. | 72 | -------------------------------------------------------------------------------- /documentation/DCP-documentation/costs.md: -------------------------------------------------------------------------------- 1 | # What does Distributed-CellProfiler cost? 2 | 3 | Distributed-CellProfiler is run by a series of three commands, only one of which incurs costs at typical scale of usage: 4 | 5 | [`setup`](step_1_configuration.md) creates a queue in SQS and a cluster, service, and task definition in ECS. 6 | ECS is entirely free. 7 | SQS queues are free to create and use up to 1 million requests/month. 8 | 9 | [`submitJobs`](step_2_submit_jobs.md) places messages in the SQS queue which is free (under 1 million requests/month). 10 | 11 | [`startCluster`](step_3_start_cluster.md) is the only command that incurs costs with initiation of your spot fleet request, creating machine alarms, and optionally creating a run dashboard. 12 | 13 | The spot fleet is the major cost of running Distributed-CellProfiler, exact pricing of which depends on the number of machines, type of machines, and duration of use. 14 | Your bid is configured in the [config file](step_1_configuration.md). 15 | Simple spot fleet configurations can be minimized by: 16 | 17 | 1) Optimize `MACHINE_TYPE` and `EBS_VOL_SIZE` based on the actual memory and harddrive needs of your run. 18 | 2) When possible, mount your S3 bucket using S3FS so that you can set `DOWNLOAD_FILES = 'False'` to not incur file egress costs. 19 | See [Step 1 Configuration](step_1_configuration.md) for more information. 20 | Data egress charges range for various reasons including traversing AWS regions and/or AWS availability zones but are [often $0.08–$0.12 per GB](https://aws.amazon.com/blogs/apn/aws-data-transfer-charges-for-server-and-serverless-architectures/). 21 | 3) Set `ASSIGN_IP = 'False'` so that you don't pay for IPv4 addresses per EC2 instance in your spot fleet. 22 | Public IPv4 costs are minimal ([$0.005/IP/hour as of February 1, 2024](https://aws.amazon.com/blogs/aws/new-aws-public-ipv4-address-charge-public-ip-insights/)) but there is no need to incur even this minimal cost unless you have a specific need for it. 23 | See [Step 1 Configuration](step_1_configuration.md) for more information. 24 | 25 | Spot fleet costs can be minimized/stopped in multiple ways: 26 | 27 | 1) We encourage the use of [`monitor`](step_4_monitor.md) during your job to help minimize the spot fleet cost as it automatically scales down your spot fleet request as your job queue empties and cancels your spot fleet request when you have no more jobs in the queue. 28 | Note that you can also perform a more aggressive downscaling of your fleet by monitor by engaging Cheapest mode (see [`more information here`](step_4_monitor.md)). 29 | 2) If your job is finished, you can still initiate [`monitor`](step_4_monitor.md) to perform the same cleanup (without the automatic scaling). 30 | 3) If you want to abort and clean up a run, you can purge your SQS queue in the [AWS SQS console](https://console.aws.amazon.com/sqs/) (by selecting your queue and pressing Actions => Purge) and then initiate [`monitor`](step_4_monitor.md) to perform the same cleanup. 31 | 4) You can stop the spot fleet request directly in the [AWS EC2 console](https://console.aws.amazon.com/ec2/) by going to Instances => Spot Requests, selecting your spot request, and pressing Actions => Cancel Spot Request. 32 | 33 | After the spot fleet has started, a Cloudwatch instance alarm is automatically placed on each instance in the fleet. 34 | Cloudwatch instance alarms [are currently $0.10/alarm/month](https://aws.amazon.com/cloudwatch/pricing/). 35 | Cloudwatch instance alarm costs can be minimized/stopped in multiple ways: 36 | 37 | 1) If you run monitor during your job, it will automatically delete Cloudwatch alarms for any instance that is no longer in use once an hour while running and at the end of a run. 38 | 2) If your job is finished, you can still initiate [`monitor`](step_4_monitor.md) to delete Cloudwatch alarms for any instance that is no longer in use. 39 | 3) In [AWS Cloudwatch console](https://console.aws.amazon.com/cloudwatch/) you can select unused alarms by going to Alarms => All alarms. Change Any State to Insufficient Data, select all alarms, and then Actions => Delete. 40 | 4) We provide a [hygiene script](hygiene.md) that will clean up old alarms for you. 41 | 42 | Cloudwatch Dashboards [are currently free](https://aws.amazon.com/cloudwatch/pricing/) for 3 Dashboards with up to 50 metrics per month and are $3 per dashboard per month after that. 43 | Cloudwatch Dashboard costs can be minimized/prevented in multiple ways: 44 | 45 | 1) You can choose not to have Distributed-CellProfiler create a Dashboard by setting `CREATE_DASHBOARD = 'False'` in your [config file](step_1_configuration.md). 46 | 2) We encourage the use of [`monitor`](step_4_monitor.md) during your job as if you have set `CLEAN_DASHBOARD = 'True'` in your [config file](step_1_configuration.md) it will automatically delete your Dashboard when your job is done. 47 | 3) If your job is finished, you can still initiate [`monitor`](step_4_monitor.md) to perform the same cleanup (without the automatic scaling). 48 | 4) You can manually delete Dashboards in the [Cloudwatch Console]((https://console.aws.amazon.com/cloudwatch/)) by going to Dashboards, selecting your Dashboard, and selecting Delete. 49 | -------------------------------------------------------------------------------- /documentation/DCP-documentation/dashboard.md: -------------------------------------------------------------------------------- 1 | # AWS Cloudwatch Dashboard 2 | ![Cloudwatch Dashboard Overview](images/dashboard_overview.png) 3 | 4 | AWS Cloudwatch Dashboards are “customizable home pages in the CloudWatch console that you can use to monitor your resources in a single view and create customized views of the metrics and alarms for your AWS resources.” 5 | A Dashboard is full of widgets, each of which you create and customize to report on a separate AWS metric. 6 | Distributed-CellProfiler has the option to auto-create a Cloudwatch Dashboard for each run and the option to clean it up when you are done. 7 | These options are set in [your config file](step_1_configuration.md). 8 | 9 | The Dashboard setup that DS auto-populates is helfpul for monitoring a run as it is occurring or for a post-mortem to better understand a previous run. 10 | Some things you can see include: whether your machines are sized appropriately for your jobs, how stable your spot fleet is, whether your jobs are failing and if so if they’re failing in a consistent manner. 11 | All told, this can help you understand and optimize your resource usage, thus saving you time and money 12 | 13 | ## FulfilledCapacity: 14 | ![Fulfilled Capacity widget](images/fulfilledcapacity.png) 15 | 16 | This widget shows the number of machines in your spot fleet that are fulfilled, i.e. how many machines you actually have at any given point. 17 | After a short spin-up time after initiating a run, you hope to see a straight line at the number of machines requested in your fleet and then a steady decrease at the end of a run as monitor scales your fleet down to match the remaining jobs. 18 | 19 | Some number of small dips are all but inevitable as machines crash and are replaced or AWS takes some of your capacity and gives it to a higher bidder. 20 | However, every time there is a dip, it means that a machine that was running a job is no longer running it and any progress on that job is lost. 21 | The job will hang out as “Not Visible” in your SQS queue until it reaches the amount of time set by SQS_MESSAGE_VISIBILITY in [your config file](step_1_configuration.md). 22 | For quick jobs, this doesn’t have much of an impact, but for jobs that take many hours, this can be frustrating and potentially expensive. 23 | 24 | If you’re seeing lots of dips or very large dips, you may be able to prevent this in future runs by 1) requesting a different machine type 2) bidding a larger amount for your machines 3) changing regions. 25 | You can also check if blips coincide with AWS outages, in which case there’s nothing you can do, it’s just bad luck (that’s what happened with the large dip in the example above). 26 | 27 | ## NumberOfMessagesReceived/Deleted 28 | 29 | ![NumberofMessagesReceived/Deleted](images/messages_deleted_received.png) 30 | 31 | This widget shows you in bulk whether your jobs are completing or erroring. 32 | NumberOfMessagesDeleted shows messages deleted from the queue after the job has successfully completed. 33 | NumberOfMessagesReceived shows both messages that are deleted from the queue as well as messages that are put back in the queue because they errored. 34 | You hope to see that the two lines track on top of each other because that means no messages are erroring. 35 | If there are often gaps between the lines then it means a fraction of your jobs are erroring and you’ll need to figure out why (see MemoryUtilization and Show Errors or look directly in your Cloudwatch Logs for insights). 36 | 37 | ## MemoryUtilization 38 | 39 | ![Memory Utilization](images/memoryutilization.png) 40 | 41 | Insufficient memory is the error that we most often encounter (as we try to use the smallest machines possible for economy’s sake) so we like to look at memory usage. 42 | Note that this is showing memory utilization in bulk for your cluster, not for individual machines. 43 | Because different machines reach memory intensive steps at different points in time, and because we’re looking at an average across 5 minute windows, the max percentage you see is likely to be much less than 100%, even if you are using all the memory in your machines at some points. 44 | 45 | # MessagesVisible/NotVisible 46 | 47 | ![MessagesVisible/NotVisible](images/messages_change_slope.png) 48 | 49 | Visible messages are messages waiting in your queue. 50 | Hidden messages (aka MessagesNotVisible) have been started and will remain hidden until either they are completed and therefore removed from the queue or they reach the time set in SQS_MESSAGE_VISIBILITY in your config file, whichever comes first. 51 | ([Read more about Message Visibility](SQS_QUEUE_information.md).) 52 | After starting your fleet (and waiting long enough for at least one round of jobs to complete), you hope to see a linear decline in total messages with the number of hidden messages equal to the number of jobs being run (fleet size * tasks per machine * docker cores). 53 | 54 | ![Blip in MessagesVisible/NotVisible](images/blip_in_messagesnotvisible.png) 55 | 56 | Sometimes you’ll see a blip where there is a rapid increase in the number of hidden messages (as pictured above). 57 | This can happen if there is an error on a machine and the hard disk gets full - it rapidly pulls jobs and puts them back until the machine error is caught and rebooted. 58 | This type of error shows in this widget as it happens. 59 | 60 | If your spot fleet loses capacity (see FulfilledCapacity), you may see a blip in MessagesVisible/NotVisible where the number of hidden messages rapidly decreases. 61 | This appears in the widget the amount of time set in SQS_MESSAGE_VISIBILITY in your config file after the capacity loss when jobs that were started (i.e. hidden) but not completed return to visible status. 62 | 63 | The relative slope of your graph can also be informative. 64 | For the run pictured at top, we discovered that a fraction of our jobs were erroring because the machines were running out of memory. 65 | Midway through 7/12 we upped the memory of the machines in our fleet and you can see from that point on a greater slope as more jobs were finishing in the same amount of time (as fewer were failing to complete because of memory errors.) 66 | 67 | ## Distinct Logs 68 | 69 | ![Logs comparison](images/logs_comparison.png) 70 | 71 | This widget shows you the number of different specific jobs that start within your given time window by plotting the number of Cloudwatch logs that have your run command in them. 72 | In this example, our run command is "cellprofiler -c". 73 | It is not necessarily informative on its own, but very helpful when compared with the following widget. 74 | 75 | ## All logs 76 | This widget shows you the number of total times that jobs are started within your log group within the given time window. 77 | Ideally, you want this number to match the number in the previous widget as it means that each job is starting in your software only once. 78 | 79 | If this number is consistently larger than the previous widget’s number, it could mean that some of your jobs are erroring and you’ll need to figure out why (see MemoryUtilization and Show Errors or look directly in your Cloudwatch Logs for insights). 80 | 81 | ## Show Errors 82 | ![Show errors](images/expand_error_log.png) 83 | 84 | This widget shows you the log entry any time that it contains “Error”. 85 | Ideally, this widget will remain empty. 86 | If it is logging errors, you can toggle each row for more information - it will show the job that errored in @logStream and the actual error message in @message. 87 | 88 | ## Interacting with a Dashboard: 89 | 90 | Once you have your Dashboard created and full of widgets, you can adjust the timescale for which the widget is reporting metrics. 91 | For any of the widgets you can set the absolute or relative time that the widget is showing by selecting the time scale from the upper right corner of the screen. 92 | Zoom in to a particular time selection on a visible widget by drawing a box around that time on the widget itself (note that zooming in doesn’t change what’s plotted, just what part of the plot you can see so metrics like Show Errors won’t update with a zoom). 93 | 94 | Some widgets allow you to select/deselect certain metrics plotted in the widget. 95 | To hide a metric without permanently removing it from the widget, simply click the X on the box next to the name of the metric in the legend. 96 | 97 | You can move the widgets around on your dashboard by hovering on the upper right or upper left corner of a widget until a 4-direction-arrow icon appears and then dragging and dropping the widget. 98 | You can change the size of a widget by hovering on the lower right corner of the widget until a diagonal arrow icon appears and then dragging the widget to the desired size. 99 | After making changes, make sure to select Save dashboard from the top menu so that they are maintained after refreshing the page. 100 | -------------------------------------------------------------------------------- /documentation/DCP-documentation/external_buckets.md: -------------------------------------------------------------------------------- 1 | # Using External Buckets 2 | 3 | Distributed-CellProfiler can read and/or write to/from an external S3 bucket (i.e. a bucket not in the same account as you are running DCP). 4 | To do so, you will need to appropriately set your configuration in run.py. 5 | You may need additional configuration in AWS Identity and Access Management (IAM). 6 | 7 | ## Config setup 8 | 9 | * **AWS_PROFILE:** The profile you use must have appropriate permissions for running DCP as well as read/write permissions for the external bucket. 10 | See below for more information. 11 | 12 | * **AWS_BUCKET:** The bucket to which you would like to write log files. 13 | This is generally the bucket in the account in which you are running compute. 14 | * **SOURCE_BUCKET:** The bucket where the files you will be reading are. 15 | Often, this is the same as AWS_BUCKET. 16 | * **DESTINATION_BUCKET:** The bucket where you want to write your output files. 17 | Often, this is the same as AWS_BUCKET. 18 | * **UPLOAD_FLAGS:** If you need to add flags to an AWS CLI command to upload flags to your DESTINATION_BUCKET, this is where you enter them. 19 | This is typically only used if you are writing to a bucket that is not yours. 20 | If you don't need to add UPLOAD_FLAGS, keep it as the default ''. 21 | 22 | ## Example configs 23 | 24 | ### Reading from the Cell Painting Gallery 25 | 26 | ```python 27 | AWS_REGION = 'your-region' # e.g. 'us-east-1' 28 | AWS_PROFILE = 'default' # The same profile used by your AWS CLI installation 29 | SSH_KEY_NAME = 'your-key-file.pem' # Expected to be in ~/.ssh 30 | AWS_BUCKET = 'bucket-name' # Your bucket 31 | SOURCE_BUCKET = 'cellpainting-gallery' 32 | WORKSPACE_BUCKET = 'bucket-name' # Likely your bucket 33 | DESTINATION_BUCKET = 'bucket-name' # Your bucket 34 | UPLOAD_FLAGS = '' 35 | ``` 36 | 37 | ### Read/Write to a collaborator's bucket 38 | 39 | ```python 40 | AWS_REGION = 'your-region' # e.g. 'us-east-1' 41 | AWS_PROFILE = 'role-permissions' # A profile with the permissions setup described above 42 | SSH_KEY_NAME = 'your-key-file.pem' # Expected to be in ~/.ssh 43 | AWS_BUCKET = 'bucket-name' # Your bucket 44 | SOURCE_BUCKET = 'collaborator-bucket' 45 | WORKSPACE_BUCKET = 'collaborator-bucket' 46 | DESTINATION_BUCKET = 'collaborator-bucket' 47 | UPLOAD_FLAGS = '--acl bucket-owner-full-control --metadata-directive REPLACE' # Examples of flags that may be necessary 48 | ``` 49 | 50 | ## Permissions setup 51 | 52 | If you are reading from a public bucket, no additional setup is necessary. 53 | Note that, depending on the configuration of that bucket, you may not be able to mount the public bucket so you will need to set `DOWNLOAD_FILES='True'`. 54 | 55 | If you are reading from a non-public bucket or writing to a bucket that is not yours, you wil need further permissions setup. 56 | Often, access to someone else's AWS account is handled through a role that can be assumed. 57 | Learn more about AWS IAM roles [here](https://docs.aws.amazon.com/IAM/latest/UserGuide/id_roles.html). 58 | Your collaborator will define the access limits of the role within their AWS IAM. 59 | You will also need to define role limits within your AWS IAM so that when you assume the role (giving you access to your collaborator's resource), that role also has the appropriate permissions to run DCP. 60 | 61 | ### In your AWS account 62 | 63 | In AWS IAM, for the role that has external bucket access, you will need to add all of the DCP permissions described in [Step 0](step_0_prep.md). 64 | 65 | You will also need to edit the trust relationship for the role so that ECS and EC2 can assume the role. 66 | A template is as follows: 67 | 68 | ```json 69 | { 70 | "Version": "2012-10-17", 71 | "Statement": [ 72 | { 73 | "Effect": "Allow", 74 | "Principal": { 75 | "AWS": [ 76 | "arn:aws:iam::123456789123:user/image_analyst", 77 | "arn:aws:iam::123456789123:user/image_expert" 78 | ], 79 | "Service": [ 80 | "ecs-tasks.amazonaws.com", 81 | "ec2.amazonaws.com" 82 | ] 83 | }, 84 | "Action": "sts:AssumeRole" 85 | } 86 | ] 87 | } 88 | 89 | ``` 90 | 91 | ### In your DCP instance 92 | 93 | DCP reads your AWS_PROFILE from your [control node](step_0_prep.md#the-control-node). 94 | Edit your AWS CLI configuration files for assuming that role in your control node as follows: 95 | 96 | In `~/.aws/config`, copy in the following text block at the bottom of the file, editing to your specifications, and save. 97 | 98 | [profile access-collaborator] 99 | role_arn = arn:aws:iam::123456789123:role/access-to-other-bucket 100 | source_profile = my-account-profile 101 | region = us-east-1 102 | output = json 103 | 104 | In `~/.aws/credentials`, copy in the following text block at the bottom of the file (filling in your access key info) and save. 105 | 106 | [my-account-profile] 107 | aws_access_key_id = ACCESS_KEY 108 | aws_secret_access_key = SECRET_ACCESS_KEY 109 | -------------------------------------------------------------------------------- /documentation/DCP-documentation/images/AMIID.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DistributedScience/Distributed-CellProfiler/bccb4ce3e07eedd43552851d5e3534e3bccae3f6/documentation/DCP-documentation/images/AMIID.jpg -------------------------------------------------------------------------------- /documentation/DCP-documentation/images/DCP-chronological_schematic.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DistributedScience/Distributed-CellProfiler/bccb4ce3e07eedd43552851d5e3534e3bccae3f6/documentation/DCP-documentation/images/DCP-chronological_schematic.png -------------------------------------------------------------------------------- /documentation/DCP-documentation/images/ECS.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DistributedScience/Distributed-CellProfiler/bccb4ce3e07eedd43552851d5e3534e3bccae3f6/documentation/DCP-documentation/images/ECS.jpg -------------------------------------------------------------------------------- /documentation/DCP-documentation/images/InstanceID.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DistributedScience/Distributed-CellProfiler/bccb4ce3e07eedd43552851d5e3534e3bccae3f6/documentation/DCP-documentation/images/InstanceID.jpg -------------------------------------------------------------------------------- /documentation/DCP-documentation/images/Launch.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DistributedScience/Distributed-CellProfiler/bccb4ce3e07eedd43552851d5e3534e3bccae3f6/documentation/DCP-documentation/images/Launch.jpg -------------------------------------------------------------------------------- /documentation/DCP-documentation/images/LoadDataCSV.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DistributedScience/Distributed-CellProfiler/bccb4ce3e07eedd43552851d5e3534e3bccae3f6/documentation/DCP-documentation/images/LoadDataCSV.png -------------------------------------------------------------------------------- /documentation/DCP-documentation/images/Network.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DistributedScience/Distributed-CellProfiler/bccb4ce3e07eedd43552851d5e3534e3bccae3f6/documentation/DCP-documentation/images/Network.jpg -------------------------------------------------------------------------------- /documentation/DCP-documentation/images/Snapshot.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DistributedScience/Distributed-CellProfiler/bccb4ce3e07eedd43552851d5e3534e3bccae3f6/documentation/DCP-documentation/images/Snapshot.jpg -------------------------------------------------------------------------------- /documentation/DCP-documentation/images/adjust_dashboard_view.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DistributedScience/Distributed-CellProfiler/bccb4ce3e07eedd43552851d5e3534e3bccae3f6/documentation/DCP-documentation/images/adjust_dashboard_view.png -------------------------------------------------------------------------------- /documentation/DCP-documentation/images/blip_in_messagesnotvisible.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DistributedScience/Distributed-CellProfiler/bccb4ce3e07eedd43552851d5e3534e3bccae3f6/documentation/DCP-documentation/images/blip_in_messagesnotvisible.png -------------------------------------------------------------------------------- /documentation/DCP-documentation/images/dashboard_overview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DistributedScience/Distributed-CellProfiler/bccb4ce3e07eedd43552851d5e3534e3bccae3f6/documentation/DCP-documentation/images/dashboard_overview.png -------------------------------------------------------------------------------- /documentation/DCP-documentation/images/expand_error_log.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DistributedScience/Distributed-CellProfiler/bccb4ce3e07eedd43552851d5e3534e3bccae3f6/documentation/DCP-documentation/images/expand_error_log.png -------------------------------------------------------------------------------- /documentation/DCP-documentation/images/fulfilledcapacity.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DistributedScience/Distributed-CellProfiler/bccb4ce3e07eedd43552851d5e3534e3bccae3f6/documentation/DCP-documentation/images/fulfilledcapacity.png -------------------------------------------------------------------------------- /documentation/DCP-documentation/images/logs_comparison.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DistributedScience/Distributed-CellProfiler/bccb4ce3e07eedd43552851d5e3534e3bccae3f6/documentation/DCP-documentation/images/logs_comparison.png -------------------------------------------------------------------------------- /documentation/DCP-documentation/images/memoryutilization.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DistributedScience/Distributed-CellProfiler/bccb4ce3e07eedd43552851d5e3534e3bccae3f6/documentation/DCP-documentation/images/memoryutilization.png -------------------------------------------------------------------------------- /documentation/DCP-documentation/images/messages_change_slope.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DistributedScience/Distributed-CellProfiler/bccb4ce3e07eedd43552851d5e3534e3bccae3f6/documentation/DCP-documentation/images/messages_change_slope.png -------------------------------------------------------------------------------- /documentation/DCP-documentation/images/messages_deleted_received.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DistributedScience/Distributed-CellProfiler/bccb4ce3e07eedd43552851d5e3534e3bccae3f6/documentation/DCP-documentation/images/messages_deleted_received.png -------------------------------------------------------------------------------- /documentation/DCP-documentation/images/sample_DCP_config_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DistributedScience/Distributed-CellProfiler/bccb4ce3e07eedd43552851d5e3534e3bccae3f6/documentation/DCP-documentation/images/sample_DCP_config_1.png -------------------------------------------------------------------------------- /documentation/DCP-documentation/images/sample_DCP_config_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DistributedScience/Distributed-CellProfiler/bccb4ce3e07eedd43552851d5e3534e3bccae3f6/documentation/DCP-documentation/images/sample_DCP_config_2.png -------------------------------------------------------------------------------- /documentation/DCP-documentation/images/zoom_into_dashboard.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DistributedScience/Distributed-CellProfiler/bccb4ce3e07eedd43552851d5e3534e3bccae3f6/documentation/DCP-documentation/images/zoom_into_dashboard.png -------------------------------------------------------------------------------- /documentation/DCP-documentation/overview.md: -------------------------------------------------------------------------------- 1 | # What is Distributed-CellProfiler? 2 | 3 | **How do I run CellProfiler on Amazon?** Use Distributed-CellProfiler! 4 | 5 | Distributed-CellProfiler is a series of scripts designed to help you run a Dockerized version of CellProfiler on [Amazon Web Services](https://aws.amazon.com/) (AWS) using AWS's file storage and computing systems. 6 | 7 | * Data is stored in S3 buckets. 8 | * Software is run on "Spot Fleets" of computers (or instances) in the cloud. 9 | 10 | ## What is Docker? 11 | 12 | Docker is a software platform that packages software into containers. 13 | In a container is the software that you want to run as well as everything needed to run it (e.g. your software source code, operating system libraries, and dependencies). 14 | 15 | Dockerizing a workflow has many benefits including 16 | 17 | * Ease of use: Dockerized software doesn't require the user to install anything themselves. 18 | * Reproducibility: You don't need to worry about results being affected by the version of your software or its dependencies being used as those are fixed. 19 | 20 | ## Why would I want to use this? 21 | 22 | Using AWS allows you to create a flexible, on-demand computing infrastructure where you only have to pay for the resources you use. 23 | This can give you access to far more computing power than you may have available at your home institution, which is great when you have large datasets to process. 24 | 25 | Each piece of the infrastructure has to be added and configured separately, which can be time-consuming and confusing. 26 | 27 | Distributed-CellProfiler tries to leverage the power of the former, while minimizing the problems of the latter. 28 | 29 | ## What do I need to have to run this? 30 | 31 | Essentially all you need to run Distributed-CellProfiler is an AWS account and a terminal program; see our [page on getting set up](step_0_prep.md) for all the specific steps you'll need to take. 32 | 33 | ## Can I contribute code to Distributed-CellProfiler? 34 | 35 | Feel free! We're always looking for ways to improve. 36 | 37 | ## Who made this? 38 | 39 | Distributed-CellProfiler is a project from the [Cimini Lab](https://cimini-lab.broadinstitute.org) in the Imaging Platform at the Broad Institute in Cambridge, MA, USA. It was initially developed in what is now the [Carpenter-Singh Lab](https://carpenter-singh-lab.broadinstitute.org). 40 | -------------------------------------------------------------------------------- /documentation/DCP-documentation/overview_2.md: -------------------------------------------------------------------------------- 1 | # What happens in AWS when I run Distributed-CellProfiler? 2 | 3 | The steps for actually running the Distributed-CellProfiler code are outlined in the repository [README](https://github.com/DistributedScience/Distributed-CellProfiler/blob/master/README.md), and details of the parameters you set in each step are on their respective Documentation pages ([Step 1: Config](step_1_configuration.md), [Step 2: Jobs](step_2_submit_jobs.md), [Step 3: Fleet](step_3_start_cluster.md), and optional [Step 4: Monitor](step_4_monitor.md)). 4 | We'll give an overview of what happens in AWS at each step here and explain what AWS does automatically once you have it set up. 5 | 6 | ![Distributed-CellProfiler Chronological Overview](images/DCP_chronological_schematic.png) 7 | 8 | **Step 1**: 9 | In the Config file you set quite a number of specifics that are used by EC2, ECS, SQS, and in making Dockers. 10 | When you run `$ python3 run.py setup` to execute the Config, it does three major things: 11 | 12 | * Creates task definitions. 13 | These are found in ECS. 14 | They define the configuration of the Dockers and include the settings you gave for **CHECK_IF_DONE_BOOL**, **DOCKER_CORES**, **EXPECTED_NUMBER_FILES**, and **MEMORY**. 15 | * Makes a queue in SQS (it is empty at this point) and sets a dead-letter queue. 16 | * Makes a service in ECS which defines how many Dockers you want. 17 | 18 | **Step 2**: 19 | In the Job file you set the location of any inputs (e.g. data and batch-specific scripts) and outputs. 20 | Additionally, you list all of the individual tasks that you want run. 21 | When you submit the Job file it adds that list of tasks to the queue in SQS (which you made in the previous step). 22 | Submit jobs with `$ python3 run.py submitJob`. 23 | 24 | **Step 3**: 25 | In the Config file you set the number and size of the EC2 instances you want. 26 | This information, along with account-specific configuration in the Fleet file is used to start the fleet with `$ python3 run.py startCluster`. 27 | 28 | **After these steps are complete, a number of things happen automatically**: 29 | 30 | * ECS puts Docker containers onto EC2 instances. 31 | If there is a mismatch within your Config file and the Docker is larger than the instance it will not be placed. 32 | ECS will keep placing Dockers onto an instance until it is full, so if you accidentally create instances that are too large you may end up with more Dockers placed on it than intended. 33 | This is also why you may want multiple **ECS_CLUSTER**s so that ECS doesn't blindly place Dockers you intended for one job onto an instance you intended for another job. 34 | * When a Docker container gets placed it gives the instance it's on its own name. 35 | * Once an instance has a name, the Docker gives it an alarm that tells it to reboot if it is sitting idle for 15 minutes. 36 | * The Docker hooks the instance up to the _perinstance logs in CloudWatch. 37 | * The instances look in SQS for a job. 38 | Any time they don't have a job they go back to SQS. 39 | If SQS tells them there are no visible jobs then they shut themselves down. 40 | * When an instance finishes a job it sends a message to SQS and removes that job from the queue. 41 | 42 | ## What does an instance configuration look like? 43 | 44 | ![Example Instance Configuration](images/sample_DCP_config_1.png) 45 | 46 | This is an example of one possible instance configuration. 47 | This is one m4.16xlarge EC2 instance (64 CPUs, 250GB of RAM) with a 165 EBS volume mounted on it. 48 | A spot fleet could contain many such instances. 49 | 50 | It has 16 tasks (individual Docker containers). 51 | 52 | Each Docker container uses 10GB of hard disk space and is assigned 4 CPUs and 15 GB of RAM (which it does not share with other Docker containers). 53 | 54 | Each container shares its individual resources among 4 copies of CellProfiler. 55 | Each copy of CellProfiler runs a pipeline on one "job", which can be anything from a single image to an entire 384 well plate or timelapse movie. 56 | 57 | You can optionally stagger the start time of these 4 copies of CellProfiler, ensuring that the most memory- or disk-intensive steps aren't happening simultaneously, decreasing the likelihood of a crash. 58 | 59 | Read more about this and other configurations in [Step 1: Configuration](step_1_configuration.md). 60 | 61 | ## How do I determine my configuration? 62 | 63 | To some degree, you determine the best configuration for your needs through trial and error. 64 | 65 | * Looking at the resources your software uses on your local computer when it runs your jobs can give you a sense of roughly how much hard drive and memory space each job requires, which can help you determine your group size and what machines to use. 66 | * Prices of different machine sizes fluctuate, so the choice of which type of machines to use in your spot fleet is best determined at the time you run it. 67 | How long a job takes to run and how quickly you need the data may also affect how much you're willing to bid for any given machine. 68 | * Running a few large Docker containers (as opposed to many small ones) increases the amount of memory all the copies of your software are sharing, decreasing the likelihood you'll run out of memory if you stagger your job start times. 69 | However, you're also at a greater risk of running out of hard disk space. 70 | 71 | Keep an eye on all of the logs the first few times you run any workflow and you'll get a sense of whether your resources are being utilized well or if you need to do more tweaking. 72 | 73 | ## What does this look like on AWS? 74 | 75 | The following five are the primary resources that Distributed-CellProfiler interacts with. 76 | After you have finished [preparing for Distributed-CellProfiler](step_0_prep), you do not need to directly interact with any of these services outside of Distributed-CellProfiler. 77 | If you would like a granular view of what Distributed-CellProfiler is doing while it runs, you can open each console in a separate tab in your browser and watch their individual behaviors, though this is not necessary, especially if you run the [monitor command](step_4_monitor.md) and/or have DS automatically create a Dashboard for you (see [Configuration](step_1_configuration.md)). 78 | 79 | * [S3 Console](https://console.aws.amazon.com/s3) 80 | * [EC2 Console](https://console.aws.amazon.com/ec2/) 81 | * [ECS Console](https://console.aws.amazon.com/ecs/) 82 | * [SQS Console](https://console.aws.amazon.com/sqs/) 83 | * [CloudWatch Console](https://console.aws.amazon.com/cloudwatch/) 84 | -------------------------------------------------------------------------------- /documentation/DCP-documentation/passing_files_to_DCP.md: -------------------------------------------------------------------------------- 1 | # Passing Files to DCP 2 | 3 | Distributed-CellProfiler can be told what files to use through LoadData.csv, Batch Files, or file lists. 4 | 5 | ## Metadata use in DCP 6 | 7 | Distributed-CellProfiler requires metadata and grouping in order to split jobs. 8 | This means that, unlikely a generic CellProfiler workflow, the inclusion of metadata and grouping are NOT optional for pipelines you wish to use in Distributed-CellProfiler. 9 | 10 | - If using LoadData, this means ensuring that your input CSV has some metadata to use for grouping and "Group images by metdata?" is set to "Yes". 11 | - If using batch files or file lists, this means ensuring that the Metadata and Groups modules are enabled, and that you are extracting metadata from file and folder names _that will also be present in your remote system_ in the Metadata module in your CellProfiler pipeline. 12 | You can pass additional metadata to CellProfiler by `Add another extraction method`, setting the method to `Import from file` and setting Metadata file location to `Default Input Folder`. 13 | Metadata of either type can be used for grouping. 14 | 15 | ## Load Data 16 | 17 | ![LoadData.csv](images/LoadDataCSV.png) 18 | 19 | LoadData.csv are CSVs that tell CellProfiler how the images should be parsed. 20 | At a minimum, this CSV should contain PathName_{NameOfChannel} and FileName_{NameOfChannel} columns for each of your channels, as well as Metadata_{PieceOfMetadata} for each kind of metadata being used to group your image sets. 21 | It can contain any other metadata you would like to track. 22 | Some users have reported issues with using relative paths in the PathName columns; using absolute paths beginning with `/home/ubuntu/bucket/{relativepath}` may increase your odds of success. 23 | 24 | ### Creating LoadData.csv 25 | 26 | You can create this CSV yourself via your favorite scripting language. 27 | We maintain a script for creating LoadData.csv from Phenix metadata XML files called [pe2loaddata](https://github.com/broadinstitute/pe2loaddata). 28 | 29 | You can also create the LoadData.csv in a local copy of CellProfiler using the standard input modules of Images, Metadata, NamesAndTypes and Groups. 30 | More written and video information about using the input modules can be found [here](broad.io/CellProfilerInput). 31 | After loading in your images, use the `Export`->`Image Set Listing` command. 32 | You will then need to replace the local paths with the paths where the files can be found in S3 which is hardcoded to `/home/ubuntu/bucket`. 33 | If your files are nested in the same structure, this can be done with a simple find and replace in any text editing software. 34 | (e.g. Find '/Users/eweisbar/Desktop' and replace with '/home/ubuntu/bucket') 35 | 36 | More detail: The [Dockerfile](https://github.com/DistributedScience/Distributed-CellProfiler/blob/master/worker/Dockerfile) is the first script to execute in the Docker. 37 | It creates the `/home/ubuntu/` folder and then executes [run_worker.sh](https://github.com/DistributedScience/Distributed-CellProfiler/blob/master/worker/run-worker.sh) from that point. 38 | run_worker.sh makes `/home/ubuntu/bucket/` and uses S3FS to mount your S3 bucket at that location. (If you set `DOWNLOAD_FILES='True'` in your [config](step_1_configuration.md), then the S3FS mount is bypassed but files are downloaded locally to the `/home/ubuntu/bucket` path so that the paths are the same as if it was S3FS mounted.) 39 | 40 | ### Using LoadData.csv 41 | 42 | To use a LoadData.csv with submitJobs, put the path to the LoadData.csv in **data_file:**. 43 | 44 | To use a LoadData.csv with run_batch_general.py, enter the name of the LoadData.csv under **#project specific stuff** in `{STEP}name`. 45 | At the bottom of the file, make sure there are no arguments or `batch=False` in the command for the step you are running. 46 | (e.g. `MakeAnalysisJobs()` or `MakeAnalysisJobs(batch=False)`) 47 | Note that if you do not follow our standard file organization, under **#not project specific, unless you deviate from the structure** you will also need to edit `datafilepath`. 48 | 49 | ## Batch Files 50 | 51 | Batch files are an easy way to transition from running locally to distributed. 52 | A batch file is an `.h5` file created by CellProfiler which captures all the data needed to run your workflow - pipeline and file information are packaged together. 53 | To use a batch file, your data needs to have the same structure in the cloud as on your local machine. 54 | 55 | ### Creating batch files 56 | 57 | To create a batch file, load all your images into a local copy of CellProfiler using the standard input modules of Images, Metadata, NamesAndTypes and Groups. 58 | More written and video information about using the input modules can be found [here](broad.io/CellProfilerInput). 59 | Put the `CreateBatchFiles` module at the end of your pipeline and ensure that it is selected. 60 | Add a path mapping and edit the `Local root path` and `Cluster root path`. 61 | Run the CellProfiler pipeline by pressing the `Analyze Images` button; note that it won't actually run your pipeline but will instead create a batch file. 62 | More information on the `CreateBatchFiles` module can be found [here](https://cellprofiler-manual.s3.amazonaws.com/CellProfiler-4.2.4/modules/fileprocessing.html). 63 | 64 | ### Using batch files 65 | 66 | To use a batch file with submitJobs, put the path to the `.h5` file in **data_file:** and **pipeline:**. 67 | 68 | To use a batch file with run_batch_general.py, enter the name of the batch file under **#project specific stuff** in `batchpipename{STEP}`. 69 | At the bottom of the file, set `batch=True` in the command for the step you are running. 70 | (e.g. `MakeAnalysisJobs(batch=True)`) 71 | Note that if you do not follow our standard file organization, under **#not project specific, unless you deviate from the structure** you will also need to edit `batchpath`. 72 | 73 | ## File lists 74 | 75 | You can also simply pass a list of absolute file paths (not relative paths) with one file per row in `.txt` format. 76 | These must be the absolute paths that Distributed-CellProfiler will see, aka relative to the root of your bucket (which will be mounted as `/bucket`. 77 | 78 | ### Creating File Lists 79 | 80 | Use any text editing software to create a `.txt` file where each line of the file is a path to a single image that you want to process. 81 | 82 | ### Using File Lists 83 | 84 | To use a file list with submitJobs, put the path to the `.txt` file in **data_file:**. 85 | -------------------------------------------------------------------------------- /documentation/DCP-documentation/step_0_prep.md: -------------------------------------------------------------------------------- 1 | # Step 0: Prep 2 | 3 | There are two classes of AWS resources that Distributed-CellProfiler interacts with: 1) infrastructure that is made once per AWS account to enable any Distributed-CellProfiler implementation to run and 2) infrastructure that is made and destroyed with every run. 4 | This section describes the creation of the first class of AWS infrastructure and only needs to be followed once per account. 5 | 6 | ## AWS Configuration 7 | 8 | The AWS resources involved in running Distributed-CellProfiler are configured using the [AWS Web Console](https://aws.amazon.com/console/) and a setup script we provide ([setup_AWS.py](../../setup_AWS.py)). 9 | You need an active AWS account configured to proceed. 10 | Login into your AWS account, and make sure the following list of resources is created: 11 | 12 | ### 1.1 Manually created resources 13 | 14 | * **Security Credentials**: Get [security credentials](http://docs.aws.amazon.com/IAM/latest/UserGuide/id_credentials_access-keys.html) for your account. 15 | Store your credentials in a safe place that you can access later. 16 | * **SSH Key**: You will probably need an ssh key to login into your EC2 instances (control or worker nodes). 17 | [Generate an SSH key](http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/ec2-key-pairs.html) and store it in a safe place for later use. 18 | If you'd rather, you can generate a new key pair to use for this during creation of the control node; make sure to `chmod 600` the private key when you download it. 19 | * **SSH Connection**: You can use your default AWS account VPC, subnet, and security groups. 20 | You should add an inbound SSH connection from your IP address to your security group. 21 | 22 | ### 1.2 Automatically created resources 23 | 24 | * BEFORE running setup_AWS, you need to open `lambda_function.py` and edit the `BUCKET_NAME` (keeping the quotes around the name) at the top of the file to be the name of your bucket. 25 | After editing, Line 12 of `lambda_function.py` should look like `bucket = "my-bucket-name"`. 26 | * Run setup_AWS by entering `python setup_AWS.py` from your command line. 27 | It will automatically create: 28 | * an [ecsInstanceRole](http://docs.aws.amazon.com/AmazonECS/latest/developerguide/instance_IAM_role.html) with appropriate permissions. 29 | This role is used by the EC2 instances generated by your spot fleet request and coordinated by ECS. 30 | * an [aws-ec2-spot-fleet-tagging-role](http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/spot-fleet-requests.html) with appropriate permissions. 31 | This role grants the Spot Fleet the permissions to request, launch, terminate, and tag instances. 32 | * an SNS topic that is used for triggering the auto-Monitor. 33 | * a Monitor lambda function that is used for auto-monitoring of your runs (see [Step 4: Monitor](step_4_monitor.md) for more information). 34 | 35 | ### 1.3 Auxiliary Resources 36 | 37 | *You can certainly configure Distributed-CellProfiler for use without S3, but most DS implementations use S3 for storage.* 38 | 39 | * [Create an S3 bucket](http://docs.aws.amazon.com/AmazonS3/latest/gsg/CreatingABucket.html) and upload your data to it. 40 | Add permissions to your bucket so that [logs can be exported to it](https://docs.aws.amazon.com/AmazonCloudWatch/latest/logs/S3ExportTasksConsole.html) (Step 3, first code block). 41 | 42 | ### 1.4 Increase Spot Limits 43 | 44 | AWS initially [limits the number of spot instances](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/using-spot-limits.html) you can use at one time; you can request more through a process in the linked documentation. 45 | Depending on your workflow (your scale and how you group your jobs), this may not be necessary. 46 | 47 | ## The Control Node 48 | 49 | The control node is a machine that is used for running the Distributed-CellProfiler scripts. 50 | It can be your local machine, if it is configured properly, or it can also be a small instance in AWS. 51 | We prefer to have a small EC2 instance dedicated to controlling our Distributed-CellProfiler workflows for simplicity of access and configuration. 52 | To login in an EC2 machine you need an SSH key that can be generated in the web console. 53 | Each time you launch an EC2 instance you have to confirm having this key (which is a .pem file). 54 | This machine is needed only for submitting jobs, and does not have any special computational requirements, so you can use a micro instance to run basic scripts to proceed. 55 | (Though we recommend each user has their own control node, further control nodes can be created from an AMI after this guide has been followed to completion once.) 56 | 57 | The control node needs the following tools to successfully run Distributed-CellProfiler. 58 | These instructions assume you are using the command line in a Linux machine, but you are free to try other operating systems too. 59 | 60 | ### Create Control Node from Scratch 61 | 62 | #### 2.1 Install Python 3.8 or higher and pip 63 | 64 | Most scripts are written in Python and support Python 3.8 and 3.9. 65 | Follow installation instructions for your platform to install Python. 66 | pip should be included with the installation of Python 3.8 or 3.9, but if you do not have it installed, install pip. 67 | 68 | #### 2.2 Clone this repository and install requirements 69 | 70 | You will need the scripts in Distributed-CellProfiler locally available in your control node. 71 |
 72 |     sudo apt-get install git
 73 |     git clone https://github.com/DistributedScience/Distributed-CellProfiler.git
 74 |     cd Distributed-CellProfiler/
 75 |     git pull
 76 |     # install requirements
 77 |     cd files
 78 |     sudo pip install -r requirements.txt
 79 | 
80 | 81 | #### 2.3 Install AWS CLI 82 | 83 | The command line interface is the main mode of interaction between the local node and the resources in AWS. 84 | You need to install [awscli](http://docs.aws.amazon.com/cli/latest/userguide/installing.html) for Distributed-CellProfiler to work properly: 85 | 86 |
 87 |     sudo pip install awscli --ignore-installed six
 88 |     sudo pip install --upgrade awscli
 89 |     aws configure
 90 | 
91 | 92 | When running the last step (`aws configure`), you will need to enter your AWS credentials. 93 | Make sure to set the region correctly (i.e. us-west-1 or eu-east-1, not eu-west-2a), and set the default file type to json. 94 | 95 | #### 2.1.4 s3fs-fuse (optional) 96 | 97 | [s3fs-fuse](https://github.com/s3fs-fuse/s3fs-fuse) allows you to mount your s3 bucket as a pseudo-file system. 98 | It does not have all the performance of a real file system, but allows you to easily access all the files in your s3 bucket. 99 | Follow the instructions at the link to mount your bucket. 100 | 101 | ### Create Control Node from AMI (optional) 102 | 103 | Once you've set up the other software (and gotten a job running, so you know everything is set up correctly), you can use Amazon's web console to set this up as an Amazon Machine Instance, or AMI, to replicate the current state of the hard drive. 104 | Create future control nodes using this AMI so that you don't need to repeat the above installation. 105 | 106 | ## Removing long-term infrastructure 107 | 108 | If you decide that you never want to run Distributed-CellProfiler again and would like to remove the long-term infrastructure, follow these steps. 109 | 110 | ### Remove Roles, Lambda Monitor, and Monitor SNS 111 | 112 |
113 | python setup_AWS.py destroy
114 | 
115 | 116 | ### Remove EC2 Control node 117 | 118 | If you made your control node as an EC2 instance, while in the AWS console, select the instance. 119 | Select `Instance state` => `Terminate instance`. 120 | -------------------------------------------------------------------------------- /documentation/DCP-documentation/step_1_configuration.md: -------------------------------------------------------------------------------- 1 | # Step 1: Configuration 2 | 3 | The first step in setting up any job is editing the values in the config.py file. 4 | Once the config file is created, simply type `python run.py setup` to set up your resources based on the configurations you've specified. 5 | 6 | *** 7 | 8 | ## Components of the config file 9 | 10 | * **APP_NAME:** This will be used to tie your clusters, tasks, services, logs, and alarms together. 11 | It need not be unique, but it should be descriptive enough that you can tell jobs apart if you're running multiple analyses (i.e. "NuclearSegmentation_Drosophila" is better than "CellProfiler"). 12 | 13 | *** 14 | 15 | * **DOCKERHUB_TAG:** This is the encapsulated version of your software your analyses will be running. 16 | 17 | *** 18 | 19 | ### AWS GENERAL SETTINGS 20 | 21 | These are settings that will allow your instances to be configured correctly and access the resources they need- see [Step 0: Prep](step_0_prep.md) for more information. 22 | 23 | Bucket configurations allow you to read/write from/to different bucket in different accounts from where you are running DCP. 24 | If you are not accessing any external buckets, set AWS_BUCKET, SOURCE_BUCKET, and DESTINATION_BUCKET the same. 25 | For more information and examples, see [External Buckets](external_buckets.md). 26 | 27 | * **AWS_BUCKET:** The bucket to which you would like to write log files. 28 | This is generally the bucket in the account in which you are running compute. 29 | * **SOURCE_BUCKET:** The bucket where the image files you will be reading are. 30 | Often, this is the same as AWS_BUCKET. 31 | * **WORKSPACE:** The bucket where non-image files you will be reading are (e.g. pipeline, load_data.csv, etc.). 32 | Often, this is the same as AWS_BUCKET. 33 | * **DESTINATION_BUCKET:** The bucket where you want to write your output files. 34 | Often, this is the same as AWS_BUCKET. 35 | * **UPLOAD_FLAGS:** If you need to add flags to an AWS CLI command to upload flags to your DESTINATION_BUCKET, this is where you enter them. 36 | This is typically only used if you are writing to a bucket that is not yours. 37 | If you don't need to add UPLOAD_FLAGS, keep it as the default ''. 38 | 39 | *** 40 | 41 | ### EC2 AND ECS INFORMATION 42 | 43 | * **ECS_CLUSTER:** Which ECS cluster you'd like the jobs to go into. 44 | All AWS accounts come with a "default" cluster, but you may add more clusters if you like. 45 | Distinct clusters for each job are not necessary, but if you're running multiple analyses at once it can help avoid the wrong Docker containers (such as the ones for your "NuclearSegmentation_Drosophila" job) going to the wrong instances (such as the instances that are part of your "NuclearSegmentation_HeLa" spot fleet). 46 | * **CLUSTER_MACHINES:** How many EC2 instances you want to have in your cluster. 47 | * **TASKS_PER_MACHINE:** How many Docker containers to place on each machine. 48 | * **MACHINE_TYPE:** A list of what type(s) of machines your spot fleet should contain. 49 | * **MACHINE_PRICE:** How much you're willing to pay per hour for each machine launched. 50 | AWS has a handy [price history tracker](https://console.aws.amazon.com/ec2sp/v1/spot/home) you can use to make a reasonable estimate of how much to bid. 51 | If your jobs complete quickly and/or you don't need the data immediately you can reduce your bid accordingly; jobs that may take many hours to finish or that you need results from immediately may justify a higher bid. 52 | See also [AWS on-demand pricing](https://aws.amazon.com/ec2/pricing/on-demand/) to compare the cost savings of using spot fleets. 53 | * **EBS_VOL_SIZE:** The size of the temporary hard drive associated with each EC2 instance in GB. 54 | The minimum allowed is 22. 55 | If you have multiple Dockers running per machine, each Docker will have access to (EBS_VOL_SIZE/TASKS_PER_MACHINE)- 2 GB of space. 56 | * **DOWNLOAD_FILES:** Whether or not to download the image files to the EBS volume before processing, as opposed to accessing them all from S3FS. 57 | This typically requires a larger EBS volume (depending on the size of your image sets, and how many sets are processed per group), but avoids occasional issues with S3FS that can crop up on longer runs. 58 | By default, DCP uses S3FS to mount the S3 `SOURCE_BUCKET` as a pseudo-file system on each EC2 instance in your spot fleet to avoid file download. 59 | If you are unable to mount the `SOURCE_BUCKET` (perhaps because of a permissions issue) you should proceed with `DOWNLOAD_FILES = 'True'`. 60 | * **ASSIGN_IP:** Whether or not to assign an a public IPv4 address to each instance in the spot fleet. 61 | If set to 'False' will overwrite whatever is in the Fleet file. 62 | If set to 'True' will respect whatever is in the Fleet file. 63 | Distributed-CellProfiler originally defaulted to assign an IP address to each instance so that one could connect to the instance for troubleshooting but that need has been mostly obviated by the level of logging currently in DCP. 64 | 65 | *** 66 | 67 | ### DOCKER INSTANCE RUNNING ENVIRONMENT 68 | 69 | * **DOCKER_CORES:** How many copies of your script to run in each Docker container. 70 | * **CPU_SHARES:** How many CPUs each Docker container may have. 71 | * **MEMORY:** How much memory each Docker container may have. 72 | * **SECONDS_TO_START:** The time each Docker core will wait before it starts another copy of your software. 73 | This can safely be set to 0 for workflows that don't require much memory or execute quickly; for slower and/or more memory intensive pipelines we advise you to space them out by roughly the length of your most memory intensive step to make sure your software doesn't crash due to lack of memory. 74 | 75 | *** 76 | 77 | ### SQS QUEUE INFORMATION 78 | 79 | * **SQS_QUEUE_NAME:** The name of the queue where all of your jobs will be sent. 80 | * **SQS_MESSAGE_VISIBILITY:** How long each job is hidden from view before being allowed to be tried again. 81 | We recommend setting this to slightly longer than the average amount of time it takes an individual job to process- if you set it too short, you may waste resources doing the same job multiple times; if you set it too long, your instances may have to wait around a long while to access a job that was sent to an instance that stalled or has since been terminated. 82 | * **SQS_DEAD_LETTER_QUEUE:** The name of the queue to send jobs to if they fail to process correctly multiple times; this keeps a single bad job (such as one where a single file has been corrupted) from keeping your cluster active indefinitely. 83 | This queue will be automatically made if it doesn't exist already. 84 | See [Step 0: Prep](step_0_prep.med) for more information. 85 | * **JOB_RETRIES:** This is the number of times that a job will be retried before it is sent to the Dead Letter Queue. 86 | 87 | *** 88 | 89 | ### LOG GROUP INFORMATION 90 | 91 | * **LOG_GROUP_NAME:** The name to give the log group that will monitor the progress of your jobs and allow you to check performance or look for problems after the fact. 92 | 93 | *** 94 | 95 | ### MONITORING 96 | 97 | * **AUTO_MONITOR:** Whether or not to have Auto-Monitor automatically monitor your jobs. 98 | 99 | *** 100 | 101 | ### CLOUDWATCH DASHBOARD CREATION 102 | 103 | * **CREATE_DASHBOARD:** Create a Cloudwatch Dashboard that plots run metrics? 104 | * **CLEAN_DASHBOARD:** Automatically clean up the Cloudwatch Dashboard at the end of the run? 105 | 106 | *** 107 | 108 | ### REDUNDANCY CHECKS 109 | 110 | * **CHECK_IF_DONE_BOOL:** Whether or not to check the output folder before proceeding. 111 | Case-insensitive. 112 | If an analysis fails partway through (due to some of the files being in the wrong place, an AWS outage, a machine crash, etc.), setting this to 'True' this allows you to resubmit the whole analysis but only reprocess jobs that haven't already been done. 113 | This saves you from having to try to parse exactly which jobs succeeded vs failed or from having to pay to rerun the entire analysis. 114 | If your software determines the correct number of files are already in the output folder it will designate that job as completed and move onto the next one. 115 | If you actually do want to overwrite files that were previously generated (such as when you have improved a pipeline and no longer want the output of the old version), set this to 'False' to process jobs whether or not there are already files in the output folder. 116 | * **EXPECTED_NUMBER_FILES:** How many files need to be in the output folder in order to mark a job as completed. 117 | * **MIN_FILE_SIZE_BYTES:** What is the minimal number of bytes an object should be to "count"? 118 | Useful when trying to detect jobs that may have exported smaller corrupted files vs larger, full-size files. 119 | * **NECESSARY_STRING:** This allows you to optionally set a string that must be included in your file to count towards the total in EXPECTED_NUMBER_FILES. 120 | 121 | *** 122 | 123 | ### CELLPROFILER SETTINGS 124 | 125 | * **ALWAYS CONTINUE:** Whether or not to run CellProfiler with the --always-continue flag, which will keep CellProfiler from crashing if it errors. 126 | Use with caution. 127 | This can be particularly helpful in jobs where a large number of files are loaded in a single run (such as during illumination correction) so that a corrupted or missing file doesn't prevent the whole job completing. 128 | However, this can make it harder to notice jobs that are not completely succesffully so should be used with caution. 129 | We suggest using this setting in conjunction with a small number of JOB_RETRIES. 130 | 131 | *** 132 | 133 | ### PLUGINS 134 | 135 | * **USE_PLUGINS:** Whether or not you will be using external plugins from the CellProfiler-plugins repository. 136 | When True, passes the `--plugins-directory` flag to CellProfiler. 137 | Defaults to the current v1.0 `CellProfiler-plugins/active_plugins` location for plugins but will revert to the historical location of plugins in the `CellProfiler-plugins` root directory if the `active_plugins` folder is not present. 138 | * **UPDATE_PLUGINS:** Whether or not to update the plugins repository before use. 139 | (i.e. run `git fetch --all` on CellProfiler-plugins) 140 | * **PLUGINS_COMMIT:** If desired, what commit or version tag to check out. 141 | Used in the `git checkout PLUGINS_COMMIT` command in CellProfiler-plugins. 142 | If you do not want to checkout a specific commit, set to False. 143 | * **INSTALL_REQUIREMENTS:** Whether or not to install requirements associate with plugins. 144 | Not all plugins require additional requirement installation. 145 | See [CellProfiler-plugins Documentation](https://plugins.cellprofiler.org/using_plugins.html) for more information on requirements. 146 | * **REQUIREMENTS:** For current v1.0 CellProfiler-plugins, a flag that will be passed to the install command (e.g. `cellpose`). 147 | See [CellProfiler-plugins Documentation](https://plugins.cellprofiler.org/using_plugins.html) for more information on supported flags. 148 | For deprecated versions of CellProfiler-plugins before v1.0, pass a path within the CellProfiler-plugins repository to the requirements file you would like to install that will be used in the `pip install -r REQUIREMENTS_FILE` command. 149 | 150 | The [CellProfiler/Distributed-CellProfiler Docker](https://hub.docker.com/r/cellprofiler/distributed-cellprofiler/tags) 2.0.0_4.2.4 and older have a clone of the CellProfiler-plugins repository with deprecated organization in them. 151 | If you would like to continue using this clone, set `USE_PLUGINS = 'True'` and `UPDATE_PLUGINS = 'False'`. 152 | Note that if you do need to install requirements with the deprecated organization, pass the path to the requirements file within the CellProfiler-plugins repository as `REQUIREMENTS`. 153 | If you would like to update the CellProfiler-plugins repository with up-to-date plugins and new structure while using the CellProfiler/Distributed-CellProfiler Docker 2.0.0_4.2.4 and older, set `UPDATE_PLUGINS = 'True'`. 154 | 155 | [CellProfiler/Distributed-CellProfiler Dockers](https://hub.docker.com/r/cellprofiler/distributed-cellprofiler/tags) newer than 2.0.0_4.2.4 have current CellProfiler-plugins repository organization. 156 | If you need to use deprecated plugin organization you can access previous commits or version tags by passing them as `PLUGINS_COMMIT`. 157 | 158 | *** 159 | 160 | ### EXAMPLE CONFIGURATIONS 161 | 162 | ![Sample_Distributed-CellProfiler_Configuration_1](images/sample_DCP_config_1.png) 163 | 164 | This is an example of one possible configuration. 165 | It's a fairly large machine that is able to process 64 jobs at the same time. 166 | 167 | The Config settings for this example are: 168 | **TASKS_PER_MACHINE** = 16 (number of Dockers) 169 | **EBS_VOL_SIZE** = 165 170 | **MACHINE_TYPE** = ['m4.16xlarge'] 171 | 172 | **DOCKER_CORES** = 4 (copies of CellProfiler to run inside a docker) 173 | **CPU_SHARES** = 4096 (number of cores for each Docker * 1024) 174 | **MEMORY** = 15000 (MB for each Docker) 175 | 176 | ![Sample_Distributed-CellProfiler_Configuration_2](images/sample_DCP_config_2.png) 177 | 178 | This is an example of another possible configuration. 179 | When we run Distributed CellProfiler we tend to prefer running a larger number of smaller machine. 180 | This is an example of a configuration we often use. 181 | We might use a spot fleet of 100 of these machines (**CLUSTER_MACHINES** = 100). 182 | 183 | The Config settings for this example are: 184 | **TASKS_PER_MACHINE** = 1 (number of Dockers) 185 | **EBS_VOL_SIZE** = 22 186 | **MACHINE_TYPE** = ['m4.xlarge'] 187 | 188 | **DOCKER_CORES** = 4 (copies of CellProfiler to run inside a docker) 189 | **CPU_SHARES** = 4096 (number of cores for each Docker * 1024) 190 | **MEMORY** = 15000 (MB for each Docker) 191 | -------------------------------------------------------------------------------- /documentation/DCP-documentation/step_2_submit_jobs.md: -------------------------------------------------------------------------------- 1 | # Step 2: Submit Jobs 2 | 3 | ## Overview 4 | 5 | Distributed-CellProfiler works by breaking your analysis into a series of smaller jobs based on the metadata and groupings you've specified in your pipeline. 6 | The choice of how to group your images is largely dependent on the details of your experiment. 7 | For example, pipelines to create [illumination correction functions](http://onlinelibrary.wiley.com/doi/10.1111/jmi.12178/full) are usually run on a per-plate basis, while pipelines with a large number of memory intensive measurement steps such as [CellPainting](http://www.nature.com/nprot/journal/v11/n9/full/nprot.2016.105.html) are grouped based on plate, well, and site so that each node only analyzes one site at a time. 8 | 9 | Once you've decided on a grouping, you're ready to start configuring your job file. 10 | Once your job file is configured, simply use `python run.py submitJob files/{YourJobFile}.json` to send all the jobs to the SQS queue [specified in your config file](step_1_configuration.md). 11 | 12 | *** 13 | 14 | **Distributed-CellProfiler only works for pipelines with extracted metadata and specified groupings and which use the LoadData OR on h5 containers created by the CreateBatchFiles module**, though we hope to add support for file-lists in the future. 15 | See [this page](https://github.com/CellProfiler/CellProfiler/wiki/Adapting-CellProfiler-to-a-LIMS-environment) for more information about running CellProfiler headless with file-lists versus LoadData CSVs. 16 | 17 | **The grouping specified in your pipeline MUST match what you specify here in order to successfully run jobs.** 18 | 19 | Due to CellProfiler's image-loading mechanisms, experiments with >10,000 image sites can begin to suffer from decreased performance. 20 | Breaking such experiments down into a number of smaller CSVs may increase your processing throughput. 21 | If using LoadData, make sure your "Base image location" is set to "None". 22 | 23 | *** 24 | 25 | ## Configuring your job file 26 | 27 | * **pipeline:** The path to your pipeline file. 28 | * **data_file:** The path to your LoadData.csv, batch file, or file list file. 29 | * **input:** The path to your default input directory. 30 | This is not necessary for every pipeline but can be helpful when non-image files are needed in the pipeline (such as a text file containing quality control rules for the FlagImage module or a metadata file for use with file lists). 31 | DO NOT set this to a large directory, or CellProfiler will try to scan the entire thing before running your pipeline. 32 | * **output:** The top output directory you'd like your files placed in. 33 | * **output_structure:** By default, Distributed-CellProfiler will put your output in subfolders created by hyphenating all your Metadata entries (see below) in order (e.g. if the individual group being processed was `{"Metadata": "Metadata_Plate=Plate1,Metadata_Well=A01"}`, the output would be placed in `output_top_directory/Plate1-A01`.) 34 | If you'd like a different folder structure, you may designate one here (e.g. if you set `"output_structure": "Metadata_Plate/Metadata_Well"` then the previous example would output to `output_top_directory/Plate1/A01`. 35 | This setting is optional. 36 | Job files that don't include it will use the default structure. 37 | * **groups:** The list of all the groups of images you'd like to process. 38 | For large numbers of groups, it may be helpful to create this list separately as a .txt file you can then append into the job's JSON file. 39 | You may create this yourself in your favorite scripting language. 40 | Alternatively, you can use the following additional tools to help you create and format this list: 41 | * `batches.sh` allows you to provide a list of all the individual metadata components (plates, columns, rows, etc). 42 | It then uses [GNU parallel](https://www.gnu.org/software/parallel/parallel_tutorial.html) to create a formatted text file with all the possible combinations of the components you provided. 43 | This approach is best when you have a large number of groups and the group structure is uniform. 44 | 45 | Example: for a 96-well plate experiment where one there are 3 plates and the experiment is grouped by Plate and Well, `batches.sh` would read: 46 | `parallel echo '{\"Metadata\": \"Metadata_Plate={1},Metadata_Well={2}{3}\"},' ::: Plate1 Plate2 Plate3 ::: A B C D E F G H ::: 01 02 03 04 05 06 07 08 09 10 11 12 | sort > batches.txt` 47 | * You may also use the list of groupings created by calling `cellprofiler --print-groups` from the command line (see [here](https://github.com/CellProfiler/CellProfiler/wiki/Adapting-CellProfiler-to-a-LIMS-environment#cmd) and [here](https://github.com/CellProfiler/Distributed-CellProfiler/issues/52) for more information). 48 | Note that for job files that specify groupings in this way, the `output_structure` variable is NOT optional - it must be specified or an error will be returned. 49 | 50 | ## Alternate job submission: run_batch_general.py 51 | 52 | We also support an alternate second path besides `submitJobs` to create the list of jobs - the `run_batch_general.py` file. 53 | This file essentially serves as a "shortcut" to run many common types of stereotyped experiments we run in our lab. 54 | Essentially, if your data follows a regular structure (such as N rows, N columns, N grouping, a particular structure for output, etc.), you may find it useful to take and modify this file for your own usage. 55 | We recommend new users use the `submitJobs` pathway, as it will help users understand the kinds of information Distributed-CellProfiler needs in order to run properly, but once they are comfortable with it they may find `run_batch_general.py` helps them create jobs faster in the future. 56 | 57 | As of Distributed-CellProfiler 2.2.0, `run_batch_general.py` has been reformatted as a CLI tool with greatly enhanced customizeability. 58 | `run_batch_general.py` must be passed 5 pieces of information: 59 | 60 | ### Required inputs 61 | 62 | * `step` is the step that you would like to make jobs for. 63 | Supported steps are `zproj`, `illum`, `qc`, `qc_persite`, `assaydev`, and`analysis` 64 | * `identifier` is the project identifier (e.g. "cpg0000-jump-pilot" or "2024_11_07_Collaborator_Cell_Painting") 65 | * `batch` is the name of the data batch (e.g. "2020_11_04_CPJUMP1") 66 | * `platelist` is the list of plates to process. 67 | Format the list in quotes with individual plates separated by commas and no spaces (e.g. "Plate1,Plate2,Plate3") 68 | 69 | A minimal `run_batch_general.py` command may look like: 70 | """bash 71 | run_batch_general.py analysis 2024_05_16_Segmentation_Project 2024_10_10_Batch1 "Plate1,Plate2,Plate3" 72 | """ 73 | 74 | ### Required input for Cell Painting Gallery 75 | 76 | Runs being made off of the Cell Painting Gallery require two additional flags: 77 | 78 | * `--source ` to specify the identifier-specific source of the data. 79 | * `--path-style cpg` is to set the input and output paths as data is structured in the Cell Painting Gallery. 80 | All paths can be overwritten with flags (see below). 81 | 82 | A minimal `run_batch_general.py` command for a dataset on the Cell Painting Gallery may look like: 83 | """bash 84 | run_batch_general.py analysis cpg0000-jump-pilot 2020_11_04_CPJUMP1 "BR00116991,BR00116992" --path-style cpg --source broad 85 | """ 86 | 87 | ### Plate layout flags 88 | 89 | * `--plate-format `: if used, can be `96` or `384` and will overwrite `rows` and `columns` to produce standard 96- or 384-well plate well names (e.g. A01, A02, etc.) 90 | * `--rows `: a custom list of row labels. 91 | Will be combined with `columns` to generate well names. 92 | Separate values with commas and no spaces and surround with quotation marks (e.g. `"A,B,C,D,E,F,G"`) 93 | * `--columns `: a custom list of column labels. 94 | Will be combined with `rows` to generate well names. 95 | Separate values with commas and no spaces and surround with quotation marks (e.g. `"1,2,3,4,5,6,7,8,9,10"`) 96 | * `--wells `: a custom list of wells. 97 | Overwrites `rows` and `columns`. 98 | Separate values with commas and no spaces and surround with quotation marks (e.g. `"C02,D04,E04,N12"`) 99 | * `--no-well-digit-pad`: Formats wells without well digit padding. 100 | Formats wells passed with `--plate format` or `--rows` and `--columns` but not `--wells`. 101 | (e.g. `A1` NOT `A01`) 102 | * `--sites `: a custom list of sites (fields of view) to be analyzed. 103 | Default is 9 sites (1 to 9). 104 | Not used by `illum`, `qc`, or `assaydev` steps. 105 | Separate values with commas and no spaces and surround with quotation marks (e.g. `"1,2,3,4,5,6"`) 106 | 107 | ### Overwrite structural defaults 108 | 109 | * `--output-structure `: overwrite default output structure 110 | * `--output-path `: overwrite default output path 111 | * `--input-path `: overwrite the default path to input files 112 | 113 | ### Overwrite defaults (for runs using load data .csv's and .cppipe) 114 | 115 | * `--pipeline `: overwrite the default pipeline name 116 | * `--pipeline-path `: overwrite the default path to pipelines 117 | * `--datafile-name `: overwrite the default load data .csv name 118 | * `--datafile-path `: overwrite the default path to load data files 119 | 120 | ### Overwrite defaults (for runs using .h5 batch files) 121 | 122 | * `--use-batch`: use h5 batch files instead of load data csv and .cppipe files 123 | * `--batchfile-name `: overwrite default batchfile name 124 | * `--batchfile-path `: overwrite default path to the batchfile 125 | -------------------------------------------------------------------------------- /documentation/DCP-documentation/step_3_start_cluster.md: -------------------------------------------------------------------------------- 1 | # Step 3: Start Cluster 2 | 3 | After your jobs have been submitted to the queue, it is time to start your cluster. 4 | Once you have configured your spot fleet request per the instructions below, you may run 5 | `python run.py startCluster files/{YourFleetFile}.json` 6 | 7 | When you enter this command, the following things will happen (in this order): 8 | 9 | * Your spot fleet request will be sent to AWS. 10 | Depending on their capacity and the price that you bid, it can take anywhere from a couple of minutes to several hours for your machines to be ready. 11 | * Distributed-CellProfiler will create the APP_NAMESpotFleetRequestId.json file, which will allow you to [start your progress monitor](step_4_monitor.md). 12 | This will allow you to walk away and just let things run even if your spot fleet won't be ready for some time. 13 | 14 | Once the spot fleet is ready: 15 | 16 | * Distributed-CellProfiler will create the log groups (if they don't already exist) for your log streams to go in. 17 | * Distributed-CellProfiler will ask AWS to place Docker containers onto the instances in your spot fleet. 18 | Your job will begin shortly! 19 | 20 | *** 21 | 22 | ## Configuring your spot fleet request 23 | 24 | Definition of many of these terms and explanations of many of the individual configuration parameters of spot fleets are covered in AWS documentation [here](http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/spot-fleet.html) and [here](http://docs.aws.amazon.com/cli/latest/reference/ec2/request-spot-fleet.html). 25 | You may also configure your spot fleet request through Amazon's web interface and simply download the JSON file at the "review" page to generate the configuration file you want, though we do not recommend this as Distributed-CellProfiler assumes a certain fleet request structure and has only been tested on certain Amazon AMI's. 26 | Looking at the output of this automatically generated spot fleet request can be useful though for obtaining values like your VPC's subnet and security groups, as well the ARN ID's of your roles. 27 | 28 | Among the parameters you should/must update: 29 | 30 | * **The IamFleetRole, IamInstanceProfile, KeyName, SubnetId, and Groups:** These are account specific and you will configure these based on the [previous setup work that you did](step_0_prep.md). 31 | Once you've created your first complete spot fleet request, you can save a copy as a local template so that you don't have to look these up every time. 32 | 33 | * The KeyName used here should be the same used in your config file but **without** the `.pem` extension. 34 | 35 | * **ImageId and SnapshotId** These refer to the OS and pre-installed programming that will be used by your spot fleet instances, and are both AWS region specific. 36 | We use the Amazon ECS-Optimized Amazon Linux AMI; but the Linux 2 AMI also seems to work in our limited testing. 37 | If there is no template fleet file for your region, or the one here is too out-of-date, see below for instructions on configuring these options yourselves. 38 | If you have a good working configuration for a region that isn't represented or for a more up-to-date version of the AMI than we've had time to test, please feel free to create a pull request and we'll include it in the repo! 39 | 40 | ## Parameters that must be configured in the spot fleet in DCP 1 (but not current versions) 41 | 42 | These parameters were present in the spot fleet request in first version of DCP but not subsequent versions. 43 | We provide the information here because we have not officially deprecated DCP 1, however we strongly encourage you to use a more updated version. 44 | 45 | * **ValidFrom and ValidTo:** These should be set such that the current date is between them, and no more than 1 year apart. 46 | * **TargetCapacity:** The number of instances you want. 47 | You must make sure it aligns with the specifications from your (config file)[step_1_configuration.md. 48 | * **InstanceType:** The type of instances you want. 49 | You must make sure it aligns with the specifications from your (config file)[step_1_configuration.md. 50 | * **SpotPrice:** The maximum price per hour you're willing to pay for each instance. 51 | AWS has a handy [price history tracker](https://console.aws.amazon.com/ec2sp/v1/spot/home) you can use to make a reasonable estimate of how much to bid. 52 | If your jobs complete quickly and/or you don't need the data immediately you can reduce your bid accordingly. 53 | Jobs that may take many hours to finish or that you need results from immediately may justify a higher bid. 54 | 55 | ## To run in a region where a spot fleet config isn't available or is out of date 56 | 57 | * Under EC2 -> Instances select "Launch Instance" 58 | 59 | ![Launch Instance](images/Launch.jpg) 60 | 61 | * Search "ECS", then choose the "Amazon ECS-Optimized Amazon Linux AMI" 62 | 63 | ![Select ECS-Optimized](images/ECS.jpg) 64 | 65 | * Select Continue, then select any instance type (we're going to kill this after a few seconds) and click "Next: Configure Instance Details" 66 | 67 | * Choose a network and subnet in the region you wish to launch instances in, and then click "Next: Add Storage" 68 | 69 | ![Set Network and Subnet](images/Network.jpg) 70 | 71 | * On the "Add Storage" page, note down the Snapshot column for the Root volume- this is your SnapshotId. 72 | Click "Review and Launch" 73 | 74 | ![Get SnapshotId](images/Snapshot.jpg) 75 | 76 | * Click "Launch", and then select any key pair (again, we'll be killing this in a few seconds) 77 | 78 | * Once your instance has launched, click its link from the launch page. 79 | 80 | ![Click InstanceID](images/InstanceID.jpg) 81 | 82 | * In the list of information on the instance, find and note its AMI ID - this is your ImageId 83 | 84 | ![Get the AMI ID](images/AMIID.jpg) 85 | 86 | * Terminate the instance 87 | -------------------------------------------------------------------------------- /documentation/DCP-documentation/step_4_monitor.md: -------------------------------------------------------------------------------- 1 | # Step 4: Monitor 2 | 3 | Your workflow is now submitted. 4 | Distributed-CellProfiler will keep an eye on a few things for you at this point without you having to do anything else. 5 | 6 | * Each instance is labeled with your APP_NAME, so that you can easily find your instances if you want to look at the instance metrics on the Running Instances section of the [EC2 web interface](https://console.aws.amazon.com/ec2/v2/home) to monitor performance. 7 | * You can also look at the whole-cluster CPU and memory usage statistics related to your APP_NAME in the [ECS web interface](https://console.aws.amazon.com/ecs/home). 8 | * Each instance will have an alarm placed on it so that if CPU usage dips below 1% for 15 consecutive minutes (almost always the result of a crashed machine), the instance will be automatically terminated and a new one will take its place. 9 | * Each individual job processed will create a log of the CellProfiler output, and each Docker container will create a log showing CPU, memory, and disk usage. 10 | 11 | If you choose to run the Monitor script, Distributed-CellProfiler can be even more helpful. 12 | 13 | ## Running Monitor 14 | 15 | ### Manually running Monitor 16 | 17 | Monitor can be run by entering `python run.py monitor files/APP_NAMESpotFleetRequestId.json`. 18 | While the optimal time to initiate Monitor is as soon as you have triggered a run as it downscales infrastructure as necessary, you can run Monitor at any point in time and it will clean up whatever infrastructure remains. 19 | 20 | **Note:** You should run the monitor inside [Screen](https://www.gnu.org/software/screen/), [tmux](https://tmux.github.io/), or another comparable service to keep a network disconnection from killing your monitor; this is particularly critical the longer your run takes. 21 | 22 | ### Using Auto-Monitor 23 | 24 | Instead of manually triggering Monitor, you can have a version of Monitor automatically initiate after you [start your cluster](step_3_start_cluster.md) by setting `AUTO_MONITOR = 'True'` in your [config file](step_1_configuration.md). 25 | Auto-Monitor is an AWS Lambda function that is triggered by alarms placed on the SQS queue. 26 | Read more about the [SQS Queue](SQS_QUEUE_information.md) to better understand the alarm metrics. 27 | 28 | ## Monitor functions 29 | 30 | ### While your analysis is running 31 | 32 | * Scales down the spot fleet request to match the number of remaining jobs WITHOUT force terminating them. 33 | This happens every 10 minutes with manual Monitor or when the are no Visible Messages in your queue for Auto-Monitor. 34 | * Deletes the alarms for any instances that have been terminated in the last 24 hours (because of spot prices rising above your maximum bid, machine crashes, etc). 35 | This happens every hour with manual Monitor or when the are no Visible Messages in your queue for Auto-Monitor. 36 | 37 | ### When your queue is totally empty (there are no Visible or Not Visible messages) 38 | 39 | * Downscales the ECS service associated with your APP_NAME. 40 | * Deletes all the alarms associated with your spot fleet (both the currently running and the previously terminated instances). 41 | * Shuts down your spot fleet to keep you from incurring charges after your analysis is over. 42 | * Gets rid of the queue, service, and task definition created for this analysis. 43 | * Exports all the logs from your analysis onto your S3 bucket. 44 | * Deletes your Cloudwatch Dashboard if you created it and set `CLEAN_DASHBOARD = 'True'` in your [config file](step_1_configuration.md). 45 | 46 | ## Cheapest mode 47 | 48 | If you are manually triggering Monitor, you can run the monitor in an optional "cheapest" mode, which will downscale the number of requested machines (but not RUNNING machines) to one machine 15 minutes after the monitor is engaged. 49 | You can engage cheapest mode by adding `True` as a final configurable parameter when starting the monitor, aka `python run.py monitor files/APP_NAMESpotFleetRequestId.json True` 50 | 51 | Cheapest mode is cheapest because it will remove all but 1 machine as soon as that machine crashes and/or runs out of jobs to do; this can save you money, particularly in multi-CPU Dockers running long jobs. 52 | This mode is optional because running this way involves some inherent risks. 53 | If machines stall out due to processing errors, they will not be replaced, meaning your job will take overall longer. 54 | Additionally, if there is limited capacity for your requested configuration when you first start (e.g. you want 200 machines but AWS says it can currently only allocate you 50), more machines will not be added if and when they become available in cheapest mode as they would in normal mode. 55 | 56 | *** 57 | 58 | ## Monitor file 59 | 60 | The JSON monitor file containing all the information Distributed-CellProfiler needs will have been automatically created when you sent the instructions to start your cluster in the [previous step](step_3_start_cluster). 61 | The file itself is quite simple and contains the following information: 62 | 63 | ```json 64 | {"MONITOR_FLEET_ID" : "sfr-9999ef99-99fc-9d9d-9999-9999999e99ab", 65 | "MONITOR_APP_NAME" : "2021_12_13_Project_Analysis", 66 | "MONITOR_ECS_CLUSTER" : "default", 67 | "MONITOR_QUEUE_NAME" : "2021_12_13_Project_AnalysisQueue", 68 | "MONITOR_BUCKET_NAME" : "bucket-name", 69 | "MONITOR_LOG_GROUP_NAME" : "2021_12_13_Project_Analysis", 70 | "MONITOR_START_TIME" : "1649187798951"} 71 | ``` 72 | 73 | For any Distributed-CellProfiler run where you have run [`startCluster`](step_3_start_cluster) more than once, the most recent values will overwrite the older values in the monitor file. 74 | Therefore, if you have started multiple spot fleets (which you might do in different subnets if you are having trouble getting enough capacity in your spot fleet, for example), Monitor will only clean up the latest request unless you manually edit the `MONITOR_FLEET_ID` to match the spot fleet you have kept. 75 | -------------------------------------------------------------------------------- /documentation/DCP-documentation/troubleshooting_runs.md: -------------------------------------------------------------------------------- 1 | # Troubleshooting 2 | 3 | Note that this is a sparse matrix. 4 | Services/behaviors that are as expected and/or not relevant for diagnosing a problem are marked N/A. 5 | 6 | | SQS | Cloudwatch | S3 | EC2/ECS | Problem | Solution | 7 | |---|---|---|---|---|---| 8 | | Messages in flight consistently < number of dockers running | CP never progresses beyond a certain module | No outputs are written to S3 | N/A | CP is stalling indefinitely on a step without throwing an error. This means there is a bug in CP. | The module that is stalling is the one after the last module that got logged. Check the Issues in the CP Github repo for reports of problems with a certain module. If you don’t see a report, make one. Use different settings within the module to avoid the bug or use a different version of DCP with the bug fixed. | 9 | | Jobs completing (total messages decreasing) much more quickly than expected. | "File not run due to > expected number of files" | No new outputs are written to S3 | N/A | CHECK_IF_DONE_BOOL is being triggered because the output folder for your job already has >= EXPECTED_NUMBER_OF_FILES. | If you want to overwrite previous runs, in your config, change CHECK_IF_DONE_BOOL to TRUE. If using the CHECK_IF_DONE_BOOL option to avoid reprocessing old jobs, make sure to account for any files that may already exist in the output folder. i.e. if your pipeline creates 5 files, but there are already 6 files in your output folder, make sure to set the EXPECTED_NUMBER_FILES to 11 (6+5), not 5. | 10 | | Jobs completing (total messages decreasing) much more quickly than expected. | "== OUT" without proceeding through CP pipeline | Batch_data.h5 files being created instead of expected output. | N/A | Your pipeline has the CreateBatchFiles module included. | Uncheck the CreateBatchFiles module in your pipeline. | 11 | | Jobs moving to dead messages | "ValueError: dictionary update sequence element #1 has length 1; 2 is required" | No outputs are written to S3 | N/A | The syntax in the groups section of your job file is incorrect. | If you are grouping based on multiple variables, make sure there are no spaces between them in your listing in your job file. e.g. "Metadata_Plate=Plate1,Metadata_Well=A01" is correct, "Metadata_Plate=Plate1, Metadata_Well=A01" is incorrect. | 12 | | N/A | Nothing happens for a long time after "cellprofiler -c -r " | N/A | N/A | 1) Your input directory is set to a folder with a large number of files and CP is trying to read the whole directory before running. 2) You are loading very large images. | 1) In your job file, change the input to a smaller folder. 2) Consider downscaling your images before running them in CP. Or just be more patient. | 13 | | N/A | Within a single log there are multiple "cellprofiler -c -r" | Expected output seen. | N/A | A single job is being processed multiple times. | SQS_MESSAGE_VISIBILITY is set too short. See [SQS_Queue_information](SQS_QUEUE_information.md) for more information. | 14 | | Jobs moving to dead messages | "ValueError: no name (Invalid arguments to routine: Bad value)" or "Encountered unrecoverable error in LoadData during startup: No name (no name)" | No outputs to S3 | N/A | There is a problem with your LoadData.csv. This is usually seen when CSVs are created with a script; accidentally having an extra comma somewhere (looks like ",,") will be invisible in Excel but generate the CP error. If you made your CSVs with pandas to_csv option, you must pass index=False or you will get this error. | Find the ",," in your CSV and remove it. If you made your CSVs with pandas dataframe’s to_csv function, check to make sure you used the index=False parameter. | 15 | | Jobs moving to dead messages | IndexError: index 0 is out of bounds for axis 0 with size 0 | No outputs to S3 | N/A | 1) Metadata values of 0 OR that have leading zeros (ie Metadata_Site=04, rather than Metadata_Site=4) are not handled well by CP. 2) The submitted jobs don’t make sense to CP. 3) DCP is looking for your images in the wrong location. 4) CellProfiler isn't accessing the rows of your load_data.csv that contain information about the jobs. | 1) Change your LoadData.csv so that there are no Metadata values of 0 or with 0 padding. 2) Change your job file so that your jobs match your pipeline’s expected input. 3) If using LoadData, make sure the file paths are correct in your LoadData.csv and the "Base image location" is set correctly in the LoadData module. If using BatchFiles, make sure your BatchFile paths are correct. 4) Make sure that your LoadData module has "Process just a range of rows?" as No or that the range you have set do not filter out the jobs that you are submitting. | 16 | | N/A | N/A | Pipeline output is not where expected | N/A | 1) There is a mistake in your ExportToSpreadsheet in your pipeline. 2) There is a mistake in your job file. | 1) Check that your Output File Location is as expected. Default Output Folder is typical. Default Output Folder sub-folder can cause outputs to be nested in an unusual manner. 2) Check the output path in your job file. | 17 | | Jobs moving to dead messages | "Empty image set list: no images passed the filtering criteria." | No outputs to S3 | N/A | DCP doesn’t know how to load your image set.| If you are using a .cppipe and LoadData.csv, make sure that your pipeline includes the LoadData module. | 18 | | Jobs completing (total messages decreasing) much more quickly than expected. | "==OUT, SUCCESS" | No outcome/saved files on S3 | N/A | There is a mismatch in your metadata somewhere. | Check the `Metadata_` columns in your load_data.csv for typos or a mismatch with your jobs file. The most common sources of mismatch are case and zero padding (e.g. A01 vs a01 vs A1). Check for these mismatches and edit the job file accordingly. If you use pe2loaddata to create your csvs and the plate was imaged multiple times, pay particular attention to the Metadata_Plate column as numbering reflecting this will be automatically passed into the load_data.csv | 19 | | N/A | Your specified output structure does not match the Metadata passed. | Expected output is seen. | N/A | This is not necessarily an error. If the input grouping is different than the output grouping (e.g. jobs are run by Plate-Well-Site but are all output to a single Plate folder) then this will print in the Cloudwatch log that matches the input structure but actual job progress will print in the Cloudwatch log that matches the output structure. | N/A | 20 | | Jobs moving to dead messages | Your perinstance logs have an IOError indicating that an .h5 batchfile does not exist | No outcome/saved files on S3 | N/A | Your job is configured for using a batchfile and no batchfile exists for your project | 1) Create a batch file and make sure that it is in the appropriate directory 2) Make sure that you have set your batch file location correctly in your jobs 3) If using run_batch_general.py for job creation, make sure that you passed the `--use-batch` flag | 21 | | No jobs are pulled from the queue | No logs are created | No outputs are written to S3 | Machines made in EC2 but they remain nameless. | A nameless machine means that the Dockers are not placed on the machines. 1) There is a mismatch in your DCP config file. OR 2) You haven't set up permissions correctly. OR 3) Dockers are not being made in ECS | 1) Confirm that the MEMORY matches the MACHINE_TYPE set in your config. Confirm that there are no typos in your DOCKERHUB_TAG set in your config. 2) Check that you have set up permissions correctly for the user or role that you have set in your config under AWS_PROFILE. Confirm that your `ecsInstanceRole` is able to access the S3 bucket where your `ecsconfigs` have been uploaded. 3) Check in ECS that you see `Registered container instances`. | 22 | | Jobs moving to dead messages | Your perinstance logs have an IOError indicating that CellProfiler cannot open your pipeline | No outputs are written to S3 | N/A | You have a corrupted pipeline | Check if you can open your pipeline locally. It may have been corrupted on upload or it may have an error within the pipeline itself. | 23 | | N/A | "== ERR move failed:An error occurred (SlowDown) when calling the PutObject operation (reached max retries: 4): Please reduce your request rate." Error may not show initially and may become more prevalent with time. | N/A | N/A | Too many jobs are finishing too quickly creating a backlog of jobs waiting to upload to S3. | You can 1) check out fewer machines at a time, 2) check out smaller machines and run fewer copies of DCP at the same time, or 3) group jobs in larger groupings (e.g. by Plate instead of Well or Site). If this happens because you have many jobs finishing at the same time (but not finishing very rapidly such that it's not creating an increasing backlog) you can increase SECONDS_TO_START in config.py so there is more separation between jobs finishing. | 24 | | N/A | "/home/ubuntu/bucket: Transport endpoint is not connected" | S3 cannot be accessed by fleet. | N/A | S3FS has stochastically dropped/failed to connect. | Perform your run without using S3FS by setting DOWNLOAD_FILES = TRUE in your config.py. Note that, depending upon your job and machine setup, you may need to increase the size of your EBS volume to account for the files being downloaded. | 25 | | Jobs moving to dead messages | "SSL: certificate subject name (*.s3.amazonaws.com) does not match target host name 'xxx.yyy.s3.amazonaws.com'" | S3 cannot be accessed by fleet. | N/A | S3FS fails to mount if your bucket name has a dot (.) in it. | You can bypass S3FS usage by setting DOWNLOAD_FILES = TRUE in your config.py. Note that, depending upon your job and machine setup, you may need to increase the size of your EBS volume to account for the files being downloaded. Alternatively, you can make your own DCP Docker and edit run-worker.sh to `use_path_request_style`. If your region is not us-east-1 you also need to specify `endpoint`. See S3FS documentation for more information. | 26 | | N/A | Your logs show that files are downloading but it never moves beyond that point. | N/A | N/A | If you have set DOWNLOAD_FILES = TRUE in your config, then your files are failing to completely download because you are running out of space and it is failing silently. | Place larger volumes on your instances by increasing EBS_VOL_SIZE in your config.py | 27 | | Jobs moving to dead messages | "ValueError: The Mito image is missing from the pipeline." | No files are output to S3 | N/A | The CellProfiler pipeline is referencing a channel (in this example, "Mito") that is not being loaded in the pipeline. | Check that your load_data csv contains the FileNames and PathNames for all your images. This can sometimes happen when the load_data csv is being automatically generated or edited as part of a workflow. | 28 | | Jobs moving to dead messages | "Failed to prepare run for module LoadData", "ValueError: zero-size array to reduction operation maximum which has no identity" | No files are output to S3 | N/A | CellProfiler cannot read any information from your load_data.csv. | Check that your load_data.csv contains data beyond the header. This can sometimes happen when the load_data csv is being automatically generated or edited as part of a workflow. | 29 | | Jobs moving to dead messages | "CP PROBLEM: Done file reports failure." | No files are output to S3 | N/A | Something went wrong in your CellProfiler pipeline. | Read the logs above the CP PROBLEM message to see what the specific CellProfiler error is and fix that error in your pipeline. | 30 | 31 | Further hints: 32 | 33 | - The SSH_KEY_NAME in the config.py file contains the name of the key pair used to access AWS. 34 | This field is the name of the file with the .pem extension (SSH_KEY_NAME = "MyKeyPair.pem"). 35 | The same name is used in the fleet configuration file (e.g. exampleFleet.json) but without using the .pem extension ("KeyName": "MyKeyPair"). 36 | - With multi-well plates (e.g. 384-well plate), it is often better to use LoadData module in CellProfiler pipeline. 37 | Pipelines that use LoadData don't need to worry about setting the input field in exampleJob_PlateID.json UNLESS something in the pipeline (such as FlagImage, FilterObjects, SaveImages, etc) references the "Default Input Folder". 38 | -------------------------------------------------------------------------------- /documentation/DCP-documentation/troubleshooting_start_cluster.md: -------------------------------------------------------------------------------- 1 | # Troubleshooting startCluster 2 | 3 | If you are having problems at [Step 3 (Start Cluster)](step_3_start_cluster.md) in your Distributed-CellProfiler runs, you may find the following troubleshooting information helpful. 4 | 5 | ## IamFleetRole 6 | 7 | If there is a problem with the `IamFleetRole` in your Fleet File, you may get the following error: 8 | 9 | ```bash 10 | botocore.exceptions.ClientError: An error occurred (InvalidSpotFleetRequestConfig) when calling the RequestSpotFleet operation: Parameter: SpotFleetRequestConfig.IamFleetRole is invalid. 11 | ``` 12 | 13 | ## IamInstanceProfile 14 | 15 | If there is a problem with the `IamInstanceProfile` in your Fleet File, you may get the following error: 16 | 17 | ```bash 18 | Your spot fleet request is causing an error and is now being cancelled. Please check your configuration and try again 19 | spotFleetRequestConfigurationInvalid : c5.xlarge, ami-0f161e6034a6262d8, Linux/UNIX: Value 20 | ``` 21 | 22 | - Check your FleetFile.json. 23 | Confirm that in the `IamInstanceProfile` the `Arn` is an **instance-profile** NOT a **role** (e.g. `"arn:aws:iam::012345678901:instance-profile/ecsInstanceRole"`). 24 | This is different from the `IamFleetRole` at the top of the FleetFile.json that is a **role**. 25 | - Confirm that your ecsInstanceRole was created correctly. 26 | If you created resources manually, using either the CLI or the console, you may have missed part of the `IamInstanceProfile` creation. 27 | In your command line, run `aws iam list-instance-profiles-for-role --role-name ecsInstanceRole`. 28 | If it returns `{"InstanceProfiles": []}`, then run the following commands: 29 | 30 | ```bash 31 | aws iam create-instance-profile --instance-profile-name ecsInstanceRole 32 | 33 | aws iam add-role-to-instance-profile --role-name ecsInstanceRole --instance-profile-name ecsInstanceRole 34 | ``` 35 | 36 | ## SubnetId 37 | 38 | If there is a problem with the `SubnetId` in your Fleet File, you may get the following error: 39 | 40 | ```bash 41 | botocore.exceptions.ClientError: An error occurred (InvalidSpotFleetRequestConfig) when calling the RequestSpotFleet operation: One of the provided subnets was not valid. 42 | ``` 43 | 44 | ## Groups 45 | 46 | If there is a problem with the `Groups` in your Fleet File, you may get the following error: 47 | 48 | ```bash 49 | Your spot fleet request is causing an error and is now being cancelled. Please check your configuration and try again 50 | spotFleetRequestConfigurationInvalid : c5.xlarge, ami-0f161e6034a6262d8, Linux/UNIX: The security group 'sg-01234567890123451atest' does not exist in VPC 'vpc-0123456789012345' 51 | ``` 52 | -------------------------------------------------------------------------------- /documentation/DCP-documentation/versions.md: -------------------------------------------------------------------------------- 1 | # Versions 2 | 3 | The most current release can always be found on DockerHub at `cellprofiler/distributed-cellprofiler`. 4 | Current version is 2.2.0. 5 | Our current tag system is DCPversion_CellProfilerversion, e.g. 1.2.0_3.1.8 6 | 7 | Previous release versions can be accessed at `bethcimini/distributed-cellprofiler:versionnumber` 8 | 9 | --- 10 | 11 | ## Version History 12 | 13 | ### 2.2.0 - Released 20241105 14 | 15 | * run_batch_general overhauled to be a CLI tool with support for Cell Painting Gallery structure 16 | * Support for AWS IAM assumed roles 17 | * Improved handling of CellProfiler-plugins and update to current CellProfiler-plugins organization 18 | * Adds WORKSPACE_BUCKET to the config so that image files and metadata files can be read off different buckets 19 | * Adds JOB_RETRIES to the config so that the number of retries before sending a job to DeadMessages is configurable 20 | * Adds ALWAYS_CONTINUE to the config so that the flag can be passed to CellProfiler 21 | * Adds ASSIGN_IP to the config and defaults to false so that EC2 spot fleet instances do not automatically get assigned a private IP address 22 | 23 | ### 2.1.0 - Released 20230518 24 | 25 | * Addition of setup_AWS.py to automate AWS infrastructure setup 26 | * Addition of optional auto-monitor 27 | * Addition of auto-dashboard creation 28 | * Addition of auto-Deadletter queue creation 29 | * Improved handling of AWS credentials 30 | 31 | ### 2.0.0rc2 - Released 20201110 32 | 33 | * Add optional ability to download files to EBS rather than reading from S3 (helpful for pipelines that access many files/image sets) 34 | 35 | ### 2.0.0rc1 - Released 20201105 36 | 37 | * Remove requirement for boto and fabric, using only boto3 38 | * Add support for Python 3 and CellProfiler 4 39 | * Move cluster size, machine type, and machine price to the config file from the fleet file, eliminating mismatches between the two 40 | * Add the ability to filter only files with certain names when running CHECK_IF_DONE 41 | * Don't cancel a fleet over capacity errors 42 | * Add "cheapest" mode to the monitor, allowing you to run more cheaply (at possible expense of running more slowly) 43 | 44 | ### 1.2.2 - Released 20201103 45 | 46 | * Allow pipelines using batch files to also designate an input output_top_directory 47 | * Add support for multiple LaunchData specifications 48 | * Add CellProfiler-plugins 49 | * Additional way to create job submissions with run_batch_general.py 50 | 51 | ### 1.2.1 - Released 20200109, Updated through 20191002 52 | 53 | * Allow monitor to downscale machines when number of jobs < number of machines 54 | * Add a parameter to discount files when running CHECK_IF_DONE checks if less than a certain size 55 | 56 | ### 1.2.0 - Released 20181108, Updated through 20200109 57 | 58 | * Improved compatibility with CellProfiler 2 and 3 59 | * Better handling of logging when using output_structure 60 | 61 | ### 1.1.0 - Released 20170217, Updated 20170221 (bugfixes) - 20181018 62 | 63 | * Changes in this release: 64 | 65 | * Added the `output_structure` variable to the job file, which allows you to structure the output folders created by DCP (ie `Plate/Well-Site` rather than `Plate-Well-Site`). Job files lacking this variable will still default to the previous settings (hyphenating all Metadata items in order they are presented in the Metadata grouping). 66 | * Added support for creating the list of groups via `cellprofiler --print-groups`- see [this issue](https://github.com/CellProfiler/Distributed-CellProfiler/issues/52) for example and discussion. Groups listed in this way MUST use the `output_structure` variable to state their desired folder structure or an error will be returned. 67 | 68 | ### 1.0.0 - Version as of 20170213 69 | -------------------------------------------------------------------------------- /example_project/README.md: -------------------------------------------------------------------------------- 1 | # Distributed-CellProfiler Minimal Example 2 | 3 | Included in this folder is all of the resources for running a complete mini-example of Distributed-Cellprofiler. 4 | It includes 3 sample image sets and a CellProfiler pipeline that identifies cells within the images and makes measuremements. 5 | It also includes the Distributed-CellProfiler files pre-configured to create a queue of all 3 jobs and spin up a spot fleet of 3 instances, each of which will process a single image set. 6 | 7 | ## Running example project 8 | 9 | ### Step 0 10 | 11 | Before running this mini-example, you will need to set up your AWS infrastructure as described in our [online documentation](https://distributedscience.github.io/Distributed-CellProfiler/step_0_prep.html). 12 | This includes creating the fleet file that you will use in Step 3. 13 | 14 | Upload the 'sample_project' folder to the top level of your bucket. 15 | While in the `Distributed-CellProfiler` folder, use the following command, replacing `yourbucket` with your bucket name: 16 | 17 | ```bash 18 | # Copy example files to S3 19 | BUCKET=yourbucket 20 | aws s3 sync example_project/demo_project_folder s3://${BUCKET}/demo_project_folder 21 | 22 | # Replace the default config with the example config 23 | cp example_project/config.py config.py 24 | ``` 25 | 26 | ### Step 1 27 | 28 | In config.py you will need to update the following fields specific to your AWS configuration: 29 | 30 | ```python 31 | # AWS GENERAL SETTINGS: 32 | AWS_REGION = 'us-east-1' 33 | AWS_PROFILE = 'default' # The same profile used by your AWS CLI installation 34 | SSH_KEY_NAME = 'your-key-file.pem' # Expected to be in ~/.ssh 35 | AWS_BUCKET = 'your-bucket-name' 36 | SOURCE_BUCKET = 'your-bucket-name' # Only differs from AWS_BUCKET with advanced configuration 37 | DESTINATION_BUCKET = 'your-bucket-name' # Only differs from AWS_BUCKET with advanced configuration 38 | ``` 39 | 40 | Then run `python3 run.py setup` 41 | 42 | ### Step 2 43 | 44 | This command points to the job file created for this demonstration and should be run as-is. 45 | `python3 run.py submitJob example_project/files/exampleJob.json` 46 | 47 | ### Step 3 48 | 49 | This command should point to whatever fleet file you created in Step 0 so you may need to update the `exampleFleet.json` file name. 50 | `python3 run.py startCluster files/exampleFleet.json` 51 | 52 | ### Step 4 53 | 54 | This command points to the monitor file that is automatically created with your run and should be run as-is. 55 | `python3 run.py monitor files/FlyExampleSpotFleetRequestId.json` 56 | 57 | ## Results 58 | 59 | While the run is happening, you can watch real-time metrics in your Cloudwatch Dashboard by navigating in the [Cloudwatch console](https://console.aws.amazon.com/cloudwatch). 60 | Note that the metrics update at intervals that may not be helpful with this fast, minimal example. 61 | 62 | After the run is done, you should see your CellProfiler output files in S3 at s3://${BUCKET}/project_folder/output in per-image folders. 63 | 64 | ## Cleanup 65 | 66 | The spot fleet, queue, and task definition will be automatically cleaned up after your demo is complete because you are running `monitor`. 67 | 68 | To remove everything else: 69 | 70 | ```bash 71 | # Remove files added to S3 bucket 72 | BUCKET=yourbucket 73 | aws s3 rm --recursive s3://${BUCKET}/demo_project_folder 74 | 75 | # Remove Cloudwatch logs 76 | aws logs delete-log-group --log-group-name FlyExample 77 | aws logs delete-log-group --log-group-name FlyExample_perInstance 78 | 79 | # Delete DeadMessages queue 80 | aws sqs delete-queue --queue-url ExampleProject_DeadMessages 81 | ``` 82 | -------------------------------------------------------------------------------- /example_project/config.py: -------------------------------------------------------------------------------- 1 | # Constants (User configurable) 2 | 3 | APP_NAME = 'FlyExample' # Used to generate derivative names unique to the application. 4 | 5 | # DOCKER REGISTRY INFORMATION: 6 | DOCKERHUB_TAG = 'cellprofiler/distributed-cellprofiler:2.0.0_4.2.4' 7 | 8 | # AWS GENERAL SETTINGS: 9 | AWS_REGION = 'us-east-1' 10 | AWS_PROFILE = 'default' # The same profile used by your AWS CLI installation 11 | SSH_KEY_NAME = 'your-key-file.pem' # Expected to be in ~/.ssh 12 | AWS_BUCKET = 'your-bucket-name' # Bucket to use for logging (likely all three buckets the same for this example) 13 | SOURCE_BUCKET = 'your-bucket-name' # Bucket to download files from (likely all three buckets the same for this example) 14 | DESTINATION_BUCKET = 'your-bucket-name' # Bucket to upload files to (likely all three buckets the same for this example) 15 | 16 | # EC2 AND ECS INFORMATION: 17 | ECS_CLUSTER = 'default' 18 | CLUSTER_MACHINES = 3 19 | TASKS_PER_MACHINE = 1 20 | MACHINE_TYPE = ['c4.xlarge'] 21 | MACHINE_PRICE = 0.13 22 | EBS_VOL_SIZE = 22 # In GB. Minimum allowed is 22. 23 | DOWNLOAD_FILES = 'False' 24 | ASSIGN_IP = 'False' # If false, will overwrite setting in Fleet file 25 | 26 | # DOCKER INSTANCE RUNNING ENVIRONMENT: 27 | DOCKER_CORES = 1 # Number of CellProfiler processes to run inside a docker container 28 | CPU_SHARES = DOCKER_CORES * 1024 # ECS computing units assigned to each docker container (1024 units = 1 core) 29 | MEMORY = 7000 # Memory assigned to the docker container in MB 30 | SECONDS_TO_START = 3*60 # Wait before the next CP process is initiated to avoid memory collisions 31 | 32 | # SQS QUEUE INFORMATION: 33 | SQS_QUEUE_NAME = APP_NAME + 'Queue' 34 | SQS_MESSAGE_VISIBILITY = 10*60 # Timeout (secs) for messages in flight (average time to be processed) 35 | SQS_DEAD_LETTER_QUEUE = 'ExampleProject_DeadMessages' 36 | 37 | # LOG GROUP INFORMATION: 38 | LOG_GROUP_NAME = APP_NAME 39 | 40 | # CLOUDWATCH DASHBOARD CREATION 41 | CREATE_DASHBOARD = 'True' # Create a dashboard in Cloudwatch for run 42 | CLEAN_DASHBOARD = 'True' # Automatically remove dashboard at end of run with Monitor 43 | 44 | # REDUNDANCY CHECKS 45 | CHECK_IF_DONE_BOOL = 'False' #True or False- should it check if there are a certain number of non-empty files and delete the job if yes? 46 | EXPECTED_NUMBER_FILES = 7 #What is the number of files that trigger skipping a job? 47 | MIN_FILE_SIZE_BYTES = 1 #What is the minimal number of bytes an object should be to "count"? 48 | NECESSARY_STRING = '' #Is there any string that should be in the file name to "count"? 49 | 50 | # PLUGINS 51 | USE_PLUGINS = 'False' 52 | UPDATE_PLUGINS = 'False' 53 | PLUGINS_COMMIT = '' # What commit or version tag do you want to check out? 54 | INSTALL_REQUIREMENTS = 'False' 55 | REQUIREMENTS_FILE = '' # Path within the plugins repo to a requirements file 56 | -------------------------------------------------------------------------------- /example_project/demo_project_folder/images/01_POS002_D.TIF: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DistributedScience/Distributed-CellProfiler/bccb4ce3e07eedd43552851d5e3534e3bccae3f6/example_project/demo_project_folder/images/01_POS002_D.TIF -------------------------------------------------------------------------------- /example_project/demo_project_folder/images/01_POS002_F.TIF: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DistributedScience/Distributed-CellProfiler/bccb4ce3e07eedd43552851d5e3534e3bccae3f6/example_project/demo_project_folder/images/01_POS002_F.TIF -------------------------------------------------------------------------------- /example_project/demo_project_folder/images/01_POS002_R.TIF: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DistributedScience/Distributed-CellProfiler/bccb4ce3e07eedd43552851d5e3534e3bccae3f6/example_project/demo_project_folder/images/01_POS002_R.TIF -------------------------------------------------------------------------------- /example_project/demo_project_folder/images/01_POS076_D.TIF: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DistributedScience/Distributed-CellProfiler/bccb4ce3e07eedd43552851d5e3534e3bccae3f6/example_project/demo_project_folder/images/01_POS076_D.TIF -------------------------------------------------------------------------------- /example_project/demo_project_folder/images/01_POS076_F.TIF: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DistributedScience/Distributed-CellProfiler/bccb4ce3e07eedd43552851d5e3534e3bccae3f6/example_project/demo_project_folder/images/01_POS076_F.TIF -------------------------------------------------------------------------------- /example_project/demo_project_folder/images/01_POS076_R.TIF: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DistributedScience/Distributed-CellProfiler/bccb4ce3e07eedd43552851d5e3534e3bccae3f6/example_project/demo_project_folder/images/01_POS076_R.TIF -------------------------------------------------------------------------------- /example_project/demo_project_folder/images/01_POS218_D.TIF: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DistributedScience/Distributed-CellProfiler/bccb4ce3e07eedd43552851d5e3534e3bccae3f6/example_project/demo_project_folder/images/01_POS218_D.TIF -------------------------------------------------------------------------------- /example_project/demo_project_folder/images/01_POS218_F.TIF: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DistributedScience/Distributed-CellProfiler/bccb4ce3e07eedd43552851d5e3534e3bccae3f6/example_project/demo_project_folder/images/01_POS218_F.TIF -------------------------------------------------------------------------------- /example_project/demo_project_folder/images/01_POS218_R.TIF: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DistributedScience/Distributed-CellProfiler/bccb4ce3e07eedd43552851d5e3534e3bccae3f6/example_project/demo_project_folder/images/01_POS218_R.TIF -------------------------------------------------------------------------------- /example_project/demo_project_folder/workspace/load_data.csv: -------------------------------------------------------------------------------- 1 | FileName_OrigBlue,FileName_OrigGreen,FileName_OrigRed,Metadata_Position,PathName_OrigBlue,PathName_OrigGreen,PathName_OrigRed 2 | 01_POS002_D.TIF,01_POS002_F.TIF,01_POS002_R.TIF,002,/home/ubuntu/bucket/demo_project_folder/images,/home/ubuntu/bucket/demo_project_folder/images,/home/ubuntu/bucket/demo_project_folder/images 3 | 01_POS076_D.TIF,01_POS076_F.TIF,01_POS076_R.TIF,076,/home/ubuntu/bucket/demo_project_folder/images,/home/ubuntu/bucket/demo_project_folder/images,/home/ubuntu/bucket/demo_project_folder/images 4 | 01_POS218_D.TIF,01_POS218_F.TIF,01_POS218_R.TIF,218,/home/ubuntu/bucket/demo_project_folder/images,/home/ubuntu/bucket/demo_project_folder/images,/home/ubuntu/bucket/demo_project_folder/images 5 | -------------------------------------------------------------------------------- /example_project/files/exampleJob.json: -------------------------------------------------------------------------------- 1 | { 2 | "_comment1": "Paths in this file are relative to the root of your S3 bucket", 3 | "pipeline": "demo_project_folder/workspace/ExampleFly.cppipe", 4 | "data_file": "demo_project_folder/workspace/load_data.csv", 5 | "input": "demo_project_folder/workspace/", 6 | "output": "demo_project_folder/output", 7 | "output_structure": "Metadata_Position", 8 | "_comment2": "The following groups are tasks, and each will be run in parallel", 9 | "groups": [ 10 | {"Metadata": "Metadata_Position=2"}, 11 | {"Metadata": "Metadata_Position=76"}, 12 | {"Metadata": "Metadata_Position=218"} 13 | ] 14 | } 15 | 16 | -------------------------------------------------------------------------------- /example_project_CPG/README.md: -------------------------------------------------------------------------------- 1 | # CPG Example Project 2 | 3 | Included in this folder is all of the resources for running a complete mini-example of Distributed-CellProfiler. 4 | This example differs from the other example project in that it reads data hosted in the public data repository the [Cell Painting Gallery](https://github.com/broadinstitute/cellpainting-gallery) instead of reading images from your own bucket. 5 | Workspace files are hosted in your own S3 bucket, and data is output to your bucket, and compute is performed in your account. 6 | It includes the Distributed-CellProfiler files pre-configured to create a queue of 3 jobs and spin up a spot fleet of 3 instances, each of which will process a single image set. 7 | 8 | ## Running example project 9 | 10 | ### Step 0 11 | 12 | Before running this mini-example, you will need to set up your AWS infrastructure as described in our [online documentation](https://distributedscience.github.io/Distributed-CellProfiler/step_0_prep.html). 13 | This includes creating the fleet file that you will use in Step 3. 14 | 15 | Upload the 'sample_project' folder to the top level of your bucket. 16 | While in the `Distributed-CellProfiler` folder, use the following command, replacing `yourbucket` with your bucket name: 17 | 18 | ```bash 19 | # Copy example files to S3 20 | BUCKET=yourbucket 21 | aws s3 sync example_project_CPG/demo_project_folder s3://${BUCKET}/demo_project_folder 22 | 23 | # Replace the default config with the example config 24 | cp example_project_CPG/config.py config.py 25 | ``` 26 | 27 | ### Step 1 28 | 29 | In config.py you will need to update the following fields specific to your AWS configuration: 30 | 31 | ```python 32 | # AWS GENERAL SETTINGS: 33 | AWS_REGION = 'us-east-1' 34 | AWS_PROFILE = 'default' # The same profile used by your AWS CLI installation 35 | SSH_KEY_NAME = 'your-key-file.pem' # Expected to be in ~/.ssh 36 | AWS_BUCKET = 'your-bucket-name' 37 | WORKSPACE_BUCKET = 'your-bucket-name' # Only differs from AWS_BUCKET with advanced configuration 38 | DESTINATION_BUCKET = 'your-bucket-name' # Only differs from AWS_BUCKET with advanced configuration 39 | ``` 40 | 41 | Then run `python run.py setup` 42 | 43 | ### Step 2 44 | 45 | This command points to the job file created for this demonstration and should be run as-is. 46 | `python run.py submitJob example_project_CPG/files/exampleCPGJob.json` 47 | 48 | ### Step 3 49 | 50 | This command should point to whatever fleet file you created in Step 0 so you may need to update the `exampleFleet.json` file name. 51 | `python run.py startCluster files/exampleFleet.json` 52 | 53 | ### Step 4 54 | 55 | This command points to the monitor file that is automatically created with your run and should be run as-is. 56 | `python run.py monitor files/ExampleCPGSpotFleetRequestId.json` 57 | 58 | ## Results 59 | 60 | While a run is happening, you can watch real-time metrics in your Cloudwatch Dashboard by navigating in the [Cloudwatch console](https://console.aws.amazon.com/cloudwatch). 61 | Note that the metrics update at intervals that may not be helpful with this fast, minimal example. 62 | 63 | After the run is done, you should see your CellProfiler output files in your S3 bucket at s3://${BUCKET}/project_folder/output in per-well-and-site folders. 64 | 65 | ## Cleanup 66 | 67 | The spot fleet, queue, and task definition will be automatically cleaned up after your demo is complete because you are running `monitor`. 68 | 69 | To remove everything else: 70 | 71 | ```bash 72 | # Remove files added to S3 bucket 73 | BUCKET=yourbucket 74 | aws s3 rm --recursive s3://${BUCKET}/demo_project_folder 75 | 76 | # Remove Cloudwatch logs 77 | aws logs delete-log-group --log-group-name ExampleCPG 78 | aws logs delete-log-group --log-group-name ExampleCPG_perInstance 79 | 80 | # Delete DeadMessages queue 81 | aws sqs delete-queue --queue-url ExampleProject_DeadMessages 82 | ``` 83 | -------------------------------------------------------------------------------- /example_project_CPG/config.py: -------------------------------------------------------------------------------- 1 | # Constants (User configurable) 2 | 3 | APP_NAME = 'ExampleCPG' # Used to generate derivative names unique to the application. 4 | 5 | # DOCKER REGISTRY INFORMATION: 6 | DOCKERHUB_TAG = 'erinweisbart/distributed-cellprofiler:2.2.0rc1_4.2.4' 7 | 8 | # AWS GENERAL SETTINGS: 9 | AWS_REGION = 'us-east-1' 10 | AWS_PROFILE = 'default' # The same profile used by your AWS CLI installation 11 | SSH_KEY_NAME = 'your-key-file.pem' # Expected to be in ~/.ssh 12 | AWS_BUCKET = 'your-bucket-name' # Bucket to use for logging 13 | SOURCE_BUCKET = 'cellpainting-gallery' # Bucket to download image files from 14 | WORKSPACE_BUCKET = 'your-bucket-name' # Bucket to download non-image files from 15 | DESTINATION_BUCKET = 'your-bucket-name' # Bucket to upload files to 16 | 17 | # EC2 AND ECS INFORMATION: 18 | ECS_CLUSTER = 'default' 19 | CLUSTER_MACHINES = 3 20 | TASKS_PER_MACHINE = 1 21 | MACHINE_TYPE = ['c4.xlarge'] 22 | MACHINE_PRICE = 0.13 23 | EBS_VOL_SIZE = 22 # In GB. Minimum allowed is 22. 24 | DOWNLOAD_FILES = 'True' 25 | ASSIGN_IP = 'False' # If false, will overwrite setting in Fleet file 26 | 27 | # DOCKER INSTANCE RUNNING ENVIRONMENT: 28 | DOCKER_CORES = 1 # Number of CellProfiler processes to run inside a docker container 29 | CPU_SHARES = DOCKER_CORES * 1024 # ECS computing units assigned to each docker container (1024 units = 1 core) 30 | MEMORY = 7000 # Memory assigned to the docker container in MB 31 | SECONDS_TO_START = 3*60 # Wait before the next CP process is initiated to avoid memory collisions 32 | 33 | # SQS QUEUE INFORMATION: 34 | SQS_QUEUE_NAME = APP_NAME + 'Queue' 35 | SQS_MESSAGE_VISIBILITY = 10*60 # Timeout (secs) for messages in flight (average time to be processed) 36 | SQS_DEAD_LETTER_QUEUE = 'ExampleProject_DeadMessages' 37 | 38 | # LOG GROUP INFORMATION: 39 | LOG_GROUP_NAME = APP_NAME 40 | 41 | # CLOUDWATCH DASHBOARD CREATION 42 | CREATE_DASHBOARD = 'True' # Create a dashboard in Cloudwatch for run 43 | CLEAN_DASHBOARD = 'True' # Automatically remove dashboard at end of run with Monitor 44 | 45 | # REDUNDANCY CHECKS 46 | CHECK_IF_DONE_BOOL = 'False' #True or False- should it check if there are a certain number of non-empty files and delete the job if yes? 47 | EXPECTED_NUMBER_FILES = 7 #What is the number of files that trigger skipping a job? 48 | MIN_FILE_SIZE_BYTES = 1 #What is the minimal number of bytes an object should be to "count"? 49 | NECESSARY_STRING = '' #Is there any string that should be in the file name to "count"? 50 | 51 | # PLUGINS 52 | USE_PLUGINS = 'False' 53 | UPDATE_PLUGINS = 'False' 54 | PLUGINS_COMMIT = '' # What commit or version tag do you want to check out? 55 | INSTALL_REQUIREMENTS = 'False' 56 | REQUIREMENTS_FILE = '' # Path within the plugins repo to a requirements file 57 | -------------------------------------------------------------------------------- /example_project_CPG/demo_project_folder/workspace/ExampleCPG.cppipe: -------------------------------------------------------------------------------- 1 | CellProfiler Pipeline: http://www.cellprofiler.org 2 | Version:4 3 | DateRevision:426 4 | GitHash: 5 | ModuleCount:7 6 | HasImagePlaneDetails:False 7 | 8 | LoadData:[module_num:1|svn_version:'Unknown'|variable_revision_number:6|show_window:True|notes:[]|batch_state:array([], dtype=uint8)|enabled:True|wants_pause:False] 9 | Input data file location:Default Input Folder sub-folder|Desktop/github/Distributed-CellProfiler/example_project_CPG/project_folder/workspace 10 | Name of the file:load_data.csv 11 | Load images based on this data?:Yes 12 | Base image location:Default Input Folder| 13 | Process just a range of rows?:No 14 | Rows to process:1,100000 15 | Group images by metadata?:Yes 16 | Select metadata tags for grouping:Well,Site 17 | Rescale intensities?:Yes 18 | 19 | IdentifyPrimaryObjects:[module_num:2|svn_version:'Unknown'|variable_revision_number:15|show_window:True|notes:['Identify the nuclei from the DAPI image. ']|batch_state:array([], dtype=uint8)|enabled:True|wants_pause:False] 20 | Select the input image:OrigDNA 21 | Name the primary objects to be identified:Nuclei 22 | Typical diameter of objects, in pixel units (Min,Max):15,90 23 | Discard objects outside the diameter range?:Yes 24 | Discard objects touching the border of the image?:Yes 25 | Method to distinguish clumped objects:Shape 26 | Method to draw dividing lines between clumped objects:Intensity 27 | Size of smoothing filter:10 28 | Suppress local maxima that are closer than this minimum allowed distance:5 29 | Speed up by using lower-resolution image to find local maxima?:Yes 30 | Fill holes in identified objects?:After declumping only 31 | Automatically calculate size of smoothing filter for declumping?:Yes 32 | Automatically calculate minimum allowed distance between local maxima?:Yes 33 | Handling of objects if excessive number of objects identified:Continue 34 | Maximum number of objects:500 35 | Use advanced settings?:Yes 36 | Threshold setting version:12 37 | Threshold strategy:Global 38 | Thresholding method:Minimum Cross-Entropy 39 | Threshold smoothing scale:1 40 | Threshold correction factor:1.0 41 | Lower and upper bounds on threshold:0.005,1 42 | Manual threshold:0.0 43 | Select the measurement to threshold with:None 44 | Two-class or three-class thresholding?:Three classes 45 | Log transform before thresholding?:No 46 | Assign pixels in the middle intensity class to the foreground or the background?:Background 47 | Size of adaptive window:10 48 | Lower outlier fraction:0.05 49 | Upper outlier fraction:0.05 50 | Averaging method:Mean 51 | Variance method:Standard deviation 52 | # of deviations:2 53 | Thresholding method:Otsu 54 | 55 | IdentifySecondaryObjects:[module_num:3|svn_version:'Unknown'|variable_revision_number:10|show_window:True|notes:['Identify the cells by using the nuclei as a "seed" region, then growing outwards until stopped by the image threshold or by a neighbor.']|batch_state:array([], dtype=uint8)|enabled:True|wants_pause:False] 56 | Select the input objects:Nuclei 57 | Name the objects to be identified:Cells 58 | Select the method to identify the secondary objects:Watershed - Image 59 | Select the input image:OrigRNA 60 | Number of pixels by which to expand the primary objects:10 61 | Regularization factor:0.05 62 | Discard secondary objects touching the border of the image?:No 63 | Discard the associated primary objects?:No 64 | Name the new primary objects:FilteredNuclei 65 | Fill holes in identified objects?:Yes 66 | Threshold setting version:12 67 | Threshold strategy:Global 68 | Thresholding method:Otsu 69 | Threshold smoothing scale:0 70 | Threshold correction factor:.7 71 | Lower and upper bounds on threshold:0.005,.6 72 | Manual threshold:0 73 | Select the measurement to threshold with:None 74 | Two-class or three-class thresholding?:Three classes 75 | Log transform before thresholding?:Yes 76 | Assign pixels in the middle intensity class to the foreground or the background?:Foreground 77 | Size of adaptive window:10 78 | Lower outlier fraction:0.05 79 | Upper outlier fraction:0.05 80 | Averaging method:Mean 81 | Variance method:Standard deviation 82 | # of deviations:2 83 | Thresholding method:Otsu 84 | 85 | IdentifyTertiaryObjects:[module_num:4|svn_version:'Unknown'|variable_revision_number:3|show_window:True|notes:['Identify the cytoplasm by "subtracting" the nuclei objects from the cell objects.']|batch_state:array([], dtype=uint8)|enabled:True|wants_pause:False] 86 | Select the larger identified objects:Cells 87 | Select the smaller identified objects:Nuclei 88 | Name the tertiary objects to be identified:Cytoplasm 89 | Shrink smaller object prior to subtraction?:Yes 90 | 91 | MeasureObjectSizeShape:[module_num:5|svn_version:'Unknown'|variable_revision_number:3|show_window:True|notes:['Measure morphological features from the cell, nuclei and cytoplasm objects.']|batch_state:array([], dtype=uint8)|enabled:True|wants_pause:False] 92 | Select object sets to measure:Cells, Nuclei, Cytoplasm 93 | Calculate the Zernike features?:Yes 94 | Calculate the advanced features?:No 95 | 96 | MeasureObjectIntensity:[module_num:6|svn_version:'Unknown'|variable_revision_number:4|show_window:True|notes:['Measure intensity features from nuclei, cell, and cytoplasm objects in all channels']|batch_state:array([], dtype=uint8)|enabled:True|wants_pause:False] 97 | Select images to measure:OrigAGP, OrigDNA, OrigER, OrigMito, OrigRNA 98 | Select objects to measure:Nuclei, Cells, Cytoplasm 99 | 100 | ExportToSpreadsheet:[module_num:7|svn_version:'Unknown'|variable_revision_number:13|show_window:True|notes:['Export any measurements to a comma-delimited file (.csv). The measurements made for the nuclei, cell and cytoplasm objects will be saved to separate .csv files, in addition to the per-image .csv’s.']|batch_state:array([], dtype=uint8)|enabled:True|wants_pause:False] 101 | Select the column delimiter:Comma (",") 102 | Add image metadata columns to your object data file?:No 103 | Add image file and folder names to your object data file?:No 104 | Select the measurements to export:No 105 | Calculate the per-image mean values for object measurements?:No 106 | Calculate the per-image median values for object measurements?:No 107 | Calculate the per-image standard deviation values for object measurements?:No 108 | Output file location:Default Output Folder|. 109 | Create a GenePattern GCT file?:No 110 | Select source of sample row name:Metadata 111 | Select the image to use as the identifier:None 112 | Select the metadata to use as the identifier:None 113 | Export all measurement types?:Yes 114 | Press button to select measurements:None|None 115 | Representation of Nan/Inf:NaN 116 | Add a prefix to file names?:No 117 | Filename prefix:MyExpt_ 118 | Overwrite existing files without warning?:Yes 119 | Data to export:Image 120 | Combine these object measurements with those of the previous object?:No 121 | File name:Image.csv 122 | Use the object name for the file name?:No 123 | Data to export:Nuclei 124 | Combine these object measurements with those of the previous object?:No 125 | File name:Nuclei.csv 126 | Use the object name for the file name?:No 127 | Data to export:Cells 128 | Combine these object measurements with those of the previous object?:No 129 | File name:Cells.csv 130 | Use the object name for the file name?:No 131 | Data to export:Cytoplasm 132 | Combine these object measurements with those of the previous object?:No 133 | File name:Cytoplasm.csv 134 | Use the object name for the file name?:No 135 | -------------------------------------------------------------------------------- /example_project_CPG/files/exampleCPGJob.json: -------------------------------------------------------------------------------- 1 | { 2 | "_comment1": "Paths in this file are relative to the root of your S3 bucket", 3 | "pipeline": "demo_project_folder/workspace/ExampleCPG.cppipe", 4 | "data_file": "demo_project_folder/workspace/load_data.csv", 5 | "input": "demo_project_folder/workspace/", 6 | "output": "demo_project_folder/output", 7 | "output_structure": "Metadata_Well-Metadata_Site", 8 | "_comment2": "The following groups are tasks, and each will be run in parallel", 9 | "groups": [ 10 | {"Metadata": "Metadata_Well=A01,Metadata_Site=1"}, 11 | {"Metadata": "Metadata_Well=A01,Metadata_Site=2"}, 12 | {"Metadata": "Metadata_Well=A02,Metadata_Site=1"} 13 | ] 14 | } 15 | 16 | -------------------------------------------------------------------------------- /files/ManualMetadata.py: -------------------------------------------------------------------------------- 1 | ''' A script to create a list of all the metadata combinations present in a given CSV 2 | This is designed to be called from the command line with 3 | $ python ManualMetadata.py pathtocsv/csvfile.csv "['Metadata_Metadata1','Metadata_Metadata2']" 4 | ''' 5 | from __future__ import print_function 6 | 7 | import pandas as pd 8 | import sys 9 | import ast 10 | 11 | csv=sys.argv[1] 12 | metadatalist=ast.literal_eval(sys.argv[2]) 13 | 14 | def manualmetadata(): 15 | incsv=pd.read_csv(csv) 16 | manmet=open(csv[:-4]+'batch.txt','w') 17 | print(incsv.shape) 18 | done=[] 19 | for i in range(incsv.shape[0]): 20 | metadatatext='{"Metadata": "' 21 | for j in metadatalist: 22 | metadatatext+=j+'='+str(incsv[j][i])+',' 23 | metadatatext=metadatatext[:-1]+'"}, \n' 24 | if metadatatext not in done: 25 | manmet.write(metadatatext) 26 | done.append(metadatatext) 27 | manmet.close() 28 | print(str(len(done)), 'batches found') 29 | manualmetadata() 30 | -------------------------------------------------------------------------------- /files/batches.sh: -------------------------------------------------------------------------------- 1 | # Command to generate batches for a single plate. 2 | # It generates 384*9 tasks, corresponding to 384 wells with 9 images each. 3 | # An image is the unit of parallelization in this example. 4 | # 5 | # You need to install parallel to run this command. 6 | 7 | parallel echo '{\"Metadata\": \"Metadata_Plate={1},Metadata_Well={2}{3},Metadata_Site={4}\"},' ::: Plate1 Plate2 ::: `echo {A..P}` ::: `seq -w 24` ::: `seq -w 9` | sort > batches.txt 8 | -------------------------------------------------------------------------------- /files/exampleFleet_us-east-1.json: -------------------------------------------------------------------------------- 1 | { 2 | "IamFleetRole": "arn:aws:iam::XXXXXXXXXXXXX:role/aws-ec2-spot-fleet-tagging-role", 3 | "AllocationStrategy": "lowestPrice", 4 | "TerminateInstancesWithExpiration": true, 5 | "LaunchSpecifications": [ 6 | { 7 | "ImageId": "ami-0ce03502c5bb5e188", 8 | "KeyName": "your_key_file_name", 9 | "IamInstanceProfile": { 10 | "Arn": "arn:aws:iam::XXXXXXXXXXXX:instance-profile/ecsInstanceRole" 11 | }, 12 | "BlockDeviceMappings": [ 13 | { 14 | "DeviceName": "/dev/xvda", 15 | "Ebs": { 16 | "DeleteOnTermination": true, 17 | "VolumeType": "gp3", 18 | "VolumeSize": 8, 19 | "SnapshotId": "snap-0a7b4ece894d62882" 20 | } 21 | }, 22 | { 23 | "DeviceName": "/dev/xvdcz", 24 | "Ebs": { 25 | "DeleteOnTermination": true, 26 | "VolumeType": "gp3" 27 | } 28 | } 29 | ], 30 | "NetworkInterfaces": [ 31 | { 32 | "DeviceIndex": 0, 33 | "SubnetId": "subnet-WWWWWWWW", 34 | "DeleteOnTermination": true, 35 | "AssociatePublicIpAddress": false, 36 | "Groups": [ 37 | "sg-ZZZZZZZZZ" 38 | ] 39 | } 40 | ] 41 | } 42 | ], 43 | "Type": "maintain" 44 | } 45 | 46 | -------------------------------------------------------------------------------- /files/exampleFleet_us-west-2.json: -------------------------------------------------------------------------------- 1 | { 2 | "IamFleetRole": "arn:aws:iam::XXXXXXXXXXXXX:role/aws-ec2-spot-fleet-tagging-role", 3 | "AllocationStrategy": "lowestPrice", 4 | "TerminateInstancesWithExpiration": true, 5 | "LaunchSpecifications": [ 6 | { 7 | "ImageId": "ami-c9c87cb1", 8 | "KeyName": "your_key_file_name", 9 | "IamInstanceProfile": { 10 | "Arn": "arn:aws:iam::XXXXXXXXXXXX:instance-profile/ecsInstanceRole" 11 | }, 12 | "BlockDeviceMappings": [ 13 | { 14 | "DeviceName": "/dev/xvda", 15 | "Ebs": { 16 | "DeleteOnTermination": true, 17 | "VolumeType": "gp2", 18 | "VolumeSize": 8, 19 | "SnapshotId": "snap-0b52be5bdbda1ac5f" 20 | } 21 | }, 22 | { 23 | "DeviceName": "/dev/xvdcz", 24 | "Ebs": { 25 | "DeleteOnTermination": true, 26 | "VolumeType": "gp2" 27 | } 28 | } 29 | ], 30 | "NetworkInterfaces": [ 31 | { 32 | "DeviceIndex": 0, 33 | "SubnetId": "subnet-WWWWWWWW", 34 | "DeleteOnTermination": true, 35 | "AssociatePublicIpAddress": false, 36 | "Groups": [ 37 | "sg-ZZZZZZZZZ" 38 | ] 39 | } 40 | ] 41 | } 42 | ], 43 | "Type": "maintain" 44 | } 45 | 46 | -------------------------------------------------------------------------------- /files/exampleJob.json: -------------------------------------------------------------------------------- 1 | { 2 | "_comment1": "Paths in this file are relative to the root of S3 buckets", 3 | "_comment2": "pipeline, data_file, and input are relative to SOURCE_BUCKET; output to DESTINATION_BUCKET", 4 | "pipeline": "projects/analysis.cppipe", 5 | "data_file": "projects/list_of_images.csv", 6 | "input": "projects/input/", 7 | "output": "projects/output/", 8 | "output_structure": "Metadata_Plate-Metadata_Well-Metadata_Site", 9 | "_comment2": "The following groups are tasks, and each will be run in parallel", 10 | "groups": [ 11 | {"Metadata": "Metadata_Plate=Plate1,Metadata_Well=A01,Metadata_Site=1"}, 12 | {"Metadata": "Metadata_Plate=Plate1,Metadata_Well=A01,Metadata_Site=2"} 13 | ] 14 | } 15 | -------------------------------------------------------------------------------- /files/requirements.txt: -------------------------------------------------------------------------------- 1 | boto3>=1.0.0 2 | -------------------------------------------------------------------------------- /lambda_function.py: -------------------------------------------------------------------------------- 1 | import boto3 2 | import datetime 3 | import botocore 4 | import json 5 | 6 | s3 = boto3.client("s3") 7 | ecs = boto3.client("ecs") 8 | ec2 = boto3.client("ec2") 9 | cloudwatch = boto3.client("cloudwatch") 10 | sqs = boto3.client("sqs") 11 | 12 | bucket = "BUCKET_NAME" 13 | 14 | 15 | def killdeadAlarms(fleetId, project): 16 | checkdates = [ 17 | datetime.datetime.now().strftime("%Y-%m-%d"), 18 | (datetime.datetime.now() - datetime.timedelta(days=1)).strftime("%Y-%m-%d"), 19 | ] 20 | todel = [] 21 | for eachdate in checkdates: 22 | datedead = ec2.describe_spot_fleet_request_history( 23 | SpotFleetRequestId=fleetId, StartTime=eachdate 24 | ) 25 | for eachevent in datedead["HistoryRecords"]: 26 | if eachevent["EventType"] == "instanceChange": 27 | if eachevent["EventInformation"]["EventSubType"] == "terminated": 28 | todel.append(eachevent["EventInformation"]["InstanceId"]) 29 | todel = [f"{project}_{x}" for x in todel] 30 | while len(todel) > 100: 31 | dellist = todel[:100] 32 | cloudwatch.delete_alarms(AlarmNames=dellist) 33 | todel = todel[100:] 34 | if len(todel) <= 100: 35 | cloudwatch.delete_alarms(AlarmNames=todel) 36 | print("Old alarms deleted") 37 | 38 | 39 | def seeIfLogExportIsDone(logExportId): 40 | while True: 41 | result = cloudwatch.describe_export_tasks(taskId=logExportId) 42 | if result["exportTasks"][0]["status"]["code"] != "PENDING": 43 | if result["exportTasks"][0]["status"]["code"] != "RUNNING": 44 | print(result["exportTasks"][0]["status"]["code"]) 45 | break 46 | time.sleep(30) 47 | 48 | 49 | def downscaleSpotFleet(nonvisible, spotFleetID): 50 | status = ec2.describe_spot_fleet_instances(SpotFleetRequestId=spotFleetID) 51 | if nonvisible < len(status["ActiveInstances"]): 52 | ec2.modify_spot_fleet_request( 53 | ExcessCapacityTerminationPolicy="noTermination", 54 | TargetCapacity=str(nonvisible), 55 | SpotFleetRequestId=spotFleetID, 56 | ) 57 | 58 | 59 | def check_sqs_queue(queueName): 60 | response = sqs.get_queue_url(QueueName=queueName) 61 | queueUrl = response["QueueUrl"] 62 | response = sqs.get_queue_attributes( 63 | QueueUrl=queueUrl, 64 | AttributeNames=[ 65 | "ApproximateNumberOfMessages", 66 | "ApproximateNumberOfMessagesNotVisible", 67 | ], 68 | ) 69 | visible = int(response["Attributes"]["ApproximateNumberOfMessages"]) 70 | nonvisible = int(response["Attributes"]["ApproximateNumberOfMessagesNotVisible"]) 71 | print( 72 | f"Found {visible} visible messages and {nonvisible} nonvisible messages in queue." 73 | ) 74 | return visible, nonvisible 75 | 76 | 77 | def lambda_handler(event, lambda_context): 78 | # Triggered any time SQS queue ApproximateNumberOfMessagesVisible = 0 79 | # OR ApproximateNumberOfMessagesNotVisible = 0 80 | messagestring = event["Records"][0]["Sns"]["Message"] 81 | messagedict = json.loads(messagestring) 82 | queueName = messagedict["Trigger"]["Dimensions"][0]["value"] 83 | project = queueName.rsplit("_", 1)[0] 84 | 85 | # Download monitor file 86 | monitor_file_name = f"{queueName.split('Queue')[0]}SpotFleetRequestId.json" 87 | monitor_local_name = f"/tmp/{monitor_file_name}" 88 | monitor_on_bucket_name = f"monitors/{monitor_file_name}" 89 | 90 | with open(monitor_local_name, "wb") as f: 91 | try: 92 | s3.download_fileobj(bucket, monitor_on_bucket_name, f) 93 | except botocore.exceptions.ClientError as error: 94 | print("Error retrieving monitor file.") 95 | return 96 | with open(monitor_local_name, "r") as input: 97 | monitorInfo = json.load(input) 98 | 99 | monitorcluster = monitorInfo["MONITOR_ECS_CLUSTER"] 100 | monitorapp = monitorInfo["MONITOR_APP_NAME"] 101 | fleetId = monitorInfo["MONITOR_FLEET_ID"] 102 | loggroupId = monitorInfo["MONITOR_LOG_GROUP_NAME"] 103 | CLEAN_DASHBOARD = monitorInfo["CLEAN_DASHBOARD"] 104 | print(f"Monitor triggered for {monitorcluster} {monitorapp} {fleetId} {loggroupId}") 105 | 106 | visible, nonvisible = check_sqs_queue(queueName) 107 | 108 | # If no visible messages, downscale machines 109 | if visible == 0 and nonvisible > 0: 110 | print("No visible messages. Tidying as we go.") 111 | killdeadAlarms(fleetId, project) 112 | downscaleSpotFleet(nonvisible, fleetId) 113 | 114 | # If no messages in progress, cleanup 115 | if visible == 0 and nonvisible == 0: 116 | print("No messages in progress. Cleaning up.") 117 | ecs.update_service( 118 | cluster=monitorcluster, 119 | service=f"{monitorapp}Service", 120 | desiredCount=0, 121 | ) 122 | print("Service has been downscaled") 123 | 124 | # Delete the alarms from active machines and machines that have died. 125 | active_dictionary = ec2.describe_spot_fleet_instances( 126 | SpotFleetRequestId=fleetId 127 | ) 128 | active_instances = [] 129 | for instance in active_dictionary["ActiveInstances"]: 130 | active_instances.append(instance["InstanceId"]) 131 | while len(active_instances) > 100: 132 | dellist = active_instances[:100] 133 | cloudwatch.delete_alarms(AlarmNames=dellist) 134 | active_instances = active_instances[100:] 135 | if len(active_instances) <= 100: 136 | cloudwatch.delete_alarms(AlarmNames=active_instances) 137 | killdeadAlarms(fleetId, monitorapp, project) 138 | 139 | # Read spot fleet id and terminate all EC2 instances 140 | ec2.cancel_spot_fleet_requests( 141 | SpotFleetRequestIds=[fleetId], TerminateInstances=True 142 | ) 143 | print("Fleet shut down.") 144 | 145 | # Remove SQS queue, ECS Task Definition, ECS Service 146 | ECS_TASK_NAME = monitorapp + "Task" 147 | ECS_SERVICE_NAME = monitorapp + "Service" 148 | 149 | print("Deleting existing queue.") 150 | queueoutput = sqs.list_queues(QueueNamePrefix=queueName) 151 | try: 152 | if len(queueoutput["QueueUrls"]) == 1: 153 | queueUrl = queueoutput["QueueUrls"][0] 154 | else: # In case we have "AnalysisQueue" and "AnalysisQueue1" and only want to delete the first of those 155 | for eachUrl in queueoutput["QueueUrls"]: 156 | if eachUrl.split("/")[-1] == queueName: 157 | queueUrl = eachUrl 158 | sqs.delete_queue(QueueUrl=queueUrl) 159 | except KeyError: 160 | print("Can't find queue to delete.") 161 | 162 | print("Deleting service") 163 | try: 164 | ecs.delete_service(cluster=monitorcluster, service=ECS_SERVICE_NAME) 165 | except: 166 | print("Couldn't delete service.") 167 | 168 | print("De-registering task") 169 | taskArns = ecs.list_task_definitions(familyPrefix=ECS_TASK_NAME) 170 | for eachtask in taskArns["taskDefinitionArns"]: 171 | fulltaskname = eachtask.split("/")[-1] 172 | ecs.deregister_task_definition(taskDefinition=fulltaskname) 173 | 174 | print("Removing cluster if it's not the default and not otherwise in use") 175 | if monitorcluster != "default": 176 | result = ecs.describe_clusters(clusters=[monitorcluster]) 177 | if ( 178 | sum( 179 | [ 180 | result["clusters"][0]["pendingTasksCount"], 181 | result["clusters"][0]["runningTasksCount"], 182 | result["clusters"][0]["activeServicesCount"], 183 | ] 184 | ) 185 | == 0 186 | ): 187 | ecs.delete_cluster(cluster=monitorcluster) 188 | 189 | # Remove alarms that triggered monitor 190 | print("Removing alarms that triggered Monitor") 191 | cloudwatch.delete_alarms( 192 | AlarmNames=[ 193 | f"ApproximateNumberOfMessagesVisibleisZero_{monitorapp}", 194 | f"ApproximateNumberOfMessagesNotVisibleisZero_{monitorapp}", 195 | ] 196 | ) 197 | 198 | # Remove Cloudwatch dashboard if created and cleanup desired 199 | if CLEAN_DASHBOARD.lower() == "true": 200 | dashboard_list = cloudwatch.list_dashboards() 201 | for entry in dashboard_list["DashboardEntries"]: 202 | if monitorapp in entry["DashboardName"]: 203 | cloudwatch.delete_dashboards( 204 | DashboardNames=[entry["DashboardName"]] 205 | ) 206 | 207 | # Delete monitor file 208 | s3.delete_object(Bucket=bucket, Key=monitor_on_bucket_name) 209 | -------------------------------------------------------------------------------- /python2worker/Dockerfile: -------------------------------------------------------------------------------- 1 | # 2 | # - [ BROAD'16 ] - 3 | # 4 | # A docker instance for accessing AWS resources 5 | # This wraps the cellprofiler docker registry 6 | # 7 | 8 | 9 | FROM cellprofiler/cellprofiler:3.1.9 10 | 11 | # Install S3FS 12 | 13 | RUN apt-get -y update && \ 14 | apt-get -y upgrade && \ 15 | apt-get -y install \ 16 | automake \ 17 | autotools-dev \ 18 | g++ \ 19 | git \ 20 | libcurl4-gnutls-dev \ 21 | libfuse-dev \ 22 | libssl-dev \ 23 | libxml2-dev \ 24 | make pkg-config \ 25 | sysstat \ 26 | curl 27 | 28 | WORKDIR /usr/local/src 29 | RUN git clone https://github.com/s3fs-fuse/s3fs-fuse.git 30 | WORKDIR /usr/local/src/s3fs-fuse 31 | RUN ./autogen.sh 32 | RUN ./configure 33 | RUN make 34 | RUN make install 35 | 36 | # Install AWS CLI 37 | 38 | RUN \ 39 | pip install awscli 40 | 41 | # Install boto3 42 | 43 | RUN \ 44 | pip install -U boto3 45 | 46 | # Install watchtower for logging 47 | 48 | RUN \ 49 | pip install watchtower==0.8.0 50 | 51 | # Install pandas for optional file downloading 52 | 53 | RUN pip install pandas==0.24.2 54 | 55 | # SETUP NEW ENTRYPOINT 56 | 57 | RUN mkdir -p /home/ubuntu/ 58 | WORKDIR /home/ubuntu 59 | COPY cp-worker.py . 60 | COPY instance-monitor.py . 61 | COPY run-worker.sh . 62 | RUN chmod 755 run-worker.sh 63 | 64 | RUN git clone https://github.com/CellProfiler/CellProfiler-plugins.git 65 | WORKDIR /home/ubuntu/CellProfiler-plugins 66 | #RUN pip install -r requirements.txt 67 | 68 | WORKDIR /home/ubuntu 69 | ENTRYPOINT ["./run-worker.sh"] 70 | CMD [""] 71 | -------------------------------------------------------------------------------- /python2worker/Makefile: -------------------------------------------------------------------------------- 1 | user = cellprofiler 2 | project = distributed-cellprofiler 3 | tag = 2.0.0_3.1.9 4 | 5 | .DEFAULT_GOAL: build 6 | build: 7 | docker build -t $(user)/$(project):$(tag) . 8 | docker login -u $(user) 9 | docker push $(user)/$(project):$(tag) 10 | echo "Your new docker registry is ready at: $(user)/$(project):$(tag)" 11 | -------------------------------------------------------------------------------- /python2worker/cp-worker.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import boto3 3 | import glob 4 | import json 5 | import logging 6 | import os 7 | import re 8 | import subprocess 9 | import sys 10 | import time 11 | import watchtower 12 | import string 13 | 14 | ################################# 15 | # CONSTANT PATHS IN THE CONTAINER 16 | ################################# 17 | 18 | DATA_ROOT = '/home/ubuntu/bucket' 19 | LOCAL_OUTPUT = '/home/ubuntu/local_output' 20 | PLUGIN_DIR = '/home/ubuntu/CellProfiler-plugins' 21 | QUEUE_URL = os.environ['SQS_QUEUE_URL'] 22 | AWS_BUCKET = os.environ['AWS_BUCKET'] 23 | LOG_GROUP_NAME= os.environ['LOG_GROUP_NAME'] 24 | CHECK_IF_DONE_BOOL= os.environ['CHECK_IF_DONE_BOOL'] 25 | EXPECTED_NUMBER_FILES= os.environ['EXPECTED_NUMBER_FILES'] 26 | if 'MIN_FILE_SIZE_BYTES' not in os.environ: 27 | MIN_FILE_SIZE_BYTES = 1 28 | else: 29 | MIN_FILE_SIZE_BYTES = int(os.environ['MIN_FILE_SIZE_BYTES']) 30 | if 'USE_PLUGINS' not in os.environ: 31 | USE_PLUGINS = 'False' 32 | else: 33 | USE_PLUGINS = os.environ['USE_PLUGINS'] 34 | if 'NECESSARY_STRING' not in os.environ: 35 | NECESSARY_STRING = False 36 | else: 37 | NECESSARY_STRING = os.environ['NECESSARY_STRING'] 38 | if 'DOWNLOAD_FILES' not in os.environ: 39 | DOWNLOAD_FILES = False 40 | else: 41 | DOWNLOAD_FILES = os.environ['DOWNLOAD_FILES'] 42 | 43 | localIn = '/home/ubuntu/local_input' 44 | 45 | 46 | ################################# 47 | # CLASS TO HANDLE THE SQS QUEUE 48 | ################################# 49 | 50 | class JobQueue(): 51 | 52 | def __init__(self, queueURL): 53 | self.client = boto3.client('sqs') 54 | self.queueURL = queueURL 55 | 56 | def readMessage(self): 57 | response = self.client.receive_message(QueueUrl=self.queueURL, WaitTimeSeconds=20) 58 | if 'Messages' in response.keys(): 59 | data = json.loads(response['Messages'][0]['Body']) 60 | handle = response['Messages'][0]['ReceiptHandle'] 61 | return data, handle 62 | else: 63 | return None, None 64 | 65 | def deleteMessage(self, handle): 66 | self.client.delete_message(QueueUrl=self.queueURL, ReceiptHandle=handle) 67 | return 68 | 69 | def returnMessage(self, handle): 70 | self.client.change_message_visibility(QueueUrl=self.queueURL, ReceiptHandle=handle, VisibilityTimeout=60) 71 | return 72 | 73 | ################################# 74 | # AUXILIARY FUNCTIONS 75 | ################################# 76 | 77 | 78 | def monitorAndLog(process,logger): 79 | while True: 80 | output= process.stdout.readline() 81 | if output== '' and process.poll() is not None: 82 | break 83 | if output: 84 | print(output.strip()) 85 | logger.info(output) 86 | 87 | def printandlog(text,logger): 88 | print(text) 89 | logger.info(text) 90 | 91 | ################################# 92 | # RUN CELLPROFILER PROCESS 93 | ################################# 94 | 95 | def runCellProfiler(message): 96 | #List the directories in the bucket- this prevents a strange s3fs error 97 | rootlist=os.listdir(DATA_ROOT) 98 | for eachSubDir in rootlist: 99 | subDirName=os.path.join(DATA_ROOT,eachSubDir) 100 | if os.path.isdir(subDirName): 101 | trashvar=os.system('ls '+subDirName) 102 | 103 | # Configure the logs 104 | logger = logging.getLogger(__name__) 105 | 106 | # Prepare paths and parameters 107 | if type(message['Metadata'])==dict: #support for cellprofiler --print-groups output 108 | if message['output_structure']=='': 109 | watchtowerlogger=watchtower.CloudWatchLogHandler(log_group=LOG_GROUP_NAME, stream_name=str(message['Metadata'].values()),create_log_group=False) 110 | logger.addHandler(watchtowerlogger) 111 | printandlog('You must specify an output structure when passing Metadata as dictionaries',logger) 112 | logger.removeHandler(watchtowerlogger) 113 | return 'INPUT_PROBLEM' 114 | else: 115 | metadataID = message['output_structure'] 116 | metadataForCall = '' 117 | for eachMetadata in message['Metadata'].keys(): 118 | if eachMetadata not in metadataID: 119 | watchtowerlogger=watchtower.CloudWatchLogHandler(log_group=LOG_GROUP_NAME, stream_name=str(message['Metadata'].values()),create_log_group=False) 120 | logger.addHandler(watchtowerlogger) 121 | printandlog('Your specified output structure does not match the Metadata passed',logger) 122 | logger.removeHandler(watchtowerlogger) 123 | else: 124 | metadataID = string.replace(metadataID,eachMetadata,message['Metadata'][eachMetadata]) 125 | metadataForCall+=eachMetadata+'='+message['Metadata'][eachMetadata]+',' 126 | message['Metadata']=metadataForCall[:-1] 127 | elif 'output_structure' in message.keys(): 128 | if message['output_structure']!='': #support for explicit output structuring 129 | watchtowerlogger=watchtower.CloudWatchLogHandler(log_group=LOG_GROUP_NAME, stream_name=message['Metadata'],create_log_group=False) 130 | logger.addHandler(watchtowerlogger) 131 | metadataID = message['output_structure'] 132 | for eachMetadata in message['Metadata'].split(','): 133 | if eachMetadata.split('=')[0] not in metadataID: 134 | printandlog('Your specified output structure does not match the Metadata passed',logger) 135 | else: 136 | metadataID = string.replace(metadataID,eachMetadata.split('=')[0],eachMetadata.split('=')[1]) 137 | printandlog('metadataID ='+metadataID, logger) 138 | logger.removeHandler(watchtowerlogger) 139 | else: #backwards compatability with 1.0.0 and/or no desire to structure output 140 | metadataID = '-'.join([x.split('=')[1] for x in message['Metadata'].split(',')]) # Strip equal signs from the metadata 141 | else: #backwards compatability with 1.0.0 and/or no desire to structure output 142 | metadataID = '-'.join([x.split('=')[1] for x in message['Metadata'].split(',')]) # Strip equal signs from the metadata 143 | 144 | localOut = LOCAL_OUTPUT + '/%(MetadataID)s' % {'MetadataID': metadataID} 145 | remoteOut= os.path.join(message['output'],metadataID) 146 | replaceValues = {'PL':message['pipeline'], 'OUT':localOut, 'FL':message['data_file'], 147 | 'DATA': DATA_ROOT, 'Metadata': message['Metadata'], 'IN': message['input'], 148 | 'MetadataID':metadataID, 'PLUGINS':PLUGIN_DIR } 149 | 150 | # Start loggging now that we have a job we care about 151 | watchtowerlogger=watchtower.CloudWatchLogHandler(log_group=LOG_GROUP_NAME, stream_name=metadataID,create_log_group=False) 152 | logger.addHandler(watchtowerlogger) 153 | 154 | # See if this is a message you've already handled, if you've so chosen 155 | if CHECK_IF_DONE_BOOL.upper() == 'TRUE': 156 | try: 157 | s3client=boto3.client('s3') 158 | bucketlist=s3client.list_objects(Bucket=AWS_BUCKET,Prefix=remoteOut+'/') 159 | objectsizelist=[k['Size'] for k in bucketlist['Contents']] 160 | objectsizelist = [i for i in objectsizelist if i >= MIN_FILE_SIZE_BYTES] 161 | if NECESSARY_STRING: 162 | if NECESSARY_STRING != '': 163 | objectsizelist = [i for i in objectsizelist if NECESSARY_STRING in i] 164 | if len(objectsizelist)>=int(EXPECTED_NUMBER_FILES): 165 | printandlog('File not run due to > expected number of files',logger) 166 | logger.removeHandler(watchtowerlogger) 167 | return 'SUCCESS' 168 | except KeyError: #Returned if that folder does not exist 169 | pass 170 | 171 | csv_name = os.path.join(DATA_ROOT,message['data_file']) 172 | downloaded_files = [] 173 | 174 | # Optional- download files 175 | if DOWNLOAD_FILES: 176 | if DOWNLOAD_FILES.lower() == 'true': 177 | printandlog('Figuring which files to download', logger) 178 | import pandas 179 | s3 = boto3.resource('s3') 180 | if not os.path.exists(localIn): 181 | os.mkdir(localIn) 182 | csv_in = pandas.read_csv(os.path.join(DATA_ROOT,message['data_file'])) 183 | csv_in=csv_in.astype('str') 184 | #Figure out what metadata fields we need in this experiment, as a dict 185 | if type(message['Metadata'])==dict: 186 | filter_dict = message['Metadata'] 187 | else: 188 | filter_dict = {} 189 | for eachMetadata in message['Metadata'].split(','): 190 | filterkey, filterval = eachMetadata.split('=') 191 | filter_dict[filterkey] = filterval 192 | #Filter our CSV to just the rows CellProfiler will process, so that we can download only what we need 193 | for eachfilter in filter_dict.keys(): 194 | csv_in = csv_in[csv_in[eachfilter] == filter_dict[eachfilter]] 195 | #Figure out the actual file names and get them 196 | channel_list = [x.split('FileName_')[1] for x in csv_in.columns if 'FileName' in x] 197 | count = 0 198 | printandlog('Downloading files', logger) 199 | for channel in channel_list: 200 | for field in range(csv_in.shape[0]): 201 | full_old_file_name = os.path.join(list(csv_in['PathName_'+channel])[field],list(csv_in['FileName_'+channel])[field]) 202 | prefix_on_bucket = full_old_file_name.split(DATA_ROOT)[1][1:] 203 | new_file_name = os.path.join(localIn,prefix_on_bucket) 204 | if not os.path.exists(os.path.split(new_file_name)[0]): 205 | os.makedirs(os.path.split(new_file_name)[0]) 206 | printandlog('made directory '+os.path.split(new_file_name)[0],logger) 207 | if not os.path.exists(new_file_name): 208 | s3.meta.client.download_file(AWS_BUCKET,prefix_on_bucket,new_file_name) 209 | downloaded_files.append(new_file_name) 210 | printandlog('Downloaded '+str(len(downloaded_files))+' files',logger) 211 | import random 212 | newtag = False 213 | while newtag == False: 214 | tag = str(random.randint(100000,999999)) #keep files from overwriting one another 215 | local_csv_name = os.path.join(localIn,tag,os.path.split(csv_name)[1]) 216 | if not os.path.exists(local_csv_name): 217 | if not os.path.exists(os.path.split(local_csv_name)[0]): 218 | os.makedirs(os.path.split(local_csv_name)[0]) 219 | csv_in = pandas.read_csv(os.path.join(DATA_ROOT,message['data_file'])) 220 | csv_in.replace(DATA_ROOT,localIn,regex=True, inplace=True) 221 | csv_in.to_csv(local_csv_name,index=False) 222 | print('Wrote updated CSV') 223 | newtag = True 224 | else: 225 | newtag = False 226 | csv_name = local_csv_name 227 | 228 | # Build and run CellProfiler command 229 | cpDone = localOut + '/cp.is.done' 230 | cp2 = False 231 | with open(os.path.join(replaceValues['DATA'],replaceValues['PL']), 'r') as openpipe: 232 | for line in openpipe: 233 | if 'DateRevision:2' in line: #comes from a CP2 pipeline 234 | cp2 = True 235 | cmdstem = 'cellprofiler -c -r -b ' 236 | if not cp2: 237 | cmdstem = 'cellprofiler -c -r ' 238 | if message['pipeline'][-3:]!='.h5': 239 | cmd = cmdstem + '-p %(DATA)s/%(PL)s -i %(DATA)s/%(IN)s -o %(OUT)s -d ' + cpDone 240 | cmd += ' --data-file='+csv_name+' ' 241 | cmd += '-g %(Metadata)s' 242 | else: 243 | cmd = cmdstem + '-p %(DATA)s/%(PL)s -i %(DATA)s/%(IN)s -o %(OUT)s -d ' + cpDone + ' -g %(Metadata)s' 244 | if USE_PLUGINS.lower() == 'true': 245 | cmd += ' --plugins-directory=%(PLUGINS)s' 246 | cmd = cmd % replaceValues 247 | print('Running', cmd) 248 | logger.info(cmd) 249 | 250 | subp = subprocess.Popen(cmd.split(), stdout=subprocess.PIPE, stderr=subprocess.STDOUT) 251 | monitorAndLog(subp,logger) 252 | 253 | # Get the outputs and move them to S3 254 | if os.path.isfile(cpDone): 255 | time.sleep(30) 256 | if len(downloaded_files) > 0: 257 | for eachfile in downloaded_files: 258 | if os.path.exists(eachfile): #Shared files are possible, and might already be cleaned up 259 | os.remove(eachfile) 260 | mvtries=0 261 | while mvtries <3: 262 | try: 263 | printandlog('Move attempt #'+str(mvtries+1),logger) 264 | cmd = 'aws s3 mv ' + localOut + ' s3://' + AWS_BUCKET + '/' + remoteOut + ' --recursive --exclude=cp.is.done' 265 | subp = subprocess.Popen(cmd.split(), stdout=subprocess.PIPE, stderr=subprocess.PIPE) 266 | out,err = subp.communicate() 267 | printandlog('== OUT \n'+out, logger) 268 | if err == '': 269 | break 270 | else: 271 | printandlog('== ERR \n'+err,logger) 272 | mvtries+=1 273 | except: 274 | printandlog('Move failed',logger) 275 | printandlog('== ERR \n'+err,logger) 276 | time.sleep(30) 277 | mvtries+=1 278 | if next(open(cpDone))=='Complete\n': 279 | if mvtries<3: 280 | printandlog('SUCCESS',logger) 281 | logger.removeHandler(watchtowerlogger) 282 | return 'SUCCESS' 283 | else: 284 | printandlog('OUTPUT PROBLEM. Giving up on '+metadataID,logger) 285 | logger.removeHandler(watchtowerlogger) 286 | return 'OUTPUT_PROBLEM' 287 | else: 288 | printandlog('CP PROBLEM: Done file reports failure',logger) 289 | logger.removeHandler(watchtowerlogger) 290 | return 'CP_PROBLEM' 291 | else: 292 | printandlog('CP PROBLEM: Done file does not exist.',logger) 293 | logger.removeHandler(watchtowerlogger) 294 | import shutil 295 | shutil.rmtree(localOut, ignore_errors=True) 296 | return 'CP_PROBLEM' 297 | 298 | 299 | ################################# 300 | # MAIN WORKER LOOP 301 | ################################# 302 | 303 | def main(): 304 | queue = JobQueue(QUEUE_URL) 305 | # Main loop. Keep reading messages while they are available in SQS 306 | while True: 307 | msg, handle = queue.readMessage() 308 | if msg is not None: 309 | result = runCellProfiler(msg) 310 | if result == 'SUCCESS': 311 | print('Batch completed successfully.') 312 | queue.deleteMessage(handle) 313 | else: 314 | print('Returning message to the queue.') 315 | queue.returnMessage(handle) 316 | else: 317 | print('No messages in the queue') 318 | break 319 | 320 | ################################# 321 | # MODULE ENTRY POINT 322 | ################################# 323 | 324 | if __name__ == '__main__': 325 | logging.basicConfig(level=logging.INFO) 326 | print('Worker started') 327 | main() 328 | print('Worker finished') 329 | 330 | -------------------------------------------------------------------------------- /python2worker/instance-monitor.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Wed Oct 12 17:39:49 2016 4 | 5 | @author: bcimini 6 | """ 7 | 8 | import subprocess 9 | import time 10 | import logging 11 | 12 | def monitor(): 13 | logger = logging.getLogger(__name__) 14 | while True: 15 | cmdlist=['df -h', 'df -i -h','vmstat -a -SM 1 1', 'iostat'] 16 | for cmd in cmdlist: 17 | process=subprocess.Popen(cmd.split(),stdout=subprocess.PIPE) 18 | out,err=process.communicate() 19 | logger.info(out) 20 | time.sleep(30) 21 | 22 | if __name__=='__main__': 23 | logging.basicConfig(level=logging.INFO) 24 | monitor() 25 | -------------------------------------------------------------------------------- /python2worker/run-worker.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | echo "Region $AWS_REGION" 4 | echo "Queue $SQS_QUEUE_URL" 5 | echo "Bucket $AWS_BUCKET" 6 | 7 | # 1. CONFIGURE AWS CLI 8 | aws configure set aws_access_key_id $AWS_ACCESS_KEY_ID 9 | aws configure set aws_secret_access_key $AWS_SECRET_ACCESS_KEY 10 | aws configure set default.region $AWS_REGION 11 | MY_INSTANCE_ID=$(curl http://169.254.169.254/latest/meta-data/instance-id) 12 | echo "Instance ID $MY_INSTANCE_ID" 13 | OWNER_ID=$(aws ec2 describe-instances --instance-ids $MY_INSTANCE_ID --output text --query 'Reservations[0].[OwnerId]') 14 | aws ec2 create-tags --resources $MY_INSTANCE_ID --tags Key=Name,Value=${APP_NAME}Worker 15 | VOL_0_ID=$(aws ec2 describe-instance-attribute --instance-id $MY_INSTANCE_ID --attribute blockDeviceMapping --output text --query BlockDeviceMappings[0].Ebs.[VolumeId]) 16 | aws ec2 create-tags --resources $VOL_0_ID --tags Key=Name,Value=${APP_NAME}Worker 17 | VOL_1_ID=$(aws ec2 describe-instance-attribute --instance-id $MY_INSTANCE_ID --attribute blockDeviceMapping --output text --query BlockDeviceMappings[1].Ebs.[VolumeId]) 18 | aws ec2 create-tags --resources $VOL_1_ID --tags Key=Name,Value=${APP_NAME}Worker 19 | 20 | # 2. MOUNT S3 21 | echo $AWS_ACCESS_KEY_ID:$AWS_SECRET_ACCESS_KEY > /credentials.txt 22 | chmod 600 /credentials.txt 23 | mkdir -p /home/ubuntu/bucket 24 | mkdir -p /home/ubuntu/local_output 25 | stdbuf -o0 s3fs $AWS_BUCKET /home/ubuntu/bucket -o passwd_file=/credentials.txt 26 | 27 | # 3. SET UP ALARMS 28 | aws cloudwatch put-metric-alarm --alarm-name ${APP_NAME}_${MY_INSTANCE_ID} --alarm-actions arn:aws:swf:${AWS_REGION}:${OWNER_ID}:action/actions/AWS_EC2.InstanceId.Terminate/1.0 --statistic Maximum --period 60 --threshold 1 --comparison-operator LessThanThreshold --metric-name CPUUtilization --namespace AWS/EC2 --evaluation-periods 15 --dimensions "Name=InstanceId,Value=${MY_INSTANCE_ID}" 29 | 30 | # 4. RUN VM STAT MONITOR 31 | 32 | python instance-monitor.py & 33 | 34 | # 5. RUN CP WORKERS 35 | for((k=0; k<$DOCKER_CORES; k++)); do 36 | python cp-worker.py |& tee $k.out & 37 | sleep $SECONDS_TO_START 38 | done 39 | wait 40 | 41 | -------------------------------------------------------------------------------- /setup_AWS.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import boto3 3 | import json 4 | import shutil 5 | import os 6 | 7 | iam = boto3.client("iam") 8 | sns = boto3.client("sns") 9 | lmbda = boto3.client("lambda") 10 | 11 | ecsInstanceRole_policy_list = [ 12 | "arn:aws:iam::aws:policy/AmazonS3FullAccess", 13 | "arn:aws:iam::aws:policy/CloudWatchFullAccess", 14 | "arn:aws:iam::aws:policy/CloudWatchActionsEC2Access", 15 | "arn:aws:iam::aws:policy/service-role/AmazonEC2ContainerServiceforEC2Role", 16 | "arn:aws:iam::aws:policy/service-role/AmazonEC2ContainerServiceRole", 17 | ] 18 | LambdaFullAccess_policy_list = [ 19 | "arn:aws:iam::aws:policy/AWSLambda_FullAccess", 20 | "arn:aws:iam::aws:policy/AmazonSNSFullAccess", 21 | "arn:aws:iam::aws:policy/service-role/AWSLambdaVPCAccessExecutionRole", 22 | "arn:aws:iam::aws:policy/service-role/AWSLambdaSQSQueueExecutionRole", 23 | "arn:aws:iam::aws:policy/AWSLambdaExecute", 24 | "arn:aws:iam::aws:policy/AmazonECS_FullAccess", 25 | "arn:aws:iam::aws:policy/service-role/AmazonEC2SpotFleetTaggingRole", 26 | "arn:aws:iam::aws:policy/AmazonS3FullAccess", 27 | "arn:aws:iam::aws:policy/AmazonSQSFullAccess", 28 | "arn:aws:iam::aws:policy/CloudWatchFullAccess", 29 | ] 30 | 31 | 32 | def setup(): 33 | # Create ECS Instance Role 34 | assume_role_policy_document = json.dumps( 35 | { 36 | "Version": "2008-10-17", 37 | "Statement": [ 38 | { 39 | "Sid": "", 40 | "Effect": "Allow", 41 | "Principal": {"Service": "ec2.amazonaws.com"}, 42 | "Action": "sts:AssumeRole", 43 | } 44 | ], 45 | } 46 | ) 47 | try: 48 | iam.create_role( 49 | RoleName="ecsInstanceRole", 50 | AssumeRolePolicyDocument=assume_role_policy_document, 51 | ) 52 | for arn in ecsInstanceRole_policy_list: 53 | iam.attach_role_policy( 54 | PolicyArn=arn, 55 | RoleName="ecsInstanceRole", 56 | ) 57 | print("Created ecsInstanceRole.") 58 | except iam.exceptions.EntityAlreadyExistsException: 59 | print("Skipping creation of ecsInstanceRole. Already exists.") 60 | try: 61 | iam.create_instance_profile(InstanceProfileName="ecsInstanceRole") 62 | except iam.exceptions.EntityAlreadyExistsException: 63 | print("Skipping creation of ecsInstanceProfile. Already exists.") 64 | try: 65 | iam.add_role_to_instance_profile( 66 | InstanceProfileName="ecsInstanceRole", RoleName="ecsInstanceRole" 67 | ) 68 | except iam.exceptions.LimitExceededException: 69 | print("Instance Profile already added to Instance Role") 70 | 71 | # Create EC2 Spot Fleet Tagging Role 72 | assume_role_policy_document = json.dumps( 73 | { 74 | "Version": "2012-10-17", 75 | "Statement": [ 76 | { 77 | "Sid": "", 78 | "Effect": "Allow", 79 | "Principal": {"Service": "spotfleet.amazonaws.com"}, 80 | "Action": "sts:AssumeRole", 81 | } 82 | ], 83 | } 84 | ) 85 | try: 86 | iam.create_role( 87 | RoleName="aws-ec2-spot-fleet-tagging-role", 88 | AssumeRolePolicyDocument=assume_role_policy_document, 89 | ) 90 | iam.attach_role_policy( 91 | PolicyArn="arn:aws:iam::aws:policy/service-role/AmazonEC2SpotFleetTaggingRole", 92 | RoleName="aws-ec2-spot-fleet-tagging-role", 93 | ) 94 | print("Created aws-ec2-spot-fleet-tagging-role.") 95 | except iam.exceptions.EntityAlreadyExistsException: 96 | print("Skipping creation of aws-ec2-spot-fleet-tagging-role. Already exists.") 97 | 98 | # Create Lambda Full Access Role 99 | assume_role_policy_document = json.dumps( 100 | { 101 | "Version": "2012-10-17", 102 | "Statement": [ 103 | { 104 | "Sid": "", 105 | "Effect": "Allow", 106 | "Principal": {"Service": "lambda.amazonaws.com"}, 107 | "Action": "sts:AssumeRole", 108 | } 109 | ], 110 | } 111 | ) 112 | try: 113 | iam.create_role( 114 | RoleName="LambdaFullAccess", 115 | AssumeRolePolicyDocument=assume_role_policy_document, 116 | ) 117 | for arn in LambdaFullAccess_policy_list: 118 | iam.attach_role_policy( 119 | PolicyArn=arn, 120 | RoleName="LambdaFullAccess", 121 | ) 122 | print("Created LambdaFullAccess role.") 123 | except iam.exceptions.EntityAlreadyExistsException: 124 | print("Skipping creation of LambdaFullAccess role. Already exists.") 125 | 126 | # Create SNS Monitor topic 127 | MonitorTopic = sns.create_topic(Name="Monitor") 128 | print("(Re-)Created Monitor SNS Topic.") 129 | 130 | # Create Monitor Lambda function 131 | LambdaFullAccess = iam.get_role(RoleName="LambdaFullAccess") 132 | 133 | shutil.make_archive("lambda_function", "zip", os.getcwd()) 134 | fxn = open("lambda_function.zip", "rb").read() 135 | try: 136 | MonitorFunction = lmbda.create_function( 137 | FunctionName="Monitor", 138 | Runtime="python3.9", 139 | Role=LambdaFullAccess["Role"]["Arn"], 140 | Handler="lambda_function.lambda_handler", 141 | Code={ 142 | "ZipFile": fxn, 143 | }, 144 | Description="Auto-monitor DS runs", 145 | Timeout=900, 146 | MemorySize=3008, 147 | Publish=True, 148 | PackageType="Zip", 149 | TracingConfig={"Mode": "PassThrough"}, 150 | Architectures=["x86_64"], 151 | EphemeralStorage={"Size": 512}, 152 | ) 153 | # Subscribe Monitor Lambda to Monitor Topic 154 | sns.subscribe( 155 | TopicArn=MonitorTopic["TopicArn"], 156 | Protocol="lambda", 157 | Endpoint=MonitorFunction["FunctionArn"], 158 | ) 159 | print("Created Monitor Lambda Function.") 160 | except lmbda.exceptions.ResourceConflictException: 161 | print("Skipping creation of Monitor Lambda Function. Already exists.") 162 | try: 163 | lmbda.add_permission( 164 | FunctionName="Monitor", 165 | StatementId="InvokeBySNS", 166 | Action="lambda:InvokeFunction", 167 | Principal="sns.amazonaws.com", 168 | ) 169 | except lmbda.exceptions.ResourceConflictException: 170 | print("Monitor Lambda Function already has SNS invoke permission.") 171 | 172 | 173 | def destroy(): 174 | # Delete roles 175 | for arn in ecsInstanceRole_policy_list: 176 | iam.detach_role_policy(RoleName="ecsInstanceRole", PolicyArn=arn) 177 | iam.remove_role_from_instance_profile( 178 | InstanceProfileName="ecsInstanceRole", RoleName="ecsInstanceRole" 179 | ) 180 | iam.delete_instance_profile(InstanceProfileName="ecsInstanceRole") 181 | iam.delete_role(RoleName="ecsInstanceRole") 182 | 183 | iam.detach_role_policy( 184 | RoleName="aws-ec2-spot-fleet-tagging-role", 185 | PolicyArn="arn:aws:iam::aws:policy/service-role/AmazonEC2SpotFleetTaggingRole", 186 | ) 187 | iam.delete_role(RoleName="aws-ec2-spot-fleet-tagging-role") 188 | 189 | for arn in LambdaFullAccess_policy_list: 190 | iam.detach_role_policy(RoleName="LambdaFullAccess", PolicyArn=arn) 191 | iam.delete_role(RoleName="LambdaFullAccess") 192 | 193 | # Delete Monitor Lambda function 194 | lmbda.delete_function(FunctionName="Monitor") 195 | 196 | # Delete Monitor SNS topic 197 | # create_topic is idempotent so we use it to return ARN since topic already exists 198 | MonitorTopic = sns.create_topic(Name="Monitor") 199 | sns.delete_topic(TopicArn=MonitorTopic["TopicArn"]) 200 | 201 | 202 | if __name__ == "__main__": 203 | if len(sys.argv) == 1: 204 | setup() 205 | else: 206 | if sys.argv[1] == "destroy": 207 | destroy() 208 | else: 209 | print("Use: setup_AWS.py or setup_AWS.py destroy") 210 | sys.exit() 211 | -------------------------------------------------------------------------------- /worker/Dockerfile: -------------------------------------------------------------------------------- 1 | # 2 | # - [ BROAD'16 ] - 3 | # 4 | # A docker instance for accessing AWS resources 5 | # This wraps the cellprofiler docker registry 6 | # 7 | 8 | 9 | FROM cellprofiler/cellprofiler:4.2.8 10 | 11 | # Install S3FS 12 | 13 | RUN apt-get -y update && \ 14 | apt-get -y upgrade && \ 15 | apt-get -y install \ 16 | automake \ 17 | autotools-dev \ 18 | g++ \ 19 | git \ 20 | jq \ 21 | libcurl4-gnutls-dev \ 22 | libfuse-dev \ 23 | libssl-dev \ 24 | libxml2-dev \ 25 | make pkg-config \ 26 | sysstat \ 27 | curl 28 | 29 | RUN apt-get install --only-upgrade bash 30 | 31 | WORKDIR /usr/local/src 32 | RUN git clone https://github.com/s3fs-fuse/s3fs-fuse.git 33 | WORKDIR /usr/local/src/s3fs-fuse 34 | RUN ./autogen.sh 35 | RUN ./configure 36 | RUN make 37 | RUN make install 38 | 39 | # Install AWS CLI 40 | 41 | RUN apt install unzip 42 | RUN curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip" 43 | RUN unzip awscliv2.zip 44 | RUN ./aws/install 45 | 46 | # Install boto3 47 | 48 | RUN python3.8 -m pip install boto3 49 | 50 | # Install watchtower for logging 51 | 52 | RUN python3.8 -m pip install watchtower 53 | 54 | # Install pandas for optional file downloading 55 | 56 | RUN python3.8 -m pip install pandas 57 | 58 | # SETUP NEW ENTRYPOINT 59 | 60 | RUN mkdir -p /home/ubuntu/ 61 | WORKDIR /home/ubuntu 62 | COPY cp-worker.py . 63 | COPY instance-monitor.py . 64 | COPY run-worker.sh . 65 | RUN chmod 755 run-worker.sh 66 | 67 | RUN git clone https://github.com/CellProfiler/CellProfiler-plugins.git 68 | WORKDIR /home/ubuntu/CellProfiler-plugins 69 | #RUN pip install -r requirements.txt 70 | 71 | WORKDIR /home/ubuntu 72 | ENTRYPOINT ["./run-worker.sh"] 73 | CMD [""] 74 | -------------------------------------------------------------------------------- /worker/Makefile: -------------------------------------------------------------------------------- 1 | user = cellprofiler 2 | project = distributed-cellprofiler 3 | tag = 2.2.0_4.2.8 4 | 5 | .DEFAULT_GOAL: build 6 | build: 7 | docker build -t $(user)/$(project):$(tag) . 8 | docker login -u $(user) 9 | docker push $(user)/$(project):$(tag) 10 | echo "Your new docker registry is ready at: $(user)/$(project):$(tag)" 11 | 12 | -------------------------------------------------------------------------------- /worker/instance-monitor.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Wed Oct 12 17:39:49 2016 4 | 5 | @author: bcimini 6 | """ 7 | 8 | import subprocess 9 | import time 10 | import logging 11 | 12 | def runcmd(cmd): 13 | logger = logging.getLogger(__name__) 14 | process=subprocess.Popen(cmd.split(),stdout=subprocess.PIPE) 15 | out,err=process.communicate() 16 | logger.info(out) 17 | return out 18 | 19 | def monitor(): 20 | logger = logging.getLogger(__name__) 21 | while True: 22 | out = runcmd('df -h') 23 | metrics = str(out).split('\\n')[1] 24 | metrics = [x for x in metrics.split(' ') if x] 25 | logger.info(f'Root disk usage {metrics[4]}.') 26 | if int((metrics[4].split('%')[0]))> 90: 27 | logger.warning('WARNING: High disk usage.') 28 | 29 | runcmd('df -i -h') 30 | 31 | out = runcmd('vmstat -a -SM 1 1') 32 | metrics = str(out).split('\\n')[2] 33 | metrics = [x for x in metrics.split(' ') if x] 34 | logger.info(f'Free memory {metrics[3]}. Inactive memory {metrics[4]}. Active memory {metrics[5]}') 35 | if float(metrics[4])/float(metrics[5]) < .1: 36 | logger.warning('WARNING: High memory usage.') 37 | 38 | out = runcmd('iostat') 39 | metrics = str(out).split('\\n')[3] 40 | metrics = [x for x in metrics.split(' ') if x] 41 | logger.info(f'Idle CPU {metrics[-1]}') 42 | if float(metrics[-1]) < 10: 43 | logger.warning('WARNING: Low CPU') 44 | 45 | time.sleep(30) 46 | 47 | if __name__=='__main__': 48 | logging.basicConfig(level=logging.INFO) 49 | monitor() 50 | -------------------------------------------------------------------------------- /worker/run-worker.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | echo "${BASH_VERSION}" 3 | echo "Region $AWS_REGION" 4 | echo "Queue $SQS_QUEUE_URL" 5 | if [[ -z "$SOURCE_BUCKET" ]]; then 6 | SOURCE_BUCKET=$AWS_BUCKET 7 | fi 8 | echo "Source Bucket $SOURCE_BUCKET" 9 | 10 | mkdir -p /home/ubuntu/bucket 11 | mkdir -p /home/ubuntu/local_output 12 | 13 | # 1. CONFIGURE AWS CLI 14 | echo "Configuring AWS CLI" 15 | aws configure set default.region $AWS_REGION 16 | MY_INSTANCE_ID=$(curl http://169.254.169.254/latest/meta-data/instance-id) 17 | echo "Instance ID $MY_INSTANCE_ID" 18 | OWNER_ID=$(aws ec2 describe-instances --instance-ids $MY_INSTANCE_ID --output text --query 'Reservations[0].[OwnerId]') 19 | aws ec2 create-tags --resources $MY_INSTANCE_ID --tags Key=Name,Value=${APP_NAME}Worker 20 | VOL_0_ID=$(aws ec2 describe-instance-attribute --instance-id $MY_INSTANCE_ID --attribute blockDeviceMapping --output text --query BlockDeviceMappings[0].Ebs.[VolumeId]) 21 | aws ec2 create-tags --resources $VOL_0_ID --tags Key=Name,Value=${APP_NAME}Worker 22 | VOL_1_ID=$(aws ec2 describe-instance-attribute --instance-id $MY_INSTANCE_ID --attribute blockDeviceMapping --output text --query BlockDeviceMappings[1].Ebs.[VolumeId]) 23 | aws ec2 create-tags --resources $VOL_1_ID --tags Key=Name,Value=${APP_NAME}Worker 24 | 25 | # 2. MOUNT S3 26 | if [[ ${DOWNLOAD_FILES} == 'False' ]]; then 27 | echo "Mounting S3 using S3FS" 28 | if [[ -z "$AWS_ACCESS_KEY_ID" ]]; then 29 | echo "Using role credentials to mount S3" 30 | s3fs $SOURCE_BUCKET /home/ubuntu/bucket -o iam_role 31 | else 32 | echo "Using user credentials to mount S3" 33 | echo $AWS_ACCESS_KEY_ID:$AWS_SECRET_ACCESS_KEY > /credentials.txt 34 | chmod 600 /credentials.txt 35 | s3fs $SOURCE_BUCKET /home/ubuntu/bucket -o passwd_file=/credentials.txt 36 | fi 37 | fi 38 | 39 | # 3. SET UP ALARMS 40 | echo "Setting up instance metric alarms" 41 | aws cloudwatch put-metric-alarm --alarm-name ${APP_NAME}_${MY_INSTANCE_ID} --alarm-actions arn:aws:swf:${AWS_REGION}:${OWNER_ID}:action/actions/AWS_EC2.InstanceId.Terminate/1.0 --statistic Maximum --period 60 --threshold 1 --comparison-operator LessThanThreshold --metric-name CPUUtilization --namespace AWS/EC2 --evaluation-periods 15 --dimensions "Name=InstanceId,Value=${MY_INSTANCE_ID}" 42 | 43 | # 4. RUN VM STAT MONITOR 44 | echo "Setting up instance monitor" 45 | python3.8 instance-monitor.py & 46 | 47 | # 5. UPDATE AND/OR INSTALL PLUGINS 48 | if [[ ${USE_PLUGINS} == 'True' ]]; then 49 | if [[ ${UPDATE_PLUGINS} == 'True' ]]; then 50 | echo "Updating CellProfiler-plugins." 51 | cd CellProfiler-plugins 52 | git fetch --all 53 | git pull 54 | cd .. 55 | fi 56 | if [[ -z "$PLUGINS_COMMIT" ]]; then 57 | PLUGINS_COMMIT='False' 58 | fi 59 | if [[ ${PLUGINS_COMMIT} != 'False' ]]; then 60 | echo "Checking out specific CellProfiler-plugins commit." 61 | cd CellProfiler-plugins 62 | git checkout ${PLUGINS_COMMIT} || echo "No such commit, branch, or version; failing here." & exit 1 63 | cd .. 64 | fi 65 | if [[ ${INSTALL_REQUIREMENTS} == 'True' ]]; then 66 | cd CellProfiler-plugins 67 | if [[ -z "$REQUIREMENTS" ]]; then 68 | REQUIREMENTS = $REQUIREMENTS_FILE 69 | fi 70 | if [[ -d "active_plugins" ]]; then 71 | echo "Installing CellProfiler-plugins requirements." 72 | pip install -e . ${REQUIREMENTS} || echo "Requirements install failed." & exit 1 73 | cd .. 74 | else 75 | echo "Detected deprecated CellProfiler-plugins repo organization. Installing requirements." 76 | pip install -r ${REQUIREMENTS} || echo "Requirements file not present or install failed; failing here." & exit 1 77 | cd .. 78 | fi 79 | fi 80 | fi 81 | 82 | # 6. RUN CP WORKERS 83 | echo "Starting workers" 84 | for((k=0; k<$DOCKER_CORES; k++)); do 85 | python3.8 cp-worker.py |& tee $k.out & 86 | sleep $SECONDS_TO_START 87 | done 88 | wait 89 | --------------------------------------------------------------------------------