├── requirements.txt
├── static
    └── TinderSecurity.png
├── scan_config.json
├── lib
    └── logger.py
├── action_auditor.py
├── README.md
├── LICENSE.md
├── query_data.py
├── main.py
├── .gitignore
├── workflow.py
├── github_wrapper.py
└── auditor.py


/requirements.txt:
--------------------------------------------------------------------------------
1 | PyYAML==6.0
2 | requests==2.26.0
3 | 


--------------------------------------------------------------------------------
/static/TinderSecurity.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TinderSec/gh-workflow-auditor/HEAD/static/TinderSecurity.png


--------------------------------------------------------------------------------
/scan_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "rce_risks":{
 3 |         "unsafe_inputs":{
 4 |             "user_input_body_title":"\\${{\\s*github.event.(.*).(body|title)\\s*}}",
 5 |             "malicious_commit_message":"\\${{\\s*(.*).head_commit.message\\s*}}",
 6 |             "malicious_input":"\\${{\\s*(.*)github.event.review(.*)\\s*}}",
 7 |             "environ_regex":"\\${{\\s*env.[A-Za-z0-9_-]*\\s*}}",
 8 |             "malicious_author":"\\${{\\s*github.event.(.*).author.(name|email)\\s*}}"
 9 |         },
10 |         "malicious_commits":{
11 |             "malicious_commit_referenced":"\\${{\\s*github.pull_request.head(.*)\\s*}}",
12 |             "malicious_pull_request_event":"\\${{\\s*(.*)github.event.pull_request.head(.*)\\s*}}"
13 |         }
14 |     },
15 |     "secrets":"\\${{\\s*secrets\\.[A-Za-z-_0-9]*\\s*}}",
16 |     "risky_events":["pull_request_target","issues","issue_comment"]
17 | }


--------------------------------------------------------------------------------
/lib/logger.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | def parse_log_level_input(input):
 4 |     if input == 'debug':
 5 |         level = logging.DEBUG
 6 |     elif input == 'info':
 7 |         level = logging.INFO
 8 |     elif input == 'warning':
 9 |         level = logging.WARNING
10 |     elif input == 'error':
11 |         level = logging.ERROR
12 |     elif input == 'critical':
13 |         level = logging.CRITICAL
14 |     else:
15 |         input = logging.INFO
16 | 
17 |     return level
18 | 
19 | def build_logger(log_level='info'):
20 |     log_format = logging.Formatter('%(levelname)s: %(message)s')
21 |     logger = logging.getLogger('Audit Log')
22 |     log_level = parse_log_level_input(log_level)
23 |     logger.setLevel(log_level)
24 | 
25 |     channel = logging.StreamHandler()
26 |     channel.setFormatter(log_format)
27 | 
28 |     log_file = logging.FileHandler('scan.log')
29 | 
30 |     logger.addHandler(channel)
31 |     logger.addHandler(log_file)
32 |     return logger
33 | 
34 | 


--------------------------------------------------------------------------------
/action_auditor.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | import re
 3 | 
 4 | def read_actions_file():
 5 |     array_of_usernames = []
 6 |     with open('actions.txt','r') as lines:
 7 |         for line in lines:
 8 |             username = line.split('/')[0]
 9 |             username_regex = re.compile("[A-Za-z0-9-]*")
10 |             if username_regex.fullmatch(username):
11 |                 if username not in array_of_usernames:
12 |                     array_of_usernames.append(username)
13 |     return array_of_usernames
14 | 
15 | class ActionAuditor:
16 |     def __init__(self, gh_wrapper, logger):
17 |         self.gh = gh_wrapper
18 |         self.logger = logger
19 | 
20 |     def check_usernames(self, username_list):
21 |         for username in username_list:
22 |             renamed_or_not = self.gh.stale_checker(username=username)
23 |             if not renamed_or_not:
24 |                 self.logger.warning(f"Security Issue: Supply chain. {username} was renamed but used in workflows. Signup the username at https://github.com to make sure.")
25 | 
26 |     def action_audit(self):
27 |         if Path('actions.txt').exists():
28 |             usernames = read_actions_file()
29 |             self.check_usernames(usernames)
30 |             Path('actions.txt').unlink()
31 |         else:
32 |             self.logger.info("No actions.txt file to scan. Supply chain scan complete.")
33 | 
34 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | <p align="center">
 2 |   <img width=50% height=50% src="static/TinderSecurity.png">
 3 | </p>
 4 | 
 5 | # GitHub Workflow Auditor
 6 | Workflow auditing tools to identify security issues in GitHub workflows
 7 | 
 8 | # Usage
 9 | 
10 | ```
11 | usage: main.py [-h] [--type {repo,org,user}] [--log-level {debug,info,warning,error,critical}] input
12 | 
13 | Identify vulnerabilities in GitHub Actions workflow
14 | 
15 | positional arguments:
16 |   input                 User/Org Name or Repo name (owner/repo).
17 | 
18 | optional arguments:
19 |   -h, --help            show this help message and exit
20 |   --type {repo,org,user}
21 |                         Type of entity that is being scanned.
22 |   --log-level {debug,info,warning,error,critical}
23 |                         Log level for output
24 | ```
25 | 
26 | Example:
27 | * org - `python3 main.py --type org google`
28 | * user - `python3 main.py --type user test_user`
29 | * repo: `python3 main.py --type repo TinderSec/gh-workflow-auditor`
30 | 
31 | # Setup
32 | 
33 | GitHub Workflow Auditor uses GitHub's GraphQL endoint. Due to this, an API token is required. The program will read it from the `PAT` environment variable. You can generate a basic PAT token (https://github.com/settings/tokens/new) without any scope. Note that you may have to "Configure SSO" for the token to be usable on some organizations.
34 | 
35 | ```
36 | export PAT=ghp_YOUR_TOKEN
37 | ```
38 | 
39 | # About
40 | GitHub Workflow Auditor identifies vulnerability in GitHub Workflows. It does so by scanning the workflow files for anti-patterns such as ingesting user inputs in an unsafe manner or using malicious commits in build process. The tool supports scanning individual repositories or all accessibe repositories of a user or organization. The output of the scan is saved as `scan.log`.
41 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
 1 | Copyright 2022 Match Group, LLC
 2 | 
 3 | The copyright holder grants you permission to and use  or redistribute this software in source and binary forms, with or without modification, conditioned on your acceptance of, and adherence to, the following conditions:
 4 | 
 5 | 1. Redistributions of source code, whether or not modified, must retain the above copyright notice, this list of conditions, and the following disclaimer. If modified, the source code must identify the modifications (identification in general terms is acceptable).
 6 | 2. Redistributions in binary or application form must reproduce the above copyright notice, this list of conditions, and the following disclaimer in the documentation or other materials provided with the binary or application.
 7 | 3. You may not use the name of the copyright holder nor the names of the contributors to endorse or promote products derived from this software without specific prior written permission.
 8 | 4. You may use this software only to scan and assess your own software and systems and may not use it for any malicious or illegal purpose. You may disclose any potential vulnerabilities you detect with this software only to the developer of the software in which you detect the potential vulnerability.
 9 | 
10 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ALL EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND NON INFRINGEMENT ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. YOU AGREE TO INDEMNIFY AND HOLD HARMLESS THE COPYRIGHT HOLDER AND ALL CONTRIBUTORS AGAINST ANY CLAIMS THAT ARISE BASED ON YOUR USE, MODIFICATION, OR REDISTRIBUTION OF THIS SOFTWARE.


--------------------------------------------------------------------------------
/query_data.py:
--------------------------------------------------------------------------------
 1 | def return_query(query_type,name,after=None):
 2 |     if query_type == 'repository':
 3 |         owner,name = name.split('/')
 4 |         return f"""query {{
 5 |                     repository(owner: "{owner}",name: "{name}") {{
 6 |                         nameWithOwner
 7 |                         object(expression: "HEAD:.github/workflows/") {{
 8 |                             ... on Tree {{
 9 |                                 entries {{
10 |                                     name
11 |                                     lineCount
12 |                                     object {{
13 |                                         ... on Blob {{
14 |                                             text
15 |                                         }}
16 |                                     }}
17 |                                 }}
18 |                             }}
19 |                         }}
20 |                     }}
21 |         }}"""
22 |     else:
23 |         after_query = f",after:\"{after}\"" if after else ""
24 |         return f"""query {{
25 |         {query_type}(login:"{name}"){{
26 |             repositories(first:100 {after_query}){{
27 |             edges{{
28 |                 node{{
29 |                 nameWithOwner,
30 |                 object(expression: "HEAD:.github/workflows/") {{
31 |                     ... on Tree {{
32 |                     entries {{
33 |                         name
34 |                         lineCount
35 |                         object {{
36 |                                 ... on Blob {{
37 |                             text
38 |                         }}
39 |                         }}
40 |                     }}
41 |                     }}
42 |                 }}
43 |                 }}
44 |             }}
45 |             pageInfo {{
46 |                 startCursor
47 |                 hasNextPage
48 |                 endCursor
49 |             }}
50 |             }}
51 |         }}
52 |         }}"""
53 | 
54 | def validation_query(username, guess_type):
55 |     return f"""query {{ 
56 |                 {guess_type}(login:"{username}"){{
57 |                     repositories(first:1){{
58 |                         edges{{
59 |                             node{{
60 |                                 nameWithOwner
61 |                             }}
62 |                         }}
63 |                     }}
64 |                 }}
65 |             }}"""


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | 
 3 | # Local imports
 4 | from auditor import content_analyzer
 5 | from action_auditor import ActionAuditor
 6 | from github_wrapper import GHWrapper
 7 | from lib.logger import build_logger
 8 | 
 9 | 
10 | """
11 | Input:
12 |    repo_dict - dictionary defining repo information
13 |    scan_folder - Location where the repo is cloned
14 | Output:
15 |     scan result (if any) in scan.log file.
16 | Summary:
17 |     For a given workflow dictionary (name, content) this
18 |     function will call content_analyzer to audit the workflow
19 |     for any potential vulnerabilities. 
20 | """
21 | def repo_analysis(repo_workflow, logger):
22 |     for workflow in repo_workflow:
23 |         workflow_name = workflow['name']
24 |         workflow_content = workflow['content']
25 |         logger.info(f">> Scanning: {workflow_name}")
26 |         content_analyzer(content=workflow_content, logger=logger) # will print out security issues
27 | 
28 | def main():
29 |     # Supporting user provided arguments: type, log level, and scan target.
30 |     parser = argparse.ArgumentParser(description='Identify vulnerabilities in GitHub Actions workflow')
31 |     parser.add_argument('--type',choices=['repo','org','user'],
32 |                         help='Type of entity that is being scanned.')
33 |     parser.add_argument('--log-level',choices=['debug','info','warning','error','critical'], default='info')
34 |     parser.add_argument('input',help='Org, user or repo name (owner/name)')
35 |     args = parser.parse_args()
36 |     
37 |     target_type = args.type #repo, org, or user
38 |     target_input = args.input #can be repo url, or a username for org/user
39 |     log_level = args.log_level
40 | 
41 |     logger = build_logger(log_level)
42 |     gh = GHWrapper(logger)
43 |     
44 |     if target_type == 'repo':
45 |         repos = gh.get_single_repo(repo_name=target_input)
46 |     else:
47 |         count, repos = gh.get_multiple_repos(target_name=target_input,
48 |                                     target_type=target_type)
49 |         logger.info(f"Metric: Scanning total {count} repos")
50 | 
51 |     for repo_dict in repos:
52 |         logger.info(f"> Starting audit of {repo_dict}")
53 |         repo_workflows = repos[repo_dict]
54 |         repo_analysis(repo_workflows, logger)
55 | 
56 |     logger.info(f"> Checking for supply chain attacks.")
57 |     action_auditor = ActionAuditor(gh, logger)
58 |     action_auditor.action_audit()
59 | 
60 | main()
61 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib64/
 18 | parts/
 19 | sdist/
 20 | var/
 21 | wheels/
 22 | pip-wheel-metadata/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | 
 53 | # Translations
 54 | *.mo
 55 | *.pot
 56 | 
 57 | # Django stuff:
 58 | *.log
 59 | local_settings.py
 60 | db.sqlite3
 61 | db.sqlite3-journal
 62 | 
 63 | # Flask stuff:
 64 | instance/
 65 | .webassets-cache
 66 | 
 67 | # Scrapy stuff:
 68 | .scrapy
 69 | 
 70 | # Sphinx documentation
 71 | docs/_build/
 72 | 
 73 | # PyBuilder
 74 | target/
 75 | 
 76 | # Jupyter Notebook
 77 | .ipynb_checkpoints
 78 | 
 79 | # IPython
 80 | profile_default/
 81 | ipython_config.py
 82 | 
 83 | # pyenv
 84 | .python-version
 85 | 
 86 | # pipenv
 87 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 88 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 89 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 90 | #   install all needed dependencies.
 91 | #Pipfile.lock
 92 | 
 93 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 94 | __pypackages__/
 95 | 
 96 | # Celery stuff
 97 | celerybeat-schedule
 98 | celerybeat.pid
 99 | 
100 | # SageMath parsed files
101 | *.sage.py
102 | 
103 | # Environments
104 | .env
105 | .venv
106 | env/
107 | venv/
108 | ENV/
109 | env.bak/
110 | venv.bak/
111 | 
112 | # Spyder project settings
113 | .spyderproject
114 | .spyproject
115 | 
116 | # Rope project settings
117 | .ropeproject
118 | 
119 | # mkdocs documentation
120 | /site
121 | 
122 | # mypy
123 | .mypy_cache/
124 | .dmypy.json
125 | dmypy.json
126 | 
127 | # Pyre type checker
128 | .pyre/
129 | *.sh
130 | .DS_Store
131 | 


--------------------------------------------------------------------------------
/workflow.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import re
 3 | import yaml
 4 | 
 5 | class WorkflowParser():
 6 |     def __init__(self, yaml_content: str):
 7 |         try:
 8 |             self.parsed_content = yaml.safe_load(yaml_content) # We don't want a vulnerability ;)
 9 |         except:
10 |             self.parsed_content= {'failed':True}
11 | 
12 |     def get_event_triggers(self) -> list:
13 |         # Check what starts a workflow. Can be list or dict
14 |         if self.parsed_content.get(True,None):
15 |             if isinstance(self.parsed_content[True], list):
16 |                 return self.parsed_content[True]
17 |             elif isinstance(self.parsed_content[True], dict):
18 |                 return list(self.parsed_content[True].keys())
19 |             else:
20 |                 return [self.parsed_content[True]]
21 | 
22 |     def get_jobs(self) -> dict:
23 |         return self.parsed_content.get('jobs',None)
24 | 
25 |     def get_jobs_count(self) -> int:
26 |         # list how many jobs execute. Jobs run on their own individual runners.
27 |         return len(self.parsed_content['jobs'].keys())
28 | 
29 |     def get_steps_for_jobs(self, job_dict: dict) -> list:
30 |         # return a list of steps in a given job dictionary
31 |         return job_dict.get('steps',None)
32 | 
33 |     def analyze_step(self, step:dict) -> tuple:
34 |         actions = step.get('uses',None)
35 |         run_command = step.get('run',None)
36 |         with_input = step.get('with',None)
37 |         step_environ = step.get('env', None) # you can define environment variables per step.
38 |         return actions, run_command, with_input, step_environ
39 | 
40 | 
41 | # Analyze various aspects of workflows to identify if it is risky.
42 | class WorkflowVulnAudit():
43 |     def __init__(self):
44 |         # get scan config regex ready
45 |         self.unsafe_input = {}
46 |         self.malicious_commits = {}
47 |         with open('scan_config.json','r') as scan_file:
48 |             scan_config = json.loads(scan_file.read())
49 |             self.triggers = scan_config['risky_events']
50 |             self.secrets = re.compile(scan_config['secrets'])
51 |         for risky_input in scan_config['rce_risks']['unsafe_inputs']:
52 |             self.unsafe_input[risky_input] = re.compile(scan_config['rce_risks']['unsafe_inputs'][risky_input])
53 |         for commit_to_watch in scan_config['rce_risks']['malicious_commits']:
54 |             self.malicious_commits[commit_to_watch] = re.compile(scan_config['rce_risks']['malicious_commits'][commit_to_watch])
55 |         self.vulnerable = {'vulnerable':True}
56 |     
57 |     def risky_command(self, command_string) -> list:
58 |         found_matches = {}
59 |         for regex in self.unsafe_input:
60 |             if matches := self.unsafe_input[regex].finditer(command_string):
61 |                 matched_commands = [command.group() for command in matches]
62 |                 if matched_commands:
63 |                     found_matches[regex] = matched_commands
64 |         return found_matches
65 | 
66 |     def risky_trigger(self, trigger_name: str) -> bool:
67 |         return bool(trigger_name in self.triggers)
68 |     
69 |     # Find and return every secrets being used in this workflow. If there is a RCE we can pull these secrets.
70 |     def get_secrets(self, full_yaml: str) -> list:
71 |         found_matches = []
72 |         if matches:= self.secrets.findall(full_yaml):
73 |             for match in matches:
74 |                 if match not in found_matches:
75 |                     found_matches.append(match)
76 |         return found_matches
77 |     
78 |     def risky_commit(self, referenced):
79 |         found_matches = {}
80 |         for regex in self.malicious_commits:
81 |             if matches := self.malicious_commits[regex].finditer(referenced):
82 |                 matched_commits = [commit.group() for commit in matches]
83 |                 if matched_commits:
84 |                     found_matches[regex] = matched_commits
85 |         return found_matches
86 | 


--------------------------------------------------------------------------------
/github_wrapper.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import requests
  4 | 
  5 | from query_data import return_query, validation_query
  6 | 
  7 | """
  8 | Input:
  9 |     token - GitHub PAT. Retrieved from environment variable.
 10 |     logger - Configured logger
 11 | 
 12 | Summary:
 13 |     This wrapper uses GitHub's GraphQL API and repository(ies)
 14 |     for the provided scan target. In addition, it is also used
 15 |     at the end of the workflow for stale account checks.
 16 | """
 17 | class GHWrapper():
 18 |     def __init__(self, logger):
 19 |         self.token = os.environ.get('PAT',None)
 20 |         self.logger = logger
 21 |         if self.token is None:
 22 |             self.logger.warning("No GitHub token provided in the PAT env variable. Exiting.")
 23 |             sys.exit()
 24 |         if not self.validate_token():
 25 |             self.logger.warning("GitHub token provided in the PAT env variable is invalid. Exiting.")
 26 |             sys.exit()
 27 | 
 28 |     def validate_token(self):
 29 |         header = {"Authorization":f"token {self.token}"}
 30 |         url = "https://api.github.com"
 31 |         validation_req = requests.get(url=url, headers=header)
 32 |         valid_status = True
 33 |         if validation_req.status_code == 401:
 34 |             valid_status = False
 35 |         else:
 36 |             valid_status = True
 37 |         return valid_status
 38 | 
 39 |     def call_graphql(self, query):
 40 |         headers = {'Authorization':f"Bearer {self.token}",
 41 |                 'Content-Type':'application/json'}
 42 |         query_request = requests.post(url='https://api.github.com/graphql',
 43 |                                     json = {'query':query},
 44 |                                     headers = headers)
 45 |         if query_request.status_code == 200:
 46 |             return query_request.json()
 47 |         else:
 48 |             message = query_request.text
 49 |             logger.error(f"GitHub GraphQL Query failed: {message}")
 50 |             sys.exit(1)
 51 | 
 52 |     def repo_node_parser(self,repo_node):
 53 |         workflow_object = repo_node['object']
 54 |         repo_workflows = []
 55 |         if workflow_object:
 56 |             workflows = workflow_object['entries']
 57 |             for workflow in workflows:
 58 |                 workflow_name = workflow['name']
 59 |                 if workflow.get('object',None):
 60 |                     workflow_text = workflow['object'].get('text',None)
 61 |                 workflow_ext = workflow_name.split('.')[-1]
 62 |                 if workflow_ext == "yml" or workflow_ext == "yaml":
 63 |                     repo_workflows.append({'name':workflow_name,'content':workflow_text})
 64 |         return repo_workflows
 65 | 
 66 |     def get_single_repo(self, repo_name):
 67 |         repos_all = {}
 68 |         repo_query = return_query('repository',
 69 |                                 repo_name)
 70 |         repos = self.call_graphql(repo_query)
 71 |         if repos.get('errors') is None:
 72 |             repo_node  = repos['data']['repository']
 73 |             repo_name = repo_node['nameWithOwner']
 74 |             repo_workflows = self.repo_node_parser(repo_node)
 75 |             if repo_workflows: # this repo has workflows
 76 |                 repos_all[repo_name] = repo_workflows
 77 |             else:
 78 |                 self.logger.debug(f"Repo {repo_name} has no workflow.")
 79 |         return repos_all
 80 | 
 81 |     def get_multiple_repos(self,target_name,target_type='org'):
 82 |         self.logger.info(f"---- Getting repos for {target_name}----")
 83 |         repos_all = {}
 84 |         query_type = {'org':'organization','user':'user','repo':'repository'}
 85 |         try:
 86 |             next_cursor = None
 87 |             has_more = True # for pagination loop
 88 |             count = 0
 89 |             while has_more:
 90 |                 query = return_query(query_type[target_type],
 91 |                                 target_name, next_cursor)
 92 |                 repos = self.call_graphql(query)
 93 |                 if repos.get('errors') is None:
 94 |                     for repo in repos['data'][query_type[target_type]]['repositories']['edges']:
 95 |                         repo_node = repo['node']
 96 |                         repo_name = repo_node['nameWithOwner']
 97 |                         repo_workflows = self.repo_node_parser(repo_node)
 98 |                         if repo_workflows:
 99 |                             repos_all[repo_name] = repo_workflows
100 |                             count += 1
101 |                         else:
102 |                             self.logger.debug(f"Repo {repo_name} has no workflow.")
103 |                     has_more = repos['data'][query_type[target_type]]['repositories']['pageInfo']['hasNextPage']
104 |                     next_cursor = repos['data'][query_type[target_type]]['repositories']['pageInfo']['endCursor']
105 |                     if has_more:
106 |                         logger.info("> Retrieve next batch of 100 repos.")
107 |                 else:
108 |                     self.logger.error(f"GraphQL response had error.")
109 |                     sys.exit(1)
110 |         except Exception as repo_err:
111 |             self.logger.debug(f"Error parsing data. Message: {str(repo_err)}")
112 |         return count, repos_all
113 | 
114 |     def stale_checker(self,username):
115 |         valid = False
116 |         if username:
117 |             user_query = validation_query(username, 'user')
118 |             is_it_user = self.call_graphql(query=user_query)['data']['user']
119 |             org_query = validation_query(username, 'organization')
120 |             is_it_org = self.call_graphql(query = org_query)['data']['organization']
121 |             if is_it_user or is_it_org:
122 |                 valid = True
123 |         return valid
124 | 
125 | 
126 | 
127 | 


--------------------------------------------------------------------------------
/auditor.py:
--------------------------------------------------------------------------------
  1 | from workflow import WorkflowParser, WorkflowVulnAudit
  2 | 
  3 | vuln_analyzer = WorkflowVulnAudit()
  4 | 
  5 | def risky_trigger_analysis(identified_triggers):
  6 |     return_triggers = []
  7 |     for trigger in identified_triggers:
  8 |         risky_or_not = vuln_analyzer.risky_trigger(trigger_name=trigger)
  9 |         if risky_or_not:
 10 |             return_triggers.append(trigger)
 11 |     return return_triggers
 12 | 
 13 | 
 14 | """
 15 | Input: 
 16 |    content - YAML content read from the workflow files.
 17 |    logger - configured logger
 18 | Output:
 19 |     scan result (if any) in scan.log file.
 20 | Summary:
 21 |     This is the critical part of the whole tool. It parses the
 22 |     YAML content to identify security issues. It does so by:
 23 |     parsing YAML to JSON, identifying keys such as event triggers,
 24 |     jobs and steps. It then checks the identified key-value pairs
 25 |     against known risks through WorkflowParser and WorkflowVulnAudit.
 26 | """
 27 | def content_analyzer(content, logger):
 28 |     risky_triggers = []
 29 |     all_actions = []
 30 |     commands = []
 31 |     environs = {}
 32 |     checked_action = []
 33 |     workflow_client = WorkflowParser(content)
 34 |     if workflow_client.parsed_content and not workflow_client.parsed_content.get('failed',None): # Sanity check to make sure proper YAML was given.
 35 |         event_triggers = workflow_client.get_event_triggers() # Identify what event(s) will start the workflow.
 36 |         secrets = vuln_analyzer.get_secrets(content) # get all the secrets in the workflow. (Uses regex). This helps understand impact.
 37 |         all_jobs = workflow_client.get_jobs() # Identify all jobs in the workflow. Stored as dictionary
 38 | 
 39 |         counter = 1 # Counter used to identify which line of code is vulnerable.
 40 |         if secrets:
 41 |             logger.info(f">>> Secrets used in workflow: {','.join(secrets)}")
 42 |         
 43 |         # Retrieve and store all needed information for a workflow run for analysis.
 44 |         if all_jobs:
 45 |             for job in all_jobs:
 46 |                 steps = all_jobs[job].get('steps',None)
 47 |                 if not steps: 
 48 |                     steps = [all_jobs[job]]
 49 |                 try:
 50 |                     environs.update(all_jobs[job].get('env',{}))
 51 |                 except:
 52 |                     logger.info(">> Environ variable is malformed")
 53 |                 for step_number,step in enumerate(steps):
 54 |                     actions, run_command, with_input, step_environ = workflow_client.analyze_step(step)
 55 |                     if actions:
 56 |                         all_actions.append({f"Job{counter}.Step{step_number+1}":step})
 57 |                     if step_environ:
 58 |                         if isinstance(step_environ, str):
 59 |                             step_environ = {f"{step_number}{step}":step_environ}
 60 |                         environs.update(step_environ)
 61 |                     if run_command:
 62 |                         commands.append({f"Job{counter}.Step{step_number+1}":step})
 63 |                 counter +=1 
 64 |             
 65 |             # Start analyzing the retrieved information.
 66 |             try: 
 67 |                 # Analyzes event triggers to see if they are user controlled.
 68 |                 risky_triggers = risky_trigger_analysis(identified_triggers=event_triggers)
 69 |                 
 70 |                 # Analyzes commands called by Steps.
 71 |                 for command in commands:
 72 |                     for step_number, step_dict in command.items():
 73 |                         risky_command = vuln_analyzer.risky_command(command_string=step_dict['run'])
 74 |                         if risky_command:
 75 |                             for regex, matched_strings in risky_command.items():
 76 |                                 if regex == 'environ_regex': # not all environments are bad. Check if this environment is user controlled.
 77 |                                     # get the key out of the matched strings. We use this to check if the environ variable stores any user controlled input.
 78 |                                     for environ_variable in matched_strings:
 79 |                                         environ_variable = environ_variable.strip('${{').strip('}}').split('.')[1].strip()
 80 |                                         # get environ value
 81 |                                         environ_var_value = environs.get(environ_variable,None)
 82 |                                         if environ_var_value:
 83 |                                             risky_env = vuln_analyzer.risky_command(command_string=environ_var_value)
 84 |                                             if risky_env and list(risky_env.keys())[0] != 'environ_regex':
 85 |                                                 logger.warning(f">>> Security Issue: RCE detected with {regex} in {step_number}: ENV variable {environ_variable} is called through GitHub context and takes user input {environ_var_value}")
 86 |                                 else:
 87 |                                     logger.warning(f">>> Security Issue: RCE detected with {regex} in {step_number}: Usage of {','.join(matched_strings)} found.")
 88 |                 
 89 |                 # Some actions combined with triggers can be bad. Check for those cases.
 90 |                 action_storage = open('actions.txt','a+')
 91 |                 for action in all_actions:
 92 |                     for step_number, step_dict in action.items():
 93 |                         action_name = step_dict.get('uses',None)
 94 |                         action_storage.write(f"{action_name}\n")
 95 |                         if 'actions/checkout' in action_name:
 96 |                             # check if specific branch is checked out
 97 |                             if step_dict.get('with',None):
 98 |                                 if step_dict['with'].get('ref',None):
 99 |                                     ref_value = step_dict['with'].get('ref')
100 |                                     risky_commits = vuln_analyzer.risky_commit(referenced=ref_value)
101 |                                     if risky_commits:
102 |                                         if 'pull_request_target' in risky_triggers:
103 |                                             logger.warning(f">>> Security Issue: Malicious pull request used in actions/checkout. Vulnerable step: {step_number} ")
104 |                 action_storage.close()
105 |             except Exception as workflow_err:
106 |                 logger.info(f">>> Error parsing workflow. Error is {str(workflow_err)}")


--------------------------------------------------------------------------------