├── scripts ├── __init__.py └── generate_heuristics_docs.py ├── ghbuster ├── service │ ├── __init__.py │ ├── github_archive.py │ └── emails_extractor.py ├── heuristics │ ├── __init__.py │ ├── user_has_only_forks.py │ ├── user_has_only_commits_from_unlinked_emails.py │ ├── base.py │ ├── user_looks_legit.py │ ├── user_metadata_basic.py │ ├── repo_has_stargazzers_who_joined_the_same_day.py │ ├── user_has_forks_from_taken_down_repos.py │ ├── user_has_low_community_activity.py │ ├── repo_commits_only_from_suspicious_unlinked_emails.py │ ├── repo_starred_by_suspicious_users.py │ └── graph.py ├── __init__.py ├── github_repo_scanner.py ├── __main__.py ├── cli.py └── output_formatter.py ├── tests ├── test_utils │ ├── __init__.py │ ├── mock_utils.py │ └── date_utils.py └── heuristics │ ├── test_user_missing_common_fields.py │ ├── test_user_just_joined.py │ ├── test_user_has_only_forks.py │ ├── test_user_has_low_community_activity.py │ ├── test_repo_has_stargazzers_who_joined_the_same_day.py │ └── test_user_has_forks_from_taken_down_repos.py ├── CODEOWNERS ├── screenshot.png ├── .gitignore ├── pyproject.toml ├── README.md └── LICENSE /scripts/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /ghbuster/service/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/test_utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /CODEOWNERS: -------------------------------------------------------------------------------- 1 | * @christophetd 2 | -------------------------------------------------------------------------------- /screenshot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataDog/ghbuster/HEAD/screenshot.png -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | lib/ 3 | **/__pycache__ 4 | *.sqlite 5 | TODO.txt 6 | malicious_repos.txt 7 | -------------------------------------------------------------------------------- /tests/test_utils/mock_utils.py: -------------------------------------------------------------------------------- 1 | from unittest.mock import MagicMock 2 | 3 | 4 | def mock_pygithub_list(items: list) -> MagicMock: 5 | mock_list = MagicMock() 6 | mock_list.__iter__.return_value = iter(items) 7 | mock_list.__len__.return_value = len(items) 8 | mock_list.totalCount = len(items) 9 | return mock_list 10 | -------------------------------------------------------------------------------- /tests/test_utils/date_utils.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import random 3 | 4 | 5 | def random_date() -> datetime.datetime: 6 | year = random.randint(2000, 2025) 7 | month = random.randint(1, 12) 8 | day = random.randint(1, 28) 9 | date_str = f"{year:04d}-{month:02d}-{day:02d}T00:00:00Z" 10 | return datetime.datetime.strptime(date_str, "%Y-%m-%dT%H:%M:%SZ") 11 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "ghbuster" 3 | version = "0.1.0" 4 | description = "A tool to identify and investigate inauthentic GitHub user accounts and repositories" 5 | requires-python = ">=3.10" 6 | dependencies = [ 7 | "networkx[default]>=3.4.2", 8 | "pygithub>=2.6.1", 9 | "pyvis>=0.3.2", 10 | "requests-cache>=1.2.1", 11 | ] 12 | 13 | [project.scripts] 14 | ghbuster = "ghbuster.__main__:cli_entrypoint" 15 | -------------------------------------------------------------------------------- /ghbuster/heuristics/__init__.py: -------------------------------------------------------------------------------- 1 | from .repo_commits_only_from_suspicious_unlinked_emails import * 2 | from .repo_has_stargazzers_who_joined_the_same_day import * 3 | from .repo_starred_by_suspicious_users import * 4 | from .user_has_only_commits_from_unlinked_emails import * 5 | from .user_has_only_forks import * 6 | from .user_looks_legit import UserLooksLegit 7 | 8 | ALL_HEURISTICS = { 9 | UserJustJoinedHeuristic(), 10 | UserMissingCommonFields(), 11 | UserHasOnlyCommitsFromUnlinkedEmails(), # can be a bit slow as it analyzes all commits from the user's repositories 12 | UserHasLowCommunityActivity(), 13 | RepoStarredBySuspiciousUsers(), 14 | RepoCommitsOnlyFromSuspiciousUnlinkedEmails(), 15 | UserHasForksFromTakenDownRepos(), 16 | UserHasOnlyForkedRepos(), 17 | RepoHasStargazersWhoJoinedOnTheSameDay() 18 | } 19 | -------------------------------------------------------------------------------- /ghbuster/__init__.py: -------------------------------------------------------------------------------- 1 | import enum 2 | 3 | 4 | class TargetType(enum.Enum): 5 | REPOSITORY = "repository" 6 | USER = "user" 7 | 8 | 9 | class TargetSpec: 10 | target_type: TargetType 11 | username: str = None 12 | repo_name: str = None 13 | 14 | def __init__(self, target_type: TargetType, username: str = None, repo_name: str = None): 15 | self.target_type = target_type 16 | self.username = username 17 | self.repo_name = repo_name 18 | 19 | def repo_full_name(self) -> str: 20 | if self.target_type == TargetType.REPOSITORY and self.username and self.repo_name: 21 | return f"{self.username}/{self.repo_name}" 22 | raise ValueError("Target is not a repository or missing username/repo_name") 23 | 24 | def __repr__(self): 25 | if self.target_type == TargetType.REPOSITORY: 26 | return f"GitHub repository {self.repo_full_name()}" 27 | elif self.target_type == TargetType.USER: 28 | return f"GitHub user {self.username}" 29 | return None 30 | -------------------------------------------------------------------------------- /ghbuster/heuristics/user_has_only_forks.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | import github 4 | 5 | from .base import MetadataHeuristic, HeuristicRunResult 6 | from .. import TargetType, TargetSpec 7 | 8 | logger = logging.getLogger(__name__) 9 | 10 | 11 | # e.g. https://github.com/sweetboy235 12 | class UserHasOnlyForkedRepos(MetadataHeuristic): 13 | def id(self) -> str: 14 | return 'user.repos_only_forks' 15 | 16 | def friendly_name(self) -> str: 17 | return "User has only forks" 18 | 19 | def description(self) -> str: 20 | return "Detects all of a user's repositories are forks. This may be an indication that the user is used solely to make other repositories appear legitimate." 21 | 22 | def target_type(self) -> TargetType: 23 | return TargetType.USER 24 | 25 | def run(self, github_client: github.Github, target_spec: TargetSpec) -> HeuristicRunResult: 26 | user = github_client.get_user(login=target_spec.username) 27 | user_repos = user.get_repos(type='owner') 28 | has_only_forks = user_repos.totalCount > 0 and not any(not repo.fork for repo in user_repos) 29 | 30 | if has_only_forks: 31 | additional_details = f"The user {target_spec.username} has only forked repositories." 32 | return HeuristicRunResult.TRIGGERED(additional_details=additional_details) 33 | 34 | return HeuristicRunResult.PASSED() 35 | -------------------------------------------------------------------------------- /tests/heuristics/test_user_missing_common_fields.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from unittest.mock import patch, MagicMock 3 | 4 | from github import NamedUser 5 | 6 | from ghbuster import TargetSpec, TargetType 7 | from ghbuster.heuristics.user_metadata_basic import UserMissingCommonFields 8 | 9 | 10 | class TestUserHasLowCommunityActivity(unittest.TestCase): 11 | def setUp(self): 12 | self.heuristic = UserMissingCommonFields() 13 | 14 | @patch('ghbuster.heuristics.user_metadata_basic.github.Github') 15 | def test_positive(self, gh): 16 | target_spec = TargetSpec(target_type=TargetType.USER, username="user") 17 | ghuser = MagicMock(NamedUser) 18 | ghuser.bio = None 19 | ghuser.company = None 20 | ghuser.location = None 21 | ghuser.name = None 22 | gh.get_user.return_value = ghuser 23 | 24 | result = self.heuristic.run(gh, target_spec) 25 | self.assertTrue(result.triggered) 26 | 27 | @patch('ghbuster.heuristics.user_metadata_basic.github.Github') 28 | def test_negative(self, gh): 29 | target_spec = TargetSpec(target_type=TargetType.USER, username="user") 30 | ghuser = MagicMock(NamedUser) 31 | ghuser.bio = None 32 | ghuser.company = None 33 | ghuser.location = None 34 | ghuser.name = 'John Doe' 35 | gh.get_user.return_value = ghuser 36 | result = self.heuristic.run(gh, target_spec) 37 | self.assertFalse(result.triggered) 38 | -------------------------------------------------------------------------------- /tests/heuristics/test_user_just_joined.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from datetime import datetime, timedelta, timezone 3 | from unittest.mock import patch, Mock 4 | 5 | from github import NamedUser 6 | 7 | from ghbuster import TargetSpec, TargetType 8 | from ghbuster.heuristics.user_metadata_basic import UserJustJoinedHeuristic 9 | 10 | 11 | class TestUserHasLowCommunityActivity(unittest.TestCase): 12 | def setUp(self): 13 | self.heuristic = UserJustJoinedHeuristic() 14 | 15 | @patch('ghbuster.heuristics.user_metadata_basic.github.Github') 16 | def test_positive(self, gh): 17 | target_spec = TargetSpec(target_type=TargetType.USER, username="newuser") 18 | ghuser = Mock(NamedUser) 19 | ghuser.created_at = datetime.now(timezone.utc) - timedelta(days=UserJustJoinedHeuristic.THRESHOLD_DAYS - 1) 20 | gh.get_user.return_value = ghuser 21 | 22 | result = self.heuristic.run(gh, target_spec) 23 | self.assertTrue(result.triggered) 24 | 25 | @patch('ghbuster.heuristics.user_metadata_basic.github.Github') 26 | def test_negative(self, gh): 27 | target_spec = TargetSpec(target_type=TargetType.USER, username="olduser") 28 | ghuser = Mock(NamedUser) 29 | ghuser.created_at = datetime.now(timezone.utc) - timedelta(days=UserJustJoinedHeuristic.THRESHOLD_DAYS + 1) 30 | gh.get_user.return_value = ghuser 31 | 32 | result = self.heuristic.run(gh, target_spec) 33 | self.assertFalse(result.triggered) 34 | -------------------------------------------------------------------------------- /ghbuster/heuristics/user_has_only_commits_from_unlinked_emails.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | import github 4 | 5 | from .base import MetadataHeuristic, HeuristicRunResult 6 | from .. import TargetType, TargetSpec 7 | from ..service.emails_extractor import GitHubCommitEmailExtractor 8 | 9 | logger = logging.getLogger(__name__) 10 | 11 | 12 | class UserHasOnlyCommitsFromUnlinkedEmails(MetadataHeuristic): 13 | def id(self) -> str: 14 | return 'user.commits_unlinked_emails' 15 | 16 | def friendly_name(self) -> str: 17 | return "User has only commits from unlinked emails" 18 | 19 | def description(self) -> str: 20 | return "Detects when all of a user's commits are from emails not linked to their GitHub profiles. This may indicate a threat actor leveraging distinct inauthentic accounts." 21 | 22 | def target_type(self) -> TargetType: 23 | return TargetType.USER 24 | 25 | def run(self, github_client: github.Github, target_spec: TargetSpec) -> HeuristicRunResult: 26 | extractor = GitHubCommitEmailExtractor(github_client, target_spec, include_forks=False, 27 | include_unlinked_emails=True, 28 | include_emails_linked_to_other_users=False) 29 | emails = extractor.find_emails() 30 | has_linked_emails = any(email.is_linked_to_user for email in emails) 31 | if has_linked_emails: 32 | return HeuristicRunResult.PASSED() 33 | else: 34 | email_addresses = list[str]() 35 | for e in emails: 36 | email_addresses.append(e.email) 37 | 38 | return HeuristicRunResult.TRIGGERED( 39 | f"The user {target_spec.username} has only commits from unlinked emails: '{', '.join(email_addresses)}'.") 40 | -------------------------------------------------------------------------------- /ghbuster/service/github_archive.py: -------------------------------------------------------------------------------- 1 | import dataclasses 2 | import datetime 3 | import logging 4 | from typing import Iterable 5 | 6 | import requests 7 | 8 | logger = logging.getLogger(__name__) 9 | 10 | 11 | @dataclasses.dataclass 12 | class GitHubEvent: 13 | event_type: str 14 | actor_login: str 15 | repo_name: str 16 | created_at: datetime 17 | _raw: dict[str, str] 18 | 19 | def from_dict(self, data: dict[str, str]) -> 'GitHubEvent': 20 | self.event_type = data.get('event_type', '') 21 | self.actor_login = data.get('actor_login', '') 22 | self.repo_name = data.get('repo_name', '') 23 | self.created_at = datetime.datetime.fromisoformat(data.get('created_at', '')) 24 | self._raw = data 25 | return self 26 | 27 | 28 | class GitHubArchive: 29 | def __init__(self): 30 | self.session = requests.Session() 31 | self.session.headers['User-Agent'] = 'ghbuster' 32 | 33 | def query(self, query: str) -> Iterable[GitHubEvent]: 34 | url = "https://play.clickhouse.com/" 35 | url_params = { 36 | 'user': 'explorer', 37 | 'default_format': 'JSONStrings', 38 | } 39 | response = self.session.post(url, params=url_params, data=query) 40 | response.raise_for_status() 41 | data = response.json().get('data', []) 42 | logger.debug("Query executed successfully, received %d rows", len(data)) 43 | result = [] 44 | for row in data: 45 | result.append(GitHubEvent( 46 | event_type=row.get('event_type', None), 47 | actor_login=row.get('actor_login', None), 48 | repo_name=row.get('repo_name', None), 49 | created_at=datetime.datetime.fromisoformat(row.get('created_at', '')), 50 | _raw=row 51 | )) 52 | 53 | return result 54 | -------------------------------------------------------------------------------- /ghbuster/heuristics/base.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | 3 | import github 4 | 5 | from .. import TargetType, TargetSpec 6 | 7 | 8 | class HeuristicRunResult: 9 | def __init__(self, triggered: bool, additional_details: str = "", heuristic: 'MetadataHeuristic' = None, 10 | skipped: bool = False): 11 | self.triggered = triggered 12 | self.additional_details = additional_details 13 | self.heuristic = heuristic 14 | self.skipped = skipped 15 | 16 | @staticmethod 17 | def TRIGGERED(additional_details: str = "") -> 'HeuristicRunResult': 18 | return HeuristicRunResult(triggered=True, additional_details=additional_details) 19 | 20 | @staticmethod 21 | def PASSED(additional_details: str = "") -> 'HeuristicRunResult': 22 | return HeuristicRunResult(triggered=False) 23 | 24 | @staticmethod 25 | def SKIPPED() -> 'HeuristicRunResult': 26 | return HeuristicRunResult(triggered=False, skipped=True) 27 | 28 | 29 | class MetadataHeuristic(ABC): 30 | @abstractmethod 31 | def run(self, github_client: github.Github, target_spec: TargetSpec) -> HeuristicRunResult: 32 | """ 33 | Run the heuristic against the provided GitHub client and repository URL. 34 | 35 | :param github_client: An authenticated GitHub client. 36 | :param target_spec: The target specification containing the type and details of the target (user or repository). 37 | """ 38 | pass 39 | 40 | @abstractmethod 41 | def target_type(self) -> TargetType: 42 | """ 43 | Return the type of target this heuristic is designed for. 44 | """ 45 | pass 46 | 47 | @abstractmethod 48 | def id(self) -> str: 49 | pass 50 | 51 | @abstractmethod 52 | def friendly_name(self) -> str: 53 | pass 54 | 55 | @abstractmethod 56 | def description(self) -> str: 57 | pass 58 | -------------------------------------------------------------------------------- /tests/heuristics/test_user_has_only_forks.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from unittest.mock import patch, Mock 3 | 4 | from github import NamedUser 5 | 6 | from ghbuster import TargetSpec, TargetType 7 | from ghbuster.heuristics.user_has_only_forks import UserHasOnlyForkedRepos 8 | from tests.test_utils.mock_utils import mock_pygithub_list 9 | 10 | 11 | class TestUserHasOnlyForks(unittest.TestCase): 12 | def setUp(self): 13 | self.heuristic = UserHasOnlyForkedRepos() 14 | 15 | @patch('ghbuster.heuristics.user_has_only_forks.github.Github') 16 | def test_positive(self, gh): 17 | target_spec = TargetSpec(target_type=TargetType.USER, username="foo") 18 | user = Mock(NamedUser) 19 | user.get_repos = Mock(return_value=mock_pygithub_list([ 20 | Mock(fork=True, full_name="foo/repo1"), 21 | Mock(fork=True, full_name="foo/repo2") 22 | ])) 23 | gh.get_user.return_value = user 24 | 25 | result = self.heuristic.run(gh, target_spec) 26 | self.assertTrue(result.triggered) 27 | 28 | @patch('ghbuster.heuristics.user_has_only_forks.github.Github') 29 | def test_negative(self, gh): 30 | target_spec = TargetSpec(target_type=TargetType.USER, username="foo") 31 | user = Mock(NamedUser) 32 | user.get_repos = Mock(return_value=mock_pygithub_list([ 33 | Mock(fork=False, full_name="foo/repo1"), 34 | Mock(fork=True, full_name="foo/repo2") 35 | ])) 36 | gh.get_user.return_value = user 37 | 38 | result = self.heuristic.run(gh, target_spec) 39 | self.assertFalse(result.triggered) 40 | 41 | @patch('ghbuster.heuristics.user_has_only_forks.github.Github') 42 | def test_negative_empty(self, gh): 43 | target_spec = TargetSpec(target_type=TargetType.USER, username="foo") 44 | user = Mock(NamedUser) 45 | user.get_repos = Mock(return_value=mock_pygithub_list([])) 46 | gh.get_user.return_value = user 47 | 48 | result = self.heuristic.run(gh, target_spec) 49 | self.assertFalse(result.triggered) 50 | -------------------------------------------------------------------------------- /ghbuster/heuristics/user_looks_legit.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from datetime import timezone, datetime 3 | 4 | import github 5 | 6 | from .base import MetadataHeuristic, HeuristicRunResult 7 | from .. import TargetType, TargetSpec 8 | 9 | logger = logging.getLogger(__name__) 10 | 11 | """ 12 | UserLooksLegit is used as a strong signal that a user is authentic, to avoid running extraneous heuristics on them. 13 | """ 14 | 15 | 16 | class UserLooksLegit(MetadataHeuristic): 17 | def id(self) -> str: 18 | return 'user.looks_legit' 19 | 20 | def friendly_name(self) -> str: 21 | return "User is likely legitimate" 22 | 23 | def description(self) -> str: 24 | return "The user is likely legitimate." 25 | 26 | def target_type(self) -> TargetType: 27 | return TargetType.USER 28 | 29 | def run(self, github_client: github.Github, target_spec: TargetSpec) -> HeuristicRunResult: 30 | user = github_client.get_user(login=target_spec.username) 31 | 32 | joined_days_ago = (datetime.now(timezone.utc) - user.created_at).days 33 | likely_legit = ( 34 | user.public_repos > 10 and 35 | joined_days_ago > 365 and 36 | user.followers > 10 and 37 | user.following > 10 and 38 | user.name is not None and 39 | (user.company is not None or user.location is not None or user.bio is not None) and 40 | user.public_repos > 5 41 | ) 42 | additional_details = ( 43 | "\n" 44 | f"- The user has {user.public_repos} public repos\n" 45 | f"- The user has {user.followers} followers, and is following {user.following} users.\n" 46 | f"- The user joined {joined_days_ago} days ago.\n" 47 | f"- The user has a name set on their profile ({user.name})\n" 48 | f"- The user has the usual fields set on their profile.\n" 49 | ) 50 | if likely_legit: 51 | return HeuristicRunResult.TRIGGERED(additional_details=additional_details) 52 | else: 53 | return HeuristicRunResult.SKIPPED() 54 | -------------------------------------------------------------------------------- /ghbuster/github_repo_scanner.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | import github 4 | 5 | from . import TargetType, TargetSpec 6 | from .heuristics import HeuristicRunResult, MetadataHeuristic 7 | 8 | logger = logging.getLogger(__name__) 9 | 10 | 11 | class GitHubScanner: 12 | def __init__(self, target_spec: TargetSpec, github_client: github.Github, heuristics: list[MetadataHeuristic]): 13 | self.target_spec = target_spec 14 | self.github_client = github_client 15 | self.heuristics = heuristics 16 | 17 | def ensure_authenticated(self): 18 | try: 19 | current_user = self.github_client.get_user() 20 | print(f"Authenticated as {current_user.login}") 21 | except github.GithubException as e: 22 | raise ValueError(f"Authentication failed. Please check your GitHub token (status code {e.status})") 23 | 24 | def validate_target_spec(self): 25 | if self.target_spec.target_type == TargetType.REPOSITORY: 26 | try: 27 | self.github_client.get_repo(f"{self.target_spec.username}/{self.target_spec.repo_name}") 28 | return # all good 29 | except github.GithubException as e: 30 | raise ValueError( 31 | f"Invalid repository '{self.target_spec.username}/{self.target_spec.repo_name}': {e.data['message']}") 32 | elif self.target_spec.target_type == TargetType.USER: 33 | try: 34 | self.github_client.get_user(self.target_spec.username) 35 | return # all good 36 | except github.GithubException as e: 37 | raise ValueError(f"Invalid user '{self.target_spec.username}': {e.data['message']}") 38 | else: 39 | raise ValueError("Unsupported target type") 40 | 41 | def scan(self) -> list[HeuristicRunResult]: 42 | results = [] 43 | for heuristic in self.heuristics: 44 | if heuristic.target_type() != self.target_spec.target_type: 45 | continue 46 | 47 | logger.debug("Running heuristic %s on %s", heuristic.id(), self.target_spec) 48 | result = heuristic.run(self.github_client, self.target_spec) 49 | result.heuristic = heuristic 50 | results.append(result) 51 | return results 52 | -------------------------------------------------------------------------------- /scripts/generate_heuristics_docs.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | from ghbuster import TargetType 4 | from ghbuster.heuristics import ALL_HEURISTICS 5 | 6 | START_MARKER = "\n" 7 | END_MARKER = "\n" 8 | 9 | 10 | def generate_docs() -> str: 11 | output = '' 12 | output += "### Repository heuristics\n\n" 13 | output += '| **ID** | **Name** | **Description** |\n' 14 | output += '|:-:|:-:|:-:|\n' 15 | heuristics = sorted(ALL_HEURISTICS, key=lambda h: h.id()) 16 | for heuristic in heuristics: 17 | if heuristic.target_type() == TargetType.REPOSITORY: 18 | # Print the filename where this heuristic is defined 19 | filename = heuristic.__module__.replace('.', '/') + '.py' 20 | description = heuristic.description() 21 | description = description.replace("\n", "") 22 | output += f'| [{heuristic.id()}](./{filename}) | {heuristic.friendly_name()} | {description} |\n' 23 | output += "\n\n" 24 | 25 | output += "### GitHub user heuristics\n\n" 26 | output += '| **ID** | **Name** | **Description** |\n' 27 | output += '|:-:|:-:|:-:|\n' 28 | for heuristic in heuristics: 29 | if heuristic.target_type() == TargetType.USER: 30 | filename = heuristic.__module__.replace('.', '/') + '.py' 31 | description = heuristic.description() 32 | description = description.replace("\n", "") 33 | output += f'| [{heuristic.id()}](./{filename}) | {heuristic.friendly_name()} | {description} |\n' 34 | output += "\n\n" 35 | return output 36 | 37 | 38 | def inject_docs(file_name: str, new_docs: str): 39 | with open(file_name, 'r') as f: 40 | contents = "".join(f.readlines()) 41 | 42 | start = end = 0 43 | try: 44 | start = contents.index(START_MARKER) 45 | end = contents.index(END_MARKER) 46 | except ValueError: 47 | sys.stderr.write(f"Unable to inject docs in {file_name}, missing start or end marker") 48 | exit(1) 49 | 50 | before = contents[0:start] 51 | after = contents[end:] 52 | 53 | new_contents = before + START_MARKER + new_docs + after # 'after' already contains the end marker 54 | with open(file_name, 'w') as f: 55 | f.write(new_contents) 56 | print(f'Wrote autogenerated docs to {file_name}') 57 | 58 | 59 | if __name__ == "__main__": 60 | if len(sys.argv) < 2: 61 | print(generate_docs()) 62 | elif len(sys.argv) == 2: 63 | file = sys.argv[1] 64 | print(f'Generating docs and injecting into {file}') 65 | inject_docs(file, generate_docs()) 66 | -------------------------------------------------------------------------------- /ghbuster/__main__.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import sys 3 | 4 | import github.Auth 5 | import requests_cache 6 | 7 | from ghbuster.heuristics import MetadataHeuristic, ALL_HEURISTICS, TargetType 8 | from ghbuster.heuristics import UserLooksLegit 9 | from .cli import CliArguments, parse_and_validate_args 10 | from .github_repo_scanner import GitHubScanner 11 | from .output_formatter import OutputFormatter 12 | 13 | 14 | def setup_logging(log_level: int): 15 | logging.basicConfig(level=log_level, format="%(asctime)s [%(levelname)s] %(name)s: %(message)s") 16 | logging.getLogger("urllib3.connectionpool").setLevel(logging.INFO) 17 | logging.getLogger("requests_cache").setLevel(logging.INFO) 18 | 19 | 20 | def setup_caching(): 21 | requests_cache.install_cache('github_cache', expire_after=3600) 22 | 23 | 24 | def resolve_heuristics(included_heuristics: set[str], excluded_heuristics: set[str]) -> list[MetadataHeuristic]: 25 | heuristics = [] 26 | for heuristic in ALL_HEURISTICS: 27 | if included_heuristics: 28 | if heuristic.id() in included_heuristics: 29 | heuristics.append(heuristic) 30 | elif excluded_heuristics: 31 | if heuristic.id() not in included_heuristics: 32 | heuristics.append(heuristic) 33 | else: 34 | heuristics.append(heuristic) 35 | return heuristics 36 | 37 | 38 | def main(args: CliArguments): 39 | setup_logging(args.log_level) 40 | setup_caching() 41 | github_client = github.Github(auth=github.Auth.Token(args.github_token)) 42 | heuristics_to_run = resolve_heuristics(args.included_heuristics, args.excluded_heuristics) 43 | 44 | if args.target_spec.target_type == TargetType.USER: 45 | smoke_test = UserLooksLegit().run(github_client, args.target_spec) 46 | if smoke_test.triggered: 47 | logging.info("An initial analysis indicates that the GitHub user %s is likely legitimate: %s", 48 | args.target_spec.username, smoke_test.additional_details) 49 | if not args.force: 50 | logging.info("Exiting early without running all heuristics. Use --force to bypass") 51 | return 52 | 53 | scanner = GitHubScanner(args.target_spec, github_client, heuristics=heuristics_to_run) 54 | scanner.ensure_authenticated() 55 | scanner.validate_target_spec() 56 | results = scanner.scan() 57 | output = OutputFormatter().format_results(args.target_spec, results) 58 | print(output) 59 | 60 | 61 | def cli_entrypoint(): 62 | main(parse_and_validate_args(sys.argv[1:])) 63 | 64 | 65 | if __name__ == "__main__": 66 | cli_entrypoint() 67 | -------------------------------------------------------------------------------- /ghbuster/heuristics/user_metadata_basic.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | from datetime import datetime, timezone 3 | 4 | import github 5 | 6 | from .base import MetadataHeuristic, HeuristicRunResult 7 | from .. import TargetType, TargetSpec 8 | 9 | 10 | class BasicUserMetadataHeuristic(MetadataHeuristic): 11 | pass 12 | 13 | 14 | class UserJustJoinedHeuristic(MetadataHeuristic): 15 | THRESHOLD_DAYS = 7 16 | 17 | def id(self) -> str: 18 | return 'user.just_joined' 19 | 20 | def friendly_name(self) -> str: 21 | return "User recently joined GitHub" 22 | 23 | def description(self) -> str: 24 | return f"The GitHub user joined the platform less than {self.THRESHOLD_DAYS} days ago." 25 | 26 | def target_type(self) -> TargetType: 27 | return TargetType.USER 28 | 29 | def run(self, github_client: github.Github, target_spec: TargetSpec) -> HeuristicRunResult: 30 | user = github_client.get_user(login=target_spec.username) 31 | if user.created_at is None: 32 | return HeuristicRunResult.PASSED() 33 | 34 | days_since_creation = (datetime.now(timezone.utc) - user.created_at).days 35 | if days_since_creation >= self.THRESHOLD_DAYS: 36 | return HeuristicRunResult.PASSED() 37 | else: 38 | additional_details = f"User {target_spec.username} joined GitHub on {user.created_at.strftime('%Y-%m-%d')} ({days_since_creation} days ago)." 39 | return HeuristicRunResult.TRIGGERED(additional_details=additional_details) 40 | 41 | 42 | class UserMissingCommonFields(MetadataHeuristic): 43 | FIELDS = [ 44 | 'name', 45 | 'company', 46 | 'bio', 47 | 'location' 48 | ] 49 | 50 | def id(self) -> str: 51 | return 'user.missing_common_fields' 52 | 53 | def friendly_name(self) -> str: 54 | return "User has none of the common profile fields set" 55 | 56 | def description(self) -> str: 57 | return f"Detects when a GitHub is missing a number of highly-common fields ({', '.join(self.FIELDS)}) in their profile." 58 | 59 | def target_type(self) -> TargetType: 60 | return TargetType.USER 61 | 62 | def run(self, github_client: github.Github, target_spec: TargetSpec) -> HeuristicRunResult: 63 | user = github_client.get_user(login=target_spec.username) 64 | if all(getattr(user, field) is None for field in self.FIELDS): 65 | additional_details = f"User {target_spec.username} has none of the common fields ({', '.join(self.FIELDS)}) set." 66 | return HeuristicRunResult.TRIGGERED(additional_details=additional_details) 67 | 68 | return HeuristicRunResult.PASSED() 69 | -------------------------------------------------------------------------------- /ghbuster/heuristics/repo_has_stargazzers_who_joined_the_same_day.py: -------------------------------------------------------------------------------- 1 | from .user_has_low_community_activity import * 2 | 3 | logger = logging.getLogger(__name__) 4 | 5 | 6 | # e.g. https://github.com/heidarodeer/crypto-clipper/stargazers 7 | class RepoHasStargazersWhoJoinedOnTheSameDay(MetadataHeuristic): 8 | THRESHOLD_PERCENT = 50 9 | MIN_STARGAZERS = 2 10 | MAX_STARGAZERS = 100 11 | 12 | def id(self) -> str: 13 | return 'repo.stargazers_joined_same_day' 14 | 15 | def friendly_name(self) -> str: 16 | return "Repository has stargazers who joined the same day" 17 | 18 | def description(self) -> str: 19 | return "Detects when a repository has a large proportion of its stargazers who joined GitHub on the same day, which may indicate a coordinated effort to boost the repository's popularity." 20 | 21 | def target_type(self) -> TargetType: 22 | return TargetType.REPOSITORY 23 | 24 | def run(self, github_client: github.Github, target_spec: TargetSpec) -> HeuristicRunResult: 25 | repo = github_client.get_repo(full_name_or_id=target_spec.repo_full_name()) 26 | all_stargazers = repo.get_stargazers() 27 | 28 | if all_stargazers.totalCount < self.MIN_STARGAZERS: 29 | logger.debug("Repository %s has too few stargazers (%d) to analyze.", target_spec.repo_full_name(), 30 | all_stargazers.totalCount) 31 | return HeuristicRunResult.PASSED() 32 | 33 | if all_stargazers.totalCount > self.MAX_STARGAZERS: 34 | logger.debug("Repository %s has too many stargazers (%d) to analyze, limiting to %d.", 35 | target_spec.repo_full_name(), all_stargazers.totalCount, self.MAX_STARGAZERS) 36 | all_stargazers = all_stargazers[:self.MAX_STARGAZERS] 37 | 38 | logger.info("Analyzing the creation date of %d stargazers", all_stargazers.totalCount) 39 | stargazers_by_join_day = {} 40 | for stargazer in all_stargazers: 41 | user_joined_day = stargazer.created_at.strftime("%Y-%m-%d") 42 | if user_joined_day not in stargazers_by_join_day: 43 | stargazers_by_join_day[user_joined_day] = 0 44 | stargazers_by_join_day[user_joined_day] += 1 45 | 46 | # Now compute the count for each join day 47 | for join_day in stargazers_by_join_day: 48 | pct_joined_on_that_day = 100 * stargazers_by_join_day[join_day] / all_stargazers.totalCount 49 | if pct_joined_on_that_day >= self.THRESHOLD_PERCENT: 50 | additional_details = ( 51 | f"Repository {target_spec.repo_full_name()} has {stargazers_by_join_day[join_day]} stargazers " 52 | f"({pct_joined_on_that_day} %) who joined on the same day, {join_day}." 53 | ) 54 | return HeuristicRunResult.TRIGGERED(additional_details=additional_details) 55 | 56 | return HeuristicRunResult.PASSED() 57 | -------------------------------------------------------------------------------- /tests/heuristics/test_user_has_low_community_activity.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from unittest.mock import patch, Mock 3 | 4 | from github import NamedUser 5 | 6 | from ghbuster import TargetSpec, TargetType 7 | from ghbuster.heuristics.user_has_low_community_activity import UserHasLowCommunityActivity 8 | 9 | 10 | class TestUserHasLowCommunityActivity(unittest.TestCase): 11 | def setUp(self): 12 | self.heuristic = UserHasLowCommunityActivity() 13 | 14 | @patch('ghbuster.heuristics.user_has_low_community_activity.github.Github') 15 | def test_positive(self, gh): 16 | target_spec = TargetSpec(target_type=TargetType.USER, username="foo") 17 | user = Mock(NamedUser) 18 | user.login = target_spec.username 19 | user.get_starred = Mock(return_value=Mock(totalCount=UserHasLowCommunityActivity.STARS_THRESHOLD - 1)) 20 | user.get_following = Mock(return_value=Mock(totalCount=UserHasLowCommunityActivity.FOLLOWING_THRESHOLD - 1)) 21 | user.get_followers = Mock(return_value=Mock(totalCount=UserHasLowCommunityActivity.FOLLOWERS_THRESHOLD - 1)) 22 | gh.get_user.return_value = user 23 | gh.search_issues.return_value = Mock(totalCount=UserHasLowCommunityActivity.ISSUES_OR_PR_THRESHOLD - 1) 24 | result = self.heuristic.run(gh, target_spec) 25 | self.assertTrue(result.triggered) 26 | 27 | @patch('ghbuster.heuristics.user_has_low_community_activity.github.Github') 28 | def test_negative(self, gh): 29 | target_spec = TargetSpec(target_type=TargetType.USER, username="foo") 30 | user = Mock(NamedUser) 31 | user.login = target_spec.username 32 | user.get_starred = Mock(return_value=Mock(totalCount=UserHasLowCommunityActivity.STARS_THRESHOLD + 1)) 33 | user.get_following = Mock(return_value=Mock(totalCount=UserHasLowCommunityActivity.FOLLOWING_THRESHOLD + 1)) 34 | user.get_followers = Mock(return_value=Mock(totalCount=UserHasLowCommunityActivity.FOLLOWERS_THRESHOLD + 1)) 35 | gh.get_user.return_value = user 36 | gh.search_issues.return_value = Mock(totalCount=UserHasLowCommunityActivity.ISSUES_OR_PR_THRESHOLD + 1) 37 | result = self.heuristic.run(gh, target_spec) 38 | self.assertFalse(result.triggered) 39 | 40 | @patch('ghbuster.heuristics.user_has_low_community_activity.github.Github') 41 | def test_negative_single_attribute_ok(self, gh): 42 | target_spec = TargetSpec(target_type=TargetType.USER, username="foo") 43 | user = Mock(NamedUser) 44 | user.login = target_spec.username 45 | user.get_starred = Mock(return_value=Mock(totalCount=UserHasLowCommunityActivity.STARS_THRESHOLD + 1)) 46 | user.get_following = Mock(return_value=Mock(totalCount=UserHasLowCommunityActivity.FOLLOWING_THRESHOLD - 1)) 47 | user.get_followers = Mock(return_value=Mock(totalCount=UserHasLowCommunityActivity.FOLLOWERS_THRESHOLD - 1)) 48 | gh.get_user.return_value = user 49 | gh.search_issues.return_value = Mock(totalCount=UserHasLowCommunityActivity.ISSUES_OR_PR_THRESHOLD - 1) 50 | result = self.heuristic.run(gh, target_spec) 51 | self.assertFalse(result.triggered) 52 | -------------------------------------------------------------------------------- /ghbuster/cli.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import re 4 | from argparse import ArgumentParser 5 | 6 | from . import TargetType, TargetSpec 7 | 8 | 9 | def _cli() -> ArgumentParser: 10 | parser = ArgumentParser( 11 | prog="ghbuster", 12 | exit_on_error=False, 13 | description="Identify inauthentic GitHub accounts and repositories", 14 | ) 15 | 16 | parser.add_argument("target", type=str, 17 | help="Target GitHub repository or user to scan, e.g., 'owner/repo', `username`, or 'https://github.com/owner/repo'.") 18 | parser.add_argument("--github-token", type=str, 19 | help="GitHub token for authentication. If not provided, the GITHUB_TOKEN environment variable is used", 20 | required=False, default=os.environ.get("GITHUB_TOKEN")) 21 | parser.add_argument("--debug", action="store_true", help="Enable debug logging", dest="enable_debug", default=False) 22 | parser.add_argument("--include", nargs="+", help="Heuristics to include (any other heuristic will not be ran)", 23 | default=[]) 24 | parser.add_argument("--exclude", nargs="+", help="Heuristics to exclude", default=[]) 25 | parser.add_argument("--force", action="store_true", default=False) 26 | return parser 27 | 28 | 29 | class CliArguments: 30 | target_spec: TargetSpec 31 | github_token: str 32 | log_level: int 33 | excluded_heuristics: set[str] 34 | included_heuristics: set[str] 35 | force: bool 36 | 37 | 38 | def parse_and_validate_args(args) -> CliArguments: 39 | args = _cli().parse_args(args) 40 | cli_args = CliArguments() 41 | 42 | # Determine target type and parse repository or user 43 | normalized_target = args.target.strip().lower() 44 | github_url_prefix = "https://github.com/" 45 | if normalized_target.startswith(github_url_prefix): 46 | normalized_target = normalized_target[len(github_url_prefix):] 47 | 48 | if '/' in normalized_target: 49 | # It's a repository 50 | parts = normalized_target.split('/') 51 | if len(parts) != 2: 52 | raise ValueError("Invalid repository format. Expected 'owner/repo'.") 53 | cli_args.target_spec = TargetSpec(target_type=TargetType.REPOSITORY, username=parts[0], repo_name=parts[1]) 54 | else: 55 | # It's a user 56 | if not re.match(r'^[a-zA-Z0-9-]+$', normalized_target): 57 | # "Username may only contain alphanumeric characters or single hyphens, and cannot begin or end with a hyphen." (from the GitHub homepage) 58 | raise ValueError("Invalid GitHub username format") 59 | cli_args.target_spec = TargetSpec(target_type=TargetType.USER, username=normalized_target) 60 | 61 | # Github token 62 | cli_args.github_token = args.github_token 63 | if cli_args.github_token is None or len(cli_args.github_token) == 0: 64 | raise ValueError( 65 | "GitHub token is required. Please provide it via the --github-token argument or set the GITHUB_TOKEN environment variable.") 66 | 67 | # Log level 68 | cli_args.log_level = logging.DEBUG if args.enable_debug else logging.INFO 69 | 70 | # Heuristics selection 71 | if args.include and args.exclude: 72 | raise ValueError("--include and --exclude are mutually exclusive.") 73 | cli_args.included_heuristics = set(args.include) 74 | cli_args.excluded_heuristics = set(args.exclude) 75 | 76 | cli_args.force = args.force 77 | return cli_args 78 | -------------------------------------------------------------------------------- /ghbuster/heuristics/user_has_forks_from_taken_down_repos.py: -------------------------------------------------------------------------------- 1 | import functools 2 | import logging 3 | 4 | import github 5 | 6 | from .base import MetadataHeuristic, HeuristicRunResult 7 | from .. import TargetType, TargetSpec 8 | 9 | logger = logging.getLogger(__name__) 10 | 11 | 12 | # e.g. https://github.com/mrrebrik3765 13 | class UserHasForksFromTakenDownRepos(MetadataHeuristic): 14 | 15 | def __init__(self, max_forks_to_analyze: int = 10): 16 | super().__init__() 17 | self.max_forks_to_analyze = max_forks_to_analyze 18 | 19 | def id(self) -> str: 20 | return 'user.forks_from_taken_down_repos' 21 | 22 | def friendly_name(self) -> str: 23 | return "User has forks of taken-down repositories" 24 | 25 | def description(self) -> str: 26 | return "Detects when a user has forks from repositories that have been taken down. This may indicate that the user is being leveraged as part of a campaign to make inauthentic repositories appear legitimate." 27 | 28 | def target_type(self) -> TargetType: 29 | return TargetType.USER 30 | 31 | def run(self, github_client: github.Github, target_spec: TargetSpec) -> HeuristicRunResult: 32 | user = github_client.get_user(login=target_spec.username) 33 | user_repos = user.get_repos(type='owner') 34 | taken_down_repos = set() 35 | num_forks_analyzed = 0 36 | for repo in user_repos: 37 | if repo.fork: 38 | if num_forks_analyzed >= self.max_forks_to_analyze: 39 | logger.debug("Reached maximum number of forks to analyze (%d), stopping further checks", 40 | self.max_forks_to_analyze) 41 | break 42 | num_forks_analyzed += 1 43 | logger.debug("Analyzing forked repository %s owned by user %s", repo.full_name, target_spec.username) 44 | try: 45 | original_name = repo.parent.full_name 46 | if not self.repo_exists(github_client, original_name): 47 | taken_down_repos.add(original_name) 48 | except github.GithubException as e: 49 | if e.status in [403, 451] and e.message == 'Repository access blocked': 50 | # tos violation, i.e. soft takedown 51 | # TODO should probably be taken into account by the heuristic 52 | logger.warning("Repository %s owned by user %s is blocked, ignoring", repo.full_name, 53 | target_spec.username) 54 | else: 55 | raise e 56 | 57 | if len(taken_down_repos) > 0: 58 | additional_details = f"The user {target_spec.username} has forks from taken down repositories: {', '.join(taken_down_repos)}." 59 | return HeuristicRunResult.TRIGGERED(additional_details=additional_details) 60 | 61 | return HeuristicRunResult.PASSED() 62 | 63 | @functools.cache 64 | def repo_exists(self, github_client: github.Github, full_name: str) -> bool: 65 | try: 66 | github_client.get_repo(full_name) 67 | return True 68 | except github.GithubException as e: 69 | if e.status == 404: 70 | return False 71 | elif e.status in [403, 451] and e.message == 'Repository access blocked': 72 | # tos violation or other access issues 73 | return False 74 | raise 75 | -------------------------------------------------------------------------------- /ghbuster/heuristics/user_has_low_community_activity.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from datetime import datetime, timedelta, timezone 3 | 4 | import github 5 | 6 | from .base import MetadataHeuristic, HeuristicRunResult 7 | from .. import TargetType, TargetSpec 8 | 9 | logger = logging.getLogger(__name__) 10 | 11 | 12 | class UserHasLowCommunityActivity(MetadataHeuristic): 13 | STARS_THRESHOLD = 1 14 | FOLLOWING_THRESHOLD = 1 15 | FOLLOWERS_THRESHOLD = 1 16 | ISSUES_OR_PR_THRESHOLD = 1 17 | ISSUES_OR_PR_TIME_PERIOD_DAYS = 30.5 * 6 # 6 months 18 | 19 | def id(self) -> str: 20 | return 'user.low_community_activity' 21 | 22 | def friendly_name(self) -> str: 23 | return "User with low community activity" 24 | 25 | def description(self) -> str: 26 | return "Detects when a user has very low community activity. This may indicate that the user is inauthentic." 27 | 28 | def target_type(self) -> TargetType: 29 | return TargetType.USER 30 | 31 | def run(self, github_client: github.Github, target_spec: TargetSpec) -> HeuristicRunResult: 32 | user = github_client.get_user(login=target_spec.username) 33 | start_date = datetime.now(timezone.utc) - timedelta(days=self.ISSUES_OR_PR_TIME_PERIOD_DAYS) 34 | stars = user.get_starred() 35 | following = user.get_following() 36 | followers = user.get_followers() 37 | has_few_stars = stars.totalCount <= self.STARS_THRESHOLD 38 | has_few_following = following.totalCount <= self.FOLLOWING_THRESHOLD 39 | has_few_followers = followers.totalCount <= self.FOLLOWERS_THRESHOLD 40 | 41 | has_few_issues_or_prs = False 42 | issue_count = 0 43 | pr_count = 0 44 | try: 45 | issues = github_client.search_issues( 46 | f"type:issue author:{target_spec.username} created:>{start_date.strftime('%Y-%m-%d')}") 47 | prs = github_client.search_issues( 48 | f"type:pr author:{target_spec.username} created:>{start_date.strftime('%Y-%m-%d')}") 49 | 50 | issue_count = issues.totalCount 51 | pr_count = prs.totalCount 52 | has_few_issues_or_prs = issue_count + pr_count <= self.ISSUES_OR_PR_THRESHOLD 53 | 54 | except github.GithubException as e: 55 | if e.status == 422 and e.message == 'Validation Failed': 56 | logger.info("User %s has their profile activity in private mode, unable to list their PRs and issues", 57 | user.login) 58 | else: 59 | raise e 60 | 61 | logger.debug("User %s has %d stars, %d following, %d followers, %d issues, and %d PRs in the last %d days.", 62 | target_spec.username, stars.totalCount, following.totalCount, followers.totalCount, 63 | issue_count, pr_count, self.ISSUES_OR_PR_TIME_PERIOD_DAYS) 64 | 65 | if not has_few_stars or not has_few_following or not has_few_followers or not has_few_issues_or_prs: 66 | return HeuristicRunResult.PASSED() 67 | 68 | reason = 'User has low community activity: ' 69 | triggered = [] 70 | if has_few_stars: 71 | triggered.append(f"{stars.totalCount} stars (threshold: {self.STARS_THRESHOLD})") 72 | if has_few_following: 73 | triggered.append(f"{following.totalCount} following (threshold: {self.FOLLOWING_THRESHOLD})") 74 | if has_few_followers: 75 | triggered.append(f"{followers.totalCount} followers (threshold: {self.FOLLOWERS_THRESHOLD})") 76 | if has_few_issues_or_prs: 77 | triggered.append( 78 | f"{issues.totalCount + prs.totalCount} issues/PRs in the last {self.ISSUES_OR_PR_TIME_PERIOD_DAYS} days (threshold: {self.ISSUES_OR_PR_THRESHOLD})") 79 | 80 | reason += ', '.join(triggered) 81 | return HeuristicRunResult.TRIGGERED(additional_details=reason) 82 | -------------------------------------------------------------------------------- /tests/heuristics/test_repo_has_stargazzers_who_joined_the_same_day.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from datetime import datetime 3 | from unittest.mock import patch, Mock 4 | 5 | from github import Repository 6 | 7 | from ghbuster import TargetSpec, TargetType 8 | from ghbuster.heuristics.repo_has_stargazzers_who_joined_the_same_day import RepoHasStargazersWhoJoinedOnTheSameDay 9 | from tests.test_utils.date_utils import random_date 10 | from tests.test_utils.mock_utils import mock_pygithub_list 11 | 12 | 13 | class TestUserHasLowCommunityActivity(unittest.TestCase): 14 | def setUp(self): 15 | self.heuristic = RepoHasStargazersWhoJoinedOnTheSameDay() 16 | 17 | @patch('ghbuster.heuristics.repo_has_stargazzers_who_joined_the_same_day.github.Github') 18 | def test_positive_all_stargazers_joined_same_day(self, gh): 19 | target_spec = TargetSpec(target_type=TargetType.REPOSITORY, username="user", repo_name="repo") 20 | ghrepo = Mock(Repository) 21 | ghrepo.get_stargazers = Mock(return_value=mock_pygithub_list([ 22 | Mock(login="user1", created_at=datetime.strptime("2025-08-07T00:00:00Z", "%Y-%m-%dT%H:%M:%SZ")), 23 | Mock(login="user2", created_at=datetime.strptime("2025-08-07T00:00:00Z", "%Y-%m-%dT%H:%M:%SZ")), 24 | Mock(login="user3", created_at=datetime.strptime("2025-08-07T00:00:00Z", "%Y-%m-%dT%H:%M:%SZ")), 25 | Mock(login="user4", created_at=datetime.strptime("2025-08-07T00:00:00Z", "%Y-%m-%dT%H:%M:%SZ")) 26 | ])) 27 | gh.get_repo.return_value = ghrepo 28 | 29 | result = self.heuristic.run(gh, target_spec) 30 | self.assertTrue(result.triggered) 31 | 32 | @patch('ghbuster.heuristics.repo_has_stargazzers_who_joined_the_same_day.github.Github') 33 | def test_positive_threshold_of_stargazers_joined_same_day(self, gh): 34 | target_spec = TargetSpec(target_type=TargetType.REPOSITORY, username="user", repo_name="repo") 35 | ghrepo = Mock(Repository) 36 | num_users = 10 37 | num_joined_same_day = round(RepoHasStargazersWhoJoinedOnTheSameDay.THRESHOLD_PERCENT / 100 * num_users) + 2 38 | same_day_users = [ 39 | Mock(login=f"user{i}", created_at=datetime.strptime("2025-08-07T00:00:00Z", "%Y-%m-%dT%H:%M:%SZ")) 40 | for i in range(num_joined_same_day) 41 | ] 42 | other_users = [ 43 | Mock(login=f"user{i}", created_at=random_date()) 44 | for i in range(num_users - num_joined_same_day) 45 | ] 46 | ghrepo.get_stargazers = Mock(return_value=mock_pygithub_list(same_day_users + other_users)) 47 | gh.get_repo.return_value = ghrepo 48 | 49 | result = self.heuristic.run(gh, target_spec) 50 | self.assertTrue(result.triggered) 51 | 52 | @patch('ghbuster.heuristics.repo_has_stargazzers_who_joined_the_same_day.github.Github') 53 | def test_negative_not_enough_stargazers(self, gh): 54 | target_spec = TargetSpec(target_type=TargetType.REPOSITORY, username="user", repo_name="repo") 55 | ghrepo = Mock(Repository) 56 | ghrepo.get_stargazers = Mock(return_value=mock_pygithub_list([])) 57 | gh.get_repo.return_value = ghrepo 58 | 59 | result = self.heuristic.run(gh, target_spec) 60 | self.assertFalse(result.triggered) 61 | 62 | @patch('ghbuster.heuristics.repo_has_stargazzers_who_joined_the_same_day.github.Github') 63 | def test_negative_too_many_stargazers(self, gh): 64 | target_spec = TargetSpec(target_type=TargetType.REPOSITORY, username="user", repo_name="repo") 65 | ghrepo = Mock(Repository) 66 | stargazers = [ 67 | Mock(login=f"user{i}", created_at=random_date()) 68 | for i in range(RepoHasStargazersWhoJoinedOnTheSameDay.MAX_STARGAZERS + 1) 69 | ] 70 | ghrepo.get_stargazers = Mock(return_value=mock_pygithub_list(stargazers)) 71 | gh.get_repo.return_value = ghrepo 72 | 73 | result = self.heuristic.run(gh, target_spec) 74 | self.assertFalse(result.triggered) 75 | -------------------------------------------------------------------------------- /ghbuster/heuristics/repo_commits_only_from_suspicious_unlinked_emails.py: -------------------------------------------------------------------------------- 1 | from github.NamedUser import NamedUser 2 | 3 | from .user_has_forks_from_taken_down_repos import * 4 | from .user_has_low_community_activity import * 5 | 6 | logger = logging.getLogger(__name__) 7 | 8 | 9 | # e.g. https://github.com/al1enb1t/cheatengine-for-linux 10 | # or https://github.com/Caztemaz/Lnk-Exploit-FileBinder-Certificate-Spoofer-Reg-Doc-Cve-Rce in the case of taken down users 11 | class RepoCommitsOnlyFromSuspiciousUnlinkedEmails(MetadataHeuristic): 12 | MAX_COMMITS = 100 13 | 14 | def id(self) -> str: 15 | return 'repo.commits_suspicious_unlinked_emails' 16 | 17 | def friendly_name(self) -> str: 18 | return "Repository commits only from suspicious unlinked emails" 19 | 20 | def description(self) -> str: 21 | return "Detects when a repository has commits with unlinked emails that also don't match the owner's username or full name." 22 | 23 | def target_type(self) -> TargetType: 24 | return TargetType.REPOSITORY 25 | 26 | def run(self, github_client: github.Github, target_spec: TargetSpec) -> HeuristicRunResult: 27 | repo = github_client.get_repo(full_name_or_id=target_spec.repo_full_name()) 28 | user = github_client.get_user(login=target_spec.username) 29 | normalized_username = user.login.lower() 30 | normalized_user_full_name = user.name.lower() if user.name else None 31 | commits = repo.get_commits() 32 | logger.debug("Analyzing %d commits for repository %s", commits.totalCount, target_spec.repo_full_name()) 33 | num_suspicious = 0 34 | num_processed = 0 35 | unlinked_emails = set() 36 | for commit in commits: 37 | if num_processed == self.MAX_COMMITS: 38 | logger.debug("Reached max commit limit of %d, stopping the processing", self.MAX_COMMITS) 39 | break 40 | num_processed += 1 41 | normalized_committer_name = commit.commit.author.name.lower() 42 | name_matches = (normalized_committer_name in [normalized_username, normalized_user_full_name]) 43 | if commit.author is None and not name_matches: 44 | # Case 1: the commit is not linked to any GitHub user based on the email 45 | # As it's a common misconfiguration, we only flag it if the author name in the git metadata doesn't match the user's username/name 46 | num_suspicious += 1 47 | unlinked_emails.add(commit.commit.author.email) 48 | elif commit.author is not None and self.commit_linked_to_taken_down_user(github_client, commit.author): 49 | # Case 2: the commit is linked to a GitHub user that has previously been taken down, we consider it "suspiciously-unlinked" too 50 | num_suspicious += 1 51 | unlinked_emails.add(commit.commit.author.email) 52 | 53 | if num_suspicious == num_processed: 54 | additional_details = f"The repository only has commits from unlinked emails ({', '.join(unlinked_emails)})." 55 | return HeuristicRunResult.TRIGGERED(additional_details=additional_details) 56 | 57 | return HeuristicRunResult.PASSED() 58 | 59 | @functools.cache 60 | def commit_linked_to_taken_down_user(self, github_client: github.Github, author: NamedUser) -> bool: 61 | # We know the commit is linked to a specific GitHub user, i.e. the git metadata email was linked to a specific user at the time of the commit 62 | # In some cases, the associated user doesn't exist anymore (e.g. taken down), so we consider the email as "currently unlinked" in this case 63 | # (like we'd see in the GitHub UI that the username is not clickable) 64 | try: 65 | github_client.get_user_by_id(author.id) 66 | return False # no exception, the user exists 67 | except github.GithubException as e: 68 | if e.status == 404: 69 | return True 70 | raise e 71 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ghbuster 2 | 3 | **ghbuster** is a tool to detect suspicious GitHub repositories and users using heuristics. It is designed to help 4 | identify potentially malicious or inauthentic accounts and repositories on GitHub. 5 | 6 |

7 | ghbuster 8 |

9 | 10 | ## Usage 11 | 12 | Install `uv` if needed: 13 | 14 | ```bash 15 | brew install astral-sh/uv/uv 16 | # or see https://docs.astral.sh/uv/getting-started/installation/ for other platforms 17 | ``` 18 | 19 | You can then install ghbuster: 20 | 21 | ```bash 22 | uv pip install "git+https://github.com/DataDog/ghbuster.git" 23 | ``` 24 | 25 | Then run it using: 26 | 27 | ```bash 28 | export GITHUB_TOKEN= 29 | ghbuster 30 | ``` 31 | 32 | ## Heuristics 33 | 34 | 35 | ### Repository heuristics 36 | 37 | | **ID** | **Name** | **Description** | 38 | |:-:|:-:|:-:| 39 | | [repo.commits_suspicious_unlinked_emails](./ghbuster/heuristics/repo_commits_only_from_suspicious_unlinked_emails.py) | Repository commits only from suspicious unlinked emails | Detects when a repository has commits with unlinked emails that also don't match the owner's username or full name. | 40 | | [repo.stargazers_joined_same_day](./ghbuster/heuristics/repo_has_stargazzers_who_joined_the_same_day.py) | Repository has stargazers who joined the same day | Detects when a repository has a large proportion of its stargazers who joined GitHub on the same day, which may indicate a coordinated effort to boost the repository's popularity. | 41 | | [repo.starred_by_suspicious_users](./ghbuster/heuristics/repo_starred_by_suspicious_users.py) | Repository starred by suspicious users | Detects when a repository has over 80 % of stars from suspicious users matching heuristics they may be inauthentic. | 42 | 43 | 44 | ### GitHub user heuristics 45 | 46 | | **ID** | **Name** | **Description** | 47 | |:-:|:-:|:-:| 48 | | [user.commits_unlinked_emails](./ghbuster/heuristics/user_has_only_commits_from_unlinked_emails.py) | User has only commits from unlinked emails | Detects when all of a user's commits are from emails not linked to their GitHub profiles. This may indicate a threat actor leveraging distinct inauthentic accounts. | 49 | | [user.forks_from_taken_down_repos](./ghbuster/heuristics/user_has_forks_from_taken_down_repos.py) | User has forks of taken-down repositories | Detects when a user has forks from repositories that have been taken down. This may indicate that the user is being leveraged as part of a campaign to make inauthentic repositories appear legitimate. | 50 | | [user.just_joined](./ghbuster/heuristics/user_metadata_basic.py) | User recently joined GitHub | The GitHub user joined the platform less than 7 days ago. | 51 | | [user.low_community_activity](./ghbuster/heuristics/user_has_low_community_activity.py) | User with low community activity | Detects when a user has very low community activity. This may indicate that the user is inauthentic. | 52 | | [user.missing_common_fields](./ghbuster/heuristics/user_metadata_basic.py) | User has none of the common profile fields set | Detects when a GitHub is missing a number of highly-common fields (name, company, bio, location) in their profile. | 53 | | [user.repos_only_forks](./ghbuster/heuristics/user_has_only_forks.py) | User has only forks | Detects all of a user's repositories are forks. This may be an indication that the user is used solely to make other repositories appear legitimate. | 54 | 55 | 56 | 57 | 58 | ## Related research 59 | 60 | - https://media.defcon.org/DEF%20CON%2033/DEF%20CON%2033%20presentations/Christophe%20Tafani-Dereeper%20Matt%20Muir%20-%20Weaponizing%20Trust%20Investigating%20a%20Threat%20Actor%20Targeting%20Security%20Researchers%20and%20Academics.pdf 61 | - https://securitylabs.datadoghq.com/articles/mut-1244-targeting-offensive-actors/ 62 | 63 | ## Development 64 | 65 | To run the local ghbuster version, use: 66 | 67 | ```bash 68 | uv venv 69 | export GITHUB_TOKEN= 70 | uv run python -m ghbuster 71 | ``` 72 | 73 | To run the tests, use: 74 | 75 | ```bash 76 | uv run python -m unittest discover tests/heuristics 77 | ``` 78 | 79 | To generate the documentation, use: 80 | 81 | ```bash 82 | uv run python -m scripts.generate_heuristics_docs README.md 83 | ``` 84 | -------------------------------------------------------------------------------- /ghbuster/heuristics/repo_starred_by_suspicious_users.py: -------------------------------------------------------------------------------- 1 | from github.NamedUser import NamedUser 2 | 3 | from .user_has_forks_from_taken_down_repos import * 4 | from .user_has_low_community_activity import * 5 | from .user_has_only_forks import * 6 | from .user_looks_legit import UserLooksLegit 7 | from .user_metadata_basic import * 8 | 9 | logger = logging.getLogger(__name__) 10 | 11 | 12 | class RepoStarredBySuspiciousUsers(MetadataHeuristic): 13 | PERCENT_THRESHOLD = 80 14 | MAX_STARGAZERS = 101 15 | 16 | def id(self) -> str: 17 | return 'repo.starred_by_suspicious_users' 18 | 19 | def friendly_name(self) -> str: 20 | return "Repository starred by suspicious users" 21 | 22 | def description(self) -> str: 23 | return f"Detects when a repository has over {round(self.PERCENT_THRESHOLD)} % of stars from suspicious users matching heuristics they may be inauthentic." 24 | 25 | def target_type(self) -> TargetType: 26 | return TargetType.REPOSITORY 27 | 28 | def run(self, github_client: github.Github, target_spec: TargetSpec) -> HeuristicRunResult: 29 | # Here we want heuristics that are quick to run 30 | repo = github_client.get_repo(full_name_or_id=target_spec.repo_full_name()) 31 | all_stargazers = repo.get_stargazers() 32 | stargazer_count = all_stargazers.totalCount 33 | 34 | if stargazer_count == 0: 35 | logger.debug("Repository %s has no stargazers.", target_spec.repo_full_name()) 36 | return HeuristicRunResult.PASSED() 37 | elif stargazer_count > self.MAX_STARGAZERS: 38 | logger.info("Repository %s has too many stargazers (%d) to analyze, ignoring it.", 39 | target_spec.repo_full_name(), all_stargazers.totalCount) 40 | return HeuristicRunResult.PASSED() 41 | 42 | logger.info("Analyzing %d stargazers for repository %s", stargazer_count, 43 | target_spec.repo_full_name()) 44 | suspicious_stargazers = {} # mapping from username to the list of triggered heuristics for this user 45 | for stargazer in all_stargazers: 46 | user = github_client.get_user(login=stargazer.login) 47 | if UserLooksLegit().run(github_client, target_spec).triggered: 48 | logger.info("The user %s exhibits strong characteristics of a legitimate user, skipping", user.login) 49 | continue 50 | 51 | user_heuristics = self.get_heuristics_to_run_for_user(user) 52 | logger.info("Analyzing if stargazer %s looks suspicious by running %d heuristics", user.login, 53 | len(user_heuristics)) 54 | for heuristic in user_heuristics: 55 | result = heuristic.run(github_client, TargetSpec(TargetType.USER, username=user.login)) 56 | if result.triggered: 57 | logger.debug("Stargazer %s triggered heuristic %s", user.login, heuristic.id()) 58 | if user.login not in suspicious_stargazers: 59 | suspicious_stargazers[user.login] = [] 60 | suspicious_stargazers[user.login].append(heuristic.id()) 61 | 62 | ratio = 100 * len(suspicious_stargazers) / stargazer_count if stargazer_count > 0 else 0 63 | if ratio >= self.PERCENT_THRESHOLD: 64 | additional_details = f"The repository has {len(suspicious_stargazers)} stargazers ({round(ratio)} %) that triggered suspicious user heuristics: {', '.join(suspicious_stargazers.keys())}." 65 | return HeuristicRunResult.TRIGGERED(additional_details=additional_details) 66 | 67 | return HeuristicRunResult.PASSED() 68 | 69 | @staticmethod 70 | def get_heuristics_to_run_for_user(user: NamedUser) -> set[MetadataHeuristic]: 71 | heuristics: set[MetadataHeuristic] = { 72 | UserJustJoinedHeuristic(), 73 | UserMissingCommonFields(), 74 | UserHasLowCommunityActivity(), 75 | UserHasOnlyForkedRepos() 76 | } 77 | 78 | if user.followers <= 100: 79 | # since this heuristic can take time, we only run it for users that have a higher chance of being inauthentic 80 | heuristics.add(UserHasForksFromTakenDownRepos(max_forks_to_analyze=20)) 81 | 82 | return heuristics 83 | -------------------------------------------------------------------------------- /tests/heuristics/test_user_has_forks_from_taken_down_repos.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from unittest.mock import patch, Mock 3 | 4 | from github import Repository, GithubException 5 | 6 | from ghbuster import TargetSpec, TargetType 7 | from ghbuster.heuristics.user_has_forks_from_taken_down_repos import UserHasForksFromTakenDownRepos 8 | from tests.test_utils.mock_utils import mock_pygithub_list 9 | 10 | 11 | class TestUserHasForksFromTakenDownRepos(unittest.TestCase): 12 | def setUp(self): 13 | self.heuristic = UserHasForksFromTakenDownRepos() 14 | 15 | @patch('ghbuster.heuristics.user_has_forks_from_taken_down_repos.github.Github') 16 | def test_positive(self, gh): 17 | target_spec = TargetSpec(target_type=TargetType.USER, username="fork") 18 | 19 | parent_repo1 = Mock(Repository, full_name='foo/repo1') 20 | repo1 = Mock(spec=Repository, fork=True, full_name="fork/repo1") 21 | repo1.parent = parent_repo1 22 | 23 | parent_repo2 = Mock(Repository, full_name='foo/repo2') # not existing anymore 24 | repo2 = Mock(spec=Repository, fork=True, full_name="fork/repo2") 25 | repo2.parent = parent_repo2 26 | user_repos = [repo1, repo2] 27 | 28 | other_repos = [ 29 | parent_repo1 30 | # note: parent_repo2 is not included here, simulating that it has been taken down 31 | ] 32 | 33 | def side_effect(repo_name): 34 | user_repos_by_name = {repo.full_name: repo for repo in user_repos} 35 | other_repos_by_name = {repo.full_name: repo for repo in other_repos} 36 | if repo_name in user_repos_by_name: 37 | return user_repos_by_name[repo_name] 38 | elif repo_name in other_repos_by_name: 39 | return other_repos_by_name[repo_name] 40 | else: 41 | raise GithubException(status=404) 42 | 43 | gh.get_repo.side_effect = side_effect 44 | gh.get_user.return_value.get_repos = Mock(return_value=mock_pygithub_list(user_repos)) 45 | result = self.heuristic.run(gh, target_spec) 46 | self.assertTrue(result.triggered) 47 | 48 | @patch('ghbuster.heuristics.user_has_forks_from_taken_down_repos.github.Github') 49 | def test_negative_no_repo(self, gh): 50 | target_spec = TargetSpec(target_type=TargetType.USER, username="fork") 51 | gh.get_user.return_value.get_repos = Mock(return_value=mock_pygithub_list([])) 52 | result = self.heuristic.run(gh, target_spec) 53 | self.assertFalse(result.triggered) 54 | 55 | @patch('ghbuster.heuristics.user_has_forks_from_taken_down_repos.github.Github') 56 | def test_negative_forks_with_existing_parents(self, gh): 57 | target_spec = TargetSpec(target_type=TargetType.USER, username="fork") 58 | 59 | parent_repo1 = Mock(Repository, full_name='foo/repo1') 60 | repo1 = Mock(spec=Repository, fork=True, full_name="fork/repo1") 61 | repo1.parent = parent_repo1 62 | 63 | parent_repo2 = Mock(Repository, full_name='foo/repo2') # not existing anymore 64 | repo2 = Mock(spec=Repository, fork=True, full_name="fork/repo2") 65 | repo2.parent = parent_repo2 66 | user_repos = [repo1, repo2] 67 | 68 | other_repos = [ 69 | parent_repo1, 70 | parent_repo2 71 | ] 72 | user_repos_by_name = {repo.full_name: repo for repo in user_repos} 73 | other_repos_by_name = {repo.full_name: repo for repo in other_repos} 74 | 75 | gh.get_repo.side_effect = lambda repo_name: user_repos_by_name[ 76 | repo_name] if repo_name in user_repos_by_name else other_repos_by_name[repo_name] 77 | gh.get_user.return_value.get_repos = Mock(return_value=mock_pygithub_list(user_repos)) 78 | result = self.heuristic.run(gh, target_spec) 79 | self.assertFalse(result.triggered) 80 | 81 | @patch('ghbuster.heuristics.user_has_forks_from_taken_down_repos.github.Github') 82 | def test_negative_no_forks(self, gh): 83 | target_spec = TargetSpec(target_type=TargetType.USER, username="fork") 84 | 85 | user_repos = [ 86 | Mock(spec=Repository, fork=False, full_name="foo/repo1"), 87 | Mock(spec=Repository, fork=False, full_name="foo/repo2") 88 | ] 89 | 90 | gh.get_repo.return_value = Mock(return_value=mock_pygithub_list(user_repos)) 91 | gh.get_user.return_value.get_repos = Mock(return_value=mock_pygithub_list(user_repos)) 92 | result = self.heuristic.run(gh, target_spec) 93 | self.assertFalse(result.triggered) 94 | -------------------------------------------------------------------------------- /ghbuster/heuristics/graph.py: -------------------------------------------------------------------------------- 1 | from pyvis.network import Network 2 | 3 | from .user_has_low_community_activity import * 4 | from .user_metadata_basic import * 5 | 6 | logger = logging.getLogger(__name__) 7 | 8 | import networkx as nx 9 | 10 | 11 | # NOTE: This heuristic is unused and experimental for now. 12 | class Graph(MetadataHeuristic): 13 | MAX_ITERATIONS = 3 14 | 15 | def id(self) -> str: 16 | return 'repo.graph' 17 | 18 | def friendly_name(self) -> str: 19 | return "Test" 20 | 21 | def description(self) -> str: 22 | return "TODO" 23 | 24 | def target_type(self) -> TargetType: 25 | return TargetType.REPOSITORY 26 | 27 | def run(self, github_client: github.Github, target_spec: TargetSpec) -> HeuristicRunResult: 28 | repo = github_client.get_repo(full_name_or_id=target_spec.repo_full_name()) 29 | current_iteration = 1 30 | current_queue = [repo] 31 | visited_repos = set() 32 | visited_users = set() 33 | graph = nx.Graph() 34 | while current_iteration <= self.MAX_ITERATIONS and current_queue: 35 | logger.debug("Starting iteration %d with %d repositories in the queue", current_iteration, 36 | len(current_queue)) 37 | next_queue = [] 38 | for repo in current_queue: 39 | if repo.full_name in visited_repos: 40 | continue 41 | visited_repos.add(repo.full_name) 42 | logger.debug("Processing repository %s at iteration depth %d", repo.full_name, current_iteration) 43 | 44 | # logic 45 | graph.add_node(repo.full_name, type='repository') 46 | graph.add_edge(repo.full_name, repo.owner.login, type='owns') 47 | 48 | try: 49 | all_stargazers = repo.get_stargazers_with_dates() 50 | all_forks = repo.get_forks() 51 | logger.debug("Found %d stargazers and %d forks for repository %s", all_stargazers.totalCount, 52 | all_forks.totalCount, repo.full_name) 53 | except github.GithubException as e: 54 | # Sample error when a repository is still up but has been restricted access to: 55 | # github.GithubException.GithubException: Repository access blocked: 403 {"message": "Repository access blocked", "block": {"reason": "tos", "created_at": "2025-06-23T08:05:48Z", "html_url": "https://github.com/tos"}} 56 | if e.status in [403, 451] and e.message == 'Repository access blocked': 57 | logger.warning( 58 | "Repository %s cannot be accessed, most likely disabled due to a breach of ToS. Ignoring", 59 | repo.full_name) 60 | continue 61 | else: 62 | raise e 63 | 64 | # If the current repository is a fork, make sure to visit the parent repository as well 65 | if repo.fork and repo.parent and repo.parent.full_name not in visited_repos: 66 | logger.debug("Repository %s is a fork, adding parent repository %s to the queue", repo.full_name, 67 | repo.parent.full_name) 68 | next_queue.append(repo.parent) 69 | 70 | users_to_visit_in_next_iteration = set() 71 | for stargazer in all_stargazers: 72 | graph.add_node(stargazer.user.login, type='user') 73 | graph.add_edge(stargazer.user.login, repo.full_name, type='stars') 74 | # Visit every user who starred the repo 75 | users_to_visit_in_next_iteration.add(stargazer.user.login) 76 | 77 | for fork in all_forks: 78 | graph.add_edge(fork.owner.login, repo.full_name, type='forks') 79 | # Visit every user who forked the repo 80 | users_to_visit_in_next_iteration.add(fork.owner.login) 81 | 82 | for user_to_visit in users_to_visit_in_next_iteration: 83 | if user_to_visit not in visited_users: 84 | visited_users.add(user_to_visit) 85 | try: 86 | user_repos = github_client.get_user(login=user_to_visit).get_repos(type='owner') 87 | repos_to_visit = [r for r in user_repos if r.full_name not in visited_repos] 88 | logger.debug("Will visit user %s (%d unvisited repos)", user_to_visit, len(repos_to_visit)) 89 | next_queue.extend(repos_to_visit) 90 | except github.GithubException as e: 91 | if e.status == 404: 92 | logger.warning("User %s not found (probably taken down), skipping", user_to_visit) 93 | else: 94 | raise e 95 | # end logic 96 | current_queue = next_queue 97 | current_iteration += 1 98 | 99 | nt = Network('1000px', '100%') 100 | nt.from_nx(graph) 101 | for edge in nt.edges: 102 | if 'type' in edge: 103 | edge['label'] = edge['type'] 104 | nt.show_buttons() 105 | nt.write_html('/tmp/graph.html', open_browser=True) 106 | 107 | return HeuristicRunResult.PASSED() 108 | -------------------------------------------------------------------------------- /ghbuster/output_formatter.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from typing import List 3 | 4 | from . import TargetSpec 5 | from .heuristics.base import HeuristicRunResult 6 | 7 | 8 | class Color: 9 | """ANSI color codes for terminal output""" 10 | RED = '\033[91m' 11 | GREEN = '\033[92m' 12 | YELLOW = '\033[93m' 13 | BLUE = '\033[94m' 14 | MAGENTA = '\033[95m' 15 | CYAN = '\033[96m' 16 | WHITE = '\033[97m' 17 | BOLD = '\033[1m' 18 | UNDERLINE = '\033[4m' 19 | END = '\033[0m' 20 | 21 | @staticmethod 22 | def disable_if_not_tty(): 23 | """Disable colors if output is not a TTY (e.g., piped to file)""" 24 | if not sys.stdout.isatty(): 25 | Color.RED = Color.GREEN = Color.YELLOW = Color.BLUE = '' 26 | Color.MAGENTA = Color.CYAN = Color.WHITE = Color.BOLD = '' 27 | Color.UNDERLINE = Color.END = '' 28 | 29 | 30 | class OutputFormatter: 31 | """Rich output formatter for heuristic scan results""" 32 | 33 | def __init__(self, disable_colors: bool = False): 34 | if disable_colors: 35 | Color.disable_if_not_tty() 36 | 37 | def format_results(self, target_spec: TargetSpec, results: List[HeuristicRunResult]) -> str: 38 | """Format all heuristic results into a nice report""" 39 | Color.disable_if_not_tty() 40 | 41 | output = [] 42 | 43 | # Header 44 | output.append(self._create_header(target_spec)) 45 | output.append("") 46 | 47 | # Separate passed and failed results 48 | failed_results = [r for r in results if r.triggered] 49 | passed_results = [r for r in results if not r.triggered] 50 | 51 | # Failed heuristics section (show first if any) 52 | if failed_results: 53 | output.append(self._create_failed_section(failed_results)) 54 | output.append("") 55 | 56 | # Passed heuristics section 57 | if passed_results: 58 | output.append(self._create_passed_section(passed_results)) 59 | output.append("") 60 | 61 | # Summary 62 | output.append(self._create_summary(len(failed_results), len(passed_results))) 63 | 64 | return "\n".join(output) 65 | 66 | def _create_header(self, target_spec: TargetSpec) -> str: 67 | """Create formatted header section""" 68 | title = f"🔍 ghbuster scan results" 69 | target_info = f"Target: {target_spec}" 70 | 71 | border = "=" * max(len(title), len(target_info)) 72 | 73 | return f"{Color.BOLD}{Color.CYAN}{border}{Color.END}\n" \ 74 | f"{Color.BOLD}{Color.WHITE}{title}{Color.END}\n" \ 75 | f"{Color.CYAN}{target_info}{Color.END}\n" \ 76 | f"{Color.CYAN}{border}{Color.END}" 77 | 78 | def _create_failed_section(self, failed_results: List[HeuristicRunResult]) -> str: 79 | """Create section for failed heuristics""" 80 | lines = [] 81 | 82 | # Section header 83 | count = len(failed_results) 84 | lines.append(f"{Color.BOLD}{Color.RED}🚨 {count} heuristics triggered{Color.END}") 85 | lines.append("") 86 | 87 | # Individual failed heuristics 88 | for i, result in enumerate(failed_results, 1): 89 | lines.append(self._format_failed_heuristic(i, result)) 90 | if i < len(failed_results): 91 | lines.append("") # Add spacing between heuristics 92 | 93 | return "\n".join(lines) 94 | 95 | def _create_passed_section(self, passed_results: List[HeuristicRunResult]) -> str: 96 | """Create section for passed heuristics""" 97 | lines = [] 98 | 99 | # Section header 100 | lines.append(f"{Color.BOLD}{Color.GREEN}Non-triggered heuristics ({len(passed_results)}){Color.END}") 101 | lines.append("") 102 | 103 | # Show passed heuristics in a compact format 104 | for result in passed_results: 105 | heuristic_name = result.heuristic.friendly_name() 106 | lines.append(f" {Color.GREEN}✅{Color.END} {heuristic_name}") 107 | 108 | return "\n".join(lines) 109 | 110 | def _format_failed_heuristic(self, index: int, result: HeuristicRunResult) -> str: 111 | """Format a single failed heuristic with details""" 112 | heuristic_name = result.heuristic.friendly_name() 113 | 114 | lines = [] 115 | lines.append(f"{Color.BOLD}{Color.RED}❌ {index}. {heuristic_name}{Color.END}") 116 | 117 | description = result.heuristic.description() 118 | lines.append(f" {Color.YELLOW}📋 Description:{Color.END} {description}") 119 | 120 | # Additional details 121 | if result.additional_details: 122 | lines.append(f" {Color.CYAN}🔍 Details:{Color.END} {result.additional_details}") 123 | 124 | return "\n".join(lines) 125 | 126 | def _create_summary(self, failed_count: int, passed_count: int) -> str: 127 | """Create summary section""" 128 | total = failed_count + passed_count 129 | 130 | lines = [] 131 | lines.append(f"{Color.BOLD}{Color.CYAN}📊 SCAN SUMMARY{Color.END}") 132 | lines.append("─" * 40) 133 | lines.append(f"Total Heuristics Run: {Color.BOLD}{total}{Color.END}") 134 | lines.append( 135 | f"Heuristics triggered: {Color.BOLD}{Color.RED if failed_count > 0 else Color.GREEN}{failed_count}{Color.END}") 136 | 137 | return "\n".join(lines) 138 | 139 | def _camel_to_title(self, camel_str: str) -> str: 140 | """Convert CamelCase to Title Case with spaces""" 141 | import re 142 | # Insert space before uppercase letters (except first) 143 | spaced = re.sub(r'(? set[EmailResult]: 43 | emails = set[EmailResult]() 44 | 45 | if self.target_spec.target_type == TargetType.REPOSITORY: 46 | repo = self.github_client.get_repo(self.target_spec.repo_full_name()) 47 | emails = self._find_emails_from_repository(repo) 48 | elif self.target_spec.target_type == TargetType.USER: 49 | user = self.github_client.get_user(self.target_spec.username) 50 | repos = user.get_repos(type='owner') 51 | logger.debug("Found %d GitHub repositories for user %s", repos.totalCount, self.target_spec.username) 52 | for repo in repos: 53 | if repo.fork and not self.include_forks: 54 | logger.debug("Skipping forked repository %s", repo.full_name) 55 | continue 56 | repo_emails = self._find_emails_from_repository(repo) 57 | emails.update(repo_emails) 58 | 59 | return emails 60 | 61 | def _find_emails_from_repository(self, repository: github.Repository) -> set[EmailResult]: 62 | logger.debug("Identifying emails from repository %s", repository.full_name) 63 | emails = set[EmailResult]() 64 | branches = repository.get_branches() 65 | logger.debug("Found %d branches in repository %s", branches.totalCount, repository.full_name) 66 | num_commits_processed = 0 67 | for branch in branches: 68 | commits = repository.get_commits(sha=branch.name) 69 | logger.debug("Processing branch '%s' with %d commits", branch.name, commits.totalCount) 70 | for commit in commits: 71 | if num_commits_processed > self.max_commits_to_analyze_per_repo: 72 | logger.debug("Reached max processing limit of %d commits per repo for %s, going to the next one", 73 | self.max_commits_to_analyze_per_repo, repository.full_name) 74 | return emails 75 | num_commits_processed += 1 76 | is_commit_linked_to_user: bool 77 | if commit.author is None or self.commit_linked_to_taken_down_user(commit.author): 78 | is_commit_linked_to_user = False 79 | if not self.include_unlinked_emails: 80 | logger.debug("Skipping commit %s (no Git authorship information)", commit.sha[:6]) 81 | continue 82 | else: 83 | is_commit_linked_to_user = True 84 | if not self.is_commit_by_current_user(commit) and not self.include_emails_linked_to_other_users: 85 | logger.debug("Skipping commit %s by Git author %s (linked to another user %s)", commit.sha[:6], 86 | commit.commit.author.email, commit.author.login) 87 | continue 88 | 89 | emails.add( 90 | EmailResult(email=commit.commit.author.email.lower(), is_linked_to_user=is_commit_linked_to_user)) 91 | 92 | return emails 93 | 94 | def is_commit_by_current_user(self, commit: Commit) -> bool: 95 | # Note: we need to compare author user ID and not only usernames, because sometimes users get renamed 96 | current_user = self.github_client.get_user(self.target_spec.username) 97 | return commit.author.id == current_user.id 98 | 99 | @functools.cache 100 | def commit_linked_to_taken_down_user(self, author: NamedUser) -> bool: 101 | # We know the commit is linked to a specific GitHub user, i.e. the git metadata email was linked to a specific user at the time of the commit 102 | # In some cases, the associated user doesn't exist anymore (e.g. taken down), so we consider the email as "currently unlinked" in this case 103 | # (like we'd see in the GitHub UI that the username is not clickable) 104 | try: 105 | self.github_client.get_user_by_id(author.id) 106 | return False # no exception, the user exists 107 | except github.GithubException as e: 108 | if e.status == 404: 109 | return True 110 | raise e 111 | 112 | 113 | """ 114 | NOTE: The version below doesn't work because the GitHub API doesn't support searching with user:x directly, or it returns "The search contains only logical operators (AND / OR / NOT) without any search terms", most likely to protect against abuse. 115 | 116 | def find_emails(self) -> set[str]: 117 | emails = set() 118 | search_query = "" 119 | if self.target_spec.target_type == TargetType.REPOSITORY: 120 | search_query = f'repo:{self.target_spec.repo_full_name()}' 121 | elif self.target_spec.target_type == TargetType.USER: 122 | search_query = f'user:{self.target_spec.username}' 123 | 124 | commits = self.github_client.search_commits(search_query) 125 | print("Search got back {} commits".format(commits.totalCount)) 126 | for commit in commits: 127 | print(commit.sha) 128 | # If we only want "linked emails", we keep only commits where the committer's email has been linked to the user profile 129 | if self.only_linked_emails and ( 130 | commit.author is None or commit.author.login is None or commit.author.login != self.target_spec.username): 131 | continue 132 | else: 133 | email = commit.commit.author.email.lower() 134 | if email not in emails: 135 | emails.add(email) 136 | return emails 137 | """ 138 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | --------------------------------------------------------------------------------