├── scripts ├── __init__.py └── generate_heuristics_docs.py ├── ghbuster ├── service │ ├── __init__.py │ ├── github_archive.py │ └── emails_extractor.py ├── heuristics │ ├── __init__.py │ ├── user_has_only_forks.py │ ├── user_has_only_commits_from_unlinked_emails.py │ ├── base.py │ ├── user_looks_legit.py │ ├── user_metadata_basic.py │ ├── repo_has_stargazzers_who_joined_the_same_day.py │ ├── user_has_forks_from_taken_down_repos.py │ ├── user_has_low_community_activity.py │ ├── repo_commits_only_from_suspicious_unlinked_emails.py │ ├── repo_starred_by_suspicious_users.py │ └── graph.py ├── __init__.py ├── github_repo_scanner.py ├── __main__.py ├── cli.py └── output_formatter.py ├── tests ├── test_utils │ ├── __init__.py │ ├── mock_utils.py │ └── date_utils.py └── heuristics │ ├── test_user_missing_common_fields.py │ ├── test_user_just_joined.py │ ├── test_user_has_only_forks.py │ ├── test_user_has_low_community_activity.py │ ├── test_repo_has_stargazzers_who_joined_the_same_day.py │ └── test_user_has_forks_from_taken_down_repos.py ├── CODEOWNERS ├── screenshot.png ├── .gitignore ├── pyproject.toml ├── README.md └── LICENSE /scripts/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /ghbuster/service/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/test_utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /CODEOWNERS: -------------------------------------------------------------------------------- 1 | * @christophetd 2 | -------------------------------------------------------------------------------- /screenshot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataDog/ghbuster/HEAD/screenshot.png -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | lib/ 3 | **/__pycache__ 4 | *.sqlite 5 | TODO.txt 6 | malicious_repos.txt 7 | -------------------------------------------------------------------------------- /tests/test_utils/mock_utils.py: -------------------------------------------------------------------------------- 1 | from unittest.mock import MagicMock 2 | 3 | 4 | def mock_pygithub_list(items: list) -> MagicMock: 5 | mock_list = MagicMock() 6 | mock_list.__iter__.return_value = iter(items) 7 | mock_list.__len__.return_value = len(items) 8 | mock_list.totalCount = len(items) 9 | return mock_list 10 | -------------------------------------------------------------------------------- /tests/test_utils/date_utils.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import random 3 | 4 | 5 | def random_date() -> datetime.datetime: 6 | year = random.randint(2000, 2025) 7 | month = random.randint(1, 12) 8 | day = random.randint(1, 28) 9 | date_str = f"{year:04d}-{month:02d}-{day:02d}T00:00:00Z" 10 | return datetime.datetime.strptime(date_str, "%Y-%m-%dT%H:%M:%SZ") 11 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "ghbuster" 3 | version = "0.1.0" 4 | description = "A tool to identify and investigate inauthentic GitHub user accounts and repositories" 5 | requires-python = ">=3.10" 6 | dependencies = [ 7 | "networkx[default]>=3.4.2", 8 | "pygithub>=2.6.1", 9 | "pyvis>=0.3.2", 10 | "requests-cache>=1.2.1", 11 | ] 12 | 13 | [project.scripts] 14 | ghbuster = "ghbuster.__main__:cli_entrypoint" 15 | -------------------------------------------------------------------------------- /ghbuster/heuristics/__init__.py: -------------------------------------------------------------------------------- 1 | from .repo_commits_only_from_suspicious_unlinked_emails import * 2 | from .repo_has_stargazzers_who_joined_the_same_day import * 3 | from .repo_starred_by_suspicious_users import * 4 | from .user_has_only_commits_from_unlinked_emails import * 5 | from .user_has_only_forks import * 6 | from .user_looks_legit import UserLooksLegit 7 | 8 | ALL_HEURISTICS = { 9 | UserJustJoinedHeuristic(), 10 | UserMissingCommonFields(), 11 | UserHasOnlyCommitsFromUnlinkedEmails(), # can be a bit slow as it analyzes all commits from the user's repositories 12 | UserHasLowCommunityActivity(), 13 | RepoStarredBySuspiciousUsers(), 14 | RepoCommitsOnlyFromSuspiciousUnlinkedEmails(), 15 | UserHasForksFromTakenDownRepos(), 16 | UserHasOnlyForkedRepos(), 17 | RepoHasStargazersWhoJoinedOnTheSameDay() 18 | } 19 | -------------------------------------------------------------------------------- /ghbuster/__init__.py: -------------------------------------------------------------------------------- 1 | import enum 2 | 3 | 4 | class TargetType(enum.Enum): 5 | REPOSITORY = "repository" 6 | USER = "user" 7 | 8 | 9 | class TargetSpec: 10 | target_type: TargetType 11 | username: str = None 12 | repo_name: str = None 13 | 14 | def __init__(self, target_type: TargetType, username: str = None, repo_name: str = None): 15 | self.target_type = target_type 16 | self.username = username 17 | self.repo_name = repo_name 18 | 19 | def repo_full_name(self) -> str: 20 | if self.target_type == TargetType.REPOSITORY and self.username and self.repo_name: 21 | return f"{self.username}/{self.repo_name}" 22 | raise ValueError("Target is not a repository or missing username/repo_name") 23 | 24 | def __repr__(self): 25 | if self.target_type == TargetType.REPOSITORY: 26 | return f"GitHub repository {self.repo_full_name()}" 27 | elif self.target_type == TargetType.USER: 28 | return f"GitHub user {self.username}" 29 | return None 30 | -------------------------------------------------------------------------------- /ghbuster/heuristics/user_has_only_forks.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | import github 4 | 5 | from .base import MetadataHeuristic, HeuristicRunResult 6 | from .. import TargetType, TargetSpec 7 | 8 | logger = logging.getLogger(__name__) 9 | 10 | 11 | # e.g. https://github.com/sweetboy235 12 | class UserHasOnlyForkedRepos(MetadataHeuristic): 13 | def id(self) -> str: 14 | return 'user.repos_only_forks' 15 | 16 | def friendly_name(self) -> str: 17 | return "User has only forks" 18 | 19 | def description(self) -> str: 20 | return "Detects all of a user's repositories are forks. This may be an indication that the user is used solely to make other repositories appear legitimate." 21 | 22 | def target_type(self) -> TargetType: 23 | return TargetType.USER 24 | 25 | def run(self, github_client: github.Github, target_spec: TargetSpec) -> HeuristicRunResult: 26 | user = github_client.get_user(login=target_spec.username) 27 | user_repos = user.get_repos(type='owner') 28 | has_only_forks = user_repos.totalCount > 0 and not any(not repo.fork for repo in user_repos) 29 | 30 | if has_only_forks: 31 | additional_details = f"The user {target_spec.username} has only forked repositories." 32 | return HeuristicRunResult.TRIGGERED(additional_details=additional_details) 33 | 34 | return HeuristicRunResult.PASSED() 35 | -------------------------------------------------------------------------------- /tests/heuristics/test_user_missing_common_fields.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from unittest.mock import patch, MagicMock 3 | 4 | from github import NamedUser 5 | 6 | from ghbuster import TargetSpec, TargetType 7 | from ghbuster.heuristics.user_metadata_basic import UserMissingCommonFields 8 | 9 | 10 | class TestUserHasLowCommunityActivity(unittest.TestCase): 11 | def setUp(self): 12 | self.heuristic = UserMissingCommonFields() 13 | 14 | @patch('ghbuster.heuristics.user_metadata_basic.github.Github') 15 | def test_positive(self, gh): 16 | target_spec = TargetSpec(target_type=TargetType.USER, username="user") 17 | ghuser = MagicMock(NamedUser) 18 | ghuser.bio = None 19 | ghuser.company = None 20 | ghuser.location = None 21 | ghuser.name = None 22 | gh.get_user.return_value = ghuser 23 | 24 | result = self.heuristic.run(gh, target_spec) 25 | self.assertTrue(result.triggered) 26 | 27 | @patch('ghbuster.heuristics.user_metadata_basic.github.Github') 28 | def test_negative(self, gh): 29 | target_spec = TargetSpec(target_type=TargetType.USER, username="user") 30 | ghuser = MagicMock(NamedUser) 31 | ghuser.bio = None 32 | ghuser.company = None 33 | ghuser.location = None 34 | ghuser.name = 'John Doe' 35 | gh.get_user.return_value = ghuser 36 | result = self.heuristic.run(gh, target_spec) 37 | self.assertFalse(result.triggered) 38 | -------------------------------------------------------------------------------- /tests/heuristics/test_user_just_joined.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from datetime import datetime, timedelta, timezone 3 | from unittest.mock import patch, Mock 4 | 5 | from github import NamedUser 6 | 7 | from ghbuster import TargetSpec, TargetType 8 | from ghbuster.heuristics.user_metadata_basic import UserJustJoinedHeuristic 9 | 10 | 11 | class TestUserHasLowCommunityActivity(unittest.TestCase): 12 | def setUp(self): 13 | self.heuristic = UserJustJoinedHeuristic() 14 | 15 | @patch('ghbuster.heuristics.user_metadata_basic.github.Github') 16 | def test_positive(self, gh): 17 | target_spec = TargetSpec(target_type=TargetType.USER, username="newuser") 18 | ghuser = Mock(NamedUser) 19 | ghuser.created_at = datetime.now(timezone.utc) - timedelta(days=UserJustJoinedHeuristic.THRESHOLD_DAYS - 1) 20 | gh.get_user.return_value = ghuser 21 | 22 | result = self.heuristic.run(gh, target_spec) 23 | self.assertTrue(result.triggered) 24 | 25 | @patch('ghbuster.heuristics.user_metadata_basic.github.Github') 26 | def test_negative(self, gh): 27 | target_spec = TargetSpec(target_type=TargetType.USER, username="olduser") 28 | ghuser = Mock(NamedUser) 29 | ghuser.created_at = datetime.now(timezone.utc) - timedelta(days=UserJustJoinedHeuristic.THRESHOLD_DAYS + 1) 30 | gh.get_user.return_value = ghuser 31 | 32 | result = self.heuristic.run(gh, target_spec) 33 | self.assertFalse(result.triggered) 34 | -------------------------------------------------------------------------------- /ghbuster/heuristics/user_has_only_commits_from_unlinked_emails.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | import github 4 | 5 | from .base import MetadataHeuristic, HeuristicRunResult 6 | from .. import TargetType, TargetSpec 7 | from ..service.emails_extractor import GitHubCommitEmailExtractor 8 | 9 | logger = logging.getLogger(__name__) 10 | 11 | 12 | class UserHasOnlyCommitsFromUnlinkedEmails(MetadataHeuristic): 13 | def id(self) -> str: 14 | return 'user.commits_unlinked_emails' 15 | 16 | def friendly_name(self) -> str: 17 | return "User has only commits from unlinked emails" 18 | 19 | def description(self) -> str: 20 | return "Detects when all of a user's commits are from emails not linked to their GitHub profiles. This may indicate a threat actor leveraging distinct inauthentic accounts." 21 | 22 | def target_type(self) -> TargetType: 23 | return TargetType.USER 24 | 25 | def run(self, github_client: github.Github, target_spec: TargetSpec) -> HeuristicRunResult: 26 | extractor = GitHubCommitEmailExtractor(github_client, target_spec, include_forks=False, 27 | include_unlinked_emails=True, 28 | include_emails_linked_to_other_users=False) 29 | emails = extractor.find_emails() 30 | has_linked_emails = any(email.is_linked_to_user for email in emails) 31 | if has_linked_emails: 32 | return HeuristicRunResult.PASSED() 33 | else: 34 | email_addresses = list[str]() 35 | for e in emails: 36 | email_addresses.append(e.email) 37 | 38 | return HeuristicRunResult.TRIGGERED( 39 | f"The user {target_spec.username} has only commits from unlinked emails: '{', '.join(email_addresses)}'.") 40 | -------------------------------------------------------------------------------- /ghbuster/service/github_archive.py: -------------------------------------------------------------------------------- 1 | import dataclasses 2 | import datetime 3 | import logging 4 | from typing import Iterable 5 | 6 | import requests 7 | 8 | logger = logging.getLogger(__name__) 9 | 10 | 11 | @dataclasses.dataclass 12 | class GitHubEvent: 13 | event_type: str 14 | actor_login: str 15 | repo_name: str 16 | created_at: datetime 17 | _raw: dict[str, str] 18 | 19 | def from_dict(self, data: dict[str, str]) -> 'GitHubEvent': 20 | self.event_type = data.get('event_type', '') 21 | self.actor_login = data.get('actor_login', '') 22 | self.repo_name = data.get('repo_name', '') 23 | self.created_at = datetime.datetime.fromisoformat(data.get('created_at', '')) 24 | self._raw = data 25 | return self 26 | 27 | 28 | class GitHubArchive: 29 | def __init__(self): 30 | self.session = requests.Session() 31 | self.session.headers['User-Agent'] = 'ghbuster' 32 | 33 | def query(self, query: str) -> Iterable[GitHubEvent]: 34 | url = "https://play.clickhouse.com/" 35 | url_params = { 36 | 'user': 'explorer', 37 | 'default_format': 'JSONStrings', 38 | } 39 | response = self.session.post(url, params=url_params, data=query) 40 | response.raise_for_status() 41 | data = response.json().get('data', []) 42 | logger.debug("Query executed successfully, received %d rows", len(data)) 43 | result = [] 44 | for row in data: 45 | result.append(GitHubEvent( 46 | event_type=row.get('event_type', None), 47 | actor_login=row.get('actor_login', None), 48 | repo_name=row.get('repo_name', None), 49 | created_at=datetime.datetime.fromisoformat(row.get('created_at', '')), 50 | _raw=row 51 | )) 52 | 53 | return result 54 | -------------------------------------------------------------------------------- /ghbuster/heuristics/base.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | 3 | import github 4 | 5 | from .. import TargetType, TargetSpec 6 | 7 | 8 | class HeuristicRunResult: 9 | def __init__(self, triggered: bool, additional_details: str = "", heuristic: 'MetadataHeuristic' = None, 10 | skipped: bool = False): 11 | self.triggered = triggered 12 | self.additional_details = additional_details 13 | self.heuristic = heuristic 14 | self.skipped = skipped 15 | 16 | @staticmethod 17 | def TRIGGERED(additional_details: str = "") -> 'HeuristicRunResult': 18 | return HeuristicRunResult(triggered=True, additional_details=additional_details) 19 | 20 | @staticmethod 21 | def PASSED(additional_details: str = "") -> 'HeuristicRunResult': 22 | return HeuristicRunResult(triggered=False) 23 | 24 | @staticmethod 25 | def SKIPPED() -> 'HeuristicRunResult': 26 | return HeuristicRunResult(triggered=False, skipped=True) 27 | 28 | 29 | class MetadataHeuristic(ABC): 30 | @abstractmethod 31 | def run(self, github_client: github.Github, target_spec: TargetSpec) -> HeuristicRunResult: 32 | """ 33 | Run the heuristic against the provided GitHub client and repository URL. 34 | 35 | :param github_client: An authenticated GitHub client. 36 | :param target_spec: The target specification containing the type and details of the target (user or repository). 37 | """ 38 | pass 39 | 40 | @abstractmethod 41 | def target_type(self) -> TargetType: 42 | """ 43 | Return the type of target this heuristic is designed for. 44 | """ 45 | pass 46 | 47 | @abstractmethod 48 | def id(self) -> str: 49 | pass 50 | 51 | @abstractmethod 52 | def friendly_name(self) -> str: 53 | pass 54 | 55 | @abstractmethod 56 | def description(self) -> str: 57 | pass 58 | -------------------------------------------------------------------------------- /tests/heuristics/test_user_has_only_forks.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from unittest.mock import patch, Mock 3 | 4 | from github import NamedUser 5 | 6 | from ghbuster import TargetSpec, TargetType 7 | from ghbuster.heuristics.user_has_only_forks import UserHasOnlyForkedRepos 8 | from tests.test_utils.mock_utils import mock_pygithub_list 9 | 10 | 11 | class TestUserHasOnlyForks(unittest.TestCase): 12 | def setUp(self): 13 | self.heuristic = UserHasOnlyForkedRepos() 14 | 15 | @patch('ghbuster.heuristics.user_has_only_forks.github.Github') 16 | def test_positive(self, gh): 17 | target_spec = TargetSpec(target_type=TargetType.USER, username="foo") 18 | user = Mock(NamedUser) 19 | user.get_repos = Mock(return_value=mock_pygithub_list([ 20 | Mock(fork=True, full_name="foo/repo1"), 21 | Mock(fork=True, full_name="foo/repo2") 22 | ])) 23 | gh.get_user.return_value = user 24 | 25 | result = self.heuristic.run(gh, target_spec) 26 | self.assertTrue(result.triggered) 27 | 28 | @patch('ghbuster.heuristics.user_has_only_forks.github.Github') 29 | def test_negative(self, gh): 30 | target_spec = TargetSpec(target_type=TargetType.USER, username="foo") 31 | user = Mock(NamedUser) 32 | user.get_repos = Mock(return_value=mock_pygithub_list([ 33 | Mock(fork=False, full_name="foo/repo1"), 34 | Mock(fork=True, full_name="foo/repo2") 35 | ])) 36 | gh.get_user.return_value = user 37 | 38 | result = self.heuristic.run(gh, target_spec) 39 | self.assertFalse(result.triggered) 40 | 41 | @patch('ghbuster.heuristics.user_has_only_forks.github.Github') 42 | def test_negative_empty(self, gh): 43 | target_spec = TargetSpec(target_type=TargetType.USER, username="foo") 44 | user = Mock(NamedUser) 45 | user.get_repos = Mock(return_value=mock_pygithub_list([])) 46 | gh.get_user.return_value = user 47 | 48 | result = self.heuristic.run(gh, target_spec) 49 | self.assertFalse(result.triggered) 50 | -------------------------------------------------------------------------------- /ghbuster/heuristics/user_looks_legit.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from datetime import timezone, datetime 3 | 4 | import github 5 | 6 | from .base import MetadataHeuristic, HeuristicRunResult 7 | from .. import TargetType, TargetSpec 8 | 9 | logger = logging.getLogger(__name__) 10 | 11 | """ 12 | UserLooksLegit is used as a strong signal that a user is authentic, to avoid running extraneous heuristics on them. 13 | """ 14 | 15 | 16 | class UserLooksLegit(MetadataHeuristic): 17 | def id(self) -> str: 18 | return 'user.looks_legit' 19 | 20 | def friendly_name(self) -> str: 21 | return "User is likely legitimate" 22 | 23 | def description(self) -> str: 24 | return "The user is likely legitimate." 25 | 26 | def target_type(self) -> TargetType: 27 | return TargetType.USER 28 | 29 | def run(self, github_client: github.Github, target_spec: TargetSpec) -> HeuristicRunResult: 30 | user = github_client.get_user(login=target_spec.username) 31 | 32 | joined_days_ago = (datetime.now(timezone.utc) - user.created_at).days 33 | likely_legit = ( 34 | user.public_repos > 10 and 35 | joined_days_ago > 365 and 36 | user.followers > 10 and 37 | user.following > 10 and 38 | user.name is not None and 39 | (user.company is not None or user.location is not None or user.bio is not None) and 40 | user.public_repos > 5 41 | ) 42 | additional_details = ( 43 | "\n" 44 | f"- The user has {user.public_repos} public repos\n" 45 | f"- The user has {user.followers} followers, and is following {user.following} users.\n" 46 | f"- The user joined {joined_days_ago} days ago.\n" 47 | f"- The user has a name set on their profile ({user.name})\n" 48 | f"- The user has the usual fields set on their profile.\n" 49 | ) 50 | if likely_legit: 51 | return HeuristicRunResult.TRIGGERED(additional_details=additional_details) 52 | else: 53 | return HeuristicRunResult.SKIPPED() 54 | -------------------------------------------------------------------------------- /ghbuster/github_repo_scanner.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | import github 4 | 5 | from . import TargetType, TargetSpec 6 | from .heuristics import HeuristicRunResult, MetadataHeuristic 7 | 8 | logger = logging.getLogger(__name__) 9 | 10 | 11 | class GitHubScanner: 12 | def __init__(self, target_spec: TargetSpec, github_client: github.Github, heuristics: list[MetadataHeuristic]): 13 | self.target_spec = target_spec 14 | self.github_client = github_client 15 | self.heuristics = heuristics 16 | 17 | def ensure_authenticated(self): 18 | try: 19 | current_user = self.github_client.get_user() 20 | print(f"Authenticated as {current_user.login}") 21 | except github.GithubException as e: 22 | raise ValueError(f"Authentication failed. Please check your GitHub token (status code {e.status})") 23 | 24 | def validate_target_spec(self): 25 | if self.target_spec.target_type == TargetType.REPOSITORY: 26 | try: 27 | self.github_client.get_repo(f"{self.target_spec.username}/{self.target_spec.repo_name}") 28 | return # all good 29 | except github.GithubException as e: 30 | raise ValueError( 31 | f"Invalid repository '{self.target_spec.username}/{self.target_spec.repo_name}': {e.data['message']}") 32 | elif self.target_spec.target_type == TargetType.USER: 33 | try: 34 | self.github_client.get_user(self.target_spec.username) 35 | return # all good 36 | except github.GithubException as e: 37 | raise ValueError(f"Invalid user '{self.target_spec.username}': {e.data['message']}") 38 | else: 39 | raise ValueError("Unsupported target type") 40 | 41 | def scan(self) -> list[HeuristicRunResult]: 42 | results = [] 43 | for heuristic in self.heuristics: 44 | if heuristic.target_type() != self.target_spec.target_type: 45 | continue 46 | 47 | logger.debug("Running heuristic %s on %s", heuristic.id(), self.target_spec) 48 | result = heuristic.run(self.github_client, self.target_spec) 49 | result.heuristic = heuristic 50 | results.append(result) 51 | return results 52 | -------------------------------------------------------------------------------- /scripts/generate_heuristics_docs.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | from ghbuster import TargetType 4 | from ghbuster.heuristics import ALL_HEURISTICS 5 | 6 | START_MARKER = "\n" 7 | END_MARKER = "\n" 8 | 9 | 10 | def generate_docs() -> str: 11 | output = '' 12 | output += "### Repository heuristics\n\n" 13 | output += '| **ID** | **Name** | **Description** |\n' 14 | output += '|:-:|:-:|:-:|\n' 15 | heuristics = sorted(ALL_HEURISTICS, key=lambda h: h.id()) 16 | for heuristic in heuristics: 17 | if heuristic.target_type() == TargetType.REPOSITORY: 18 | # Print the filename where this heuristic is defined 19 | filename = heuristic.__module__.replace('.', '/') + '.py' 20 | description = heuristic.description() 21 | description = description.replace("\n", "") 22 | output += f'| [{heuristic.id()}](./{filename}) | {heuristic.friendly_name()} | {description} |\n' 23 | output += "\n\n" 24 | 25 | output += "### GitHub user heuristics\n\n" 26 | output += '| **ID** | **Name** | **Description** |\n' 27 | output += '|:-:|:-:|:-:|\n' 28 | for heuristic in heuristics: 29 | if heuristic.target_type() == TargetType.USER: 30 | filename = heuristic.__module__.replace('.', '/') + '.py' 31 | description = heuristic.description() 32 | description = description.replace("\n", "") 33 | output += f'| [{heuristic.id()}](./{filename}) | {heuristic.friendly_name()} | {description} |\n' 34 | output += "\n\n" 35 | return output 36 | 37 | 38 | def inject_docs(file_name: str, new_docs: str): 39 | with open(file_name, 'r') as f: 40 | contents = "".join(f.readlines()) 41 | 42 | start = end = 0 43 | try: 44 | start = contents.index(START_MARKER) 45 | end = contents.index(END_MARKER) 46 | except ValueError: 47 | sys.stderr.write(f"Unable to inject docs in {file_name}, missing start or end marker") 48 | exit(1) 49 | 50 | before = contents[0:start] 51 | after = contents[end:] 52 | 53 | new_contents = before + START_MARKER + new_docs + after # 'after' already contains the end marker 54 | with open(file_name, 'w') as f: 55 | f.write(new_contents) 56 | print(f'Wrote autogenerated docs to {file_name}') 57 | 58 | 59 | if __name__ == "__main__": 60 | if len(sys.argv) < 2: 61 | print(generate_docs()) 62 | elif len(sys.argv) == 2: 63 | file = sys.argv[1] 64 | print(f'Generating docs and injecting into {file}') 65 | inject_docs(file, generate_docs()) 66 | -------------------------------------------------------------------------------- /ghbuster/__main__.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import sys 3 | 4 | import github.Auth 5 | import requests_cache 6 | 7 | from ghbuster.heuristics import MetadataHeuristic, ALL_HEURISTICS, TargetType 8 | from ghbuster.heuristics import UserLooksLegit 9 | from .cli import CliArguments, parse_and_validate_args 10 | from .github_repo_scanner import GitHubScanner 11 | from .output_formatter import OutputFormatter 12 | 13 | 14 | def setup_logging(log_level: int): 15 | logging.basicConfig(level=log_level, format="%(asctime)s [%(levelname)s] %(name)s: %(message)s") 16 | logging.getLogger("urllib3.connectionpool").setLevel(logging.INFO) 17 | logging.getLogger("requests_cache").setLevel(logging.INFO) 18 | 19 | 20 | def setup_caching(): 21 | requests_cache.install_cache('github_cache', expire_after=3600) 22 | 23 | 24 | def resolve_heuristics(included_heuristics: set[str], excluded_heuristics: set[str]) -> list[MetadataHeuristic]: 25 | heuristics = [] 26 | for heuristic in ALL_HEURISTICS: 27 | if included_heuristics: 28 | if heuristic.id() in included_heuristics: 29 | heuristics.append(heuristic) 30 | elif excluded_heuristics: 31 | if heuristic.id() not in included_heuristics: 32 | heuristics.append(heuristic) 33 | else: 34 | heuristics.append(heuristic) 35 | return heuristics 36 | 37 | 38 | def main(args: CliArguments): 39 | setup_logging(args.log_level) 40 | setup_caching() 41 | github_client = github.Github(auth=github.Auth.Token(args.github_token)) 42 | heuristics_to_run = resolve_heuristics(args.included_heuristics, args.excluded_heuristics) 43 | 44 | if args.target_spec.target_type == TargetType.USER: 45 | smoke_test = UserLooksLegit().run(github_client, args.target_spec) 46 | if smoke_test.triggered: 47 | logging.info("An initial analysis indicates that the GitHub user %s is likely legitimate: %s", 48 | args.target_spec.username, smoke_test.additional_details) 49 | if not args.force: 50 | logging.info("Exiting early without running all heuristics. Use --force to bypass") 51 | return 52 | 53 | scanner = GitHubScanner(args.target_spec, github_client, heuristics=heuristics_to_run) 54 | scanner.ensure_authenticated() 55 | scanner.validate_target_spec() 56 | results = scanner.scan() 57 | output = OutputFormatter().format_results(args.target_spec, results) 58 | print(output) 59 | 60 | 61 | def cli_entrypoint(): 62 | main(parse_and_validate_args(sys.argv[1:])) 63 | 64 | 65 | if __name__ == "__main__": 66 | cli_entrypoint() 67 | -------------------------------------------------------------------------------- /ghbuster/heuristics/user_metadata_basic.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | from datetime import datetime, timezone 3 | 4 | import github 5 | 6 | from .base import MetadataHeuristic, HeuristicRunResult 7 | from .. import TargetType, TargetSpec 8 | 9 | 10 | class BasicUserMetadataHeuristic(MetadataHeuristic): 11 | pass 12 | 13 | 14 | class UserJustJoinedHeuristic(MetadataHeuristic): 15 | THRESHOLD_DAYS = 7 16 | 17 | def id(self) -> str: 18 | return 'user.just_joined' 19 | 20 | def friendly_name(self) -> str: 21 | return "User recently joined GitHub" 22 | 23 | def description(self) -> str: 24 | return f"The GitHub user joined the platform less than {self.THRESHOLD_DAYS} days ago." 25 | 26 | def target_type(self) -> TargetType: 27 | return TargetType.USER 28 | 29 | def run(self, github_client: github.Github, target_spec: TargetSpec) -> HeuristicRunResult: 30 | user = github_client.get_user(login=target_spec.username) 31 | if user.created_at is None: 32 | return HeuristicRunResult.PASSED() 33 | 34 | days_since_creation = (datetime.now(timezone.utc) - user.created_at).days 35 | if days_since_creation >= self.THRESHOLD_DAYS: 36 | return HeuristicRunResult.PASSED() 37 | else: 38 | additional_details = f"User {target_spec.username} joined GitHub on {user.created_at.strftime('%Y-%m-%d')} ({days_since_creation} days ago)." 39 | return HeuristicRunResult.TRIGGERED(additional_details=additional_details) 40 | 41 | 42 | class UserMissingCommonFields(MetadataHeuristic): 43 | FIELDS = [ 44 | 'name', 45 | 'company', 46 | 'bio', 47 | 'location' 48 | ] 49 | 50 | def id(self) -> str: 51 | return 'user.missing_common_fields' 52 | 53 | def friendly_name(self) -> str: 54 | return "User has none of the common profile fields set" 55 | 56 | def description(self) -> str: 57 | return f"Detects when a GitHub is missing a number of highly-common fields ({', '.join(self.FIELDS)}) in their profile." 58 | 59 | def target_type(self) -> TargetType: 60 | return TargetType.USER 61 | 62 | def run(self, github_client: github.Github, target_spec: TargetSpec) -> HeuristicRunResult: 63 | user = github_client.get_user(login=target_spec.username) 64 | if all(getattr(user, field) is None for field in self.FIELDS): 65 | additional_details = f"User {target_spec.username} has none of the common fields ({', '.join(self.FIELDS)}) set." 66 | return HeuristicRunResult.TRIGGERED(additional_details=additional_details) 67 | 68 | return HeuristicRunResult.PASSED() 69 | -------------------------------------------------------------------------------- /ghbuster/heuristics/repo_has_stargazzers_who_joined_the_same_day.py: -------------------------------------------------------------------------------- 1 | from .user_has_low_community_activity import * 2 | 3 | logger = logging.getLogger(__name__) 4 | 5 | 6 | # e.g. https://github.com/heidarodeer/crypto-clipper/stargazers 7 | class RepoHasStargazersWhoJoinedOnTheSameDay(MetadataHeuristic): 8 | THRESHOLD_PERCENT = 50 9 | MIN_STARGAZERS = 2 10 | MAX_STARGAZERS = 100 11 | 12 | def id(self) -> str: 13 | return 'repo.stargazers_joined_same_day' 14 | 15 | def friendly_name(self) -> str: 16 | return "Repository has stargazers who joined the same day" 17 | 18 | def description(self) -> str: 19 | return "Detects when a repository has a large proportion of its stargazers who joined GitHub on the same day, which may indicate a coordinated effort to boost the repository's popularity." 20 | 21 | def target_type(self) -> TargetType: 22 | return TargetType.REPOSITORY 23 | 24 | def run(self, github_client: github.Github, target_spec: TargetSpec) -> HeuristicRunResult: 25 | repo = github_client.get_repo(full_name_or_id=target_spec.repo_full_name()) 26 | all_stargazers = repo.get_stargazers() 27 | 28 | if all_stargazers.totalCount < self.MIN_STARGAZERS: 29 | logger.debug("Repository %s has too few stargazers (%d) to analyze.", target_spec.repo_full_name(), 30 | all_stargazers.totalCount) 31 | return HeuristicRunResult.PASSED() 32 | 33 | if all_stargazers.totalCount > self.MAX_STARGAZERS: 34 | logger.debug("Repository %s has too many stargazers (%d) to analyze, limiting to %d.", 35 | target_spec.repo_full_name(), all_stargazers.totalCount, self.MAX_STARGAZERS) 36 | all_stargazers = all_stargazers[:self.MAX_STARGAZERS] 37 | 38 | logger.info("Analyzing the creation date of %d stargazers", all_stargazers.totalCount) 39 | stargazers_by_join_day = {} 40 | for stargazer in all_stargazers: 41 | user_joined_day = stargazer.created_at.strftime("%Y-%m-%d") 42 | if user_joined_day not in stargazers_by_join_day: 43 | stargazers_by_join_day[user_joined_day] = 0 44 | stargazers_by_join_day[user_joined_day] += 1 45 | 46 | # Now compute the count for each join day 47 | for join_day in stargazers_by_join_day: 48 | pct_joined_on_that_day = 100 * stargazers_by_join_day[join_day] / all_stargazers.totalCount 49 | if pct_joined_on_that_day >= self.THRESHOLD_PERCENT: 50 | additional_details = ( 51 | f"Repository {target_spec.repo_full_name()} has {stargazers_by_join_day[join_day]} stargazers " 52 | f"({pct_joined_on_that_day} %) who joined on the same day, {join_day}." 53 | ) 54 | return HeuristicRunResult.TRIGGERED(additional_details=additional_details) 55 | 56 | return HeuristicRunResult.PASSED() 57 | -------------------------------------------------------------------------------- /tests/heuristics/test_user_has_low_community_activity.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from unittest.mock import patch, Mock 3 | 4 | from github import NamedUser 5 | 6 | from ghbuster import TargetSpec, TargetType 7 | from ghbuster.heuristics.user_has_low_community_activity import UserHasLowCommunityActivity 8 | 9 | 10 | class TestUserHasLowCommunityActivity(unittest.TestCase): 11 | def setUp(self): 12 | self.heuristic = UserHasLowCommunityActivity() 13 | 14 | @patch('ghbuster.heuristics.user_has_low_community_activity.github.Github') 15 | def test_positive(self, gh): 16 | target_spec = TargetSpec(target_type=TargetType.USER, username="foo") 17 | user = Mock(NamedUser) 18 | user.login = target_spec.username 19 | user.get_starred = Mock(return_value=Mock(totalCount=UserHasLowCommunityActivity.STARS_THRESHOLD - 1)) 20 | user.get_following = Mock(return_value=Mock(totalCount=UserHasLowCommunityActivity.FOLLOWING_THRESHOLD - 1)) 21 | user.get_followers = Mock(return_value=Mock(totalCount=UserHasLowCommunityActivity.FOLLOWERS_THRESHOLD - 1)) 22 | gh.get_user.return_value = user 23 | gh.search_issues.return_value = Mock(totalCount=UserHasLowCommunityActivity.ISSUES_OR_PR_THRESHOLD - 1) 24 | result = self.heuristic.run(gh, target_spec) 25 | self.assertTrue(result.triggered) 26 | 27 | @patch('ghbuster.heuristics.user_has_low_community_activity.github.Github') 28 | def test_negative(self, gh): 29 | target_spec = TargetSpec(target_type=TargetType.USER, username="foo") 30 | user = Mock(NamedUser) 31 | user.login = target_spec.username 32 | user.get_starred = Mock(return_value=Mock(totalCount=UserHasLowCommunityActivity.STARS_THRESHOLD + 1)) 33 | user.get_following = Mock(return_value=Mock(totalCount=UserHasLowCommunityActivity.FOLLOWING_THRESHOLD + 1)) 34 | user.get_followers = Mock(return_value=Mock(totalCount=UserHasLowCommunityActivity.FOLLOWERS_THRESHOLD + 1)) 35 | gh.get_user.return_value = user 36 | gh.search_issues.return_value = Mock(totalCount=UserHasLowCommunityActivity.ISSUES_OR_PR_THRESHOLD + 1) 37 | result = self.heuristic.run(gh, target_spec) 38 | self.assertFalse(result.triggered) 39 | 40 | @patch('ghbuster.heuristics.user_has_low_community_activity.github.Github') 41 | def test_negative_single_attribute_ok(self, gh): 42 | target_spec = TargetSpec(target_type=TargetType.USER, username="foo") 43 | user = Mock(NamedUser) 44 | user.login = target_spec.username 45 | user.get_starred = Mock(return_value=Mock(totalCount=UserHasLowCommunityActivity.STARS_THRESHOLD + 1)) 46 | user.get_following = Mock(return_value=Mock(totalCount=UserHasLowCommunityActivity.FOLLOWING_THRESHOLD - 1)) 47 | user.get_followers = Mock(return_value=Mock(totalCount=UserHasLowCommunityActivity.FOLLOWERS_THRESHOLD - 1)) 48 | gh.get_user.return_value = user 49 | gh.search_issues.return_value = Mock(totalCount=UserHasLowCommunityActivity.ISSUES_OR_PR_THRESHOLD - 1) 50 | result = self.heuristic.run(gh, target_spec) 51 | self.assertFalse(result.triggered) 52 | -------------------------------------------------------------------------------- /ghbuster/cli.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import re 4 | from argparse import ArgumentParser 5 | 6 | from . import TargetType, TargetSpec 7 | 8 | 9 | def _cli() -> ArgumentParser: 10 | parser = ArgumentParser( 11 | prog="ghbuster", 12 | exit_on_error=False, 13 | description="Identify inauthentic GitHub accounts and repositories", 14 | ) 15 | 16 | parser.add_argument("target", type=str, 17 | help="Target GitHub repository or user to scan, e.g., 'owner/repo', `username`, or 'https://github.com/owner/repo'.") 18 | parser.add_argument("--github-token", type=str, 19 | help="GitHub token for authentication. If not provided, the GITHUB_TOKEN environment variable is used", 20 | required=False, default=os.environ.get("GITHUB_TOKEN")) 21 | parser.add_argument("--debug", action="store_true", help="Enable debug logging", dest="enable_debug", default=False) 22 | parser.add_argument("--include", nargs="+", help="Heuristics to include (any other heuristic will not be ran)", 23 | default=[]) 24 | parser.add_argument("--exclude", nargs="+", help="Heuristics to exclude", default=[]) 25 | parser.add_argument("--force", action="store_true", default=False) 26 | return parser 27 | 28 | 29 | class CliArguments: 30 | target_spec: TargetSpec 31 | github_token: str 32 | log_level: int 33 | excluded_heuristics: set[str] 34 | included_heuristics: set[str] 35 | force: bool 36 | 37 | 38 | def parse_and_validate_args(args) -> CliArguments: 39 | args = _cli().parse_args(args) 40 | cli_args = CliArguments() 41 | 42 | # Determine target type and parse repository or user 43 | normalized_target = args.target.strip().lower() 44 | github_url_prefix = "https://github.com/" 45 | if normalized_target.startswith(github_url_prefix): 46 | normalized_target = normalized_target[len(github_url_prefix):] 47 | 48 | if '/' in normalized_target: 49 | # It's a repository 50 | parts = normalized_target.split('/') 51 | if len(parts) != 2: 52 | raise ValueError("Invalid repository format. Expected 'owner/repo'.") 53 | cli_args.target_spec = TargetSpec(target_type=TargetType.REPOSITORY, username=parts[0], repo_name=parts[1]) 54 | else: 55 | # It's a user 56 | if not re.match(r'^[a-zA-Z0-9-]+$', normalized_target): 57 | # "Username may only contain alphanumeric characters or single hyphens, and cannot begin or end with a hyphen." (from the GitHub homepage) 58 | raise ValueError("Invalid GitHub username format") 59 | cli_args.target_spec = TargetSpec(target_type=TargetType.USER, username=normalized_target) 60 | 61 | # Github token 62 | cli_args.github_token = args.github_token 63 | if cli_args.github_token is None or len(cli_args.github_token) == 0: 64 | raise ValueError( 65 | "GitHub token is required. Please provide it via the --github-token argument or set the GITHUB_TOKEN environment variable.") 66 | 67 | # Log level 68 | cli_args.log_level = logging.DEBUG if args.enable_debug else logging.INFO 69 | 70 | # Heuristics selection 71 | if args.include and args.exclude: 72 | raise ValueError("--include and --exclude are mutually exclusive.") 73 | cli_args.included_heuristics = set(args.include) 74 | cli_args.excluded_heuristics = set(args.exclude) 75 | 76 | cli_args.force = args.force 77 | return cli_args 78 | -------------------------------------------------------------------------------- /ghbuster/heuristics/user_has_forks_from_taken_down_repos.py: -------------------------------------------------------------------------------- 1 | import functools 2 | import logging 3 | 4 | import github 5 | 6 | from .base import MetadataHeuristic, HeuristicRunResult 7 | from .. import TargetType, TargetSpec 8 | 9 | logger = logging.getLogger(__name__) 10 | 11 | 12 | # e.g. https://github.com/mrrebrik3765 13 | class UserHasForksFromTakenDownRepos(MetadataHeuristic): 14 | 15 | def __init__(self, max_forks_to_analyze: int = 10): 16 | super().__init__() 17 | self.max_forks_to_analyze = max_forks_to_analyze 18 | 19 | def id(self) -> str: 20 | return 'user.forks_from_taken_down_repos' 21 | 22 | def friendly_name(self) -> str: 23 | return "User has forks of taken-down repositories" 24 | 25 | def description(self) -> str: 26 | return "Detects when a user has forks from repositories that have been taken down. This may indicate that the user is being leveraged as part of a campaign to make inauthentic repositories appear legitimate." 27 | 28 | def target_type(self) -> TargetType: 29 | return TargetType.USER 30 | 31 | def run(self, github_client: github.Github, target_spec: TargetSpec) -> HeuristicRunResult: 32 | user = github_client.get_user(login=target_spec.username) 33 | user_repos = user.get_repos(type='owner') 34 | taken_down_repos = set() 35 | num_forks_analyzed = 0 36 | for repo in user_repos: 37 | if repo.fork: 38 | if num_forks_analyzed >= self.max_forks_to_analyze: 39 | logger.debug("Reached maximum number of forks to analyze (%d), stopping further checks", 40 | self.max_forks_to_analyze) 41 | break 42 | num_forks_analyzed += 1 43 | logger.debug("Analyzing forked repository %s owned by user %s", repo.full_name, target_spec.username) 44 | try: 45 | original_name = repo.parent.full_name 46 | if not self.repo_exists(github_client, original_name): 47 | taken_down_repos.add(original_name) 48 | except github.GithubException as e: 49 | if e.status in [403, 451] and e.message == 'Repository access blocked': 50 | # tos violation, i.e. soft takedown 51 | # TODO should probably be taken into account by the heuristic 52 | logger.warning("Repository %s owned by user %s is blocked, ignoring", repo.full_name, 53 | target_spec.username) 54 | else: 55 | raise e 56 | 57 | if len(taken_down_repos) > 0: 58 | additional_details = f"The user {target_spec.username} has forks from taken down repositories: {', '.join(taken_down_repos)}." 59 | return HeuristicRunResult.TRIGGERED(additional_details=additional_details) 60 | 61 | return HeuristicRunResult.PASSED() 62 | 63 | @functools.cache 64 | def repo_exists(self, github_client: github.Github, full_name: str) -> bool: 65 | try: 66 | github_client.get_repo(full_name) 67 | return True 68 | except github.GithubException as e: 69 | if e.status == 404: 70 | return False 71 | elif e.status in [403, 451] and e.message == 'Repository access blocked': 72 | # tos violation or other access issues 73 | return False 74 | raise 75 | -------------------------------------------------------------------------------- /ghbuster/heuristics/user_has_low_community_activity.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from datetime import datetime, timedelta, timezone 3 | 4 | import github 5 | 6 | from .base import MetadataHeuristic, HeuristicRunResult 7 | from .. import TargetType, TargetSpec 8 | 9 | logger = logging.getLogger(__name__) 10 | 11 | 12 | class UserHasLowCommunityActivity(MetadataHeuristic): 13 | STARS_THRESHOLD = 1 14 | FOLLOWING_THRESHOLD = 1 15 | FOLLOWERS_THRESHOLD = 1 16 | ISSUES_OR_PR_THRESHOLD = 1 17 | ISSUES_OR_PR_TIME_PERIOD_DAYS = 30.5 * 6 # 6 months 18 | 19 | def id(self) -> str: 20 | return 'user.low_community_activity' 21 | 22 | def friendly_name(self) -> str: 23 | return "User with low community activity" 24 | 25 | def description(self) -> str: 26 | return "Detects when a user has very low community activity. This may indicate that the user is inauthentic." 27 | 28 | def target_type(self) -> TargetType: 29 | return TargetType.USER 30 | 31 | def run(self, github_client: github.Github, target_spec: TargetSpec) -> HeuristicRunResult: 32 | user = github_client.get_user(login=target_spec.username) 33 | start_date = datetime.now(timezone.utc) - timedelta(days=self.ISSUES_OR_PR_TIME_PERIOD_DAYS) 34 | stars = user.get_starred() 35 | following = user.get_following() 36 | followers = user.get_followers() 37 | has_few_stars = stars.totalCount <= self.STARS_THRESHOLD 38 | has_few_following = following.totalCount <= self.FOLLOWING_THRESHOLD 39 | has_few_followers = followers.totalCount <= self.FOLLOWERS_THRESHOLD 40 | 41 | has_few_issues_or_prs = False 42 | issue_count = 0 43 | pr_count = 0 44 | try: 45 | issues = github_client.search_issues( 46 | f"type:issue author:{target_spec.username} created:>{start_date.strftime('%Y-%m-%d')}") 47 | prs = github_client.search_issues( 48 | f"type:pr author:{target_spec.username} created:>{start_date.strftime('%Y-%m-%d')}") 49 | 50 | issue_count = issues.totalCount 51 | pr_count = prs.totalCount 52 | has_few_issues_or_prs = issue_count + pr_count <= self.ISSUES_OR_PR_THRESHOLD 53 | 54 | except github.GithubException as e: 55 | if e.status == 422 and e.message == 'Validation Failed': 56 | logger.info("User %s has their profile activity in private mode, unable to list their PRs and issues", 57 | user.login) 58 | else: 59 | raise e 60 | 61 | logger.debug("User %s has %d stars, %d following, %d followers, %d issues, and %d PRs in the last %d days.", 62 | target_spec.username, stars.totalCount, following.totalCount, followers.totalCount, 63 | issue_count, pr_count, self.ISSUES_OR_PR_TIME_PERIOD_DAYS) 64 | 65 | if not has_few_stars or not has_few_following or not has_few_followers or not has_few_issues_or_prs: 66 | return HeuristicRunResult.PASSED() 67 | 68 | reason = 'User has low community activity: ' 69 | triggered = [] 70 | if has_few_stars: 71 | triggered.append(f"{stars.totalCount} stars (threshold: {self.STARS_THRESHOLD})") 72 | if has_few_following: 73 | triggered.append(f"{following.totalCount} following (threshold: {self.FOLLOWING_THRESHOLD})") 74 | if has_few_followers: 75 | triggered.append(f"{followers.totalCount} followers (threshold: {self.FOLLOWERS_THRESHOLD})") 76 | if has_few_issues_or_prs: 77 | triggered.append( 78 | f"{issues.totalCount + prs.totalCount} issues/PRs in the last {self.ISSUES_OR_PR_TIME_PERIOD_DAYS} days (threshold: {self.ISSUES_OR_PR_THRESHOLD})") 79 | 80 | reason += ', '.join(triggered) 81 | return HeuristicRunResult.TRIGGERED(additional_details=reason) 82 | -------------------------------------------------------------------------------- /tests/heuristics/test_repo_has_stargazzers_who_joined_the_same_day.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from datetime import datetime 3 | from unittest.mock import patch, Mock 4 | 5 | from github import Repository 6 | 7 | from ghbuster import TargetSpec, TargetType 8 | from ghbuster.heuristics.repo_has_stargazzers_who_joined_the_same_day import RepoHasStargazersWhoJoinedOnTheSameDay 9 | from tests.test_utils.date_utils import random_date 10 | from tests.test_utils.mock_utils import mock_pygithub_list 11 | 12 | 13 | class TestUserHasLowCommunityActivity(unittest.TestCase): 14 | def setUp(self): 15 | self.heuristic = RepoHasStargazersWhoJoinedOnTheSameDay() 16 | 17 | @patch('ghbuster.heuristics.repo_has_stargazzers_who_joined_the_same_day.github.Github') 18 | def test_positive_all_stargazers_joined_same_day(self, gh): 19 | target_spec = TargetSpec(target_type=TargetType.REPOSITORY, username="user", repo_name="repo") 20 | ghrepo = Mock(Repository) 21 | ghrepo.get_stargazers = Mock(return_value=mock_pygithub_list([ 22 | Mock(login="user1", created_at=datetime.strptime("2025-08-07T00:00:00Z", "%Y-%m-%dT%H:%M:%SZ")), 23 | Mock(login="user2", created_at=datetime.strptime("2025-08-07T00:00:00Z", "%Y-%m-%dT%H:%M:%SZ")), 24 | Mock(login="user3", created_at=datetime.strptime("2025-08-07T00:00:00Z", "%Y-%m-%dT%H:%M:%SZ")), 25 | Mock(login="user4", created_at=datetime.strptime("2025-08-07T00:00:00Z", "%Y-%m-%dT%H:%M:%SZ")) 26 | ])) 27 | gh.get_repo.return_value = ghrepo 28 | 29 | result = self.heuristic.run(gh, target_spec) 30 | self.assertTrue(result.triggered) 31 | 32 | @patch('ghbuster.heuristics.repo_has_stargazzers_who_joined_the_same_day.github.Github') 33 | def test_positive_threshold_of_stargazers_joined_same_day(self, gh): 34 | target_spec = TargetSpec(target_type=TargetType.REPOSITORY, username="user", repo_name="repo") 35 | ghrepo = Mock(Repository) 36 | num_users = 10 37 | num_joined_same_day = round(RepoHasStargazersWhoJoinedOnTheSameDay.THRESHOLD_PERCENT / 100 * num_users) + 2 38 | same_day_users = [ 39 | Mock(login=f"user{i}", created_at=datetime.strptime("2025-08-07T00:00:00Z", "%Y-%m-%dT%H:%M:%SZ")) 40 | for i in range(num_joined_same_day) 41 | ] 42 | other_users = [ 43 | Mock(login=f"user{i}", created_at=random_date()) 44 | for i in range(num_users - num_joined_same_day) 45 | ] 46 | ghrepo.get_stargazers = Mock(return_value=mock_pygithub_list(same_day_users + other_users)) 47 | gh.get_repo.return_value = ghrepo 48 | 49 | result = self.heuristic.run(gh, target_spec) 50 | self.assertTrue(result.triggered) 51 | 52 | @patch('ghbuster.heuristics.repo_has_stargazzers_who_joined_the_same_day.github.Github') 53 | def test_negative_not_enough_stargazers(self, gh): 54 | target_spec = TargetSpec(target_type=TargetType.REPOSITORY, username="user", repo_name="repo") 55 | ghrepo = Mock(Repository) 56 | ghrepo.get_stargazers = Mock(return_value=mock_pygithub_list([])) 57 | gh.get_repo.return_value = ghrepo 58 | 59 | result = self.heuristic.run(gh, target_spec) 60 | self.assertFalse(result.triggered) 61 | 62 | @patch('ghbuster.heuristics.repo_has_stargazzers_who_joined_the_same_day.github.Github') 63 | def test_negative_too_many_stargazers(self, gh): 64 | target_spec = TargetSpec(target_type=TargetType.REPOSITORY, username="user", repo_name="repo") 65 | ghrepo = Mock(Repository) 66 | stargazers = [ 67 | Mock(login=f"user{i}", created_at=random_date()) 68 | for i in range(RepoHasStargazersWhoJoinedOnTheSameDay.MAX_STARGAZERS + 1) 69 | ] 70 | ghrepo.get_stargazers = Mock(return_value=mock_pygithub_list(stargazers)) 71 | gh.get_repo.return_value = ghrepo 72 | 73 | result = self.heuristic.run(gh, target_spec) 74 | self.assertFalse(result.triggered) 75 | -------------------------------------------------------------------------------- /ghbuster/heuristics/repo_commits_only_from_suspicious_unlinked_emails.py: -------------------------------------------------------------------------------- 1 | from github.NamedUser import NamedUser 2 | 3 | from .user_has_forks_from_taken_down_repos import * 4 | from .user_has_low_community_activity import * 5 | 6 | logger = logging.getLogger(__name__) 7 | 8 | 9 | # e.g. https://github.com/al1enb1t/cheatengine-for-linux 10 | # or https://github.com/Caztemaz/Lnk-Exploit-FileBinder-Certificate-Spoofer-Reg-Doc-Cve-Rce in the case of taken down users 11 | class RepoCommitsOnlyFromSuspiciousUnlinkedEmails(MetadataHeuristic): 12 | MAX_COMMITS = 100 13 | 14 | def id(self) -> str: 15 | return 'repo.commits_suspicious_unlinked_emails' 16 | 17 | def friendly_name(self) -> str: 18 | return "Repository commits only from suspicious unlinked emails" 19 | 20 | def description(self) -> str: 21 | return "Detects when a repository has commits with unlinked emails that also don't match the owner's username or full name." 22 | 23 | def target_type(self) -> TargetType: 24 | return TargetType.REPOSITORY 25 | 26 | def run(self, github_client: github.Github, target_spec: TargetSpec) -> HeuristicRunResult: 27 | repo = github_client.get_repo(full_name_or_id=target_spec.repo_full_name()) 28 | user = github_client.get_user(login=target_spec.username) 29 | normalized_username = user.login.lower() 30 | normalized_user_full_name = user.name.lower() if user.name else None 31 | commits = repo.get_commits() 32 | logger.debug("Analyzing %d commits for repository %s", commits.totalCount, target_spec.repo_full_name()) 33 | num_suspicious = 0 34 | num_processed = 0 35 | unlinked_emails = set() 36 | for commit in commits: 37 | if num_processed == self.MAX_COMMITS: 38 | logger.debug("Reached max commit limit of %d, stopping the processing", self.MAX_COMMITS) 39 | break 40 | num_processed += 1 41 | normalized_committer_name = commit.commit.author.name.lower() 42 | name_matches = (normalized_committer_name in [normalized_username, normalized_user_full_name]) 43 | if commit.author is None and not name_matches: 44 | # Case 1: the commit is not linked to any GitHub user based on the email 45 | # As it's a common misconfiguration, we only flag it if the author name in the git metadata doesn't match the user's username/name 46 | num_suspicious += 1 47 | unlinked_emails.add(commit.commit.author.email) 48 | elif commit.author is not None and self.commit_linked_to_taken_down_user(github_client, commit.author): 49 | # Case 2: the commit is linked to a GitHub user that has previously been taken down, we consider it "suspiciously-unlinked" too 50 | num_suspicious += 1 51 | unlinked_emails.add(commit.commit.author.email) 52 | 53 | if num_suspicious == num_processed: 54 | additional_details = f"The repository only has commits from unlinked emails ({', '.join(unlinked_emails)})." 55 | return HeuristicRunResult.TRIGGERED(additional_details=additional_details) 56 | 57 | return HeuristicRunResult.PASSED() 58 | 59 | @functools.cache 60 | def commit_linked_to_taken_down_user(self, github_client: github.Github, author: NamedUser) -> bool: 61 | # We know the commit is linked to a specific GitHub user, i.e. the git metadata email was linked to a specific user at the time of the commit 62 | # In some cases, the associated user doesn't exist anymore (e.g. taken down), so we consider the email as "currently unlinked" in this case 63 | # (like we'd see in the GitHub UI that the username is not clickable) 64 | try: 65 | github_client.get_user_by_id(author.id) 66 | return False # no exception, the user exists 67 | except github.GithubException as e: 68 | if e.status == 404: 69 | return True 70 | raise e 71 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ghbuster 2 | 3 | **ghbuster** is a tool to detect suspicious GitHub repositories and users using heuristics. It is designed to help 4 | identify potentially malicious or inauthentic accounts and repositories on GitHub. 5 | 6 |
7 |
8 |