├── example_output.png ├── .gitignore ├── README.md ├── LICENSE.md ├── utils.sh └── archaeologit.sh /example_output.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peterjaric/archaeologit/HEAD/example_output.png -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # -*- mode: gitignore; -*- 2 | *~ 3 | \#*\# 4 | /.emacs.desktop 5 | /.emacs.desktop.lock 6 | *.elc 7 | auto-save-list 8 | tramp 9 | .\#* 10 | 11 | 12 | # log files and candidate files 13 | *.log 14 | *.txt 15 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Archaeologit 2 | This script scans the history of a user's GitHub repositories 3 | for a given pattern to find sensitive things that may have been there 4 | but have been overwritten in a later commit. For example passwords or secret tokens. 5 | 6 | ## Usage 7 | archaeologit.sh '' [] 8 | 9 | ## Examples 10 | archaeologit.sh USERNAME 'password.....|secret.....|passwd.....|credentials.....|creds.....|aws.?key.....|consumer.?key.....|api.?key.....|aws.?token.....|oauth.?token.....|access.?token.....|api.?token.....' 11 | archaeologit.sh peterjaric 'password|secret|token' scan.log 12 | 13 | ## Example output 14 | ![Example output when running archaeologit](example_output.png) 15 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Peter Jaric 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /utils.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # This file is part of archaeologit which is released under the MIT license. See LICENSE.md. 3 | # Archaeologit was originally written by Peter Jaric 4 | 5 | # Fail script immediately if something goes wrong 6 | set -e 7 | 8 | if which tput > /dev/null 9 | then 10 | TPUT=tput 11 | else 12 | TPUT=: 13 | fi 14 | 15 | UNDERLINE_START=$(${TPUT} smul) 16 | UNDERLINE_END=$(${TPUT} rmul) 17 | 18 | # Log message 19 | # -d Suppress date 20 | # -u Underline text 21 | # -n No new line 22 | # In message: _S_ -> Start underlining, _E_ -> Stop underlining 23 | log() { 24 | local OPTIND 25 | local SUPPRESS_DATE 26 | local SUPPRESS_NEWLINE 27 | local OVERWRITE 28 | local UNDERLINE 29 | local FILE 30 | 31 | while getopts "rndf:" opt; do 32 | case $opt in 33 | d) SUPPRESS_DATE=true ;; 34 | f) FILE=${OPTARG} ;; 35 | n) SUPPRESS_NEWLINE=true ;; 36 | r) OVERWRITE=true; SUPPRESS_NEWLINE=true ;; 37 | \?) echo "Invalid option to log: -${OPTSTRING}" >&2 ;; 38 | esac 39 | done 40 | 41 | shift $((OPTIND-1)) 42 | MSG=$@ 43 | FMSG=${MSG} 44 | 45 | # Handle special strings 46 | MSG=${MSG//_S_/$UNDERLINE_START} 47 | MSG=${MSG//_E_/$UNDERLINE_END} 48 | FMSG=${FMSG//_S_/} 49 | FMSG=${FMSG//_E_/} 50 | 51 | if [ "${SUPPRESS_DATE}" != true ] 52 | then 53 | MSG="[$(date)] ${MSG}" 54 | FMSG="[$(date)] ${FMSG}" 55 | fi 56 | 57 | if [ "${OVERWRITE}" == true ] 58 | then 59 | MSG="\r${MSG}" 60 | fi 61 | 62 | echo -en "${MSG}" 63 | 64 | if [ "${SUPPRESS_NEWLINE}" != true ] 65 | then 66 | echo 67 | fi 68 | 69 | if [ "${FILE}" != "" ] 70 | then 71 | echo "${FMSG}" >> ${FILE} 72 | fi 73 | 74 | 75 | } 76 | 77 | 78 | faketimeout() { 79 | shift 1 80 | eval $@ 81 | } 82 | 83 | if which timeout > /dev/null 84 | then 85 | TIMEOUT_BIN=timeout 86 | elif which gtimeout > /dev/null 87 | then 88 | TIMEOUT_BIN=gtimeout 89 | else 90 | TIMEOUT_BIN=faketimeout 91 | log -d "No timeout command found, disabling timeout functionality." 92 | fi 93 | 94 | SED_BIN=sed 95 | if which gsed > /dev/null 96 | then 97 | SED_BIN=gsed 98 | fi 99 | -------------------------------------------------------------------------------- /archaeologit.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # This file is part of archaeologit which is released under the MIT license. See LICENSE.md. 3 | # Archaeologit was originally written by Peter Jaric 4 | 5 | DESCRIPTION=\ 6 | "This script scans the history of a user's GitHub repositories 7 | for a given pattern to find sensitive things that may have been there 8 | but have been overwritten in a later commit. For example passwords or 9 | secret tokens. 10 | 11 | Example: 12 | ./archaeologit.sh USERNAME 'password.....|secret.....|passwd.....|credentials.....|creds.....|aws.?key.....|consumer.?key.....|api.?key.....|aws.?token.....|oauth.?token.....|access.?token.....|api.?token.....' 13 | " 14 | 15 | # Fail script immediately if something goes wrong (some commands need "|| true" after them because of this) 16 | set -e 17 | 18 | # Load some useful utility functions 19 | source utils.sh 20 | 21 | # Timeout of git clone and git fetch per repository 22 | TIMEOUT=5m 23 | 24 | # Max number of commits to scan. Will scan the latest commits if more than this limit. 25 | MAX_COMMITS=10000 26 | 27 | # Where to store repositories locally 28 | WORKINGPATH=/tmp/archaeologit/repos 29 | 30 | # Where results are logged by default 31 | DEFAULT_LOG_FILE=archaeologit.log 32 | 33 | # Do not show files matching this pattern 34 | export PATH_BLACKLIST_PATTERN='.*\.md|.*\.markdown|.*\.html?|.*\.css|.*\.min.js|.*\.rst|.*\.jquery\..*\.js|.*/node_modules/.*|.*test.*|.*example.*|.*sample.*|.*rdoc|.*spec.rb' 35 | 36 | # Get script arguments 37 | USERNAME=$1 38 | PATTERN=$2 39 | LOG_FILE=$3 40 | 41 | export PATTERN 42 | export SED_PATTERN=$(echo "${PATTERN}" | ${SED_BIN} 's/\([|?*]\)/\\\1/g') 43 | export SED_BIN 44 | 45 | # Validate arguments 46 | if [ "${USERNAME}" = "" -o "${PATTERN}" = "" ] 47 | then 48 | echo "${DESCRIPTION}" 49 | echo "Usage: $0 '' []" 50 | echo "Example: $0 peterjaric 'password|secret|token' scan.log" 51 | echo "Warning: do not use capturing groups in pattern." 52 | exit 1 53 | fi 54 | 55 | 56 | # Cleanup function for when script is interrupted 57 | function prematurefinish { 58 | log -f ${LOG_FILE} "Script exited prematurely, probably due to an error." 59 | if [ "${ONGOING_CLONE}" != "" -a -d "${ONGOING_CLONE}" ] 60 | then 61 | log -d "Cleaning up interrupted clone folder ${ONGOING_CLONE}." 62 | rm -rf ${ONGOING_CLONE} 63 | fi 64 | } 65 | trap prematurefinish EXIT 66 | 67 | mkdir -p ${WORKINGPATH} 68 | 69 | # Create list of repos, either just one from command line, or by 70 | # fetching it from GitHub 71 | if [[ "${USERNAME}" == http* ]] 72 | then 73 | # Check file argument and set to default if missing 74 | if [ "${LOG_FILE}" = "" ] 75 | then 76 | LOG_FILE="${DEFAULT_LOG_FILE}" 77 | fi 78 | REPOS=${USERNAME} 79 | log -d "Writing output to _S_${LOG_FILE}_E_" 80 | log -f ${LOG_FILE} "Fetching just one repo: _S_${REPOS}_E_" 81 | else 82 | # Check file argument and set to default if missing 83 | if [ "${LOG_FILE}" = "" ] 84 | then 85 | LOG_FILE="${USERNAME}_${DEFAULT_LOG_FILE}" 86 | fi 87 | 88 | log -d "Writing output to _S_${LOG_FILE}_E_" 89 | log -f ${LOG_FILE} "Fetching _S_${USERNAME}_E_'s GitHub repos..." 90 | 91 | # User 92 | JSON=$(curl -s "https://api.github.com/users/${USERNAME}/repos?type=all&per_page=100") # Currently not paging above 100 repos 93 | REPOS=$(echo "${JSON}" | grep clone_url | cut -d'"' -f4) 94 | 95 | # Organization 96 | JSON=$(curl -s "https://api.github.com/orgs/${USERNAME}/repos?type=all&per_page=100") # Currently not paging above 100 repos 97 | REPOS="${REPOS}"$(echo "${JSON}" | grep clone_url | cut -d'"' -f4) 98 | fi 99 | 100 | LOG_FILE=$(realpath ${LOG_FILE}) 101 | 102 | REPO_COUNT=$(echo -n "${REPOS}" | wc -w) 103 | 104 | log -f ${LOG_FILE} "Going to search for: /${PATTERN}/i in ${REPO_COUNT} repos." 105 | 106 | export LOG_FILE 107 | 108 | for REPO in ${REPOS} 109 | do 110 | # Create the local folder name for this repository 111 | CLONEPATH=${WORKINGPATH}/$(echo ${REPO} | 112 | rev | # Reverse path 113 | cut -d "/" -f 1-2 | # Get rid of everything after the second slash 114 | cut -c 5- | # Remove "tig." (".git") 115 | rev) # Reverse back - now we have username/reponame 116 | 117 | export GITHUBURL=$(echo ${REPO} | rev | cut -d "." -f 2- | rev) # Remove .git 118 | 119 | # If repository already present, just update it, otherwise clone it 120 | if [ -e "${CLONEPATH}" -a -d "${CLONEPATH}" ] 121 | then 122 | # Update existing repository 123 | GIT_GET="bash -c 'cd ${CLONEPATH} && git fetch -q origin HEAD:HEAD ; cd -'" 124 | else 125 | # Clone repository 126 | mkdir -p ${CLONEPATH} 127 | 128 | # Inject fake username and password into repo url to avoid prompting when 129 | # cloning private or removed repo 130 | REPO_WITH_CREDS=$(echo $REPO | sed 's|//|//git:git@|') 131 | 132 | GIT_GET="git clone --bare -q ${REPO_WITH_CREDS} ${CLONEPATH}" 133 | fi 134 | 135 | log -f ${LOG_FILE} "Getting _S_${REPO}_E_..." 136 | 137 | # Going to fetch this repo 138 | ONGOING_CLONE=${CLONEPATH} 139 | 140 | # Fetch repo, timeout if it takes too much time 141 | if eval ${TIMEOUT_BIN} ${TIMEOUT} ${GIT_GET} > /dev/null 2> /dev/null 142 | then 143 | # Done fetching this repo 144 | unset ONGOING_CLONE 145 | 146 | log "Searching repository..." 147 | cd ${CLONEPATH} 148 | 149 | # Make a list of all commit hashes 150 | ALL_COMMITS=$(git log --all --pretty=format:%h --max-count ${MAX_COMMITS}) > /dev/null 2> /dev/null || true 151 | 152 | export ALL_COMMITS 153 | 154 | if [ "${ALL_COMMITS}" != "" ] 155 | then 156 | # Grep for all occurences of PATTERN in all commits of the repository 157 | if ! ${TIMEOUT_BIN} ${TIMEOUT} bash -c 'git grep -I --ignore-case --line-number --extended-regexp -e "${PATTERN}" ${ALL_COMMITS} | 158 | sort -u -t":" -k4 | # Remove duplicate lines 159 | while IFS= read -r LINE; do # Fold lines and keep only the parts with the pattern 160 | HASH=$(echo "${LINE}" | cut -d: -f1) 161 | GITPATH=$(echo "${LINE}" | cut -d: -f2) 162 | LINENUMBER=$(echo "${LINE}" | cut -d: -f3) 163 | GITHUBPATH="${GITHUBURL}/blob/${HASH}/${GITPATH}" 164 | if [[ ! ${GITPATH} =~ ${PATH_BLACKLIST_PATTERN} ]] 165 | then 166 | echo "${LINE}" | 167 | tr -d "\015" | # Remove Windows line ending character (^M) 168 | cut -d: -f4- | # Remove location 169 | fold -s | # Fold long lines on spaces 170 | fold | # Fold again if any lines still are too long 171 | grep -iE "${PATTERN}" | # Only keep lines matching the pattern 172 | ${SED_BIN} "s@\(${SED_PATTERN}\)@KILLUNTILHERE\1@i" | # 173 | ${SED_BIN} "s@^.*KILLUNTILHERE\(${SED_PATTERN}\)\(.\{0,30\}\).*@\1\2~${GITHUBPATH}#L${LINENUMBER}@i" # Format line to fit the column command 174 | fi 175 | done | # Echo to stdout 176 | column -t -s"~" | # Format line in columns 177 | tee -a ${LOG_FILE} | # Also echo to file 178 | grep --color -iE "${PATTERN}" || true' # Colorize pattern in stdout 179 | then 180 | log -f "${LOG_FILE}" "Timed out after ${TIMEOUT}." 181 | fi 182 | else 183 | log "Empty repository, no commits!" 184 | fi 185 | cd - > /dev/null 186 | else 187 | log -f "${LOG_FILE}" "Timed out after ${TIMEOUT} or could not fetch repository." 188 | fi 189 | log -f "${LOG_FILE}" "Done." 190 | done 191 | 192 | # Remove trap, we are going to exit correctly 193 | trap EXIT 194 | --------------------------------------------------------------------------------