├── .gitignore ├── listallrepos ├── listallrepos.go ├── README.md ├── git_all.sh ├── do_everything.sh └── calculate_staleness.sh /.gitignore: -------------------------------------------------------------------------------- 1 | .envrc 2 | repos/ 3 | stats/ 4 | -------------------------------------------------------------------------------- /listallrepos: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mwinters0/github-cleanup/master/listallrepos -------------------------------------------------------------------------------- /listallrepos.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "context" 5 | "encoding/json" 6 | "github.com/google/go-github/github" 7 | "golang.org/x/oauth2" 8 | "log" 9 | "os" 10 | ) 11 | 12 | func main() { 13 | ctx := context.Background() 14 | ts := oauth2.StaticTokenSource( 15 | &oauth2.Token{AccessToken: os.Getenv("GITHUB_TOKEN")}, 16 | ) 17 | tc := oauth2.NewClient(ctx, ts) 18 | 19 | client := github.NewClient(tc) 20 | 21 | opt := &github.RepositoryListByOrgOptions{ 22 | ListOptions: github.ListOptions{PerPage: 10}, 23 | } 24 | 25 | var allRepos []*github.Repository 26 | for { 27 | repos, resp, err := client.Repositories.ListByOrg(ctx, os.Getenv("GITHUB_ORG"), opt) 28 | fatalIf(err) 29 | allRepos = append(allRepos, repos...) 30 | if resp.NextPage == 0 { 31 | break 32 | } 33 | opt.Page = resp.NextPage 34 | } 35 | 36 | j, err := json.Marshal(allRepos) 37 | fatalIf(err) 38 | os.Stdout.Write(j) 39 | } 40 | 41 | func fatalIf(err error) { 42 | if err != nil { 43 | log.Fatal(err) 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # github-cleanup 2 | 3 | You've got a lot of repos. Like, thousands. Some of them probably aren't used any more, or never were. How do you find the dead weight? 4 | 5 | Github's API is not informative enough to help you, so these scripts will: 6 | 1. Retrieve the full list of repos in your Github organization. 7 | 2. Clone them all locally, and checkout whichever branch has the latest commit. 8 | 3. Assign a weight ("heat") to each repo for how likely it is to be useless, and aggregate those stats into a CSV file for ease of cajoling your coworkers. 9 | 10 | 11 | 12 | ## Usage 13 | 1. Clone this repo to somewhere with lots of space. 14 | 2. Have [jq](https://stedolan.github.io/jq/) somewhere on your PATH, or inside this dir. 15 | 2. Set environment variables `GITHUB_ORG` and `GITHUB_TOKEN` 16 | 3. `./do_everything.sh` 17 | 18 | The code will clone all repos to `./repos`, and generate data files in `./stats/TODAY/TODAY.csv`. 19 | 20 | ## TODO 21 | - Pass "created_at" through the pipeline 22 | - Move the "ignore recently-created repos" code into make_staleness_stats.sh so all stats options are in one place 23 | - Create a whitelist facility 24 | - Support stdin/out 25 | -------------------------------------------------------------------------------- /git_all.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Reads $DATA_FILE ($1), clones all repos to REPOS_DIR ($2), finds and pulls the 4 | # most-recent branch. 5 | 6 | #set -e 7 | # This is intentionally not set because git can fail in many marvelous ways. 8 | # TODO gracefully handle any git failures and maybe produce a report. 9 | 10 | 11 | 12 | ### Init ### 13 | 14 | DATA_FILE="$1" 15 | REPOS_DIR="$2" 16 | NUM_REPOS=$(wc -l "$DATA_FILE" | awk '{print $1}') 17 | 18 | if [ ! -r "$DATA_FILE" ]; then 19 | echo "Can't read data file '$DATA_FILE', aborting" 20 | exit 1 21 | fi 22 | 23 | if [ ! -w "$REPOS_DIR" ]; then 24 | echo "Can't write to '$REPOS_DIR', aborting" 25 | exit 1 26 | fi 27 | 28 | echo "Data file is $DATA_FILE" 29 | echo "Repos dir is $REPOS_DIR" 30 | 31 | 32 | ### Main ### 33 | 34 | CURRENT_NUM=1 35 | cd "$REPOS_DIR" 36 | while read REPO_URL; do 37 | echo "${CURRENT_NUM}/${NUM_REPOS}" 38 | REPO_NAME=$(basename "$REPO_URL" .git) 39 | REPO_DIR="$REPOS_DIR/$REPO_NAME" 40 | echo "$REPO_DIR" 41 | 42 | if [[ ! -d "$REPO_DIR" ]]; then 43 | cd "$REPOS_DIR" 44 | git clone "$REPO_URL" 45 | fi 46 | cd "$REPO_DIR" 47 | #find newest branch and pull it 48 | git fetch 49 | LATEST_BRANCH_NAME=$(git branch -a --sort=-committerdate | head -1 | awk '{print $1}' | sed 's/remotes\/origin\///g') 50 | if [ -z "$LATEST_BRANCH_NAME" ]; then 51 | # zero branches, empty repo 52 | continue 53 | fi 54 | if $MJS_DEBUG; then echo "LATEST_BRANCH_NAME: ${LATEST_BRANCH_NAME}"; fi 55 | git checkout $LATEST_BRANCH_NAME 56 | git merge --ff-only 57 | cd "$REPOS_DIR" 58 | 59 | CURRENT_NUM=$((++CURRENT_NUM)) 60 | sleep 1 # just to avoid hitting any rate limits 61 | done <"${DATA_FILE}" 62 | -------------------------------------------------------------------------------- /do_everything.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | # Runs everything with default settings (clone to ./repos, output to ./stats/TODAY) 5 | 6 | 7 | ### Config ### 8 | # TODO getopt 9 | 10 | ### Init ### 11 | 12 | MY_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" #location of this script, for generating absolute paths 13 | TIMESTAMP=$(date +"%Y%m%d") 14 | STATS_DIR="$MY_DIR/stats/$TIMESTAMP" 15 | REPOS_DIR="$MY_DIR/repos" 16 | 17 | if [ -z "$GITHUB_ORG" ]; then 18 | echo "You must set environment variable GITHUB_ORG, aborting." 19 | exit 1 20 | fi 21 | 22 | if [ -z "$GITHUB_TOKEN" ]; then 23 | echo "You must set environment variable GITHUB_TOKEN, aborting." 24 | exit 1 25 | fi 26 | 27 | if [ ! -x "./listallrepos" ]; then 28 | echo "./listallrepos is not executable, aborting." 29 | exit 1 30 | fi 31 | 32 | if [ ! -x "./git_all.sh" ]; then 33 | echo "./git_all.sh is not executable, aborting." 34 | exit 1; 35 | fi 36 | 37 | if [ ! -x "./calculate_staleness.sh" ]; then 38 | echo "./calculate_staleness.sh is not executable, aborting." 39 | exit 1; 40 | fi 41 | 42 | # jq might be in ./ or might be on path 43 | JQ_CMD='./jq' 44 | if [ ! -x "$JQ_CMD" ]; then 45 | command -v jq >/dev/null 2>&1 || { 46 | echo "I require jq but it's not installed, aborting."; 47 | exit 1; 48 | } 49 | JQ_CMD='jq' 50 | fi 51 | 52 | 53 | ### Main ### 54 | 55 | echo "Stats dir is $STATS_DIR" 56 | if [ ! -d "$STATS_DIR" ]; then 57 | echo "Stats dir doesn't exist, creating..." 58 | mkdir -p "$STATS_DIR" 59 | if [ $? -ne 0 ] ; then 60 | echo "Unable to mkdir, aborting." 61 | exit 1 62 | fi 63 | fi 64 | 65 | echo "Repos dir is $REPOS_DIR" 66 | if [ ! -d "$REPOS_DIR" ]; then 67 | echo "Repos dir doesn't exist, creating..." 68 | mkdir -p "$REPOS_DIR" 69 | if [ $? -ne 0 ] ; then 70 | echo "Unable to mkdir, aborting." 71 | exit 1 72 | fi 73 | fi 74 | 75 | echo "Getting full repo list ..." 76 | ./listallrepos > "$STATS_DIR/allrepos.json" 77 | if [ $? -ne 0 ] ; then 78 | echo "There was an error in listallrepos, aborting." 79 | exit 1 80 | fi 81 | 82 | #Ignore repos created in the past 14 days (60 * 60 * 24 * 14 = 1209600) 83 | echo "Filtering to exclude recently-created repos ..." 84 | < "$STATS_DIR/allrepos.json" "$JQ_CMD" '[.[] | select((now - (.created_at | fromdate)) >= 1209600)]' > "$STATS_DIR/allrepos-exclude-recent.json" 85 | 86 | # Everything downstream runs off of this file (instead of just globbing the 87 | # directories) so that we can easily implement a whitelist / more metadata / 88 | # etc here. 89 | DATA_FILE="$STATS_DIR/ssh_urls" 90 | echo "Extracting git data ..." 91 | < "$STATS_DIR/allrepos-exclude-recent.json" "$JQ_CMD" -r '.[] | .ssh_url' > "$DATA_FILE" 92 | 93 | echo "Cloning and pulling all ..." 94 | ./git_all.sh "$DATA_FILE" "$REPOS_DIR" 95 | 96 | ./calculate_staleness.sh "$DATA_FILE" "$REPOS_DIR" "$STATS_DIR/$TIMESTAMP.csv" 97 | -------------------------------------------------------------------------------- /calculate_staleness.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | # config 5 | ENABLE_DEBUG_OUTPUT=false 6 | OLD_AGE_DAYS=1095 7 | FEW_FILES=10 8 | TINY_KB=8 9 | 10 | 11 | 12 | ### Functions ### 13 | 14 | usage() { 15 | echo "USAGE: $0 " 16 | echo 17 | exit 1 18 | } 19 | 20 | # Quick n dirty! Global vars! 21 | writeRepoStats() { 22 | LINE=${HEAT},"\"https://github.com/TheWeatherCompany/${REPO_NAME}\",\"${HEAT_REASON}\",\"${LAST_COMMIT_BRANCH}\",\"${LAST_COMMIT_DATE}\",${NUM_COMMITS},${NUM_FILES},${DISK_SIZE_CONTENT}" 23 | if $ENABLE_DEBUG_OUTPUT; then 24 | echo $LINE 25 | echo; 26 | else 27 | echo $LINE >> "$OUTPUT_FILE" 28 | fi 29 | } 30 | 31 | 32 | ### Init ### 33 | 34 | DATA_FILE="$1" 35 | REPOS_DIR="$2" 36 | OUTPUT_FILE="$3" 37 | NOW=$(date +%s) 38 | 39 | if [[ -z "$DATA_FILE" ]]; then usage; fi 40 | if [[ -z "$REPOS_DIR" ]]; then usage; fi 41 | if [[ -z "$OUTPUT_FILE" ]]; then usage; fi 42 | 43 | if [ ! -r "$DATA_FILE" ]; then 44 | echo "Can't read data file '$DATA_FILE', aborting" 45 | exit 1 46 | fi 47 | 48 | if [ ! -r "$REPOS_DIR" ]; then 49 | echo "Can't read '$REPOS_DIR', aborting" 50 | exit 1 51 | fi 52 | 53 | if [ ! -w "$(dirname $OUTPUT_FILE)" ]; then 54 | echo "Can't write to '$(dirname $OUTPUT_FILE)', aborting" 55 | exit 1 56 | fi 57 | 58 | 59 | ### Main ### 60 | 61 | NUM_REPOS=$(wc -l "$DATA_FILE" | awk '{print $1}') 62 | 63 | #Write CSV header into new file 64 | if ! $ENABLE_DEBUG_OUTPUT ; then 65 | echo '"Heat","Repo URL","Heat Reasons","Latest branch","Last commit date","Number of commits","Number of files","Size of repo content (kb)"' > "$OUTPUT_FILE" 66 | fi 67 | 68 | CURRENT_NUM=0 69 | while read REPO_URL; do 70 | CURRENT_NUM=$((++CURRENT_NUM)) 71 | 72 | REPO_NAME=$(basename "$REPO_URL" .git) 73 | echo "${CURRENT_NUM}/${NUM_REPOS} $REPO_NAME" 74 | 75 | #sanity check 76 | REPO_DIR="$REPOS_DIR/$REPO_NAME" 77 | if [[ ! -d "$REPO_DIR" ]]; then 78 | echo "Not a directory: '${REPO_DIR}', skipping" 79 | continue 80 | fi 81 | #init 82 | HEAT=0 83 | HEAT_REASON='' 84 | NUM_COMMITS=0 85 | LAST_COMMIT_DATE='' 86 | LAST_COMMIT_UNIX=0 87 | LAST_COMMIT_AGE=0 88 | LAST_COMMIT_BRANCH='' 89 | NUM_FILES=0 90 | DISK_SIZE_ACTUAL=0 91 | DISK_SIZE_GIT=0 92 | DISK_SIZE_CONTENT=0 93 | NUM_BRANCHES=0 94 | 95 | cd "$REPO_DIR" 96 | if ! git --no-pager log -1 >/dev/null 2>&1; then 97 | # No git log (non-zero error) = totally empty 98 | HEAT=100 99 | HEAT_REASON='completely empty' 100 | LAST_COMMIT_DATE='never' 101 | writeRepoStats 102 | continue 103 | fi 104 | LAST_COMMIT_BRANCH=$(git rev-parse --abbrev-ref HEAD) 105 | NUM_COMMITS=$(git rev-list --count HEAD --) 106 | LAST_COMMIT_DATE=$(git --no-pager log -1 --format=%cI) 107 | LAST_COMMIT_UNIX=$(git --no-pager log -1 --format=%ct) 108 | LAST_COMMIT_AGE=$((($NOW - $LAST_COMMIT_UNIX) / 86400 )) #age in days 109 | NUM_FILES=$(find . -type f | grep -v '.git/' | wc -l | awk '{print $1}') 110 | DISK_SIZE_ACTUAL=$(du -k -s . | awk '{print $1}') 111 | DISK_SIZE_GIT=$(du -k -s .git | awk '{print $1}') 112 | DISK_SIZE_CONTENT=$(($DISK_SIZE_ACTUAL - $DISK_SIZE_GIT)) 113 | if $ENABLE_DEBUG_OUTPUT; then 114 | echo "NUM_COMMITS: ${NUM_COMMITS}" 115 | echo "LAST_COMMIT_DATE: ${LAST_COMMIT_DATE}" 116 | echo "LAST_COMMIT_UNIX: ${LAST_COMMIT_UNIX}" 117 | echo "LAST_COMMIT_AGE: ${LAST_COMMIT_AGE}" 118 | echo "NUM_FILES: ${NUM_FILES}" 119 | echo "DISK_SIZE_CONTENT: ${DISK_SIZE_CONTENT}" 120 | fi 121 | if [[ "$LAST_COMMIT_AGE" -gt "$OLD_AGE_DAYS" ]]; then 122 | HEAT=$(($HEAT + 10)) 123 | # Add 1 heat for every 30 days past OLD_AGE_DAYS 124 | BASE_AGE=$(($LAST_COMMIT_AGE - $OLD_AGE_DAYS)) 125 | HEAT=$(($HEAT + ($BASE_AGE / 30))) 126 | HEAT_REASON="$HEAT_REASON, no recent commits" 127 | fi 128 | if [[ "$NUM_FILES" -lt "$FEW_FILES" ]]; then 129 | # TODO Don't ding for few files if LAST_COMMIT_AGE is fairly recent (1yr?), or if NUM_COMMITS > 5, or DISK_SIZE_CONTENT > 16 130 | HEAT=$(($HEAT + (($FEW_FILES - $NUM_FILES) * 2) + 1)) #fewer files = more heat, 1 file = $FEW_FILES heat 131 | HEAT_REASON="$HEAT_REASON, few files" 132 | fi 133 | if [[ "$DISK_SIZE_CONTENT" -lt "$TINY_KB" ]]; then 134 | HEAT=$(($HEAT + ($TINY_KB - $DISK_SIZE_CONTENT) + 1)) #smaller = more heat, 1kb = $TINY_KB heat 135 | HEAT_REASON="$HEAT_REASON, tiny" 136 | fi 137 | cd "$REPOS_DIR" 138 | 139 | if [[ "$HEAT" -gt 0 ]]; then 140 | HEAT_REASON=${HEAT_REASON:2} # strip leading ', ' 141 | writeRepoStats 142 | fi 143 | done <${DATA_FILE} 144 | --------------------------------------------------------------------------------