├── .github └── workflows │ └── main.yml ├── .gitignore ├── .markdownlint.json ├── LICENSE ├── MANIFEST.in ├── Makefile ├── NOTICE ├── README.md ├── demo.json ├── llnl_config.json ├── requirements.txt ├── requirements ├── dev.txt └── production.txt ├── scraper ├── __init__.py ├── azuredevops │ ├── __init__.py │ └── models.py ├── bitbucket │ └── __init__.py ├── code_gov │ ├── __init__.py │ └── models.py ├── doecode │ └── __init__.py ├── gen_code_gov_json.py ├── github │ ├── __init__.py │ ├── queryManager.py │ └── util.py ├── gitlab │ └── __init__.py ├── tfs │ ├── __init__.py │ └── models.py └── util.py ├── scripts ├── clone_everything.py ├── codegov_compute_hours.py ├── get_stargazers.py ├── get_traffic.py ├── get_users_emails.py ├── get_year_commits.py ├── github_stats.py ├── my_repo.py ├── org_to_emails.py └── stars.py ├── setup.cfg └── setup.py /.github/workflows/main.yml: -------------------------------------------------------------------------------- 1 | name: llnl-scraper 2 | 3 | on: 4 | pull_request: [] 5 | 6 | jobs: 7 | testing: 8 | runs-on: ${{ matrix.os }} 9 | strategy: 10 | fail-fast: false 11 | matrix: 12 | os: 13 | - ubuntu-latest 14 | python-version: 15 | - "3.8" 16 | - "3.9" 17 | - "3.10" 18 | - "3.11" 19 | - "3.12" 20 | - "3.13" 21 | name: Python ${{ matrix.python-version }} Tests 22 | steps: 23 | - uses: actions/checkout@v3 24 | - name: Set up Python 25 | uses: actions/setup-python@v4 26 | with: 27 | python-version: ${{ matrix.python-version }} 28 | architecture: x64 29 | - name: Python Tests 30 | run: | 31 | conda create --quiet --name test pytest 32 | export PATH="/usr/share/miniconda/bin:$PATH" 33 | source activate test 34 | pip install bandit black isort flake8 35 | pip install . 36 | npm install -g markdownlint-cli@0.33.0 37 | make test 38 | scraper -h 39 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | code.csv 2 | code.json 3 | config.json 4 | build/ 5 | dist/ 6 | venv/ 7 | *.pyc 8 | llnl_scraper.egg-info/ 9 | .vscode/ 10 | -------------------------------------------------------------------------------- /.markdownlint.json: -------------------------------------------------------------------------------- 1 | { 2 | "MD013": false, 3 | "MD014": false 4 | } 5 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018, Lawrence Livermore National Security, LLC 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include requirements/*.txt 2 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | test: 2 | bandit -r scraper/ 3 | flake8 scraper/ 4 | black --check . 5 | isort --check . 6 | 7 | markdownlint '**/*.md' 8 | pyflakes scraper 9 | 10 | release: test 11 | python3 setup.py sdist bdist_wheel 12 | 13 | upload: 14 | twine upload --skip-existing dist/* 15 | -------------------------------------------------------------------------------- /NOTICE: -------------------------------------------------------------------------------- 1 | This work was produced under the auspices of the U.S. Department of Energy by 2 | Lawrence Livermore National Laboratory under Contract DE-AC52-07NA27344. 3 | 4 | This work was prepared as an account of work sponsored by an agency of the 5 | United States Government. Neither the United States Government nor Lawrence 6 | Livermore National Security, LLC, nor any of their employees makes any warranty, 7 | expressed or implied, or assumes any legal liability or responsibility for the 8 | accuracy, completeness, or usefulness of any information, apparatus, product, or 9 | process disclosed, or represents that its use would not infringe privately owned 10 | rights. Reference herein to any specific commercial product, process, or service 11 | by trade name, trademark, manufacturer, or otherwise does not necessarily 12 | constitute or imply its endorsement, recommendation, or favoring by the United 13 | States Government or Lawrence Livermore National Security, LLC. The views and 14 | opinions of authors expressed herein do not necessarily state or reflect those 15 | of the United States Government or Lawrence Livermore National Security, LLC, 16 | and shall not be used for advertising or product endorsement purposes. 17 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Scraper 2 | 3 | Scraper is a tool for scraping and visualizing open source data from various 4 | code hosting platforms, such as: GitHub.com, GitHub Enterprise, GitLab.com, 5 | hosted GitLab, and Bitbucket Server. 6 | 7 | ## Getting Started: Code.gov 8 | 9 | [Code.gov](https://code.gov) is a newly launched website of the US Federal 10 | Government to allow the People to access metadata from the governments custom 11 | developed software. This site requires metadata to function, and this Python 12 | library can help with that! 13 | 14 | To get started, you will need a [GitHub Personal Auth 15 | Token](https://help.github.com/articles/creating-a-personal-access-token-for-the-command-line/) 16 | to make requests to the GitHub API. This should be set in your environment or 17 | shell `rc` file with the name `GITHUB_API_TOKEN`: 18 | 19 | ```shell 20 | $ export GITHUB_API_TOKEN=XYZ 21 | 22 | $ echo "export GITHUB_API_TOKEN=XYZ" >> ~/.bashrc 23 | ``` 24 | 25 | Additionally, to perform the labor hours estimation, you will need to install 26 | `cloc` into your environment. This is typically done with a [Package 27 | Manager](https://github.com/AlDanial/cloc#install-via-package-manager) such as 28 | `npm` or `homebrew`. 29 | 30 | Then to generate a `code.json` file for your agency, you will need a 31 | `config.json` file to coordinate the platforms you will connect to and scrape 32 | data from. An example config file can be found in [demo.json](/demo.json). Once 33 | you have your config file, you are ready to install and run the scraper! 34 | 35 | ```shell 36 | # Install Scraper from a local copy of this repository 37 | $ pip install -e . 38 | # OR 39 | # Install Scraper from PyPI 40 | $ pip install llnl-scraper 41 | 42 | # Run Scraper with your config file ``config.json`` 43 | $ scraper --config config.json 44 | ``` 45 | 46 | A full example of the resulting `code.json` file can be [found 47 | here](https://gist.github.com/IanLee1521/b7d7c0c2d8c24b10dd04edd5e8cab6c4). 48 | 49 | ## Config File Options 50 | 51 | The configuration file is a json file that specifies what repository platforms 52 | to pull projects from as well as some settings that can be used to override 53 | incomplete or inaccurate data returned via the scraping. 54 | 55 | The basic structure is: 56 | 57 | ```jsonc 58 | { 59 | // REQUIRED 60 | "contact_email": "...", // Used when the contact email cannot be found otherwise 61 | 62 | // OPTIONAL 63 | "agency": "...", // Your agency abbreviation here 64 | "organization": "...", // The organization within the agency 65 | "permissions": { ... }, // Object containing default values for usageType and exemptionText 66 | 67 | // Platform configurations, described in more detail below 68 | "GitHub": [ ... ], 69 | "GitLab": [ ... ], 70 | "Bitbucket": [ ... ], 71 | } 72 | ``` 73 | 74 | ```jsonc 75 | "GitHub": [ 76 | { 77 | "url": "https://github.com", // GitHub.com or GitHub Enterprise URL to inventory 78 | "token": null, // Private token for accessing this GitHub instance 79 | "public_only": true, // Only inventory public repositories 80 | 81 | "connect_timeout": 4, // The timeout in seconds for connecting to the server 82 | "read_timeout": 10, // The timeout in seconds to wait for a response from the server 83 | 84 | "orgs": [ ... ], // List of organizations to inventory 85 | "repos": [ ... ], // List of single repositories to inventory 86 | "exclude": [ ... ] // List of organizations / repositories to exclude from inventory 87 | } 88 | ], 89 | ``` 90 | 91 | ```jsonc 92 | "GitLab": [ 93 | { 94 | "url": "https://gitlab.com", // GitLab.com or hosted GitLab instance URL to inventory 95 | "token": null, // Private token for accessing this GitHub instance 96 | "fetch_languages": false, // Include individual calls to API for language metadata. Very slow, so defaults to false. (eg, for 191 projects on internal server, 5 seconds for False, 12 minutes, 38 seconds for True) 97 | 98 | "orgs": [ ... ], // List of organizations to inventory 99 | "repos": [ ... ], // List of single repositories to inventory 100 | "exclude": [ ... ] // List of groups / repositories to exclude from inventory 101 | } 102 | ] 103 | ``` 104 | 105 | ```jsonc 106 | "Bitbucket": [ 107 | { 108 | "url": "https://bitbucket.internal", // Base URL for a Bitbucket Server instance 109 | "username": "", // Username to authenticate with 110 | "password": "", // Password to authenticate with 111 | "token": "", // Token to authenticate with, if supplied username and password are ignored 112 | 113 | "exclude": [ ... ] // List of projects / repositories to exclude from inventory 114 | } 115 | ] 116 | ``` 117 | 118 | ```jsonc 119 | "TFS": [ 120 | { 121 | "url": "https://tfs.internal", // Base URL for a Team Foundation Server (TFS) or Visual Studio Team Services (VSTS) 122 | "token": null, // Private token for accessing this TFS instance 123 | 124 | "exclude": [ ... ] // List of projects / repositories to exclude from inventory 125 | } 126 | ] 127 | ``` 128 | 129 | ```jsonc 130 | "AzureDevOps": [ 131 | { 132 | "url": "https://dev.azure.com", // Base URL for an Azure Dev Ops Server or Azure Dev Ops Cloud instance 133 | "token": null, // Personal Access Token for accessing this ADO instance 134 | "apiVersion": "", // API Version 135 | "exclude": [ ... ] // List of projects to exclude from inventory 136 | } 137 | ] 138 | ``` 139 | 140 | ## License 141 | 142 | Scraper is released under an MIT license. For more details see the 143 | [LICENSE](/LICENSE) file. 144 | 145 | LLNL-CODE-705597 146 | -------------------------------------------------------------------------------- /demo.json: -------------------------------------------------------------------------------- 1 | { 2 | "agency": "DOE", 3 | "organization": "Lawrence Livermore National Laboratory", 4 | "contact_email": "open-source@llnl.gov", 5 | 6 | "permissions": { 7 | "usageType": "exemptByAgencyMission", 8 | "exemptionText": "This source code resides on a private server and has not been properly evaluated for releaseability." 9 | }, 10 | 11 | "GitHub": [ 12 | { 13 | "url": "https://github.com", 14 | "token": null, 15 | "public_only": true, 16 | 17 | "orgs": [ 18 | "LLNL", 19 | "spack", 20 | "xbraid" 21 | ], 22 | "repos": [ 23 | "dun/conman", 24 | "dun/munge" 25 | ], 26 | "exclude": [ 27 | "LLNL", 28 | "spack/spack.io" 29 | ] 30 | } 31 | ], 32 | 33 | "GitLab": [ 34 | { 35 | "url": "https://gitlab.com", 36 | "token": null, 37 | "fetch_languages": false, 38 | 39 | "repos": [ 40 | "IanLee1521/flake8", 41 | "gnachman/iterm2", 42 | "gitlab-org/gitlab-ce" 43 | ], 44 | "exclude": [ 45 | "IanLee1521", 46 | "gitlab-org/gitlab-ce" 47 | ] 48 | } 49 | ] 50 | } 51 | -------------------------------------------------------------------------------- /llnl_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "agency": "DOE", 3 | "organization": "Lawrence Livermore National Laboratory", 4 | "contact_email": "open-source@llnl.gov", 5 | "github_orgs": [ 6 | "chaos", 7 | "esgf", 8 | "flux-framework", 9 | "glvis", 10 | "llnl", 11 | "mfem", 12 | "pruners", 13 | "rose-compiler", 14 | "spack", 15 | "uv-cdat", 16 | "zfsonlinux" 17 | ], 18 | "github_repos": [ 19 | "ceed/laghos", 20 | "dun/conman", 21 | "dun/munge", 22 | "frankieli/icenine", 23 | "hpc/dcp", 24 | "hpc/mpifileutils", 25 | "hpc/openlorenz", 26 | "hpc/spindle" 27 | ] 28 | } 29 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | -r requirements/dev.txt 2 | -------------------------------------------------------------------------------- /requirements/dev.txt: -------------------------------------------------------------------------------- 1 | -r production.txt 2 | 3 | # Development tools 4 | ipython 5 | twine 6 | 7 | # Testing tools 8 | bandit 9 | black 10 | flake8 11 | isort 12 | pyflakes 13 | safety 14 | -------------------------------------------------------------------------------- /requirements/production.txt: -------------------------------------------------------------------------------- 1 | github3.py>=2.0.0 2 | msrest>=0.6.4 3 | python-dateutil>=2.7.3 4 | python-gitlab>=1.6.0 5 | pytz>=2017.3 6 | requests>=2.16 7 | setuptools>=24.2.0 8 | stashy>=0.3 9 | vsts>=0.1.25 10 | -------------------------------------------------------------------------------- /scraper/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LLNL/scraper/4b0efe9cae3d062b0e6b633333e42768d56f8b57/scraper/__init__.py -------------------------------------------------------------------------------- /scraper/azuredevops/__init__.py: -------------------------------------------------------------------------------- 1 | import base64 2 | import logging 3 | import os 4 | import re 5 | from typing import List 6 | 7 | import requests 8 | 9 | from scraper.azuredevops.models import AzureDevOpsCollection, AzureDevOpsProject 10 | 11 | logger = logging.getLogger(__name__) 12 | 13 | 14 | class AzureDevOpsClient: 15 | def __init__(self, baseurl, api_version, token=None): 16 | self.baseurl = baseurl 17 | self.api_version = api_version 18 | self.is_cloud_ado = "dev.azure.com" in baseurl 19 | self.session = self._create_client_session(token) 20 | 21 | def get_projects_metadata(self) -> List[AzureDevOpsProject]: 22 | """ 23 | Get metadata for all projects 24 | """ 25 | collections = self._get_all_collections() 26 | return self._get_all_projects(collections) 27 | 28 | def _create_client_session(self, token): 29 | """ 30 | Creates the Azure DevOps Client Context with the provided token. 31 | If no token is provided, it will look for the ADO_API_TOKEN environment variable. 32 | """ 33 | if token is None: 34 | token = os.environ.get("ADO_API_TOKEN", None) 35 | 36 | if token is None: 37 | raise RuntimeError("Azure Dev Ops Token was not provided.") 38 | 39 | session = requests.Session() 40 | auth_string = f":{token}" 41 | encoded_auth = base64.b64encode(auth_string.encode("ascii")).decode("ascii") 42 | session.headers.update( 43 | {"Authorization": f"Basic {encoded_auth}", "Accept": "application/json"} 44 | ) 45 | return session 46 | 47 | def _get_all_collections(self) -> List[AzureDevOpsCollection]: 48 | """ 49 | Get all collections from the Azure DevOps API. 50 | """ 51 | collections = [] 52 | 53 | if self.is_cloud_ado: 54 | # For cloud Azure DevOps, get all organizations from the API 55 | profile_url = f"https://app.vssps.visualstudio.com/_apis/profile/profiles/me?api-version={self.api_version}" 56 | profile_response = self.session.get(profile_url) 57 | 58 | if profile_response.status_code == 200: 59 | profile = profile_response.json() 60 | 61 | # Get user's organizations/accounts 62 | accounts_url = f"https://app.vssps.visualstudio.com/_apis/accounts?memberId={profile['id']}&api-version={self.api_version}" 63 | accounts_response = self.session.get(accounts_url) 64 | 65 | if accounts_response.status_code == 200: 66 | accounts_json = accounts_response.json() 67 | 68 | if accounts_json.get("value") and len(accounts_json["value"]) > 0: 69 | for org in accounts_json["value"]: 70 | collections.append( 71 | AzureDevOpsCollection( 72 | id=org["accountId"], 73 | name=org["accountName"], 74 | url=f"https://dev.azure.com/{org['accountName']}", 75 | ) 76 | ) 77 | logger.debug( 78 | f"Found cloud organization: {org['accountName']}" 79 | ) 80 | else: 81 | logger.warning("No organizations found with your access token.") 82 | 83 | # Fallback: Try to extract organization from baseAddress 84 | org_name = self.baseurl.rstrip("/").split("/")[-1] 85 | if org_name and org_name != "dev.azure.com": 86 | collections.append( 87 | AzureDevOpsCollection( 88 | id=org_name, 89 | name=org_name, 90 | url=f"https://dev.azure.com/{org_name}", 91 | ) 92 | ) 93 | logger.debug( 94 | f"Using organization from base address: {org_name}" 95 | ) 96 | else: 97 | raise RuntimeError( 98 | f"Failed to retrieve organizations. Status Code: {accounts_response.status_code} Response: {accounts_response.text}" 99 | ) 100 | else: 101 | logger.warning( 102 | f"Failed to retrieve user profile: {profile_response.status_code} Response: {profile_response.text}" 103 | ) 104 | logger.warning( 105 | "Falling back to base address for organization extraction." 106 | ) 107 | # Fallback: Try to extract organization from baseAddress 108 | org_name = self.baseurl.rstrip("/").split("/")[-1] 109 | if org_name and org_name != "dev.azure.com": 110 | collections.append( 111 | AzureDevOpsCollection( 112 | id=org_name, 113 | name=org_name, 114 | url=f"https://dev.azure.com/{org_name}", 115 | ) 116 | ) 117 | logger.debug(f"Using organization from base address: {org_name}") 118 | else: 119 | raise RuntimeError( 120 | "Could not determine organization. Please specify organization in the baseurl." 121 | ) 122 | else: 123 | # For on-premises, get collections via API 124 | collections_url = f"{self.baseurl}/_apis/projectcollections?api-version={self.api_version}" 125 | collections_response = self.session.get(collections_url) 126 | 127 | if collections_response.status_code == 200: 128 | collections_json = collections_response.json() 129 | for collection in collections_json.get("value", []): 130 | collections.append( 131 | AzureDevOpsCollection( 132 | id=collection["id"], 133 | name=collection["name"], 134 | url=collection["url"], 135 | ) 136 | ) 137 | else: 138 | raise RuntimeError( 139 | f"Failed to retrieve collections. Status Code: {collections_response.status_code} Response: {collections_response.text}" 140 | ) 141 | 142 | logger.debug(f"Found {len(collections)} collections/organizations") 143 | return collections 144 | 145 | def _get_web_url_from_api_url(self, api_url, project_name): 146 | """ 147 | Convert an API URL to a web-accessible URL 148 | 149 | Parameters: 150 | api_url (str): API URL for the project 151 | project_name (str): Name of the project 152 | 153 | Returns: 154 | str: Web URL for the project 155 | """ 156 | if self.is_cloud_ado: 157 | # For cloud ADO, convert URL like: 158 | # https://dev.azure.com/org-name/_apis/projects/project-id 159 | # to: https://dev.azure.com/org-name/project-name 160 | match = re.search(r"https://dev\.azure\.com/([^/]+)", api_url) 161 | if match: 162 | org_name = match.group(1) 163 | return f"https://dev.azure.com/{org_name}/{project_name}" 164 | else: 165 | # For on-premises ADO, convert URL like: 166 | # https://server/collection/_apis/projects/project-id 167 | # to: https://server/collection/project-name 168 | base_url = api_url.split("/_apis/projects")[0] 169 | return f"{base_url}/{project_name}" 170 | 171 | def _get_repo_web_url(self, api_url, project_name): 172 | """ 173 | Generate web-accessible URL for repositories page 174 | 175 | Parameters: 176 | api_url (str): API URL for the project 177 | project_name (str): Name of the project 178 | 179 | Returns: 180 | str: Web URL for the project's repositories page 181 | """ 182 | project_web_url = self._get_web_url_from_api_url(api_url, project_name) 183 | return f"{project_web_url}/_git" 184 | 185 | def _get_all_projects( 186 | self, collections: List[AzureDevOpsCollection] = None 187 | ) -> List[AzureDevOpsProject]: 188 | """ 189 | Get all projects from the provided collections or from all collections if none are provided 190 | 191 | Parameters: 192 | collections (List[AzureDevOpsCollection]): List of collections to get projects from 193 | """ 194 | if collections is None: 195 | collections = self._get_all_collections() 196 | 197 | projects = [] 198 | for collection in collections: 199 | collection_url = ( 200 | f"https://dev.azure.com/{collection.name}" 201 | if self.is_cloud_ado 202 | else f"{self.baseurl}/{collection.name}" 203 | ) 204 | logger.debug("Getting projects from collection: %s", collection_url) 205 | 206 | top = 100 207 | project_skip = 0 208 | total_projects = 0 209 | has_more_projects = True 210 | 211 | while has_more_projects: 212 | url = f"{collection_url}/_apis/projects?$top={top}&$skip={project_skip}&api-version={self.api_version}&includeCapabilities=true" 213 | 214 | response = self.session.get(url) 215 | if response.status_code != 200: 216 | raise RuntimeError( 217 | f"Failed to get projects: {response.status_code}" 218 | ) 219 | 220 | result = response.json() 221 | for project in result.get("value", []): 222 | project_api_url = project.get("url") 223 | project_name = project.get("name") 224 | 225 | project_web_url = self._get_web_url_from_api_url( 226 | project_api_url, project_name 227 | ) 228 | repo_web_url = self._get_repo_web_url(project_api_url, project_name) 229 | 230 | projects.append( 231 | AzureDevOpsProject( 232 | project_id=project.get("id"), 233 | project_name=project_name, 234 | project_description=project.get("description") or "", 235 | project_url=project_web_url, 236 | repo_url=repo_web_url, 237 | project_create_time="", # Not provided in API response 238 | project_last_update_time=project.get("lastUpdateTime"), 239 | collection_or_org_name=collection.name, 240 | ) 241 | ) 242 | 243 | count = len(result.get("value", [])) 244 | total_projects += count 245 | project_skip += top 246 | 247 | has_more_projects = count == top 248 | 249 | return projects 250 | -------------------------------------------------------------------------------- /scraper/azuredevops/models.py: -------------------------------------------------------------------------------- 1 | class AzureDevOpsCollection: 2 | def __init__(self, id="", name="", url=""): 3 | self.id = id 4 | self.name = name 5 | self.url = url 6 | 7 | 8 | class AzureDevOpsProject: 9 | def __init__( 10 | self, 11 | project_id="", 12 | project_name="", 13 | project_description="", 14 | project_url="", 15 | repo_url="", 16 | project_create_time="", 17 | project_last_update_time="", 18 | collection_or_org_name="", 19 | ): 20 | self.project_id = project_id 21 | self.project_name = project_name 22 | self.project_description = project_description 23 | self.project_url = project_url 24 | self.repo_url = repo_url 25 | self.project_create_time = project_create_time 26 | self.project_last_update_time = project_last_update_time 27 | self.collection_or_org_name = collection_or_org_name 28 | -------------------------------------------------------------------------------- /scraper/bitbucket/__init__.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import logging 3 | 4 | import stashy 5 | from stashy.client import Stash 6 | 7 | logger = logging.getLogger(__name__) 8 | 9 | 10 | def connect(url, username=None, password=None, token=None): 11 | """ 12 | Return a connected Bitbucket session 13 | """ 14 | if token is not None: 15 | bb_session = Stash(url, token=token) 16 | logger.info("Connected to: %s with token", url) 17 | else: 18 | bb_session = stashy.connect(url, username, password) 19 | logger.info("Connected to: %s as username %s", url, username) 20 | 21 | return bb_session 22 | 23 | 24 | def all_repos(bb_session): 25 | """ 26 | Yields Stashy repo dictionary objects for all repos in Bitbucket 27 | """ 28 | 29 | for repo in bb_session.repos.all(): 30 | all_commits = sorted( 31 | bb_session.projects[repo["project"]["key"]] 32 | .repos[repo["name"]] 33 | .commits(None), 34 | key=lambda x: x["authorTimestamp"], 35 | ) 36 | if all_commits: 37 | repo["created"] = ( 38 | datetime.datetime.fromtimestamp( 39 | all_commits[0]["authorTimestamp"] / 1000 40 | ) 41 | .date() 42 | .isoformat() 43 | ) 44 | repo["lastModified"] = ( 45 | datetime.datetime.fromtimestamp( 46 | all_commits[-1]["authorTimestamp"] / 1000 47 | ) 48 | .date() 49 | .isoformat() 50 | ) 51 | yield repo 52 | -------------------------------------------------------------------------------- /scraper/code_gov/__init__.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: UTF-8 -*- 3 | 4 | import logging 5 | 6 | from scraper import bitbucket, doecode, github, gitlab, tfs 7 | from scraper.azuredevops import AzureDevOpsClient 8 | from scraper.code_gov.models import Metadata, Project 9 | from scraper.github import gov_orgs 10 | 11 | logger = logging.getLogger(__name__) 12 | 13 | 14 | def process_config(config): 15 | """ 16 | Master function to process a Scraper config file 17 | 18 | Returns a Code.gov Metadata file 19 | """ 20 | 21 | agency = config.get("agency", "UNKNOWN") 22 | logger.debug("Agency: %s", agency) 23 | 24 | method = config.get("method", "other") 25 | logger.debug("Inventory Method: %s", method) 26 | 27 | compute_labor_hours = config.get("compute_labor_hours", True) 28 | 29 | if config.get("contact_email", None) is None: 30 | # A default contact email is required to handle the (frequent) case 31 | # where a project / repository has no available contact email. 32 | logger.warning('Config file should contain a "contact_email"') 33 | 34 | logger.debug("Creating inventory from config: %s", config) 35 | code_gov_metadata = Metadata(agency, method) 36 | 37 | # Parse config for GitHub repositories 38 | github_instances = config.get("GitHub", []) 39 | if config.get("github_gov_orgs", False): 40 | github_instances.append({"url": "https://github.com", "orgs": gov_orgs()}) 41 | for instance in github_instances: 42 | timeouts = {} 43 | url = instance.get("url", "https://github.com") 44 | orgs = instance.get("orgs", []) 45 | repos = instance.get("repos", []) 46 | public_only = instance.get("public_only", True) 47 | excluded = instance.get("exclude", []) 48 | token = instance.get("token", None) 49 | connect_timeout = instance.get("connect_timeout", None) 50 | read_timeout = instance.get("read_timeout", None) 51 | 52 | if connect_timeout is not None: 53 | timeouts["default_connect_timeout"] = connect_timeout 54 | if read_timeout is not None: 55 | timeouts["default_read_timeout"] = read_timeout 56 | 57 | gh_session = github.connect(url, token, timeouts) 58 | 59 | for repo in github.query_repos(gh_session, orgs, repos, public_only): 60 | if repo.owner.login in excluded or repo.full_name in excluded: 61 | logger.info("Excluding: %s", repo.full_name) 62 | continue 63 | 64 | code_gov_project = Project.from_github3( 65 | repo, labor_hours=compute_labor_hours 66 | ) 67 | code_gov_metadata["releases"].append(code_gov_project) 68 | 69 | # Parse config for GitLab repositories 70 | gitlab_instances = config.get("GitLab", []) 71 | for instance in gitlab_instances: 72 | url = instance.get("url") 73 | # orgs = instance.get('orgs', []) 74 | repos = instance.get("repos", []) 75 | # public_only = instance.get('public_only', True) 76 | excluded = instance.get("exclude", []) 77 | token = instance.get("token", None) 78 | fetch_languages = instance.get("fetch_languages", False) 79 | 80 | gl_session = gitlab.connect(url, token) 81 | 82 | for repo in gitlab.query_repos(gl_session, repos): 83 | namespace = repo.namespace["path"] 84 | path_with_namespace = repo.path_with_namespace 85 | if namespace in excluded or path_with_namespace in excluded: 86 | logger.info("Excluding: %s", repo.path_with_namespace) 87 | continue 88 | 89 | code_gov_project = Project.from_gitlab( 90 | repo, labor_hours=compute_labor_hours, fetch_languages=fetch_languages 91 | ) 92 | code_gov_metadata["releases"].append(code_gov_project) 93 | 94 | # Parse config for Bitbucket repositories 95 | bitbucket_instances = config.get("Bitbucket", []) 96 | for instance in bitbucket_instances: 97 | url = instance.get("url") 98 | # orgs = instance.get('orgs', None) 99 | # public_only = instance.get('public_only', True) 100 | username = instance.get("username", None) 101 | password = instance.get("password", None) 102 | token = instance.get("token", None) 103 | excluded = instance.get("exclude", []) 104 | 105 | bb_session = bitbucket.connect(url, username, password, token) 106 | 107 | for repo in bitbucket.all_repos(bb_session): 108 | project = repo["project"]["key"] 109 | project_repo = "%s/%s" % (project, repo["slug"]) 110 | if project in excluded or project_repo in excluded: 111 | logger.info("Excluding: %s", project_repo) 112 | continue 113 | 114 | code_gov_project = Project.from_stashy( 115 | repo, labor_hours=compute_labor_hours 116 | ) 117 | code_gov_metadata["releases"].append(code_gov_project) 118 | 119 | # Parse config for TFS repositories 120 | tfs_instances = config.get("TFS", []) 121 | for instance in tfs_instances: 122 | url = instance.get("url") 123 | token = instance.get("token", None) 124 | 125 | projects = tfs.get_projects_metadata(url, token) 126 | for project in projects: 127 | code_gov_project = Project.from_tfs( 128 | project, labor_hours=compute_labor_hours 129 | ) 130 | code_gov_metadata["releases"].append(code_gov_project) 131 | 132 | # parse config for AzureDevOps repositories 133 | ado_instances = config.get("AzureDevOps", []) 134 | for instance in ado_instances: 135 | url = instance.get("url") 136 | token = instance.get("token", None) 137 | api_version = instance.get("apiVersion", "6.1-preview") 138 | excluded = instance.get("exclude", []) 139 | 140 | ado_client = AzureDevOpsClient(url, api_version, token) 141 | projects = ado_client.get_projects_metadata() 142 | for project in projects: 143 | if project.project_name in excluded: 144 | logger.info("Excluding: %s", project.project_name) 145 | continue 146 | 147 | code_gov_project = Project.from_ado( 148 | project, labor_hours=compute_labor_hours 149 | ) 150 | code_gov_metadata["releases"].append(code_gov_project) 151 | 152 | # Handle parsing of DOE CODE records 153 | 154 | doecode_config = config.get("DOE CODE", {}) 155 | doecode_json = doecode_config.get("json", None) 156 | doecode_url = doecode_config.get("url", None) 157 | doecode_key = doecode_config.get("api_key", None) 158 | 159 | for record in doecode.process(doecode_json, doecode_url, doecode_key): 160 | code_gov_project = Project.from_doecode(record) 161 | code_gov_metadata["releases"].append(code_gov_project) 162 | 163 | return code_gov_metadata 164 | 165 | 166 | def force_attributes(metadata, config): 167 | """ 168 | Forces certain fields in the Code.gov Metadata json 169 | """ 170 | 171 | organization = config.get("organization", "") 172 | logger.debug("Organization: %s", organization) 173 | 174 | contact_email = config.get("contact_email") 175 | logger.debug("Contact Email: %s", contact_email) 176 | 177 | permissions = config.get("permissions", {}) 178 | default_usage = permissions.get("usageType", "") 179 | default_exemption_text = permissions.get("exemptionText", "") 180 | logger.debug("Default usageType: %s", default_usage) 181 | logger.debug("Default exemptionText: %s", default_exemption_text) 182 | 183 | # Force certain fields 184 | if organization: 185 | logger.debug("Forcing Organization to: %s", organization) 186 | 187 | if contact_email: 188 | logger.debug("Forcing Contact Email to: %s", contact_email) 189 | 190 | for release in metadata["releases"]: 191 | if organization: 192 | release["organization"] = organization 193 | 194 | if contact_email: 195 | release["contact"]["email"] = contact_email 196 | 197 | if "licenses" not in release["permissions"]: 198 | release["permissions"]["licenses"] = None 199 | 200 | if "description" not in release: 201 | release["description"] = "No description available..." 202 | 203 | if "usageType" not in release["permissions"]: 204 | release["permissions"]["usageType"] = default_usage 205 | release["permissions"]["exemptionText"] = default_exemption_text 206 | 207 | return metadata 208 | -------------------------------------------------------------------------------- /scraper/code_gov/models.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: UTF-8 -*- 3 | 4 | import json 5 | import logging 6 | 7 | from dateutil.parser import parse as date_parse 8 | import github3 9 | import gitlab 10 | from requests.utils import requote_uri 11 | 12 | from scraper.azuredevops.models import AzureDevOpsProject 13 | from scraper.github.util import _license_obj 14 | from scraper.util import _prune_dict_null_str, labor_hours_from_url 15 | 16 | logger = logging.getLogger(__name__) 17 | 18 | POLICY_START_DATE = date_parse("2016-08-08T00:00:00Z") 19 | 20 | 21 | class Metadata(dict): 22 | """ 23 | Defines the entire contents of a Code.gov 's code.json file 24 | 25 | For details: https://code.gov/#/policy-guide/docs/compliance/inventory-code 26 | """ 27 | 28 | def __init__(self, agency, method, other_method=""): 29 | # *version: [string] The Code.gov metadata schema version 30 | self["version"] = "2.0.0" 31 | 32 | # *agency: [string] The agency acronym for Clinger Cohen Act agency, e.g. "GSA" or "DOD" 33 | self["agency"] = agency.upper() 34 | 35 | # *measurementType: [object] The description of the open source measurement method 36 | # *method [enum]: An enumerated list of methods for measuring the open source requirement 37 | # cost: Cost of software development. 38 | # systems: System certification and accreditation boundaries. 39 | # projects: A complete software solution / project. 40 | # modules: A self-contained module from a software solution. 41 | # linesOfCode: Source lines of code. 42 | # other: Another measurement method not referenced above. 43 | # ifOther: [string] A one- or two- sentence description of the measurement type used, if 'other' is selected as the value of 'method' field. 44 | self["measurementType"] = {"method": method} 45 | if method == "other": 46 | self["measurementType"]["ifOther"] = other_method 47 | 48 | # The list of source code releases 49 | self["releases"] = [] 50 | 51 | def to_json(self): 52 | return json.dumps(self, indent=4, sort_keys=True, ensure_ascii=False) 53 | 54 | 55 | class Project(dict): 56 | """ 57 | Python representation of Code.gov Metadata Schema 58 | 59 | For details: https://code.gov/#/policy-guide/docs/compliance/inventory-code 60 | """ 61 | 62 | def __init__(self): 63 | # -- REQUIRED FIELDS -- 64 | 65 | # *name: [string] The name of the release 66 | self["name"] = "" 67 | 68 | # repository: [string] The URL of the public project repository 69 | self["repositoryURL"] = "" 70 | 71 | # *description: [string] A description of the project 72 | self["description"] = "" 73 | 74 | # *permissions: [object] A description of the usage/restrictions regarding the release 75 | # * licenses: [null or array of objects] An object containing license details, if available. If not, null should be used. 76 | # URL: [string] The URL of the release license, if available 77 | # name: [string] An abbreviation for the name of the license 78 | # * usageType: [enum] 79 | # openSource: Open source 80 | # governmentWideReuse: Government-wide reuse. 81 | # exemptByLaw: The sharing of the source code is restricted by law or regulation, including—but not limited to—patent or intellectual property law, the Export Asset Regulations, the International Traffic in Arms Regulation, and the Federal laws and regulations governing classified information. 82 | # exemptByNationalSecurity: The sharing of the source code would create an identifiable risk to the detriment of national security, confidentiality of Government information, or individual privacy. 83 | # exemptByAgencySystem: The sharing of the source code would create an identifiable risk to the stability, security, or integrity of the agency’s systems or personnel. 84 | # exemptByAgencyMission: The sharing of the source code would create an identifiable risk to agency mission, programs, or operations. 85 | # exemptByCIO: The CIO believes it is in the national interest to exempt sharing the source code. 86 | # exemptByPolicyDate: The release was created prior to the M-16-21 policy (August 8, 2016). 87 | # exemptionText: [null or string] 88 | self["permissions"] = {"licenses": None, "usageType": "", "exemptionText": None} 89 | 90 | # *laborHours: [number]: An estimate of total labor hours spent by your organization/component across all versions of this release. This includes labor performed by federal employees and contractors. 91 | self["laborHours"] = 0 92 | 93 | # *tags: [array] An array of keywords that will be helpful in discovering and searching for the release. 94 | self["tags"] = [] 95 | 96 | # *contact: [object] Information about contacting the project. 97 | # *email: [string] An email address to contact the project. 98 | # name: [string] The name of a contact or department for the project 99 | # twitter: [string] The username of the project's Twitter account 100 | # phone: [string] The phone number to contact a project. 101 | self["contact"] = {"email": ""} 102 | # TODO: Currently, the GSA Harvester requires these fields to not be present if they are empty 103 | # 'name': '', 104 | # 'URL': '', 105 | # 'phone': '', 106 | # } 107 | 108 | # -- OPTIONAL FIELDS -- 109 | 110 | # version: [string] The version for this release. For example, "1.0.0." 111 | # self['version'] = '' 112 | 113 | # organization: [string] The organization or component within the agency that the releases listed belong to. For example, "18F" or "Navy." 114 | # self['organization'] = '' 115 | 116 | # status: [string] The development status of the project 117 | # "Ideation" - brainstorming phase. 118 | # "Development" - a release is still in development. 119 | # "Alpha" - initial prototyping phase and internal testing. 120 | # "Beta" - a project is being tested in public. 121 | # "Release Candidate" - a release is nearly ready for production. 122 | # "Production" - finished project, with development and maintenance ongoing. 123 | # "Archival" - finished project, but no longer actively maintained. 124 | # self['status'] = '' 125 | 126 | # vcs: [string] A lowercase string with the name of the Version Control System in use on the project. 127 | # self['vcs'] = '' 128 | 129 | # homepageURL: [string] The URL of the public release homepage. 130 | # self['homepageURL'] = '' 131 | 132 | # downloadURL: [string] The URL where a distribution of the release can be found. 133 | # self['downloadURL'] = '' 134 | 135 | # disclaimerText: [string] Short paragraph that includes disclaimer language to accompany the release. 136 | # self['disclaimerText'] = '' 137 | 138 | # disclaimerURL: [string] The URL where disclaimer language regarding the release can be found. 139 | # self['disclaimerURL'] = '' 140 | 141 | # languages: [array] A list of strings with the names of the programming languages in use on the release. 142 | # self['languages'] = [] 143 | 144 | # partners: [array] An array of objects including an acronym for each agency partnering on the release and the contact email at such agency. 145 | # name: [string] The acronym describing the partner agency. 146 | # email: [string] The email address for the point of contact at the partner agency. 147 | # self['partners'] = [] 148 | 149 | # relatedCode: [array] An array of affiliated government repositories that may be a part of the same project. For example, relatedCode for 'code-gov-web' would include 'code-gov-api' and 'code-gov-tools'. 150 | # name: [string] The name of the code repository, project, library or release. 151 | # URL: [string] The URL where the code repository, project, library or release can be found. 152 | # isGovernmentRepo: [boolean] True or False. Is the code repository owned or managed by a federal agency? 153 | # self['relatedCode'] = [] 154 | 155 | # reusedCode: [array] An array of government source code, libraries, frameworks, APIs, platforms or other software used in this release. For example: US Web Design Standards, cloud.gov, Federalist, Digital Services Playbook, Analytics Reporter. 156 | # name: [string] The name of the software used in this release. 157 | # URL: [string] The URL where the software can be found. 158 | # self['reusedCode'] = [] 159 | 160 | # date: [object] A date object describing the release. 161 | # created: [string] The date the release was originally created, in YYYY-MM-DD or ISO 8601 format. 162 | # lastModified: [string] The date the release was modified, in YYYY-MM-DD or ISO 8601 format. 163 | # metadataLastUpdated: [string] The date the metadata of the release was last updated, in YYYY-MM-DD or ISO 8601 format. 164 | # self['date'] = { 165 | # 'created': '', 166 | # 'lastModified': '', 167 | # 'metadataLastUpdated': '' 168 | # } 169 | 170 | @classmethod 171 | def from_github3(klass, repository, labor_hours=True): 172 | """ 173 | Create CodeGovProject object from github3 Repository object 174 | """ 175 | if not isinstance(repository, github3.repos.repo._Repository): 176 | raise TypeError("Repository must be a github3 Repository object") 177 | 178 | logger.info("Processing: %s", repository.full_name) 179 | 180 | project = klass() 181 | 182 | logger.debug("GitHub3: repository=%s", repository) 183 | 184 | # -- REQUIRED FIELDS -- 185 | 186 | project["name"] = repository.name 187 | project["repositoryURL"] = repository.clone_url 188 | project["description"] = repository.description 189 | 190 | try: 191 | repo_license = repository.license() 192 | except github3.exceptions.NotFoundError: 193 | logger.debug("no license found for repo=%s", repository) 194 | repo_license = None 195 | 196 | if repo_license: 197 | license_obj = repo_license.license 198 | if license_obj: 199 | logger.debug( 200 | "license spdx=%s; url=%s", license_obj.spdx_id, license_obj.url 201 | ) 202 | if license_obj.url is None: 203 | project["permissions"]["licenses"] = [{"name": license_obj.spdx_id}] 204 | else: 205 | project["permissions"]["licenses"] = [ 206 | {"URL": license_obj.url, "name": license_obj.spdx_id} 207 | ] 208 | else: 209 | project["permissions"]["licenses"] = None 210 | 211 | public_server = repository.html_url.startswith("https://github.com") 212 | if not repository.private and public_server: 213 | project["permissions"]["usageType"] = "openSource" 214 | elif date_parse(repository.created_at) < POLICY_START_DATE: 215 | project["permissions"]["usageType"] = "exemptByPolicyDate" 216 | 217 | if labor_hours: 218 | project["laborHours"] = labor_hours_from_url(project["repositoryURL"]) 219 | else: 220 | project["laborHours"] = 0 221 | 222 | project["tags"] = ["github"] 223 | old_accept = repository.session.headers["Accept"] 224 | repository.session.headers["Accept"] = ( 225 | "application/vnd.github.mercy-preview+json" 226 | ) 227 | topics = repository._get(repository.url + "/topics").json() 228 | project["tags"].extend(topics.get("names", [])) 229 | repository.session.headers["Accept"] = old_accept 230 | 231 | # Hacky way to get an Organization object back with GitHub3.py >= 1.2.0 232 | owner_url = repository.owner.url 233 | owner_api_response = repository._get(owner_url) 234 | organization = repository._json(owner_api_response, 200) 235 | project["contact"]["email"] = organization["email"] 236 | project["contact"]["URL"] = organization["html_url"] 237 | 238 | # -- OPTIONAL FIELDS -- 239 | 240 | # project['version'] = '' 241 | 242 | project["organization"] = organization["name"] 243 | 244 | # TODO: Currently, can't be an empty string, see: https://github.com/GSA/code-gov-web/issues/370 245 | project["status"] = "Development" 246 | 247 | project["vcs"] = "git" 248 | 249 | project["homepageURL"] = repository.html_url 250 | 251 | project["downloadURL"] = repository.downloads_url 252 | 253 | project["languages"] = [lang for lang, _ in repository.languages()] 254 | 255 | # project['partners'] = [] 256 | 257 | # project['relatedCode'] = [] 258 | 259 | # project['reusedCode'] = [] 260 | 261 | # date: [object] A date object describing the release. 262 | # created: [string] The date the release was originally created, in YYYY-MM-DD or ISO 8601 format. 263 | # lastModified: [string] The date the release was modified, in YYYY-MM-DD or ISO 8601 format. 264 | # metadataLastUpdated: [string] The date the metadata of the release was last updated, in YYYY-MM-DD or ISO 8601 format. 265 | try: 266 | created_at = repository.created_at.date() 267 | except AttributeError: 268 | created_at = date_parse(repository.created_at).date() 269 | try: 270 | updated_at = repository.updated_at.date() 271 | except AttributeError: 272 | updated_at = date_parse(repository.updated_at).date() 273 | 274 | project["date"] = { 275 | "created": created_at.isoformat(), 276 | "lastModified": updated_at.isoformat(), 277 | "metadataLastUpdated": "", 278 | } 279 | 280 | _prune_dict_null_str(project) 281 | 282 | return project 283 | 284 | @classmethod 285 | def from_gitlab(klass, repository, labor_hours=True, fetch_languages=False): 286 | """ 287 | Create CodeGovProject object from GitLab Repository 288 | """ 289 | if not isinstance(repository, gitlab.v4.objects.Project): 290 | raise TypeError("Repository must be a gitlab Repository object") 291 | 292 | project = klass() 293 | 294 | logger.debug( 295 | "GitLab: repository_id=%d path_with_namespace=%s", 296 | repository.id, 297 | repository.path_with_namespace, 298 | ) 299 | 300 | # -- REQUIRED FIELDS -- 301 | 302 | project["name"] = repository.name 303 | project["repositoryURL"] = repository.http_url_to_repo 304 | project["description"] = repository.description 305 | 306 | # TODO: Update licenses from GitLab API 307 | project["permissions"]["licenses"] = None 308 | 309 | web_url = repository.web_url 310 | public_server = web_url.startswith("https://gitlab.com") 311 | 312 | if repository.visibility in ("public") and public_server: 313 | project["permissions"]["usageType"] = "openSource" 314 | elif date_parse(repository.created_at) < POLICY_START_DATE: 315 | project["permissions"]["usageType"] = "exemptByPolicyDate" 316 | 317 | if labor_hours: 318 | project["laborHours"] = labor_hours_from_url(project["repositoryURL"]) 319 | else: 320 | project["laborHours"] = 0 321 | 322 | project["tags"] = ["gitlab"] + repository.tag_list 323 | 324 | project["contact"] = {"email": "", "URL": web_url} 325 | 326 | # -- OPTIONAL FIELDS -- 327 | 328 | # project['version'] = '' 329 | 330 | project["organization"] = repository.namespace["name"] 331 | 332 | # TODO: Currently, can't be an empty string, see: https://github.com/GSA/code-gov-web/issues/370 333 | project["status"] = "Development" 334 | 335 | project["vcs"] = "git" 336 | 337 | project["homepageURL"] = repository.web_url 338 | 339 | api_url = repository.manager.gitlab._url 340 | archive_suffix = "/projects/%s/repository/archive" % repository.get_id() 341 | project["downloadURL"] = api_url + archive_suffix 342 | 343 | # project['languages'] = [lang for lang, _ in repository.languages()] 344 | 345 | if fetch_languages: 346 | project["languages"] = [*repository.languages()] 347 | 348 | # project['partners'] = [] 349 | # project['relatedCode'] = [] 350 | # project['reusedCode'] = [] 351 | 352 | project["date"] = { 353 | "created": date_parse(repository.created_at).date().isoformat(), 354 | "lastModified": date_parse(repository.last_activity_at).date().isoformat(), 355 | "metadataLastUpdated": "", 356 | } 357 | 358 | _prune_dict_null_str(project) 359 | 360 | return project 361 | 362 | @classmethod 363 | def from_stashy(klass, repository, labor_hours=True): 364 | """ 365 | Handles crafting Code.gov Project for Bitbucket Server repositories 366 | """ 367 | # if not isinstance(repository, stashy.repos.Repository): 368 | # raise TypeError('Repository must be a stashy Repository object') 369 | if not isinstance(repository, dict): 370 | raise TypeError("Repository must be a dict") 371 | 372 | project = klass() 373 | 374 | logger.debug( 375 | "Stashy: project_key=%s repository_slug=%s", 376 | repository["name"], 377 | repository["project"]["key"], 378 | ) 379 | 380 | # -- REQUIRED FIELDS -- 381 | 382 | project["name"] = repository["name"] 383 | 384 | clone_urls = [clone["href"] for clone in repository["links"]["clone"]] 385 | for url in clone_urls: 386 | # Only rely on SSH Urls for repository urls 387 | if url.startswith("ssh://"): 388 | project["repositoryURL"] = url 389 | break 390 | 391 | description = repository["project"].get("description", "") 392 | if description: 393 | project["description"] = "Project description: %s" % description 394 | 395 | project["permissions"]["licenses"] = None 396 | 397 | web_url = repository["links"]["self"][0]["href"] 398 | public_server = web_url.startswith("https://bitbucket.org") 399 | if repository["public"] and public_server: 400 | project["permissions"]["usageType"] = "openSource" 401 | 402 | if labor_hours: 403 | project["laborHours"] = labor_hours_from_url(project["repositoryURL"]) 404 | else: 405 | project["laborHours"] = 0 406 | 407 | project["tags"] = ["bitbucket"] 408 | 409 | project["contact"]["email"] = "" 410 | project["contact"]["URL"] = repository["links"]["self"][0]["href"] 411 | 412 | # -- OPTIONAL FIELDS -- 413 | 414 | # project['version'] = '' 415 | 416 | # project['organization'] = organization.name 417 | 418 | # TODO: Currently, can't be an empty string, see: https://github.com/GSA/code-gov-web/issues/370 419 | project["status"] = "Development" 420 | 421 | project["vcs"] = repository["scmId"] 422 | 423 | project["homepageURL"] = repository["links"]["self"][0]["href"] 424 | 425 | # project['downloadURL'] = 426 | 427 | # project['languages'] = 428 | 429 | # project['partners'] = [] 430 | 431 | # project['relatedCode'] = [] 432 | 433 | # project['reusedCode'] = [] 434 | 435 | # date: [object] A date object describing the release. Empty if repo has no commits. 436 | # created: [string] The date the release was originally created, in YYYY-MM-DD or ISO 8601 format. 437 | # lastModified: [string] The date the release was modified, in YYYY-MM-DD or ISO 8601 format. 438 | if repository.get("created", None): 439 | project["date"] = { 440 | "created": repository["created"], 441 | "lastModified": repository["lastModified"], 442 | } 443 | 444 | _prune_dict_null_str(project) 445 | 446 | return project 447 | 448 | @classmethod 449 | def from_doecode(klass, record): 450 | """ 451 | Create CodeGovProject object from DOE CODE record 452 | 453 | Handles crafting Code.gov Project 454 | """ 455 | if not isinstance(record, dict): 456 | raise TypeError("`record` must be a dict") 457 | 458 | project = klass() 459 | 460 | # -- REQUIRED FIELDS -- 461 | 462 | project["name"] = record["software_title"] 463 | logger.debug('DOE CODE: software_title="%s"', record["software_title"]) 464 | 465 | link = record.get("repository_link", "") 466 | if not link: 467 | link = record.get("landing_page") 468 | logger.warning("DOE CODE: No repositoryURL, using landing_page: %s", link) 469 | 470 | project["repositoryURL"] = link 471 | 472 | project["description"] = record["description"] 473 | 474 | licenses = set(record["licenses"]) 475 | licenses.discard(None) 476 | logger.debug("DOE CODE: licenses=%s", licenses) 477 | 478 | license_objects = [] 479 | if "Other" in licenses: 480 | licenses.remove("Other") 481 | license_objects = [{"name": "Other", "URL": record["proprietary_url"]}] 482 | 483 | if licenses: 484 | license_objects.extend( 485 | [_license_obj(license_name) for license_name in licenses] 486 | ) 487 | 488 | project["permissions"]["licenses"] = license_objects 489 | 490 | if record["open_source"]: 491 | usage_type = "openSource" 492 | else: 493 | usage_type = "exemptByLaw" 494 | project["permissions"][ 495 | "exemptionText" 496 | ] = "This source code is restricted by patent and / or intellectual property law." 497 | 498 | project["permissions"]["usageType"] = usage_type 499 | 500 | labor_hours = record.get("labor_hours") 501 | if labor_hours is not None: 502 | project["laborHours"] = labor_hours 503 | else: 504 | project["laborHours"] = 0 505 | 506 | project["tags"] = ["DOE CODE"] 507 | lab_name = record.get("lab_display_name") 508 | if lab_name is not None: 509 | project["tags"].append(lab_name) 510 | 511 | project["contact"]["email"] = record["owner"] 512 | # project['contact']['URL'] = '' 513 | # project['contact']['name'] = '' 514 | # project['contact']['phone'] = '' 515 | 516 | # -- OPTIONAL FIELDS -- 517 | 518 | if "version_number" in record and record["version_number"]: 519 | project["version"] = record["version_number"] 520 | 521 | if lab_name is not None: 522 | project["organization"] = lab_name 523 | 524 | # Currently, can't be an empty string, see: https://github.com/GSA/code-gov-web/issues/370 525 | status = record.get("ever_announced") 526 | if status is None: 527 | raise ValueError('DOE CODE: Unable to determine "ever_announced" value!') 528 | 529 | project["status"] = "Production" if status else "Development" 530 | 531 | vcs = None 532 | link = project["repositoryURL"] 533 | if "github.com" in link: 534 | vcs = "git" 535 | if vcs is None: 536 | logger.debug( 537 | 'DOE CODE: Unable to determine vcs for: name="%s", repositoryURL=%s', 538 | project["name"], 539 | link, 540 | ) 541 | vcs = "" 542 | if vcs: 543 | project["vcs"] = vcs 544 | 545 | url = record.get("landing_page", "") 546 | if url: 547 | project["homepageURL"] = url 548 | 549 | # record['downloadURL'] = '' 550 | 551 | # self['disclaimerText'] = '' 552 | 553 | # self['disclaimerURL'] = '' 554 | 555 | if "programming_languages" in record: 556 | project["languages"] = record["programming_languages"] 557 | 558 | # self['partners'] = [] 559 | # TODO: Look into using record['contributing_organizations'] 560 | 561 | # self['relatedCode'] = [] 562 | 563 | # self['reusedCode'] = [] 564 | 565 | # date: [object] A date object describing the release. 566 | # created: [string] The date the release was originally created, in YYYY-MM-DD or ISO 8601 format. 567 | # lastModified: [string] The date the release was modified, in YYYY-MM-DD or ISO 8601 format. 568 | # metadataLastUpdated: [string] The date the metadata of the release was last updated, in YYYY-MM-DD or ISO 8601 format. 569 | if "date_record_added" in record and "date_record_updated" in record: 570 | project["date"] = { 571 | "created": record["date_record_added"], 572 | # 'lastModified': '', 573 | "metadataLastUpdated": record["date_record_updated"], 574 | } 575 | 576 | return project 577 | 578 | @classmethod 579 | def from_tfs(klass, tfs_project, labor_hours=True): 580 | """ 581 | Creates CodeGovProject object from TFS/VSTS/AzureDevOps Instance 582 | """ 583 | project = klass() 584 | project_web_url = "" 585 | 586 | # -- REQUIRED FIELDS -- 587 | project["name"] = tfs_project.projectInfo.name 588 | 589 | if "web" in tfs_project.projectInfo._links.additional_properties: 590 | if "href" in tfs_project.projectInfo._links.additional_properties["web"]: 591 | # URL Encodes spaces that are in the Project Name for the Project Web URL 592 | project_web_url = requote_uri( 593 | tfs_project.projectInfo._links.additional_properties["web"]["href"] 594 | ) 595 | 596 | project["repositoryURL"] = project_web_url 597 | 598 | project["homepageURL"] = project_web_url 599 | 600 | project["description"] = tfs_project.projectInfo.description 601 | 602 | project["vcs"] = "TFS" 603 | 604 | project["permissions"]["license"] = None 605 | 606 | project["tags"] = [] 607 | 608 | if labor_hours: 609 | logger.debug("Sorry labor hour calculation not currently supported.") 610 | # project['laborHours'] = labor_hours_from_url(project['repositoryURL']) 611 | else: 612 | project["laborHours"] = 0 613 | 614 | if tfs_project.projectCreateInfo.last_update_time < POLICY_START_DATE: 615 | project["permissions"]["usageType"] = "exemptByPolicyDate" 616 | else: 617 | project["permissions"]["usageType"] = "exemptByAgencyMission" 618 | project["permissions"][ 619 | "exemptionText" 620 | ] = "This source code resides on a private server and has not been properly evaluated for releaseability." 621 | 622 | project["contact"] = {"email": "", "URL": project_web_url} 623 | 624 | project["date"] = { 625 | "lastModified": tfs_project.projectLastUpdateInfo.last_update_time.date().isoformat(), 626 | "created": tfs_project.projectCreateInfo.last_update_time.date().isoformat(), 627 | "metadataLastUpdated": "", 628 | } 629 | 630 | _prune_dict_null_str(project) 631 | 632 | return project 633 | 634 | @classmethod 635 | def from_ado(klass, ado_project: AzureDevOpsProject, labor_hours=True): 636 | """ 637 | Creates CodeGovProject object from AzureDevOps Instance 638 | """ 639 | project = klass() 640 | project_web_url = "" 641 | 642 | # -- REQUIRED FIELDS -- 643 | project["name"] = ado_project.project_name 644 | 645 | project["repositoryURL"] = requote_uri(ado_project.repo_url) 646 | 647 | project["homepageURL"] = requote_uri(ado_project.project_url) 648 | 649 | project["description"] = ado_project.project_description 650 | 651 | project["vcs"] = "AzureDevOps" 652 | 653 | project["permissions"]["license"] = None 654 | 655 | project["tags"] = [] 656 | 657 | if labor_hours: 658 | logger.debug("Sorry labor hour calculation not currently supported.") 659 | # project['laborHours'] = labor_hours_from_url(project['repositoryURL']) 660 | else: 661 | project["laborHours"] = 0 662 | 663 | last_update_time_as_date = date_parse(ado_project.project_last_update_time) 664 | if last_update_time_as_date < POLICY_START_DATE: 665 | project["permissions"]["usageType"] = "exemptByPolicyDate" 666 | else: 667 | project["permissions"]["usageType"] = "exemptByAgencyMission" 668 | project["permissions"][ 669 | "exemptionText" 670 | ] = "This source code resides on a private server and has not been properly evaluated for releaseability." 671 | 672 | project["contact"] = {"email": "", "URL": project_web_url} 673 | 674 | project["date"] = { 675 | "lastModified": last_update_time_as_date.isoformat(), 676 | "created": "", 677 | "metadataLastUpdated": "", 678 | } 679 | 680 | _prune_dict_null_str(project) 681 | 682 | return project 683 | -------------------------------------------------------------------------------- /scraper/doecode/__init__.py: -------------------------------------------------------------------------------- 1 | import json 2 | import logging 3 | 4 | import requests 5 | 6 | from scraper.util import DEFAULT_REQUESTS_TIMEOUTS 7 | 8 | logger = logging.getLogger(__name__) 9 | 10 | 11 | def process_json(filename): 12 | """ 13 | Converts a DOE CODE .json file into DOE CODE projects 14 | Yields DOE CODE records from a DOE CODE .json file 15 | """ 16 | 17 | logger.debug("Processing DOE CODE json: %s", filename) 18 | 19 | with open(filename, encoding="utf-8") as fd: 20 | doecode_json = json.load(fd) 21 | 22 | for record in doecode_json["records"]: 23 | yield record 24 | 25 | 26 | def process_url(url, key): 27 | """ 28 | Yields DOE CODE records from a DOE CODE .json URL response 29 | Converts a DOE CODE API .json URL response into DOE CODE projects 30 | """ 31 | 32 | logger.debug("Fetching DOE CODE JSON: %s", url) 33 | 34 | if key is None: 35 | raise ValueError("DOE CODE API Key value is missing!") 36 | 37 | response = requests.get( 38 | url, 39 | headers={"Authorization": "Basic " + key}, 40 | timeout=DEFAULT_REQUESTS_TIMEOUTS, 41 | ) 42 | doecode_json = response.json() 43 | 44 | for record in doecode_json["records"]: 45 | yield record 46 | 47 | 48 | def process(filename=None, url=None, key=None): 49 | """ 50 | Yields DOE CODE records based on provided input sources 51 | 52 | param: 53 | filename (str): Path to a DOE CODE .json file 54 | url (str): URL for a DOE CODE server json file 55 | key (str): API Key for connecting to DOE CODE server 56 | """ 57 | 58 | if filename is not None: 59 | yield from process_json(filename) 60 | elif url and key: 61 | yield from process_url(url, key) 62 | -------------------------------------------------------------------------------- /scraper/gen_code_gov_json.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: UTF-8 -*- 3 | 4 | import argparse 5 | import json 6 | import logging 7 | import os 8 | 9 | from scraper import code_gov 10 | from scraper.util import configure_logging 11 | 12 | logger = logging.getLogger(__name__) 13 | 14 | 15 | def main(): 16 | parser = argparse.ArgumentParser( 17 | description="Scrape code repositories for Code.gov / DOE CODE" 18 | ) 19 | 20 | parser.add_argument( 21 | "--agency", type=str, nargs="?", default="", help='Agency Label, e.g. "DOE"' 22 | ) 23 | parser.add_argument( 24 | "--method", 25 | type=str, 26 | nargs="?", 27 | default="", 28 | help="Method of measuring open source", 29 | ) 30 | parser.add_argument( 31 | "--organization", 32 | type=str, 33 | nargs="?", 34 | default="", 35 | help="Force all repos to report a particular organization", 36 | ) 37 | parser.add_argument( 38 | "--contact-email", 39 | type=str, 40 | nargs="?", 41 | default="", 42 | help="Force all repos to report a particular contact email", 43 | ) 44 | 45 | parser.add_argument( 46 | "--config", type=str, nargs="?", default="", help="Configuration File (*.json)" 47 | ) 48 | 49 | parser.add_argument( 50 | "--github-gov-orgs", 51 | action="store_true", 52 | help="Use orgs from government.github.com/community", 53 | ) 54 | parser.add_argument( 55 | "--skip-labor-hours", 56 | action="store_true", 57 | help='Skip calculation of labor hours, assume "0"', 58 | ) 59 | 60 | parser.add_argument( 61 | "--doecode-json", 62 | type=str, 63 | nargs="?", 64 | default=None, 65 | help="Path to DOE CODE .json file", 66 | ) 67 | parser.add_argument( 68 | "--doecode-url", 69 | type=str, 70 | nargs="?", 71 | default=None, 72 | help="URL to DOE CODE .json data", 73 | ) 74 | parser.add_argument( 75 | "--doecode-api-key", 76 | type=str, 77 | nargs="?", 78 | default=None, 79 | help="DOE CODE API key for accessing --doecode-url", 80 | ) 81 | 82 | parser.add_argument( 83 | "--output-path", 84 | type=str, 85 | nargs="?", 86 | default="", 87 | help="Output path for .json file", 88 | ) 89 | parser.add_argument( 90 | "--output-filename", 91 | type=str, 92 | nargs="?", 93 | default="code.json", 94 | help="Output filename for .json file", 95 | ) 96 | 97 | parser.add_argument("--verbose", action="store_true", help="Enable verbose output") 98 | 99 | args = parser.parse_args() 100 | 101 | configure_logging(args.verbose) 102 | 103 | try: 104 | with open(args.config, encoding="utf-8") as fd: 105 | config_json = json.load(fd) 106 | except (FileNotFoundError, json.JSONDecodeError): 107 | if args.config: 108 | raise 109 | config_json = {} 110 | 111 | # Update config based on commandline arguments 112 | if args.agency: 113 | config_json["agency"] = args.agency 114 | if args.method: 115 | config_json["method"] = args.method 116 | if args.organization: 117 | config_json["organization"] = args.organization 118 | if args.contact_email: 119 | config_json["contact_email"] = args.contact_email 120 | if args.output_path: 121 | config_json["output_path"] = args.output_path 122 | if args.skip_labor_hours: 123 | config_json["compute_labor_hours"] = False 124 | if args.github_gov_orgs: 125 | config_json["github_gov_orgs"] = True 126 | 127 | config_json["DOE CODE"] = {} 128 | config_json["DOE CODE"]["json"] = args.doecode_json 129 | config_json["DOE CODE"]["url"] = args.doecode_url 130 | config_json["DOE CODE"]["api_key"] = args.doecode_api_key 131 | 132 | output_path = config_json.get("output_path", None) 133 | output_path = args.output_path or output_path 134 | logger.debug("Output Path: %s", output_path) 135 | 136 | if output_path is not None and not os.path.exists(output_path): 137 | raise RuntimeError( 138 | "Invalid output path argument provided! Make sure the output path exists and try again." 139 | ) 140 | 141 | code_json = code_gov.process_config(config_json) 142 | 143 | code_gov.force_attributes(code_json, config_json) 144 | 145 | logger.info("Number of Projects: %s", len(code_json["releases"])) 146 | 147 | output_filepath = args.output_filename 148 | 149 | if output_path is not None: 150 | output_filepath = os.path.join(output_path, output_filepath) 151 | 152 | with open(output_filepath, "w", encoding="utf-8") as fp: 153 | logger.info("Writing output to: %s", output_filepath) 154 | fp.write(code_json.to_json()) 155 | 156 | 157 | if __name__ == "__main__": 158 | main() 159 | -------------------------------------------------------------------------------- /scraper/github/__init__.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: UTF-8 -*- 3 | 4 | import logging 5 | import os 6 | import time 7 | 8 | import github3 9 | import requests 10 | 11 | from scraper.util import DEFAULT_REQUESTS_TIMEOUTS 12 | 13 | logger = logging.getLogger(__name__) 14 | 15 | 16 | def gov_orgs(): 17 | """ 18 | Returns a list of the names of US Government GitHub organizations 19 | 20 | Based on: https://government.github.com/community/ 21 | 22 | Example return: 23 | {'llnl', '18f', 'gsa', 'dhs-ncats', 'spack', ...} 24 | """ 25 | us_gov_github_orgs = set() 26 | 27 | gov_orgs_json = requests.get( 28 | "https://government.github.com/organizations.json", 29 | timeout=DEFAULT_REQUESTS_TIMEOUTS, 30 | ).json() 31 | 32 | us_gov_github_orgs.update(gov_orgs_json["governments"]["U.S. Federal"]) 33 | us_gov_github_orgs.update( 34 | gov_orgs_json["governments"]["U.S. Military and Intelligence"] 35 | ) 36 | us_gov_github_orgs.update(gov_orgs_json["research"]["U.S. Research Labs"]) 37 | 38 | return list(us_gov_github_orgs) 39 | 40 | 41 | def create_session(token=None, timeouts=None): 42 | """ 43 | Create a github3.py session connected to GitHub.com 44 | 45 | If token is not provided, will attempt to use the GITHUB_API_TOKEN 46 | environment variable if present. 47 | """ 48 | if token is None: 49 | token = os.environ.get("GITHUB_API_TOKEN", None) 50 | 51 | if timeouts is None: 52 | timeouts = {} 53 | 54 | custom_session = github3.session.GitHubSession(**timeouts) 55 | gh_session = github3.GitHub(token=token, session=custom_session) 56 | 57 | if gh_session is None: 58 | raise RuntimeError("Invalid or missing GITHUB_API_TOKEN") 59 | 60 | return gh_session 61 | 62 | 63 | def create_enterprise_session(url, token=None, timeouts=None): 64 | """ 65 | Create a github3.py session for a GitHub Enterprise instance 66 | 67 | If token is not provided, will attempt to use the GITHUB_API_TOKEN 68 | environment variable if present. 69 | """ 70 | if timeouts is None: 71 | timeouts = {} 72 | 73 | custom_session = github3.session.GitHubSession(**timeouts) 74 | gh_session = github3.GitHubEnterprise(url=url, token=token, session=custom_session) 75 | 76 | if gh_session is None: 77 | msg = "Unable to connect to GitHub Enterprise (%s) with provided token." 78 | raise RuntimeError(msg, url) 79 | 80 | return gh_session 81 | 82 | 83 | def _num_requests_needed(num_repos, factor=2, wiggle_room=100): 84 | """ 85 | Helper function to estimate the minimum number of API requests needed 86 | """ 87 | return num_repos * factor + wiggle_room 88 | 89 | 90 | def _check_api_limits(gh_session, api_required=250): 91 | """ 92 | Simplified check for API limits 93 | 94 | If necessary, spin in place waiting for API to reset before returning. 95 | 96 | See: https://developer.github.com/v3/#rate-limiting 97 | """ 98 | api_rates = gh_session.rate_limit() 99 | 100 | api_remaining = api_rates["rate"]["remaining"] 101 | api_reset = api_rates["rate"]["reset"] 102 | logger.debug("Rate Limit - %d requests remaining", api_remaining) 103 | 104 | if api_remaining > api_required: 105 | return 106 | 107 | now_time = time.time() 108 | time_to_reset = int(api_reset - now_time) 109 | logger.warning("Rate Limit Depleted - Sleeping for %d seconds", time_to_reset) 110 | 111 | while now_time < api_reset: 112 | time.sleep(10) 113 | now_time = time.time() 114 | 115 | return 116 | 117 | 118 | def connect(url="https://github.com", token=None, timeouts=None): 119 | """ 120 | Create a GitHub session for making requests 121 | """ 122 | 123 | if timeouts is None: 124 | timeouts = {} 125 | 126 | gh_session = None 127 | if url == "https://github.com": 128 | gh_session = create_session(token, timeouts) 129 | else: 130 | gh_session = create_enterprise_session(url, token, timeouts) 131 | 132 | if gh_session is None: 133 | msg = "Unable to connect to (%s) with provided token." 134 | raise RuntimeError(msg, url) 135 | 136 | logger.info("Connected to: %s", url) 137 | 138 | return gh_session 139 | 140 | 141 | def query_repos(gh_session, orgs=None, repos=None, public_only=True): 142 | """ 143 | Yields GitHub3.py repo objects for provided orgs and repo names 144 | 145 | If orgs and repos are BOTH empty, execute special mode of getting ALL 146 | repositories from the GitHub Server. 147 | 148 | If public_only is True, will return only those repos that are marked as 149 | public. Set this to false to return all organizations that the session has 150 | permissions to access. 151 | """ 152 | 153 | if orgs is None: 154 | orgs = [] 155 | if repos is None: 156 | repos = [] 157 | if public_only: 158 | privacy = "public" 159 | else: 160 | privacy = "all" 161 | 162 | _check_api_limits(gh_session, 10) 163 | 164 | for org_name in orgs: 165 | org = gh_session.organization(org_name) 166 | num_repos = org.public_repos_count 167 | 168 | _check_api_limits(gh_session, _num_requests_needed(num_repos)) 169 | 170 | for repo in org.repositories(type=privacy): 171 | _check_api_limits(gh_session, 10) 172 | yield repo 173 | 174 | for repo_name in repos: 175 | _check_api_limits(gh_session, 10) 176 | org, name = repo_name.split("/") 177 | yield gh_session.repository(org, name) 178 | 179 | if not (orgs or repos): 180 | for repo in gh_session.all_repositories(): 181 | yield repo 182 | -------------------------------------------------------------------------------- /scraper/github/queryManager.py: -------------------------------------------------------------------------------- 1 | """ 2 | A module for GitHub query and data management. 3 | 4 | With this module, you will be able to send GraphQL and REST queries 5 | to GitHub, as well as read and write JSON files to store data. 6 | """ 7 | 8 | from datetime import datetime 9 | import json 10 | import os 11 | import re 12 | import time 13 | 14 | import pytz 15 | import requests 16 | 17 | from scraper.util import DEFAULT_REQUESTS_TIMEOUTS 18 | 19 | 20 | def _vPrint(verbose, *args, **kwargs): 21 | """Easy verbosity-control print method. 22 | 23 | Args: 24 | verbose (bool): Normal print if True, do nothing otherwise. 25 | *args: Argument list for the 'print' method. 26 | **kwargs: Keyword arguments for the 'print' method. 27 | 28 | """ 29 | if verbose: 30 | print(*args, **kwargs) 31 | 32 | 33 | class GitHubQueryManager: 34 | """GitHub query API manager.""" 35 | 36 | def __init__(self, apiToken=None, maxRetry=10, retryDelay=3): 37 | """Initialize the GitHubQueryManager object. 38 | 39 | Note: 40 | If no apiToken argument is provided, 41 | the environment variable 'GITHUB_API_TOKEN' must be set. 42 | 43 | Args: 44 | apiToken (Optional[str]): A string representing a GitHub API 45 | token. Defaults to None. 46 | maxRetry (Optional[int]): A limit on how many times to 47 | automatically retry requests. Defaults to 10. 48 | retryDelay (Optional[int]): Number of seconds to wait between 49 | automatic request retries. Defaults to 3. 50 | 51 | Raises: 52 | TypeError: If no GitHub API token is provided either via 53 | argument or environment variable 'GITHUB_API_TOKEN'. 54 | 55 | """ 56 | 57 | # Get GitHub API token 58 | if apiToken: 59 | self.__githubApiToken = apiToken 60 | else: 61 | try: 62 | self.__githubApiToken = os.environ["GITHUB_API_TOKEN"] 63 | except KeyError as error: 64 | raise TypeError( 65 | "Requires either a string argument or environment variable 'GITHUB_API_TOKEN'." 66 | ) from error 67 | 68 | # Check token validity 69 | print("Checking GitHub API token... ", end="", flush=True) 70 | basicCheck = self._submitQuery("query { viewer { login } }") 71 | if basicCheck["statusNum"] == 401: 72 | print("FAILED.") 73 | raise ValueError( 74 | "GitHub API token is not valid.\n%s %s" 75 | % (basicCheck["statusTxt"], basicCheck["result"]) 76 | ) 77 | 78 | print("Token validated.") 79 | 80 | # Initialize private variables 81 | self.__query = None #: Cached query string 82 | self.__queryPath = None #: Path to query file 83 | self.__queryTimestamp = None #: When query file was last modified 84 | 85 | # Initialize public variables 86 | self.maxRetry = maxRetry 87 | self.retryDelay = retryDelay 88 | self.data = {} 89 | """Dict: Working data.""" 90 | 91 | @property 92 | def maxRetry(self): 93 | """int: A limit on how many times to automatically retry requests. 94 | 95 | Must be a whole integer greater than 0. 96 | """ 97 | return self.__maxRetry 98 | 99 | @maxRetry.setter 100 | def maxRetry(self, maxRetry): 101 | numIn = int(maxRetry) 102 | numIn = 1 if numIn <= 0 else numIn 103 | self.__maxRetry = numIn 104 | print("Auto-retry limit for requests set to %d." % (self.maxRetry)) 105 | 106 | @property 107 | def retryDelay(self): 108 | """int: Number of seconds to wait between automatic request retries. 109 | 110 | Must be a whole integer greater than 0. 111 | """ 112 | return self.__retryDelay 113 | 114 | @retryDelay.setter 115 | def retryDelay(self, retryDelay): 116 | numIn = int(retryDelay) 117 | numIn = 1 if numIn <= 0 else numIn 118 | self.__retryDelay = numIn 119 | print("Auto-retry delay set to %dsec." % (self.retryDelay)) 120 | 121 | def _readGQL(self, filePath, verbose=False): 122 | """Read a 'pretty' formatted GraphQL query file into a one-line string. 123 | 124 | Removes line breaks and comments. Condenses white space. 125 | 126 | Args: 127 | filePath (str): A relative or absolute path to a file containing 128 | a GraphQL query. 129 | File may use comments and multi-line formatting. 130 | .. _GitHub GraphQL Explorer: 131 | https://developer.github.com/v4/explorer/ 132 | verbose (Optional[bool]): If False, prints will be suppressed. 133 | Defaults to False. 134 | 135 | Returns: 136 | str: A single line GraphQL query. 137 | 138 | """ 139 | if not os.path.isfile(filePath): 140 | raise RuntimeError("Query file '%s' does not exist." % (filePath)) 141 | lastModified = os.path.getmtime(filePath) 142 | absPath = os.path.abspath(filePath) 143 | if absPath == self.__queryPath and lastModified == self.__queryTimestamp: 144 | _vPrint( 145 | verbose, 146 | "Using cached query '%s'" % (os.path.basename(self.__queryPath)), 147 | ) 148 | query_in = self.__query 149 | else: 150 | _vPrint(verbose, "Reading '%s' ... " % (filePath), end="", flush=True) 151 | with open(filePath, "r", encoding="utf-8") as q: 152 | # Strip comments. 153 | query_in = re.sub(r"#.*(\n|\Z)", "\n", q.read()) 154 | # Condense whitespace. 155 | query_in = re.sub(r"\s+", " ", query_in) 156 | # Remove leading and trailing whitespace. 157 | query_in = query_in.strip() 158 | _vPrint(verbose, "File read!") 159 | self.__queryPath = absPath 160 | self.__queryTimestamp = lastModified 161 | self.__query = query_in 162 | return query_in 163 | 164 | def queryGitHubFromFile(self, filePath, gitvars=None, verbosity=0, **kwargs): 165 | """Submit a GitHub GraphQL query from a file. 166 | 167 | Can only be used with GraphQL queries. 168 | For REST queries, see the 'queryGitHub' method. 169 | 170 | Args: 171 | filePath (str): A relative or absolute path to a file containing 172 | a GraphQL query. 173 | File may use comments and multi-line formatting. 174 | .. _GitHub GraphQL Explorer: 175 | https://developer.github.com/v4/explorer/ 176 | gitvars (Optional[Dict]): All query variables. 177 | Defaults to None. 178 | GraphQL Only. 179 | verbosity (Optional[int]): Changes output verbosity levels. 180 | If < 0, all extra printouts are suppressed. 181 | If == 0, normal print statements are displayed. 182 | If > 0, additional status print statements are displayed. 183 | Defaults to 0. 184 | **kwargs: Keyword arguments for the 'queryGitHub' method. 185 | 186 | Returns: 187 | Dict: A JSON style dictionary. 188 | 189 | """ 190 | if not gitvars: 191 | gitvars = {} 192 | 193 | gitquery = self._readGQL(filePath, verbose=(verbosity >= 0)) 194 | return self.queryGitHub( 195 | gitquery, gitvars=gitvars, verbosity=verbosity, **kwargs 196 | ) 197 | 198 | def queryGitHub( 199 | self, 200 | gitquery, 201 | gitvars=None, 202 | verbosity=0, 203 | paginate=False, 204 | cursorVar=None, 205 | keysToList=None, 206 | rest=False, 207 | requestCount=0, 208 | pageNum=0, 209 | headers=None, 210 | ): 211 | """Submit a GitHub query. 212 | 213 | Args: 214 | gitquery (str): The query or endpoint itself. 215 | Examples: 216 | query: 'query { viewer { login } }' 217 | endpoint: '/user' 218 | gitvars (Optional[Dict]): All query variables. 219 | Defaults to None. 220 | GraphQL Only. 221 | verbosity (Optional[int]): Changes output verbosity levels. 222 | If < 0, all extra printouts are suppressed. 223 | If == 0, normal print statements are displayed. 224 | If > 0, additional status print statements are displayed. 225 | Defaults to 0. 226 | paginate (Optional[bool]): Pagination will be completed 227 | automatically if True. Defaults to False. 228 | cursorVar (Optional[str]): Key in 'gitvars' that represents the 229 | pagination cursor. Defaults to None. 230 | GraphQL Only. 231 | keysToList (Optional[List[str]]): Ordered list of keys needed to 232 | retrieve the list in the query results to be extended by 233 | pagination. Defaults to None. 234 | Example: 235 | ['data', 'viewer', 'repositories', 'nodes'] 236 | GraphQL Only. 237 | rest (Optional[bool]): If True, uses the REST API instead 238 | of GraphQL. Defaults to False. 239 | requestCount (Optional[int]): Counter for repeated requests. 240 | pageNum (Optional[int]): Counter for pagination. 241 | For user readable log messages only, does not affect data. 242 | headers (Optional[Dict]): Additional headers. 243 | Defaults to None. 244 | 245 | Returns: 246 | Dict: A JSON style dictionary. 247 | 248 | """ 249 | if not gitvars: 250 | gitvars = {} 251 | if not keysToList: 252 | keysToList = [] 253 | if not headers: 254 | headers = {} 255 | 256 | requestCount += 1 257 | pageNum = 0 if pageNum < 0 else pageNum # no negative page numbers 258 | pageNum += 1 259 | 260 | if paginate: 261 | _vPrint((verbosity >= 0), "Page %d" % (pageNum)) 262 | _vPrint( 263 | (verbosity >= 0), "Sending %s query..." % ("REST" if rest else "GraphQL") 264 | ) 265 | try: 266 | response = self._submitQuery( 267 | gitquery, 268 | gitvars=gitvars, 269 | verbose=(verbosity > 0), 270 | rest=rest, 271 | headers=headers, 272 | ) 273 | except requests.exceptions.ReadTimeout: # Handles intermittent response delays 274 | _vPrint((verbosity >= 0), "Read timed out.") 275 | _vPrint((verbosity >= 0), "Repeating query...") 276 | return self.queryGitHub( 277 | gitquery, 278 | gitvars=gitvars, 279 | verbosity=verbosity, 280 | paginate=paginate, 281 | cursorVar=cursorVar, 282 | keysToList=keysToList, 283 | rest=rest, 284 | requestCount=requestCount, 285 | pageNum=(pageNum - 1), # retry same page 286 | headers=headers, 287 | ) 288 | _vPrint((verbosity >= 0), "Checking response...") 289 | _vPrint((verbosity >= 0), "HTTP STATUS %s" % (response["statusTxt"])) 290 | statusNum = response["statusNum"] 291 | 292 | # Make sure the query limit didn't run out 293 | try: 294 | apiStatus = { 295 | "limit": int(response["headDict"]["X-RateLimit-Limit"]), 296 | "remaining": int(response["headDict"]["X-RateLimit-Remaining"]), 297 | "reset": int(response["headDict"]["X-RateLimit-Reset"]), 298 | } 299 | _vPrint((verbosity >= 0), "API Status %s" % (json.dumps(apiStatus))) 300 | if apiStatus["remaining"] <= 0: 301 | _vPrint((verbosity >= 0), "API rate limit exceeded.") 302 | self._awaitReset(apiStatus["reset"]) 303 | _vPrint((verbosity >= 0), "Repeating query...") 304 | return self.queryGitHub( 305 | gitquery, 306 | gitvars=gitvars, 307 | verbosity=verbosity, 308 | paginate=paginate, 309 | cursorVar=cursorVar, 310 | keysToList=keysToList, 311 | rest=rest, 312 | requestCount=(requestCount - 1), # not counted against retries 313 | pageNum=(pageNum - 1), # retry same page 314 | headers=headers, 315 | ) 316 | except KeyError: # Handles error responses without X-RateLimit data 317 | _vPrint((verbosity >= 0), "Failed to check API Status.") 318 | 319 | # Check for explicit API rate limit error responses 320 | if statusNum in (403, 429): 321 | _vPrint((verbosity >= 0), "API rate limit exceeded.") 322 | if requestCount >= self.maxRetry: 323 | raise RuntimeError( 324 | "Query attempted but failed %d times.\n%s\n%s" 325 | % ( 326 | self.maxRetry, 327 | response["statusTxt"], 328 | response["result"], 329 | ) 330 | ) 331 | 332 | try: # Use explicit wait time if available 333 | waitTime = int(response["headDict"]["Retry-After"]) 334 | self._countdown( 335 | waitTime, 336 | printString="Waiting %*d seconds...", 337 | verbose=(verbosity >= 0), 338 | ) 339 | except KeyError: # Handles missing Retry-After header 340 | self._countdown( 341 | # wait at least 1 min, longer on continued failure (recommended best practice) 342 | 60 * requestCount, 343 | printString="Waiting %*d seconds...", 344 | verbose=(verbosity >= 0), 345 | ) 346 | _vPrint((verbosity >= 0), "Repeating query...") 347 | return self.queryGitHub( 348 | gitquery, 349 | gitvars=gitvars, 350 | verbosity=verbosity, 351 | paginate=paginate, 352 | cursorVar=cursorVar, 353 | keysToList=keysToList, 354 | rest=rest, 355 | requestCount=requestCount, 356 | pageNum=(pageNum - 1), # retry same page 357 | headers=headers, 358 | ) 359 | # Check for accepted but not yet processed, usually due to un-cached data 360 | if statusNum == 202: 361 | if requestCount >= self.maxRetry: 362 | raise RuntimeError( 363 | "Query attempted but failed %d times.\n%s\n%s" 364 | % ( 365 | self.maxRetry, 366 | response["statusTxt"], 367 | response["result"], 368 | ) 369 | ) 370 | 371 | self._countdown( 372 | self.retryDelay, 373 | printString="Query accepted but not yet processed. Trying again in %*d seconds...", 374 | verbose=(verbosity >= 0), 375 | ) 376 | return self.queryGitHub( 377 | gitquery, 378 | gitvars=gitvars, 379 | verbosity=verbosity, 380 | paginate=paginate, 381 | cursorVar=cursorVar, 382 | keysToList=keysToList, 383 | rest=rest, 384 | requestCount=requestCount, 385 | pageNum=(pageNum - 1), # retry same page 386 | headers=headers, 387 | ) 388 | # Check for server error responses 389 | if statusNum in (502, 503): 390 | if requestCount >= self.maxRetry: 391 | raise RuntimeError( 392 | "Query attempted but failed %d times.\n%s\n%s" 393 | % ( 394 | self.maxRetry, 395 | response["statusTxt"], 396 | response["result"], 397 | ) 398 | ) 399 | 400 | self._countdown( 401 | self.retryDelay, 402 | printString="Server error. Trying again in %*d seconds...", 403 | verbose=(verbosity >= 0), 404 | ) 405 | return self.queryGitHub( 406 | gitquery, 407 | gitvars=gitvars, 408 | verbosity=verbosity, 409 | paginate=paginate, 410 | cursorVar=cursorVar, 411 | keysToList=keysToList, 412 | rest=rest, 413 | requestCount=requestCount, 414 | pageNum=(pageNum - 1), # retry same page 415 | headers=headers, 416 | ) 417 | # Check for other error responses 418 | if statusNum >= 400 or statusNum == 204: 419 | raise RuntimeError( 420 | "Request got an Error response.\n%s\n%s" 421 | % (response["statusTxt"], response["result"]) 422 | ) 423 | 424 | _vPrint((verbosity >= 0), "Data received!") 425 | outObj = json.loads(response["result"]) 426 | 427 | # Check for GraphQL API errors (e.g. repo not found) 428 | if not rest and "errors" in outObj: 429 | if requestCount >= self.maxRetry: 430 | raise RuntimeError( 431 | "Query attempted but failed %d times.\n%s\n%s" 432 | % ( 433 | self.maxRetry, 434 | response["statusTxt"], 435 | response["result"], 436 | ) 437 | ) 438 | 439 | if len(outObj["errors"]) == 1 and len(outObj["errors"][0]) == 1: 440 | # Poorly defined error type, usually intermittent, try again. 441 | _vPrint( 442 | (verbosity >= 0), 443 | "GraphQL API error.\n%s" % (json.dumps(outObj["errors"])), 444 | ) 445 | self._countdown( 446 | self.retryDelay, 447 | printString="Unknown API error. Trying again in %*d seconds...", 448 | verbose=(verbosity >= 0), 449 | ) 450 | return self.queryGitHub( 451 | gitquery, 452 | gitvars=gitvars, 453 | verbosity=verbosity, 454 | paginate=paginate, 455 | cursorVar=cursorVar, 456 | keysToList=keysToList, 457 | rest=rest, 458 | requestCount=requestCount, 459 | pageNum=(pageNum - 1), # retry same page 460 | headers=headers, 461 | ) 462 | 463 | raise RuntimeError( 464 | "GraphQL API error.\n%s" % (json.dumps(outObj["errors"])) 465 | ) 466 | 467 | # Pagination 468 | if paginate: 469 | if rest and response["linkDict"]: 470 | if "next" in response["linkDict"]: 471 | nextObj = self.queryGitHub( 472 | response["linkDict"]["next"], 473 | gitvars=gitvars, 474 | verbosity=verbosity, 475 | paginate=paginate, 476 | cursorVar=cursorVar, 477 | keysToList=keysToList, 478 | rest=rest, 479 | requestCount=0, 480 | pageNum=pageNum, 481 | headers=headers, 482 | ) 483 | outObj.extend(nextObj) 484 | elif not rest: 485 | if not cursorVar: 486 | raise ValueError( 487 | "Must specify argument 'cursorVar' to use GraphQL auto-pagination." 488 | ) 489 | if not len(keysToList) > 0: 490 | raise ValueError( 491 | "Must specify argument 'keysToList' as a non-empty list to use GraphQL auto-pagination." 492 | ) 493 | aPage = outObj 494 | for key in keysToList[0:-1]: 495 | aPage = aPage[key] 496 | gitvars[cursorVar] = aPage["pageInfo"]["endCursor"] 497 | if aPage["pageInfo"]["hasNextPage"]: 498 | nextObj = self.queryGitHub( 499 | gitquery, 500 | gitvars=gitvars, 501 | verbosity=verbosity, 502 | paginate=paginate, 503 | cursorVar=cursorVar, 504 | keysToList=keysToList, 505 | rest=rest, 506 | requestCount=0, 507 | pageNum=pageNum, 508 | headers=headers, 509 | ) 510 | newPage = nextObj 511 | for key in keysToList[0:-1]: 512 | newPage = newPage[key] 513 | aPage[keysToList[-1]].extend(newPage[keysToList[-1]]) 514 | aPage.pop("pageInfo", None) 515 | 516 | return outObj 517 | 518 | def _submitQuery( 519 | self, gitquery, gitvars=None, verbose=False, rest=False, headers=None 520 | ): 521 | """Send a curl request to GitHub. 522 | 523 | Args: 524 | gitquery (str): The query or endpoint itself. 525 | Examples: 526 | query: 'query { viewer { login } }' 527 | endpoint: '/user' 528 | gitvars (Optional[Dict]): All query variables. 529 | Defaults to None. 530 | verbose (Optional[bool]): If False, stderr prints will be 531 | suppressed. Defaults to False. 532 | rest (Optional[bool]): If True, uses the REST API instead 533 | of GraphQL. Defaults to False. 534 | headers (Optional[Dict]): Additional headers. 535 | Defaults to None. 536 | 537 | Returns: 538 | { 539 | 'statusNum' (int): The HTTP status code. 540 | 'statusTxt' (str): The HTTP status message. 541 | 'headDict' (Dict[str]): The response headers. 542 | 'linkDict' (Dict[int]): Link based pagination data. 543 | 'result' (str): The body of the response. 544 | } 545 | 546 | """ 547 | if not gitvars: 548 | gitvars = {} 549 | if not headers: 550 | headers = {} 551 | 552 | authhead = {"Authorization": "bearer " + self.__githubApiToken} 553 | if not rest: 554 | gitqueryJSON = json.dumps( 555 | {"query": gitquery, "variables": json.dumps(gitvars)} 556 | ) 557 | fullResponse = requests.post( 558 | "https://api.github.com/graphql", 559 | data=gitqueryJSON, 560 | headers={**authhead, **headers}, 561 | timeout=DEFAULT_REQUESTS_TIMEOUTS, 562 | ) 563 | else: 564 | fullResponse = requests.get( 565 | "https://api.github.com" + gitquery, 566 | headers={**authhead, **headers}, 567 | timeout=DEFAULT_REQUESTS_TIMEOUTS, 568 | ) 569 | _vPrint( 570 | verbose, 571 | "\n%s\n%s" 572 | % (json.dumps(dict(fullResponse.headers), indent=2), fullResponse.text), 573 | ) 574 | result = fullResponse.text 575 | headDict = fullResponse.headers 576 | statusNum = int(fullResponse.status_code) 577 | statusTxt = "%d %s" % (statusNum, fullResponse.reason) 578 | 579 | # Parse any Link headers even further 580 | linkDict = None 581 | if "Link" in headDict: 582 | linkProperties = headDict["Link"].split(", ") 583 | propDict = {} 584 | for item in linkProperties: 585 | divided = re.split(r'; rel="|"', item) 586 | propDict[divided[2]] = divided[1] 587 | linkDict = propDict 588 | 589 | return { 590 | "statusNum": statusNum, 591 | "statusTxt": statusTxt, 592 | "headDict": headDict, 593 | "linkDict": linkDict, 594 | "result": result, 595 | } 596 | 597 | def _awaitReset(self, utcTimeStamp, verbose=True): 598 | """Wait until the given UTC timestamp. 599 | 600 | Args: 601 | utcTimeStamp (int): A UTC format timestamp. 602 | verbose (Optional[bool]): If False, all extra printouts will be 603 | suppressed. Defaults to True. 604 | 605 | """ 606 | resetTime = pytz.utc.localize(datetime.utcfromtimestamp(utcTimeStamp)) 607 | _vPrint(verbose, "--- Current Timestamp") 608 | _vPrint(verbose, " %s" % (time.strftime("%c"))) 609 | now = pytz.utc.localize(datetime.utcnow()) 610 | waitTime = round((resetTime - now).total_seconds()) + 1 611 | _vPrint(verbose, "--- Current UTC Timestamp") 612 | _vPrint(verbose, " %s" % (now.strftime("%c"))) 613 | _vPrint(verbose, "--- GITHUB NEEDS A BREAK Until UTC Timestamp") 614 | _vPrint(verbose, " %s" % (resetTime.strftime("%c"))) 615 | self._countdown( 616 | waitTime, printString="--- Waiting %*d seconds...", verbose=verbose 617 | ) 618 | _vPrint(verbose, "--- READY!") 619 | 620 | def _countdown( 621 | self, waitTime=0, printString="Waiting %*d seconds...", verbose=True 622 | ): 623 | """Prints a message and waits. 624 | 625 | Args: 626 | waitTime (Optional[int]): Number of seconds to wait. Defaults to 0. 627 | printString (Optional[str]): A counter message to display. 628 | Defaults to "Waiting %*d seconds...". 629 | verbose (Optional[bool]): If False, all extra printouts will be 630 | suppressed. Defaults to True. 631 | 632 | """ 633 | if waitTime <= 0: 634 | waitTime = self.retryDelay 635 | _vPrint(verbose, printString % (len(str(waitTime)), waitTime)) 636 | time.sleep(waitTime) 637 | 638 | 639 | class DataManager: 640 | """JSON data manager.""" 641 | 642 | def __init__(self, filePath=None, loadData=False): 643 | """Initialize the DataManager object. 644 | Args: 645 | filePath (Optional[str]): Relative or absolute path to a JSON 646 | data file. Defaults to None. 647 | loadData (Optional[bool]): Loads data from the given file path 648 | if True. Defaults to False. 649 | 650 | """ 651 | self.data = {} 652 | """Dict: Working data.""" 653 | self.filePath = filePath 654 | if loadData: 655 | self.fileLoad(updatePath=False) 656 | 657 | @property 658 | def filePath(self): 659 | """str: Absolute path to a JSON format data file. 660 | 661 | Can accept relative paths, but will always convert them to 662 | the absolute path. 663 | """ 664 | if not self.__filePath: 665 | raise ValueError("Internal variable filePath has not been set.") 666 | return self.__filePath 667 | 668 | @filePath.setter 669 | def filePath(self, filePath): 670 | if filePath: 671 | if not os.path.isfile(filePath): 672 | print( 673 | "Data file '%s' does not currently exist. Saving data will create a new file." 674 | % (filePath) 675 | ) 676 | self.__filePath = os.path.abspath(filePath) 677 | print("Stored new data file path '%s'" % (self.filePath)) 678 | else: 679 | self.__filePath = None 680 | 681 | def dataReset(self): 682 | """Reset the internal JSON data dictionary.""" 683 | self.data = {} 684 | print("Stored data has been reset.") 685 | 686 | def fileLoad(self, filePath=None, updatePath=True): 687 | """Load a JSON data file into the internal JSON data dictionary. 688 | 689 | Current internal data will be overwritten. 690 | If no file path is provided, the stored data file path will be used. 691 | 692 | Args: 693 | filePath (Optional[str]): A relative or absolute path to a 694 | '.json' file. Defaults to None. 695 | updatePath (Optional[bool]): Specifies whether or not to update 696 | the stored data file path. Defaults to True. 697 | 698 | """ 699 | if not filePath: 700 | filePath = self.filePath 701 | if not os.path.isfile(filePath): 702 | raise FileNotFoundError("Data file '%s' does not exist." % (filePath)) 703 | 704 | print( 705 | "Importing existing data file '%s' ... " % (filePath), 706 | end="", 707 | flush=True, 708 | ) 709 | with open(filePath, "r", encoding="utf-8") as q: 710 | data_raw = q.read() 711 | print("Imported!") 712 | self.data = json.loads(data_raw) 713 | if updatePath: 714 | self.filePath = filePath 715 | 716 | def fileSave(self, filePath=None, updatePath=False, newline=None): 717 | """Write the internal JSON data dictionary to a JSON data file. 718 | 719 | If no file path is provided, the stored data file path will be used. 720 | 721 | Args: 722 | filePath (Optional[str]): A relative or absolute path to a 723 | '.json' file. Defaults to None. 724 | updatePath (Optional[bool]): Specifies whether or not to update 725 | the stored data file path. Defaults to False. 726 | newline (Optional[str]): Specifies the line endings to use when 727 | writing the file. Defaults to system default line separator. 728 | 729 | """ 730 | if not filePath: 731 | filePath = self.filePath 732 | if not os.path.isfile(filePath): 733 | print("Data file '%s' does not exist, will create new file." % (filePath)) 734 | if not os.path.exists(os.path.split(filePath)[0]): 735 | os.makedirs(os.path.split(filePath)[0]) 736 | dataJsonString = json.dumps(self.data, indent=4, sort_keys=True) 737 | print("Writing to file '%s' ... " % (filePath), end="", flush=True) 738 | with open(filePath, "w", encoding="utf-8", newline=newline) as fileout: 739 | fileout.write(dataJsonString) 740 | print("Wrote file!") 741 | if updatePath: 742 | self.filePath = filePath 743 | -------------------------------------------------------------------------------- /scraper/github/util.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | logger = logging.getLogger(__name__) 4 | 5 | 6 | def _license_obj(license_name): 7 | """ 8 | A helper function to look up license object information 9 | 10 | Use names from: https://api.github.com/licenses 11 | """ 12 | obj = None 13 | 14 | if license_name in ("MIT", "MIT License"): 15 | obj = {"URL": "https://api.github.com/licenses/mit", "name": "MIT"} 16 | elif license_name in ('BSD 2-clause "Simplified" License'): 17 | obj = { 18 | "URL": "https://api.github.com/licenses/bsd-2-clause", 19 | "name": "BSD-2-Clause", 20 | } 21 | elif license_name in ('BSD 3-clause "New" or "Revised" License'): 22 | obj = { 23 | "URL": "https://api.github.com/licenses/bsd-3-clause", 24 | "name": "BSD-3-Clause", 25 | } 26 | elif license_name in ("Apache License 2.0"): 27 | obj = { 28 | "URL": "https://api.github.com/licenses/apache-2.0", 29 | "name": "Apache-2.0", 30 | } 31 | elif license_name in ("GNU General Public License v2.1"): 32 | obj = {"URL": "https://api.github.com/licenses/gpl-2.1", "name": "GPL-2.1"} 33 | elif license_name in ("GNU General Public License v2.0"): 34 | obj = {"URL": "https://api.github.com/licenses/gpl-2.0", "name": "GPL-2.0"} 35 | elif license_name in ("GNU Lesser General Public License v2.1"): 36 | obj = {"URL": "https://api.github.com/licenses/lgpl-2.1", "name": "LGPL-2.1"} 37 | elif license_name in ("GNU General Public License v3.0"): 38 | obj = {"URL": "https://api.github.com/licenses/gpl-3.0", "name": "GPL-3.0"} 39 | elif license_name in ("GNU Lesser General Public License v3.0"): 40 | obj = {"URL": "https://api.github.com/licenses/lgpl-3.0", "name": "LGPL-3.0"} 41 | elif license_name in ("Eclipse Public License 1.0"): 42 | obj = {"URL": "https://api.github.com/licenses/epl-1.0", "name": "EPL-1.0"} 43 | elif license_name in ("Mozilla Public License 2.0"): 44 | obj = {"URL": "https://api.github.com/licenses/mpl-2.0", "name": "MPL-2.0"} 45 | elif license_name in ("The Unlicense"): 46 | obj = {"URL": "https://api.github.com/licenses/unlicense", "name": "Unlicense"} 47 | elif license_name in ("GNU Affero General Public License v3.0"): 48 | obj = {"URL": "https://api.github.com/licenses/agpl-3.0", "name": "AGPL-3.0"} 49 | elif license_name in ("Eclipse Public License 2.0"): 50 | obj = {"URL": "https://api.github.com/licenses/epl-2.0", "name": "EPL-2.0"} 51 | 52 | if obj is None: 53 | logger.warning("I don't understand the license: %s", license_name) 54 | raise ValueError("Aborting!") 55 | 56 | return obj 57 | -------------------------------------------------------------------------------- /scraper/gitlab/__init__.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | 4 | import gitlab 5 | 6 | logger = logging.getLogger(__name__) 7 | 8 | 9 | def connect(url="https://gitlab.com", token=None): 10 | """ 11 | Return a connected GitLab session 12 | 13 | ``token`` should be a ``private_token`` from Gitlab 14 | """ 15 | 16 | if token is None: 17 | token = os.environ.get("GITLAB_API_TOKEN", None) 18 | 19 | gl_session = gitlab.Gitlab(url, token) 20 | 21 | try: 22 | gl_session.version() 23 | except gitlab.exceptions.GitlabAuthenticationError as exc: 24 | raise RuntimeError("Invalid or missing GITLAB_API_TOKEN") from exc 25 | 26 | logger.info("Connected to: %s", url) 27 | 28 | return gl_session 29 | 30 | 31 | def query_repos(gl_session, repos=None): 32 | """ 33 | Yields Gitlab project objects for all projects in Bitbucket 34 | """ 35 | 36 | if repos is None: 37 | repos = [] 38 | 39 | for repo in repos: 40 | yield gl_session.projects.get(repo) 41 | 42 | if not repos: 43 | for project in gl_session.projects.list(as_list=False): 44 | yield project 45 | -------------------------------------------------------------------------------- /scraper/tfs/__init__.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: UTF-8 -*- 3 | 4 | import logging 5 | import os 6 | 7 | from msrest.authentication import BasicAuthentication 8 | from vsts.vss_connection import VssConnection 9 | 10 | from scraper.tfs.models import TFSProject 11 | 12 | logger = logging.getLogger(__name__) 13 | 14 | HARD_CODED_TOP = 10000 15 | 16 | 17 | def get_projects_metadata(baseurl, token): 18 | logger.debug("Retrieving TFS Metdata.....") 19 | return get_all_projects(baseurl, token) 20 | 21 | 22 | def create_tfs_connection(url, token): 23 | """ 24 | Creates the TFS Connection Context 25 | """ 26 | if token is None: 27 | token = os.environ.get("TFS_API_TOKEN", None) 28 | 29 | tfs_credentials = BasicAuthentication("", token) 30 | tfs_connection = VssConnection(base_url=url, creds=tfs_credentials) 31 | return tfs_connection 32 | 33 | 34 | def create_tfs_project_analysis_client(url, token=None): 35 | """ 36 | Create a project_analysis_client.py client for a Team Foundation Server Enterprise connection instance. 37 | This is helpful for understanding project languages, but currently blank for all our test conditions. 38 | 39 | If token is not provided, will attempt to use the TFS_API_TOKEN 40 | environment variable if present. 41 | """ 42 | if token is None: 43 | token = os.environ.get("TFS_API_TOKEN", None) 44 | 45 | tfs_connection = create_tfs_connection(url, token) 46 | project_analysis_client = tfs_connection.get_client( 47 | "vsts.project_analysis.v4_1.project_analysis_client.ProjectAnalysisClient" 48 | ) 49 | 50 | if project_analysis_client is None: 51 | raise RuntimeError( 52 | "Unable to connect to TFS Enterprise (%s) with provided token." % url 53 | ) 54 | 55 | return project_analysis_client 56 | 57 | 58 | def create_tfs_core_client(url, token=None): 59 | """ 60 | Create a core_client.py client for a Team Foundation Server Enterprise connection instance 61 | 62 | If token is not provided, will attempt to use the TFS_API_TOKEN 63 | environment variable if present. 64 | """ 65 | if token is None: 66 | token = os.environ.get("TFS_API_TOKEN", None) 67 | 68 | tfs_connection = create_tfs_connection(url, token) 69 | tfs_client = tfs_connection.get_client("vsts.core.v4_1.core_client.CoreClient") 70 | 71 | if tfs_client is None: 72 | raise RuntimeError( 73 | "Unable to connect to TFS Enterprise (%s) with provided token." % url 74 | ) 75 | 76 | return tfs_client 77 | 78 | 79 | def create_tfs_git_client(url, token=None): 80 | """ 81 | Creates a TFS Git Client to pull Git repo info 82 | """ 83 | if token is None: 84 | token = os.environ.get("TFS_API_TOKEN", None) 85 | 86 | tfs_connection = create_tfs_connection(url, token) 87 | tfs_git_client = tfs_connection.get_client("vsts.git.v4_1.git_client.GitClient") 88 | 89 | if tfs_git_client is None: 90 | raise RuntimeError( 91 | "Unable to create TFS Git Client, failed to connect to TFS Enterprise (%s) with provided token." 92 | % url 93 | ) 94 | 95 | return tfs_git_client 96 | 97 | 98 | def create_tfs_tfvc_client(url, token=None): 99 | """ 100 | Creates a TFS TFVC Client to pull TFVC repo info 101 | """ 102 | if token is None: 103 | token = os.environ.get("TFS_API_TOKEN", None) 104 | 105 | tfs_connection = create_tfs_connection(url, token) 106 | tfs_tfvc_client = tfs_connection.get_client("vsts.tfvc.v4_1.tfvc_client.TfvcClient") 107 | 108 | if tfs_tfvc_client is None: 109 | raise RuntimeError( 110 | "Unable to create TFS Git Client, failed to connect to TFS Enterprise (%s) with provided token." 111 | % url 112 | ) 113 | 114 | return tfs_tfvc_client 115 | 116 | 117 | def get_all_projects(url, token, top=HARD_CODED_TOP): 118 | """ 119 | Returns a list of all projects with their collection info from the server. Currently limited functionality to only return the first 1000 projects. 120 | #TODO refactor to add multiple calls to api to retrieve all projects if more exist beyond top. 121 | """ 122 | project_list = [] 123 | tfs_client = create_tfs_core_client(url, token) 124 | 125 | collections = tfs_client.get_project_collections(top=top) 126 | 127 | for collection in collections: 128 | collection_client = create_tfs_core_client( 129 | "{url}/{collection_name}".format(url=url, collection_name=collection.name), 130 | token, 131 | ) 132 | 133 | logger.debug("Retrieving Projects for Project Collection: %s", collection.name) 134 | # Retrieves all projects in the project collection 135 | projects = collection_client.get_projects(top=HARD_CODED_TOP) 136 | # get_projects only gets the project references, have to call get_project_history_entries to get last update info for projects 137 | # Only calling this once per collection as its an expensive API call, wil refactor later if there is a better API call to use 138 | collection_history_list = collection_client.get_project_history_entries() 139 | for project in projects: 140 | # get_projects only gets team project ref objects, 141 | # have to call get_project to get the team project object which includes the TFS Web Url for the project 142 | logger.debug("Retrieving Team Project for Project: %s", project.name) 143 | projectInfo = collection_client.get_project(project.id, True, True) 144 | 145 | tfsProject = TFSProject(projectInfo, collection) 146 | 147 | logger.debug( 148 | "Retrieving Last Updated and Created Info for Project: %s", project.name 149 | ) 150 | tfsProject.projectLastUpdateInfo = get_project_last_update_time( 151 | collection_history_list, project.id 152 | ) 153 | tfsProject.projectCreateInfo = get_project_create_time( 154 | collection_history_list, project.id 155 | ) 156 | project_list.append(tfsProject) 157 | 158 | return project_list 159 | 160 | 161 | def get_git_repos(url, token, collection, project): 162 | """ 163 | Returns a list of all git repos for the supplied project within the supplied collection 164 | """ 165 | git_client = create_tfs_git_client( 166 | "{url}/{collection_name}".format(url=url, collection_name=collection.name), 167 | token, 168 | ) 169 | logger.debug("Retrieving Git Repos for Project: %s", project.name) 170 | return git_client.get_repositories(project.id) 171 | 172 | 173 | def get_tfvc_repos(url, token, collection, project): 174 | """ 175 | Returns a list of all tfvc branches for the supplied project within the supplied collection 176 | """ 177 | branch_list = [] 178 | tfvc_client = create_tfs_tfvc_client( 179 | "{url}/{collection_name}".format(url=url, collection_name=collection.name), 180 | token, 181 | ) 182 | 183 | logger.debug("Retrieving Tfvc Branches for Project: %s}", project.name) 184 | branches = tfvc_client.get_branches(project.id, True, True, False, True) 185 | if branches: 186 | branch_list.extend(branches) 187 | else: 188 | logger.debug("No Tfvc Branches in Project: %s", project.name) 189 | 190 | return branch_list 191 | 192 | 193 | def get_project_last_update_time(collection_history_list, projectId): 194 | sorted_history_list = sorted( 195 | collection_history_list, key=lambda x: x.last_update_time, reverse=True 196 | ) 197 | return next((x for x in sorted_history_list if x.id == projectId)) 198 | 199 | 200 | def get_project_create_time(collection_history_list, projectId): 201 | sorted_history_list = sorted( 202 | collection_history_list, key=lambda x: x.last_update_time, reverse=False 203 | ) 204 | return next((x for x in sorted_history_list if x.id == projectId)) 205 | -------------------------------------------------------------------------------- /scraper/tfs/models.py: -------------------------------------------------------------------------------- 1 | class TFSProject: 2 | def __init__(self, projectInfo, collectionInfo): 3 | self.projectInfo = projectInfo 4 | self.collectionInfo = collectionInfo 5 | self.projectCreateInfo = {} 6 | self.projectLastUpdateInfo = {} 7 | self.gitInfo = [] 8 | self.tfvcInfo = [] 9 | -------------------------------------------------------------------------------- /scraper/util.py: -------------------------------------------------------------------------------- 1 | import functools 2 | import json 3 | import logging 4 | import logging.config 5 | import os 6 | from subprocess import PIPE, Popen # nosec 7 | import tempfile 8 | 9 | logger = logging.getLogger(__name__) 10 | 11 | # These mirror the defaults in github3.py sessions per: 12 | # https://github.com/sigmavirus24/github3.py/blob/ce43e6e5fdef6555f5a6b6602e2cc4b66c428aef/src/github3/session.py#L98 13 | DEFAULT_REQUESTS_TIMEOUTS = (4, 10) 14 | 15 | 16 | def execute(command, cwd=None): 17 | logger.debug("Forking command: %s", command) 18 | 19 | if cwd is None: 20 | cwd = os.getcwd() 21 | elif not os.path.isdir(cwd): 22 | raise ValueError("path does not exist: %s" % cwd) 23 | 24 | with Popen( 25 | command, cwd=cwd, stdout=PIPE, stderr=PIPE, shell=False 26 | ) as process: # nosec 27 | out, err = process.communicate() 28 | 29 | if process.returncode: 30 | logging.error( 31 | "Error Executing: command=%s, returncode=%d", 32 | " ".join(command), 33 | process.returncode, 34 | ) 35 | 36 | return out.decode("utf-8"), err.decode("utf-8") 37 | 38 | 39 | def configure_logging(verbose=False): 40 | DEFAULT_LOGGING = { 41 | "version": 1, 42 | "disable_existing_loggers": False, 43 | "formatters": { 44 | "standard": { 45 | # 'format': '%(asctime)s [%(levelname)s] %(name)s: %(message)s' 46 | # 'format': '%(levelname)s: %(message)s' 47 | "format": "%(asctime)s - %(levelname)s: %(message)s" 48 | } 49 | }, 50 | "handlers": { 51 | "default": { 52 | "level": "INFO", 53 | "formatter": "standard", 54 | "class": "logging.StreamHandler", 55 | }, 56 | "null": { 57 | "level": "INFO", 58 | "formatter": "standard", 59 | "class": "logging.NullHandler", 60 | }, 61 | }, 62 | "loggers": { 63 | "": {"handlers": ["default"], "level": "DEBUG", "propagate": False}, 64 | "github3": {"handlers": ["null"], "level": "DEBUG", "propagate": False}, 65 | "urllib3": {"handlers": ["null"], "level": "DEBUG", "propagate": False}, 66 | }, 67 | } 68 | 69 | if verbose: 70 | DEFAULT_LOGGING["handlers"]["default"]["level"] = "DEBUG" 71 | # DEFAULT_LOGGING['loggers']['']['level'] = 'DEBUG' 72 | 73 | logging.config.dictConfig(DEFAULT_LOGGING) 74 | 75 | 76 | def git_repo_to_sloc(url): 77 | """ 78 | Given a Git repository URL, returns number of lines of code based on cloc 79 | 80 | Reference: 81 | - cloc: https://github.com/AlDanial/cloc 82 | - https://www.omg.org/spec/AFP/ 83 | - Another potential way to calculation effort 84 | 85 | Sample cloc output: 86 | { 87 | "header": { 88 | "cloc_url": "github.com/AlDanial/cloc", 89 | "cloc_version": "1.74", 90 | "elapsed_seconds": 0.195950984954834, 91 | "n_files": 27, 92 | "n_lines": 2435, 93 | "files_per_second": 137.78956000769, 94 | "lines_per_second": 12426.5769858787 95 | }, 96 | "C++": { 97 | "nFiles": 7, 98 | "blank": 121, 99 | "comment": 314, 100 | "code": 371 101 | }, 102 | "C/C++ Header": { 103 | "nFiles": 8, 104 | "blank": 107, 105 | "comment": 604, 106 | "code": 191 107 | }, 108 | "CMake": { 109 | "nFiles": 11, 110 | "blank": 49, 111 | "comment": 465, 112 | "code": 165 113 | }, 114 | "Markdown": { 115 | "nFiles": 1, 116 | "blank": 18, 117 | "comment": 0, 118 | "code": 30 119 | }, 120 | "SUM": { 121 | "blank": 295, 122 | "comment": 1383, 123 | "code": 757, 124 | "nFiles": 27 125 | } 126 | } 127 | """ 128 | 129 | with tempfile.TemporaryDirectory() as tmp_dir: 130 | logger.debug("Cloning: url=%s tmp_dir=%s", url, tmp_dir) 131 | 132 | tmp_clone = os.path.join(tmp_dir, "clone-dir") 133 | 134 | cmd = ["git", "clone", "--depth=1", url, tmp_clone] 135 | execute(cmd) 136 | 137 | cmd = ["cloc", "--json", tmp_clone] 138 | out, err = execute(cmd) 139 | 140 | if err: 141 | logger.warning( 142 | "Error encountered while analyzing: url=%s stderr=%s", url, err 143 | ) 144 | 145 | try: 146 | cloc_json = json.loads(out) 147 | sloc = cloc_json["SUM"]["code"] 148 | except json.decoder.JSONDecodeError: 149 | logger.error("Error Decoding: url=%s, out=%s", url, out) 150 | sloc = 0 151 | except KeyError: 152 | logging.error( 153 | "Missing LOC information (Is the repository empty?): url=%s, json=%s", 154 | url, 155 | json.dumps(cloc_json), 156 | ) 157 | sloc = 0 158 | 159 | logger.debug("SLOC: url=%s, sloc=%d", url, sloc) 160 | 161 | return sloc 162 | 163 | 164 | def compute_labor_hours(sloc, month_hours="cocomo_book"): 165 | """ 166 | Compute the labor hours, given a count of source lines of code 167 | 168 | The intention is to use the COCOMO II model to compute this value. 169 | 170 | References: 171 | - http://csse.usc.edu/tools 172 | - http://softwarecost.org/tools/COCOMO/ 173 | - https://www.rose-hulman.edu/class/csse/csse372/201310/Homework/CII_modelman2000.pdf 174 | """ 175 | # Calculation of hours in a month 176 | if month_hours == "hours_per_year": 177 | # Use number of working hours in a year: 178 | # (40 Hours / week) * (52 weeks / year) / (12 months / year) ~= 173.33 179 | HOURS_PER_PERSON_MONTH = 40.0 * 52 / 12 180 | else: 181 | # Use value from COCOMO II Book (month_hours=='cocomo_book'): 182 | # Reference: https://dl.acm.org/citation.cfm?id=557000 183 | # This is the value used by the Code.gov team: 184 | # https://github.com/GSA/code-gov/blob/master/docs/labor_hour_calc.md 185 | HOURS_PER_PERSON_MONTH = 152.0 186 | 187 | # Coefficients for the COCOMO II model (only the two used for person-month 188 | # calculation) 189 | co_a = 2.94 190 | co_b = 0.91 191 | 192 | # These values represent a default of "Nominal" from the established 193 | # constant values for the COCOMO II model. 194 | scale_factors = [ 195 | 3.72, # Precedentedness 196 | 3.04, # Development Flexibility 197 | 4.24, # Architecture / Risk Resolution 198 | 3.29, # Team Cohesion 199 | 4.68, # Process Maturity 200 | ] 201 | cost_drivers = [ 202 | 1.00, # Required Software Reliability 203 | 1.00, # Data Base Size 204 | 1.00, # Product Complexity 205 | 1.00, # Developed for Reusability 206 | 1.00, # Documentation Match to Lifecycle Needs 207 | 1.00, # Analyst Capability 208 | 1.00, # Programmer Capability 209 | 1.00, # Personnel Continuity 210 | 1.00, # Application Experience 211 | 1.00, # Platform Experience 212 | 1.00, # Language and Toolset Experience 213 | 1.00, # Time Constraint 214 | 1.00, # Storage Constraint 215 | 1.00, # Platform Volatility 216 | 1.00, # Use of Software Tools 217 | 1.00, # Multisite Development 218 | 1.00, # Required Development Schedule 219 | ] 220 | 221 | # The summation (∑) of the scale factors is used in this calculation 222 | scale_factor_aggregate = co_b + 0.01 * functools.reduce( 223 | lambda x, y: x + y, scale_factors 224 | ) 225 | # The product (∏) of the cost drivers 226 | effort_adjustment_factor = functools.reduce(lambda x, y: x * y, cost_drivers) 227 | # The calculation of person-months uses KSLOC for the size of a project 228 | size = sloc / 1000 229 | 230 | # Calculate PM = A * Size^E * EAF 231 | person_months = co_a * size**scale_factor_aggregate * effort_adjustment_factor 232 | 233 | labor_hours = round(person_months * HOURS_PER_PERSON_MONTH, 1) 234 | logger.debug("sloc=%d labor_hours=%d", sloc, labor_hours) 235 | 236 | return labor_hours 237 | 238 | 239 | def labor_hours_from_url(url): 240 | sum_sloc = git_repo_to_sloc(url) 241 | logger.info("SLOC: %d", sum_sloc) 242 | 243 | labor_hours = compute_labor_hours(sum_sloc) 244 | logger.info("labor_hours: %d", labor_hours) 245 | 246 | return labor_hours 247 | 248 | 249 | def _prune_dict_null_str(dictionary): 250 | """ 251 | Prune the "None" or emptry string values from dictionary items 252 | """ 253 | for key, value in list(dictionary.items()): 254 | if value is None or str(value) == "": 255 | del dictionary[key] 256 | 257 | if isinstance(value, dict): 258 | dictionary[key] = _prune_dict_null_str(dictionary[key]) 259 | 260 | return dictionary 261 | -------------------------------------------------------------------------------- /scripts/clone_everything.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python3 2 | 3 | import pathlib 4 | import subprocess 5 | from timeit import default_timer as timer 6 | 7 | import requests 8 | 9 | INPUT_FILE = "https://raw.githubusercontent.com/LLNL/llnl.github.io/main/visualize/github-data/intReposInfo.json" 10 | 11 | 12 | def main(): 13 | repo_info = requests.get(INPUT_FILE).json()["data"] 14 | 15 | BACKUP_PATH = "github_backup" 16 | pathlib.Path(BACKUP_PATH).mkdir(parents=True, exist_ok=True) 17 | 18 | start = timer() 19 | 20 | for slug, data in repo_info.items(): 21 | url = data["url"] 22 | clone_path = f"{BACKUP_PATH}/{slug}" 23 | if pathlib.Path(clone_path).exists(): 24 | print(f"... updating: {url}") 25 | subprocess.run(["time", "git", "fetch"], cwd=clone_path) 26 | else: 27 | print(f"... cloning: {url}") 28 | subprocess.run(["time", "git", "clone", "--mirror", url, clone_path]) 29 | if not pathlib.Path(clone_path).exists(): 30 | print("Something went wrong with the clone, don't try to lfs fetch...") 31 | continue 32 | subprocess.run(["time", "git", "lfs", "fetch", "--all"], cwd=clone_path) 33 | 34 | end = timer() 35 | 36 | print(end - start) # Time in seconds, e.g. 5.38091952400282 37 | 38 | 39 | if __name__ == "__main__": 40 | main() 41 | -------------------------------------------------------------------------------- /scripts/codegov_compute_hours.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python3 2 | 3 | import argparse 4 | import json 5 | 6 | from scraper.util import compute_labor_hours, git_repo_to_sloc 7 | 8 | parser = argparse.ArgumentParser( 9 | description="Scrape code repositories for Code.gov / DOECode" 10 | ) 11 | parser.add_argument( 12 | "filename", type=str, help="Path to locally stored `code.json` file" 13 | ) 14 | args = parser.parse_args() 15 | 16 | code_gov_json = json.load(open(args.filename)) 17 | releases = code_gov_json["releases"] 18 | 19 | repo_urls = { 20 | release["repositoryURL"].rstrip("/") 21 | for release in releases 22 | if release.get("vcs", "") == "git" 23 | } 24 | 25 | for url in repo_urls: 26 | # print(url) 27 | 28 | sloc = git_repo_to_sloc(url) 29 | # print(sloc) 30 | 31 | hours = compute_labor_hours(sloc) 32 | print("-- url=%s, sloc=%d, hours=%d" % (url, sloc, hours)) 33 | -------------------------------------------------------------------------------- /scripts/get_stargazers.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import getpass 3 | import os 4 | 5 | import github3 6 | import requests 7 | 8 | 9 | class GitHub_Stargazers: 10 | def __init__(self): 11 | self.repos = {} 12 | self.stargazers = {} 13 | self.total_count = 0 14 | 15 | def get_stats(self, username="", password="", organization="llnl", force=True): 16 | """ 17 | Retrieves the traffic for the users of the given organization. 18 | Requires organization admin credentials token to access the data. 19 | """ 20 | stargazers_file_path = "../github_stats_output/stargazers.csv" 21 | if force or not os.path.isfile(file_path): 22 | my_github.login(username, password) 23 | calls_beginning = self.logged_in_gh.ratelimit_remaining + 1 24 | print("Rate Limit: " + str(calls_beginning)) 25 | my_github.get_org(organization) 26 | my_github.get_repos() 27 | my_github.write_to_file(file_path=stargazers_file_path) 28 | # my_github.write_to_file(file_path=stargazers_file_path) 29 | calls_remaining = self.logged_in_gh.ratelimit_remaining 30 | calls_used = calls_beginning - calls_remaining 31 | print( 32 | "Rate Limit Remaining: " 33 | + str(calls_remaining) 34 | + "\nUsed " 35 | + str(calls_used) 36 | + " API calls." 37 | ) 38 | 39 | def login(self, username="", password=""): 40 | """ 41 | Performs a login and sets the Github object via given credentials. If 42 | credentials are empty or incorrect then prompts user for credentials. 43 | Stores the authentication token in a CREDENTIALS_FILE used for future 44 | logins. Handles Two Factor Authentication. 45 | """ 46 | try: 47 | self.token = "" 48 | id = "" 49 | if not os.path.isfile("CREDENTIALS_FILE"): 50 | if username == "" or password == "": 51 | username = raw_input("Username: ") 52 | password = getpass.getpass("Password: ") 53 | note = "GitHub Organization Stats App" 54 | note_url = "http://software.llnl.gov/" 55 | scopes = ["user", "repo"] 56 | auth = github3.authorize( 57 | username, 58 | password, 59 | scopes, 60 | note, 61 | note_url, 62 | two_factor_callback=self.prompt_2fa, 63 | ) 64 | self.token = auth.token 65 | id = auth.id 66 | with open("CREDENTIALS_FILE", "w+") as fd: 67 | fd.write(self.token + "\n") 68 | fd.write(str(id)) 69 | fd.close() 70 | else: 71 | with open("CREDENTIALS_FILE", "r") as fd: 72 | self.token = fd.readline().strip() 73 | id = fd.readline().strip() 74 | fd.close() 75 | print("Logging in.") 76 | self.logged_in_gh = github3.login( 77 | token=self.token, two_factor_callback=self.prompt_2fa 78 | ) 79 | self.logged_in_gh.user().to_json() 80 | except (ValueError, AttributeError, github3.models.GitHubError): 81 | print("Bad credentials. Try again.") 82 | self.login() 83 | 84 | def prompt_2fa(self): 85 | """ 86 | Taken from 87 | http://github3py.readthedocs.io/en/master/examples/two_factor_auth.html 88 | Prompts a user for their 2FA code and returns it. 89 | """ 90 | code = "" 91 | while not code: 92 | code = raw_input("Enter 2FA code: ") 93 | return code 94 | 95 | def get_org(self, organization_name=""): 96 | """ 97 | Retrieves an organization via given org name. If given 98 | empty string, prompts user for an org name. 99 | """ 100 | self.organization_name = organization_name 101 | if organization_name == "": 102 | self.organization_name = raw_input("Organization: ") 103 | print("Getting organization.") 104 | self.org_retrieved = self.logged_in_gh.organization(organization_name) 105 | 106 | def get_repos(self): 107 | """ 108 | Gets the repos for the organization and builds the URL/headers for 109 | getting timestamps of stargazers. 110 | """ 111 | print("Getting repos.") 112 | # Uses the developer API. Note this could change. 113 | 114 | headers = { 115 | "Accept": "application/vnd.github.v3.star+json", 116 | "Authorization": "token " + self.token, 117 | } 118 | temp_count = 0 119 | for repo in self.org_retrieved.iter_repos(): 120 | temp_count += 1 121 | url = ( 122 | "https://api.github.com/repos/" 123 | + self.organization_name 124 | + "/" 125 | + repo.name 126 | ) 127 | self.repos[repo.name] = self.get_stargazers(url=url, headers=headers) 128 | self.calc_stargazers(start_count=650) 129 | print("total count: \t" + str(self.total_count)) 130 | print(str(temp_count) + " repos") 131 | 132 | def get_stargazers(self, url, headers={}): 133 | """ 134 | Return a list of the stargazers of a GitHub repo 135 | 136 | Includes both the 'starred_at' and 'user' data. 137 | 138 | param: url 139 | url is the 'stargazers_url' of the form: 140 | https://api.github.com/repos/LLNL/spack/stargazers 141 | """ 142 | url = url + "/stargazers?per_page=100&page=%s" 143 | page = 1 144 | gazers = [] 145 | 146 | json_data = requests.get(url % page, headers=headers).json() 147 | while json_data: 148 | gazers.extend(json_data) 149 | page += 1 150 | json_data = requests.get(url % page, headers=headers).json() 151 | return gazers 152 | 153 | def calc_stargazers(self, date=(datetime.date.today()), start_count=0): 154 | for repo_json in self.repos: 155 | for stargazer in self.repos[repo_json]: 156 | print(stargazer) 157 | date = stargazer["starred_at"][:10] 158 | try: 159 | self.stargazers[date] += 1 160 | except KeyError: 161 | count = self.stargazers[date] = 1 162 | 163 | sorted_stargazers = sorted(self.stargazers) 164 | for stargazer in reversed(sorted_stargazers): 165 | number_starred = self.stargazers[stargazer] 166 | self.stargazers[stargazer] = start_count - number_starred 167 | start_count = start_count - number_starred 168 | 169 | def write_to_file( 170 | self, file_path="", date=(datetime.date.today()), organization="llnl" 171 | ): 172 | """ 173 | Writes stargazers data to file. 174 | """ 175 | with open(file_path, "w+") as out: 176 | out.write("date,organization,stargazers\n") 177 | sorted_stargazers = sorted(self.stargazers) # sort based on lowercase 178 | for star in sorted_stargazers: 179 | out.write(star + "," + str(self.stargazers[star]) + "\n") 180 | out.close() 181 | 182 | 183 | if __name__ == "__main__": 184 | my_github = GitHub_Stargazers() 185 | my_github.get_stats() 186 | -------------------------------------------------------------------------------- /scripts/get_traffic.py: -------------------------------------------------------------------------------- 1 | import calendar 2 | import csv 3 | import datetime 4 | import errno 5 | import getpass 6 | import json 7 | import math 8 | import os 9 | import time 10 | 11 | import github3 12 | import requests 13 | 14 | 15 | class GitHub_Traffic: 16 | def __init__(self): 17 | self.referrers = {} 18 | self.referrers_lower = {} 19 | self.views = {} 20 | self.clones = {} 21 | 22 | self.referrers_json = {} 23 | self.views_json = {} 24 | self.clones_json = {} 25 | self.releases_json = {} 26 | 27 | def get_stats(self, username="", password="", organization="llnl", force=True): 28 | """ 29 | Retrieves the traffic for the users of the given organization. 30 | Requires organization admin credentials token to access the data. 31 | """ 32 | referrers_file_path = "../github_stats_output/referrers.csv" 33 | views_file_path = "../github_stats_output/views.csv" 34 | clones_file_path = "../github_stats_output/clones.csv" 35 | if force or not os.path.isfile(file_path): 36 | my_github.login(username, password) 37 | calls_beginning = self.logged_in_gh.ratelimit_remaining + 1 38 | print("Rate Limit: " + str(calls_beginning)) 39 | my_github.get_org(organization) 40 | my_github.get_traffic() 41 | views_row_count = my_github.check_data_redundancy( 42 | file_path=views_file_path, dict_to_check=self.views 43 | ) 44 | clones_row_count = my_github.check_data_redundancy( 45 | file_path=clones_file_path, dict_to_check=self.clones 46 | ) 47 | my_github.write_to_file( 48 | referrers_file_path=referrers_file_path, 49 | views_file_path=views_file_path, 50 | clones_file_path=clones_file_path, 51 | views_row_count=views_row_count, 52 | clones_row_count=clones_row_count, 53 | ) 54 | my_github.write_json( 55 | dict_to_write=self.referrers_json, 56 | path_ending_type="traffic_popular_referrers", 57 | ) 58 | my_github.write_json( 59 | dict_to_write=self.views_json, path_ending_type="traffic_views" 60 | ) 61 | my_github.write_json( 62 | dict_to_write=self.clones_json, path_ending_type="traffic_clones" 63 | ) 64 | my_github.write_json( 65 | dict_to_write=self.releases_json, path_ending_type="releases" 66 | ) 67 | calls_remaining = self.logged_in_gh.ratelimit_remaining 68 | calls_used = calls_beginning - calls_remaining 69 | print( 70 | "Rate Limit Remaining: " 71 | + str(calls_remaining) 72 | + "\nUsed " 73 | + str(calls_used) 74 | + " API calls." 75 | ) 76 | 77 | def login(self, username="", password=""): 78 | """ 79 | Performs a login and sets the Github object via given credentials. If 80 | credentials are empty or incorrect then prompts user for credentials. 81 | Stores the authentication token in a CREDENTIALS_FILE used for future 82 | logins. Handles Two Factor Authentication. 83 | """ 84 | try: 85 | self.token = "" 86 | id = "" 87 | if not os.path.isfile("CREDENTIALS_FILE_ADMIN"): 88 | if username == "" or password == "": 89 | username = raw_input("Username: ") 90 | password = getpass.getpass("Password: ") 91 | note = "GitHub Organization Stats App" 92 | note_url = "http://software.llnl.gov/" 93 | scopes = ["user", "repo"] 94 | auth = github3.authorize( 95 | username, 96 | password, 97 | scopes, 98 | note, 99 | note_url, 100 | two_factor_callback=self.prompt_2fa, 101 | ) 102 | self.token = auth.token 103 | id = auth.id 104 | with open("CREDENTIALS_FILE_ADMIN", "w+") as fd: 105 | fd.write(self.token + "\n") 106 | fd.write(str(id)) 107 | fd.close() 108 | else: 109 | with open("CREDENTIALS_FILE_ADMIN", "r") as fd: 110 | self.token = fd.readline().strip() 111 | id = fd.readline().strip() 112 | fd.close() 113 | print("Logging in.") 114 | self.logged_in_gh = github3.login( 115 | token=self.token, two_factor_callback=self.prompt_2fa 116 | ) 117 | self.logged_in_gh.user().to_json() 118 | except (ValueError, AttributeError, github3.models.GitHubError): 119 | print("Bad credentials. Try again.") 120 | self.login() 121 | 122 | def prompt_2fa(self): 123 | """ 124 | Taken from 125 | http://github3py.readthedocs.io/en/master/examples/two_factor_auth.html 126 | Prompts a user for their 2FA code and returns it. 127 | """ 128 | code = "" 129 | while not code: 130 | code = raw_input("Enter 2FA code: ") 131 | return code 132 | 133 | def get_org(self, organization_name=""): 134 | """ 135 | Retrieves an organization via given org name. If given 136 | empty string, prompts user for an org name. 137 | """ 138 | self.organization_name = organization_name 139 | if organization_name == "": 140 | self.organization_name = raw_input("Organization: ") 141 | print("Getting organization.") 142 | self.org_retrieved = self.logged_in_gh.organization(organization_name) 143 | 144 | def get_traffic(self): 145 | """ 146 | Retrieves the traffic for the repositories of the given organization. 147 | """ 148 | print("Getting traffic.") 149 | # Uses the developer API. Note this could change. 150 | headers = { 151 | "Accept": "application/vnd.github.spiderman-preview", 152 | "Authorization": "token " + self.token, 153 | } 154 | headers_release = {"Authorization": "token " + self.token} 155 | for repo in self.org_retrieved.iter_repos(type="public"): 156 | url = ( 157 | "https://api.github.com/repos/" 158 | + self.organization_name 159 | + "/" 160 | + repo.name 161 | ) 162 | self.get_referrers(url=url, headers=headers, repo_name=repo.name) 163 | self.get_paths(url=url, headers=headers) 164 | self.get_data( 165 | url=url, 166 | headers=headers, 167 | dict_to_store=self.views, 168 | type="views", 169 | repo_name=repo.name, 170 | ) 171 | self.get_data( 172 | url=url, 173 | headers=headers, 174 | dict_to_store=self.clones, 175 | type="clones", 176 | repo_name=repo.name, 177 | ) 178 | self.get_releases(url=url, headers=headers_release, repo_name=repo.name) 179 | 180 | def get_releases(self, url="", headers={}, repo_name=""): 181 | """ 182 | Retrieves the releases for the given repo in JSON. 183 | """ 184 | url_releases = url + "/releases" 185 | r = requests.get(url_releases, headers=headers) 186 | self.releases_json[repo_name] = r.json() 187 | 188 | def get_referrers(self, url="", headers={}, repo_name=""): 189 | """ 190 | Retrieves the total referrers and unique referrers of all repos in json 191 | and then stores it in a dict. 192 | """ 193 | # JSON 194 | url_referrers = url + "/traffic/popular/referrers" 195 | r1 = requests.get(url_referrers, headers=headers) 196 | referrers_json = r1.json() 197 | self.referrers_json[repo_name] = referrers_json 198 | # CSV 199 | for referrer in referrers_json: 200 | ref_name = referrer["referrer"] 201 | try: 202 | tuple_in = (referrer["count"], referrer["uniques"]) # curr vals 203 | tuple = ( 204 | self.referrers[ref_name][0] + tuple_in[0], # cal new vals 205 | self.referrers[ref_name][1] + tuple_in[1], 206 | ) 207 | self.referrers[ref_name] = tuple # record new vals 208 | except KeyError: 209 | tuple = self.referrers[ref_name] = ( 210 | referrer["count"], 211 | referrer["uniques"], 212 | ) 213 | self.referrers_lower[ref_name.lower()] = ref_name 214 | 215 | def get_paths(self, url="", headers={}): 216 | """ 217 | Retrieves the popular paths information in json and then stores it in a 218 | dict. 219 | """ 220 | url_paths = url + "/traffic/popular/paths" 221 | # r2 = requests.get(url_paths, headers=headers) 222 | # print 'PATHS ' + str(r2.json()) 223 | 224 | def get_data( 225 | self, 226 | url="", 227 | headers={}, 228 | date=str(datetime.date.today()), 229 | dict_to_store={}, 230 | type="", 231 | repo_name="", 232 | ): 233 | """ 234 | Retrieves data from json and stores it in the supplied dict. Accepts 235 | 'clones' or 'views' as type. 236 | """ 237 | # JSON 238 | url = url + "/traffic/" + type 239 | r3 = requests.get(url, headers=headers) 240 | json = r3.json() 241 | if type == "views": 242 | self.views_json[repo_name] = json 243 | elif type == "clones": 244 | self.clones_json[repo_name] = json 245 | # CSV 246 | for day in json[type]: 247 | timestamp_seconds = day["timestamp"] / 1000 248 | try: 249 | date_timestamp = datetime.datetime.utcfromtimestamp( 250 | timestamp_seconds 251 | ).strftime("%Y-%m-%d") 252 | # do not add todays date, some views might not be recorded yet 253 | if date_timestamp != date: 254 | tuple_in = (day["count"], day["uniques"]) 255 | tuple = ( 256 | dict_to_store[timestamp_seconds][0] + tuple_in[0], 257 | dict_to_store[timestamp_seconds][1] + tuple_in[1], 258 | ) 259 | dict_to_store[timestamp_seconds] = tuple 260 | except KeyError: 261 | tuple = dict_to_store[timestamp_seconds] = ( 262 | day["count"], 263 | day["uniques"], 264 | ) 265 | 266 | def write_json( 267 | self, 268 | date=(datetime.date.today()), 269 | organization="llnl", 270 | dict_to_write={}, 271 | path_ending_type="", 272 | ): 273 | """ 274 | Writes all traffic data to file in JSON form. 275 | """ 276 | for repo in dict_to_write: 277 | if len(dict_to_write[repo]) != 0: # don't need to write out empty lists 278 | path = ( 279 | "../github-data/" 280 | + organization 281 | + "/" 282 | + repo 283 | + "/" 284 | + path_ending_type 285 | + "/" 286 | + str(date) 287 | + ".json" 288 | ) 289 | self.checkDir(path) 290 | with open(path, "w") as out: 291 | out.write( 292 | json.dumps( 293 | dict_to_write[repo], 294 | sort_keys=True, 295 | indent=4, 296 | separators=(",", ": "), 297 | ) 298 | ) 299 | out.close() 300 | 301 | def write_to_file( 302 | self, 303 | referrers_file_path="", 304 | views_file_path="", 305 | clones_file_path="", 306 | date=(datetime.date.today()), 307 | organization="llnl", 308 | views_row_count=0, 309 | clones_row_count=0, 310 | ): 311 | """ 312 | Writes all traffic data to file. 313 | """ 314 | self.write_referrers_to_file(file_path=referrers_file_path) 315 | self.write_data_to_file( 316 | file_path=views_file_path, 317 | dict_to_write=self.views, 318 | name="views", 319 | row_count=views_row_count, 320 | ) 321 | self.write_data_to_file( 322 | file_path=clones_file_path, 323 | dict_to_write=self.clones, 324 | name="clones", 325 | row_count=clones_row_count, 326 | ) 327 | 328 | def check_data_redundancy(self, file_path="", dict_to_check={}): 329 | """ 330 | Checks the given csv file against the json data scraped for the given 331 | dict. It will remove all data retrieved that has already been recorded 332 | so we don't write redundant data to file. Returns count of rows from 333 | file. 334 | """ 335 | count = 0 336 | exists = os.path.isfile(file_path) 337 | previous_dates = {} 338 | if exists: 339 | with open(file_path, "r") as input: 340 | input.readline() # skip header line 341 | for row in csv.reader(input): 342 | timestamp = calendar.timegm(time.strptime(row[0], "%Y-%m-%d")) 343 | if timestamp in dict_to_check: # our date is already recorded 344 | del dict_to_check[timestamp] 345 | # calc current id max 346 | count += 1 347 | input.close() 348 | return count 349 | 350 | def write_data_to_file( 351 | self, 352 | file_path="", 353 | date=str(datetime.date.today()), 354 | organization="llnl", 355 | dict_to_write={}, 356 | name="", 357 | row_count=0, 358 | ): 359 | """ 360 | Writes given dict to file. 361 | """ 362 | exists = os.path.isfile(file_path) 363 | with open(file_path, "a") as out: 364 | if not exists: 365 | out.write("date,organization," + name + ",unique_" + name + ",id\n") 366 | sorted_dict = sorted(dict_to_write) 367 | for day in sorted_dict: 368 | day_formatted = datetime.datetime.utcfromtimestamp(day).strftime( 369 | "%Y-%m-%d" 370 | ) 371 | out.write( 372 | day_formatted 373 | + "," 374 | + organization 375 | + "," 376 | + str(dict_to_write[day][0]) 377 | + "," 378 | + str(dict_to_write[day][1]) 379 | + "," 380 | + str(row_count) 381 | + "\n" 382 | ) 383 | row_count += 1 384 | 385 | def write_referrers_to_file( 386 | self, file_path="", date=str(datetime.date.today()), organization="llnl" 387 | ): 388 | """ 389 | Writes the referrers data to file. 390 | """ 391 | self.remove_date(file_path=file_path, date=date) 392 | referrers_exists = os.path.isfile(file_path) 393 | with open(file_path, "a") as out: 394 | if not referrers_exists: 395 | out.write( 396 | "date,organization,referrer,count,count_log,uniques," 397 | + "uniques_logged\n" 398 | ) 399 | sorted_referrers = sorted(self.referrers_lower) # sort based on lowercase 400 | for referrer in sorted_referrers: 401 | ref_name = self.referrers_lower[referrer] # grab real name from 402 | count = self.referrers[ref_name][0] 403 | uniques = self.referrers[ref_name][1] 404 | if count == 1: # so we don't display 0 for count of 1 405 | count = 1.5 406 | if uniques == 1: 407 | uniques = 1.5 408 | count_logged = math.log(count) 409 | uniques_logged = math.log(uniques) 410 | out.write( 411 | date 412 | + "," 413 | + organization 414 | + "," 415 | + ref_name 416 | + "," 417 | + str(count) 418 | + "," 419 | + str(count_logged) 420 | + "," 421 | + str(uniques) 422 | + "," 423 | + str(uniques_logged) 424 | + "\n" 425 | ) 426 | out.close() 427 | 428 | def remove_date(self, file_path="", date=str(datetime.date.today())): 429 | """ 430 | Removes all rows of the associated date from the given csv file. 431 | Defaults to today. 432 | """ 433 | languages_exists = os.path.isfile(file_path) 434 | if languages_exists: 435 | with open(file_path, "rb") as inp, open("temp.csv", "wb") as out: 436 | writer = csv.writer(out) 437 | for row in csv.reader(inp): 438 | if row[0] != date: 439 | writer.writerow(row) 440 | inp.close() 441 | out.close() 442 | os.remove(file_path) 443 | os.rename("temp.csv", file_path) 444 | 445 | def checkDir(self, file_path=""): 446 | """ 447 | Checks if a directory exists. If not, it creates one with the specified 448 | file_path. 449 | """ 450 | if not os.path.exists(os.path.dirname(file_path)): 451 | try: 452 | os.makedirs(os.path.dirname(file_path)) 453 | except OSError as e: 454 | if e.errno != errno.EEXIST: 455 | raise 456 | 457 | 458 | if __name__ == "__main__": 459 | my_github = GitHub_Traffic() 460 | my_github.get_stats() 461 | -------------------------------------------------------------------------------- /scripts/get_users_emails.py: -------------------------------------------------------------------------------- 1 | import getpass 2 | import os 3 | 4 | import github3 5 | 6 | 7 | class GitHub_Users_Emails: 8 | def __init__(self): 9 | self.emails = {} 10 | self.logins_lower = {} 11 | 12 | def get_stats(self, username="", password="", organization="llnl", force=True): 13 | """ 14 | Retrieves the emails for the users of the given organization. 15 | """ 16 | file_path = "../github_stats_output/users_emails.csv" 17 | if force or not os.path.isfile(file_path): 18 | my_github.login(username, password) 19 | calls_beginning = self.logged_in_gh.ratelimit_remaining + 1 20 | print("Rate Limit: " + str(calls_beginning)) 21 | my_github.get_org(organization) 22 | count_members = my_github.get_mems_of_org() 23 | my_github.write_to_file(file_path) 24 | calls_remaining = self.logged_in_gh.ratelimit_remaining 25 | calls_used = calls_beginning - calls_remaining 26 | print( 27 | "Rate Limit Remaining: " 28 | + str(calls_remaining) 29 | + "\nUsed " 30 | + str(calls_used) 31 | + " API calls." 32 | ) 33 | 34 | def login(self, username="", password=""): 35 | """ 36 | Performs a login and sets the Github object via given credentials. If 37 | credentials are empty or incorrect then prompts user for credentials. 38 | Stores the authentication token in a CREDENTIALS_FILE used for future 39 | logins. Handles Two Factor Authentication. 40 | """ 41 | try: 42 | token = "" 43 | id = "" 44 | if not os.path.isfile("CREDENTIALS_FILE"): 45 | if username == "" or password == "": 46 | username = raw_input("Username: ") 47 | password = getpass.getpass("Password: ") 48 | note = "GitHub Organization Stats App" 49 | note_url = "http://software.llnl.gov/" 50 | scopes = ["user", "repo"] 51 | auth = github3.authorize( 52 | username, 53 | password, 54 | scopes, 55 | note, 56 | note_url, 57 | two_factor_callback=self.prompt_2fa, 58 | ) 59 | token = auth.token 60 | id = auth.id 61 | with open("CREDENTIALS_FILE", "w+") as fd: 62 | fd.write(token + "\n") 63 | fd.write(str(id)) 64 | fd.close() 65 | else: 66 | with open("CREDENTIALS_FILE", "r") as fd: 67 | token = fd.readline().strip() 68 | id = fd.readline().strip() 69 | fd.close() 70 | print("Logging in.") 71 | self.logged_in_gh = github3.login( 72 | token=token, two_factor_callback=self.prompt_2fa 73 | ) 74 | self.logged_in_gh.user().to_json() 75 | except (ValueError, AttributeError, github3.models.GitHubError): 76 | print("Bad credentials. Try again.") 77 | self.login() 78 | 79 | def prompt_2fa(self): 80 | """ 81 | Taken from 82 | http://github3py.readthedocs.io/en/master/examples/two_factor_auth.html 83 | Prompts a user for their 2FA code and returns it. 84 | """ 85 | code = "" 86 | while not code: 87 | code = raw_input("Enter 2FA code: ") 88 | return code 89 | 90 | def get_org(self, organization_name=""): 91 | """ 92 | Retrieves an organization via given org name. If given 93 | empty string, prompts user for an org name. 94 | """ 95 | if organization_name == "": 96 | organization_name = raw_input("Organization: ") 97 | print("Getting organization.") 98 | self.org_retrieved = self.logged_in_gh.organization(organization_name) 99 | 100 | def get_mems_of_org(self): 101 | """ 102 | Retrieves the emails of the members of the organization. Note this Only 103 | gets public emails. Private emails would need authentication for each 104 | user. 105 | """ 106 | print("Getting members' emails.") 107 | for member in self.org_retrieved.iter_members(): 108 | login = member.to_json()["login"] 109 | user_email = self.logged_in_gh.user(login).to_json()["email"] 110 | if user_email is not None: 111 | self.emails[login] = user_email 112 | else: # user has no public email 113 | self.emails[login] = "none" 114 | # used for sorting regardless of case 115 | self.logins_lower[login.lower()] = login 116 | 117 | def write_to_file(self, file_path=""): 118 | """ 119 | Writes the user emails to file. 120 | """ 121 | with open(file_path, "w+") as out: 122 | out.write("user, email\n") 123 | sorted_names = sorted(self.logins_lower) # sort based on lowercase 124 | for login in sorted_names: 125 | out.write( 126 | self.logins_lower[login] 127 | + "," 128 | + self.emails[self.logins_lower[login]] 129 | + "\n" 130 | ) 131 | out.close() 132 | 133 | 134 | if __name__ == "__main__": 135 | my_github = GitHub_Users_Emails() 136 | my_github.get_stats() 137 | -------------------------------------------------------------------------------- /scripts/get_year_commits.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import getpass 3 | import os 4 | import time 5 | 6 | import github3 7 | 8 | 9 | class GitHub_LLNL_Year_Commits: 10 | def __init__(self): 11 | self.commits_dict_list = [] 12 | self.commits = {} 13 | self.sorted_weeks = [] 14 | 15 | def get_year_commits( 16 | self, username="", password="", organization="llnl", force=True 17 | ): 18 | """ 19 | Does setup such as login, printing API info, and waiting for GitHub to 20 | build the commit statistics. Then gets the last year of commits and 21 | prints them to file. 22 | """ 23 | file_path = "year_commits.csv" 24 | if force or not os.path.isfile(file_path): 25 | my_github.login(username, password) 26 | calls_beginning = self.logged_in_gh.ratelimit_remaining + 1 27 | print("Rate Limit: " + str(calls_beginning)) 28 | my_github.get_org(organization) 29 | my_github.repos(building_stats=True) 30 | print("Letting GitHub build statistics.") 31 | time.sleep(30) 32 | print("Trying again.") 33 | my_github.repos(building_stats=False) 34 | my_github.calc_total_commits(starting_commits=35163) 35 | my_github.write_to_file() 36 | calls_remaining = self.logged_in_gh.ratelimit_remaining 37 | calls_used = calls_beginning - calls_remaining 38 | print( 39 | "Rate Limit Remaining: " 40 | + str(calls_remaining) 41 | + "\nUsed " 42 | + str(calls_used) 43 | + " API calls." 44 | ) 45 | 46 | def login(self, username="", password=""): 47 | """ 48 | Performs a login and sets the Github object via given credentials. If 49 | credentials are empty or incorrect then prompts user for credentials. 50 | Stores the authentication token in a CREDENTIALS_FILE used for future 51 | logins. Handles Two Factor Authentication. 52 | """ 53 | try: 54 | token = "" 55 | id = "" 56 | if not os.path.isfile("CREDENTIALS_FILE"): 57 | if username == "" or password == "": 58 | username = raw_input("Username: ") 59 | password = getpass.getpass("Password: ") 60 | note = "GitHub Organization Stats App" 61 | note_url = "http://software.llnl.gov/" 62 | scopes = ["user", "repo"] 63 | auth = github3.authorize( 64 | username, 65 | password, 66 | scopes, 67 | note, 68 | note_url, 69 | two_factor_callback=self.prompt_2fa, 70 | ) 71 | token = auth.token 72 | id = auth.id 73 | with open("CREDENTIALS_FILE", "w+") as fd: 74 | fd.write(token + "\n") 75 | fd.write(str(id)) 76 | fd.close() 77 | else: 78 | with open("CREDENTIALS_FILE", "r") as fd: 79 | token = fd.readline().strip() 80 | id = fd.readline().strip() 81 | fd.close() 82 | print("Logging in.") 83 | self.logged_in_gh = github3.login( 84 | token=token, two_factor_callback=self.prompt_2fa 85 | ) 86 | self.logged_in_gh.user().to_json() 87 | except (ValueError, AttributeError, github3.models.GitHubError): 88 | print("Bad credentials. Try again.") 89 | self.login() 90 | 91 | def prompt_2fa(self): 92 | """ 93 | Taken from 94 | http://github3py.readthedocs.io/en/master/examples/two_factor_auth.html 95 | Prompts a user for their 2FA code and returns it. 96 | """ 97 | code = "" 98 | while not code: 99 | code = raw_input("Enter 2FA code: ") 100 | return code 101 | 102 | def get_org(self, organization_name=""): 103 | """ 104 | Retrieves an organization via given org name. If given 105 | empty string, prompts user for an org name. 106 | """ 107 | if organization_name == "": 108 | organization_name = raw_input("Organization: ") 109 | print("Getting organization.") 110 | self.org_retrieved = self.logged_in_gh.organization(organization_name) 111 | 112 | def repos(self, building_stats=False): 113 | """ 114 | Retrieves the last year of commits for the organization and stores them 115 | in weeks (UNIX time) associated with number of commits that week. 116 | """ 117 | print("Getting repos.") 118 | for repo in self.org_retrieved.iter_repos(): 119 | for activity in repo.iter_commit_activity(): 120 | if not building_stats: 121 | self.commits_dict_list.append(activity) 122 | 123 | def calc_total_commits(self, starting_commits=0): 124 | """ 125 | Uses the weekly commits and traverses back through the last 126 | year, each week subtracting the weekly commits and storing them. It 127 | needs an initial starting commits number, which should be taken from 128 | the most up to date number from github_stats.py output. 129 | """ 130 | for week_of_commits in self.commits_dict_list: 131 | try: 132 | self.commits[week_of_commits["week"]] -= week_of_commits["total"] 133 | except KeyError: 134 | total = self.commits[week_of_commits["week"]] = -week_of_commits[ 135 | "total" 136 | ] 137 | self.sorted_weeks = sorted(self.commits) 138 | 139 | # reverse because lower numbered weeks are older in time. 140 | # we traverse from most recent to oldest 141 | for week in reversed(self.sorted_weeks): 142 | self.commits[week] = self.commits[week] + starting_commits 143 | starting_commits = self.commits[week] 144 | 145 | def write_to_file(self): 146 | """ 147 | Writes the weeks with associated commits to file. 148 | """ 149 | with open("../github_stats_output/last_year_commits.csv", "w+") as output: 150 | output.write( 151 | "date,organization,repos,members,teams," 152 | + "unique_contributors,total_contributors,forks," 153 | + "stargazers,pull_requests,open_issues,has_readme," 154 | + "has_license,pull_requests_open,pull_requests_closed," 155 | + "commits\n" 156 | ) 157 | # no reverse this time to print oldest first 158 | previous_commits = 0 159 | for week in self.sorted_weeks: 160 | if str(self.commits[week]) != previous_commits: # delete dups 161 | week_formatted = datetime.datetime.utcfromtimestamp(week).strftime( 162 | "%Y-%m-%d" 163 | ) 164 | output.write( 165 | week_formatted 166 | + ",llnl,0,0,0,0,0,0,0,0,0,0,0,0,0," 167 | + str(self.commits[week]) 168 | + "\n" 169 | ) 170 | previous_commits = str(self.commits[week]) 171 | 172 | 173 | if __name__ == "__main__": 174 | my_github = GitHub_LLNL_Year_Commits() 175 | my_github.get_year_commits() 176 | -------------------------------------------------------------------------------- /scripts/github_stats.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | import csv 3 | import datetime 4 | import errno 5 | import getpass 6 | import json 7 | import math 8 | import os 9 | import time 10 | 11 | import github3 12 | import my_repo 13 | 14 | 15 | class GitHub_LLNL_Stats: 16 | def __init__(self): 17 | print("Initalizing.") 18 | self.unique_contributors = defaultdict(list) 19 | self.languages = {} 20 | self.languages_size = {} 21 | self.all_repos = [] 22 | self.total_repos = 0 23 | self.total_contributors = 0 24 | self.total_forks = 0 25 | self.total_stars = 0 26 | self.total_pull_reqs = 0 27 | self.total_pull_reqs_open = 0 28 | self.total_pull_reqs_closed = 0 29 | self.total_open_issues = 0 30 | self.total_closed_issues = 0 31 | self.total_issues = 0 32 | self.total_readmes = 0 33 | self.total_licenses = 0 34 | self.total_commits = 0 35 | self.search_limit = 0 36 | self.previous_language = "" 37 | 38 | # JSON vars 39 | self.repos_json = {} 40 | self.members_json = {} 41 | self.teams_json = {} 42 | self.contributors_json = defaultdict(list) 43 | self.pull_requests_json = defaultdict(list) 44 | self.issues_json = defaultdict(list) 45 | self.languages_json = defaultdict(dict) 46 | self.commits_json = defaultdict(list) 47 | 48 | def get_stats( 49 | self, 50 | username="", 51 | password="", 52 | organization="llnl", 53 | force=True, 54 | repo_type="public", 55 | ): 56 | """ 57 | Retrieves the statistics from the given organization with the given 58 | credentials. Will not retreive data if file exists and force hasn't been 59 | set to True. This is to save GH API requests. 60 | """ 61 | date = str(datetime.date.today()) 62 | file_path = ( 63 | "../github_stats_output/" + date[:4] + "/" + date[:7] + "/" + date + ".csv" 64 | ) 65 | if force or not os.path.isfile(file_path): 66 | my_github.login(username, password) 67 | calls_beginning = self.logged_in_gh.ratelimit_remaining + 1 68 | print("Rate Limit: " + str(calls_beginning)) 69 | my_github.get_org(organization) 70 | count_members = my_github.get_mems_of_org() 71 | count_teams = my_github.get_teams_of_org() 72 | my_github.repos(repo_type=repo_type, organization=organization) 73 | # Write JSON 74 | my_github.write_org_json( 75 | dict_to_write=self.members_json, 76 | path_ending_type="members", 77 | is_list=True, 78 | ) 79 | my_github.write_org_json( 80 | dict_to_write={"singleton": self.org_retrieved.to_json()}, 81 | path_ending_type="organization", 82 | ) 83 | my_github.write_org_json( 84 | dict_to_write=self.teams_json, path_ending_type="teams", is_list=True 85 | ) 86 | 87 | my_github.write_repo_json( 88 | dict_to_write=self.repos_json, path_ending_type="repo" 89 | ) 90 | my_github.write_repo_json( 91 | dict_to_write=self.contributors_json, 92 | path_ending_type="contributors", 93 | is_list=True, 94 | ) 95 | my_github.write_repo_json( 96 | dict_to_write=self.pull_requests_json, 97 | path_ending_type="pull-requests", 98 | is_list=True, 99 | ) 100 | my_github.write_repo_json( 101 | dict_to_write=self.issues_json, path_ending_type="issues", is_list=True 102 | ) 103 | my_github.write_repo_json( 104 | dict_to_write=self.languages_json, 105 | path_ending_type="languages", 106 | is_dict=True, 107 | ) 108 | my_github.write_repo_json( 109 | dict_to_write=self.commits_json, 110 | path_ending_type="commits", 111 | is_list=True, 112 | ) 113 | # Write CSV 114 | my_github.write_to_file( 115 | file_path, date, organization, count_members, count_teams 116 | ) 117 | calls_remaining = self.logged_in_gh.ratelimit_remaining 118 | calls_used = calls_beginning - calls_remaining 119 | print( 120 | "Rate Limit Remaining: " 121 | + str(calls_remaining) 122 | + "\nUsed " 123 | + str(calls_used) 124 | + " API calls." 125 | ) 126 | 127 | def login(self, username="", password=""): 128 | """ 129 | Performs a login and sets the Github object via given credentials. If 130 | credentials are empty or incorrect then prompts user for credentials. 131 | Stores the authentication token in a CREDENTIALS_FILE used for future 132 | logins. Handles Two Factor Authentication. 133 | """ 134 | try: 135 | self.token = "" 136 | id = "" 137 | if not os.path.isfile("CREDENTIALS_FILE"): 138 | if username == "" or password == "": 139 | username = raw_input("Username: ") 140 | password = getpass.getpass("Password: ") 141 | note = "GitHub Organization Stats App" 142 | note_url = "http://software.llnl.gov/" 143 | scopes = ["user", "repo"] 144 | auth = github3.authorize( 145 | username, 146 | password, 147 | scopes, 148 | note, 149 | note_url, 150 | two_factor_callback=self.prompt_2fa, 151 | ) 152 | self.token = auth.token 153 | id = auth.id 154 | with open("CREDENTIALS_FILE", "w+") as fd: 155 | fd.write(self.token + "\n") 156 | fd.write(str(id)) 157 | fd.close() 158 | else: 159 | with open("CREDENTIALS_FILE", "r") as fd: 160 | self.token = fd.readline().strip() 161 | id = fd.readline().strip() 162 | fd.close() 163 | print("Logging in.") 164 | self.logged_in_gh = github3.login( 165 | token=self.token, two_factor_callback=self.prompt_2fa 166 | ) 167 | self.logged_in_gh.user().to_json() 168 | except (ValueError, AttributeError, github3.models.GitHubError): 169 | print("Bad credentials. Try again.") 170 | self.login() 171 | 172 | def prompt_2fa(self): 173 | """ 174 | Taken from 175 | http://github3py.readthedocs.io/en/master/examples/two_factor_auth.html 176 | Prompts a user for their 2FA code and returns it. 177 | """ 178 | code = "" 179 | while not code: 180 | code = raw_input("Enter 2FA code: ") 181 | return code 182 | 183 | def get_org(self, organization_name=""): 184 | """ 185 | Retrieves an organization via given org name. If given 186 | empty string, prompts user for an org name. 187 | """ 188 | if organization_name == "": 189 | organization_name = raw_input("Organization: ") 190 | print("Getting organization.") 191 | self.org_retrieved = self.logged_in_gh.organization(organization_name) 192 | 193 | def get_mems_of_org(self): 194 | """ 195 | Retrieves the number of members of the organization. 196 | """ 197 | print("Getting members.") 198 | counter = 0 199 | for member in self.org_retrieved.iter_members(): 200 | self.members_json[member.id] = member.to_json() 201 | counter += 1 202 | return counter 203 | 204 | def get_teams_of_org(self): 205 | """ 206 | Retrieves the number of teams of the organization. 207 | """ 208 | print("Getting teams.") 209 | counter = 0 210 | for team in self.org_retrieved.iter_teams(): 211 | self.teams_json[team.id] = team.to_json() 212 | counter += 1 213 | return counter 214 | 215 | def repos(self, repo_type="public", organization="llnl"): 216 | """ 217 | Retrieves info about the repos of the current organization. 218 | """ 219 | print("Getting repos.") 220 | for repo in self.org_retrieved.iter_repos(type=repo_type): 221 | # JSON 222 | json = repo.to_json() 223 | self.repos_json[repo.name] = json 224 | # CSV 225 | temp_repo = my_repo.My_Repo() 226 | temp_repo.name = repo.full_name 227 | self.total_repos += 1 228 | temp_repo.contributors = my_github.get_total_contributors(repo) 229 | self.total_contributors += temp_repo.contributors 230 | temp_repo.forks = repo.forks_count 231 | self.total_forks += temp_repo.forks 232 | temp_repo.stargazers = repo.stargazers 233 | self.total_stars += temp_repo.stargazers 234 | ( 235 | temp_repo.pull_requests_open, 236 | temp_repo.pull_requests_closed, 237 | ) = my_github.get_pull_reqs(repo) 238 | temp_repo.pull_requests = ( 239 | temp_repo.pull_requests_open + temp_repo.pull_requests_closed 240 | ) 241 | self.total_pull_reqs += temp_repo.pull_requests_open 242 | self.total_pull_reqs += temp_repo.pull_requests_closed 243 | self.total_pull_reqs_open += temp_repo.pull_requests_open 244 | self.total_pull_reqs_closed += temp_repo.pull_requests_closed 245 | temp_repo.open_issues = repo.open_issues_count 246 | self.total_open_issues += temp_repo.open_issues 247 | temp_repo.closed_issues = my_github.get_issues( 248 | repo, organization=organization 249 | ) 250 | temp_repo.issues = temp_repo.closed_issues + temp_repo.open_issues 251 | self.total_closed_issues += temp_repo.closed_issues 252 | self.total_issues += temp_repo.issues 253 | my_github.get_languages(repo, temp_repo) 254 | temp_repo.readme = my_github.get_readme(repo) 255 | # temp_repo.license = my_github.get_license(repo) 256 | temp_repo.commits = self.get_commits(repo=repo, organization=organization) 257 | self.total_commits += temp_repo.commits 258 | self.all_repos.append(temp_repo) 259 | 260 | def get_total_contributors(self, repo): 261 | """ 262 | Retrieves the number of contributors to a repo in the organization. 263 | Also adds to unique contributor list. 264 | """ 265 | repo_contributors = 0 266 | for contributor in repo.iter_contributors(): 267 | repo_contributors += 1 268 | self.unique_contributors[contributor.id].append(repo.name) 269 | self.contributors_json[repo.name].append(contributor.to_json()) 270 | return repo_contributors 271 | 272 | def get_pull_reqs(self, repo): 273 | """ 274 | Retrieves the number of pull requests on a repo in the organization. 275 | """ 276 | pull_reqs_open = 0 277 | pull_reqs_closed = 0 278 | for pull_request in repo.iter_pulls(state="all"): 279 | self.pull_requests_json[repo.name].append(pull_request.to_json()) 280 | if pull_request.closed_at is not None: 281 | pull_reqs_closed += 1 282 | else: 283 | pull_reqs_open += 1 284 | return pull_reqs_open, pull_reqs_closed 285 | 286 | def get_issues(self, repo, organization="llnl"): 287 | """ 288 | Retrieves the number of closed issues. 289 | """ 290 | # JSON 291 | path = "../github-data/" + organization + "/" + repo.name + "/issues" 292 | is_only_today = False 293 | if not os.path.exists(path): # no previous path, get all issues 294 | all_issues = repo.iter_issues(state="all") 295 | is_only_today = True 296 | else: 297 | files = os.listdir(path) 298 | date = str(files[-1][:-5]) 299 | if date == str(datetime.date.today()): 300 | # most recent date is actually today, get previous most recent date 301 | if len(files) > 2: 302 | date = str(files[-2][:-5]) 303 | else: 304 | # This means there is only one file, today. Retrieve every issue 305 | all_issues = repo.iter_issues(state="all") 306 | is_only_today = True 307 | if not is_only_today: # there's a previous saved JSON that's not today 308 | all_issues = repo.iter_issues(since=date, state="all") 309 | for issue in all_issues: 310 | self.issues_json[repo.name].append(issue.to_json()) 311 | # CSV 312 | closed_issues = 0 313 | for issue in repo.iter_issues(state="closed"): 314 | if issue is not None: 315 | closed_issues += 1 316 | return closed_issues 317 | 318 | def get_languages(self, repo, temp_repo): 319 | """ 320 | Retrieves the languages used in the repo and increments the respective 321 | counts of those languages. Only increments languages that have names. 322 | Anything else is not incremented (i.e. numbers). 323 | """ 324 | try: 325 | self.languages[repo.language] += 1 326 | except KeyError: 327 | count = self.languages[repo.language] = 1 328 | for repo_languages in repo.iter_languages(): 329 | self.languages_json[repo.name][repo_languages[0]] = repo_languages[1] 330 | for language in repo_languages: 331 | if isinstance(language, basestring): # is language 332 | temp_repo.languages.append(language) 333 | self.previous_language = language 334 | else: # record size bytes of language 335 | try: 336 | self.languages_size[self.previous_language] += language 337 | except KeyError: 338 | size = self.languages_size[self.previous_language] = language 339 | 340 | def get_readme(self, repo): 341 | """ 342 | Checks to see if the given repo has a ReadMe. MD means it has a correct 343 | Readme recognized by GitHub. 344 | """ 345 | readme_contents = repo.readme() 346 | if readme_contents is not None: 347 | self.total_readmes += 1 348 | return "MD" 349 | if self.search_limit >= 28: 350 | print("Hit search limit. Sleeping for 60 sec.") 351 | time.sleep(60) 352 | self.search_limit = 0 353 | self.search_limit += 1 354 | search_results = self.logged_in_gh.search_code( 355 | "readme" + "in:path repo:" + repo.full_name 356 | ) 357 | try: 358 | for result in search_results: 359 | path = result.path[1:] 360 | if "/" not in path and "readme" in path.lower(): 361 | self.total_readmes += 1 362 | return path 363 | return "MISS" 364 | except (github3.models.GitHubError, StopIteration): 365 | return "MISS" 366 | 367 | def get_license(self, repo): 368 | """ 369 | Checks to see if the given repo has a top level LICENSE file. 370 | """ 371 | if self.search_limit >= 28: 372 | print("Hit search limit. Sleeping for 60 sec.") 373 | time.sleep(60) 374 | self.search_limit = 0 375 | self.search_limit += 1 376 | search_results = self.logged_in_gh.search_code( 377 | "license" + "in:path repo:" + repo.full_name 378 | ) 379 | try: 380 | for result in search_results: 381 | path = result.path[1:] 382 | if "/" not in path and "license" in path.lower(): 383 | self.total_licenses += 1 384 | return path 385 | return "MISS" 386 | except StopIteration: 387 | return "MISS" 388 | 389 | def get_commits(self, repo, organization="llnl"): 390 | """ 391 | Retrieves the number of commits to a repo in the organization. If it is 392 | the first time getting commits for a repo, it will get all commits and 393 | save them to JSON. If there are previous commits saved, it will only get 394 | commits that have not been saved to disk since the last date of commits. 395 | """ 396 | # JSON 397 | path = "../github-data/" + organization + "/" + repo.name + "/commits" 398 | is_only_today = False 399 | if not os.path.exists(path): # no previous path, get all commits 400 | all_commits = repo.iter_commits() 401 | is_only_today = True 402 | else: 403 | files = os.listdir(path) 404 | date = str(files[-1][:-5]) 405 | if date == str(datetime.date.today()): 406 | # most recent date is actually today, get previous most recent date 407 | if len(files) > 2: 408 | date = str(files[-2][:-5]) 409 | else: 410 | # This means there is only one file, today. Retrieve every commit 411 | all_commits = repo.iter_commits() 412 | is_only_today = True 413 | if not is_only_today: # there's a previous saved JSON that's not today 414 | all_commits = repo.iter_commits(since=date) 415 | for commit in all_commits: 416 | self.commits_json[repo.name].append(commit.to_json()) 417 | # for csv 418 | count = 0 419 | for commit in repo.iter_commits(): 420 | count += 1 421 | return count 422 | 423 | def write_org_json( 424 | self, 425 | date=(datetime.date.today()), 426 | organization="llnl", 427 | dict_to_write={}, 428 | path_ending_type="", 429 | is_list=False, 430 | ): 431 | """ 432 | Writes stats from the organization to JSON. 433 | """ 434 | path = ( 435 | "../github-data/" 436 | + organization 437 | + "-org/" 438 | + path_ending_type 439 | + "/" 440 | + str(date) 441 | + ".json" 442 | ) 443 | self.checkDir(path) 444 | with open(path, "w") as out_clear: # clear old data 445 | out_clear.close() 446 | with open(path, "a") as out: 447 | if is_list: # used for list of items 448 | out.write("[") 449 | for item in dict_to_write: 450 | out.write( 451 | json.dumps( 452 | dict_to_write[item], 453 | sort_keys=True, 454 | indent=4, 455 | separators=(",", ": "), 456 | ) 457 | + "," 458 | ) 459 | out.seek(-1, os.SEEK_END) # kill last comma 460 | out.truncate() 461 | if is_list: 462 | out.write("]") 463 | out.close() 464 | 465 | def write_repo_json( 466 | self, 467 | date=(datetime.date.today()), 468 | organization="llnl", 469 | dict_to_write={}, 470 | path_ending_type="", 471 | is_list=False, 472 | is_dict=False, 473 | ): 474 | """ 475 | #Writes repo specific data to JSON. 476 | """ 477 | for repo in dict_to_write: 478 | path = ( 479 | "../github-data/" 480 | + organization 481 | + "/" 482 | + repo 483 | + "/" 484 | + path_ending_type 485 | + "/" 486 | + str(date) 487 | + ".json" 488 | ) 489 | self.checkDir(path) 490 | with open(path, "w") as out: 491 | if is_list: 492 | out.write("[") 493 | for value in dict_to_write[repo]: 494 | if is_dict: 495 | for inner_dict in value: 496 | out.write( 497 | json.dumps( 498 | inner_dict, 499 | sort_keys=True, 500 | indent=4, 501 | separators=(",", ": "), 502 | ) 503 | + "," 504 | ) 505 | else: 506 | out.write( 507 | json.dumps( 508 | value, 509 | sort_keys=True, 510 | indent=4, 511 | separators=(",", ": "), 512 | ) 513 | + "," 514 | ) 515 | out.seek(-1, os.SEEK_END) # kill last comma 516 | out.truncate() 517 | out.write("]") 518 | else: 519 | out.write( 520 | json.dumps( 521 | dict_to_write[repo], 522 | sort_keys=True, 523 | indent=4, 524 | separators=(",", ": "), 525 | ) 526 | ) 527 | out.close() 528 | 529 | def write_to_file( 530 | self, 531 | file_path="", 532 | date=str(datetime.date.today()), 533 | organization="N/A", 534 | members=0, 535 | teams=0, 536 | ): 537 | """ 538 | Writes the current organization information to file (csv). 539 | """ 540 | self.checkDir(file_path) 541 | with open(file_path, "w+") as output: 542 | output.write( 543 | "date,organization,members,teams,unique_contributors," 544 | + "repository,contributors,forks,stargazers,pull_requests," 545 | + "open_issues,has_readme,has_license,languages,pull_requests_open," 546 | + "pull_requests_closed,commits,closed_issues,issues\n" 547 | + date 548 | + "," 549 | + organization 550 | + "," 551 | + str(members) 552 | + "," 553 | + str(teams) 554 | + "," 555 | + str(len(self.unique_contributors)) 556 | + "\n" 557 | ) 558 | for repo in self.all_repos: 559 | output.write( 560 | ",,,,," 561 | + repo.name 562 | + "," 563 | + str(repo.contributors) 564 | + "," 565 | + str(repo.forks) 566 | + "," 567 | + str(repo.stargazers) 568 | + "," 569 | + str(repo.pull_requests) 570 | + "," 571 | + str(repo.open_issues) 572 | + "," 573 | + str(repo.readme) 574 | + "," 575 | + str(repo.license) 576 | + "," 577 | + " ".join(sorted(repo.languages)) 578 | + "," 579 | + str(repo.pull_requests_open) 580 | + "," 581 | + str(repo.pull_requests_closed) 582 | + "," 583 | + str(repo.commits) 584 | + "," 585 | + str(repo.closed_issues) 586 | + "," 587 | + str(repo.issues) 588 | + "\n" 589 | ) 590 | output.write( 591 | ",,,,total," 592 | + str(self.total_repos) 593 | + "," 594 | + str(self.total_contributors) 595 | + "," 596 | + str(self.total_forks) 597 | + "," 598 | + str(self.total_stars) 599 | + "," 600 | + str(self.total_pull_reqs) 601 | + "," 602 | + str(self.total_open_issues) 603 | + "," 604 | + str(self.total_readmes) 605 | + "," 606 | + str(self.total_licenses) 607 | + ",," 608 | + str(self.total_pull_reqs_open) 609 | + "," 610 | + str(self.total_pull_reqs_closed) 611 | + "," 612 | + str(self.total_commits) 613 | + "," 614 | + str(self.total_closed_issues) 615 | + "," 616 | + str(self.total_issues) 617 | ) 618 | output.close() 619 | # Update total 620 | self.write_totals( 621 | file_path="../github_stats_output/total.csv", 622 | date=date, 623 | organization=organization, 624 | members=members, 625 | teams=teams, 626 | ) 627 | # Update language sizes 628 | self.write_languages( 629 | file_path="../github_stats_output/languages.csv", date=date 630 | ) 631 | 632 | def write_totals( 633 | self, 634 | file_path="", 635 | date=str(datetime.date.today()), 636 | organization="N/A", 637 | members=0, 638 | teams=0, 639 | ): 640 | """ 641 | Updates the total.csv file with current data. 642 | """ 643 | 644 | total_exists = os.path.isfile(file_path) 645 | with open(file_path, "a") as out_total: 646 | if not total_exists: 647 | out_total.write( 648 | "date,organization,repos,members,teams," 649 | + "unique_contributors,total_contributors,forks," 650 | + "stargazers,pull_requests,open_issues,has_readme," 651 | + "has_license,pull_requests_open,pull_requests_closed," 652 | + "commits,id,closed_issues,issues\n" 653 | ) 654 | self.delete_last_line(date=date, file_path=file_path) 655 | out_total.close() 656 | with open(file_path, "r") as file_read: 657 | row_count = sum(1 for row in file_read) - 1 658 | file_read.close() 659 | with open(file_path, "a") as out_total: 660 | out_total.write( 661 | date 662 | + "," 663 | + organization 664 | + "," 665 | + str(self.total_repos) 666 | + "," 667 | + str(members) 668 | + "," 669 | + str(teams) 670 | + "," 671 | + str(len(self.unique_contributors)) 672 | + "," 673 | + str(self.total_contributors) 674 | + "," 675 | + str(self.total_forks) 676 | + "," 677 | + str(self.total_stars) 678 | + "," 679 | + str(self.total_pull_reqs) 680 | + "," 681 | + str(self.total_open_issues) 682 | + "," 683 | + str(self.total_readmes) 684 | + "," 685 | + str(self.total_licenses) 686 | + "," 687 | + str(self.total_pull_reqs_open) 688 | + "," 689 | + str(self.total_pull_reqs_closed) 690 | + "," 691 | + str(self.total_commits) 692 | + "," 693 | + str(row_count) 694 | + "," 695 | + str(self.total_closed_issues) 696 | + "," 697 | + str(self.total_issues) 698 | + "\n" 699 | ) 700 | out_total.close() 701 | 702 | def write_languages(self, file_path="", date=str(datetime.date.today())): 703 | """ 704 | Updates languages.csv file with current data. 705 | """ 706 | self.remove_date(file_path=file_path, date=date) 707 | languages_exists = os.path.isfile(file_path) 708 | with open(file_path, "a") as out_languages: 709 | if not languages_exists: 710 | out_languages.write("date,language,count,size,size_log\n") 711 | languages_sorted = sorted(self.languages_size) 712 | # self.delete_last_line(date=date, file_path=file_path) 713 | for language in languages_sorted: 714 | try: 715 | out_languages.write( 716 | date 717 | + "," 718 | + language 719 | + "," 720 | + str(self.languages[language]) 721 | + "," 722 | + str(self.languages_size[language]) 723 | + "," 724 | + str(math.log10(int(self.languages_size[language]))) 725 | + "\n" 726 | ) 727 | except (TypeError, KeyError): 728 | out_languages.write( 729 | date 730 | + "," 731 | + language 732 | + "," 733 | + str(0) 734 | + "," 735 | + str(self.languages_size[language]) 736 | + "," 737 | + str(math.log10(int(self.languages_size[language]))) 738 | + "\n" 739 | ) 740 | 741 | def checkDir(self, file_path=""): 742 | """ 743 | Checks if a directory exists. If not, it creates one with the specified 744 | file_path. 745 | """ 746 | if not os.path.exists(os.path.dirname(file_path)): 747 | try: 748 | os.makedirs(os.path.dirname(file_path)) 749 | except OSError as e: 750 | if e.errno != errno.EEXIST: 751 | raise 752 | 753 | def remove_date(self, file_path="", date=str(datetime.date.today())): 754 | """ 755 | Removes all rows of the associated date from the given csv file. 756 | Defaults to today. 757 | """ 758 | languages_exists = os.path.isfile(file_path) 759 | if languages_exists: 760 | with open(file_path, "rb") as inp, open("temp.csv", "wb") as out: 761 | writer = csv.writer(out) 762 | for row in csv.reader(inp): 763 | if row[0] != date: 764 | writer.writerow(row) 765 | inp.close() 766 | out.close() 767 | os.remove(file_path) 768 | os.rename("temp.csv", file_path) 769 | 770 | def delete_last_line(self, file_path="", date=str(datetime.date.today())): 771 | """ 772 | The following code was modified from 773 | http://stackoverflow.com/a/10289740 & 774 | http://stackoverflow.com/a/17309010 775 | It essentially will check if the total for the current date already 776 | exists in total.csv. If it does, it just removes the last line. 777 | This is so the script could be run more than once a day and not 778 | create many entries in the total.csv file for the same date. 779 | """ 780 | deleted_line = False 781 | if os.path.isfile(file_path): 782 | with open(file_path, "r+") as file: 783 | reader = csv.reader(file, delimiter=",") 784 | for row in reader: 785 | if date == row[0]: 786 | file.seek(0, os.SEEK_END) 787 | pos = file.tell() - 1 788 | while pos > 0 and file.read(1) != "\n": 789 | pos -= 1 790 | file.seek(pos, os.SEEK_SET) 791 | if pos > 0: 792 | file.seek(pos, os.SEEK_SET) 793 | file.truncate() 794 | deleted_line = True 795 | break 796 | if deleted_line: 797 | file.write("\n") 798 | file.close() 799 | 800 | 801 | if __name__ == "__main__": 802 | my_github = GitHub_LLNL_Stats() 803 | my_github.get_stats() 804 | -------------------------------------------------------------------------------- /scripts/my_repo.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | 3 | 4 | class My_Repo: 5 | def __init__(self): 6 | self.date = datetime.date.today() 7 | self.name = "N/A" 8 | self.organization = "N/A" 9 | self.contributors = 0 10 | self.forks = 0 11 | self.stargazers = 0 12 | self.pull_requests = 0 13 | self.pull_requests_open = 0 14 | self.pull_requests_closed = 0 15 | self.issues = 0 16 | self.open_issues = 0 17 | self.closed_issues = 0 18 | self.languages = [] 19 | self.readme = "MISS" 20 | self.license = "MISS" 21 | self.commits = 0 22 | -------------------------------------------------------------------------------- /scripts/org_to_emails.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python3 2 | 3 | from scraper.github import create_session 4 | 5 | # Looks for an environment variable `GITHUB_API_TOKEN` with a valid GitHub API token 6 | gh = create_session() 7 | 8 | 9 | def print_org_members_without_2fa(org_name="llnl"): 10 | org = gh.organization(org_name) 11 | 12 | for user in org.members(filter="2fa_disabled"): 13 | emails = { 14 | c["author"]["email"] 15 | for e in user.events() 16 | if e.type == "PushEvent" 17 | for c in e.payload["commits"] 18 | } 19 | emails = {e for e in emails if "@llnl.gov" in e} 20 | if emails: 21 | print(f"{user.login}: {','.join(emails)}") 22 | 23 | 24 | if __name__ == "__main__": 25 | print_org_members_without_2fa() 26 | -------------------------------------------------------------------------------- /scripts/stars.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import logging 4 | import os 5 | import re 6 | 7 | import requests 8 | 9 | logging.basicConfig(level=logging.DEBUG) 10 | 11 | github = requests.Session() 12 | 13 | NEXT_LINK_REGEX = re.compile(r'<(\S+)>(?=; rel="next")') 14 | 15 | 16 | def get_stargazers(url, session=None): 17 | """ 18 | Return a list of the stargazers of a GitHub repo 19 | 20 | Includes both the 'starred_at' and 'user' data. 21 | 22 | param: url 23 | url is the 'stargazers_url' of the form: 24 | https://api.github.com/repos/LLNL/spack/stargazers 25 | """ 26 | headers = {"Accept": "application/vnd.github.v3.star+json"} 27 | url = url + "?per_page=100&page=%s" 28 | page = 1 29 | gazers = [] 30 | 31 | response = github.get(url % page, headers=headers) 32 | gazers.extend(response.json()) 33 | 34 | # {rel: url for url, rel in LINK_REGEX.findall(r.headers['Link'])} 35 | while json_data: 36 | gazers.extend(json_data) 37 | page += 1 38 | json_data = github.get(url % page, headers=headers).json() 39 | 40 | return gazers 41 | 42 | 43 | if __name__ == "__main__": 44 | if "GITHUB_API_TOKEN" in os.environ: 45 | auth = "token {}".format(os.environ["GITHUB_API_TOKEN"]) 46 | github.headers["Authorization"] = auth 47 | logging.info("Using auth: %s", auth) 48 | 49 | orgs = ["llnl"] 50 | urls = ("https://api.github.com/orgs/%s/repos?per_page=100" % org for org in orgs) 51 | 52 | repos = [] 53 | for url in urls: 54 | repos.extend(github.get(url).json()) 55 | 56 | stargazers = {repo["name"]: [] for repo in repos} 57 | 58 | for repo in repos: 59 | stargazers[repo["name"]] = get_stargazers(repo["stargazers_url"]) 60 | 61 | print(repo["name"], len(stargazers[repo["name"]])) 62 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [flake8] 2 | ignore = E231, E501, W503 3 | 4 | [isort] 5 | combine_star = true 6 | force_sort_within_sections = true 7 | profile = black 8 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | from setuptools import find_packages, setup 5 | 6 | with open("README.md") as fh: 7 | long_description = fh.read() 8 | 9 | with open("requirements/production.txt") as fp: 10 | lines = [x.strip() for x in fp.readlines() if x] 11 | install_reqs = [x for x in lines if not x.startswith("#")] 12 | 13 | setup( 14 | name="llnl-scraper", 15 | version="0.15.0", 16 | description="Package for extracting software repository metadata", 17 | long_description=long_description, 18 | long_description_content_type="text/markdown", 19 | author="Ian Lee", 20 | author_email="lee1001@llnl.gov", 21 | url="https://github.com/llnl/scraper", 22 | packages=find_packages(), 23 | install_requires=install_reqs, 24 | python_requires=">=3.6", 25 | entry_points={ 26 | "console_scripts": [ 27 | "scraper = scraper.gen_code_gov_json:main", 28 | ] 29 | }, 30 | scripts=[ 31 | "scripts/codegov_compute_hours.py", 32 | ], 33 | classifiers=[ 34 | "Development Status :: 4 - Beta", 35 | "Intended Audience :: Developers", 36 | "Operating System :: OS Independent", 37 | "Programming Language :: Python", 38 | "Programming Language :: Python :: 3", 39 | "Programming Language :: Python :: 3 :: Only", 40 | "Programming Language :: Python :: 3.6", 41 | "Programming Language :: Python :: 3.7", 42 | "Programming Language :: Python :: 3.8", 43 | "Programming Language :: Python :: 3.9", 44 | "Programming Language :: Python :: 3.10", 45 | "Programming Language :: Python :: Implementation :: CPython", 46 | "Programming Language :: Python :: Implementation :: PyPy", 47 | ], 48 | ) 49 | --------------------------------------------------------------------------------