├── .github
    └── workflows
    │   └── main.yml
├── .gitignore
├── .markdownlint.json
├── LICENSE
├── MANIFEST.in
├── Makefile
├── NOTICE
├── README.md
├── demo.json
├── llnl_config.json
├── requirements.txt
├── requirements
    ├── dev.txt
    └── production.txt
├── scraper
    ├── __init__.py
    ├── azuredevops
    │   ├── __init__.py
    │   └── models.py
    ├── bitbucket
    │   └── __init__.py
    ├── code_gov
    │   ├── __init__.py
    │   └── models.py
    ├── doecode
    │   └── __init__.py
    ├── gen_code_gov_json.py
    ├── github
    │   ├── __init__.py
    │   ├── queryManager.py
    │   └── util.py
    ├── gitlab
    │   └── __init__.py
    ├── tfs
    │   ├── __init__.py
    │   └── models.py
    └── util.py
├── scripts
    ├── clone_everything.py
    ├── codegov_compute_hours.py
    ├── get_stargazers.py
    ├── get_traffic.py
    ├── get_users_emails.py
    ├── get_year_commits.py
    ├── github_stats.py
    ├── my_repo.py
    ├── org_to_emails.py
    └── stars.py
├── setup.cfg
└── setup.py


/.github/workflows/main.yml:
--------------------------------------------------------------------------------
 1 | name: llnl-scraper
 2 | 
 3 | on:
 4 |   pull_request: []
 5 | 
 6 | jobs:
 7 |   testing:
 8 |     runs-on: ${{ matrix.os }}
 9 |     strategy:
10 |       fail-fast: false
11 |       matrix:
12 |         os:
13 |           - ubuntu-latest
14 |         python-version:
15 |           - "3.8"
16 |           - "3.9"
17 |           - "3.10"
18 |           - "3.11"
19 |           - "3.12"
20 |           - "3.13"
21 |     name: Python ${{ matrix.python-version }} Tests
22 |     steps:
23 |       - uses: actions/checkout@v3
24 |       - name: Set up Python
25 |         uses: actions/setup-python@v4
26 |         with:
27 |           python-version: ${{ matrix.python-version }}
28 |           architecture: x64
29 |       - name:  Python Tests
30 |         run: |
31 |           conda create --quiet --name test pytest
32 |           export PATH="/usr/share/miniconda/bin:$PATH"
33 |           source activate test
34 |           pip install bandit black isort flake8
35 |           pip install .
36 |           npm install -g markdownlint-cli@0.33.0
37 |           make test
38 |           scraper -h
39 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | code.csv
 2 | code.json
 3 | config.json
 4 | build/
 5 | dist/
 6 | venv/
 7 | *.pyc
 8 | llnl_scraper.egg-info/
 9 | .vscode/
10 | 


--------------------------------------------------------------------------------
/.markdownlint.json:
--------------------------------------------------------------------------------
1 | {
2 |     "MD013": false,
3 |     "MD014": false
4 | }
5 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2018, Lawrence Livermore National Security, LLC
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include requirements/*.txt
2 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | test:
 2 | 	bandit -r scraper/
 3 | 	flake8 scraper/
 4 | 	black --check .
 5 | 	isort --check .
 6 | 
 7 | 	markdownlint '**/*.md'
 8 | 	pyflakes scraper
 9 | 
10 | release: test
11 | 	python3 setup.py sdist bdist_wheel
12 | 
13 | upload:
14 | 	twine upload --skip-existing dist/*
15 | 


--------------------------------------------------------------------------------
/NOTICE:
--------------------------------------------------------------------------------
 1 | This work was produced under the auspices of the U.S. Department of Energy by
 2 | Lawrence Livermore National Laboratory under Contract DE-AC52-07NA27344.
 3 | 
 4 | This work was prepared as an account of work sponsored by an agency of the
 5 | United States Government. Neither the United States Government nor Lawrence
 6 | Livermore National Security, LLC, nor any of their employees makes any warranty,
 7 | expressed or implied, or assumes any legal liability or responsibility for the
 8 | accuracy, completeness, or usefulness of any information, apparatus, product, or
 9 | process disclosed, or represents that its use would not infringe privately owned
10 | rights. Reference herein to any specific commercial product, process, or service
11 | by trade name, trademark, manufacturer, or otherwise does not necessarily
12 | constitute or imply its endorsement, recommendation, or favoring by the United
13 | States Government or Lawrence Livermore National Security, LLC. The views and
14 | opinions of authors expressed herein do not necessarily state or reflect those
15 | of the United States Government or Lawrence Livermore National Security, LLC,
16 | and shall not be used for advertising or product endorsement purposes.
17 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Scraper
  2 | 
  3 | Scraper is a tool for scraping and visualizing open source data from various
  4 | code hosting platforms, such as: GitHub.com, GitHub Enterprise, GitLab.com,
  5 | hosted GitLab, and Bitbucket Server.
  6 | 
  7 | ## Getting Started: Code.gov
  8 | 
  9 | [Code.gov](https://code.gov) is a newly launched website of the US Federal
 10 | Government to allow the People to access metadata from the governments custom
 11 | developed software. This site requires metadata to function, and this Python
 12 | library can help with that!
 13 | 
 14 | To get started, you will need a [GitHub Personal Auth
 15 | Token](https://help.github.com/articles/creating-a-personal-access-token-for-the-command-line/)
 16 | to make requests to the GitHub API. This should be set in your environment or
 17 | shell `rc` file with the name `GITHUB_API_TOKEN`:
 18 | 
 19 | ```shell
 20 |     $ export GITHUB_API_TOKEN=XYZ
 21 | 
 22 |     $ echo "export GITHUB_API_TOKEN=XYZ" >> ~/.bashrc
 23 | ```
 24 | 
 25 | Additionally, to perform the labor hours estimation, you will need to install
 26 | `cloc` into your environment. This is typically done with a [Package
 27 | Manager](https://github.com/AlDanial/cloc#install-via-package-manager) such as
 28 | `npm` or `homebrew`.
 29 | 
 30 | Then to generate a `code.json` file for your agency, you will need a
 31 | `config.json` file to coordinate the platforms you will connect to and scrape
 32 | data from. An example config file can be found in [demo.json](/demo.json). Once
 33 | you have your config file, you are ready to install and run the scraper!
 34 | 
 35 | ```shell
 36 |     # Install Scraper from a local copy of this repository
 37 |     $ pip install -e .
 38 |     # OR
 39 |     # Install Scraper from PyPI
 40 |     $ pip install llnl-scraper
 41 | 
 42 |     # Run Scraper with your config file ``config.json``
 43 |     $ scraper --config config.json
 44 | ```
 45 | 
 46 | A full example of the resulting `code.json` file can be [found
 47 | here](https://gist.github.com/IanLee1521/b7d7c0c2d8c24b10dd04edd5e8cab6c4).
 48 | 
 49 | ## Config File Options
 50 | 
 51 | The configuration file is a json file that specifies what repository platforms
 52 | to pull projects from as well as some settings that can be used to override
 53 | incomplete or inaccurate data returned via the scraping.
 54 | 
 55 | The basic structure is:
 56 | 
 57 | ```jsonc
 58 | {
 59 |     // REQUIRED
 60 |     "contact_email": "...",  // Used when the contact email cannot be found otherwise
 61 | 
 62 |     // OPTIONAL
 63 |     "agency": "...",         // Your agency abbreviation here
 64 |     "organization": "...",   // The organization within the agency
 65 |     "permissions": { ... },  // Object containing default values for usageType and exemptionText
 66 | 
 67 |     // Platform configurations, described in more detail below
 68 |     "GitHub": [ ... ],
 69 |     "GitLab": [ ... ],
 70 |     "Bitbucket": [ ... ],
 71 | }
 72 | ```
 73 | 
 74 | ```jsonc
 75 | "GitHub": [
 76 |     {
 77 |         "url": "https://github.com",  // GitHub.com or GitHub Enterprise URL to inventory
 78 |         "token": null,                // Private token for accessing this GitHub instance
 79 |         "public_only": true,          // Only inventory public repositories
 80 | 
 81 |         "connect_timeout": 4,  // The timeout in seconds for connecting to the server
 82 |         "read_timeout": 10,    // The timeout in seconds to wait for a response from the server
 83 | 
 84 |         "orgs": [ ... ],    // List of organizations to inventory
 85 |         "repos": [ ... ],   // List of single repositories to inventory
 86 |         "exclude": [ ... ]  // List of organizations / repositories to exclude from inventory
 87 |     }
 88 | ],
 89 | ```
 90 | 
 91 | ```jsonc
 92 | "GitLab": [
 93 |     {
 94 |         "url": "https://gitlab.com",  // GitLab.com or hosted GitLab instance URL to inventory
 95 |         "token": null,                // Private token for accessing this GitHub instance
 96 |         "fetch_languages": false,     // Include individual calls to API for language metadata. Very slow, so defaults to false. (eg, for 191 projects on internal server, 5 seconds for False, 12 minutes, 38 seconds for True)
 97 | 
 98 |         "orgs": [ ... ],    // List of organizations to inventory
 99 |         "repos": [ ... ],   // List of single repositories to inventory
100 |         "exclude": [ ... ]  // List of groups / repositories to exclude from inventory
101 |     }
102 | ]
103 | ```
104 | 
105 | ```jsonc
106 | "Bitbucket": [
107 |     {
108 |         "url": "https://bitbucket.internal",  // Base URL for a Bitbucket Server instance
109 |         "username": "",                       // Username to authenticate with
110 |         "password": "",                       // Password to authenticate with
111 |         "token": "",                          // Token to authenticate with, if supplied username and password are ignored
112 | 
113 |         "exclude": [ ... ]  // List of projects / repositories to exclude from inventory
114 |     }
115 | ]
116 | ```
117 | 
118 | ```jsonc
119 | "TFS": [
120 |     {
121 |         "url": "https://tfs.internal",  // Base URL for a Team Foundation Server (TFS) or Visual Studio Team Services (VSTS)
122 |         "token": null,                  // Private token for accessing this TFS instance
123 | 
124 |         "exclude": [ ... ]  // List of projects / repositories to exclude from inventory
125 |     }
126 | ]
127 | ```
128 | 
129 | ```jsonc
130 | "AzureDevOps": [
131 |     {
132 |         "url": "https://dev.azure.com",  // Base URL for an Azure Dev Ops Server or Azure Dev Ops Cloud instance
133 |         "token": null,                  // Personal Access Token for accessing this ADO instance
134 |         "apiVersion": "",               // API Version
135 |         "exclude": [ ... ]  // List of projects to exclude from inventory
136 |     }
137 | ]
138 | ```
139 | 
140 | ## License
141 | 
142 | Scraper is released under an MIT license. For more details see the
143 | [LICENSE](/LICENSE) file.
144 | 
145 | LLNL-CODE-705597
146 | 


--------------------------------------------------------------------------------
/demo.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "agency": "DOE",
 3 |     "organization": "Lawrence Livermore National Laboratory",
 4 |     "contact_email": "open-source@llnl.gov",
 5 | 
 6 |     "permissions": {
 7 |         "usageType": "exemptByAgencyMission",
 8 |         "exemptionText": "This source code resides on a private server and has not been properly evaluated for releaseability."
 9 |     },
10 | 
11 |     "GitHub": [
12 |         {
13 |             "url": "https://github.com",
14 |             "token": null,
15 |             "public_only": true,
16 | 
17 |             "orgs": [
18 |                 "LLNL",
19 |                 "spack",
20 |                 "xbraid"
21 |             ],
22 |             "repos": [
23 |                 "dun/conman",
24 |                 "dun/munge"
25 |             ],
26 |             "exclude": [
27 |                 "LLNL",
28 |                 "spack/spack.io"
29 |             ]
30 |         }
31 |     ],
32 | 
33 |     "GitLab": [
34 |         {
35 |             "url": "https://gitlab.com",
36 |             "token": null,
37 |             "fetch_languages": false,
38 | 
39 |             "repos": [
40 |                 "IanLee1521/flake8",
41 |                 "gnachman/iterm2",
42 |                 "gitlab-org/gitlab-ce"
43 |             ],
44 |             "exclude": [
45 |                 "IanLee1521",
46 |                 "gitlab-org/gitlab-ce"
47 |             ]
48 |         }
49 |     ]
50 | }
51 | 


--------------------------------------------------------------------------------
/llnl_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "agency": "DOE",
 3 |     "organization": "Lawrence Livermore National Laboratory",
 4 |     "contact_email": "open-source@llnl.gov",
 5 |     "github_orgs": [
 6 |         "chaos",
 7 |         "esgf",
 8 |         "flux-framework",
 9 |         "glvis",
10 |         "llnl",
11 |         "mfem",
12 |         "pruners",
13 |         "rose-compiler",
14 |         "spack",
15 |         "uv-cdat",
16 |         "zfsonlinux"
17 |     ],
18 |     "github_repos": [
19 |         "ceed/laghos",
20 |         "dun/conman",
21 |         "dun/munge",
22 |         "frankieli/icenine",
23 |         "hpc/dcp",
24 |         "hpc/mpifileutils",
25 |         "hpc/openlorenz",
26 |         "hpc/spindle"
27 |     ]
28 | }
29 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | -r requirements/dev.txt
2 | 


--------------------------------------------------------------------------------
/requirements/dev.txt:
--------------------------------------------------------------------------------
 1 | -r production.txt
 2 | 
 3 | # Development tools
 4 | ipython
 5 | twine
 6 | 
 7 | # Testing tools
 8 | bandit
 9 | black
10 | flake8
11 | isort
12 | pyflakes
13 | safety
14 | 


--------------------------------------------------------------------------------
/requirements/production.txt:
--------------------------------------------------------------------------------
 1 | github3.py>=2.0.0
 2 | msrest>=0.6.4
 3 | python-dateutil>=2.7.3
 4 | python-gitlab>=1.6.0
 5 | pytz>=2017.3
 6 | requests>=2.16
 7 | setuptools>=24.2.0
 8 | stashy>=0.3
 9 | vsts>=0.1.25
10 | 


--------------------------------------------------------------------------------
/scraper/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LLNL/scraper/4b0efe9cae3d062b0e6b633333e42768d56f8b57/scraper/__init__.py


--------------------------------------------------------------------------------
/scraper/azuredevops/__init__.py:
--------------------------------------------------------------------------------
  1 | import base64
  2 | import logging
  3 | import os
  4 | import re
  5 | from typing import List
  6 | 
  7 | import requests
  8 | 
  9 | from scraper.azuredevops.models import AzureDevOpsCollection, AzureDevOpsProject
 10 | 
 11 | logger = logging.getLogger(__name__)
 12 | 
 13 | 
 14 | class AzureDevOpsClient:
 15 |     def __init__(self, baseurl, api_version, token=None):
 16 |         self.baseurl = baseurl
 17 |         self.api_version = api_version
 18 |         self.is_cloud_ado = "dev.azure.com" in baseurl
 19 |         self.session = self._create_client_session(token)
 20 | 
 21 |     def get_projects_metadata(self) -> List[AzureDevOpsProject]:
 22 |         """
 23 |         Get metadata for all projects
 24 |         """
 25 |         collections = self._get_all_collections()
 26 |         return self._get_all_projects(collections)
 27 | 
 28 |     def _create_client_session(self, token):
 29 |         """
 30 |         Creates the Azure DevOps Client Context with the provided token.
 31 |         If no token is provided, it will look for the ADO_API_TOKEN environment variable.
 32 |         """
 33 |         if token is None:
 34 |             token = os.environ.get("ADO_API_TOKEN", None)
 35 | 
 36 |         if token is None:
 37 |             raise RuntimeError("Azure Dev Ops Token was not provided.")
 38 | 
 39 |         session = requests.Session()
 40 |         auth_string = f":{token}"
 41 |         encoded_auth = base64.b64encode(auth_string.encode("ascii")).decode("ascii")
 42 |         session.headers.update(
 43 |             {"Authorization": f"Basic {encoded_auth}", "Accept": "application/json"}
 44 |         )
 45 |         return session
 46 | 
 47 |     def _get_all_collections(self) -> List[AzureDevOpsCollection]:
 48 |         """
 49 |         Get all collections from the Azure DevOps API.
 50 |         """
 51 |         collections = []
 52 | 
 53 |         if self.is_cloud_ado:
 54 |             # For cloud Azure DevOps, get all organizations from the API
 55 |             profile_url = f"https://app.vssps.visualstudio.com/_apis/profile/profiles/me?api-version={self.api_version}"
 56 |             profile_response = self.session.get(profile_url)
 57 | 
 58 |             if profile_response.status_code == 200:
 59 |                 profile = profile_response.json()
 60 | 
 61 |                 # Get user's organizations/accounts
 62 |                 accounts_url = f"https://app.vssps.visualstudio.com/_apis/accounts?memberId={profile['id']}&api-version={self.api_version}"
 63 |                 accounts_response = self.session.get(accounts_url)
 64 | 
 65 |                 if accounts_response.status_code == 200:
 66 |                     accounts_json = accounts_response.json()
 67 | 
 68 |                     if accounts_json.get("value") and len(accounts_json["value"]) > 0:
 69 |                         for org in accounts_json["value"]:
 70 |                             collections.append(
 71 |                                 AzureDevOpsCollection(
 72 |                                     id=org["accountId"],
 73 |                                     name=org["accountName"],
 74 |                                     url=f"https://dev.azure.com/{org['accountName']}",
 75 |                                 )
 76 |                             )
 77 |                             logger.debug(
 78 |                                 f"Found cloud organization: {org['accountName']}"
 79 |                             )
 80 |                     else:
 81 |                         logger.warning("No organizations found with your access token.")
 82 | 
 83 |                         # Fallback: Try to extract organization from baseAddress
 84 |                         org_name = self.baseurl.rstrip("/").split("/")[-1]
 85 |                         if org_name and org_name != "dev.azure.com":
 86 |                             collections.append(
 87 |                                 AzureDevOpsCollection(
 88 |                                     id=org_name,
 89 |                                     name=org_name,
 90 |                                     url=f"https://dev.azure.com/{org_name}",
 91 |                                 )
 92 |                             )
 93 |                             logger.debug(
 94 |                                 f"Using organization from base address: {org_name}"
 95 |                             )
 96 |                 else:
 97 |                     raise RuntimeError(
 98 |                         f"Failed to retrieve organizations. Status Code: {accounts_response.status_code} Response: {accounts_response.text}"
 99 |                     )
100 |             else:
101 |                 logger.warning(
102 |                     f"Failed to retrieve user profile: {profile_response.status_code} Response: {profile_response.text}"
103 |                 )
104 |                 logger.warning(
105 |                     "Falling back to base address for organization extraction."
106 |                 )
107 |                 # Fallback: Try to extract organization from baseAddress
108 |                 org_name = self.baseurl.rstrip("/").split("/")[-1]
109 |                 if org_name and org_name != "dev.azure.com":
110 |                     collections.append(
111 |                         AzureDevOpsCollection(
112 |                             id=org_name,
113 |                             name=org_name,
114 |                             url=f"https://dev.azure.com/{org_name}",
115 |                         )
116 |                     )
117 |                     logger.debug(f"Using organization from base address: {org_name}")
118 |                 else:
119 |                     raise RuntimeError(
120 |                         "Could not determine organization. Please specify organization in the baseurl."
121 |                     )
122 |         else:
123 |             # For on-premises, get collections via API
124 |             collections_url = f"{self.baseurl}/_apis/projectcollections?api-version={self.api_version}"
125 |             collections_response = self.session.get(collections_url)
126 | 
127 |             if collections_response.status_code == 200:
128 |                 collections_json = collections_response.json()
129 |                 for collection in collections_json.get("value", []):
130 |                     collections.append(
131 |                         AzureDevOpsCollection(
132 |                             id=collection["id"],
133 |                             name=collection["name"],
134 |                             url=collection["url"],
135 |                         )
136 |                     )
137 |             else:
138 |                 raise RuntimeError(
139 |                     f"Failed to retrieve collections. Status Code: {collections_response.status_code} Response: {collections_response.text}"
140 |                 )
141 | 
142 |         logger.debug(f"Found {len(collections)} collections/organizations")
143 |         return collections
144 | 
145 |     def _get_web_url_from_api_url(self, api_url, project_name):
146 |         """
147 |         Convert an API URL to a web-accessible URL
148 | 
149 |         Parameters:
150 |             api_url (str): API URL for the project
151 |             project_name (str): Name of the project
152 | 
153 |         Returns:
154 |             str: Web URL for the project
155 |         """
156 |         if self.is_cloud_ado:
157 |             # For cloud ADO, convert URL like:
158 |             # https://dev.azure.com/org-name/_apis/projects/project-id
159 |             # to: https://dev.azure.com/org-name/project-name
160 |             match = re.search(r"https://dev\.azure\.com/([^/]+)", api_url)
161 |             if match:
162 |                 org_name = match.group(1)
163 |                 return f"https://dev.azure.com/{org_name}/{project_name}"
164 |         else:
165 |             # For on-premises ADO, convert URL like:
166 |             # https://server/collection/_apis/projects/project-id
167 |             # to: https://server/collection/project-name
168 |             base_url = api_url.split("/_apis/projects")[0]
169 |             return f"{base_url}/{project_name}"
170 | 
171 |     def _get_repo_web_url(self, api_url, project_name):
172 |         """
173 |         Generate web-accessible URL for repositories page
174 | 
175 |         Parameters:
176 |             api_url (str): API URL for the project
177 |             project_name (str): Name of the project
178 | 
179 |         Returns:
180 |             str: Web URL for the project's repositories page
181 |         """
182 |         project_web_url = self._get_web_url_from_api_url(api_url, project_name)
183 |         return f"{project_web_url}/_git"
184 | 
185 |     def _get_all_projects(
186 |         self, collections: List[AzureDevOpsCollection] = None
187 |     ) -> List[AzureDevOpsProject]:
188 |         """
189 |         Get all projects from the provided collections or from all collections if none are provided
190 | 
191 |         Parameters:
192 |         collections (List[AzureDevOpsCollection]): List of collections to get projects from
193 |         """
194 |         if collections is None:
195 |             collections = self._get_all_collections()
196 | 
197 |         projects = []
198 |         for collection in collections:
199 |             collection_url = (
200 |                 f"https://dev.azure.com/{collection.name}"
201 |                 if self.is_cloud_ado
202 |                 else f"{self.baseurl}/{collection.name}"
203 |             )
204 |             logger.debug("Getting projects from collection: %s", collection_url)
205 | 
206 |             top = 100
207 |             project_skip = 0
208 |             total_projects = 0
209 |             has_more_projects = True
210 | 
211 |             while has_more_projects:
212 |                 url = f"{collection_url}/_apis/projects?$top={top}&$skip={project_skip}&api-version={self.api_version}&includeCapabilities=true"
213 | 
214 |                 response = self.session.get(url)
215 |                 if response.status_code != 200:
216 |                     raise RuntimeError(
217 |                         f"Failed to get projects: {response.status_code}"
218 |                     )
219 | 
220 |                 result = response.json()
221 |                 for project in result.get("value", []):
222 |                     project_api_url = project.get("url")
223 |                     project_name = project.get("name")
224 | 
225 |                     project_web_url = self._get_web_url_from_api_url(
226 |                         project_api_url, project_name
227 |                     )
228 |                     repo_web_url = self._get_repo_web_url(project_api_url, project_name)
229 | 
230 |                     projects.append(
231 |                         AzureDevOpsProject(
232 |                             project_id=project.get("id"),
233 |                             project_name=project_name,
234 |                             project_description=project.get("description") or "",
235 |                             project_url=project_web_url,
236 |                             repo_url=repo_web_url,
237 |                             project_create_time="",  # Not provided in API response
238 |                             project_last_update_time=project.get("lastUpdateTime"),
239 |                             collection_or_org_name=collection.name,
240 |                         )
241 |                     )
242 | 
243 |                 count = len(result.get("value", []))
244 |                 total_projects += count
245 |                 project_skip += top
246 | 
247 |                 has_more_projects = count == top
248 | 
249 |         return projects
250 | 


--------------------------------------------------------------------------------
/scraper/azuredevops/models.py:
--------------------------------------------------------------------------------
 1 | class AzureDevOpsCollection:
 2 |     def __init__(self, id="", name="", url=""):
 3 |         self.id = id
 4 |         self.name = name
 5 |         self.url = url
 6 | 
 7 | 
 8 | class AzureDevOpsProject:
 9 |     def __init__(
10 |         self,
11 |         project_id="",
12 |         project_name="",
13 |         project_description="",
14 |         project_url="",
15 |         repo_url="",
16 |         project_create_time="",
17 |         project_last_update_time="",
18 |         collection_or_org_name="",
19 |     ):
20 |         self.project_id = project_id
21 |         self.project_name = project_name
22 |         self.project_description = project_description
23 |         self.project_url = project_url
24 |         self.repo_url = repo_url
25 |         self.project_create_time = project_create_time
26 |         self.project_last_update_time = project_last_update_time
27 |         self.collection_or_org_name = collection_or_org_name
28 | 


--------------------------------------------------------------------------------
/scraper/bitbucket/__init__.py:
--------------------------------------------------------------------------------
 1 | import datetime
 2 | import logging
 3 | 
 4 | import stashy
 5 | from stashy.client import Stash
 6 | 
 7 | logger = logging.getLogger(__name__)
 8 | 
 9 | 
10 | def connect(url, username=None, password=None, token=None):
11 |     """
12 |     Return a connected Bitbucket session
13 |     """
14 |     if token is not None:
15 |         bb_session = Stash(url, token=token)
16 |         logger.info("Connected to: %s with token", url)
17 |     else:
18 |         bb_session = stashy.connect(url, username, password)
19 |         logger.info("Connected to: %s as username %s", url, username)
20 | 
21 |     return bb_session
22 | 
23 | 
24 | def all_repos(bb_session):
25 |     """
26 |     Yields Stashy repo dictionary objects for all repos in Bitbucket
27 |     """
28 | 
29 |     for repo in bb_session.repos.all():
30 |         all_commits = sorted(
31 |             bb_session.projects[repo["project"]["key"]]
32 |             .repos[repo["name"]]
33 |             .commits(None),
34 |             key=lambda x: x["authorTimestamp"],
35 |         )
36 |         if all_commits:
37 |             repo["created"] = (
38 |                 datetime.datetime.fromtimestamp(
39 |                     all_commits[0]["authorTimestamp"] / 1000
40 |                 )
41 |                 .date()
42 |                 .isoformat()
43 |             )
44 |             repo["lastModified"] = (
45 |                 datetime.datetime.fromtimestamp(
46 |                     all_commits[-1]["authorTimestamp"] / 1000
47 |                 )
48 |                 .date()
49 |                 .isoformat()
50 |             )
51 |         yield repo
52 | 


--------------------------------------------------------------------------------
/scraper/code_gov/__init__.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/env python
  2 | # -*- coding: UTF-8 -*-
  3 | 
  4 | import logging
  5 | 
  6 | from scraper import bitbucket, doecode, github, gitlab, tfs
  7 | from scraper.azuredevops import AzureDevOpsClient
  8 | from scraper.code_gov.models import Metadata, Project
  9 | from scraper.github import gov_orgs
 10 | 
 11 | logger = logging.getLogger(__name__)
 12 | 
 13 | 
 14 | def process_config(config):
 15 |     """
 16 |     Master function to process a Scraper config file
 17 | 
 18 |     Returns a Code.gov Metadata file
 19 |     """
 20 | 
 21 |     agency = config.get("agency", "UNKNOWN")
 22 |     logger.debug("Agency: %s", agency)
 23 | 
 24 |     method = config.get("method", "other")
 25 |     logger.debug("Inventory Method: %s", method)
 26 | 
 27 |     compute_labor_hours = config.get("compute_labor_hours", True)
 28 | 
 29 |     if config.get("contact_email", None) is None:
 30 |         # A default contact email is required to handle the (frequent) case
 31 |         # where a project / repository has no available contact email.
 32 |         logger.warning('Config file should contain a "contact_email"')
 33 | 
 34 |     logger.debug("Creating inventory from config: %s", config)
 35 |     code_gov_metadata = Metadata(agency, method)
 36 | 
 37 |     # Parse config for GitHub repositories
 38 |     github_instances = config.get("GitHub", [])
 39 |     if config.get("github_gov_orgs", False):
 40 |         github_instances.append({"url": "https://github.com", "orgs": gov_orgs()})
 41 |     for instance in github_instances:
 42 |         timeouts = {}
 43 |         url = instance.get("url", "https://github.com")
 44 |         orgs = instance.get("orgs", [])
 45 |         repos = instance.get("repos", [])
 46 |         public_only = instance.get("public_only", True)
 47 |         excluded = instance.get("exclude", [])
 48 |         token = instance.get("token", None)
 49 |         connect_timeout = instance.get("connect_timeout", None)
 50 |         read_timeout = instance.get("read_timeout", None)
 51 | 
 52 |         if connect_timeout is not None:
 53 |             timeouts["default_connect_timeout"] = connect_timeout
 54 |         if read_timeout is not None:
 55 |             timeouts["default_read_timeout"] = read_timeout
 56 | 
 57 |         gh_session = github.connect(url, token, timeouts)
 58 | 
 59 |         for repo in github.query_repos(gh_session, orgs, repos, public_only):
 60 |             if repo.owner.login in excluded or repo.full_name in excluded:
 61 |                 logger.info("Excluding: %s", repo.full_name)
 62 |                 continue
 63 | 
 64 |             code_gov_project = Project.from_github3(
 65 |                 repo, labor_hours=compute_labor_hours
 66 |             )
 67 |             code_gov_metadata["releases"].append(code_gov_project)
 68 | 
 69 |     # Parse config for GitLab repositories
 70 |     gitlab_instances = config.get("GitLab", [])
 71 |     for instance in gitlab_instances:
 72 |         url = instance.get("url")
 73 |         # orgs = instance.get('orgs', [])
 74 |         repos = instance.get("repos", [])
 75 |         # public_only = instance.get('public_only', True)
 76 |         excluded = instance.get("exclude", [])
 77 |         token = instance.get("token", None)
 78 |         fetch_languages = instance.get("fetch_languages", False)
 79 | 
 80 |         gl_session = gitlab.connect(url, token)
 81 | 
 82 |         for repo in gitlab.query_repos(gl_session, repos):
 83 |             namespace = repo.namespace["path"]
 84 |             path_with_namespace = repo.path_with_namespace
 85 |             if namespace in excluded or path_with_namespace in excluded:
 86 |                 logger.info("Excluding: %s", repo.path_with_namespace)
 87 |                 continue
 88 | 
 89 |             code_gov_project = Project.from_gitlab(
 90 |                 repo, labor_hours=compute_labor_hours, fetch_languages=fetch_languages
 91 |             )
 92 |             code_gov_metadata["releases"].append(code_gov_project)
 93 | 
 94 |     # Parse config for Bitbucket repositories
 95 |     bitbucket_instances = config.get("Bitbucket", [])
 96 |     for instance in bitbucket_instances:
 97 |         url = instance.get("url")
 98 |         # orgs = instance.get('orgs', None)
 99 |         # public_only = instance.get('public_only', True)
100 |         username = instance.get("username", None)
101 |         password = instance.get("password", None)
102 |         token = instance.get("token", None)
103 |         excluded = instance.get("exclude", [])
104 | 
105 |         bb_session = bitbucket.connect(url, username, password, token)
106 | 
107 |         for repo in bitbucket.all_repos(bb_session):
108 |             project = repo["project"]["key"]
109 |             project_repo = "%s/%s" % (project, repo["slug"])
110 |             if project in excluded or project_repo in excluded:
111 |                 logger.info("Excluding: %s", project_repo)
112 |                 continue
113 | 
114 |             code_gov_project = Project.from_stashy(
115 |                 repo, labor_hours=compute_labor_hours
116 |             )
117 |             code_gov_metadata["releases"].append(code_gov_project)
118 | 
119 |     # Parse config for TFS repositories
120 |     tfs_instances = config.get("TFS", [])
121 |     for instance in tfs_instances:
122 |         url = instance.get("url")
123 |         token = instance.get("token", None)
124 | 
125 |         projects = tfs.get_projects_metadata(url, token)
126 |         for project in projects:
127 |             code_gov_project = Project.from_tfs(
128 |                 project, labor_hours=compute_labor_hours
129 |             )
130 |             code_gov_metadata["releases"].append(code_gov_project)
131 | 
132 |     # parse config for AzureDevOps repositories
133 |     ado_instances = config.get("AzureDevOps", [])
134 |     for instance in ado_instances:
135 |         url = instance.get("url")
136 |         token = instance.get("token", None)
137 |         api_version = instance.get("apiVersion", "6.1-preview")
138 |         excluded = instance.get("exclude", [])
139 | 
140 |         ado_client = AzureDevOpsClient(url, api_version, token)
141 |         projects = ado_client.get_projects_metadata()
142 |         for project in projects:
143 |             if project.project_name in excluded:
144 |                 logger.info("Excluding: %s", project.project_name)
145 |                 continue
146 | 
147 |             code_gov_project = Project.from_ado(
148 |                 project, labor_hours=compute_labor_hours
149 |             )
150 |             code_gov_metadata["releases"].append(code_gov_project)
151 | 
152 |     # Handle parsing of DOE CODE records
153 | 
154 |     doecode_config = config.get("DOE CODE", {})
155 |     doecode_json = doecode_config.get("json", None)
156 |     doecode_url = doecode_config.get("url", None)
157 |     doecode_key = doecode_config.get("api_key", None)
158 | 
159 |     for record in doecode.process(doecode_json, doecode_url, doecode_key):
160 |         code_gov_project = Project.from_doecode(record)
161 |         code_gov_metadata["releases"].append(code_gov_project)
162 | 
163 |     return code_gov_metadata
164 | 
165 | 
166 | def force_attributes(metadata, config):
167 |     """
168 |     Forces certain fields in the Code.gov Metadata json
169 |     """
170 | 
171 |     organization = config.get("organization", "")
172 |     logger.debug("Organization: %s", organization)
173 | 
174 |     contact_email = config.get("contact_email")
175 |     logger.debug("Contact Email: %s", contact_email)
176 | 
177 |     permissions = config.get("permissions", {})
178 |     default_usage = permissions.get("usageType", "")
179 |     default_exemption_text = permissions.get("exemptionText", "")
180 |     logger.debug("Default usageType: %s", default_usage)
181 |     logger.debug("Default exemptionText: %s", default_exemption_text)
182 | 
183 |     # Force certain fields
184 |     if organization:
185 |         logger.debug("Forcing Organization to: %s", organization)
186 | 
187 |     if contact_email:
188 |         logger.debug("Forcing Contact Email to: %s", contact_email)
189 | 
190 |     for release in metadata["releases"]:
191 |         if organization:
192 |             release["organization"] = organization
193 | 
194 |         if contact_email:
195 |             release["contact"]["email"] = contact_email
196 | 
197 |         if "licenses" not in release["permissions"]:
198 |             release["permissions"]["licenses"] = None
199 | 
200 |         if "description" not in release:
201 |             release["description"] = "No description available..."
202 | 
203 |         if "usageType" not in release["permissions"]:
204 |             release["permissions"]["usageType"] = default_usage
205 |             release["permissions"]["exemptionText"] = default_exemption_text
206 | 
207 |     return metadata
208 | 


--------------------------------------------------------------------------------
/scraper/code_gov/models.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/env python
  2 | # -*- coding: UTF-8 -*-
  3 | 
  4 | import json
  5 | import logging
  6 | 
  7 | from dateutil.parser import parse as date_parse
  8 | import github3
  9 | import gitlab
 10 | from requests.utils import requote_uri
 11 | 
 12 | from scraper.azuredevops.models import AzureDevOpsProject
 13 | from scraper.github.util import _license_obj
 14 | from scraper.util import _prune_dict_null_str, labor_hours_from_url
 15 | 
 16 | logger = logging.getLogger(__name__)
 17 | 
 18 | POLICY_START_DATE = date_parse("2016-08-08T00:00:00Z")
 19 | 
 20 | 
 21 | class Metadata(dict):
 22 |     """
 23 |     Defines the entire contents of a Code.gov 's code.json file
 24 | 
 25 |     For details: https://code.gov/#/policy-guide/docs/compliance/inventory-code
 26 |     """
 27 | 
 28 |     def __init__(self, agency, method, other_method=""):
 29 |         # *version: [string] The Code.gov metadata schema version
 30 |         self["version"] = "2.0.0"
 31 | 
 32 |         # *agency: [string] The agency acronym for Clinger Cohen Act agency, e.g. "GSA" or "DOD"
 33 |         self["agency"] = agency.upper()
 34 | 
 35 |         # *measurementType: [object] The description of the open source measurement method
 36 |         #   *method [enum]: An enumerated list of methods for measuring the open source requirement
 37 |         #       cost: Cost of software development.
 38 |         #       systems: System certification and accreditation boundaries.
 39 |         #       projects: A complete software solution / project.
 40 |         #       modules: A self-contained module from a software solution.
 41 |         #       linesOfCode: Source lines of code.
 42 |         #       other: Another measurement method not referenced above.
 43 |         #   ifOther: [string] A one- or two- sentence description of the measurement type used, if 'other' is selected as the value of 'method' field.
 44 |         self["measurementType"] = {"method": method}
 45 |         if method == "other":
 46 |             self["measurementType"]["ifOther"] = other_method
 47 | 
 48 |         # The list of source code releases
 49 |         self["releases"] = []
 50 | 
 51 |     def to_json(self):
 52 |         return json.dumps(self, indent=4, sort_keys=True, ensure_ascii=False)
 53 | 
 54 | 
 55 | class Project(dict):
 56 |     """
 57 |     Python representation of Code.gov Metadata Schema
 58 | 
 59 |     For details: https://code.gov/#/policy-guide/docs/compliance/inventory-code
 60 |     """
 61 | 
 62 |     def __init__(self):
 63 |         # -- REQUIRED FIELDS --
 64 | 
 65 |         # *name: [string] The name of the release
 66 |         self["name"] = ""
 67 | 
 68 |         # repository: [string] The URL of the public project repository
 69 |         self["repositoryURL"] = ""
 70 | 
 71 |         # *description: [string] A description of the project
 72 |         self["description"] = ""
 73 | 
 74 |         # *permissions: [object] A description of the usage/restrictions regarding the release
 75 |         #   * licenses: [null or array of objects] An object containing license details, if available. If not, null should be used.
 76 |         #       URL: [string] The URL of the release license, if available
 77 |         #       name: [string] An abbreviation for the name of the license
 78 |         #   * usageType: [enum]
 79 |         #       openSource: Open source
 80 |         #       governmentWideReuse: Government-wide reuse.
 81 |         #       exemptByLaw: The sharing of the source code is restricted by law or regulation, including—but not limited to—patent or intellectual property law, the Export Asset Regulations, the International Traffic in Arms Regulation, and the Federal laws and regulations governing classified information.
 82 |         #       exemptByNationalSecurity: The sharing of the source code would create an identifiable risk to the detriment of national security, confidentiality of Government information, or individual privacy.
 83 |         #       exemptByAgencySystem: The sharing of the source code would create an identifiable risk to the stability, security, or integrity of the agency’s systems or personnel.
 84 |         #       exemptByAgencyMission: The sharing of the source code would create an identifiable risk to agency mission, programs, or operations.
 85 |         #       exemptByCIO: The CIO believes it is in the national interest to exempt sharing the source code.
 86 |         #       exemptByPolicyDate: The release was created prior to the M-16-21 policy (August 8, 2016).
 87 |         #   exemptionText: [null or string]
 88 |         self["permissions"] = {"licenses": None, "usageType": "", "exemptionText": None}
 89 | 
 90 |         # *laborHours: [number]: An estimate of total labor hours spent by your organization/component across all versions of this release. This includes labor performed by federal employees and contractors.
 91 |         self["laborHours"] = 0
 92 | 
 93 |         # *tags: [array] An array of keywords that will be helpful in discovering and searching for the release.
 94 |         self["tags"] = []
 95 | 
 96 |         # *contact: [object] Information about contacting the project.
 97 |         #   *email: [string] An email address to contact the project.
 98 |         #   name: [string] The name of a contact or department for the project
 99 |         #   twitter: [string] The username of the project's Twitter account
100 |         #   phone: [string] The phone number to contact a project.
101 |         self["contact"] = {"email": ""}
102 |         # TODO: Currently, the GSA Harvester requires these fields to not be present if they are empty
103 |         #     'name': '',
104 |         #     'URL': '',
105 |         #     'phone': '',
106 |         # }
107 | 
108 |         # -- OPTIONAL FIELDS --
109 | 
110 |         # version: [string] The version for this release. For example, "1.0.0."
111 |         # self['version'] = ''
112 | 
113 |         # organization: [string] The organization or component within the agency that the releases listed belong to. For example, "18F" or "Navy."
114 |         # self['organization'] = ''
115 | 
116 |         # status: [string] The development status of the project
117 |         #   "Ideation" - brainstorming phase.
118 |         #   "Development" - a release is still in development.
119 |         #   "Alpha" - initial prototyping phase and internal testing.
120 |         #   "Beta" - a project is being tested in public.
121 |         #   "Release Candidate" - a release is nearly ready for production.
122 |         #   "Production" - finished project, with development and maintenance ongoing.
123 |         #   "Archival" - finished project, but no longer actively maintained.
124 |         # self['status'] = ''
125 | 
126 |         # vcs: [string] A lowercase string with the name of the Version Control System in use on the project.
127 |         # self['vcs'] = ''
128 | 
129 |         # homepageURL: [string] The URL of the public release homepage.
130 |         # self['homepageURL'] = ''
131 | 
132 |         # downloadURL: [string] The URL where a distribution of the release can be found.
133 |         # self['downloadURL'] = ''
134 | 
135 |         # disclaimerText: [string] Short paragraph that includes disclaimer language to accompany the release.
136 |         # self['disclaimerText'] = ''
137 | 
138 |         # disclaimerURL: [string] The URL where disclaimer language regarding the release can be found.
139 |         # self['disclaimerURL'] = ''
140 | 
141 |         # languages: [array] A list of strings with the names of the programming languages in use on the release.
142 |         # self['languages'] = []
143 | 
144 |         # partners: [array] An array of objects including an acronym for each agency partnering on the release and the contact email at such agency.
145 |         #   name: [string] The acronym describing the partner agency.
146 |         #   email: [string] The email address for the point of contact at the partner agency.
147 |         # self['partners'] = []
148 | 
149 |         # relatedCode: [array] An array of affiliated government repositories that may be a part of the same project. For example, relatedCode for 'code-gov-web' would include 'code-gov-api' and 'code-gov-tools'.
150 |         #   name: [string] The name of the code repository, project, library or release.
151 |         #   URL: [string] The URL where the code repository, project, library or release can be found.
152 |         #   isGovernmentRepo: [boolean] True or False. Is the code repository owned or managed by a federal agency?
153 |         # self['relatedCode'] = []
154 | 
155 |         # reusedCode: [array] An array of government source code, libraries, frameworks, APIs, platforms or other software used in this release. For example: US Web Design Standards, cloud.gov, Federalist, Digital Services Playbook, Analytics Reporter.
156 |         #   name: [string] The name of the software used in this release.
157 |         #   URL: [string] The URL where the software can be found.
158 |         # self['reusedCode'] = []
159 | 
160 |         # date: [object] A date object describing the release.
161 |         #   created: [string] The date the release was originally created, in YYYY-MM-DD or ISO 8601 format.
162 |         #   lastModified: [string] The date the release was modified, in YYYY-MM-DD or ISO 8601 format.
163 |         #   metadataLastUpdated: [string] The date the metadata of the release was last updated, in YYYY-MM-DD or ISO 8601 format.
164 |         # self['date'] = {
165 |         #     'created': '',
166 |         #     'lastModified': '',
167 |         #     'metadataLastUpdated': ''
168 |         # }
169 | 
170 |     @classmethod
171 |     def from_github3(klass, repository, labor_hours=True):
172 |         """
173 |         Create CodeGovProject object from github3 Repository object
174 |         """
175 |         if not isinstance(repository, github3.repos.repo._Repository):
176 |             raise TypeError("Repository must be a github3 Repository object")
177 | 
178 |         logger.info("Processing: %s", repository.full_name)
179 | 
180 |         project = klass()
181 | 
182 |         logger.debug("GitHub3: repository=%s", repository)
183 | 
184 |         # -- REQUIRED FIELDS --
185 | 
186 |         project["name"] = repository.name
187 |         project["repositoryURL"] = repository.clone_url
188 |         project["description"] = repository.description
189 | 
190 |         try:
191 |             repo_license = repository.license()
192 |         except github3.exceptions.NotFoundError:
193 |             logger.debug("no license found for repo=%s", repository)
194 |             repo_license = None
195 | 
196 |         if repo_license:
197 |             license_obj = repo_license.license
198 |             if license_obj:
199 |                 logger.debug(
200 |                     "license spdx=%s; url=%s", license_obj.spdx_id, license_obj.url
201 |                 )
202 |                 if license_obj.url is None:
203 |                     project["permissions"]["licenses"] = [{"name": license_obj.spdx_id}]
204 |                 else:
205 |                     project["permissions"]["licenses"] = [
206 |                         {"URL": license_obj.url, "name": license_obj.spdx_id}
207 |                     ]
208 |             else:
209 |                 project["permissions"]["licenses"] = None
210 | 
211 |         public_server = repository.html_url.startswith("https://github.com")
212 |         if not repository.private and public_server:
213 |             project["permissions"]["usageType"] = "openSource"
214 |         elif date_parse(repository.created_at) < POLICY_START_DATE:
215 |             project["permissions"]["usageType"] = "exemptByPolicyDate"
216 | 
217 |         if labor_hours:
218 |             project["laborHours"] = labor_hours_from_url(project["repositoryURL"])
219 |         else:
220 |             project["laborHours"] = 0
221 | 
222 |         project["tags"] = ["github"]
223 |         old_accept = repository.session.headers["Accept"]
224 |         repository.session.headers["Accept"] = (
225 |             "application/vnd.github.mercy-preview+json"
226 |         )
227 |         topics = repository._get(repository.url + "/topics").json()
228 |         project["tags"].extend(topics.get("names", []))
229 |         repository.session.headers["Accept"] = old_accept
230 | 
231 |         # Hacky way to get an Organization object back with GitHub3.py >= 1.2.0
232 |         owner_url = repository.owner.url
233 |         owner_api_response = repository._get(owner_url)
234 |         organization = repository._json(owner_api_response, 200)
235 |         project["contact"]["email"] = organization["email"]
236 |         project["contact"]["URL"] = organization["html_url"]
237 | 
238 |         # -- OPTIONAL FIELDS --
239 | 
240 |         # project['version'] = ''
241 | 
242 |         project["organization"] = organization["name"]
243 | 
244 |         # TODO: Currently, can't be an empty string, see: https://github.com/GSA/code-gov-web/issues/370
245 |         project["status"] = "Development"
246 | 
247 |         project["vcs"] = "git"
248 | 
249 |         project["homepageURL"] = repository.html_url
250 | 
251 |         project["downloadURL"] = repository.downloads_url
252 | 
253 |         project["languages"] = [lang for lang, _ in repository.languages()]
254 | 
255 |         # project['partners'] = []
256 | 
257 |         # project['relatedCode'] = []
258 | 
259 |         # project['reusedCode'] = []
260 | 
261 |         # date: [object] A date object describing the release.
262 |         #   created: [string] The date the release was originally created, in YYYY-MM-DD or ISO 8601 format.
263 |         #   lastModified: [string] The date the release was modified, in YYYY-MM-DD or ISO 8601 format.
264 |         #   metadataLastUpdated: [string] The date the metadata of the release was last updated, in YYYY-MM-DD or ISO 8601 format.
265 |         try:
266 |             created_at = repository.created_at.date()
267 |         except AttributeError:
268 |             created_at = date_parse(repository.created_at).date()
269 |         try:
270 |             updated_at = repository.updated_at.date()
271 |         except AttributeError:
272 |             updated_at = date_parse(repository.updated_at).date()
273 | 
274 |         project["date"] = {
275 |             "created": created_at.isoformat(),
276 |             "lastModified": updated_at.isoformat(),
277 |             "metadataLastUpdated": "",
278 |         }
279 | 
280 |         _prune_dict_null_str(project)
281 | 
282 |         return project
283 | 
284 |     @classmethod
285 |     def from_gitlab(klass, repository, labor_hours=True, fetch_languages=False):
286 |         """
287 |         Create CodeGovProject object from GitLab Repository
288 |         """
289 |         if not isinstance(repository, gitlab.v4.objects.Project):
290 |             raise TypeError("Repository must be a gitlab Repository object")
291 | 
292 |         project = klass()
293 | 
294 |         logger.debug(
295 |             "GitLab: repository_id=%d path_with_namespace=%s",
296 |             repository.id,
297 |             repository.path_with_namespace,
298 |         )
299 | 
300 |         # -- REQUIRED FIELDS --
301 | 
302 |         project["name"] = repository.name
303 |         project["repositoryURL"] = repository.http_url_to_repo
304 |         project["description"] = repository.description
305 | 
306 |         # TODO: Update licenses from GitLab API
307 |         project["permissions"]["licenses"] = None
308 | 
309 |         web_url = repository.web_url
310 |         public_server = web_url.startswith("https://gitlab.com")
311 | 
312 |         if repository.visibility in ("public") and public_server:
313 |             project["permissions"]["usageType"] = "openSource"
314 |         elif date_parse(repository.created_at) < POLICY_START_DATE:
315 |             project["permissions"]["usageType"] = "exemptByPolicyDate"
316 | 
317 |         if labor_hours:
318 |             project["laborHours"] = labor_hours_from_url(project["repositoryURL"])
319 |         else:
320 |             project["laborHours"] = 0
321 | 
322 |         project["tags"] = ["gitlab"] + repository.tag_list
323 | 
324 |         project["contact"] = {"email": "", "URL": web_url}
325 | 
326 |         # -- OPTIONAL FIELDS --
327 | 
328 |         # project['version'] = ''
329 | 
330 |         project["organization"] = repository.namespace["name"]
331 | 
332 |         # TODO: Currently, can't be an empty string, see: https://github.com/GSA/code-gov-web/issues/370
333 |         project["status"] = "Development"
334 | 
335 |         project["vcs"] = "git"
336 | 
337 |         project["homepageURL"] = repository.web_url
338 | 
339 |         api_url = repository.manager.gitlab._url
340 |         archive_suffix = "/projects/%s/repository/archive" % repository.get_id()
341 |         project["downloadURL"] = api_url + archive_suffix
342 | 
343 |         # project['languages'] = [lang for lang, _ in repository.languages()]
344 | 
345 |         if fetch_languages:
346 |             project["languages"] = [*repository.languages()]
347 | 
348 |         # project['partners'] = []
349 |         # project['relatedCode'] = []
350 |         # project['reusedCode'] = []
351 | 
352 |         project["date"] = {
353 |             "created": date_parse(repository.created_at).date().isoformat(),
354 |             "lastModified": date_parse(repository.last_activity_at).date().isoformat(),
355 |             "metadataLastUpdated": "",
356 |         }
357 | 
358 |         _prune_dict_null_str(project)
359 | 
360 |         return project
361 | 
362 |     @classmethod
363 |     def from_stashy(klass, repository, labor_hours=True):
364 |         """
365 |         Handles crafting Code.gov Project for Bitbucket Server repositories
366 |         """
367 |         # if not isinstance(repository, stashy.repos.Repository):
368 |         #     raise TypeError('Repository must be a stashy Repository object')
369 |         if not isinstance(repository, dict):
370 |             raise TypeError("Repository must be a dict")
371 | 
372 |         project = klass()
373 | 
374 |         logger.debug(
375 |             "Stashy: project_key=%s repository_slug=%s",
376 |             repository["name"],
377 |             repository["project"]["key"],
378 |         )
379 | 
380 |         # -- REQUIRED FIELDS --
381 | 
382 |         project["name"] = repository["name"]
383 | 
384 |         clone_urls = [clone["href"] for clone in repository["links"]["clone"]]
385 |         for url in clone_urls:
386 |             # Only rely on SSH Urls for repository urls
387 |             if url.startswith("ssh://"):
388 |                 project["repositoryURL"] = url
389 |                 break
390 | 
391 |         description = repository["project"].get("description", "")
392 |         if description:
393 |             project["description"] = "Project description: %s" % description
394 | 
395 |         project["permissions"]["licenses"] = None
396 | 
397 |         web_url = repository["links"]["self"][0]["href"]
398 |         public_server = web_url.startswith("https://bitbucket.org")
399 |         if repository["public"] and public_server:
400 |             project["permissions"]["usageType"] = "openSource"
401 | 
402 |         if labor_hours:
403 |             project["laborHours"] = labor_hours_from_url(project["repositoryURL"])
404 |         else:
405 |             project["laborHours"] = 0
406 | 
407 |         project["tags"] = ["bitbucket"]
408 | 
409 |         project["contact"]["email"] = ""
410 |         project["contact"]["URL"] = repository["links"]["self"][0]["href"]
411 | 
412 |         # -- OPTIONAL FIELDS --
413 | 
414 |         # project['version'] = ''
415 | 
416 |         # project['organization'] = organization.name
417 | 
418 |         # TODO: Currently, can't be an empty string, see: https://github.com/GSA/code-gov-web/issues/370
419 |         project["status"] = "Development"
420 | 
421 |         project["vcs"] = repository["scmId"]
422 | 
423 |         project["homepageURL"] = repository["links"]["self"][0]["href"]
424 | 
425 |         # project['downloadURL'] =
426 | 
427 |         # project['languages'] =
428 | 
429 |         # project['partners'] = []
430 | 
431 |         # project['relatedCode'] = []
432 | 
433 |         # project['reusedCode'] = []
434 | 
435 |         # date: [object] A date object describing the release. Empty if repo has no commits.
436 |         #   created: [string] The date the release was originally created, in YYYY-MM-DD or ISO 8601 format.
437 |         #   lastModified: [string] The date the release was modified, in YYYY-MM-DD or ISO 8601 format.
438 |         if repository.get("created", None):
439 |             project["date"] = {
440 |                 "created": repository["created"],
441 |                 "lastModified": repository["lastModified"],
442 |             }
443 | 
444 |         _prune_dict_null_str(project)
445 | 
446 |         return project
447 | 
448 |     @classmethod
449 |     def from_doecode(klass, record):
450 |         """
451 |         Create CodeGovProject object from DOE CODE record
452 | 
453 |         Handles crafting Code.gov Project
454 |         """
455 |         if not isinstance(record, dict):
456 |             raise TypeError("`record` must be a dict")
457 | 
458 |         project = klass()
459 | 
460 |         # -- REQUIRED FIELDS --
461 | 
462 |         project["name"] = record["software_title"]
463 |         logger.debug('DOE CODE: software_title="%s"', record["software_title"])
464 | 
465 |         link = record.get("repository_link", "")
466 |         if not link:
467 |             link = record.get("landing_page")
468 |             logger.warning("DOE CODE: No repositoryURL, using landing_page: %s", link)
469 | 
470 |         project["repositoryURL"] = link
471 | 
472 |         project["description"] = record["description"]
473 | 
474 |         licenses = set(record["licenses"])
475 |         licenses.discard(None)
476 |         logger.debug("DOE CODE: licenses=%s", licenses)
477 | 
478 |         license_objects = []
479 |         if "Other" in licenses:
480 |             licenses.remove("Other")
481 |             license_objects = [{"name": "Other", "URL": record["proprietary_url"]}]
482 | 
483 |         if licenses:
484 |             license_objects.extend(
485 |                 [_license_obj(license_name) for license_name in licenses]
486 |             )
487 | 
488 |         project["permissions"]["licenses"] = license_objects
489 | 
490 |         if record["open_source"]:
491 |             usage_type = "openSource"
492 |         else:
493 |             usage_type = "exemptByLaw"
494 |             project["permissions"][
495 |                 "exemptionText"
496 |             ] = "This source code is restricted by patent and / or intellectual property law."
497 | 
498 |         project["permissions"]["usageType"] = usage_type
499 | 
500 |         labor_hours = record.get("labor_hours")
501 |         if labor_hours is not None:
502 |             project["laborHours"] = labor_hours
503 |         else:
504 |             project["laborHours"] = 0
505 | 
506 |         project["tags"] = ["DOE CODE"]
507 |         lab_name = record.get("lab_display_name")
508 |         if lab_name is not None:
509 |             project["tags"].append(lab_name)
510 | 
511 |         project["contact"]["email"] = record["owner"]
512 |         # project['contact']['URL'] = ''
513 |         # project['contact']['name'] = ''
514 |         # project['contact']['phone'] = ''
515 | 
516 |         # -- OPTIONAL FIELDS --
517 | 
518 |         if "version_number" in record and record["version_number"]:
519 |             project["version"] = record["version_number"]
520 | 
521 |         if lab_name is not None:
522 |             project["organization"] = lab_name
523 | 
524 |         # Currently, can't be an empty string, see: https://github.com/GSA/code-gov-web/issues/370
525 |         status = record.get("ever_announced")
526 |         if status is None:
527 |             raise ValueError('DOE CODE: Unable to determine "ever_announced" value!')
528 | 
529 |         project["status"] = "Production" if status else "Development"
530 | 
531 |         vcs = None
532 |         link = project["repositoryURL"]
533 |         if "github.com" in link:
534 |             vcs = "git"
535 |         if vcs is None:
536 |             logger.debug(
537 |                 'DOE CODE: Unable to determine vcs for: name="%s", repositoryURL=%s',
538 |                 project["name"],
539 |                 link,
540 |             )
541 |             vcs = ""
542 |         if vcs:
543 |             project["vcs"] = vcs
544 | 
545 |         url = record.get("landing_page", "")
546 |         if url:
547 |             project["homepageURL"] = url
548 | 
549 |         # record['downloadURL'] = ''
550 | 
551 |         # self['disclaimerText'] = ''
552 | 
553 |         # self['disclaimerURL'] = ''
554 | 
555 |         if "programming_languages" in record:
556 |             project["languages"] = record["programming_languages"]
557 | 
558 |         # self['partners'] = []
559 |         # TODO: Look into using record['contributing_organizations']
560 | 
561 |         # self['relatedCode'] = []
562 | 
563 |         # self['reusedCode'] = []
564 | 
565 |         # date: [object] A date object describing the release.
566 |         #   created: [string] The date the release was originally created, in YYYY-MM-DD or ISO 8601 format.
567 |         #   lastModified: [string] The date the release was modified, in YYYY-MM-DD or ISO 8601 format.
568 |         #   metadataLastUpdated: [string] The date the metadata of the release was last updated, in YYYY-MM-DD or ISO 8601 format.
569 |         if "date_record_added" in record and "date_record_updated" in record:
570 |             project["date"] = {
571 |                 "created": record["date_record_added"],
572 |                 # 'lastModified': '',
573 |                 "metadataLastUpdated": record["date_record_updated"],
574 |             }
575 | 
576 |         return project
577 | 
578 |     @classmethod
579 |     def from_tfs(klass, tfs_project, labor_hours=True):
580 |         """
581 |         Creates CodeGovProject object from TFS/VSTS/AzureDevOps Instance
582 |         """
583 |         project = klass()
584 |         project_web_url = ""
585 | 
586 |         # -- REQUIRED FIELDS --
587 |         project["name"] = tfs_project.projectInfo.name
588 | 
589 |         if "web" in tfs_project.projectInfo._links.additional_properties:
590 |             if "href" in tfs_project.projectInfo._links.additional_properties["web"]:
591 |                 # URL Encodes spaces that are in the Project Name for the Project Web URL
592 |                 project_web_url = requote_uri(
593 |                     tfs_project.projectInfo._links.additional_properties["web"]["href"]
594 |                 )
595 | 
596 |         project["repositoryURL"] = project_web_url
597 | 
598 |         project["homepageURL"] = project_web_url
599 | 
600 |         project["description"] = tfs_project.projectInfo.description
601 | 
602 |         project["vcs"] = "TFS"
603 | 
604 |         project["permissions"]["license"] = None
605 | 
606 |         project["tags"] = []
607 | 
608 |         if labor_hours:
609 |             logger.debug("Sorry labor hour calculation not currently supported.")
610 |             # project['laborHours'] = labor_hours_from_url(project['repositoryURL'])
611 |         else:
612 |             project["laborHours"] = 0
613 | 
614 |         if tfs_project.projectCreateInfo.last_update_time < POLICY_START_DATE:
615 |             project["permissions"]["usageType"] = "exemptByPolicyDate"
616 |         else:
617 |             project["permissions"]["usageType"] = "exemptByAgencyMission"
618 |             project["permissions"][
619 |                 "exemptionText"
620 |             ] = "This source code resides on a private server and has not been properly evaluated for releaseability."
621 | 
622 |         project["contact"] = {"email": "", "URL": project_web_url}
623 | 
624 |         project["date"] = {
625 |             "lastModified": tfs_project.projectLastUpdateInfo.last_update_time.date().isoformat(),
626 |             "created": tfs_project.projectCreateInfo.last_update_time.date().isoformat(),
627 |             "metadataLastUpdated": "",
628 |         }
629 | 
630 |         _prune_dict_null_str(project)
631 | 
632 |         return project
633 | 
634 |     @classmethod
635 |     def from_ado(klass, ado_project: AzureDevOpsProject, labor_hours=True):
636 |         """
637 |         Creates CodeGovProject object from AzureDevOps Instance
638 |         """
639 |         project = klass()
640 |         project_web_url = ""
641 | 
642 |         # -- REQUIRED FIELDS --
643 |         project["name"] = ado_project.project_name
644 | 
645 |         project["repositoryURL"] = requote_uri(ado_project.repo_url)
646 | 
647 |         project["homepageURL"] = requote_uri(ado_project.project_url)
648 | 
649 |         project["description"] = ado_project.project_description
650 | 
651 |         project["vcs"] = "AzureDevOps"
652 | 
653 |         project["permissions"]["license"] = None
654 | 
655 |         project["tags"] = []
656 | 
657 |         if labor_hours:
658 |             logger.debug("Sorry labor hour calculation not currently supported.")
659 |             # project['laborHours'] = labor_hours_from_url(project['repositoryURL'])
660 |         else:
661 |             project["laborHours"] = 0
662 | 
663 |         last_update_time_as_date = date_parse(ado_project.project_last_update_time)
664 |         if last_update_time_as_date < POLICY_START_DATE:
665 |             project["permissions"]["usageType"] = "exemptByPolicyDate"
666 |         else:
667 |             project["permissions"]["usageType"] = "exemptByAgencyMission"
668 |             project["permissions"][
669 |                 "exemptionText"
670 |             ] = "This source code resides on a private server and has not been properly evaluated for releaseability."
671 | 
672 |         project["contact"] = {"email": "", "URL": project_web_url}
673 | 
674 |         project["date"] = {
675 |             "lastModified": last_update_time_as_date.isoformat(),
676 |             "created": "",
677 |             "metadataLastUpdated": "",
678 |         }
679 | 
680 |         _prune_dict_null_str(project)
681 | 
682 |         return project
683 | 


--------------------------------------------------------------------------------
/scraper/doecode/__init__.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import logging
 3 | 
 4 | import requests
 5 | 
 6 | from scraper.util import DEFAULT_REQUESTS_TIMEOUTS
 7 | 
 8 | logger = logging.getLogger(__name__)
 9 | 
10 | 
11 | def process_json(filename):
12 |     """
13 |     Converts a DOE CODE .json file into DOE CODE projects
14 |     Yields DOE CODE records from a DOE CODE .json file
15 |     """
16 | 
17 |     logger.debug("Processing DOE CODE json: %s", filename)
18 | 
19 |     with open(filename, encoding="utf-8") as fd:
20 |         doecode_json = json.load(fd)
21 | 
22 |     for record in doecode_json["records"]:
23 |         yield record
24 | 
25 | 
26 | def process_url(url, key):
27 |     """
28 |     Yields DOE CODE records from a DOE CODE .json URL response
29 |     Converts a DOE CODE API .json URL response into DOE CODE projects
30 |     """
31 | 
32 |     logger.debug("Fetching DOE CODE JSON: %s", url)
33 | 
34 |     if key is None:
35 |         raise ValueError("DOE CODE API Key value is missing!")
36 | 
37 |     response = requests.get(
38 |         url,
39 |         headers={"Authorization": "Basic " + key},
40 |         timeout=DEFAULT_REQUESTS_TIMEOUTS,
41 |     )
42 |     doecode_json = response.json()
43 | 
44 |     for record in doecode_json["records"]:
45 |         yield record
46 | 
47 | 
48 | def process(filename=None, url=None, key=None):
49 |     """
50 |     Yields DOE CODE records based on provided input sources
51 | 
52 |     param:
53 |         filename (str): Path to a DOE CODE .json file
54 |         url (str): URL for a DOE CODE server json file
55 |         key (str): API Key for connecting to DOE CODE server
56 |     """
57 | 
58 |     if filename is not None:
59 |         yield from process_json(filename)
60 |     elif url and key:
61 |         yield from process_url(url, key)
62 | 


--------------------------------------------------------------------------------
/scraper/gen_code_gov_json.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/env python
  2 | # -*- coding: UTF-8 -*-
  3 | 
  4 | import argparse
  5 | import json
  6 | import logging
  7 | import os
  8 | 
  9 | from scraper import code_gov
 10 | from scraper.util import configure_logging
 11 | 
 12 | logger = logging.getLogger(__name__)
 13 | 
 14 | 
 15 | def main():
 16 |     parser = argparse.ArgumentParser(
 17 |         description="Scrape code repositories for Code.gov / DOE CODE"
 18 |     )
 19 | 
 20 |     parser.add_argument(
 21 |         "--agency", type=str, nargs="?", default="", help='Agency Label, e.g. "DOE"'
 22 |     )
 23 |     parser.add_argument(
 24 |         "--method",
 25 |         type=str,
 26 |         nargs="?",
 27 |         default="",
 28 |         help="Method of measuring open source",
 29 |     )
 30 |     parser.add_argument(
 31 |         "--organization",
 32 |         type=str,
 33 |         nargs="?",
 34 |         default="",
 35 |         help="Force all repos to report a particular organization",
 36 |     )
 37 |     parser.add_argument(
 38 |         "--contact-email",
 39 |         type=str,
 40 |         nargs="?",
 41 |         default="",
 42 |         help="Force all repos to report a particular contact email",
 43 |     )
 44 | 
 45 |     parser.add_argument(
 46 |         "--config", type=str, nargs="?", default="", help="Configuration File (*.json)"
 47 |     )
 48 | 
 49 |     parser.add_argument(
 50 |         "--github-gov-orgs",
 51 |         action="store_true",
 52 |         help="Use orgs from government.github.com/community",
 53 |     )
 54 |     parser.add_argument(
 55 |         "--skip-labor-hours",
 56 |         action="store_true",
 57 |         help='Skip calculation of labor hours, assume "0"',
 58 |     )
 59 | 
 60 |     parser.add_argument(
 61 |         "--doecode-json",
 62 |         type=str,
 63 |         nargs="?",
 64 |         default=None,
 65 |         help="Path to DOE CODE .json file",
 66 |     )
 67 |     parser.add_argument(
 68 |         "--doecode-url",
 69 |         type=str,
 70 |         nargs="?",
 71 |         default=None,
 72 |         help="URL to DOE CODE .json data",
 73 |     )
 74 |     parser.add_argument(
 75 |         "--doecode-api-key",
 76 |         type=str,
 77 |         nargs="?",
 78 |         default=None,
 79 |         help="DOE CODE API key for accessing --doecode-url",
 80 |     )
 81 | 
 82 |     parser.add_argument(
 83 |         "--output-path",
 84 |         type=str,
 85 |         nargs="?",
 86 |         default="",
 87 |         help="Output path for .json file",
 88 |     )
 89 |     parser.add_argument(
 90 |         "--output-filename",
 91 |         type=str,
 92 |         nargs="?",
 93 |         default="code.json",
 94 |         help="Output filename for .json file",
 95 |     )
 96 | 
 97 |     parser.add_argument("--verbose", action="store_true", help="Enable verbose output")
 98 | 
 99 |     args = parser.parse_args()
100 | 
101 |     configure_logging(args.verbose)
102 | 
103 |     try:
104 |         with open(args.config, encoding="utf-8") as fd:
105 |             config_json = json.load(fd)
106 |     except (FileNotFoundError, json.JSONDecodeError):
107 |         if args.config:
108 |             raise
109 |         config_json = {}
110 | 
111 |     # Update config based on commandline arguments
112 |     if args.agency:
113 |         config_json["agency"] = args.agency
114 |     if args.method:
115 |         config_json["method"] = args.method
116 |     if args.organization:
117 |         config_json["organization"] = args.organization
118 |     if args.contact_email:
119 |         config_json["contact_email"] = args.contact_email
120 |     if args.output_path:
121 |         config_json["output_path"] = args.output_path
122 |     if args.skip_labor_hours:
123 |         config_json["compute_labor_hours"] = False
124 |     if args.github_gov_orgs:
125 |         config_json["github_gov_orgs"] = True
126 | 
127 |     config_json["DOE CODE"] = {}
128 |     config_json["DOE CODE"]["json"] = args.doecode_json
129 |     config_json["DOE CODE"]["url"] = args.doecode_url
130 |     config_json["DOE CODE"]["api_key"] = args.doecode_api_key
131 | 
132 |     output_path = config_json.get("output_path", None)
133 |     output_path = args.output_path or output_path
134 |     logger.debug("Output Path: %s", output_path)
135 | 
136 |     if output_path is not None and not os.path.exists(output_path):
137 |         raise RuntimeError(
138 |             "Invalid output path argument provided!  Make sure the output path exists and try again."
139 |         )
140 | 
141 |     code_json = code_gov.process_config(config_json)
142 | 
143 |     code_gov.force_attributes(code_json, config_json)
144 | 
145 |     logger.info("Number of Projects: %s", len(code_json["releases"]))
146 | 
147 |     output_filepath = args.output_filename
148 | 
149 |     if output_path is not None:
150 |         output_filepath = os.path.join(output_path, output_filepath)
151 | 
152 |     with open(output_filepath, "w", encoding="utf-8") as fp:
153 |         logger.info("Writing output to: %s", output_filepath)
154 |         fp.write(code_json.to_json())
155 | 
156 | 
157 | if __name__ == "__main__":
158 |     main()
159 | 


--------------------------------------------------------------------------------
/scraper/github/__init__.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/env python
  2 | # -*- coding: UTF-8 -*-
  3 | 
  4 | import logging
  5 | import os
  6 | import time
  7 | 
  8 | import github3
  9 | import requests
 10 | 
 11 | from scraper.util import DEFAULT_REQUESTS_TIMEOUTS
 12 | 
 13 | logger = logging.getLogger(__name__)
 14 | 
 15 | 
 16 | def gov_orgs():
 17 |     """
 18 |     Returns a list of the names of US Government GitHub organizations
 19 | 
 20 |     Based on: https://government.github.com/community/
 21 | 
 22 |     Example return:
 23 |         {'llnl', '18f', 'gsa', 'dhs-ncats', 'spack', ...}
 24 |     """
 25 |     us_gov_github_orgs = set()
 26 | 
 27 |     gov_orgs_json = requests.get(
 28 |         "https://government.github.com/organizations.json",
 29 |         timeout=DEFAULT_REQUESTS_TIMEOUTS,
 30 |     ).json()
 31 | 
 32 |     us_gov_github_orgs.update(gov_orgs_json["governments"]["U.S. Federal"])
 33 |     us_gov_github_orgs.update(
 34 |         gov_orgs_json["governments"]["U.S. Military and Intelligence"]
 35 |     )
 36 |     us_gov_github_orgs.update(gov_orgs_json["research"]["U.S. Research Labs"])
 37 | 
 38 |     return list(us_gov_github_orgs)
 39 | 
 40 | 
 41 | def create_session(token=None, timeouts=None):
 42 |     """
 43 |     Create a github3.py session connected to GitHub.com
 44 | 
 45 |     If token is not provided, will attempt to use the GITHUB_API_TOKEN
 46 |     environment variable if present.
 47 |     """
 48 |     if token is None:
 49 |         token = os.environ.get("GITHUB_API_TOKEN", None)
 50 | 
 51 |     if timeouts is None:
 52 |         timeouts = {}
 53 | 
 54 |     custom_session = github3.session.GitHubSession(**timeouts)
 55 |     gh_session = github3.GitHub(token=token, session=custom_session)
 56 | 
 57 |     if gh_session is None:
 58 |         raise RuntimeError("Invalid or missing GITHUB_API_TOKEN")
 59 | 
 60 |     return gh_session
 61 | 
 62 | 
 63 | def create_enterprise_session(url, token=None, timeouts=None):
 64 |     """
 65 |     Create a github3.py session for a GitHub Enterprise instance
 66 | 
 67 |     If token is not provided, will attempt to use the GITHUB_API_TOKEN
 68 |     environment variable if present.
 69 |     """
 70 |     if timeouts is None:
 71 |         timeouts = {}
 72 | 
 73 |     custom_session = github3.session.GitHubSession(**timeouts)
 74 |     gh_session = github3.GitHubEnterprise(url=url, token=token, session=custom_session)
 75 | 
 76 |     if gh_session is None:
 77 |         msg = "Unable to connect to GitHub Enterprise (%s) with provided token."
 78 |         raise RuntimeError(msg, url)
 79 | 
 80 |     return gh_session
 81 | 
 82 | 
 83 | def _num_requests_needed(num_repos, factor=2, wiggle_room=100):
 84 |     """
 85 |     Helper function to estimate the minimum number of API requests needed
 86 |     """
 87 |     return num_repos * factor + wiggle_room
 88 | 
 89 | 
 90 | def _check_api_limits(gh_session, api_required=250):
 91 |     """
 92 |     Simplified check for API limits
 93 | 
 94 |     If necessary, spin in place waiting for API to reset before returning.
 95 | 
 96 |     See: https://developer.github.com/v3/#rate-limiting
 97 |     """
 98 |     api_rates = gh_session.rate_limit()
 99 | 
100 |     api_remaining = api_rates["rate"]["remaining"]
101 |     api_reset = api_rates["rate"]["reset"]
102 |     logger.debug("Rate Limit - %d requests remaining", api_remaining)
103 | 
104 |     if api_remaining > api_required:
105 |         return
106 | 
107 |     now_time = time.time()
108 |     time_to_reset = int(api_reset - now_time)
109 |     logger.warning("Rate Limit Depleted - Sleeping for %d seconds", time_to_reset)
110 | 
111 |     while now_time < api_reset:
112 |         time.sleep(10)
113 |         now_time = time.time()
114 | 
115 |     return
116 | 
117 | 
118 | def connect(url="https://github.com", token=None, timeouts=None):
119 |     """
120 |     Create a GitHub session for making requests
121 |     """
122 | 
123 |     if timeouts is None:
124 |         timeouts = {}
125 | 
126 |     gh_session = None
127 |     if url == "https://github.com":
128 |         gh_session = create_session(token, timeouts)
129 |     else:
130 |         gh_session = create_enterprise_session(url, token, timeouts)
131 | 
132 |     if gh_session is None:
133 |         msg = "Unable to connect to (%s) with provided token."
134 |         raise RuntimeError(msg, url)
135 | 
136 |     logger.info("Connected to: %s", url)
137 | 
138 |     return gh_session
139 | 
140 | 
141 | def query_repos(gh_session, orgs=None, repos=None, public_only=True):
142 |     """
143 |     Yields GitHub3.py repo objects for provided orgs and repo names
144 | 
145 |     If orgs and repos are BOTH empty, execute special mode of getting ALL
146 |     repositories from the GitHub Server.
147 | 
148 |     If public_only is True, will return only those repos that are marked as
149 |     public. Set this to false to return all organizations that the session has
150 |     permissions to access.
151 |     """
152 | 
153 |     if orgs is None:
154 |         orgs = []
155 |     if repos is None:
156 |         repos = []
157 |     if public_only:
158 |         privacy = "public"
159 |     else:
160 |         privacy = "all"
161 | 
162 |     _check_api_limits(gh_session, 10)
163 | 
164 |     for org_name in orgs:
165 |         org = gh_session.organization(org_name)
166 |         num_repos = org.public_repos_count
167 | 
168 |         _check_api_limits(gh_session, _num_requests_needed(num_repos))
169 | 
170 |         for repo in org.repositories(type=privacy):
171 |             _check_api_limits(gh_session, 10)
172 |             yield repo
173 | 
174 |     for repo_name in repos:
175 |         _check_api_limits(gh_session, 10)
176 |         org, name = repo_name.split("/")
177 |         yield gh_session.repository(org, name)
178 | 
179 |     if not (orgs or repos):
180 |         for repo in gh_session.all_repositories():
181 |             yield repo
182 | 


--------------------------------------------------------------------------------
/scraper/github/queryManager.py:
--------------------------------------------------------------------------------
  1 | """
  2 | A module for GitHub query and data management.
  3 | 
  4 | With this module, you will be able to send GraphQL and REST queries
  5 | to GitHub, as well as read and write JSON files to store data.
  6 | """
  7 | 
  8 | from datetime import datetime
  9 | import json
 10 | import os
 11 | import re
 12 | import time
 13 | 
 14 | import pytz
 15 | import requests
 16 | 
 17 | from scraper.util import DEFAULT_REQUESTS_TIMEOUTS
 18 | 
 19 | 
 20 | def _vPrint(verbose, *args, **kwargs):
 21 |     """Easy verbosity-control print method.
 22 | 
 23 |     Args:
 24 |         verbose (bool): Normal print if True, do nothing otherwise.
 25 |         *args: Argument list for the 'print' method.
 26 |         **kwargs: Keyword arguments for the 'print' method.
 27 | 
 28 |     """
 29 |     if verbose:
 30 |         print(*args, **kwargs)
 31 | 
 32 | 
 33 | class GitHubQueryManager:
 34 |     """GitHub query API manager."""
 35 | 
 36 |     def __init__(self, apiToken=None, maxRetry=10, retryDelay=3):
 37 |         """Initialize the GitHubQueryManager object.
 38 | 
 39 |         Note:
 40 |             If no apiToken argument is provided,
 41 |             the environment variable 'GITHUB_API_TOKEN' must be set.
 42 | 
 43 |         Args:
 44 |             apiToken (Optional[str]): A string representing a GitHub API
 45 |                 token. Defaults to None.
 46 |             maxRetry (Optional[int]): A limit on how many times to
 47 |                 automatically retry requests. Defaults to 10.
 48 |             retryDelay (Optional[int]): Number of seconds to wait between
 49 |                 automatic request retries. Defaults to 3.
 50 | 
 51 |         Raises:
 52 |             TypeError: If no GitHub API token is provided either via
 53 |             argument or environment variable 'GITHUB_API_TOKEN'.
 54 | 
 55 |         """
 56 | 
 57 |         # Get GitHub API token
 58 |         if apiToken:
 59 |             self.__githubApiToken = apiToken
 60 |         else:
 61 |             try:
 62 |                 self.__githubApiToken = os.environ["GITHUB_API_TOKEN"]
 63 |             except KeyError as error:
 64 |                 raise TypeError(
 65 |                     "Requires either a string argument or environment variable 'GITHUB_API_TOKEN'."
 66 |                 ) from error
 67 | 
 68 |         # Check token validity
 69 |         print("Checking GitHub API token... ", end="", flush=True)
 70 |         basicCheck = self._submitQuery("query { viewer { login } }")
 71 |         if basicCheck["statusNum"] == 401:
 72 |             print("FAILED.")
 73 |             raise ValueError(
 74 |                 "GitHub API token is not valid.\n%s %s"
 75 |                 % (basicCheck["statusTxt"], basicCheck["result"])
 76 |             )
 77 | 
 78 |         print("Token validated.")
 79 | 
 80 |         # Initialize private variables
 81 |         self.__query = None  #: Cached query string
 82 |         self.__queryPath = None  #: Path to query file
 83 |         self.__queryTimestamp = None  #: When query file was last modified
 84 | 
 85 |         # Initialize public variables
 86 |         self.maxRetry = maxRetry
 87 |         self.retryDelay = retryDelay
 88 |         self.data = {}
 89 |         """Dict: Working data."""
 90 | 
 91 |     @property
 92 |     def maxRetry(self):
 93 |         """int: A limit on how many times to automatically retry requests.
 94 | 
 95 |         Must be a whole integer greater than 0.
 96 |         """
 97 |         return self.__maxRetry
 98 | 
 99 |     @maxRetry.setter
100 |     def maxRetry(self, maxRetry):
101 |         numIn = int(maxRetry)
102 |         numIn = 1 if numIn <= 0 else numIn
103 |         self.__maxRetry = numIn
104 |         print("Auto-retry limit for requests set to %d." % (self.maxRetry))
105 | 
106 |     @property
107 |     def retryDelay(self):
108 |         """int: Number of seconds to wait between automatic request retries.
109 | 
110 |         Must be a whole integer greater than 0.
111 |         """
112 |         return self.__retryDelay
113 | 
114 |     @retryDelay.setter
115 |     def retryDelay(self, retryDelay):
116 |         numIn = int(retryDelay)
117 |         numIn = 1 if numIn <= 0 else numIn
118 |         self.__retryDelay = numIn
119 |         print("Auto-retry delay set to %dsec." % (self.retryDelay))
120 | 
121 |     def _readGQL(self, filePath, verbose=False):
122 |         """Read a 'pretty' formatted GraphQL query file into a one-line string.
123 | 
124 |         Removes line breaks and comments. Condenses white space.
125 | 
126 |         Args:
127 |             filePath (str): A relative or absolute path to a file containing
128 |                 a GraphQL query.
129 |                 File may use comments and multi-line formatting.
130 |                 .. _GitHub GraphQL Explorer:
131 |                    https://developer.github.com/v4/explorer/
132 |             verbose (Optional[bool]): If False, prints will be suppressed.
133 |                 Defaults to False.
134 | 
135 |         Returns:
136 |             str: A single line GraphQL query.
137 | 
138 |         """
139 |         if not os.path.isfile(filePath):
140 |             raise RuntimeError("Query file '%s' does not exist." % (filePath))
141 |         lastModified = os.path.getmtime(filePath)
142 |         absPath = os.path.abspath(filePath)
143 |         if absPath == self.__queryPath and lastModified == self.__queryTimestamp:
144 |             _vPrint(
145 |                 verbose,
146 |                 "Using cached query '%s'" % (os.path.basename(self.__queryPath)),
147 |             )
148 |             query_in = self.__query
149 |         else:
150 |             _vPrint(verbose, "Reading '%s' ... " % (filePath), end="", flush=True)
151 |             with open(filePath, "r", encoding="utf-8") as q:
152 |                 # Strip comments.
153 |                 query_in = re.sub(r"#.*(\n|\Z)", "\n", q.read())
154 |                 # Condense whitespace.
155 |                 query_in = re.sub(r"\s+", " ", query_in)
156 |                 # Remove leading and trailing whitespace.
157 |                 query_in = query_in.strip()
158 |             _vPrint(verbose, "File read!")
159 |             self.__queryPath = absPath
160 |             self.__queryTimestamp = lastModified
161 |             self.__query = query_in
162 |         return query_in
163 | 
164 |     def queryGitHubFromFile(self, filePath, gitvars=None, verbosity=0, **kwargs):
165 |         """Submit a GitHub GraphQL query from a file.
166 | 
167 |         Can only be used with GraphQL queries.
168 |         For REST queries, see the 'queryGitHub' method.
169 | 
170 |         Args:
171 |             filePath (str): A relative or absolute path to a file containing
172 |                 a GraphQL query.
173 |                 File may use comments and multi-line formatting.
174 |                 .. _GitHub GraphQL Explorer:
175 |                    https://developer.github.com/v4/explorer/
176 |             gitvars (Optional[Dict]): All query variables.
177 |                 Defaults to None.
178 |                 GraphQL Only.
179 |             verbosity (Optional[int]): Changes output verbosity levels.
180 |                 If < 0, all extra printouts are suppressed.
181 |                 If == 0, normal print statements are displayed.
182 |                 If > 0, additional status print statements are displayed.
183 |                 Defaults to 0.
184 |             **kwargs: Keyword arguments for the 'queryGitHub' method.
185 | 
186 |         Returns:
187 |             Dict: A JSON style dictionary.
188 | 
189 |         """
190 |         if not gitvars:
191 |             gitvars = {}
192 | 
193 |         gitquery = self._readGQL(filePath, verbose=(verbosity >= 0))
194 |         return self.queryGitHub(
195 |             gitquery, gitvars=gitvars, verbosity=verbosity, **kwargs
196 |         )
197 | 
198 |     def queryGitHub(
199 |         self,
200 |         gitquery,
201 |         gitvars=None,
202 |         verbosity=0,
203 |         paginate=False,
204 |         cursorVar=None,
205 |         keysToList=None,
206 |         rest=False,
207 |         requestCount=0,
208 |         pageNum=0,
209 |         headers=None,
210 |     ):
211 |         """Submit a GitHub query.
212 | 
213 |         Args:
214 |             gitquery (str): The query or endpoint itself.
215 |                 Examples:
216 |                        query: 'query { viewer { login } }'
217 |                     endpoint: '/user'
218 |             gitvars (Optional[Dict]): All query variables.
219 |                 Defaults to None.
220 |                 GraphQL Only.
221 |             verbosity (Optional[int]): Changes output verbosity levels.
222 |                 If < 0, all extra printouts are suppressed.
223 |                 If == 0, normal print statements are displayed.
224 |                 If > 0, additional status print statements are displayed.
225 |                 Defaults to 0.
226 |             paginate (Optional[bool]): Pagination will be completed
227 |                 automatically if True. Defaults to False.
228 |             cursorVar (Optional[str]): Key in 'gitvars' that represents the
229 |                 pagination cursor. Defaults to None.
230 |                 GraphQL Only.
231 |             keysToList (Optional[List[str]]): Ordered list of keys needed to
232 |                 retrieve the list in the query results to be extended by
233 |                 pagination. Defaults to None.
234 |                 Example:
235 |                     ['data', 'viewer', 'repositories', 'nodes']
236 |                 GraphQL Only.
237 |             rest (Optional[bool]): If True, uses the REST API instead
238 |                 of GraphQL. Defaults to False.
239 |             requestCount (Optional[int]): Counter for repeated requests.
240 |             pageNum (Optional[int]): Counter for pagination.
241 |                 For user readable log messages only, does not affect data.
242 |             headers (Optional[Dict]): Additional headers.
243 |                 Defaults to None.
244 | 
245 |         Returns:
246 |             Dict: A JSON style dictionary.
247 | 
248 |         """
249 |         if not gitvars:
250 |             gitvars = {}
251 |         if not keysToList:
252 |             keysToList = []
253 |         if not headers:
254 |             headers = {}
255 | 
256 |         requestCount += 1
257 |         pageNum = 0 if pageNum < 0 else pageNum  # no negative page numbers
258 |         pageNum += 1
259 | 
260 |         if paginate:
261 |             _vPrint((verbosity >= 0), "Page %d" % (pageNum))
262 |         _vPrint(
263 |             (verbosity >= 0), "Sending %s query..." % ("REST" if rest else "GraphQL")
264 |         )
265 |         try:
266 |             response = self._submitQuery(
267 |                 gitquery,
268 |                 gitvars=gitvars,
269 |                 verbose=(verbosity > 0),
270 |                 rest=rest,
271 |                 headers=headers,
272 |             )
273 |         except requests.exceptions.ReadTimeout:  # Handles intermittent response delays
274 |             _vPrint((verbosity >= 0), "Read timed out.")
275 |             _vPrint((verbosity >= 0), "Repeating query...")
276 |             return self.queryGitHub(
277 |                 gitquery,
278 |                 gitvars=gitvars,
279 |                 verbosity=verbosity,
280 |                 paginate=paginate,
281 |                 cursorVar=cursorVar,
282 |                 keysToList=keysToList,
283 |                 rest=rest,
284 |                 requestCount=requestCount,
285 |                 pageNum=(pageNum - 1),  # retry same page
286 |                 headers=headers,
287 |             )
288 |         _vPrint((verbosity >= 0), "Checking response...")
289 |         _vPrint((verbosity >= 0), "HTTP STATUS %s" % (response["statusTxt"]))
290 |         statusNum = response["statusNum"]
291 | 
292 |         # Make sure the query limit didn't run out
293 |         try:
294 |             apiStatus = {
295 |                 "limit": int(response["headDict"]["X-RateLimit-Limit"]),
296 |                 "remaining": int(response["headDict"]["X-RateLimit-Remaining"]),
297 |                 "reset": int(response["headDict"]["X-RateLimit-Reset"]),
298 |             }
299 |             _vPrint((verbosity >= 0), "API Status %s" % (json.dumps(apiStatus)))
300 |             if apiStatus["remaining"] <= 0:
301 |                 _vPrint((verbosity >= 0), "API rate limit exceeded.")
302 |                 self._awaitReset(apiStatus["reset"])
303 |                 _vPrint((verbosity >= 0), "Repeating query...")
304 |                 return self.queryGitHub(
305 |                     gitquery,
306 |                     gitvars=gitvars,
307 |                     verbosity=verbosity,
308 |                     paginate=paginate,
309 |                     cursorVar=cursorVar,
310 |                     keysToList=keysToList,
311 |                     rest=rest,
312 |                     requestCount=(requestCount - 1),  # not counted against retries
313 |                     pageNum=(pageNum - 1),  # retry same page
314 |                     headers=headers,
315 |                 )
316 |         except KeyError:  # Handles error responses without X-RateLimit data
317 |             _vPrint((verbosity >= 0), "Failed to check API Status.")
318 | 
319 |         # Check for explicit API rate limit error responses
320 |         if statusNum in (403, 429):
321 |             _vPrint((verbosity >= 0), "API rate limit exceeded.")
322 |             if requestCount >= self.maxRetry:
323 |                 raise RuntimeError(
324 |                     "Query attempted but failed %d times.\n%s\n%s"
325 |                     % (
326 |                         self.maxRetry,
327 |                         response["statusTxt"],
328 |                         response["result"],
329 |                     )
330 |                 )
331 | 
332 |             try:  # Use explicit wait time if available
333 |                 waitTime = int(response["headDict"]["Retry-After"])
334 |                 self._countdown(
335 |                     waitTime,
336 |                     printString="Waiting %*d seconds...",
337 |                     verbose=(verbosity >= 0),
338 |                 )
339 |             except KeyError:  # Handles missing Retry-After header
340 |                 self._countdown(
341 |                     # wait at least 1 min, longer on continued failure (recommended best practice)
342 |                     60 * requestCount,
343 |                     printString="Waiting %*d seconds...",
344 |                     verbose=(verbosity >= 0),
345 |                 )
346 |             _vPrint((verbosity >= 0), "Repeating query...")
347 |             return self.queryGitHub(
348 |                 gitquery,
349 |                 gitvars=gitvars,
350 |                 verbosity=verbosity,
351 |                 paginate=paginate,
352 |                 cursorVar=cursorVar,
353 |                 keysToList=keysToList,
354 |                 rest=rest,
355 |                 requestCount=requestCount,
356 |                 pageNum=(pageNum - 1),  # retry same page
357 |                 headers=headers,
358 |             )
359 |         # Check for accepted but not yet processed, usually due to un-cached data
360 |         if statusNum == 202:
361 |             if requestCount >= self.maxRetry:
362 |                 raise RuntimeError(
363 |                     "Query attempted but failed %d times.\n%s\n%s"
364 |                     % (
365 |                         self.maxRetry,
366 |                         response["statusTxt"],
367 |                         response["result"],
368 |                     )
369 |                 )
370 | 
371 |             self._countdown(
372 |                 self.retryDelay,
373 |                 printString="Query accepted but not yet processed. Trying again in %*d seconds...",
374 |                 verbose=(verbosity >= 0),
375 |             )
376 |             return self.queryGitHub(
377 |                 gitquery,
378 |                 gitvars=gitvars,
379 |                 verbosity=verbosity,
380 |                 paginate=paginate,
381 |                 cursorVar=cursorVar,
382 |                 keysToList=keysToList,
383 |                 rest=rest,
384 |                 requestCount=requestCount,
385 |                 pageNum=(pageNum - 1),  # retry same page
386 |                 headers=headers,
387 |             )
388 |         # Check for server error responses
389 |         if statusNum in (502, 503):
390 |             if requestCount >= self.maxRetry:
391 |                 raise RuntimeError(
392 |                     "Query attempted but failed %d times.\n%s\n%s"
393 |                     % (
394 |                         self.maxRetry,
395 |                         response["statusTxt"],
396 |                         response["result"],
397 |                     )
398 |                 )
399 | 
400 |             self._countdown(
401 |                 self.retryDelay,
402 |                 printString="Server error. Trying again in %*d seconds...",
403 |                 verbose=(verbosity >= 0),
404 |             )
405 |             return self.queryGitHub(
406 |                 gitquery,
407 |                 gitvars=gitvars,
408 |                 verbosity=verbosity,
409 |                 paginate=paginate,
410 |                 cursorVar=cursorVar,
411 |                 keysToList=keysToList,
412 |                 rest=rest,
413 |                 requestCount=requestCount,
414 |                 pageNum=(pageNum - 1),  # retry same page
415 |                 headers=headers,
416 |             )
417 |         # Check for other error responses
418 |         if statusNum >= 400 or statusNum == 204:
419 |             raise RuntimeError(
420 |                 "Request got an Error response.\n%s\n%s"
421 |                 % (response["statusTxt"], response["result"])
422 |             )
423 | 
424 |         _vPrint((verbosity >= 0), "Data received!")
425 |         outObj = json.loads(response["result"])
426 | 
427 |         # Check for GraphQL API errors (e.g. repo not found)
428 |         if not rest and "errors" in outObj:
429 |             if requestCount >= self.maxRetry:
430 |                 raise RuntimeError(
431 |                     "Query attempted but failed %d times.\n%s\n%s"
432 |                     % (
433 |                         self.maxRetry,
434 |                         response["statusTxt"],
435 |                         response["result"],
436 |                     )
437 |                 )
438 | 
439 |             if len(outObj["errors"]) == 1 and len(outObj["errors"][0]) == 1:
440 |                 # Poorly defined error type, usually intermittent, try again.
441 |                 _vPrint(
442 |                     (verbosity >= 0),
443 |                     "GraphQL API error.\n%s" % (json.dumps(outObj["errors"])),
444 |                 )
445 |                 self._countdown(
446 |                     self.retryDelay,
447 |                     printString="Unknown API error. Trying again in %*d seconds...",
448 |                     verbose=(verbosity >= 0),
449 |                 )
450 |                 return self.queryGitHub(
451 |                     gitquery,
452 |                     gitvars=gitvars,
453 |                     verbosity=verbosity,
454 |                     paginate=paginate,
455 |                     cursorVar=cursorVar,
456 |                     keysToList=keysToList,
457 |                     rest=rest,
458 |                     requestCount=requestCount,
459 |                     pageNum=(pageNum - 1),  # retry same page
460 |                     headers=headers,
461 |                 )
462 | 
463 |             raise RuntimeError(
464 |                 "GraphQL API error.\n%s" % (json.dumps(outObj["errors"]))
465 |             )
466 | 
467 |         # Pagination
468 |         if paginate:
469 |             if rest and response["linkDict"]:
470 |                 if "next" in response["linkDict"]:
471 |                     nextObj = self.queryGitHub(
472 |                         response["linkDict"]["next"],
473 |                         gitvars=gitvars,
474 |                         verbosity=verbosity,
475 |                         paginate=paginate,
476 |                         cursorVar=cursorVar,
477 |                         keysToList=keysToList,
478 |                         rest=rest,
479 |                         requestCount=0,
480 |                         pageNum=pageNum,
481 |                         headers=headers,
482 |                     )
483 |                     outObj.extend(nextObj)
484 |             elif not rest:
485 |                 if not cursorVar:
486 |                     raise ValueError(
487 |                         "Must specify argument 'cursorVar' to use GraphQL auto-pagination."
488 |                     )
489 |                 if not len(keysToList) > 0:
490 |                     raise ValueError(
491 |                         "Must specify argument 'keysToList' as a non-empty list to use GraphQL auto-pagination."
492 |                     )
493 |                 aPage = outObj
494 |                 for key in keysToList[0:-1]:
495 |                     aPage = aPage[key]
496 |                 gitvars[cursorVar] = aPage["pageInfo"]["endCursor"]
497 |                 if aPage["pageInfo"]["hasNextPage"]:
498 |                     nextObj = self.queryGitHub(
499 |                         gitquery,
500 |                         gitvars=gitvars,
501 |                         verbosity=verbosity,
502 |                         paginate=paginate,
503 |                         cursorVar=cursorVar,
504 |                         keysToList=keysToList,
505 |                         rest=rest,
506 |                         requestCount=0,
507 |                         pageNum=pageNum,
508 |                         headers=headers,
509 |                     )
510 |                     newPage = nextObj
511 |                     for key in keysToList[0:-1]:
512 |                         newPage = newPage[key]
513 |                     aPage[keysToList[-1]].extend(newPage[keysToList[-1]])
514 |                 aPage.pop("pageInfo", None)
515 | 
516 |         return outObj
517 | 
518 |     def _submitQuery(
519 |         self, gitquery, gitvars=None, verbose=False, rest=False, headers=None
520 |     ):
521 |         """Send a curl request to GitHub.
522 | 
523 |         Args:
524 |             gitquery (str): The query or endpoint itself.
525 |                 Examples:
526 |                        query: 'query { viewer { login } }'
527 |                     endpoint: '/user'
528 |             gitvars (Optional[Dict]): All query variables.
529 |                 Defaults to None.
530 |             verbose (Optional[bool]): If False, stderr prints will be
531 |                 suppressed. Defaults to False.
532 |             rest (Optional[bool]): If True, uses the REST API instead
533 |                 of GraphQL. Defaults to False.
534 |             headers (Optional[Dict]): Additional headers.
535 |                 Defaults to None.
536 | 
537 |         Returns:
538 |             {
539 |                 'statusNum' (int): The HTTP status code.
540 |                 'statusTxt' (str): The HTTP status message.
541 |                 'headDict' (Dict[str]): The response headers.
542 |                 'linkDict' (Dict[int]): Link based pagination data.
543 |                 'result' (str): The body of the response.
544 |             }
545 | 
546 |         """
547 |         if not gitvars:
548 |             gitvars = {}
549 |         if not headers:
550 |             headers = {}
551 | 
552 |         authhead = {"Authorization": "bearer " + self.__githubApiToken}
553 |         if not rest:
554 |             gitqueryJSON = json.dumps(
555 |                 {"query": gitquery, "variables": json.dumps(gitvars)}
556 |             )
557 |             fullResponse = requests.post(
558 |                 "https://api.github.com/graphql",
559 |                 data=gitqueryJSON,
560 |                 headers={**authhead, **headers},
561 |                 timeout=DEFAULT_REQUESTS_TIMEOUTS,
562 |             )
563 |         else:
564 |             fullResponse = requests.get(
565 |                 "https://api.github.com" + gitquery,
566 |                 headers={**authhead, **headers},
567 |                 timeout=DEFAULT_REQUESTS_TIMEOUTS,
568 |             )
569 |         _vPrint(
570 |             verbose,
571 |             "\n%s\n%s"
572 |             % (json.dumps(dict(fullResponse.headers), indent=2), fullResponse.text),
573 |         )
574 |         result = fullResponse.text
575 |         headDict = fullResponse.headers
576 |         statusNum = int(fullResponse.status_code)
577 |         statusTxt = "%d %s" % (statusNum, fullResponse.reason)
578 | 
579 |         # Parse any Link headers even further
580 |         linkDict = None
581 |         if "Link" in headDict:
582 |             linkProperties = headDict["Link"].split(", ")
583 |             propDict = {}
584 |             for item in linkProperties:
585 |                 divided = re.split(r'<https://api.github.com|>; rel="|"', item)
586 |                 propDict[divided[2]] = divided[1]
587 |             linkDict = propDict
588 | 
589 |         return {
590 |             "statusNum": statusNum,
591 |             "statusTxt": statusTxt,
592 |             "headDict": headDict,
593 |             "linkDict": linkDict,
594 |             "result": result,
595 |         }
596 | 
597 |     def _awaitReset(self, utcTimeStamp, verbose=True):
598 |         """Wait until the given UTC timestamp.
599 | 
600 |         Args:
601 |             utcTimeStamp (int): A UTC format timestamp.
602 |             verbose (Optional[bool]): If False, all extra printouts will be
603 |                 suppressed. Defaults to True.
604 | 
605 |         """
606 |         resetTime = pytz.utc.localize(datetime.utcfromtimestamp(utcTimeStamp))
607 |         _vPrint(verbose, "--- Current Timestamp")
608 |         _vPrint(verbose, "      %s" % (time.strftime("%c")))
609 |         now = pytz.utc.localize(datetime.utcnow())
610 |         waitTime = round((resetTime - now).total_seconds()) + 1
611 |         _vPrint(verbose, "--- Current UTC Timestamp")
612 |         _vPrint(verbose, "      %s" % (now.strftime("%c")))
613 |         _vPrint(verbose, "--- GITHUB NEEDS A BREAK Until UTC Timestamp")
614 |         _vPrint(verbose, "      %s" % (resetTime.strftime("%c")))
615 |         self._countdown(
616 |             waitTime, printString="--- Waiting %*d seconds...", verbose=verbose
617 |         )
618 |         _vPrint(verbose, "--- READY!")
619 | 
620 |     def _countdown(
621 |         self, waitTime=0, printString="Waiting %*d seconds...", verbose=True
622 |     ):
623 |         """Prints a message and waits.
624 | 
625 |         Args:
626 |             waitTime (Optional[int]): Number of seconds to wait. Defaults to 0.
627 |             printString (Optional[str]): A counter message to display.
628 |                 Defaults to "Waiting %*d seconds...".
629 |             verbose (Optional[bool]): If False, all extra printouts will be
630 |                 suppressed. Defaults to True.
631 | 
632 |         """
633 |         if waitTime <= 0:
634 |             waitTime = self.retryDelay
635 |         _vPrint(verbose, printString % (len(str(waitTime)), waitTime))
636 |         time.sleep(waitTime)
637 | 
638 | 
639 | class DataManager:
640 |     """JSON data manager."""
641 | 
642 |     def __init__(self, filePath=None, loadData=False):
643 |         """Initialize the DataManager object.
644 |         Args:
645 |             filePath (Optional[str]): Relative or absolute path to a JSON
646 |                 data file. Defaults to None.
647 |             loadData (Optional[bool]): Loads data from the given file path
648 |                 if True. Defaults to False.
649 | 
650 |         """
651 |         self.data = {}
652 |         """Dict: Working data."""
653 |         self.filePath = filePath
654 |         if loadData:
655 |             self.fileLoad(updatePath=False)
656 | 
657 |     @property
658 |     def filePath(self):
659 |         """str: Absolute path to a JSON format data file.
660 | 
661 |         Can accept relative paths, but will always convert them to
662 |         the absolute path.
663 |         """
664 |         if not self.__filePath:
665 |             raise ValueError("Internal variable filePath has not been set.")
666 |         return self.__filePath
667 | 
668 |     @filePath.setter
669 |     def filePath(self, filePath):
670 |         if filePath:
671 |             if not os.path.isfile(filePath):
672 |                 print(
673 |                     "Data file '%s' does not currently exist. Saving data will create a new file."
674 |                     % (filePath)
675 |                 )
676 |             self.__filePath = os.path.abspath(filePath)
677 |             print("Stored new data file path '%s'" % (self.filePath))
678 |         else:
679 |             self.__filePath = None
680 | 
681 |     def dataReset(self):
682 |         """Reset the internal JSON data dictionary."""
683 |         self.data = {}
684 |         print("Stored data has been reset.")
685 | 
686 |     def fileLoad(self, filePath=None, updatePath=True):
687 |         """Load a JSON data file into the internal JSON data dictionary.
688 | 
689 |         Current internal data will be overwritten.
690 |         If no file path is provided, the stored data file path will be used.
691 | 
692 |         Args:
693 |             filePath (Optional[str]): A relative or absolute path to a
694 |                 '.json' file. Defaults to None.
695 |             updatePath (Optional[bool]): Specifies whether or not to update
696 |                 the stored data file path. Defaults to True.
697 | 
698 |         """
699 |         if not filePath:
700 |             filePath = self.filePath
701 |         if not os.path.isfile(filePath):
702 |             raise FileNotFoundError("Data file '%s' does not exist." % (filePath))
703 | 
704 |         print(
705 |             "Importing existing data file '%s' ... " % (filePath),
706 |             end="",
707 |             flush=True,
708 |         )
709 |         with open(filePath, "r", encoding="utf-8") as q:
710 |             data_raw = q.read()
711 |         print("Imported!")
712 |         self.data = json.loads(data_raw)
713 |         if updatePath:
714 |             self.filePath = filePath
715 | 
716 |     def fileSave(self, filePath=None, updatePath=False, newline=None):
717 |         """Write the internal JSON data dictionary to a JSON data file.
718 | 
719 |         If no file path is provided, the stored data file path will be used.
720 | 
721 |         Args:
722 |             filePath (Optional[str]): A relative or absolute path to a
723 |                 '.json' file. Defaults to None.
724 |             updatePath (Optional[bool]): Specifies whether or not to update
725 |                 the stored data file path. Defaults to False.
726 |             newline (Optional[str]): Specifies the line endings to use when
727 |                 writing the file. Defaults to system default line separator.
728 | 
729 |         """
730 |         if not filePath:
731 |             filePath = self.filePath
732 |         if not os.path.isfile(filePath):
733 |             print("Data file '%s' does not exist, will create new file." % (filePath))
734 |             if not os.path.exists(os.path.split(filePath)[0]):
735 |                 os.makedirs(os.path.split(filePath)[0])
736 |         dataJsonString = json.dumps(self.data, indent=4, sort_keys=True)
737 |         print("Writing to file '%s' ... " % (filePath), end="", flush=True)
738 |         with open(filePath, "w", encoding="utf-8", newline=newline) as fileout:
739 |             fileout.write(dataJsonString)
740 |         print("Wrote file!")
741 |         if updatePath:
742 |             self.filePath = filePath
743 | 


--------------------------------------------------------------------------------
/scraper/github/util.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | logger = logging.getLogger(__name__)
 4 | 
 5 | 
 6 | def _license_obj(license_name):
 7 |     """
 8 |     A helper function to look up license object information
 9 | 
10 |     Use names from: https://api.github.com/licenses
11 |     """
12 |     obj = None
13 | 
14 |     if license_name in ("MIT", "MIT License"):
15 |         obj = {"URL": "https://api.github.com/licenses/mit", "name": "MIT"}
16 |     elif license_name in ('BSD 2-clause "Simplified" License'):
17 |         obj = {
18 |             "URL": "https://api.github.com/licenses/bsd-2-clause",
19 |             "name": "BSD-2-Clause",
20 |         }
21 |     elif license_name in ('BSD 3-clause "New" or "Revised" License'):
22 |         obj = {
23 |             "URL": "https://api.github.com/licenses/bsd-3-clause",
24 |             "name": "BSD-3-Clause",
25 |         }
26 |     elif license_name in ("Apache License 2.0"):
27 |         obj = {
28 |             "URL": "https://api.github.com/licenses/apache-2.0",
29 |             "name": "Apache-2.0",
30 |         }
31 |     elif license_name in ("GNU General Public License v2.1"):
32 |         obj = {"URL": "https://api.github.com/licenses/gpl-2.1", "name": "GPL-2.1"}
33 |     elif license_name in ("GNU General Public License v2.0"):
34 |         obj = {"URL": "https://api.github.com/licenses/gpl-2.0", "name": "GPL-2.0"}
35 |     elif license_name in ("GNU Lesser General Public License v2.1"):
36 |         obj = {"URL": "https://api.github.com/licenses/lgpl-2.1", "name": "LGPL-2.1"}
37 |     elif license_name in ("GNU General Public License v3.0"):
38 |         obj = {"URL": "https://api.github.com/licenses/gpl-3.0", "name": "GPL-3.0"}
39 |     elif license_name in ("GNU Lesser General Public License v3.0"):
40 |         obj = {"URL": "https://api.github.com/licenses/lgpl-3.0", "name": "LGPL-3.0"}
41 |     elif license_name in ("Eclipse Public License 1.0"):
42 |         obj = {"URL": "https://api.github.com/licenses/epl-1.0", "name": "EPL-1.0"}
43 |     elif license_name in ("Mozilla Public License 2.0"):
44 |         obj = {"URL": "https://api.github.com/licenses/mpl-2.0", "name": "MPL-2.0"}
45 |     elif license_name in ("The Unlicense"):
46 |         obj = {"URL": "https://api.github.com/licenses/unlicense", "name": "Unlicense"}
47 |     elif license_name in ("GNU Affero General Public License v3.0"):
48 |         obj = {"URL": "https://api.github.com/licenses/agpl-3.0", "name": "AGPL-3.0"}
49 |     elif license_name in ("Eclipse Public License 2.0"):
50 |         obj = {"URL": "https://api.github.com/licenses/epl-2.0", "name": "EPL-2.0"}
51 | 
52 |     if obj is None:
53 |         logger.warning("I don't understand the license: %s", license_name)
54 |         raise ValueError("Aborting!")
55 | 
56 |     return obj
57 | 


--------------------------------------------------------------------------------
/scraper/gitlab/__init__.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import os
 3 | 
 4 | import gitlab
 5 | 
 6 | logger = logging.getLogger(__name__)
 7 | 
 8 | 
 9 | def connect(url="https://gitlab.com", token=None):
10 |     """
11 |     Return a connected GitLab session
12 | 
13 |     ``token`` should be a ``private_token`` from Gitlab
14 |     """
15 | 
16 |     if token is None:
17 |         token = os.environ.get("GITLAB_API_TOKEN", None)
18 | 
19 |     gl_session = gitlab.Gitlab(url, token)
20 | 
21 |     try:
22 |         gl_session.version()
23 |     except gitlab.exceptions.GitlabAuthenticationError as exc:
24 |         raise RuntimeError("Invalid or missing GITLAB_API_TOKEN") from exc
25 | 
26 |     logger.info("Connected to: %s", url)
27 | 
28 |     return gl_session
29 | 
30 | 
31 | def query_repos(gl_session, repos=None):
32 |     """
33 |     Yields Gitlab project objects for all projects in Bitbucket
34 |     """
35 | 
36 |     if repos is None:
37 |         repos = []
38 | 
39 |     for repo in repos:
40 |         yield gl_session.projects.get(repo)
41 | 
42 |     if not repos:
43 |         for project in gl_session.projects.list(as_list=False):
44 |             yield project
45 | 


--------------------------------------------------------------------------------
/scraper/tfs/__init__.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/env python
  2 | # -*- coding: UTF-8 -*-
  3 | 
  4 | import logging
  5 | import os
  6 | 
  7 | from msrest.authentication import BasicAuthentication
  8 | from vsts.vss_connection import VssConnection
  9 | 
 10 | from scraper.tfs.models import TFSProject
 11 | 
 12 | logger = logging.getLogger(__name__)
 13 | 
 14 | HARD_CODED_TOP = 10000
 15 | 
 16 | 
 17 | def get_projects_metadata(baseurl, token):
 18 |     logger.debug("Retrieving TFS Metdata.....")
 19 |     return get_all_projects(baseurl, token)
 20 | 
 21 | 
 22 | def create_tfs_connection(url, token):
 23 |     """
 24 |     Creates the TFS Connection Context
 25 |     """
 26 |     if token is None:
 27 |         token = os.environ.get("TFS_API_TOKEN", None)
 28 | 
 29 |     tfs_credentials = BasicAuthentication("", token)
 30 |     tfs_connection = VssConnection(base_url=url, creds=tfs_credentials)
 31 |     return tfs_connection
 32 | 
 33 | 
 34 | def create_tfs_project_analysis_client(url, token=None):
 35 |     """
 36 |     Create a project_analysis_client.py client for a Team Foundation Server Enterprise connection instance.
 37 |     This is helpful for understanding project languages, but currently blank for all our test conditions.
 38 | 
 39 |     If token is not provided, will attempt to use the TFS_API_TOKEN
 40 |     environment variable if present.
 41 |     """
 42 |     if token is None:
 43 |         token = os.environ.get("TFS_API_TOKEN", None)
 44 | 
 45 |     tfs_connection = create_tfs_connection(url, token)
 46 |     project_analysis_client = tfs_connection.get_client(
 47 |         "vsts.project_analysis.v4_1.project_analysis_client.ProjectAnalysisClient"
 48 |     )
 49 | 
 50 |     if project_analysis_client is None:
 51 |         raise RuntimeError(
 52 |             "Unable to connect to TFS Enterprise (%s) with provided token." % url
 53 |         )
 54 | 
 55 |     return project_analysis_client
 56 | 
 57 | 
 58 | def create_tfs_core_client(url, token=None):
 59 |     """
 60 |     Create a core_client.py client for a Team Foundation Server Enterprise connection instance
 61 | 
 62 |     If token is not provided, will attempt to use the TFS_API_TOKEN
 63 |     environment variable if present.
 64 |     """
 65 |     if token is None:
 66 |         token = os.environ.get("TFS_API_TOKEN", None)
 67 | 
 68 |     tfs_connection = create_tfs_connection(url, token)
 69 |     tfs_client = tfs_connection.get_client("vsts.core.v4_1.core_client.CoreClient")
 70 | 
 71 |     if tfs_client is None:
 72 |         raise RuntimeError(
 73 |             "Unable to connect to TFS Enterprise (%s) with provided token." % url
 74 |         )
 75 | 
 76 |     return tfs_client
 77 | 
 78 | 
 79 | def create_tfs_git_client(url, token=None):
 80 |     """
 81 |     Creates a TFS Git Client to pull Git repo info
 82 |     """
 83 |     if token is None:
 84 |         token = os.environ.get("TFS_API_TOKEN", None)
 85 | 
 86 |     tfs_connection = create_tfs_connection(url, token)
 87 |     tfs_git_client = tfs_connection.get_client("vsts.git.v4_1.git_client.GitClient")
 88 | 
 89 |     if tfs_git_client is None:
 90 |         raise RuntimeError(
 91 |             "Unable to create TFS Git Client, failed to connect to TFS Enterprise (%s) with provided token."
 92 |             % url
 93 |         )
 94 | 
 95 |     return tfs_git_client
 96 | 
 97 | 
 98 | def create_tfs_tfvc_client(url, token=None):
 99 |     """
100 |     Creates a TFS TFVC Client to pull TFVC repo info
101 |     """
102 |     if token is None:
103 |         token = os.environ.get("TFS_API_TOKEN", None)
104 | 
105 |     tfs_connection = create_tfs_connection(url, token)
106 |     tfs_tfvc_client = tfs_connection.get_client("vsts.tfvc.v4_1.tfvc_client.TfvcClient")
107 | 
108 |     if tfs_tfvc_client is None:
109 |         raise RuntimeError(
110 |             "Unable to create TFS Git Client, failed to connect to TFS Enterprise (%s) with provided token."
111 |             % url
112 |         )
113 | 
114 |     return tfs_tfvc_client
115 | 
116 | 
117 | def get_all_projects(url, token, top=HARD_CODED_TOP):
118 |     """
119 |     Returns a list of all projects with their collection info from the server. Currently limited functionality to only return the first 1000 projects.
120 |     #TODO refactor to add multiple calls to api to retrieve all projects if more exist beyond top.
121 |     """
122 |     project_list = []
123 |     tfs_client = create_tfs_core_client(url, token)
124 | 
125 |     collections = tfs_client.get_project_collections(top=top)
126 | 
127 |     for collection in collections:
128 |         collection_client = create_tfs_core_client(
129 |             "{url}/{collection_name}".format(url=url, collection_name=collection.name),
130 |             token,
131 |         )
132 | 
133 |         logger.debug("Retrieving Projects for Project Collection: %s", collection.name)
134 |         # Retrieves all projects in the project collection
135 |         projects = collection_client.get_projects(top=HARD_CODED_TOP)
136 |         # get_projects only gets the project references, have to call get_project_history_entries to get last update info for projects
137 |         # Only calling this once per collection as its an expensive API call, wil refactor later if there is a better API call to use
138 |         collection_history_list = collection_client.get_project_history_entries()
139 |         for project in projects:
140 |             # get_projects only gets team project ref objects,
141 |             # have to call get_project to get the team project object which includes the TFS Web Url for the project
142 |             logger.debug("Retrieving Team Project for Project: %s", project.name)
143 |             projectInfo = collection_client.get_project(project.id, True, True)
144 | 
145 |             tfsProject = TFSProject(projectInfo, collection)
146 | 
147 |             logger.debug(
148 |                 "Retrieving Last Updated and Created Info for Project: %s", project.name
149 |             )
150 |             tfsProject.projectLastUpdateInfo = get_project_last_update_time(
151 |                 collection_history_list, project.id
152 |             )
153 |             tfsProject.projectCreateInfo = get_project_create_time(
154 |                 collection_history_list, project.id
155 |             )
156 |             project_list.append(tfsProject)
157 | 
158 |     return project_list
159 | 
160 | 
161 | def get_git_repos(url, token, collection, project):
162 |     """
163 |     Returns a list of all git repos for the supplied project within the supplied collection
164 |     """
165 |     git_client = create_tfs_git_client(
166 |         "{url}/{collection_name}".format(url=url, collection_name=collection.name),
167 |         token,
168 |     )
169 |     logger.debug("Retrieving Git Repos for Project: %s", project.name)
170 |     return git_client.get_repositories(project.id)
171 | 
172 | 
173 | def get_tfvc_repos(url, token, collection, project):
174 |     """
175 |     Returns a list of all tfvc branches for the supplied project within the supplied collection
176 |     """
177 |     branch_list = []
178 |     tfvc_client = create_tfs_tfvc_client(
179 |         "{url}/{collection_name}".format(url=url, collection_name=collection.name),
180 |         token,
181 |     )
182 | 
183 |     logger.debug("Retrieving Tfvc Branches for Project: %s}", project.name)
184 |     branches = tfvc_client.get_branches(project.id, True, True, False, True)
185 |     if branches:
186 |         branch_list.extend(branches)
187 |     else:
188 |         logger.debug("No Tfvc Branches in Project: %s", project.name)
189 | 
190 |     return branch_list
191 | 
192 | 
193 | def get_project_last_update_time(collection_history_list, projectId):
194 |     sorted_history_list = sorted(
195 |         collection_history_list, key=lambda x: x.last_update_time, reverse=True
196 |     )
197 |     return next((x for x in sorted_history_list if x.id == projectId))
198 | 
199 | 
200 | def get_project_create_time(collection_history_list, projectId):
201 |     sorted_history_list = sorted(
202 |         collection_history_list, key=lambda x: x.last_update_time, reverse=False
203 |     )
204 |     return next((x for x in sorted_history_list if x.id == projectId))
205 | 


--------------------------------------------------------------------------------
/scraper/tfs/models.py:
--------------------------------------------------------------------------------
1 | class TFSProject:
2 |     def __init__(self, projectInfo, collectionInfo):
3 |         self.projectInfo = projectInfo
4 |         self.collectionInfo = collectionInfo
5 |         self.projectCreateInfo = {}
6 |         self.projectLastUpdateInfo = {}
7 |         self.gitInfo = []
8 |         self.tfvcInfo = []
9 | 


--------------------------------------------------------------------------------
/scraper/util.py:
--------------------------------------------------------------------------------
  1 | import functools
  2 | import json
  3 | import logging
  4 | import logging.config
  5 | import os
  6 | from subprocess import PIPE, Popen  # nosec
  7 | import tempfile
  8 | 
  9 | logger = logging.getLogger(__name__)
 10 | 
 11 | # These mirror the defaults in github3.py sessions per:
 12 | # https://github.com/sigmavirus24/github3.py/blob/ce43e6e5fdef6555f5a6b6602e2cc4b66c428aef/src/github3/session.py#L98
 13 | DEFAULT_REQUESTS_TIMEOUTS = (4, 10)
 14 | 
 15 | 
 16 | def execute(command, cwd=None):
 17 |     logger.debug("Forking command: %s", command)
 18 | 
 19 |     if cwd is None:
 20 |         cwd = os.getcwd()
 21 |     elif not os.path.isdir(cwd):
 22 |         raise ValueError("path does not exist: %s" % cwd)
 23 | 
 24 |     with Popen(
 25 |         command, cwd=cwd, stdout=PIPE, stderr=PIPE, shell=False
 26 |     ) as process:  # nosec
 27 |         out, err = process.communicate()
 28 | 
 29 |     if process.returncode:
 30 |         logging.error(
 31 |             "Error Executing: command=%s, returncode=%d",
 32 |             " ".join(command),
 33 |             process.returncode,
 34 |         )
 35 | 
 36 |     return out.decode("utf-8"), err.decode("utf-8")
 37 | 
 38 | 
 39 | def configure_logging(verbose=False):
 40 |     DEFAULT_LOGGING = {
 41 |         "version": 1,
 42 |         "disable_existing_loggers": False,
 43 |         "formatters": {
 44 |             "standard": {
 45 |                 # 'format': '%(asctime)s [%(levelname)s] %(name)s: %(message)s'
 46 |                 # 'format': '%(levelname)s: %(message)s'
 47 |                 "format": "%(asctime)s - %(levelname)s: %(message)s"
 48 |             }
 49 |         },
 50 |         "handlers": {
 51 |             "default": {
 52 |                 "level": "INFO",
 53 |                 "formatter": "standard",
 54 |                 "class": "logging.StreamHandler",
 55 |             },
 56 |             "null": {
 57 |                 "level": "INFO",
 58 |                 "formatter": "standard",
 59 |                 "class": "logging.NullHandler",
 60 |             },
 61 |         },
 62 |         "loggers": {
 63 |             "": {"handlers": ["default"], "level": "DEBUG", "propagate": False},
 64 |             "github3": {"handlers": ["null"], "level": "DEBUG", "propagate": False},
 65 |             "urllib3": {"handlers": ["null"], "level": "DEBUG", "propagate": False},
 66 |         },
 67 |     }
 68 | 
 69 |     if verbose:
 70 |         DEFAULT_LOGGING["handlers"]["default"]["level"] = "DEBUG"
 71 |         # DEFAULT_LOGGING['loggers']['']['level'] = 'DEBUG'
 72 | 
 73 |     logging.config.dictConfig(DEFAULT_LOGGING)
 74 | 
 75 | 
 76 | def git_repo_to_sloc(url):
 77 |     """
 78 |     Given a Git repository URL, returns number of lines of code based on cloc
 79 | 
 80 |     Reference:
 81 |     - cloc: https://github.com/AlDanial/cloc
 82 |     - https://www.omg.org/spec/AFP/
 83 |         - Another potential way to calculation effort
 84 | 
 85 |     Sample cloc output:
 86 |         {
 87 |             "header": {
 88 |                 "cloc_url": "github.com/AlDanial/cloc",
 89 |                 "cloc_version": "1.74",
 90 |                 "elapsed_seconds": 0.195950984954834,
 91 |                 "n_files": 27,
 92 |                 "n_lines": 2435,
 93 |                 "files_per_second": 137.78956000769,
 94 |                 "lines_per_second": 12426.5769858787
 95 |             },
 96 |             "C++": {
 97 |                 "nFiles": 7,
 98 |                 "blank": 121,
 99 |                 "comment": 314,
100 |                 "code": 371
101 |             },
102 |             "C/C++ Header": {
103 |                 "nFiles": 8,
104 |                 "blank": 107,
105 |                 "comment": 604,
106 |                 "code": 191
107 |             },
108 |             "CMake": {
109 |                 "nFiles": 11,
110 |                 "blank": 49,
111 |                 "comment": 465,
112 |                 "code": 165
113 |             },
114 |             "Markdown": {
115 |                 "nFiles": 1,
116 |                 "blank": 18,
117 |                 "comment": 0,
118 |                 "code": 30
119 |             },
120 |             "SUM": {
121 |                 "blank": 295,
122 |                 "comment": 1383,
123 |                 "code": 757,
124 |                 "nFiles": 27
125 |             }
126 |         }
127 |     """
128 | 
129 |     with tempfile.TemporaryDirectory() as tmp_dir:
130 |         logger.debug("Cloning: url=%s tmp_dir=%s", url, tmp_dir)
131 | 
132 |         tmp_clone = os.path.join(tmp_dir, "clone-dir")
133 | 
134 |         cmd = ["git", "clone", "--depth=1", url, tmp_clone]
135 |         execute(cmd)
136 | 
137 |         cmd = ["cloc", "--json", tmp_clone]
138 |         out, err = execute(cmd)
139 | 
140 |         if err:
141 |             logger.warning(
142 |                 "Error encountered while analyzing: url=%s stderr=%s", url, err
143 |             )
144 | 
145 |         try:
146 |             cloc_json = json.loads(out)
147 |             sloc = cloc_json["SUM"]["code"]
148 |         except json.decoder.JSONDecodeError:
149 |             logger.error("Error Decoding: url=%s, out=%s", url, out)
150 |             sloc = 0
151 |         except KeyError:
152 |             logging.error(
153 |                 "Missing LOC information (Is the repository empty?): url=%s, json=%s",
154 |                 url,
155 |                 json.dumps(cloc_json),
156 |             )
157 |             sloc = 0
158 | 
159 |     logger.debug("SLOC: url=%s, sloc=%d", url, sloc)
160 | 
161 |     return sloc
162 | 
163 | 
164 | def compute_labor_hours(sloc, month_hours="cocomo_book"):
165 |     """
166 |     Compute the labor hours, given a count of source lines of code
167 | 
168 |     The intention is to use the COCOMO II model to compute this value.
169 | 
170 |     References:
171 |     - http://csse.usc.edu/tools
172 |     - http://softwarecost.org/tools/COCOMO/
173 |     - https://www.rose-hulman.edu/class/csse/csse372/201310/Homework/CII_modelman2000.pdf
174 |     """
175 |     # Calculation of hours in a month
176 |     if month_hours == "hours_per_year":
177 |         # Use number of working hours in a year:
178 |         # (40 Hours / week) * (52 weeks / year) / (12 months / year) ~= 173.33
179 |         HOURS_PER_PERSON_MONTH = 40.0 * 52 / 12
180 |     else:
181 |         # Use value from COCOMO II Book (month_hours=='cocomo_book'):
182 |         # Reference: https://dl.acm.org/citation.cfm?id=557000
183 |         # This is the value used by the Code.gov team:
184 |         # https://github.com/GSA/code-gov/blob/master/docs/labor_hour_calc.md
185 |         HOURS_PER_PERSON_MONTH = 152.0
186 | 
187 |     # Coefficients for the COCOMO II model (only the two used for person-month
188 |     # calculation)
189 |     co_a = 2.94
190 |     co_b = 0.91
191 | 
192 |     # These values represent a default of "Nominal" from the established
193 |     # constant values for the COCOMO II model.
194 |     scale_factors = [
195 |         3.72,  # Precedentedness
196 |         3.04,  # Development Flexibility
197 |         4.24,  # Architecture / Risk Resolution
198 |         3.29,  # Team Cohesion
199 |         4.68,  # Process Maturity
200 |     ]
201 |     cost_drivers = [
202 |         1.00,  # Required Software Reliability
203 |         1.00,  # Data Base Size
204 |         1.00,  # Product Complexity
205 |         1.00,  # Developed for Reusability
206 |         1.00,  # Documentation Match to Lifecycle Needs
207 |         1.00,  # Analyst Capability
208 |         1.00,  # Programmer Capability
209 |         1.00,  # Personnel Continuity
210 |         1.00,  # Application Experience
211 |         1.00,  # Platform Experience
212 |         1.00,  # Language and Toolset Experience
213 |         1.00,  # Time Constraint
214 |         1.00,  # Storage Constraint
215 |         1.00,  # Platform Volatility
216 |         1.00,  # Use of Software Tools
217 |         1.00,  # Multisite Development
218 |         1.00,  # Required Development Schedule
219 |     ]
220 | 
221 |     # The summation (∑) of the scale factors is used in this calculation
222 |     scale_factor_aggregate = co_b + 0.01 * functools.reduce(
223 |         lambda x, y: x + y, scale_factors
224 |     )
225 |     # The product (∏) of the cost drivers
226 |     effort_adjustment_factor = functools.reduce(lambda x, y: x * y, cost_drivers)
227 |     # The calculation of person-months uses KSLOC for the size of a project
228 |     size = sloc / 1000
229 | 
230 |     # Calculate PM = A * Size^E * EAF
231 |     person_months = co_a * size**scale_factor_aggregate * effort_adjustment_factor
232 | 
233 |     labor_hours = round(person_months * HOURS_PER_PERSON_MONTH, 1)
234 |     logger.debug("sloc=%d labor_hours=%d", sloc, labor_hours)
235 | 
236 |     return labor_hours
237 | 
238 | 
239 | def labor_hours_from_url(url):
240 |     sum_sloc = git_repo_to_sloc(url)
241 |     logger.info("SLOC: %d", sum_sloc)
242 | 
243 |     labor_hours = compute_labor_hours(sum_sloc)
244 |     logger.info("labor_hours: %d", labor_hours)
245 | 
246 |     return labor_hours
247 | 
248 | 
249 | def _prune_dict_null_str(dictionary):
250 |     """
251 |     Prune the "None" or emptry string values from dictionary items
252 |     """
253 |     for key, value in list(dictionary.items()):
254 |         if value is None or str(value) == "":
255 |             del dictionary[key]
256 | 
257 |         if isinstance(value, dict):
258 |             dictionary[key] = _prune_dict_null_str(dictionary[key])
259 | 
260 |     return dictionary
261 | 


--------------------------------------------------------------------------------
/scripts/clone_everything.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python3
 2 | 
 3 | import pathlib
 4 | import subprocess
 5 | from timeit import default_timer as timer
 6 | 
 7 | import requests
 8 | 
 9 | INPUT_FILE = "https://raw.githubusercontent.com/LLNL/llnl.github.io/main/visualize/github-data/intReposInfo.json"
10 | 
11 | 
12 | def main():
13 |     repo_info = requests.get(INPUT_FILE).json()["data"]
14 | 
15 |     BACKUP_PATH = "github_backup"
16 |     pathlib.Path(BACKUP_PATH).mkdir(parents=True, exist_ok=True)
17 | 
18 |     start = timer()
19 | 
20 |     for slug, data in repo_info.items():
21 |         url = data["url"]
22 |         clone_path = f"{BACKUP_PATH}/{slug}"
23 |         if pathlib.Path(clone_path).exists():
24 |             print(f"... updating: {url}")
25 |             subprocess.run(["time", "git", "fetch"], cwd=clone_path)
26 |         else:
27 |             print(f"... cloning: {url}")
28 |             subprocess.run(["time", "git", "clone", "--mirror", url, clone_path])
29 |         if not pathlib.Path(clone_path).exists():
30 |             print("Something went wrong with the clone, don't try to lfs fetch...")
31 |             continue
32 |         subprocess.run(["time", "git", "lfs", "fetch", "--all"], cwd=clone_path)
33 | 
34 |     end = timer()
35 | 
36 |     print(end - start)  # Time in seconds, e.g. 5.38091952400282
37 | 
38 | 
39 | if __name__ == "__main__":
40 |     main()
41 | 


--------------------------------------------------------------------------------
/scripts/codegov_compute_hours.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python3
 2 | 
 3 | import argparse
 4 | import json
 5 | 
 6 | from scraper.util import compute_labor_hours, git_repo_to_sloc
 7 | 
 8 | parser = argparse.ArgumentParser(
 9 |     description="Scrape code repositories for Code.gov / DOECode"
10 | )
11 | parser.add_argument(
12 |     "filename", type=str, help="Path to locally stored `code.json` file"
13 | )
14 | args = parser.parse_args()
15 | 
16 | code_gov_json = json.load(open(args.filename))
17 | releases = code_gov_json["releases"]
18 | 
19 | repo_urls = {
20 |     release["repositoryURL"].rstrip("/")
21 |     for release in releases
22 |     if release.get("vcs", "") == "git"
23 | }
24 | 
25 | for url in repo_urls:
26 |     # print(url)
27 | 
28 |     sloc = git_repo_to_sloc(url)
29 |     # print(sloc)
30 | 
31 |     hours = compute_labor_hours(sloc)
32 |     print("-- url=%s, sloc=%d, hours=%d" % (url, sloc, hours))
33 | 


--------------------------------------------------------------------------------
/scripts/get_stargazers.py:
--------------------------------------------------------------------------------
  1 | import datetime
  2 | import getpass
  3 | import os
  4 | 
  5 | import github3
  6 | import requests
  7 | 
  8 | 
  9 | class GitHub_Stargazers:
 10 |     def __init__(self):
 11 |         self.repos = {}
 12 |         self.stargazers = {}
 13 |         self.total_count = 0
 14 | 
 15 |     def get_stats(self, username="", password="", organization="llnl", force=True):
 16 |         """
 17 |         Retrieves the traffic for the users of the given organization.
 18 |         Requires organization admin credentials token to access the data.
 19 |         """
 20 |         stargazers_file_path = "../github_stats_output/stargazers.csv"
 21 |         if force or not os.path.isfile(file_path):
 22 |             my_github.login(username, password)
 23 |             calls_beginning = self.logged_in_gh.ratelimit_remaining + 1
 24 |             print("Rate Limit: " + str(calls_beginning))
 25 |             my_github.get_org(organization)
 26 |             my_github.get_repos()
 27 |             my_github.write_to_file(file_path=stargazers_file_path)
 28 |             # my_github.write_to_file(file_path=stargazers_file_path)
 29 |             calls_remaining = self.logged_in_gh.ratelimit_remaining
 30 |             calls_used = calls_beginning - calls_remaining
 31 |             print(
 32 |                 "Rate Limit Remaining: "
 33 |                 + str(calls_remaining)
 34 |                 + "\nUsed "
 35 |                 + str(calls_used)
 36 |                 + " API calls."
 37 |             )
 38 | 
 39 |     def login(self, username="", password=""):
 40 |         """
 41 |         Performs a login and sets the Github object via given credentials. If
 42 |         credentials are empty or incorrect then prompts user for credentials.
 43 |         Stores the authentication token in a CREDENTIALS_FILE used for future
 44 |         logins. Handles Two Factor Authentication.
 45 |         """
 46 |         try:
 47 |             self.token = ""
 48 |             id = ""
 49 |             if not os.path.isfile("CREDENTIALS_FILE"):
 50 |                 if username == "" or password == "":
 51 |                     username = raw_input("Username: ")
 52 |                     password = getpass.getpass("Password: ")
 53 |                 note = "GitHub Organization Stats App"
 54 |                 note_url = "http://software.llnl.gov/"
 55 |                 scopes = ["user", "repo"]
 56 |                 auth = github3.authorize(
 57 |                     username,
 58 |                     password,
 59 |                     scopes,
 60 |                     note,
 61 |                     note_url,
 62 |                     two_factor_callback=self.prompt_2fa,
 63 |                 )
 64 |                 self.token = auth.token
 65 |                 id = auth.id
 66 |                 with open("CREDENTIALS_FILE", "w+") as fd:
 67 |                     fd.write(self.token + "\n")
 68 |                     fd.write(str(id))
 69 |                 fd.close()
 70 |             else:
 71 |                 with open("CREDENTIALS_FILE", "r") as fd:
 72 |                     self.token = fd.readline().strip()
 73 |                     id = fd.readline().strip()
 74 |                 fd.close()
 75 |             print("Logging in.")
 76 |             self.logged_in_gh = github3.login(
 77 |                 token=self.token, two_factor_callback=self.prompt_2fa
 78 |             )
 79 |             self.logged_in_gh.user().to_json()
 80 |         except (ValueError, AttributeError, github3.models.GitHubError):
 81 |             print("Bad credentials. Try again.")
 82 |             self.login()
 83 | 
 84 |     def prompt_2fa(self):
 85 |         """
 86 |         Taken from
 87 |         http://github3py.readthedocs.io/en/master/examples/two_factor_auth.html
 88 |         Prompts a user for their 2FA code and returns it.
 89 |         """
 90 |         code = ""
 91 |         while not code:
 92 |             code = raw_input("Enter 2FA code: ")
 93 |         return code
 94 | 
 95 |     def get_org(self, organization_name=""):
 96 |         """
 97 |         Retrieves an organization via given org name. If given
 98 |         empty string, prompts user for an org name.
 99 |         """
100 |         self.organization_name = organization_name
101 |         if organization_name == "":
102 |             self.organization_name = raw_input("Organization: ")
103 |         print("Getting organization.")
104 |         self.org_retrieved = self.logged_in_gh.organization(organization_name)
105 | 
106 |     def get_repos(self):
107 |         """
108 |         Gets the repos for the organization and builds the URL/headers for
109 |         getting timestamps of stargazers.
110 |         """
111 |         print("Getting repos.")
112 |         # Uses the developer API. Note this could change.
113 | 
114 |         headers = {
115 |             "Accept": "application/vnd.github.v3.star+json",
116 |             "Authorization": "token " + self.token,
117 |         }
118 |         temp_count = 0
119 |         for repo in self.org_retrieved.iter_repos():
120 |             temp_count += 1
121 |             url = (
122 |                 "https://api.github.com/repos/"
123 |                 + self.organization_name
124 |                 + "/"
125 |                 + repo.name
126 |             )
127 |             self.repos[repo.name] = self.get_stargazers(url=url, headers=headers)
128 |         self.calc_stargazers(start_count=650)
129 |         print("total count: \t" + str(self.total_count))
130 |         print(str(temp_count) + " repos")
131 | 
132 |     def get_stargazers(self, url, headers={}):
133 |         """
134 |         Return a list of the stargazers of a GitHub repo
135 | 
136 |         Includes both the 'starred_at' and 'user' data.
137 | 
138 |         param: url
139 |             url is the 'stargazers_url' of the form:
140 |                 https://api.github.com/repos/LLNL/spack/stargazers
141 |         """
142 |         url = url + "/stargazers?per_page=100&page=%s"
143 |         page = 1
144 |         gazers = []
145 | 
146 |         json_data = requests.get(url % page, headers=headers).json()
147 |         while json_data:
148 |             gazers.extend(json_data)
149 |             page += 1
150 |             json_data = requests.get(url % page, headers=headers).json()
151 |         return gazers
152 | 
153 |     def calc_stargazers(self, date=(datetime.date.today()), start_count=0):
154 |         for repo_json in self.repos:
155 |             for stargazer in self.repos[repo_json]:
156 |                 print(stargazer)
157 |                 date = stargazer["starred_at"][:10]
158 |                 try:
159 |                     self.stargazers[date] += 1
160 |                 except KeyError:
161 |                     count = self.stargazers[date] = 1
162 | 
163 |         sorted_stargazers = sorted(self.stargazers)
164 |         for stargazer in reversed(sorted_stargazers):
165 |             number_starred = self.stargazers[stargazer]
166 |             self.stargazers[stargazer] = start_count - number_starred
167 |             start_count = start_count - number_starred
168 | 
169 |     def write_to_file(
170 |         self, file_path="", date=(datetime.date.today()), organization="llnl"
171 |     ):
172 |         """
173 |         Writes stargazers data to file.
174 |         """
175 |         with open(file_path, "w+") as out:
176 |             out.write("date,organization,stargazers\n")
177 |             sorted_stargazers = sorted(self.stargazers)  # sort based on lowercase
178 |             for star in sorted_stargazers:
179 |                 out.write(star + "," + str(self.stargazers[star]) + "\n")
180 |         out.close()
181 | 
182 | 
183 | if __name__ == "__main__":
184 |     my_github = GitHub_Stargazers()
185 |     my_github.get_stats()
186 | 


--------------------------------------------------------------------------------
/scripts/get_traffic.py:
--------------------------------------------------------------------------------
  1 | import calendar
  2 | import csv
  3 | import datetime
  4 | import errno
  5 | import getpass
  6 | import json
  7 | import math
  8 | import os
  9 | import time
 10 | 
 11 | import github3
 12 | import requests
 13 | 
 14 | 
 15 | class GitHub_Traffic:
 16 |     def __init__(self):
 17 |         self.referrers = {}
 18 |         self.referrers_lower = {}
 19 |         self.views = {}
 20 |         self.clones = {}
 21 | 
 22 |         self.referrers_json = {}
 23 |         self.views_json = {}
 24 |         self.clones_json = {}
 25 |         self.releases_json = {}
 26 | 
 27 |     def get_stats(self, username="", password="", organization="llnl", force=True):
 28 |         """
 29 |         Retrieves the traffic for the users of the given organization.
 30 |         Requires organization admin credentials token to access the data.
 31 |         """
 32 |         referrers_file_path = "../github_stats_output/referrers.csv"
 33 |         views_file_path = "../github_stats_output/views.csv"
 34 |         clones_file_path = "../github_stats_output/clones.csv"
 35 |         if force or not os.path.isfile(file_path):
 36 |             my_github.login(username, password)
 37 |             calls_beginning = self.logged_in_gh.ratelimit_remaining + 1
 38 |             print("Rate Limit: " + str(calls_beginning))
 39 |             my_github.get_org(organization)
 40 |             my_github.get_traffic()
 41 |             views_row_count = my_github.check_data_redundancy(
 42 |                 file_path=views_file_path, dict_to_check=self.views
 43 |             )
 44 |             clones_row_count = my_github.check_data_redundancy(
 45 |                 file_path=clones_file_path, dict_to_check=self.clones
 46 |             )
 47 |             my_github.write_to_file(
 48 |                 referrers_file_path=referrers_file_path,
 49 |                 views_file_path=views_file_path,
 50 |                 clones_file_path=clones_file_path,
 51 |                 views_row_count=views_row_count,
 52 |                 clones_row_count=clones_row_count,
 53 |             )
 54 |             my_github.write_json(
 55 |                 dict_to_write=self.referrers_json,
 56 |                 path_ending_type="traffic_popular_referrers",
 57 |             )
 58 |             my_github.write_json(
 59 |                 dict_to_write=self.views_json, path_ending_type="traffic_views"
 60 |             )
 61 |             my_github.write_json(
 62 |                 dict_to_write=self.clones_json, path_ending_type="traffic_clones"
 63 |             )
 64 |             my_github.write_json(
 65 |                 dict_to_write=self.releases_json, path_ending_type="releases"
 66 |             )
 67 |             calls_remaining = self.logged_in_gh.ratelimit_remaining
 68 |             calls_used = calls_beginning - calls_remaining
 69 |             print(
 70 |                 "Rate Limit Remaining: "
 71 |                 + str(calls_remaining)
 72 |                 + "\nUsed "
 73 |                 + str(calls_used)
 74 |                 + " API calls."
 75 |             )
 76 | 
 77 |     def login(self, username="", password=""):
 78 |         """
 79 |         Performs a login and sets the Github object via given credentials. If
 80 |         credentials are empty or incorrect then prompts user for credentials.
 81 |         Stores the authentication token in a CREDENTIALS_FILE used for future
 82 |         logins. Handles Two Factor Authentication.
 83 |         """
 84 |         try:
 85 |             self.token = ""
 86 |             id = ""
 87 |             if not os.path.isfile("CREDENTIALS_FILE_ADMIN"):
 88 |                 if username == "" or password == "":
 89 |                     username = raw_input("Username: ")
 90 |                     password = getpass.getpass("Password: ")
 91 |                 note = "GitHub Organization Stats App"
 92 |                 note_url = "http://software.llnl.gov/"
 93 |                 scopes = ["user", "repo"]
 94 |                 auth = github3.authorize(
 95 |                     username,
 96 |                     password,
 97 |                     scopes,
 98 |                     note,
 99 |                     note_url,
100 |                     two_factor_callback=self.prompt_2fa,
101 |                 )
102 |                 self.token = auth.token
103 |                 id = auth.id
104 |                 with open("CREDENTIALS_FILE_ADMIN", "w+") as fd:
105 |                     fd.write(self.token + "\n")
106 |                     fd.write(str(id))
107 |                 fd.close()
108 |             else:
109 |                 with open("CREDENTIALS_FILE_ADMIN", "r") as fd:
110 |                     self.token = fd.readline().strip()
111 |                     id = fd.readline().strip()
112 |                 fd.close()
113 |             print("Logging in.")
114 |             self.logged_in_gh = github3.login(
115 |                 token=self.token, two_factor_callback=self.prompt_2fa
116 |             )
117 |             self.logged_in_gh.user().to_json()
118 |         except (ValueError, AttributeError, github3.models.GitHubError):
119 |             print("Bad credentials. Try again.")
120 |             self.login()
121 | 
122 |     def prompt_2fa(self):
123 |         """
124 |         Taken from
125 |         http://github3py.readthedocs.io/en/master/examples/two_factor_auth.html
126 |         Prompts a user for their 2FA code and returns it.
127 |         """
128 |         code = ""
129 |         while not code:
130 |             code = raw_input("Enter 2FA code: ")
131 |         return code
132 | 
133 |     def get_org(self, organization_name=""):
134 |         """
135 |         Retrieves an organization via given org name. If given
136 |         empty string, prompts user for an org name.
137 |         """
138 |         self.organization_name = organization_name
139 |         if organization_name == "":
140 |             self.organization_name = raw_input("Organization: ")
141 |         print("Getting organization.")
142 |         self.org_retrieved = self.logged_in_gh.organization(organization_name)
143 | 
144 |     def get_traffic(self):
145 |         """
146 |         Retrieves the traffic for the repositories of the given organization.
147 |         """
148 |         print("Getting traffic.")
149 |         # Uses the developer API. Note this could change.
150 |         headers = {
151 |             "Accept": "application/vnd.github.spiderman-preview",
152 |             "Authorization": "token " + self.token,
153 |         }
154 |         headers_release = {"Authorization": "token " + self.token}
155 |         for repo in self.org_retrieved.iter_repos(type="public"):
156 |             url = (
157 |                 "https://api.github.com/repos/"
158 |                 + self.organization_name
159 |                 + "/"
160 |                 + repo.name
161 |             )
162 |             self.get_referrers(url=url, headers=headers, repo_name=repo.name)
163 |             self.get_paths(url=url, headers=headers)
164 |             self.get_data(
165 |                 url=url,
166 |                 headers=headers,
167 |                 dict_to_store=self.views,
168 |                 type="views",
169 |                 repo_name=repo.name,
170 |             )
171 |             self.get_data(
172 |                 url=url,
173 |                 headers=headers,
174 |                 dict_to_store=self.clones,
175 |                 type="clones",
176 |                 repo_name=repo.name,
177 |             )
178 |             self.get_releases(url=url, headers=headers_release, repo_name=repo.name)
179 | 
180 |     def get_releases(self, url="", headers={}, repo_name=""):
181 |         """
182 |         Retrieves the releases for the given repo in JSON.
183 |         """
184 |         url_releases = url + "/releases"
185 |         r = requests.get(url_releases, headers=headers)
186 |         self.releases_json[repo_name] = r.json()
187 | 
188 |     def get_referrers(self, url="", headers={}, repo_name=""):
189 |         """
190 |         Retrieves the total referrers and unique referrers of all repos in json
191 |         and then stores it in a dict.
192 |         """
193 |         # JSON
194 |         url_referrers = url + "/traffic/popular/referrers"
195 |         r1 = requests.get(url_referrers, headers=headers)
196 |         referrers_json = r1.json()
197 |         self.referrers_json[repo_name] = referrers_json
198 |         # CSV
199 |         for referrer in referrers_json:
200 |             ref_name = referrer["referrer"]
201 |             try:
202 |                 tuple_in = (referrer["count"], referrer["uniques"])  # curr vals
203 |                 tuple = (
204 |                     self.referrers[ref_name][0] + tuple_in[0],  # cal new vals
205 |                     self.referrers[ref_name][1] + tuple_in[1],
206 |                 )
207 |                 self.referrers[ref_name] = tuple  # record new vals
208 |             except KeyError:
209 |                 tuple = self.referrers[ref_name] = (
210 |                     referrer["count"],
211 |                     referrer["uniques"],
212 |                 )
213 |                 self.referrers_lower[ref_name.lower()] = ref_name
214 | 
215 |     def get_paths(self, url="", headers={}):
216 |         """
217 |         Retrieves the popular paths information in json and then stores it in a
218 |         dict.
219 |         """
220 |         url_paths = url + "/traffic/popular/paths"
221 |         # r2 = requests.get(url_paths, headers=headers)
222 |         # print 'PATHS ' + str(r2.json())
223 | 
224 |     def get_data(
225 |         self,
226 |         url="",
227 |         headers={},
228 |         date=str(datetime.date.today()),
229 |         dict_to_store={},
230 |         type="",
231 |         repo_name="",
232 |     ):
233 |         """
234 |         Retrieves data from json and stores it in the supplied dict. Accepts
235 |         'clones' or 'views' as type.
236 |         """
237 |         # JSON
238 |         url = url + "/traffic/" + type
239 |         r3 = requests.get(url, headers=headers)
240 |         json = r3.json()
241 |         if type == "views":
242 |             self.views_json[repo_name] = json
243 |         elif type == "clones":
244 |             self.clones_json[repo_name] = json
245 |         # CSV
246 |         for day in json[type]:
247 |             timestamp_seconds = day["timestamp"] / 1000
248 |             try:
249 |                 date_timestamp = datetime.datetime.utcfromtimestamp(
250 |                     timestamp_seconds
251 |                 ).strftime("%Y-%m-%d")
252 |                 # do not add todays date, some views might not be recorded yet
253 |                 if date_timestamp != date:
254 |                     tuple_in = (day["count"], day["uniques"])
255 |                     tuple = (
256 |                         dict_to_store[timestamp_seconds][0] + tuple_in[0],
257 |                         dict_to_store[timestamp_seconds][1] + tuple_in[1],
258 |                     )
259 |                     dict_to_store[timestamp_seconds] = tuple
260 |             except KeyError:
261 |                 tuple = dict_to_store[timestamp_seconds] = (
262 |                     day["count"],
263 |                     day["uniques"],
264 |                 )
265 | 
266 |     def write_json(
267 |         self,
268 |         date=(datetime.date.today()),
269 |         organization="llnl",
270 |         dict_to_write={},
271 |         path_ending_type="",
272 |     ):
273 |         """
274 |         Writes all traffic data to file in JSON form.
275 |         """
276 |         for repo in dict_to_write:
277 |             if len(dict_to_write[repo]) != 0:  # don't need to write out empty lists
278 |                 path = (
279 |                     "../github-data/"
280 |                     + organization
281 |                     + "/"
282 |                     + repo
283 |                     + "/"
284 |                     + path_ending_type
285 |                     + "/"
286 |                     + str(date)
287 |                     + ".json"
288 |                 )
289 |                 self.checkDir(path)
290 |                 with open(path, "w") as out:
291 |                     out.write(
292 |                         json.dumps(
293 |                             dict_to_write[repo],
294 |                             sort_keys=True,
295 |                             indent=4,
296 |                             separators=(",", ": "),
297 |                         )
298 |                     )
299 |                 out.close()
300 | 
301 |     def write_to_file(
302 |         self,
303 |         referrers_file_path="",
304 |         views_file_path="",
305 |         clones_file_path="",
306 |         date=(datetime.date.today()),
307 |         organization="llnl",
308 |         views_row_count=0,
309 |         clones_row_count=0,
310 |     ):
311 |         """
312 |         Writes all traffic data to file.
313 |         """
314 |         self.write_referrers_to_file(file_path=referrers_file_path)
315 |         self.write_data_to_file(
316 |             file_path=views_file_path,
317 |             dict_to_write=self.views,
318 |             name="views",
319 |             row_count=views_row_count,
320 |         )
321 |         self.write_data_to_file(
322 |             file_path=clones_file_path,
323 |             dict_to_write=self.clones,
324 |             name="clones",
325 |             row_count=clones_row_count,
326 |         )
327 | 
328 |     def check_data_redundancy(self, file_path="", dict_to_check={}):
329 |         """
330 |         Checks the given csv file against the json data scraped for the given
331 |         dict. It will remove all data retrieved that has already been recorded
332 |         so we don't write redundant data to file. Returns count of rows from
333 |         file.
334 |         """
335 |         count = 0
336 |         exists = os.path.isfile(file_path)
337 |         previous_dates = {}
338 |         if exists:
339 |             with open(file_path, "r") as input:
340 |                 input.readline()  # skip header line
341 |                 for row in csv.reader(input):
342 |                     timestamp = calendar.timegm(time.strptime(row[0], "%Y-%m-%d"))
343 |                     if timestamp in dict_to_check:  # our date is already recorded
344 |                         del dict_to_check[timestamp]
345 |                     # calc current id max
346 |                     count += 1
347 |             input.close()
348 |         return count
349 | 
350 |     def write_data_to_file(
351 |         self,
352 |         file_path="",
353 |         date=str(datetime.date.today()),
354 |         organization="llnl",
355 |         dict_to_write={},
356 |         name="",
357 |         row_count=0,
358 |     ):
359 |         """
360 |         Writes given dict to file.
361 |         """
362 |         exists = os.path.isfile(file_path)
363 |         with open(file_path, "a") as out:
364 |             if not exists:
365 |                 out.write("date,organization," + name + ",unique_" + name + ",id\n")
366 |             sorted_dict = sorted(dict_to_write)
367 |             for day in sorted_dict:
368 |                 day_formatted = datetime.datetime.utcfromtimestamp(day).strftime(
369 |                     "%Y-%m-%d"
370 |                 )
371 |                 out.write(
372 |                     day_formatted
373 |                     + ","
374 |                     + organization
375 |                     + ","
376 |                     + str(dict_to_write[day][0])
377 |                     + ","
378 |                     + str(dict_to_write[day][1])
379 |                     + ","
380 |                     + str(row_count)
381 |                     + "\n"
382 |                 )
383 |                 row_count += 1
384 | 
385 |     def write_referrers_to_file(
386 |         self, file_path="", date=str(datetime.date.today()), organization="llnl"
387 |     ):
388 |         """
389 |         Writes the referrers data to file.
390 |         """
391 |         self.remove_date(file_path=file_path, date=date)
392 |         referrers_exists = os.path.isfile(file_path)
393 |         with open(file_path, "a") as out:
394 |             if not referrers_exists:
395 |                 out.write(
396 |                     "date,organization,referrer,count,count_log,uniques,"
397 |                     + "uniques_logged\n"
398 |                 )
399 |             sorted_referrers = sorted(self.referrers_lower)  # sort based on lowercase
400 |             for referrer in sorted_referrers:
401 |                 ref_name = self.referrers_lower[referrer]  # grab real name from
402 |                 count = self.referrers[ref_name][0]
403 |                 uniques = self.referrers[ref_name][1]
404 |                 if count == 1:  # so we don't display 0 for count of 1
405 |                     count = 1.5
406 |                 if uniques == 1:
407 |                     uniques = 1.5
408 |                 count_logged = math.log(count)
409 |                 uniques_logged = math.log(uniques)
410 |                 out.write(
411 |                     date
412 |                     + ","
413 |                     + organization
414 |                     + ","
415 |                     + ref_name
416 |                     + ","
417 |                     + str(count)
418 |                     + ","
419 |                     + str(count_logged)
420 |                     + ","
421 |                     + str(uniques)
422 |                     + ","
423 |                     + str(uniques_logged)
424 |                     + "\n"
425 |                 )
426 |         out.close()
427 | 
428 |     def remove_date(self, file_path="", date=str(datetime.date.today())):
429 |         """
430 |         Removes all rows of the associated date from the given csv file.
431 |         Defaults to today.
432 |         """
433 |         languages_exists = os.path.isfile(file_path)
434 |         if languages_exists:
435 |             with open(file_path, "rb") as inp, open("temp.csv", "wb") as out:
436 |                 writer = csv.writer(out)
437 |                 for row in csv.reader(inp):
438 |                     if row[0] != date:
439 |                         writer.writerow(row)
440 |             inp.close()
441 |             out.close()
442 |             os.remove(file_path)
443 |             os.rename("temp.csv", file_path)
444 | 
445 |     def checkDir(self, file_path=""):
446 |         """
447 |         Checks if a directory exists. If not, it creates one with the specified
448 |         file_path.
449 |         """
450 |         if not os.path.exists(os.path.dirname(file_path)):
451 |             try:
452 |                 os.makedirs(os.path.dirname(file_path))
453 |             except OSError as e:
454 |                 if e.errno != errno.EEXIST:
455 |                     raise
456 | 
457 | 
458 | if __name__ == "__main__":
459 |     my_github = GitHub_Traffic()
460 |     my_github.get_stats()
461 | 


--------------------------------------------------------------------------------
/scripts/get_users_emails.py:
--------------------------------------------------------------------------------
  1 | import getpass
  2 | import os
  3 | 
  4 | import github3
  5 | 
  6 | 
  7 | class GitHub_Users_Emails:
  8 |     def __init__(self):
  9 |         self.emails = {}
 10 |         self.logins_lower = {}
 11 | 
 12 |     def get_stats(self, username="", password="", organization="llnl", force=True):
 13 |         """
 14 |         Retrieves the emails for the users of the given organization.
 15 |         """
 16 |         file_path = "../github_stats_output/users_emails.csv"
 17 |         if force or not os.path.isfile(file_path):
 18 |             my_github.login(username, password)
 19 |             calls_beginning = self.logged_in_gh.ratelimit_remaining + 1
 20 |             print("Rate Limit: " + str(calls_beginning))
 21 |             my_github.get_org(organization)
 22 |             count_members = my_github.get_mems_of_org()
 23 |             my_github.write_to_file(file_path)
 24 |             calls_remaining = self.logged_in_gh.ratelimit_remaining
 25 |             calls_used = calls_beginning - calls_remaining
 26 |             print(
 27 |                 "Rate Limit Remaining: "
 28 |                 + str(calls_remaining)
 29 |                 + "\nUsed "
 30 |                 + str(calls_used)
 31 |                 + " API calls."
 32 |             )
 33 | 
 34 |     def login(self, username="", password=""):
 35 |         """
 36 |         Performs a login and sets the Github object via given credentials. If
 37 |         credentials are empty or incorrect then prompts user for credentials.
 38 |         Stores the authentication token in a CREDENTIALS_FILE used for future
 39 |         logins. Handles Two Factor Authentication.
 40 |         """
 41 |         try:
 42 |             token = ""
 43 |             id = ""
 44 |             if not os.path.isfile("CREDENTIALS_FILE"):
 45 |                 if username == "" or password == "":
 46 |                     username = raw_input("Username: ")
 47 |                     password = getpass.getpass("Password: ")
 48 |                 note = "GitHub Organization Stats App"
 49 |                 note_url = "http://software.llnl.gov/"
 50 |                 scopes = ["user", "repo"]
 51 |                 auth = github3.authorize(
 52 |                     username,
 53 |                     password,
 54 |                     scopes,
 55 |                     note,
 56 |                     note_url,
 57 |                     two_factor_callback=self.prompt_2fa,
 58 |                 )
 59 |                 token = auth.token
 60 |                 id = auth.id
 61 |                 with open("CREDENTIALS_FILE", "w+") as fd:
 62 |                     fd.write(token + "\n")
 63 |                     fd.write(str(id))
 64 |                 fd.close()
 65 |             else:
 66 |                 with open("CREDENTIALS_FILE", "r") as fd:
 67 |                     token = fd.readline().strip()
 68 |                     id = fd.readline().strip()
 69 |                 fd.close()
 70 |             print("Logging in.")
 71 |             self.logged_in_gh = github3.login(
 72 |                 token=token, two_factor_callback=self.prompt_2fa
 73 |             )
 74 |             self.logged_in_gh.user().to_json()
 75 |         except (ValueError, AttributeError, github3.models.GitHubError):
 76 |             print("Bad credentials. Try again.")
 77 |             self.login()
 78 | 
 79 |     def prompt_2fa(self):
 80 |         """
 81 |         Taken from
 82 |         http://github3py.readthedocs.io/en/master/examples/two_factor_auth.html
 83 |         Prompts a user for their 2FA code and returns it.
 84 |         """
 85 |         code = ""
 86 |         while not code:
 87 |             code = raw_input("Enter 2FA code: ")
 88 |         return code
 89 | 
 90 |     def get_org(self, organization_name=""):
 91 |         """
 92 |         Retrieves an organization via given org name. If given
 93 |         empty string, prompts user for an org name.
 94 |         """
 95 |         if organization_name == "":
 96 |             organization_name = raw_input("Organization: ")
 97 |         print("Getting organization.")
 98 |         self.org_retrieved = self.logged_in_gh.organization(organization_name)
 99 | 
100 |     def get_mems_of_org(self):
101 |         """
102 |         Retrieves the emails of the members of the organization. Note this Only
103 |         gets public emails. Private emails would need authentication for each
104 |         user.
105 |         """
106 |         print("Getting members' emails.")
107 |         for member in self.org_retrieved.iter_members():
108 |             login = member.to_json()["login"]
109 |             user_email = self.logged_in_gh.user(login).to_json()["email"]
110 |             if user_email is not None:
111 |                 self.emails[login] = user_email
112 |             else:  # user has no public email
113 |                 self.emails[login] = "none"
114 |             # used for sorting regardless of case
115 |             self.logins_lower[login.lower()] = login
116 | 
117 |     def write_to_file(self, file_path=""):
118 |         """
119 |         Writes the user emails to file.
120 |         """
121 |         with open(file_path, "w+") as out:
122 |             out.write("user, email\n")
123 |             sorted_names = sorted(self.logins_lower)  # sort based on lowercase
124 |             for login in sorted_names:
125 |                 out.write(
126 |                     self.logins_lower[login]
127 |                     + ","
128 |                     + self.emails[self.logins_lower[login]]
129 |                     + "\n"
130 |                 )
131 |         out.close()
132 | 
133 | 
134 | if __name__ == "__main__":
135 |     my_github = GitHub_Users_Emails()
136 |     my_github.get_stats()
137 | 


--------------------------------------------------------------------------------
/scripts/get_year_commits.py:
--------------------------------------------------------------------------------
  1 | import datetime
  2 | import getpass
  3 | import os
  4 | import time
  5 | 
  6 | import github3
  7 | 
  8 | 
  9 | class GitHub_LLNL_Year_Commits:
 10 |     def __init__(self):
 11 |         self.commits_dict_list = []
 12 |         self.commits = {}
 13 |         self.sorted_weeks = []
 14 | 
 15 |     def get_year_commits(
 16 |         self, username="", password="", organization="llnl", force=True
 17 |     ):
 18 |         """
 19 |         Does setup such as login, printing API info, and waiting for GitHub to
 20 |         build the commit statistics. Then gets the last year of commits and
 21 |         prints them to file.
 22 |         """
 23 |         file_path = "year_commits.csv"
 24 |         if force or not os.path.isfile(file_path):
 25 |             my_github.login(username, password)
 26 |             calls_beginning = self.logged_in_gh.ratelimit_remaining + 1
 27 |             print("Rate Limit: " + str(calls_beginning))
 28 |             my_github.get_org(organization)
 29 |             my_github.repos(building_stats=True)
 30 |             print("Letting GitHub build statistics.")
 31 |             time.sleep(30)
 32 |             print("Trying again.")
 33 |             my_github.repos(building_stats=False)
 34 |             my_github.calc_total_commits(starting_commits=35163)
 35 |             my_github.write_to_file()
 36 |             calls_remaining = self.logged_in_gh.ratelimit_remaining
 37 |             calls_used = calls_beginning - calls_remaining
 38 |             print(
 39 |                 "Rate Limit Remaining: "
 40 |                 + str(calls_remaining)
 41 |                 + "\nUsed "
 42 |                 + str(calls_used)
 43 |                 + " API calls."
 44 |             )
 45 | 
 46 |     def login(self, username="", password=""):
 47 |         """
 48 |         Performs a login and sets the Github object via given credentials. If
 49 |         credentials are empty or incorrect then prompts user for credentials.
 50 |         Stores the authentication token in a CREDENTIALS_FILE used for future
 51 |         logins. Handles Two Factor Authentication.
 52 |         """
 53 |         try:
 54 |             token = ""
 55 |             id = ""
 56 |             if not os.path.isfile("CREDENTIALS_FILE"):
 57 |                 if username == "" or password == "":
 58 |                     username = raw_input("Username: ")
 59 |                     password = getpass.getpass("Password: ")
 60 |                 note = "GitHub Organization Stats App"
 61 |                 note_url = "http://software.llnl.gov/"
 62 |                 scopes = ["user", "repo"]
 63 |                 auth = github3.authorize(
 64 |                     username,
 65 |                     password,
 66 |                     scopes,
 67 |                     note,
 68 |                     note_url,
 69 |                     two_factor_callback=self.prompt_2fa,
 70 |                 )
 71 |                 token = auth.token
 72 |                 id = auth.id
 73 |                 with open("CREDENTIALS_FILE", "w+") as fd:
 74 |                     fd.write(token + "\n")
 75 |                     fd.write(str(id))
 76 |                 fd.close()
 77 |             else:
 78 |                 with open("CREDENTIALS_FILE", "r") as fd:
 79 |                     token = fd.readline().strip()
 80 |                     id = fd.readline().strip()
 81 |                 fd.close()
 82 |             print("Logging in.")
 83 |             self.logged_in_gh = github3.login(
 84 |                 token=token, two_factor_callback=self.prompt_2fa
 85 |             )
 86 |             self.logged_in_gh.user().to_json()
 87 |         except (ValueError, AttributeError, github3.models.GitHubError):
 88 |             print("Bad credentials. Try again.")
 89 |             self.login()
 90 | 
 91 |     def prompt_2fa(self):
 92 |         """
 93 |         Taken from
 94 |         http://github3py.readthedocs.io/en/master/examples/two_factor_auth.html
 95 |         Prompts a user for their 2FA code and returns it.
 96 |         """
 97 |         code = ""
 98 |         while not code:
 99 |             code = raw_input("Enter 2FA code: ")
100 |         return code
101 | 
102 |     def get_org(self, organization_name=""):
103 |         """
104 |         Retrieves an organization via given org name. If given
105 |         empty string, prompts user for an org name.
106 |         """
107 |         if organization_name == "":
108 |             organization_name = raw_input("Organization: ")
109 |         print("Getting organization.")
110 |         self.org_retrieved = self.logged_in_gh.organization(organization_name)
111 | 
112 |     def repos(self, building_stats=False):
113 |         """
114 |         Retrieves the last year of commits for the organization and stores them
115 |         in weeks (UNIX time) associated with number of commits that week.
116 |         """
117 |         print("Getting repos.")
118 |         for repo in self.org_retrieved.iter_repos():
119 |             for activity in repo.iter_commit_activity():
120 |                 if not building_stats:
121 |                     self.commits_dict_list.append(activity)
122 | 
123 |     def calc_total_commits(self, starting_commits=0):
124 |         """
125 |         Uses the weekly commits and traverses back through the last
126 |         year, each week subtracting the weekly commits and storing them. It
127 |         needs an initial starting commits number, which should be taken from
128 |         the most up to date number from github_stats.py output.
129 |         """
130 |         for week_of_commits in self.commits_dict_list:
131 |             try:
132 |                 self.commits[week_of_commits["week"]] -= week_of_commits["total"]
133 |             except KeyError:
134 |                 total = self.commits[week_of_commits["week"]] = -week_of_commits[
135 |                     "total"
136 |                 ]
137 |         self.sorted_weeks = sorted(self.commits)
138 | 
139 |         # reverse because lower numbered weeks are older in time.
140 |         # we traverse from most recent to oldest
141 |         for week in reversed(self.sorted_weeks):
142 |             self.commits[week] = self.commits[week] + starting_commits
143 |             starting_commits = self.commits[week]
144 | 
145 |     def write_to_file(self):
146 |         """
147 |         Writes the weeks with associated commits to file.
148 |         """
149 |         with open("../github_stats_output/last_year_commits.csv", "w+") as output:
150 |             output.write(
151 |                 "date,organization,repos,members,teams,"
152 |                 + "unique_contributors,total_contributors,forks,"
153 |                 + "stargazers,pull_requests,open_issues,has_readme,"
154 |                 + "has_license,pull_requests_open,pull_requests_closed,"
155 |                 + "commits\n"
156 |             )
157 |             # no reverse this time to print oldest first
158 |             previous_commits = 0
159 |             for week in self.sorted_weeks:
160 |                 if str(self.commits[week]) != previous_commits:  # delete dups
161 |                     week_formatted = datetime.datetime.utcfromtimestamp(week).strftime(
162 |                         "%Y-%m-%d"
163 |                     )
164 |                     output.write(
165 |                         week_formatted
166 |                         + ",llnl,0,0,0,0,0,0,0,0,0,0,0,0,0,"
167 |                         + str(self.commits[week])
168 |                         + "\n"
169 |                     )
170 |                     previous_commits = str(self.commits[week])
171 | 
172 | 
173 | if __name__ == "__main__":
174 |     my_github = GitHub_LLNL_Year_Commits()
175 |     my_github.get_year_commits()
176 | 


--------------------------------------------------------------------------------
/scripts/github_stats.py:
--------------------------------------------------------------------------------
  1 | from collections import defaultdict
  2 | import csv
  3 | import datetime
  4 | import errno
  5 | import getpass
  6 | import json
  7 | import math
  8 | import os
  9 | import time
 10 | 
 11 | import github3
 12 | import my_repo
 13 | 
 14 | 
 15 | class GitHub_LLNL_Stats:
 16 |     def __init__(self):
 17 |         print("Initalizing.")
 18 |         self.unique_contributors = defaultdict(list)
 19 |         self.languages = {}
 20 |         self.languages_size = {}
 21 |         self.all_repos = []
 22 |         self.total_repos = 0
 23 |         self.total_contributors = 0
 24 |         self.total_forks = 0
 25 |         self.total_stars = 0
 26 |         self.total_pull_reqs = 0
 27 |         self.total_pull_reqs_open = 0
 28 |         self.total_pull_reqs_closed = 0
 29 |         self.total_open_issues = 0
 30 |         self.total_closed_issues = 0
 31 |         self.total_issues = 0
 32 |         self.total_readmes = 0
 33 |         self.total_licenses = 0
 34 |         self.total_commits = 0
 35 |         self.search_limit = 0
 36 |         self.previous_language = ""
 37 | 
 38 |         # JSON vars
 39 |         self.repos_json = {}
 40 |         self.members_json = {}
 41 |         self.teams_json = {}
 42 |         self.contributors_json = defaultdict(list)
 43 |         self.pull_requests_json = defaultdict(list)
 44 |         self.issues_json = defaultdict(list)
 45 |         self.languages_json = defaultdict(dict)
 46 |         self.commits_json = defaultdict(list)
 47 | 
 48 |     def get_stats(
 49 |         self,
 50 |         username="",
 51 |         password="",
 52 |         organization="llnl",
 53 |         force=True,
 54 |         repo_type="public",
 55 |     ):
 56 |         """
 57 |         Retrieves the statistics from the given organization with the given
 58 |         credentials. Will not retreive data if file exists and force hasn't been
 59 |         set to True. This is to save GH API requests.
 60 |         """
 61 |         date = str(datetime.date.today())
 62 |         file_path = (
 63 |             "../github_stats_output/" + date[:4] + "/" + date[:7] + "/" + date + ".csv"
 64 |         )
 65 |         if force or not os.path.isfile(file_path):
 66 |             my_github.login(username, password)
 67 |             calls_beginning = self.logged_in_gh.ratelimit_remaining + 1
 68 |             print("Rate Limit: " + str(calls_beginning))
 69 |             my_github.get_org(organization)
 70 |             count_members = my_github.get_mems_of_org()
 71 |             count_teams = my_github.get_teams_of_org()
 72 |             my_github.repos(repo_type=repo_type, organization=organization)
 73 |             # Write JSON
 74 |             my_github.write_org_json(
 75 |                 dict_to_write=self.members_json,
 76 |                 path_ending_type="members",
 77 |                 is_list=True,
 78 |             )
 79 |             my_github.write_org_json(
 80 |                 dict_to_write={"singleton": self.org_retrieved.to_json()},
 81 |                 path_ending_type="organization",
 82 |             )
 83 |             my_github.write_org_json(
 84 |                 dict_to_write=self.teams_json, path_ending_type="teams", is_list=True
 85 |             )
 86 | 
 87 |             my_github.write_repo_json(
 88 |                 dict_to_write=self.repos_json, path_ending_type="repo"
 89 |             )
 90 |             my_github.write_repo_json(
 91 |                 dict_to_write=self.contributors_json,
 92 |                 path_ending_type="contributors",
 93 |                 is_list=True,
 94 |             )
 95 |             my_github.write_repo_json(
 96 |                 dict_to_write=self.pull_requests_json,
 97 |                 path_ending_type="pull-requests",
 98 |                 is_list=True,
 99 |             )
100 |             my_github.write_repo_json(
101 |                 dict_to_write=self.issues_json, path_ending_type="issues", is_list=True
102 |             )
103 |             my_github.write_repo_json(
104 |                 dict_to_write=self.languages_json,
105 |                 path_ending_type="languages",
106 |                 is_dict=True,
107 |             )
108 |             my_github.write_repo_json(
109 |                 dict_to_write=self.commits_json,
110 |                 path_ending_type="commits",
111 |                 is_list=True,
112 |             )
113 |             # Write CSV
114 |             my_github.write_to_file(
115 |                 file_path, date, organization, count_members, count_teams
116 |             )
117 |             calls_remaining = self.logged_in_gh.ratelimit_remaining
118 |             calls_used = calls_beginning - calls_remaining
119 |             print(
120 |                 "Rate Limit Remaining: "
121 |                 + str(calls_remaining)
122 |                 + "\nUsed "
123 |                 + str(calls_used)
124 |                 + " API calls."
125 |             )
126 | 
127 |     def login(self, username="", password=""):
128 |         """
129 |         Performs a login and sets the Github object via given credentials. If
130 |         credentials are empty or incorrect then prompts user for credentials.
131 |         Stores the authentication token in a CREDENTIALS_FILE used for future
132 |         logins. Handles Two Factor Authentication.
133 |         """
134 |         try:
135 |             self.token = ""
136 |             id = ""
137 |             if not os.path.isfile("CREDENTIALS_FILE"):
138 |                 if username == "" or password == "":
139 |                     username = raw_input("Username: ")
140 |                     password = getpass.getpass("Password: ")
141 |                 note = "GitHub Organization Stats App"
142 |                 note_url = "http://software.llnl.gov/"
143 |                 scopes = ["user", "repo"]
144 |                 auth = github3.authorize(
145 |                     username,
146 |                     password,
147 |                     scopes,
148 |                     note,
149 |                     note_url,
150 |                     two_factor_callback=self.prompt_2fa,
151 |                 )
152 |                 self.token = auth.token
153 |                 id = auth.id
154 |                 with open("CREDENTIALS_FILE", "w+") as fd:
155 |                     fd.write(self.token + "\n")
156 |                     fd.write(str(id))
157 |                 fd.close()
158 |             else:
159 |                 with open("CREDENTIALS_FILE", "r") as fd:
160 |                     self.token = fd.readline().strip()
161 |                     id = fd.readline().strip()
162 |                 fd.close()
163 |             print("Logging in.")
164 |             self.logged_in_gh = github3.login(
165 |                 token=self.token, two_factor_callback=self.prompt_2fa
166 |             )
167 |             self.logged_in_gh.user().to_json()
168 |         except (ValueError, AttributeError, github3.models.GitHubError):
169 |             print("Bad credentials. Try again.")
170 |             self.login()
171 | 
172 |     def prompt_2fa(self):
173 |         """
174 |         Taken from
175 |         http://github3py.readthedocs.io/en/master/examples/two_factor_auth.html
176 |         Prompts a user for their 2FA code and returns it.
177 |         """
178 |         code = ""
179 |         while not code:
180 |             code = raw_input("Enter 2FA code: ")
181 |         return code
182 | 
183 |     def get_org(self, organization_name=""):
184 |         """
185 |         Retrieves an organization via given org name. If given
186 |         empty string, prompts user for an org name.
187 |         """
188 |         if organization_name == "":
189 |             organization_name = raw_input("Organization: ")
190 |         print("Getting organization.")
191 |         self.org_retrieved = self.logged_in_gh.organization(organization_name)
192 | 
193 |     def get_mems_of_org(self):
194 |         """
195 |         Retrieves the number of members of the organization.
196 |         """
197 |         print("Getting members.")
198 |         counter = 0
199 |         for member in self.org_retrieved.iter_members():
200 |             self.members_json[member.id] = member.to_json()
201 |             counter += 1
202 |         return counter
203 | 
204 |     def get_teams_of_org(self):
205 |         """
206 |         Retrieves the number of teams of the organization.
207 |         """
208 |         print("Getting teams.")
209 |         counter = 0
210 |         for team in self.org_retrieved.iter_teams():
211 |             self.teams_json[team.id] = team.to_json()
212 |             counter += 1
213 |         return counter
214 | 
215 |     def repos(self, repo_type="public", organization="llnl"):
216 |         """
217 |         Retrieves info about the repos of the current organization.
218 |         """
219 |         print("Getting repos.")
220 |         for repo in self.org_retrieved.iter_repos(type=repo_type):
221 |             # JSON
222 |             json = repo.to_json()
223 |             self.repos_json[repo.name] = json
224 |             # CSV
225 |             temp_repo = my_repo.My_Repo()
226 |             temp_repo.name = repo.full_name
227 |             self.total_repos += 1
228 |             temp_repo.contributors = my_github.get_total_contributors(repo)
229 |             self.total_contributors += temp_repo.contributors
230 |             temp_repo.forks = repo.forks_count
231 |             self.total_forks += temp_repo.forks
232 |             temp_repo.stargazers = repo.stargazers
233 |             self.total_stars += temp_repo.stargazers
234 |             (
235 |                 temp_repo.pull_requests_open,
236 |                 temp_repo.pull_requests_closed,
237 |             ) = my_github.get_pull_reqs(repo)
238 |             temp_repo.pull_requests = (
239 |                 temp_repo.pull_requests_open + temp_repo.pull_requests_closed
240 |             )
241 |             self.total_pull_reqs += temp_repo.pull_requests_open
242 |             self.total_pull_reqs += temp_repo.pull_requests_closed
243 |             self.total_pull_reqs_open += temp_repo.pull_requests_open
244 |             self.total_pull_reqs_closed += temp_repo.pull_requests_closed
245 |             temp_repo.open_issues = repo.open_issues_count
246 |             self.total_open_issues += temp_repo.open_issues
247 |             temp_repo.closed_issues = my_github.get_issues(
248 |                 repo, organization=organization
249 |             )
250 |             temp_repo.issues = temp_repo.closed_issues + temp_repo.open_issues
251 |             self.total_closed_issues += temp_repo.closed_issues
252 |             self.total_issues += temp_repo.issues
253 |             my_github.get_languages(repo, temp_repo)
254 |             temp_repo.readme = my_github.get_readme(repo)
255 |             # temp_repo.license = my_github.get_license(repo)
256 |             temp_repo.commits = self.get_commits(repo=repo, organization=organization)
257 |             self.total_commits += temp_repo.commits
258 |             self.all_repos.append(temp_repo)
259 | 
260 |     def get_total_contributors(self, repo):
261 |         """
262 |         Retrieves the number of contributors to a repo in the organization.
263 |         Also adds to unique contributor list.
264 |         """
265 |         repo_contributors = 0
266 |         for contributor in repo.iter_contributors():
267 |             repo_contributors += 1
268 |             self.unique_contributors[contributor.id].append(repo.name)
269 |             self.contributors_json[repo.name].append(contributor.to_json())
270 |         return repo_contributors
271 | 
272 |     def get_pull_reqs(self, repo):
273 |         """
274 |         Retrieves the number of pull requests on a repo in the organization.
275 |         """
276 |         pull_reqs_open = 0
277 |         pull_reqs_closed = 0
278 |         for pull_request in repo.iter_pulls(state="all"):
279 |             self.pull_requests_json[repo.name].append(pull_request.to_json())
280 |             if pull_request.closed_at is not None:
281 |                 pull_reqs_closed += 1
282 |             else:
283 |                 pull_reqs_open += 1
284 |         return pull_reqs_open, pull_reqs_closed
285 | 
286 |     def get_issues(self, repo, organization="llnl"):
287 |         """
288 |         Retrieves the number of closed issues.
289 |         """
290 |         # JSON
291 |         path = "../github-data/" + organization + "/" + repo.name + "/issues"
292 |         is_only_today = False
293 |         if not os.path.exists(path):  # no previous path, get all issues
294 |             all_issues = repo.iter_issues(state="all")
295 |             is_only_today = True
296 |         else:
297 |             files = os.listdir(path)
298 |             date = str(files[-1][:-5])
299 |             if date == str(datetime.date.today()):
300 |                 # most recent date is actually today, get previous most recent date
301 |                 if len(files) > 2:
302 |                     date = str(files[-2][:-5])
303 |                 else:
304 |                     # This means there is only one file, today. Retrieve every issue
305 |                     all_issues = repo.iter_issues(state="all")
306 |                     is_only_today = True
307 |             if not is_only_today:  # there's a previous saved JSON that's not today
308 |                 all_issues = repo.iter_issues(since=date, state="all")
309 |         for issue in all_issues:
310 |             self.issues_json[repo.name].append(issue.to_json())
311 |         # CSV
312 |         closed_issues = 0
313 |         for issue in repo.iter_issues(state="closed"):
314 |             if issue is not None:
315 |                 closed_issues += 1
316 |         return closed_issues
317 | 
318 |     def get_languages(self, repo, temp_repo):
319 |         """
320 |         Retrieves the languages used in the repo and increments the respective
321 |         counts of those languages. Only increments languages that have names.
322 |         Anything else is not incremented (i.e. numbers).
323 |         """
324 |         try:
325 |             self.languages[repo.language] += 1
326 |         except KeyError:
327 |             count = self.languages[repo.language] = 1
328 |         for repo_languages in repo.iter_languages():
329 |             self.languages_json[repo.name][repo_languages[0]] = repo_languages[1]
330 |             for language in repo_languages:
331 |                 if isinstance(language, basestring):  # is language
332 |                     temp_repo.languages.append(language)
333 |                     self.previous_language = language
334 |                 else:  # record size bytes of language
335 |                     try:
336 |                         self.languages_size[self.previous_language] += language
337 |                     except KeyError:
338 |                         size = self.languages_size[self.previous_language] = language
339 | 
340 |     def get_readme(self, repo):
341 |         """
342 |         Checks to see if the given repo has a ReadMe. MD means it has a correct
343 |         Readme recognized by GitHub.
344 |         """
345 |         readme_contents = repo.readme()
346 |         if readme_contents is not None:
347 |             self.total_readmes += 1
348 |             return "MD"
349 |         if self.search_limit >= 28:
350 |             print("Hit search limit. Sleeping for 60 sec.")
351 |             time.sleep(60)
352 |             self.search_limit = 0
353 |         self.search_limit += 1
354 |         search_results = self.logged_in_gh.search_code(
355 |             "readme" + "in:path repo:" + repo.full_name
356 |         )
357 |         try:
358 |             for result in search_results:
359 |                 path = result.path[1:]
360 |                 if "/" not in path and "readme" in path.lower():
361 |                     self.total_readmes += 1
362 |                     return path
363 |             return "MISS"
364 |         except (github3.models.GitHubError, StopIteration):
365 |             return "MISS"
366 | 
367 |     def get_license(self, repo):
368 |         """
369 |         Checks to see if the given repo has a top level LICENSE file.
370 |         """
371 |         if self.search_limit >= 28:
372 |             print("Hit search limit. Sleeping for 60 sec.")
373 |             time.sleep(60)
374 |             self.search_limit = 0
375 |         self.search_limit += 1
376 |         search_results = self.logged_in_gh.search_code(
377 |             "license" + "in:path repo:" + repo.full_name
378 |         )
379 |         try:
380 |             for result in search_results:
381 |                 path = result.path[1:]
382 |                 if "/" not in path and "license" in path.lower():
383 |                     self.total_licenses += 1
384 |                     return path
385 |             return "MISS"
386 |         except StopIteration:
387 |             return "MISS"
388 | 
389 |     def get_commits(self, repo, organization="llnl"):
390 |         """
391 |         Retrieves the number of commits to a repo in the organization. If it is
392 |         the first time getting commits for a repo, it will get all commits and
393 |         save them to JSON. If there are previous commits saved, it will only get
394 |         commits that have not been saved to disk since the last date of commits.
395 |         """
396 |         # JSON
397 |         path = "../github-data/" + organization + "/" + repo.name + "/commits"
398 |         is_only_today = False
399 |         if not os.path.exists(path):  # no previous path, get all commits
400 |             all_commits = repo.iter_commits()
401 |             is_only_today = True
402 |         else:
403 |             files = os.listdir(path)
404 |             date = str(files[-1][:-5])
405 |             if date == str(datetime.date.today()):
406 |                 # most recent date is actually today, get previous most recent date
407 |                 if len(files) > 2:
408 |                     date = str(files[-2][:-5])
409 |                 else:
410 |                     # This means there is only one file, today. Retrieve every commit
411 |                     all_commits = repo.iter_commits()
412 |                     is_only_today = True
413 |             if not is_only_today:  # there's a previous saved JSON that's not today
414 |                 all_commits = repo.iter_commits(since=date)
415 |         for commit in all_commits:
416 |             self.commits_json[repo.name].append(commit.to_json())
417 |         # for csv
418 |         count = 0
419 |         for commit in repo.iter_commits():
420 |             count += 1
421 |         return count
422 | 
423 |     def write_org_json(
424 |         self,
425 |         date=(datetime.date.today()),
426 |         organization="llnl",
427 |         dict_to_write={},
428 |         path_ending_type="",
429 |         is_list=False,
430 |     ):
431 |         """
432 |         Writes stats from the organization to JSON.
433 |         """
434 |         path = (
435 |             "../github-data/"
436 |             + organization
437 |             + "-org/"
438 |             + path_ending_type
439 |             + "/"
440 |             + str(date)
441 |             + ".json"
442 |         )
443 |         self.checkDir(path)
444 |         with open(path, "w") as out_clear:  # clear old data
445 |             out_clear.close()
446 |         with open(path, "a") as out:
447 |             if is_list:  # used for list of items
448 |                 out.write("[")
449 |             for item in dict_to_write:
450 |                 out.write(
451 |                     json.dumps(
452 |                         dict_to_write[item],
453 |                         sort_keys=True,
454 |                         indent=4,
455 |                         separators=(",", ": "),
456 |                     )
457 |                     + ","
458 |                 )
459 |             out.seek(-1, os.SEEK_END)  # kill last comma
460 |             out.truncate()
461 |             if is_list:
462 |                 out.write("]")
463 |         out.close()
464 | 
465 |     def write_repo_json(
466 |         self,
467 |         date=(datetime.date.today()),
468 |         organization="llnl",
469 |         dict_to_write={},
470 |         path_ending_type="",
471 |         is_list=False,
472 |         is_dict=False,
473 |     ):
474 |         """
475 |         #Writes repo specific data to JSON.
476 |         """
477 |         for repo in dict_to_write:
478 |             path = (
479 |                 "../github-data/"
480 |                 + organization
481 |                 + "/"
482 |                 + repo
483 |                 + "/"
484 |                 + path_ending_type
485 |                 + "/"
486 |                 + str(date)
487 |                 + ".json"
488 |             )
489 |             self.checkDir(path)
490 |             with open(path, "w") as out:
491 |                 if is_list:
492 |                     out.write("[")
493 |                     for value in dict_to_write[repo]:
494 |                         if is_dict:
495 |                             for inner_dict in value:
496 |                                 out.write(
497 |                                     json.dumps(
498 |                                         inner_dict,
499 |                                         sort_keys=True,
500 |                                         indent=4,
501 |                                         separators=(",", ": "),
502 |                                     )
503 |                                     + ","
504 |                                 )
505 |                         else:
506 |                             out.write(
507 |                                 json.dumps(
508 |                                     value,
509 |                                     sort_keys=True,
510 |                                     indent=4,
511 |                                     separators=(",", ": "),
512 |                                 )
513 |                                 + ","
514 |                             )
515 |                     out.seek(-1, os.SEEK_END)  # kill last comma
516 |                     out.truncate()
517 |                     out.write("]")
518 |                 else:
519 |                     out.write(
520 |                         json.dumps(
521 |                             dict_to_write[repo],
522 |                             sort_keys=True,
523 |                             indent=4,
524 |                             separators=(",", ": "),
525 |                         )
526 |                     )
527 |             out.close()
528 | 
529 |     def write_to_file(
530 |         self,
531 |         file_path="",
532 |         date=str(datetime.date.today()),
533 |         organization="N/A",
534 |         members=0,
535 |         teams=0,
536 |     ):
537 |         """
538 |         Writes the current organization information to file (csv).
539 |         """
540 |         self.checkDir(file_path)
541 |         with open(file_path, "w+") as output:
542 |             output.write(
543 |                 "date,organization,members,teams,unique_contributors,"
544 |                 + "repository,contributors,forks,stargazers,pull_requests,"
545 |                 + "open_issues,has_readme,has_license,languages,pull_requests_open,"
546 |                 + "pull_requests_closed,commits,closed_issues,issues\n"
547 |                 + date
548 |                 + ","
549 |                 + organization
550 |                 + ","
551 |                 + str(members)
552 |                 + ","
553 |                 + str(teams)
554 |                 + ","
555 |                 + str(len(self.unique_contributors))
556 |                 + "\n"
557 |             )
558 |             for repo in self.all_repos:
559 |                 output.write(
560 |                     ",,,,,"
561 |                     + repo.name
562 |                     + ","
563 |                     + str(repo.contributors)
564 |                     + ","
565 |                     + str(repo.forks)
566 |                     + ","
567 |                     + str(repo.stargazers)
568 |                     + ","
569 |                     + str(repo.pull_requests)
570 |                     + ","
571 |                     + str(repo.open_issues)
572 |                     + ","
573 |                     + str(repo.readme)
574 |                     + ","
575 |                     + str(repo.license)
576 |                     + ","
577 |                     + " ".join(sorted(repo.languages))
578 |                     + ","
579 |                     + str(repo.pull_requests_open)
580 |                     + ","
581 |                     + str(repo.pull_requests_closed)
582 |                     + ","
583 |                     + str(repo.commits)
584 |                     + ","
585 |                     + str(repo.closed_issues)
586 |                     + ","
587 |                     + str(repo.issues)
588 |                     + "\n"
589 |                 )
590 |             output.write(
591 |                 ",,,,total,"
592 |                 + str(self.total_repos)
593 |                 + ","
594 |                 + str(self.total_contributors)
595 |                 + ","
596 |                 + str(self.total_forks)
597 |                 + ","
598 |                 + str(self.total_stars)
599 |                 + ","
600 |                 + str(self.total_pull_reqs)
601 |                 + ","
602 |                 + str(self.total_open_issues)
603 |                 + ","
604 |                 + str(self.total_readmes)
605 |                 + ","
606 |                 + str(self.total_licenses)
607 |                 + ",,"
608 |                 + str(self.total_pull_reqs_open)
609 |                 + ","
610 |                 + str(self.total_pull_reqs_closed)
611 |                 + ","
612 |                 + str(self.total_commits)
613 |                 + ","
614 |                 + str(self.total_closed_issues)
615 |                 + ","
616 |                 + str(self.total_issues)
617 |             )
618 |         output.close()
619 |         # Update total
620 |         self.write_totals(
621 |             file_path="../github_stats_output/total.csv",
622 |             date=date,
623 |             organization=organization,
624 |             members=members,
625 |             teams=teams,
626 |         )
627 |         # Update language sizes
628 |         self.write_languages(
629 |             file_path="../github_stats_output/languages.csv", date=date
630 |         )
631 | 
632 |     def write_totals(
633 |         self,
634 |         file_path="",
635 |         date=str(datetime.date.today()),
636 |         organization="N/A",
637 |         members=0,
638 |         teams=0,
639 |     ):
640 |         """
641 |         Updates the total.csv file with current data.
642 |         """
643 | 
644 |         total_exists = os.path.isfile(file_path)
645 |         with open(file_path, "a") as out_total:
646 |             if not total_exists:
647 |                 out_total.write(
648 |                     "date,organization,repos,members,teams,"
649 |                     + "unique_contributors,total_contributors,forks,"
650 |                     + "stargazers,pull_requests,open_issues,has_readme,"
651 |                     + "has_license,pull_requests_open,pull_requests_closed,"
652 |                     + "commits,id,closed_issues,issues\n"
653 |                 )
654 |             self.delete_last_line(date=date, file_path=file_path)
655 |         out_total.close()
656 |         with open(file_path, "r") as file_read:
657 |             row_count = sum(1 for row in file_read) - 1
658 |         file_read.close()
659 |         with open(file_path, "a") as out_total:
660 |             out_total.write(
661 |                 date
662 |                 + ","
663 |                 + organization
664 |                 + ","
665 |                 + str(self.total_repos)
666 |                 + ","
667 |                 + str(members)
668 |                 + ","
669 |                 + str(teams)
670 |                 + ","
671 |                 + str(len(self.unique_contributors))
672 |                 + ","
673 |                 + str(self.total_contributors)
674 |                 + ","
675 |                 + str(self.total_forks)
676 |                 + ","
677 |                 + str(self.total_stars)
678 |                 + ","
679 |                 + str(self.total_pull_reqs)
680 |                 + ","
681 |                 + str(self.total_open_issues)
682 |                 + ","
683 |                 + str(self.total_readmes)
684 |                 + ","
685 |                 + str(self.total_licenses)
686 |                 + ","
687 |                 + str(self.total_pull_reqs_open)
688 |                 + ","
689 |                 + str(self.total_pull_reqs_closed)
690 |                 + ","
691 |                 + str(self.total_commits)
692 |                 + ","
693 |                 + str(row_count)
694 |                 + ","
695 |                 + str(self.total_closed_issues)
696 |                 + ","
697 |                 + str(self.total_issues)
698 |                 + "\n"
699 |             )
700 |         out_total.close()
701 | 
702 |     def write_languages(self, file_path="", date=str(datetime.date.today())):
703 |         """
704 |         Updates languages.csv file with current data.
705 |         """
706 |         self.remove_date(file_path=file_path, date=date)
707 |         languages_exists = os.path.isfile(file_path)
708 |         with open(file_path, "a") as out_languages:
709 |             if not languages_exists:
710 |                 out_languages.write("date,language,count,size,size_log\n")
711 |             languages_sorted = sorted(self.languages_size)
712 |             # self.delete_last_line(date=date, file_path=file_path)
713 |             for language in languages_sorted:
714 |                 try:
715 |                     out_languages.write(
716 |                         date
717 |                         + ","
718 |                         + language
719 |                         + ","
720 |                         + str(self.languages[language])
721 |                         + ","
722 |                         + str(self.languages_size[language])
723 |                         + ","
724 |                         + str(math.log10(int(self.languages_size[language])))
725 |                         + "\n"
726 |                     )
727 |                 except (TypeError, KeyError):
728 |                     out_languages.write(
729 |                         date
730 |                         + ","
731 |                         + language
732 |                         + ","
733 |                         + str(0)
734 |                         + ","
735 |                         + str(self.languages_size[language])
736 |                         + ","
737 |                         + str(math.log10(int(self.languages_size[language])))
738 |                         + "\n"
739 |                     )
740 | 
741 |     def checkDir(self, file_path=""):
742 |         """
743 |         Checks if a directory exists. If not, it creates one with the specified
744 |         file_path.
745 |         """
746 |         if not os.path.exists(os.path.dirname(file_path)):
747 |             try:
748 |                 os.makedirs(os.path.dirname(file_path))
749 |             except OSError as e:
750 |                 if e.errno != errno.EEXIST:
751 |                     raise
752 | 
753 |     def remove_date(self, file_path="", date=str(datetime.date.today())):
754 |         """
755 |         Removes all rows of the associated date from the given csv file.
756 |         Defaults to today.
757 |         """
758 |         languages_exists = os.path.isfile(file_path)
759 |         if languages_exists:
760 |             with open(file_path, "rb") as inp, open("temp.csv", "wb") as out:
761 |                 writer = csv.writer(out)
762 |                 for row in csv.reader(inp):
763 |                     if row[0] != date:
764 |                         writer.writerow(row)
765 |             inp.close()
766 |             out.close()
767 |             os.remove(file_path)
768 |             os.rename("temp.csv", file_path)
769 | 
770 |     def delete_last_line(self, file_path="", date=str(datetime.date.today())):
771 |         """
772 |         The following code was modified from
773 |         http://stackoverflow.com/a/10289740 &
774 |         http://stackoverflow.com/a/17309010
775 |         It essentially will check if the total for the current date already
776 |         exists in total.csv. If it does, it just removes the last line.
777 |         This is so the script could be run more than once a day and not
778 |         create many entries in the total.csv file for the same date.
779 |         """
780 |         deleted_line = False
781 |         if os.path.isfile(file_path):
782 |             with open(file_path, "r+") as file:
783 |                 reader = csv.reader(file, delimiter=",")
784 |                 for row in reader:
785 |                     if date == row[0]:
786 |                         file.seek(0, os.SEEK_END)
787 |                         pos = file.tell() - 1
788 |                         while pos > 0 and file.read(1) != "\n":
789 |                             pos -= 1
790 |                             file.seek(pos, os.SEEK_SET)
791 |                         if pos > 0:
792 |                             file.seek(pos, os.SEEK_SET)
793 |                             file.truncate()
794 |                             deleted_line = True
795 |                             break
796 |                 if deleted_line:
797 |                     file.write("\n")
798 |             file.close()
799 | 
800 | 
801 | if __name__ == "__main__":
802 |     my_github = GitHub_LLNL_Stats()
803 |     my_github.get_stats()
804 | 


--------------------------------------------------------------------------------
/scripts/my_repo.py:
--------------------------------------------------------------------------------
 1 | import datetime
 2 | 
 3 | 
 4 | class My_Repo:
 5 |     def __init__(self):
 6 |         self.date = datetime.date.today()
 7 |         self.name = "N/A"
 8 |         self.organization = "N/A"
 9 |         self.contributors = 0
10 |         self.forks = 0
11 |         self.stargazers = 0
12 |         self.pull_requests = 0
13 |         self.pull_requests_open = 0
14 |         self.pull_requests_closed = 0
15 |         self.issues = 0
16 |         self.open_issues = 0
17 |         self.closed_issues = 0
18 |         self.languages = []
19 |         self.readme = "MISS"
20 |         self.license = "MISS"
21 |         self.commits = 0
22 | 


--------------------------------------------------------------------------------
/scripts/org_to_emails.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python3
 2 | 
 3 | from scraper.github import create_session
 4 | 
 5 | # Looks for an environment variable `GITHUB_API_TOKEN` with a valid GitHub API token
 6 | gh = create_session()
 7 | 
 8 | 
 9 | def print_org_members_without_2fa(org_name="llnl"):
10 |     org = gh.organization(org_name)
11 | 
12 |     for user in org.members(filter="2fa_disabled"):
13 |         emails = {
14 |             c["author"]["email"]
15 |             for e in user.events()
16 |             if e.type == "PushEvent"
17 |             for c in e.payload["commits"]
18 |         }
19 |         emails = {e for e in emails if "@llnl.gov" in e}
20 |         if emails:
21 |             print(f"{user.login}: {','.join(emails)}")
22 | 
23 | 
24 | if __name__ == "__main__":
25 |     print_org_members_without_2fa()
26 | 


--------------------------------------------------------------------------------
/scripts/stars.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import logging
 4 | import os
 5 | import re
 6 | 
 7 | import requests
 8 | 
 9 | logging.basicConfig(level=logging.DEBUG)
10 | 
11 | github = requests.Session()
12 | 
13 | NEXT_LINK_REGEX = re.compile(r'<(\S+)>(?=; rel="next")')
14 | 
15 | 
16 | def get_stargazers(url, session=None):
17 |     """
18 |     Return a list of the stargazers of a GitHub repo
19 | 
20 |     Includes both the 'starred_at' and 'user' data.
21 | 
22 |     param: url
23 |         url is the 'stargazers_url' of the form:
24 |             https://api.github.com/repos/LLNL/spack/stargazers
25 |     """
26 |     headers = {"Accept": "application/vnd.github.v3.star+json"}
27 |     url = url + "?per_page=100&page=%s"
28 |     page = 1
29 |     gazers = []
30 | 
31 |     response = github.get(url % page, headers=headers)
32 |     gazers.extend(response.json())
33 | 
34 |     # {rel: url for url, rel in LINK_REGEX.findall(r.headers['Link'])}
35 |     while json_data:
36 |         gazers.extend(json_data)
37 |         page += 1
38 |         json_data = github.get(url % page, headers=headers).json()
39 | 
40 |     return gazers
41 | 
42 | 
43 | if __name__ == "__main__":
44 |     if "GITHUB_API_TOKEN" in os.environ:
45 |         auth = "token {}".format(os.environ["GITHUB_API_TOKEN"])
46 |         github.headers["Authorization"] = auth
47 |         logging.info("Using auth: %s", auth)
48 | 
49 |     orgs = ["llnl"]
50 |     urls = ("https://api.github.com/orgs/%s/repos?per_page=100" % org for org in orgs)
51 | 
52 |     repos = []
53 |     for url in urls:
54 |         repos.extend(github.get(url).json())
55 | 
56 |     stargazers = {repo["name"]: [] for repo in repos}
57 | 
58 |     for repo in repos:
59 |         stargazers[repo["name"]] = get_stargazers(repo["stargazers_url"])
60 | 
61 |         print(repo["name"], len(stargazers[repo["name"]]))
62 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [flake8]
2 | ignore = E231, E501, W503
3 | 
4 | [isort]
5 | combine_star = true
6 | force_sort_within_sections = true
7 | profile = black
8 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | from setuptools import find_packages, setup
 5 | 
 6 | with open("README.md") as fh:
 7 |     long_description = fh.read()
 8 | 
 9 | with open("requirements/production.txt") as fp:
10 |     lines = [x.strip() for x in fp.readlines() if x]
11 |     install_reqs = [x for x in lines if not x.startswith("#")]
12 | 
13 | setup(
14 |     name="llnl-scraper",
15 |     version="0.15.0",
16 |     description="Package for extracting software repository metadata",
17 |     long_description=long_description,
18 |     long_description_content_type="text/markdown",
19 |     author="Ian Lee",
20 |     author_email="lee1001@llnl.gov",
21 |     url="https://github.com/llnl/scraper",
22 |     packages=find_packages(),
23 |     install_requires=install_reqs,
24 |     python_requires=">=3.6",
25 |     entry_points={
26 |         "console_scripts": [
27 |             "scraper = scraper.gen_code_gov_json:main",
28 |         ]
29 |     },
30 |     scripts=[
31 |         "scripts/codegov_compute_hours.py",
32 |     ],
33 |     classifiers=[
34 |         "Development Status :: 4 - Beta",
35 |         "Intended Audience :: Developers",
36 |         "Operating System :: OS Independent",
37 |         "Programming Language :: Python",
38 |         "Programming Language :: Python :: 3",
39 |         "Programming Language :: Python :: 3 :: Only",
40 |         "Programming Language :: Python :: 3.6",
41 |         "Programming Language :: Python :: 3.7",
42 |         "Programming Language :: Python :: 3.8",
43 |         "Programming Language :: Python :: 3.9",
44 |         "Programming Language :: Python :: 3.10",
45 |         "Programming Language :: Python :: Implementation :: CPython",
46 |         "Programming Language :: Python :: Implementation :: PyPy",
47 |     ],
48 | )
49 | 


--------------------------------------------------------------------------------