├── .gitignore
├── .pre-commit-config.yaml
├── README.md
├── requirements.txt
└── scripts
    ├── bq-to-raw-url.sh
    ├── filter_popularity.py
    ├── github_downloader.py
    └── json-to-raw-url.sh


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 98 | __pypackages__/
 99 | 
100 | # Celery stuff
101 | celerybeat-schedule
102 | celerybeat.pid
103 | 
104 | # SageMath parsed files
105 | *.sage.py
106 | 
107 | # Environments
108 | .env
109 | .venv
110 | env/
111 | venv/
112 | ENV/
113 | env.bak/
114 | venv.bak/
115 | 
116 | # Spyder project settings
117 | .spyderproject
118 | .spyproject
119 | 
120 | # Rope project settings
121 | .ropeproject
122 | 
123 | # mkdocs documentation
124 | /site
125 | 
126 | # mypy
127 | .mypy_cache/
128 | .dmypy.json
129 | dmypy.json
130 | 
131 | # Pyre type checker
132 | .pyre/
133 | 
134 | # pytype static type analyzer
135 | .pytype/
136 | 
137 | # Cython debug symbols
138 | cython_debug/
139 | 
140 | # General
141 | .DS_Store
142 | .AppleDouble
143 | .LSOverride
144 | 
145 | # Icon must end with two \r
146 | Icon
147 | 
148 | 
149 | # Thumbnails
150 | ._*
151 | 
152 | # Files that might appear in the root of a volume
153 | .DocumentRevisions-V100
154 | .fseventsd
155 | .Spotlight-V100
156 | .TemporaryItems
157 | .Trashes
158 | .VolumeIcon.icns
159 | .com.apple.timemachine.donotpresent
160 | 
161 | # Directories potentially created on remote AFP share
162 | .AppleDB
163 | .AppleDesktop
164 | Network Trash Folder
165 | Temporary Items
166 | .apdisk
167 | 
168 | .vscode/*
169 | !.vscode/settings.json
170 | !.vscode/tasks.json
171 | !.vscode/launch.json
172 | !.vscode/extensions.json
173 | *.code-workspace
174 | 
175 | # Local History for Visual Studio Code
176 | .history/
177 | 
178 | 
179 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | repos:
 2 | -   repo: https://github.com/ambv/black
 3 |     rev: stable
 4 |     hooks:
 5 |     - id: black
 6 | 
 7 | -   repo: https://github.com/pre-commit/mirrors-mypy
 8 |     rev: master
 9 |     hooks:
10 |     -   id: mypy
11 |         args: [--no-strict-optional, --ignore-missing-imports]
12 | 
13 | -   repo: https://github.com/asottile/blacken-docs
14 |     rev: master
15 |     hooks:
16 |     -   id: blacken-docs
17 |         additional_dependencies: [black]
18 |         args: ["--skip-errors"]
19 | 
20 | -   repo: https://github.com/timothycrosley/isort
21 |     rev: master
22 |     hooks:
23 |     -   id: isort
24 |         additional_dependencies: [toml]
25 | 
26 | -   repo: git@github.com:humitos/mirrors-autoflake.git
27 |     rev: master
28 |     hooks:
29 |     -   id: autoflake
30 |         args: ["--in-place", "--remove-all-unused-imports", "--remove-unused-variable"]
31 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | Scripts for the purposes of scraping Sourcegraph search results. Script `scripts/json-to-raw-url.sh` extracts raw GitHub file URLs from [src-cli](https://github.com/sourcegraph/src-cli/), and `scripts/github_downloader.py` downloads all the files from GitHub.
 2 | 
 3 | ## Example Usage
 4 | 
 5 | ```sh
 6 | $ src search -stream -json '${{github.event.comment.body}} file:.github/workflows COUNT:100000' | ./scripts/json-to-raw-url.sh | python3 scripts/github_downloader.py
 7 | ```
 8 | 
 9 | ## Why is this so useful?
10 | 
11 | This allows security researchers to run static analysis tools on a mass of GitHub repos which are fetched from Sourcegraph. Here's an example of running [Semgrep](https://semgrep.dev):
12 | 
13 | ```sh
14 | $ semgrep --config "p/github-actions" out
15 | ```
16 | 
17 | The output will include full repository file paths, allowing us to easily identify the vulnerable repositories.
18 | 
19 | ## How to install
20 | 
21 | ```sh
22 | $ git clone https://github.com/KarimPwnz/sourcegraph-scripts.git
23 | $ cd sourcegraph-scripts
24 | $ pip install -r requirements.txt
25 | ```
26 | 
27 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | aiohttp==3.8.4
2 | aiohttp_retry==2.8.3
3 | aiopath==0.5.12
4 | 


--------------------------------------------------------------------------------
/scripts/bq-to-raw-url.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | jq -r '"https://raw.githubusercontent.com/\(.repo_name)/HEAD/\(.path)"'
4 | 


--------------------------------------------------------------------------------
/scripts/filter_popularity.py:
--------------------------------------------------------------------------------
 1 | import aiohttp
 2 | import argparse
 3 | import asyncio
 4 | import logging
 5 | import sys
 6 | 
 7 | 
 8 | class PopularityChecker:
 9 |     def __init__(self, client, min_stars):
10 |         self._client = client
11 |         self.min_stars = min_stars
12 |         self._usernames = {}
13 | 
14 |     @staticmethod
15 |     def _gh_url_to_username(url):
16 |         # https://raw.githubusercontent.com/USERNAME/...
17 |         return url.split("/")[3]
18 | 
19 |     async def get_repos(self, username):
20 |         async with self._client.get(
21 |             f"https://api.github.com/users/{username}/repos?type=owner"
22 |         ) as resp:
23 |             repos = await resp.json()
24 |             try:
25 |                 message = repos.get("message")
26 |                 logging.error(
27 |                     "Got error while getting repos of %s: %s", username, message
28 |                 )
29 |                 return []
30 |             except AttributeError:
31 |                 return repos
32 | 
33 |     async def check_popularity(self, username):
34 |         logging.info("Getting repos of %s", username)
35 |         try:
36 |             repos = await self.get_repos(username)
37 |         except Exception as e:
38 |             logging.exception("Exception occurred %s", str(e))
39 |             return False
40 |         for repo in repos:
41 |             if repo.get("stargazers_count", -1) >= self.min_stars:
42 |                 self._usernames[username] = True
43 |                 return True
44 |         self._usernames[username] = False
45 |         return False
46 | 
47 | 
48 |     async def is_popular(self, query):
49 |         if query.startswith("https://raw.githubusercontent.com/"):
50 |             username = self._gh_url_to_username(query)
51 |         else:
52 |             username = query
53 |         if username not in self._usernames:
54 |             self._usernames[username] = asyncio.ensure_future(self.check_popularity(username))
55 |         return await self._usernames[username]
56 | 
57 |     async def print_if_popular(self, query):
58 |         logging.info("Checking %s", query)
59 |         if await self.is_popular(query):
60 |             print(query, flush=True)
61 | 
62 | 
63 | async def main():
64 |     logging.basicConfig(format="[%(levelname)s] %(message)s", level=logging.INFO)
65 |     parser = argparse.ArgumentParser()
66 |     parser.add_argument(
67 |         "-s",
68 |         dest="stars",
69 |         help="Number of minimum stars (default: 100)",
70 |         default=100,
71 |         type=int,
72 |     )
73 |     parser.add_argument(
74 |         "-l",
75 |         dest="limit",
76 |         help="Concurrent requests limit (default: 20)",
77 |         default=20,
78 |         type=int
79 |     )
80 |     parser.add_argument(
81 |         "-token", dest="token", help="Github token", required=False, type=str
82 |     )
83 |     args = parser.parse_args()
84 |     headers = {}
85 |     if args.token:
86 |         headers["Authorization"] = f"Token {args.token}"
87 |     conn = aiohttp.TCPConnector(limit=args.limit)
88 |     async with aiohttp.ClientSession(connector=conn) as client:
89 |         checker = PopularityChecker(client=client, min_stars=args.stars)
90 |         queries = (l.rstrip("\n") for l in sys.stdin)
91 |         tasks = []
92 |         for query in queries:
93 |             tasks.append(checker.print_if_popular(query))
94 |         await asyncio.gather(*tasks)
95 | 
96 | 
97 | if __name__ == "__main__":
98 |     asyncio.run(main())
99 | 


--------------------------------------------------------------------------------
/scripts/github_downloader.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import asyncio
 3 | import logging
 4 | import sys
 5 | import aiohttp
 6 | 
 7 | from aiopath import AsyncPath
 8 | 
 9 | 
10 | class GitHubDownloader:
11 |     def __init__(self, client, output_path, proxy):
12 |         self._client = client
13 |         self.output_path = output_path
14 |         self.proxy = proxy
15 | 
16 |     @staticmethod
17 |     def _url_to_path(url):
18 |         return url.replace("https://raw.githubusercontent.com/", "")
19 | 
20 |     async def download(self, url):
21 |         async with self._client.get(url, proxy=self.proxy) as resp:
22 |             resp.raise_for_status()
23 |             return await resp.text()
24 | 
25 |     async def save(self, url, content):
26 |         path = self.output_path / AsyncPath(self._url_to_path(url))
27 |         await path.parent.mkdir(parents=True, exist_ok=True)
28 |         await path.write_text(content)
29 | 
30 |     async def download_and_save(self, url, retry_sleep=30):
31 |         logging.info("Downloading %s", url)
32 |         try:
33 |             content = await self.download(url)
34 |         except aiohttp.ClientResponseError as e:
35 |             if e.status == 429:
36 |                 logging.warning("Retrying in %s seconds (got 429-ed): %s", retry_sleep, url)
37 |                 await asyncio.sleep(retry_sleep)
38 |                 return await self.download_and_save(url, retry_sleep * 2)
39 |             logging.error("Status code %s != 200 received: %s", e.status, url)
40 |             return
41 |         except Exception as e:
42 |             logging.exception("Exception occurred while downloading: %s", str(e))
43 |             return
44 |         logging.info("Saving %s", url)
45 |         try:
46 |             await self.save(url, content)
47 |         except Exception as e:
48 |             logging.exception("Exception occurred while saving: %s", str(e))
49 | 
50 | 
51 | async def main():
52 |     logging.basicConfig(format="[%(levelname)s] %(message)s", level=logging.INFO)
53 |     parser = argparse.ArgumentParser()
54 |     parser.add_argument(
55 |         "-o", "--output", help="Output file path (default: 'out')", default="out"
56 |     )
57 |     parser.add_argument(
58 |         "-l",
59 |         "--limit",
60 |         dest="limit",
61 |         help="Concurrent requests limit (default: 100)",
62 |         default=100,
63 |         type=int,
64 |     )
65 |     parser.add_argument("-p",
66 |         "--proxy",
67 |         dest="proxy",
68 |         help="Proxy to use with each request; proxy domain resolved per request",
69 |         default=None
70 |     )
71 |     args = parser.parse_args()
72 |     conn = aiohttp.TCPConnector(limit=args.limit)
73 |     async with aiohttp.ClientSession(connector=conn) as client:
74 |         downloader = GitHubDownloader(client=client, output_path=args.output, proxy=args.proxy)
75 |         tasks = []
76 |         urls = (l.rstrip("\n") for l in sys.stdin)
77 |         for url in urls:
78 |             tasks.append(downloader.download_and_save(url))
79 |             if len(tasks) == args.limit:
80 |                 await asyncio.gather(*tasks)
81 |                 tasks = []
82 |         await asyncio.gather(*tasks)
83 | 
84 | 
85 | if __name__ == "__main__":
86 |     asyncio.run(main())
87 | 


--------------------------------------------------------------------------------
/scripts/json-to-raw-url.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | jq -r '. | select(.repository != null) | "\(.repository)/HEAD/\(.path)"' |
4 | cut -d '/' -f 2- |
5 | awk '{print "https://raw.githubusercontent.com/"$1}'
6 | 


--------------------------------------------------------------------------------