├── .gitignore ├── LICENSE.txt ├── MANIFEST.in ├── README.md ├── README.rst ├── setup.cfg ├── setup.py ├── tests ├── __init__.py ├── amo_test.py ├── conftest.py ├── metadata_test.py └── webext_test.py └── webextaware ├── __init__.py ├── amo.py ├── database.py ├── main.py ├── metadata.py ├── modes ├── __init__.py ├── get.py ├── grep.py ├── info.py ├── libs.py ├── manifest.py ├── meta.py ├── query.py ├── runmode.py ├── scan.py ├── shell.py ├── stats.py ├── sync.py └── unzip.py ├── package.json ├── scanner.py └── webext.py /.gitignore: -------------------------------------------------------------------------------- 1 | bin 2 | include 3 | lib 4 | man 5 | dist 6 | pip-selfcheck.json 7 | .Python 8 | venv 9 | .idea/ 10 | __pycache__/ 11 | webextaware.egg-info/ 12 | .eggs/ 13 | node_modules 14 | webext_hashfs 15 | Pipfile 16 | Pipfile.lock 17 | *.pyc 18 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Mozilla Public License Version 2.0 2 | ================================== 3 | 4 | 1. Definitions 5 | -------------- 6 | 7 | 1.1. "Contributor" 8 | means each individual or legal entity that creates, contributes to 9 | the creation of, or owns Covered Software. 10 | 11 | 1.2. "Contributor Version" 12 | means the combination of the Contributions of others (if any) used 13 | by a Contributor and that particular Contributor's Contribution. 14 | 15 | 1.3. "Contribution" 16 | means Covered Software of a particular Contributor. 17 | 18 | 1.4. "Covered Software" 19 | means Source Code Form to which the initial Contributor has attached 20 | the notice in Exhibit A, the Executable Form of such Source Code 21 | Form, and Modifications of such Source Code Form, in each case 22 | including portions thereof. 23 | 24 | 1.5. "Incompatible With Secondary Licenses" 25 | means 26 | 27 | (a) that the initial Contributor has attached the notice described 28 | in Exhibit B to the Covered Software; or 29 | 30 | (b) that the Covered Software was made available under the terms of 31 | version 1.1 or earlier of the License, but not also under the 32 | terms of a Secondary License. 33 | 34 | 1.6. "Executable Form" 35 | means any form of the work other than Source Code Form. 36 | 37 | 1.7. "Larger Work" 38 | means a work that combines Covered Software with other material, in 39 | a separate file or files, that is not Covered Software. 40 | 41 | 1.8. "License" 42 | means this document. 43 | 44 | 1.9. "Licensable" 45 | means having the right to grant, to the maximum extent possible, 46 | whether at the time of the initial grant or subsequently, any and 47 | all of the rights conveyed by this License. 48 | 49 | 1.10. "Modifications" 50 | means any of the following: 51 | 52 | (a) any file in Source Code Form that results from an addition to, 53 | deletion from, or modification of the contents of Covered 54 | Software; or 55 | 56 | (b) any new file in Source Code Form that contains any Covered 57 | Software. 58 | 59 | 1.11. "Patent Claims" of a Contributor 60 | means any patent claim(s), including without limitation, method, 61 | process, and apparatus claims, in any patent Licensable by such 62 | Contributor that would be infringed, but for the grant of the 63 | License, by the making, using, selling, offering for sale, having 64 | made, import, or transfer of either its Contributions or its 65 | Contributor Version. 66 | 67 | 1.12. "Secondary License" 68 | means either the GNU General Public License, Version 2.0, the GNU 69 | Lesser General Public License, Version 2.1, the GNU Affero General 70 | Public License, Version 3.0, or any later versions of those 71 | licenses. 72 | 73 | 1.13. "Source Code Form" 74 | means the form of the work preferred for making modifications. 75 | 76 | 1.14. "You" (or "Your") 77 | means an individual or a legal entity exercising rights under this 78 | License. For legal entities, "You" includes any entity that 79 | controls, is controlled by, or is under common control with You. For 80 | purposes of this definition, "control" means (a) the power, direct 81 | or indirect, to cause the direction or management of such entity, 82 | whether by contract or otherwise, or (b) ownership of more than 83 | fifty percent (50%) of the outstanding shares or beneficial 84 | ownership of such entity. 85 | 86 | 2. License Grants and Conditions 87 | -------------------------------- 88 | 89 | 2.1. Grants 90 | 91 | Each Contributor hereby grants You a world-wide, royalty-free, 92 | non-exclusive license: 93 | 94 | (a) under intellectual property rights (other than patent or trademark) 95 | Licensable by such Contributor to use, reproduce, make available, 96 | modify, display, perform, distribute, and otherwise exploit its 97 | Contributions, either on an unmodified basis, with Modifications, or 98 | as part of a Larger Work; and 99 | 100 | (b) under Patent Claims of such Contributor to make, use, sell, offer 101 | for sale, have made, import, and otherwise transfer either its 102 | Contributions or its Contributor Version. 103 | 104 | 2.2. Effective Date 105 | 106 | The licenses granted in Section 2.1 with respect to any Contribution 107 | become effective for each Contribution on the date the Contributor first 108 | distributes such Contribution. 109 | 110 | 2.3. Limitations on Grant Scope 111 | 112 | The licenses granted in this Section 2 are the only rights granted under 113 | this License. No additional rights or licenses will be implied from the 114 | distribution or licensing of Covered Software under this License. 115 | Notwithstanding Section 2.1(b) above, no patent license is granted by a 116 | Contributor: 117 | 118 | (a) for any code that a Contributor has removed from Covered Software; 119 | or 120 | 121 | (b) for infringements caused by: (i) Your and any other third party's 122 | modifications of Covered Software, or (ii) the combination of its 123 | Contributions with other software (except as part of its Contributor 124 | Version); or 125 | 126 | (c) under Patent Claims infringed by Covered Software in the absence of 127 | its Contributions. 128 | 129 | This License does not grant any rights in the trademarks, service marks, 130 | or logos of any Contributor (except as may be necessary to comply with 131 | the notice requirements in Section 3.4). 132 | 133 | 2.4. Subsequent Licenses 134 | 135 | No Contributor makes additional grants as a result of Your choice to 136 | distribute the Covered Software under a subsequent version of this 137 | License (see Section 10.2) or under the terms of a Secondary License (if 138 | permitted under the terms of Section 3.3). 139 | 140 | 2.5. Representation 141 | 142 | Each Contributor represents that the Contributor believes its 143 | Contributions are its original creation(s) or it has sufficient rights 144 | to grant the rights to its Contributions conveyed by this License. 145 | 146 | 2.6. Fair Use 147 | 148 | This License is not intended to limit any rights You have under 149 | applicable copyright doctrines of fair use, fair dealing, or other 150 | equivalents. 151 | 152 | 2.7. Conditions 153 | 154 | Sections 3.1, 3.2, 3.3, and 3.4 are conditions of the licenses granted 155 | in Section 2.1. 156 | 157 | 3. Responsibilities 158 | ------------------- 159 | 160 | 3.1. Distribution of Source Form 161 | 162 | All distribution of Covered Software in Source Code Form, including any 163 | Modifications that You create or to which You contribute, must be under 164 | the terms of this License. You must inform recipients that the Source 165 | Code Form of the Covered Software is governed by the terms of this 166 | License, and how they can obtain a copy of this License. You may not 167 | attempt to alter or restrict the recipients' rights in the Source Code 168 | Form. 169 | 170 | 3.2. Distribution of Executable Form 171 | 172 | If You distribute Covered Software in Executable Form then: 173 | 174 | (a) such Covered Software must also be made available in Source Code 175 | Form, as described in Section 3.1, and You must inform recipients of 176 | the Executable Form how they can obtain a copy of such Source Code 177 | Form by reasonable means in a timely manner, at a charge no more 178 | than the cost of distribution to the recipient; and 179 | 180 | (b) You may distribute such Executable Form under the terms of this 181 | License, or sublicense it under different terms, provided that the 182 | license for the Executable Form does not attempt to limit or alter 183 | the recipients' rights in the Source Code Form under this License. 184 | 185 | 3.3. Distribution of a Larger Work 186 | 187 | You may create and distribute a Larger Work under terms of Your choice, 188 | provided that You also comply with the requirements of this License for 189 | the Covered Software. If the Larger Work is a combination of Covered 190 | Software with a work governed by one or more Secondary Licenses, and the 191 | Covered Software is not Incompatible With Secondary Licenses, this 192 | License permits You to additionally distribute such Covered Software 193 | under the terms of such Secondary License(s), so that the recipient of 194 | the Larger Work may, at their option, further distribute the Covered 195 | Software under the terms of either this License or such Secondary 196 | License(s). 197 | 198 | 3.4. Notices 199 | 200 | You may not remove or alter the substance of any license notices 201 | (including copyright notices, patent notices, disclaimers of warranty, 202 | or limitations of liability) contained within the Source Code Form of 203 | the Covered Software, except that You may alter any license notices to 204 | the extent required to remedy known factual inaccuracies. 205 | 206 | 3.5. Application of Additional Terms 207 | 208 | You may choose to offer, and to charge a fee for, warranty, support, 209 | indemnity or liability obligations to one or more recipients of Covered 210 | Software. However, You may do so only on Your own behalf, and not on 211 | behalf of any Contributor. You must make it absolutely clear that any 212 | such warranty, support, indemnity, or liability obligation is offered by 213 | You alone, and You hereby agree to indemnify every Contributor for any 214 | liability incurred by such Contributor as a result of warranty, support, 215 | indemnity or liability terms You offer. You may include additional 216 | disclaimers of warranty and limitations of liability specific to any 217 | jurisdiction. 218 | 219 | 4. Inability to Comply Due to Statute or Regulation 220 | --------------------------------------------------- 221 | 222 | If it is impossible for You to comply with any of the terms of this 223 | License with respect to some or all of the Covered Software due to 224 | statute, judicial order, or regulation then You must: (a) comply with 225 | the terms of this License to the maximum extent possible; and (b) 226 | describe the limitations and the code they affect. Such description must 227 | be placed in a text file included with all distributions of the Covered 228 | Software under this License. Except to the extent prohibited by statute 229 | or regulation, such description must be sufficiently detailed for a 230 | recipient of ordinary skill to be able to understand it. 231 | 232 | 5. Termination 233 | -------------- 234 | 235 | 5.1. The rights granted under this License will terminate automatically 236 | if You fail to comply with any of its terms. However, if You become 237 | compliant, then the rights granted under this License from a particular 238 | Contributor are reinstated (a) provisionally, unless and until such 239 | Contributor explicitly and finally terminates Your grants, and (b) on an 240 | ongoing basis, if such Contributor fails to notify You of the 241 | non-compliance by some reasonable means prior to 60 days after You have 242 | come back into compliance. Moreover, Your grants from a particular 243 | Contributor are reinstated on an ongoing basis if such Contributor 244 | notifies You of the non-compliance by some reasonable means, this is the 245 | first time You have received notice of non-compliance with this License 246 | from such Contributor, and You become compliant prior to 30 days after 247 | Your receipt of the notice. 248 | 249 | 5.2. If You initiate litigation against any entity by asserting a patent 250 | infringement claim (excluding declaratory judgment actions, 251 | counter-claims, and cross-claims) alleging that a Contributor Version 252 | directly or indirectly infringes any patent, then the rights granted to 253 | You by any and all Contributors for the Covered Software under Section 254 | 2.1 of this License shall terminate. 255 | 256 | 5.3. In the event of termination under Sections 5.1 or 5.2 above, all 257 | end user license agreements (excluding distributors and resellers) which 258 | have been validly granted by You or Your distributors under this License 259 | prior to termination shall survive termination. 260 | 261 | ************************************************************************ 262 | * * 263 | * 6. Disclaimer of Warranty * 264 | * ------------------------- * 265 | * * 266 | * Covered Software is provided under this License on an "as is" * 267 | * basis, without warranty of any kind, either expressed, implied, or * 268 | * statutory, including, without limitation, warranties that the * 269 | * Covered Software is free of defects, merchantable, fit for a * 270 | * particular purpose or non-infringing. The entire risk as to the * 271 | * quality and performance of the Covered Software is with You. * 272 | * Should any Covered Software prove defective in any respect, You * 273 | * (not any Contributor) assume the cost of any necessary servicing, * 274 | * repair, or correction. This disclaimer of warranty constitutes an * 275 | * essential part of this License. No use of any Covered Software is * 276 | * authorized under this License except under this disclaimer. * 277 | * * 278 | ************************************************************************ 279 | 280 | ************************************************************************ 281 | * * 282 | * 7. Limitation of Liability * 283 | * -------------------------- * 284 | * * 285 | * Under no circumstances and under no legal theory, whether tort * 286 | * (including negligence), contract, or otherwise, shall any * 287 | * Contributor, or anyone who distributes Covered Software as * 288 | * permitted above, be liable to You for any direct, indirect, * 289 | * special, incidental, or consequential damages of any character * 290 | * including, without limitation, damages for lost profits, loss of * 291 | * goodwill, work stoppage, computer failure or malfunction, or any * 292 | * and all other commercial damages or losses, even if such party * 293 | * shall have been informed of the possibility of such damages. This * 294 | * limitation of liability shall not apply to liability for death or * 295 | * personal injury resulting from such party's negligence to the * 296 | * extent applicable law prohibits such limitation. Some * 297 | * jurisdictions do not allow the exclusion or limitation of * 298 | * incidental or consequential damages, so this exclusion and * 299 | * limitation may not apply to You. * 300 | * * 301 | ************************************************************************ 302 | 303 | 8. Litigation 304 | ------------- 305 | 306 | Any litigation relating to this License may be brought only in the 307 | courts of a jurisdiction where the defendant maintains its principal 308 | place of business and such litigation shall be governed by laws of that 309 | jurisdiction, without reference to its conflict-of-law provisions. 310 | Nothing in this Section shall prevent a party's ability to bring 311 | cross-claims or counter-claims. 312 | 313 | 9. Miscellaneous 314 | ---------------- 315 | 316 | This License represents the complete agreement concerning the subject 317 | matter hereof. If any provision of this License is held to be 318 | unenforceable, such provision shall be reformed only to the extent 319 | necessary to make it enforceable. Any law or regulation which provides 320 | that the language of a contract shall be construed against the drafter 321 | shall not be used to construe this License against a Contributor. 322 | 323 | 10. Versions of the License 324 | --------------------------- 325 | 326 | 10.1. New Versions 327 | 328 | Mozilla Foundation is the license steward. Except as provided in Section 329 | 10.3, no one other than the license steward has the right to modify or 330 | publish new versions of this License. Each version will be given a 331 | distinguishing version number. 332 | 333 | 10.2. Effect of New Versions 334 | 335 | You may distribute the Covered Software under the terms of the version 336 | of the License under which You originally received the Covered Software, 337 | or under the terms of any subsequent version published by the license 338 | steward. 339 | 340 | 10.3. Modified Versions 341 | 342 | If you create software not governed by this License, and you want to 343 | create a new license for such software, you may create and use a 344 | modified version of this License if you rename the license and remove 345 | any references to the name of the license steward (except to note that 346 | such modified license differs from this License). 347 | 348 | 10.4. Distributing Source Code Form that is Incompatible With Secondary 349 | Licenses 350 | 351 | If You choose to distribute Source Code Form that is Incompatible With 352 | Secondary Licenses under the terms of this version of the License, the 353 | notice described in Exhibit B of this License must be attached. 354 | 355 | Exhibit A - Source Code Form License Notice 356 | ------------------------------------------- 357 | 358 | This Source Code Form is subject to the terms of the Mozilla Public 359 | License, v. 2.0. If a copy of the MPL was not distributed with this 360 | file, You can obtain one at http://mozilla.org/MPL/2.0/. 361 | 362 | If it is not possible or desirable to put the notice in a particular 363 | file, then You may include the notice in a location (such as a LICENSE 364 | file in a relevant directory) where a recipient would be likely to look 365 | for such a notice. 366 | 367 | You may add additional accurate notices of copyright ownership. 368 | 369 | Exhibit B - "Incompatible With Secondary Licenses" Notice 370 | --------------------------------------------------------- 371 | 372 | This Source Code Form is "Incompatible With Secondary Licenses", as 373 | defined by the Mozilla Public License, v. 2.0. 374 | 375 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | global-exclude *.py[co] 2 | global-exclude __pycache__ 3 | include LICENSE.txt 4 | include README.rst 5 | include webextaware/package.json 6 | exclude README.md 7 | exclude tests/* 8 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Web Extension Aware 2 | 3 | [![PyPI Package version](https://badge.fury.io/py/webextaware.svg)](https://pypi.python.org/pypi/webextaware) 4 | 5 | 6 | ## Requirements 7 | WebExtAware depends on other command line tools and Python modules, some of which require 8 | libraries for building. You need to ensure that the following dependencies are met: 9 | * grep 10 | 11 | ## Installation for users 12 | WebExtAware requires **Python 3** to run. If that's what you have, you're good to go: 13 | ``` 14 | $ pip3 install [--user] webextaware 15 | $ webextaware --help 16 | ``` 17 | 18 | Whether or not the `--user` flag is required depends on your Python installation. It is usually 19 | not required on Macs with Homebrew while most Linux distributions can't do without it. 20 | 21 | To use the `scan` sub command, you need to have a recent version of `node` and `npm` installed. 22 | Check that they are installed and available: 23 | ``` 24 | $ node --version 25 | v7.10.0 26 | $ npm --version 27 | 4.2.0 28 | ``` 29 | 30 | ## Installation for developers 31 | 32 | Locally clone the repo, then cd there. Create a virtualenv and install with 33 | 34 | ``` 35 | virtualenv --always-copy --python=python3 venv 36 | . venv/bin/activate 37 | pip install -e .[dev] 38 | ``` 39 | 40 | As long as the virtualenv is active, the ```webextaware``` command is available. 41 | 42 | ## Metadata update 43 | 44 | Sync all the AMO data with 45 | 46 | ``` 47 | webextaware sync 48 | ``` 49 | 50 | You may run into AMO's occasional 504s. In any case, re-run the following 51 | command until all you get are persistent 404 errors: 52 | 53 | ``` 54 | webextaware sync -n 55 | ``` 56 | 57 | ## Usage examples 58 | 59 | Most commands accept selectors for selecting packages. Valid selectors are: 60 | 61 | * any AMO ID like *737717* 62 | * any extension file ID (sha256 hashes) like *2c8fc1861903551dac72bdbe9ec389bff8c417ba7217f6c738ac8d968939fc30* 63 | * the keyword *all* for selecting the whole metadata set 64 | * the keyword *orphans* for selecting extensions not referenced by the metadata set 65 | * a regular expression that is matched against extension names 66 | 67 | ### info, query 68 | 69 | Get some info on the cache state with 70 | 71 | ``` 72 | webextaware info 73 | ``` 74 | 75 | Query the metadata for known hashes or IDs: 76 | 77 | ``` 78 | webextaware query 835644 79 | ``` 80 | 81 | ### stats 82 | 83 | Write CSV with stats for all the extensions to terminal with 84 | 85 | ``` 86 | webextaware stats 87 | ``` 88 | 89 | ### manifest, get, unzip 90 | 91 | Show the manifests and paths of cache files associated with a specific AMO ID and 92 | unzip them to the /tmp folder with 93 | 94 | ``` 95 | webextaware manifest 728674 96 | webextaware get 728674 97 | webextaware unzip 728674 -o /tmp 98 | ``` 99 | 100 | The last command prints a list of extracted folders. 101 | 102 | Pass `-r` to the `manifest` subcommand to dump raw manifests. Pass `-t` to the manifest command 103 | to get manifests in a grep-friendly line-based format. 104 | 105 | ``` 106 | webextaware manifest all -t | grep /optional_permissions: 107 | ``` 108 | 109 | You can unzip all the extensions to a specific folder with 110 | 111 | ``` 112 | webextaware unzip all -o /tmp/exts 113 | ``` 114 | 115 | It will print a list of folders where the extensions were extracted. 116 | 117 | ### grep 118 | 119 | Grep for a regular expression in all or specific extensions with 120 | 121 | ``` 122 | webextaware grep optional_permissions 123 | webextaware grep optional_permissions 739662 -A 10 124 | ``` 125 | 126 | The `grep` subcommand is equivalent to `grep -E -r `. 127 | Any arguments following the regular expression and package selectors will be 128 | passed transparently to grep. If you need more fancy grepping capabilities or a 129 | huge performance boost, consider to `webextaware unzip all` first. 130 | 131 | ### scan 132 | 133 | Scan a web extension with retire.js and scanjs with 134 | 135 | ``` 136 | webextaware scan 739662 137 | ``` 138 | 139 | The result is formatted in JSON. 140 | 141 | ### libs 142 | 143 | Get some framework / libraries statistics with 144 | 145 | ``` 146 | webextaware libs amazon -e 147 | webextaware libs -H 148 | ``` 149 | 150 | The former syntax provides the data in JSON format and groups libraries per extension. 151 | The latter syntax provides complete library statistics in a human-readable format. 152 | There is also a `--traverse` mode for `--perext` (in short `-et`) to provide a grep-friendly format. 153 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | WebExtAware 2 | ========== 3 | 4 | `WebExtAware `_ is a bulk analysis tool for maintaining a local 5 | mirror of all `web extensions `_ published on 6 | `addons.mozilla.org `_ and performing various scans and other security-related 7 | operations on them. 8 | 9 | -------- 10 | 11 | The WebExtAware Python package has non-Python dependencies that need to be satisfied before the script can run. 12 | For installation instructions, usage information, and issue tracking, please visit the `Project Page 13 | `_ on GitHub. 14 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | description-file = README.rst 3 | 4 | [aliases] 5 | test = pytest 6 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # This Source Code Form is subject to the terms of the Mozilla Public 2 | # License, v. 2.0. If a copy of the MPL was not distributed with this file, 3 | # You can obtain one at http://mozilla.org/MPL/2.0/. 4 | 5 | from setuptools import setup, find_packages 6 | 7 | PACKAGE_NAME = "webextaware" 8 | PACKAGE_VERSION = "1.2.5" 9 | 10 | INSTALL_REQUIRES = [ 11 | "coloredlogs", 12 | "gevent", 13 | "grequests", 14 | "hashfs", 15 | "ipython", 16 | "json-cfg", 17 | "requests", 18 | "pynpm", 19 | "python-magic", 20 | "urllib3" 21 | ] 22 | 23 | TESTS_REQUIRE = [ 24 | "coverage", 25 | "mock", 26 | "pytest", 27 | "pytest-runner" 28 | ] 29 | 30 | DEV_REQUIRES = [ 31 | "coverage", 32 | "mock", 33 | "nose", 34 | "pycodestyle", 35 | "pytest", 36 | "pytest-runner", 37 | "radon" 38 | ] 39 | 40 | setup( 41 | name=PACKAGE_NAME, 42 | version=PACKAGE_VERSION, 43 | description="Mozilla WebExtensions Security Analyzer", 44 | classifiers=[ 45 | "Development Status :: 4 - Beta", 46 | "Environment :: Console", 47 | "Intended Audience :: Developers", 48 | "License :: OSI Approved :: Mozilla Public License 2.0 (MPL 2.0)", 49 | "Natural Language :: English", 50 | "Operating System :: MacOS :: MacOS X", 51 | "Operating System :: POSIX :: Linux", 52 | "Programming Language :: Python :: 3", 53 | "Programming Language :: Python :: 3 :: Only", 54 | "Topic :: Security", 55 | "Topic :: Software Development :: Quality Assurance", 56 | "Topic :: Software Development :: Testing", 57 | "Topic :: Utilities" 58 | ], 59 | keywords=["mozilla", "firefox", "browser", "addons", "web extensions", "testing", "security"], 60 | author="Christiane Ruetten", 61 | author_email="cr@mozilla.com", 62 | url="https://github.com/cr/webextaware", 63 | download_url="https://github.com/cr/webextaware/archive/latest.tar.gz", 64 | license="MPL2", 65 | packages=find_packages(exclude=["tests"]), 66 | include_package_data=True, # See MANIFEST.in 67 | zip_safe=False, 68 | use_2to3=False, 69 | install_requires=INSTALL_REQUIRES, 70 | tests_require=TESTS_REQUIRE, 71 | extras_require={"dev": DEV_REQUIRES}, # For `pip install -e .[dev]` 72 | entry_points={ 73 | "console_scripts": [ 74 | "webextaware = webextaware.main:main" 75 | ] 76 | } 77 | ) 78 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | # This Source Code Form is subject to the terms of the Mozilla Public 2 | # License, v. 2.0. If a copy of the MPL was not distributed with this file, 3 | # You can obtain one at http://mozilla.org/MPL/2.0/. 4 | 5 | 6 | class ArgsMock(object): 7 | """ 8 | Mock used for testing functionality that 9 | requires access to an args-style object. 10 | """ 11 | def __init__(self, **kwargs): 12 | self.kwargs = kwargs 13 | 14 | def __getattr__(self, attr): 15 | try: 16 | return self.kwargs[attr] 17 | except KeyError: 18 | return None 19 | -------------------------------------------------------------------------------- /tests/amo_test.py: -------------------------------------------------------------------------------- 1 | # This Source Code Form is subject to the terms of the Mozilla Public 2 | # License, v. 2.0. If a copy of the MPL was not distributed with this file, 3 | # You can obtain one at http://mozilla.org/MPL/2.0/. 4 | 5 | 6 | def test_amo_metadata_downloader(raw_meta): 7 | """AMO metadata downloader""" 8 | # Operates on data pre-downloaded by tests.__init__.setup_package() 9 | assert type(raw_meta) is list and type(raw_meta[0]) is dict, "delivers expected format" 10 | assert len(raw_meta) == 100, "delivers expected number of extensions" 11 | assert "id" in raw_meta[0] and "current_version" in raw_meta[0] and "file" in raw_meta[0]["current_version"], \ 12 | "metadata entries have expected format" 13 | assert raw_meta[0]["current_version"]["file"]["hash"].startswith("sha256:"), "hashes are SHA256" 14 | 15 | 16 | def test_amo_extension_downloader(raw_meta, ext_db): 17 | """AMO extension downloader""" 18 | # Operates on data pre-downloaded by tests.__init__.setup_package() 19 | 20 | # Download AMO pages until they contain at least five web extension files 21 | all_extensions = set() 22 | for ext in raw_meta: 23 | f = ext["current_version"]["file"] 24 | h = f["hash"].split(":")[1] 25 | all_extensions.add(h) 26 | 27 | # See which extensions were actually downloaded 28 | downloaded_extensions = set() 29 | for f in ext_db: 30 | addr = ext_db.get(f) 31 | downloaded_extensions.add(addr.id) 32 | 33 | assert len(downloaded_extensions) > 10, "extensions are downloaded" 34 | assert all_extensions >= downloaded_extensions, "only extensions from metadata are downloaded" 35 | assert len(downloaded_extensions) > 0.5 * len(all_extensions), "at least 50% of extensions are downloaded" 36 | -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | # This Source Code Form is subject to the terms of the Mozilla Public 2 | # License, v. 2.0. If a copy of the MPL was not distributed with this file, 3 | # You can obtain one at http://mozilla.org/MPL/2.0/. 4 | 5 | import hashfs 6 | import pytest 7 | 8 | from webextaware import amo 9 | from webextaware import metadata as md 10 | 11 | 12 | @pytest.fixture(scope="session") 13 | def raw_meta(): 14 | """Raw AMO metadata session fixture""" 15 | return amo.download_metadata(max_pages=2) 16 | 17 | 18 | @pytest.fixture(scope="session") 19 | def hfs_tmp(tmpdir_factory): 20 | """Session-wide temporary directory""" 21 | return tmpdir_factory.mktemp("hashfs_extension") 22 | 23 | 24 | @pytest.fixture(scope="session") 25 | def ext_db(raw_meta, hfs_tmp): 26 | edb = hashfs.HashFS(hfs_tmp, depth=4, width=1, algorithm='sha256') 27 | meta = md.Metadata(data=raw_meta) 28 | amo.update_files(meta, edb) 29 | return edb 30 | -------------------------------------------------------------------------------- /tests/metadata_test.py: -------------------------------------------------------------------------------- 1 | # This Source Code Form is subject to the terms of the Mozilla Public 2 | # License, v. 2.0. If a copy of the MPL was not distributed with this file, 3 | # You can obtain one at http://mozilla.org/MPL/2.0/. 4 | 5 | from nose.tools import * 6 | import bz2 7 | import copy 8 | import json 9 | import os 10 | 11 | from webextaware import metadata as md 12 | 13 | 14 | def test_metadata_object(tmpdir, raw_meta): 15 | """Metadata cache object instantiation""" 16 | md_file = tmpdir.join("md.bz2") 17 | meta = md.Metadata(data=raw_meta, filename=md_file) 18 | assert_true(type(meta) is md.Metadata, "has correct type") 19 | assert_true(len(meta) > 0 and len(meta) > 0.5 * len(raw_meta), "contains metadata on extensions") 20 | assert_false(os.path.isfile(md_file), "metadata cache file is only written on demand") 21 | meta.save() 22 | assert_true(os.path.isfile(md_file), "metadata cache file is written on demand") 23 | 24 | meta_again = md.Metadata(filename=md_file) 25 | assert_equal(len(meta), len(meta_again), "restores data from cache file") 26 | 27 | 28 | def test_metadata_extensions(raw_meta): 29 | """Metadata extension objects""" 30 | meta = md.Metadata(data=raw_meta) 31 | for e in meta: 32 | assert_true(type(e) is md.Extension, "iterating yields Extension objects") 33 | assert_true(type(e.id) is int and 0 < e.id < 100000000, "iterating yields Extensions with AMO IDs") 34 | assert_true(type(e.name) is str and len(e.name) > 0, "extensions have names") 35 | host_perm, api_perm = e.permissions 36 | assert_true(type(host_perm) is set or host_perm is None, "host permissions come as sets or None") 37 | assert_true(type(api_perm) is set or api_perm is None, "api permissions come as sets or None") 38 | assert_true(e.is_webextension(), "there are only web extensions in cache") 39 | 40 | for f in e.files(): 41 | assert_true("hash" in f and "url" in f, "extension files have hashes and URLs") 42 | assert_true(f["hash"].startswith("sha256:"), "hashes are sha256") 43 | 44 | for h in e.file_hashes(): 45 | assert_true(type(h) is str and h.isalnum() and len(h) == 64, "file IDs look like SHA256 hashes") 46 | 47 | 48 | def test_metadata_id_handling(raw_meta): 49 | """Metadata ID handling""" 50 | meta = md.Metadata(data=raw_meta) 51 | assert_true(len(meta) > 0, "there is metadata to work with") 52 | 53 | for e in meta: 54 | amo_id = e.id 55 | hashes = meta.id_to_hashes(amo_id) 56 | 57 | assert_true(meta.is_known_id(amo_id), "AMO IDs are recognized") 58 | ext_amo = meta.get_by_id(amo_id) 59 | ext_amo_get = meta.get(amo_id) 60 | assert_true(type(ext_amo) is md.Extension and type(ext_amo_get) is md.Extension, "can get by AMO IDs") 61 | assert_true(ext_amo.id == ext_amo_get.id == e.id, "gets yield identical results for identical AMO IDs") 62 | 63 | for hash_id in hashes: 64 | assert_true(meta.is_known_hash(hash_id), "hash IDs are recognized") 65 | ext_hash = meta.get_by_hash(hash_id) 66 | ext_hash_get = meta.get(hash_id) 67 | assert_true(type(ext_hash) is md.Extension and type(ext_hash_get) is md.Extension, "can get by hash IDs") 68 | assert_true(ext_hash.id == ext_hash_get.id == amo_id, "gets yield same result for same hash ID") 69 | assert_true(meta.hash_to_id(hash_id) == amo_id, "hash IDs resolve to extension") 70 | assert_true(hash_id in meta.id_to_hashes(amo_id), "hash IDs are associated with AMO ID") 71 | 72 | assert_true(type(ext_amo) is md.Extension and type(ext_hash) is md.Extension, "can get by AMO and hash ID") 73 | assert_true(type(ext_amo_get) is md.Extension and type(ext_hash_get) is md.Extension, "can get by any ID") 74 | assert_true(ext_amo.id == ext_hash.id == ext_amo_get.id == ext_hash_get.id, "get method yield same result") 75 | 76 | # Test failure cases 77 | # amo_id and hash_id still hold valid IDs 78 | assert_false(meta.is_known_id(hash_id), "invalid AMO IDs are rejected") 79 | assert_false(meta.is_known_hash(amo_id), "invalid hash IDs are rejected") 80 | assert_true(meta.get_by_id(hash_id) is None, "invalid AMO IDs yield None") 81 | assert_true(meta.get_by_hash(amo_id) is None, "invalid hash IDs yield None") 82 | assert_true(meta.get("invalid foo") is None, "invalid get yields None") 83 | 84 | def test_metadata_migration_from_amo_v3_to_v5(tmpdir, raw_meta): 85 | raw_meta_old_v3 = copy.deepcopy(raw_meta) 86 | for raw_ext in raw_meta_old_v3: 87 | # Move "file" (from AMO API v5) to first item of "files" array (=AMO API v3). 88 | raw_ext["current_version"]["files"] = [raw_ext["current_version"].pop("file")] 89 | raw_ext["current_version"]["files"][0]["is_webextension"] = True 90 | 91 | # AMO API v3 also had a is_webextension property that could be False. 92 | raw_meta_old_v3[0]["current_version"]["files"][0]["is_webextension"] = False 93 | 94 | number_of_webextensions_in_meta = len(raw_meta_old_v3) - 1 95 | 96 | md_file = tmpdir.join("md_with_amo_api_v3_data.bz2") 97 | with bz2.open(md_file, "w") as f: 98 | f.write(json.dumps(raw_meta_old_v3).encode("utf-8")) 99 | 100 | # Test importing from disk (this can happen if someone has a cache from early 2021). 101 | meta = md.Metadata(filename=md_file) 102 | assert_equal(len(meta), number_of_webextensions_in_meta, "found all WebExtensions in metadata file") 103 | for e in meta: 104 | assert_true(e.is_webextension(), "there are only web extensions in cache") 105 | assert_equal(len(list(e.file_hashes())), 1, "got file hash") 106 | 107 | # For good measure, also test the load-as-data variant: 108 | meta = md.Metadata(data=raw_meta_old_v3) 109 | assert_equal(len(meta), number_of_webextensions_in_meta, "found all WebExtensions in metadata") 110 | for e in meta: 111 | assert_true(e.is_webextension(), "there are only web extensions in cache") 112 | assert_equal(len(list(e.file_hashes())), 1, "got file hash") 113 | -------------------------------------------------------------------------------- /tests/webext_test.py: -------------------------------------------------------------------------------- 1 | # This Source Code Form is subject to the terms of the Mozilla Public 2 | # License, v. 2.0. If a copy of the MPL was not distributed with this file, 3 | # You can obtain one at http://mozilla.org/MPL/2.0/. 4 | 5 | import json 6 | 7 | from webextaware import webext as we 8 | 9 | 10 | def test_webext_object(ext_db): 11 | """WebExtension object instantiation""" 12 | 13 | assert len(ext_db) > 0, "there are web extension files to work with" 14 | 15 | for file_name in ext_db: 16 | with we.WebExtension(file_name) as w: 17 | assert type(w) is we.WebExtension, "can be instantiated" 18 | assert "manifest.json" in w.ls(), "each has manifest.json" 19 | manifest = w.manifest() 20 | assert type(manifest) is we.Manifest, "can yield Manifest objects" 21 | zipped_names = w.ls() 22 | assert "manifest.json" in zipped_names, "manifest.json is among zipped_files" 23 | 24 | 25 | def test_webext_manifest(ext_db): 26 | """WebExtension Manifest objects""" 27 | 28 | assert len(ext_db) > 0, "there are web extension files to work with" 29 | for file_name in ext_db: 30 | with we.WebExtension(file_name) as w: 31 | manifest = w.manifest() 32 | assert "manifest_version" in manifest and "version" in manifest and "name" in manifest, \ 33 | "objects have mandatory fields" 34 | manifest_str = str(manifest) 35 | assert len(manifest_str) > 0, "string representation is not empty" 36 | 37 | re_manifest = json.loads(manifest_str) 38 | assert "manifest_version" in re_manifest and "version" in re_manifest and "name" in re_manifest, \ 39 | "strings representations are valid JSON" 40 | -------------------------------------------------------------------------------- /webextaware/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cr/webextaware/932555290efbec2a817c0bd6e6c4a2cbee149f18/webextaware/__init__.py -------------------------------------------------------------------------------- /webextaware/amo.py: -------------------------------------------------------------------------------- 1 | # This Source Code Form is subject to the terms of the Mozilla Public 2 | # License, v. 2.0. If a copy of the MPL was not distributed with this file, 3 | # You can obtain one at http://mozilla.org/MPL/2.0/. 4 | 5 | import grequests 6 | from io import BytesIO 7 | import logging 8 | import math 9 | import requests 10 | 11 | 12 | logger = logging.getLogger(__name__) 13 | amo_server = "https://addons.mozilla.org" 14 | MAX_CONCURRENT_REQUESTS = 10 15 | 16 | 17 | def download_metadata(max_pages=(2 << 31), max_ext=(2 << 31), page_size=50, min_users=0, max_users=0): 18 | """ 19 | Retrieves the metadata for all public extensions. 20 | If specified, limit to extensions with at least |min_users| users. 21 | If specified, limit to extensions with less than |max_users| users. 22 | 23 | Returns an array of addon results from the AMO API as described at 24 | https://addons-server.readthedocs.io/en/latest/topics/api/addons.html#addon-detail-object 25 | """ 26 | global logger 27 | 28 | # Maximum page_size seems to be 50 right now, 25 is AMO's current default. 29 | url = amo_server + "/api/v5/addons/search/" 30 | search_params = "sort=created" \ 31 | "&type=extension" \ 32 | "&app=firefox" \ 33 | "&page_size=%d" % page_size 34 | if min_users: 35 | search_params += "&users__gte=%d" % min_users 36 | if max_users: 37 | search_params += "&users__lt=%d" % max_users 38 | logger.debug("Search parameters for AMO query: %s" % search_params) 39 | 40 | extra_desc = "" 41 | if min_users and max_users: 42 | extra_desc += ", with at least %d and less than %d users" % (min_users, max_users) 43 | elif min_users: 44 | extra_desc += ", with at least %d users" % min_users 45 | elif max_users: 46 | extra_desc += ", with less than %d users" % max_users 47 | 48 | # Grab page_size and count from first result page and calculate num_pages from that 49 | first_page = requests.get("%s?%s" % (url, search_params), verify=True).json() 50 | logger.info("There are currently %d web extensions listed%s" % (first_page["count"], extra_desc)) 51 | supported_page_size = int(first_page["page_size"]) 52 | if page_size != supported_page_size: 53 | logger.warning("Requested size %d is greater than supported size %d" % (page_size, supported_page_size)) 54 | num_pages = min(max_pages, int(math.ceil(first_page["count"] / supported_page_size))) 55 | max_pages_in_api = first_page["page_count"] 56 | if num_pages > max_pages_in_api: 57 | actual_result_count = max_pages_in_api * supported_page_size 58 | if not min_users and not max_users and num_pages <= max_pages and first_page["count"] < max_ext: 59 | logger.info("Splitting query to avoid truncation to %d results" % actual_result_count) 60 | return download_metadata_workaround_limit(max_pages, max_ext, supported_page_size, first_page["count"]) 61 | logger.warning("Truncating results to %d pages (%d results) due to API limitation" % (max_pages_in_api, actual_result_count)) 62 | num_pages = max_pages_in_api 63 | logger.info("Fetching %d pages of AMO metadata" % num_pages) 64 | pages_to_get = ["%s?%s&page=%d" % (url, search_params, n) for n in range(2, num_pages + 1)] 65 | 66 | # NOTE: The logic below assumes the result set to be stable during the query. 67 | # If an item is deleted during the query, another item may be missing or 68 | # appear multiple times due to shifted items during pagination. 69 | 70 | session = create_request_session() 71 | metadata = first_page["results"] 72 | while True: 73 | fatal_errors = 0 74 | unsent_requests = [grequests.get(url, verify=True, session=session) for url in pages_to_get] 75 | for response in grequests.imap(unsent_requests, size=MAX_CONCURRENT_REQUESTS): 76 | if 200 <= response.status_code < 400: 77 | logger.debug("Downloaded %d bytes from `%s`" % (len(response.content), response.url)) 78 | metadata += response.json()["results"] 79 | try: 80 | original_url = response.history[0].url 81 | except IndexError: 82 | # There was no redirect 83 | original_url = response.url 84 | pages_to_get.remove(original_url) 85 | else: 86 | logger.error("Unable to download `%s`, status code %d" % (response.url, response.status_code)) 87 | if 400 <= response.status_code < 500: 88 | fatal_errors += 1 89 | if len(pages_to_get) % 25 == 0: 90 | logger.info("%d pages to go" % len(pages_to_get)) 91 | if len(pages_to_get) == fatal_errors: 92 | break 93 | 94 | if len(pages_to_get) > 0: 95 | logger.error("Unable to fetch %d pages. Please try again later later" % len(pages_to_get)) 96 | return None 97 | 98 | if len(metadata) != first_page["count"]: 99 | logger.warning("Got %d instead of the expected %d results" % (len(metadata), first_page["count"])) 100 | 101 | return metadata[0:min(len(metadata), max_ext)] 102 | 103 | 104 | def download_metadata_workaround_limit(max_pages, max_ext, page_size, total_count): 105 | global logger 106 | 107 | # The AMO API is limited to 30k, but there are more extensions. To work 108 | # around this limit, we run two queries with logically disjoint results and 109 | # merge them. In May 2023, the total number of public extensions is 32k, 110 | # of which 14k have at least 10 users (=user_count_for_split). 111 | # 112 | # The work-around here depends on the ability to partition the results in 113 | # subsets. Ideally the AMO API should not have a cap on the result window: 114 | # https://github.com/mozilla/addons-server/issues/20640 115 | user_count_for_split = 10 116 | logger.info("Part 1 of 2: Looking up extensions with at least %d users" % user_count_for_split) 117 | metadata_part_1 = download_metadata(max_pages, max_ext, page_size, user_count_for_split, 0) 118 | logger.info("Part 2 of 2: Looking up extensions with less than %d users" % user_count_for_split) 119 | metadata_part_2 = download_metadata(max_pages, max_ext, page_size, 0, user_count_for_split) 120 | logger.info("Merging %d and %d and expecting %d results" % (len(metadata_part_1), len(metadata_part_2), total_count)) 121 | 122 | id_seen = set() 123 | metadata = [] 124 | for metadata_part in [metadata_part_1, metadata_part_2]: 125 | for ext in metadata_part: 126 | amo_id = ext["id"] 127 | if amo_id in id_seen: 128 | # In theory, the user count could update while the query is 129 | # running, and an addon can appear in both lists. 130 | logger.warning("Ignoring duplicate entry for AMO ID %s (addon ID %s)" % (amo_id, ext["guid"])) 131 | continue 132 | id_seen.add(amo_id) 133 | metadata.append(ext) 134 | 135 | if len(metadata) != total_count: 136 | # Could happen for several reasons, including but not limited to: 137 | # - An extension was added or removed while querying. 138 | # - user count updated, extension no longer in metadata_part_2. 139 | logger.warning("Got %d instead of the expected %d results after combining two result sets" % (len(metadata), total_count)) 140 | return metadata[0:min(len(metadata), max_ext)] 141 | 142 | 143 | def __as_chunks(flat_list, chunk_size): 144 | for i in range(0, len(flat_list), chunk_size): 145 | yield flat_list[i:i + chunk_size] 146 | 147 | 148 | def update_files(metadata, hash_fs): 149 | urls_to_get = [] 150 | for ext in metadata: 151 | for ext_file in ext.files(): 152 | ext_file_hash_type, ext_file_hash = ext_file["hash"].split(":") 153 | assert ext_file_hash_type == "sha256" 154 | if hash_fs.get(ext_file_hash) is None: 155 | if ext_file["url"] in urls_to_get: 156 | logger.warning("Duplicate URL in metadata: %s" % ext_file["url"]) 157 | urls_to_get.append(ext_file["url"]) 158 | else: 159 | logger.debug("`%s` is already cached locally" % ext_file_hash) 160 | 161 | logger.info("Fetching %d uncached web extensions from AMO" % len(urls_to_get)) 162 | 163 | session = create_request_session() 164 | 165 | while True: 166 | fatal_errors = 0 167 | unsent_requests = [grequests.get(url, verify=True, session=session) for url in urls_to_get] 168 | for response in grequests.imap(unsent_requests, size=MAX_CONCURRENT_REQUESTS): 169 | if response.status_code == 200: 170 | logger.debug("Downloaded %d bytes from `%s`" % (len(response.content), response.url)) 171 | try: 172 | hash_fs.put(BytesIO(response.content), ".zip") 173 | except ValueError as err: 174 | # probably the mysterious ValueError: embedded null byte 175 | logger.error("Unable to store `%s` in local cache: %s" % (response.url, str(err))) 176 | continue 177 | try: 178 | original_url = response.history[0].url 179 | except IndexError: 180 | # There was no redirect 181 | original_url = response.url 182 | urls_to_get.remove(original_url) 183 | else: 184 | logger.error("Unable to download `%s`, status code %d" % (response.url, response.status_code)) 185 | if 400 <= response.status_code < 500: 186 | fatal_errors += 1 187 | if len(urls_to_get) % 100 == 0: 188 | logger.info("%d extensions to go" % len(urls_to_get)) 189 | 190 | if len(urls_to_get) == fatal_errors: 191 | break 192 | 193 | if len(urls_to_get) > 0: 194 | logger.warning("Unable to fetch %d extensions, likely deleted add-ons" % len(urls_to_get)) 195 | 196 | 197 | def create_request_session(): 198 | # Share connections between requests to avoid overusing file descriptors. 199 | a = requests.adapters.HTTPAdapter(pool_maxsize=MAX_CONCURRENT_REQUESTS) 200 | session = requests.Session() 201 | session.mount('https://', a) 202 | return session 203 | -------------------------------------------------------------------------------- /webextaware/database.py: -------------------------------------------------------------------------------- 1 | # This Source Code Form is subject to the terms of the Mozilla Public 2 | # License, v. 2.0. If a copy of the MPL was not distributed with this file, 3 | # You can obtain one at http://mozilla.org/MPL/2.0/. 4 | 5 | import hashfs 6 | import logging 7 | import os 8 | import re 9 | 10 | from . import amo 11 | from . import metadata as md 12 | from . import webext as we 13 | 14 | 15 | logger = logging.getLogger(__name__) 16 | 17 | 18 | class Database(object): 19 | """Class for bundling high-level AMO and web extension functionality""" 20 | 21 | def __init__(self, args, files=None, metadata=None): 22 | self.args = args 23 | self.meta = metadata 24 | self.file_db = files 25 | 26 | if self.file_db is None: 27 | db_dir = os.path.join(self.args.workdir, "webext_data") 28 | if not os.path.isdir(db_dir): 29 | os.makedirs(db_dir) 30 | self.file_db = hashfs.HashFS(db_dir, depth=4, width=1, algorithm='sha256') 31 | 32 | if self.meta is None: 33 | self.meta = md.Metadata(filename=md.get_metadata_file(self.args)) 34 | 35 | def sync(self): 36 | if self.args.nometa: 37 | logger.warning("Using cached AMO metadata, not updating") 38 | else: 39 | logger.info("Downloading current metadata set from AMO") 40 | self.meta = md.Metadata(filename=md.get_metadata_file(self.args), 41 | data=amo.download_metadata()) 42 | self.meta.save() 43 | logger.info("Metadata set contains %d web extensions" % len(self.meta)) 44 | logger.info("Downloading missing web extensions") 45 | amo.update_files(self.meta, self.file_db) 46 | 47 | def match(self, selectors): 48 | if type(selectors) is not list and type(selectors) is not tuple: 49 | selectors = [selectors] 50 | logger.debug("Matching for %s" % repr(selectors)) 51 | 52 | selection = {} 53 | for selector in selectors: 54 | if selector == "all": 55 | for amo_ext in self.meta: 56 | for ext_id in amo_ext.file_hashes(): 57 | if amo_ext.id not in selection: 58 | selection[amo_ext.id] = set() 59 | selection[amo_ext.id].add(ext_id) 60 | elif selector == "orphans": 61 | for file_path in self.file_db: 62 | ext_id = self.file_db.get(file_path).id 63 | if not self.meta.is_known_hash(ext_id): 64 | if None not in selection: 65 | selection[None] = set() 66 | selection[None].add(ext_id) 67 | elif self.meta.is_known_id(selector): 68 | amo_ext = self.meta.get_by_id(selector) 69 | for ext_id in amo_ext.file_hashes(): 70 | if amo_ext.id not in selection: 71 | selection[amo_ext.id] = set() 72 | selection[amo_ext.id].add(ext_id) 73 | elif self.meta.is_known_hash(selector): 74 | amo_ext = self.meta.get_by_hash(selector) 75 | if amo_ext.id not in selection: 76 | selection[amo_ext.id] = set() 77 | selection[amo_ext.id].add(selector) 78 | elif type(selector) is str and len(selector) == 64 and self.file_db.get(selector) is not None: 79 | amo_id = None 80 | if amo_id not in selection: 81 | selection[amo_id] = set() 82 | selection[amo_id].add(selector) 83 | else: 84 | try: 85 | m = re.compile(selector, re.IGNORECASE) 86 | except re.error: 87 | logger.error("Invalid selector `%s`" % selector) 88 | continue 89 | for amo_ext in self.meta: 90 | if m.match(amo_ext.name) is not None: 91 | for ext_id in amo_ext.file_hashes(): 92 | if amo_ext.id not in selection: 93 | selection[amo_ext.id] = set() 94 | selection[amo_ext.id].add(ext_id) 95 | 96 | return selection 97 | 98 | def get_meta(self, selectors): 99 | meta = {} 100 | match = self.match(selectors) 101 | for amo_id in match: 102 | meta[amo_id] = self.meta.get_by_id(amo_id) 103 | return meta 104 | 105 | def get_ext(self, selectors): 106 | extensions = {} 107 | match = self.match(selectors) 108 | for amo_id in match: 109 | for ext_id in match[amo_id]: 110 | file_ref = self.file_db.get(ext_id) 111 | if file_ref is None: 112 | logger.warning("Cache miss for ID %s - %s" % (amo_id, ext_id)) 113 | continue 114 | file_path = file_ref.abspath 115 | if amo_id not in extensions: 116 | extensions[amo_id] = {} 117 | extensions[amo_id][ext_id] = we.WebExtension(file_path) 118 | return extensions 119 | 120 | def grep(self, pattern, selectors=None): 121 | pass 122 | 123 | def unizp(self, directories=None, selectors=None): 124 | pass 125 | 126 | def scan(self, scanners=None, directories=None, selectors=None): 127 | pass 128 | -------------------------------------------------------------------------------- /webextaware/main.py: -------------------------------------------------------------------------------- 1 | # This Source Code Form is subject to the terms of the Mozilla Public 2 | # License, v. 2.0. If a copy of the MPL was not distributed with this file, 3 | # You can obtain one at http://mozilla.org/MPL/2.0/. 4 | 5 | import argparse 6 | import coloredlogs 7 | import logging 8 | import os 9 | import pkg_resources as pkgr 10 | import resource 11 | import sys 12 | 13 | from . import modes 14 | 15 | 16 | # Initialize coloredlogs 17 | logger = logging.getLogger(__name__) 18 | coloredlogs.DEFAULT_LOG_FORMAT = "%(asctime)s %(levelname)s %(threadName)s %(name)s %(message)s" 19 | coloredlogs.install(level="INFO") 20 | 21 | 22 | def get_args(argv=None): 23 | """ 24 | Argument parsing 25 | :return: Argument parser object 26 | """ 27 | if argv is None: 28 | argv = sys.argv[1:] 29 | 30 | pkg_version = pkgr.require("webextaware")[0].version 31 | home = os.path.expanduser("~") 32 | 33 | parser = argparse.ArgumentParser(prog="webextaware") 34 | parser.add_argument("--version", action="version", version="%(prog)s " + pkg_version) 35 | 36 | parser.add_argument("-d", "--debug", 37 | help="Enable debug", 38 | action="store_true", 39 | default=False) 40 | 41 | parser.add_argument("-w", "--workdir", 42 | help="Path to working directory", 43 | type=os.path.abspath, 44 | action="store", 45 | default=os.path.join(home, ".webextaware")) 46 | 47 | # Set up subparsers, one for each mode 48 | subparsers = parser.add_subparsers(help="run mode", dest="mode") 49 | modes_list = modes.list_modes() 50 | for mode_name in modes_list: 51 | mode_class = modes_list[mode_name] 52 | sub_parser = subparsers.add_parser(mode_name, help=mode_class.help) 53 | mode_class.setup_args(sub_parser) 54 | 55 | return parser.parse_args(argv) 56 | 57 | 58 | # This is the entry point used in setup.py 59 | def main(): 60 | global logger 61 | 62 | args = get_args() 63 | 64 | if args.debug: 65 | coloredlogs.install(level="DEBUG") 66 | 67 | logger.debug("Command arguments: %s" % args) 68 | 69 | # Adjust file limits 70 | from_limit = resource.getrlimit(resource.RLIMIT_NOFILE) 71 | (soft_limit, hard_limit) = from_limit 72 | soft_limit = min(10000, hard_limit) 73 | to_limit = (soft_limit, hard_limit) 74 | logger.debug("Raising open file limit from %s to %s" % (repr(from_limit), repr(to_limit))) 75 | resource.setrlimit(resource.RLIMIT_NOFILE, to_limit) 76 | 77 | try: 78 | result = modes.run(args) 79 | 80 | except KeyboardInterrupt: 81 | logger.critical("User abort") 82 | return 5 83 | 84 | if result != 0: 85 | logger.error("Command failed") 86 | return result 87 | -------------------------------------------------------------------------------- /webextaware/metadata.py: -------------------------------------------------------------------------------- 1 | # This Source Code Form is subject to the terms of the Mozilla Public 2 | # License, v. 2.0. If a copy of the MPL was not distributed with this file, 3 | # You can obtain one at http://mozilla.org/MPL/2.0/. 4 | 5 | import bz2 6 | import json 7 | import logging 8 | import os 9 | 10 | 11 | logger = logging.getLogger(__name__) 12 | 13 | 14 | def get_metadata_file(args): 15 | return os.path.join(args.workdir, "amo_metadata.json.bz2") 16 | 17 | 18 | def create_directory_path(amo_id, ext_id, base=None): 19 | if base is None: 20 | return os.path.join(amo_id, ext_id) 21 | else: 22 | return os.path.join(base, amo_id, ext_id) 23 | 24 | 25 | class Metadata(object): 26 | def __init__(self, filename=None, data=None): 27 | self.__ext = [] 28 | if data is not None: 29 | for e in data: 30 | ext = Extension(e) 31 | if ext.is_webextension(): 32 | self.__ext.append(ext) 33 | self.__filename = filename 34 | self.__hash_index = {} 35 | self.__id_index = {} 36 | if data is None and filename is not None: 37 | self.load(filename) 38 | self.generate_index() 39 | 40 | def raw_data(self): 41 | return self.__ext 42 | 43 | def load(self, metadata_filename): 44 | global logger 45 | self.__ext = [] 46 | try: 47 | with bz2.open(metadata_filename, "r") as f: 48 | logger.debug("Retrieving metadata state from `%s`" % metadata_filename) 49 | for e in json.load(f): 50 | ext = Extension(e) 51 | if ext.is_webextension(): 52 | self.__ext.append(ext) 53 | except FileNotFoundError: 54 | logger.warning("No metadata state stored in `%s`" % metadata_filename) 55 | 56 | def save(self): 57 | global logger 58 | logger.debug("Writing metadata state to `%s`" % self.__filename) 59 | with bz2.open(self.__filename, "w") as f: 60 | f.write(json.dumps(self.__ext).encode("utf-8")) 61 | 62 | def generate_index(self): 63 | self.__id_index = {} 64 | self.__hash_index = {} 65 | for ext in self.__ext: 66 | self.__id_index[ext.id] = ext 67 | for h in ext.file_hashes(): 68 | self.__hash_index[h] = ext 69 | 70 | def is_known_id(self, amo_id): 71 | try: 72 | amo_id = int(amo_id) 73 | except (ValueError, TypeError): 74 | return False 75 | return amo_id in self.__id_index 76 | 77 | def is_known_hash(self, hash_id): 78 | return hash_id in self.__hash_index 79 | 80 | def get_by_id(self, amo_id): 81 | try: 82 | amo_id = int(amo_id) 83 | except (ValueError, TypeError): 84 | return None 85 | if amo_id in self.__id_index: 86 | return self.__id_index[amo_id] 87 | else: 88 | return None 89 | 90 | def get_by_hash(self, hash_id): 91 | if hash_id in self.__hash_index: 92 | return self.__hash_index[hash_id] 93 | else: 94 | return None 95 | 96 | def get(self, amo_or_hash_id): 97 | if self.is_known_id(amo_or_hash_id): 98 | return self.get_by_id(amo_or_hash_id) 99 | elif self.is_known_hash(amo_or_hash_id): 100 | return self.get_by_hash(amo_or_hash_id) 101 | else: 102 | return None 103 | 104 | def id_to_hashes(self, amo_id): 105 | ext = self.get_by_id(amo_id) 106 | if ext is None: 107 | return None 108 | return [h for h in ext.file_hashes()] 109 | 110 | def hash_to_id(self, hash_id): 111 | return self.get_by_hash(hash_id).id 112 | 113 | def __iter__(self): 114 | for ext in self.__ext: 115 | yield ext 116 | 117 | def __len__(self): 118 | return len(self.__ext) 119 | 120 | def iter_files(self): 121 | for ext in self: 122 | for f in ext.files: 123 | yield f 124 | 125 | 126 | class Extension(dict): 127 | __language_priority = ['en-US', 'en-GB', 'uk', 'de', 'fr', 'pl', 'es', 'it', 'nl'] 128 | 129 | def __init__(self, *args, **kwargs): 130 | super().__init__(*args, **kwargs) 131 | 132 | @property 133 | def id(self): 134 | return self["id"] 135 | 136 | @property 137 | def name(self): 138 | for lang in self.__language_priority: 139 | if lang in self["name"]: 140 | return self["name"][lang] 141 | lang = list(self["name"].keys())[0] 142 | return self["name"][lang] 143 | 144 | @property 145 | def permissions(self): 146 | aggregate_api_permissions = set() 147 | aggregate_host_permissions = set() 148 | for f in self.files(): 149 | for p in f["permissions"]: 150 | if "/" in p or ":" in p or "<" in p: 151 | aggregate_host_permissions.add(p) 152 | else: 153 | aggregate_api_permissions.add(p) 154 | if len(aggregate_host_permissions) == 0: 155 | aggregate_host_permissions = None 156 | if len(aggregate_api_permissions) == 0: 157 | aggregate_api_permissions = None 158 | return aggregate_host_permissions, aggregate_api_permissions 159 | 160 | def is_webextension(self): 161 | # True in practice, unless the metadata was downloaded before sep 2021. 162 | for f in self.files(): 163 | return True 164 | return False 165 | 166 | def files(self): 167 | if "current_version" not in self: 168 | return 169 | 170 | # AMO API v5 (latest) 171 | if "file" in self["current_version"]: 172 | yield self["current_version"]["file"] 173 | return 174 | 175 | if "files" not in self["current_version"]: 176 | return 177 | 178 | # AMO API v3 (from disk by webextaware<=1.2.5) 179 | # Until 2021, "files" could contain multiple items. 180 | # Since sep 2021, "files" can only have one item: 181 | # https://github.com/mozilla/addons-server/issues/17839 182 | for f in self["current_version"]["files"]: 183 | # If the metadata was downloaded before sep 2021, is_webextension could be false. 184 | # After sep 2021, all addons on AMO are WebExtensions: 185 | # https://github.com/mozilla/addons-server/issues/17946 186 | if f["is_webextension"]: 187 | yield f 188 | 189 | def file_hashes(self): 190 | for f in self.files(): 191 | yield f["hash"].split(":")[1] 192 | 193 | def __iter__(self): 194 | return self.files() 195 | -------------------------------------------------------------------------------- /webextaware/modes/__init__.py: -------------------------------------------------------------------------------- 1 | # This Source Code Form is subject to the terms of the Mozilla Public 2 | # License, v. 2.0. If a copy of the MPL was not distributed with this file, 3 | # You can obtain one at http://mozilla.org/MPL/2.0/. 4 | 5 | from . import get 6 | from . import grep 7 | from . import info 8 | from . import libs 9 | from . import manifest 10 | from . import meta 11 | from . import query 12 | from . import scan 13 | from . import shell 14 | from . import stats 15 | from . import sync 16 | from . import unzip 17 | 18 | from .runmode import run, list_modes 19 | 20 | 21 | __all__ = [ 22 | "get", 23 | "grep", 24 | "info", 25 | "manifest", 26 | "meta", 27 | "query", 28 | "scan", 29 | "shell", 30 | "stats", 31 | "sync", 32 | "unzip", 33 | "run", 34 | "list_modes" 35 | ] 36 | -------------------------------------------------------------------------------- /webextaware/modes/get.py: -------------------------------------------------------------------------------- 1 | # This Source Code Form is subject to the terms of the Mozilla Public 2 | # License, v. 2.0. If a copy of the MPL was not distributed with this file, 3 | # You can obtain one at http://mozilla.org/MPL/2.0/. 4 | 5 | import logging 6 | 7 | from .runmode import RunMode 8 | 9 | 10 | logger = logging.getLogger(__name__) 11 | 12 | 13 | class GetMode(RunMode): 14 | """ 15 | Mode to get associated files in cache 16 | """ 17 | 18 | name = "get" 19 | help = "get associated files in cache" 20 | 21 | @staticmethod 22 | def setup_args(parser): 23 | parser.add_argument("selectors", 24 | metavar="selector", 25 | nargs="+", 26 | help="AMO IDs, extension IDs, regexp, `orphans`, `all`") 27 | 28 | def run(self): 29 | matches = self.db.match(self.args.selectors) 30 | if len(matches) == 0: 31 | logger.warning("No results") 32 | return 10 33 | 34 | for amo_id in matches: 35 | for ext_id in matches[amo_id]: 36 | file_ref = self.files.get(ext_id) 37 | if file_ref is None: 38 | logger.warning("Cache miss for AMO ID %d file %s" % (amo_id, ext_id)) 39 | continue 40 | file_path = file_ref.abspath 41 | print("%s\t%s" % (repr(amo_id), file_path)) 42 | 43 | return 0 44 | -------------------------------------------------------------------------------- /webextaware/modes/grep.py: -------------------------------------------------------------------------------- 1 | # This Source Code Form is subject to the terms of the Mozilla Public 2 | # License, v. 2.0. If a copy of the MPL was not distributed with this file, 3 | # You can obtain one at http://mozilla.org/MPL/2.0/. 4 | 5 | import argparse 6 | import logging 7 | from multiprocessing import Pool 8 | import os 9 | import sys 10 | 11 | from .runmode import RunMode 12 | from .. import webext 13 | 14 | 15 | logger = logging.getLogger(__name__) 16 | 17 | 18 | class GrepMode(RunMode): 19 | """ 20 | Mode to search for patterns in extension content 21 | """ 22 | 23 | name = "grep" 24 | help = "search extension content for pattern" 25 | 26 | @staticmethod 27 | def setup_args(parser): 28 | parser.add_argument("regexp", 29 | action="store", 30 | help="regular expression for `grep -E`") 31 | 32 | parser.add_argument("selectors", 33 | metavar="selector", 34 | nargs="*", 35 | default=["all"], 36 | help="AMO IDs, extension IDs, regexp, `orphans`, `all` (default)") 37 | 38 | parser.add_argument('grepargs', 39 | nargs=argparse.REMAINDER, 40 | help="additional arguments for `grep -E`") 41 | 42 | def setup(self): 43 | if not super().setup(): 44 | return False 45 | if webext.WebExtension.grep_exe is None: 46 | logger.critical("Missing `grep` binary") 47 | return False 48 | logger.debug("Using `%s` for grepping" % webext.WebExtension.grep_exe) 49 | return True 50 | 51 | def run(self): 52 | matches = self.db.match(self.args.selectors) 53 | if len(matches) == 0: 54 | logger.warning("No results") 55 | return 10 56 | 57 | work_list = [(amo_id, ext_id) for amo_id in matches for ext_id in matches[amo_id]] 58 | for amo_id, ext_id, lines in parallel_grep(work_list, self): 59 | if lines is None: 60 | continue 61 | for line in lines: 62 | print(line) 63 | 64 | return 0 65 | 66 | 67 | mp_mode = None 68 | 69 | 70 | def parallel_grep(work_list, mode): 71 | global mp_mode 72 | mp_mode = mode 73 | work_len = len(work_list) 74 | with Pool() as p: 75 | results = p.imap_unordered(grep, work_list) 76 | done = 0 77 | for result in results: 78 | done += 1 79 | if done % 500 == 0: 80 | logger.info("Progress: %d/%d (%.1f%%)" % (done, work_len, 100.0 * done / work_len)) 81 | yield result 82 | 83 | 84 | def grep(work_item): 85 | global mp_mode 86 | amo_id, ext_id = work_item 87 | try: 88 | ext = mp_mode.db.get_ext(ext_id)[amo_id][ext_id] 89 | except KeyError: 90 | logger.debug("Missing cache file for %d - %s" % (amo_id, ext_id)) 91 | return amo_id, ext_id, None 92 | file_ref = mp_mode.files.get(ext_id) 93 | if file_ref is None: 94 | logger.warning("Cache miss for ID %d - %s" % (amo_id, ext_id)) 95 | return amo_id, ext_id, None 96 | 97 | logger.debug("Grepping in %s, %s" % (amo_id, ext_id)) 98 | lines = [] 99 | try: 100 | package_id = "%s%s%s" % (amo_id, os.path.sep, ext_id) 101 | for line in ext.grep(mp_mode.args.regexp, mp_mode.args.grepargs, color=sys.stdout.isatty()): 102 | lines.append(line.replace("<%= PACKAGE_ID %>", package_id)) 103 | finally: 104 | ext.cleanup() 105 | return amo_id, ext_id, lines 106 | -------------------------------------------------------------------------------- /webextaware/modes/info.py: -------------------------------------------------------------------------------- 1 | # This Source Code Form is subject to the terms of the Mozilla Public 2 | # License, v. 2.0. If a copy of the MPL was not distributed with this file, 3 | # You can obtain one at http://mozilla.org/MPL/2.0/. 4 | 5 | import logging 6 | 7 | from .runmode import RunMode 8 | 9 | 10 | logger = logging.getLogger(__name__) 11 | 12 | 13 | class InfoMode(RunMode): 14 | """ 15 | Mode to provide info on the current state of webextaware 16 | """ 17 | 18 | name = "info" 19 | help = "print info on state of local cache" 20 | 21 | def run(self): 22 | all_amo_ids = set() 23 | all_ext_ids = set() 24 | matches = self.db.match("all") 25 | for amo_id in matches: 26 | all_amo_ids.add(amo_id) 27 | for ext_id in matches[amo_id]: 28 | all_ext_ids.add(ext_id) 29 | amo_count = len(all_amo_ids) 30 | print("AMO IDs in local cache: %d" % amo_count) 31 | ext_count = len(all_ext_ids) 32 | print("Referenced extensions in cache: %d" % ext_count) 33 | file_count = len(self.files) 34 | print("Total files in cache: %d" % file_count) 35 | print("Orphans in cache: %d" % (file_count - ext_count)) 36 | return 0 37 | -------------------------------------------------------------------------------- /webextaware/modes/libs.py: -------------------------------------------------------------------------------- 1 | # This Source Code Form is subject to the terms of the Mozilla Public 2 | # License, v. 2.0. If a copy of the MPL was not distributed with this file, 3 | # You can obtain one at http://mozilla.org/MPL/2.0/. 4 | 5 | import json 6 | import logging 7 | from multiprocessing import Pool 8 | import os 9 | import pkg_resources as pkgr 10 | import pynpm 11 | import shutil 12 | 13 | from .runmode import RunMode 14 | from .. import scanner 15 | from ..webext import traverse 16 | 17 | 18 | logger = logging.getLogger(__name__) 19 | 20 | 21 | class LibsMode(RunMode): 22 | """ 23 | Mode to detect libraries and frameworks 24 | """ 25 | 26 | name = "libs" 27 | help = "collect statistics on libraries and frameworks" 28 | 29 | @staticmethod 30 | def setup_args(parser): 31 | parser.add_argument("-e", "--perext", 32 | action="store_true", 33 | help="use web extension-centric output format") 34 | 35 | parser.add_argument("-t", "--traverse", 36 | action="store_true", 37 | help="produce a grep-friendly output format") 38 | 39 | parser.add_argument("-H", "--human", 40 | action="store_true", 41 | help="print human-readable output format") 42 | 43 | parser.add_argument("selectors", 44 | metavar="selector", 45 | nargs="*", 46 | default=["all"], 47 | help="AMO IDs, extension IDs, regexp, `orphans`, `all` (default)") 48 | 49 | def run(self): 50 | node_dir = check_npm_install(self.args) 51 | if node_dir is None: 52 | return 5 53 | matches = self.db.match(self.args.selectors) 54 | if len(matches) == 0: 55 | logger.warning("No results") 56 | return 10 57 | 58 | retire_instance = scanner.RetireScanner(node_dir=node_dir) 59 | if not retire_instance.dependencies(): 60 | return 20 61 | 62 | work_list = [(amo_id, ext_id) for amo_id in matches for ext_id in matches[amo_id]] 63 | results = {} 64 | for amo_id, ext_id, result in parallel_scan(work_list, self, retire_instance): 65 | if amo_id not in results: 66 | results[amo_id] = {} 67 | if ext_id not in results[amo_id]: 68 | results[amo_id][ext_id] = {} 69 | results[amo_id][ext_id] = result 70 | 71 | if self.args.perext: 72 | if self.args.human: 73 | logger.warning("Human-readable output not implemented for per-extension results") 74 | # Remove entries with empty results 75 | for amo_id in results: 76 | for ext_id in results[amo_id]: 77 | results[amo_id][ext_id] = list(filter(lambda r: "results" in r and len(r["results"]) > 0, 78 | results[amo_id][ext_id])) 79 | if not self.args.traverse: 80 | print(json.dumps(results, indent=4)) 81 | else: 82 | for line in traverse(results): 83 | print(line.lstrip("/")) 84 | 85 | else: 86 | components = by_components(results) 87 | 88 | if not self.args.human: 89 | print(json.dumps(components, indent=4)) 90 | 91 | else: 92 | # severity_rating = ["-", "low", "medium", "high"] 93 | aggregate = aggregate_counts(components) 94 | amo_count = len(matches) 95 | ext_count = sum([len(matches[amo_id]) for amo_id in matches]) 96 | for component in sorted(aggregate.keys()): 97 | for version in sorted(aggregate[component].keys()): 98 | print("%40s\t%-15s\t%7d (%.1f%%)\t%7d (%.1f%%)" % ( 99 | component, 100 | version, 101 | aggregate[component][version]["amo_ids"], 102 | aggregate[component][version]["amo_ids"] * 100.0 / amo_count, 103 | aggregate[component][version]["ext_ids"], 104 | aggregate[component][version]["ext_ids"] * 100.0 / ext_count 105 | )) 106 | 107 | return 0 108 | 109 | 110 | def check_npm_install(args): 111 | node_dir = os.path.join(args.workdir, "node") 112 | os.makedirs(node_dir, exist_ok=True) 113 | package_json = os.path.join(node_dir, "package.json") 114 | module_package_json = pkgr.resource_filename("webextaware", "package.json") 115 | if not os.path.exists(package_json) \ 116 | or os.path.getmtime(package_json) < os.path.getmtime(module_package_json): 117 | shutil.copyfile(module_package_json, package_json) 118 | try: 119 | npm_pkg = pynpm.NPMPackage(os.path.abspath(package_json)) 120 | npm_pkg.install() 121 | except FileNotFoundError: 122 | logger.critical("Node Package Manager not found") 123 | os.unlink(package_json) # To trigger reinstall 124 | return None 125 | return node_dir 126 | 127 | 128 | def sub_versions(version): 129 | subs = version.split(".") 130 | for i in range(len(subs)): 131 | yield ".".join(subs[:i+1]) 132 | 133 | 134 | mp_mode = None 135 | mp_scanner = None 136 | 137 | 138 | def parallel_scan(work_list, mode, scanner_instance): 139 | global mp_mode, mp_scanner 140 | mp_mode = mode 141 | mp_scanner = scanner_instance 142 | work_len = len(work_list) 143 | with Pool() as p: 144 | results = p.imap_unordered(scan, work_list) 145 | done = 0 146 | for result in results: 147 | if done % 100 == 0: 148 | logger.info("Progress: %d/%d (%.1f%%)" % (done, work_len, 100.0 * done / work_len)) 149 | done += 1 150 | yield result 151 | 152 | 153 | def scan(work_item): 154 | global mp_mode, mp_scanner 155 | amo_id, ext_id = work_item 156 | try: 157 | ext = mp_mode.db.get_ext(ext_id)[amo_id][ext_id] 158 | except KeyError: 159 | logger.debug("Missing cache file for %s - %s" % (amo_id, ext_id)) 160 | return amo_id, ext_id, None 161 | file_ref = mp_mode.files.get(ext_id) 162 | if file_ref is None: 163 | logger.warning("Cache miss for ID %d - %s" % (amo_id, ext_id)) 164 | return amo_id, ext_id, None 165 | 166 | logger.debug("Running %s scan on %s, %s" % (mp_scanner.name, amo_id, ext_id)) 167 | mp_scanner.scan(extension=ext, verbose=True) 168 | return amo_id, ext_id, mp_scanner.result 169 | 170 | 171 | def by_components(results): 172 | components = {} 173 | for amo_id in results: 174 | for ext_id in results[amo_id]: 175 | if results[amo_id][ext_id] is None: 176 | continue 177 | for detection in results[amo_id][ext_id]: 178 | if "results" in detection: 179 | # This is a regular result with `file` and `results` keys 180 | for result in detection["results"]: 181 | c_name = result["component"] 182 | c_version = result["version"] 183 | if c_name not in components: 184 | components[c_name] = {} 185 | component = components[c_name] 186 | if c_version not in component: 187 | component[c_version] = {} 188 | component[c_version].update(result) 189 | if "detection" in component[c_version]: 190 | del component[c_version]["detection"] 191 | component[c_version]["matches"] = {} 192 | if amo_id not in component[c_version]["matches"]: 193 | component[c_version]["matches"][amo_id] = {} 194 | if ext_id not in component[c_version]["matches"][amo_id]: 195 | component[c_version]["matches"][amo_id][ext_id] = [] 196 | component[c_version]["matches"][amo_id][ext_id].append(detection["file"]) 197 | else: 198 | # This is a file-less result with just `component` and `version` 199 | for d in detection: 200 | c_name = d["component"] 201 | c_version = d["version"] 202 | if c_name not in components: 203 | components[c_name] = {} 204 | if c_version not in components[c_name]: 205 | components[c_name][c_version] = {} 206 | components[c_name][c_version].update(d) 207 | components[c_name][c_version]["matches"] = {} 208 | if amo_id not in components[c_name][c_version]["matches"]: 209 | components[c_name][c_version]["matches"][amo_id] = {} 210 | if ext_id not in components[c_name][c_version]["matches"][amo_id]: 211 | components[c_name][c_version]["matches"][amo_id][ext_id] = [] 212 | components[c_name][c_version]["matches"][amo_id][ext_id].append(None) 213 | return components 214 | 215 | 216 | def aggregate_counts(components): 217 | aggregate = {} 218 | for c_name in components: 219 | aggregate[c_name] = {} 220 | for c_version in components[c_name]: 221 | for v in sub_versions(c_version): 222 | if v not in aggregate[c_name]: 223 | aggregate[c_name][v] = { 224 | "amo_ids": 0, 225 | "ext_ids": 0 226 | } 227 | for amo_id in components[c_name][c_version]["matches"]: 228 | aggregate[c_name][v]["amo_ids"] += 1 229 | aggregate[c_name][v]["ext_ids"] += \ 230 | len(components[c_name][c_version]["matches"][amo_id]) 231 | return aggregate 232 | -------------------------------------------------------------------------------- /webextaware/modes/manifest.py: -------------------------------------------------------------------------------- 1 | # This Source Code Form is subject to the terms of the Mozilla Public 2 | # License, v. 2.0. If a copy of the MPL was not distributed with this file, 3 | # You can obtain one at http://mozilla.org/MPL/2.0/. 4 | 5 | import json 6 | import logging 7 | import sys 8 | 9 | from .runmode import RunMode 10 | 11 | 12 | logger = logging.getLogger(__name__) 13 | 14 | 15 | class MetadataMode(RunMode): 16 | """ 17 | Mode to query AMO and web extension IDs 18 | """ 19 | 20 | name = "manifest" 21 | help = "print manifests as JSON" 22 | 23 | @staticmethod 24 | def setup_args(parser): 25 | parser.add_argument("-r", "--raw", 26 | action="store_true", 27 | help="dump raw manifests instead of digested JSON") 28 | 29 | parser.add_argument("-t", "--traverse", 30 | action="store_true", 31 | help="use a grep-friendly output format") 32 | 33 | parser.add_argument("selectors", 34 | metavar="selector", 35 | nargs="+", 36 | help="AMO IDs, extension IDs, regexp, `orphans`, `all`") 37 | 38 | @staticmethod 39 | def check_args(args): 40 | global logger 41 | if args.raw and args.traverse: 42 | logger.critical("Cannot combine `--raw` and `--traverse`") 43 | return False 44 | return True 45 | 46 | def run(self): 47 | global logger 48 | 49 | exts = self.db.get_ext(self.args.selectors) 50 | if len(exts) == 0: 51 | logger.warning("No results") 52 | return 10 53 | 54 | if self.args.traverse: 55 | for amo_id in exts: 56 | for ext_id in exts[amo_id]: 57 | ext = exts[amo_id][ext_id] 58 | manifest = ext.manifest() 59 | for line in manifest.traverse(): 60 | print("%s/%s/manifest.json%s" % (amo_id, ext_id, line)) 61 | return 0 62 | 63 | if self.args.raw: 64 | for amo_id in exts: 65 | for ext_id in exts[amo_id]: 66 | ext = exts[amo_id][ext_id] 67 | manifest = ext.manifest() 68 | sys.stdout.buffer.write(manifest.raw) 69 | if not manifest.raw.endswith(b"\n"): 70 | sys.stdout.buffer.write(b"\n") 71 | sys.stdout.flush() 72 | return 0 73 | 74 | manifests = {} 75 | for amo_id in exts: 76 | for ext_id in exts[amo_id]: 77 | if amo_id not in manifests: 78 | manifests[amo_id] = {} 79 | ext = exts[amo_id][ext_id] 80 | try: 81 | manifest = ext.manifest() 82 | except Exception as e: 83 | logger.warning("Unable to parse extension manifest of %d - %s: %s" % (amo_id, ext_id, str(e))) 84 | manifests[amo_id][ext_id] = None 85 | continue 86 | manifests[amo_id][ext_id] = manifest.json 87 | 88 | print(json.dumps(manifests, sort_keys=True, indent=4)) 89 | 90 | return 0 91 | -------------------------------------------------------------------------------- /webextaware/modes/meta.py: -------------------------------------------------------------------------------- 1 | # This Source Code Form is subject to the terms of the Mozilla Public 2 | # License, v. 2.0. If a copy of the MPL was not distributed with this file, 3 | # You can obtain one at http://mozilla.org/MPL/2.0/. 4 | 5 | import json 6 | import logging 7 | 8 | from .runmode import RunMode 9 | 10 | 11 | logger = logging.getLogger(__name__) 12 | 13 | 14 | class MetaaMode(RunMode): 15 | """ 16 | Mode to query AMO and web extension IDs 17 | """ 18 | 19 | name = "meta" 20 | help = "print AMO metadata objects as JSON" 21 | 22 | @staticmethod 23 | def setup_args(parser): 24 | parser.add_argument("selectors", 25 | metavar="selector", 26 | nargs="*", 27 | default=["all"], 28 | help="AMO IDs, extension IDs, regexp, `orphans`, `all` (default)") 29 | 30 | def run(self): 31 | result = 0 32 | meta = self.db.get_meta(self.args.selectors) 33 | if len(meta) == 0: 34 | logger.warning("No results") 35 | result = 10 36 | print(json.dumps(meta, sort_keys=True, indent=4)) 37 | return result 38 | -------------------------------------------------------------------------------- /webextaware/modes/query.py: -------------------------------------------------------------------------------- 1 | # This Source Code Form is subject to the terms of the Mozilla Public 2 | # License, v. 2.0. If a copy of the MPL was not distributed with this file, 3 | # You can obtain one at http://mozilla.org/MPL/2.0/. 4 | 5 | import logging 6 | 7 | from .runmode import RunMode 8 | from .. import webext as we 9 | 10 | 11 | logger = logging.getLogger(__name__) 12 | 13 | 14 | class QueryMode(RunMode): 15 | """ 16 | Mode to query AMO and web extension IDs 17 | """ 18 | 19 | name = "query" 20 | help = "query relations between AMO IDs and web extension IDs" 21 | 22 | @staticmethod 23 | def setup_args(parser): 24 | parser.add_argument("ids", 25 | metavar="selector", 26 | nargs="+", 27 | help="AMO IDs, extension IDs, regexp, `orphans`, `all`") 28 | 29 | def run(self): 30 | matches = self.db.match(self.args.ids) 31 | if len(matches) == 0: 32 | logger.warning("No results") 33 | return 10 34 | 35 | for amo_id in matches: 36 | for ext_id in matches[amo_id]: 37 | if amo_id is None: 38 | # Orphans are not referenced in current metadata 39 | ext = we.WebExtension(self.files.get(ext_id).abspath) 40 | ext_name = ext.manifest()["name"] 41 | else: 42 | ext = self.db.get_meta(amo_id)[amo_id] 43 | ext_name = ext.name 44 | print("%s\t%s\t%s" % (repr(amo_id), ext_id, ext_name)) 45 | 46 | return 0 47 | -------------------------------------------------------------------------------- /webextaware/modes/runmode.py: -------------------------------------------------------------------------------- 1 | # This Source Code Form is subject to the terms of the Mozilla Public 2 | # License, v. 2.0. If a copy of the MPL was not distributed with this file, 3 | # You can obtain one at http://mozilla.org/MPL/2.0/. 4 | 5 | import hashfs 6 | import logging 7 | import os 8 | 9 | from .. import metadata as md 10 | from .. import database as db 11 | 12 | 13 | logger = logging.getLogger(__name__) 14 | 15 | 16 | class RunMode(object): 17 | """ 18 | Generic parent class for run mode implementations 19 | """ 20 | 21 | name = "runmode" 22 | help = "Just a parent class for run modes" 23 | 24 | @staticmethod 25 | def setup_args(parser): 26 | """ 27 | Add a subparser for the mode's specific arguments. 28 | 29 | This definition serves as default, but modes are free to 30 | override it. 31 | 32 | :param parser: parent argparser to add to 33 | :return: None 34 | """ 35 | pass 36 | 37 | @staticmethod 38 | def check_args(args): 39 | """ 40 | Validate mode args 41 | 42 | :param args: parsed arguments object 43 | :return: bool 44 | """ 45 | del args 46 | return True 47 | 48 | def __init__(self, args, files=None, metadata=None, database=None): 49 | self.args = args 50 | 51 | if not os.path.isdir(args.workdir): 52 | os.makedirs(args.workdir) 53 | self.workdir = args.workdir 54 | 55 | self.files = files 56 | self.meta = metadata 57 | self.db = database 58 | 59 | def setup(self): 60 | """ 61 | Performs all the setup shared among multiple runs of the mode. 62 | Put everything here that takes too long for __init__(). 63 | :return: None 64 | """ 65 | if self.files is None: 66 | db_dir = os.path.join(self.workdir, "webext_data") 67 | if not os.path.isdir(db_dir): 68 | os.makedirs(db_dir) 69 | self.files = hashfs.HashFS(db_dir, depth=4, width=1, algorithm='sha256') 70 | 71 | if self.meta is None: 72 | self.meta = md.Metadata(filename=md.get_metadata_file(self.args)) 73 | 74 | if self.db is None: 75 | self.db = db.Database(self.args, files=self.files, metadata=self.meta) 76 | 77 | return True 78 | 79 | def run(self): 80 | """ 81 | Executes the the steps that constitutes the actual run. 82 | Results are kept internally in the class instance. 83 | :return: None 84 | """ 85 | pass 86 | 87 | def teardown(self): 88 | """ 89 | Clean up steps required after runs were performed. 90 | :return: None 91 | """ 92 | self.files = None 93 | self.meta = None 94 | self.db = None 95 | 96 | 97 | def __subclasses_of(cls): 98 | sub_classes = cls.__subclasses__() 99 | sub_sub_classes = [] 100 | for sub_cls in sub_classes: 101 | sub_sub_classes += __subclasses_of(sub_cls) 102 | return sub_classes + sub_sub_classes 103 | 104 | 105 | def list_modes(): 106 | """Return a list of all run modes""" 107 | return dict([(mode.name, mode) for mode in __subclasses_of(RunMode)]) 108 | 109 | 110 | def run(args): 111 | all_modes = list_modes() 112 | 113 | if args.mode is None: 114 | args.mode = "info" 115 | 116 | try: 117 | current_mode = all_modes[args.mode](args) 118 | except KeyError: 119 | logger.critical("Unknown run mode `%s`" % args.mode) 120 | return 5 121 | 122 | if not current_mode.check_args(args): 123 | return 5 124 | 125 | try: 126 | logger.debug("Running mode .setup()") 127 | if not current_mode.setup(): 128 | logger.critical("Setup failed") 129 | return 10 130 | logger.debug("Running mode .run()") 131 | result = current_mode.run() 132 | 133 | except KeyboardInterrupt: 134 | logger.debug("Running mode .teardown()") 135 | current_mode.teardown() 136 | raise KeyboardInterrupt 137 | 138 | logger.debug("Running mode .teardown()") 139 | current_mode.teardown() 140 | 141 | return result 142 | -------------------------------------------------------------------------------- /webextaware/modes/scan.py: -------------------------------------------------------------------------------- 1 | # This Source Code Form is subject to the terms of the Mozilla Public 2 | # License, v. 2.0. If a copy of the MPL was not distributed with this file, 3 | # You can obtain one at http://mozilla.org/MPL/2.0/. 4 | 5 | import json 6 | from multiprocessing import Pool 7 | import logging 8 | import os 9 | import pkg_resources as pkgr 10 | import pynpm 11 | import shutil 12 | 13 | from .runmode import RunMode 14 | from .. import scanner 15 | 16 | 17 | logger = logging.getLogger(__name__) 18 | 19 | 20 | class ScanMode(RunMode): 21 | """ 22 | Mode to run security scanners 23 | """ 24 | 25 | name = "scan" 26 | help = "run security scanners on extensions" 27 | 28 | @staticmethod 29 | def setup_args(parser): 30 | parser.add_argument("-s", "--scanner", 31 | action="append", 32 | choices=sorted(scanner.list_scanners().keys()), 33 | help="scanner to use (`retire` or `scanjs`; default: all)") 34 | 35 | parser.add_argument("selectors", 36 | metavar="selector", 37 | nargs="*", 38 | default=["all"], 39 | help="AMO IDs, extension IDs, regexp, `orphans`, `all` (default)") 40 | 41 | def run(self): 42 | node_dir = check_npm_install(self.args) 43 | if node_dir is None: 44 | return 5 45 | matches = self.db.match(self.args.selectors) 46 | if len(matches) == 0: 47 | logger.warning("No results") 48 | return 10 49 | 50 | scanners = [] 51 | if self.args.scanner is None or len(self.args.scanner[0]) == 0: 52 | scanner_list = scanner.list_scanners() 53 | else: 54 | scanner_list = {} 55 | all_scanners = scanner.list_scanners() 56 | for scanner_name in self.args.scanner: 57 | if scanner_name in all_scanners: 58 | scanner_list[scanner_name] = all_scanners[scanner_name] 59 | for scanner_name in scanner_list: 60 | scanner_instance = scanner_list[scanner_name](node_dir=node_dir) 61 | if not scanner_instance.dependencies(): 62 | return 20 63 | scanners.append(scanner_instance) 64 | 65 | work_list = [(amo_id, ext_id) for amo_id in matches for ext_id in matches[amo_id]] 66 | results = {} 67 | for amo_id, ext_id, result in parallel_scan(work_list, self, scanners): 68 | if amo_id not in results: 69 | results[amo_id] = {} 70 | if ext_id not in results[amo_id]: 71 | results[amo_id][ext_id] = {} 72 | results[amo_id][ext_id] = result 73 | 74 | print(json.dumps(results, indent=4)) 75 | 76 | return 0 77 | 78 | 79 | def check_npm_install(args): 80 | node_dir = os.path.join(args.workdir, "node") 81 | os.makedirs(node_dir, exist_ok=True) 82 | package_json = os.path.join(node_dir, "package.json") 83 | module_package_json = pkgr.resource_filename("webextaware", "package.json") 84 | if not os.path.exists(package_json) \ 85 | or os.path.getmtime(package_json) < os.path.getmtime(module_package_json): 86 | shutil.copyfile(module_package_json, package_json) 87 | try: 88 | npm_pkg = pynpm.NPMPackage(os.path.abspath(package_json)) 89 | npm_pkg.install() 90 | except FileNotFoundError: 91 | logger.critical("Node Package Manager not found") 92 | os.unlink(package_json) # To trigger reinstall 93 | return None 94 | return node_dir 95 | 96 | 97 | mp_mode = None 98 | mp_scanners = None 99 | 100 | 101 | def parallel_scan(work_list, mode, scanners): 102 | global mp_mode, mp_scanners 103 | mp_mode = mode 104 | mp_scanners = scanners 105 | work_len = len(work_list) 106 | with Pool() as p: 107 | results = p.imap_unordered(scan, work_list) 108 | done = 0 109 | for result in results: 110 | if done % 100 == 0: 111 | logger.info("Progress: %d/%d (%.1f%%)" % (done, work_len, 100.0 * done / work_len)) 112 | done += 1 113 | yield result 114 | 115 | 116 | def scan(work_item): 117 | global mp_mode, mp_scanners 118 | amo_id, ext_id = work_item 119 | try: 120 | ext = mp_mode.db.get_ext(ext_id)[amo_id][ext_id] 121 | except KeyError: 122 | logger.debug("Missing cache file for %d - %s" % (amo_id, ext_id)) 123 | return amo_id, ext_id, None 124 | file_ref = mp_mode.files.get(ext_id) 125 | if file_ref is None: 126 | logger.warning("Cache miss for ID %d - %s" % (amo_id, ext_id)) 127 | return amo_id, ext_id, None 128 | 129 | result = {} 130 | for scanner_instance in mp_scanners: 131 | logger.info("Running %s scan on %s, %s" % (scanner_instance.name, amo_id, ext_id)) 132 | scanner_instance.scan(extension=ext) 133 | result[scanner_instance.name] = scanner_instance.result 134 | 135 | return amo_id, ext_id, result 136 | -------------------------------------------------------------------------------- /webextaware/modes/shell.py: -------------------------------------------------------------------------------- 1 | # This Source Code Form is subject to the terms of the Mozilla Public 2 | # License, v. 2.0. If a copy of the MPL was not distributed with this file, 3 | # You can obtain one at http://mozilla.org/MPL/2.0/. 4 | 5 | import IPython as ipy 6 | import json 7 | import logging 8 | import pprint 9 | 10 | from .runmode import RunMode 11 | from .. import amo 12 | from .. import database as dbase 13 | from .. import metadata as md 14 | from .. import scanner 15 | from .. import webext as we 16 | 17 | 18 | logger = logging.getLogger(__name__) 19 | 20 | 21 | class ShellMode(RunMode): 22 | """ 23 | Mode to run an IPython shell 24 | """ 25 | 26 | name = "shell" 27 | help = "drop into an IPython shell" 28 | 29 | @staticmethod 30 | def setup_args(parser): 31 | parser.add_argument("selectors", 32 | metavar="selector", 33 | nargs="*", 34 | default=["all"], 35 | help="AMO IDs, extension IDs, regexp, `orphans`, `all` (default)") 36 | 37 | def run(self): 38 | 39 | matches = self.db.match(self.args.selectors) 40 | if len(matches) == 0: 41 | logger.warning("No results") 42 | return 10 43 | 44 | # Just for convenience 45 | meta = self.meta 46 | files = self.files 47 | db = self.db 48 | 49 | ipy.embed() 50 | 51 | return 0 52 | -------------------------------------------------------------------------------- /webextaware/modes/stats.py: -------------------------------------------------------------------------------- 1 | # This Source Code Form is subject to the terms of the Mozilla Public 2 | # License, v. 2.0. If a copy of the MPL was not distributed with this file, 3 | # You can obtain one at http://mozilla.org/MPL/2.0/. 4 | 5 | import csv 6 | import logging 7 | import sys 8 | 9 | from .runmode import RunMode 10 | 11 | 12 | logger = logging.getLogger(__name__) 13 | 14 | 15 | class StatsMode(RunMode): 16 | """ 17 | Mode to generate extension statistics 18 | """ 19 | 20 | name = "stats" 21 | help = "print CSV of web extension statistics" 22 | 23 | @staticmethod 24 | def setup_args(parser): 25 | parser.add_argument("-o", "--output", 26 | help="file for CSV output (default: stdout)", 27 | action="store", 28 | default=None) 29 | 30 | def run(self): 31 | field_names = [ 32 | "amo_id", 33 | "name", 34 | "average_daily_users", 35 | "weekly_downloads", 36 | "host_permissions", 37 | "api_permissions" 38 | ] 39 | output_file = None 40 | if self.args.output is None: 41 | csv_writer = csv.DictWriter(sys.stdout, fieldnames=field_names) 42 | else: 43 | output_file = open(self.args.output, "w") 44 | csv_writer = csv.DictWriter(output_file, fieldnames=field_names) 45 | csv_writer.writeheader() 46 | 47 | metas = self.db.get_meta("all") 48 | for amo_id in metas.keys(): 49 | ext = metas[amo_id] 50 | host_permissions, api_permissions = ext.permissions 51 | csv_writer.writerow({ 52 | "amo_id": ext.id, 53 | "name": ext.name, 54 | "average_daily_users": ext["average_daily_users"], 55 | "weekly_downloads": ext["weekly_downloads"], 56 | "host_permissions": host_permissions, 57 | "api_permissions": api_permissions 58 | }) 59 | 60 | if self.args.output is None: 61 | sys.stdout.flush() 62 | else: 63 | output_file.close() 64 | 65 | return 0 66 | -------------------------------------------------------------------------------- /webextaware/modes/sync.py: -------------------------------------------------------------------------------- 1 | # This Source Code Form is subject to the terms of the Mozilla Public 2 | # License, v. 2.0. If a copy of the MPL was not distributed with this file, 3 | # You can obtain one at http://mozilla.org/MPL/2.0/. 4 | 5 | import logging 6 | 7 | from .runmode import RunMode 8 | import webextaware.amo as amo 9 | import webextaware.metadata as md 10 | 11 | 12 | logger = logging.getLogger(__name__) 13 | 14 | 15 | class SyncMode(RunMode): 16 | """ 17 | Mode to update local AMO metadata and web extension file cache 18 | """ 19 | 20 | name = "sync" 21 | help = "update local AMO metadata and web extension file cache" 22 | 23 | @staticmethod 24 | def setup_args(parser): 25 | parser.add_argument("-n", "--nometa", 26 | help="do not update metadata, just download extensions", 27 | action="store_true", 28 | default=False) 29 | 30 | def run(self): 31 | if self.args.nometa: 32 | logger.warning("Using cached AMO metadata, not updating") 33 | else: 34 | logger.info("Downloading current metadata set from AMO") 35 | self.meta = md.Metadata(filename=md.get_metadata_file(self.args), 36 | data=amo.download_metadata()) 37 | self.meta.save() 38 | logger.info("Downloaded metadata set contains %d web extensions" % len(self.meta)) 39 | logger.info("Downloading missing web extensions") 40 | amo.update_files(self.meta, self.files) 41 | return 0 42 | -------------------------------------------------------------------------------- /webextaware/modes/unzip.py: -------------------------------------------------------------------------------- 1 | # This Source Code Form is subject to the terms of the Mozilla Public 2 | # License, v. 2.0. If a copy of the MPL was not distributed with this file, 3 | # You can obtain one at http://mozilla.org/MPL/2.0/. 4 | 5 | import glob 6 | import logging 7 | import os 8 | from shutil import rmtree 9 | 10 | from .runmode import RunMode 11 | from ..metadata import create_directory_path 12 | 13 | logger = logging.getLogger(__name__) 14 | 15 | 16 | class UnzipMode(RunMode): 17 | """ 18 | Mode to extract extensions 19 | """ 20 | 21 | name = "unzip" 22 | help = "extract extensions" 23 | 24 | @staticmethod 25 | def setup_args(parser): 26 | parser.add_argument("-n", "--nooverwrite", 27 | action="store_true", 28 | help="do not overwrite existing extension directories") 29 | 30 | parser.add_argument("-o", "--outdir", 31 | action="store", 32 | default="ext", 33 | help="root path for extraction (default: $PWD/ext)") 34 | 35 | parser.add_argument("-p", "--prune", 36 | action="store_true", 37 | help="delete obsolete (unreferenced) extensions from outdir") 38 | 39 | parser.add_argument("selectors", 40 | metavar="selector", 41 | nargs="+", 42 | help="AMO IDs, extension IDs, regexp, `orphans`, `all`") 43 | 44 | def run(self): 45 | exts = self.db.get_ext(self.args.selectors) 46 | if len(exts) == 0: 47 | logger.warning("No results") 48 | return 10 49 | 50 | for amo_id in exts: 51 | for ext_id in exts[amo_id]: 52 | unzip_path = create_directory_path(str(amo_id), ext_id, base=self.args.outdir) 53 | logger.debug("Considering to unzip %d to %s" % (amo_id, unzip_path)) 54 | try: 55 | os.makedirs(unzip_path, exist_ok=False) 56 | except FileExistsError: 57 | if self.args.nooverwrite: 58 | logger.info("Skipping existing directory %s" % unzip_path) 59 | continue 60 | with exts[amo_id][ext_id] as ext: 61 | ext.unzip(unzip_path) 62 | print(unzip_path) 63 | 64 | if self.args.prune: 65 | logger.info("Pruning orphans from output directory") 66 | for ext_id in self.db.get_ext("orphans")[0]: 67 | for match in glob.glob(create_directory_path("*", ext_id, base=self.args.outdir)): 68 | logger.info("Pruning `%s`" % match) 69 | rmtree(match) 70 | 71 | return 0 72 | -------------------------------------------------------------------------------- /webextaware/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "webextaware", 3 | "version": "1.2.1", 4 | "description": "Dummy package for installing webextaware node dependencies", 5 | "dependencies": { 6 | "eslint-config-scanjs": "mozfreddyb/eslint-config-scanjs", 7 | "eslint-plugin-no-unsafe-innerhtml": "1.0", 8 | "eslint": "4.6", 9 | "retire": "1.4" 10 | }, 11 | "devDependencies": {}, 12 | "scripts": { 13 | "retire": "retire", 14 | "eslint": "eslint", 15 | "test": "echo \"Error: no test specified\" && exit 1" 16 | }, 17 | "repository": { 18 | "type": "git", 19 | "url": "git+https://github.com/cr/webextaware.git" 20 | }, 21 | "keywords": [ 22 | "dummy" 23 | ], 24 | "author": "cr@mozilla.com", 25 | "license": "MPL-2.0", 26 | "bugs": { 27 | "url": "https://github.com/cr/webextaware/issues" 28 | }, 29 | "homepage": "https://github.com/cr/webextaware#readme" 30 | } 31 | -------------------------------------------------------------------------------- /webextaware/scanner.py: -------------------------------------------------------------------------------- 1 | # This Source Code Form is subject to the terms of the Mozilla Public 2 | # License, v. 2.0. If a copy of the MPL was not distributed with this file, 3 | # You can obtain one at http://mozilla.org/MPL/2.0/. 4 | 5 | import json 6 | import logging 7 | import os 8 | import shutil 9 | import subprocess 10 | import tempfile 11 | from time import sleep 12 | 13 | 14 | logger = logging.getLogger(__name__) 15 | 16 | 17 | class Scanner(object): 18 | 19 | name = "dummy" 20 | 21 | def __init__(self, **kwargs): 22 | self.args = kwargs 23 | self.result = None 24 | 25 | def dependencies(self): 26 | return True 27 | 28 | def scan(self, directory=None, extension=None): 29 | self.result = {} 30 | 31 | def is_scanning(self): 32 | return self.result is None 33 | 34 | def wait(self): 35 | while self.is_scanning(): 36 | sleep(0.1) 37 | return self.result() 38 | 39 | 40 | def __subclasses_of(cls): 41 | sub_classes = cls.__subclasses__() 42 | sub_sub_classes = [] 43 | for sub_cls in sub_classes: 44 | sub_sub_classes += __subclasses_of(sub_cls) 45 | return sub_classes + sub_sub_classes 46 | 47 | 48 | def list_scanners(): 49 | """Return a list of all scanners""" 50 | return dict([(scanner.name, scanner) for scanner in __subclasses_of(Scanner)]) 51 | 52 | 53 | class RetireScanner(Scanner): 54 | 55 | name = "retire" 56 | 57 | def __init__(self, **kwargs): 58 | super().__init__(**kwargs) 59 | 60 | def dependencies(self): 61 | global logger 62 | self.result = {} 63 | if "retire_bin" in self.args: 64 | retire_bin = self.args["retire_bin"] 65 | else: 66 | try: 67 | cmd = ["npm", "bin"] 68 | node_bin_path = subprocess.check_output(cmd, cwd=self.args["node_dir"]).decode("utf-8").split()[0] 69 | except FileNotFoundError: 70 | logger.critical("Node Package Manager not found") 71 | return False 72 | retire_bin = os.path.join(node_bin_path, "retire") 73 | logger.debug("Checking `%s`" % retire_bin) 74 | if not os.path.isfile(retire_bin): 75 | if os.path.isfile("%s.exe" % retire_bin): 76 | retire_bin = "%s.exe" % retire_bin 77 | logger.debug("Checking `%s`" % retire_bin) 78 | else: 79 | logger.critical("Unable to find retire.js binary") 80 | return False 81 | self.args["retire_bin"] = retire_bin 82 | logger.debug("Using retire.js binary at `%s`" % retire_bin) 83 | cmd = [retire_bin, "--version"] 84 | try: 85 | subprocess.check_call(cmd, cwd=self.args["node_dir"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) 86 | except subprocess.CalledProcessError as e: 87 | logger.critical("Error running retire.js binary: `%s`" % str(e)) 88 | return False 89 | return True 90 | 91 | def scan(self, unzip_dir=None, extension=None, verbose=False): 92 | global logger 93 | if unzip_dir is None: 94 | unzip_dir = tempfile.mkdtemp() 95 | rm_unzip_dir = True 96 | else: 97 | rm_unzip_dir = False 98 | if extension is not None: 99 | extension.unzip(unzip_dir) 100 | cmd = [self.args["retire_bin"], "--outputformat", "json", "--outputpath", "/dev/stdout", 101 | "--js", "--jspath", unzip_dir] 102 | if verbose: 103 | # List all detected frameworks, not just vulnerable 104 | cmd.append("--verbose") 105 | logger.debug("Running shell command `%s`" % " ".join(cmd)) 106 | cmd_output = subprocess.run(cmd, cwd=self.args["node_dir"], check=False, stdout=subprocess.PIPE, 107 | stderr=subprocess.DEVNULL).stdout 108 | logger.debug("Shell command output: `%s`" % cmd_output) 109 | if rm_unzip_dir: 110 | shutil.rmtree(unzip_dir, ignore_errors=True) 111 | try: 112 | result = json.loads(cmd_output.decode("utf-8")) 113 | except json.decoder.JSONDecodeError: 114 | logger.warning("retirejs call failed, probably due to network failure") 115 | logger.warning("Failing output is `%s`" % cmd_output) 116 | self.result = None 117 | return 118 | # Make file paths relative 119 | for r in result: 120 | if "file" not in r: 121 | continue 122 | if r["file"].startswith(unzip_dir): 123 | r["file"] = os.path.relpath(r["file"], start=unzip_dir) 124 | self.result = result 125 | 126 | 127 | class ScanJSScanner(Scanner): 128 | 129 | name = "scanjs" 130 | 131 | def __init__(self, **kwargs): 132 | super().__init__(**kwargs) 133 | 134 | def dependencies(self): 135 | global logger 136 | try: 137 | cmd = ["npm", "root"] 138 | node_root_path = subprocess.check_output(cmd, cwd=self.args["node_dir"]).decode("utf-8").split()[0] 139 | except FileNotFoundError: 140 | logger.critical("Node Package Manager not found") 141 | return False 142 | if "eslint_bin" in self.args: 143 | eslint_bin = self.args["eslint_bin"] 144 | else: 145 | try: 146 | cmd = ["npm", "bin"] 147 | node_bin_path = subprocess.check_output(cmd, cwd=self.args["node_dir"]).decode("utf-8").split()[0] 148 | except FileNotFoundError: 149 | logger.critical("Node Package Manager not found") 150 | return False 151 | eslint_bin = os.path.join(node_bin_path, "eslint") 152 | logger.debug("Checking `%s`" % eslint_bin) 153 | if not os.path.isfile(eslint_bin): 154 | logger.debug("Checking `%s`" % eslint_bin) 155 | if os.path.isfile("%s.exe" % eslint_bin): 156 | eslint_bin = "%s.exe" % eslint_bin 157 | else: 158 | logger.critical("Unable to find eslint binary") 159 | return False 160 | self.args["eslint_bin"] = eslint_bin 161 | logger.debug("Using eslint binary at `%s`" % eslint_bin) 162 | cmd = [eslint_bin, "--version"] 163 | try: 164 | subprocess.check_call(cmd, cwd=self.args["node_dir"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) 165 | except subprocess.CalledProcessError as e: 166 | logger.critical("Error running eslint binary: `%s`" % str(e)) 167 | return False 168 | eslint_rc = os.path.join(node_root_path, "eslint-config-scanjs", ".eslintrc") 169 | logger.debug("Checking `%s`" % eslint_rc) 170 | if not os.path.isfile(eslint_rc): 171 | logger.critical("You must install the `eslint-plugin-scanjs-rules` node module") 172 | return False 173 | self.args["eslint_rc"] = eslint_rc 174 | logger.debug("Using scanjs config at `%s`" % eslint_rc) 175 | return True 176 | 177 | def scan(self, unzip_dir=None, extension=None): 178 | global logger 179 | if unzip_dir is None: 180 | unzip_dir = tempfile.mkdtemp() 181 | rm_unzip_dir = True 182 | else: 183 | rm_unzip_dir = False 184 | if extension is not None: 185 | extension.unzip(unzip_dir) 186 | cmd = [self.args["eslint_bin"], 187 | "--no-eslintrc", 188 | "--no-inline-config", 189 | "--ignore-pattern", "__MACOSX", 190 | "--quiet", # suppresses warnings 191 | "-c", self.args["eslint_rc"], 192 | "-f", "json", 193 | unzip_dir] 194 | logger.debug("Running shell command `%s`" % " ".join(cmd)) 195 | cmd_output = subprocess.run(cmd, cwd=self.args["node_dir"], check=False, stdout=subprocess.PIPE, 196 | stderr=subprocess.DEVNULL).stdout 197 | logger.debug("Shell command output: `%s`" % cmd_output) 198 | if rm_unzip_dir: 199 | shutil.rmtree(unzip_dir, ignore_errors=True) 200 | if len(cmd_output) == 0: 201 | self.result = None 202 | else: 203 | try: 204 | result = json.loads(cmd_output.decode("utf-8")) 205 | except json.decoder.JSONDecodeError as err: 206 | logger.error("Failed to decode eslint output: %s" % str(err)) 207 | logger.error("Failing output: %s" % cmd_output) 208 | self.result = None 209 | return 210 | 211 | for r in result: 212 | # Make file paths relative 213 | if r["filePath"].startswith(unzip_dir): 214 | r["filePath"] = os.path.relpath(r["filePath"], start=unzip_dir) 215 | # Strip those massive `source` keys 216 | r["source"] = "/* stripped from results */" 217 | for m in r["messages"]: 218 | m["source"] = "/* stripped from results */" 219 | self.result = result 220 | -------------------------------------------------------------------------------- /webextaware/webext.py: -------------------------------------------------------------------------------- 1 | # This Source Code Form is subject to the terms of the Mozilla Public 2 | # License, v. 2.0. If a copy of the MPL was not distributed with this file, 3 | # You can obtain one at http://mozilla.org/MPL/2.0/. 4 | 5 | from distutils.spawn import find_executable 6 | from collections import OrderedDict 7 | import fnmatch 8 | import json 9 | import jsoncfg 10 | import logging 11 | import os 12 | import shutil 13 | import subprocess 14 | import tempfile 15 | import zipfile 16 | 17 | 18 | logger = logging.getLogger(__name__) 19 | 20 | 21 | class WebExtension(object): 22 | 23 | grep_exe = find_executable("grep") 24 | if grep_exe is None: 25 | grep_exe = find_executable("grep.exe") 26 | 27 | def __init__(self, filename): 28 | self.filename = filename 29 | self.unzip_folder = None 30 | self.unzip_folder_is_temp = False 31 | 32 | def __str__(self): 33 | manifest = self.manifest() 34 | return "" % (manifest["name"], manifest["version"]) 35 | 36 | def __enter__(self): 37 | return self 38 | 39 | def __exit__(self, *args): 40 | self.cleanup() 41 | 42 | def _open_ZipFile(self): 43 | return zipfile.ZipFile(self.filename) 44 | 45 | def manifest(self): 46 | logger.debug("Preparing manifest for %s" % self.filename) 47 | with self._open_ZipFile() as z: 48 | manifest = z.read("manifest.json") 49 | return Manifest(manifest) 50 | 51 | def ls(self): 52 | with self._open_ZipFile() as z: 53 | return z.namelist() 54 | 55 | def unzip(self, unzip_folder=None): 56 | if self.unzip_folder is not None and os.path.isdir(self.unzip_folder): 57 | return self.unzip_folder 58 | if unzip_folder is None: 59 | self.unzip_folder = tempfile.mkdtemp(prefix="webextaware_unzip_") 60 | self.unzip_folder_is_temp = True 61 | else: 62 | self.unzip_folder = unzip_folder 63 | self.unzip_folder_is_temp = False 64 | os.makedirs(self.unzip_folder, exist_ok=True) 65 | with self._open_ZipFile() as z: 66 | z.extractall(self.unzip_folder) 67 | return self.unzip_folder 68 | 69 | def is_unzipped(self): 70 | return self.unzip_folder is not None 71 | 72 | def cleanup(self): 73 | if self.unzip_folder is not None and self.unzip_folder_is_temp: 74 | shutil.rmtree(self.unzip_folder) 75 | self.unzip_folder = None 76 | self.unzip_folder_is_temp = False 77 | 78 | def find(self, glob_pattern): 79 | matches = [] 80 | for file_name in self.ls(): 81 | if fnmatch.fnmatch(file_name, glob_pattern): 82 | matches.append(file_name) 83 | return matches 84 | 85 | def grep(self, regexp, grep_args=None, color=False): 86 | if self.grep_exe is None: 87 | logger.critical("Can't find the `grep` binary.") 88 | return None 89 | if grep_args is None: 90 | grep_args = [] 91 | if color: 92 | color_arg = ["--color=always"] 93 | else: 94 | color_arg = ["--color=never"] 95 | folder = self.unzip() 96 | cmd = [self.grep_exe, "-E"] + [regexp] + grep_args + color_arg + ["-r", folder] 97 | logger.debug("Running shell command `%s`" % " ".join(cmd)) 98 | grep_result = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) 99 | if grep_result.stderr is not None and len(grep_result.stderr) > 0: 100 | logger.warning("Shell command yielded errors: `%s`" % grep_result.stderr.decode("utf-8")) 101 | if grep_result.stdout is None or len(grep_result.stdout) == 0: 102 | return [] 103 | results = [] 104 | try: 105 | decoded_result = grep_result.stdout.decode("utf-8") 106 | except UnicodeDecodeError as err: 107 | logger.warning("Error decoding grep results in `%s`: %s" % (self.unzip_folder, err)) 108 | return results 109 | for line in decoded_result.splitlines(): 110 | if line.startswith(folder): 111 | results.append(line.replace(folder, "<%= PACKAGE_ID %>")) 112 | elif line.startswith("Binary file ") and line.endswith(" matches"): 113 | filename = line[12:-8] 114 | if not os.path.isfile(filename) or not filename.startswith(folder): 115 | logger.warning("Unexpected grep output: `%s`" % line) 116 | results.append("%s: Binary file matches" % filename.replace(folder, "<%= PACKAGE_ID %>")) 117 | return results 118 | 119 | 120 | class Manifest(object): 121 | 122 | def __init__(self, content): 123 | self.raw = content 124 | try: 125 | utf_content = content.decode('utf-8-sig') 126 | except UnicodeDecodeError as e: 127 | # This should not be happening, but AMO lists several 128 | # extensions with non-standard manifest encoding. 129 | logger.warning("Unicode error in manifest: %s: %s" % (repr(content), str(e))) 130 | self.json = None 131 | return 132 | 133 | try: 134 | self.json = json.loads(utf_content) 135 | except ValueError as e: 136 | # There is lots of broken JSON in the wild. Most are using comments, 137 | # so will retry with a more relaxed parser. 138 | self.json = None 139 | logger.debug("Manifest can't be regularly parsed: %s: %s" % (repr(utf_content), str(e))) 140 | 141 | if self.json is None: 142 | logger.debug("Retrying with relaxed parser") 143 | try: 144 | self.json = jsoncfg.loads(utf_content) 145 | except jsoncfg.parser.JSONConfigParserException as e: 146 | # Give up when even relaxed parsing does not work. 147 | logger.error("Manifest can't be parsed: %s" % str(e)) 148 | self.json = None 149 | 150 | def traverse(self): 151 | if self.json is None: 152 | logger.warning("Manifest contains invalid JSON") 153 | return [] 154 | return list(traverse(self.json)) 155 | 156 | def __getitem__(self, item): 157 | if self.json is None: 158 | raise KeyError 159 | return self.json[item] 160 | 161 | def __contains__(self, item): 162 | if self.json is None: 163 | return False 164 | return item in self.json 165 | 166 | def __str__(self): 167 | return json.dumps(self.json, indent=4) 168 | 169 | 170 | def traverse(obj, ptr=None, path=""): 171 | if ptr is None: 172 | ptr = obj 173 | if type(ptr) is dict or type(ptr) is OrderedDict: 174 | for key in ptr.keys(): 175 | for line in traverse(obj, ptr=ptr[key], path="%s/%s" % (path, key)): 176 | yield line 177 | elif type(ptr) is list: 178 | for item in ptr: 179 | for line in traverse(obj, ptr=item, path=path): 180 | yield line 181 | else: 182 | yield ":".join([path, repr(ptr)]) 183 | --------------------------------------------------------------------------------