├── .gitignore ├── .gitmodules ├── COPYING ├── Makefile ├── NOTICE ├── README.md ├── art ├── architecture.png ├── art.go ├── art.sh ├── results.png ├── webui.png ├── yesiscan.png └── yesiscan.svg ├── backend ├── askalono.go ├── askalono │ ├── .gitignore │ └── askalono.go ├── bitbake.go ├── cran.go ├── cran_test.go ├── cran_test_cases │ ├── test_case0.error │ ├── test_case0.input │ ├── test_case0.output │ ├── test_case1.error │ ├── test_case1.input │ ├── test_case1.output │ ├── test_case2.error │ ├── test_case2.input │ ├── test_case2.output │ ├── test_case3.error │ ├── test_case3.input │ ├── test_case3.output │ ├── test_case4.error │ ├── test_case4.input │ ├── test_case4.output │ ├── test_case5.error │ ├── test_case5.input │ └── test_case5.output ├── licenseclassifier.go ├── pom.go ├── regexp.go ├── regexpcore.go ├── scancode.go └── spdx.go ├── cmd └── yesiscan │ ├── .gitignore │ ├── .goreleaser.yaml │ ├── Makefile │ ├── main.go │ └── web.go ├── examples ├── DESCRIPTION ├── big5.json ├── config.json ├── pom.xml ├── regexp.json └── ssh.config ├── go.mod ├── go.sum ├── interfaces └── interfaces.go ├── iterator ├── bzip2.go ├── fs.go ├── git.go ├── gzip.go ├── http.go ├── iterator.go ├── tar.go ├── util.go ├── util_test.go └── zip.go ├── lib ├── lib.go ├── main.go ├── profiles.go └── results.go ├── parser └── parser.go ├── s3 ├── s3.go └── screenshot-s3-public-bucket.png ├── util ├── ansi │ └── ansi.go ├── errwrap │ └── errwrap.go ├── licenses │ ├── licenses.go │ └── licenses_test.go ├── safepath │ └── safepath.go └── util.go └── web ├── static ├── 4a90d9.jpg ├── icons8-checkmark.svg └── icons8-search.svg └── web.go /.gitignore: -------------------------------------------------------------------------------- 1 | old/ 2 | tmp/ 3 | .autoConfigURI 4 | .autoConfigCookiePath 5 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "util/licenses/license-list-data"] 2 | path = util/licenses/license-list-data 3 | url = https://github.com/spdx/license-list-data/ 4 | -------------------------------------------------------------------------------- /COPYING: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | APPENDIX: How to apply the Apache License to your work. 180 | 181 | To apply the Apache License to your work, attach the following 182 | boilerplate notice, with the fields enclosed by brackets "[]" 183 | replaced with your own identifying information. (Don't include 184 | the brackets!) The text should be enclosed in the appropriate 185 | comment syntax for the file format. We also recommend that a 186 | file or class name and description of purpose be included on the 187 | same "printed page" as the copyright notice for easier 188 | identification within third-party archives. 189 | 190 | Copyright [yyyy] [name of copyright owner] 191 | 192 | Licensed under the Apache License, Version 2.0 (the "License"); 193 | you may not use this file except in compliance with the License. 194 | You may obtain a copy of the License at 195 | 196 | http://www.apache.org/licenses/LICENSE-2.0 197 | 198 | Unless required by applicable law or agreed to in writing, software 199 | distributed under the License is distributed on an "AS IS" BASIS, 200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | See the License for the specific language governing permissions and 202 | limitations under the License. 203 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: all build gofmt 2 | 3 | all: build 4 | 5 | build: 6 | $(MAKE) --quiet -C cmd/yesiscan 7 | 8 | gofmt: 9 | # TODO: remove gofmt once goimports has a -s option 10 | find . -maxdepth 9 -type f -name '*.go' -not -path './old/*' -not -path './tmp/*' -not -path './vendor/*' -exec gofmt -s -w {} \; 11 | find . -maxdepth 9 -type f -name '*.go' -not -path './old/*' -not -path './tmp/*' -not -path './vendor/*' -exec goimports -w {} \; 12 | -------------------------------------------------------------------------------- /NOTICE: -------------------------------------------------------------------------------- 1 | Copyright Amazon.com Inc or its affiliates and the project contributors 2 | Written by James Shubin and the project contributors 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); you may not use 5 | this file except in compliance with the License. You may obtain a copy of the 6 | License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software distributed 11 | under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR 12 | CONDITIONS OF ANY KIND, either express or implied. See the License for the 13 | specific language governing permissions and limitations under the License. 14 | 15 | We will never require a CLA to submit a patch. All contributions follow the 16 | `inbound == outbound` rule. 17 | 18 | This is not an official Amazon product. Amazon does not offer support for this 19 | project. 20 | -------------------------------------------------------------------------------- /art/architecture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/purpleidea/yesiscan/a9f8980f4f152aa15610bec090c7c6503db8ee6f/art/architecture.png -------------------------------------------------------------------------------- /art/art.go: -------------------------------------------------------------------------------- 1 | // Copyright Amazon.com Inc or its affiliates and the project contributors 2 | // Written by James Shubin and the project contributors 3 | // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); you may not 5 | // use this file except in compliance with the License. You may obtain a copy of 6 | // the License at 7 | // 8 | // http://www.apache.org/licenses/LICENSE-2.0 9 | // 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 12 | // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 13 | // License for the specific language governing permissions and limitations under 14 | // the License. 15 | // 16 | // We will never require a CLA to submit a patch. All contributions follow the 17 | // `inbound == outbound` rule. 18 | // 19 | // This is not an official Amazon product. Amazon does not offer support for 20 | // this project. 21 | // 22 | // SPDX-License-Identifier: Apache-2.0 23 | 24 | package art 25 | 26 | import ( 27 | _ "embed" 28 | ) 29 | 30 | //go:embed yesiscan.svg 31 | var YesiscanSvg []byte 32 | -------------------------------------------------------------------------------- /art/art.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | inkscape --without-gui --export-png "yesiscan.png" --export-width 400 "yesiscan.svg" 4 | -------------------------------------------------------------------------------- /art/results.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/purpleidea/yesiscan/a9f8980f4f152aa15610bec090c7c6503db8ee6f/art/results.png -------------------------------------------------------------------------------- /art/webui.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/purpleidea/yesiscan/a9f8980f4f152aa15610bec090c7c6503db8ee6f/art/webui.png -------------------------------------------------------------------------------- /art/yesiscan.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/purpleidea/yesiscan/a9f8980f4f152aa15610bec090c7c6503db8ee6f/art/yesiscan.png -------------------------------------------------------------------------------- /backend/askalono.go: -------------------------------------------------------------------------------- 1 | // Copyright Amazon.com Inc or its affiliates and the project contributors 2 | // Written by James Shubin and the project contributors 3 | // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); you may not 5 | // use this file except in compliance with the License. You may obtain a copy of 6 | // the License at 7 | // 8 | // http://www.apache.org/licenses/LICENSE-2.0 9 | // 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 12 | // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 13 | // License for the specific language governing permissions and limitations under 14 | // the License. 15 | // 16 | // We will never require a CLA to submit a patch. All contributions follow the 17 | // `inbound == outbound` rule. 18 | // 19 | // This is not an official Amazon product. Amazon does not offer support for 20 | // this project. 21 | // 22 | // SPDX-License-Identifier: Apache-2.0 23 | 24 | // TODO: should this be a subpackage? 25 | package backend 26 | 27 | import ( 28 | "bytes" 29 | "context" 30 | "encoding/json" 31 | "fmt" 32 | "os" 33 | "os/exec" 34 | "strings" 35 | "syscall" 36 | 37 | "github.com/awslabs/yesiscan/backend/askalono" 38 | "github.com/awslabs/yesiscan/interfaces" 39 | "github.com/awslabs/yesiscan/util/errwrap" 40 | "github.com/awslabs/yesiscan/util/licenses" 41 | "github.com/awslabs/yesiscan/util/safepath" 42 | ) 43 | 44 | const ( 45 | // AskalonoConfidenceError is the error string askalono returns for when 46 | // it doesn't have high enough confidence in a file. 47 | AskalonoConfidenceError = "Confidence threshold not high enough for any known license" 48 | ) 49 | 50 | // Askalono is based on the rust askalono project. It uses the Sørensen–Dice 51 | // coefficient for license comparison. It would be pretty easy, and preferable 52 | // to use one of the many pre-existing golang Sørensen–Dice implementations and 53 | // to have a pure golang solution for this, however it would be good to have at 54 | // least one backend that exec's out to a remote process, and since this one is 55 | // fairly self-contained, it is a good example to use before we try and wrap 56 | // something more complicated like scancode. 57 | // See: https://en.wikipedia.org/wiki/S%C3%B8rensen%E2%80%93Dice_coefficient 58 | type Askalono struct { 59 | Debug bool 60 | Logf func(format string, v ...interface{}) 61 | Prefix safepath.AbsDir 62 | 63 | // binary is the path of the executable to run. 64 | binary string 65 | } 66 | 67 | func (obj *Askalono) String() string { 68 | return "askalono" 69 | } 70 | 71 | func (obj *Askalono) Setup(ctx context.Context) error { 72 | // This runs --help to check this is in the path and running properly. 73 | // It also unpacks the embedded askalono binary if we have one to use! 74 | 75 | name, err := askalono.GetExpectedName() // what the binary expected to be named 76 | if err != nil { 77 | return err 78 | } 79 | obj.binary = name.Path() // the default 80 | 81 | relDir := safepath.UnsafeParseIntoRelDir("askalono/") 82 | prefix := safepath.JoinToAbsDir(obj.Prefix, relDir) 83 | if err := os.MkdirAll(prefix.Path(), interfaces.Umask); err != nil { 84 | return err 85 | } 86 | 87 | if size, absFile, err := askalono.InstallBinary(prefix); err != nil { 88 | // not a permanent error, we can fall back to anything built-in 89 | obj.Logf("unpacking binary failed: %v", err) 90 | } else { 91 | obj.binary = absFile.Path() // use this specific path instead! 92 | // TODO: change to human readable bytes 93 | obj.Logf("installed: %d bytes to disk at %s", size, obj.binary) 94 | } 95 | 96 | args := []string{"--help"} 97 | 98 | prog := fmt.Sprintf("%s %s", obj.binary, strings.Join(args, " ")) 99 | 100 | obj.Logf("running: %s", prog) 101 | 102 | // TODO: do we need to do the ^C handling? 103 | // XXX: is the ^C context cancellation propagating into this correctly? 104 | cmd := exec.CommandContext(ctx, obj.binary, args...) 105 | cmd.Dir = "" 106 | cmd.Env = []string{} 107 | cmd.SysProcAttr = &syscall.SysProcAttr{ 108 | Setpgid: true, 109 | Pgid: 0, 110 | } 111 | 112 | if err := cmd.Run(); err != nil { 113 | if e, ok := err.(*exec.Error); ok && e.Err == exec.ErrNotFound { 114 | // TODO: this error message is CLI specific, but should be generalized 115 | obj.Logf("either run with --no-backend-askalono or install askalono into your $PATH") 116 | } 117 | 118 | obj.Logf("your %s doesn't seem to be working properly, check how it was installed?", obj.binary) 119 | return errwrap.Wrapf(err, "error running: %s", prog) 120 | } 121 | 122 | return nil 123 | } 124 | 125 | func (obj *Askalono) ScanPath(ctx context.Context, path safepath.Path, info *interfaces.Info) (*interfaces.Result, error) { 126 | 127 | if info.FileInfo.IsDir() { // path.IsDir() should be the same. 128 | return nil, nil // skip 129 | } 130 | if info.FileInfo.Size() == 0 { 131 | return nil, nil // skip 132 | } 133 | 134 | filename := path.Path() 135 | 136 | ctx, cancel := context.WithCancel(ctx) 137 | defer cancel() 138 | 139 | // yes the args need to go in this order, nothing else works... 140 | args := []string{"--format", "json", "identify", "--optimize", filename} 141 | 142 | prog := fmt.Sprintf("%s %s", obj.binary, strings.Join(args, " ")) 143 | 144 | // TODO: add a progress bar of some sort somewhere 145 | if obj.Debug { 146 | obj.Logf("running: %s", prog) 147 | } 148 | 149 | // TODO: do we need to do the ^C handling? 150 | // XXX: is the ^C context cancellation propagating into this correctly? 151 | cmd := exec.CommandContext(ctx, obj.binary, args...) 152 | 153 | cmd.Dir = "" 154 | cmd.Env = []string{} 155 | 156 | // ignore signals sent to parent process (we're in our own group) 157 | cmd.SysProcAttr = &syscall.SysProcAttr{ 158 | Setpgid: true, 159 | Pgid: 0, 160 | } 161 | 162 | out, reterr := cmd.Output() 163 | if reterr != nil { 164 | if obj.Debug { 165 | obj.Logf("error running: %s", prog) 166 | } 167 | // XXX: bug: https://github.com/jpeddicord/askalono/issues/74 168 | // don't error here because it might be askalono erroring but 169 | // still returning output as an error message... it should not 170 | // have been written this way, but askalono team probably won't 171 | // change things now. 172 | //return nil, errwrap.Wrapf(reterr, "error running: %s", prog) 173 | } 174 | 175 | buffer := bytes.NewBuffer(out) 176 | if buffer.Len() == 0 { 177 | // XXX: bug: https://github.com/jpeddicord/askalono/issues/74 178 | obj.Logf("askalono EOF bug, skipped: %s", filename) 179 | return nil, nil // skip, unfortunately 180 | } 181 | decoder := json.NewDecoder(buffer) 182 | 183 | var askalonoOutput AskalonoOutput // this gets populated during decode 184 | if err := decoder.Decode(&askalonoOutput); err != nil { 185 | // programming error, report this to us please 186 | return nil, errwrap.Wrapf(err, "error decoding askalono json output") 187 | } 188 | 189 | if askalonoOutput.Path != "" && askalonoOutput.Path != filename { 190 | // programming error (probably in askalono) 191 | if obj.Debug { 192 | obj.Logf("expected: %s", filename) 193 | obj.Logf("got path: %s", askalonoOutput.Path) 194 | } 195 | return nil, fmt.Errorf("path did not match what was expected") 196 | } 197 | 198 | if reterr != nil && askalonoOutput.Error == "" { 199 | // probably a bug in askalono 200 | return nil, errwrap.Wrapf(reterr, "askalono bug, error running: %s", prog) 201 | } 202 | 203 | if reterr != nil && askalonoOutput.Error == AskalonoConfidenceError { 204 | return nil, nil // skip 205 | } 206 | 207 | if e := askalonoOutput.Error; reterr != nil && e != "" { 208 | return nil, fmt.Errorf("unhandled askalono error: %s", e) 209 | } 210 | 211 | if askalonoOutput.Result == nil { 212 | return nil, nil // didn't find anything 213 | } 214 | 215 | return askalonoResultHelper(askalonoOutput.Result) 216 | } 217 | 218 | // AskalonoOutput is modelled after the askalono output format. 219 | // 220 | // example: 221 | // 222 | // { 223 | // "path": "/home/ANT.AMAZON.COM/purple/code/license-finder-repo/spdx.go", 224 | // "result": { 225 | // "score": 0.9310345, 226 | // "license": { 227 | // "name": "MIT", 228 | // "kind":"original", 229 | // "aliases": [] 230 | // }, 231 | // "containing": [ 232 | // { 233 | // "score":0.993865, 234 | // "license": { 235 | // "name":"MIT", 236 | // "kind":"original", 237 | // "aliases": [] 238 | // }, 239 | // "line_range":[17,26] 240 | // } 241 | // ] 242 | // } 243 | // } 244 | type AskalonoOutput struct { 245 | // Path is an absolute file path to the file being scanned. 246 | Path string `json:"path"` 247 | 248 | // Result specifies what it found. 249 | Result *AskalonoResultContaining `json:"result"` 250 | 251 | // Error is a string returned instead of Result on askalono error. 252 | Error string 253 | } 254 | 255 | // AskalonoResult is the generic result format returned by askalono. It is 256 | // usually augmented by an additional field. That can be found in 257 | // AskalonoResultRanged or AskalonoResultContaining. 258 | type AskalonoResult struct { 259 | // Score is the matching score found. A 1.00 is a perfect match. 260 | Score float64 `json:"score"` 261 | 262 | // License points to the license information attached with this find. 263 | License *AskalonoLicense `json:"license"` 264 | } 265 | 266 | // AskalonoResultRanged is a version of the AskalonoResult that also contains 267 | // the line range information. 268 | type AskalonoResultRanged struct { 269 | *AskalonoResult 270 | 271 | // LineRangeRaw specifies where the match was found. 272 | LineRangeRaw []int64 `json:"line_range"` 273 | 274 | // TODO: add LineRangeStart and LineRangeEnd and Unmarshall into there! 275 | } 276 | 277 | // AskalonoResultContaining is a version of the AskalonoResult that also 278 | // contains a list of additional AskalonoResultRanged matches. 279 | type AskalonoResultContaining struct { 280 | *AskalonoResult 281 | 282 | // Containing has some further information about the output. It isn't 283 | // always populated, and I think it is only used when --optimize is used 284 | // *and* it didn't find an exact match. It lists all the other matches 285 | // it found. 286 | Containing []*AskalonoResultRanged `json:"containing"` 287 | } 288 | 289 | // AskalonoLicense is the format of the license struct returned by askalono. 290 | type AskalonoLicense struct { 291 | // Name is the SPDX name of the license found. 292 | Name string `json:"name"` 293 | 294 | // Kind is some sort of license tag. So far I've found "original". 295 | Kind string `json:"kind"` 296 | 297 | // Aliases is probably aliases for this license. I've not found this 298 | // output anywhere atm, so I've left it as an interface. 299 | Aliases []interface{} `json:"aliases"` 300 | } 301 | 302 | func askalonoResultHelper(result *AskalonoResultContaining) (*interfaces.Result, error) { 303 | if result == nil { 304 | return nil, fmt.Errorf("got nil result") 305 | } 306 | 307 | if result.AskalonoResult != nil && result.AskalonoResult.License != nil { 308 | return askalonoLicenseHelper(result.AskalonoResult.License, result.Score) 309 | } 310 | 311 | if len(result.Containing) == 0 { 312 | // programming error (probably in askalono) 313 | return nil, fmt.Errorf("got nil license") 314 | } 315 | 316 | // TODO: add file content ranges 317 | // XXX: askalono can't currently find more than one license at a time, 318 | // so we don't handle that more complicated case for now. More info: 319 | // https://github.com/jpeddicord/askalono/issues/40 320 | r := result.Containing[0].AskalonoResult 321 | return askalonoLicenseHelper(r.License, r.Score) 322 | } 323 | 324 | func askalonoLicenseHelper(input *AskalonoLicense, confidence float64) (*interfaces.Result, error) { 325 | if input == nil { 326 | return nil, fmt.Errorf("got nil license") 327 | } 328 | 329 | license := &licenses.License{ 330 | SPDX: input.Name, 331 | // TODO: populate other fields here (eg: found license text) 332 | } 333 | // FIXME: If license is not in SPDX, add a custom entry. 334 | if err := license.Validate(); err != nil { 335 | //return nil, err 336 | license = &licenses.License{ 337 | //SPDX: "", 338 | Origin: "askalono.jpeddicord.github.com", 339 | Custom: input.Name, 340 | // TODO: populate other fields here (eg: found license text) 341 | } 342 | } 343 | return &interfaces.Result{ 344 | Licenses: []*licenses.License{ 345 | license, 346 | }, 347 | Confidence: confidence, 348 | }, nil 349 | } 350 | -------------------------------------------------------------------------------- /backend/askalono/.gitignore: -------------------------------------------------------------------------------- 1 | askalono-* 2 | -------------------------------------------------------------------------------- /backend/askalono/askalono.go: -------------------------------------------------------------------------------- 1 | // Copyright Amazon.com Inc or its affiliates and the project contributors 2 | // Written by James Shubin and the project contributors 3 | // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); you may not 5 | // use this file except in compliance with the License. You may obtain a copy of 6 | // the License at 7 | // 8 | // http://www.apache.org/licenses/LICENSE-2.0 9 | // 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 12 | // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 13 | // License for the specific language governing permissions and limitations under 14 | // the License. 15 | // 16 | // We will never require a CLA to submit a patch. All contributions follow the 17 | // `inbound == outbound` rule. 18 | // 19 | // This is not an official Amazon product. Amazon does not offer support for 20 | // this project. 21 | // 22 | // SPDX-License-Identifier: Apache-2.0 23 | 24 | package askalono 25 | 26 | import ( 27 | "archive/zip" 28 | "bytes" 29 | "crypto/sha256" 30 | _ "embed" 31 | "fmt" 32 | "io" 33 | "os" 34 | "runtime" 35 | 36 | "github.com/awslabs/yesiscan/util/errwrap" 37 | "github.com/awslabs/yesiscan/util/safepath" 38 | ) 39 | 40 | const ( 41 | // AskalonoVersion is the version string used in the git tag. These can 42 | // be seen here: https://github.com/jpeddicord/askalono/releases/ 43 | AskalonoVersion = "0.4.6" 44 | ) 45 | 46 | // AskalonoHashes maps version number, os, and then sha256sum. These are the 47 | // hashes of the actual binaries, not the .zip files they are in. We ultimately 48 | // care just about the integrity of the binary, so that's all we need to check. 49 | // We don't also need to check the hash of the zip files, since we aren't 50 | // worried about opening a zip file being dangerous. 51 | // FIXME: We don't support different architectures for now. (eg: runtime.GOARCH) 52 | var AskalonoHashes = map[string]map[string]string{ 53 | "0.4.6": { 54 | "linux": "a089146694cf433a4580c3da414cf43c70722ba6398d214fe41ca27b53deb476", 55 | "darwin": "1e006e6c61ec4abd714ae930a94b2f447c57392d621a6e8367c7aaa4cb4f427c", 56 | "windows": "89f477e6e70e9bb58caf3b1f6a22fc6566e182ff81c3a920d49b6e6947ee97a1", 57 | }, 58 | } 59 | 60 | //go:embed askalono-0.4.6-Linux.zip 61 | var Askalono046Linux []byte 62 | 63 | //go:embed askalono-0.4.6-macOS.zip 64 | var Askalono046macOS []byte 65 | 66 | //go:embed askalono-0.4.6-Windows.zip 67 | var Askalono046Windows []byte 68 | 69 | func init() { 70 | if _, err := GetExpectedHash(); err != nil { 71 | panic(fmt.Sprintf("error with askalono hash lookup: %v", err)) 72 | } 73 | } 74 | 75 | // GetExpectedName returns the expected name of the binary for a given platform. 76 | // This happens to also be the path it is expected to be found in the zip file 77 | // because the packages contain that single file in the root. If this ever 78 | // changes, then we need to add an additional GetExpectedPath method and change 79 | // the logic. 80 | func GetExpectedName() (safepath.RelFile, error) { 81 | switch os := runtime.GOOS; os { 82 | case "linux": 83 | return safepath.ParseIntoRelFile("askalono") 84 | case "darwin": 85 | return safepath.ParseIntoRelFile("askalono") 86 | case "windows": 87 | return safepath.ParseIntoRelFile("askalono.exe") // lol, windows 88 | default: 89 | return safepath.RelFile{}, fmt.Errorf("unsupported os: %s", os) 90 | } 91 | } 92 | 93 | // GetExpectedHash returns the expected hash of the binary for this version and 94 | // OS. 95 | func GetExpectedHash() (string, error) { 96 | m, exists := AskalonoHashes[AskalonoVersion] 97 | if !exists { 98 | return "", fmt.Errorf("no askalono hash found for version: %s", AskalonoVersion) 99 | } 100 | h, exists := m[runtime.GOOS] 101 | if !exists { 102 | return "", fmt.Errorf("no askalono hash found for os: %s", runtime.GOOS) 103 | } 104 | if h == "" { 105 | return "", fmt.Errorf("empty hash") 106 | } 107 | // the null hash, you can get this by running: `sha256sum /dev/null` 108 | if h == "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855" { 109 | return "", fmt.Errorf("null hash") 110 | } 111 | return h, nil 112 | } 113 | 114 | // GetZip returns the correct zipped package for this OS and ARCH. If it doesn't 115 | // have one available, then it errors. 116 | func GetZip() ([]byte, error) { 117 | if arch := runtime.GOARCH; arch != "amd64" { 118 | return nil, fmt.Errorf("unsupported arch: %s", arch) 119 | } 120 | 121 | var b []byte 122 | switch os := runtime.GOOS; os { 123 | case "linux": 124 | b = Askalono046Linux 125 | case "darwin": 126 | b = Askalono046macOS 127 | case "windows": 128 | b = Askalono046Windows 129 | default: 130 | return nil, fmt.Errorf("unsupported os: %s", os) 131 | } 132 | 133 | if len(b) == 0 { 134 | // Was it built/downloaded correctly? 135 | return nil, fmt.Errorf("empty binary") 136 | } 137 | return b, nil 138 | } 139 | 140 | // InstallBinary installs an askalono binary into this dir if it's not there 141 | // already or if it has the wrong hash. It then returns its extracted size, and 142 | // its complete path. 143 | func InstallBinary(absDir safepath.AbsDir) (int64, safepath.AbsFile, error) { 144 | // NOTE: see this comment in the docs for this function. If the way the 145 | // zip files is built changes, we might need to change this for a 146 | // GetExpectedPath function call instead. 147 | relFileExpected, err := GetExpectedName() 148 | if err != nil { 149 | return 0, safepath.AbsFile{}, err 150 | } 151 | 152 | // this is where the output file will be stored 153 | absFile := safepath.JoinToAbsFile(absDir, relFileExpected) 154 | 155 | // First check the hash of the file at this location... If it's okay, 156 | // then we're done early! 157 | 158 | expectedHash, err := GetExpectedHash() 159 | if err != nil { 160 | // programming error, this was checked in init() 161 | return 0, safepath.AbsFile{}, err 162 | } 163 | 164 | if f, err := os.Open(absFile.Path()); err != nil && !os.IsNotExist(err) { 165 | // serious filesystem problem 166 | return 0, safepath.AbsFile{}, err 167 | } else if err == nil { 168 | // check the sha256 sum 169 | h := sha256.New() 170 | if _, err := io.Copy(h, f); err != nil { 171 | f.Close() // close it when we exit this block 172 | return 0, safepath.AbsFile{}, err 173 | } 174 | f.Close() // close it when we exit this block 175 | 176 | if fmt.Sprintf("%x", h.Sum(nil)) != expectedHash { 177 | // The expected binary destination file is invalid. So 178 | // delete it. We will re-write it later. This is safest. 179 | if err := os.Remove(absFile.Path()); err != nil { 180 | return 0, safepath.AbsFile{}, errwrap.Wrapf(err, "error deleting invalid file: %s", absFile.Path()) 181 | } 182 | } 183 | } 184 | 185 | b, err := GetZip() 186 | if err != nil { 187 | return 0, safepath.AbsFile{}, err 188 | } 189 | 190 | // Open the zip archive for reading. 191 | // FIXME: use a variant that can take a context 192 | z, err := zip.NewReader(bytes.NewReader(b), int64(len(b))) 193 | if err != nil { 194 | return 0, safepath.AbsFile{}, err 195 | } 196 | //defer z.Close() // no close method exists 197 | if z.Comment != "" { 198 | //obj.Logf("zip has comment: %s", z.Comment) 199 | } 200 | 201 | // Iterate through the files in the archive. 202 | // XXX: can a child directory appear before a parent? 203 | // TODO: add a recurring progress logf if it takes longer than 30 sec 204 | var x *zip.File 205 | for _, x = range z.File { 206 | // TODO: obj.Debug ? 207 | //obj.Logf("zip: %s", x.Name) 208 | 209 | if x.FileInfo().IsDir() { 210 | continue 211 | } 212 | 213 | relFile, err := safepath.ParseIntoRelFile(x.Name) 214 | if err != nil { 215 | // programming error 216 | return 0, safepath.AbsFile{}, err 217 | } 218 | 219 | if relFileExpected.Cmp(relFile) == nil { 220 | break // found 221 | } 222 | } 223 | if x == nil { 224 | return 0, safepath.AbsFile{}, fmt.Errorf("did not file %s in zip archive", relFileExpected.Path()) 225 | } 226 | 227 | // NOTE: On the difference between absDir and absFile.Dir()... If they 228 | // differ, that's because the relfile has a parent relDir component. 229 | 230 | // XXX: which mode method? 231 | if err := os.MkdirAll(absFile.Dir().Path(), os.ModePerm); err != nil { 232 | return 0, safepath.AbsFile{}, err 233 | } 234 | 235 | // open the actual source file 236 | // we need to read this into a buffer, because this is a ReadCloser, not 237 | // a ReadSeekCloser. We want to make sure it passes the hash, before we 238 | // write it out to disk. 239 | f, err := x.Open() 240 | if err != nil { 241 | return 0, safepath.AbsFile{}, errwrap.Wrapf(err, "error opening file %s", x.Name) 242 | } 243 | // don't `defer` close here because we want to free in the loop 244 | 245 | data, err := io.ReadAll(f) 246 | if err != nil { 247 | f.Close() // close file on error! 248 | return 0, safepath.AbsFile{}, err 249 | } 250 | f.Close() 251 | 252 | sum := sha256.Sum256(data) 253 | if h := fmt.Sprintf("%x", sum); h != expectedHash { 254 | return 0, safepath.AbsFile{}, fmt.Errorf("unexpected askalono binary hash of: %s", h) 255 | } 256 | 257 | // At this point, we can write out the file... 258 | // XXX: which mode method? 259 | if err := os.WriteFile(absFile.Path(), data, os.ModePerm); err != nil { 260 | return 0, safepath.AbsFile{}, errwrap.Wrapf(err, "error writing our file to disk at %s", absFile.Path()) 261 | } 262 | 263 | return int64(len(data)), absFile, nil // this is where the new binary was copied to 264 | } 265 | -------------------------------------------------------------------------------- /backend/bitbake.go: -------------------------------------------------------------------------------- 1 | // Copyright Amazon.com Inc or its affiliates and the project contributors 2 | // Written by James Shubin and the project contributors 3 | // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); you may not 5 | // use this file except in compliance with the License. You may obtain a copy of 6 | // the License at 7 | // 8 | // http://www.apache.org/licenses/LICENSE-2.0 9 | // 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 12 | // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 13 | // License for the specific language governing permissions and limitations under 14 | // the License. 15 | // 16 | // We will never require a CLA to submit a patch. All contributions follow the 17 | // `inbound == outbound` rule. 18 | // 19 | // This is not an official Amazon product. Amazon does not offer support for 20 | // this project. 21 | // 22 | // SPDX-License-Identifier: Apache-2.0 23 | 24 | // TODO: should this be a subpackage? 25 | package backend 26 | 27 | import ( 28 | "bufio" 29 | "bytes" 30 | "context" 31 | "sort" 32 | "strings" 33 | 34 | "github.com/awslabs/yesiscan/interfaces" 35 | "github.com/awslabs/yesiscan/util/errwrap" 36 | "github.com/awslabs/yesiscan/util/licenses" 37 | ) 38 | 39 | const ( 40 | // BitbakeMaxBytesLine sets a larger maximum for file line scanning than 41 | // the default of bufio.MaxScanTokenSize which is sort of small. 42 | BitbakeMaxBytesLine = 1024 * 1024 * 8 // 8 MiB 43 | 44 | // BitbakeLicensePrefix is the string we look for when trying to find a 45 | // license. 46 | BitbakeLicensePrefix = `LICENSE = "` 47 | 48 | // BitbakeLicenseSuffix is the terminating string at the end of the 49 | // line. We must not include the newline here. 50 | BitbakeLicenseSuffix = `"` 51 | 52 | // BitbakeFilenameSuffix is the file extension used by the bitbake 53 | // files. 54 | BitbakeFilenameSuffix = ".bb" 55 | ) 56 | 57 | // Bitbake is a license backend for the bitbake .bb files which are very 58 | // commonly seen in the yocto project. We use a trivial string parser for 59 | // finding these-- this could be improved significantly if people write fancier 60 | // .bb files, but this should get us 99% of the way there. 61 | type Bitbake struct { 62 | Debug bool 63 | Logf func(format string, v ...interface{}) 64 | } 65 | 66 | func (obj *Bitbake) String() string { 67 | return "bitbake" 68 | } 69 | 70 | func (obj *Bitbake) ScanData(ctx context.Context, data []byte, info *interfaces.Info) (*interfaces.Result, error) { 71 | if !strings.HasSuffix(info.FileInfo.Name(), BitbakeFilenameSuffix) { 72 | return nil, nil // skip 73 | } 74 | 75 | if info.FileInfo.IsDir() { 76 | return nil, nil // skip 77 | } 78 | if len(data) == 0 { 79 | return nil, nil // skip 80 | } 81 | 82 | ctx, cancel := context.WithCancel(ctx) 83 | defer cancel() 84 | 85 | licenseMap := make(map[string]struct{}) 86 | 87 | reader := bytes.NewReader(data) 88 | scanner := bufio.NewScanner(reader) 89 | buf := []byte{} // create a buffer for very long lines 90 | scanner.Buffer(buf, BitbakeMaxBytesLine) // set the max size of that buffer 91 | for scanner.Scan() { 92 | // In an effort to short-circuit things if needed, we run a 93 | // check ourselves and break out early if we see that we have 94 | // cancelled early. 95 | select { 96 | case <-ctx.Done(): 97 | return nil, errwrap.Wrapf(ctx.Err(), "scanner ended early") 98 | default: 99 | } 100 | 101 | s := scanner.Text() // newlines will be stripped here 102 | if !strings.HasPrefix(s, BitbakeLicensePrefix) { 103 | continue 104 | } 105 | if !strings.HasSuffix(s, BitbakeLicenseSuffix) { 106 | continue 107 | } 108 | 109 | license := s[len(BitbakeLicensePrefix) : len(s)-len(BitbakeLicenseSuffix)] 110 | if license == "" { 111 | // TODO: should we warn here? 112 | continue 113 | } 114 | 115 | // XXX: i've only seen & in between license strings for now... 116 | // example: https://git.yoctoproject.org/poky/tree/meta/recipes-devtools/btrfs-tools/btrfs-tools_5.16.2.bb#n10 117 | lids := strings.Split(license, "&") // lid is licenseID 118 | for _, x := range lids { 119 | lid := strings.TrimSpace(x) 120 | // TODO: should we normalize case here? 121 | licenseMap[lid] = struct{}{} 122 | } 123 | } 124 | var skip error 125 | scannerErr := scanner.Err() 126 | if scannerErr == bufio.ErrTooLong { 127 | skip = scannerErr // add to ignored files... 128 | scannerErr = nil // reset 129 | } 130 | 131 | ids := []string{} 132 | for id := range licenseMap { 133 | ids = append(ids, id) 134 | } 135 | sort.Strings(ids) // deterministic order 136 | 137 | licenseList := []*licenses.License{} 138 | 139 | for _, id := range ids { 140 | license := &licenses.License{ 141 | SPDX: id, 142 | // TODO: populate other fields here? 143 | } 144 | 145 | // If we find an unknown SPDX ID, we don't want to error, 146 | // because that would allow someone to put junk in their code to 147 | // prevent us scanning it. Instead, create an invalid license 148 | // but return it anyways. If we ever want to check validity, we 149 | // know to expect failures. 150 | if err := license.Validate(); err != nil { 151 | //return nil, err 152 | license = &licenses.License{ 153 | //SPDX: "", 154 | Origin: "", // unknown! 155 | Custom: id, 156 | // TODO: populate other fields here (eg: found license text) 157 | } 158 | } 159 | 160 | licenseList = append(licenseList, license) 161 | } 162 | 163 | if len(licenseMap) == 0 && skip == nil { 164 | // NOTE: this is NOT the same as interfaces.ErrUnknownLicense 165 | // because in this scenario, we're comfortable (ish) the parser 166 | // is exhaustive at finding a license with this methodology. 167 | // We want to return nil, but we error only if Scanner.Err() did 168 | // and so normally this returns nil, nil. 169 | return nil, errwrap.Wrapf(scannerErr, "bitbake scanner error") 170 | } 171 | 172 | result := &interfaces.Result{ 173 | Licenses: licenseList, 174 | Confidence: 1.0, // TODO: what should we put here? 175 | Skip: skip, 176 | } 177 | 178 | // We perform the strange task of processing any partial results, and 179 | // returning some even if we errored, because the spdx code seems to 180 | // think this is better than no results. I'll do the same, but there is 181 | // no guarantee the calling iterator will use these. (Currently it does 182 | // not!) 183 | return result, errwrap.Wrapf(scannerErr, "bitbake scanner error") 184 | } 185 | -------------------------------------------------------------------------------- /backend/cran.go: -------------------------------------------------------------------------------- 1 | // Copyright Amazon.com Inc or its affiliates and the project contributors 2 | // Written by James Shubin and the project contributors 3 | // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); you may not 5 | // use this file except in compliance with the License. You may obtain a copy of 6 | // the License at 7 | // 8 | // http://www.apache.org/licenses/LICENSE-2.0 9 | // 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 12 | // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 13 | // License for the specific language governing permissions and limitations under 14 | // the License. 15 | // 16 | // We will never require a CLA to submit a patch. All contributions follow the 17 | // `inbound == outbound` rule. 18 | // 19 | // This is not an official Amazon product. Amazon does not offer support for 20 | // this project. 21 | // 22 | // SPDX-License-Identifier: Apache-2.0 23 | 24 | // TODO: should this be a subpackage? 25 | package backend 26 | 27 | import ( 28 | "bytes" 29 | "context" 30 | "errors" 31 | "net/mail" 32 | "regexp" 33 | "sort" 34 | "strings" 35 | 36 | "github.com/awslabs/yesiscan/interfaces" 37 | "github.com/awslabs/yesiscan/util/errwrap" 38 | "github.com/awslabs/yesiscan/util/licenses" 39 | ) 40 | 41 | const ( 42 | // CranLicensePrefix is the string we look for when trying to find a 43 | // license. 44 | CranLicensePrefix = "License" 45 | 46 | // CranFilename is the filename used by the R metadata files. 47 | CranFilename = "DESCRIPTION" 48 | ) 49 | 50 | var ( 51 | // ErrInvalidLicenseFormat is an error used in the 52 | // CranDescriptionFileSubParser when licenses with invalid format are 53 | // found. 54 | ErrInvalidLicenseFormat = errors.New("invalid format in License(s)") 55 | 56 | // stripTrashCran is used to replace all strings which include file and 57 | // a filename and sometimes have a + or | before it. For example: 58 | // "| file LICENSE". This also replaces newline characters. source: 59 | // https://cran.rstudio.com/doc/manuals/r-devel/R-exts.html#Licensing 60 | stripTrashCran = regexp.MustCompile(`(([+,|]?([\n ])*)file([\n ])+\w+\b([\n ])*)|\n`) 61 | ) 62 | 63 | // Cran is a backend for DESCRIPTION files which store R package metadata. We 64 | // are getting the license names from the License field in the text file. 65 | type Cran struct { 66 | Debug bool 67 | Logf func(format string, v ...interface{}) 68 | } 69 | 70 | // String method returns the name of the backend. 71 | func (obj *Cran) String() string { 72 | return "cran" 73 | } 74 | 75 | // ScanData is used to extract license ids from data and return licenses based 76 | // on the license ids. 77 | func (obj *Cran) ScanData(ctx context.Context, data []byte, info *interfaces.Info) (*interfaces.Result, error) { 78 | // This check is taking place with the assumption that the file that 79 | // will be scanned will be named "DESCRIPTION". 80 | if info.FileInfo.Name() != CranFilename { 81 | return nil, nil // skip 82 | } 83 | if info.FileInfo.IsDir() { 84 | return nil, nil // skip 85 | } 86 | if len(data) == 0 { 87 | return nil, nil // skip 88 | } 89 | 90 | // Appending a newline to the data because the parser needs to have a 91 | // SECOND trailing newline for it to work properly. Who knows why... 92 | data = append(data, "\n"...) 93 | reader := bytes.NewReader(data) 94 | // Parse the DESCRIPTION file using RFC5322 which is also used for mail. 95 | parsed, err := mail.ReadMessage(reader) 96 | if err != nil { 97 | return nil, errwrap.Wrapf(err, "parse error") 98 | } 99 | 100 | // Getting license information from License field. 101 | cranlicenseFields, ok := parsed.Header[CranLicensePrefix] 102 | if !ok { 103 | // This would mean we did not have a License field in the 104 | // DESCRIPTION file. 105 | return nil, nil 106 | } 107 | licenseMap := make(map[string]struct{}) 108 | var subErr error 109 | for _, license := range cranlicenseFields { 110 | lids, err := CranDescriptionFileSubParser(license) // lid is licenseID 111 | if err != nil { 112 | subErr = errwrap.Append(subErr, err) // store for later 113 | } 114 | // Our parser might have partial results even when it errors. 115 | for _, lid := range lids { 116 | // TODO: should we normalize case here? 117 | licenseMap[lid] = struct{}{} 118 | } 119 | } 120 | 121 | ids := []string{} 122 | for id := range licenseMap { 123 | ids = append(ids, id) 124 | } 125 | sort.Strings(ids) // deterministic order 126 | 127 | licenseList := []*licenses.License{} 128 | 129 | for _, id := range ids { 130 | license := &licenses.License{ 131 | SPDX: id, 132 | // TODO: populate other fields here? 133 | } 134 | 135 | // If we find an unknown SPDX ID, we don't want to error, 136 | // because that would allow someone to put junk in their code to 137 | // prevent us scanning it. Instead, create an invalid license 138 | // but return it anyways. If we ever want to check validity, we 139 | // know to expect failures. 140 | // XXX: Some Cran licenses are not SPDX, therefore we might want 141 | // to add an alias matcher in the future. 142 | if err := license.Validate(); err != nil { 143 | //return nil, err 144 | license = &licenses.License{ 145 | //SPDX: "", 146 | Origin: "", // unknown! 147 | Custom: id, 148 | // TODO: populate other fields here 149 | // (eg: found license text) 150 | } 151 | } 152 | 153 | licenseList = append(licenseList, license) 154 | } 155 | 156 | // We return any partial results, and even if we errored, because we can 157 | // now notify the user of these issues separately. 158 | result := &interfaces.Result{ 159 | Licenses: licenseList, 160 | Confidence: 1.0, // TODO: what should we put here? 161 | Skip: errwrap.Wrapf(subErr, "cran sub-parser error"), 162 | } 163 | 164 | return result, nil 165 | } 166 | 167 | // CranDescriptionFileSubParser is used to parse the License field in 168 | // DESCRIPTION files. 169 | func CranDescriptionFileSubParser(input string) ([]string, error) { 170 | if input == "" { 171 | return nil, ErrInvalidLicenseFormat 172 | } 173 | // Removing all files and new line characters from input. 174 | input = stripTrashCran.ReplaceAllString(input, "") 175 | if input == "" { 176 | // We are returning nil, nil here because the input only 177 | // consisted of files for Licenses. 178 | return nil, nil 179 | } 180 | var result []string 181 | var err error 182 | // TODO: I have only seen | in between license strings for now. source: 183 | // https://cran.rstudio.com/doc/manuals/r-devel/R-exts.html#Licensing 184 | listLicenseNames := strings.Split(input, "|") 185 | for _, x := range listLicenseNames { 186 | license := strings.TrimSpace(x) 187 | if license == "" { 188 | err = ErrInvalidLicenseFormat 189 | continue 190 | } 191 | result = append(result, license) 192 | } 193 | return result, err 194 | } 195 | -------------------------------------------------------------------------------- /backend/cran_test.go: -------------------------------------------------------------------------------- 1 | // Copyright Amazon.com Inc or its affiliates and the project contributors 2 | // Written by James Shubin and the project contributors 3 | // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); you may not 5 | // use this file except in compliance with the License. You may obtain a copy of 6 | // the License at 7 | // 8 | // http://www.apache.org/licenses/LICENSE-2.0 9 | // 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 12 | // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 13 | // License for the specific language governing permissions and limitations under 14 | // the License. 15 | // 16 | // We will never require a CLA to submit a patch. All contributions follow the 17 | // `inbound == outbound` rule. 18 | // 19 | // This is not an official Amazon product. Amazon does not offer support for 20 | // this project. 21 | // 22 | // SPDX-License-Identifier: Apache-2.0 23 | 24 | // TODO: should this be a subpackage? 25 | 26 | package backend_test 27 | 28 | import ( 29 | "context" 30 | "fmt" 31 | "io/fs" 32 | "io/ioutil" 33 | "os" 34 | "path/filepath" 35 | "reflect" 36 | "strings" 37 | "testing" 38 | "time" 39 | 40 | "github.com/awslabs/yesiscan/backend" 41 | "github.com/awslabs/yesiscan/interfaces" 42 | "github.com/awslabs/yesiscan/iterator" 43 | "github.com/awslabs/yesiscan/util/licenses" 44 | ) 45 | 46 | // cranFileInfo struct helps make any input file to be DESCRIPTION files. 47 | type cranFileInfo struct { 48 | fileInfo fs.FileInfo 49 | } 50 | 51 | func (obj *cranFileInfo) Name() string { return backend.CranFilename } 52 | func (obj *cranFileInfo) Size() int64 { return obj.fileInfo.Size() } 53 | func (obj *cranFileInfo) Mode() fs.FileMode { return obj.fileInfo.Mode() } 54 | func (obj *cranFileInfo) ModTime() time.Time { return obj.fileInfo.ModTime() } 55 | func (obj *cranFileInfo) IsDir() bool { return obj.fileInfo.IsDir() } 56 | func (obj *cranFileInfo) Sys() interface{} { return obj.fileInfo.Sys() } 57 | 58 | // CranDescriptionFileSubParser parses the string in the License field to get 59 | // License names from DESCRIPTION files. If no Licenses are found nil is 60 | // returned and any files mentioned are ignored by the parser. 61 | func TestCranDescriptionFileSubParser(t *testing.T) { 62 | errVal := backend.ErrInvalidLicenseFormat 63 | tests := []struct { 64 | input string 65 | output []string 66 | err error 67 | }{ 68 | {"", nil, errVal}, 69 | {"||||||", nil, errVal}, 70 | {"++--###", []string{"++--###"}, nil}, 71 | {"file LICENSE", nil, nil}, 72 | {"file any", nil, nil}, 73 | {"MIT + file LICENSE", []string{"MIT"}, nil}, 74 | {"MIT + file LICENSE | file LICENSE", []string{"MIT"}, nil}, 75 | {"Artistic-2.0 | AGPL-3 + file LICENSE", []string{"Artistic-2.0", "AGPL-3"}, nil}, 76 | {"GPL-2 | \n file LICENSE", []string{"GPL-2"}, nil}, 77 | {"MIT + file LICENSE | file LICENSE | AGPL-3 + file anything", []string{"MIT", "AGPL-3"}, nil}, 78 | {"Artistic-2.0 | AGPL-3 + file any | MIT + file LICENSE", []string{"Artistic-2.0", "AGPL-3", "MIT"}, nil}, 79 | {"Artistic-2.0 | | MIT +file LICENSE", []string{"Artistic-2.0", "MIT"}, errVal}, 80 | {"Artistic-2.0 | \n AGPL-3 + file any | \n MIT + file LICENSE", []string{"Artistic-2.0", "AGPL-3", "MIT"}, nil}, 81 | {"Artistic-2.0 | \n AGPL-3 \n + file any | \n MI\nT + file LICENSE", []string{"Artistic-2.0", "AGPL-3", "MIT"}, nil}, 82 | {"Artistic-2.0 | \n AGPL-3 \n + file any | -+-+##& | \n MIT + file LICENSE", []string{"Artistic-2.0", "AGPL-3", "-+-+##&", "MIT"}, nil}, 83 | } 84 | 85 | for i, test := range tests { 86 | out, err := backend.CranDescriptionFileSubParser(test.input) 87 | if err != test.err { 88 | t.Errorf("err: %v, exp err: %v", err, test.err) 89 | continue 90 | } 91 | if !reflect.DeepEqual(out, test.output) { 92 | t.Errorf("out: %v, exp out: %v", out, test.output) 93 | continue 94 | } 95 | t.Logf("test# %d succeeded!", i) 96 | } 97 | } 98 | 99 | // TestCranBackend tests whether the cran backend runs as intended. 100 | func TestCranBackend(t *testing.T) { 101 | inputfilePaths, err := filepath.Glob("./cran_test_cases/*.input") 102 | if err != nil { 103 | t.Errorf("error getting input files: %v", err) 104 | return 105 | } 106 | cranBackend := &backend.Cran{ 107 | Debug: false, 108 | Logf: func(format string, v ...interface{}) { 109 | t.Logf("backend: "+format, v...) 110 | }, 111 | } 112 | for _, path := range inputfilePaths { 113 | inputFileInfo, err := os.Stat(path) 114 | if err != nil { 115 | t.Errorf("error getting FileInfo: %v", err) 116 | continue 117 | } 118 | data, err := ioutil.ReadFile(path) 119 | if err != nil { 120 | t.Errorf("error reading input file: %v", err) 121 | continue 122 | } 123 | fileInfo := &cranFileInfo{ 124 | fileInfo: inputFileInfo, 125 | } 126 | info := &interfaces.Info{ 127 | FileInfo: fileInfo, 128 | UID: iterator.FileScheme + path, 129 | } 130 | 131 | outputFilePath := strings.TrimSuffix(path, ".input") + ".output" 132 | errorFilePath := strings.TrimSuffix(path, ".input") + ".error" 133 | // TODO: if there is no error file, assume we expect no error 134 | outputContents, outputFileErr := ioutil.ReadFile(outputFilePath) 135 | if outputFileErr != nil { 136 | t.Errorf("error reading output file: %v", outputFileErr) 137 | } 138 | errorContents, errorFileErr := ioutil.ReadFile(errorFilePath) 139 | if errorFileErr != nil { 140 | t.Errorf("error reading error file: %v", errorFileErr) 141 | } 142 | if outputFileErr != nil || errorFileErr != nil { 143 | // give both statements a chance to tell us what's 144 | // missing before we go on to the next test case 145 | continue 146 | } 147 | 148 | expOut := strings.TrimSuffix(string(outputContents), "\n") 149 | var expErr error 150 | if s := strings.TrimSuffix(string(errorContents), "\n"); s != "" { 151 | expErr = fmt.Errorf(s) 152 | } 153 | 154 | result, err := cranBackend.ScanData(context.Background(), data, info) 155 | if (err == nil) != (expErr == nil) { // xor 156 | t.Errorf("filename: %v, err: %v", path, err) 157 | t.Errorf("filename: %v, exp: %v", path, expErr) 158 | continue 159 | } 160 | if err != nil && expErr != nil { 161 | if err.Error() != expErr.Error() { // compare the strings 162 | t.Errorf("filename: %v, err: %v", path, err) 163 | t.Errorf("filename: %v, exp: %v", path, expErr) 164 | continue 165 | } 166 | } 167 | 168 | var out string 169 | if result != nil { 170 | out = licenses.Join(result.Licenses) 171 | } 172 | if out != expOut { 173 | t.Errorf("filename: %v, out: %v", path, out) 174 | t.Errorf("filename: %v, exp: %v", path, expOut) 175 | continue 176 | } 177 | 178 | t.Logf("Success!") 179 | } 180 | } 181 | -------------------------------------------------------------------------------- /backend/cran_test_cases/test_case0.error: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /backend/cran_test_cases/test_case0.input: -------------------------------------------------------------------------------- 1 | Title: Example DESCRIPTION Files 2 | Version: 1.4.1.9000 3 | Depends: 4 | R (>= 3.4) 5 | Imports: 6 | cli, 7 | utils 8 | Suggests: 9 | callr, 10 | covr 11 | Config/testthat/edition: 3 12 | Encoding: UTF-8 13 | Language: en-US 14 | Collate: 15 | 'assertions.R' 16 | -------------------------------------------------------------------------------- /backend/cran_test_cases/test_case0.output: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /backend/cran_test_cases/test_case1.error: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /backend/cran_test_cases/test_case1.input: -------------------------------------------------------------------------------- 1 | Title: Example DESCRIPTION Files 2 | Version: 1.4.1.9000 3 | License: MIT + 4 | file 5 | LICENSE | GPL-2.0 6 | License: Apache-2.0 7 | Depends: 8 | R (>= 3.4) 9 | Imports: 10 | cli, 11 | utils 12 | Suggests: 13 | callr, 14 | covr 15 | Config/testthat/edition: 3 16 | Encoding: UTF-8 17 | Language: en-US 18 | Collate: 19 | 'assertions.R' 20 | -------------------------------------------------------------------------------- /backend/cran_test_cases/test_case1.output: -------------------------------------------------------------------------------- 1 | Apache-2.0, GPL-2.0, MIT 2 | -------------------------------------------------------------------------------- /backend/cran_test_cases/test_case2.error: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /backend/cran_test_cases/test_case2.input: -------------------------------------------------------------------------------- 1 | Title: Example DESCRIPTION Files 2 | Version: 1.4.1.9000 3 | License: Apache-2 4 | Depends: 5 | R (>= 3.4) 6 | Imports: 7 | cli, 8 | utils 9 | Suggests: 10 | callr, 11 | covr 12 | Config/testthat/edition: 3 13 | Encoding: UTF-8 14 | Language: en-US 15 | Collate: 16 | 'assertions.R' 17 | -------------------------------------------------------------------------------- /backend/cran_test_cases/test_case2.output: -------------------------------------------------------------------------------- 1 | Apache-2(unknown) 2 | -------------------------------------------------------------------------------- /backend/cran_test_cases/test_case3.error: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /backend/cran_test_cases/test_case3.input: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/purpleidea/yesiscan/a9f8980f4f152aa15610bec090c7c6503db8ee6f/backend/cran_test_cases/test_case3.input -------------------------------------------------------------------------------- /backend/cran_test_cases/test_case3.output: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /backend/cran_test_cases/test_case4.error: -------------------------------------------------------------------------------- 1 | cran sub-parser error: invalid format in License(s) 2 | -------------------------------------------------------------------------------- /backend/cran_test_cases/test_case4.input: -------------------------------------------------------------------------------- 1 | Title: Example DESCRIPTION Files 2 | Version: 1.4.1.9000 3 | License: MIT + 4 | file 5 | LICENSE | GPL-2.0 6 | License: 7 | Depends: 8 | R (>= 3.4) 9 | Imports: 10 | cli, 11 | utils 12 | Suggests: 13 | callr, 14 | covr 15 | Config/testthat/edition: 3 16 | Encoding: UTF-8 17 | Language: en-US 18 | Collate: 19 | 'assertions.R' 20 | -------------------------------------------------------------------------------- /backend/cran_test_cases/test_case4.output: -------------------------------------------------------------------------------- 1 | GPL-2.0, MIT 2 | -------------------------------------------------------------------------------- /backend/cran_test_cases/test_case5.error: -------------------------------------------------------------------------------- 1 | cran sub-parser error: invalid format in License(s) 2 | -------------------------------------------------------------------------------- /backend/cran_test_cases/test_case5.input: -------------------------------------------------------------------------------- 1 | Title: Example DESCRIPTION Files 2 | Version: 1.4.1.9000 3 | License: 4 | Depends: 5 | R (>= 3.4) 6 | Imports: 7 | cli, 8 | utils 9 | Suggests: 10 | callr, 11 | covr 12 | Config/testthat/edition: 3 13 | Encoding: UTF-8 14 | Language: en-US 15 | Collate: 16 | 'assertions.R' 17 | -------------------------------------------------------------------------------- /backend/cran_test_cases/test_case5.output: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /backend/licenseclassifier.go: -------------------------------------------------------------------------------- 1 | // Copyright Amazon.com Inc or its affiliates and the project contributors 2 | // Written by James Shubin and the project contributors 3 | // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); you may not 5 | // use this file except in compliance with the License. You may obtain a copy of 6 | // the License at 7 | // 8 | // http://www.apache.org/licenses/LICENSE-2.0 9 | // 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 12 | // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 13 | // License for the specific language governing permissions and limitations under 14 | // the License. 15 | // 16 | // We will never require a CLA to submit a patch. All contributions follow the 17 | // `inbound == outbound` rule. 18 | // 19 | // This is not an official Amazon product. Amazon does not offer support for 20 | // this project. 21 | // 22 | // SPDX-License-Identifier: Apache-2.0 23 | 24 | // TODO: should this be a subpackage? 25 | package backend 26 | 27 | import ( 28 | "context" 29 | "fmt" 30 | "sort" 31 | 32 | "github.com/awslabs/yesiscan/interfaces" 33 | "github.com/awslabs/yesiscan/util/errwrap" 34 | "github.com/awslabs/yesiscan/util/licenses" 35 | "github.com/awslabs/yesiscan/util/safepath" 36 | 37 | "github.com/google/licenseclassifier" 38 | "github.com/google/licenseclassifier/tools/identify_license/backend" 39 | "github.com/google/licenseclassifier/tools/identify_license/results" 40 | ) 41 | 42 | // LicenseClassifier is based on the licenseclassifier project. 43 | type LicenseClassifier struct { 44 | // This was chosen as it's easier to have the first backend be based on 45 | // a native golang project, rather than having to play the exec games 46 | // right away. Some code within is based on their cli code that wraps 47 | // their lib. 48 | 49 | Debug bool 50 | Logf func(format string, v ...interface{}) 51 | 52 | // XXX: also match with .header files 53 | // XXX: what default value do we want here? 54 | // XXX: what exactly does this do? 55 | IncludeHeaders bool 56 | 57 | // UseDefaultConfidence specifies whether we should use the default 58 | // confidence threshold that this library seems to use all the time. 59 | // I've noticed that without it, it misidentifies a lot of things. But 60 | // with it, it misses some things entirely, even if it incorrectly 61 | // identifies them. 62 | UseDefaultConfidence bool 63 | 64 | // SkipZeroResults tells this backend to avoid erroring when we aren't 65 | // able to determine if a file matches a known license. Since this 66 | // particular backend is not good at general file identification, and 67 | // only good at being presented with actual licenses, this is useful if 68 | // file filtering is not enabled. 69 | SkipZeroResults bool 70 | } 71 | 72 | func (obj *LicenseClassifier) String() string { 73 | return "licenseclassifier" 74 | } 75 | 76 | func (obj *LicenseClassifier) ScanPath(ctx context.Context, path safepath.Path, info *interfaces.Info) (*interfaces.Result, error) { 77 | 78 | if info.FileInfo.IsDir() { // path.IsDir() should be the same. 79 | return nil, nil // skip 80 | } 81 | if info.FileInfo.Size() == 0 { 82 | return nil, nil // skip 83 | } 84 | 85 | filenames := []string{path.Path()} 86 | 87 | threshold := 0.0 // we decide acceptability downstream 88 | if obj.UseDefaultConfidence { 89 | threshold = licenseclassifier.DefaultConfidenceThreshold 90 | } 91 | forbiddenOnly := true // identify using forbidden licenses archive 92 | be, err := backend.New(threshold, forbiddenOnly) 93 | if err != nil { 94 | be.Close() 95 | return nil, errwrap.Wrapf(err, "cannot create license classifier") 96 | } 97 | 98 | // XXX: bug: https://github.com/google/licenseclassifier/issues/28 99 | ctx, cancel := context.WithCancel(ctx) 100 | defer cancel() 101 | if errs := be.ClassifyLicensesWithContext(ctx, filenames, obj.IncludeHeaders); errs != nil { 102 | be.Close() 103 | for _, err := range errs { 104 | if obj.Debug { 105 | obj.Logf("classify license failed: %v", err) 106 | } 107 | } 108 | return nil, fmt.Errorf("cannot classify licenses") 109 | } 110 | 111 | results := be.GetResults() 112 | if len(results) == 0 { 113 | be.Close() 114 | return nil, nil 115 | } 116 | 117 | sort.Sort(results) 118 | // A match identifies the result of matching a string against a known value. 119 | // Name string // Name of known value that was matched. 120 | // Confidence float64 // Confidence percentage. 121 | // Offset int // The offset into the unknown string the match was made. 122 | // Extent int // The length from the offset into the unknown string. 123 | //for _, r := range results { 124 | // log.Printf("%s: %s (confidence: %v, offset: %v, extent: %v)", 125 | // r.Filename, r.Name, r.Confidence, r.Offset, r.Extent) 126 | // // licenses/AGPL-3.0.txt: AGPL-3.0 (confidence: 0.9999677086024283, offset: 0, extent: 30968) 127 | //} 128 | be.Close() 129 | // This can give us multiple results, sorted by most confident. 130 | result, err := licenseclassifierResultHelper(results[0]) 131 | if err != nil { 132 | return nil, err 133 | } 134 | 135 | // Add more info about the others possibilities to the result. 136 | more := []*interfaces.Result{} 137 | for i := 1; i < len(results); i++ { 138 | r, err := licenseclassifierResultHelper(results[i]) 139 | if err != nil { 140 | return nil, err 141 | } 142 | more = append(more, r) 143 | } 144 | if len(more) > 0 { 145 | result.More = more 146 | } 147 | 148 | return result, nil 149 | } 150 | 151 | func licenseclassifierResultHelper(result *results.LicenseType) (*interfaces.Result, error) { 152 | if result == nil { 153 | return nil, fmt.Errorf("got nil result") 154 | } 155 | 156 | // XXX: This backend seems to return names that aren't valid SPDX ID's. 157 | // It's also not necessarily guaranteed that the SPDX ID's they do 158 | // return correspond to the exact same license texts that we expect. We 159 | // need to (1) ensure the mapping is the same, and (2) check when one of 160 | // these licenses is not in our SPDX list, and tag it separately. 161 | license := &licenses.License{ 162 | SPDX: result.Name, 163 | // TODO: populate other fields here (eg: found license text) 164 | } 165 | // FIXME: If license is not in SPDX, add a custom entry. 166 | // FIXME: https://github.com/google/licenseclassifier/issues/31 167 | if err := license.Validate(); err != nil { 168 | //return nil, err 169 | license = &licenses.License{ 170 | //SPDX: "", 171 | Origin: "licenseclassifier.google.github.com", 172 | Custom: result.Name, 173 | // TODO: populate other fields here (eg: found license text) 174 | } 175 | } 176 | return &interfaces.Result{ 177 | Licenses: []*licenses.License{ 178 | license, 179 | }, 180 | Confidence: result.Confidence, 181 | }, nil 182 | } 183 | -------------------------------------------------------------------------------- /backend/pom.go: -------------------------------------------------------------------------------- 1 | // Copyright Amazon.com Inc or its affiliates and the project contributors 2 | // Written by James Shubin and the project contributors 3 | // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); you may not 5 | // use this file except in compliance with the License. You may obtain a copy of 6 | // the License at 7 | // 8 | // http://www.apache.org/licenses/LICENSE-2.0 9 | // 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 12 | // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 13 | // License for the specific language governing permissions and limitations under 14 | // the License. 15 | // 16 | // We will never require a CLA to submit a patch. All contributions follow the 17 | // `inbound == outbound` rule. 18 | // 19 | // This is not an official Amazon product. Amazon does not offer support for 20 | // this project. 21 | // 22 | // SPDX-License-Identifier: Apache-2.0 23 | 24 | // TODO: should this be a subpackage? 25 | package backend 26 | 27 | import ( 28 | "context" 29 | "encoding/xml" 30 | "sort" 31 | 32 | "github.com/awslabs/yesiscan/interfaces" 33 | "github.com/awslabs/yesiscan/util/errwrap" 34 | "github.com/awslabs/yesiscan/util/licenses" 35 | ) 36 | 37 | const ( 38 | // PomFilename is the file name used by the pomfiles. 39 | PomFilename = "pom.xml" 40 | ) 41 | 42 | // Pom is a backend for Pom or Project Object Model files. It is an xml file 43 | // commonly used by the Maven Project under the name pom.xml. We are getting the 44 | // license names by parsing the pom.xml file. 45 | type Pom struct { 46 | Debug bool 47 | Logf func(format string, v ...interface{}) 48 | } 49 | 50 | // String method returns the name of the backend. 51 | func (obj *Pom) String() string { 52 | return "pom" 53 | } 54 | 55 | // ScanData method is used to extract license ids from data and return licenses 56 | // based on the license ids. 57 | func (obj *Pom) ScanData(ctx context.Context, data []byte, info *interfaces.Info) (*interfaces.Result, error) { 58 | // This check is taking place with the assumption that the file that will be 59 | // scanned will have to be named "pom.xml". 60 | if info.FileInfo.Name() != PomFilename { 61 | return nil, nil // skip 62 | } 63 | if info.FileInfo.IsDir() { 64 | return nil, nil // skip 65 | } 66 | if len(data) == 0 { 67 | return nil, nil // skip 68 | } 69 | 70 | licenseMap := make(map[string]struct{}) 71 | var pomFileLicenses PomLicenses 72 | 73 | // parsing pom.xml file to get license names in struct 74 | if err := xml.Unmarshal(data, &pomFileLicenses); err != nil { 75 | // There is a parse error with the file, so we can't properly 76 | // examine it for licensing information with this pom scanner. 77 | result := &interfaces.Result{ 78 | Confidence: 1.0, // TODO: what should we put here? 79 | Skip: errwrap.Wrapf(err, "parse error"), 80 | } 81 | return result, nil 82 | } 83 | 84 | if len(pomFileLicenses.Names) == 0 { 85 | // If we did not get any license names from the pom file we return nil, nil. 86 | return nil, nil 87 | } 88 | 89 | // lid is license id 90 | for _, lid := range pomFileLicenses.Names { 91 | licenseMap[lid] = struct{}{} 92 | } 93 | 94 | ids := []string{} 95 | for id := range licenseMap { 96 | ids = append(ids, id) 97 | } 98 | sort.Strings(ids) // deterministic order 99 | 100 | licenseList := []*licenses.License{} 101 | 102 | for _, id := range ids { 103 | license := &licenses.License{ 104 | SPDX: id, 105 | // TODO: populate other fields here? 106 | } 107 | 108 | // If we find an unknown SPDX ID, we don't want to error, because that would 109 | // allow someone to put junk in their code to prevent us scanning it. Instead, 110 | // create an invalid license but return it anyways. If we ever want to check 111 | // validity, we know to expect failures. 112 | // XXX: Many Pom licenses are not SPDX, therefore we might want to add an alias 113 | // matcher in the future. 114 | if err := license.Validate(); err != nil { 115 | //return nil, err 116 | license = &licenses.License{ 117 | //SPDX: "", 118 | Origin: "", // unknown! 119 | Custom: id, 120 | // TODO: populate other fields here (eg: found license text) 121 | } 122 | } 123 | 124 | licenseList = append(licenseList, license) 125 | } 126 | 127 | result := &interfaces.Result{ 128 | Licenses: licenseList, 129 | Confidence: 1.0, // TODO: what should we put here? 130 | } 131 | 132 | return result, nil 133 | } 134 | 135 | // PomLicenses is a struct that helps store license names from the licenses 136 | // field in a pom.xml file. 137 | type PomLicenses struct { 138 | // Names is a variable that will store the license names from pom.xml. 139 | Names []string `xml:"licenses>license>name"` 140 | } 141 | -------------------------------------------------------------------------------- /backend/regexp.go: -------------------------------------------------------------------------------- 1 | // Copyright Amazon.com Inc or its affiliates and the project contributors 2 | // Written by James Shubin and the project contributors 3 | // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); you may not 5 | // use this file except in compliance with the License. You may obtain a copy of 6 | // the License at 7 | // 8 | // http://www.apache.org/licenses/LICENSE-2.0 9 | // 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 12 | // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 13 | // License for the specific language governing permissions and limitations under 14 | // the License. 15 | // 16 | // We will never require a CLA to submit a patch. All contributions follow the 17 | // `inbound == outbound` rule. 18 | // 19 | // This is not an official Amazon product. Amazon does not offer support for 20 | // this project. 21 | // 22 | // SPDX-License-Identifier: Apache-2.0 23 | 24 | // TODO: should this be a subpackage? 25 | package backend 26 | 27 | import ( 28 | "bytes" 29 | "context" 30 | "encoding/json" 31 | "fmt" 32 | "os" 33 | 34 | "github.com/awslabs/yesiscan/interfaces" 35 | "github.com/awslabs/yesiscan/util/errwrap" 36 | ) 37 | 38 | // Regexp is a simple backend that uses regular expressions to find certain 39 | // license strings. It wraps the RegexpCore backend and adds the file input 40 | // code. 41 | type Regexp struct { 42 | *RegexpCore 43 | 44 | // Filename is an absolute path to a file that we will read the patterns 45 | // from. The struct is described below and an example is available in 46 | // the examples folder. 47 | Filename string 48 | } 49 | 50 | func (obj *Regexp) String() string { 51 | return obj.RegexpCore.String() 52 | } 53 | 54 | func (obj *Regexp) Setup(ctx context.Context) error { 55 | b, err := os.ReadFile(obj.Filename) 56 | if err != nil { 57 | // TODO: this error message is CLI specific, but should be generalized 58 | obj.Logf("either run with --no-backend-regexp or create your regexp pattern file at %s", obj.Filename) 59 | return errwrap.Wrapf(err, "could not read config file: %s", obj.Filename) 60 | } 61 | 62 | buffer := bytes.NewBuffer(b) 63 | if buffer.Len() == 0 { 64 | // TODO: should this be an error, or just a silent ignore? 65 | return fmt.Errorf("empty input file") 66 | } 67 | decoder := json.NewDecoder(buffer) 68 | 69 | var regexpConfig RegexpConfig // this gets populated during decode 70 | if err := decoder.Decode(®expConfig); err != nil { 71 | return errwrap.Wrapf(err, "error decoding regexp json output") 72 | } 73 | 74 | obj.RegexpCore.Rules = regexpConfig.Rules 75 | obj.RegexpCore.Origin = regexpConfig.Origin 76 | 77 | return obj.RegexpCore.Setup(ctx) 78 | } 79 | 80 | func (obj *Regexp) ScanData(ctx context.Context, data []byte, info *interfaces.Info) (*interfaces.Result, error) { 81 | return obj.RegexpCore.ScanData(ctx, data, info) 82 | } 83 | 84 | // RegexpConfig is the structure of the pattern config file. 85 | type RegexpConfig struct { 86 | // Rules is the list of regexp and license id rules. 87 | Rules []*RegexpLicenseRule `json:"rules"` 88 | 89 | // Origin is the SPDX origin string if we want to have a custom 90 | // namespace for non-SPDX license ID's. 91 | Origin string `json:"origin"` 92 | 93 | // Comment adds a user friendly comment for this file. We could use it 94 | // to add a version string or maybe that could be a separate field. 95 | Comment string `json:"comment"` 96 | } 97 | -------------------------------------------------------------------------------- /backend/regexpcore.go: -------------------------------------------------------------------------------- 1 | // Copyright Amazon.com Inc or its affiliates and the project contributors 2 | // Written by James Shubin and the project contributors 3 | // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); you may not 5 | // use this file except in compliance with the License. You may obtain a copy of 6 | // the License at 7 | // 8 | // http://www.apache.org/licenses/LICENSE-2.0 9 | // 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 12 | // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 13 | // License for the specific language governing permissions and limitations under 14 | // the License. 15 | // 16 | // We will never require a CLA to submit a patch. All contributions follow the 17 | // `inbound == outbound` rule. 18 | // 19 | // This is not an official Amazon product. Amazon does not offer support for 20 | // this project. 21 | // 22 | // SPDX-License-Identifier: Apache-2.0 23 | 24 | // TODO: should this be a subpackage? 25 | package backend 26 | 27 | import ( 28 | "bufio" 29 | "bytes" 30 | "context" 31 | "regexp" 32 | "sort" 33 | "strings" 34 | 35 | "github.com/awslabs/yesiscan/interfaces" 36 | "github.com/awslabs/yesiscan/util/errwrap" 37 | "github.com/awslabs/yesiscan/util/licenses" 38 | ) 39 | 40 | const ( 41 | // RegexpMaxBytesLine sets a larger maximum for file line scanning than 42 | // the default of bufio.MaxScanTokenSize which is sort of small. 43 | RegexpMaxBytesLine = 1024 * 1024 * 8 // 8 MiB 44 | ) 45 | 46 | // RegexpCore is a simple backend that uses regular expressions to find certain 47 | // license strings. You should probably not use this backend directly, but wrap 48 | // it with one of the other ones like Regexp. 49 | type RegexpCore struct { 50 | Debug bool 51 | Logf func(format string, v ...interface{}) 52 | 53 | // Rules is a list of regexp license rules. 54 | Rules []*RegexpLicenseRule 55 | 56 | // Origin is the license field origin which is used if a non-SPDX ID is 57 | // specified. You can use this blank if you want. These are commonly 58 | // expressed in "reverse-dns" notation to provide a unique identifier 59 | // when naming your license. Eg: "yesiscan.awslabs.github.com". 60 | Origin string 61 | 62 | // MultipleMatch is set to true if you want the same regexp to be 63 | // allowed to match more than once in the same file. This is useful if 64 | // you want to be able to pull out every range where the pattern is 65 | // seen, even if you will keep getting the same license answer. Most of 66 | // the time you probably want to leave this as false. 67 | MultipleMatch bool 68 | 69 | // compiledRegexps is compiled list of the above Rules field. This is 70 | // done for performance reasons. 71 | compiledRegexps []*regexp.Regexp 72 | } 73 | 74 | func (obj *RegexpCore) String() string { 75 | return "regexp" 76 | } 77 | 78 | func (obj *RegexpCore) Setup(ctx context.Context) error { 79 | for i, x := range obj.Rules { 80 | r, err := regexp.Compile(x.Pattern) 81 | if err != nil { 82 | return errwrap.Wrapf(err, "regexp compile failed at index: %d", i) 83 | } 84 | obj.compiledRegexps = append(obj.compiledRegexps, r) 85 | } 86 | 87 | return nil 88 | } 89 | 90 | func (obj *RegexpCore) ScanData(ctx context.Context, data []byte, info *interfaces.Info) (*interfaces.Result, error) { 91 | if info.FileInfo.IsDir() { 92 | return nil, nil // skip 93 | } 94 | if len(data) == 0 { 95 | return nil, nil // skip 96 | } 97 | 98 | ctx, cancel := context.WithCancel(ctx) 99 | defer cancel() 100 | 101 | licenseMap := make(map[string]struct{}) 102 | 103 | reader := bytes.NewReader(data) 104 | scanner := bufio.NewScanner(reader) 105 | buf := []byte{} // create a buffer for very long lines 106 | scanner.Buffer(buf, RegexpMaxBytesLine) // set the max size of that buffer 107 | for scanner.Scan() { 108 | // In an effort to short-circuit things if needed, we run a 109 | // check ourselves and break out early if we see that we have 110 | // cancelled early. 111 | select { 112 | case <-ctx.Done(): 113 | return nil, errwrap.Wrapf(ctx.Err(), "scanner ended early") 114 | default: 115 | } 116 | 117 | s := scanner.Text() // newlines will be stripped here 118 | s = strings.TrimSpace(s) 119 | if s == "" { 120 | continue 121 | } 122 | 123 | for i, r := range obj.compiledRegexps { 124 | loc := r.FindStringIndex(s) // (loc []int) 125 | if loc == nil { // no match 126 | continue 127 | } 128 | if obj.Debug { 129 | obj.Logf("matched: %s", string(s[loc[0]:loc[1]])) 130 | } 131 | 132 | lid := obj.Rules[i].ID 133 | // TODO: replace this with a generic license parser and 134 | // alias matcher. 135 | split := strings.Split(lid, " AND ") 136 | for _, l := range split { 137 | l = strings.TrimSpace(l) 138 | licenseMap[l] = struct{}{} 139 | } 140 | if !obj.MultipleMatch { 141 | break // just break this inner loop 142 | } 143 | } 144 | } 145 | var skip error 146 | scannerErr := scanner.Err() 147 | if scannerErr == bufio.ErrTooLong { 148 | skip = scannerErr // add to ignored files... 149 | scannerErr = nil // reset 150 | } 151 | 152 | ids := []string{} 153 | for id := range licenseMap { 154 | ids = append(ids, id) 155 | } 156 | sort.Strings(ids) // deterministic order 157 | 158 | licenseList := []*licenses.License{} 159 | 160 | for _, id := range ids { 161 | license := &licenses.License{ 162 | SPDX: id, 163 | // TODO: populate other fields here? 164 | } 165 | 166 | // If we find an unknown SPDX ID, we don't want to error, 167 | // because that would allow someone to put junk in their code to 168 | // prevent us scanning it. Instead, create an invalid license 169 | // but return it anyways. If we ever want to check validity, we 170 | // know to expect failures. It *must* be valid because it's an 171 | // explicit SPDX scanner. 172 | if err := license.Validate(); err != nil { 173 | //return nil, err 174 | license = &licenses.License{ 175 | //SPDX: "", 176 | Origin: obj.Origin, 177 | Custom: id, 178 | // TODO: populate other fields here (eg: found license text) 179 | } 180 | } 181 | 182 | licenseList = append(licenseList, license) 183 | } 184 | 185 | if len(licenseMap) == 0 && skip == nil { 186 | // NOTE: this is NOT the same as interfaces.ErrUnknownLicense 187 | // because in this scenario, we're comfortable (ish) the parser 188 | // is exhaustive at finding a license with this methodology. 189 | // We want to return nil, but we error only if Scanner.Err() did 190 | // and so normally this returns nil, nil. 191 | return nil, errwrap.Wrapf(scannerErr, "regexp scanner error") 192 | } 193 | 194 | result := &interfaces.Result{ 195 | Licenses: licenseList, 196 | Confidence: 1.0, // TODO: what should we put here? 197 | Skip: skip, 198 | } 199 | 200 | // We perform the strange task of processing any partial results, and 201 | // returning some even if we errored, because the spdx code seems to 202 | // think this is better than no results. I'll do the same, but there is 203 | // no guarantee the calling iterator will use these. (Currently it does 204 | // not!) 205 | return result, errwrap.Wrapf(scannerErr, "regexp scanner error") 206 | } 207 | 208 | // RegexpLicenseRule represents the data required for a regexp license rule. 209 | // Reminder, you can use backticks to quote golang strings, which is 210 | // particularly helpful when entering regular expressions into structs. 211 | type RegexpLicenseRule struct { 212 | // Pattern is the expression we want to match. This uses the stock 213 | // golang regexp engine. 214 | Pattern string `json:"pattern"` 215 | 216 | // ID is the license ID we should use when the above pattern matches. It 217 | // should be an SPDX ID, but other strings are supported, they just 218 | // won't be treated as SPDX if they aren't in our database of allowed 219 | // license identifiers. 220 | ID string `json:"id"` 221 | 222 | // TODO: add a comment field? 223 | //Comment string `json:"comment"` 224 | } 225 | -------------------------------------------------------------------------------- /backend/spdx.go: -------------------------------------------------------------------------------- 1 | // Copyright Amazon.com Inc or its affiliates and the project contributors 2 | // Written by James Shubin and the project contributors 3 | // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); you may not 5 | // use this file except in compliance with the License. You may obtain a copy of 6 | // the License at 7 | // 8 | // http://www.apache.org/licenses/LICENSE-2.0 9 | // 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 12 | // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 13 | // License for the specific language governing permissions and limitations under 14 | // the License. 15 | // 16 | // We will never require a CLA to submit a patch. All contributions follow the 17 | // `inbound == outbound` rule. 18 | // 19 | // This is not an official Amazon product. Amazon does not offer support for 20 | // this project. 21 | // 22 | // SPDX-License-Identifier: Apache-2.0 23 | 24 | // TODO: should this be a subpackage? 25 | package backend 26 | 27 | import ( 28 | "bufio" 29 | "bytes" 30 | "context" 31 | "regexp" 32 | "sort" 33 | "strings" 34 | 35 | "github.com/awslabs/yesiscan/interfaces" 36 | "github.com/awslabs/yesiscan/util/errwrap" 37 | "github.com/awslabs/yesiscan/util/licenses" 38 | ) 39 | 40 | const ( 41 | // SpdxMaxBytesLine sets a larger maximum for file line scanning than 42 | // the default of bufio.MaxScanTokenSize which is sort of small. 43 | SpdxMaxBytesLine = 1024 * 1024 * 8 // 8 MiB 44 | 45 | // magicStringSPDX is the string we look for when trying to find an ID. 46 | magicStringSPDX = "SPDX-License-Identifier:" 47 | 48 | // magicNumberSPDX is... a bad parser hack that SPDX recommends. 49 | magicNumberSPDX = 5 50 | ) 51 | 52 | var ( 53 | // stripTrashSPDX is taken from the spdx tools repository. 54 | stripTrashSPDX = regexp.MustCompile(`[^\w\s\d.\-\+()]+`) 55 | ) 56 | 57 | // Spdx is based on the Software Package Data Exchange project. It is built 58 | // with a slightly objectionable parser as prescribed in the official tools 59 | // repo. 60 | type Spdx struct { 61 | Debug bool 62 | Logf func(format string, v ...interface{}) 63 | } 64 | 65 | func (obj *Spdx) String() string { 66 | return "spdx" 67 | } 68 | 69 | func (obj *Spdx) ScanData(ctx context.Context, data []byte, info *interfaces.Info) (*interfaces.Result, error) { 70 | if info.FileInfo.IsDir() { 71 | return nil, nil // skip 72 | } 73 | if len(data) == 0 { 74 | return nil, nil // skip 75 | } 76 | 77 | ctx, cancel := context.WithCancel(ctx) 78 | defer cancel() 79 | 80 | licenseMap := make(map[string]struct{}) 81 | 82 | // An official parser for SPDX ID's seems be: 83 | // https://github.com/spdx/tools-golang/blob/a16d50ee155238df280a68252acc25e9afb7acea/idsearcher/idsearcher.go#L269 84 | // If it's meant to be that simplistic, we'll implement something 85 | // similar. Please report bugs over there before you report them here =D 86 | 87 | reader := bytes.NewReader(data) 88 | scanner := bufio.NewScanner(reader) 89 | buf := []byte{} // create a buffer for very long lines 90 | scanner.Buffer(buf, SpdxMaxBytesLine) // set the max size of that buffer 91 | for scanner.Scan() { 92 | // In an effort to short-circuit things if needed, we run a 93 | // check ourselves and break out early if we see that we have 94 | // cancelled early. 95 | select { 96 | case <-ctx.Done(): 97 | return nil, errwrap.Wrapf(ctx.Err(), "scanner ended early") 98 | default: 99 | } 100 | 101 | s := scanner.Text() // newlines will be stripped here 102 | strs := strings.SplitN(s, magicStringSPDX, 2) // max split of 2 103 | if len(strs) == 1 { // no split happened, string not found 104 | continue 105 | } 106 | 107 | // weird way to parse, but whatever: 108 | // "if prefixed by more than n characters, it's probably not a 109 | // short-form ID; it's probably code to detect short-form IDs." 110 | if len(stripTrash(strs[0])) > magicNumberSPDX { // arbitrary wat 111 | continue 112 | } 113 | 114 | // spdx says: "stop before trailing */ if it is present" 115 | lid := strings.Split(strs[1], "*/")[0] // lid is licenseID 116 | lid = strings.TrimSpace(lid) 117 | lid = stripTrash(lid) 118 | 119 | licenseMap[lid] = struct{}{} 120 | } 121 | var skip error 122 | scannerErr := scanner.Err() 123 | if scannerErr == bufio.ErrTooLong { 124 | skip = scannerErr // add to ignored files... 125 | scannerErr = nil // reset 126 | } 127 | 128 | ids := []string{} 129 | for id := range licenseMap { 130 | ids = append(ids, id) 131 | } 132 | sort.Strings(ids) // deterministic order 133 | 134 | licenseList := []*licenses.License{} 135 | 136 | for _, id := range ids { 137 | license := &licenses.License{ 138 | SPDX: id, 139 | // TODO: populate other fields here? 140 | } 141 | 142 | // If we find an unknown SPDX ID, we don't want to error, 143 | // because that would allow someone to put junk in their code to 144 | // prevent us scanning it. Instead, create an invalid license 145 | // but return it anyways. If we ever want to check validity, we 146 | // know to expect failures. It *must* be valid because it's an 147 | // explicit SPDX scanner. 148 | if err := license.Validate(); err != nil { 149 | //return nil, err 150 | license = &licenses.License{ 151 | //SPDX: "", 152 | Origin: "", // unknown! 153 | Custom: id, 154 | // TODO: populate other fields here (eg: found license text) 155 | } 156 | } 157 | 158 | licenseList = append(licenseList, license) 159 | } 160 | 161 | if len(licenseMap) == 0 && skip == nil { 162 | // NOTE: this is NOT the same as interfaces.ErrUnknownLicense 163 | // because in this scenario, we're comfortable (ish) the parser 164 | // is exhaustive at finding a license with this methodology. 165 | // We want to return nil, but we error only if Scanner.Err() did 166 | // and so normally this returns nil, nil. 167 | return nil, errwrap.Wrapf(scannerErr, "spdx scanner error") 168 | } 169 | 170 | result := &interfaces.Result{ 171 | Licenses: licenseList, 172 | Confidence: 1.0, // TODO: what should we put here? 173 | Skip: skip, 174 | } 175 | 176 | // We perform the strange task of processing any partial results, and 177 | // returning some even if we errored, because the spdx code seems to 178 | // think this is better than no results. I'll do the same, but there is 179 | // no guarantee the calling iterator will use these. (Currently it does 180 | // not!) 181 | return result, errwrap.Wrapf(scannerErr, "spdx scanner error") 182 | } 183 | 184 | // stripTrash is an improved version of the identically named function in the 185 | // SPDX tools repository. 186 | func stripTrash(lid string) string { 187 | return stripTrashSPDX.ReplaceAllString(lid, "") 188 | } 189 | -------------------------------------------------------------------------------- /cmd/yesiscan/.gitignore: -------------------------------------------------------------------------------- 1 | # if you build the binary in this dir 2 | yesiscan 3 | .program 4 | .version 5 | dist/ 6 | .envrc 7 | -------------------------------------------------------------------------------- /cmd/yesiscan/.goreleaser.yaml: -------------------------------------------------------------------------------- 1 | # Make sure to check the documentation at https://goreleaser.com 2 | before: 3 | hooks: 4 | # You may remove this if you don't use go modules. 5 | #- go mod tidy 6 | - go generate ./... 7 | builds: 8 | - env: 9 | - CGO_ENABLED=0 10 | goos: 11 | - linux 12 | - darwin 13 | # - windows 14 | 15 | goarch: 16 | - amd64 17 | - arm64 18 | 19 | ignore: 20 | - goarch: 386 21 | 22 | ldflags: 23 | - '-s -w -X main.program={{.ProjectName}} -X main.version={{.ShortCommit}}' 24 | 25 | archives: 26 | - format: binary 27 | # - replacements: 28 | # darwin: Darwin 29 | # linux: Linux 30 | # windows: Windows 31 | # amd64: x86_64 32 | checksum: 33 | name_template: 'checksums.txt' 34 | snapshot: 35 | name_template: "{{ incpatch .Version }}-next" 36 | changelog: 37 | sort: asc 38 | filters: 39 | exclude: 40 | - '^docs:' 41 | - '^test:' 42 | -------------------------------------------------------------------------------- /cmd/yesiscan/Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: all build release 2 | 3 | # These values get pulled in from these magic dot files to make builds that 4 | # already contain these values. They set these config values automatically. 5 | AUTO := $(shell cat ../../.autoConfigURI 2>/dev/null || echo '') 6 | COOKIE := $(shell cat ../../.autoConfigCookiePath 2>/dev/null || echo '') 7 | 8 | all: build 9 | 10 | build: 11 | #@go build && echo "built binary to: $(PWD)/yesiscan" 12 | @go build -ldflags="-X main.autoConfigURI=$(AUTO) -X main.autoConfigCookiePath=$(COOKIE)" && echo "built binary to: $(PWD)/yesiscan" 13 | 14 | release: 15 | goreleaser release --skip-validate --rm-dist 16 | -------------------------------------------------------------------------------- /cmd/yesiscan/web.go: -------------------------------------------------------------------------------- 1 | // Copyright Amazon.com Inc or its affiliates and the project contributors 2 | // Written by James Shubin and the project contributors 3 | // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); you may not 5 | // use this file except in compliance with the License. You may obtain a copy of 6 | // the License at 7 | // 8 | // http://www.apache.org/licenses/LICENSE-2.0 9 | // 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 12 | // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 13 | // License for the specific language governing permissions and limitations under 14 | // the License. 15 | // 16 | // We will never require a CLA to submit a patch. All contributions follow the 17 | // `inbound == outbound` rule. 18 | // 19 | // This is not an official Amazon product. Amazon does not offer support for 20 | // this project. 21 | // 22 | // SPDX-License-Identifier: Apache-2.0 23 | 24 | package main 25 | 26 | import ( 27 | "context" 28 | "fmt" 29 | "os" 30 | "os/signal" 31 | "strings" 32 | 33 | "github.com/awslabs/yesiscan/util/ansi" 34 | "github.com/awslabs/yesiscan/web" 35 | 36 | cli "github.com/urfave/cli/v2" // imports as package "cli" 37 | ) 38 | 39 | // Web is the general entry point for running this software as an http web 40 | // server. 41 | // TODO: replace the *cli.Context with a more general context that can be used 42 | // by all the different frontends. 43 | func Web(c *cli.Context, program, version string, debug bool) error { 44 | logf := (&ansi.Logf{ 45 | Prefix: "main: ", 46 | Ellipsis: "...", 47 | Enable: false, 48 | Prefixes: []string{}, 49 | }).Init() 50 | logf("Hello from purpleidea! This is %s, version: %s", program, version) 51 | defer logf("Done!") 52 | 53 | server := &web.Server{ 54 | Program: program, 55 | Version: version, 56 | 57 | Debug: debug, 58 | Logf: func(format string, v ...interface{}) { 59 | //logf(format, v...) // XXX: replaced for now b/c of gin logs 60 | fmt.Printf(strings.TrimRight(format, "\n")+"\n", v...) // avoid prefixing for now 61 | }, 62 | 63 | Profiles: c.StringSlice("profile"), 64 | Listen: c.String("listen"), 65 | } 66 | 67 | ctx, stop := signal.NotifyContext(context.Background(), os.Interrupt) 68 | defer stop() 69 | 70 | return server.Run(ctx) 71 | } 72 | -------------------------------------------------------------------------------- /examples/DESCRIPTION: -------------------------------------------------------------------------------- 1 | Title: Example DESCRIPTION Files 2 | Version: 1.4.1.9000 3 | License: MIT + 4 | file 5 | LICENSE | GPL-2.0 6 | License: Apache-2.0 7 | Depends: 8 | R (>= 3.4) 9 | Imports: 10 | cli, 11 | utils 12 | Suggests: 13 | callr, 14 | covr 15 | Config/testthat/edition: 3 16 | Encoding: UTF-8 17 | Language: en-US 18 | Collate: 19 | 'assertions.R' 20 | -------------------------------------------------------------------------------- /examples/big5.json: -------------------------------------------------------------------------------- 1 | { 2 | "comment": "an example profile for the big5, non-proliferation friendly licenses", 3 | "licenses": [ 4 | "AGPL-3.0-or-later", 5 | "GPL-3.0-or-later", 6 | "LGPL-3.0-or-later", 7 | "AGPL-3.0-only", 8 | "GPL-3.0-only", 9 | "LGPL-3.0-only", 10 | "AGPL-3.0", 11 | "GPL-3.0", 12 | "LGPL-3.0", 13 | "AGPL-3.0+", 14 | "GPL-3.0+", 15 | "LGPL-3.0+", 16 | 17 | "LGPL-2.1-or-later", 18 | "LGPL-2.1-only", 19 | "LGPL-2.1", 20 | "LGPL-2.1+", 21 | 22 | "GPL-2.0-or-later", 23 | "LGPL-2.0-or-later", 24 | "GPL-2.0-only", 25 | "LGPL-2.0-only", 26 | "GPL-2.0", 27 | "LGPL-2.0", 28 | "GPL-2.0+", 29 | "LGPL-2.0+", 30 | 31 | "Apache-2.0", 32 | "MIT", 33 | "MIT-0" 34 | ], 35 | "exclude": true 36 | } 37 | -------------------------------------------------------------------------------- /examples/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "quiet": false, 3 | "profiles": ["big5", "default", "fsf"], 4 | "regexp-path": "", 5 | "output-type": "html", 6 | "output-s3bucket": "yesiscan-test", 7 | "region": "ca-central-1", 8 | "backends": { 9 | "licenseclassifier": false, 10 | "cran": true, 11 | "pom": true, 12 | "spdx": true, 13 | "askalono": true, 14 | "scancode": true, 15 | "bitbake": true, 16 | "regexp": true 17 | }, 18 | "configs": { 19 | "~/.config/yesiscan/profiles/big5.json": "https://raw.githubusercontent.com/awslabs/yesiscan/main/examples/big5.json" 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /examples/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4.0.0 4 | 5 | org.neo4j.build.plugins 6 | license-maven-plugin 7 | 4-SNAPSHOT 8 | maven-plugin 9 | 10 | 11 | 1.6 12 | 1.6 13 | 14 | 15 | 16 | 17 | The Apache Software License, Version 2.0 18 | http://www.apache.org/licenses/LICENSE-2.0.txt 19 | repo 20 | 21 | 22 | 23 | 24 | -------------------------------------------------------------------------------- /examples/regexp.json: -------------------------------------------------------------------------------- 1 | { 2 | "comment": "", 3 | "rules": [ 4 | { 5 | "id": "AGPL-3.0-or-later", 6 | "pattern": "AGPL" 7 | }, 8 | { 9 | "id": "AGPL-3.0-or-later", 10 | "pattern": "AGPLv3" 11 | }, 12 | { 13 | "id": "GPL-3.0-or-later", 14 | "pattern": "GPL" 15 | } 16 | ] 17 | } 18 | -------------------------------------------------------------------------------- /examples/ssh.config: -------------------------------------------------------------------------------- 1 | # run yesiscan on a remote host over ssh 2 | # put this in your ~/.ssh/config 3 | # then ssh yesiscan 4 | # then run ./yesiscan web 5 | # in your local webbrowser you can connect to http://localhost:8000 6 | Host yesiscan 7 | Hostname 8 | User 9 | 10 | LocalForward 8000 localhost:8000 11 | GSSAPIAuthentication no 12 | RequestTTY yes 13 | RemoteCommand screen -xRR 14 | 15 | -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/awslabs/yesiscan 2 | 3 | go 1.16 4 | 5 | require ( 6 | github.com/aws/aws-sdk-go-v2 v1.16.11 // indirect 7 | github.com/aws/aws-sdk-go-v2/config v1.17.1 // indirect 8 | github.com/aws/aws-sdk-go-v2/service/s3 v1.27.5 // indirect 9 | github.com/fatih/color v1.13.0 // indirect 10 | github.com/gin-contrib/multitemplate v0.0.0-20220705015713-e21a0ba39de3 // indirect 11 | github.com/gin-gonic/gin v1.8.1 // indirect 12 | github.com/go-git/go-git/v5 v5.3.0 // indirect 13 | github.com/go-playground/validator/v10 v10.10.0 // indirect 14 | github.com/google/licenseclassifier v0.0.0-20210325184830-bb04aff29e72 // indirect 15 | github.com/hashicorp/go-multierror v1.1.1 // indirect 16 | github.com/mitchellh/go-homedir v1.1.0 // indirect 17 | github.com/pkg/errors v0.9.1 // indirect 18 | github.com/shurcooL/sanitized_anchor_name v1.0.0 // indirect 19 | github.com/ssgelm/cookiejarparser v1.0.1 // indirect 20 | github.com/urfave/cli/v2 v2.14.1 // indirect 21 | golang.org/x/term v0.1.0 // indirect 22 | ) 23 | -------------------------------------------------------------------------------- /iterator/bzip2.go: -------------------------------------------------------------------------------- 1 | // Copyright Amazon.com Inc or its affiliates and the project contributors 2 | // Written by James Shubin and the project contributors 3 | // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); you may not 5 | // use this file except in compliance with the License. You may obtain a copy of 6 | // the License at 7 | // 8 | // http://www.apache.org/licenses/LICENSE-2.0 9 | // 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 12 | // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 13 | // License for the specific language governing permissions and limitations under 14 | // the License. 15 | // 16 | // We will never require a CLA to submit a patch. All contributions follow the 17 | // `inbound == outbound` rule. 18 | // 19 | // This is not an official Amazon product. Amazon does not offer support for 20 | // this project. 21 | // 22 | // SPDX-License-Identifier: Apache-2.0 23 | 24 | package iterator 25 | 26 | import ( 27 | "compress/bzip2" 28 | "context" 29 | "crypto/sha256" 30 | "fmt" 31 | "io" 32 | "os" 33 | "strconv" 34 | "strings" 35 | "sync" 36 | "time" 37 | 38 | "github.com/awslabs/yesiscan/interfaces" 39 | "github.com/awslabs/yesiscan/util/errwrap" 40 | "github.com/awslabs/yesiscan/util/safepath" 41 | ) 42 | 43 | var ( 44 | // Bzip2Extensions is a list of valid extensions. 45 | Bzip2Extensions = []string{ 46 | ".bz", 47 | ".bz2", 48 | //".bzip" // not actually an extension that's used! 49 | ".bzip2", 50 | ".tbz", 51 | ".tbz2", 52 | //".tbzip2", // not actually an extension that's used! 53 | //".tar.bz", 54 | //".tar.bz2", 55 | //".tar.bzip2", 56 | } 57 | 58 | bzip2MapMutex *sync.Mutex 59 | bzip2Mutexes map[string]*sync.Mutex 60 | ) 61 | 62 | func init() { 63 | bzip2MapMutex = &sync.Mutex{} 64 | bzip2Mutexes = make(map[string]*sync.Mutex) 65 | } 66 | 67 | // Bzip2 is an iterator that takes a .bz or similar URI to open and performs the 68 | // decompress operation. It will eventually return an Fs iterator since there's 69 | // no need for it to know how to walk through a filesystem tree itself and it's 70 | // going to return a single file here. It can use a local cache so that future 71 | // calls to the same URI won't have to waste cycles, but only in cases when we 72 | // can determine it will be the same file. 73 | type Bzip2 struct { 74 | Debug bool 75 | Logf func(format string, v ...interface{}) 76 | Prefix safepath.AbsDir 77 | 78 | // Parser is a pointer to the parser that returned this. If it wasn't 79 | // returned by a parser, leave this nil. If this iterator came from an 80 | // iterator, then the Iterator handle should be filled instead. 81 | Parser interfaces.Parser 82 | 83 | // Iterator is a pointer to the iterator that returned this. If it 84 | // wasn't returned by an iterator, leave this nil. If this iterator came 85 | // from a parser, then the Parser handle should be filled instead. 86 | Iterator interfaces.Iterator 87 | 88 | // Path is the location of the file to gunzip. 89 | Path safepath.AbsFile 90 | 91 | // AllowAnyExtension specifies whether we will attempt to run if the 92 | // Path does not end with the correct bzip2 extension. 93 | AllowAnyExtension bool 94 | 95 | // AllowedExtensions specifies a list of extensions that we are allowed 96 | // to try to decode from. If this is empty, then we allow only the 97 | // defaults above because allowing no extensions at all would make no 98 | // sense. If AllowAnyExtension is set, then this has no effect. All the 99 | // matches are case insensitive. 100 | AllowedExtensions []string 101 | 102 | // iterators store the list of which iterators we created, so we know 103 | // which ones we have to close! 104 | iterators []interfaces.Iterator 105 | 106 | // unlock is a function that should be called as part of the Close 107 | // method once this resource is finished. It can be defined when 108 | // building this iterator in case we want a mechanism for the caller of 109 | // this iterator to tell the child when to unlock any in-use resources. 110 | // It must be safe to call this function more than once if necessary. 111 | // This is currently used privately. 112 | unlock func() 113 | } 114 | 115 | // String returns a human-readable representation of the bzip2 path we're 116 | // looking at. The output of this format is not guaranteed to be constant, so 117 | // don't try to parse it. 118 | func (obj *Bzip2) String() string { 119 | return fmt.Sprintf("bzip2: %s", obj.Path) 120 | } 121 | 122 | // Validate runs some checks to ensure this iterator was built correctly. 123 | func (obj *Bzip2) Validate() error { 124 | if obj.Logf == nil { 125 | return fmt.Errorf("the Logf function must be specified") 126 | } 127 | if err := obj.Prefix.Validate(); err != nil { 128 | return err 129 | } 130 | 131 | if obj.Path.Path() == "" { 132 | return fmt.Errorf("must specify a Path") 133 | } 134 | 135 | return obj.validateExtension() 136 | } 137 | 138 | // validateExtension is a helper function to process our extension validation. 139 | func (obj *Bzip2) validateExtension() error { 140 | if obj.AllowAnyExtension { 141 | return nil 142 | } 143 | if len(obj.AllowedExtensions) == 0 { 144 | for _, x := range Bzip2Extensions { 145 | if obj.Path.HasExtInsensitive(x) { 146 | return nil 147 | } 148 | } 149 | } 150 | 151 | for _, x := range obj.AllowedExtensions { 152 | if obj.Path.HasExtInsensitive(x) { 153 | return nil 154 | } 155 | } 156 | 157 | if len(obj.AllowedExtensions) == 0 { 158 | return fmt.Errorf("a valid bzip2 extension is required without the allow any extension option") 159 | } 160 | 161 | return fmt.Errorf("an allowed extension is required to run this iterator") 162 | } 163 | 164 | // GetParser returns a handle to the parent parser that built this iterator if 165 | // there is one. 166 | func (obj *Bzip2) GetParser() interfaces.Parser { return obj.Parser } 167 | 168 | // GetIterator returns a handle to the parent iterator that built this iterator 169 | // if there is one. 170 | func (obj *Bzip2) GetIterator() interfaces.Iterator { return obj.Iterator } 171 | 172 | // Recurse runs a simple iterator that is responsible for uncompressing a bzip2 173 | // URI into a local filesystem path. If this happens successfully, it will 174 | // return a new FsIterator that is initialized to this root path. 175 | func (obj *Bzip2) Recurse(ctx context.Context, scan interfaces.ScanFunc) ([]interfaces.Iterator, error) { 176 | relDir := safepath.UnsafeParseIntoRelDir("bzip2/") 177 | prefix := safepath.JoinToAbsDir(obj.Prefix, relDir) 178 | if err := os.MkdirAll(prefix.Path(), interfaces.Umask); err != nil { 179 | return nil, err 180 | } 181 | 182 | // make a unique ID for the directory 183 | // XXX: we can consider different algorithms or methods here later... 184 | now := strconv.FormatInt(time.Now().UnixMilli(), 10) // itoa but int64 185 | sum := sha256.Sum256([]byte(obj.Path.Path() + now)) 186 | hashRelDir, err := safepath.ParseIntoRelDir(fmt.Sprintf("%x", sum)) 187 | if err != nil { 188 | return nil, err 189 | } 190 | // ensure it gets put into a folder so it doesn't explode current dir 191 | bzip2AbsDir := safepath.JoinToAbsDir(prefix, hashRelDir) 192 | 193 | bzip2MapMutex.Lock() 194 | mu, exists := bzip2Mutexes[obj.Path.Path()] 195 | if !exists { 196 | mu = &sync.Mutex{} 197 | bzip2Mutexes[obj.Path.Path()] = mu 198 | } 199 | bzip2MapMutex.Unlock() 200 | 201 | if obj.Debug { 202 | obj.Logf("locking: %s", obj.String()) 203 | } 204 | mu.Lock() // locking happens here (unlock on all errors/returns!) 205 | once := &sync.Once{} 206 | obj.unlock = func() { 207 | fn := func() { 208 | if obj.Debug { 209 | obj.Logf("unlocking: %s", obj.String()) 210 | } 211 | mu.Unlock() 212 | } 213 | once.Do(fn) 214 | } 215 | 216 | // XXX: unlock when context closes? 217 | 218 | // XXX: If the destination dir has contents, consider removing them 219 | // first. This is one reason why we have a mutex. 220 | 221 | // Open the bzip2 file for reading. 222 | // FIXME: use a variant that can take a context 223 | f, err := os.Open(obj.Path.Path()) 224 | if err != nil { 225 | obj.unlock() 226 | return nil, errwrap.Wrapf(err, "error opening path %s", obj.Path) 227 | } 228 | defer f.Close() 229 | 230 | z := bzip2.NewReader(f) 231 | 232 | bytesTotal := int64(0) 233 | // Iterate through the files in the archive. 234 | // TODO: add a recurring progress logf if it takes longer than 30 sec 235 | 236 | // TODO: obj.Debug ? 237 | 238 | newName := "unknown" 239 | p := obj.Path.Path() 240 | suffix := WhichSuffixInsensitive(p, Bzip2Extensions) 241 | p = strings.TrimSuffix(p, suffix) 242 | ix := strings.LastIndex(p, "/") 243 | if ix != -1 { 244 | p = p[ix+1:] 245 | if len(p) > 0 { 246 | newName = p 247 | } 248 | } 249 | 250 | obj.Logf("bzip2: %s", newName) 251 | 252 | // add in a .tar if it's an embedded tar file 253 | if p := strings.ToLower(obj.Path.Path()); strings.HasSuffix(p, ".tbz") || strings.HasSuffix(p, ".tbz2") { 254 | newName += ".tar" 255 | } 256 | relFile, err := safepath.ParseIntoRelFile(newName) 257 | if err != nil { 258 | // programming error 259 | obj.unlock() 260 | return nil, err 261 | } 262 | 263 | // this is where the output file will be stored 264 | absFile := safepath.JoinToAbsFile(bzip2AbsDir, relFile) 265 | 266 | // XXX: sanity check (is output in the dir?) 267 | // TODO: we could add this, but safepath automatically does this 268 | // if absFile is not inside of bzip2AbsDir then error 269 | 270 | absDir := absFile.Dir() // get the absDir that absFile is in 271 | 272 | if err := os.MkdirAll(absDir.Path(), os.ModePerm); err != nil { 273 | // programming error 274 | obj.unlock() 275 | return nil, err 276 | } 277 | 278 | // write to this location 279 | dest, err := os.OpenFile(absFile.Path(), os.O_WRONLY|os.O_CREATE|os.O_TRUNC, os.ModePerm) 280 | if err != nil { 281 | obj.unlock() 282 | return nil, errwrap.Wrapf(err, "error writing our file to disk at %s", absFile) 283 | } 284 | // don't `defer` close here because we want to free in the loop 285 | 286 | // FIXME: use a variant that can take a context 287 | size, err := io.Copy(dest, z) 288 | if e, ok := err.(bzip2.StructuralError); ok { 289 | dest.Close() // close dest file on error! 290 | obj.unlock() 291 | return nil, &interfaces.IteratorError{ 292 | Path: obj.Path.Path(), 293 | Err: errwrap.Wrapf(e, "error decompressing bzip2"), 294 | } 295 | 296 | } else if err != nil { 297 | dest.Close() // close dest file on error! 298 | obj.unlock() 299 | return nil, errwrap.Wrapf(err, "error writing our file to disk at %s", absFile) 300 | } 301 | obj.Logf("uncompressed: %d bytes to disk at %s", size, absFile) 302 | 303 | dest.Close() // close dest file on error! 304 | 305 | bytesTotal += int64(size) 306 | 307 | // TODO: change to human readable bytes 308 | obj.Logf("uncompressed from %s into %s (%d bytes)", obj.String(), bzip2AbsDir, bytesTotal) 309 | 310 | obj.iterators = []interfaces.Iterator{} 311 | 312 | // if it's a single bzip2 file we return an fs iterator and let the fs 313 | // iterator sort that out... 314 | iterator := &Fs{ 315 | Debug: obj.Debug, 316 | Logf: func(format string, v ...interface{}) { 317 | obj.Logf(format, v...) // TODO: add a prefix? 318 | }, 319 | Prefix: obj.Prefix, 320 | 321 | Iterator: obj, 322 | 323 | Path: bzip2AbsDir, 324 | 325 | //Unlock: unlock, 326 | } 327 | obj.iterators = append(obj.iterators, iterator) 328 | 329 | return obj.iterators, nil 330 | } 331 | 332 | // Close shuts down the iterator and/or performs clean up after the Recurse 333 | // method has run. This must be called if you run Recurse. 334 | func (obj *Bzip2) Close() error { 335 | if obj.unlock != nil { 336 | obj.unlock() 337 | } 338 | var errs error 339 | for i := len(obj.iterators) - 1; i >= 0; i-- { // reverse order (stacks!) 340 | if err := obj.iterators[i].Close(); err != nil { 341 | errs = errwrap.Append(errs, err) 342 | } 343 | } 344 | return errs 345 | } 346 | -------------------------------------------------------------------------------- /iterator/gzip.go: -------------------------------------------------------------------------------- 1 | // Copyright Amazon.com Inc or its affiliates and the project contributors 2 | // Written by James Shubin and the project contributors 3 | // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); you may not 5 | // use this file except in compliance with the License. You may obtain a copy of 6 | // the License at 7 | // 8 | // http://www.apache.org/licenses/LICENSE-2.0 9 | // 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 12 | // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 13 | // License for the specific language governing permissions and limitations under 14 | // the License. 15 | // 16 | // We will never require a CLA to submit a patch. All contributions follow the 17 | // `inbound == outbound` rule. 18 | // 19 | // This is not an official Amazon product. Amazon does not offer support for 20 | // this project. 21 | // 22 | // SPDX-License-Identifier: Apache-2.0 23 | 24 | package iterator 25 | 26 | import ( 27 | "compress/gzip" 28 | "context" 29 | "crypto/sha256" 30 | "fmt" 31 | "io" 32 | "os" 33 | "strconv" 34 | "strings" 35 | "sync" 36 | "time" 37 | 38 | "github.com/awslabs/yesiscan/interfaces" 39 | "github.com/awslabs/yesiscan/util/errwrap" 40 | "github.com/awslabs/yesiscan/util/safepath" 41 | ) 42 | 43 | var ( 44 | // GzipExtensions is a list of valid extensions. 45 | GzipExtensions = []string{ 46 | ".gz", 47 | ".gzip", 48 | ".tgz", 49 | //".tar.gz", 50 | //".tar.gzip", 51 | } 52 | 53 | gzipMapMutex *sync.Mutex 54 | gzipMutexes map[string]*sync.Mutex 55 | ) 56 | 57 | func init() { 58 | gzipMapMutex = &sync.Mutex{} 59 | gzipMutexes = make(map[string]*sync.Mutex) 60 | } 61 | 62 | // Gzip is an iterator that takes a .gz or similar URI to open and performs the 63 | // decompress operation. It will eventually return an Fs iterator since there's 64 | // no need for it to know how to walk through a filesystem tree itself and it's 65 | // going to return a single file here. It can use a local cache so that future 66 | // calls to the same URI won't have to waste cycles, but only in cases when we 67 | // can determine it will be the same file. This does _not_ support gzip 68 | // multistream, but it could be added if we find a use-case for it. 69 | type Gzip struct { 70 | Debug bool 71 | Logf func(format string, v ...interface{}) 72 | Prefix safepath.AbsDir 73 | 74 | // Parser is a pointer to the parser that returned this. If it wasn't 75 | // returned by a parser, leave this nil. If this iterator came from an 76 | // iterator, then the Iterator handle should be filled instead. 77 | Parser interfaces.Parser 78 | 79 | // Iterator is a pointer to the iterator that returned this. If it 80 | // wasn't returned by an iterator, leave this nil. If this iterator came 81 | // from a parser, then the Parser handle should be filled instead. 82 | Iterator interfaces.Iterator 83 | 84 | // Path is the location of the file to gunzip. 85 | Path safepath.AbsFile 86 | 87 | // AllowAnyExtension specifies whether we will attempt to run if the 88 | // Path does not end with the correct gzip extension. 89 | AllowAnyExtension bool 90 | 91 | // AllowedExtensions specifies a list of extensions that we are allowed 92 | // to try to decode from. If this is empty, then we allow only the 93 | // defaults above because allowing no extensions at all would make no 94 | // sense. If AllowAnyExtension is set, then this has no effect. All the 95 | // matches are case insensitive. 96 | AllowedExtensions []string 97 | 98 | // iterators store the list of which iterators we created, so we know 99 | // which ones we have to close! 100 | iterators []interfaces.Iterator 101 | 102 | // unlock is a function that should be called as part of the Close 103 | // method once this resource is finished. It can be defined when 104 | // building this iterator in case we want a mechanism for the caller of 105 | // this iterator to tell the child when to unlock any in-use resources. 106 | // It must be safe to call this function more than once if necessary. 107 | // This is currently used privately. 108 | unlock func() 109 | } 110 | 111 | // String returns a human-readable representation of the gzip path we're looking 112 | // at. The output of this format is not guaranteed to be constant, so don't try 113 | // to parse it. 114 | func (obj *Gzip) String() string { 115 | return fmt.Sprintf("gzip: %s", obj.Path) 116 | } 117 | 118 | // Validate runs some checks to ensure this iterator was built correctly. 119 | func (obj *Gzip) Validate() error { 120 | if obj.Logf == nil { 121 | return fmt.Errorf("the Logf function must be specified") 122 | } 123 | if err := obj.Prefix.Validate(); err != nil { 124 | return err 125 | } 126 | 127 | if obj.Path.Path() == "" { 128 | return fmt.Errorf("must specify a Path") 129 | } 130 | 131 | return obj.validateExtension() 132 | } 133 | 134 | // validateExtension is a helper function to process our extension validation. 135 | func (obj *Gzip) validateExtension() error { 136 | if obj.AllowAnyExtension { 137 | return nil 138 | } 139 | if len(obj.AllowedExtensions) == 0 { 140 | for _, x := range GzipExtensions { 141 | if obj.Path.HasExtInsensitive(x) { 142 | return nil 143 | } 144 | } 145 | } 146 | 147 | for _, x := range obj.AllowedExtensions { 148 | if obj.Path.HasExtInsensitive(x) { 149 | return nil 150 | } 151 | } 152 | 153 | if len(obj.AllowedExtensions) == 0 { 154 | return fmt.Errorf("a valid gzip extension is required without the allow any extension option") 155 | } 156 | 157 | return fmt.Errorf("an allowed extension is required to run this iterator") 158 | } 159 | 160 | // GetParser returns a handle to the parent parser that built this iterator if 161 | // there is one. 162 | func (obj *Gzip) GetParser() interfaces.Parser { return obj.Parser } 163 | 164 | // GetIterator returns a handle to the parent iterator that built this iterator 165 | // if there is one. 166 | func (obj *Gzip) GetIterator() interfaces.Iterator { return obj.Iterator } 167 | 168 | // Recurse runs a simple iterator that is responsible for uncompressing a gzip 169 | // URI into a local filesystem path. If this happens successfully, it will 170 | // return a new FsIterator that is initialized to this root path. 171 | func (obj *Gzip) Recurse(ctx context.Context, scan interfaces.ScanFunc) ([]interfaces.Iterator, error) { 172 | relDir := safepath.UnsafeParseIntoRelDir("gzip/") 173 | prefix := safepath.JoinToAbsDir(obj.Prefix, relDir) 174 | if err := os.MkdirAll(prefix.Path(), interfaces.Umask); err != nil { 175 | return nil, err 176 | } 177 | 178 | // make a unique ID for the directory 179 | // XXX: we can consider different algorithms or methods here later... 180 | now := strconv.FormatInt(time.Now().UnixMilli(), 10) // itoa but int64 181 | sum := sha256.Sum256([]byte(obj.Path.Path() + now)) 182 | hashRelDir, err := safepath.ParseIntoRelDir(fmt.Sprintf("%x", sum)) 183 | if err != nil { 184 | return nil, err 185 | } 186 | // ensure it gets put into a folder so it doesn't explode current dir 187 | gzipAbsDir := safepath.JoinToAbsDir(prefix, hashRelDir) 188 | 189 | gzipMapMutex.Lock() 190 | mu, exists := gzipMutexes[obj.Path.Path()] 191 | if !exists { 192 | mu = &sync.Mutex{} 193 | gzipMutexes[obj.Path.Path()] = mu 194 | } 195 | gzipMapMutex.Unlock() 196 | 197 | if obj.Debug { 198 | obj.Logf("locking: %s", obj.String()) 199 | } 200 | mu.Lock() // locking happens here (unlock on all errors/returns!) 201 | once := &sync.Once{} 202 | obj.unlock = func() { 203 | fn := func() { 204 | if obj.Debug { 205 | obj.Logf("unlocking: %s", obj.String()) 206 | } 207 | mu.Unlock() 208 | } 209 | once.Do(fn) 210 | } 211 | 212 | // XXX: unlock when context closes? 213 | 214 | // XXX: If the destination dir has contents, consider removing them 215 | // first. This is one reason why we have a mutex. 216 | 217 | // Open the gzip file for reading. 218 | // FIXME: use a variant that can take a context 219 | f, err := os.Open(obj.Path.Path()) 220 | if err != nil { 221 | obj.unlock() 222 | return nil, errwrap.Wrapf(err, "error opening path %s", obj.Path) 223 | } 224 | defer f.Close() 225 | 226 | z, err := gzip.NewReader(f) 227 | if err != nil { 228 | obj.unlock() 229 | return nil, errwrap.Wrapf(err, "error reading gzip %s", obj.Path) 230 | } 231 | defer z.Close() 232 | z.Multistream(false) // TODO: do we ever want to allow this here? 233 | 234 | filesTotal := 0 235 | bytesTotal := int64(0) 236 | // Iterate through the files in the archive. 237 | // TODO: add a recurring progress logf if it takes longer than 30 sec 238 | for { 239 | // In an effort to short-circuit things if needed, we run a 240 | // check ourselves and break out early if we see that we have 241 | // cancelled early. 242 | select { 243 | case <-ctx.Done(): 244 | obj.unlock() 245 | return nil, errwrap.Wrapf(ctx.Err(), "ended decompressing early") 246 | default: 247 | } 248 | 249 | if s := z.Header.Comment; s != "" { 250 | obj.Logf("gzip has comment: %s", s) 251 | } 252 | 253 | // TODO: obj.Debug ? 254 | name := z.Header.Name 255 | newName := name 256 | if name != "" { 257 | obj.Logf("gzip: %s", name) 258 | } else { 259 | // a .tgz might have no name string for example 260 | obj.Logf("gzip name is empty") 261 | newName = "unknown" 262 | p := obj.Path.Path() 263 | suffix := WhichSuffixInsensitive(p, GzipExtensions) 264 | p = strings.TrimSuffix(p, suffix) 265 | ix := strings.LastIndex(p, "/") 266 | if ix != -1 { 267 | p = p[ix+1:] 268 | if len(p) > 0 { 269 | newName = p 270 | } 271 | obj.Logf("gzip basename: %s", newName) 272 | } 273 | } 274 | 275 | // add in a .tar if it's an embedded tar file 276 | if strings.HasSuffix(strings.ToLower(obj.Path.Path()), ".tgz") { 277 | newName += ".tar" 278 | } 279 | relFile, err := safepath.ParseIntoRelFile(newName) 280 | if err != nil { 281 | // programming error 282 | obj.unlock() 283 | return nil, err 284 | } 285 | 286 | // this is where the output file will be stored 287 | absFile := safepath.JoinToAbsFile(gzipAbsDir, relFile) 288 | 289 | // XXX: sanity check (is output in the dir?) 290 | // TODO: we could add this, but safepath automatically does this 291 | // if absFile is not inside of gzipAbsDir then error 292 | 293 | absDir := absFile.Dir() // get the absDir that absFile is in 294 | 295 | if err := os.MkdirAll(absDir.Path(), os.ModePerm); err != nil { 296 | // programming error 297 | obj.unlock() 298 | return nil, err 299 | } 300 | 301 | // write to this location 302 | dest, err := os.OpenFile(absFile.Path(), os.O_WRONLY|os.O_CREATE|os.O_TRUNC, os.ModePerm) 303 | if err != nil { 304 | obj.unlock() 305 | return nil, errwrap.Wrapf(err, "error writing our file to disk at %s", absFile) 306 | } 307 | // don't `defer` close here because we want to free in the loop 308 | 309 | // FIXME: use a variant that can take a context 310 | size, err := io.Copy(dest, z) 311 | if err != nil { 312 | dest.Close() // close dest file on error! 313 | obj.unlock() 314 | return nil, errwrap.Wrapf(err, "error writing our file to disk at %s", absFile) 315 | } 316 | obj.Logf("uncompressed: %d bytes to disk at %s", size, absFile) 317 | 318 | dest.Close() // close dest file on error! 319 | 320 | filesTotal++ 321 | bytesTotal += int64(size) 322 | 323 | break // TODO: remove if we ever do multistream 324 | } 325 | 326 | // TODO: change to human readable bytes 327 | obj.Logf("uncompressed: %d files from %s into %s (%d bytes)", filesTotal, obj.String(), gzipAbsDir, bytesTotal) 328 | 329 | obj.iterators = []interfaces.Iterator{} 330 | 331 | // if it's a single gzip file we return an fs iterator and let the fs 332 | // iterator sort that out... 333 | iterator := &Fs{ 334 | Debug: obj.Debug, 335 | Logf: func(format string, v ...interface{}) { 336 | obj.Logf(format, v...) // TODO: add a prefix? 337 | }, 338 | Prefix: obj.Prefix, 339 | 340 | Iterator: obj, 341 | 342 | Path: gzipAbsDir, 343 | 344 | //Unlock: unlock, 345 | } 346 | obj.iterators = append(obj.iterators, iterator) 347 | 348 | return obj.iterators, nil 349 | } 350 | 351 | // Close shuts down the iterator and/or performs clean up after the Recurse 352 | // method has run. This must be called if you run Recurse. 353 | func (obj *Gzip) Close() error { 354 | if obj.unlock != nil { 355 | obj.unlock() 356 | } 357 | var errs error 358 | for i := len(obj.iterators) - 1; i >= 0; i-- { // reverse order (stacks!) 359 | if err := obj.iterators[i].Close(); err != nil { 360 | errs = errwrap.Append(errs, err) 361 | } 362 | } 363 | return errs 364 | } 365 | -------------------------------------------------------------------------------- /iterator/http.go: -------------------------------------------------------------------------------- 1 | // Copyright Amazon.com Inc or its affiliates and the project contributors 2 | // Written by James Shubin and the project contributors 3 | // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); you may not 5 | // use this file except in compliance with the License. You may obtain a copy of 6 | // the License at 7 | // 8 | // http://www.apache.org/licenses/LICENSE-2.0 9 | // 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 12 | // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 13 | // License for the specific language governing permissions and limitations under 14 | // the License. 15 | // 16 | // We will never require a CLA to submit a patch. All contributions follow the 17 | // `inbound == outbound` rule. 18 | // 19 | // This is not an official Amazon product. Amazon does not offer support for 20 | // this project. 21 | // 22 | // SPDX-License-Identifier: Apache-2.0 23 | 24 | package iterator 25 | 26 | import ( 27 | "context" 28 | "crypto/sha256" 29 | "fmt" 30 | "io" 31 | "net/http" 32 | "net/url" 33 | "os" 34 | "strconv" 35 | "strings" 36 | "sync" 37 | "time" 38 | 39 | "github.com/awslabs/yesiscan/interfaces" 40 | "github.com/awslabs/yesiscan/util/errwrap" 41 | "github.com/awslabs/yesiscan/util/safepath" 42 | ) 43 | 44 | const ( 45 | // HttpScheme is the standard prefix used for http URL's. 46 | HttpScheme = "http://" 47 | 48 | // HttpsScheme is the standard prefix used for https URL's. 49 | HttpsScheme = "https://" 50 | 51 | // HttpSchemeRaw is the standard prefix used for http URL's but without 52 | // the scheme protocol separator which is . 53 | HttpSchemeRaw = "http" 54 | 55 | // HttpsSchemeRaw is the standard prefix used for https URL's but 56 | // without the scheme protocol separator which is . 57 | HttpsSchemeRaw = "https" 58 | 59 | // UnknownFileName is the filename used when the URL doesn't have an 60 | // obvious filename at the end that we can use. 61 | // TODO: is there a better name we can use? This is mostly arbitrary. 62 | UnknownFileName = ".unknown" 63 | ) 64 | 65 | var ( 66 | httpMapMutex *sync.Mutex 67 | httpMutexes map[string]*sync.Mutex 68 | ) 69 | 70 | func init() { 71 | httpMapMutex = &sync.Mutex{} 72 | httpMutexes = make(map[string]*sync.Mutex) 73 | } 74 | 75 | // Http is an iterator that takes an http URL to download and performs the 76 | // download operation. It will eventually return an Fs iterator since there's no 77 | // need for it to know how to walk through a filesystem tree itself. It can use 78 | // a local cache so that future calls to the same URL won't have to waste 79 | // bandwidth or cycles again but only in cases when we can determine it will be 80 | // the same file. Please note this is named http, but we obviously support https 81 | // as the most common form of this. 82 | type Http struct { 83 | Debug bool 84 | Logf func(format string, v ...interface{}) 85 | Prefix safepath.AbsDir 86 | 87 | // Parser is a pointer to the parser that returned this. If it wasn't 88 | // returned by a parser, leave this nil. If this iterator came from an 89 | // iterator, then the Iterator handle should be filled instead. 90 | Parser interfaces.Parser 91 | 92 | // Iterator is a pointer to the iterator that returned this. If it 93 | // wasn't returned by an iterator, leave this nil. If this iterator came 94 | // from a parser, then the Parser handle should be filled instead. 95 | Iterator interfaces.Iterator 96 | 97 | // URL is the http URL of the file that we want to download. 98 | // TODO: consider doing some clever parsing of well-known paths like 99 | // github-style URL's or internal company code repository URL's. 100 | URL string 101 | 102 | // AllowHttp specifies whether we're allowed to download http 103 | // (unencrypted) URLs. 104 | AllowHttp bool 105 | 106 | // iterators store the list of which iterators we created, so we know 107 | // which ones we have to close! 108 | iterators []interfaces.Iterator 109 | 110 | // unlock is a function that should be called as part of the Close 111 | // method once this resource is finished. It can be defined when 112 | // building this iterator in case we want a mechanism for the caller of 113 | // this iterator to tell the child when to unlock any in-use resources. 114 | // It must be safe to call this function more than once if necessary. 115 | // This is currently used privately. 116 | unlock func() 117 | } 118 | 119 | // String returns a human-readable representation of the http URL we're looking 120 | // at. The output of this format is not guaranteed to be constant, so don't try 121 | // to parse it. 122 | func (obj *Http) String() string { 123 | return fmt.Sprintf("http: %s", obj.URL) 124 | } 125 | 126 | // Validate runs some checks to ensure this iterator was built correctly. 127 | func (obj *Http) Validate() error { 128 | if obj.Logf == nil { 129 | return fmt.Errorf("the Logf function must be specified") 130 | } 131 | if err := obj.Prefix.Validate(); err != nil { 132 | return err 133 | } 134 | 135 | if obj.URL == "" { 136 | return fmt.Errorf("must specify a URL") 137 | } 138 | 139 | if _, err := url.Parse(obj.URL); err != nil { 140 | return err // not that url.Parse ever really errors :/ 141 | } 142 | 143 | isHttp := strings.HasPrefix(strings.ToLower(obj.URL), HttpScheme) 144 | isHttps := strings.HasPrefix(strings.ToLower(obj.URL), HttpsScheme) 145 | if !isHttp && !isHttps { 146 | return fmt.Errorf("invalid scheme") 147 | } 148 | 149 | if isHttp && !obj.AllowHttp { 150 | // did you mean https ? 151 | return fmt.Errorf("the http scheme is not allowed without the allow http option") 152 | } 153 | 154 | return nil 155 | } 156 | 157 | // GetParser returns a handle to the parent parser that built this iterator if 158 | // there is one. 159 | func (obj *Http) GetParser() interfaces.Parser { return obj.Parser } 160 | 161 | // GetIterator returns a handle to the parent iterator that built this iterator 162 | // if there is one. 163 | func (obj *Http) GetIterator() interfaces.Iterator { return obj.Iterator } 164 | 165 | // Recurse runs a simple iterator that is responsible for downloading an http 166 | // url into a local filesystem path. If this happens successfully, it 167 | // will return a new FsIterator that is initialized to this root path. 168 | func (obj *Http) Recurse(ctx context.Context, scan interfaces.ScanFunc) ([]interfaces.Iterator, error) { 169 | relDir := safepath.UnsafeParseIntoRelDir("http/") 170 | prefix := safepath.JoinToAbsDir(obj.Prefix, relDir) 171 | if err := os.MkdirAll(prefix.Path(), interfaces.Umask); err != nil { 172 | return nil, err 173 | } 174 | 175 | // make a unique ID for the directory 176 | // XXX: we can consider different algorithms or methods here later... 177 | now := strconv.FormatInt(time.Now().UnixMilli(), 10) // itoa but int64 178 | sum := sha256.Sum256([]byte(obj.URL + now)) 179 | hashRelDir, err := safepath.ParseIntoRelDir(fmt.Sprintf("%x", sum)) 180 | if err != nil { 181 | return nil, err 182 | } 183 | httpAbsDir := safepath.JoinToAbsDir(prefix, hashRelDir) 184 | 185 | httpMapMutex.Lock() 186 | mu, exists := httpMutexes[obj.URL] 187 | if !exists { 188 | mu = &sync.Mutex{} 189 | httpMutexes[obj.URL] = mu 190 | } 191 | httpMapMutex.Unlock() 192 | 193 | if obj.Debug { 194 | obj.Logf("locking: %s", obj.String()) 195 | } 196 | mu.Lock() // locking happens here (unlock on all errors/returns!) 197 | once := &sync.Once{} 198 | obj.unlock = func() { 199 | fn := func() { 200 | if obj.Debug { 201 | obj.Logf("unlocking: %s", obj.String()) 202 | } 203 | mu.Unlock() 204 | } 205 | once.Do(fn) 206 | } 207 | 208 | // XXX: unlock when context closes? 209 | 210 | u, err := url.Parse(obj.URL) 211 | if err != nil { 212 | // programming error 213 | obj.unlock() 214 | return nil, errwrap.Wrapf(err, "error parsing URL %s", obj.URL) 215 | } 216 | segments := strings.Split(u.Path, "/") 217 | fileName := UnknownFileName // default 218 | if len(segments) > 0 { 219 | fileName = segments[len(segments)-1] 220 | } 221 | 222 | relFile, err := safepath.ParseIntoRelFile(fileName) 223 | if err != nil { 224 | // programming error 225 | obj.unlock() 226 | return nil, err 227 | } 228 | 229 | //directory := httpAbsDir.Path() 230 | fullFileNameAbsFile := safepath.JoinToAbsFile(httpAbsDir, relFile) 231 | fullFileName := fullFileNameAbsFile.Path() 232 | 233 | // make the dir we put the downloaded file into 234 | if err := os.MkdirAll(httpAbsDir.Path(), interfaces.Umask); err != nil { 235 | obj.unlock() 236 | return nil, err 237 | } 238 | 239 | // This is one reason why we have a mutex. 240 | if _, err := os.Stat(fullFileName); err == nil { 241 | obj.Logf("file %s already exists, overwriting", obj.String()) 242 | } 243 | 244 | // create blank file 245 | file, err := os.Create(fullFileName) 246 | if err != nil { 247 | obj.unlock() 248 | return nil, errwrap.Wrapf(err, "error writing file %s", fullFileNameAbsFile) 249 | } 250 | defer file.Close() 251 | 252 | obj.Logf("downloading %s into %s as %s", obj.URL, httpAbsDir, fileName) 253 | 254 | req, err := http.NewRequestWithContext(ctx, "GET", obj.URL, nil) // XXX: nil? 255 | if err != nil { 256 | obj.unlock() 257 | return nil, errwrap.Wrapf(err, "error building request for %s", obj.URL) 258 | } 259 | 260 | //tr := &http.Transport{ 261 | // IdleConnTimeout: 30 * time.Second, 262 | //} 263 | client := &http.Client{ 264 | //Transport: tr, 265 | 266 | // If CheckRedirect is nil, the Client uses its default policy, 267 | // which is to stop after 10 consecutive requests. 268 | // CheckRedirect func(req *Request, via []*Request) error 269 | CheckRedirect: nil, 270 | } 271 | 272 | // TODO: add a recurring progress logf if it takes longer than 30 sec 273 | resp, err := client.Do(req) 274 | if err != nil { 275 | obj.unlock() 276 | return nil, errwrap.Wrapf(err, "error do-ing request for %s", obj.URL) 277 | } 278 | defer resp.Body.Close() 279 | 280 | // TODO: should we allow others? 281 | if resp.StatusCode != 200 { 282 | obj.unlock() 283 | return nil, fmt.Errorf("bad status code of: %d", resp.StatusCode) 284 | } 285 | 286 | // FIXME: add a variant that can take a context 287 | size, err := io.Copy(file, resp.Body) 288 | if err != nil { 289 | obj.unlock() 290 | return nil, errwrap.Wrapf(err, "error writing our file to disk at %s", fullFileNameAbsFile) 291 | } 292 | obj.Logf("copied: %d bytes to disk at %s", size, fullFileNameAbsFile) 293 | 294 | obj.iterators = []interfaces.Iterator{} 295 | 296 | if strings.HasPrefix(obj.URL, HttpScheme) { 297 | u.Scheme = HttpSchemeRaw 298 | } 299 | if strings.HasPrefix(obj.URL, HttpsScheme) { 300 | u.Scheme = HttpsSchemeRaw 301 | } 302 | u.Opaque = "" // encoded opaque data 303 | if _, has := u.User.Password(); has { // redact password 304 | u.User = url.UserPassword(u.User.Username(), "") 305 | } 306 | //u.Host = ? // host or host:port 307 | 308 | u.RawPath = "" // encoded path hint (see EscapedPath method) 309 | u.ForceQuery = false // append a query ('?') even if RawQuery is empty 310 | v := url.Values{} 311 | v.Set("now", now) 312 | u.RawQuery = v.Encode() // encoded query values, without '?' 313 | u.Fragment = "" // fragment for references, without '#' 314 | u.RawFragment = "" // encoded fragment hint (see EscapedFragment method) 315 | 316 | // XXX: if it's a single zip file do we return a zip iterator here or do 317 | // we let the fs iterator sort that out... 318 | iterator := &Fs{ 319 | Debug: obj.Debug, 320 | Logf: func(format string, v ...interface{}) { 321 | obj.Logf(format, v...) // TODO: add a prefix? 322 | }, 323 | Prefix: obj.Prefix, 324 | 325 | Iterator: obj, 326 | 327 | // XXX: what path? 328 | Path: httpAbsDir, 329 | 330 | GenUID: func(safePath safepath.Path) (string, error) { 331 | if !safepath.HasPrefix(safePath, httpAbsDir) { 332 | // programming error 333 | return "", fmt.Errorf("path doesn't have prefix") 334 | } 335 | 336 | p := "" 337 | // remove httpAbsDir prefix from safePath to get a relPath 338 | relPath, err := safepath.StripPrefix(safePath, httpAbsDir) 339 | if err == nil { 340 | p = relPath.String() 341 | } else if err != nil && safePath.String() != httpAbsDir.String() { 342 | // programming error 343 | return "", errwrap.Wrapf(err, "problem stripping prefix") 344 | } 345 | 346 | x := *u // copy 347 | x.Path += "/" + p 348 | 349 | return x.String(), nil 350 | }, 351 | 352 | //Unlock: unlock, 353 | } 354 | obj.iterators = append(obj.iterators, iterator) 355 | 356 | return obj.iterators, nil 357 | } 358 | 359 | // Close shuts down the iterator and/or performs clean up after the Recurse 360 | // method has run. This must be called if you run Recurse. 361 | func (obj *Http) Close() error { 362 | if obj.unlock != nil { 363 | obj.unlock() 364 | } 365 | var errs error 366 | for i := len(obj.iterators) - 1; i >= 0; i-- { // reverse order (stacks!) 367 | if err := obj.iterators[i].Close(); err != nil { 368 | errs = errwrap.Append(errs, err) 369 | } 370 | } 371 | return errs 372 | } 373 | -------------------------------------------------------------------------------- /iterator/iterator.go: -------------------------------------------------------------------------------- 1 | // Copyright Amazon.com Inc or its affiliates and the project contributors 2 | // Written by James Shubin and the project contributors 3 | // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); you may not 5 | // use this file except in compliance with the License. You may obtain a copy of 6 | // the License at 7 | // 8 | // http://www.apache.org/licenses/LICENSE-2.0 9 | // 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 12 | // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 13 | // License for the specific language governing permissions and limitations under 14 | // the License. 15 | // 16 | // We will never require a CLA to submit a patch. All contributions follow the 17 | // `inbound == outbound` rule. 18 | // 19 | // This is not an official Amazon product. Amazon does not offer support for 20 | // this project. 21 | // 22 | // SPDX-License-Identifier: Apache-2.0 23 | 24 | package iterator 25 | 26 | import ( 27 | "fmt" 28 | "io/fs" 29 | 30 | "github.com/awslabs/yesiscan/interfaces" 31 | "github.com/awslabs/yesiscan/util/safepath" 32 | ) 33 | 34 | var ( 35 | // SkipPathExtensions is a list of file extensions to not scan. This 36 | // list is alphabetical and has a comment for each element. 37 | SkipPathExtensions = []string{ 38 | ".bmp", // image format 39 | ".csv", // data format 40 | ".cvsignore", // csv ignore file 41 | ".doc", // document format 42 | ".eps", // image format 43 | ".gif", // image format 44 | ".gitignore", // git ignore file 45 | ".jpeg", // image format with weird naming 46 | ".jpg", // image format 47 | ".ico", // icon file format 48 | ".pdf", // document format 49 | ".png", // image format 50 | ".ppt", // presentation format (microsoft) 51 | ".svg", // image format 52 | ".odp", // presentation format (libreoffice) 53 | ".ods", // spreadsheet format (libreoffice) 54 | ".odt", // document format (libreoffice) 55 | ".xls", // spreadsheet format 56 | } 57 | 58 | // SkipDirPaths is a list of relative dir paths to not scan. This list 59 | // list is alphabetical and has a comment for each element. 60 | SkipDirPaths = []string{ 61 | ".git/", // internal git folder 62 | ".github/", // github specific stuff 63 | ".svn/", // internal svn folder 64 | //".eggs/", // python ??? directory 65 | } 66 | ) 67 | 68 | // SkipPath takes an input path and file info struct, and returns whether we 69 | // should skip over it or not. To skip it, return true and no error. To skip a 70 | // directory, return interfaces.SkipDir as the error. Lastly, if anything goes 71 | // wrong, you can return your own error, but minimizing this chance is ideal. 72 | // The stuff that gets skipped in here *must* be common for all iterators, as 73 | // this function is shared by all of them. Individual backends can have their 74 | // own file skip detection as well. For example, one particular backend might 75 | // not know how to scan *.go files, where as a different one might specialize in 76 | // this. Lastly, a design decision was made to make this a "pure, stateless" 77 | // function. In other words, the decision to skip a file or not should be based 78 | // entirely on the input arguments, and more complicated skip functions that 79 | // might take into account more complex logic, such as the existence of multiple 80 | // file paths is not possible. For example, if someone were to invent a file 81 | // called `.legalignore` that worked like `.gitignore` but told software which 82 | // files copyrights wouldn't apply from, we'd be unable to detect those and skip 83 | // over them with this skip function since it only has a view into individual 84 | // files and doesn't get a stateful, full directory tree view. 85 | func SkipPath(path safepath.Path, info fs.FileInfo) (bool, error) { 86 | 87 | // TODO: This could be built with a list of rules that we pass into the 88 | // iterator, so that it could be configurable as needed. 89 | 90 | if !path.IsAbs() { // the walk func gives us absolutes 91 | return false, fmt.Errorf("path %s was not absolute", path.String()) 92 | } 93 | 94 | if info.IsDir() { // path.IsDir() 95 | absDir, ok := path.(safepath.AbsDir) 96 | if !ok { // should not happen unless bug 97 | return false, fmt.Errorf("expected AbsDir") 98 | } 99 | 100 | for _, dir := range SkipDirPaths { 101 | relDir := safepath.UnsafeParseIntoRelDir(dir) 102 | if absDir.HasDir(relDir) { 103 | return true, interfaces.SkipDir 104 | } 105 | } 106 | 107 | return false, nil // don't skip 108 | } 109 | 110 | absFile, ok := path.(safepath.AbsFile) 111 | if !ok { // should not happen unless bug 112 | return false, fmt.Errorf("expected AbsFile") 113 | } 114 | 115 | for _, ext := range SkipPathExtensions { 116 | // Make sure we have at least one char in the file name (x.foo) 117 | // and insensitive match on extensions like .foo that we skip. 118 | if absFile.HasExtInsensitive(ext) && len(ext) != len(absFile.Path()) { // case insensitive 119 | return true, nil 120 | } 121 | } 122 | 123 | return false, nil // don't skip 124 | } 125 | -------------------------------------------------------------------------------- /iterator/util.go: -------------------------------------------------------------------------------- 1 | // Copyright Amazon.com Inc or its affiliates and the project contributors 2 | // Written by James Shubin and the project contributors 3 | // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); you may not 5 | // use this file except in compliance with the License. You may obtain a copy of 6 | // the License at 7 | // 8 | // http://www.apache.org/licenses/LICENSE-2.0 9 | // 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 12 | // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 13 | // License for the specific language governing permissions and limitations under 14 | // the License. 15 | // 16 | // We will never require a CLA to submit a patch. All contributions follow the 17 | // `inbound == outbound` rule. 18 | // 19 | // This is not an official Amazon product. Amazon does not offer support for 20 | // this project. 21 | // 22 | // SPDX-License-Identifier: Apache-2.0 23 | 24 | package iterator 25 | 26 | import ( 27 | "strings" 28 | ) 29 | 30 | // WhichSuffix returns the first suffix with the longest match that is found in 31 | // the input string from the list provided. If none are found, then the empty 32 | // string is returned. The comparisons are done in lower case, but the returned 33 | // suffix is in the original case from the input list. 34 | func WhichSuffixInsensitive(s string, suffixList []string) string { 35 | suffix := "" 36 | length := 0 37 | for _, x := range suffixList { 38 | if strings.HasSuffix(strings.ToLower(s), strings.ToLower(x)) { 39 | if l := len(x); l > length { 40 | suffix = x 41 | length = l 42 | } 43 | } 44 | } 45 | return suffix 46 | } 47 | -------------------------------------------------------------------------------- /iterator/util_test.go: -------------------------------------------------------------------------------- 1 | // Copyright Amazon.com Inc or its affiliates and the project contributors 2 | // Written by James Shubin and the project contributors 3 | // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); you may not 5 | // use this file except in compliance with the License. You may obtain a copy of 6 | // the License at 7 | // 8 | // http://www.apache.org/licenses/LICENSE-2.0 9 | // 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 12 | // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 13 | // License for the specific language governing permissions and limitations under 14 | // the License. 15 | // 16 | // We will never require a CLA to submit a patch. All contributions follow the 17 | // `inbound == outbound` rule. 18 | // 19 | // This is not an official Amazon product. Amazon does not offer support for 20 | // this project. 21 | // 22 | // SPDX-License-Identifier: Apache-2.0 23 | 24 | package iterator_test 25 | 26 | import ( 27 | "testing" 28 | 29 | "github.com/awslabs/yesiscan/iterator" 30 | ) 31 | 32 | func TestWhichSuffixInsensitive(t *testing.T) { 33 | has := ".FoO" 34 | exp := ".foO" 35 | suffixList := []string{ 36 | ".fooo", 37 | exp, 38 | ".bar", 39 | ".BAZ", 40 | } 41 | if s := iterator.WhichSuffixInsensitive(has, suffixList); s != exp { 42 | t.Errorf("exp: %s", exp) 43 | t.Errorf("got: %s", s) 44 | return 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /iterator/zip.go: -------------------------------------------------------------------------------- 1 | // Copyright Amazon.com Inc or its affiliates and the project contributors 2 | // Written by James Shubin and the project contributors 3 | // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); you may not 5 | // use this file except in compliance with the License. You may obtain a copy of 6 | // the License at 7 | // 8 | // http://www.apache.org/licenses/LICENSE-2.0 9 | // 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 12 | // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 13 | // License for the specific language governing permissions and limitations under 14 | // the License. 15 | // 16 | // We will never require a CLA to submit a patch. All contributions follow the 17 | // `inbound == outbound` rule. 18 | // 19 | // This is not an official Amazon product. Amazon does not offer support for 20 | // this project. 21 | // 22 | // SPDX-License-Identifier: Apache-2.0 23 | 24 | package iterator 25 | 26 | import ( 27 | "archive/zip" 28 | "context" 29 | "crypto/sha256" 30 | "fmt" 31 | "io" 32 | "os" 33 | "strconv" 34 | "sync" 35 | "time" 36 | 37 | "github.com/awslabs/yesiscan/interfaces" 38 | "github.com/awslabs/yesiscan/util/errwrap" 39 | "github.com/awslabs/yesiscan/util/safepath" 40 | ) 41 | 42 | const ( 43 | // ZipExtension is the standard extension used for zip URI's. 44 | ZipExtension = ".zip" 45 | 46 | // JarExtension is used for java .jar files. This is included here since 47 | // they are just zip files that are named differently. 48 | JarExtension = ".jar" 49 | 50 | // WhlExtension is used for python .whl files. This is included here since 51 | // they are just zip files that are named differently. 52 | WhlExtension = ".whl" 53 | ) 54 | 55 | var ( 56 | zipMapMutex *sync.Mutex 57 | zipMutexes map[string]*sync.Mutex 58 | ) 59 | 60 | func init() { 61 | zipMapMutex = &sync.Mutex{} 62 | zipMutexes = make(map[string]*sync.Mutex) 63 | } 64 | 65 | // Zip is an iterator that takes a .zip URI to open and performs the unzip 66 | // operation. It will eventually return an Fs iterator since there's no need for 67 | // it to know how to walk through a filesystem tree itself. It can use a local 68 | // cache so that future calls to the same URI won't have to waste cycles, but 69 | // only in cases when we can determine it will be the same file. 70 | type Zip struct { 71 | Debug bool 72 | Logf func(format string, v ...interface{}) 73 | Prefix safepath.AbsDir 74 | 75 | // Parser is a pointer to the parser that returned this. If it wasn't 76 | // returned by a parser, leave this nil. If this iterator came from an 77 | // iterator, then the Iterator handle should be filled instead. 78 | Parser interfaces.Parser 79 | 80 | // Iterator is a pointer to the iterator that returned this. If it 81 | // wasn't returned by an iterator, leave this nil. If this iterator came 82 | // from a parser, then the Parser handle should be filled instead. 83 | Iterator interfaces.Iterator 84 | 85 | // Path is the location of the file to unzip. 86 | Path safepath.AbsFile 87 | 88 | // FIXME: add zip max file limit field to prevent zip bombs 89 | 90 | // TODO: add zip password field 91 | 92 | // AllowAnyExtension specifies whether we will attempt to run if the 93 | // Path does not end with the correct zip extension. 94 | AllowAnyExtension bool 95 | 96 | // AllowedExtensions specifies a list of extensions that we are allowed 97 | // to try to decode from. If this is empty, then we allow only the 98 | // default of zip because allowing no extensions at all would make no 99 | // sense. If AllowAnyExtension is set, then this has no effect. All the 100 | // matches are case insensitive. 101 | AllowedExtensions []string 102 | 103 | // iterators store the list of which iterators we created, so we know 104 | // which ones we have to close! 105 | iterators []interfaces.Iterator 106 | 107 | // unlock is a function that should be called as part of the Close 108 | // method once this resource is finished. It can be defined when 109 | // building this iterator in case we want a mechanism for the caller of 110 | // this iterator to tell the child when to unlock any in-use resources. 111 | // It must be safe to call this function more than once if necessary. 112 | // This is currently used privately. 113 | unlock func() 114 | } 115 | 116 | // String returns a human-readable representation of the zip path we're looking 117 | // at. The output of this format is not guaranteed to be constant, so don't try 118 | // to parse it. 119 | func (obj *Zip) String() string { 120 | return fmt.Sprintf("zip: %s", obj.Path) 121 | } 122 | 123 | // Validate runs some checks to ensure this iterator was built correctly. 124 | func (obj *Zip) Validate() error { 125 | if obj.Logf == nil { 126 | return fmt.Errorf("the Logf function must be specified") 127 | } 128 | if err := obj.Prefix.Validate(); err != nil { 129 | return err 130 | } 131 | 132 | if obj.Path.Path() == "" { 133 | return fmt.Errorf("must specify a Path") 134 | } 135 | 136 | return obj.validateExtension() 137 | } 138 | 139 | // validateExtension is a helper function to process our extension validation. 140 | func (obj *Zip) validateExtension() error { 141 | if obj.AllowAnyExtension { 142 | return nil 143 | } 144 | if obj.Path.HasExtInsensitive(ZipExtension) && len(obj.AllowedExtensions) == 0 { 145 | return nil 146 | } 147 | 148 | for _, x := range obj.AllowedExtensions { 149 | if obj.Path.HasExtInsensitive(x) { 150 | return nil 151 | } 152 | } 153 | 154 | if len(obj.AllowedExtensions) == 0 { 155 | return fmt.Errorf("the zip extension is required without the allow any extension option") 156 | } 157 | 158 | return fmt.Errorf("an allowed extension is required to run this iterator") 159 | } 160 | 161 | // GetParser returns a handle to the parent parser that built this iterator if 162 | // there is one. 163 | func (obj *Zip) GetParser() interfaces.Parser { return obj.Parser } 164 | 165 | // GetIterator returns a handle to the parent iterator that built this iterator 166 | // if there is one. 167 | func (obj *Zip) GetIterator() interfaces.Iterator { return obj.Iterator } 168 | 169 | // Recurse runs a simple iterator that is responsible for unzipping a zip URI 170 | // into a local filesystem path. If this happens successfully, it will return a 171 | // new FsIterator that is initialized to this root path. 172 | func (obj *Zip) Recurse(ctx context.Context, scan interfaces.ScanFunc) ([]interfaces.Iterator, error) { 173 | relDir := safepath.UnsafeParseIntoRelDir("zip/") 174 | prefix := safepath.JoinToAbsDir(obj.Prefix, relDir) 175 | if err := os.MkdirAll(prefix.Path(), interfaces.Umask); err != nil { 176 | return nil, err 177 | } 178 | 179 | // make a unique ID for the directory 180 | // XXX: we can consider different algorithms or methods here later... 181 | now := strconv.FormatInt(time.Now().UnixMilli(), 10) // itoa but int64 182 | sum := sha256.Sum256([]byte(obj.Path.Path() + now)) 183 | hashRelDir, err := safepath.ParseIntoRelDir(fmt.Sprintf("%x", sum)) 184 | if err != nil { 185 | return nil, err 186 | } 187 | // ensure it gets put into a folder so it doesn't explode current dir 188 | zipAbsDir := safepath.JoinToAbsDir(prefix, hashRelDir) 189 | 190 | zipMapMutex.Lock() 191 | mu, exists := zipMutexes[obj.Path.Path()] 192 | if !exists { 193 | mu = &sync.Mutex{} 194 | zipMutexes[obj.Path.Path()] = mu 195 | } 196 | zipMapMutex.Unlock() 197 | 198 | if obj.Debug { 199 | obj.Logf("locking: %s", obj.String()) 200 | } 201 | mu.Lock() // locking happens here (unlock on all errors/returns!) 202 | once := &sync.Once{} 203 | obj.unlock = func() { 204 | fn := func() { 205 | if obj.Debug { 206 | obj.Logf("unlocking: %s", obj.String()) 207 | } 208 | mu.Unlock() 209 | } 210 | once.Do(fn) 211 | } 212 | 213 | // XXX: unlock when context closes? 214 | 215 | // XXX: If the destination dir has contents, consider removing them 216 | // first. This is one reason why we have a mutex. 217 | 218 | // Open the zip archive for reading. 219 | // FIXME: use a variant that can take a context 220 | z, err := zip.OpenReader(obj.Path.Path()) 221 | if err == zip.ErrFormat || err == zip.ErrAlgorithm || err == zip.ErrChecksum { 222 | obj.unlock() 223 | // Return an "iterator error" instead! This is a magic error 224 | // that tells the caller that we don't want to nuke the entire 225 | // scan for one unimportant error! Instead we bubble up and 226 | // collect this information to return to the user. 227 | return nil, &interfaces.IteratorError{ 228 | Path: obj.Path.Path(), 229 | Err: err, 230 | } 231 | 232 | } else if err != nil { 233 | obj.unlock() 234 | return nil, errwrap.Wrapf(err, "error opening path %s", obj.Path) 235 | } 236 | defer z.Close() 237 | if z.Comment != "" { 238 | obj.Logf("zip has comment: %s", z.Comment) 239 | } 240 | 241 | filesTotal := 0 242 | bytesTotal := int64(0) 243 | // Iterate through the files in the archive. 244 | // XXX: can a child directory appear before a parent? 245 | // TODO: add a recurring progress logf if it takes longer than 30 sec 246 | for _, x := range z.File { 247 | // In an effort to short-circuit things if needed, we run a 248 | // check ourselves and break out early if we see that we have 249 | // cancelled early. 250 | select { 251 | case <-ctx.Done(): 252 | obj.unlock() 253 | return nil, errwrap.Wrapf(ctx.Err(), "ended unzipping early") 254 | default: 255 | } 256 | 257 | // TODO: obj.Debug ? 258 | obj.Logf("zip: %s", x.Name) 259 | 260 | if x.FileInfo().IsDir() { 261 | relDir, err := safepath.ParseIntoRelDir(x.Name) 262 | if err != nil { 263 | // programming error 264 | obj.unlock() 265 | return nil, err 266 | } 267 | 268 | // this is where the new dir will be created 269 | absDir := safepath.JoinToAbsDir(zipAbsDir, relDir) 270 | 271 | // XXX: sanity check (is output in the dir?) 272 | // TODO: we could add this, but safepath automatically does this 273 | // if absDir is not inside of zipAbsDir then error 274 | 275 | // XXX: which mode method? 276 | //if err := os.MkdirAll(absDir.Path(), x.Mode()); err != nil { 277 | if err := os.MkdirAll(absDir.Path(), os.ModePerm); err != nil { 278 | // programming error 279 | obj.unlock() 280 | return nil, err 281 | } 282 | 283 | continue 284 | } 285 | 286 | relFile, err := safepath.ParseIntoRelFile(x.Name) 287 | if err != nil { 288 | // programming error 289 | obj.unlock() 290 | return nil, err 291 | } 292 | 293 | // this is where the output file will be stored 294 | absFile := safepath.JoinToAbsFile(zipAbsDir, relFile) 295 | 296 | // XXX: sanity check (is output in the dir?) 297 | // TODO: we could add this, but safepath automatically does this 298 | // if absFile is not inside of zipAbsDir then error 299 | 300 | absDir := absFile.Dir() // get the absDir that absFile is in 301 | 302 | // XXX: which mode to use? Maybe we are assuming a mode here 303 | // because we haven't seen that dir yet! Maybe if we pre-sort 304 | // all of the zip file entries first... 305 | //if err := os.MkdirAll(absDir.Path(), x.Mode()); err != nil { 306 | if err := os.MkdirAll(absDir.Path(), os.ModePerm); err != nil { 307 | // programming error 308 | obj.unlock() 309 | return nil, err 310 | } 311 | 312 | // write to this location 313 | // XXX: which mode method? 314 | //dest, err := os.OpenFile(absFile.Path(), os.O_WRONLY|os.O_CREATE|os.O_TRUNC, x.Mode()) 315 | dest, err := os.OpenFile(absFile.Path(), os.O_WRONLY|os.O_CREATE|os.O_TRUNC, os.ModePerm) 316 | if err != nil { 317 | obj.unlock() 318 | return nil, errwrap.Wrapf(err, "error writing our file to disk at %s", absFile) 319 | } 320 | 321 | // open the actual source file 322 | f, err := x.Open() 323 | if err != nil { 324 | dest.Close() // close dest file on error! 325 | obj.unlock() 326 | return nil, errwrap.Wrapf(err, "error opening file %s", x.Name) 327 | } 328 | // don't `defer` close here because we want to free in the loop 329 | 330 | // FIXME: use a variant that can take a context 331 | size, err := io.Copy(dest, f) 332 | if err != nil { 333 | f.Close() // close file on error! 334 | dest.Close() // close dest file on error! 335 | obj.unlock() 336 | return nil, errwrap.Wrapf(err, "error writing our file to disk at %s", absFile) 337 | } 338 | obj.Logf("unzipped: %d bytes to disk at %s", size, absFile) 339 | 340 | f.Close() // close on success to save memory! 341 | dest.Close() // close dest file on error! 342 | 343 | filesTotal++ 344 | bytesTotal += int64(size) 345 | } 346 | 347 | // TODO: change to human readable bytes 348 | obj.Logf("unzipped: %d files from %s into %s (%d bytes)", filesTotal, obj.String(), zipAbsDir, bytesTotal) 349 | 350 | obj.iterators = []interfaces.Iterator{} 351 | 352 | // if it's a single zip file we return an fs iterator and let the fs 353 | // iterator sort that out... 354 | iterator := &Fs{ 355 | Debug: obj.Debug, 356 | Logf: func(format string, v ...interface{}) { 357 | obj.Logf(format, v...) // TODO: add a prefix? 358 | }, 359 | Prefix: obj.Prefix, 360 | 361 | Iterator: obj, 362 | 363 | Path: zipAbsDir, 364 | 365 | //Unlock: unlock, 366 | } 367 | obj.iterators = append(obj.iterators, iterator) 368 | 369 | return obj.iterators, nil 370 | } 371 | 372 | // Close shuts down the iterator and/or performs clean up after the Recurse 373 | // method has run. This must be called if you run Recurse. 374 | func (obj *Zip) Close() error { 375 | if obj.unlock != nil { 376 | obj.unlock() 377 | } 378 | var errs error 379 | for i := len(obj.iterators) - 1; i >= 0; i-- { // reverse order (stacks!) 380 | if err := obj.iterators[i].Close(); err != nil { 381 | errs = errwrap.Append(errs, err) 382 | } 383 | } 384 | return errs 385 | } 386 | -------------------------------------------------------------------------------- /lib/profiles.go: -------------------------------------------------------------------------------- 1 | // Copyright Amazon.com Inc or its affiliates and the project contributors 2 | // Written by James Shubin and the project contributors 3 | // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); you may not 5 | // use this file except in compliance with the License. You may obtain a copy of 6 | // the License at 7 | // 8 | // http://www.apache.org/licenses/LICENSE-2.0 9 | // 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 12 | // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 13 | // License for the specific language governing permissions and limitations under 14 | // the License. 15 | // 16 | // We will never require a CLA to submit a patch. All contributions follow the 17 | // `inbound == outbound` rule. 18 | // 19 | // This is not an official Amazon product. Amazon does not offer support for 20 | // this project. 21 | // 22 | // SPDX-License-Identifier: Apache-2.0 23 | 24 | package lib 25 | 26 | import ( 27 | "fmt" 28 | "sort" 29 | "strings" 30 | 31 | "github.com/awslabs/yesiscan/interfaces" 32 | "github.com/awslabs/yesiscan/util" 33 | "github.com/awslabs/yesiscan/util/licenses" 34 | 35 | colour "github.com/fatih/color" 36 | ) 37 | 38 | const ( 39 | // UseColour specifies whether we use ANSI/HTML colours or not. 40 | UseColour = true 41 | 42 | // DefaultProfileName is the name given to the built-in "include all" 43 | // profile. 44 | DefaultProfileName = "default" 45 | ) 46 | 47 | // ProfileConfig is the datastructure representing the profile config that is 48 | // used for the .json files on disk. 49 | type ProfileConfig struct { 50 | 51 | // Licenses is the list of license SPDX ID's to match. 52 | Licenses []string `json:"licenses"` 53 | 54 | // Exclude these licenses from match instead of including by default. 55 | Exclude bool `json:"exclude"` 56 | 57 | // Comment adds a user friendly comment for this file. 58 | Comment string `json:"comment"` 59 | } 60 | 61 | // ProfileData is the parsed version of ProfileConfig with real license structs. 62 | type ProfileData struct { 63 | 64 | // Licenses is the list of license SPDX ID's to match. 65 | Licenses []*licenses.License 66 | 67 | // Exclude these licenses from match instead of including by default. 68 | Exclude bool 69 | } 70 | 71 | // SimpleProfiles is a simple way to filter the results. This is the first 72 | // filter function created and is mostly used for an initial POC. It is the 73 | // more complicated successor to the SimpleResults function. Style can be 74 | // `ansi`, `html`, or `text`. 75 | func SimpleProfiles(results interfaces.ResultSet, passes []string, warnings map[string]error, profile *ProfileData, summary bool, backendWeights map[interfaces.Backend]float64, style string) (string, error) { 76 | if style != "ansi" && style != "html" && style != "text" { 77 | return "", fmt.Errorf("invalid style: %s", style) 78 | } 79 | 80 | redString := func(format string, a ...interface{}) string { 81 | if style == "ansi" { 82 | return colour.New(colour.FgRed).Add(colour.Bold).Sprintf(format, a...) 83 | } 84 | if style == "html" { 85 | return `` + fmt.Sprintf(format, a...) + "" 86 | } 87 | return fmt.Sprintf(format, a...) 88 | } 89 | boldString := func(format string, a ...interface{}) string { 90 | if style == "ansi" { 91 | return colour.New(colour.Bold).Sprintf(format, a...) 92 | } 93 | if style == "html" { 94 | return `` + fmt.Sprintf(format, a...) + "" 95 | } 96 | return fmt.Sprintf(format, a...) 97 | } 98 | str := "" 99 | 100 | countStr := fmt.Sprintf("%d", len(passes)) 101 | if len(passes) > 0 { 102 | countStr = redString(countStr) 103 | } 104 | 105 | hasResults := false // do we have anything to show? 106 | licenseMap := make(map[string]int64) // for computing a summary 107 | errorMap := make(map[string]struct { 108 | backend string 109 | err error 110 | }) // for recording found skip errors 111 | // XXX: handle dir's in here specially and merge in their weights with child paths! 112 | Loop: 113 | for uri, m := range results { // FIXME: sort and process properly 114 | bs := []*AnnotatedBackend{} 115 | ttl := 0.0 // total weight for the set of backends at this uri 116 | skipUri := true // assume we skip 117 | innerLicenseMap := make(map[string]int64) 118 | plus := func(name string) { 119 | val, _ := innerLicenseMap[name] // defaults to zero! 120 | innerLicenseMap[name] = val + 1 121 | } 122 | for backend, result := range m { 123 | if result.Skip != nil { 124 | errorMap[uri] = struct { 125 | backend string 126 | err error 127 | }{ 128 | backend: backend.String(), 129 | err: result.Skip, 130 | } 131 | } 132 | // accounting for licenses summary 133 | for _, x := range result.Licenses { 134 | plus(x.String()) 135 | } 136 | 137 | if profile == nil { 138 | skipUri = false 139 | } else { 140 | // TODO: memoize this for performance 141 | count := len(licenses.Union(profile.Licenses, result.Licenses)) 142 | // are there licenses that match in our profile? 143 | if count > 0 && !profile.Exclude { 144 | skipUri = false 145 | } 146 | 147 | // are there licenses we didn't account for? 148 | if len(result.Licenses) > count && profile.Exclude { 149 | skipUri = false 150 | } 151 | } 152 | 153 | weight, exists := backendWeights[backend] 154 | if !exists { 155 | return "", fmt.Errorf("no weight found for backend: %s", backend.String()) 156 | } 157 | b := &AnnotatedBackend{ 158 | Backend: backend, 159 | Weight: weight, 160 | } 161 | bs = append(bs, b) 162 | ttl += weight 163 | } 164 | if skipUri { // we don't want to display this Uri (this file) 165 | continue Loop 166 | } 167 | f := 0.0 // NOTE: confidence *if* the different results agree! 168 | //for backend, result := range m { 169 | for _, b := range bs { // for backend, result := range m 170 | backend := b.Backend 171 | weight := b.Weight // backendWeights[backend] 172 | result := m[backend] 173 | scale := weight / ttl 174 | b.ScaledConfidence = result.Confidence * scale 175 | f = f + b.ScaledConfidence 176 | } 177 | 178 | // merge into to parent accounting 179 | for k, v := range innerLicenseMap { // map[string]int64 180 | val, _ := licenseMap[k] // defaults to zero! 181 | licenseMap[k] = val + v 182 | } 183 | 184 | // start table row here after the above continue... 185 | if style == "html" { 186 | str += "" 187 | } 188 | 189 | sort.Sort(sort.Reverse(SortedBackends(bs))) 190 | smartURI := util.SmartURI(uri) // make it useful to click on 191 | if style == "ansi" { 192 | hyperlink := util.ShellHyperlinkEncode(uri, smartURI) 193 | str += fmt.Sprintf("%s (%.2f%%)\n", hyperlink, f*100.0) 194 | } 195 | if style == "html" { 196 | hyperlink := util.HtmlHyperlinkEncode(uri, smartURI) 197 | str += fmt.Sprintf("%s (%.2f%%)", hyperlink, f*100.0) 198 | } 199 | if style == "text" { 200 | // TODO: can we do better for text output? 201 | str += fmt.Sprintf("%s (%.2f%%)\n", uri, f*100.0) 202 | } 203 | hasResults = true 204 | 205 | if style == "html" { 206 | str += "
    " 207 | } 208 | for _, b := range bs { // for backend, result := range m 209 | backend := b.Backend 210 | weight := b.Weight // backendWeights[backend] 211 | result := m[backend] 212 | 213 | l := licenses.Join(result.Licenses) 214 | if UseColour && profile != nil { 215 | ll := []string{} 216 | // only colour the matched ones! 217 | for _, x := range result.Licenses { 218 | r := x.String() 219 | inList := licenses.InList(x, profile.Licenses) 220 | if inList && !profile.Exclude || !inList && profile.Exclude { 221 | r = x.String() 222 | r = redString(r) 223 | } 224 | 225 | ll = append(ll, r) 226 | } 227 | l = strings.Join(ll, ", ") 228 | } 229 | 230 | s := "" 231 | if style == "ansi" { 232 | s = fmt.Sprintf(" %s (%.2f/%.2f) %s (%.2f%%)\n", backend.String(), weight, ttl, l, result.Confidence*100.0) 233 | } 234 | if style == "html" { 235 | s = fmt.Sprintf("
  • %s (%.2f/%.2f) %s (%.2f%%)
  • ", backend.String(), weight, ttl, l, result.Confidence*100.0) 236 | } 237 | if style == "text" { 238 | s = fmt.Sprintf(" %s (%.2f/%.2f) %s (%.2f%%)\n", backend.String(), weight, ttl, l, result.Confidence*100.0) 239 | } 240 | 241 | str += s 242 | hasResults = true 243 | if !debug { 244 | continue 245 | } 246 | it := result.Meta.Iterator // at least one must be present 247 | for { 248 | str += fmt.Sprintf(" %s\n", it) 249 | hasResults = true 250 | newIt := it.GetIterator() 251 | if newIt == nil { 252 | break 253 | } 254 | it = newIt 255 | } 256 | if parser := it.GetParser(); parser != nil { 257 | str += fmt.Sprintf(" %s\n", parser) 258 | hasResults = true 259 | } 260 | } 261 | if style == "html" { 262 | str += "
" 263 | str += "" 264 | } 265 | } 266 | 267 | skippedStr := "" 268 | if style == "ansi" { 269 | skippedStr = fmt.Sprintf("skipped: %s files/directories\n", countStr) 270 | } 271 | if style == "html" { 272 | s := `` 273 | s += fmt.Sprintf("", countStr) 274 | s += "
skipped: %s files/directories
" 275 | skippedStr = s 276 | } 277 | if style == "text" { 278 | skippedStr = fmt.Sprintf("skipped: %d files/directories\n", countStr) 279 | } 280 | 281 | erroredStr := "" 282 | if len(errorMap) > 0 { // keep it in scope 283 | names := []string{} 284 | for k := range errorMap { // map[string]error 285 | names = append(names, k) 286 | } 287 | sort.Strings(names) 288 | if style == "ansi" || style == "text" { 289 | s := "errors:\n" 290 | for _, x := range names { 291 | s += fmt.Sprintf("%s: %s (%s)\n", x, redString(errorMap[x].err.Error()), errorMap[x].backend) 292 | } 293 | erroredStr = s 294 | } 295 | if style == "html" { 296 | s := `` 297 | s += `` 298 | for _, x := range names { 299 | s += fmt.Sprintf("", x, redString(errorMap[x].err.Error()), errorMap[x].backend) 300 | } 301 | 302 | s += "
errors:
%s%s (%s)
" 303 | erroredStr = s 304 | } 305 | } 306 | 307 | warningStr := "" 308 | if len(warnings) > 0 { // keep it in scope 309 | names := []string{} 310 | for k := range warnings { // map[string]error 311 | names = append(names, k) 312 | } 313 | sort.Strings(names) 314 | if style == "ansi" || style == "text" { 315 | s := "errors:\n" 316 | for _, x := range names { 317 | s += fmt.Sprintf("%s: %s\n", x, redString(warnings[x].Error())) 318 | } 319 | warningStr = s 320 | } 321 | if style == "html" { 322 | s := `` 323 | s += `` 324 | for _, x := range names { 325 | s += fmt.Sprintf("", x, redString(warnings[x].Error())) 326 | } 327 | 328 | s += "
errors:
%s%s
" 329 | warningStr = s 330 | } 331 | } 332 | 333 | noResultsStr := "" 334 | if !hasResults { 335 | noResultsStr = "" 336 | if style == "html" { 337 | s := `` 338 | s += "" 339 | s += "
no results
" 340 | noResultsStr = s 341 | } 342 | } 343 | 344 | summaryStr := "" 345 | if summary { 346 | names := []string{} 347 | for k := range licenseMap { // map[string]int64 348 | names = append(names, k) 349 | } 350 | sort.Strings(names) 351 | if style == "ansi" || style == "text" { 352 | s := boldString("summary:") + "\n" 353 | for _, x := range names { 354 | s += fmt.Sprintf("%s: %d\n", x, licenseMap[x]) 355 | } 356 | summaryStr = s 357 | } 358 | if style == "html" { 359 | s := `` 360 | s += fmt.Sprintf(``, boldString("summary:")) 361 | for _, x := range names { 362 | s += fmt.Sprintf("", x, licenseMap[x]) 363 | } 364 | 365 | s += "
%s
%s%d
" 366 | summaryStr = s 367 | } 368 | } 369 | 370 | if !hasResults { 371 | summaryStr = "" 372 | } 373 | // glue it all together 374 | str = skippedStr + warningStr + erroredStr + summaryStr + noResultsStr + str 375 | 376 | return str, nil 377 | } 378 | -------------------------------------------------------------------------------- /lib/results.go: -------------------------------------------------------------------------------- 1 | // Copyright Amazon.com Inc or its affiliates and the project contributors 2 | // Written by James Shubin and the project contributors 3 | // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); you may not 5 | // use this file except in compliance with the License. You may obtain a copy of 6 | // the License at 7 | // 8 | // http://www.apache.org/licenses/LICENSE-2.0 9 | // 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 12 | // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 13 | // License for the specific language governing permissions and limitations under 14 | // the License. 15 | // 16 | // We will never require a CLA to submit a patch. All contributions follow the 17 | // `inbound == outbound` rule. 18 | // 19 | // This is not an official Amazon product. Amazon does not offer support for 20 | // this project. 21 | // 22 | // SPDX-License-Identifier: Apache-2.0 23 | 24 | package lib 25 | 26 | import ( 27 | "fmt" 28 | "sort" 29 | 30 | "github.com/awslabs/yesiscan/interfaces" 31 | "github.com/awslabs/yesiscan/util" 32 | "github.com/awslabs/yesiscan/util/licenses" 33 | ) 34 | 35 | const ( 36 | debug = false 37 | ) 38 | 39 | type AnnotatedBackend struct { 40 | Backend interfaces.Backend 41 | Weight float64 42 | ScaledConfidence float64 43 | } 44 | 45 | type SortedBackends []*AnnotatedBackend 46 | 47 | func (obj SortedBackends) Len() int { return len(obj) } 48 | func (obj SortedBackends) Swap(i, j int) { obj[i], obj[j] = obj[j], obj[i] } 49 | func (obj SortedBackends) Less(i, j int) bool { 50 | return obj[i].ScaledConfidence < obj[j].ScaledConfidence 51 | } 52 | 53 | //func (obj SortedBackends) Sort() { sort.Sort(obj) } 54 | 55 | // SimpleResults is a simple way to format the results. This is the first 56 | // display function created and is mostly used for debugging and initial POC. 57 | func SimpleResults(results interfaces.ResultSet, backendWeights map[interfaces.Backend]float64) (string, error) { 58 | if len(results) == 0 { 59 | return "", fmt.Errorf("no results obtained") 60 | } 61 | 62 | str := "" 63 | // XXX: handle dir's in here specially and merge in their weights with child paths! 64 | for uri, m := range results { // FIXME: sort and process properly 65 | bs := []*AnnotatedBackend{} 66 | ttl := 0.0 // total weight for the set of backends at this uri 67 | for backend := range m { 68 | weight, exists := backendWeights[backend] 69 | if !exists { 70 | return "", fmt.Errorf("no weight found for backend: %s", backend.String()) 71 | } 72 | b := &AnnotatedBackend{ 73 | Backend: backend, 74 | Weight: weight, 75 | } 76 | bs = append(bs, b) 77 | ttl += weight 78 | } 79 | f := 0.0 // NOTE: confidence *if* the different results agree! 80 | //for backend, result := range m { 81 | for _, b := range bs { // for backend, result := range m 82 | backend := b.Backend 83 | weight := b.Weight // backendWeights[backend] 84 | result := m[backend] 85 | scale := weight / ttl 86 | b.ScaledConfidence = result.Confidence * scale 87 | f = f + b.ScaledConfidence 88 | } 89 | 90 | sort.Sort(sort.Reverse(SortedBackends(bs))) 91 | display := uri // show the URI 92 | smartURI := util.SmartURI(uri) 93 | hyperlink := util.ShellHyperlinkEncode(display, smartURI) 94 | str += fmt.Sprintf("%s (%.2f%%)\n", hyperlink, f*100.0) 95 | for _, b := range bs { // for backend, result := range m 96 | backend := b.Backend 97 | weight := b.Weight // backendWeights[backend] 98 | result := m[backend] 99 | l := licenses.Join(result.Licenses) 100 | str += fmt.Sprintf(" %s (%.2f/%.2f) %s (%.2f%%)\n", backend.String(), weight, ttl, l, result.Confidence*100.0) 101 | if !debug { 102 | continue 103 | } 104 | it := result.Meta.Iterator // at least one must be present 105 | for { 106 | str += fmt.Sprintf(" %s\n", it) 107 | newIt := it.GetIterator() 108 | if newIt == nil { 109 | break 110 | } 111 | it = newIt 112 | } 113 | if parser := it.GetParser(); parser != nil { 114 | str += fmt.Sprintf(" %s\n", parser) 115 | } 116 | } 117 | } 118 | return str, nil 119 | } 120 | -------------------------------------------------------------------------------- /parser/parser.go: -------------------------------------------------------------------------------- 1 | // Copyright Amazon.com Inc or its affiliates and the project contributors 2 | // Written by James Shubin and the project contributors 3 | // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); you may not 5 | // use this file except in compliance with the License. You may obtain a copy of 6 | // the License at 7 | // 8 | // http://www.apache.org/licenses/LICENSE-2.0 9 | // 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 12 | // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 13 | // License for the specific language governing permissions and limitations under 14 | // the License. 15 | // 16 | // We will never require a CLA to submit a patch. All contributions follow the 17 | // `inbound == outbound` rule. 18 | // 19 | // This is not an official Amazon product. Amazon does not offer support for 20 | // this project. 21 | // 22 | // SPDX-License-Identifier: Apache-2.0 23 | 24 | package parser 25 | 26 | import ( 27 | "fmt" 28 | "net/url" 29 | "os" 30 | "path/filepath" 31 | "strings" 32 | 33 | "github.com/awslabs/yesiscan/interfaces" 34 | "github.com/awslabs/yesiscan/iterator" 35 | "github.com/awslabs/yesiscan/util/errwrap" 36 | "github.com/awslabs/yesiscan/util/safepath" 37 | "github.com/go-git/go-git/v5/plumbing" 38 | ) 39 | 40 | // TrivialURIParser takes input as a single string. It expects either a URL or a 41 | // Path component as the input. 42 | type TrivialURIParser struct { 43 | Debug bool 44 | Logf func(format string, v ...interface{}) 45 | Prefix safepath.AbsDir 46 | 47 | Input string 48 | } 49 | 50 | func (obj *TrivialURIParser) String() string { 51 | return fmt.Sprintf("trivialuriparser(%s)", obj.Input) 52 | } 53 | 54 | func (obj *TrivialURIParser) Parse() ([]interfaces.Iterator, error) { 55 | if obj.Input == "" { 56 | return nil, fmt.Errorf("empty input") 57 | } 58 | 59 | iterators := []interfaces.Iterator{} 60 | 61 | // NOTE: it's unlikely that the url.Parse method ever errors. 62 | u, err := url.Parse(obj.Input) 63 | if err != nil { 64 | return nil, errwrap.Wrapf(err, "could not parse URL") 65 | } 66 | s := u.String() 67 | 68 | if obj.Debug { 69 | obj.Logf("scheme: %s", u.Scheme) 70 | obj.Logf("host: %s", u.Host) 71 | obj.Logf("path: %s", u.Path) 72 | } 73 | 74 | // TODO: consider allowing HttpSchemeRaw as well (with a flag) 75 | if strings.ToLower(u.Scheme) == iterator.HttpSchemeRaw { 76 | return nil, fmt.Errorf("plain http is currently blocked, did you mean https?") 77 | } 78 | 79 | // this is a bit of a heuristic, but we'll go with it for now 80 | // this is because we get https:// urls that are really github git URI's 81 | isTar := strings.HasSuffix(strings.ToLower(s), iterator.TarExtension) 82 | if strings.ToLower(u.Scheme) == iterator.HttpsSchemeRaw && (isZip(s) || isGzip(s) || isTar || isBzip2(s)) { 83 | iterator := &iterator.Http{ 84 | Debug: obj.Debug, 85 | Logf: func(format string, v ...interface{}) { 86 | obj.Logf("iterator: "+format, v...) 87 | }, 88 | Prefix: obj.Prefix, 89 | URL: s, // TODO: pass a *net.URL instead? 90 | AllowHttp: false, // allow non-https ? 91 | 92 | Parser: obj, // store a handle to the originator 93 | } 94 | iterators = append(iterators, iterator) 95 | return iterators, nil 96 | } 97 | 98 | if isGit(u) { 99 | // TODO: for now, just assume it can only be a git iterator... 100 | // Checking if commit hash exists at the end of the URL. 101 | // examples of URLs of different hosts containing commit hashes: 102 | // github: https://github.com/awslabs/yesiscan/commit/496d080bc7fe835511d7220f127e118d0881b792 103 | // webrtc: https://webrtc.googlesource.com/src.git/+/c276aee4eda7b1a466b139838f20e790bd746309 104 | // TODO: Might need to be generalized in the future as we add more URL patterns. 105 | hash := "" 106 | index := strings.LastIndex(u.Path, "/") 107 | pathSuffix := u.Path[index+1:] 108 | if plumbing.IsHash(pathSuffix) { 109 | hash = pathSuffix 110 | // Here we are removing the parts of the URL which are there because 111 | // of a commit hash such that the repository can be cloned properly. 112 | u.Path = u.Path[:index] 113 | index := strings.LastIndex(u.Path, "/") 114 | u.Path = u.Path[:index] 115 | s = u.String() 116 | } 117 | iterator := &iterator.Git{ 118 | Debug: obj.Debug, 119 | Logf: func(format string, v ...interface{}) { 120 | obj.Logf("iterator: "+format, v...) 121 | }, 122 | Prefix: obj.Prefix, 123 | URL: s, // TODO: pass a *net.URL instead? 124 | TrimGitSuffix: true, 125 | Hash: hash, 126 | Parser: obj, // store a handle to the originator 127 | } 128 | iterators = append(iterators, iterator) 129 | return iterators, nil 130 | } 131 | 132 | // path component (absolute or relative, file or dir) 133 | if u.Scheme == "" { 134 | // XXX: we could auto-detect the dir bit 135 | isDir := strings.HasSuffix(obj.Input, "/") 136 | info, err := os.Stat(obj.Input) // XXX: stat or Lstat? 137 | if err != nil { 138 | return nil, err 139 | } 140 | if isDir != info.IsDir() { 141 | return nil, fmt.Errorf("input path must end with a trailing slash if it's a dir") 142 | } 143 | 144 | p, err := filepath.Abs(obj.Input) 145 | if err != nil { 146 | return nil, err 147 | } 148 | if isDir { 149 | p += "/" // filepath.Abs calls filepath.Clean which strips this 150 | } 151 | 152 | path, err := safepath.ParseIntoPath(p, isDir) 153 | if err != nil { 154 | return nil, err 155 | } 156 | iterator := &iterator.Fs{ 157 | Debug: obj.Debug, 158 | Logf: func(format string, v ...interface{}) { 159 | obj.Logf("iterator: "+format, v...) 160 | }, 161 | Prefix: obj.Prefix, 162 | Path: path, 163 | 164 | Parser: obj, // store a handle to the originator 165 | } 166 | iterators = append(iterators, iterator) 167 | return iterators, nil 168 | } 169 | 170 | obj.Logf("i'm not sure how to parse this URI, please report this if you think I should be able to!") 171 | return nil, fmt.Errorf("i'm not sure how to parse this uri") 172 | } 173 | 174 | // isGit is a small helper to decide if we should run the git iterator or not. 175 | // TODO: we should expand this function as it's a heuristic. maybe we can do 176 | // better overall and not need a heuristic. time will tell... 177 | func isGit(u *url.URL) bool { 178 | if strings.ToLower(u.Scheme) == iterator.GitSchemeRaw { 179 | return true 180 | } 181 | if strings.ToLower(u.Scheme) == iterator.HttpsSchemeRaw { 182 | hosts := []string{"github.com", "webrtc.googlesource.com"} 183 | urlHost := strings.ToLower(u.Host) 184 | for _, host := range hosts { 185 | if urlHost == host { 186 | return true 187 | } 188 | } 189 | } 190 | 191 | return false 192 | } 193 | 194 | // isZip is a helper method to determine whether a string has a Zip extension 195 | // suffix. 196 | func isZip(input string) bool { 197 | extensions := []string{iterator.ZipExtension, iterator.JarExtension, iterator.WhlExtension} 198 | for _, extension := range extensions { 199 | if strings.HasSuffix(strings.ToLower(input), extension) { 200 | return true 201 | } 202 | } 203 | return false 204 | } 205 | 206 | // isGzip is a helper method to determine whether a string has a Gzip extension 207 | // suffix. 208 | func isGzip(input string) bool { 209 | for _, extension := range iterator.GzipExtensions { 210 | if strings.HasSuffix(strings.ToLower(input), extension) { 211 | return true 212 | } 213 | } 214 | return false 215 | } 216 | 217 | // isBzip2 is a helper method to determine whether a string has a Bzip2 218 | // extension suffix. 219 | func isBzip2(input string) bool { 220 | for _, extension := range iterator.Bzip2Extensions { 221 | if strings.HasSuffix(strings.ToLower(input), extension) { 222 | return true 223 | } 224 | } 225 | return false 226 | } 227 | -------------------------------------------------------------------------------- /s3/s3.go: -------------------------------------------------------------------------------- 1 | // Copyright Amazon.com Inc or its affiliates and the project contributors 2 | // Written by James Shubin and the project contributors 3 | // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); you may not 5 | // use this file except in compliance with the License. You may obtain a copy of 6 | // the License at 7 | // 8 | // http://www.apache.org/licenses/LICENSE-2.0 9 | // 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 12 | // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 13 | // License for the specific language governing permissions and limitations under 14 | // the License. 15 | // 16 | // We will never require a CLA to submit a patch. All contributions follow the 17 | // `inbound == outbound` rule. 18 | // 19 | // This is not an official Amazon product. Amazon does not offer support for 20 | // this project. 21 | // 22 | // SPDX-License-Identifier: Apache-2.0 23 | 24 | package s3 25 | 26 | import ( 27 | "bytes" 28 | "context" 29 | "crypto/md5" 30 | "encoding/base64" 31 | "errors" 32 | "fmt" 33 | "io" 34 | "time" 35 | 36 | "github.com/awslabs/yesiscan/util/errwrap" 37 | 38 | "github.com/aws/aws-sdk-go-v2/aws" 39 | s3config "github.com/aws/aws-sdk-go-v2/config" 40 | "github.com/aws/aws-sdk-go-v2/service/s3" 41 | s3types "github.com/aws/aws-sdk-go-v2/service/s3/types" 42 | ) 43 | 44 | const ( 45 | // GrantReadAllUsers is the constant used to give read access to all. 46 | GrantReadAllUsers = "uri=http://acs.amazonaws.com/groups/global/AllUsers" 47 | 48 | // DefaultRegion is a region to use if none are specified. 49 | DefaultRegion = "ca-central-1" // yul 50 | ) 51 | 52 | // PubURL returns the public URL for an object in a given region and bucket. 53 | // This depends on you setting the appropriate permissions and choosing valid 54 | // input parameters. No validation is done, this is just templating. 55 | func PubURL(region, bucket, object string) string { 56 | return fmt.Sprintf("https://%s.s3.%s.amazonaws.com/%s", bucket, region, object) 57 | } 58 | 59 | // Inputs is the set of information required to use the Store method. 60 | type Inputs struct { 61 | // Region is the region where we will push the data. 62 | Region string 63 | 64 | // BucketName is the name of the bucket. 65 | BucketName string 66 | 67 | // CreateBucket is true if we wish to create the bucket if it's missing. 68 | CreateBucket bool 69 | 70 | // ObjectName is the name of the object. 71 | ObjectName string 72 | 73 | // GrantReadAllUsers specifies that all users read access will be set on 74 | // this object. Only use this if you are certain you want anyone on the 75 | // internet to be able to read this object. 76 | GrantReadAllUsers bool 77 | 78 | // ContentType is what is set for the object if it is non-nil. 79 | ContentType *string 80 | 81 | // Data is the actual data to store. 82 | Data []byte 83 | 84 | Debug bool 85 | Logf func(format string, v ...interface{}) 86 | } 87 | 88 | // Store takes some inputs and stores the data into s3. If successful, it 89 | // returns a presign URL that can be shared to give access to the object. If you 90 | // chose to make the object public, then it can also be accessed using the 91 | // well-known public URL as obtained by the PubURL function. This depends on you 92 | // having appropriate AWS credentials set up on your machine for the account you 93 | // want to use. 94 | func Store(ctx context.Context, inputs *Inputs) (string, error) { 95 | if inputs.Debug { 96 | inputs.Logf("begin s3...") 97 | defer inputs.Logf("done s3") 98 | } 99 | 100 | // TODO: check if region is valid? 101 | if inputs.Region == "" { 102 | return "", fmt.Errorf("empty region") 103 | } 104 | 105 | cfg, err := s3config.LoadDefaultConfig(ctx, s3config.WithRegion(inputs.Region)) 106 | if err != nil { 107 | return "", errwrap.Wrapf(err, "config error") 108 | } 109 | cfg.Region = inputs.Region 110 | client := s3.NewFromConfig(cfg) 111 | 112 | if inputs.CreateBucket { 113 | if inputs.Debug { 114 | inputs.Logf("creating bucket...") 115 | } 116 | createBucketInput := &s3.CreateBucketInput{ 117 | Bucket: &inputs.BucketName, 118 | 119 | // The configuration information for the bucket. 120 | CreateBucketConfiguration: &s3types.CreateBucketConfiguration{ 121 | // Specifies the Region where the bucket will be 122 | // created. If you don't specify a Region, the 123 | // bucket is created in the US East 124 | // (N. Virginia) Region (us-east-1). 125 | //LocationConstraint: s3types.BucketLocationConstraintCaCentral1, 126 | // it's a string region 127 | LocationConstraint: s3types.BucketLocationConstraint(inputs.Region), 128 | }, 129 | } 130 | 131 | _, err := client.CreateBucket(ctx, createBucketInput) 132 | //*CreateBucketOutput 133 | if err == nil { 134 | inputs.Logf("bucket created") 135 | } 136 | 137 | // ignore the error if it shows bucket already exists 138 | var bucketErr error 139 | for err != nil { 140 | bucketErr = err // we have an error! 141 | if _, ok := err.(*s3types.BucketAlreadyOwnedByYou); ok { 142 | bucketErr = nil // ignore me! 143 | break 144 | } 145 | err = errors.Unwrap(err) 146 | } 147 | if bucketErr != nil { 148 | return "", errwrap.Wrapf(bucketErr, "bucket creation issue") 149 | } 150 | if inputs.Debug { 151 | inputs.Logf("bucket should exist") 152 | } 153 | } 154 | 155 | body := bytes.NewReader(inputs.Data) // support seek 156 | 157 | // we hash this to make idempotent puts avoid copying the data again... 158 | h := md5.New() 159 | if _, err := io.Copy(h, body); err != nil { 160 | return "", errwrap.Wrapf(err, "copy to hash error") 161 | } 162 | // rewind after hashing 163 | if _, err := body.Seek(0, io.SeekStart); err != nil { 164 | return "", errwrap.Wrapf(err, "seek error") 165 | } 166 | 167 | md5s := base64.StdEncoding.EncodeToString(h.Sum(nil)) 168 | if inputs.Debug { 169 | inputs.Logf("md5s: %s", md5s) 170 | } 171 | 172 | putObjectInput := &s3.PutObjectInput{ 173 | Bucket: &inputs.BucketName, // this member is required 174 | 175 | Key: &inputs.ObjectName, // this member is required 176 | 177 | // For using values that are not seekable (io.Seeker) see, 178 | // https://aws.github.io/aws-sdk-go-v2/docs/sdk-utilisties/s3/#unseekable-streaming-input 179 | Body: body, // io.Reader 180 | 181 | ContentMD5: &md5s, 182 | 183 | ContentType: inputs.ContentType, 184 | 185 | StorageClass: s3types.StorageClassStandard, 186 | } 187 | if inputs.GrantReadAllUsers { // give all users on internet read access! 188 | putObjectInput.GrantRead = aws.String(GrantReadAllUsers) 189 | } 190 | 191 | inputs.Logf("putting object...") 192 | if _, err := client.PutObject(ctx, putObjectInput); err != nil { 193 | return "", errwrap.Wrapf(err, "put error") 194 | } 195 | 196 | // X-Amz-Expires must be less than a week (in seconds); that is, the 197 | // given X-Amz-Expires must be less than 604800 seconds. (equal is okay) 198 | // TODO: i suppose we could allow the user to specify the expiry time, 199 | // but the maximum is so short, we'll hardcode this in here for now. 200 | presignClient := s3.NewPresignClient(client, s3.WithPresignExpires(7*24*time.Hour)) 201 | 202 | presignResult, err := presignClient.PresignGetObject(ctx, &s3.GetObjectInput{ 203 | Bucket: aws.String(inputs.BucketName), 204 | Key: aws.String(inputs.ObjectName), 205 | }) 206 | 207 | if err != nil { 208 | return "", errwrap.Wrapf(err, "presign error") 209 | } 210 | 211 | return presignResult.URL, nil 212 | } 213 | -------------------------------------------------------------------------------- /s3/screenshot-s3-public-bucket.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/purpleidea/yesiscan/a9f8980f4f152aa15610bec090c7c6503db8ee6f/s3/screenshot-s3-public-bucket.png -------------------------------------------------------------------------------- /util/ansi/ansi.go: -------------------------------------------------------------------------------- 1 | // Copyright Amazon.com Inc or its affiliates and the project contributors 2 | // Written by James Shubin and the project contributors 3 | // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); you may not 5 | // use this file except in compliance with the License. You may obtain a copy of 6 | // the License at 7 | // 8 | // http://www.apache.org/licenses/LICENSE-2.0 9 | // 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 12 | // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 13 | // License for the specific language governing permissions and limitations under 14 | // the License. 15 | // 16 | // We will never require a CLA to submit a patch. All contributions follow the 17 | // `inbound == outbound` rule. 18 | // 19 | // This is not an official Amazon product. Amazon does not offer support for 20 | // this project. 21 | // 22 | // SPDX-License-Identifier: Apache-2.0 23 | 24 | package ansi 25 | 26 | import ( 27 | "fmt" 28 | "os" 29 | "strings" 30 | "sync" 31 | 32 | "golang.org/x/term" 33 | ) 34 | 35 | // Logf is a complex printing thing to do some ansi terminal escape sequence 36 | // magic. 37 | // FIXME: there might be bugs if Ellipsis is very big and Width is very small. 38 | type Logf struct { 39 | // Prefix is a prefix to append to each message. You can leave this 40 | // empty. 41 | Prefix string 42 | 43 | // Ellipsis is what is appended to the end of each message when 44 | // truncating. You can leave this empty. 45 | Ellipsis string 46 | 47 | // Enable specifies whether you want to turn this on or not. 48 | Enable bool 49 | 50 | // Prefixes are a list of string prefixes to match when deciding to 51 | // delete a previous entry. 52 | Prefixes []string 53 | 54 | mutex *sync.Mutex 55 | previous string 56 | isTerminal bool 57 | width int 58 | } 59 | 60 | // Init must be called once before Logf is used. As a convenience, this returns 61 | // the Logf function that you should use! 62 | func (obj *Logf) Init() func(format string, v ...interface{}) { 63 | obj.mutex = &sync.Mutex{} 64 | //obj.previous = "" 65 | obj.isTerminal = term.IsTerminal(0) 66 | var err error 67 | obj.width, _, err = term.GetSize(0) 68 | if err != nil { 69 | obj.isTerminal = false // keep it simple, who cares 70 | } 71 | 72 | return obj.Logf 73 | } 74 | 75 | // Logf is the actual Logf function you should use. You must run Init before 76 | // you use this. 77 | func (obj *Logf) Logf(format string, v ...interface{}) { 78 | s := fmt.Sprintf(format, v...) 79 | 80 | if obj.isTerminal { 81 | // TODO: what about multi-char width UTF-8 stuff? 82 | if len(s) > obj.width-len(obj.Prefix) { // truncate/ellipsize 83 | s = s[0:obj.width-len(obj.Prefix)-len(obj.Ellipsis)] + obj.Ellipsis 84 | } 85 | } 86 | s = s + "\n" // add the newline in 87 | 88 | obj.mutex.Lock() // for safety 89 | validPrefix := false 90 | for _, p := range obj.Prefixes { 91 | b := strings.HasPrefix(obj.previous, p) 92 | validPrefix = validPrefix || b 93 | } 94 | 95 | if obj.Enable && obj.previous != "" && validPrefix { 96 | // move up 1 line, clear to left 97 | fmt.Fprint(os.Stderr, "\033[1A\033[K") // not 1K as you'd think 98 | } 99 | fmt.Fprint(os.Stderr, obj.Prefix+s) // actually print 100 | 101 | obj.previous = s // save for later 102 | obj.mutex.Unlock() 103 | } 104 | -------------------------------------------------------------------------------- /util/errwrap/errwrap.go: -------------------------------------------------------------------------------- 1 | // Mgmt 2 | // Copyright (C) 2013-2021+ James Shubin and the project contributors 3 | // Written by James Shubin and the project contributors 4 | // 5 | // This program is free software: you can redistribute it and/or modify 6 | // it under the terms of the GNU General Public License as published by 7 | // the Free Software Foundation, either version 3 of the License, or 8 | // (at your option) any later version. 9 | // 10 | // This program is distributed in the hope that it will be useful, 11 | // but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | // GNU General Public License for more details. 14 | // 15 | // You should have received a copy of the GNU General Public License 16 | // along with this program. If not, see . 17 | 18 | // NOTE: This was copied from https://github.com/purpleidea/mgmt/ but the author 19 | // has allowed it to be distributed as LGPL-3.0+ for easier use in this project. 20 | // SPDX-License-Identifier: LGPL-3.0-linking-exception 21 | 22 | // Package errwrap contains some error helpers. 23 | package errwrap 24 | 25 | import ( 26 | "github.com/hashicorp/go-multierror" 27 | "github.com/pkg/errors" 28 | ) 29 | 30 | // Wrapf adds a new error onto an existing chain of errors. If the new error to 31 | // be added is nil, then the old error is returned unchanged. 32 | func Wrapf(err error, format string, args ...interface{}) error { 33 | return errors.Wrapf(err, format, args...) 34 | } 35 | 36 | // Append can be used to safely append an error onto an existing one. If you 37 | // pass in a nil error to append, the existing error will be returned unchanged. 38 | // If the existing error is already nil, then the new error will be returned 39 | // unchanged. This makes it easy to use Append as a safe `reterr += err`, when 40 | // you don't know if either is nil or not. 41 | func Append(reterr, err error) error { 42 | if reterr == nil { // keep it simple, pass it through 43 | return err // which might even be nil 44 | } 45 | if err == nil { // no error, so don't do anything 46 | return reterr 47 | } 48 | // both are real errors 49 | return multierror.Append(reterr, err) 50 | } 51 | 52 | // String returns a string representation of the error. In particular, if the 53 | // error is nil, it returns an empty string instead of panicing. 54 | func String(err error) string { 55 | if err == nil { 56 | return "" 57 | } 58 | return err.Error() 59 | } 60 | 61 | // Cause returns the top-most error that we can print directly to the end-user. 62 | func Cause(err error) error { 63 | return errors.Cause(err) 64 | } 65 | -------------------------------------------------------------------------------- /util/licenses/licenses.go: -------------------------------------------------------------------------------- 1 | // Copyright Amazon.com Inc or its affiliates and the project contributors 2 | // Written by James Shubin and the project contributors 3 | // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); you may not 5 | // use this file except in compliance with the License. You may obtain a copy of 6 | // the License at 7 | // 8 | // http://www.apache.org/licenses/LICENSE-2.0 9 | // 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 12 | // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 13 | // License for the specific language governing permissions and limitations under 14 | // the License. 15 | // 16 | // We will never require a CLA to submit a patch. All contributions follow the 17 | // `inbound == outbound` rule. 18 | // 19 | // This is not an official Amazon product. Amazon does not offer support for 20 | // this project. 21 | // 22 | // SPDX-License-Identifier: Apache-2.0 23 | 24 | // Package licenses provides some structures for handling and representing 25 | // software licenses. It uses SPDX representations for part of it, because there 26 | // doesn't seem to be a better alternative. It doesn't guarantee that it 27 | // implements all of the SPDX spec. If there's an aspect which you think was 28 | // mis-implemented or is missing, please let us know. 29 | // XXX: Add a test to check if the license-list-data submodule is up-to-date! 30 | package licenses 31 | 32 | import ( 33 | "bytes" 34 | "embed" 35 | "encoding/json" 36 | "fmt" 37 | "strings" 38 | "sync" 39 | ) 40 | 41 | // licensesJson is populated automatically at build-time from the official spdx 42 | // licenses.json file, which is linked into this repository as a git submodule. 43 | // 44 | //go:embed license-list-data/json/licenses.json 45 | var licensesJSON []byte 46 | 47 | //go:embed license-list-data/json/details/*.json 48 | var licensesTextJSON embed.FS 49 | 50 | //go:embed license-list-data/json/exceptions.json 51 | var exceptionsJson []byte 52 | 53 | //go:embed license-list-data/json/exceptions/*.json 54 | var exceptionsTextJSON embed.FS 55 | 56 | var ( 57 | once sync.Once 58 | LicenseList LicenseListSPDX // this gets populated during init() 59 | ) 60 | 61 | func init() { 62 | once.Do(decode) 63 | } 64 | 65 | // TODO: import the exceptions if we ever decide we want to look at those. 66 | func decode() { 67 | buffer := bytes.NewBuffer(licensesJSON) 68 | decoder := json.NewDecoder(buffer) 69 | if err := decoder.Decode(&LicenseList); err != nil { 70 | panic(fmt.Sprintf("error decoding spdx license list: %+v", err)) 71 | } 72 | if len(LicenseList.Licenses) == 0 { 73 | panic(fmt.Sprintf("could not find any licenses to decode")) 74 | } 75 | 76 | // debug 77 | //dirEntry, err := licensesTextJSON.ReadDir("license-list-data/json/details") 78 | //if err != nil { 79 | // panic(fmt.Sprintf("error: %+v", err)) 80 | //} 81 | //for _, x := range dirEntry { 82 | // fmt.Printf("Name: %+v\n", x.Name()) 83 | //} 84 | 85 | for _, license := range LicenseList.Licenses { 86 | //fmt.Printf("ID: %+v\n", license.LicenseID) // debug 87 | 88 | f := "license-list-data/json/details/" + strings.TrimPrefix(license.Reference, "./") 89 | data, err := licensesTextJSON.ReadFile(f) 90 | if err != nil { 91 | panic(fmt.Sprintf("error reading spdx license file: %s, error: %+v", f, err)) 92 | } 93 | //fmt.Printf("Data: %s\n", string(data)) // debug 94 | buffer := bytes.NewBuffer(data) 95 | decoder := json.NewDecoder(buffer) 96 | 97 | if err := decoder.Decode(&license); err != nil { 98 | panic(fmt.Sprintf("error decoding spdx license text: %+v", err)) 99 | } 100 | //fmt.Printf("Text: %+v\n", license.Text) // debug 101 | if license.Text == "" { 102 | panic(fmt.Sprintf("could not find any license text for: %s", license.LicenseID)) 103 | } 104 | } 105 | } 106 | 107 | // LicenseListSPDX is modelled after the official SPDX licenses.json file. 108 | type LicenseListSPDX struct { 109 | Version string `json:"licenseListVersion"` 110 | 111 | Licenses []*LicenseSPDX `json:"licenses"` 112 | } 113 | 114 | // LicenseSPDX is modelled after the official SPDX license entries. It also 115 | // includes fields from the referenced fields, which include the full text. 116 | type LicenseSPDX struct { 117 | // Reference is a link to the full license .json file. 118 | Reference string `json:"reference"` 119 | IsDeprecated bool `json:"isDeprecatedLicenseId"` 120 | DetailsURL string `json:"detailsUrl"` 121 | // ReferenceNumber is an index number for the license. I wouldn't 122 | // consider this to be stable over time. 123 | ReferenceNumber int64 `json:"referenceNumber"` 124 | // Name is a friendly name for the license. 125 | Name string `json:"name"` 126 | // LicenseID is the SPDX ID for the license. 127 | LicenseID string `json:"licenseId"` 128 | SeeAlso []string `json:"seeAlso"` 129 | IsOSIApproved bool `json:"isOsiApproved"` 130 | 131 | //IsDeprecated bool `json:"isDeprecatedLicenseId"` // appears again 132 | IsFSFLibre bool `json:"isFsfLibre"` 133 | Text string `json:"licenseText"` 134 | } 135 | 136 | // License is a representation of a license. It's better than a simple SPDX ID 137 | // as a string, because it allows us to store alternative representations to an 138 | // internal or different representation, as well as any other information that 139 | // we want to have associated here. 140 | type License struct { 141 | // SPDX is the well-known SPDX ID for the license. 142 | SPDX string 143 | 144 | // Origin shows a different license provenance, and associated custom 145 | // name. It should probably be a "reverse-dns" style unique identifier. 146 | Origin string 147 | // Custom is a custom string that is a unique identifier for the license 148 | // in the aforementioned Origin namespace. 149 | Custom string 150 | } 151 | 152 | // String returns a string representation of whatever license is specified. 153 | func (obj *License) String() string { 154 | if obj.Origin != "" && obj.Custom != "" { 155 | return fmt.Sprintf("%s(%s)", obj.Custom, obj.Origin) 156 | } 157 | 158 | if obj.Origin == "" && obj.Custom != "" { 159 | return fmt.Sprintf("%s(unknown)", obj.Custom) // TODO: display this differently? 160 | } 161 | 162 | // TODO: replace with a different short name if one exists 163 | return obj.SPDX 164 | } 165 | 166 | // Validate returns an error if the license doesn't have a valid representation. 167 | // For example, if you express the license as an SPDX ID, this will validate 168 | // that it is among the known licenses. 169 | func (obj *License) Validate() error { 170 | if obj.SPDX != "" { 171 | // if an SPDX ID is specified, we validate based on it! 172 | _, err := ID(obj.SPDX) 173 | return err 174 | } 175 | 176 | // valid, but from an unknown origin 177 | if obj.Origin != "" && obj.Custom != "" { 178 | return nil 179 | } 180 | 181 | if obj.Origin == "" && obj.Custom != "" { 182 | return fmt.Errorf("unknown custom license: %s", obj.Custom) 183 | } 184 | 185 | return fmt.Errorf("unknown license format") 186 | } 187 | 188 | // Cmp compares two licenses and determines if they are identical. 189 | func (obj *License) Cmp(license *License) error { 190 | if obj.SPDX != license.SPDX { 191 | return fmt.Errorf("the SPDX field differs") 192 | } 193 | if obj.Origin != license.Origin { 194 | return fmt.Errorf("the Origin field differs") 195 | } 196 | if obj.Custom != license.Custom { 197 | return fmt.Errorf("the Custom field differs") 198 | } 199 | 200 | return nil 201 | } 202 | 203 | // ID looks up the license from the imported list. Do not modify the result as 204 | // it is the global database that everyone is using. 205 | func ID(spdx string) (*LicenseSPDX, error) { 206 | for _, license := range LicenseList.Licenses { 207 | if spdx == license.LicenseID { 208 | return license, nil 209 | } 210 | } 211 | return nil, fmt.Errorf("license ID (%s) not found", spdx) 212 | } 213 | 214 | // StringToLicense takes an input string and returns a license struct. This can 215 | // handle both normal SPDX ID's and the origin strings in the `name(origin)` 216 | // format. It rarely returns an error unless you pass it an obviously fake 217 | // license identifier. 218 | // TODO: add some tests 219 | func StringToLicense(name string) (*License, error) { 220 | license := &License{ 221 | SPDX: name, 222 | } 223 | 224 | if err := license.Validate(); err == nil { 225 | return license, nil 226 | } 227 | 228 | // assume this for now... 229 | license = &License{ 230 | //SPDX: "", 231 | Origin: "", // unknown 232 | Custom: name, 233 | } 234 | 235 | // parse the licenseName(origin) syntax 236 | ix := strings.Index(name, "(") 237 | if ix > -1 && strings.HasSuffix(name, ")") && (ix+1) < (len(name)-1) { 238 | license = &License{ 239 | //SPDX: "", 240 | Origin: name[ix+1 : len(name)-1], 241 | Custom: name[0:ix], 242 | } 243 | } 244 | 245 | lhs := strings.Count(name, "(") 246 | rhs := strings.Count(name, ")") 247 | if lhs != rhs { 248 | return nil, fmt.Errorf("unbalanced parenthesis") 249 | } 250 | if lhs != 0 && lhs != 1 { 251 | return nil, fmt.Errorf("invalid parenthesis count") 252 | } 253 | 254 | return license, nil 255 | } 256 | 257 | // StringsToLicenses converts a list of input strings and converts them into the 258 | // matching list of license structs. It accepts non-SPDX license names in the 259 | // standard SPDX format of `name(origin)`. 260 | func StringsToLicenses(inputs []string) ([]*License, error) { 261 | licenses := []*License{} 262 | 263 | for _, x := range inputs { 264 | license, err := StringToLicense(x) 265 | if err != nil { 266 | return nil, err 267 | } 268 | licenses = append(licenses, license) 269 | } 270 | 271 | return licenses, nil 272 | } 273 | 274 | // Join joins the string representations of a list of licenses with comma space. 275 | func Join(licenses []*License) string { 276 | xs := []string{} 277 | for _, license := range licenses { 278 | xs = append(xs, license.String()) 279 | } 280 | return strings.Join(xs, ", ") 281 | } 282 | 283 | // InList returns true if a license exists inside a list, otherwise false. It 284 | // uses the license Cmp method to determine equality. 285 | func InList(needle *License, haystack []*License) bool { 286 | for _, x := range haystack { 287 | if needle.Cmp(x) == nil { 288 | return true 289 | } 290 | } 291 | return false 292 | } 293 | 294 | // Union returns the union of licenses in both input lists. It uses the pointers 295 | // from the first list in the results. It does not try to remove duplicates so 296 | // if either list has duplicates, you may end up with duplicates in the result. 297 | // It uses the license Cmp method to determine equality. 298 | func Union(haystack1 []*License, haystack2 []*License) []*License { 299 | union := []*License{} 300 | for _, x := range haystack1 { 301 | if InList(x, haystack2) { 302 | union = append(union, x) 303 | } 304 | } 305 | return union 306 | } 307 | -------------------------------------------------------------------------------- /util/licenses/licenses_test.go: -------------------------------------------------------------------------------- 1 | // Copyright Amazon.com Inc or its affiliates and the project contributors 2 | // Written by James Shubin and the project contributors 3 | // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); you may not 5 | // use this file except in compliance with the License. You may obtain a copy of 6 | // the License at 7 | // 8 | // http://www.apache.org/licenses/LICENSE-2.0 9 | // 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 12 | // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 13 | // License for the specific language governing permissions and limitations under 14 | // the License. 15 | // 16 | // We will never require a CLA to submit a patch. All contributions follow the 17 | // `inbound == outbound` rule. 18 | // 19 | // This is not an official Amazon product. Amazon does not offer support for 20 | // this project. 21 | // 22 | // SPDX-License-Identifier: Apache-2.0 23 | 24 | package licenses_test 25 | 26 | import ( 27 | "testing" 28 | 29 | "github.com/awslabs/yesiscan/util/licenses" 30 | ) 31 | 32 | func TestValidate(t *testing.T) { 33 | license := licenses.License{ 34 | SPDX: "AGPL-3.0-or-later", 35 | } 36 | if err := license.Validate(); err != nil { 37 | t.Errorf("err: %+v", err) 38 | return 39 | } 40 | } 41 | 42 | func TestID(t *testing.T) { 43 | if _, err := licenses.ID("AGPL-3.0-or-later"); err != nil { 44 | t.Errorf("err: %+v", err) 45 | return 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /util/util.go: -------------------------------------------------------------------------------- 1 | // Copyright Amazon.com Inc or its affiliates and the project contributors 2 | // Written by James Shubin and the project contributors 3 | // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); you may not 5 | // use this file except in compliance with the License. You may obtain a copy of 6 | // the License at 7 | // 8 | // http://www.apache.org/licenses/LICENSE-2.0 9 | // 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 12 | // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 13 | // License for the specific language governing permissions and limitations under 14 | // the License. 15 | // 16 | // We will never require a CLA to submit a patch. All contributions follow the 17 | // `inbound == outbound` rule. 18 | // 19 | // This is not an official Amazon product. Amazon does not offer support for 20 | // this project. 21 | // 22 | // SPDX-License-Identifier: Apache-2.0 23 | 24 | package util 25 | 26 | import ( 27 | "fmt" 28 | "net/url" 29 | "sort" 30 | "strings" 31 | ) 32 | 33 | // StrInList returns true if a string exists inside a list, otherwise false. 34 | func StrInList(needle string, haystack []string) bool { 35 | for _, x := range haystack { 36 | if needle == x { 37 | return true 38 | } 39 | } 40 | return false 41 | } 42 | 43 | // ShellHyperlinkEncode takes a string, and a uri and returns a shell encoded 44 | // representation of a hyperlink using the modern shell escaping sequence. Idea 45 | // from: https://purpleidea.com/blog/2018/06/29/hyperlinks-in-gnome-terminal/ 46 | func ShellHyperlinkEncode(display string, uri string) string { 47 | x := uri // XXX: how do we escape correctly? 48 | //x := url.QueryEscape(uri) // XXX: this is the wrong escaping 49 | 50 | return "\033]8;;" + x + "\a" + display + "\033]8;;\a" 51 | } 52 | 53 | // HtmlHyperlinkEncode takes a string, and a uri and returns an html 54 | // representation of a hyperlink using the normal anchor tags. 55 | func HtmlHyperlinkEncode(display string, uri string) string { 56 | return "" + display + "" 57 | } 58 | 59 | // SmartURI returns a "smart" URI given an internal UID that we have. The UID is 60 | // the special string that's the unique identifier that's returned from each 61 | // backend. We convert this into a "better" URI if we can. If we can't, we just 62 | // return the uid unchanged. 63 | // TODO: the different helper functions that are called within could be provided 64 | // by each backend, instead of us writing them here and assuming how they work. 65 | func SmartURI(uid string) string { 66 | // is this a github URI? 67 | if s, err := smartGithubURI(uid); err == nil { 68 | return s 69 | } 70 | 71 | return uid 72 | } 73 | 74 | // smartGithubURI attempts to return a useful URI from an internal Github UID. 75 | // If we don't detect this as a github UID, then we error. 76 | func smartGithubURI(uid string) (string, error) { 77 | u, err := url.Parse(uid) 78 | if err != nil { 79 | return "", err 80 | } 81 | 82 | if u.Scheme != "git" && u.Scheme != "https" { 83 | return "", fmt.Errorf("invalid scheme") 84 | } 85 | u.Scheme = "https" // make it user clickable 86 | 87 | if u.Host != "github.com" { 88 | return "", fmt.Errorf("wrong hostname") 89 | } 90 | 91 | q := u.Query() 92 | sha1s := q["sha1"] 93 | if len(sha1s) != 1 { 94 | return "", fmt.Errorf("wrong length of sha1s") 95 | } 96 | sha1 := sha1s[0] 97 | if sha1 == "" { 98 | return "", fmt.Errorf("unknown sha1") 99 | } 100 | u.RawQuery = "" // erase it 101 | 102 | p := strings.TrimPrefix(u.Path, "/") 103 | ps := strings.Split(p, "/") 104 | if len(ps) < 2 { 105 | return "", fmt.Errorf("invalid path") 106 | } 107 | 108 | u.Path = ps[0] + "/" + ps[1] + "/blob/" + sha1 + "/" + strings.Join(ps[2:], "/") 109 | 110 | u.RawPath = "" // encoded path hint (see EscapedPath method) 111 | u.ForceQuery = false // append a query ('?') even if RawQuery is empty 112 | 113 | // TODO: add support for line number ranges, eg: #L13-L42 or just #L42 114 | 115 | u.Fragment = "" // fragment for references, without '#' 116 | u.RawFragment = "" // encoded fragment hint (see EscapedFragment method) 117 | 118 | return u.String(), nil 119 | } 120 | 121 | // NamedArgsTemplate takes a format string that contains named args wrapped in 122 | // curly brackets, and templates them in. For example, "hello {name}!" will turn 123 | // into "hello world!" if you pass a map with "name" => "world" into it. 124 | func NamedArgsTemplate(format string, replacements map[string]interface{}) string { 125 | keys := []string{} 126 | for k := range replacements { 127 | keys = append(keys, k) 128 | } 129 | sort.Strings(keys) 130 | 131 | args := []string{} 132 | for _, k := range keys { 133 | s1 := "{" + k + "}" 134 | args = append(args, s1) 135 | s2 := fmt.Sprint(replacements[k]) 136 | args = append(args, s2) 137 | } 138 | 139 | return strings.NewReplacer(args...).Replace(format) 140 | } 141 | -------------------------------------------------------------------------------- /web/static/4a90d9.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/purpleidea/yesiscan/a9f8980f4f152aa15610bec090c7c6503db8ee6f/web/static/4a90d9.jpg -------------------------------------------------------------------------------- /web/static/icons8-checkmark.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /web/static/icons8-search.svg: -------------------------------------------------------------------------------- 1 | --------------------------------------------------------------------------------